From 06bd486c72342b4d979245c9f4051156e6492541 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Tue, 25 Jun 2019 21:22:29 -0400 Subject: scraping progress --- src/scraping/acm/index.js | 88 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 src/scraping/acm/index.js (limited to 'src/scraping/acm/index.js') diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js new file mode 100644 index 000000000..81f775617 --- /dev/null +++ b/src/scraping/acm/index.js @@ -0,0 +1,88 @@ +const { Builder, By, Key, until } = require('selenium-webdriver'); +const fs = require("fs"); + +let driver; +fs.readFile("./citations.txt", { encoding: "utf8" }, scrapeTargets); +results = [] + +async function scrapeTargets(error, data) { + if (error) { + console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.") + return; + } + + driver = await new Builder().forBrowser('chrome').build(); + + let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g)); + + let results = [] + let pdfs = [] + for (let id of references) { + let result = {} + let lines = [] + try { + let url = `https://dl.acm.org/citation.cfm?id=${id}`; + await driver.get(url); + await driver.sleep(500) + let candidates = await driver.findElements(By.xpath('.//a[contains(@href, "ft_gateway.cfm?id=")]')); + if (candidates.length > 0) { + pdfs.push(candidates[0]) + } + let webElements = await driver.findElements(By.id("abstract-body")) + for (let el of webElements) { + let text = await el.getText() + lines.push(text) + } + result.url = url + result.abstract = lines.join(" "); + await driver.findElement(By.xpath(`//*[@id="tab-1014-btnInnerEl"]/span`)).click() + await driver.sleep(500) + let authors = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')).getText() + let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")) + authorLines = authors.split("\n").map(line => line.trim()).filter(sanitize) + + let i = 0; + let allAuthors = [] + while (i < authorLines.length) { + let individual = []; + while (!authorLines[i].startsWith("Average citations")) { + individual.push(authorLines[i]) + i++ + } + individual.push(authorLines[i]) + allAuthors.push(individual); + i++ + } + result.authors = allAuthors.map(metadata => { + let publicationYears = metadata[1].substring(18).split("-"); + author = { + name: metadata[0], + publication_start: parseInt(publicationYears[0]), + publication_end: parseInt(publicationYears[1]) + }; + for (let count = 2; count < metadata.length; count++) { + let attr = metadata[count] + let char = attr.length - 1; + while (attr[char] != " ") { + char-- + } + let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, ""); + let value = parseFloat(attr.substring(char + 1).replace(/,/g, "")); + author[key] = value + } + return author + }) + } catch (e) { + console.log(e) + await driver.quit(); + } + results.push(result) + } + + let output = ""; + results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); + + fs.writeFile("./results.txt", output, function errorHandler(exception) { console.log(exception || "results successfully written") }) + + await driver.quit(); +} \ No newline at end of file -- cgit v1.2.3-70-g09d2 From 642d22526d102198ed624a2b1e2eaed3b8f731b6 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Sat, 29 Jun 2019 02:32:54 -0400 Subject: added acm scraping to master and made keyhandlers responsive to mac keystrokes --- src/.DS_Store | Bin 6148 -> 6148 bytes src/client/views/GlobalKeyHandler.ts | 12 +++-- src/scraping/acm/chromedriver | Bin 0 -> 10256192 bytes src/scraping/acm/chromedriver.exe | Bin 0 -> 7477760 bytes src/scraping/acm/citations.txt | 2 + src/scraping/acm/index.js | 97 +++++++++++++++++++++++++++++++++++ src/scraping/acm/package.json | 17 ++++++ src/scraping/acm/results.txt | 64 +++++++++++++++++++++++ 8 files changed, 188 insertions(+), 4 deletions(-) create mode 100755 src/scraping/acm/chromedriver create mode 100644 src/scraping/acm/chromedriver.exe create mode 100644 src/scraping/acm/citations.txt create mode 100644 src/scraping/acm/index.js create mode 100644 src/scraping/acm/package.json create mode 100644 src/scraping/acm/results.txt (limited to 'src/scraping/acm/index.js') diff --git a/src/.DS_Store b/src/.DS_Store index d70e95c0a..ff00f097e 100644 Binary files a/src/.DS_Store and b/src/.DS_Store differ diff --git a/src/client/views/GlobalKeyHandler.ts b/src/client/views/GlobalKeyHandler.ts index bb10f27cf..95a367cea 100644 --- a/src/client/views/GlobalKeyHandler.ts +++ b/src/client/views/GlobalKeyHandler.ts @@ -4,8 +4,9 @@ import { CollectionDockingView } from "./collections/CollectionDockingView"; import { MainView } from "./MainView"; import { DragManager } from "../util/DragManager"; import { action } from "mobx"; +import { emptyFunction } from "../../Utils"; -const modifiers = ["Control", "Meta", "Shift", "Alt"]; +const modifiers = ["control", "meta", "shift", "alt"]; type KeyHandler = (keycode: string) => KeyControlInfo; type KeyControlInfo = { preventDefault: boolean, @@ -19,10 +20,13 @@ export default class KeyManager { constructor(mainView: MainView) { this.mainView = mainView; + + let isMac = navigator.platform.toLowerCase().indexOf("mac") >= 0; + this.router.set("0000", this.unmodified); - this.router.set("0100", this.ctrl); - this.router.set("0010", this.alt); - this.router.set("1100", this.ctrl_shift); + this.router.set(isMac ? "0001" : "0100", this.ctrl); + this.router.set(isMac ? "0100" : "0010", this.alt); + this.router.set(isMac ? "1001" : "1100", this.ctrl_shift); } public handle = (e: KeyboardEvent) => { diff --git a/src/scraping/acm/chromedriver b/src/scraping/acm/chromedriver new file mode 100755 index 000000000..9e9b16717 Binary files /dev/null and b/src/scraping/acm/chromedriver differ diff --git a/src/scraping/acm/chromedriver.exe b/src/scraping/acm/chromedriver.exe new file mode 100644 index 000000000..6a362fd43 Binary files /dev/null and b/src/scraping/acm/chromedriver.exe differ diff --git a/src/scraping/acm/citations.txt b/src/scraping/acm/citations.txt new file mode 100644 index 000000000..e5018ddef --- /dev/null +++ b/src/scraping/acm/citations.txt @@ -0,0 +1,2 @@ +321046 +2412979 \ No newline at end of file diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js new file mode 100644 index 000000000..b7455fe0d --- /dev/null +++ b/src/scraping/acm/index.js @@ -0,0 +1,97 @@ +const { + Builder, + By, + Key, + until +} = require('selenium-webdriver'); +const fs = require("fs"); + +let driver; +fs.readFile("./citations.txt", { + encoding: "utf8" +}, scrapeTargets); +results = [] + +async function scrapeTargets(error, data) { + if (error) { + console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.") + return; + } + + driver = await new Builder().forBrowser('chrome').build(); + + let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g)); + + let results = [] + let pdfs = [] + for (let id of references) { + let result = {} + let lines = [] + try { + let url = `https://dl.acm.org/citation.cfm?id=${id}`; + await driver.get(url); + await driver.sleep(500) + let candidates = await driver.findElements(By.xpath('.//a[contains(@href, "ft_gateway.cfm?id=")]')); + if (candidates.length > 0) { + pdfs.push(candidates[0]) + } + let webElements = await driver.findElements(By.id("abstract-body")) + for (let el of webElements) { + let text = await el.getText() + lines.push(text) + } + result.url = url + result.abstract = lines.join(" "); + await driver.findElement(By.xpath(`//*[@id="tab-1014-btnInnerEl"]/span`)).click() + await driver.sleep(500) + let authors = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')).getText() + let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")) + authorLines = authors.split("\n").map(line => line.trim()).filter(sanitize) + + let i = 0; + let allAuthors = [] + while (i < authorLines.length) { + let individual = []; + while (!authorLines[i].startsWith("Average citations")) { + individual.push(authorLines[i]) + i++ + } + individual.push(authorLines[i]) + allAuthors.push(individual); + i++ + } + result.authors = allAuthors.map(metadata => { + let publicationYears = metadata[1].substring(18).split("-"); + author = { + name: metadata[0], + publication_start: parseInt(publicationYears[0]), + publication_end: parseInt(publicationYears[1]) + }; + for (let count = 2; count < metadata.length; count++) { + let attr = metadata[count] + let char = attr.length - 1; + while (attr[char] != " ") { + char-- + } + let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, ""); + let value = parseFloat(attr.substring(char + 1).replace(/,/g, "")); + author[key] = value + } + return author + }) + } catch (e) { + console.log(e) + await driver.quit(); + } + results.push(result) + } + + let output = ""; + results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); + + fs.writeFile("./results.txt", output, function errorHandler(exception) { + console.log(exception || "results successfully written") + }) + + await driver.quit(); +} \ No newline at end of file diff --git a/src/scraping/acm/package.json b/src/scraping/acm/package.json new file mode 100644 index 000000000..10f4d2156 --- /dev/null +++ b/src/scraping/acm/package.json @@ -0,0 +1,17 @@ +{ + "name": "scraper", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC", + "dependencies": { + "axios": "^0.19.0", + "cheerio": "^1.0.0-rc.3", + "selenium-webdriver": "^4.0.0-alpha.4" + } +} diff --git a/src/scraping/acm/results.txt b/src/scraping/acm/results.txt new file mode 100644 index 000000000..4842139a9 --- /dev/null +++ b/src/scraping/acm/results.txt @@ -0,0 +1,64 @@ +{ + "url": "https://dl.acm.org/citation.cfm?id=321046", + "abstract": "It has been observed by many people that a striking number of quite diverse mathematical problems can be formulated as problems in integer programming, that is, linear programming problems in which some or all of the variables are required to assume integral values. This fact is rendered quite interesting by recent research on such problems, notably by R. E. Gomory [2, 3], which gives promise of yielding efficient computational techniques for their solution. The present paper provides yet another example of the versatility of integer programming as a mathematical modeling device by representing a generalization of the well-known “Travelling Salesman Problem” in integer programming terms. The authors have developed several such models, of which the one presented here is the most efficient in terms of generality, number of variables, and number of constraints. This model is due to the second author [4] and was presented briefly at the Symposium on Combinatorial Problems held at Princeton University, April 1960, sponsored by SIAM and IBM. The problem treated is: (1) A salesman is required to visit each of n cities, indexed by 1, … , n. He leaves from a “base city” indexed by 0, visits each of the n other cities exactly once, and returns to city 0. During his travels he must return to 0 exactly t times, including his final return (here t may be allowed to vary), and he must visit no more than p cities in one tour. (By a tour we mean a succession of visits to cities without stopping at city 0.) It is required to find such an itinerary which minimizes the total distance traveled by the salesman. Note that if t is fixed, then for the problem to have a solution we must have tp ≧ n. For t = 1, p ≧ n, we have the standard traveling salesman problem. Let dij (i ≠ j = 0, 1, … , n) be the distance covered in traveling from city i to city j. The following integer programming problem will be shown to be equivalent to (1): (2) Minimize the linear form ∑0≦i≠j≦n∑ dijxij over the set determined by the relations ∑ni=0i≠j xij = 1 (j = 1, … , n) ∑nj=0j≠i xij = 1 (i = 1, … , n) ui - uj + pxij ≦ p - 1 (1 ≦ i ≠ j ≦ n) where the xij are non-negative integers and the ui (i = 1, …, n) are arbitrary real numbers. (We shall see that it is permissible to restrict the ui to be non-negative integers as well.) If t is fixed it is necessary to add the additional relation: ∑nu=1 xi0 = t Note that the constraints require that xij = 0 or 1, so that a natural correspondence between these two problems exists if the xij are interpreted as follows: The salesman proceeds from city i to city j if and only if xij = 1. Under this correspondence the form to be minimized in (2) is the total distance to be traveled by the salesman in (1), so the burden of proof is to show that the two feasible sets correspond; i.e., a feasible solution to (2) has xij which do define a legitimate itinerary in (1), and, conversely a legitimate itinerary in (1) defines xij, which, together with appropriate ui, satisfy the constraints of (2). Consider a feasible solution to (2). The number of returns to city 0 is given by ∑ni=1 xi0. The constraints of the form ∑ xij = 1, all xij non-negative integers, represent the conditions that each city (other than zero) is visited exactly once. The ui play a role similar to node potentials in a network and the inequalities involving them serve to eliminate tours that do not begin and end at city 0 and tours that visit more than p cities. Consider any xr0r1 = 1 (r1 ≠ 0). There exists a unique r2 such that xr1r2 = 1. Unless r2 = 0, there is a unique r3 with xr2r3 = 1. We proceed in this fashion until some rj = 0. This must happen since the alternative is that at some point we reach an rk = rj, j + 1 < k. Since none of the r's are zero we have uri - uri + 1 + pxriri + 1 ≦ p - 1 or uri - uri + 1 ≦ - 1. Summing from i = j to k - 1, we have urj - urk = 0 ≦ j + 1 - k, which is a contradiction. Thus all tours include city 0. It remains to observe that no tours is of length greater than p. Suppose such a tour exists, x0r1 , xr1r2 , … , xrprp+1 = 1 with all ri ≠ 0. Then, as before, ur1 - urp+1 ≦ - p or urp+1 - ur1 ≧ p. But we have urp+1 - ur1 + pxrp+1r1 ≦ p - 1 or urp+1 - ur1 ≦ p (1 - xrp+1r1) - 1 ≦ p - 1, which is a contradiction. Conversely, if the xij correspond to a legitimate itinerary, it is clear that the ui can be adjusted so that ui = j if city i is the jth city visited in the tour which includes city i, for we then have ui - uj = - 1 if xij = 1, and always ui - uj ≦ p - 1. The above integer program involves n2 + n constraints (if t is not fixed) in n2 + 2n variables. Since the inequality form of constraint is fundamental for integer programming calculations, one may eliminate 2n variables, say the xi0 and x0j, by means of the equation constraints and produce an equivalent problem with n2 + n inequalities and n2 variables. The currently known integer programming procedures are sufficiently regular in their behavior to cast doubt on the heuristic value of machine experiments with our model. However, it seems appropriate to report the results of the five machine experiments we have conducted so far. The solution procedure used was the all-integer algorithm of R. E. Gomory [3] without the ranking procedure he describes. The first three experiments were simple model verification tests on a four-city standard traveling salesman problem with distance matrix [ 20 23 4 30 7 27 25 5 25 3 21 26 ] The first experiment was with a model, now obsolete, using roughly twice as many constraints and variables as the current model (for this problem, 28 constraints in 21 variables). The machine was halted after 4000 pivot steps had failed to produce a solution. The second experiment used the earlier model with the xi0 and x0j eliminated, resulting in a 28-constraint, 15-variable problem. Here the machine produced the optimal solution in 41 pivot steps. The third experiment used the current formulation with the xi0 and x0j eliminated, yielding 13 constraints and 9 variables. The optimal solution was reached in 7 pivot steps. The fourth and fifth experiments were used on a standard ten-city problem, due to Barachet, solved by Dantzig, Johnson and Fulkerson [1]. The current formulation was used, yielding 91 constraints in 81 variables. The fifth problem differed from the fourth only in that the ordering of the rows was altered to attempt to introduce more favorable pivot choices. In each case the machine was stopped after over 250 pivot steps had failed to produce the solution. In each case the last 100 pivot steps had failed to change the value of the objective function. It seems hopeful that more efficient integer programming procedures now under development will yield a satisfactory algorithmic solution to the traveling salesman problem, when applied to this model. In any case, the model serves to illustrate how problems of this sort may be succinctly formulated in integer programming terms.", + "authors": [ + { + "name": "C. E. Miller", + "publication_start": 1960, + "publication_end": 1960, + "publication_count": 1, + "citation_count": 179, + "available_for_download": 1, + "downloads_6_weeks": 133, + "downloads_12_months": 1002, + "downloads_cumulative": 9790, + "average_downloads_per_article": 9790, + "average_citations_per_article": 179 + }, + { + "name": "A. W. Tucker", + "publication_start": 1960, + "publication_end": 1993, + "publication_count": 5, + "citation_count": 196, + "available_for_download": 1, + "downloads_6_weeks": 133, + "downloads_12_months": 1002, + "downloads_cumulative": 9790, + "average_downloads_per_article": 9790, + "average_citations_per_article": 39.2 + }, + { + "name": "R. A. Zemlin", + "publication_start": 1960, + "publication_end": 1964, + "publication_count": 2, + "citation_count": 188, + "available_for_download": 2, + "downloads_6_weeks": 133, + "downloads_12_months": 1007, + "downloads_cumulative": 10021, + "average_downloads_per_article": 5010.5, + "average_citations_per_article": 94 + } + ] +} +{ + "url": "https://dl.acm.org/citation.cfm?id=2412979", + "abstract": "The STRUCT system utilizes the flexibility of a powerful graphics display system to provide a set of tools for program analysis. These tools allow the analysis of the static prograin structure and the dynamic execution behavior. of programs within the entire operating system/user program environment of the Brown University Graphics System (BUGS). Information is collected and presented in a manner which fully exploits two aspects of this environment. First, the operating system has been developed in a well-structured hierarcal manner following principles laid down by other researchers (2), (3). Second the programs under analysis have been written in a structured programming language following coding conventions which make available, at the source code level, valuable program control information. A new set of pictorial constructs is introduced for presenting a. program structure (static or dynamic) for inspection. These constructs combine the best features of an indented structured source code listing and the box odented nature of traditional flow charts. The graphical tools available are USed to provide for swift changes in. the desired level of detail displayed within a program structure, for traveling linearly through a program structure, for traveling through a complex program structure (following subroutine or system calls), for concurrently viewing multiple related program structures, and for presenting dynamic program behavior data using three-dimensional projections, The volume of a three-dimensional box representing a program block is proportional to the block's resource utilization. The scope of this paper is limited to a description of the STRUCT system. This system is currently being used to predict and analyze the performance advantages available through the migration of function (program modules) between levels of software and between software and firmware within BUGS. The results of this research on migration will be included in a doctoral dissertation currently being written.", + "authors": [ + { + "name": "Andries Van Dam", + "publication_start": 1975, + "publication_end": 1975, + "publication_count": 1, + "citation_count": 0, + "available_for_download": 0, + "downloads_6_weeks": 8, + "downloads_12_months": 97, + "downloads_cumulative": 97, + "average_downloads_per_article": 0, + "average_citations_per_article": 0 + } + ] +} -- cgit v1.2.3-70-g09d2 From a2e447925dd9fc9300d1c812cd202acf0de1aa95 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Mon, 1 Jul 2019 03:35:42 -0400 Subject: improved acm scraping script --- src/scraping/acm/index.js | 226 +++++++++++++++++++++++++++-------------- src/scraping/acm/results.txt | 26 ++--- src/scraping/buxton/scraper.py | 2 +- 3 files changed, 163 insertions(+), 91 deletions(-) (limited to 'src/scraping/acm/index.js') diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index b7455fe0d..3d5e801be 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -1,97 +1,167 @@ const { Builder, - By, - Key, - until + By } = require('selenium-webdriver'); -const fs = require("fs"); +const { + readFile, + writeFile +} = require('fs'); -let driver; -fs.readFile("./citations.txt", { - encoding: "utf8" -}, scrapeTargets); -results = [] +const driver_pause = 500; // milliseconds +const sample_line_char_max = 100; // characters +const target_browser = 'chrome'; + +// GENERAL UTILITY FUNCTIONS + +function log_read(content) { + process.stdout.write("reading " + content + "..."); +} + +function log_snippet(result) { + let ellipse = result.length > sample_line_char_max; + let i = sample_line_char_max; + if (ellipse) { + while (result[i] != " " && i < -1) { + i--; + } + } + console.log(` "${result.substring(0, i + 1).trim()}${ellipse ? "..." : ""}"`); +} + +// DRIVER UTILITY FUNCTIONS + +async function navigate_to(url) { + await driver.get(url); + await driver.sleep(driver_pause); +} + +async function click_on(xpath) { + await driver.findElement(By.xpath(xpath)).click(); + await driver.sleep(driver_pause); +} + +// TEXT SCRAPING + +async function read_title() { + log_read("title"); + let title_el = await driver.findElement(By.xpath('//*[@id="divmain"]/div/h1')); + let title = await title_el.getText(); + log_snippet(title); + return title; +} + +async function read_abstract() { + log_read("abstract"); + let lines = []; + let webElements = await driver.findElements(By.id("abstract-body")); + for (let el of webElements) { + let text = await el.getText(); + lines.push(text); + } + let abstract = lines.join(" "); + log_snippet(abstract); + return abstract; +} + +async function read_authors() { + log_read("authors"); + await click_on('//*[@id="tab-1014-btnInnerEl"]/span'); + + let authors_el = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')); + let authors = await authors_el.getText(); + let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")); + let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize); + + let all_authors = []; + let i = 0; + while (i < author_lines.length) { + let individual = []; + while (!author_lines[i].startsWith("Average citations")) { + individual.push(author_lines[i]); + i++; + } + individual.push(author_lines[i]); + all_authors.push(individual); + i++; + } -async function scrapeTargets(error, data) { + let multiple = all_authors.length == 1 ? "" : " et al."; + log_snippet(all_authors[0][0] + multiple); + return all_authors; +} + +// JSON / DASH CONVERSION AND EXPORT + +function parse_authors(metadata) { + let publicationYears = metadata[1].substring(18).split("-"); + author = { + name: metadata[0], + publication_start: parseInt(publicationYears[0]), + publication_end: parseInt(publicationYears[1]) + }; + for (let count = 2; count < metadata.length; count++) { + let attr = metadata[count]; + let char = attr.length - 1; + while (attr[char] != " ") { + char--; + } + let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, ""); + let value = parseFloat(attr.substring(char + 1).replace(/,/g, "")); + author[key] = value; + } + return author; +} + +function write_results() { + let output = ""; + results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); + + writeFile("./results.txt", output, function errorHandler(exception) { + console.log(exception || "scraped references successfully written as JSON to ./results.txt\n"); + }); +} + +async function scrape_targets(error, data) { if (error) { - console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.") + console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided."); return; } - driver = await new Builder().forBrowser('chrome').build(); - let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g)); + let quota = references.length; + console.log(`${references.join(", ")}\n`); + + driver = await new Builder().forBrowser(target_browser).build(); + + for (let i = 0; i < quota; i++) { + let result = {}; - let results = [] - let pdfs = [] - for (let id of references) { - let result = {} - let lines = [] try { - let url = `https://dl.acm.org/citation.cfm?id=${id}`; - await driver.get(url); - await driver.sleep(500) - let candidates = await driver.findElements(By.xpath('.//a[contains(@href, "ft_gateway.cfm?id=")]')); - if (candidates.length > 0) { - pdfs.push(candidates[0]) - } - let webElements = await driver.findElements(By.id("abstract-body")) - for (let el of webElements) { - let text = await el.getText() - lines.push(text) - } - result.url = url - result.abstract = lines.join(" "); - await driver.findElement(By.xpath(`//*[@id="tab-1014-btnInnerEl"]/span`)).click() - await driver.sleep(500) - let authors = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')).getText() - let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")) - authorLines = authors.split("\n").map(line => line.trim()).filter(sanitize) - - let i = 0; - let allAuthors = [] - while (i < authorLines.length) { - let individual = []; - while (!authorLines[i].startsWith("Average citations")) { - individual.push(authorLines[i]) - i++ - } - individual.push(authorLines[i]) - allAuthors.push(individual); - i++ - } - result.authors = allAuthors.map(metadata => { - let publicationYears = metadata[1].substring(18).split("-"); - author = { - name: metadata[0], - publication_start: parseInt(publicationYears[0]), - publication_end: parseInt(publicationYears[1]) - }; - for (let count = 2; count < metadata.length; count++) { - let attr = metadata[count] - let char = attr.length - 1; - while (attr[char] != " ") { - char-- - } - let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, ""); - let value = parseFloat(attr.substring(char + 1).replace(/,/g, "")); - author[key] = value - } - return author - }) + let url = `https://dl.acm.org/citation.cfm?id=${references[i]}`; + await navigate_to(url); + console.log(`scraping ${i + 1}/${quota} (${url})`); + + result.url = url; + result.title = await read_title(); + result.abstract = await read_abstract(); + result.authors = (await read_authors()).map(parse_authors); } catch (e) { - console.log(e) + console.log(e); await driver.quit(); } - results.push(result) - } - let output = ""; - results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); + results.push(result); + console.log(); + } - fs.writeFile("./results.txt", output, function errorHandler(exception) { - console.log(exception || "results successfully written") - }) + write_results(); await driver.quit(); -} \ No newline at end of file +} + +let driver; +let results = []; +console.log("reading references..."); +readFile("./citations.txt", { + encoding: "utf8" +}, scrape_targets); \ No newline at end of file diff --git a/src/scraping/acm/results.txt b/src/scraping/acm/results.txt index 4842139a9..fffa7ff51 100644 --- a/src/scraping/acm/results.txt +++ b/src/scraping/acm/results.txt @@ -1,5 +1,6 @@ { "url": "https://dl.acm.org/citation.cfm?id=321046", + "title": "Integer Programming Formulation of Traveling Salesman Problems", "abstract": "It has been observed by many people that a striking number of quite diverse mathematical problems can be formulated as problems in integer programming, that is, linear programming problems in which some or all of the variables are required to assume integral values. This fact is rendered quite interesting by recent research on such problems, notably by R. E. Gomory [2, 3], which gives promise of yielding efficient computational techniques for their solution. The present paper provides yet another example of the versatility of integer programming as a mathematical modeling device by representing a generalization of the well-known “Travelling Salesman Problem” in integer programming terms. The authors have developed several such models, of which the one presented here is the most efficient in terms of generality, number of variables, and number of constraints. This model is due to the second author [4] and was presented briefly at the Symposium on Combinatorial Problems held at Princeton University, April 1960, sponsored by SIAM and IBM. The problem treated is: (1) A salesman is required to visit each of n cities, indexed by 1, … , n. He leaves from a “base city” indexed by 0, visits each of the n other cities exactly once, and returns to city 0. During his travels he must return to 0 exactly t times, including his final return (here t may be allowed to vary), and he must visit no more than p cities in one tour. (By a tour we mean a succession of visits to cities without stopping at city 0.) It is required to find such an itinerary which minimizes the total distance traveled by the salesman. Note that if t is fixed, then for the problem to have a solution we must have tp ≧ n. For t = 1, p ≧ n, we have the standard traveling salesman problem. Let dij (i ≠ j = 0, 1, … , n) be the distance covered in traveling from city i to city j. The following integer programming problem will be shown to be equivalent to (1): (2) Minimize the linear form ∑0≦i≠j≦n∑ dijxij over the set determined by the relations ∑ni=0i≠j xij = 1 (j = 1, … , n) ∑nj=0j≠i xij = 1 (i = 1, … , n) ui - uj + pxij ≦ p - 1 (1 ≦ i ≠ j ≦ n) where the xij are non-negative integers and the ui (i = 1, …, n) are arbitrary real numbers. (We shall see that it is permissible to restrict the ui to be non-negative integers as well.) If t is fixed it is necessary to add the additional relation: ∑nu=1 xi0 = t Note that the constraints require that xij = 0 or 1, so that a natural correspondence between these two problems exists if the xij are interpreted as follows: The salesman proceeds from city i to city j if and only if xij = 1. Under this correspondence the form to be minimized in (2) is the total distance to be traveled by the salesman in (1), so the burden of proof is to show that the two feasible sets correspond; i.e., a feasible solution to (2) has xij which do define a legitimate itinerary in (1), and, conversely a legitimate itinerary in (1) defines xij, which, together with appropriate ui, satisfy the constraints of (2). Consider a feasible solution to (2). The number of returns to city 0 is given by ∑ni=1 xi0. The constraints of the form ∑ xij = 1, all xij non-negative integers, represent the conditions that each city (other than zero) is visited exactly once. The ui play a role similar to node potentials in a network and the inequalities involving them serve to eliminate tours that do not begin and end at city 0 and tours that visit more than p cities. Consider any xr0r1 = 1 (r1 ≠ 0). There exists a unique r2 such that xr1r2 = 1. Unless r2 = 0, there is a unique r3 with xr2r3 = 1. We proceed in this fashion until some rj = 0. This must happen since the alternative is that at some point we reach an rk = rj, j + 1 < k. Since none of the r's are zero we have uri - uri + 1 + pxriri + 1 ≦ p - 1 or uri - uri + 1 ≦ - 1. Summing from i = j to k - 1, we have urj - urk = 0 ≦ j + 1 - k, which is a contradiction. Thus all tours include city 0. It remains to observe that no tours is of length greater than p. Suppose such a tour exists, x0r1 , xr1r2 , … , xrprp+1 = 1 with all ri ≠ 0. Then, as before, ur1 - urp+1 ≦ - p or urp+1 - ur1 ≧ p. But we have urp+1 - ur1 + pxrp+1r1 ≦ p - 1 or urp+1 - ur1 ≦ p (1 - xrp+1r1) - 1 ≦ p - 1, which is a contradiction. Conversely, if the xij correspond to a legitimate itinerary, it is clear that the ui can be adjusted so that ui = j if city i is the jth city visited in the tour which includes city i, for we then have ui - uj = - 1 if xij = 1, and always ui - uj ≦ p - 1. The above integer program involves n2 + n constraints (if t is not fixed) in n2 + 2n variables. Since the inequality form of constraint is fundamental for integer programming calculations, one may eliminate 2n variables, say the xi0 and x0j, by means of the equation constraints and produce an equivalent problem with n2 + n inequalities and n2 variables. The currently known integer programming procedures are sufficiently regular in their behavior to cast doubt on the heuristic value of machine experiments with our model. However, it seems appropriate to report the results of the five machine experiments we have conducted so far. The solution procedure used was the all-integer algorithm of R. E. Gomory [3] without the ranking procedure he describes. The first three experiments were simple model verification tests on a four-city standard traveling salesman problem with distance matrix [ 20 23 4 30 7 27 25 5 25 3 21 26 ] The first experiment was with a model, now obsolete, using roughly twice as many constraints and variables as the current model (for this problem, 28 constraints in 21 variables). The machine was halted after 4000 pivot steps had failed to produce a solution. The second experiment used the earlier model with the xi0 and x0j eliminated, resulting in a 28-constraint, 15-variable problem. Here the machine produced the optimal solution in 41 pivot steps. The third experiment used the current formulation with the xi0 and x0j eliminated, yielding 13 constraints and 9 variables. The optimal solution was reached in 7 pivot steps. The fourth and fifth experiments were used on a standard ten-city problem, due to Barachet, solved by Dantzig, Johnson and Fulkerson [1]. The current formulation was used, yielding 91 constraints in 81 variables. The fifth problem differed from the fourth only in that the ordering of the rows was altered to attempt to introduce more favorable pivot choices. In each case the machine was stopped after over 250 pivot steps had failed to produce the solution. In each case the last 100 pivot steps had failed to change the value of the objective function. It seems hopeful that more efficient integer programming procedures now under development will yield a satisfactory algorithmic solution to the traveling salesman problem, when applied to this model. In any case, the model serves to illustrate how problems of this sort may be succinctly formulated in integer programming terms.", "authors": [ { @@ -9,10 +10,10 @@ "publication_count": 1, "citation_count": 179, "available_for_download": 1, - "downloads_6_weeks": 133, - "downloads_12_months": 1002, - "downloads_cumulative": 9790, - "average_downloads_per_article": 9790, + "downloads_6_weeks": 130, + "downloads_12_months": 1004, + "downloads_cumulative": 9792, + "average_downloads_per_article": 9792, "average_citations_per_article": 179 }, { @@ -22,10 +23,10 @@ "publication_count": 5, "citation_count": 196, "available_for_download": 1, - "downloads_6_weeks": 133, - "downloads_12_months": 1002, - "downloads_cumulative": 9790, - "average_downloads_per_article": 9790, + "downloads_6_weeks": 130, + "downloads_12_months": 1004, + "downloads_cumulative": 9792, + "average_downloads_per_article": 9792, "average_citations_per_article": 39.2 }, { @@ -35,16 +36,17 @@ "publication_count": 2, "citation_count": 188, "available_for_download": 2, - "downloads_6_weeks": 133, - "downloads_12_months": 1007, - "downloads_cumulative": 10021, - "average_downloads_per_article": 5010.5, + "downloads_6_weeks": 130, + "downloads_12_months": 1009, + "downloads_cumulative": 10023, + "average_downloads_per_article": 5011.5, "average_citations_per_article": 94 } ] } { "url": "https://dl.acm.org/citation.cfm?id=2412979", + "title": "STRUCT programming analysis system", "abstract": "The STRUCT system utilizes the flexibility of a powerful graphics display system to provide a set of tools for program analysis. These tools allow the analysis of the static prograin structure and the dynamic execution behavior. of programs within the entire operating system/user program environment of the Brown University Graphics System (BUGS). Information is collected and presented in a manner which fully exploits two aspects of this environment. First, the operating system has been developed in a well-structured hierarcal manner following principles laid down by other researchers (2), (3). Second the programs under analysis have been written in a structured programming language following coding conventions which make available, at the source code level, valuable program control information. A new set of pictorial constructs is introduced for presenting a. program structure (static or dynamic) for inspection. These constructs combine the best features of an indented structured source code listing and the box odented nature of traditional flow charts. The graphical tools available are USed to provide for swift changes in. the desired level of detail displayed within a program structure, for traveling linearly through a program structure, for traveling through a complex program structure (following subroutine or system calls), for concurrently viewing multiple related program structures, and for presenting dynamic program behavior data using three-dimensional projections, The volume of a three-dimensional box representing a program block is proportional to the block's resource utilization. The scope of this paper is limited to a description of the STRUCT system. This system is currently being used to predict and analyze the performance advantages available through the migration of function (program modules) between levels of software and between software and firmware within BUGS. The results of this research on migration will be included in a doctoral dissertation currently being written.", "authors": [ { diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 8766a54fd..02c6d8b74 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -118,7 +118,7 @@ def write_image(folder, name): image = Image.open(f"{dist}/{folder}/{name}") native_width, native_height = image.size - + view_doc = { "_id": view_doc_guid, "fields": { -- cgit v1.2.3-70-g09d2 From cd78d5b3371d73cc51dcb6f3dbfdca1a3bbcf6e1 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Mon, 1 Jul 2019 04:58:37 -0400 Subject: final scraping changes --- src/scraping/acm/index.js | 106 +++++++++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 43 deletions(-) (limited to 'src/scraping/acm/index.js') diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index 3d5e801be..ff4b099e7 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -17,15 +17,32 @@ function log_read(content) { process.stdout.write("reading " + content + "..."); } -function log_snippet(result) { - let ellipse = result.length > sample_line_char_max; - let i = sample_line_char_max; - if (ellipse) { - while (result[i] != " " && i < -1) { - i--; - } +function log_snippet(result, quotes = true) { + let snippet = "failed to create snippet"; + switch (typeof result) { + case "string": + let ellipse = result.length > sample_line_char_max; + let i = sample_line_char_max; + if (ellipse) { + while (result[i] != " " && i < -1) { + i--; + } + } + snippet = `${result.substring(0, i + 1).trim()}${ellipse ? "..." : ""}`; + snippet = quotes ? `"${snippet}"` : snippet; + break; + case "object": + snippet = result.map(res => { + switch (typeof res) { + case "string": + return res.substring(0, sample_line_char_max / result.length); + case "object": + return res[Object.keys(res)[0]]; + } + }).join(', '); } - console.log(` "${result.substring(0, i + 1).trim()}${ellipse ? "..." : ""}"`); + console.log(snippet); + return result; } // DRIVER UTILITY FUNCTIONS @@ -35,40 +52,41 @@ async function navigate_to(url) { await driver.sleep(driver_pause); } -async function click_on(xpath) { - await driver.findElement(By.xpath(xpath)).click(); +async function click_on(ref) { + await (await locate(ref)).click(); await driver.sleep(driver_pause); } -// TEXT SCRAPING +async function locate(ref, multiple = false) { + let locator = ref.startsWith("//") ? By.xpath(ref) : By.id(ref); + return await multiple ? driver.findElements(locator) : driver.findElement(locator); +} -async function read_title() { - log_read("title"); - let title_el = await driver.findElement(By.xpath('//*[@id="divmain"]/div/h1')); - let title = await title_el.getText(); - log_snippet(title); - return title; +async function text_of(ref) { + let element = await locate(ref); + return await element.getText(); } -async function read_abstract() { - log_read("abstract"); - let lines = []; - let webElements = await driver.findElements(By.id("abstract-body")); - for (let el of webElements) { - let text = await el.getText(); - lines.push(text); +async function text_of_all(ref) { + let elements = await locate(ref, true); + let results = []; + for (let element of elements) { + results.push(await element.getText()); } - let abstract = lines.join(" "); - log_snippet(abstract); - return abstract; + return results; } +async function logged_assign(key, value) { + log_read(key); + result[key] = log_snippet(value); +} + +// TEXT SCRAPING + async function read_authors() { - log_read("authors"); await click_on('//*[@id="tab-1014-btnInnerEl"]/span'); - let authors_el = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')); - let authors = await authors_el.getText(); + let authors = await text_of('//*[@id="tabpanel-1009-body"]'); let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")); let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize); @@ -85,8 +103,6 @@ async function read_authors() { i++; } - let multiple = all_authors.length == 1 ? "" : " et al."; - log_snippet(all_authors[0][0] + multiple); return all_authors; } @@ -113,6 +129,7 @@ function parse_authors(metadata) { } function write_results() { + console.log(); let output = ""; results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); @@ -129,29 +146,29 @@ async function scrape_targets(error, data) { let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g)); let quota = references.length; - console.log(`${references.join(", ")}\n`); + log_snippet(`found ${quota} references to scrape`, false); driver = await new Builder().forBrowser(target_browser).build(); for (let i = 0; i < quota; i++) { - let result = {}; - try { - let url = `https://dl.acm.org/citation.cfm?id=${references[i]}`; + result = {}; + + let id = references[i]; + let url = `https://dl.acm.org/citation.cfm?id=${id}`; + console.log(`\nscraping ${i + 1}/${quota} (${id})`); await navigate_to(url); - console.log(`scraping ${i + 1}/${quota} (${url})`); - result.url = url; - result.title = await read_title(); - result.abstract = await read_abstract(); - result.authors = (await read_authors()).map(parse_authors); + logged_assign("url", url); + logged_assign("title", await text_of('//*[@id="divmain"]/div/h1')); + logged_assign("abstract", (await text_of_all("abstract-body")).join(" ")); + logged_assign("authors", (await read_authors()).map(parse_authors)); } catch (e) { console.log(e); await driver.quit(); } results.push(result); - console.log(); } write_results(); @@ -161,7 +178,10 @@ async function scrape_targets(error, data) { let driver; let results = []; -console.log("reading references..."); +let result = {}; + +log_read("target references"); + readFile("./citations.txt", { encoding: "utf8" }, scrape_targets); \ No newline at end of file -- cgit v1.2.3-70-g09d2 From ab2b2317e601b9e440f9c48b4639c143d5187949 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Mon, 1 Jul 2019 19:29:15 -0400 Subject: added tooltips and improved acm scraping --- src/client/views/DocumentDecorations.tsx | 2 +- src/client/views/GlobalKeyHandler.ts | 2 +- src/client/views/MainView.tsx | 6 ++-- src/client/views/TemplateMenu.tsx | 2 +- src/scraping/acm/index.js | 56 +++++++++++++++++++++++++------- src/scraping/acm/results.txt | 34 +++++++++++-------- 6 files changed, 71 insertions(+), 31 deletions(-) (limited to 'src/scraping/acm/index.js') diff --git a/src/client/views/DocumentDecorations.tsx b/src/client/views/DocumentDecorations.tsx index 61e9d209a..e1eff5cdc 100644 --- a/src/client/views/DocumentDecorations.tsx +++ b/src/client/views/DocumentDecorations.tsx @@ -692,7 +692,7 @@ export class DocumentDecorations extends React.Component<{}, { value: string }> {this._edtingTitle ? :
{`${this.selectionTitle}`}
} -
X
+
X
e.preventDefault()}>
e.preventDefault()}>
e.preventDefault()}>
diff --git a/src/client/views/GlobalKeyHandler.ts b/src/client/views/GlobalKeyHandler.ts index e467d7c61..4c72bd683 100644 --- a/src/client/views/GlobalKeyHandler.ts +++ b/src/client/views/GlobalKeyHandler.ts @@ -13,7 +13,7 @@ type KeyControlInfo = { }; export default class KeyManager { - public static Handler: KeyManager = new KeyManager(); + public static Instance: KeyManager = new KeyManager(); private router = new Map(); constructor() { diff --git a/src/client/views/MainView.tsx b/src/client/views/MainView.tsx index fb88c284f..a9d75d8cc 100644 --- a/src/client/views/MainView.tsx +++ b/src/client/views/MainView.tsx @@ -64,8 +64,8 @@ export class MainView extends React.Component { } componentWillMount() { - window.removeEventListener("keydown", KeyManager.Handler.handle); - window.addEventListener("keydown", KeyManager.Handler.handle); + window.removeEventListener("keydown", KeyManager.Instance.handle); + window.addEventListener("keydown", KeyManager.Instance.handle); window.removeEventListener("pointerdown", this.pointerDown); window.addEventListener("pointerdown", this.pointerDown); @@ -78,7 +78,7 @@ export class MainView extends React.Component { pointerUp = (e: PointerEvent) => this.isPointerDown = false; componentWillUnMount() { - window.removeEventListener("keydown", KeyManager.Handler.handle); + window.removeEventListener("keydown", KeyManager.Instance.handle); window.removeEventListener("pointerdown", this.pointerDown); window.removeEventListener("pointerup", this.pointerUp); } diff --git a/src/client/views/TemplateMenu.tsx b/src/client/views/TemplateMenu.tsx index a9bc4d3d2..a1d59484a 100644 --- a/src/client/views/TemplateMenu.tsx +++ b/src/client/views/TemplateMenu.tsx @@ -79,7 +79,7 @@ export class TemplateMenu extends React.Component { return (
-
this.toggleTemplateActivity()}>+
+
this.toggleTemplateActivity()}>+
    {templateMenu}
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index ff4b099e7..ad0f844ba 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -10,6 +10,17 @@ const { const driver_pause = 500; // milliseconds const sample_line_char_max = 100; // characters const target_browser = 'chrome'; +const tab_map = { + abstract: "11", + authors: "14", + references: "15", + cited_by: "16", + index_terms: "17", + publication: "18", + reviews: "19", + comments: "20", + table_of_contents: "21" +}; // GENERAL UTILITY FUNCTIONS @@ -32,14 +43,18 @@ function log_snippet(result, quotes = true) { snippet = quotes ? `"${snippet}"` : snippet; break; case "object": - snippet = result.map(res => { - switch (typeof res) { - case "string": - return res.substring(0, sample_line_char_max / result.length); - case "object": - return res[Object.keys(res)[0]]; - } - }).join(', '); + if (Array.isArray(result)) { + snippet = result.map(res => { + switch (typeof res) { + case "string": + return res.substring(0, sample_line_char_max / result.length); + case "object": + return res[Object.keys(res)[0]]; + } + }).join(', '); + } else { + snippet = result[Object.keys(result)[0]]; + } } console.log(snippet); return result; @@ -57,6 +72,10 @@ async function click_on(ref) { await driver.sleep(driver_pause); } +async function click_on_acm_tab(target) { + await click_on(`//*[@id="tab-10${tab_map[target]}-btnInnerEl"]/span`); +} + async function locate(ref, multiple = false) { let locator = ref.startsWith("//") ? By.xpath(ref) : By.id(ref); return await multiple ? driver.findElements(locator) : driver.findElement(locator); @@ -84,8 +103,6 @@ async function logged_assign(key, value) { // TEXT SCRAPING async function read_authors() { - await click_on('//*[@id="tab-1014-btnInnerEl"]/span'); - let authors = await text_of('//*[@id="tabpanel-1009-body"]'); let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")); let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize); @@ -106,6 +123,17 @@ async function read_authors() { return all_authors; } +async function read_publication() { + let publciation_elements = (await text_of("source-body")).split("\n"); + let publication_module = {}; + for (let element of publciation_elements) { + if (element.startsWith("Title")) { + publication_module.title = element.substring(6); + } + } + return publication_module; +} + // JSON / DASH CONVERSION AND EXPORT function parse_authors(metadata) { @@ -134,7 +162,7 @@ function write_results() { results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); writeFile("./results.txt", output, function errorHandler(exception) { - console.log(exception || "scraped references successfully written as JSON to ./results.txt\n"); + console.log(exception || "scraped references successfully written as JSON to ./results.txt"); }); } @@ -157,12 +185,18 @@ async function scrape_targets(error, data) { let id = references[i]; let url = `https://dl.acm.org/citation.cfm?id=${id}`; console.log(`\nscraping ${i + 1}/${quota} (${id})`); + await navigate_to(url); logged_assign("url", url); logged_assign("title", await text_of('//*[@id="divmain"]/div/h1')); logged_assign("abstract", (await text_of_all("abstract-body")).join(" ")); + + await click_on_acm_tab("authors"); logged_assign("authors", (await read_authors()).map(parse_authors)); + + await click_on_acm_tab("publication"); + logged_assign("publication", await read_publication()); } catch (e) { console.log(e); await driver.quit(); diff --git a/src/scraping/acm/results.txt b/src/scraping/acm/results.txt index fffa7ff51..ba66d61a7 100644 --- a/src/scraping/acm/results.txt +++ b/src/scraping/acm/results.txt @@ -10,10 +10,10 @@ "publication_count": 1, "citation_count": 179, "available_for_download": 1, - "downloads_6_weeks": 130, - "downloads_12_months": 1004, - "downloads_cumulative": 9792, - "average_downloads_per_article": 9792, + "downloads_6_weeks": 123, + "downloads_12_months": 922, + "downloads_cumulative": 9793, + "average_downloads_per_article": 9793, "average_citations_per_article": 179 }, { @@ -23,10 +23,10 @@ "publication_count": 5, "citation_count": 196, "available_for_download": 1, - "downloads_6_weeks": 130, - "downloads_12_months": 1004, - "downloads_cumulative": 9792, - "average_downloads_per_article": 9792, + "downloads_6_weeks": 123, + "downloads_12_months": 922, + "downloads_cumulative": 9793, + "average_downloads_per_article": 9793, "average_citations_per_article": 39.2 }, { @@ -36,13 +36,16 @@ "publication_count": 2, "citation_count": 188, "available_for_download": 2, - "downloads_6_weeks": 130, - "downloads_12_months": 1009, - "downloads_cumulative": 10023, - "average_downloads_per_article": 5011.5, + "downloads_6_weeks": 123, + "downloads_12_months": 927, + "downloads_cumulative": 10024, + "average_downloads_per_article": 5012, "average_citations_per_article": 94 } - ] + ], + "publication": { + "title": "Journal of the ACM (JACM) JACM Homepage table of contents archive" + } } { "url": "https://dl.acm.org/citation.cfm?id=2412979", @@ -62,5 +65,8 @@ "average_downloads_per_article": 0, "average_citations_per_article": 0 } - ] + ], + "publication": { + "title": "IEEE Transactions on Software Engineering table of contents archive" + } } -- cgit v1.2.3-70-g09d2 From 44ff64c8186b25069c14aaf4de0ae694f872c2d3 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Mon, 1 Jul 2019 19:31:13 -0400 Subject: scraping function tweak --- src/scraping/acm/index.js | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'src/scraping/acm/index.js') diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index ad0f844ba..39938ecca 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -28,6 +28,10 @@ function log_read(content) { process.stdout.write("reading " + content + "..."); } +function first_value(object) { + return object[Object.keys(object)[0]]; +} + function log_snippet(result, quotes = true) { let snippet = "failed to create snippet"; switch (typeof result) { @@ -49,11 +53,11 @@ function log_snippet(result, quotes = true) { case "string": return res.substring(0, sample_line_char_max / result.length); case "object": - return res[Object.keys(res)[0]]; + return first_value(res); } }).join(', '); } else { - snippet = result[Object.keys(result)[0]]; + snippet = first_value(result); } } console.log(snippet); -- cgit v1.2.3-70-g09d2 From 4d31e1038ece3fdf9565f226486aeff0e36bb48d Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Tue, 2 Jul 2019 01:03:01 -0400 Subject: keyhandler and scraping --- src/client/views/GlobalKeyHandler.ts | 2 +- src/scraping/acm/index.js | 90 ++++++++++++++++++++++++++++-------- src/scraping/acm/results.txt | 47 +++++++++++++------ 3 files changed, 106 insertions(+), 33 deletions(-) (limited to 'src/scraping/acm/index.js') diff --git a/src/client/views/GlobalKeyHandler.ts b/src/client/views/GlobalKeyHandler.ts index 9a0c3a6b1..574e43ba3 100644 --- a/src/client/views/GlobalKeyHandler.ts +++ b/src/client/views/GlobalKeyHandler.ts @@ -100,7 +100,7 @@ export default class KeyManager { MainView.Instance.mainFreeform && CollectionDockingView.Instance.AddRightSplit(MainView.Instance.mainFreeform, undefined); break; case "arrowleft": - MainView.Instance.mainFreeform && CollectionDockingView.Instance.CloseRightSplit(MainView.Instance.mainFreeform) + MainView.Instance.mainFreeform && CollectionDockingView.Instance.CloseRightSplit(MainView.Instance.mainFreeform); break; case "f": MainView.Instance.isSearchVisible = !MainView.Instance.isSearchVisible; diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index 39938ecca..be844da31 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -7,9 +7,13 @@ const { writeFile } = require('fs'); +const target_source = './citations.txt'; +const target_browser = 'chrome'; +const target_dist = './results.txt'; + const driver_pause = 500; // milliseconds const sample_line_char_max = 100; // characters -const target_browser = 'chrome'; + const tab_map = { abstract: "11", authors: "14", @@ -22,16 +26,29 @@ const tab_map = { table_of_contents: "21" }; +String.prototype.removeAll = function (replacements, trim = true) { + let result = this; + for (let expression of replacements) { + result = result.replace(expression, ""); + } + return trim ? result.trim() : result; +}; + +String.prototype.remove = function (replacement, trim = true) { + let result = this.replace(replacement, ""); + return trim ? result.trim() : result; +}; + +Object.prototype.first = function () { + return this[Object.keys(this)[0]]; +}; + // GENERAL UTILITY FUNCTIONS function log_read(content) { process.stdout.write("reading " + content + "..."); } -function first_value(object) { - return object[Object.keys(object)[0]]; -} - function log_snippet(result, quotes = true) { let snippet = "failed to create snippet"; switch (typeof result) { @@ -53,11 +70,11 @@ function log_snippet(result, quotes = true) { case "string": return res.substring(0, sample_line_char_max / result.length); case "object": - return first_value(res); + return res.first(); } }).join(', '); } else { - snippet = first_value(result); + snippet = result.first(); } } console.log(snippet); @@ -130,9 +147,40 @@ async function read_authors() { async function read_publication() { let publciation_elements = (await text_of("source-body")).split("\n"); let publication_module = {}; + + let extract = (regex, target, index = 1) => regex.exec(target)[index]; + for (let element of publciation_elements) { + + let location = /Volume (\d+) Issue (\d+), ([\w.\d]+)/g; + let pages = /(\d+)-(\d+)/g; + let publication_date = /(\d{4}-\d{2}-\d{2})/g; + let publisher = /Publisher (.*)/g; + let issn = /ISSN: (\d{4}-\d{4})/g; + let eissn = /EISSN: ([\dA-Z]{4}-[\dA-Z]{4})/g; + let doi = /doi>([\.\d\/A-Z]+)/g; + if (element.startsWith("Title")) { - publication_module.title = element.substring(6); + publication_module.name = element.substring(6).removeAll(["table of contents", "archive", /\w+ Homepage/]); + } else if (element.startsWith("Volume")) { + let match = location.exec(element); + publication_module.volume = parseInt(match[1]); + publication_module.issue = parseInt(match[2]); + publication_module.month = match[3]; + } else if (element.startsWith("Pages")) { + let match = pages.exec(element); + publication_module.page_start = parseInt(match[1]); + publication_module.page_end = parseInt(match[2]); + } else if (element.startsWith("Publication Date ")) { + publication_module.publication_date = extract(publication_date, element); + } else if (element.startsWith("Publisher ")) { + publication_module.publisher = extract(publisher, element); + } else if (element.startsWith("ISSN: ")) { + publication_module.issn = extract(issn, element); + if (element.includes("EISSN: ")) { + publication_module.eissn = extract(eissn, element); + } + publication_module.doi = extract(doi, element); } } return publication_module; @@ -153,8 +201,8 @@ function parse_authors(metadata) { while (attr[char] != " ") { char--; } - let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, ""); - let value = parseFloat(attr.substring(char + 1).replace(/,/g, "")); + let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").remove(/[\(\)]/g); + let value = parseFloat(attr.substring(char + 1).remove(/,/g)); author[key] = value; } return author; @@ -165,7 +213,7 @@ function write_results() { let output = ""; results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); - writeFile("./results.txt", output, function errorHandler(exception) { + writeFile(target_dist, output, function errorHandler(exception) { console.log(exception || "scraped references successfully written as JSON to ./results.txt"); }); } @@ -176,7 +224,7 @@ async function scrape_targets(error, data) { return; } - let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g)); + let references = data.split("\n").map(entry => entry.removeAll(["\r"])).filter(line => line.match(/\d+/g)); let quota = references.length; log_snippet(`found ${quota} references to scrape`, false); @@ -185,6 +233,7 @@ async function scrape_targets(error, data) { for (let i = 0; i < quota; i++) { try { result = {}; + let target; let id = references[i]; let url = `https://dl.acm.org/citation.cfm?id=${id}`; @@ -194,13 +243,18 @@ async function scrape_targets(error, data) { logged_assign("url", url); logged_assign("title", await text_of('//*[@id="divmain"]/div/h1')); - logged_assign("abstract", (await text_of_all("abstract-body")).join(" ")); - await click_on_acm_tab("authors"); - logged_assign("authors", (await read_authors()).map(parse_authors)); + target = "abstract"; + await click_on_acm_tab(target); + logged_assign(target, (await text_of_all("abstract-body")).join(" ")); + + target = "authors"; + await click_on_acm_tab(target); + logged_assign(target, (await read_authors()).map(parse_authors)); - await click_on_acm_tab("publication"); - logged_assign("publication", await read_publication()); + target = "publication"; + await click_on_acm_tab(target); + logged_assign(target, await read_publication()); } catch (e) { console.log(e); await driver.quit(); @@ -220,6 +274,6 @@ let result = {}; log_read("target references"); -readFile("./citations.txt", { +readFile(target_source, { encoding: "utf8" }, scrape_targets); \ No newline at end of file diff --git a/src/scraping/acm/results.txt b/src/scraping/acm/results.txt index ba66d61a7..a15da8b10 100644 --- a/src/scraping/acm/results.txt +++ b/src/scraping/acm/results.txt @@ -10,10 +10,10 @@ "publication_count": 1, "citation_count": 179, "available_for_download": 1, - "downloads_6_weeks": 123, - "downloads_12_months": 922, - "downloads_cumulative": 9793, - "average_downloads_per_article": 9793, + "downloads_6_weeks": 124, + "downloads_12_months": 923, + "downloads_cumulative": 9794, + "average_downloads_per_article": 9794, "average_citations_per_article": 179 }, { @@ -23,10 +23,10 @@ "publication_count": 5, "citation_count": 196, "available_for_download": 1, - "downloads_6_weeks": 123, - "downloads_12_months": 922, - "downloads_cumulative": 9793, - "average_downloads_per_article": 9793, + "downloads_6_weeks": 124, + "downloads_12_months": 923, + "downloads_cumulative": 9794, + "average_downloads_per_article": 9794, "average_citations_per_article": 39.2 }, { @@ -36,15 +36,25 @@ "publication_count": 2, "citation_count": 188, "available_for_download": 2, - "downloads_6_weeks": 123, - "downloads_12_months": 927, - "downloads_cumulative": 10024, - "average_downloads_per_article": 5012, + "downloads_6_weeks": 124, + "downloads_12_months": 928, + "downloads_cumulative": 10025, + "average_downloads_per_article": 5012.5, "average_citations_per_article": 94 } ], "publication": { - "title": "Journal of the ACM (JACM) JACM Homepage table of contents archive" + "name": "Journal of the ACM (JACM)", + "volume": 7, + "issue": 4, + "month": "Oct.", + "page_start": 326, + "page_end": 329, + "publication_date": "1960-10-01", + "publisher": "ACM New York, NY, USA", + "issn": "0004-5411", + "eissn": "1557-735X", + "doi": "10.1145/321043.321046" } } { @@ -67,6 +77,15 @@ } ], "publication": { - "title": "IEEE Transactions on Software Engineering table of contents archive" + "name": "IEEE Transactions on Software Engineering", + "volume": 1, + "issue": 1, + "month": "March", + "page_start": 384, + "page_end": 389, + "publication_date": "1975-03-01", + "publisher": "IEEE Press Piscataway, NJ, USA", + "issn": "0098-5589", + "doi": "10.1109/TSE.1975.6312869" } } -- cgit v1.2.3-70-g09d2 From 49cf949250fb9b01a8457c2c3dee60b19f60c036 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Tue, 2 Jul 2019 11:16:00 -0400 Subject: scraping tweaks from last night --- src/scraping/acm/index.js | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'src/scraping/acm/index.js') diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index be844da31..51781dba8 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -107,13 +107,13 @@ async function text_of(ref) { return await element.getText(); } -async function text_of_all(ref) { +async function text_of_all(ref, delimiter = undefined) { let elements = await locate(ref, true); let results = []; for (let element of elements) { results.push(await element.getText()); } - return results; + return delimiter ? results.join(delimiter) : results; } async function logged_assign(key, value) { @@ -141,7 +141,7 @@ async function read_authors() { i++; } - return all_authors; + return all_authors.map(parse_author); } async function read_publication() { @@ -162,12 +162,12 @@ async function read_publication() { if (element.startsWith("Title")) { publication_module.name = element.substring(6).removeAll(["table of contents", "archive", /\w+ Homepage/]); - } else if (element.startsWith("Volume")) { + } else if (element.startsWith("Volume ")) { let match = location.exec(element); publication_module.volume = parseInt(match[1]); publication_module.issue = parseInt(match[2]); publication_module.month = match[3]; - } else if (element.startsWith("Pages")) { + } else if (element.startsWith("Pages ")) { let match = pages.exec(element); publication_module.page_start = parseInt(match[1]); publication_module.page_end = parseInt(match[2]); @@ -188,7 +188,7 @@ async function read_publication() { // JSON / DASH CONVERSION AND EXPORT -function parse_authors(metadata) { +function parse_author(metadata) { let publicationYears = metadata[1].substring(18).split("-"); author = { name: metadata[0], @@ -246,11 +246,11 @@ async function scrape_targets(error, data) { target = "abstract"; await click_on_acm_tab(target); - logged_assign(target, (await text_of_all("abstract-body")).join(" ")); + logged_assign(target, await text_of_all("abstract-body", " ")); target = "authors"; await click_on_acm_tab(target); - logged_assign(target, (await read_authors()).map(parse_authors)); + logged_assign(target, await read_authors()); target = "publication"; await click_on_acm_tab(target); -- cgit v1.2.3-70-g09d2