aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/acm
diff options
context:
space:
mode:
authorSam Wilkins <samuel_wilkins@brown.edu>2019-06-25 21:22:29 -0400
committerSam Wilkins <samuel_wilkins@brown.edu>2019-06-25 21:22:29 -0400
commit06bd486c72342b4d979245c9f4051156e6492541 (patch)
tree3c0a18c8e682ab3c518eaa14369f088025041469 /src/scraping/acm
parent4be53f12fffa099f3381813192e60415898183d1 (diff)
scraping progress
Diffstat (limited to 'src/scraping/acm')
-rw-r--r--src/scraping/acm/chromedriver.exebin0 -> 7477760 bytes
-rw-r--r--src/scraping/acm/citations.txt2
-rw-r--r--src/scraping/acm/debug.log38
-rw-r--r--src/scraping/acm/index.js88
-rw-r--r--src/scraping/acm/package.json17
-rw-r--r--src/scraping/acm/results.txt64
6 files changed, 209 insertions, 0 deletions
diff --git a/src/scraping/acm/chromedriver.exe b/src/scraping/acm/chromedriver.exe
new file mode 100644
index 000000000..6a362fd43
--- /dev/null
+++ b/src/scraping/acm/chromedriver.exe
Binary files differ
diff --git a/src/scraping/acm/citations.txt b/src/scraping/acm/citations.txt
new file mode 100644
index 000000000..e5018ddef
--- /dev/null
+++ b/src/scraping/acm/citations.txt
@@ -0,0 +1,2 @@
+321046
+2412979 \ No newline at end of file
diff --git a/src/scraping/acm/debug.log b/src/scraping/acm/debug.log
new file mode 100644
index 000000000..8c0a148f4
--- /dev/null
+++ b/src/scraping/acm/debug.log
@@ -0,0 +1,38 @@
+[0625/170004.768:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/170004.769:ERROR:exception_snapshot_win.cc(98)] thread ID 17604 not found in process
+[0625/171124.644:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/171124.645:ERROR:exception_snapshot_win.cc(98)] thread ID 14348 not found in process
+[0625/171853.989:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/171853.990:ERROR:exception_snapshot_win.cc(98)] thread ID 12080 not found in process
+[0625/171947.744:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/171947.745:ERROR:exception_snapshot_win.cc(98)] thread ID 16160 not found in process
+[0625/172007.424:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/172007.425:ERROR:exception_snapshot_win.cc(98)] thread ID 13472 not found in process
+[0625/172059.353:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/172059.354:ERROR:exception_snapshot_win.cc(98)] thread ID 6396 not found in process
+[0625/172402.795:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/172402.796:ERROR:exception_snapshot_win.cc(98)] thread ID 10720 not found in process
+[0625/172618.850:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/172618.850:ERROR:exception_snapshot_win.cc(98)] thread ID 21136 not found in process
+[0625/172819.875:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/172819.876:ERROR:exception_snapshot_win.cc(98)] thread ID 17624 not found in process
+[0625/172953.674:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/172953.675:ERROR:exception_snapshot_win.cc(98)] thread ID 15180 not found in process
+[0625/173412.182:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173412.182:ERROR:exception_snapshot_win.cc(98)] thread ID 13952 not found in process
+[0625/173447.806:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173447.807:ERROR:exception_snapshot_win.cc(98)] thread ID 1572 not found in process
+[0625/173516.188:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173516.189:ERROR:exception_snapshot_win.cc(98)] thread ID 5472 not found in process
+[0625/173528.446:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173528.447:ERROR:exception_snapshot_win.cc(98)] thread ID 20420 not found in process
+[0625/173539.436:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173539.437:ERROR:exception_snapshot_win.cc(98)] thread ID 16192 not found in process
+[0625/173643.139:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173643.140:ERROR:exception_snapshot_win.cc(98)] thread ID 15716 not found in process
+[0625/173659.376:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173659.377:ERROR:exception_snapshot_win.cc(98)] thread ID 11828 not found in process
+[0625/201137.209:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/201137.210:ERROR:exception_snapshot_win.cc(98)] thread ID 7688 not found in process
+[0625/210240.476:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/210240.477:ERROR:exception_snapshot_win.cc(98)] thread ID 20828 not found in process
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js
new file mode 100644
index 000000000..81f775617
--- /dev/null
+++ b/src/scraping/acm/index.js
@@ -0,0 +1,88 @@
+const { Builder, By, Key, until } = require('selenium-webdriver');
+const fs = require("fs");
+
+let driver;
+fs.readFile("./citations.txt", { encoding: "utf8" }, scrapeTargets);
+results = []
+
+async function scrapeTargets(error, data) {
+ if (error) {
+ console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.")
+ return;
+ }
+
+ driver = await new Builder().forBrowser('chrome').build();
+
+ let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g));
+
+ let results = []
+ let pdfs = []
+ for (let id of references) {
+ let result = {}
+ let lines = []
+ try {
+ let url = `https://dl.acm.org/citation.cfm?id=${id}`;
+ await driver.get(url);
+ await driver.sleep(500)
+ let candidates = await driver.findElements(By.xpath('.//a[contains(@href, "ft_gateway.cfm?id=")]'));
+ if (candidates.length > 0) {
+ pdfs.push(candidates[0])
+ }
+ let webElements = await driver.findElements(By.id("abstract-body"))
+ for (let el of webElements) {
+ let text = await el.getText()
+ lines.push(text)
+ }
+ result.url = url
+ result.abstract = lines.join(" ");
+ await driver.findElement(By.xpath(`//*[@id="tab-1014-btnInnerEl"]/span`)).click()
+ await driver.sleep(500)
+ let authors = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')).getText()
+ let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:"))
+ authorLines = authors.split("\n").map(line => line.trim()).filter(sanitize)
+
+ let i = 0;
+ let allAuthors = []
+ while (i < authorLines.length) {
+ let individual = [];
+ while (!authorLines[i].startsWith("Average citations")) {
+ individual.push(authorLines[i])
+ i++
+ }
+ individual.push(authorLines[i])
+ allAuthors.push(individual);
+ i++
+ }
+ result.authors = allAuthors.map(metadata => {
+ let publicationYears = metadata[1].substring(18).split("-");
+ author = {
+ name: metadata[0],
+ publication_start: parseInt(publicationYears[0]),
+ publication_end: parseInt(publicationYears[1])
+ };
+ for (let count = 2; count < metadata.length; count++) {
+ let attr = metadata[count]
+ let char = attr.length - 1;
+ while (attr[char] != " ") {
+ char--
+ }
+ let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, "");
+ let value = parseFloat(attr.substring(char + 1).replace(/,/g, ""));
+ author[key] = value
+ }
+ return author
+ })
+ } catch (e) {
+ console.log(e)
+ await driver.quit();
+ }
+ results.push(result)
+ }
+
+ let output = "";
+ results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n"));
+
+ fs.writeFile("./results.txt", output, function errorHandler(exception) { console.log(exception || "results successfully written") })
+
+ await driver.quit();
+} \ No newline at end of file
diff --git a/src/scraping/acm/package.json b/src/scraping/acm/package.json
new file mode 100644
index 000000000..10f4d2156
--- /dev/null
+++ b/src/scraping/acm/package.json
@@ -0,0 +1,17 @@
+{
+ "name": "scraper",
+ "version": "1.0.0",
+ "description": "",
+ "main": "index.js",
+ "scripts": {
+ "test": "echo \"Error: no test specified\" && exit 1"
+ },
+ "keywords": [],
+ "author": "",
+ "license": "ISC",
+ "dependencies": {
+ "axios": "^0.19.0",
+ "cheerio": "^1.0.0-rc.3",
+ "selenium-webdriver": "^4.0.0-alpha.4"
+ }
+}
diff --git a/src/scraping/acm/results.txt b/src/scraping/acm/results.txt
new file mode 100644
index 000000000..05bb2be8b
--- /dev/null
+++ b/src/scraping/acm/results.txt
@@ -0,0 +1,64 @@
+{
+ "url": "https://dl.acm.org/citation.cfm?id=321046",
+ "abstract": "It has been observed by many people that a striking number of quite diverse mathematical problems can be formulated as problems in integer programming, that is, linear programming problems in which some or all of the variables are required to assume integral values. This fact is rendered quite interesting by recent research on such problems, notably by R. E. Gomory [2, 3], which gives promise of yielding efficient computational techniques for their solution. The present paper provides yet another example of the versatility of integer programming as a mathematical modeling device by representing a generalization of the well-known “Travelling Salesman Problem” in integer programming terms. The authors have developed several such models, of which the one presented here is the most efficient in terms of generality, number of variables, and number of constraints. This model is due to the second author [4] and was presented briefly at the Symposium on Combinatorial Problems held at Princeton University, April 1960, sponsored by SIAM and IBM. The problem treated is: (1) A salesman is required to visit each of n cities, indexed by 1, … , n. He leaves from a “base city” indexed by 0, visits each of the n other cities exactly once, and returns to city 0. During his travels he must return to 0 exactly t times, including his final return (here t may be allowed to vary), and he must visit no more than p cities in one tour. (By a tour we mean a succession of visits to cities without stopping at city 0.) It is required to find such an itinerary which minimizes the total distance traveled by the salesman. Note that if t is fixed, then for the problem to have a solution we must have tp ≧ n. For t = 1, p ≧ n, we have the standard traveling salesman problem. Let dij (i ≠ j = 0, 1, … , n) be the distance covered in traveling from city i to city j. The following integer programming problem will be shown to be equivalent to (1): (2) Minimize the linear form ∑0≦i≠j≦n∑ dijxij over the set determined by the relations ∑ni=0i≠j xij = 1 (j = 1, … , n) ∑nj=0j≠i xij = 1 (i = 1, … , n) ui - uj + pxij ≦ p - 1 (1 ≦ i ≠ j ≦ n) where the xij are non-negative integers and the ui (i = 1, …, n) are arbitrary real numbers. (We shall see that it is permissible to restrict the ui to be non-negative integers as well.) If t is fixed it is necessary to add the additional relation: ∑nu=1 xi0 = t Note that the constraints require that xij = 0 or 1, so that a natural correspondence between these two problems exists if the xij are interpreted as follows: The salesman proceeds from city i to city j if and only if xij = 1. Under this correspondence the form to be minimized in (2) is the total distance to be traveled by the salesman in (1), so the burden of proof is to show that the two feasible sets correspond; i.e., a feasible solution to (2) has xij which do define a legitimate itinerary in (1), and, conversely a legitimate itinerary in (1) defines xij, which, together with appropriate ui, satisfy the constraints of (2). Consider a feasible solution to (2). The number of returns to city 0 is given by ∑ni=1 xi0. The constraints of the form ∑ xij = 1, all xij non-negative integers, represent the conditions that each city (other than zero) is visited exactly once. The ui play a role similar to node potentials in a network and the inequalities involving them serve to eliminate tours that do not begin and end at city 0 and tours that visit more than p cities. Consider any xr0r1 = 1 (r1 ≠ 0). There exists a unique r2 such that xr1r2 = 1. Unless r2 = 0, there is a unique r3 with xr2r3 = 1. We proceed in this fashion until some rj = 0. This must happen since the alternative is that at some point we reach an rk = rj, j + 1 < k. Since none of the r's are zero we have uri - uri + 1 + pxriri + 1 ≦ p - 1 or uri - uri + 1 ≦ - 1. Summing from i = j to k - 1, we have urj - urk = 0 ≦ j + 1 - k, which is a contradiction. Thus all tours include city 0. It remains to observe that no tours is of length greater than p. Suppose such a tour exists, x0r1 , xr1r2 , … , xrprp+1 = 1 with all ri ≠ 0. Then, as before, ur1 - urp+1 ≦ - p or urp+1 - ur1 ≧ p. But we have urp+1 - ur1 + pxrp+1r1 ≦ p - 1 or urp+1 - ur1 ≦ p (1 - xrp+1r1) - 1 ≦ p - 1, which is a contradiction. Conversely, if the xij correspond to a legitimate itinerary, it is clear that the ui can be adjusted so that ui = j if city i is the jth city visited in the tour which includes city i, for we then have ui - uj = - 1 if xij = 1, and always ui - uj ≦ p - 1. The above integer program involves n2 + n constraints (if t is not fixed) in n2 + 2n variables. Since the inequality form of constraint is fundamental for integer programming calculations, one may eliminate 2n variables, say the xi0 and x0j, by means of the equation constraints and produce an equivalent problem with n2 + n inequalities and n2 variables. The currently known integer programming procedures are sufficiently regular in their behavior to cast doubt on the heuristic value of machine experiments with our model. However, it seems appropriate to report the results of the five machine experiments we have conducted so far. The solution procedure used was the all-integer algorithm of R. E. Gomory [3] without the ranking procedure he describes. The first three experiments were simple model verification tests on a four-city standard traveling salesman problem with distance matrix [ 20 23 4 30 7 27 25 5 25 3 21 26 ] The first experiment was with a model, now obsolete, using roughly twice as many constraints and variables as the current model (for this problem, 28 constraints in 21 variables). The machine was halted after 4000 pivot steps had failed to produce a solution. The second experiment used the earlier model with the xi0 and x0j eliminated, resulting in a 28-constraint, 15-variable problem. Here the machine produced the optimal solution in 41 pivot steps. The third experiment used the current formulation with the xi0 and x0j eliminated, yielding 13 constraints and 9 variables. The optimal solution was reached in 7 pivot steps. The fourth and fifth experiments were used on a standard ten-city problem, due to Barachet, solved by Dantzig, Johnson and Fulkerson [1]. The current formulation was used, yielding 91 constraints in 81 variables. The fifth problem differed from the fourth only in that the ordering of the rows was altered to attempt to introduce more favorable pivot choices. In each case the machine was stopped after over 250 pivot steps had failed to produce the solution. In each case the last 100 pivot steps had failed to change the value of the objective function. It seems hopeful that more efficient integer programming procedures now under development will yield a satisfactory algorithmic solution to the traveling salesman problem, when applied to this model. In any case, the model serves to illustrate how problems of this sort may be succinctly formulated in integer programming terms.",
+ "authors": [
+ {
+ "name": "C. E. Miller",
+ "publication_start": 1960,
+ "publication_end": 1960,
+ "publication_count": 1,
+ "citation_count": 179,
+ "available_for_download": 1,
+ "downloads_6_weeks": 132,
+ "downloads_12_months": 993,
+ "downloads_cumulative": 9781,
+ "average_downloads_per_article": 9781,
+ "average_citations_per_article": 179
+ },
+ {
+ "name": "A. W. Tucker",
+ "publication_start": 1960,
+ "publication_end": 1993,
+ "publication_count": 5,
+ "citation_count": 196,
+ "available_for_download": 1,
+ "downloads_6_weeks": 132,
+ "downloads_12_months": 993,
+ "downloads_cumulative": 9781,
+ "average_downloads_per_article": 9781,
+ "average_citations_per_article": 39.2
+ },
+ {
+ "name": "R. A. Zemlin",
+ "publication_start": 1960,
+ "publication_end": 1964,
+ "publication_count": 2,
+ "citation_count": 188,
+ "available_for_download": 2,
+ "downloads_6_weeks": 132,
+ "downloads_12_months": 998,
+ "downloads_cumulative": 10012,
+ "average_downloads_per_article": 5006,
+ "average_citations_per_article": 94
+ }
+ ]
+}
+{
+ "url": "https://dl.acm.org/citation.cfm?id=2412979",
+ "abstract": "The STRUCT system utilizes the flexibility of a powerful graphics display system to provide a set of tools for program analysis. These tools allow the analysis of the static prograin structure and the dynamic execution behavior. of programs within the entire operating system/user program environment of the Brown University Graphics System (BUGS). Information is collected and presented in a manner which fully exploits two aspects of this environment. First, the operating system has been developed in a well-structured hierarcal manner following principles laid down by other researchers (2), (3). Second the programs under analysis have been written in a structured programming language following coding conventions which make available, at the source code level, valuable program control information. A new set of pictorial constructs is introduced for presenting a. program structure (static or dynamic) for inspection. These constructs combine the best features of an indented structured source code listing and the box odented nature of traditional flow charts. The graphical tools available are USed to provide for swift changes in. the desired level of detail displayed within a program structure, for traveling linearly through a program structure, for traveling through a complex program structure (following subroutine or system calls), for concurrently viewing multiple related program structures, and for presenting dynamic program behavior data using three-dimensional projections, The volume of a three-dimensional box representing a program block is proportional to the block's resource utilization. The scope of this paper is limited to a description of the STRUCT system. This system is currently being used to predict and analyze the performance advantages available through the migration of function (program modules) between levels of software and between software and firmware within BUGS. The results of this research on migration will be included in a doctoral dissertation currently being written.",
+ "authors": [
+ {
+ "name": "Andries Van Dam",
+ "publication_start": 1975,
+ "publication_end": 1975,
+ "publication_count": 1,
+ "citation_count": 0,
+ "available_for_download": 0,
+ "downloads_6_weeks": 8,
+ "downloads_12_months": 97,
+ "downloads_cumulative": 97,
+ "average_downloads_per_article": 0,
+ "average_citations_per_article": 0
+ }
+ ]
+}