diff options
Diffstat (limited to 'src/scraping/acm')
-rw-r--r-- | src/scraping/acm/.gitignore | 2 | ||||
-rwxr-xr-x | src/scraping/acm/chromedriver | bin | 0 -> 10256192 bytes | |||
-rw-r--r-- | src/scraping/acm/index.js | 333 | ||||
-rw-r--r-- | src/scraping/acm/results.txt | 55 |
4 files changed, 305 insertions, 85 deletions
diff --git a/src/scraping/acm/.gitignore b/src/scraping/acm/.gitignore new file mode 100644 index 000000000..caca8b99c --- /dev/null +++ b/src/scraping/acm/.gitignore @@ -0,0 +1,2 @@ +./citations.txt +./results.txt
\ No newline at end of file diff --git a/src/scraping/acm/chromedriver b/src/scraping/acm/chromedriver Binary files differnew file mode 100755 index 000000000..9e9b16717 --- /dev/null +++ b/src/scraping/acm/chromedriver diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index 81f775617..b71d55226 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -1,88 +1,279 @@ -const { Builder, By, Key, until } = require('selenium-webdriver'); -const fs = require("fs"); +const { + Builder, + By +} = require('selenium-webdriver'); +const { + readFile, + writeFile +} = require('fs'); -let driver; -fs.readFile("./citations.txt", { encoding: "utf8" }, scrapeTargets); -results = [] +const target_source = './citations.txt'; +const target_browser = 'chrome'; +const target_dist = './results.txt'; -async function scrapeTargets(error, data) { - if (error) { - console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.") - return; +const driver_pause = 500; // milliseconds +const sample_line_char_max = 100; // characters + +const tab_map = { + abstract: "11", + authors: "14", + references: "15", + cited_by: "16", + index_terms: "17", + publication: "18", + reviews: "19", + comments: "20", + table_of_contents: "21" +}; + +String.prototype.removeAll = function (replacements, trim = true) { + let result = this; + for (let expression of replacements) { + result = result.replace(expression, ""); } + return trim ? result.trim() : result; +}; - driver = await new Builder().forBrowser('chrome').build(); +String.prototype.remove = function (replacement, trim = true) { + let result = this.replace(replacement, ""); + return trim ? result.trim() : result; +}; - let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g)); +Object.prototype.first = function () { + return this[Object.keys(this)[0]]; +}; - let results = [] - let pdfs = [] - for (let id of references) { - let result = {} - let lines = [] - try { - let url = `https://dl.acm.org/citation.cfm?id=${id}`; - await driver.get(url); - await driver.sleep(500) - let candidates = await driver.findElements(By.xpath('.//a[contains(@href, "ft_gateway.cfm?id=")]')); - if (candidates.length > 0) { - pdfs.push(candidates[0]) - } - let webElements = await driver.findElements(By.id("abstract-body")) - for (let el of webElements) { - let text = await el.getText() - lines.push(text) - } - result.url = url - result.abstract = lines.join(" "); - await driver.findElement(By.xpath(`//*[@id="tab-1014-btnInnerEl"]/span`)).click() - await driver.sleep(500) - let authors = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')).getText() - let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")) - authorLines = authors.split("\n").map(line => line.trim()).filter(sanitize) - - let i = 0; - let allAuthors = [] - while (i < authorLines.length) { - let individual = []; - while (!authorLines[i].startsWith("Average citations")) { - individual.push(authorLines[i]) - i++ +// GENERAL UTILITY FUNCTIONS + +function log_read(content) { + process.stdout.write("reading " + content + "..."); +} + +function log_snippet(result, quotes = true) { + let snippet = "failed to create snippet"; + switch (typeof result) { + case "string": + let ellipse = result.length > sample_line_char_max; + let i = sample_line_char_max; + if (ellipse) { + while (result[i] != " " && i < -1) { + i--; } - individual.push(authorLines[i]) - allAuthors.push(individual); - i++ } - result.authors = allAuthors.map(metadata => { - let publicationYears = metadata[1].substring(18).split("-"); - author = { - name: metadata[0], - publication_start: parseInt(publicationYears[0]), - publication_end: parseInt(publicationYears[1]) - }; - for (let count = 2; count < metadata.length; count++) { - let attr = metadata[count] - let char = attr.length - 1; - while (attr[char] != " ") { - char-- + snippet = `${result.substring(0, i + 1).trim()}${ellipse ? "..." : ""}`; + snippet = quotes ? `"${snippet}"` : snippet; + break; + case "object": + if (Array.isArray(result)) { + snippet = result.map(res => { + switch (typeof res) { + case "string": + return res.substring(0, sample_line_char_max / result.length); + case "object": + return res.first(); } - let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, ""); - let value = parseFloat(attr.substring(char + 1).replace(/,/g, "")); - author[key] = value - } - return author - }) - } catch (e) { - console.log(e) - await driver.quit(); + }).join(', '); + } else { + snippet = result.first(); + } + } + console.log(snippet); + return result; +} + +// DRIVER UTILITY FUNCTIONS + +async function navigate_to(url) { + await driver.get(url); + await driver.sleep(driver_pause); +} + +async function click_on(ref) { + await (await locate(ref)).click(); + await driver.sleep(driver_pause); +} + +async function click_on_acm_tab(target) { + await click_on(`//*[@id="tab-10${tab_map[target]}-btnInnerEl"]/span`); +} + +async function locate(ref, multiple = false) { + let locator = ref.startsWith("//") ? By.xpath(ref) : By.id(ref); + return await multiple ? driver.findElements(locator) : driver.findElement(locator); +} + +async function text_of(ref) { + let element = await locate(ref); + return await element.getText(); +} + +async function text_of_all(ref, delimiter = undefined) { + let elements = await locate(ref, true); + let results = []; + for (let element of elements) { + results.push(await element.getText()); + } + return delimiter ? results.join(delimiter) : results; +} + +async function logged_assign(key, value) { + log_read(key); + result[key] = log_snippet(value); +} + +// TEXT SCRAPING + +async function read_authors() { + let authors = await text_of('//*[@id="tabpanel-1009-body"]'); + let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")); + let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize); + + let all_authors = []; + let i = 0; + while (i < author_lines.length) { + let individual = []; + while (!author_lines[i].startsWith("Average citations")) { + individual.push(author_lines[i]); + i++; + } + individual.push(author_lines[i]); + all_authors.push(individual); + i++; + } + + return all_authors.map(parse_author); +} + +async function read_publication() { + let publciation_elements = (await text_of("source-body")).split("\n"); + let publication_module = {}; + + let extract = (regex, target, index = 1) => regex.exec(target)[index]; + + for (let element of publciation_elements) { + + let location = /Volume (\d+) Issue (\d+), ([\w.\d]+)/g; + let pages = /(\d+)-(\d+)/g; + let publication_date = /(\d{4}-\d{2}-\d{2})/g; + let publisher = /Publisher (.*)/g; + let issn = /ISSN: (\d{4}-\d{4})/g; + let eissn = /EISSN: ([\dA-Z]{4}-[\dA-Z]{4})/g; + let doi = /doi>([\.\d\/A-Z]+)/g; + + if (element.startsWith("Title")) { + publication_module.name = element.substring(6).removeAll(["table of contents", "archive", /\w+ Homepage/]); + } else if (element.startsWith("Volume ")) { + let match = location.exec(element); + publication_module.volume = parseInt(match[1]); + publication_module.issue = parseInt(match[2]); + publication_module.month = match[3]; + } else if (element.startsWith("Pages ")) { + let match = pages.exec(element); + publication_module.page_start = parseInt(match[1]); + publication_module.page_end = parseInt(match[2]); + } else if (element.startsWith("Publication Date ")) { + publication_module.publication_date = extract(publication_date, element); + } else if (element.startsWith("Publisher ")) { + publication_module.publisher = extract(publisher, element); + } else if (element.startsWith("ISSN: ")) { + publication_module.issn = extract(issn, element); + if (element.includes("EISSN: ")) { + publication_module.eissn = extract(eissn, element); + } + publication_module.doi = extract(doi, element); } - results.push(result) } + return publication_module; +} + +// JSON / DASH CONVERSION AND EXPORT +function parse_author(metadata) { + let publicationYears = metadata[1].substring(18).split("-"); + author = { + name: metadata[0], + publication_start: parseInt(publicationYears[0]), + publication_end: parseInt(publicationYears[1]) + }; + for (let count = 2; count < metadata.length; count++) { + let attr = metadata[count]; + let char = attr.length - 1; + while (attr[char] != " ") { + char--; + } + let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").remove(/[\(\)]/g); + let value = parseFloat(attr.substring(char + 1).remove(/,/g)); + author[key] = value; + } + return author; +} + +function write_results() { + console.log(); let output = ""; results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); - fs.writeFile("./results.txt", output, function errorHandler(exception) { console.log(exception || "results successfully written") }) + writeFile(target_dist, output, function errorHandler(exception) { + console.log(exception || "scraped references successfully written as JSON to ./results.txt"); + }); +} + +async function scrape_targets(error, data) { + if (error) { + console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided."); + return; + } + + let references = data.split("\n").map(entry => entry.removeAll(["\r"])).filter(line => line.match(/\d+/g)); + let quota = references.length; + log_snippet(`found ${quota} references to scrape`, false); + + driver = await new Builder().forBrowser(target_browser).build(); + + for (let i = 0; i < quota; i++) { + try { + result = {}; + let target; + + let id = references[i]; + let url = `https://dl.acm.org/citation.cfm?id=${id}`; + console.log(`\nscraping ${i + 1}/${quota} (${id})`); + + await navigate_to(url); + + logged_assign("url", url); + logged_assign("title", await text_of('//*[@id="divmain"]/div/h1')); + + target = "abstract"; + await click_on_acm_tab(target); + logged_assign(target, await text_of_all("abstract-body", " ")); + + target = "authors"; + await click_on_acm_tab(target); + logged_assign(target, await read_authors()); + + target = "publication"; + await click_on_acm_tab(target); + logged_assign(target, await read_publication()); + } catch (e) { + console.log(e); + await driver.quit(); + } + + results.push(result); + } + + write_results(); await driver.quit(); -}
\ No newline at end of file +} + +let driver; +let results = []; +let result = {}; + +log_read("target references"); + +readFile(target_source, { + encoding: "utf8" +}, scrape_targets); diff --git a/src/scraping/acm/results.txt b/src/scraping/acm/results.txt index 05bb2be8b..a15da8b10 100644 --- a/src/scraping/acm/results.txt +++ b/src/scraping/acm/results.txt @@ -1,5 +1,6 @@ { "url": "https://dl.acm.org/citation.cfm?id=321046", + "title": "Integer Programming Formulation of Traveling Salesman Problems", "abstract": "It has been observed by many people that a striking number of quite diverse mathematical problems can be formulated as problems in integer programming, that is, linear programming problems in which some or all of the variables are required to assume integral values. This fact is rendered quite interesting by recent research on such problems, notably by R. E. Gomory [2, 3], which gives promise of yielding efficient computational techniques for their solution. The present paper provides yet another example of the versatility of integer programming as a mathematical modeling device by representing a generalization of the well-known “Travelling Salesman Problem” in integer programming terms. The authors have developed several such models, of which the one presented here is the most efficient in terms of generality, number of variables, and number of constraints. This model is due to the second author [4] and was presented briefly at the Symposium on Combinatorial Problems held at Princeton University, April 1960, sponsored by SIAM and IBM. The problem treated is: (1) A salesman is required to visit each of n cities, indexed by 1, … , n. He leaves from a “base city” indexed by 0, visits each of the n other cities exactly once, and returns to city 0. During his travels he must return to 0 exactly t times, including his final return (here t may be allowed to vary), and he must visit no more than p cities in one tour. (By a tour we mean a succession of visits to cities without stopping at city 0.) It is required to find such an itinerary which minimizes the total distance traveled by the salesman. Note that if t is fixed, then for the problem to have a solution we must have tp ≧ n. For t = 1, p ≧ n, we have the standard traveling salesman problem. Let dij (i ≠ j = 0, 1, … , n) be the distance covered in traveling from city i to city j. The following integer programming problem will be shown to be equivalent to (1): (2) Minimize the linear form ∑0≦i≠j≦n∑ dijxij over the set determined by the relations ∑ni=0i≠j xij = 1 (j = 1, … , n) ∑nj=0j≠i xij = 1 (i = 1, … , n) ui - uj + pxij ≦ p - 1 (1 ≦ i ≠ j ≦ n) where the xij are non-negative integers and the ui (i = 1, …, n) are arbitrary real numbers. (We shall see that it is permissible to restrict the ui to be non-negative integers as well.) If t is fixed it is necessary to add the additional relation: ∑nu=1 xi0 = t Note that the constraints require that xij = 0 or 1, so that a natural correspondence between these two problems exists if the xij are interpreted as follows: The salesman proceeds from city i to city j if and only if xij = 1. Under this correspondence the form to be minimized in (2) is the total distance to be traveled by the salesman in (1), so the burden of proof is to show that the two feasible sets correspond; i.e., a feasible solution to (2) has xij which do define a legitimate itinerary in (1), and, conversely a legitimate itinerary in (1) defines xij, which, together with appropriate ui, satisfy the constraints of (2). Consider a feasible solution to (2). The number of returns to city 0 is given by ∑ni=1 xi0. The constraints of the form ∑ xij = 1, all xij non-negative integers, represent the conditions that each city (other than zero) is visited exactly once. The ui play a role similar to node potentials in a network and the inequalities involving them serve to eliminate tours that do not begin and end at city 0 and tours that visit more than p cities. Consider any xr0r1 = 1 (r1 ≠ 0). There exists a unique r2 such that xr1r2 = 1. Unless r2 = 0, there is a unique r3 with xr2r3 = 1. We proceed in this fashion until some rj = 0. This must happen since the alternative is that at some point we reach an rk = rj, j + 1 < k. Since none of the r's are zero we have uri - uri + 1 + pxriri + 1 ≦ p - 1 or uri - uri + 1 ≦ - 1. Summing from i = j to k - 1, we have urj - urk = 0 ≦ j + 1 - k, which is a contradiction. Thus all tours include city 0. It remains to observe that no tours is of length greater than p. Suppose such a tour exists, x0r1 , xr1r2 , … , xrprp+1 = 1 with all ri ≠ 0. Then, as before, ur1 - urp+1 ≦ - p or urp+1 - ur1 ≧ p. But we have urp+1 - ur1 + pxrp+1r1 ≦ p - 1 or urp+1 - ur1 ≦ p (1 - xrp+1r1) - 1 ≦ p - 1, which is a contradiction. Conversely, if the xij correspond to a legitimate itinerary, it is clear that the ui can be adjusted so that ui = j if city i is the jth city visited in the tour which includes city i, for we then have ui - uj = - 1 if xij = 1, and always ui - uj ≦ p - 1. The above integer program involves n2 + n constraints (if t is not fixed) in n2 + 2n variables. Since the inequality form of constraint is fundamental for integer programming calculations, one may eliminate 2n variables, say the xi0 and x0j, by means of the equation constraints and produce an equivalent problem with n2 + n inequalities and n2 variables. The currently known integer programming procedures are sufficiently regular in their behavior to cast doubt on the heuristic value of machine experiments with our model. However, it seems appropriate to report the results of the five machine experiments we have conducted so far. The solution procedure used was the all-integer algorithm of R. E. Gomory [3] without the ranking procedure he describes. The first three experiments were simple model verification tests on a four-city standard traveling salesman problem with distance matrix [ 20 23 4 30 7 27 25 5 25 3 21 26 ] The first experiment was with a model, now obsolete, using roughly twice as many constraints and variables as the current model (for this problem, 28 constraints in 21 variables). The machine was halted after 4000 pivot steps had failed to produce a solution. The second experiment used the earlier model with the xi0 and x0j eliminated, resulting in a 28-constraint, 15-variable problem. Here the machine produced the optimal solution in 41 pivot steps. The third experiment used the current formulation with the xi0 and x0j eliminated, yielding 13 constraints and 9 variables. The optimal solution was reached in 7 pivot steps. The fourth and fifth experiments were used on a standard ten-city problem, due to Barachet, solved by Dantzig, Johnson and Fulkerson [1]. The current formulation was used, yielding 91 constraints in 81 variables. The fifth problem differed from the fourth only in that the ordering of the rows was altered to attempt to introduce more favorable pivot choices. In each case the machine was stopped after over 250 pivot steps had failed to produce the solution. In each case the last 100 pivot steps had failed to change the value of the objective function. It seems hopeful that more efficient integer programming procedures now under development will yield a satisfactory algorithmic solution to the traveling salesman problem, when applied to this model. In any case, the model serves to illustrate how problems of this sort may be succinctly formulated in integer programming terms.", "authors": [ { @@ -9,10 +10,10 @@ "publication_count": 1, "citation_count": 179, "available_for_download": 1, - "downloads_6_weeks": 132, - "downloads_12_months": 993, - "downloads_cumulative": 9781, - "average_downloads_per_article": 9781, + "downloads_6_weeks": 124, + "downloads_12_months": 923, + "downloads_cumulative": 9794, + "average_downloads_per_article": 9794, "average_citations_per_article": 179 }, { @@ -22,10 +23,10 @@ "publication_count": 5, "citation_count": 196, "available_for_download": 1, - "downloads_6_weeks": 132, - "downloads_12_months": 993, - "downloads_cumulative": 9781, - "average_downloads_per_article": 9781, + "downloads_6_weeks": 124, + "downloads_12_months": 923, + "downloads_cumulative": 9794, + "average_downloads_per_article": 9794, "average_citations_per_article": 39.2 }, { @@ -35,16 +36,30 @@ "publication_count": 2, "citation_count": 188, "available_for_download": 2, - "downloads_6_weeks": 132, - "downloads_12_months": 998, - "downloads_cumulative": 10012, - "average_downloads_per_article": 5006, + "downloads_6_weeks": 124, + "downloads_12_months": 928, + "downloads_cumulative": 10025, + "average_downloads_per_article": 5012.5, "average_citations_per_article": 94 } - ] + ], + "publication": { + "name": "Journal of the ACM (JACM)", + "volume": 7, + "issue": 4, + "month": "Oct.", + "page_start": 326, + "page_end": 329, + "publication_date": "1960-10-01", + "publisher": "ACM New York, NY, USA", + "issn": "0004-5411", + "eissn": "1557-735X", + "doi": "10.1145/321043.321046" + } } { "url": "https://dl.acm.org/citation.cfm?id=2412979", + "title": "STRUCT programming analysis system", "abstract": "The STRUCT system utilizes the flexibility of a powerful graphics display system to provide a set of tools for program analysis. These tools allow the analysis of the static prograin structure and the dynamic execution behavior. of programs within the entire operating system/user program environment of the Brown University Graphics System (BUGS). Information is collected and presented in a manner which fully exploits two aspects of this environment. First, the operating system has been developed in a well-structured hierarcal manner following principles laid down by other researchers (2), (3). Second the programs under analysis have been written in a structured programming language following coding conventions which make available, at the source code level, valuable program control information. A new set of pictorial constructs is introduced for presenting a. program structure (static or dynamic) for inspection. These constructs combine the best features of an indented structured source code listing and the box odented nature of traditional flow charts. The graphical tools available are USed to provide for swift changes in. the desired level of detail displayed within a program structure, for traveling linearly through a program structure, for traveling through a complex program structure (following subroutine or system calls), for concurrently viewing multiple related program structures, and for presenting dynamic program behavior data using three-dimensional projections, The volume of a three-dimensional box representing a program block is proportional to the block's resource utilization. The scope of this paper is limited to a description of the STRUCT system. This system is currently being used to predict and analyze the performance advantages available through the migration of function (program modules) between levels of software and between software and firmware within BUGS. The results of this research on migration will be included in a doctoral dissertation currently being written.", "authors": [ { @@ -60,5 +75,17 @@ "average_downloads_per_article": 0, "average_citations_per_article": 0 } - ] + ], + "publication": { + "name": "IEEE Transactions on Software Engineering", + "volume": 1, + "issue": 1, + "month": "March", + "page_start": 384, + "page_end": 389, + "publication_date": "1975-03-01", + "publisher": "IEEE Press Piscataway, NJ, USA", + "issn": "0098-5589", + "doi": "10.1109/TSE.1975.6312869" + } } |