aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/acm/index.js
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping/acm/index.js')
-rw-r--r--src/scraping/acm/index.js226
1 files changed, 148 insertions, 78 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js
index b7455fe0d..3d5e801be 100644
--- a/src/scraping/acm/index.js
+++ b/src/scraping/acm/index.js
@@ -1,97 +1,167 @@
const {
Builder,
- By,
- Key,
- until
+ By
} = require('selenium-webdriver');
-const fs = require("fs");
+const {
+ readFile,
+ writeFile
+} = require('fs');
-let driver;
-fs.readFile("./citations.txt", {
- encoding: "utf8"
-}, scrapeTargets);
-results = []
+const driver_pause = 500; // milliseconds
+const sample_line_char_max = 100; // characters
+const target_browser = 'chrome';
+
+// GENERAL UTILITY FUNCTIONS
+
+function log_read(content) {
+ process.stdout.write("reading " + content + "...");
+}
+
+function log_snippet(result) {
+ let ellipse = result.length > sample_line_char_max;
+ let i = sample_line_char_max;
+ if (ellipse) {
+ while (result[i] != " " && i < -1) {
+ i--;
+ }
+ }
+ console.log(` "${result.substring(0, i + 1).trim()}${ellipse ? "..." : ""}"`);
+}
+
+// DRIVER UTILITY FUNCTIONS
+
+async function navigate_to(url) {
+ await driver.get(url);
+ await driver.sleep(driver_pause);
+}
+
+async function click_on(xpath) {
+ await driver.findElement(By.xpath(xpath)).click();
+ await driver.sleep(driver_pause);
+}
+
+// TEXT SCRAPING
+
+async function read_title() {
+ log_read("title");
+ let title_el = await driver.findElement(By.xpath('//*[@id="divmain"]/div/h1'));
+ let title = await title_el.getText();
+ log_snippet(title);
+ return title;
+}
+
+async function read_abstract() {
+ log_read("abstract");
+ let lines = [];
+ let webElements = await driver.findElements(By.id("abstract-body"));
+ for (let el of webElements) {
+ let text = await el.getText();
+ lines.push(text);
+ }
+ let abstract = lines.join(" ");
+ log_snippet(abstract);
+ return abstract;
+}
+
+async function read_authors() {
+ log_read("authors");
+ await click_on('//*[@id="tab-1014-btnInnerEl"]/span');
+
+ let authors_el = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]'));
+ let authors = await authors_el.getText();
+ let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:"));
+ let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize);
+
+ let all_authors = [];
+ let i = 0;
+ while (i < author_lines.length) {
+ let individual = [];
+ while (!author_lines[i].startsWith("Average citations")) {
+ individual.push(author_lines[i]);
+ i++;
+ }
+ individual.push(author_lines[i]);
+ all_authors.push(individual);
+ i++;
+ }
-async function scrapeTargets(error, data) {
+ let multiple = all_authors.length == 1 ? "" : " et al.";
+ log_snippet(all_authors[0][0] + multiple);
+ return all_authors;
+}
+
+// JSON / DASH CONVERSION AND EXPORT
+
+function parse_authors(metadata) {
+ let publicationYears = metadata[1].substring(18).split("-");
+ author = {
+ name: metadata[0],
+ publication_start: parseInt(publicationYears[0]),
+ publication_end: parseInt(publicationYears[1])
+ };
+ for (let count = 2; count < metadata.length; count++) {
+ let attr = metadata[count];
+ let char = attr.length - 1;
+ while (attr[char] != " ") {
+ char--;
+ }
+ let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, "");
+ let value = parseFloat(attr.substring(char + 1).replace(/,/g, ""));
+ author[key] = value;
+ }
+ return author;
+}
+
+function write_results() {
+ let output = "";
+ results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n"));
+
+ writeFile("./results.txt", output, function errorHandler(exception) {
+ console.log(exception || "scraped references successfully written as JSON to ./results.txt\n");
+ });
+}
+
+async function scrape_targets(error, data) {
if (error) {
- console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.")
+ console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.");
return;
}
- driver = await new Builder().forBrowser('chrome').build();
-
let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g));
+ let quota = references.length;
+ console.log(`${references.join(", ")}\n`);
+
+ driver = await new Builder().forBrowser(target_browser).build();
+
+ for (let i = 0; i < quota; i++) {
+ let result = {};
- let results = []
- let pdfs = []
- for (let id of references) {
- let result = {}
- let lines = []
try {
- let url = `https://dl.acm.org/citation.cfm?id=${id}`;
- await driver.get(url);
- await driver.sleep(500)
- let candidates = await driver.findElements(By.xpath('.//a[contains(@href, "ft_gateway.cfm?id=")]'));
- if (candidates.length > 0) {
- pdfs.push(candidates[0])
- }
- let webElements = await driver.findElements(By.id("abstract-body"))
- for (let el of webElements) {
- let text = await el.getText()
- lines.push(text)
- }
- result.url = url
- result.abstract = lines.join(" ");
- await driver.findElement(By.xpath(`//*[@id="tab-1014-btnInnerEl"]/span`)).click()
- await driver.sleep(500)
- let authors = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')).getText()
- let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:"))
- authorLines = authors.split("\n").map(line => line.trim()).filter(sanitize)
-
- let i = 0;
- let allAuthors = []
- while (i < authorLines.length) {
- let individual = [];
- while (!authorLines[i].startsWith("Average citations")) {
- individual.push(authorLines[i])
- i++
- }
- individual.push(authorLines[i])
- allAuthors.push(individual);
- i++
- }
- result.authors = allAuthors.map(metadata => {
- let publicationYears = metadata[1].substring(18).split("-");
- author = {
- name: metadata[0],
- publication_start: parseInt(publicationYears[0]),
- publication_end: parseInt(publicationYears[1])
- };
- for (let count = 2; count < metadata.length; count++) {
- let attr = metadata[count]
- let char = attr.length - 1;
- while (attr[char] != " ") {
- char--
- }
- let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, "");
- let value = parseFloat(attr.substring(char + 1).replace(/,/g, ""));
- author[key] = value
- }
- return author
- })
+ let url = `https://dl.acm.org/citation.cfm?id=${references[i]}`;
+ await navigate_to(url);
+ console.log(`scraping ${i + 1}/${quota} (${url})`);
+
+ result.url = url;
+ result.title = await read_title();
+ result.abstract = await read_abstract();
+ result.authors = (await read_authors()).map(parse_authors);
} catch (e) {
- console.log(e)
+ console.log(e);
await driver.quit();
}
- results.push(result)
- }
- let output = "";
- results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n"));
+ results.push(result);
+ console.log();
+ }
- fs.writeFile("./results.txt", output, function errorHandler(exception) {
- console.log(exception || "results successfully written")
- })
+ write_results();
await driver.quit();
-} \ No newline at end of file
+}
+
+let driver;
+let results = [];
+console.log("reading references...");
+readFile("./citations.txt", {
+ encoding: "utf8"
+}, scrape_targets); \ No newline at end of file