edits made

author: Monika <monika_hedman@brown.edu> 2019-07-01 15:39:37 -0400
committer: Monika <monika_hedman@brown.edu> 2019-07-01 15:39:37 -0400
commit: c5391bf0d352f69211ba28fc263d27204d7b8dc4 (patch)
tree: acbc632aff852e74015bd1c8eee4933a7565cb94 /src/scraping/acm/index.js
parent: ee4155a168dadad182719eb55df3459d6a937a45 (diff)
parent: cd78d5b3371d73cc51dcb6f3dbfdca1a3bbcf6e1 (diff)
1 files changed, 187 insertions, 0 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js
new file mode 100644
index 000000000..ff4b099e7
--- /dev/null
+++ b/src/scraping/acm/index.js
@@ -0,0 +1,187 @@
+const {
+    Builder,
+    By
+} = require('selenium-webdriver');
+const {
+    readFile,
+    writeFile
+} = require('fs');
+
+const driver_pause = 500; // milliseconds
+const sample_line_char_max = 100; // characters
+const target_browser = 'chrome';
+
+// GENERAL UTILITY FUNCTIONS
+
+function log_read(content) {
+    process.stdout.write("reading " + content + "...");
+}
+
+function log_snippet(result, quotes = true) {
+    let snippet = "failed to create snippet";
+    switch (typeof result) {
+        case "string":
+            let ellipse = result.length > sample_line_char_max;
+            let i = sample_line_char_max;
+            if (ellipse) {
+                while (result[i] != " " && i < -1) {
+                    i--;
+                }
+            }
+            snippet = `${result.substring(0, i + 1).trim()}${ellipse ? "..." : ""}`;
+            snippet = quotes ? `"${snippet}"` : snippet;
+            break;
+        case "object":
+            snippet = result.map(res => {
+                switch (typeof res) {
+                    case "string":
+                        return res.substring(0, sample_line_char_max / result.length);
+                    case "object":
+                        return res[Object.keys(res)[0]];
+                }
+            }).join(', ');
+    }
+    console.log(snippet);
+    return result;
+}
+
+// DRIVER UTILITY FUNCTIONS
+
+async function navigate_to(url) {
+    await driver.get(url);
+    await driver.sleep(driver_pause);
+}
+
+async function click_on(ref) {
+    await (await locate(ref)).click();
+    await driver.sleep(driver_pause);
+}
+
+async function locate(ref, multiple = false) {
+    let locator = ref.startsWith("//") ? By.xpath(ref) : By.id(ref);
+    return await multiple ? driver.findElements(locator) : driver.findElement(locator);
+}
+
+async function text_of(ref) {
+    let element = await locate(ref);
+    return await element.getText();
+}
+
+async function text_of_all(ref) {
+    let elements = await locate(ref, true);
+    let results = [];
+    for (let element of elements) {
+        results.push(await element.getText());
+    }
+    return results;
+}
+
+async function logged_assign(key, value) {
+    log_read(key);
+    result[key] = log_snippet(value);
+}
+
+// TEXT SCRAPING
+
+async function read_authors() {
+    await click_on('//*[@id="tab-1014-btnInnerEl"]/span');
+
+    let authors = await text_of('//*[@id="tabpanel-1009-body"]');
+    let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:"));
+    let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize);
+
+    let all_authors = [];
+    let i = 0;
+    while (i < author_lines.length) {
+        let individual = [];
+        while (!author_lines[i].startsWith("Average citations")) {
+            individual.push(author_lines[i]);
+            i++;
+        }
+        individual.push(author_lines[i]);
+        all_authors.push(individual);
+        i++;
+    }
+
+    return all_authors;
+}
+
+// JSON / DASH CONVERSION AND EXPORT
+
+function parse_authors(metadata) {
+    let publicationYears = metadata[1].substring(18).split("-");
+    author = {
+        name: metadata[0],
+        publication_start: parseInt(publicationYears[0]),
+        publication_end: parseInt(publicationYears[1])
+    };
+    for (let count = 2; count < metadata.length; count++) {
+        let attr = metadata[count];
+        let char = attr.length - 1;
+        while (attr[char] != " ") {
+            char--;
+        }
+        let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, "");
+        let value = parseFloat(attr.substring(char + 1).replace(/,/g, ""));
+        author[key] = value;
+    }
+    return author;
+}
+
+function write_results() {
+    console.log();
+    let output = "";
+    results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n"));
+
+    writeFile("./results.txt", output, function errorHandler(exception) {
+        console.log(exception || "scraped references successfully written as JSON to ./results.txt\n");
+    });
+}
+
+async function scrape_targets(error, data) {
+    if (error) {
+        console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.");
+        return;
+    }
+
+    let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g));
+    let quota = references.length;
+    log_snippet(`found ${quota} references to scrape`, false);
+
+    driver = await new Builder().forBrowser(target_browser).build();
+
+    for (let i = 0; i < quota; i++) {
+        try {
+            result = {};
+
+            let id = references[i];
+            let url = `https://dl.acm.org/citation.cfm?id=${id}`;
+            console.log(`\nscraping ${i + 1}/${quota} (${id})`);
+            await navigate_to(url);
+
+            logged_assign("url", url);
+            logged_assign("title", await text_of('//*[@id="divmain"]/div/h1'));
+            logged_assign("abstract", (await text_of_all("abstract-body")).join(" "));
+            logged_assign("authors", (await read_authors()).map(parse_authors));
+        } catch (e) {
+            console.log(e);
+            await driver.quit();
+        }
+
+        results.push(result);
+    }
+
+    write_results();
+
+    await driver.quit();
+}
+
+let driver;
+let results = [];
+let result = {};
+
+log_read("target references");
+
+readFile("./citations.txt", {
+    encoding: "utf8"
+}, scrape_targets);
+\ No newline at end of file
author	Monika <monika_hedman@brown.edu>	2019-07-01 15:39:37 -0400
committer	Monika <monika_hedman@brown.edu>	2019-07-01 15:39:37 -0400
commit	c5391bf0d352f69211ba28fc263d27204d7b8dc4 (patch)
tree	acbc632aff852e74015bd1c8eee4933a7565cb94 /src/scraping/acm/index.js
parent	ee4155a168dadad182719eb55df3459d6a937a45 (diff)
parent	cd78d5b3371d73cc51dcb6f3dbfdca1a3bbcf6e1 (diff)