diff options
Diffstat (limited to 'src/scraping/acm/index.js')
-rw-r--r-- | src/scraping/acm/index.js | 333 |
1 files changed, 262 insertions, 71 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index 81f775617..b71d55226 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -1,88 +1,279 @@ -const { Builder, By, Key, until } = require('selenium-webdriver'); -const fs = require("fs"); +const { + Builder, + By +} = require('selenium-webdriver'); +const { + readFile, + writeFile +} = require('fs'); -let driver; -fs.readFile("./citations.txt", { encoding: "utf8" }, scrapeTargets); -results = [] +const target_source = './citations.txt'; +const target_browser = 'chrome'; +const target_dist = './results.txt'; -async function scrapeTargets(error, data) { - if (error) { - console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.") - return; +const driver_pause = 500; // milliseconds +const sample_line_char_max = 100; // characters + +const tab_map = { + abstract: "11", + authors: "14", + references: "15", + cited_by: "16", + index_terms: "17", + publication: "18", + reviews: "19", + comments: "20", + table_of_contents: "21" +}; + +String.prototype.removeAll = function (replacements, trim = true) { + let result = this; + for (let expression of replacements) { + result = result.replace(expression, ""); } + return trim ? result.trim() : result; +}; - driver = await new Builder().forBrowser('chrome').build(); +String.prototype.remove = function (replacement, trim = true) { + let result = this.replace(replacement, ""); + return trim ? result.trim() : result; +}; - let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g)); +Object.prototype.first = function () { + return this[Object.keys(this)[0]]; +}; - let results = [] - let pdfs = [] - for (let id of references) { - let result = {} - let lines = [] - try { - let url = `https://dl.acm.org/citation.cfm?id=${id}`; - await driver.get(url); - await driver.sleep(500) - let candidates = await driver.findElements(By.xpath('.//a[contains(@href, "ft_gateway.cfm?id=")]')); - if (candidates.length > 0) { - pdfs.push(candidates[0]) - } - let webElements = await driver.findElements(By.id("abstract-body")) - for (let el of webElements) { - let text = await el.getText() - lines.push(text) - } - result.url = url - result.abstract = lines.join(" "); - await driver.findElement(By.xpath(`//*[@id="tab-1014-btnInnerEl"]/span`)).click() - await driver.sleep(500) - let authors = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')).getText() - let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")) - authorLines = authors.split("\n").map(line => line.trim()).filter(sanitize) - - let i = 0; - let allAuthors = [] - while (i < authorLines.length) { - let individual = []; - while (!authorLines[i].startsWith("Average citations")) { - individual.push(authorLines[i]) - i++ +// GENERAL UTILITY FUNCTIONS + +function log_read(content) { + process.stdout.write("reading " + content + "..."); +} + +function log_snippet(result, quotes = true) { + let snippet = "failed to create snippet"; + switch (typeof result) { + case "string": + let ellipse = result.length > sample_line_char_max; + let i = sample_line_char_max; + if (ellipse) { + while (result[i] != " " && i < -1) { + i--; } - individual.push(authorLines[i]) - allAuthors.push(individual); - i++ } - result.authors = allAuthors.map(metadata => { - let publicationYears = metadata[1].substring(18).split("-"); - author = { - name: metadata[0], - publication_start: parseInt(publicationYears[0]), - publication_end: parseInt(publicationYears[1]) - }; - for (let count = 2; count < metadata.length; count++) { - let attr = metadata[count] - let char = attr.length - 1; - while (attr[char] != " ") { - char-- + snippet = `${result.substring(0, i + 1).trim()}${ellipse ? "..." : ""}`; + snippet = quotes ? `"${snippet}"` : snippet; + break; + case "object": + if (Array.isArray(result)) { + snippet = result.map(res => { + switch (typeof res) { + case "string": + return res.substring(0, sample_line_char_max / result.length); + case "object": + return res.first(); } - let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, ""); - let value = parseFloat(attr.substring(char + 1).replace(/,/g, "")); - author[key] = value - } - return author - }) - } catch (e) { - console.log(e) - await driver.quit(); + }).join(', '); + } else { + snippet = result.first(); + } + } + console.log(snippet); + return result; +} + +// DRIVER UTILITY FUNCTIONS + +async function navigate_to(url) { + await driver.get(url); + await driver.sleep(driver_pause); +} + +async function click_on(ref) { + await (await locate(ref)).click(); + await driver.sleep(driver_pause); +} + +async function click_on_acm_tab(target) { + await click_on(`//*[@id="tab-10${tab_map[target]}-btnInnerEl"]/span`); +} + +async function locate(ref, multiple = false) { + let locator = ref.startsWith("//") ? By.xpath(ref) : By.id(ref); + return await multiple ? driver.findElements(locator) : driver.findElement(locator); +} + +async function text_of(ref) { + let element = await locate(ref); + return await element.getText(); +} + +async function text_of_all(ref, delimiter = undefined) { + let elements = await locate(ref, true); + let results = []; + for (let element of elements) { + results.push(await element.getText()); + } + return delimiter ? results.join(delimiter) : results; +} + +async function logged_assign(key, value) { + log_read(key); + result[key] = log_snippet(value); +} + +// TEXT SCRAPING + +async function read_authors() { + let authors = await text_of('//*[@id="tabpanel-1009-body"]'); + let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")); + let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize); + + let all_authors = []; + let i = 0; + while (i < author_lines.length) { + let individual = []; + while (!author_lines[i].startsWith("Average citations")) { + individual.push(author_lines[i]); + i++; + } + individual.push(author_lines[i]); + all_authors.push(individual); + i++; + } + + return all_authors.map(parse_author); +} + +async function read_publication() { + let publciation_elements = (await text_of("source-body")).split("\n"); + let publication_module = {}; + + let extract = (regex, target, index = 1) => regex.exec(target)[index]; + + for (let element of publciation_elements) { + + let location = /Volume (\d+) Issue (\d+), ([\w.\d]+)/g; + let pages = /(\d+)-(\d+)/g; + let publication_date = /(\d{4}-\d{2}-\d{2})/g; + let publisher = /Publisher (.*)/g; + let issn = /ISSN: (\d{4}-\d{4})/g; + let eissn = /EISSN: ([\dA-Z]{4}-[\dA-Z]{4})/g; + let doi = /doi>([\.\d\/A-Z]+)/g; + + if (element.startsWith("Title")) { + publication_module.name = element.substring(6).removeAll(["table of contents", "archive", /\w+ Homepage/]); + } else if (element.startsWith("Volume ")) { + let match = location.exec(element); + publication_module.volume = parseInt(match[1]); + publication_module.issue = parseInt(match[2]); + publication_module.month = match[3]; + } else if (element.startsWith("Pages ")) { + let match = pages.exec(element); + publication_module.page_start = parseInt(match[1]); + publication_module.page_end = parseInt(match[2]); + } else if (element.startsWith("Publication Date ")) { + publication_module.publication_date = extract(publication_date, element); + } else if (element.startsWith("Publisher ")) { + publication_module.publisher = extract(publisher, element); + } else if (element.startsWith("ISSN: ")) { + publication_module.issn = extract(issn, element); + if (element.includes("EISSN: ")) { + publication_module.eissn = extract(eissn, element); + } + publication_module.doi = extract(doi, element); } - results.push(result) } + return publication_module; +} + +// JSON / DASH CONVERSION AND EXPORT +function parse_author(metadata) { + let publicationYears = metadata[1].substring(18).split("-"); + author = { + name: metadata[0], + publication_start: parseInt(publicationYears[0]), + publication_end: parseInt(publicationYears[1]) + }; + for (let count = 2; count < metadata.length; count++) { + let attr = metadata[count]; + let char = attr.length - 1; + while (attr[char] != " ") { + char--; + } + let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").remove(/[\(\)]/g); + let value = parseFloat(attr.substring(char + 1).remove(/,/g)); + author[key] = value; + } + return author; +} + +function write_results() { + console.log(); let output = ""; results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); - fs.writeFile("./results.txt", output, function errorHandler(exception) { console.log(exception || "results successfully written") }) + writeFile(target_dist, output, function errorHandler(exception) { + console.log(exception || "scraped references successfully written as JSON to ./results.txt"); + }); +} + +async function scrape_targets(error, data) { + if (error) { + console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided."); + return; + } + + let references = data.split("\n").map(entry => entry.removeAll(["\r"])).filter(line => line.match(/\d+/g)); + let quota = references.length; + log_snippet(`found ${quota} references to scrape`, false); + + driver = await new Builder().forBrowser(target_browser).build(); + + for (let i = 0; i < quota; i++) { + try { + result = {}; + let target; + + let id = references[i]; + let url = `https://dl.acm.org/citation.cfm?id=${id}`; + console.log(`\nscraping ${i + 1}/${quota} (${id})`); + + await navigate_to(url); + + logged_assign("url", url); + logged_assign("title", await text_of('//*[@id="divmain"]/div/h1')); + + target = "abstract"; + await click_on_acm_tab(target); + logged_assign(target, await text_of_all("abstract-body", " ")); + + target = "authors"; + await click_on_acm_tab(target); + logged_assign(target, await read_authors()); + + target = "publication"; + await click_on_acm_tab(target); + logged_assign(target, await read_publication()); + } catch (e) { + console.log(e); + await driver.quit(); + } + + results.push(result); + } + + write_results(); await driver.quit(); -}
\ No newline at end of file +} + +let driver; +let results = []; +let result = {}; + +log_read("target references"); + +readFile(target_source, { + encoding: "utf8" +}, scrape_targets); |