diff options
Diffstat (limited to 'src/scraping/acm/index.js')
-rw-r--r-- | src/scraping/acm/index.js | 226 |
1 files changed, 148 insertions, 78 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index b7455fe0d..3d5e801be 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -1,97 +1,167 @@ const { Builder, - By, - Key, - until + By } = require('selenium-webdriver'); -const fs = require("fs"); +const { + readFile, + writeFile +} = require('fs'); -let driver; -fs.readFile("./citations.txt", { - encoding: "utf8" -}, scrapeTargets); -results = [] +const driver_pause = 500; // milliseconds +const sample_line_char_max = 100; // characters +const target_browser = 'chrome'; + +// GENERAL UTILITY FUNCTIONS + +function log_read(content) { + process.stdout.write("reading " + content + "..."); +} + +function log_snippet(result) { + let ellipse = result.length > sample_line_char_max; + let i = sample_line_char_max; + if (ellipse) { + while (result[i] != " " && i < -1) { + i--; + } + } + console.log(` "${result.substring(0, i + 1).trim()}${ellipse ? "..." : ""}"`); +} + +// DRIVER UTILITY FUNCTIONS + +async function navigate_to(url) { + await driver.get(url); + await driver.sleep(driver_pause); +} + +async function click_on(xpath) { + await driver.findElement(By.xpath(xpath)).click(); + await driver.sleep(driver_pause); +} + +// TEXT SCRAPING + +async function read_title() { + log_read("title"); + let title_el = await driver.findElement(By.xpath('//*[@id="divmain"]/div/h1')); + let title = await title_el.getText(); + log_snippet(title); + return title; +} + +async function read_abstract() { + log_read("abstract"); + let lines = []; + let webElements = await driver.findElements(By.id("abstract-body")); + for (let el of webElements) { + let text = await el.getText(); + lines.push(text); + } + let abstract = lines.join(" "); + log_snippet(abstract); + return abstract; +} + +async function read_authors() { + log_read("authors"); + await click_on('//*[@id="tab-1014-btnInnerEl"]/span'); + + let authors_el = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')); + let authors = await authors_el.getText(); + let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")); + let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize); + + let all_authors = []; + let i = 0; + while (i < author_lines.length) { + let individual = []; + while (!author_lines[i].startsWith("Average citations")) { + individual.push(author_lines[i]); + i++; + } + individual.push(author_lines[i]); + all_authors.push(individual); + i++; + } -async function scrapeTargets(error, data) { + let multiple = all_authors.length == 1 ? "" : " et al."; + log_snippet(all_authors[0][0] + multiple); + return all_authors; +} + +// JSON / DASH CONVERSION AND EXPORT + +function parse_authors(metadata) { + let publicationYears = metadata[1].substring(18).split("-"); + author = { + name: metadata[0], + publication_start: parseInt(publicationYears[0]), + publication_end: parseInt(publicationYears[1]) + }; + for (let count = 2; count < metadata.length; count++) { + let attr = metadata[count]; + let char = attr.length - 1; + while (attr[char] != " ") { + char--; + } + let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, ""); + let value = parseFloat(attr.substring(char + 1).replace(/,/g, "")); + author[key] = value; + } + return author; +} + +function write_results() { + let output = ""; + results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); + + writeFile("./results.txt", output, function errorHandler(exception) { + console.log(exception || "scraped references successfully written as JSON to ./results.txt\n"); + }); +} + +async function scrape_targets(error, data) { if (error) { - console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.") + console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided."); return; } - driver = await new Builder().forBrowser('chrome').build(); - let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g)); + let quota = references.length; + console.log(`${references.join(", ")}\n`); + + driver = await new Builder().forBrowser(target_browser).build(); + + for (let i = 0; i < quota; i++) { + let result = {}; - let results = [] - let pdfs = [] - for (let id of references) { - let result = {} - let lines = [] try { - let url = `https://dl.acm.org/citation.cfm?id=${id}`; - await driver.get(url); - await driver.sleep(500) - let candidates = await driver.findElements(By.xpath('.//a[contains(@href, "ft_gateway.cfm?id=")]')); - if (candidates.length > 0) { - pdfs.push(candidates[0]) - } - let webElements = await driver.findElements(By.id("abstract-body")) - for (let el of webElements) { - let text = await el.getText() - lines.push(text) - } - result.url = url - result.abstract = lines.join(" "); - await driver.findElement(By.xpath(`//*[@id="tab-1014-btnInnerEl"]/span`)).click() - await driver.sleep(500) - let authors = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')).getText() - let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")) - authorLines = authors.split("\n").map(line => line.trim()).filter(sanitize) - - let i = 0; - let allAuthors = [] - while (i < authorLines.length) { - let individual = []; - while (!authorLines[i].startsWith("Average citations")) { - individual.push(authorLines[i]) - i++ - } - individual.push(authorLines[i]) - allAuthors.push(individual); - i++ - } - result.authors = allAuthors.map(metadata => { - let publicationYears = metadata[1].substring(18).split("-"); - author = { - name: metadata[0], - publication_start: parseInt(publicationYears[0]), - publication_end: parseInt(publicationYears[1]) - }; - for (let count = 2; count < metadata.length; count++) { - let attr = metadata[count] - let char = attr.length - 1; - while (attr[char] != " ") { - char-- - } - let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, ""); - let value = parseFloat(attr.substring(char + 1).replace(/,/g, "")); - author[key] = value - } - return author - }) + let url = `https://dl.acm.org/citation.cfm?id=${references[i]}`; + await navigate_to(url); + console.log(`scraping ${i + 1}/${quota} (${url})`); + + result.url = url; + result.title = await read_title(); + result.abstract = await read_abstract(); + result.authors = (await read_authors()).map(parse_authors); } catch (e) { - console.log(e) + console.log(e); await driver.quit(); } - results.push(result) - } - let output = ""; - results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); + results.push(result); + console.log(); + } - fs.writeFile("./results.txt", output, function errorHandler(exception) { - console.log(exception || "results successfully written") - }) + write_results(); await driver.quit(); -}
\ No newline at end of file +} + +let driver; +let results = []; +console.log("reading references..."); +readFile("./citations.txt", { + encoding: "utf8" +}, scrape_targets);
\ No newline at end of file |