aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/acm/index.js
diff options
context:
space:
mode:
authorSam Wilkins <samuel_wilkins@brown.edu>2019-06-25 21:22:29 -0400
committerSam Wilkins <samuel_wilkins@brown.edu>2019-06-25 21:22:29 -0400
commit06bd486c72342b4d979245c9f4051156e6492541 (patch)
tree3c0a18c8e682ab3c518eaa14369f088025041469 /src/scraping/acm/index.js
parent4be53f12fffa099f3381813192e60415898183d1 (diff)
scraping progress
Diffstat (limited to 'src/scraping/acm/index.js')
-rw-r--r--src/scraping/acm/index.js88
1 files changed, 88 insertions, 0 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js
new file mode 100644
index 000000000..81f775617
--- /dev/null
+++ b/src/scraping/acm/index.js
@@ -0,0 +1,88 @@
+const { Builder, By, Key, until } = require('selenium-webdriver');
+const fs = require("fs");
+
+let driver;
+fs.readFile("./citations.txt", { encoding: "utf8" }, scrapeTargets);
+results = []
+
+async function scrapeTargets(error, data) {
+ if (error) {
+ console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.")
+ return;
+ }
+
+ driver = await new Builder().forBrowser('chrome').build();
+
+ let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g));
+
+ let results = []
+ let pdfs = []
+ for (let id of references) {
+ let result = {}
+ let lines = []
+ try {
+ let url = `https://dl.acm.org/citation.cfm?id=${id}`;
+ await driver.get(url);
+ await driver.sleep(500)
+ let candidates = await driver.findElements(By.xpath('.//a[contains(@href, "ft_gateway.cfm?id=")]'));
+ if (candidates.length > 0) {
+ pdfs.push(candidates[0])
+ }
+ let webElements = await driver.findElements(By.id("abstract-body"))
+ for (let el of webElements) {
+ let text = await el.getText()
+ lines.push(text)
+ }
+ result.url = url
+ result.abstract = lines.join(" ");
+ await driver.findElement(By.xpath(`//*[@id="tab-1014-btnInnerEl"]/span`)).click()
+ await driver.sleep(500)
+ let authors = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')).getText()
+ let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:"))
+ authorLines = authors.split("\n").map(line => line.trim()).filter(sanitize)
+
+ let i = 0;
+ let allAuthors = []
+ while (i < authorLines.length) {
+ let individual = [];
+ while (!authorLines[i].startsWith("Average citations")) {
+ individual.push(authorLines[i])
+ i++
+ }
+ individual.push(authorLines[i])
+ allAuthors.push(individual);
+ i++
+ }
+ result.authors = allAuthors.map(metadata => {
+ let publicationYears = metadata[1].substring(18).split("-");
+ author = {
+ name: metadata[0],
+ publication_start: parseInt(publicationYears[0]),
+ publication_end: parseInt(publicationYears[1])
+ };
+ for (let count = 2; count < metadata.length; count++) {
+ let attr = metadata[count]
+ let char = attr.length - 1;
+ while (attr[char] != " ") {
+ char--
+ }
+ let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, "");
+ let value = parseFloat(attr.substring(char + 1).replace(/,/g, ""));
+ author[key] = value
+ }
+ return author
+ })
+ } catch (e) {
+ console.log(e)
+ await driver.quit();
+ }
+ results.push(result)
+ }
+
+ let output = "";
+ results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n"));
+
+ fs.writeFile("./results.txt", output, function errorHandler(exception) { console.log(exception || "results successfully written") })
+
+ await driver.quit();
+} \ No newline at end of file