aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/acm/index.js
diff options
context:
space:
mode:
authorBob Zeleznik <zzzman@gmail.com>2019-06-29 09:14:16 -0400
committerBob Zeleznik <zzzman@gmail.com>2019-06-29 09:14:16 -0400
commitb2c6bf39771f288fac10224c0e1c9dcbf4290730 (patch)
tree87ccfa225035fc3103cee91e17ba63c3eb0b8cd7 /src/scraping/acm/index.js
parent824066f6c6842d31c41b4686a6e1a9baae61c492 (diff)
parent5cfd32830586a3e1162ee81538e13d675edb79a7 (diff)
Merge branch 'master' of https://github.com/browngraphicslab/Dash-Web
Diffstat (limited to 'src/scraping/acm/index.js')
-rw-r--r--src/scraping/acm/index.js97
1 files changed, 97 insertions, 0 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js
new file mode 100644
index 000000000..b7455fe0d
--- /dev/null
+++ b/src/scraping/acm/index.js
@@ -0,0 +1,97 @@
+const {
+ Builder,
+ By,
+ Key,
+ until
+} = require('selenium-webdriver');
+const fs = require("fs");
+
+let driver;
+fs.readFile("./citations.txt", {
+ encoding: "utf8"
+}, scrapeTargets);
+results = []
+
+async function scrapeTargets(error, data) {
+ if (error) {
+ console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.")
+ return;
+ }
+
+ driver = await new Builder().forBrowser('chrome').build();
+
+ let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g));
+
+ let results = []
+ let pdfs = []
+ for (let id of references) {
+ let result = {}
+ let lines = []
+ try {
+ let url = `https://dl.acm.org/citation.cfm?id=${id}`;
+ await driver.get(url);
+ await driver.sleep(500)
+ let candidates = await driver.findElements(By.xpath('.//a[contains(@href, "ft_gateway.cfm?id=")]'));
+ if (candidates.length > 0) {
+ pdfs.push(candidates[0])
+ }
+ let webElements = await driver.findElements(By.id("abstract-body"))
+ for (let el of webElements) {
+ let text = await el.getText()
+ lines.push(text)
+ }
+ result.url = url
+ result.abstract = lines.join(" ");
+ await driver.findElement(By.xpath(`//*[@id="tab-1014-btnInnerEl"]/span`)).click()
+ await driver.sleep(500)
+ let authors = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')).getText()
+ let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:"))
+ authorLines = authors.split("\n").map(line => line.trim()).filter(sanitize)
+
+ let i = 0;
+ let allAuthors = []
+ while (i < authorLines.length) {
+ let individual = [];
+ while (!authorLines[i].startsWith("Average citations")) {
+ individual.push(authorLines[i])
+ i++
+ }
+ individual.push(authorLines[i])
+ allAuthors.push(individual);
+ i++
+ }
+ result.authors = allAuthors.map(metadata => {
+ let publicationYears = metadata[1].substring(18).split("-");
+ author = {
+ name: metadata[0],
+ publication_start: parseInt(publicationYears[0]),
+ publication_end: parseInt(publicationYears[1])
+ };
+ for (let count = 2; count < metadata.length; count++) {
+ let attr = metadata[count]
+ let char = attr.length - 1;
+ while (attr[char] != " ") {
+ char--
+ }
+ let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, "");
+ let value = parseFloat(attr.substring(char + 1).replace(/,/g, ""));
+ author[key] = value
+ }
+ return author
+ })
+ } catch (e) {
+ console.log(e)
+ await driver.quit();
+ }
+ results.push(result)
+ }
+
+ let output = "";
+ results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n"));
+
+ fs.writeFile("./results.txt", output, function errorHandler(exception) {
+ console.log(exception || "results successfully written")
+ })
+
+ await driver.quit();
+} \ No newline at end of file