aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/acm/index.js
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping/acm/index.js')
-rw-r--r--src/scraping/acm/index.js106
1 files changed, 63 insertions, 43 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js
index 3d5e801be..ff4b099e7 100644
--- a/src/scraping/acm/index.js
+++ b/src/scraping/acm/index.js
@@ -17,15 +17,32 @@ function log_read(content) {
process.stdout.write("reading " + content + "...");
}
-function log_snippet(result) {
- let ellipse = result.length > sample_line_char_max;
- let i = sample_line_char_max;
- if (ellipse) {
- while (result[i] != " " && i < -1) {
- i--;
- }
+function log_snippet(result, quotes = true) {
+ let snippet = "failed to create snippet";
+ switch (typeof result) {
+ case "string":
+ let ellipse = result.length > sample_line_char_max;
+ let i = sample_line_char_max;
+ if (ellipse) {
+ while (result[i] != " " && i < -1) {
+ i--;
+ }
+ }
+ snippet = `${result.substring(0, i + 1).trim()}${ellipse ? "..." : ""}`;
+ snippet = quotes ? `"${snippet}"` : snippet;
+ break;
+ case "object":
+ snippet = result.map(res => {
+ switch (typeof res) {
+ case "string":
+ return res.substring(0, sample_line_char_max / result.length);
+ case "object":
+ return res[Object.keys(res)[0]];
+ }
+ }).join(', ');
}
- console.log(` "${result.substring(0, i + 1).trim()}${ellipse ? "..." : ""}"`);
+ console.log(snippet);
+ return result;
}
// DRIVER UTILITY FUNCTIONS
@@ -35,40 +52,41 @@ async function navigate_to(url) {
await driver.sleep(driver_pause);
}
-async function click_on(xpath) {
- await driver.findElement(By.xpath(xpath)).click();
+async function click_on(ref) {
+ await (await locate(ref)).click();
await driver.sleep(driver_pause);
}
-// TEXT SCRAPING
+async function locate(ref, multiple = false) {
+ let locator = ref.startsWith("//") ? By.xpath(ref) : By.id(ref);
+ return await multiple ? driver.findElements(locator) : driver.findElement(locator);
+}
-async function read_title() {
- log_read("title");
- let title_el = await driver.findElement(By.xpath('//*[@id="divmain"]/div/h1'));
- let title = await title_el.getText();
- log_snippet(title);
- return title;
+async function text_of(ref) {
+ let element = await locate(ref);
+ return await element.getText();
}
-async function read_abstract() {
- log_read("abstract");
- let lines = [];
- let webElements = await driver.findElements(By.id("abstract-body"));
- for (let el of webElements) {
- let text = await el.getText();
- lines.push(text);
+async function text_of_all(ref) {
+ let elements = await locate(ref, true);
+ let results = [];
+ for (let element of elements) {
+ results.push(await element.getText());
}
- let abstract = lines.join(" ");
- log_snippet(abstract);
- return abstract;
+ return results;
}
+async function logged_assign(key, value) {
+ log_read(key);
+ result[key] = log_snippet(value);
+}
+
+// TEXT SCRAPING
+
async function read_authors() {
- log_read("authors");
await click_on('//*[@id="tab-1014-btnInnerEl"]/span');
- let authors_el = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]'));
- let authors = await authors_el.getText();
+ let authors = await text_of('//*[@id="tabpanel-1009-body"]');
let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:"));
let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize);
@@ -85,8 +103,6 @@ async function read_authors() {
i++;
}
- let multiple = all_authors.length == 1 ? "" : " et al.";
- log_snippet(all_authors[0][0] + multiple);
return all_authors;
}
@@ -113,6 +129,7 @@ function parse_authors(metadata) {
}
function write_results() {
+ console.log();
let output = "";
results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n"));
@@ -129,29 +146,29 @@ async function scrape_targets(error, data) {
let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g));
let quota = references.length;
- console.log(`${references.join(", ")}\n`);
+ log_snippet(`found ${quota} references to scrape`, false);
driver = await new Builder().forBrowser(target_browser).build();
for (let i = 0; i < quota; i++) {
- let result = {};
-
try {
- let url = `https://dl.acm.org/citation.cfm?id=${references[i]}`;
+ result = {};
+
+ let id = references[i];
+ let url = `https://dl.acm.org/citation.cfm?id=${id}`;
+ console.log(`\nscraping ${i + 1}/${quota} (${id})`);
await navigate_to(url);
- console.log(`scraping ${i + 1}/${quota} (${url})`);
- result.url = url;
- result.title = await read_title();
- result.abstract = await read_abstract();
- result.authors = (await read_authors()).map(parse_authors);
+ logged_assign("url", url);
+ logged_assign("title", await text_of('//*[@id="divmain"]/div/h1'));
+ logged_assign("abstract", (await text_of_all("abstract-body")).join(" "));
+ logged_assign("authors", (await read_authors()).map(parse_authors));
} catch (e) {
console.log(e);
await driver.quit();
}
results.push(result);
- console.log();
}
write_results();
@@ -161,7 +178,10 @@ async function scrape_targets(error, data) {
let driver;
let results = [];
-console.log("reading references...");
+let result = {};
+
+log_read("target references");
+
readFile("./citations.txt", {
encoding: "utf8"
}, scrape_targets); \ No newline at end of file