aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/acm/index.js
diff options
context:
space:
mode:
authorSam Wilkins <samwilkins333@gmail.com>2019-07-01 19:29:15 -0400
committerSam Wilkins <samwilkins333@gmail.com>2019-07-01 19:29:15 -0400
commitab2b2317e601b9e440f9c48b4639c143d5187949 (patch)
tree207c8fd2b7d0b2dd3a1e419bf6256ba0feb374a4 /src/scraping/acm/index.js
parentcd78d5b3371d73cc51dcb6f3dbfdca1a3bbcf6e1 (diff)
added tooltips and improved acm scraping
Diffstat (limited to 'src/scraping/acm/index.js')
-rw-r--r--src/scraping/acm/index.js56
1 files changed, 45 insertions, 11 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js
index ff4b099e7..ad0f844ba 100644
--- a/src/scraping/acm/index.js
+++ b/src/scraping/acm/index.js
@@ -10,6 +10,17 @@ const {
const driver_pause = 500; // milliseconds
const sample_line_char_max = 100; // characters
const target_browser = 'chrome';
+const tab_map = {
+ abstract: "11",
+ authors: "14",
+ references: "15",
+ cited_by: "16",
+ index_terms: "17",
+ publication: "18",
+ reviews: "19",
+ comments: "20",
+ table_of_contents: "21"
+};
// GENERAL UTILITY FUNCTIONS
@@ -32,14 +43,18 @@ function log_snippet(result, quotes = true) {
snippet = quotes ? `"${snippet}"` : snippet;
break;
case "object":
- snippet = result.map(res => {
- switch (typeof res) {
- case "string":
- return res.substring(0, sample_line_char_max / result.length);
- case "object":
- return res[Object.keys(res)[0]];
- }
- }).join(', ');
+ if (Array.isArray(result)) {
+ snippet = result.map(res => {
+ switch (typeof res) {
+ case "string":
+ return res.substring(0, sample_line_char_max / result.length);
+ case "object":
+ return res[Object.keys(res)[0]];
+ }
+ }).join(', ');
+ } else {
+ snippet = result[Object.keys(result)[0]];
+ }
}
console.log(snippet);
return result;
@@ -57,6 +72,10 @@ async function click_on(ref) {
await driver.sleep(driver_pause);
}
+async function click_on_acm_tab(target) {
+ await click_on(`//*[@id="tab-10${tab_map[target]}-btnInnerEl"]/span`);
+}
+
async function locate(ref, multiple = false) {
let locator = ref.startsWith("//") ? By.xpath(ref) : By.id(ref);
return await multiple ? driver.findElements(locator) : driver.findElement(locator);
@@ -84,8 +103,6 @@ async function logged_assign(key, value) {
// TEXT SCRAPING
async function read_authors() {
- await click_on('//*[@id="tab-1014-btnInnerEl"]/span');
-
let authors = await text_of('//*[@id="tabpanel-1009-body"]');
let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:"));
let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize);
@@ -106,6 +123,17 @@ async function read_authors() {
return all_authors;
}
+async function read_publication() {
+ let publciation_elements = (await text_of("source-body")).split("\n");
+ let publication_module = {};
+ for (let element of publciation_elements) {
+ if (element.startsWith("Title")) {
+ publication_module.title = element.substring(6);
+ }
+ }
+ return publication_module;
+}
+
// JSON / DASH CONVERSION AND EXPORT
function parse_authors(metadata) {
@@ -134,7 +162,7 @@ function write_results() {
results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n"));
writeFile("./results.txt", output, function errorHandler(exception) {
- console.log(exception || "scraped references successfully written as JSON to ./results.txt\n");
+ console.log(exception || "scraped references successfully written as JSON to ./results.txt");
});
}
@@ -157,12 +185,18 @@ async function scrape_targets(error, data) {
let id = references[i];
let url = `https://dl.acm.org/citation.cfm?id=${id}`;
console.log(`\nscraping ${i + 1}/${quota} (${id})`);
+
await navigate_to(url);
logged_assign("url", url);
logged_assign("title", await text_of('//*[@id="divmain"]/div/h1'));
logged_assign("abstract", (await text_of_all("abstract-body")).join(" "));
+
+ await click_on_acm_tab("authors");
logged_assign("authors", (await read_authors()).map(parse_authors));
+
+ await click_on_acm_tab("publication");
+ logged_assign("publication", await read_publication());
} catch (e) {
console.log(e);
await driver.quit();