From ab2b2317e601b9e440f9c48b4639c143d5187949 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Mon, 1 Jul 2019 19:29:15 -0400 Subject: added tooltips and improved acm scraping --- src/scraping/acm/index.js | 56 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 11 deletions(-) (limited to 'src/scraping/acm/index.js') diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index ff4b099e7..ad0f844ba 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -10,6 +10,17 @@ const { const driver_pause = 500; // milliseconds const sample_line_char_max = 100; // characters const target_browser = 'chrome'; +const tab_map = { + abstract: "11", + authors: "14", + references: "15", + cited_by: "16", + index_terms: "17", + publication: "18", + reviews: "19", + comments: "20", + table_of_contents: "21" +}; // GENERAL UTILITY FUNCTIONS @@ -32,14 +43,18 @@ function log_snippet(result, quotes = true) { snippet = quotes ? `"${snippet}"` : snippet; break; case "object": - snippet = result.map(res => { - switch (typeof res) { - case "string": - return res.substring(0, sample_line_char_max / result.length); - case "object": - return res[Object.keys(res)[0]]; - } - }).join(', '); + if (Array.isArray(result)) { + snippet = result.map(res => { + switch (typeof res) { + case "string": + return res.substring(0, sample_line_char_max / result.length); + case "object": + return res[Object.keys(res)[0]]; + } + }).join(', '); + } else { + snippet = result[Object.keys(result)[0]]; + } } console.log(snippet); return result; @@ -57,6 +72,10 @@ async function click_on(ref) { await driver.sleep(driver_pause); } +async function click_on_acm_tab(target) { + await click_on(`//*[@id="tab-10${tab_map[target]}-btnInnerEl"]/span`); +} + async function locate(ref, multiple = false) { let locator = ref.startsWith("//") ? By.xpath(ref) : By.id(ref); return await multiple ? driver.findElements(locator) : driver.findElement(locator); @@ -84,8 +103,6 @@ async function logged_assign(key, value) { // TEXT SCRAPING async function read_authors() { - await click_on('//*[@id="tab-1014-btnInnerEl"]/span'); - let authors = await text_of('//*[@id="tabpanel-1009-body"]'); let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")); let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize); @@ -106,6 +123,17 @@ async function read_authors() { return all_authors; } +async function read_publication() { + let publciation_elements = (await text_of("source-body")).split("\n"); + let publication_module = {}; + for (let element of publciation_elements) { + if (element.startsWith("Title")) { + publication_module.title = element.substring(6); + } + } + return publication_module; +} + // JSON / DASH CONVERSION AND EXPORT function parse_authors(metadata) { @@ -134,7 +162,7 @@ function write_results() { results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); writeFile("./results.txt", output, function errorHandler(exception) { - console.log(exception || "scraped references successfully written as JSON to ./results.txt\n"); + console.log(exception || "scraped references successfully written as JSON to ./results.txt"); }); } @@ -157,12 +185,18 @@ async function scrape_targets(error, data) { let id = references[i]; let url = `https://dl.acm.org/citation.cfm?id=${id}`; console.log(`\nscraping ${i + 1}/${quota} (${id})`); + await navigate_to(url); logged_assign("url", url); logged_assign("title", await text_of('//*[@id="divmain"]/div/h1')); logged_assign("abstract", (await text_of_all("abstract-body")).join(" ")); + + await click_on_acm_tab("authors"); logged_assign("authors", (await read_authors()).map(parse_authors)); + + await click_on_acm_tab("publication"); + logged_assign("publication", await read_publication()); } catch (e) { console.log(e); await driver.quit(); -- cgit v1.2.3-70-g09d2 From 44ff64c8186b25069c14aaf4de0ae694f872c2d3 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Mon, 1 Jul 2019 19:31:13 -0400 Subject: scraping function tweak --- src/scraping/acm/index.js | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'src/scraping/acm/index.js') diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index ad0f844ba..39938ecca 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -28,6 +28,10 @@ function log_read(content) { process.stdout.write("reading " + content + "..."); } +function first_value(object) { + return object[Object.keys(object)[0]]; +} + function log_snippet(result, quotes = true) { let snippet = "failed to create snippet"; switch (typeof result) { @@ -49,11 +53,11 @@ function log_snippet(result, quotes = true) { case "string": return res.substring(0, sample_line_char_max / result.length); case "object": - return res[Object.keys(res)[0]]; + return first_value(res); } }).join(', '); } else { - snippet = result[Object.keys(result)[0]]; + snippet = first_value(result); } } console.log(snippet); -- cgit v1.2.3-70-g09d2