diff options
author | Sam Wilkins <samwilkins333@gmail.com> | 2019-07-01 19:29:15 -0400 |
---|---|---|
committer | Sam Wilkins <samwilkins333@gmail.com> | 2019-07-01 19:29:15 -0400 |
commit | ab2b2317e601b9e440f9c48b4639c143d5187949 (patch) | |
tree | 207c8fd2b7d0b2dd3a1e419bf6256ba0feb374a4 /src/scraping/acm | |
parent | cd78d5b3371d73cc51dcb6f3dbfdca1a3bbcf6e1 (diff) |
added tooltips and improved acm scraping
Diffstat (limited to 'src/scraping/acm')
-rw-r--r-- | src/scraping/acm/index.js | 56 | ||||
-rw-r--r-- | src/scraping/acm/results.txt | 34 |
2 files changed, 65 insertions, 25 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index ff4b099e7..ad0f844ba 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -10,6 +10,17 @@ const { const driver_pause = 500; // milliseconds const sample_line_char_max = 100; // characters const target_browser = 'chrome'; +const tab_map = { + abstract: "11", + authors: "14", + references: "15", + cited_by: "16", + index_terms: "17", + publication: "18", + reviews: "19", + comments: "20", + table_of_contents: "21" +}; // GENERAL UTILITY FUNCTIONS @@ -32,14 +43,18 @@ function log_snippet(result, quotes = true) { snippet = quotes ? `"${snippet}"` : snippet; break; case "object": - snippet = result.map(res => { - switch (typeof res) { - case "string": - return res.substring(0, sample_line_char_max / result.length); - case "object": - return res[Object.keys(res)[0]]; - } - }).join(', '); + if (Array.isArray(result)) { + snippet = result.map(res => { + switch (typeof res) { + case "string": + return res.substring(0, sample_line_char_max / result.length); + case "object": + return res[Object.keys(res)[0]]; + } + }).join(', '); + } else { + snippet = result[Object.keys(result)[0]]; + } } console.log(snippet); return result; @@ -57,6 +72,10 @@ async function click_on(ref) { await driver.sleep(driver_pause); } +async function click_on_acm_tab(target) { + await click_on(`//*[@id="tab-10${tab_map[target]}-btnInnerEl"]/span`); +} + async function locate(ref, multiple = false) { let locator = ref.startsWith("//") ? By.xpath(ref) : By.id(ref); return await multiple ? driver.findElements(locator) : driver.findElement(locator); @@ -84,8 +103,6 @@ async function logged_assign(key, value) { // TEXT SCRAPING async function read_authors() { - await click_on('//*[@id="tab-1014-btnInnerEl"]/span'); - let authors = await text_of('//*[@id="tabpanel-1009-body"]'); let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")); let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize); @@ -106,6 +123,17 @@ async function read_authors() { return all_authors; } +async function read_publication() { + let publciation_elements = (await text_of("source-body")).split("\n"); + let publication_module = {}; + for (let element of publciation_elements) { + if (element.startsWith("Title")) { + publication_module.title = element.substring(6); + } + } + return publication_module; +} + // JSON / DASH CONVERSION AND EXPORT function parse_authors(metadata) { @@ -134,7 +162,7 @@ function write_results() { results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); writeFile("./results.txt", output, function errorHandler(exception) { - console.log(exception || "scraped references successfully written as JSON to ./results.txt\n"); + console.log(exception || "scraped references successfully written as JSON to ./results.txt"); }); } @@ -157,12 +185,18 @@ async function scrape_targets(error, data) { let id = references[i]; let url = `https://dl.acm.org/citation.cfm?id=${id}`; console.log(`\nscraping ${i + 1}/${quota} (${id})`); + await navigate_to(url); logged_assign("url", url); logged_assign("title", await text_of('//*[@id="divmain"]/div/h1')); logged_assign("abstract", (await text_of_all("abstract-body")).join(" ")); + + await click_on_acm_tab("authors"); logged_assign("authors", (await read_authors()).map(parse_authors)); + + await click_on_acm_tab("publication"); + logged_assign("publication", await read_publication()); } catch (e) { console.log(e); await driver.quit(); diff --git a/src/scraping/acm/results.txt b/src/scraping/acm/results.txt index fffa7ff51..ba66d61a7 100644 --- a/src/scraping/acm/results.txt +++ b/src/scraping/acm/results.txt @@ -10,10 +10,10 @@ "publication_count": 1, "citation_count": 179, "available_for_download": 1, - "downloads_6_weeks": 130, - "downloads_12_months": 1004, - "downloads_cumulative": 9792, - "average_downloads_per_article": 9792, + "downloads_6_weeks": 123, + "downloads_12_months": 922, + "downloads_cumulative": 9793, + "average_downloads_per_article": 9793, "average_citations_per_article": 179 }, { @@ -23,10 +23,10 @@ "publication_count": 5, "citation_count": 196, "available_for_download": 1, - "downloads_6_weeks": 130, - "downloads_12_months": 1004, - "downloads_cumulative": 9792, - "average_downloads_per_article": 9792, + "downloads_6_weeks": 123, + "downloads_12_months": 922, + "downloads_cumulative": 9793, + "average_downloads_per_article": 9793, "average_citations_per_article": 39.2 }, { @@ -36,13 +36,16 @@ "publication_count": 2, "citation_count": 188, "available_for_download": 2, - "downloads_6_weeks": 130, - "downloads_12_months": 1009, - "downloads_cumulative": 10023, - "average_downloads_per_article": 5011.5, + "downloads_6_weeks": 123, + "downloads_12_months": 927, + "downloads_cumulative": 10024, + "average_downloads_per_article": 5012, "average_citations_per_article": 94 } - ] + ], + "publication": { + "title": "Journal of the ACM (JACM) JACM Homepage table of contents archive" + } } { "url": "https://dl.acm.org/citation.cfm?id=2412979", @@ -62,5 +65,8 @@ "average_downloads_per_article": 0, "average_citations_per_article": 0 } - ] + ], + "publication": { + "title": "IEEE Transactions on Software Engineering table of contents archive" + } } |