diff options
Diffstat (limited to 'src/scraping/acm/index.js')
-rw-r--r-- | src/scraping/acm/index.js | 106 |
1 files changed, 63 insertions, 43 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index 3d5e801be..ff4b099e7 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -17,15 +17,32 @@ function log_read(content) { process.stdout.write("reading " + content + "..."); } -function log_snippet(result) { - let ellipse = result.length > sample_line_char_max; - let i = sample_line_char_max; - if (ellipse) { - while (result[i] != " " && i < -1) { - i--; - } +function log_snippet(result, quotes = true) { + let snippet = "failed to create snippet"; + switch (typeof result) { + case "string": + let ellipse = result.length > sample_line_char_max; + let i = sample_line_char_max; + if (ellipse) { + while (result[i] != " " && i < -1) { + i--; + } + } + snippet = `${result.substring(0, i + 1).trim()}${ellipse ? "..." : ""}`; + snippet = quotes ? `"${snippet}"` : snippet; + break; + case "object": + snippet = result.map(res => { + switch (typeof res) { + case "string": + return res.substring(0, sample_line_char_max / result.length); + case "object": + return res[Object.keys(res)[0]]; + } + }).join(', '); } - console.log(` "${result.substring(0, i + 1).trim()}${ellipse ? "..." : ""}"`); + console.log(snippet); + return result; } // DRIVER UTILITY FUNCTIONS @@ -35,40 +52,41 @@ async function navigate_to(url) { await driver.sleep(driver_pause); } -async function click_on(xpath) { - await driver.findElement(By.xpath(xpath)).click(); +async function click_on(ref) { + await (await locate(ref)).click(); await driver.sleep(driver_pause); } -// TEXT SCRAPING +async function locate(ref, multiple = false) { + let locator = ref.startsWith("//") ? By.xpath(ref) : By.id(ref); + return await multiple ? driver.findElements(locator) : driver.findElement(locator); +} -async function read_title() { - log_read("title"); - let title_el = await driver.findElement(By.xpath('//*[@id="divmain"]/div/h1')); - let title = await title_el.getText(); - log_snippet(title); - return title; +async function text_of(ref) { + let element = await locate(ref); + return await element.getText(); } -async function read_abstract() { - log_read("abstract"); - let lines = []; - let webElements = await driver.findElements(By.id("abstract-body")); - for (let el of webElements) { - let text = await el.getText(); - lines.push(text); +async function text_of_all(ref) { + let elements = await locate(ref, true); + let results = []; + for (let element of elements) { + results.push(await element.getText()); } - let abstract = lines.join(" "); - log_snippet(abstract); - return abstract; + return results; } +async function logged_assign(key, value) { + log_read(key); + result[key] = log_snippet(value); +} + +// TEXT SCRAPING + async function read_authors() { - log_read("authors"); await click_on('//*[@id="tab-1014-btnInnerEl"]/span'); - let authors_el = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')); - let authors = await authors_el.getText(); + let authors = await text_of('//*[@id="tabpanel-1009-body"]'); let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")); let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize); @@ -85,8 +103,6 @@ async function read_authors() { i++; } - let multiple = all_authors.length == 1 ? "" : " et al."; - log_snippet(all_authors[0][0] + multiple); return all_authors; } @@ -113,6 +129,7 @@ function parse_authors(metadata) { } function write_results() { + console.log(); let output = ""; results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); @@ -129,29 +146,29 @@ async function scrape_targets(error, data) { let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g)); let quota = references.length; - console.log(`${references.join(", ")}\n`); + log_snippet(`found ${quota} references to scrape`, false); driver = await new Builder().forBrowser(target_browser).build(); for (let i = 0; i < quota; i++) { - let result = {}; - try { - let url = `https://dl.acm.org/citation.cfm?id=${references[i]}`; + result = {}; + + let id = references[i]; + let url = `https://dl.acm.org/citation.cfm?id=${id}`; + console.log(`\nscraping ${i + 1}/${quota} (${id})`); await navigate_to(url); - console.log(`scraping ${i + 1}/${quota} (${url})`); - result.url = url; - result.title = await read_title(); - result.abstract = await read_abstract(); - result.authors = (await read_authors()).map(parse_authors); + logged_assign("url", url); + logged_assign("title", await text_of('//*[@id="divmain"]/div/h1')); + logged_assign("abstract", (await text_of_all("abstract-body")).join(" ")); + logged_assign("authors", (await read_authors()).map(parse_authors)); } catch (e) { console.log(e); await driver.quit(); } results.push(result); - console.log(); } write_results(); @@ -161,7 +178,10 @@ async function scrape_targets(error, data) { let driver; let results = []; -console.log("reading references..."); +let result = {}; + +log_read("target references"); + readFile("./citations.txt", { encoding: "utf8" }, scrape_targets);
\ No newline at end of file |