diff options
-rw-r--r-- | src/client/views/GlobalKeyHandler.ts | 2 | ||||
-rw-r--r-- | src/scraping/acm/index.js | 90 | ||||
-rw-r--r-- | src/scraping/acm/results.txt | 47 |
3 files changed, 106 insertions, 33 deletions
diff --git a/src/client/views/GlobalKeyHandler.ts b/src/client/views/GlobalKeyHandler.ts index 9a0c3a6b1..574e43ba3 100644 --- a/src/client/views/GlobalKeyHandler.ts +++ b/src/client/views/GlobalKeyHandler.ts @@ -100,7 +100,7 @@ export default class KeyManager { MainView.Instance.mainFreeform && CollectionDockingView.Instance.AddRightSplit(MainView.Instance.mainFreeform, undefined); break; case "arrowleft": - MainView.Instance.mainFreeform && CollectionDockingView.Instance.CloseRightSplit(MainView.Instance.mainFreeform) + MainView.Instance.mainFreeform && CollectionDockingView.Instance.CloseRightSplit(MainView.Instance.mainFreeform); break; case "f": MainView.Instance.isSearchVisible = !MainView.Instance.isSearchVisible; diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index 39938ecca..be844da31 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -7,9 +7,13 @@ const { writeFile } = require('fs'); +const target_source = './citations.txt'; +const target_browser = 'chrome'; +const target_dist = './results.txt'; + const driver_pause = 500; // milliseconds const sample_line_char_max = 100; // characters -const target_browser = 'chrome'; + const tab_map = { abstract: "11", authors: "14", @@ -22,16 +26,29 @@ const tab_map = { table_of_contents: "21" }; +String.prototype.removeAll = function (replacements, trim = true) { + let result = this; + for (let expression of replacements) { + result = result.replace(expression, ""); + } + return trim ? result.trim() : result; +}; + +String.prototype.remove = function (replacement, trim = true) { + let result = this.replace(replacement, ""); + return trim ? result.trim() : result; +}; + +Object.prototype.first = function () { + return this[Object.keys(this)[0]]; +}; + // GENERAL UTILITY FUNCTIONS function log_read(content) { process.stdout.write("reading " + content + "..."); } -function first_value(object) { - return object[Object.keys(object)[0]]; -} - function log_snippet(result, quotes = true) { let snippet = "failed to create snippet"; switch (typeof result) { @@ -53,11 +70,11 @@ function log_snippet(result, quotes = true) { case "string": return res.substring(0, sample_line_char_max / result.length); case "object": - return first_value(res); + return res.first(); } }).join(', '); } else { - snippet = first_value(result); + snippet = result.first(); } } console.log(snippet); @@ -130,9 +147,40 @@ async function read_authors() { async function read_publication() { let publciation_elements = (await text_of("source-body")).split("\n"); let publication_module = {}; + + let extract = (regex, target, index = 1) => regex.exec(target)[index]; + for (let element of publciation_elements) { + + let location = /Volume (\d+) Issue (\d+), ([\w.\d]+)/g; + let pages = /(\d+)-(\d+)/g; + let publication_date = /(\d{4}-\d{2}-\d{2})/g; + let publisher = /Publisher (.*)/g; + let issn = /ISSN: (\d{4}-\d{4})/g; + let eissn = /EISSN: ([\dA-Z]{4}-[\dA-Z]{4})/g; + let doi = /doi>([\.\d\/A-Z]+)/g; + if (element.startsWith("Title")) { - publication_module.title = element.substring(6); + publication_module.name = element.substring(6).removeAll(["table of contents", "archive", /\w+ Homepage/]); + } else if (element.startsWith("Volume")) { + let match = location.exec(element); + publication_module.volume = parseInt(match[1]); + publication_module.issue = parseInt(match[2]); + publication_module.month = match[3]; + } else if (element.startsWith("Pages")) { + let match = pages.exec(element); + publication_module.page_start = parseInt(match[1]); + publication_module.page_end = parseInt(match[2]); + } else if (element.startsWith("Publication Date ")) { + publication_module.publication_date = extract(publication_date, element); + } else if (element.startsWith("Publisher ")) { + publication_module.publisher = extract(publisher, element); + } else if (element.startsWith("ISSN: ")) { + publication_module.issn = extract(issn, element); + if (element.includes("EISSN: ")) { + publication_module.eissn = extract(eissn, element); + } + publication_module.doi = extract(doi, element); } } return publication_module; @@ -153,8 +201,8 @@ function parse_authors(metadata) { while (attr[char] != " ") { char--; } - let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, ""); - let value = parseFloat(attr.substring(char + 1).replace(/,/g, "")); + let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").remove(/[\(\)]/g); + let value = parseFloat(attr.substring(char + 1).remove(/,/g)); author[key] = value; } return author; @@ -165,7 +213,7 @@ function write_results() { let output = ""; results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); - writeFile("./results.txt", output, function errorHandler(exception) { + writeFile(target_dist, output, function errorHandler(exception) { console.log(exception || "scraped references successfully written as JSON to ./results.txt"); }); } @@ -176,7 +224,7 @@ async function scrape_targets(error, data) { return; } - let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g)); + let references = data.split("\n").map(entry => entry.removeAll(["\r"])).filter(line => line.match(/\d+/g)); let quota = references.length; log_snippet(`found ${quota} references to scrape`, false); @@ -185,6 +233,7 @@ async function scrape_targets(error, data) { for (let i = 0; i < quota; i++) { try { result = {}; + let target; let id = references[i]; let url = `https://dl.acm.org/citation.cfm?id=${id}`; @@ -194,13 +243,18 @@ async function scrape_targets(error, data) { logged_assign("url", url); logged_assign("title", await text_of('//*[@id="divmain"]/div/h1')); - logged_assign("abstract", (await text_of_all("abstract-body")).join(" ")); - await click_on_acm_tab("authors"); - logged_assign("authors", (await read_authors()).map(parse_authors)); + target = "abstract"; + await click_on_acm_tab(target); + logged_assign(target, (await text_of_all("abstract-body")).join(" ")); + + target = "authors"; + await click_on_acm_tab(target); + logged_assign(target, (await read_authors()).map(parse_authors)); - await click_on_acm_tab("publication"); - logged_assign("publication", await read_publication()); + target = "publication"; + await click_on_acm_tab(target); + logged_assign(target, await read_publication()); } catch (e) { console.log(e); await driver.quit(); @@ -220,6 +274,6 @@ let result = {}; log_read("target references"); -readFile("./citations.txt", { +readFile(target_source, { encoding: "utf8" }, scrape_targets);
\ No newline at end of file diff --git a/src/scraping/acm/results.txt b/src/scraping/acm/results.txt index ba66d61a7..a15da8b10 100644 --- a/src/scraping/acm/results.txt +++ b/src/scraping/acm/results.txt @@ -10,10 +10,10 @@ "publication_count": 1, "citation_count": 179, "available_for_download": 1, - "downloads_6_weeks": 123, - "downloads_12_months": 922, - "downloads_cumulative": 9793, - "average_downloads_per_article": 9793, + "downloads_6_weeks": 124, + "downloads_12_months": 923, + "downloads_cumulative": 9794, + "average_downloads_per_article": 9794, "average_citations_per_article": 179 }, { @@ -23,10 +23,10 @@ "publication_count": 5, "citation_count": 196, "available_for_download": 1, - "downloads_6_weeks": 123, - "downloads_12_months": 922, - "downloads_cumulative": 9793, - "average_downloads_per_article": 9793, + "downloads_6_weeks": 124, + "downloads_12_months": 923, + "downloads_cumulative": 9794, + "average_downloads_per_article": 9794, "average_citations_per_article": 39.2 }, { @@ -36,15 +36,25 @@ "publication_count": 2, "citation_count": 188, "available_for_download": 2, - "downloads_6_weeks": 123, - "downloads_12_months": 927, - "downloads_cumulative": 10024, - "average_downloads_per_article": 5012, + "downloads_6_weeks": 124, + "downloads_12_months": 928, + "downloads_cumulative": 10025, + "average_downloads_per_article": 5012.5, "average_citations_per_article": 94 } ], "publication": { - "title": "Journal of the ACM (JACM) JACM Homepage table of contents archive" + "name": "Journal of the ACM (JACM)", + "volume": 7, + "issue": 4, + "month": "Oct.", + "page_start": 326, + "page_end": 329, + "publication_date": "1960-10-01", + "publisher": "ACM New York, NY, USA", + "issn": "0004-5411", + "eissn": "1557-735X", + "doi": "10.1145/321043.321046" } } { @@ -67,6 +77,15 @@ } ], "publication": { - "title": "IEEE Transactions on Software Engineering table of contents archive" + "name": "IEEE Transactions on Software Engineering", + "volume": 1, + "issue": 1, + "month": "March", + "page_start": 384, + "page_end": 389, + "publication_date": "1975-03-01", + "publisher": "IEEE Press Piscataway, NJ, USA", + "issn": "0098-5589", + "doi": "10.1109/TSE.1975.6312869" } } |