diff options
Diffstat (limited to 'src/scraping')
| -rw-r--r-- | src/scraping/acm/index.js | 138 | ||||
| -rw-r--r-- | src/scraping/acm/results.txt | 53 | ||||
| -rw-r--r-- | src/scraping/buxton/scraper.py | 61 |
3 files changed, 212 insertions, 40 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index ff4b099e7..51781dba8 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -7,9 +7,41 @@ const { writeFile } = require('fs'); +const target_source = './citations.txt'; +const target_browser = 'chrome'; +const target_dist = './results.txt'; + const driver_pause = 500; // milliseconds const sample_line_char_max = 100; // characters -const target_browser = 'chrome'; + +const tab_map = { + abstract: "11", + authors: "14", + references: "15", + cited_by: "16", + index_terms: "17", + publication: "18", + reviews: "19", + comments: "20", + table_of_contents: "21" +}; + +String.prototype.removeAll = function (replacements, trim = true) { + let result = this; + for (let expression of replacements) { + result = result.replace(expression, ""); + } + return trim ? result.trim() : result; +}; + +String.prototype.remove = function (replacement, trim = true) { + let result = this.replace(replacement, ""); + return trim ? result.trim() : result; +}; + +Object.prototype.first = function () { + return this[Object.keys(this)[0]]; +}; // GENERAL UTILITY FUNCTIONS @@ -32,14 +64,18 @@ function log_snippet(result, quotes = true) { snippet = quotes ? `"${snippet}"` : snippet; break; case "object": - snippet = result.map(res => { - switch (typeof res) { - case "string": - return res.substring(0, sample_line_char_max / result.length); - case "object": - return res[Object.keys(res)[0]]; - } - }).join(', '); + if (Array.isArray(result)) { + snippet = result.map(res => { + switch (typeof res) { + case "string": + return res.substring(0, sample_line_char_max / result.length); + case "object": + return res.first(); + } + }).join(', '); + } else { + snippet = result.first(); + } } console.log(snippet); return result; @@ -57,6 +93,10 @@ async function click_on(ref) { await driver.sleep(driver_pause); } +async function click_on_acm_tab(target) { + await click_on(`//*[@id="tab-10${tab_map[target]}-btnInnerEl"]/span`); +} + async function locate(ref, multiple = false) { let locator = ref.startsWith("//") ? By.xpath(ref) : By.id(ref); return await multiple ? driver.findElements(locator) : driver.findElement(locator); @@ -67,13 +107,13 @@ async function text_of(ref) { return await element.getText(); } -async function text_of_all(ref) { +async function text_of_all(ref, delimiter = undefined) { let elements = await locate(ref, true); let results = []; for (let element of elements) { results.push(await element.getText()); } - return results; + return delimiter ? results.join(delimiter) : results; } async function logged_assign(key, value) { @@ -84,8 +124,6 @@ async function logged_assign(key, value) { // TEXT SCRAPING async function read_authors() { - await click_on('//*[@id="tab-1014-btnInnerEl"]/span'); - let authors = await text_of('//*[@id="tabpanel-1009-body"]'); let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")); let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize); @@ -103,12 +141,54 @@ async function read_authors() { i++; } - return all_authors; + return all_authors.map(parse_author); +} + +async function read_publication() { + let publciation_elements = (await text_of("source-body")).split("\n"); + let publication_module = {}; + + let extract = (regex, target, index = 1) => regex.exec(target)[index]; + + for (let element of publciation_elements) { + + let location = /Volume (\d+) Issue (\d+), ([\w.\d]+)/g; + let pages = /(\d+)-(\d+)/g; + let publication_date = /(\d{4}-\d{2}-\d{2})/g; + let publisher = /Publisher (.*)/g; + let issn = /ISSN: (\d{4}-\d{4})/g; + let eissn = /EISSN: ([\dA-Z]{4}-[\dA-Z]{4})/g; + let doi = /doi>([\.\d\/A-Z]+)/g; + + if (element.startsWith("Title")) { + publication_module.name = element.substring(6).removeAll(["table of contents", "archive", /\w+ Homepage/]); + } else if (element.startsWith("Volume ")) { + let match = location.exec(element); + publication_module.volume = parseInt(match[1]); + publication_module.issue = parseInt(match[2]); + publication_module.month = match[3]; + } else if (element.startsWith("Pages ")) { + let match = pages.exec(element); + publication_module.page_start = parseInt(match[1]); + publication_module.page_end = parseInt(match[2]); + } else if (element.startsWith("Publication Date ")) { + publication_module.publication_date = extract(publication_date, element); + } else if (element.startsWith("Publisher ")) { + publication_module.publisher = extract(publisher, element); + } else if (element.startsWith("ISSN: ")) { + publication_module.issn = extract(issn, element); + if (element.includes("EISSN: ")) { + publication_module.eissn = extract(eissn, element); + } + publication_module.doi = extract(doi, element); + } + } + return publication_module; } // JSON / DASH CONVERSION AND EXPORT -function parse_authors(metadata) { +function parse_author(metadata) { let publicationYears = metadata[1].substring(18).split("-"); author = { name: metadata[0], @@ -121,8 +201,8 @@ function parse_authors(metadata) { while (attr[char] != " ") { char--; } - let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, ""); - let value = parseFloat(attr.substring(char + 1).replace(/,/g, "")); + let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").remove(/[\(\)]/g); + let value = parseFloat(attr.substring(char + 1).remove(/,/g)); author[key] = value; } return author; @@ -133,8 +213,8 @@ function write_results() { let output = ""; results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); - writeFile("./results.txt", output, function errorHandler(exception) { - console.log(exception || "scraped references successfully written as JSON to ./results.txt\n"); + writeFile(target_dist, output, function errorHandler(exception) { + console.log(exception || "scraped references successfully written as JSON to ./results.txt"); }); } @@ -144,7 +224,7 @@ async function scrape_targets(error, data) { return; } - let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g)); + let references = data.split("\n").map(entry => entry.removeAll(["\r"])).filter(line => line.match(/\d+/g)); let quota = references.length; log_snippet(`found ${quota} references to scrape`, false); @@ -153,16 +233,28 @@ async function scrape_targets(error, data) { for (let i = 0; i < quota; i++) { try { result = {}; + let target; let id = references[i]; let url = `https://dl.acm.org/citation.cfm?id=${id}`; console.log(`\nscraping ${i + 1}/${quota} (${id})`); + await navigate_to(url); logged_assign("url", url); logged_assign("title", await text_of('//*[@id="divmain"]/div/h1')); - logged_assign("abstract", (await text_of_all("abstract-body")).join(" ")); - logged_assign("authors", (await read_authors()).map(parse_authors)); + + target = "abstract"; + await click_on_acm_tab(target); + logged_assign(target, await text_of_all("abstract-body", " ")); + + target = "authors"; + await click_on_acm_tab(target); + logged_assign(target, await read_authors()); + + target = "publication"; + await click_on_acm_tab(target); + logged_assign(target, await read_publication()); } catch (e) { console.log(e); await driver.quit(); @@ -182,6 +274,6 @@ let result = {}; log_read("target references"); -readFile("./citations.txt", { +readFile(target_source, { encoding: "utf8" }, scrape_targets);
\ No newline at end of file diff --git a/src/scraping/acm/results.txt b/src/scraping/acm/results.txt index fffa7ff51..a15da8b10 100644 --- a/src/scraping/acm/results.txt +++ b/src/scraping/acm/results.txt @@ -10,10 +10,10 @@ "publication_count": 1, "citation_count": 179, "available_for_download": 1, - "downloads_6_weeks": 130, - "downloads_12_months": 1004, - "downloads_cumulative": 9792, - "average_downloads_per_article": 9792, + "downloads_6_weeks": 124, + "downloads_12_months": 923, + "downloads_cumulative": 9794, + "average_downloads_per_article": 9794, "average_citations_per_article": 179 }, { @@ -23,10 +23,10 @@ "publication_count": 5, "citation_count": 196, "available_for_download": 1, - "downloads_6_weeks": 130, - "downloads_12_months": 1004, - "downloads_cumulative": 9792, - "average_downloads_per_article": 9792, + "downloads_6_weeks": 124, + "downloads_12_months": 923, + "downloads_cumulative": 9794, + "average_downloads_per_article": 9794, "average_citations_per_article": 39.2 }, { @@ -36,13 +36,26 @@ "publication_count": 2, "citation_count": 188, "available_for_download": 2, - "downloads_6_weeks": 130, - "downloads_12_months": 1009, - "downloads_cumulative": 10023, - "average_downloads_per_article": 5011.5, + "downloads_6_weeks": 124, + "downloads_12_months": 928, + "downloads_cumulative": 10025, + "average_downloads_per_article": 5012.5, "average_citations_per_article": 94 } - ] + ], + "publication": { + "name": "Journal of the ACM (JACM)", + "volume": 7, + "issue": 4, + "month": "Oct.", + "page_start": 326, + "page_end": 329, + "publication_date": "1960-10-01", + "publisher": "ACM New York, NY, USA", + "issn": "0004-5411", + "eissn": "1557-735X", + "doi": "10.1145/321043.321046" + } } { "url": "https://dl.acm.org/citation.cfm?id=2412979", @@ -62,5 +75,17 @@ "average_downloads_per_article": 0, "average_citations_per_article": 0 } - ] + ], + "publication": { + "name": "IEEE Transactions on Software Engineering", + "volume": 1, + "issue": 1, + "month": "March", + "page_start": 384, + "page_end": 389, + "publication_date": "1975-03-01", + "publisher": "IEEE Press Piscataway, NJ, USA", + "issn": "0098-5589", + "doi": "10.1109/TSE.1975.6312869" + } } diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 02c6d8b74..700269727 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -26,7 +26,7 @@ def extract_links(fileName): item = rels[rel] if item.reltype == RT.HYPERLINK and ".aspx" not in item._target: links.append(item._target) - return listify(links) + return text_doc_map(links) def extract_value(kv_string): @@ -60,6 +60,12 @@ def protofy(fieldId): } +def text_doc_map(string_list): + def guid_map(caption): + return write_text_doc(caption) + return listify(proxify_guids(list(map(guid_map, string_list)))) + + def write_schema(parse_results, display_fields, storage_key): view_guids = parse_results["child_guids"] @@ -110,6 +116,54 @@ def write_schema(parse_results, display_fields, storage_key): return view_doc_guid +def write_text_doc(content): + data_doc_guid = guid() + view_doc_guid = guid() + + view_doc = { + "_id": view_doc_guid, + "fields": { + "proto": protofy(data_doc_guid), + "x": 10, + "y": 10, + "width": 400, + "zIndex": 2, + "libraryBrush": False + }, + "__type": "Doc" + } + + data_doc = { + "_id": data_doc_guid, + "fields": { + "proto": protofy("textProto"), + "data": { + "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}', + "__type": "RichTextField" + }, + "title": content, + "nativeWidth": 200, + "author": "Bill Buxton", + "creationDate": { + "date": datetime.datetime.utcnow().microsecond, + "__type": "date" + }, + "isPrototype": True, + "autoHeight": True, + "page": -1, + "nativeHeight": 200, + "height": 200, + "data_text": content + }, + "__type": "Doc" + } + + db.newDocuments.insert_one(view_doc) + db.newDocuments.insert_one(data_doc) + + return view_doc_guid + + def write_image(folder, name): path = f"http://localhost:1050/files/{folder}/{name}" @@ -253,7 +307,7 @@ def parse_document(file_name: str): while lines[cur] != "Image": link_descriptions.append(lines[cur].strip()) cur += 1 - result["link_descriptions"] = listify(link_descriptions) + result["link_descriptions"] = text_doc_map(link_descriptions) result["hyperlinks"] = extract_links(source + "/" + file_name) @@ -265,7 +319,8 @@ def parse_document(file_name: str): captions.append(lines[cur + 1]) cur += 2 result["images"] = listify(images) - result["captions"] = listify(captions) + + result["captions"] = text_doc_map(captions) notes = [] if (cur < len(lines) and lines[cur] == "NOTES:"): |
