aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/acm
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping/acm')
-rw-r--r--src/scraping/acm/index.js90
-rw-r--r--src/scraping/acm/results.txt47
2 files changed, 105 insertions, 32 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js
index 39938ecca..be844da31 100644
--- a/src/scraping/acm/index.js
+++ b/src/scraping/acm/index.js
@@ -7,9 +7,13 @@ const {
writeFile
} = require('fs');
+const target_source = './citations.txt';
+const target_browser = 'chrome';
+const target_dist = './results.txt';
+
const driver_pause = 500; // milliseconds
const sample_line_char_max = 100; // characters
-const target_browser = 'chrome';
+
const tab_map = {
abstract: "11",
authors: "14",
@@ -22,16 +26,29 @@ const tab_map = {
table_of_contents: "21"
};
+String.prototype.removeAll = function (replacements, trim = true) {
+ let result = this;
+ for (let expression of replacements) {
+ result = result.replace(expression, "");
+ }
+ return trim ? result.trim() : result;
+};
+
+String.prototype.remove = function (replacement, trim = true) {
+ let result = this.replace(replacement, "");
+ return trim ? result.trim() : result;
+};
+
+Object.prototype.first = function () {
+ return this[Object.keys(this)[0]];
+};
+
// GENERAL UTILITY FUNCTIONS
function log_read(content) {
process.stdout.write("reading " + content + "...");
}
-function first_value(object) {
- return object[Object.keys(object)[0]];
-}
-
function log_snippet(result, quotes = true) {
let snippet = "failed to create snippet";
switch (typeof result) {
@@ -53,11 +70,11 @@ function log_snippet(result, quotes = true) {
case "string":
return res.substring(0, sample_line_char_max / result.length);
case "object":
- return first_value(res);
+ return res.first();
}
}).join(', ');
} else {
- snippet = first_value(result);
+ snippet = result.first();
}
}
console.log(snippet);
@@ -130,9 +147,40 @@ async function read_authors() {
async function read_publication() {
let publciation_elements = (await text_of("source-body")).split("\n");
let publication_module = {};
+
+ let extract = (regex, target, index = 1) => regex.exec(target)[index];
+
for (let element of publciation_elements) {
+
+ let location = /Volume (\d+) Issue (\d+), ([\w.\d]+)/g;
+ let pages = /(\d+)-(\d+)/g;
+ let publication_date = /(\d{4}-\d{2}-\d{2})/g;
+ let publisher = /Publisher (.*)/g;
+ let issn = /ISSN: (\d{4}-\d{4})/g;
+ let eissn = /EISSN: ([\dA-Z]{4}-[\dA-Z]{4})/g;
+ let doi = /doi>([\.\d\/A-Z]+)/g;
+
if (element.startsWith("Title")) {
- publication_module.title = element.substring(6);
+ publication_module.name = element.substring(6).removeAll(["table of contents", "archive", /\w+ Homepage/]);
+ } else if (element.startsWith("Volume")) {
+ let match = location.exec(element);
+ publication_module.volume = parseInt(match[1]);
+ publication_module.issue = parseInt(match[2]);
+ publication_module.month = match[3];
+ } else if (element.startsWith("Pages")) {
+ let match = pages.exec(element);
+ publication_module.page_start = parseInt(match[1]);
+ publication_module.page_end = parseInt(match[2]);
+ } else if (element.startsWith("Publication Date ")) {
+ publication_module.publication_date = extract(publication_date, element);
+ } else if (element.startsWith("Publisher ")) {
+ publication_module.publisher = extract(publisher, element);
+ } else if (element.startsWith("ISSN: ")) {
+ publication_module.issn = extract(issn, element);
+ if (element.includes("EISSN: ")) {
+ publication_module.eissn = extract(eissn, element);
+ }
+ publication_module.doi = extract(doi, element);
}
}
return publication_module;
@@ -153,8 +201,8 @@ function parse_authors(metadata) {
while (attr[char] != " ") {
char--;
}
- let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, "");
- let value = parseFloat(attr.substring(char + 1).replace(/,/g, ""));
+ let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").remove(/[\(\)]/g);
+ let value = parseFloat(attr.substring(char + 1).remove(/,/g));
author[key] = value;
}
return author;
@@ -165,7 +213,7 @@ function write_results() {
let output = "";
results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n"));
- writeFile("./results.txt", output, function errorHandler(exception) {
+ writeFile(target_dist, output, function errorHandler(exception) {
console.log(exception || "scraped references successfully written as JSON to ./results.txt");
});
}
@@ -176,7 +224,7 @@ async function scrape_targets(error, data) {
return;
}
- let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g));
+ let references = data.split("\n").map(entry => entry.removeAll(["\r"])).filter(line => line.match(/\d+/g));
let quota = references.length;
log_snippet(`found ${quota} references to scrape`, false);
@@ -185,6 +233,7 @@ async function scrape_targets(error, data) {
for (let i = 0; i < quota; i++) {
try {
result = {};
+ let target;
let id = references[i];
let url = `https://dl.acm.org/citation.cfm?id=${id}`;
@@ -194,13 +243,18 @@ async function scrape_targets(error, data) {
logged_assign("url", url);
logged_assign("title", await text_of('//*[@id="divmain"]/div/h1'));
- logged_assign("abstract", (await text_of_all("abstract-body")).join(" "));
- await click_on_acm_tab("authors");
- logged_assign("authors", (await read_authors()).map(parse_authors));
+ target = "abstract";
+ await click_on_acm_tab(target);
+ logged_assign(target, (await text_of_all("abstract-body")).join(" "));
+
+ target = "authors";
+ await click_on_acm_tab(target);
+ logged_assign(target, (await read_authors()).map(parse_authors));
- await click_on_acm_tab("publication");
- logged_assign("publication", await read_publication());
+ target = "publication";
+ await click_on_acm_tab(target);
+ logged_assign(target, await read_publication());
} catch (e) {
console.log(e);
await driver.quit();
@@ -220,6 +274,6 @@ let result = {};
log_read("target references");
-readFile("./citations.txt", {
+readFile(target_source, {
encoding: "utf8"
}, scrape_targets); \ No newline at end of file
diff --git a/src/scraping/acm/results.txt b/src/scraping/acm/results.txt
index ba66d61a7..a15da8b10 100644
--- a/src/scraping/acm/results.txt
+++ b/src/scraping/acm/results.txt
@@ -10,10 +10,10 @@
"publication_count": 1,
"citation_count": 179,
"available_for_download": 1,
- "downloads_6_weeks": 123,
- "downloads_12_months": 922,
- "downloads_cumulative": 9793,
- "average_downloads_per_article": 9793,
+ "downloads_6_weeks": 124,
+ "downloads_12_months": 923,
+ "downloads_cumulative": 9794,
+ "average_downloads_per_article": 9794,
"average_citations_per_article": 179
},
{
@@ -23,10 +23,10 @@
"publication_count": 5,
"citation_count": 196,
"available_for_download": 1,
- "downloads_6_weeks": 123,
- "downloads_12_months": 922,
- "downloads_cumulative": 9793,
- "average_downloads_per_article": 9793,
+ "downloads_6_weeks": 124,
+ "downloads_12_months": 923,
+ "downloads_cumulative": 9794,
+ "average_downloads_per_article": 9794,
"average_citations_per_article": 39.2
},
{
@@ -36,15 +36,25 @@
"publication_count": 2,
"citation_count": 188,
"available_for_download": 2,
- "downloads_6_weeks": 123,
- "downloads_12_months": 927,
- "downloads_cumulative": 10024,
- "average_downloads_per_article": 5012,
+ "downloads_6_weeks": 124,
+ "downloads_12_months": 928,
+ "downloads_cumulative": 10025,
+ "average_downloads_per_article": 5012.5,
"average_citations_per_article": 94
}
],
"publication": {
- "title": "Journal of the ACM (JACM) JACM Homepage table of contents archive"
+ "name": "Journal of the ACM (JACM)",
+ "volume": 7,
+ "issue": 4,
+ "month": "Oct.",
+ "page_start": 326,
+ "page_end": 329,
+ "publication_date": "1960-10-01",
+ "publisher": "ACM New York, NY, USA",
+ "issn": "0004-5411",
+ "eissn": "1557-735X",
+ "doi": "10.1145/321043.321046"
}
}
{
@@ -67,6 +77,15 @@
}
],
"publication": {
- "title": "IEEE Transactions on Software Engineering table of contents archive"
+ "name": "IEEE Transactions on Software Engineering",
+ "volume": 1,
+ "issue": 1,
+ "month": "March",
+ "page_start": 384,
+ "page_end": 389,
+ "publication_date": "1975-03-01",
+ "publisher": "IEEE Press Piscataway, NJ, USA",
+ "issn": "0098-5589",
+ "doi": "10.1109/TSE.1975.6312869"
}
}