3 files changed, 212 insertions, 40 deletions
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js
index ff4b099e7..51781dba8 100644
--- a/src/scraping/acm/index.js
+++ b/src/scraping/acm/index.js
@@ -7,9 +7,41 @@ const {
     writeFile
 } = require('fs');
 
+const target_source = './citations.txt';
+const target_browser = 'chrome';
+const target_dist = './results.txt';
+
 const driver_pause = 500; // milliseconds
 const sample_line_char_max = 100; // characters
-const target_browser = 'chrome';
+
+const tab_map = {
+    abstract: "11",
+    authors: "14",
+    references: "15",
+    cited_by: "16",
+    index_terms: "17",
+    publication: "18",
+    reviews: "19",
+    comments: "20",
+    table_of_contents: "21"
+};
+
+String.prototype.removeAll = function (replacements, trim = true) {
+    let result = this;
+    for (let expression of replacements) {
+        result = result.replace(expression, "");
+    }
+    return trim ? result.trim() : result;
+};
+
+String.prototype.remove = function (replacement, trim = true) {
+    let result = this.replace(replacement, "");
+    return trim ? result.trim() : result;
+};
+
+Object.prototype.first = function () {
+    return this[Object.keys(this)[0]];
+};
 
 // GENERAL UTILITY FUNCTIONS
 
@@ -32,14 +64,18 @@ function log_snippet(result, quotes = true) {
             snippet = quotes ? `"${snippet}"` : snippet;
             break;
         case "object":
-            snippet = result.map(res => {
-                switch (typeof res) {
-                    case "string":
-                        return res.substring(0, sample_line_char_max / result.length);
-                    case "object":
-                        return res[Object.keys(res)[0]];
-                }
-            }).join(', ');
+            if (Array.isArray(result)) {
+                snippet = result.map(res => {
+                    switch (typeof res) {
+                        case "string":
+                            return res.substring(0, sample_line_char_max / result.length);
+                        case "object":
+                            return res.first();
+                    }
+                }).join(', ');
+            } else {
+                snippet = result.first();
+            }
     }
     console.log(snippet);
     return result;
@@ -57,6 +93,10 @@ async function click_on(ref) {
     await driver.sleep(driver_pause);
 }
 
+async function click_on_acm_tab(target) {
+    await click_on(`//*[@id="tab-10${tab_map[target]}-btnInnerEl"]/span`);
+}
+
 async function locate(ref, multiple = false) {
     let locator = ref.startsWith("//") ? By.xpath(ref) : By.id(ref);
     return await multiple ? driver.findElements(locator) : driver.findElement(locator);
@@ -67,13 +107,13 @@ async function text_of(ref) {
     return await element.getText();
 }
 
-async function text_of_all(ref) {
+async function text_of_all(ref, delimiter = undefined) {
     let elements = await locate(ref, true);
     let results = [];
     for (let element of elements) {
         results.push(await element.getText());
     }
-    return results;
+    return delimiter ? results.join(delimiter) : results;
 }
 
 async function logged_assign(key, value) {
@@ -84,8 +124,6 @@ async function logged_assign(key, value) {
 // TEXT SCRAPING
 
 async function read_authors() {
-    await click_on('//*[@id="tab-1014-btnInnerEl"]/span');
-
     let authors = await text_of('//*[@id="tabpanel-1009-body"]');
     let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:"));
     let author_lines = authors.split("\n").map(line => line.trim()).filter(sanitize);
@@ -103,12 +141,54 @@ async function read_authors() {
         i++;
     }
 
-    return all_authors;
+    return all_authors.map(parse_author);
+}
+
+async function read_publication() {
+    let publciation_elements = (await text_of("source-body")).split("\n");
+    let publication_module = {};
+
+    let extract = (regex, target, index = 1) => regex.exec(target)[index];
+
+    for (let element of publciation_elements) {
+
+        let location = /Volume (\d+) Issue (\d+), ([\w.\d]+)/g;
+        let pages = /(\d+)-(\d+)/g;
+        let publication_date = /(\d{4}-\d{2}-\d{2})/g;
+        let publisher = /Publisher (.*)/g;
+        let issn = /ISSN: (\d{4}-\d{4})/g;
+        let eissn = /EISSN: ([\dA-Z]{4}-[\dA-Z]{4})/g;
+        let doi = /doi>([\.\d\/A-Z]+)/g;
+
+        if (element.startsWith("Title")) {
+            publication_module.name = element.substring(6).removeAll(["table of contents", "archive", /\w+ Homepage/]);
+        } else if (element.startsWith("Volume ")) {
+            let match = location.exec(element);
+            publication_module.volume = parseInt(match[1]);
+            publication_module.issue = parseInt(match[2]);
+            publication_module.month = match[3];
+        } else if (element.startsWith("Pages ")) {
+            let match = pages.exec(element);
+            publication_module.page_start = parseInt(match[1]);
+            publication_module.page_end = parseInt(match[2]);
+        } else if (element.startsWith("Publication Date ")) {
+            publication_module.publication_date = extract(publication_date, element);
+        } else if (element.startsWith("Publisher ")) {
+            publication_module.publisher = extract(publisher, element);
+        } else if (element.startsWith("ISSN: ")) {
+            publication_module.issn = extract(issn, element);
+            if (element.includes("EISSN: ")) {
+                publication_module.eissn = extract(eissn, element);
+            }
+            publication_module.doi = extract(doi, element);
+        }
+    }
+    return publication_module;
 }
 
 // JSON / DASH CONVERSION AND EXPORT
 
-function parse_authors(metadata) {
+function parse_author(metadata) {
     let publicationYears = metadata[1].substring(18).split("-");
     author = {
         name: metadata[0],
@@ -121,8 +201,8 @@ function parse_authors(metadata) {
         while (attr[char] != " ") {
             char--;
         }
-        let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, "");
-        let value = parseFloat(attr.substring(char + 1).replace(/,/g, ""));
+        let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").remove(/[\(\)]/g);
+        let value = parseFloat(attr.substring(char + 1).remove(/,/g));
         author[key] = value;
     }
     return author;
@@ -133,8 +213,8 @@ function write_results() {
     let output = "";
     results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n"));
 
-    writeFile("./results.txt", output, function errorHandler(exception) {
-        console.log(exception || "scraped references successfully written as JSON to ./results.txt\n");
+    writeFile(target_dist, output, function errorHandler(exception) {
+        console.log(exception || "scraped references successfully written as JSON to ./results.txt");
     });
 }
 
@@ -144,7 +224,7 @@ async function scrape_targets(error, data) {
         return;
     }
 
-    let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g));
+    let references = data.split("\n").map(entry => entry.removeAll(["\r"])).filter(line => line.match(/\d+/g));
     let quota = references.length;
     log_snippet(`found ${quota} references to scrape`, false);
 
@@ -153,16 +233,28 @@ async function scrape_targets(error, data) {
     for (let i = 0; i < quota; i++) {
         try {
             result = {};
+            let target;
 
             let id = references[i];
             let url = `https://dl.acm.org/citation.cfm?id=${id}`;
             console.log(`\nscraping ${i + 1}/${quota} (${id})`);
+
             await navigate_to(url);
 
             logged_assign("url", url);
             logged_assign("title", await text_of('//*[@id="divmain"]/div/h1'));
-            logged_assign("abstract", (await text_of_all("abstract-body")).join(" "));
-            logged_assign("authors", (await read_authors()).map(parse_authors));
+
+            target = "abstract";
+            await click_on_acm_tab(target);
+            logged_assign(target, await text_of_all("abstract-body", " "));
+
+            target = "authors";
+            await click_on_acm_tab(target);
+            logged_assign(target, await read_authors());
+
+            target = "publication";
+            await click_on_acm_tab(target);
+            logged_assign(target, await read_publication());
         } catch (e) {
             console.log(e);
             await driver.quit();
@@ -182,6 +274,6 @@ let result = {};
 
 log_read("target references");
 
-readFile("./citations.txt", {
+readFile(target_source, {
     encoding: "utf8"
 }, scrape_targets);
 \ No newline at end of file
diff --git a/src/scraping/acm/results.txt b/src/scraping/acm/results.txt
index fffa7ff51..a15da8b10 100644
--- a/src/scraping/acm/results.txt
+++ b/src/scraping/acm/results.txt
@@ -10,10 +10,10 @@
             "publication_count": 1,
             "citation_count": 179,
             "available_for_download": 1,
-            "downloads_6_weeks": 130,
-            "downloads_12_months": 1004,
-            "downloads_cumulative": 9792,
-            "average_downloads_per_article": 9792,
+            "downloads_6_weeks": 124,
+            "downloads_12_months": 923,
+            "downloads_cumulative": 9794,
+            "average_downloads_per_article": 9794,
             "average_citations_per_article": 179
         },
         {
@@ -23,10 +23,10 @@
             "publication_count": 5,
             "citation_count": 196,
             "available_for_download": 1,
-            "downloads_6_weeks": 130,
-            "downloads_12_months": 1004,
-            "downloads_cumulative": 9792,
-            "average_downloads_per_article": 9792,
+            "downloads_6_weeks": 124,
+            "downloads_12_months": 923,
+            "downloads_cumulative": 9794,
+            "average_downloads_per_article": 9794,
             "average_citations_per_article": 39.2
         },
         {
@@ -36,13 +36,26 @@
             "publication_count": 2,
             "citation_count": 188,
             "available_for_download": 2,
-            "downloads_6_weeks": 130,
-            "downloads_12_months": 1009,
-            "downloads_cumulative": 10023,
-            "average_downloads_per_article": 5011.5,
+            "downloads_6_weeks": 124,
+            "downloads_12_months": 928,
+            "downloads_cumulative": 10025,
+            "average_downloads_per_article": 5012.5,
             "average_citations_per_article": 94
         }
-    ]
+    ],
+    "publication": {
+        "name": "Journal of the ACM (JACM)",
+        "volume": 7,
+        "issue": 4,
+        "month": "Oct.",
+        "page_start": 326,
+        "page_end": 329,
+        "publication_date": "1960-10-01",
+        "publisher": "ACM New York, NY, USA",
+        "issn": "0004-5411",
+        "eissn": "1557-735X",
+        "doi": "10.1145/321043.321046"
+    }
 }
 {
     "url": "https://dl.acm.org/citation.cfm?id=2412979",
@@ -62,5 +75,17 @@
             "average_downloads_per_article": 0,
             "average_citations_per_article": 0
         }
-    ]
+    ],
+    "publication": {
+        "name": "IEEE Transactions on Software Engineering",
+        "volume": 1,
+        "issue": 1,
+        "month": "March",
+        "page_start": 384,
+        "page_end": 389,
+        "publication_date": "1975-03-01",
+        "publisher": "IEEE Press Piscataway, NJ, USA",
+        "issn": "0098-5589",
+        "doi": "10.1109/TSE.1975.6312869"
+    }
 }
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index 02c6d8b74..700269727 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -26,7 +26,7 @@ def extract_links(fileName):
         item = rels[rel]
         if item.reltype == RT.HYPERLINK and ".aspx" not in item._target:
             links.append(item._target)
-    return listify(links)
+    return text_doc_map(links)
 
 
 def extract_value(kv_string):
@@ -60,6 +60,12 @@ def protofy(fieldId):
     }
 
 
+def text_doc_map(string_list):
+    def guid_map(caption):
+        return write_text_doc(caption)
+    return listify(proxify_guids(list(map(guid_map, string_list))))
+
+
 def write_schema(parse_results, display_fields, storage_key):
     view_guids = parse_results["child_guids"]
 
@@ -110,6 +116,54 @@ def write_schema(parse_results, display_fields, storage_key):
     return view_doc_guid
 
 
+def write_text_doc(content):
+    data_doc_guid = guid()
+    view_doc_guid = guid()
+
+    view_doc = {
+        "_id": view_doc_guid,
+        "fields": {
+            "proto": protofy(data_doc_guid),
+            "x": 10,
+            "y": 10,
+            "width": 400,
+            "zIndex": 2,
+            "libraryBrush": False
+        },
+        "__type": "Doc"
+    }
+
+    data_doc = {
+        "_id": data_doc_guid,
+        "fields": {
+            "proto": protofy("textProto"),
+            "data": {
+                "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}',
+                "__type": "RichTextField"
+            },
+            "title": content,
+            "nativeWidth": 200,
+            "author": "Bill Buxton",
+            "creationDate": {
+                "date": datetime.datetime.utcnow().microsecond,
+                "__type": "date"
+            },
+            "isPrototype": True,
+            "autoHeight": True,
+            "page": -1,
+            "nativeHeight": 200,
+            "height": 200,
+            "data_text": content
+        },
+        "__type": "Doc"
+    }
+
+    db.newDocuments.insert_one(view_doc)
+    db.newDocuments.insert_one(data_doc)
+
+    return view_doc_guid
+
+
 def write_image(folder, name):
     path = f"http://localhost:1050/files/{folder}/{name}"
 
@@ -253,7 +307,7 @@ def parse_document(file_name: str):
     while lines[cur] != "Image":
         link_descriptions.append(lines[cur].strip())
         cur += 1
-    result["link_descriptions"] = listify(link_descriptions)
+    result["link_descriptions"] = text_doc_map(link_descriptions)
 
     result["hyperlinks"] = extract_links(source + "/" + file_name)
 
@@ -265,7 +319,8 @@ def parse_document(file_name: str):
         captions.append(lines[cur + 1])
         cur += 2
     result["images"] = listify(images)
-    result["captions"] = listify(captions)
+
+    result["captions"] = text_doc_map(captions)
 
     notes = []
     if (cur < len(lines) and lines[cur] == "NOTES:"):