From 06bd486c72342b4d979245c9f4051156e6492541 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Tue, 25 Jun 2019 21:22:29 -0400 Subject: scraping progress --- src/buxton/scraper.py | 331 --------------------- src/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx | Bin 1675500 -> 0 bytes src/buxton/source/Bill_Notes_Braun_T3.docx | Bin 1671968 -> 0 bytes src/buxton/source/Bill_Notes_CasioC801.docx | Bin 574664 -> 0 bytes src/buxton/source/Bill_Notes_Casio_Mini.docx | Bin 581069 -> 0 bytes .../source/Bill_Notes_FingerWorks_Prototype.docx | Bin 585090 -> 0 bytes .../source/Bill_Notes_Fingerworks_TouchStream.docx | Bin 1722555 -> 0 bytes src/buxton/source/Bill_Notes_FrogPad.docx | Bin 840173 -> 0 bytes src/buxton/source/Bill_Notes_Gavilan_SC.docx | Bin 1695290 -> 0 bytes .../source/Bill_Notes_Grandjean_Stenotype.docx | Bin 2094142 -> 0 bytes src/buxton/source/Bill_Notes_Matias.docx | Bin 590407 -> 0 bytes src/buxton/source/Bill_Notes_MousePen.docx | Bin 505322 -> 0 bytes src/buxton/source/Bill_Notes_NewO.docx | Bin 2264571 -> 0 bytes src/buxton/source/Bill_Notes_OLPC.docx | Bin 6883659 -> 0 bytes src/buxton/source/Bill_Notes_PARCkbd.docx | Bin 631959 -> 0 bytes .../source/Bill_Notes_Philco_Mystery_Control.docx | Bin 1994439 -> 0 bytes src/buxton/source/Bill_Notes_TASA_Kbd.docx | Bin 461199 -> 0 bytes src/buxton/source/Bill_Notes_The_Tap.docx | Bin 711321 -> 0 bytes src/client/util/ClientUtils.ts | 4 + src/scraping/acm/chromedriver.exe | Bin 0 -> 7477760 bytes src/scraping/acm/citations.txt | 2 + src/scraping/acm/debug.log | 38 +++ src/scraping/acm/index.js | 88 ++++++ src/scraping/acm/package.json | 17 ++ src/scraping/acm/results.txt | 64 ++++ src/scraping/buxton/scraper.py | 331 +++++++++++++++++++++ .../buxton/source/Bill_Notes_Bill_Notes_CyKey.docx | Bin 0 -> 1675500 bytes .../buxton/source/Bill_Notes_Braun_T3.docx | Bin 0 -> 1671968 bytes .../buxton/source/Bill_Notes_CasioC801.docx | Bin 0 -> 574664 bytes .../buxton/source/Bill_Notes_Casio_Mini.docx | Bin 0 -> 581069 bytes .../source/Bill_Notes_FingerWorks_Prototype.docx | Bin 0 -> 585090 bytes .../source/Bill_Notes_Fingerworks_TouchStream.docx | Bin 0 -> 1722555 bytes src/scraping/buxton/source/Bill_Notes_FrogPad.docx | Bin 0 -> 840173 bytes .../buxton/source/Bill_Notes_Gavilan_SC.docx | Bin 0 -> 1695290 bytes .../source/Bill_Notes_Grandjean_Stenotype.docx | Bin 0 -> 2094142 bytes src/scraping/buxton/source/Bill_Notes_Matias.docx | Bin 0 -> 590407 bytes .../buxton/source/Bill_Notes_MousePen.docx | Bin 0 -> 505322 bytes src/scraping/buxton/source/Bill_Notes_NewO.docx | Bin 0 -> 2264571 bytes src/scraping/buxton/source/Bill_Notes_OLPC.docx | Bin 0 -> 6883659 bytes src/scraping/buxton/source/Bill_Notes_PARCkbd.docx | Bin 0 -> 631959 bytes .../source/Bill_Notes_Philco_Mystery_Control.docx | Bin 0 -> 1994439 bytes .../buxton/source/Bill_Notes_TASA_Kbd.docx | Bin 0 -> 461199 bytes src/scraping/buxton/source/Bill_Notes_The_Tap.docx | Bin 0 -> 711321 bytes 43 files changed, 544 insertions(+), 331 deletions(-) delete mode 100644 src/buxton/scraper.py delete mode 100644 src/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx delete mode 100644 src/buxton/source/Bill_Notes_Braun_T3.docx delete mode 100644 src/buxton/source/Bill_Notes_CasioC801.docx delete mode 100644 src/buxton/source/Bill_Notes_Casio_Mini.docx delete mode 100644 src/buxton/source/Bill_Notes_FingerWorks_Prototype.docx delete mode 100644 src/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx delete mode 100644 src/buxton/source/Bill_Notes_FrogPad.docx delete mode 100644 src/buxton/source/Bill_Notes_Gavilan_SC.docx delete mode 100644 src/buxton/source/Bill_Notes_Grandjean_Stenotype.docx delete mode 100644 src/buxton/source/Bill_Notes_Matias.docx delete mode 100644 src/buxton/source/Bill_Notes_MousePen.docx delete mode 100644 src/buxton/source/Bill_Notes_NewO.docx delete mode 100644 src/buxton/source/Bill_Notes_OLPC.docx delete mode 100644 src/buxton/source/Bill_Notes_PARCkbd.docx delete mode 100644 src/buxton/source/Bill_Notes_Philco_Mystery_Control.docx delete mode 100644 src/buxton/source/Bill_Notes_TASA_Kbd.docx delete mode 100644 src/buxton/source/Bill_Notes_The_Tap.docx create mode 100644 src/client/util/ClientUtils.ts create mode 100644 src/scraping/acm/chromedriver.exe create mode 100644 src/scraping/acm/citations.txt create mode 100644 src/scraping/acm/debug.log create mode 100644 src/scraping/acm/index.js create mode 100644 src/scraping/acm/package.json create mode 100644 src/scraping/acm/results.txt create mode 100644 src/scraping/buxton/scraper.py create mode 100644 src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_Braun_T3.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_CasioC801.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_FrogPad.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_Matias.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_MousePen.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_NewO.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_OLPC.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_PARCkbd.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx create mode 100644 src/scraping/buxton/source/Bill_Notes_The_Tap.docx (limited to 'src') diff --git a/src/buxton/scraper.py b/src/buxton/scraper.py deleted file mode 100644 index 043fd3cf4..000000000 --- a/src/buxton/scraper.py +++ /dev/null @@ -1,331 +0,0 @@ -import os -import docx2txt -from docx import Document -from docx.opc.constants import RELATIONSHIP_TYPE as RT -import re -from pymongo import MongoClient -import shutil -import uuid -import datetime -from PIL import Image -import math -import sys - -source = "./source" -dist = "../server/public/files" - -db = MongoClient("localhost", 27017)["Dash"] -schema_guids = [] - - -def extract_links(fileName): - links = [] - doc = Document(fileName) - rels = doc.part.rels - for rel in rels: - item = rels[rel] - if item.reltype == RT.HYPERLINK and ".aspx" not in item._target: - links.append(item._target) - return listify(links) - - -def extract_value(kv_string): - pieces = kv_string.split(":") - return (pieces[1] if len(pieces) > 1 else kv_string).strip() - - -def mkdir_if_absent(path): - try: - if not os.path.exists(path): - os.mkdir(path) - except OSError: - print("failed to create the appropriate directory structures for %s" % file_name) - - -def guid(): - return str(uuid.uuid4()) - - -def listify(list): - return { - "fields": list, - "__type": "list" - } - - -def protofy(fieldId): - return { - "fieldId": fieldId, - "__type": "proxy" - } - - -def write_schema(parse_results, display_fields): - view_guids = parse_results["child_guids"] - - data_doc = parse_results["schema"] - fields = data_doc["fields"] - - view_doc_guid = guid() - - view_doc = { - "_id": view_doc_guid, - "fields": { - "proto": protofy(data_doc["_id"]), - "x": 10, - "y": 10, - "width": 900, - "height": 600, - "panX": 0, - "panY": 0, - "zoomBasis": 0.5, - "zIndex": 2, - "libraryBrush": False, - "viewType": 2 - }, - "__type": "Doc" - } - - fields["proto"] = protofy("collectionProto") - fields["data"] = listify(proxify_guids(view_guids)) - fields["schemaColumns"] = listify(display_fields) - fields["backgroundColor"] = "white" - fields["scale"] = 0.5 - fields["viewType"] = 2 - fields["author"] = "Bill Buxton" - fields["creationDate"] = { - "date": datetime.datetime.utcnow().microsecond, - "__type": "date" - } - fields["isPrototype"] = True - fields["page"] = -1 - - db.newDocuments.insert_one(data_doc) - db.newDocuments.insert_one(view_doc) - - data_doc_guid = data_doc["_id"] - print(f"inserted view document ({view_doc_guid})") - print(f"inserted data document ({data_doc_guid})\n") - - return view_doc_guid - - -def write_image(folder, name): - path = f"http://localhost:1050/files/{folder}/{name}" - - data_doc_guid = guid() - view_doc_guid = guid() - - view_doc = { - "_id": view_doc_guid, - "fields": { - "proto": protofy(data_doc_guid), - "x": 10, - "y": 10, - "width": 300, - "zIndex": 2, - "libraryBrush": False - }, - "__type": "Doc" - } - - image = Image.open(f"{dist}/{folder}/{name}") - native_width, native_height = image.size - - data_doc = { - "_id": data_doc_guid, - "fields": { - "proto": protofy("imageProto"), - "data": { - "url": path, - "__type": "image" - }, - "title": name, - "nativeWidth": native_width, - "author": "Bill Buxton", - "creationDate": { - "date": datetime.datetime.utcnow().microsecond, - "__type": "date" - }, - "isPrototype": True, - "page": -1, - "nativeHeight": native_height, - "height": native_height - }, - "__type": "Doc" - } - - db.newDocuments.insert_one(view_doc) - db.newDocuments.insert_one(data_doc) - - return view_doc_guid - - -def parse_document(file_name: str): - print(f"parsing {file_name}...") - pure_name = file_name.split(".")[0] - - result = {} - - dir_path = dist + "/" + pure_name - mkdir_if_absent(dir_path) - - raw = str(docx2txt.process(source + "/" + file_name, dir_path)) - - view_guids = [] - count = 0 - for image in os.listdir(dir_path): - count += 1 - view_guids.append(write_image(pure_name, image)) - os.rename(dir_path + "/" + image, dir_path + - "/" + image.replace(".", "_m.", 1)) - print(f"extracted {count} images...") - - def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( - u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip() - - def sanitize_price(raw: str): - raw = raw.replace(",", "") - start = raw.find("$") - if start > -1: - i = start + 1 - while (i < len(raw) and re.match(r"[0-9\.]", raw[i])): - i += 1 - price = raw[start + 1: i + 1] - return float(price) - elif (raw.lower().find("nfs")): - return -1 - else: - return math.nan - - def remove_empty(line): return len(line) > 1 - - lines = list(map(sanitize, raw.split("\n"))) - lines = list(filter(remove_empty, lines)) - - result["file_name"] = file_name - result["title"] = lines[2].strip() - result["short_description"] = lines[3].strip().replace( - "Short Description: ", "") - - cur = 5 - notes = "" - while lines[cur] != "Device Details": - notes += lines[cur] + " " - cur += 1 - result["buxton_notes"] = notes.strip() - - cur += 1 - clean = list( - map(lambda data: data.strip().split(":"), lines[cur].split("|"))) - result["company"] = clean[0][len(clean[0]) - 1].strip() - result["year"] = clean[1][len(clean[1]) - 1].strip() - result["original_price"] = sanitize_price( - clean[2][len(clean[2]) - 1].strip()) - - cur += 1 - result["degrees_of_freedom"] = extract_value( - lines[cur]).replace("NA", "N/A") - cur += 1 - - dimensions = lines[cur].lower() - if dimensions.startswith("dimensions"): - dim_concat = dimensions[11:].strip() - cur += 1 - while lines[cur] != "Key Words": - dim_concat += (" " + lines[cur].strip()) - cur += 1 - result["dimensions"] = dim_concat - else: - result["dimensions"] = "N/A" - - cur += 1 - result["primary_key"] = extract_value(lines[cur]) - cur += 1 - result["secondary_key"] = extract_value(lines[cur]) - - while lines[cur] != "Links": - result["secondary_key"] += (" " + extract_value(lines[cur]).strip()) - cur += 1 - - cur += 1 - link_descriptions = [] - while lines[cur] != "Image": - link_descriptions.append(lines[cur].strip()) - cur += 1 - result["link_descriptions"] = listify(link_descriptions) - - result["hyperlinks"] = extract_links(source + "/" + file_name) - - images = [] - captions = [] - cur += 3 - while cur + 1 < len(lines) and lines[cur] != "NOTES:": - images.append(lines[cur]) - captions.append(lines[cur + 1]) - cur += 2 - result["images"] = listify(images) - result["captions"] = listify(captions) - - notes = [] - if (cur < len(lines) and lines[cur] == "NOTES:"): - cur += 1 - while cur < len(lines): - notes.append(lines[cur]) - cur += 1 - if len(notes) > 0: - result["notes"] = listify(notes) - - print("writing child schema...") - - return { - "schema": { - "_id": guid(), - "fields": result, - "__type": "Doc" - }, - "child_guids": view_guids - } - - -def proxify_guids(guids): - return list(map(lambda guid: {"fieldId": guid, "__type": "proxy"}, guids)) - - -if os.path.exists(dist): - shutil.rmtree(dist) -while os.path.exists(dist): - pass -os.mkdir(dist) -mkdir_if_absent(source) - -candidates = 0 -for file_name in os.listdir(source): - if file_name.endswith('.docx'): - candidates += 1 - schema_guids.append(write_schema( - parse_document(file_name), ["title", "data"])) - -print("writing parent schema...") -parent_guid = write_schema({ - "schema": { - "_id": guid(), - "fields": {}, - "__type": "Doc" - }, - "child_guids": schema_guids -}, ["title", "short_description", "original_price"]) - -print("appending parent schema to main workspace...\n") -db.newDocuments.update_one( - {"fields.title": "WS collection 1"}, - {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}} -) - -print("rewriting .gitignore...\n") -lines = ['*', '!.gitignore'] -with open(dist + "/.gitignore", 'w') as f: - f.write('\n'.join(lines)) - -suffix = "" if candidates == 1 else "s" -print(f"conversion complete. {candidates} candidate{suffix} processed.") diff --git a/src/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx deleted file mode 100644 index 06094b4d3..000000000 Binary files a/src/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_Braun_T3.docx b/src/buxton/source/Bill_Notes_Braun_T3.docx deleted file mode 100644 index 356697092..000000000 Binary files a/src/buxton/source/Bill_Notes_Braun_T3.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_CasioC801.docx b/src/buxton/source/Bill_Notes_CasioC801.docx deleted file mode 100644 index cd89fb97b..000000000 Binary files a/src/buxton/source/Bill_Notes_CasioC801.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_Casio_Mini.docx b/src/buxton/source/Bill_Notes_Casio_Mini.docx deleted file mode 100644 index a503cddfc..000000000 Binary files a/src/buxton/source/Bill_Notes_Casio_Mini.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_FingerWorks_Prototype.docx b/src/buxton/source/Bill_Notes_FingerWorks_Prototype.docx deleted file mode 100644 index 4d13a8cf5..000000000 Binary files a/src/buxton/source/Bill_Notes_FingerWorks_Prototype.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx b/src/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx deleted file mode 100644 index 578a1be08..000000000 Binary files a/src/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_FrogPad.docx b/src/buxton/source/Bill_Notes_FrogPad.docx deleted file mode 100644 index d01e1bf5c..000000000 Binary files a/src/buxton/source/Bill_Notes_FrogPad.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/buxton/source/Bill_Notes_Gavilan_SC.docx deleted file mode 100644 index 7bd28b376..000000000 Binary files a/src/buxton/source/Bill_Notes_Gavilan_SC.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/buxton/source/Bill_Notes_Grandjean_Stenotype.docx deleted file mode 100644 index 0615c4953..000000000 Binary files a/src/buxton/source/Bill_Notes_Grandjean_Stenotype.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_Matias.docx b/src/buxton/source/Bill_Notes_Matias.docx deleted file mode 100644 index 547603256..000000000 Binary files a/src/buxton/source/Bill_Notes_Matias.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_MousePen.docx b/src/buxton/source/Bill_Notes_MousePen.docx deleted file mode 100644 index 4e1056636..000000000 Binary files a/src/buxton/source/Bill_Notes_MousePen.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_NewO.docx b/src/buxton/source/Bill_Notes_NewO.docx deleted file mode 100644 index a514926d2..000000000 Binary files a/src/buxton/source/Bill_Notes_NewO.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_OLPC.docx b/src/buxton/source/Bill_Notes_OLPC.docx deleted file mode 100644 index bfca0a9bb..000000000 Binary files a/src/buxton/source/Bill_Notes_OLPC.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_PARCkbd.docx b/src/buxton/source/Bill_Notes_PARCkbd.docx deleted file mode 100644 index c0cf6ba9a..000000000 Binary files a/src/buxton/source/Bill_Notes_PARCkbd.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_Philco_Mystery_Control.docx b/src/buxton/source/Bill_Notes_Philco_Mystery_Control.docx deleted file mode 100644 index ad06903f3..000000000 Binary files a/src/buxton/source/Bill_Notes_Philco_Mystery_Control.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_TASA_Kbd.docx b/src/buxton/source/Bill_Notes_TASA_Kbd.docx deleted file mode 100644 index e4c659de9..000000000 Binary files a/src/buxton/source/Bill_Notes_TASA_Kbd.docx and /dev/null differ diff --git a/src/buxton/source/Bill_Notes_The_Tap.docx b/src/buxton/source/Bill_Notes_The_Tap.docx deleted file mode 100644 index 8ceebc71e..000000000 Binary files a/src/buxton/source/Bill_Notes_The_Tap.docx and /dev/null differ diff --git a/src/client/util/ClientUtils.ts b/src/client/util/ClientUtils.ts new file mode 100644 index 000000000..425bde14a --- /dev/null +++ b/src/client/util/ClientUtils.ts @@ -0,0 +1,4 @@ +//AUTO-GENERATED FILE: DO NOT EDIT +export namespace ClientUtils { + export const RELEASE = false; +} \ No newline at end of file diff --git a/src/scraping/acm/chromedriver.exe b/src/scraping/acm/chromedriver.exe new file mode 100644 index 000000000..6a362fd43 Binary files /dev/null and b/src/scraping/acm/chromedriver.exe differ diff --git a/src/scraping/acm/citations.txt b/src/scraping/acm/citations.txt new file mode 100644 index 000000000..e5018ddef --- /dev/null +++ b/src/scraping/acm/citations.txt @@ -0,0 +1,2 @@ +321046 +2412979 \ No newline at end of file diff --git a/src/scraping/acm/debug.log b/src/scraping/acm/debug.log new file mode 100644 index 000000000..8c0a148f4 --- /dev/null +++ b/src/scraping/acm/debug.log @@ -0,0 +1,38 @@ +[0625/170004.768:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/170004.769:ERROR:exception_snapshot_win.cc(98)] thread ID 17604 not found in process +[0625/171124.644:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/171124.645:ERROR:exception_snapshot_win.cc(98)] thread ID 14348 not found in process +[0625/171853.989:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/171853.990:ERROR:exception_snapshot_win.cc(98)] thread ID 12080 not found in process +[0625/171947.744:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/171947.745:ERROR:exception_snapshot_win.cc(98)] thread ID 16160 not found in process +[0625/172007.424:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/172007.425:ERROR:exception_snapshot_win.cc(98)] thread ID 13472 not found in process +[0625/172059.353:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/172059.354:ERROR:exception_snapshot_win.cc(98)] thread ID 6396 not found in process +[0625/172402.795:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/172402.796:ERROR:exception_snapshot_win.cc(98)] thread ID 10720 not found in process +[0625/172618.850:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/172618.850:ERROR:exception_snapshot_win.cc(98)] thread ID 21136 not found in process +[0625/172819.875:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/172819.876:ERROR:exception_snapshot_win.cc(98)] thread ID 17624 not found in process +[0625/172953.674:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/172953.675:ERROR:exception_snapshot_win.cc(98)] thread ID 15180 not found in process +[0625/173412.182:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173412.182:ERROR:exception_snapshot_win.cc(98)] thread ID 13952 not found in process +[0625/173447.806:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173447.807:ERROR:exception_snapshot_win.cc(98)] thread ID 1572 not found in process +[0625/173516.188:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173516.189:ERROR:exception_snapshot_win.cc(98)] thread ID 5472 not found in process +[0625/173528.446:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173528.447:ERROR:exception_snapshot_win.cc(98)] thread ID 20420 not found in process +[0625/173539.436:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173539.437:ERROR:exception_snapshot_win.cc(98)] thread ID 16192 not found in process +[0625/173643.139:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173643.140:ERROR:exception_snapshot_win.cc(98)] thread ID 15716 not found in process +[0625/173659.376:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173659.377:ERROR:exception_snapshot_win.cc(98)] thread ID 11828 not found in process +[0625/201137.209:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/201137.210:ERROR:exception_snapshot_win.cc(98)] thread ID 7688 not found in process +[0625/210240.476:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/210240.477:ERROR:exception_snapshot_win.cc(98)] thread ID 20828 not found in process diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js new file mode 100644 index 000000000..81f775617 --- /dev/null +++ b/src/scraping/acm/index.js @@ -0,0 +1,88 @@ +const { Builder, By, Key, until } = require('selenium-webdriver'); +const fs = require("fs"); + +let driver; +fs.readFile("./citations.txt", { encoding: "utf8" }, scrapeTargets); +results = [] + +async function scrapeTargets(error, data) { + if (error) { + console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.") + return; + } + + driver = await new Builder().forBrowser('chrome').build(); + + let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g)); + + let results = [] + let pdfs = [] + for (let id of references) { + let result = {} + let lines = [] + try { + let url = `https://dl.acm.org/citation.cfm?id=${id}`; + await driver.get(url); + await driver.sleep(500) + let candidates = await driver.findElements(By.xpath('.//a[contains(@href, "ft_gateway.cfm?id=")]')); + if (candidates.length > 0) { + pdfs.push(candidates[0]) + } + let webElements = await driver.findElements(By.id("abstract-body")) + for (let el of webElements) { + let text = await el.getText() + lines.push(text) + } + result.url = url + result.abstract = lines.join(" "); + await driver.findElement(By.xpath(`//*[@id="tab-1014-btnInnerEl"]/span`)).click() + await driver.sleep(500) + let authors = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')).getText() + let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:")) + authorLines = authors.split("\n").map(line => line.trim()).filter(sanitize) + + let i = 0; + let allAuthors = [] + while (i < authorLines.length) { + let individual = []; + while (!authorLines[i].startsWith("Average citations")) { + individual.push(authorLines[i]) + i++ + } + individual.push(authorLines[i]) + allAuthors.push(individual); + i++ + } + result.authors = allAuthors.map(metadata => { + let publicationYears = metadata[1].substring(18).split("-"); + author = { + name: metadata[0], + publication_start: parseInt(publicationYears[0]), + publication_end: parseInt(publicationYears[1]) + }; + for (let count = 2; count < metadata.length; count++) { + let attr = metadata[count] + let char = attr.length - 1; + while (attr[char] != " ") { + char-- + } + let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, ""); + let value = parseFloat(attr.substring(char + 1).replace(/,/g, "")); + author[key] = value + } + return author + }) + } catch (e) { + console.log(e) + await driver.quit(); + } + results.push(result) + } + + let output = ""; + results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n")); + + fs.writeFile("./results.txt", output, function errorHandler(exception) { console.log(exception || "results successfully written") }) + + await driver.quit(); +} \ No newline at end of file diff --git a/src/scraping/acm/package.json b/src/scraping/acm/package.json new file mode 100644 index 000000000..10f4d2156 --- /dev/null +++ b/src/scraping/acm/package.json @@ -0,0 +1,17 @@ +{ + "name": "scraper", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC", + "dependencies": { + "axios": "^0.19.0", + "cheerio": "^1.0.0-rc.3", + "selenium-webdriver": "^4.0.0-alpha.4" + } +} diff --git a/src/scraping/acm/results.txt b/src/scraping/acm/results.txt new file mode 100644 index 000000000..05bb2be8b --- /dev/null +++ b/src/scraping/acm/results.txt @@ -0,0 +1,64 @@ +{ + "url": "https://dl.acm.org/citation.cfm?id=321046", + "abstract": "It has been observed by many people that a striking number of quite diverse mathematical problems can be formulated as problems in integer programming, that is, linear programming problems in which some or all of the variables are required to assume integral values. This fact is rendered quite interesting by recent research on such problems, notably by R. E. Gomory [2, 3], which gives promise of yielding efficient computational techniques for their solution. The present paper provides yet another example of the versatility of integer programming as a mathematical modeling device by representing a generalization of the well-known “Travelling Salesman Problem” in integer programming terms. The authors have developed several such models, of which the one presented here is the most efficient in terms of generality, number of variables, and number of constraints. This model is due to the second author [4] and was presented briefly at the Symposium on Combinatorial Problems held at Princeton University, April 1960, sponsored by SIAM and IBM. The problem treated is: (1) A salesman is required to visit each of n cities, indexed by 1, … , n. He leaves from a “base city” indexed by 0, visits each of the n other cities exactly once, and returns to city 0. During his travels he must return to 0 exactly t times, including his final return (here t may be allowed to vary), and he must visit no more than p cities in one tour. (By a tour we mean a succession of visits to cities without stopping at city 0.) It is required to find such an itinerary which minimizes the total distance traveled by the salesman. Note that if t is fixed, then for the problem to have a solution we must have tp ≧ n. For t = 1, p ≧ n, we have the standard traveling salesman problem. Let dij (i ≠ j = 0, 1, … , n) be the distance covered in traveling from city i to city j. The following integer programming problem will be shown to be equivalent to (1): (2) Minimize the linear form ∑0≦i≠j≦n∑ dijxij over the set determined by the relations ∑ni=0i≠j xij = 1 (j = 1, … , n) ∑nj=0j≠i xij = 1 (i = 1, … , n) ui - uj + pxij ≦ p - 1 (1 ≦ i ≠ j ≦ n) where the xij are non-negative integers and the ui (i = 1, …, n) are arbitrary real numbers. (We shall see that it is permissible to restrict the ui to be non-negative integers as well.) If t is fixed it is necessary to add the additional relation: ∑nu=1 xi0 = t Note that the constraints require that xij = 0 or 1, so that a natural correspondence between these two problems exists if the xij are interpreted as follows: The salesman proceeds from city i to city j if and only if xij = 1. Under this correspondence the form to be minimized in (2) is the total distance to be traveled by the salesman in (1), so the burden of proof is to show that the two feasible sets correspond; i.e., a feasible solution to (2) has xij which do define a legitimate itinerary in (1), and, conversely a legitimate itinerary in (1) defines xij, which, together with appropriate ui, satisfy the constraints of (2). Consider a feasible solution to (2). The number of returns to city 0 is given by ∑ni=1 xi0. The constraints of the form ∑ xij = 1, all xij non-negative integers, represent the conditions that each city (other than zero) is visited exactly once. The ui play a role similar to node potentials in a network and the inequalities involving them serve to eliminate tours that do not begin and end at city 0 and tours that visit more than p cities. Consider any xr0r1 = 1 (r1 ≠ 0). There exists a unique r2 such that xr1r2 = 1. Unless r2 = 0, there is a unique r3 with xr2r3 = 1. We proceed in this fashion until some rj = 0. This must happen since the alternative is that at some point we reach an rk = rj, j + 1 < k. Since none of the r's are zero we have uri - uri + 1 + pxriri + 1 ≦ p - 1 or uri - uri + 1 ≦ - 1. Summing from i = j to k - 1, we have urj - urk = 0 ≦ j + 1 - k, which is a contradiction. Thus all tours include city 0. It remains to observe that no tours is of length greater than p. Suppose such a tour exists, x0r1 , xr1r2 , … , xrprp+1 = 1 with all ri ≠ 0. Then, as before, ur1 - urp+1 ≦ - p or urp+1 - ur1 ≧ p. But we have urp+1 - ur1 + pxrp+1r1 ≦ p - 1 or urp+1 - ur1 ≦ p (1 - xrp+1r1) - 1 ≦ p - 1, which is a contradiction. Conversely, if the xij correspond to a legitimate itinerary, it is clear that the ui can be adjusted so that ui = j if city i is the jth city visited in the tour which includes city i, for we then have ui - uj = - 1 if xij = 1, and always ui - uj ≦ p - 1. The above integer program involves n2 + n constraints (if t is not fixed) in n2 + 2n variables. Since the inequality form of constraint is fundamental for integer programming calculations, one may eliminate 2n variables, say the xi0 and x0j, by means of the equation constraints and produce an equivalent problem with n2 + n inequalities and n2 variables. The currently known integer programming procedures are sufficiently regular in their behavior to cast doubt on the heuristic value of machine experiments with our model. However, it seems appropriate to report the results of the five machine experiments we have conducted so far. The solution procedure used was the all-integer algorithm of R. E. Gomory [3] without the ranking procedure he describes. The first three experiments were simple model verification tests on a four-city standard traveling salesman problem with distance matrix [ 20 23 4 30 7 27 25 5 25 3 21 26 ] The first experiment was with a model, now obsolete, using roughly twice as many constraints and variables as the current model (for this problem, 28 constraints in 21 variables). The machine was halted after 4000 pivot steps had failed to produce a solution. The second experiment used the earlier model with the xi0 and x0j eliminated, resulting in a 28-constraint, 15-variable problem. Here the machine produced the optimal solution in 41 pivot steps. The third experiment used the current formulation with the xi0 and x0j eliminated, yielding 13 constraints and 9 variables. The optimal solution was reached in 7 pivot steps. The fourth and fifth experiments were used on a standard ten-city problem, due to Barachet, solved by Dantzig, Johnson and Fulkerson [1]. The current formulation was used, yielding 91 constraints in 81 variables. The fifth problem differed from the fourth only in that the ordering of the rows was altered to attempt to introduce more favorable pivot choices. In each case the machine was stopped after over 250 pivot steps had failed to produce the solution. In each case the last 100 pivot steps had failed to change the value of the objective function. It seems hopeful that more efficient integer programming procedures now under development will yield a satisfactory algorithmic solution to the traveling salesman problem, when applied to this model. In any case, the model serves to illustrate how problems of this sort may be succinctly formulated in integer programming terms.", + "authors": [ + { + "name": "C. E. Miller", + "publication_start": 1960, + "publication_end": 1960, + "publication_count": 1, + "citation_count": 179, + "available_for_download": 1, + "downloads_6_weeks": 132, + "downloads_12_months": 993, + "downloads_cumulative": 9781, + "average_downloads_per_article": 9781, + "average_citations_per_article": 179 + }, + { + "name": "A. W. Tucker", + "publication_start": 1960, + "publication_end": 1993, + "publication_count": 5, + "citation_count": 196, + "available_for_download": 1, + "downloads_6_weeks": 132, + "downloads_12_months": 993, + "downloads_cumulative": 9781, + "average_downloads_per_article": 9781, + "average_citations_per_article": 39.2 + }, + { + "name": "R. A. Zemlin", + "publication_start": 1960, + "publication_end": 1964, + "publication_count": 2, + "citation_count": 188, + "available_for_download": 2, + "downloads_6_weeks": 132, + "downloads_12_months": 998, + "downloads_cumulative": 10012, + "average_downloads_per_article": 5006, + "average_citations_per_article": 94 + } + ] +} +{ + "url": "https://dl.acm.org/citation.cfm?id=2412979", + "abstract": "The STRUCT system utilizes the flexibility of a powerful graphics display system to provide a set of tools for program analysis. These tools allow the analysis of the static prograin structure and the dynamic execution behavior. of programs within the entire operating system/user program environment of the Brown University Graphics System (BUGS). Information is collected and presented in a manner which fully exploits two aspects of this environment. First, the operating system has been developed in a well-structured hierarcal manner following principles laid down by other researchers (2), (3). Second the programs under analysis have been written in a structured programming language following coding conventions which make available, at the source code level, valuable program control information. A new set of pictorial constructs is introduced for presenting a. program structure (static or dynamic) for inspection. These constructs combine the best features of an indented structured source code listing and the box odented nature of traditional flow charts. The graphical tools available are USed to provide for swift changes in. the desired level of detail displayed within a program structure, for traveling linearly through a program structure, for traveling through a complex program structure (following subroutine or system calls), for concurrently viewing multiple related program structures, and for presenting dynamic program behavior data using three-dimensional projections, The volume of a three-dimensional box representing a program block is proportional to the block's resource utilization. The scope of this paper is limited to a description of the STRUCT system. This system is currently being used to predict and analyze the performance advantages available through the migration of function (program modules) between levels of software and between software and firmware within BUGS. The results of this research on migration will be included in a doctoral dissertation currently being written.", + "authors": [ + { + "name": "Andries Van Dam", + "publication_start": 1975, + "publication_end": 1975, + "publication_count": 1, + "citation_count": 0, + "available_for_download": 0, + "downloads_6_weeks": 8, + "downloads_12_months": 97, + "downloads_cumulative": 97, + "average_downloads_per_article": 0, + "average_citations_per_article": 0 + } + ] +} diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py new file mode 100644 index 000000000..97af10519 --- /dev/null +++ b/src/scraping/buxton/scraper.py @@ -0,0 +1,331 @@ +import os +import docx2txt +from docx import Document +from docx.opc.constants import RELATIONSHIP_TYPE as RT +import re +from pymongo import MongoClient +import shutil +import uuid +import datetime +from PIL import Image +import math +import sys + +source = "./source" +dist = "../../server/public/files" + +db = MongoClient("localhost", 27017)["Dash"] +schema_guids = [] + + +def extract_links(fileName): + links = [] + doc = Document(fileName) + rels = doc.part.rels + for rel in rels: + item = rels[rel] + if item.reltype == RT.HYPERLINK and ".aspx" not in item._target: + links.append(item._target) + return listify(links) + + +def extract_value(kv_string): + pieces = kv_string.split(":") + return (pieces[1] if len(pieces) > 1 else kv_string).strip() + + +def mkdir_if_absent(path): + try: + if not os.path.exists(path): + os.mkdir(path) + except OSError: + print("failed to create the appropriate directory structures for %s" % file_name) + + +def guid(): + return str(uuid.uuid4()) + + +def listify(list): + return { + "fields": list, + "__type": "list" + } + + +def protofy(fieldId): + return { + "fieldId": fieldId, + "__type": "proxy" + } + + +def write_schema(parse_results, display_fields): + view_guids = parse_results["child_guids"] + + data_doc = parse_results["schema"] + fields = data_doc["fields"] + + view_doc_guid = guid() + + view_doc = { + "_id": view_doc_guid, + "fields": { + "proto": protofy(data_doc["_id"]), + "x": 10, + "y": 10, + "width": 900, + "height": 600, + "panX": 0, + "panY": 0, + "zoomBasis": 0.5, + "zIndex": 2, + "libraryBrush": False, + "viewType": 2 + }, + "__type": "Doc" + } + + fields["proto"] = protofy("collectionProto") + fields["data"] = listify(proxify_guids(view_guids)) + fields["schemaColumns"] = listify(display_fields) + fields["backgroundColor"] = "white" + fields["scale"] = 0.5 + fields["viewType"] = 2 + fields["author"] = "Bill Buxton" + fields["creationDate"] = { + "date": datetime.datetime.utcnow().microsecond, + "__type": "date" + } + fields["isPrototype"] = True + fields["page"] = -1 + + db.newDocuments.insert_one(data_doc) + db.newDocuments.insert_one(view_doc) + + data_doc_guid = data_doc["_id"] + print(f"inserted view document ({view_doc_guid})") + print(f"inserted data document ({data_doc_guid})\n") + + return view_doc_guid + + +def write_image(folder, name): + path = f"http://localhost:1050/files/{folder}/{name}" + + data_doc_guid = guid() + view_doc_guid = guid() + + view_doc = { + "_id": view_doc_guid, + "fields": { + "proto": protofy(data_doc_guid), + "x": 10, + "y": 10, + "width": 300, + "zIndex": 2, + "libraryBrush": False + }, + "__type": "Doc" + } + + image = Image.open(f"{dist}/{folder}/{name}") + native_width, native_height = image.size + + data_doc = { + "_id": data_doc_guid, + "fields": { + "proto": protofy("imageProto"), + "data": { + "url": path, + "__type": "image" + }, + "title": name, + "nativeWidth": native_width, + "author": "Bill Buxton", + "creationDate": { + "date": datetime.datetime.utcnow().microsecond, + "__type": "date" + }, + "isPrototype": True, + "page": -1, + "nativeHeight": native_height, + "height": native_height + }, + "__type": "Doc" + } + + db.newDocuments.insert_one(view_doc) + db.newDocuments.insert_one(data_doc) + + return view_doc_guid + + +def parse_document(file_name: str): + print(f"parsing {file_name}...") + pure_name = file_name.split(".")[0] + + result = {} + + dir_path = dist + "/" + pure_name + mkdir_if_absent(dir_path) + + raw = str(docx2txt.process(source + "/" + file_name, dir_path)) + + view_guids = [] + count = 0 + for image in os.listdir(dir_path): + count += 1 + view_guids.append(write_image(pure_name, image)) + os.rename(dir_path + "/" + image, dir_path + + "/" + image.replace(".", "_m.", 1)) + print(f"extracted {count} images...") + + def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( + u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip() + + def sanitize_price(raw: str): + raw = raw.replace(",", "") + start = raw.find("$") + if start > -1: + i = start + 1 + while (i < len(raw) and re.match(r"[0-9\.]", raw[i])): + i += 1 + price = raw[start + 1: i + 1] + return float(price) + elif (raw.lower().find("nfs")): + return -1 + else: + return math.nan + + def remove_empty(line): return len(line) > 1 + + lines = list(map(sanitize, raw.split("\n"))) + lines = list(filter(remove_empty, lines)) + + result["file_name"] = file_name + result["title"] = lines[2].strip() + result["short_description"] = lines[3].strip().replace( + "Short Description: ", "") + + cur = 5 + notes = "" + while lines[cur] != "Device Details": + notes += lines[cur] + " " + cur += 1 + result["buxton_notes"] = notes.strip() + + cur += 1 + clean = list( + map(lambda data: data.strip().split(":"), lines[cur].split("|"))) + result["company"] = clean[0][len(clean[0]) - 1].strip() + result["year"] = clean[1][len(clean[1]) - 1].strip() + result["original_price"] = sanitize_price( + clean[2][len(clean[2]) - 1].strip()) + + cur += 1 + result["degrees_of_freedom"] = extract_value( + lines[cur]).replace("NA", "N/A") + cur += 1 + + dimensions = lines[cur].lower() + if dimensions.startswith("dimensions"): + dim_concat = dimensions[11:].strip() + cur += 1 + while lines[cur] != "Key Words": + dim_concat += (" " + lines[cur].strip()) + cur += 1 + result["dimensions"] = dim_concat + else: + result["dimensions"] = "N/A" + + cur += 1 + result["primary_key"] = extract_value(lines[cur]) + cur += 1 + result["secondary_key"] = extract_value(lines[cur]) + + while lines[cur] != "Links": + result["secondary_key"] += (" " + extract_value(lines[cur]).strip()) + cur += 1 + + cur += 1 + link_descriptions = [] + while lines[cur] != "Image": + link_descriptions.append(lines[cur].strip()) + cur += 1 + result["link_descriptions"] = listify(link_descriptions) + + result["hyperlinks"] = extract_links(source + "/" + file_name) + + images = [] + captions = [] + cur += 3 + while cur + 1 < len(lines) and lines[cur] != "NOTES:": + images.append(lines[cur]) + captions.append(lines[cur + 1]) + cur += 2 + result["images"] = listify(images) + result["captions"] = listify(captions) + + notes = [] + if (cur < len(lines) and lines[cur] == "NOTES:"): + cur += 1 + while cur < len(lines): + notes.append(lines[cur]) + cur += 1 + if len(notes) > 0: + result["notes"] = listify(notes) + + print("writing child schema...") + + return { + "schema": { + "_id": guid(), + "fields": result, + "__type": "Doc" + }, + "child_guids": view_guids + } + + +def proxify_guids(guids): + return list(map(lambda guid: {"fieldId": guid, "__type": "proxy"}, guids)) + + +if os.path.exists(dist): + shutil.rmtree(dist) +while os.path.exists(dist): + pass +os.mkdir(dist) +mkdir_if_absent(source) + +candidates = 0 +for file_name in os.listdir(source): + if file_name.endswith('.docx'): + candidates += 1 + schema_guids.append(write_schema( + parse_document(file_name), ["title", "data"])) + +print("writing parent schema...") +parent_guid = write_schema({ + "schema": { + "_id": guid(), + "fields": {}, + "__type": "Doc" + }, + "child_guids": schema_guids +}, ["title", "short_description", "original_price"]) + +print("appending parent schema to main workspace...\n") +db.newDocuments.update_one( + {"fields.title": "WS collection 1"}, + {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}} +) + +print("rewriting .gitignore...\n") +lines = ['*', '!.gitignore'] +with open(dist + "/.gitignore", 'w') as f: + f.write('\n'.join(lines)) + +suffix = "" if candidates == 1 else "s" +print(f"conversion complete. {candidates} candidate{suffix} processed.") diff --git a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx new file mode 100644 index 000000000..06094b4d3 Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx b/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx new file mode 100644 index 000000000..356697092 Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx b/src/scraping/buxton/source/Bill_Notes_CasioC801.docx new file mode 100644 index 000000000..cd89fb97b Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_CasioC801.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx b/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx new file mode 100644 index 000000000..a503cddfc Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx b/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx new file mode 100644 index 000000000..4d13a8cf5 Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx b/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx new file mode 100644 index 000000000..578a1be08 Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx new file mode 100644 index 000000000..d01e1bf5c Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx new file mode 100644 index 000000000..7bd28b376 Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx new file mode 100644 index 000000000..0615c4953 Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Matias.docx b/src/scraping/buxton/source/Bill_Notes_Matias.docx new file mode 100644 index 000000000..547603256 Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_Matias.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_MousePen.docx b/src/scraping/buxton/source/Bill_Notes_MousePen.docx new file mode 100644 index 000000000..4e1056636 Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_MousePen.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_NewO.docx b/src/scraping/buxton/source/Bill_Notes_NewO.docx new file mode 100644 index 000000000..a514926d2 Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_NewO.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_OLPC.docx b/src/scraping/buxton/source/Bill_Notes_OLPC.docx new file mode 100644 index 000000000..bfca0a9bb Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_OLPC.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx new file mode 100644 index 000000000..c0cf6ba9a Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx b/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx new file mode 100644 index 000000000..ad06903f3 Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx b/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx new file mode 100644 index 000000000..e4c659de9 Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx b/src/scraping/buxton/source/Bill_Notes_The_Tap.docx new file mode 100644 index 000000000..8ceebc71e Binary files /dev/null and b/src/scraping/buxton/source/Bill_Notes_The_Tap.docx differ -- cgit v1.2.3-70-g09d2