diff options
| author | Eleanor Eng <eleanor_eng@brown.edu> | 2019-08-06 10:54:33 -0400 |
|---|---|---|
| committer | Eleanor Eng <eleanor_eng@brown.edu> | 2019-08-06 10:54:33 -0400 |
| commit | b7194d88ba9733413063c7f371dedefd3a97e35f (patch) | |
| tree | 418203fe1ee1f4aa0771c555ab672541cc775473 /src/scraping | |
| parent | 2c4440be2807b3b1da691ea04b061c35e50ecb72 (diff) | |
| parent | 298d1c9b29d6ce2171fd9ac8274b64583b73f6f5 (diff) | |
merge with master
Diffstat (limited to 'src/scraping')
28 files changed, 33 insertions, 10 deletions
diff --git a/src/scraping/acm/chromedriver b/src/scraping/acm/chromedriver Binary files differindex 9e9b16717..9e9b16717 100755..100644 --- a/src/scraping/acm/chromedriver +++ b/src/scraping/acm/chromedriver diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 700269727..f0f45d8f9 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -1,4 +1,5 @@ import os +from shutil import copyfile import docx2txt from docx import Document from docx.opc.constants import RELATIONSHIP_TYPE as RT @@ -15,7 +16,9 @@ source = "./source" dist = "../../server/public/files" db = MongoClient("localhost", 27017)["Dash"] +target_collection = db.newDocuments schema_guids = [] +common_proto_id = "" def extract_links(fileName): @@ -84,7 +87,7 @@ def write_schema(parse_results, display_fields, storage_key): "height": 600, "panX": 0, "panY": 0, - "zoomBasis": 0.5, + "zoomBasis": 1, "zIndex": 2, "libraryBrush": False, "viewType": 2 @@ -92,7 +95,7 @@ def write_schema(parse_results, display_fields, storage_key): "__type": "Doc" } - fields["proto"] = protofy("collectionProto") + fields["proto"] = protofy(common_proto_id) fields[storage_key] = listify(proxify_guids(view_guids)) fields["schemaColumns"] = listify(display_fields) fields["backgroundColor"] = "white" @@ -106,8 +109,8 @@ def write_schema(parse_results, display_fields, storage_key): fields["isPrototype"] = True fields["page"] = -1 - db.newDocuments.insert_one(data_doc) - db.newDocuments.insert_one(view_doc) + target_collection.insert_one(data_doc) + target_collection.insert_one(view_doc) data_doc_guid = data_doc["_id"] print(f"inserted view document ({view_doc_guid})") @@ -158,8 +161,8 @@ def write_text_doc(content): "__type": "Doc" } - db.newDocuments.insert_one(view_doc) - db.newDocuments.insert_one(data_doc) + target_collection.insert_one(view_doc) + target_collection.insert_one(data_doc) return view_doc_guid @@ -209,8 +212,8 @@ def write_image(folder, name): "__type": "Doc" } - db.newDocuments.insert_one(view_doc) - db.newDocuments.insert_one(data_doc) + target_collection.insert_one(view_doc) + target_collection.insert_one(data_doc) return view_doc_guid @@ -231,7 +234,9 @@ def parse_document(file_name: str): for image in os.listdir(dir_path): count += 1 view_guids.append(write_image(pure_name, image)) - os.rename(dir_path + "/" + image, dir_path + + copyfile(dir_path + "/" + image, dir_path + + "/" + image.replace(".", "_o.", 1)) + copyfile(dir_path + "/" + image, dir_path + "/" + image.replace(".", "_m.", 1)) print(f"extracted {count} images...") @@ -347,6 +352,22 @@ def proxify_guids(guids): return list(map(lambda guid: {"fieldId": guid, "__type": "proxy"}, guids)) +def write_common_proto(): + id = guid() + common_proto = { + "_id": id, + "fields": { + "proto": protofy("collectionProto"), + "title": "Common Import Proto", + }, + "__type": "Doc" + } + + target_collection.insert_one(common_proto) + + return id + + if os.path.exists(dist): shutil.rmtree(dist) while os.path.exists(dist): @@ -354,6 +375,8 @@ while os.path.exists(dist): os.mkdir(dist) mkdir_if_absent(source) +common_proto_id = write_common_proto() + candidates = 0 for file_name in os.listdir(source): if file_name.endswith('.docx'): @@ -372,7 +395,7 @@ parent_guid = write_schema({ }, ["title", "short_description", "original_price"], "data") print("appending parent schema to main workspace...\n") -db.newDocuments.update_one( +target_collection.update_one( {"fields.title": "WS collection 1"}, {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}} ) diff --git a/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx b/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx Binary files differnew file mode 100644 index 000000000..a2ab04b78 --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx b/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx Binary files differnew file mode 100644 index 000000000..e4375ebeb --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx b/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx Binary files differnew file mode 100644 index 000000000..99f7ad19d --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx diff --git a/src/scraping/buxton/source/Bill_Notes_BAT.docx b/src/scraping/buxton/source/Bill_Notes_BAT.docx Binary files differnew file mode 100644 index 000000000..0e3368611 --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_BAT.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx Binary files differindex 06094b4d3..649d636e3 100644 --- a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx +++ b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx b/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx Binary files differindex 356697092..b00080e08 100644 --- a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx +++ b/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx diff --git a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx b/src/scraping/buxton/source/Bill_Notes_CasioC801.docx Binary files differindex cd89fb97b..510a006e0 100644 --- a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx +++ b/src/scraping/buxton/source/Bill_Notes_CasioC801.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx b/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx Binary files differnew file mode 100644 index 000000000..c8d3943c0 --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx b/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx Binary files differindex a503cddfc..cea9e7b69 100644 --- a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx +++ b/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx diff --git a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx b/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx Binary files differindex 4d13a8cf5..f53402a06 100644 --- a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx +++ b/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx b/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx Binary files differindex 578a1be08..0eec89949 100644 --- a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx +++ b/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx diff --git a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx Binary files differindex d01e1bf5c..ba80c1959 100644 --- a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx +++ b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx Binary files differindex 7bd28b376..8558a4e13 100644 --- a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx +++ b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx Binary files differindex 0615c4953..09e17f971 100644 --- a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx +++ b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx b/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx Binary files differnew file mode 100644 index 000000000..f00fcb772 --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Matias.docx b/src/scraping/buxton/source/Bill_Notes_Matias.docx Binary files differindex 547603256..d2d014bbe 100644 --- a/src/scraping/buxton/source/Bill_Notes_Matias.docx +++ b/src/scraping/buxton/source/Bill_Notes_Matias.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Microwriter.docx b/src/scraping/buxton/source/Bill_Notes_Microwriter.docx Binary files differnew file mode 100644 index 000000000..3ac272e42 --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Microwriter.docx diff --git a/src/scraping/buxton/source/Bill_Notes_MousePen.docx b/src/scraping/buxton/source/Bill_Notes_MousePen.docx Binary files differindex 4e1056636..cd0b3eab3 100644 --- a/src/scraping/buxton/source/Bill_Notes_MousePen.docx +++ b/src/scraping/buxton/source/Bill_Notes_MousePen.docx diff --git a/src/scraping/buxton/source/Bill_Notes_NewO.docx b/src/scraping/buxton/source/Bill_Notes_NewO.docx Binary files differindex a514926d2..2f4a04e81 100644 --- a/src/scraping/buxton/source/Bill_Notes_NewO.docx +++ b/src/scraping/buxton/source/Bill_Notes_NewO.docx diff --git a/src/scraping/buxton/source/Bill_Notes_OLPC.docx b/src/scraping/buxton/source/Bill_Notes_OLPC.docx Binary files differindex bfca0a9bb..7a636e2d6 100644 --- a/src/scraping/buxton/source/Bill_Notes_OLPC.docx +++ b/src/scraping/buxton/source/Bill_Notes_OLPC.docx diff --git a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx Binary files differindex c0cf6ba9a..3038de363 100644 --- a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx +++ b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx b/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx Binary files differindex ad06903f3..af72fa662 100644 --- a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx +++ b/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx diff --git a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx b/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx Binary files differindex e4c659de9..5c2eb8d7f 100644 --- a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx +++ b/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx diff --git a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx b/src/scraping/buxton/source/Bill_Notes_The_Tap.docx Binary files differindex 8ceebc71e..c9ee2eaea 100644 --- a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx +++ b/src/scraping/buxton/source/Bill_Notes_The_Tap.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Twiddler.docx b/src/scraping/buxton/source/Bill_Notes_Twiddler.docx Binary files differnew file mode 100644 index 000000000..27b4acc85 --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Twiddler.docx diff --git a/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc b/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc Binary files differnew file mode 100644 index 000000000..6bd71f20e --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc |
