From d6cef0815815c2587b9cc791e6a37c742aba1b45 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Thu, 25 Jul 2019 03:31:34 -0400 Subject: cognitive services refactor, buxton python script fixes, covered up imagebox context menu bug --- src/scraping/buxton/scraper.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'src/scraping/buxton/scraper.py') diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 700269727..14490cfe4 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -15,6 +15,7 @@ source = "./source" dist = "../../server/public/files" db = MongoClient("localhost", 27017)["Dash"] +target_collection = db.newDocuments schema_guids = [] @@ -84,7 +85,7 @@ def write_schema(parse_results, display_fields, storage_key): "height": 600, "panX": 0, "panY": 0, - "zoomBasis": 0.5, + "zoomBasis": 1, "zIndex": 2, "libraryBrush": False, "viewType": 2 @@ -106,8 +107,8 @@ def write_schema(parse_results, display_fields, storage_key): fields["isPrototype"] = True fields["page"] = -1 - db.newDocuments.insert_one(data_doc) - db.newDocuments.insert_one(view_doc) + target_collection.insert_one(data_doc) + target_collection.insert_one(view_doc) data_doc_guid = data_doc["_id"] print(f"inserted view document ({view_doc_guid})") @@ -158,8 +159,8 @@ def write_text_doc(content): "__type": "Doc" } - db.newDocuments.insert_one(view_doc) - db.newDocuments.insert_one(data_doc) + target_collection.insert_one(view_doc) + target_collection.insert_one(data_doc) return view_doc_guid @@ -209,8 +210,8 @@ def write_image(folder, name): "__type": "Doc" } - db.newDocuments.insert_one(view_doc) - db.newDocuments.insert_one(data_doc) + target_collection.insert_one(view_doc) + target_collection.insert_one(data_doc) return view_doc_guid @@ -372,7 +373,7 @@ parent_guid = write_schema({ }, ["title", "short_description", "original_price"], "data") print("appending parent schema to main workspace...\n") -db.newDocuments.update_one( +target_collection.update_one( {"fields.title": "WS collection 1"}, {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}} ) -- cgit v1.2.3-70-g09d2 From e1c7add158ce245ce6cb557177986b31fe107dd8 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Thu, 25 Jul 2019 18:56:46 -0400 Subject: scraping common proto tweak --- src/scraping/buxton/scraper.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'src/scraping/buxton/scraper.py') diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 14490cfe4..48b8fe3fa 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -17,6 +17,7 @@ dist = "../../server/public/files" db = MongoClient("localhost", 27017)["Dash"] target_collection = db.newDocuments schema_guids = [] +common_proto_id = "" def extract_links(fileName): @@ -93,7 +94,7 @@ def write_schema(parse_results, display_fields, storage_key): "__type": "Doc" } - fields["proto"] = protofy("collectionProto") + fields["proto"] = protofy(common_proto_id) fields[storage_key] = listify(proxify_guids(view_guids)) fields["schemaColumns"] = listify(display_fields) fields["backgroundColor"] = "white" @@ -137,7 +138,7 @@ def write_text_doc(content): data_doc = { "_id": data_doc_guid, "fields": { - "proto": protofy("textProto"), + "proto": protofy("commonImportProto"), "data": { "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}', "__type": "RichTextField" @@ -348,6 +349,22 @@ def proxify_guids(guids): return list(map(lambda guid: {"fieldId": guid, "__type": "proxy"}, guids)) +def write_common_proto(): + id = guid() + common_proto = { + "_id": id, + "fields": { + "proto": protofy("collectionProto"), + "title": "Common Import Proto", + }, + "__type": "Doc" + } + + target_collection.insert_one(common_proto) + + return id + + if os.path.exists(dist): shutil.rmtree(dist) while os.path.exists(dist): @@ -355,6 +372,8 @@ while os.path.exists(dist): os.mkdir(dist) mkdir_if_absent(source) +common_proto_id = write_common_proto() + candidates = 0 for file_name in os.listdir(source): if file_name.endswith('.docx'): -- cgit v1.2.3-70-g09d2 From c1377587f27044d89ec84befa9953de627d49873 Mon Sep 17 00:00:00 2001 From: Tyler Schicke Date: Sat, 27 Jul 2019 14:38:53 -0400 Subject: Fixed up buxton scraper to get fullscreen images and remove bad images --- src/scraping/buxton/scraper.py | 3 +++ .../buxton/source/Bill_Notes_Bill_Notes_CyKey.docx | Bin 1675500 -> 1561425 bytes .../buxton/source/Bill_Notes_Braun_T3.docx | Bin 1671968 -> 1510917 bytes .../buxton/source/Bill_Notes_CasioC801.docx | Bin 574664 -> 413861 bytes .../buxton/source/Bill_Notes_Casio_Mini.docx | Bin 581069 -> 467304 bytes .../source/Bill_Notes_FingerWorks_Prototype.docx | Bin 585090 -> 423384 bytes .../source/Bill_Notes_Fingerworks_TouchStream.docx | Bin 1722555 -> 1558473 bytes src/scraping/buxton/source/Bill_Notes_FrogPad.docx | Bin 840173 -> 679241 bytes .../buxton/source/Bill_Notes_Gavilan_SC.docx | Bin 1695290 -> 1531689 bytes .../source/Bill_Notes_Grandjean_Stenotype.docx | Bin 2094142 -> 1933004 bytes src/scraping/buxton/source/Bill_Notes_Matias.docx | Bin 590407 -> 476141 bytes .../buxton/source/Bill_Notes_MousePen.docx | Bin 505322 -> 344083 bytes src/scraping/buxton/source/Bill_Notes_NewO.docx | Bin 2264571 -> 2150143 bytes src/scraping/buxton/source/Bill_Notes_OLPC.docx | Bin 6883659 -> 6721592 bytes src/scraping/buxton/source/Bill_Notes_PARCkbd.docx | Bin 631959 -> 517484 bytes .../source/Bill_Notes_Philco_Mystery_Control.docx | Bin 1994439 -> 1880816 bytes .../buxton/source/Bill_Notes_TASA_Kbd.docx | Bin 461199 -> 347612 bytes src/scraping/buxton/source/Bill_Notes_The_Tap.docx | Bin 711321 -> 597382 bytes 18 files changed, 3 insertions(+) (limited to 'src/scraping/buxton/scraper.py') diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 48b8fe3fa..182b22a1a 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -1,4 +1,5 @@ import os +from shutil import copyfile import docx2txt from docx import Document from docx.opc.constants import RELATIONSHIP_TYPE as RT @@ -233,6 +234,8 @@ def parse_document(file_name: str): for image in os.listdir(dir_path): count += 1 view_guids.append(write_image(pure_name, image)) + copyfile(dir_path + "/" + image, dir_path + + "/" + image.replace(".", "_o.", 1)) os.rename(dir_path + "/" + image, dir_path + "/" + image.replace(".", "_m.", 1)) print(f"extracted {count} images...") diff --git a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx index 06094b4d3..649d636e3 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx and b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx b/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx index 356697092..b00080e08 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx and b/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx b/src/scraping/buxton/source/Bill_Notes_CasioC801.docx index cd89fb97b..510a006e0 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx and b/src/scraping/buxton/source/Bill_Notes_CasioC801.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx b/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx index a503cddfc..cea9e7b69 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx and b/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx b/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx index 4d13a8cf5..f53402a06 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx and b/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx b/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx index 578a1be08..0eec89949 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx and b/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx index d01e1bf5c..ba80c1959 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx and b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx index 7bd28b376..8558a4e13 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx and b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx index 0615c4953..09e17f971 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx and b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Matias.docx b/src/scraping/buxton/source/Bill_Notes_Matias.docx index 547603256..d2d014bbe 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_Matias.docx and b/src/scraping/buxton/source/Bill_Notes_Matias.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_MousePen.docx b/src/scraping/buxton/source/Bill_Notes_MousePen.docx index 4e1056636..cd0b3eab3 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_MousePen.docx and b/src/scraping/buxton/source/Bill_Notes_MousePen.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_NewO.docx b/src/scraping/buxton/source/Bill_Notes_NewO.docx index a514926d2..2f4a04e81 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_NewO.docx and b/src/scraping/buxton/source/Bill_Notes_NewO.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_OLPC.docx b/src/scraping/buxton/source/Bill_Notes_OLPC.docx index bfca0a9bb..7a636e2d6 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_OLPC.docx and b/src/scraping/buxton/source/Bill_Notes_OLPC.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx index c0cf6ba9a..3038de363 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx and b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx b/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx index ad06903f3..af72fa662 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx and b/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx b/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx index e4c659de9..5c2eb8d7f 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx and b/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx differ diff --git a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx b/src/scraping/buxton/source/Bill_Notes_The_Tap.docx index 8ceebc71e..c9ee2eaea 100644 Binary files a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx and b/src/scraping/buxton/source/Bill_Notes_The_Tap.docx differ -- cgit v1.2.3-70-g09d2