diff options
Diffstat (limited to 'src/scraping')
| -rw-r--r-- | src/scraping/buxton/scraper.py | 18 | ||||
| -rw-r--r-- | src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx | bin | 0 -> 748412 bytes | |||
| -rw-r--r-- | src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx | bin | 1561425 -> 1675500 bytes | |||
| -rw-r--r-- | src/scraping/buxton/source/Bill_Notes_FrogPad.docx | bin | 679241 -> 840173 bytes | |||
| -rw-r--r-- | src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx | bin | 1531689 -> 1729610 bytes | |||
| -rw-r--r-- | src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx | bin | 1933004 -> 2094142 bytes | |||
| -rw-r--r-- | src/scraping/buxton/source/Bill_Notes_NewO.docx | bin | 2150143 -> 2264571 bytes | |||
| -rw-r--r-- | src/scraping/buxton/source/Bill_Notes_PARCkbd.docx | bin | 517484 -> 631959 bytes | |||
| -rw-r--r-- | src/scraping/buxton/source/Bill_Notes_PARCtab.doc | bin | 0 -> 4046250 bytes |
9 files changed, 10 insertions, 8 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 29cb8a256..807216ef1 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -17,6 +17,7 @@ dist = "../../server/public/files" db = MongoClient("localhost", 27017)["Dash"] target_collection = db.newDocuments +target_doc_title = "Workspace 1" schema_guids = [] common_proto_id = "" @@ -69,7 +70,7 @@ def text_doc_map(string_list): return listify(proxify_guids(list(map(guid_map, string_list)))) -def write_schema(parse_results, display_fields, storage_key): +def write_collection(parse_results, display_fields, storage_key, viewType=2): view_guids = parse_results["child_guids"] data_doc = parse_results["schema"] @@ -88,8 +89,9 @@ def write_schema(parse_results, display_fields, storage_key): "panX": 0, "panY": 0, "zoomBasis": 1, - "zIndex": 2 - "viewType": 2 + "zIndex": 2, + "libraryBrush": False, + "viewType": viewType }, "__type": "Doc" } @@ -234,7 +236,7 @@ def parse_document(file_name: str): copyfile(dir_path + "/" + image, dir_path + "/" + image.replace(".", "_o.", 1)) copyfile(dir_path + "/" + image, dir_path + - "/" + image.replace(".", "_m.", 1)) + "/" + image.replace(".", "_m.", 1)) print(f"extracted {count} images...") def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( @@ -378,22 +380,22 @@ candidates = 0 for file_name in os.listdir(source): if file_name.endswith('.docx'): candidates += 1 - schema_guids.append(write_schema( + schema_guids.append(write_collection( parse_document(file_name), ["title", "data"], "image_data")) print("writing parent schema...") -parent_guid = write_schema({ +parent_guid = write_collection({ "schema": { "_id": guid(), "fields": {}, "__type": "Doc" }, "child_guids": schema_guids -}, ["title", "short_description", "original_price"], "data") +}, ["title", "short_description", "original_price"], "data", 1) print("appending parent schema to main workspace...\n") target_collection.update_one( - {"fields.title": "WS collection 1"}, + {"fields.title": target_doc_title}, {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}} ) diff --git a/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx b/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx Binary files differnew file mode 100644 index 000000000..df1aafe9c --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx Binary files differindex 649d636e3..06094b4d3 100644 --- a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx +++ b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx diff --git a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx Binary files differindex ba80c1959..d01e1bf5c 100644 --- a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx +++ b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx Binary files differindex 8558a4e13..b9a30c8a9 100644 --- a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx +++ b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx Binary files differindex 09e17f971..0615c4953 100644 --- a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx +++ b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx diff --git a/src/scraping/buxton/source/Bill_Notes_NewO.docx b/src/scraping/buxton/source/Bill_Notes_NewO.docx Binary files differindex 2f4a04e81..a514926d2 100644 --- a/src/scraping/buxton/source/Bill_Notes_NewO.docx +++ b/src/scraping/buxton/source/Bill_Notes_NewO.docx diff --git a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx Binary files differindex 3038de363..c0cf6ba9a 100644 --- a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx +++ b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx diff --git a/src/scraping/buxton/source/Bill_Notes_PARCtab.doc b/src/scraping/buxton/source/Bill_Notes_PARCtab.doc Binary files differnew file mode 100644 index 000000000..3cdc2d21b --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_PARCtab.doc |
