diff options
author | monikahedman <monika_hedman@brown.edu> | 2019-08-08 15:03:03 -0400 |
---|---|---|
committer | monikahedman <monika_hedman@brown.edu> | 2019-08-08 15:03:03 -0400 |
commit | 8f951d9110d096d665af6fbd295902ef8d3574e8 (patch) | |
tree | 37cc6881cbf93aeea5deae53a6415d6607377edc /src/scraping | |
parent | 030af1b9112cd12383abcd7f35142cc382ea4d6a (diff) | |
parent | 316c241d72fb83aad5f2bf9b143c317fdc906654 (diff) |
Merge branch 'master' of https://github.com/browngraphicslab/Dash-Web into checkbox
Diffstat (limited to 'src/scraping')
18 files changed, 10 insertions, 11 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index f0f45d8f9..807216ef1 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -17,6 +17,7 @@ dist = "../../server/public/files" db = MongoClient("localhost", 27017)["Dash"] target_collection = db.newDocuments +target_doc_title = "Workspace 1" schema_guids = [] common_proto_id = "" @@ -69,7 +70,7 @@ def text_doc_map(string_list): return listify(proxify_guids(list(map(guid_map, string_list)))) -def write_schema(parse_results, display_fields, storage_key): +def write_collection(parse_results, display_fields, storage_key, viewType=2): view_guids = parse_results["child_guids"] data_doc = parse_results["schema"] @@ -90,7 +91,7 @@ def write_schema(parse_results, display_fields, storage_key): "zoomBasis": 1, "zIndex": 2, "libraryBrush": False, - "viewType": 2 + "viewType": viewType }, "__type": "Doc" } @@ -130,8 +131,7 @@ def write_text_doc(content): "x": 10, "y": 10, "width": 400, - "zIndex": 2, - "libraryBrush": False + "zIndex": 2 }, "__type": "Doc" } @@ -183,8 +183,7 @@ def write_image(folder, name): "x": 10, "y": 10, "width": min(800, native_width), - "zIndex": 2, - "libraryBrush": False + "zIndex": 2 }, "__type": "Doc" } @@ -237,7 +236,7 @@ def parse_document(file_name: str): copyfile(dir_path + "/" + image, dir_path + "/" + image.replace(".", "_o.", 1)) copyfile(dir_path + "/" + image, dir_path + - "/" + image.replace(".", "_m.", 1)) + "/" + image.replace(".", "_m.", 1)) print(f"extracted {count} images...") def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( @@ -381,22 +380,22 @@ candidates = 0 for file_name in os.listdir(source): if file_name.endswith('.docx'): candidates += 1 - schema_guids.append(write_schema( + schema_guids.append(write_collection( parse_document(file_name), ["title", "data"], "image_data")) print("writing parent schema...") -parent_guid = write_schema({ +parent_guid = write_collection({ "schema": { "_id": guid(), "fields": {}, "__type": "Doc" }, "child_guids": schema_guids -}, ["title", "short_description", "original_price"], "data") +}, ["title", "short_description", "original_price"], "data", 1) print("appending parent schema to main workspace...\n") target_collection.update_one( - {"fields.title": "WS collection 1"}, + {"fields.title": target_doc_title}, {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}} ) diff --git a/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx b/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx Binary files differnew file mode 100644 index 000000000..a2ab04b78 --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx b/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx Binary files differnew file mode 100644 index 000000000..e4375ebeb --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx b/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx Binary files differnew file mode 100644 index 000000000..99f7ad19d --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx b/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx Binary files differnew file mode 100644 index 000000000..df1aafe9c --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx diff --git a/src/scraping/buxton/source/Bill_Notes_BAT.docx b/src/scraping/buxton/source/Bill_Notes_BAT.docx Binary files differnew file mode 100644 index 000000000..0e3368611 --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_BAT.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx Binary files differindex 649d636e3..06094b4d3 100644 --- a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx +++ b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx b/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx Binary files differnew file mode 100644 index 000000000..c8d3943c0 --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx diff --git a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx Binary files differindex ba80c1959..d01e1bf5c 100644 --- a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx +++ b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx Binary files differindex 8558a4e13..b9a30c8a9 100644 --- a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx +++ b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx Binary files differindex 09e17f971..0615c4953 100644 --- a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx +++ b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx b/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx Binary files differnew file mode 100644 index 000000000..f00fcb772 --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Microwriter.docx b/src/scraping/buxton/source/Bill_Notes_Microwriter.docx Binary files differnew file mode 100644 index 000000000..3ac272e42 --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Microwriter.docx diff --git a/src/scraping/buxton/source/Bill_Notes_NewO.docx b/src/scraping/buxton/source/Bill_Notes_NewO.docx Binary files differindex 2f4a04e81..a514926d2 100644 --- a/src/scraping/buxton/source/Bill_Notes_NewO.docx +++ b/src/scraping/buxton/source/Bill_Notes_NewO.docx diff --git a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx Binary files differindex 3038de363..c0cf6ba9a 100644 --- a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx +++ b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx diff --git a/src/scraping/buxton/source/Bill_Notes_PARCtab.doc b/src/scraping/buxton/source/Bill_Notes_PARCtab.doc Binary files differnew file mode 100644 index 000000000..3cdc2d21b --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_PARCtab.doc diff --git a/src/scraping/buxton/source/Bill_Notes_Twiddler.docx b/src/scraping/buxton/source/Bill_Notes_Twiddler.docx Binary files differnew file mode 100644 index 000000000..27b4acc85 --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_Twiddler.docx diff --git a/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc b/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc Binary files differnew file mode 100644 index 000000000..6bd71f20e --- /dev/null +++ b/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc |