aboutsummaryrefslogtreecommitdiff
path: root/src/scraping
diff options
context:
space:
mode:
authormonikahedman <monika_hedman@brown.edu>2019-08-08 15:03:03 -0400
committermonikahedman <monika_hedman@brown.edu>2019-08-08 15:03:03 -0400
commit8f951d9110d096d665af6fbd295902ef8d3574e8 (patch)
tree37cc6881cbf93aeea5deae53a6415d6607377edc /src/scraping
parent030af1b9112cd12383abcd7f35142cc382ea4d6a (diff)
parent316c241d72fb83aad5f2bf9b143c317fdc906654 (diff)
Merge branch 'master' of https://github.com/browngraphicslab/Dash-Web into checkbox
Diffstat (limited to 'src/scraping')
-rw-r--r--src/scraping/buxton/scraper.py21
-rw-r--r--src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docxbin0 -> 412208 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docxbin0 -> 474022 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docxbin0 -> 1758498 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docxbin0 -> 748412 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_BAT.docxbin0 -> 1349620 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docxbin1561425 -> 1675500 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docxbin0 -> 523939 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_FrogPad.docxbin679241 -> 840173 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docxbin1531689 -> 1729610 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docxbin1933004 -> 2094142 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docxbin0 -> 919789 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Microwriter.docxbin0 -> 1042556 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_NewO.docxbin2150143 -> 2264571 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_PARCkbd.docxbin517484 -> 631959 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_PARCtab.docbin0 -> 4046250 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Twiddler.docxbin0 -> 526307 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_orbiTouch.docbin0 -> 3945306 bytes
18 files changed, 10 insertions, 11 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index f0f45d8f9..807216ef1 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -17,6 +17,7 @@ dist = "../../server/public/files"
db = MongoClient("localhost", 27017)["Dash"]
target_collection = db.newDocuments
+target_doc_title = "Workspace 1"
schema_guids = []
common_proto_id = ""
@@ -69,7 +70,7 @@ def text_doc_map(string_list):
return listify(proxify_guids(list(map(guid_map, string_list))))
-def write_schema(parse_results, display_fields, storage_key):
+def write_collection(parse_results, display_fields, storage_key, viewType=2):
view_guids = parse_results["child_guids"]
data_doc = parse_results["schema"]
@@ -90,7 +91,7 @@ def write_schema(parse_results, display_fields, storage_key):
"zoomBasis": 1,
"zIndex": 2,
"libraryBrush": False,
- "viewType": 2
+ "viewType": viewType
},
"__type": "Doc"
}
@@ -130,8 +131,7 @@ def write_text_doc(content):
"x": 10,
"y": 10,
"width": 400,
- "zIndex": 2,
- "libraryBrush": False
+ "zIndex": 2
},
"__type": "Doc"
}
@@ -183,8 +183,7 @@ def write_image(folder, name):
"x": 10,
"y": 10,
"width": min(800, native_width),
- "zIndex": 2,
- "libraryBrush": False
+ "zIndex": 2
},
"__type": "Doc"
}
@@ -237,7 +236,7 @@ def parse_document(file_name: str):
copyfile(dir_path + "/" + image, dir_path +
"/" + image.replace(".", "_o.", 1))
copyfile(dir_path + "/" + image, dir_path +
- "/" + image.replace(".", "_m.", 1))
+ "/" + image.replace(".", "_m.", 1))
print(f"extracted {count} images...")
def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
@@ -381,22 +380,22 @@ candidates = 0
for file_name in os.listdir(source):
if file_name.endswith('.docx'):
candidates += 1
- schema_guids.append(write_schema(
+ schema_guids.append(write_collection(
parse_document(file_name), ["title", "data"], "image_data"))
print("writing parent schema...")
-parent_guid = write_schema({
+parent_guid = write_collection({
"schema": {
"_id": guid(),
"fields": {},
"__type": "Doc"
},
"child_guids": schema_guids
-}, ["title", "short_description", "original_price"], "data")
+}, ["title", "short_description", "original_price"], "data", 1)
print("appending parent schema to main workspace...\n")
target_collection.update_one(
- {"fields.title": "WS collection 1"},
+ {"fields.title": target_doc_title},
{"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}}
)
diff --git a/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx b/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx
new file mode 100644
index 000000000..a2ab04b78
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx b/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx
new file mode 100644
index 000000000..e4375ebeb
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx b/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx
new file mode 100644
index 000000000..99f7ad19d
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx b/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx
new file mode 100644
index 000000000..df1aafe9c
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_BAT.docx b/src/scraping/buxton/source/Bill_Notes_BAT.docx
new file mode 100644
index 000000000..0e3368611
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_BAT.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx
index 649d636e3..06094b4d3 100644
--- a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx
+++ b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx b/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx
new file mode 100644
index 000000000..c8d3943c0
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx
index ba80c1959..d01e1bf5c 100644
--- a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx
+++ b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx
index 8558a4e13..b9a30c8a9 100644
--- a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx
+++ b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx
index 09e17f971..0615c4953 100644
--- a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx
+++ b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx b/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx
new file mode 100644
index 000000000..f00fcb772
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Microwriter.docx b/src/scraping/buxton/source/Bill_Notes_Microwriter.docx
new file mode 100644
index 000000000..3ac272e42
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Microwriter.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_NewO.docx b/src/scraping/buxton/source/Bill_Notes_NewO.docx
index 2f4a04e81..a514926d2 100644
--- a/src/scraping/buxton/source/Bill_Notes_NewO.docx
+++ b/src/scraping/buxton/source/Bill_Notes_NewO.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx
index 3038de363..c0cf6ba9a 100644
--- a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx
+++ b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_PARCtab.doc b/src/scraping/buxton/source/Bill_Notes_PARCtab.doc
new file mode 100644
index 000000000..3cdc2d21b
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_PARCtab.doc
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Twiddler.docx b/src/scraping/buxton/source/Bill_Notes_Twiddler.docx
new file mode 100644
index 000000000..27b4acc85
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Twiddler.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc b/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc
new file mode 100644
index 000000000..6bd71f20e
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc
Binary files differ