aboutsummaryrefslogtreecommitdiff
path: root/src/scraping
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping')
-rw-r--r--[-rwxr-xr-x]src/scraping/acm/chromedriverbin10256192 -> 10256192 bytes
-rw-r--r--src/scraping/buxton/scraper.py43
-rw-r--r--src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docxbin0 -> 412208 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docxbin0 -> 474022 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docxbin0 -> 1758498 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_BAT.docxbin0 -> 1349620 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docxbin1675500 -> 1561425 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Braun_T3.docxbin1671968 -> 1510917 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_CasioC801.docxbin574664 -> 413861 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docxbin0 -> 523939 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Casio_Mini.docxbin581069 -> 467304 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docxbin585090 -> 423384 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docxbin1722555 -> 1558473 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_FrogPad.docxbin840173 -> 679241 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docxbin1695290 -> 1531689 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docxbin2094142 -> 1933004 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docxbin0 -> 919789 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Matias.docxbin590407 -> 476141 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Microwriter.docxbin0 -> 1042556 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_MousePen.docxbin505322 -> 344083 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_NewO.docxbin2264571 -> 2150143 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_OLPC.docxbin6883659 -> 6721592 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_PARCkbd.docxbin631959 -> 517484 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docxbin1994439 -> 1880816 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docxbin461199 -> 347612 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_The_Tap.docxbin711321 -> 597382 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Twiddler.docxbin0 -> 526307 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_orbiTouch.docbin0 -> 3945306 bytes
28 files changed, 33 insertions, 10 deletions
diff --git a/src/scraping/acm/chromedriver b/src/scraping/acm/chromedriver
index 9e9b16717..9e9b16717 100755..100644
--- a/src/scraping/acm/chromedriver
+++ b/src/scraping/acm/chromedriver
Binary files differ
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index 700269727..f0f45d8f9 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -1,4 +1,5 @@
import os
+from shutil import copyfile
import docx2txt
from docx import Document
from docx.opc.constants import RELATIONSHIP_TYPE as RT
@@ -15,7 +16,9 @@ source = "./source"
dist = "../../server/public/files"
db = MongoClient("localhost", 27017)["Dash"]
+target_collection = db.newDocuments
schema_guids = []
+common_proto_id = ""
def extract_links(fileName):
@@ -84,7 +87,7 @@ def write_schema(parse_results, display_fields, storage_key):
"height": 600,
"panX": 0,
"panY": 0,
- "zoomBasis": 0.5,
+ "zoomBasis": 1,
"zIndex": 2,
"libraryBrush": False,
"viewType": 2
@@ -92,7 +95,7 @@ def write_schema(parse_results, display_fields, storage_key):
"__type": "Doc"
}
- fields["proto"] = protofy("collectionProto")
+ fields["proto"] = protofy(common_proto_id)
fields[storage_key] = listify(proxify_guids(view_guids))
fields["schemaColumns"] = listify(display_fields)
fields["backgroundColor"] = "white"
@@ -106,8 +109,8 @@ def write_schema(parse_results, display_fields, storage_key):
fields["isPrototype"] = True
fields["page"] = -1
- db.newDocuments.insert_one(data_doc)
- db.newDocuments.insert_one(view_doc)
+ target_collection.insert_one(data_doc)
+ target_collection.insert_one(view_doc)
data_doc_guid = data_doc["_id"]
print(f"inserted view document ({view_doc_guid})")
@@ -158,8 +161,8 @@ def write_text_doc(content):
"__type": "Doc"
}
- db.newDocuments.insert_one(view_doc)
- db.newDocuments.insert_one(data_doc)
+ target_collection.insert_one(view_doc)
+ target_collection.insert_one(data_doc)
return view_doc_guid
@@ -209,8 +212,8 @@ def write_image(folder, name):
"__type": "Doc"
}
- db.newDocuments.insert_one(view_doc)
- db.newDocuments.insert_one(data_doc)
+ target_collection.insert_one(view_doc)
+ target_collection.insert_one(data_doc)
return view_doc_guid
@@ -231,7 +234,9 @@ def parse_document(file_name: str):
for image in os.listdir(dir_path):
count += 1
view_guids.append(write_image(pure_name, image))
- os.rename(dir_path + "/" + image, dir_path +
+ copyfile(dir_path + "/" + image, dir_path +
+ "/" + image.replace(".", "_o.", 1))
+ copyfile(dir_path + "/" + image, dir_path +
"/" + image.replace(".", "_m.", 1))
print(f"extracted {count} images...")
@@ -347,6 +352,22 @@ def proxify_guids(guids):
return list(map(lambda guid: {"fieldId": guid, "__type": "proxy"}, guids))
+def write_common_proto():
+ id = guid()
+ common_proto = {
+ "_id": id,
+ "fields": {
+ "proto": protofy("collectionProto"),
+ "title": "Common Import Proto",
+ },
+ "__type": "Doc"
+ }
+
+ target_collection.insert_one(common_proto)
+
+ return id
+
+
if os.path.exists(dist):
shutil.rmtree(dist)
while os.path.exists(dist):
@@ -354,6 +375,8 @@ while os.path.exists(dist):
os.mkdir(dist)
mkdir_if_absent(source)
+common_proto_id = write_common_proto()
+
candidates = 0
for file_name in os.listdir(source):
if file_name.endswith('.docx'):
@@ -372,7 +395,7 @@ parent_guid = write_schema({
}, ["title", "short_description", "original_price"], "data")
print("appending parent schema to main workspace...\n")
-db.newDocuments.update_one(
+target_collection.update_one(
{"fields.title": "WS collection 1"},
{"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}}
)
diff --git a/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx b/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx
new file mode 100644
index 000000000..a2ab04b78
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx b/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx
new file mode 100644
index 000000000..e4375ebeb
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx b/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx
new file mode 100644
index 000000000..99f7ad19d
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_BAT.docx b/src/scraping/buxton/source/Bill_Notes_BAT.docx
new file mode 100644
index 000000000..0e3368611
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_BAT.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx
index 06094b4d3..649d636e3 100644
--- a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx
+++ b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx b/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx
index 356697092..b00080e08 100644
--- a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx
+++ b/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx b/src/scraping/buxton/source/Bill_Notes_CasioC801.docx
index cd89fb97b..510a006e0 100644
--- a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx
+++ b/src/scraping/buxton/source/Bill_Notes_CasioC801.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx b/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx
new file mode 100644
index 000000000..c8d3943c0
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx b/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx
index a503cddfc..cea9e7b69 100644
--- a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx
+++ b/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx b/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx
index 4d13a8cf5..f53402a06 100644
--- a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx
+++ b/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx b/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx
index 578a1be08..0eec89949 100644
--- a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx
+++ b/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx
index d01e1bf5c..ba80c1959 100644
--- a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx
+++ b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx
index 7bd28b376..8558a4e13 100644
--- a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx
+++ b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx
index 0615c4953..09e17f971 100644
--- a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx
+++ b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx b/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx
new file mode 100644
index 000000000..f00fcb772
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Matias.docx b/src/scraping/buxton/source/Bill_Notes_Matias.docx
index 547603256..d2d014bbe 100644
--- a/src/scraping/buxton/source/Bill_Notes_Matias.docx
+++ b/src/scraping/buxton/source/Bill_Notes_Matias.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Microwriter.docx b/src/scraping/buxton/source/Bill_Notes_Microwriter.docx
new file mode 100644
index 000000000..3ac272e42
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Microwriter.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_MousePen.docx b/src/scraping/buxton/source/Bill_Notes_MousePen.docx
index 4e1056636..cd0b3eab3 100644
--- a/src/scraping/buxton/source/Bill_Notes_MousePen.docx
+++ b/src/scraping/buxton/source/Bill_Notes_MousePen.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_NewO.docx b/src/scraping/buxton/source/Bill_Notes_NewO.docx
index a514926d2..2f4a04e81 100644
--- a/src/scraping/buxton/source/Bill_Notes_NewO.docx
+++ b/src/scraping/buxton/source/Bill_Notes_NewO.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_OLPC.docx b/src/scraping/buxton/source/Bill_Notes_OLPC.docx
index bfca0a9bb..7a636e2d6 100644
--- a/src/scraping/buxton/source/Bill_Notes_OLPC.docx
+++ b/src/scraping/buxton/source/Bill_Notes_OLPC.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx
index c0cf6ba9a..3038de363 100644
--- a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx
+++ b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx b/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx
index ad06903f3..af72fa662 100644
--- a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx
+++ b/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx b/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx
index e4c659de9..5c2eb8d7f 100644
--- a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx
+++ b/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx b/src/scraping/buxton/source/Bill_Notes_The_Tap.docx
index 8ceebc71e..c9ee2eaea 100644
--- a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx
+++ b/src/scraping/buxton/source/Bill_Notes_The_Tap.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Twiddler.docx b/src/scraping/buxton/source/Bill_Notes_Twiddler.docx
new file mode 100644
index 000000000..27b4acc85
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_Twiddler.docx
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc b/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc
new file mode 100644
index 000000000..6bd71f20e
--- /dev/null
+++ b/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc
Binary files differ