proof of concept, imports metadata-embedded schemas

author: Sam Wilkins <samuel_wilkins@brown.edu> 2019-06-19 17:44:56 -0400
committer: Sam Wilkins <samuel_wilkins@brown.edu> 2019-06-19 17:44:56 -0400
commit: 5202cb26929a4bfe7b0881473ebcdebb06e91248 (patch)
tree: d2320ab0bce0a8a7b6390aee1ea6544aaafa85e6 /src/buxton/scraper.py
parent: cbb2f4191e31d72c8c727976b5616983af15af45 (diff)
1 files changed, 80 insertions, 33 deletions
diff --git a/src/buxton/scraper.py b/src/buxton/scraper.py
index c7efd8f09..c89961f54 100644
--- a/src/buxton/scraper.py
+++ b/src/buxton/scraper.py
@@ -13,7 +13,7 @@ source = "./source"
 dist = "../server/public/files"
 
 db = MongoClient("localhost", 27017)["Dash"]
-view_doc_guids = []
+schema_guids = []
 
 
 def extract_links(fileName):
@@ -24,7 +24,7 @@ def extract_links(fileName):
         item = rels[rel]
         if item.reltype == RT.HYPERLINK and ".aspx" not in item._target:
             links.append(item._target)
-    return links
+    return listify(links)
 
 
 def extract_value(kv_string):
@@ -44,6 +44,66 @@ def guid():
     return str(uuid.uuid4())
 
 
+def listify(list):
+    return {
+        "fields": list,
+        "__type": "list"
+    }
+
+
+def protofy(fieldId):
+    return {
+        "fieldId": fieldId,
+        "__type": "proxy"
+    }
+
+
+def write_schema(parse_results):
+    view_guids = parse_results["view_guids"]
+
+    data_doc = parse_results["schema"]
+    fields = data_doc["fields"]
+
+    view_doc_guid = guid()
+
+    view_doc = {
+        "_id": view_doc_guid,
+        "fields": {
+            "proto": protofy(data_doc["_id"]),
+            "x": 10,
+            "y": 10,
+            "width": 900,
+            "height": 600,
+            "panX": 0,
+            "panY": 0,
+            "zoomBasis": 0.5,
+            "zIndex": 2,
+            "libraryBrush": False,
+            "viewType": 2
+        },
+        "__type": "Doc"
+    }
+
+    fields["proto"] = protofy("collectionProto")
+    fields["data"] = listify(proxify_guids(view_guids))
+    fields["schemaColumns"] = listify(["title", "data"])
+    fields["backgroundColor"] = "white"
+    fields["scale"] = 0.5
+    fields["viewType"] = 2
+    fields["author"] = "Bill Buxton"
+    fields["creationDate"] = {
+        "date": datetime.datetime.utcnow().microsecond,
+        "__type": "date"
+    }
+    fields["isPrototype"] = True
+    fields["page"] = -1
+
+    db.newDocuments.insert_one(data_doc)
+    db.newDocuments.insert_one(view_doc)
+
+    return view_doc_guid
+
+
 def write_image(folder, name):
     path = f"http://localhost:1050/files/{folder}/{name}"
 
@@ -53,10 +113,7 @@ def write_image(folder, name):
     view_doc = {
         "_id": view_doc_guid,
         "fields": {
-            "proto": {
-                "fieldId": data_doc_guid,
-                "__type": "proxy"
-            },
+            "proto": protofy(data_doc_guid),
             "x": 10,
             "y": 10,
             "width": 300,
@@ -72,10 +129,7 @@ def write_image(folder, name):
     data_doc = {
         "_id": data_doc_guid,
         "fields": {
-            "proto": {
-                "_id": "imageProto",
-                "__type": "proxy"
-            },
+            "proto": protofy("imageProto"),
             "data": {
                 "url": path,
                 "__type": "image"
@@ -115,8 +169,9 @@ def parse_document(file_name: str):
     raw = str(docx2txt.process(source + "/" + file_name, dir_path))
 
     print("Extracting images...\n")
+    view_guids = []
     for image in os.listdir(dir_path):
-        view_doc_guids.append(write_image(pure_name, image))
+        view_guids.append(write_image(pure_name, image))
         os.rename(dir_path + "/" + image, dir_path +
                   "/" + image.replace(".", "_m.", 1))
     print()
@@ -178,7 +233,7 @@ def parse_document(file_name: str):
     while lines[cur] != "Image":
         link_descriptions.append(lines[cur].strip())
         cur += 1
-    result["link_descriptions"] = link_descriptions
+    result["link_descriptions"] = listify(link_descriptions)
 
     result["hyperlinks"] = extract_links(source + "/" + file_name)
 
@@ -189,8 +244,8 @@ def parse_document(file_name: str):
         images.append(lines[cur])
         captions.append(lines[cur + 1])
         cur += 2
-    result["images"] = images
-    result["captions"] = captions
+    result["images"] = listify(images)
+    result["captions"] = listify(captions)
 
     notes = []
     if (cur < len(lines) and lines[cur] == "NOTES:"):
@@ -199,24 +254,22 @@ def parse_document(file_name: str):
             notes.append(lines[cur])
             cur += 1
     if len(notes) > 0:
-        result["notes"] = notes
+        result["notes"] = listify(notes)
 
     print("...contents dictionary constructed.")
 
-    return result
-
-
-def wrap(document):
     return {
-        "_id": guid(),
-        "fields": document,
-        "__type": "Doc"
+        "schema": {
+            "_id": guid(),
+            "fields": result,
+            "__type": "Doc"
+        },
+        "view_guids": view_guids
     }
 
 
-def upload(collection, mongofied):
-    for doc in mongofied:
-        collection.insert_one(doc)
+def proxify_guids(guids):
+    return list(map(lambda guid: {"fieldId": guid, "__type": "proxy"}, guids))
 
 
 if os.path.exists(dist):
@@ -227,20 +280,14 @@ os.mkdir(dist)
 mkdir_if_absent(source)
 
 candidates = 0
-mongofied = []
 for file_name in os.listdir(source):
     if file_name.endswith('.docx'):
         candidates += 1
-        mongofied.append(wrap(parse_document(file_name)))
-
-for doc in mongofied:
-    db.newDocuments.insert_one(doc)
+        schema_guids.append(write_schema(parse_document(file_name)))
 
-proxified = list(
-    map(lambda guid: {"fieldId": guid, "__type": "proxy"}, view_doc_guids))
 db.newDocuments.update_one(
     {"fields.title": "WS collection 1"},
-    {"$push": {"fields.data.fields": {"$each": proxified}}}
+    {"$push": {"fields.data.fields": {"$each": proxify_guids(schema_guids)}}}
 )
 
 print("...dictionaries written to Dash Document.\n")
author	Sam Wilkins <samuel_wilkins@brown.edu>	2019-06-19 17:44:56 -0400
committer	Sam Wilkins <samuel_wilkins@brown.edu>	2019-06-19 17:44:56 -0400
commit	5202cb26929a4bfe7b0881473ebcdebb06e91248 (patch)
tree	d2320ab0bce0a8a7b6390aee1ea6544aaafa85e6 /src/buxton/scraper.py
parent	cbb2f4191e31d72c8c727976b5616983af15af45 (diff)