From 90d6454c05cdeb109da25dd55d428c140defca49 Mon Sep 17 00:00:00 2001
From: Sam Wilkins <samwilkins333@gmail.com>
Date: Sun, 2 Feb 2020 12:46:57 -0500
Subject: fixed scraper

---
 src/scraping/buxton/.idea/workspace.xml            |  46 ++-
 src/scraping/buxton/narratives.py                  |  38 ++
 .../buxton/narratives/Theme - Chord Kbds.docx      | Bin 0 -> 5701815 bytes
 .../buxton/narratives/chord_keyboards.json         |  39 ++
 src/scraping/buxton/scraper.py                     | 399 ++++++++++++++++-----
 5 files changed, 411 insertions(+), 111 deletions(-)
 create mode 100644 src/scraping/buxton/narratives.py
 create mode 100644 src/scraping/buxton/narratives/Theme - Chord Kbds.docx
 create mode 100644 src/scraping/buxton/narratives/chord_keyboards.json

(limited to 'src')
diff --git a/src/scraping/buxton/.idea/workspace.xml b/src/scraping/buxton/.idea/workspace.xml
index b2c7d4b8c..6f1ae3814 100644
--- a/src/scraping/buxton/.idea/workspace.xml
+++ b/src/scraping/buxton/.idea/workspace.xml
@@ -126,7 +126,7 @@
     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
   </component>
-  <component name="RunManager" selected="Python.jsonifier">
+  <component name="RunManager" selected="Python.narratives">
     <configuration name="jsonifier" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
       <module name="buxton" />
       <option name="INTERPRETER_OPTIONS" value="" />
@@ -148,6 +148,27 @@
       <option name="INPUT_FILE" value="" />
       <method v="2" />
     </configuration>
+    <configuration name="narratives" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
+      <module name="buxton" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="/usr/local/bin/python3.7" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="false" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/narratives.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
     <configuration name="scraper" type="PythonConfigurationType" factoryName="Python">
       <module name="buxton" />
       <option name="INTERPRETER_OPTIONS" value="" />
@@ -160,7 +181,7 @@
       <option name="IS_MODULE_SDK" value="false" />
       <option name="ADD_CONTENT_ROOTS" value="true" />
       <option name="ADD_SOURCE_ROOTS" value="true" />
-      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/new_scraper.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/scraper.py" />
       <option name="PARAMETERS" value="" />
       <option name="SHOW_COMMAND_LINE" value="false" />
       <option name="EMULATE_TERMINAL" value="false" />
@@ -172,6 +193,7 @@
     <list>
       <item itemvalue="Python.jsonifier" />
       <item itemvalue="Python.scraper" />
+      <item itemvalue="Python.narratives" />
     </list>
   </component>
   <component name="SvnConfiguration">
@@ -188,30 +210,30 @@
     <servers />
   </component>
   <component name="WindowStateProjectService">
-    <state x="184" y="103" key="#com.intellij.execution.impl.EditConfigurationsDialog" timestamp="1580610403225">
+    <state x="184" y="103" key="#com.intellij.execution.impl.EditConfigurationsDialog" timestamp="1580656983882">
       <screen x="0" y="23" width="1440" height="836" />
     </state>
-    <state x="184" y="103" key="#com.intellij.execution.impl.EditConfigurationsDialog/0.23.1440.836@0.23.1440.836" timestamp="1580610403225" />
+    <state x="184" y="103" key="#com.intellij.execution.impl.EditConfigurationsDialog/0.23.1440.836@0.23.1440.836" timestamp="1580656983882" />
     <state x="483" y="152" key="#xdebugger.evaluate" timestamp="1580601059439">
       <screen x="0" y="23" width="1440" height="836" />
     </state>
     <state x="483" y="152" key="#xdebugger.evaluate/0.23.1440.836@0.23.1440.836" timestamp="1580601059439" />
-    <state width="1419" height="216" key="GridCell.Tab.0.bottom" timestamp="1580612505537">
+    <state width="1419" height="216" key="GridCell.Tab.0.bottom" timestamp="1580656997013">
       <screen x="0" y="23" width="1440" height="836" />
     </state>
-    <state width="1419" height="216" key="GridCell.Tab.0.bottom/0.23.1440.836@0.23.1440.836" timestamp="1580612505537" />
-    <state width="1419" height="216" key="GridCell.Tab.0.center" timestamp="1580612505536">
+    <state width="1419" height="216" key="GridCell.Tab.0.bottom/0.23.1440.836@0.23.1440.836" timestamp="1580656997013" />
+    <state width="1419" height="216" key="GridCell.Tab.0.center" timestamp="1580656997012">
       <screen x="0" y="23" width="1440" height="836" />
     </state>
-    <state width="1419" height="216" key="GridCell.Tab.0.center/0.23.1440.836@0.23.1440.836" timestamp="1580612505536" />
-    <state width="1419" height="216" key="GridCell.Tab.0.left" timestamp="1580612505535">
+    <state width="1419" height="216" key="GridCell.Tab.0.center/0.23.1440.836@0.23.1440.836" timestamp="1580656997012" />
+    <state width="1419" height="216" key="GridCell.Tab.0.left" timestamp="1580656997012">
       <screen x="0" y="23" width="1440" height="836" />
     </state>
-    <state width="1419" height="216" key="GridCell.Tab.0.left/0.23.1440.836@0.23.1440.836" timestamp="1580612505535" />
-    <state width="1419" height="216" key="GridCell.Tab.0.right" timestamp="1580612505536">
+    <state width="1419" height="216" key="GridCell.Tab.0.left/0.23.1440.836@0.23.1440.836" timestamp="1580656997012" />
+    <state width="1419" height="216" key="GridCell.Tab.0.right" timestamp="1580656997012">
       <screen x="0" y="23" width="1440" height="836" />
     </state>
-    <state width="1419" height="216" key="GridCell.Tab.0.right/0.23.1440.836@0.23.1440.836" timestamp="1580612505536" />
+    <state width="1419" height="216" key="GridCell.Tab.0.right/0.23.1440.836@0.23.1440.836" timestamp="1580656997012" />
     <state width="1419" height="268" key="GridCell.Tab.1.bottom" timestamp="1580610405283">
       <screen x="0" y="23" width="1440" height="836" />
     </state>
diff --git a/src/scraping/buxton/narratives.py b/src/scraping/buxton/narratives.py
new file mode 100644
index 000000000..947d60f91
--- /dev/null
+++ b/src/scraping/buxton/narratives.py
@@ -0,0 +1,38 @@
+from docx import Document
+import tempfile
+from zipfile import ZipFile
+import shutil
+from pathlib import Path
+from os import mkdir
+
+path = "./narratives/Theme - Chord Kbds.docx"
+doc = Document(path)
+
+# IMAGE_EXT = ('png', 'jpeg', 'jpg')
+#
+# with tempfile.TemporaryDirectory() as working_dir:
+#     with ZipFile(path) as working_zip:
+#         image_list = [name for name in working_zip.namelist() if any(name.endswith(ext) for ext in IMAGE_EXT)]
+#         working_zip.extractall(working_dir, image_list)
+#         mkdir("./test")
+#         for image in image_list:
+#             shutil.copy(Path(working_dir).resolve() / image, "./test")
+
+paragraphs = doc.paragraphs
+for i in range(len(paragraphs)):
+    print(f"{i}: {paragraphs[i].text}")
+
+# for section in doc.sections:
+#     print(section.orientation)
+
+# for shape in doc.inline_shapes:
+#     print(shape._inline)
+
+# images = doc.tables[0]
+# for row in images.rows:
+#     contents = []
+#     for cell in row.cells:
+#         contents.append(cell.text)
+    # print(contents)
+
+
diff --git a/src/scraping/buxton/narratives/Theme - Chord Kbds.docx b/src/scraping/buxton/narratives/Theme - Chord Kbds.docx
new file mode 100644
index 000000000..439a7d975
Binary files /dev/null and b/src/scraping/buxton/narratives/Theme - Chord Kbds.docx differ
diff --git a/src/scraping/buxton/narratives/chord_keyboards.json b/src/scraping/buxton/narratives/chord_keyboards.json
new file mode 100644
index 000000000..748578769
--- /dev/null
+++ b/src/scraping/buxton/narratives/chord_keyboards.json
@@ -0,0 +1,39 @@
+{
+    "slides": [{
+            "text": "Theme: Chord Keyboards\nFrom music to type\n\nChord keyboards require 2 or more keys to be simultaneously pushed to spawn the intended output. Playing a chord on a piano or pushing both the shift + a letter key on a typewriter to enter an upper case character are examples.",
+            "devices": ["Casio CZ-101"]
+        },
+        {
+            "text": "This is an early mechanical keyboard for taking dictation. Instead of typing alphanumeric characters as on a typewriter, pressing different combinations prints shorthand symbols on the tape, each representing a different phoneme.  Speech is easier to keep up with this way, since each phoneme typically represents multiple characters.\n\nThe downside – until AI came to the rescue – was that it then took hours to manually transcribe to shorthand into conventional readable text.",
+            "devices": ["Grandjean Sténotype"]
+        },
+        {
+            "text": "Designed and manufactured in the DDR, the purpose of this keyboard is to emboss dots representing Braille symbols onto paper. The effect is to enable blind users to use their tactile sensitivity to read with their fingers.\n\nEach Braille symbol consists of two columns of 3 embossed dots each.  Which 3 dots are embossed in each column is determined by which of the three keys on either side are simultaneously pressed. The key in the middle, operated by either thumb, enters a space.",
+            "devices": ["Braille Writer"]
+        },
+        {
+            "text": "This combination is derived from the work of the inventor of the mouse, Doug Engelbart\n\nWhile these are 2 distinct devices, they are not what they appear to be.\n\nFunctionally, there is a virtual 7-button chord keyboard, employing the 5 buttons on the keyset and the middle and right button of the mouse.  And, using the left mouse button, there is also a 1-button mouse\n\nText was entered using a minor variant of 7-bit ASCII.  The intent was to enable entering small bits of text without moving back-and-forth between mouse and QWERTY keyboard. It didn’t catch on.",
+            "devices": ["Xerox PARC 5-Button Keyset & 3-Button Mouse"]
+        },
+        {
+            "text": "",
+            "devices": []
+        },
+        {
+            "text": "",
+            "devices": []
+        },
+        {
+            "text": "",
+            "devices": []
+        },
+        {
+            "text": "",
+            "devices": []
+        },
+        {
+            "text": "",
+            "devices": []
+        }
+    ]
+}
\ No newline at end of file
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index 394958823..f7a38112d 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -1,32 +1,36 @@
 import os
+from shutil import copyfile
 import docx2txt
 from docx import Document
 from docx.opc.constants import RELATIONSHIP_TYPE as RT
 import re
+from pymongo import MongoClient
 import shutil
 import uuid
-import json
-import base64
-from shutil import copyfile
+import datetime
 from PIL import Image
+import math
+
+source = "./source"
+filesPath = "../../server/public/files"
+image_dist = filesPath + "/images/buxton"
 
-files_path = "../../server/public/files"
-source_path = "./source"
-temp_images_path = "./extracted_images"
-server_images_path = f"{files_path}/images/buxton"
-json_path = "./json"
+db = MongoClient("localhost", 27017)["Dash"]
+target_collection = db.newDocuments
+target_doc_title = "Collection 1"
+schema_guids = []
+common_proto_id = ""
 
 
-# noinspection PyProtectedMember
-def extract_links(file):
+def extract_links(fileName):
     links = []
-    doc = Document(file)
+    doc = Document(fileName)
     rels = doc.part.rels
     for rel in rels:
         item = rels[rel]
         if item.reltype == RT.HYPERLINK and ".aspx" not in item._target:
             links.append(item._target)
-    return links
+    return text_doc_map(links)
 
 
 def extract_value(kv_string):
@@ -46,58 +50,228 @@ def guid():
     return str(uuid.uuid4())
 
 
-def encode_image(folder: str, name: str):
-    with open(f"{temp_images_path}/{folder}/{name}", "rb") as image:
-        encoded = base64.b64encode(image.read())
-        return encoded.decode("utf-8")
-
-
-def parse_document(name: str):
-    print(f"parsing {name}...")
-    pure_name = name.split(".")[0]
+def listify(list):
+    return {
+        "fields": list,
+        "__type": "list"
+    }
+
+
+def protofy(fieldId):
+    return {
+        "fieldId": fieldId,
+        "__type": "proxy"
+    }
+
+
+def text_doc_map(string_list):
+    def guid_map(caption):
+        return write_text_doc(caption)
+    return listify(proxify_guids(list(map(guid_map, string_list))))
+
+
+def write_collection(parse_results, display_fields, storage_key, viewType):
+    view_guids = parse_results["child_guids"]
+
+    data_doc = parse_results["schema"]
+    fields = data_doc["fields"]
+
+    view_doc_guid = guid()
+
+    view_doc = {
+        "_id": view_doc_guid,
+        "fields": {
+            "proto": protofy(data_doc["_id"]),
+            "x": 10,
+            "y": 10,
+            "_width": 900,
+            "_height": 600,
+            "_panX": 0,
+            "_panY": 0,
+            "zIndex": 2,
+            "libraryBrush": False,
+            "_viewType": viewType,
+            "_LODdisable": True
+        },
+        "__type": "Doc"
+    }
+
+    fields["proto"] = protofy(common_proto_id)
+    fields[storage_key] = listify(proxify_guids(view_guids))
+    fields["schemaColumns"] = listify(display_fields)
+    fields["author"] = "Bill Buxton"
+    fields["creationDate"] = {
+        "date": datetime.datetime.utcnow().microsecond,
+        "__type": "date"
+    }
+    if "image_urls" in parse_results:
+        fields["hero"] = {
+            "url": parse_results["image_urls"][0],
+            "__type": "image"
+        }
+    fields["isPrototype"] = True
+
+    target_collection.insert_one(data_doc)
+    target_collection.insert_one(view_doc)
+
+    data_doc_guid = data_doc["_id"]
+    print(f"inserted view document ({view_doc_guid})")
+    print(f"inserted data document ({data_doc_guid})\n")
+
+    return view_doc_guid
+
+
+def write_text_doc(content):
+    data_doc_guid = guid()
+    view_doc_guid = guid()
+
+    view_doc = {
+        "_id": view_doc_guid,
+        "fields": {
+            "proto": protofy(data_doc_guid),
+            "x": 10,
+            "y": 10,
+            "_width": 400,
+            "zIndex": 2
+        },
+        "__type": "Doc"
+    }
+
+    data_doc = {
+        "_id": data_doc_guid,
+        "fields": {
+            "proto": protofy("textProto"),
+            "data": {
+                "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}',
+                "__type": "RichTextField"
+            },
+            "title": content,
+            "_nativeWidth": 200,
+            "author": "Bill Buxton",
+            "creationDate": {
+                "date": datetime.datetime.utcnow().microsecond,
+                "__type": "date"
+            },
+            "isPrototype": True,
+            "_autoHeight": True,
+            "page": -1,
+            "_nativeHeight": 200,
+            "_height": 200,
+            "data_text": content
+        },
+        "__type": "Doc"
+    }
+
+    target_collection.insert_one(view_doc)
+    target_collection.insert_one(data_doc)
+
+    return view_doc_guid
+
+
+def write_image(folder, name):
+    path = f"http://localhost:1050/files/images/buxton/{folder}/{name}"
+
+    data_doc_guid = guid()
+    view_doc_guid = guid()
+
+    image = Image.open(f"{image_dist}/{folder}/{name}")
+    native_width, native_height = image.size
+
+    if abs(native_width - native_height) < 10:
+        return None
+
+    view_doc = {
+        "_id": view_doc_guid,
+        "fields": {
+            "proto": protofy(data_doc_guid),
+            "x": 10,
+            "y": 10,
+            "_width": min(800, native_width),
+            "zIndex": 2,
+            "widthUnit": "*",
+            "widthMagnitude": 1
+        },
+        "__type": "Doc"
+    }
+
+    data_doc = {
+        "_id": data_doc_guid,
+        "fields": {
+            "proto": protofy("imageProto"),
+            "data": {
+                "url": path,
+                "__type": "image"
+            },
+            "title": name,
+            "_nativeWidth": native_width,
+            "author": "Bill Buxton",
+            "creationDate": {
+                "date": datetime.datetime.utcnow().microsecond,
+                "__type": "date"
+            },
+            "isPrototype": True,
+            "page": -1,
+            "_nativeHeight": native_height,
+            "_height": native_height
+        },
+        "__type": "Doc"
+    }
+
+    target_collection.insert_one(view_doc)
+    target_collection.insert_one(data_doc)
+
+    return {
+        "layout_id": view_doc_guid,
+        "url": path
+    }
+
+
+def parse_document(file_name: str):
+    print(f"parsing {file_name}...")
+    pure_name = file_name.split(".")[0]
 
     result = {}
 
-    saved_device_images_dir = server_images_path + "/" + pure_name
-    temp_device_images_dir = temp_images_path + "/" + pure_name
-    mkdir_if_absent(temp_device_images_dir)
-    mkdir_if_absent(saved_device_images_dir)
-
-    raw = str(docx2txt.process(source_path +
-                               "/" + name, temp_device_images_dir))
-
-    extracted_images = []
-    for image in os.listdir(temp_device_images_dir):
-        temp = f"{temp_device_images_dir}/{image}"
-        native_width, native_height = Image.open(temp).size
-        if abs(native_width - native_height) < 10:
-            continue
-        original = saved_device_images_dir + "/" + image.replace(".", "_o.", 1)
-        medium = saved_device_images_dir + "/" + image.replace(".", "_m.", 1)
-        copyfile(temp, original)
-        copyfile(temp, medium)
-        server_path = f"http://localhost:1050/files/images/buxton/{pure_name}/{image}"
-        extracted_images.append(server_path)
-    result["extracted_images"] = extracted_images
+    dir_path = image_dist + "/" + pure_name
+    print(dir_path)
+    mkdir_if_absent(dir_path)
+
+    raw = str(docx2txt.process(source + "/" + file_name, dir_path))
+
+    urls = []
+    view_guids = []
+    count = 0
+    for image in os.listdir(dir_path):
+        created = write_image(pure_name, image)
+        if created != None:
+            urls.append(created["url"])
+            view_guids.append(created["layout_id"])
+            count += 1
+            resolved = dir_path + "/" + image
+            original = dir_path + "/" + image.replace(".", "_o.", 1)
+            medium = dir_path + "/" + image.replace(".", "_m.", 1)
+            copyfile(resolved, original)
+            copyfile(resolved, medium)
+    print(f"extracted {count} images...")
 
     def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
         u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip()
 
-    def sanitize_price(raw_price: str):
-        raw_price = raw_price.replace(",", "")
-        start = raw_price.find("$")
-        if "x" in raw_price.lower():
+    def sanitize_price(raw: str):
+        raw = raw.replace(",", "")
+        if "x" in raw.lower():
             return None
+        start = raw.find("$")
         if start > -1:
             i = start + 1
-            while i < len(raw_price) and re.match(r"[0-9.]", raw_price[i]):
+            while (i < len(raw) and re.match(r"[0-9\.]", raw[i])):
                 i += 1
-            price = raw_price[start + 1: i + 1]
+            price = raw[start + 1: i + 1]
             return float(price)
-        elif raw_price.lower().find("nfs"):
+        elif (raw.lower().find("nfs")):
             return -1
         else:
-            return None
+            return math.nan
 
     def remove_empty(line): return len(line) > 1
 
@@ -112,6 +286,7 @@ def parse_document(name: str):
     lines = list(map(sanitize, raw.split("\n")))
     lines = list(filter(remove_empty, lines))
 
+    result["file_name"] = file_name
     result["title"] = lines[2].strip()
     result["short_description"] = lines[3].strip().replace(
         "Short Description: ", "")
@@ -127,13 +302,11 @@ def parse_document(name: str):
     clean = list(
         map(lambda data: data.strip().split(":"), lines[cur].split("|")))
     result["company"] = clean[0][len(clean[0]) - 1].strip()
-
     result["year"] = try_parse(clean[1][len(clean[1]) - 1].strip())
     result["original_price"] = sanitize_price(
         clean[2][len(clean[2]) - 1].strip())
 
     cur += 1
-
     result["degrees_of_freedom"] = try_parse(extract_value(
         lines[cur]).replace("NA", "N/A"))
     cur += 1
@@ -161,71 +334,99 @@ def parse_document(name: str):
     cur += 1
     link_descriptions = []
     while lines[cur] != "Image":
-        description = lines[cur].strip().lower()
-        valid = True
-        for ignored in ["powerpoint", "vimeo", "xxx"]:
-            if ignored in description:
-                valid = False
-                break
-        if valid:
-            link_descriptions.append(description)
+        link_descriptions.append(lines[cur].strip())
         cur += 1
-    result["link_descriptions"] = link_descriptions
+    result["link_descriptions"] = text_doc_map(link_descriptions)
 
-    result["hyperlinks"] = extract_links(source_path + "/" + name)
+    result["hyperlinks"] = extract_links(source + "/" + file_name)
 
     images = []
     captions = []
     cur += 3
     while cur + 1 < len(lines) and lines[cur] != "NOTES:":
-        name = lines[cur]
-        if "full document" not in name.lower():
-            images.append(name)
-            captions.append(lines[cur + 1])
+        images.append(lines[cur])
+        captions.append(lines[cur + 1])
         cur += 2
-    result["table_image_names"] = images
+    result["images"] = listify(images)
 
-    result["captions"] = captions
+    result["captions"] = text_doc_map(captions)
 
     notes = []
-    if cur < len(lines) and lines[cur] == "NOTES:":
+    if (cur < len(lines) and lines[cur] == "NOTES:"):
         cur += 1
         while cur < len(lines):
             notes.append(lines[cur])
             cur += 1
     if len(notes) > 0:
-        result["notes"] = notes
-
-    return result
-
-
-if os.path.exists(server_images_path):
-    shutil.rmtree(server_images_path)
-while os.path.exists(server_images_path):
+        result["notes"] = listify(notes)
+
+    print("writing child schema...")
+
+    return {
+        "schema": {
+            "_id": guid(),
+            "fields": result,
+            "__type": "Doc"
+        },
+        "child_guids": view_guids,
+        "image_urls": urls
+    }
+
+
+def proxify_guids(guids):
+    return list(map(lambda guid: {"fieldId": guid, "__type": "prefetch_proxy"}, guids))
+
+
+def write_common_proto():
+    id = guid()
+    common_proto = {
+        "_id": id,
+        "fields": {
+            "proto": protofy("collectionProto"),
+            "title": "The Buxton Collection",
+        },
+        "__type": "Doc"
+    }
+    target_collection.insert_one(common_proto)
+    return id
+
+
+if os.path.exists(image_dist):
+    shutil.rmtree(image_dist)
+while os.path.exists(image_dist):
     pass
-os.mkdir(server_images_path)
+os.mkdir(image_dist)
+mkdir_if_absent(source)
 
-mkdir_if_absent(source_path)
-mkdir_if_absent(json_path)
-mkdir_if_absent(temp_images_path)
-
-results = []
+common_proto_id = write_common_proto()
 
 candidates = 0
-for file_name in os.listdir(source_path):
-    if file_name.endswith('.docx') or file_name.endswith(".doc"):
+for file_name in os.listdir(source):
+    if file_name.endswith('.docx') or file_name.endswith('.doc'):
         candidates += 1
-        results.append(parse_document(file_name))
-
-
-with open(f"./json/buxton_collection.json", "w", encoding="utf-8") as out:
-    json.dump(results, out, ensure_ascii=False, indent=4)
-
-print(f"\nSuccessfully parsed {candidates} candidates.")
-
-print("\nrewriting .gitignore...")
-entries = ['*', '!.gitignore']
-with open(files_path + "/.gitignore", 'w') as f:
-    f.write('\n'.join(entries))
-
-shutil.rmtree(temp_images_path)
+        schema_guids.append(write_collection(
+            parse_document(file_name), ["title", "data"], "data", 5))
+
+print("writing parent schema...")
+parent_guid = write_collection({
+    "schema": {
+        "_id": guid(),
+        "fields": {},
+        "__type": "Doc"
+    },
+    "child_guids": schema_guids
+}, ["title", "short_description", "original_price"], "data", 2)
+
+print("appending parent schema to main workspace...\n")
+target_collection.update_one(
+    {"fields.title": target_doc_title},
+    {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}}
+)
+
+print("rewriting .gitignore...\n")
+lines = ['*', '!.gitignore']
+with open(filesPath + "/.gitignore", 'w') as f:
+    f.write('\n'.join(lines))
+
+suffix = "" if candidates == 1 else "s"
+print(f"conversion complete. {candidates} candidate{suffix} processed.")
-- 
cgit v1.2.3-70-g09d2