aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSam Wilkins <samwilkins333@gmail.com>2020-02-02 12:46:57 -0500
committerSam Wilkins <samwilkins333@gmail.com>2020-02-02 12:46:57 -0500
commit90d6454c05cdeb109da25dd55d428c140defca49 (patch)
tree6b3450af3735be612fed4c0a0b40c3da2cce850f
parenta4620be72d758223e41e60974e0bfb8fd764c274 (diff)
fixed scraper
-rw-r--r--src/scraping/buxton/.idea/workspace.xml46
-rw-r--r--src/scraping/buxton/narratives.py38
-rw-r--r--src/scraping/buxton/narratives/Theme - Chord Kbds.docxbin0 -> 5701815 bytes
-rw-r--r--src/scraping/buxton/narratives/chord_keyboards.json39
-rw-r--r--src/scraping/buxton/scraper.py399
5 files changed, 411 insertions, 111 deletions
diff --git a/src/scraping/buxton/.idea/workspace.xml b/src/scraping/buxton/.idea/workspace.xml
index b2c7d4b8c..6f1ae3814 100644
--- a/src/scraping/buxton/.idea/workspace.xml
+++ b/src/scraping/buxton/.idea/workspace.xml
@@ -126,7 +126,7 @@
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
</component>
- <component name="RunManager" selected="Python.jsonifier">
+ <component name="RunManager" selected="Python.narratives">
<configuration name="jsonifier" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
<module name="buxton" />
<option name="INTERPRETER_OPTIONS" value="" />
@@ -148,6 +148,27 @@
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
+ <configuration name="narratives" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
+ <module name="buxton" />
+ <option name="INTERPRETER_OPTIONS" value="" />
+ <option name="PARENT_ENVS" value="true" />
+ <envs>
+ <env name="PYTHONUNBUFFERED" value="1" />
+ </envs>
+ <option name="SDK_HOME" value="/usr/local/bin/python3.7" />
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+ <option name="IS_MODULE_SDK" value="false" />
+ <option name="ADD_CONTENT_ROOTS" value="true" />
+ <option name="ADD_SOURCE_ROOTS" value="true" />
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/narratives.py" />
+ <option name="PARAMETERS" value="" />
+ <option name="SHOW_COMMAND_LINE" value="false" />
+ <option name="EMULATE_TERMINAL" value="false" />
+ <option name="MODULE_MODE" value="false" />
+ <option name="REDIRECT_INPUT" value="false" />
+ <option name="INPUT_FILE" value="" />
+ <method v="2" />
+ </configuration>
<configuration name="scraper" type="PythonConfigurationType" factoryName="Python">
<module name="buxton" />
<option name="INTERPRETER_OPTIONS" value="" />
@@ -160,7 +181,7 @@
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
- <option name="SCRIPT_NAME" value="$PROJECT_DIR$/new_scraper.py" />
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/scraper.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
@@ -172,6 +193,7 @@
<list>
<item itemvalue="Python.jsonifier" />
<item itemvalue="Python.scraper" />
+ <item itemvalue="Python.narratives" />
</list>
</component>
<component name="SvnConfiguration">
@@ -188,30 +210,30 @@
<servers />
</component>
<component name="WindowStateProjectService">
- <state x="184" y="103" key="#com.intellij.execution.impl.EditConfigurationsDialog" timestamp="1580610403225">
+ <state x="184" y="103" key="#com.intellij.execution.impl.EditConfigurationsDialog" timestamp="1580656983882">
<screen x="0" y="23" width="1440" height="836" />
</state>
- <state x="184" y="103" key="#com.intellij.execution.impl.EditConfigurationsDialog/0.23.1440.836@0.23.1440.836" timestamp="1580610403225" />
+ <state x="184" y="103" key="#com.intellij.execution.impl.EditConfigurationsDialog/0.23.1440.836@0.23.1440.836" timestamp="1580656983882" />
<state x="483" y="152" key="#xdebugger.evaluate" timestamp="1580601059439">
<screen x="0" y="23" width="1440" height="836" />
</state>
<state x="483" y="152" key="#xdebugger.evaluate/0.23.1440.836@0.23.1440.836" timestamp="1580601059439" />
- <state width="1419" height="216" key="GridCell.Tab.0.bottom" timestamp="1580612505537">
+ <state width="1419" height="216" key="GridCell.Tab.0.bottom" timestamp="1580656997013">
<screen x="0" y="23" width="1440" height="836" />
</state>
- <state width="1419" height="216" key="GridCell.Tab.0.bottom/0.23.1440.836@0.23.1440.836" timestamp="1580612505537" />
- <state width="1419" height="216" key="GridCell.Tab.0.center" timestamp="1580612505536">
+ <state width="1419" height="216" key="GridCell.Tab.0.bottom/0.23.1440.836@0.23.1440.836" timestamp="1580656997013" />
+ <state width="1419" height="216" key="GridCell.Tab.0.center" timestamp="1580656997012">
<screen x="0" y="23" width="1440" height="836" />
</state>
- <state width="1419" height="216" key="GridCell.Tab.0.center/0.23.1440.836@0.23.1440.836" timestamp="1580612505536" />
- <state width="1419" height="216" key="GridCell.Tab.0.left" timestamp="1580612505535">
+ <state width="1419" height="216" key="GridCell.Tab.0.center/0.23.1440.836@0.23.1440.836" timestamp="1580656997012" />
+ <state width="1419" height="216" key="GridCell.Tab.0.left" timestamp="1580656997012">
<screen x="0" y="23" width="1440" height="836" />
</state>
- <state width="1419" height="216" key="GridCell.Tab.0.left/0.23.1440.836@0.23.1440.836" timestamp="1580612505535" />
- <state width="1419" height="216" key="GridCell.Tab.0.right" timestamp="1580612505536">
+ <state width="1419" height="216" key="GridCell.Tab.0.left/0.23.1440.836@0.23.1440.836" timestamp="1580656997012" />
+ <state width="1419" height="216" key="GridCell.Tab.0.right" timestamp="1580656997012">
<screen x="0" y="23" width="1440" height="836" />
</state>
- <state width="1419" height="216" key="GridCell.Tab.0.right/0.23.1440.836@0.23.1440.836" timestamp="1580612505536" />
+ <state width="1419" height="216" key="GridCell.Tab.0.right/0.23.1440.836@0.23.1440.836" timestamp="1580656997012" />
<state width="1419" height="268" key="GridCell.Tab.1.bottom" timestamp="1580610405283">
<screen x="0" y="23" width="1440" height="836" />
</state>
diff --git a/src/scraping/buxton/narratives.py b/src/scraping/buxton/narratives.py
new file mode 100644
index 000000000..947d60f91
--- /dev/null
+++ b/src/scraping/buxton/narratives.py
@@ -0,0 +1,38 @@
+from docx import Document
+import tempfile
+from zipfile import ZipFile
+import shutil
+from pathlib import Path
+from os import mkdir
+
+path = "./narratives/Theme - Chord Kbds.docx"
+doc = Document(path)
+
+# IMAGE_EXT = ('png', 'jpeg', 'jpg')
+#
+# with tempfile.TemporaryDirectory() as working_dir:
+# with ZipFile(path) as working_zip:
+# image_list = [name for name in working_zip.namelist() if any(name.endswith(ext) for ext in IMAGE_EXT)]
+# working_zip.extractall(working_dir, image_list)
+# mkdir("./test")
+# for image in image_list:
+# shutil.copy(Path(working_dir).resolve() / image, "./test")
+
+paragraphs = doc.paragraphs
+for i in range(len(paragraphs)):
+ print(f"{i}: {paragraphs[i].text}")
+
+# for section in doc.sections:
+# print(section.orientation)
+
+# for shape in doc.inline_shapes:
+# print(shape._inline)
+
+# images = doc.tables[0]
+# for row in images.rows:
+# contents = []
+# for cell in row.cells:
+# contents.append(cell.text)
+ # print(contents)
+
+
diff --git a/src/scraping/buxton/narratives/Theme - Chord Kbds.docx b/src/scraping/buxton/narratives/Theme - Chord Kbds.docx
new file mode 100644
index 000000000..439a7d975
--- /dev/null
+++ b/src/scraping/buxton/narratives/Theme - Chord Kbds.docx
Binary files differ
diff --git a/src/scraping/buxton/narratives/chord_keyboards.json b/src/scraping/buxton/narratives/chord_keyboards.json
new file mode 100644
index 000000000..748578769
--- /dev/null
+++ b/src/scraping/buxton/narratives/chord_keyboards.json
@@ -0,0 +1,39 @@
+{
+ "slides": [{
+ "text": "Theme: Chord Keyboards\nFrom music to type\n\nChord keyboards require 2 or more keys to be simultaneously pushed to spawn the intended output. Playing a chord on a piano or pushing both the shift + a letter key on a typewriter to enter an upper case character are examples.",
+ "devices": ["Casio CZ-101"]
+ },
+ {
+ "text": "This is an early mechanical keyboard for taking dictation. Instead of typing alphanumeric characters as on a typewriter, pressing different combinations prints shorthand symbols on the tape, each representing a different phoneme. Speech is easier to keep up with this way, since each phoneme typically represents multiple characters.\n\nThe downside – until AI came to the rescue – was that it then took hours to manually transcribe to shorthand into conventional readable text.",
+ "devices": ["Grandjean Sténotype"]
+ },
+ {
+ "text": "Designed and manufactured in the DDR, the purpose of this keyboard is to emboss dots representing Braille symbols onto paper. The effect is to enable blind users to use their tactile sensitivity to read with their fingers.\n\nEach Braille symbol consists of two columns of 3 embossed dots each. Which 3 dots are embossed in each column is determined by which of the three keys on either side are simultaneously pressed. The key in the middle, operated by either thumb, enters a space.",
+ "devices": ["Braille Writer"]
+ },
+ {
+ "text": "This combination is derived from the work of the inventor of the mouse, Doug Engelbart\n\nWhile these are 2 distinct devices, they are not what they appear to be.\n\nFunctionally, there is a virtual 7-button chord keyboard, employing the 5 buttons on the keyset and the middle and right button of the mouse. And, using the left mouse button, there is also a 1-button mouse\n\nText was entered using a minor variant of 7-bit ASCII. The intent was to enable entering small bits of text without moving back-and-forth between mouse and QWERTY keyboard. It didn’t catch on.",
+ "devices": ["Xerox PARC 5-Button Keyset & 3-Button Mouse"]
+ },
+ {
+ "text": "",
+ "devices": []
+ },
+ {
+ "text": "",
+ "devices": []
+ },
+ {
+ "text": "",
+ "devices": []
+ },
+ {
+ "text": "",
+ "devices": []
+ },
+ {
+ "text": "",
+ "devices": []
+ }
+ ]
+} \ No newline at end of file
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index 394958823..f7a38112d 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -1,32 +1,36 @@
import os
+from shutil import copyfile
import docx2txt
from docx import Document
from docx.opc.constants import RELATIONSHIP_TYPE as RT
import re
+from pymongo import MongoClient
import shutil
import uuid
-import json
-import base64
-from shutil import copyfile
+import datetime
from PIL import Image
+import math
+
+source = "./source"
+filesPath = "../../server/public/files"
+image_dist = filesPath + "/images/buxton"
-files_path = "../../server/public/files"
-source_path = "./source"
-temp_images_path = "./extracted_images"
-server_images_path = f"{files_path}/images/buxton"
-json_path = "./json"
+db = MongoClient("localhost", 27017)["Dash"]
+target_collection = db.newDocuments
+target_doc_title = "Collection 1"
+schema_guids = []
+common_proto_id = ""
-# noinspection PyProtectedMember
-def extract_links(file):
+def extract_links(fileName):
links = []
- doc = Document(file)
+ doc = Document(fileName)
rels = doc.part.rels
for rel in rels:
item = rels[rel]
if item.reltype == RT.HYPERLINK and ".aspx" not in item._target:
links.append(item._target)
- return links
+ return text_doc_map(links)
def extract_value(kv_string):
@@ -46,58 +50,228 @@ def guid():
return str(uuid.uuid4())
-def encode_image(folder: str, name: str):
- with open(f"{temp_images_path}/{folder}/{name}", "rb") as image:
- encoded = base64.b64encode(image.read())
- return encoded.decode("utf-8")
-
-
-def parse_document(name: str):
- print(f"parsing {name}...")
- pure_name = name.split(".")[0]
+def listify(list):
+ return {
+ "fields": list,
+ "__type": "list"
+ }
+
+
+def protofy(fieldId):
+ return {
+ "fieldId": fieldId,
+ "__type": "proxy"
+ }
+
+
+def text_doc_map(string_list):
+ def guid_map(caption):
+ return write_text_doc(caption)
+ return listify(proxify_guids(list(map(guid_map, string_list))))
+
+
+def write_collection(parse_results, display_fields, storage_key, viewType):
+ view_guids = parse_results["child_guids"]
+
+ data_doc = parse_results["schema"]
+ fields = data_doc["fields"]
+
+ view_doc_guid = guid()
+
+ view_doc = {
+ "_id": view_doc_guid,
+ "fields": {
+ "proto": protofy(data_doc["_id"]),
+ "x": 10,
+ "y": 10,
+ "_width": 900,
+ "_height": 600,
+ "_panX": 0,
+ "_panY": 0,
+ "zIndex": 2,
+ "libraryBrush": False,
+ "_viewType": viewType,
+ "_LODdisable": True
+ },
+ "__type": "Doc"
+ }
+
+ fields["proto"] = protofy(common_proto_id)
+ fields[storage_key] = listify(proxify_guids(view_guids))
+ fields["schemaColumns"] = listify(display_fields)
+ fields["author"] = "Bill Buxton"
+ fields["creationDate"] = {
+ "date": datetime.datetime.utcnow().microsecond,
+ "__type": "date"
+ }
+ if "image_urls" in parse_results:
+ fields["hero"] = {
+ "url": parse_results["image_urls"][0],
+ "__type": "image"
+ }
+ fields["isPrototype"] = True
+
+ target_collection.insert_one(data_doc)
+ target_collection.insert_one(view_doc)
+
+ data_doc_guid = data_doc["_id"]
+ print(f"inserted view document ({view_doc_guid})")
+ print(f"inserted data document ({data_doc_guid})\n")
+
+ return view_doc_guid
+
+
+def write_text_doc(content):
+ data_doc_guid = guid()
+ view_doc_guid = guid()
+
+ view_doc = {
+ "_id": view_doc_guid,
+ "fields": {
+ "proto": protofy(data_doc_guid),
+ "x": 10,
+ "y": 10,
+ "_width": 400,
+ "zIndex": 2
+ },
+ "__type": "Doc"
+ }
+
+ data_doc = {
+ "_id": data_doc_guid,
+ "fields": {
+ "proto": protofy("textProto"),
+ "data": {
+ "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}',
+ "__type": "RichTextField"
+ },
+ "title": content,
+ "_nativeWidth": 200,
+ "author": "Bill Buxton",
+ "creationDate": {
+ "date": datetime.datetime.utcnow().microsecond,
+ "__type": "date"
+ },
+ "isPrototype": True,
+ "_autoHeight": True,
+ "page": -1,
+ "_nativeHeight": 200,
+ "_height": 200,
+ "data_text": content
+ },
+ "__type": "Doc"
+ }
+
+ target_collection.insert_one(view_doc)
+ target_collection.insert_one(data_doc)
+
+ return view_doc_guid
+
+
+def write_image(folder, name):
+ path = f"http://localhost:1050/files/images/buxton/{folder}/{name}"
+
+ data_doc_guid = guid()
+ view_doc_guid = guid()
+
+ image = Image.open(f"{image_dist}/{folder}/{name}")
+ native_width, native_height = image.size
+
+ if abs(native_width - native_height) < 10:
+ return None
+
+ view_doc = {
+ "_id": view_doc_guid,
+ "fields": {
+ "proto": protofy(data_doc_guid),
+ "x": 10,
+ "y": 10,
+ "_width": min(800, native_width),
+ "zIndex": 2,
+ "widthUnit": "*",
+ "widthMagnitude": 1
+ },
+ "__type": "Doc"
+ }
+
+ data_doc = {
+ "_id": data_doc_guid,
+ "fields": {
+ "proto": protofy("imageProto"),
+ "data": {
+ "url": path,
+ "__type": "image"
+ },
+ "title": name,
+ "_nativeWidth": native_width,
+ "author": "Bill Buxton",
+ "creationDate": {
+ "date": datetime.datetime.utcnow().microsecond,
+ "__type": "date"
+ },
+ "isPrototype": True,
+ "page": -1,
+ "_nativeHeight": native_height,
+ "_height": native_height
+ },
+ "__type": "Doc"
+ }
+
+ target_collection.insert_one(view_doc)
+ target_collection.insert_one(data_doc)
+
+ return {
+ "layout_id": view_doc_guid,
+ "url": path
+ }
+
+
+def parse_document(file_name: str):
+ print(f"parsing {file_name}...")
+ pure_name = file_name.split(".")[0]
result = {}
- saved_device_images_dir = server_images_path + "/" + pure_name
- temp_device_images_dir = temp_images_path + "/" + pure_name
- mkdir_if_absent(temp_device_images_dir)
- mkdir_if_absent(saved_device_images_dir)
-
- raw = str(docx2txt.process(source_path +
- "/" + name, temp_device_images_dir))
-
- extracted_images = []
- for image in os.listdir(temp_device_images_dir):
- temp = f"{temp_device_images_dir}/{image}"
- native_width, native_height = Image.open(temp).size
- if abs(native_width - native_height) < 10:
- continue
- original = saved_device_images_dir + "/" + image.replace(".", "_o.", 1)
- medium = saved_device_images_dir + "/" + image.replace(".", "_m.", 1)
- copyfile(temp, original)
- copyfile(temp, medium)
- server_path = f"http://localhost:1050/files/images/buxton/{pure_name}/{image}"
- extracted_images.append(server_path)
- result["extracted_images"] = extracted_images
+ dir_path = image_dist + "/" + pure_name
+ print(dir_path)
+ mkdir_if_absent(dir_path)
+
+ raw = str(docx2txt.process(source + "/" + file_name, dir_path))
+
+ urls = []
+ view_guids = []
+ count = 0
+ for image in os.listdir(dir_path):
+ created = write_image(pure_name, image)
+ if created != None:
+ urls.append(created["url"])
+ view_guids.append(created["layout_id"])
+ count += 1
+ resolved = dir_path + "/" + image
+ original = dir_path + "/" + image.replace(".", "_o.", 1)
+ medium = dir_path + "/" + image.replace(".", "_m.", 1)
+ copyfile(resolved, original)
+ copyfile(resolved, medium)
+ print(f"extracted {count} images...")
def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip()
- def sanitize_price(raw_price: str):
- raw_price = raw_price.replace(",", "")
- start = raw_price.find("$")
- if "x" in raw_price.lower():
+ def sanitize_price(raw: str):
+ raw = raw.replace(",", "")
+ if "x" in raw.lower():
return None
+ start = raw.find("$")
if start > -1:
i = start + 1
- while i < len(raw_price) and re.match(r"[0-9.]", raw_price[i]):
+ while (i < len(raw) and re.match(r"[0-9\.]", raw[i])):
i += 1
- price = raw_price[start + 1: i + 1]
+ price = raw[start + 1: i + 1]
return float(price)
- elif raw_price.lower().find("nfs"):
+ elif (raw.lower().find("nfs")):
return -1
else:
- return None
+ return math.nan
def remove_empty(line): return len(line) > 1
@@ -112,6 +286,7 @@ def parse_document(name: str):
lines = list(map(sanitize, raw.split("\n")))
lines = list(filter(remove_empty, lines))
+ result["file_name"] = file_name
result["title"] = lines[2].strip()
result["short_description"] = lines[3].strip().replace(
"Short Description: ", "")
@@ -127,13 +302,11 @@ def parse_document(name: str):
clean = list(
map(lambda data: data.strip().split(":"), lines[cur].split("|")))
result["company"] = clean[0][len(clean[0]) - 1].strip()
-
result["year"] = try_parse(clean[1][len(clean[1]) - 1].strip())
result["original_price"] = sanitize_price(
clean[2][len(clean[2]) - 1].strip())
cur += 1
-
result["degrees_of_freedom"] = try_parse(extract_value(
lines[cur]).replace("NA", "N/A"))
cur += 1
@@ -161,71 +334,99 @@ def parse_document(name: str):
cur += 1
link_descriptions = []
while lines[cur] != "Image":
- description = lines[cur].strip().lower()
- valid = True
- for ignored in ["powerpoint", "vimeo", "xxx"]:
- if ignored in description:
- valid = False
- break
- if valid:
- link_descriptions.append(description)
+ link_descriptions.append(lines[cur].strip())
cur += 1
- result["link_descriptions"] = link_descriptions
+ result["link_descriptions"] = text_doc_map(link_descriptions)
- result["hyperlinks"] = extract_links(source_path + "/" + name)
+ result["hyperlinks"] = extract_links(source + "/" + file_name)
images = []
captions = []
cur += 3
while cur + 1 < len(lines) and lines[cur] != "NOTES:":
- name = lines[cur]
- if "full document" not in name.lower():
- images.append(name)
- captions.append(lines[cur + 1])
+ images.append(lines[cur])
+ captions.append(lines[cur + 1])
cur += 2
- result["table_image_names"] = images
+ result["images"] = listify(images)
- result["captions"] = captions
+ result["captions"] = text_doc_map(captions)
notes = []
- if cur < len(lines) and lines[cur] == "NOTES:":
+ if (cur < len(lines) and lines[cur] == "NOTES:"):
cur += 1
while cur < len(lines):
notes.append(lines[cur])
cur += 1
if len(notes) > 0:
- result["notes"] = notes
-
- return result
-
-
-if os.path.exists(server_images_path):
- shutil.rmtree(server_images_path)
-while os.path.exists(server_images_path):
+ result["notes"] = listify(notes)
+
+ print("writing child schema...")
+
+ return {
+ "schema": {
+ "_id": guid(),
+ "fields": result,
+ "__type": "Doc"
+ },
+ "child_guids": view_guids,
+ "image_urls": urls
+ }
+
+
+def proxify_guids(guids):
+ return list(map(lambda guid: {"fieldId": guid, "__type": "prefetch_proxy"}, guids))
+
+
+def write_common_proto():
+ id = guid()
+ common_proto = {
+ "_id": id,
+ "fields": {
+ "proto": protofy("collectionProto"),
+ "title": "The Buxton Collection",
+ },
+ "__type": "Doc"
+ }
+ target_collection.insert_one(common_proto)
+ return id
+
+
+if os.path.exists(image_dist):
+ shutil.rmtree(image_dist)
+while os.path.exists(image_dist):
pass
-os.mkdir(server_images_path)
+os.mkdir(image_dist)
+mkdir_if_absent(source)
-mkdir_if_absent(source_path)
-mkdir_if_absent(json_path)
-mkdir_if_absent(temp_images_path)
-
-results = []
+common_proto_id = write_common_proto()
candidates = 0
-for file_name in os.listdir(source_path):
- if file_name.endswith('.docx') or file_name.endswith(".doc"):
+for file_name in os.listdir(source):
+ if file_name.endswith('.docx') or file_name.endswith('.doc'):
candidates += 1
- results.append(parse_document(file_name))
-
-
-with open(f"./json/buxton_collection.json", "w", encoding="utf-8") as out:
- json.dump(results, out, ensure_ascii=False, indent=4)
-
-print(f"\nSuccessfully parsed {candidates} candidates.")
-
-print("\nrewriting .gitignore...")
-entries = ['*', '!.gitignore']
-with open(files_path + "/.gitignore", 'w') as f:
- f.write('\n'.join(entries))
-
-shutil.rmtree(temp_images_path)
+ schema_guids.append(write_collection(
+ parse_document(file_name), ["title", "data"], "data", 5))
+
+print("writing parent schema...")
+parent_guid = write_collection({
+ "schema": {
+ "_id": guid(),
+ "fields": {},
+ "__type": "Doc"
+ },
+ "child_guids": schema_guids
+}, ["title", "short_description", "original_price"], "data", 2)
+
+print("appending parent schema to main workspace...\n")
+target_collection.update_one(
+ {"fields.title": target_doc_title},
+ {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}}
+)
+
+print("rewriting .gitignore...\n")
+lines = ['*', '!.gitignore']
+with open(filesPath + "/.gitignore", 'w') as f:
+ f.write('\n'.join(lines))
+
+suffix = "" if candidates == 1 else "s"
+print(f"conversion complete. {candidates} candidate{suffix} processed.")