diff options
30 files changed, 360 insertions, 303 deletions
diff --git a/.gitignore b/.gitignore index 8376c385e..b88fed833 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ ClientUtils.ts solr-8.3.1/server/logs/ solr-8.3.1/server/solr/dash/data/tlog/* solr-8.3.1/server/solr/dash/data/index/* +src/scraping/buxton/source/ src/server/public/files/ src/scraping/acm/package-lock.json src/server/session_manager/logs/**/*.log diff --git a/src/scraping/buxton/.idea/workspace.xml b/src/scraping/buxton/.idea/workspace.xml new file mode 100644 index 000000000..b2c7d4b8c --- /dev/null +++ b/src/scraping/buxton/.idea/workspace.xml @@ -0,0 +1,248 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ChangeListManager"> + <list default="true" id="693c6819-edcc-46d6-8260-3f51ec080a46" name="Default Changelist" comment=""> + <change afterPath="$PROJECT_DIR$/new_scraper.py" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/../../client/views/collections/CollectionSubView.tsx" beforeDir="false" afterPath="$PROJECT_DIR$/../../client/views/collections/CollectionSubView.tsx" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image1.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image10.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image11.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image12.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image13.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image14.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image15.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image16.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image2.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image3.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image4.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image5.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image6.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image7.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image8.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Apple_ADB_Mouse/image9.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Casio_CZ-101/image1.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Casio_CZ-101/image10.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Casio_CZ-101/image11.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Casio_CZ-101/image2.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Casio_CZ-101/image3.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Casio_CZ-101/image4.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Casio_CZ-101/image5.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Casio_CZ-101/image6.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Casio_CZ-101/image7.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Casio_CZ-101/image8.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Casio_CZ-101/image9.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image1.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image10.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image11.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image12.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image13.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image14.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image15.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image16.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image17.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image18.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image2.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image3.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image4.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image5.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image6.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image7.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image8.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Contour_UniTrap/image9.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image1.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image10.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image11.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image12.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image13.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image14.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image15.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image16.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image17.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image18.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image19.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image2.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image20.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image21.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image3.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image4.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image5.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image6.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image7.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image8.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Dymo_MK-6/image9.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image1.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image10.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image11.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image12.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image13.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image14.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image15.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image16.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image2.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image3.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image4.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image5.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image6.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image7.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image8.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_Helios-Klimax/image9.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image1.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image10.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image11.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image12.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image13.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image14.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image15.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image2.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image3.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image4.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image5.png" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image6.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image7.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image8.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/extracted_images/Bill_Notes_IBM_PS2_Mouse/image9.jpeg" beforeDir="false" /> + <change beforePath="$PROJECT_DIR$/jsonifier.py" beforeDir="false" afterPath="$PROJECT_DIR$/jsonifier.py" afterDir="false" /> + <change beforePath="$PROJECT_DIR$/scraper.py" beforeDir="false" afterPath="$PROJECT_DIR$/scraper.py" afterDir="false" /> + </list> + <option name="SHOW_DIALOG" value="false" /> + <option name="HIGHLIGHT_CONFLICTS" value="true" /> + <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> + <option name="LAST_RESOLUTION" value="IGNORE" /> + </component> + <component name="Git.Settings"> + <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/../../.." /> + </component> + <component name="ProjectId" id="1XDYVVOvUV6lmODouwAWUpvxnni" /> + <component name="ProjectLevelVcsManager" settingsEditedManually="true" /> + <component name="ProjectViewState"> + <option name="hideEmptyMiddlePackages" value="true" /> + <option name="showExcludedFiles" value="true" /> + <option name="showLibraryContents" value="true" /> + </component> + <component name="PropertiesComponent"> + <property name="ASKED_SHARE_PROJECT_CONFIGURATION_FILES" value="true" /> + <property name="RunOnceActivity.ShowReadmeOnStart" value="true" /> + <property name="SHARE_PROJECT_CONFIGURATION_FILES" value="true" /> + <property name="last_opened_file_path" value="$PROJECT_DIR$" /> + <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" /> + </component> + <component name="RunManager" selected="Python.jsonifier"> + <configuration name="jsonifier" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true"> + <module name="buxton" /> + <option name="INTERPRETER_OPTIONS" value="" /> + <option name="PARENT_ENVS" value="true" /> + <envs> + <env name="PYTHONUNBUFFERED" value="1" /> + </envs> + <option name="SDK_HOME" value="/usr/local/bin/python3.7" /> + <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" /> + <option name="IS_MODULE_SDK" value="false" /> + <option name="ADD_CONTENT_ROOTS" value="true" /> + <option name="ADD_SOURCE_ROOTS" value="true" /> + <option name="SCRIPT_NAME" value="$PROJECT_DIR$/jsonifier.py" /> + <option name="PARAMETERS" value="" /> + <option name="SHOW_COMMAND_LINE" value="false" /> + <option name="EMULATE_TERMINAL" value="false" /> + <option name="MODULE_MODE" value="false" /> + <option name="REDIRECT_INPUT" value="false" /> + <option name="INPUT_FILE" value="" /> + <method v="2" /> + </configuration> + <configuration name="scraper" type="PythonConfigurationType" factoryName="Python"> + <module name="buxton" /> + <option name="INTERPRETER_OPTIONS" value="" /> + <option name="PARENT_ENVS" value="true" /> + <envs> + <env name="PYTHONUNBUFFERED" value="1" /> + </envs> + <option name="SDK_HOME" value="/usr/local/bin/python3.7" /> + <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" /> + <option name="IS_MODULE_SDK" value="false" /> + <option name="ADD_CONTENT_ROOTS" value="true" /> + <option name="ADD_SOURCE_ROOTS" value="true" /> + <option name="SCRIPT_NAME" value="$PROJECT_DIR$/new_scraper.py" /> + <option name="PARAMETERS" value="" /> + <option name="SHOW_COMMAND_LINE" value="false" /> + <option name="EMULATE_TERMINAL" value="false" /> + <option name="MODULE_MODE" value="false" /> + <option name="REDIRECT_INPUT" value="false" /> + <option name="INPUT_FILE" value="" /> + <method v="2" /> + </configuration> + <list> + <item itemvalue="Python.jsonifier" /> + <item itemvalue="Python.scraper" /> + </list> + </component> + <component name="SvnConfiguration"> + <configuration /> + </component> + <component name="TaskManager"> + <task active="true" id="Default" summary="Default task"> + <changelist id="693c6819-edcc-46d6-8260-3f51ec080a46" name="Default Changelist" comment="" /> + <created>1580582155646</created> + <option name="number" value="Default" /> + <option name="presentableId" value="Default" /> + <updated>1580582155646</updated> + </task> + <servers /> + </component> + <component name="WindowStateProjectService"> + <state x="184" y="103" key="#com.intellij.execution.impl.EditConfigurationsDialog" timestamp="1580610403225"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state x="184" y="103" key="#com.intellij.execution.impl.EditConfigurationsDialog/0.23.1440.836@0.23.1440.836" timestamp="1580610403225" /> + <state x="483" y="152" key="#xdebugger.evaluate" timestamp="1580601059439"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state x="483" y="152" key="#xdebugger.evaluate/0.23.1440.836@0.23.1440.836" timestamp="1580601059439" /> + <state width="1419" height="216" key="GridCell.Tab.0.bottom" timestamp="1580612505537"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state width="1419" height="216" key="GridCell.Tab.0.bottom/0.23.1440.836@0.23.1440.836" timestamp="1580612505537" /> + <state width="1419" height="216" key="GridCell.Tab.0.center" timestamp="1580612505536"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state width="1419" height="216" key="GridCell.Tab.0.center/0.23.1440.836@0.23.1440.836" timestamp="1580612505536" /> + <state width="1419" height="216" key="GridCell.Tab.0.left" timestamp="1580612505535"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state width="1419" height="216" key="GridCell.Tab.0.left/0.23.1440.836@0.23.1440.836" timestamp="1580612505535" /> + <state width="1419" height="216" key="GridCell.Tab.0.right" timestamp="1580612505536"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state width="1419" height="216" key="GridCell.Tab.0.right/0.23.1440.836@0.23.1440.836" timestamp="1580612505536" /> + <state width="1419" height="268" key="GridCell.Tab.1.bottom" timestamp="1580610405283"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state width="1419" height="268" key="GridCell.Tab.1.bottom/0.23.1440.836@0.23.1440.836" timestamp="1580610405283" /> + <state width="1419" height="268" key="GridCell.Tab.1.center" timestamp="1580610405282"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state width="1419" height="268" key="GridCell.Tab.1.center/0.23.1440.836@0.23.1440.836" timestamp="1580610405282" /> + <state width="1419" height="268" key="GridCell.Tab.1.left" timestamp="1580610405282"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state width="1419" height="268" key="GridCell.Tab.1.left/0.23.1440.836@0.23.1440.836" timestamp="1580610405282" /> + <state width="1419" height="268" key="GridCell.Tab.1.right" timestamp="1580610405282"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state width="1419" height="268" key="GridCell.Tab.1.right/0.23.1440.836@0.23.1440.836" timestamp="1580610405282" /> + <state x="229" y="80" key="SettingsEditor" timestamp="1580610123068"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state x="229" y="80" key="SettingsEditor/0.23.1440.836@0.23.1440.836" timestamp="1580610123068" /> + <state width="720" height="417" key="XDebugger.FullValuePopup" timestamp="1580584300118"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state width="720" height="417" key="XDebugger.FullValuePopup/0.23.1440.836@0.23.1440.836" timestamp="1580584300118" /> + <state x="515" y="128" key="com.intellij.openapi.editor.actions.MultiplePasteAction$ClipboardContentChooser" timestamp="1580582281665"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state x="515" y="128" key="com.intellij.openapi.editor.actions.MultiplePasteAction$ClipboardContentChooser/0.23.1440.836@0.23.1440.836" timestamp="1580582281665" /> + <state x="385" y="183" width="670" height="676" key="search.everywhere.popup" timestamp="1580585906043"> + <screen x="0" y="23" width="1440" height="836" /> + </state> + <state x="385" y="183" width="670" height="676" key="search.everywhere.popup/0.23.1440.836@0.23.1440.836" timestamp="1580585906043" /> + </component> +</project>
\ No newline at end of file diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index ec9c3f72c..394958823 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -1,37 +1,32 @@ import os -from shutil import copyfile import docx2txt from docx import Document from docx.opc.constants import RELATIONSHIP_TYPE as RT import re -from pymongo import MongoClient import shutil import uuid -import datetime +import json +import base64 +from shutil import copyfile from PIL import Image -import math -import sys - -source = "./source" -filesPath = "../../server/public/files" -image_dist = filesPath + "/images/buxton" -db = MongoClient("localhost", 27017)["Dash"] -target_collection = db.newDocuments -target_doc_title = "Collection 1" -schema_guids = [] -common_proto_id = "" +files_path = "../../server/public/files" +source_path = "./source" +temp_images_path = "./extracted_images" +server_images_path = f"{files_path}/images/buxton" +json_path = "./json" -def extract_links(fileName): +# noinspection PyProtectedMember +def extract_links(file): links = [] - doc = Document(fileName) + doc = Document(file) rels = doc.part.rels for rel in rels: item = rels[rel] if item.reltype == RT.HYPERLINK and ".aspx" not in item._target: links.append(item._target) - return text_doc_map(links) + return links def extract_value(kv_string): @@ -51,233 +46,72 @@ def guid(): return str(uuid.uuid4()) -def listify(list): - return { - "fields": list, - "__type": "list" - } - - -def protofy(fieldId): - return { - "fieldId": fieldId, - "__type": "proxy" - } - - -def text_doc_map(string_list): - def guid_map(caption): - return write_text_doc(caption) - return listify(proxify_guids(list(map(guid_map, string_list)))) - - -def write_collection(parse_results, display_fields, storage_key, viewType): - view_guids = parse_results["child_guids"] - - data_doc = parse_results["schema"] - fields = data_doc["fields"] - - view_doc_guid = guid() - - view_doc = { - "_id": view_doc_guid, - "fields": { - "proto": protofy(data_doc["_id"]), - "x": 10, - "y": 10, - "_width": 900, - "_height": 600, - "_panX": 0, - "_panY": 0, - "zIndex": 2, - "libraryBrush": False, - "_viewType": viewType, - "_LODdisable": True - }, - "__type": "Doc" - } - - fields["proto"] = protofy(common_proto_id) - fields[storage_key] = listify(proxify_guids(view_guids)) - fields["schemaColumns"] = listify(display_fields) - fields["author"] = "Bill Buxton" - fields["creationDate"] = { - "date": datetime.datetime.utcnow().microsecond, - "__type": "date" - } - if "image_urls" in parse_results: - fields["hero"] = { - "url": parse_results["image_urls"][0], - "__type": "image" - } - fields["isPrototype"] = True - - target_collection.insert_one(data_doc) - target_collection.insert_one(view_doc) - - data_doc_guid = data_doc["_id"] - print(f"inserted view document ({view_doc_guid})") - print(f"inserted data document ({data_doc_guid})\n") - - return view_doc_guid - - -def write_text_doc(content): - data_doc_guid = guid() - view_doc_guid = guid() - - view_doc = { - "_id": view_doc_guid, - "fields": { - "proto": protofy(data_doc_guid), - "x": 10, - "y": 10, - "_width": 400, - "zIndex": 2 - }, - "__type": "Doc" - } - - data_doc = { - "_id": data_doc_guid, - "fields": { - "proto": protofy("textProto"), - "data": { - "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}', - "__type": "RichTextField" - }, - "title": content, - "_nativeWidth": 200, - "author": "Bill Buxton", - "creationDate": { - "date": datetime.datetime.utcnow().microsecond, - "__type": "date" - }, - "isPrototype": True, - "_autoHeight": True, - "page": -1, - "_nativeHeight": 200, - "_height": 200, - "data_text": content - }, - "__type": "Doc" - } - - target_collection.insert_one(view_doc) - target_collection.insert_one(data_doc) - - return view_doc_guid - - -def write_image(folder, name): - path = f"http://localhost:1050/files/images/buxton/{folder}/{name}" - - data_doc_guid = guid() - view_doc_guid = guid() - - image = Image.open(f"{image_dist}/{folder}/{name}") - native_width, native_height = image.size - - if abs(native_width - native_height) < 10: - return None - - view_doc = { - "_id": view_doc_guid, - "fields": { - "proto": protofy(data_doc_guid), - "x": 10, - "y": 10, - "_width": min(800, native_width), - "zIndex": 2, - "widthUnit": "*", - "widthMagnitude": 1 - }, - "__type": "Doc" - } - - data_doc = { - "_id": data_doc_guid, - "fields": { - "proto": protofy("imageProto"), - "data": { - "url": path, - "__type": "image" - }, - "title": name, - "_nativeWidth": native_width, - "author": "Bill Buxton", - "creationDate": { - "date": datetime.datetime.utcnow().microsecond, - "__type": "date" - }, - "isPrototype": True, - "page": -1, - "_nativeHeight": native_height, - "_height": native_height - }, - "__type": "Doc" - } - - target_collection.insert_one(view_doc) - target_collection.insert_one(data_doc) - - return { - "layout_id": view_doc_guid, - "url": path - } - - -def parse_document(file_name: str): - print(f"parsing {file_name}...") - pure_name = file_name.split(".")[0] +def encode_image(folder: str, name: str): + with open(f"{temp_images_path}/{folder}/{name}", "rb") as image: + encoded = base64.b64encode(image.read()) + return encoded.decode("utf-8") + + +def parse_document(name: str): + print(f"parsing {name}...") + pure_name = name.split(".")[0] result = {} - dir_path = image_dist + "/" + pure_name - print(dir_path) - mkdir_if_absent(dir_path) - - raw = str(docx2txt.process(source + "/" + file_name, dir_path)) - - urls = [] - view_guids = [] - count = 0 - for image in os.listdir(dir_path): - created = write_image(pure_name, image) - if created != None: - urls.append(created["url"]) - view_guids.append(created["layout_id"]) - count += 1 - resolved = dir_path + "/" + image - original = dir_path + "/" + image.replace(".", "_o.", 1) - medium = dir_path + "/" + image.replace(".", "_m.", 1) - copyfile(resolved, original) - copyfile(resolved, medium) - print(f"extracted {count} images...") + saved_device_images_dir = server_images_path + "/" + pure_name + temp_device_images_dir = temp_images_path + "/" + pure_name + mkdir_if_absent(temp_device_images_dir) + mkdir_if_absent(saved_device_images_dir) + + raw = str(docx2txt.process(source_path + + "/" + name, temp_device_images_dir)) + + extracted_images = [] + for image in os.listdir(temp_device_images_dir): + temp = f"{temp_device_images_dir}/{image}" + native_width, native_height = Image.open(temp).size + if abs(native_width - native_height) < 10: + continue + original = saved_device_images_dir + "/" + image.replace(".", "_o.", 1) + medium = saved_device_images_dir + "/" + image.replace(".", "_m.", 1) + copyfile(temp, original) + copyfile(temp, medium) + server_path = f"http://localhost:1050/files/images/buxton/{pure_name}/{image}" + extracted_images.append(server_path) + result["extracted_images"] = extracted_images def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip() - def sanitize_price(raw: str): - raw = raw.replace(",", "") - start = raw.find("$") + def sanitize_price(raw_price: str): + raw_price = raw_price.replace(",", "") + start = raw_price.find("$") + if "x" in raw_price.lower(): + return None if start > -1: i = start + 1 - while (i < len(raw) and re.match(r"[0-9\.]", raw[i])): + while i < len(raw_price) and re.match(r"[0-9.]", raw_price[i]): i += 1 - price = raw[start + 1: i + 1] + price = raw_price[start + 1: i + 1] return float(price) - elif (raw.lower().find("nfs")): + elif raw_price.lower().find("nfs"): return -1 else: - return math.nan + return None def remove_empty(line): return len(line) > 1 + def try_parse(to_parse: int): + value: int + try: + value = int(to_parse) + except ValueError: + value = None + return value + lines = list(map(sanitize, raw.split("\n"))) lines = list(filter(remove_empty, lines)) - result["file_name"] = file_name result["title"] = lines[2].strip() result["short_description"] = lines[3].strip().replace( "Short Description: ", "") @@ -293,13 +127,15 @@ def parse_document(file_name: str): clean = list( map(lambda data: data.strip().split(":"), lines[cur].split("|"))) result["company"] = clean[0][len(clean[0]) - 1].strip() - result["year"] = clean[1][len(clean[1]) - 1].strip() + + result["year"] = try_parse(clean[1][len(clean[1]) - 1].strip()) result["original_price"] = sanitize_price( clean[2][len(clean[2]) - 1].strip()) cur += 1 - result["degrees_of_freedom"] = extract_value( - lines[cur]).replace("NA", "N/A") + + result["degrees_of_freedom"] = try_parse(extract_value( + lines[cur]).replace("NA", "N/A")) cur += 1 dimensions = lines[cur].lower() @@ -325,99 +161,71 @@ def parse_document(file_name: str): cur += 1 link_descriptions = [] while lines[cur] != "Image": - link_descriptions.append(lines[cur].strip()) + description = lines[cur].strip().lower() + valid = True + for ignored in ["powerpoint", "vimeo", "xxx"]: + if ignored in description: + valid = False + break + if valid: + link_descriptions.append(description) cur += 1 - result["link_descriptions"] = text_doc_map(link_descriptions) + result["link_descriptions"] = link_descriptions - result["hyperlinks"] = extract_links(source + "/" + file_name) + result["hyperlinks"] = extract_links(source_path + "/" + name) images = [] captions = [] cur += 3 while cur + 1 < len(lines) and lines[cur] != "NOTES:": - images.append(lines[cur]) - captions.append(lines[cur + 1]) + name = lines[cur] + if "full document" not in name.lower(): + images.append(name) + captions.append(lines[cur + 1]) cur += 2 - result["images"] = listify(images) + result["table_image_names"] = images - result["captions"] = text_doc_map(captions) + result["captions"] = captions notes = [] - if (cur < len(lines) and lines[cur] == "NOTES:"): + if cur < len(lines) and lines[cur] == "NOTES:": cur += 1 while cur < len(lines): notes.append(lines[cur]) cur += 1 if len(notes) > 0: - result["notes"] = listify(notes) - - print("writing child schema...") - - return { - "schema": { - "_id": guid(), - "fields": result, - "__type": "Doc" - }, - "child_guids": view_guids, - "image_urls": urls - } - - -def proxify_guids(guids): - return list(map(lambda guid: {"fieldId": guid, "__type": "prefetch_proxy"}, guids)) - - -def write_common_proto(): - id = guid() - common_proto = { - "_id": id, - "fields": { - "proto": protofy("collectionProto"), - "title": "The Buxton Collection", - }, - "__type": "Doc" - } - target_collection.insert_one(common_proto) - return id - - -if os.path.exists(image_dist): - shutil.rmtree(image_dist) -while os.path.exists(image_dist): + result["notes"] = notes + + return result + + +if os.path.exists(server_images_path): + shutil.rmtree(server_images_path) +while os.path.exists(server_images_path): pass -os.mkdir(image_dist) -mkdir_if_absent(source) +os.mkdir(server_images_path) -common_proto_id = write_common_proto() +mkdir_if_absent(source_path) +mkdir_if_absent(json_path) +mkdir_if_absent(temp_images_path) + +results = [] candidates = 0 -for file_name in os.listdir(source): - if file_name.endswith('.docx'): +for file_name in os.listdir(source_path): + if file_name.endswith('.docx') or file_name.endswith(".doc"): candidates += 1 - schema_guids.append(write_collection( - parse_document(file_name), ["title", "data"], "data", 5)) - -print("writing parent schema...") -parent_guid = write_collection({ - "schema": { - "_id": guid(), - "fields": {}, - "__type": "Doc" - }, - "child_guids": schema_guids -}, ["title", "short_description", "original_price"], "data", 2) - -print("appending parent schema to main workspace...\n") -target_collection.update_one( - {"fields.title": target_doc_title}, - {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}} -) - -print("rewriting .gitignore...\n") -lines = ['*', '!.gitignore'] -with open(filesPath + "/.gitignore", 'w') as f: - f.write('\n'.join(lines)) - -suffix = "" if candidates == 1 else "s" -print(f"conversion complete. {candidates} candidate{suffix} processed.") + results.append(parse_document(file_name)) + + +with open(f"./json/buxton_collection.json", "w", encoding="utf-8") as out: + json.dump(results, out, ensure_ascii=False, indent=4) + +print(f"\nSuccessfully parsed {candidates} candidates.") + +print("\nrewriting .gitignore...") +entries = ['*', '!.gitignore'] +with open(files_path + "/.gitignore", 'w') as f: + f.write('\n'.join(entries)) + +shutil.rmtree(temp_images_path) diff --git a/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx b/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx Binary files differdeleted file mode 100644 index a2ab04b78..000000000 --- a/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx b/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx Binary files differdeleted file mode 100644 index e4375ebeb..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx b/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx Binary files differdeleted file mode 100644 index 99f7ad19d..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx b/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx Binary files differdeleted file mode 100644 index df1aafe9c..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_BAT.docx b/src/scraping/buxton/source/Bill_Notes_BAT.docx Binary files differdeleted file mode 100644 index 0e3368611..000000000 --- a/src/scraping/buxton/source/Bill_Notes_BAT.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx Binary files differdeleted file mode 100644 index 06094b4d3..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx b/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx Binary files differdeleted file mode 100644 index b00080e08..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx b/src/scraping/buxton/source/Bill_Notes_CasioC801.docx Binary files differdeleted file mode 100644 index 510a006e0..000000000 --- a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx b/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx Binary files differdeleted file mode 100644 index c8d3943c0..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx b/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx Binary files differdeleted file mode 100644 index cea9e7b69..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx b/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx Binary files differdeleted file mode 100644 index f53402a06..000000000 --- a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx b/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx Binary files differdeleted file mode 100644 index 0eec89949..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx Binary files differdeleted file mode 100644 index d01e1bf5c..000000000 --- a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx Binary files differdeleted file mode 100644 index b9a30c8a9..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx Binary files differdeleted file mode 100644 index 0615c4953..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx b/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx Binary files differdeleted file mode 100644 index f00fcb772..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Matias.docx b/src/scraping/buxton/source/Bill_Notes_Matias.docx Binary files differdeleted file mode 100644 index d2d014bbe..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Matias.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Microwriter.docx b/src/scraping/buxton/source/Bill_Notes_Microwriter.docx Binary files differdeleted file mode 100644 index 3ac272e42..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Microwriter.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_MousePen.docx b/src/scraping/buxton/source/Bill_Notes_MousePen.docx Binary files differdeleted file mode 100644 index cd0b3eab3..000000000 --- a/src/scraping/buxton/source/Bill_Notes_MousePen.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_NB75D.docx b/src/scraping/buxton/source/Bill_Notes_NB75D.docx Binary files differdeleted file mode 100644 index a5a5e3d90..000000000 --- a/src/scraping/buxton/source/Bill_Notes_NB75D.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx Binary files differdeleted file mode 100644 index c0cf6ba9a..000000000 --- a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_PARCtab.doc b/src/scraping/buxton/source/Bill_Notes_PARCtab.doc Binary files differdeleted file mode 100644 index 3cdc2d21b..000000000 --- a/src/scraping/buxton/source/Bill_Notes_PARCtab.doc +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx b/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx Binary files differdeleted file mode 100644 index af72fa662..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx b/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx Binary files differdeleted file mode 100644 index 5c2eb8d7f..000000000 --- a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx b/src/scraping/buxton/source/Bill_Notes_The_Tap.docx Binary files differdeleted file mode 100644 index c9ee2eaea..000000000 --- a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_Twiddler.docx b/src/scraping/buxton/source/Bill_Notes_Twiddler.docx Binary files differdeleted file mode 100644 index 27b4acc85..000000000 --- a/src/scraping/buxton/source/Bill_Notes_Twiddler.docx +++ /dev/null diff --git a/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc b/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc Binary files differdeleted file mode 100644 index 6bd71f20e..000000000 --- a/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc +++ /dev/null |