aboutsummaryrefslogtreecommitdiff
path: root/src/scraping
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping')
-rw-r--r--src/scraping/buxton/.idea/buxton.iml8
-rw-r--r--src/scraping/buxton/.idea/inspectionProfiles/profiles_settings.xml6
-rw-r--r--src/scraping/buxton/.idea/misc.xml4
-rw-r--r--src/scraping/buxton/.idea/modules.xml8
-rw-r--r--src/scraping/buxton/.idea/vcs.xml6
-rw-r--r--src/scraping/buxton/.idea/workspace.xml173
-rw-r--r--src/scraping/buxton/final/BuxtonImporter.ts389
-rw-r--r--src/scraping/buxton/jsonifier.py231
-rw-r--r--src/scraping/buxton/narratives.py38
-rw-r--r--src/scraping/buxton/narratives/Theme - Chord Kbds.docxbin0 -> 5701815 bytes
-rw-r--r--src/scraping/buxton/narratives/chord_keyboards.json39
-rw-r--r--src/scraping/buxton/node_scraper.ts57
-rw-r--r--src/scraping/buxton/scraper.py37
-rw-r--r--src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docxbin412208 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docxbin474022 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docxbin1758498 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docxbin748412 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_BAT.docxbin1349620 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docxbin1675500 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Braun_T3.docxbin1510917 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_CasioC801.docxbin413861 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docxbin523939 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Casio_Mini.docxbin467304 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docxbin423384 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docxbin1558473 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_FrogPad.docxbin840173 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docxbin1729610 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docxbin2094142 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docxbin919789 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Matias.docxbin476141 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Microwriter.docxbin1042556 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_MousePen.docxbin344083 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_NB75D.docxbin27696302 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_PARCkbd.docxbin631959 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_PARCtab.docbin4046250 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docxbin1880816 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docxbin347612 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_The_Tap.docxbin597382 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_Twiddler.docxbin526307 -> 0 bytes
-rw-r--r--src/scraping/buxton/source/Bill_Notes_orbiTouch.docbin3945306 -> 0 bytes
40 files changed, 925 insertions, 71 deletions
diff --git a/src/scraping/buxton/.idea/buxton.iml b/src/scraping/buxton/.idea/buxton.iml
new file mode 100644
index 000000000..d0876a78d
--- /dev/null
+++ b/src/scraping/buxton/.idea/buxton.iml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+ <component name="NewModuleRootManager">
+ <content url="file://$MODULE_DIR$" />
+ <orderEntry type="inheritedJdk" />
+ <orderEntry type="sourceFolder" forTests="false" />
+ </component>
+</module> \ No newline at end of file
diff --git a/src/scraping/buxton/.idea/inspectionProfiles/profiles_settings.xml b/src/scraping/buxton/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 000000000..105ce2da2
--- /dev/null
+++ b/src/scraping/buxton/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+ <settings>
+ <option name="USE_PROJECT_PROFILE" value="false" />
+ <version value="1.0" />
+ </settings>
+</component> \ No newline at end of file
diff --git a/src/scraping/buxton/.idea/misc.xml b/src/scraping/buxton/.idea/misc.xml
new file mode 100644
index 000000000..a2e120dcc
--- /dev/null
+++ b/src/scraping/buxton/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
+</project> \ No newline at end of file
diff --git a/src/scraping/buxton/.idea/modules.xml b/src/scraping/buxton/.idea/modules.xml
new file mode 100644
index 000000000..5bbca8f01
--- /dev/null
+++ b/src/scraping/buxton/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+ <component name="ProjectModuleManager">
+ <modules>
+ <module fileurl="file://$PROJECT_DIR$/.idea/buxton.iml" filepath="$PROJECT_DIR$/.idea/buxton.iml" />
+ </modules>
+ </component>
+</project> \ No newline at end of file
diff --git a/src/scraping/buxton/.idea/vcs.xml b/src/scraping/buxton/.idea/vcs.xml
new file mode 100644
index 000000000..c2365ab11
--- /dev/null
+++ b/src/scraping/buxton/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+ <component name="VcsDirectoryMappings">
+ <mapping directory="$PROJECT_DIR$/../../.." vcs="Git" />
+ </component>
+</project> \ No newline at end of file
diff --git a/src/scraping/buxton/.idea/workspace.xml b/src/scraping/buxton/.idea/workspace.xml
new file mode 100644
index 000000000..c1db7a75b
--- /dev/null
+++ b/src/scraping/buxton/.idea/workspace.xml
@@ -0,0 +1,173 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+ <component name="ChangeListManager">
+ <list default="true" id="693c6819-edcc-46d6-8260-3f51ec080a46" name="Default Changelist" comment="">
+ <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+ </list>
+ <option name="SHOW_DIALOG" value="false" />
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+ <option name="LAST_RESOLUTION" value="IGNORE" />
+ </component>
+ <component name="Git.Settings">
+ <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/../../.." />
+ </component>
+ <component name="ProjectId" id="1XDYVVOvUV6lmODouwAWUpvxnni" />
+ <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
+ <component name="ProjectViewState">
+ <option name="hideEmptyMiddlePackages" value="true" />
+ <option name="showExcludedFiles" value="true" />
+ <option name="showLibraryContents" value="true" />
+ </component>
+ <component name="PropertiesComponent">
+ <property name="ASKED_SHARE_PROJECT_CONFIGURATION_FILES" value="true" />
+ <property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
+ <property name="last_opened_file_path" value="$PROJECT_DIR$" />
+ <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
+ </component>
+ <component name="RunManager" selected="Python.narratives">
+ <configuration name="jsonifier" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
+ <module name="buxton" />
+ <option name="INTERPRETER_OPTIONS" value="" />
+ <option name="PARENT_ENVS" value="true" />
+ <envs>
+ <env name="PYTHONUNBUFFERED" value="1" />
+ </envs>
+ <option name="SDK_HOME" value="/usr/local/bin/python3.7" />
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+ <option name="IS_MODULE_SDK" value="false" />
+ <option name="ADD_CONTENT_ROOTS" value="true" />
+ <option name="ADD_SOURCE_ROOTS" value="true" />
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/jsonifier.py" />
+ <option name="PARAMETERS" value="" />
+ <option name="SHOW_COMMAND_LINE" value="false" />
+ <option name="EMULATE_TERMINAL" value="false" />
+ <option name="MODULE_MODE" value="false" />
+ <option name="REDIRECT_INPUT" value="false" />
+ <option name="INPUT_FILE" value="" />
+ <method v="2" />
+ </configuration>
+ <configuration name="narratives" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
+ <module name="buxton" />
+ <option name="INTERPRETER_OPTIONS" value="" />
+ <option name="PARENT_ENVS" value="true" />
+ <envs>
+ <env name="PYTHONUNBUFFERED" value="1" />
+ </envs>
+ <option name="SDK_HOME" value="/usr/local/bin/python3.7" />
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+ <option name="IS_MODULE_SDK" value="false" />
+ <option name="ADD_CONTENT_ROOTS" value="true" />
+ <option name="ADD_SOURCE_ROOTS" value="true" />
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/narratives.py" />
+ <option name="PARAMETERS" value="" />
+ <option name="SHOW_COMMAND_LINE" value="false" />
+ <option name="EMULATE_TERMINAL" value="false" />
+ <option name="MODULE_MODE" value="false" />
+ <option name="REDIRECT_INPUT" value="false" />
+ <option name="INPUT_FILE" value="" />
+ <method v="2" />
+ </configuration>
+ <configuration name="scraper" type="PythonConfigurationType" factoryName="Python">
+ <module name="buxton" />
+ <option name="INTERPRETER_OPTIONS" value="" />
+ <option name="PARENT_ENVS" value="true" />
+ <envs>
+ <env name="PYTHONUNBUFFERED" value="1" />
+ </envs>
+ <option name="SDK_HOME" value="/usr/local/bin/python3.7" />
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+ <option name="IS_MODULE_SDK" value="false" />
+ <option name="ADD_CONTENT_ROOTS" value="true" />
+ <option name="ADD_SOURCE_ROOTS" value="true" />
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/scraper.py" />
+ <option name="PARAMETERS" value="" />
+ <option name="SHOW_COMMAND_LINE" value="false" />
+ <option name="EMULATE_TERMINAL" value="false" />
+ <option name="MODULE_MODE" value="false" />
+ <option name="REDIRECT_INPUT" value="false" />
+ <option name="INPUT_FILE" value="" />
+ <method v="2" />
+ </configuration>
+ <list>
+ <item itemvalue="Python.jsonifier" />
+ <item itemvalue="Python.narratives" />
+ <item itemvalue="Python.scraper" />
+ </list>
+ </component>
+ <component name="SvnConfiguration">
+ <configuration />
+ </component>
+ <component name="TaskManager">
+ <task active="true" id="Default" summary="Default task">
+ <changelist id="693c6819-edcc-46d6-8260-3f51ec080a46" name="Default Changelist" comment="" />
+ <created>1580582155646</created>
+ <option name="number" value="Default" />
+ <option name="presentableId" value="Default" />
+ <updated>1580582155646</updated>
+ </task>
+ <servers />
+ </component>
+ <component name="WindowStateProjectService">
+ <state x="184" y="103" key="#com.intellij.execution.impl.EditConfigurationsDialog" timestamp="1580656983882">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state x="184" y="103" key="#com.intellij.execution.impl.EditConfigurationsDialog/0.23.1440.836@0.23.1440.836" timestamp="1580656983882" />
+ <state x="483" y="152" key="#xdebugger.evaluate" timestamp="1580601059439">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state x="483" y="152" key="#xdebugger.evaluate/0.23.1440.836@0.23.1440.836" timestamp="1580601059439" />
+ <state width="1419" height="268" key="GridCell.Tab.0.bottom" timestamp="1580786975290">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state width="1419" height="268" key="GridCell.Tab.0.bottom/0.23.1440.836@0.23.1440.836" timestamp="1580786975289" />
+ <state width="1419" height="268" key="GridCell.Tab.0.center" timestamp="1580786975289">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state width="1419" height="268" key="GridCell.Tab.0.center/0.23.1440.836@0.23.1440.836" timestamp="1580786975289" />
+ <state width="1419" height="268" key="GridCell.Tab.0.left" timestamp="1580786975289">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state width="1419" height="268" key="GridCell.Tab.0.left/0.23.1440.836@0.23.1440.836" timestamp="1580786975289" />
+ <state width="1419" height="268" key="GridCell.Tab.0.right" timestamp="1580786975289">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state width="1419" height="268" key="GridCell.Tab.0.right/0.23.1440.836@0.23.1440.836" timestamp="1580786975289" />
+ <state width="1419" height="268" key="GridCell.Tab.1.bottom" timestamp="1580786975292">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state width="1419" height="268" key="GridCell.Tab.1.bottom/0.23.1440.836@0.23.1440.836" timestamp="1580786975292" />
+ <state width="1419" height="268" key="GridCell.Tab.1.center" timestamp="1580786975291">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state width="1419" height="268" key="GridCell.Tab.1.center/0.23.1440.836@0.23.1440.836" timestamp="1580786975291" />
+ <state width="1419" height="268" key="GridCell.Tab.1.left" timestamp="1580786975290">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state width="1419" height="268" key="GridCell.Tab.1.left/0.23.1440.836@0.23.1440.836" timestamp="1580786975290" />
+ <state width="1419" height="268" key="GridCell.Tab.1.right" timestamp="1580786975292">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state width="1419" height="268" key="GridCell.Tab.1.right/0.23.1440.836@0.23.1440.836" timestamp="1580786975292" />
+ <state x="229" y="80" key="SettingsEditor" timestamp="1580610123068">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state x="229" y="80" key="SettingsEditor/0.23.1440.836@0.23.1440.836" timestamp="1580610123068" />
+ <state width="720" height="417" key="XDebugger.FullValuePopup" timestamp="1580584300118">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state width="720" height="417" key="XDebugger.FullValuePopup/0.23.1440.836@0.23.1440.836" timestamp="1580584300118" />
+ <state x="399" y="273" key="com.intellij.ide.util.TipDialog" timestamp="1580799621511">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state x="399" y="273" key="com.intellij.ide.util.TipDialog/0.23.1440.836@0.23.1440.836" timestamp="1580799621511" />
+ <state x="515" y="128" key="com.intellij.openapi.editor.actions.MultiplePasteAction$ClipboardContentChooser" timestamp="1580582281665">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state x="515" y="128" key="com.intellij.openapi.editor.actions.MultiplePasteAction$ClipboardContentChooser/0.23.1440.836@0.23.1440.836" timestamp="1580582281665" />
+ <state x="385" y="183" width="670" height="676" key="search.everywhere.popup" timestamp="1580585906043">
+ <screen x="0" y="23" width="1440" height="836" />
+ </state>
+ <state x="385" y="183" width="670" height="676" key="search.everywhere.popup/0.23.1440.836@0.23.1440.836" timestamp="1580585906043" />
+ </component>
+</project> \ No newline at end of file
diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts
new file mode 100644
index 000000000..e7a0d367d
--- /dev/null
+++ b/src/scraping/buxton/final/BuxtonImporter.ts
@@ -0,0 +1,389 @@
+import { readdirSync, writeFile, mkdirSync } from "fs";
+import * as path from "path";
+import { red, cyan, yellow } from "colors";
+import { Utils } from "../../../Utils";
+import rimraf = require("rimraf");
+import { DashUploadUtils } from "../../../server/DashUploadUtils";
+const StreamZip = require('node-stream-zip');
+const createImageSizeStream = require("image-size-stream");
+import { parseXml } from "libxmljs";
+import { strictEqual } from "assert";
+import { Readable, PassThrough } from "stream";
+
+interface DocumentContents {
+ body: string;
+ imageData: ImageData[];
+ hyperlinks: string[];
+ captions: string[];
+ embeddedFileNames: string[];
+}
+
+export interface DeviceDocument {
+ title: string;
+ shortDescription: string;
+ longDescription: string;
+ company: string;
+ year: number;
+ originalPrice?: number;
+ degreesOfFreedom?: number;
+ dimensions?: string;
+ primaryKey: string;
+ secondaryKey: string;
+ attribute: string;
+ __images: ImageData[];
+ hyperlinks: string[];
+ captions: string[];
+ embeddedFileNames: string[];
+}
+
+export interface AnalysisResult {
+ device?: DeviceDocument;
+ errors?: { [key: string]: string };
+}
+
+type Transformer<T> = (raw: string) => TransformResult<T>;
+interface TransformResult<T> {
+ transformed?: T;
+ error?: string;
+}
+
+export interface ImportResults {
+ deviceCount: number;
+ errorCount: number;
+}
+
+type ResultCallback = (result: AnalysisResult) => void;
+type TerminatorCallback = (result: ImportResults) => void;
+
+interface Processor<T> {
+ exp: RegExp;
+ matchIndex?: number;
+ transformer?: Transformer<T>;
+ required?: boolean;
+}
+
+interface ImageData {
+ url: string;
+ nativeWidth: number;
+ nativeHeight: number;
+}
+
+namespace Utilities {
+
+ export function numberValue(raw: string): TransformResult<number> {
+ const transformed = Number(raw);
+ if (isNaN(transformed)) {
+ return { error: `${raw} cannot be parsed to a numeric value.` };
+ }
+ return { transformed };
+ }
+
+ export function collectUniqueTokens(raw: string): TransformResult<string[]> {
+ const pieces = raw.replace(/,|\s+and\s+/g, " ").split(/\s+/).filter(piece => piece.length);
+ const unique = new Set(pieces.map(token => token.toLowerCase().trim()));
+ return { transformed: Array.from(unique).map(capitalize).sort() };
+ }
+
+ export function correctSentences(raw: string): TransformResult<string> {
+ raw = raw.replace(/\./g, ". ").replace(/\:/g, ": ").replace(/\,/g, ", ").replace(/\?/g, "? ").trimRight();
+ raw = raw.replace(/\s{2,}/g, " ");
+ return { transformed: raw };
+ }
+
+ export function capitalize(word: string): string {
+ const clean = word.trim();
+ if (!clean.length) {
+ return word;
+ }
+ return word.charAt(0).toUpperCase() + word.slice(1);
+ }
+
+ export async function readAndParseXml(zip: any, relativePath: string) {
+ console.log(`Text streaming ${relativePath}`);
+ const contents = await new Promise<string>((resolve, reject) => {
+ let body = "";
+ zip.stream(relativePath, (error: any, stream: any) => {
+ if (error) {
+ reject(error);
+ }
+ stream.on('data', (chunk: any) => body += chunk.toString());
+ stream.on('end', () => resolve(body));
+ });
+ });
+
+ return parseXml(contents);
+ }
+
+}
+
+const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([
+ ["title", {
+ exp: /contact\s+(.*)Short Description:/
+ }],
+ ["company", {
+ exp: /Company:\s+([^\|]*)\s+\|/,
+ transformer: (raw: string) => ({ transformed: raw.replace(/\./g, "") })
+ }],
+ ["year", {
+ exp: /Year:\s+([^\|]*)\s+\|/,
+ transformer: (raw: string) => Utilities.numberValue(/[0-9]{4}/.exec(raw)![0])
+ }],
+ ["primaryKey", {
+ exp: /Primary:\s+(.*)(Secondary|Additional):/,
+ transformer: raw => {
+ const { transformed, error } = Utilities.collectUniqueTokens(raw);
+ return transformed ? { transformed: transformed[0] } : { error };
+ }
+ }],
+ ["secondaryKey", {
+ exp: /(Secondary|Additional):\s+(.*)Attributes?:/,
+ transformer: raw => {
+ const { transformed, error } = Utilities.collectUniqueTokens(raw);
+ return transformed ? { transformed: transformed[0] } : { error };
+ },
+ matchIndex: 2
+ }],
+ ["attribute", {
+ exp: /Attributes?:\s+(.*)Links/,
+ transformer: raw => {
+ const { transformed, error } = Utilities.collectUniqueTokens(raw);
+ return transformed ? { transformed: transformed[0] } : { error };
+ },
+ }],
+ ["originalPrice", {
+ exp: /Original Price \(USD\)\:\s+(\$[0-9\,]+\.[0-9]+|NFS)/,
+ transformer: (raw: string) => {
+ raw = raw.replace(/\,/g, "");
+ if (raw === "NFS") {
+ return { transformed: -1 };
+ }
+ return Utilities.numberValue(raw.slice(1));
+ },
+ required: false
+ }],
+ ["degreesOfFreedom", {
+ exp: /Degrees of Freedom:\s+([0-9]+)/,
+ transformer: Utilities.numberValue,
+ required: false
+ }],
+ ["dimensions", {
+ exp: /Dimensions\s+\(L x W x H\):\s+([0-9\.]+\s+x\s+[0-9\.]+\s+x\s+[0-9\.]+\s\([A-Za-z]+\))/,
+ transformer: (raw: string) => {
+ const [length, width, group] = raw.split(" x ");
+ const [height, unit] = group.split(" ");
+ return {
+ transformed: {
+ dim_length: Number(length),
+ dim_width: Number(width),
+ dim_height: Number(height),
+ dim_unit: unit.replace(/[\(\)]+/g, "")
+ }
+ };
+ },
+ required: false
+ }],
+ ["shortDescription", {
+ exp: /Short Description:\s+(.*)Bill Buxton[’']s Notes/,
+ transformer: Utilities.correctSentences
+ }],
+ ["longDescription", {
+ exp: /Bill Buxton[’']s Notes(.*)Device Details/,
+ transformer: Utilities.correctSentences
+ }],
+]);
+
+const sourceDir = path.resolve(__dirname, "source");
+const outDir = path.resolve(__dirname, "json");
+const imageDir = path.resolve(__dirname, "../../../server/public/files/images/buxton");
+const successOut = "buxton.json";
+const failOut = "incomplete.json";
+const deviceKeys = Array.from(RegexMap.keys());
+
+export default async function executeImport(emitter: ResultCallback, terminator: TerminatorCallback) {
+ try {
+ const contents = readdirSync(sourceDir);
+ const wordDocuments = contents.filter(file => /.*\.docx?$/.test(file)).map(file => `${sourceDir}/${file}`);
+ [outDir, imageDir].forEach(dir => {
+ rimraf.sync(dir);
+ mkdirSync(dir);
+ });
+ return parseFiles(wordDocuments, emitter, terminator);
+ } catch (e) {
+ const message = [
+ "Unable to find a source directory.",
+ "Please ensure that the following directory exists and is populated with Word documents:",
+ `${sourceDir}`
+ ].join('\n');
+ console.log(red(message));
+ return { error: message };
+ }
+}
+
+async function parseFiles(wordDocuments: string[], emitter: ResultCallback, terminator: TerminatorCallback): Promise<DeviceDocument[]> {
+ const results: AnalysisResult[] = [];
+ for (const filePath of wordDocuments) {
+ const fileName = path.basename(filePath).replace("Bill_Notes_", "");
+ console.log(cyan(`\nExtracting contents from ${fileName}...`));
+ const result = analyze(fileName, await extractFileContents(filePath));
+ emitter(result);
+ results.push(result);
+ }
+
+ const masterDevices: DeviceDocument[] = [];
+ const masterErrors: { [key: string]: string }[] = [];
+ results.forEach(({ device, errors }) => {
+ if (device) {
+ masterDevices.push(device);
+ } else if (errors) {
+ masterErrors.push(errors);
+ }
+ });
+
+ const total = wordDocuments.length;
+ if (masterDevices.length + masterErrors.length !== total) {
+ throw new Error(`Encountered a ${masterDevices.length} to ${masterErrors.length} mismatch in device / error split!`);
+ }
+
+ console.log();
+ await writeOutputFile(successOut, masterDevices, total, true);
+ await writeOutputFile(failOut, masterErrors, total, false);
+ console.log();
+
+ terminator({ deviceCount: masterDevices.length, errorCount: masterErrors.length });
+
+ return masterDevices;
+}
+
+const tableCellXPath = '//*[name()="w:tbl"]/*[name()="w:tr"]/*[name()="w:tc"]';
+const hyperlinkXPath = '//*[name()="Relationship" and contains(@Type, "hyperlink")]';
+
+async function extractFileContents(pathToDocument: string): Promise<DocumentContents> {
+ console.log('Extracting text...');
+ const zip = new StreamZip({ file: pathToDocument, storeEntries: true });
+ await new Promise<void>(resolve => zip.on('ready', resolve));
+
+ // extract the body of the document and, specifically, its captions
+ const document = await Utilities.readAndParseXml(zip, "word/document.xml");
+ const body = document.root()?.text() ?? "No body found. Check the import script's XML parser.";
+ const captions: string[] = [];
+ const embeddedFileNames: string[] = [];
+ const captionTargets = document.find(tableCellXPath).map(node => node.text());
+
+ const { length } = captionTargets;
+ strictEqual(length > 3, true, "No captions written.");
+ strictEqual(length % 3 === 0, true, "Improper caption formatting.");
+
+ for (let i = 3; i < captionTargets.length; i += 3) {
+ const row = captionTargets.slice(i, i + 3);
+ captions.push(row[1]);
+ embeddedFileNames.push(row[2]);
+ }
+
+ // extract all hyperlinks embedded in the document
+ const rels = await Utilities.readAndParseXml(zip, "word/_rels/document.xml.rels");
+ const hyperlinks = rels.find(hyperlinkXPath).map(el => el.attrs()[2].value());
+ console.log("Text extracted.");
+
+ console.log("Beginning image extraction...");
+ const imageData = await writeImages(zip);
+ console.log(`Extracted ${imageData.length} images.`);
+
+ zip.close();
+
+ return { body, imageData, captions, embeddedFileNames, hyperlinks };
+}
+
+const imageEntry = /^word\/media\/\w+\.(jpeg|jpg|png|gif)/;
+
+interface Dimensions {
+ width: number;
+ height: number;
+ type: string;
+}
+
+async function writeImages(zip: any): Promise<ImageData[]> {
+ const allEntries = Object.values<any>(zip.entries()).map(({ name }) => name);
+ const imageEntries = allEntries.filter(name => imageEntry.test(name));
+
+ const imageUrls: ImageData[] = [];
+ for (const mediaPath of imageEntries) {
+ const getImageStream = () => new Promise<Readable>((resolve, reject) => {
+ zip.stream(mediaPath, (error: any, stream: any) => error ? reject(error) : resolve(stream));
+ });
+
+ const { width, height, type } = await new Promise<Dimensions>(async resolve => {
+ const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: Dimensions) => {
+ readStream.destroy();
+ resolve(dimensions);
+ }).on("error", () => readStream.destroy());
+ const readStream = await getImageStream();
+ readStream.pipe(sizeStream);
+ });
+ if (Math.abs(width - height) < 10) {
+ continue;
+ }
+
+ const generatedFileName = `upload_${Utils.GenerateGuid()}.${type.toLowerCase()}`;
+ await DashUploadUtils.outputResizedImages(getImageStream, generatedFileName, imageDir);
+
+ imageUrls.push({
+ url: `/files/images/buxton/${generatedFileName}`,
+ nativeWidth: width,
+ nativeHeight: height
+ });
+ }
+
+ return imageUrls;
+}
+
+function analyze(fileName: string, contents: DocumentContents): AnalysisResult {
+ const { body, imageData, captions, hyperlinks, embeddedFileNames } = contents;
+ const device: any = {
+ hyperlinks,
+ captions,
+ embeddedFileNames,
+ __images: imageData
+ };
+ const errors: { [key: string]: string } = { fileName };
+
+ for (const key of deviceKeys) {
+ const { exp, transformer, matchIndex, required } = RegexMap.get(key)!;
+ const matches = exp.exec(body);
+
+ let captured: string;
+ if (matches && (captured = matches[matchIndex ?? 1])) {
+ captured = captured.replace(/\s{2,}/g, " ");
+ if (transformer) {
+ const { error, transformed } = transformer(captured);
+ if (error) {
+ errors[key] = `__ERR__${key.toUpperCase()}__TRANSFORM__: ${error}`;
+ continue;
+ }
+ captured = transformed;
+ }
+
+ device[key] = captured;
+ } else if (required ?? true) {
+ errors[key] = `ERR__${key.toUpperCase()}__: outer match ${matches === null ? "wasn't" : "was"} captured.`;
+ continue;
+ }
+ }
+
+ const errorKeys = Object.keys(errors);
+ if (errorKeys.length > 1) {
+ console.log(red(`@ ${cyan(fileName.toUpperCase())}...`));
+ errorKeys.forEach(key => key !== "filename" && console.log(red(errors[key])));
+ return { errors };
+ }
+
+ return { device };
+}
+
+async function writeOutputFile(relativePath: string, data: any[], total: number, success: boolean) {
+ console.log(yellow(`Encountered ${data.length} ${success ? "valid" : "invalid"} documents out of ${total} candidates. Writing ${relativePath}...`));
+ return new Promise<void>((resolve, reject) => {
+ const destination = path.resolve(outDir, relativePath);
+ const contents = JSON.stringify(data, undefined, 4);
+ writeFile(destination, contents, err => err ? reject(err) : resolve());
+ });
+} \ No newline at end of file
diff --git a/src/scraping/buxton/jsonifier.py b/src/scraping/buxton/jsonifier.py
new file mode 100644
index 000000000..a315d49c0
--- /dev/null
+++ b/src/scraping/buxton/jsonifier.py
@@ -0,0 +1,231 @@
+import os
+import docx2txt
+from docx import Document
+from docx.opc.constants import RELATIONSHIP_TYPE as RT
+import re
+import shutil
+import uuid
+import json
+import base64
+from shutil import copyfile
+from PIL import Image
+
+files_path = "../../server/public/files"
+source_path = "./source"
+temp_images_path = "./extracted_images"
+server_images_path = f"{files_path}/images/buxton"
+json_path = "./json"
+
+
+# noinspection PyProtectedMember
+def extract_links(file):
+ links = []
+ doc = Document(file)
+ rels = doc.part.rels
+ for rel in rels:
+ item = rels[rel]
+ if item.reltype == RT.HYPERLINK and ".aspx" not in item._target:
+ links.append(item._target)
+ return links
+
+
+def extract_value(kv_string):
+ pieces = kv_string.split(":")
+ return (pieces[1] if len(pieces) > 1 else kv_string).strip()
+
+
+def mkdir_if_absent(path):
+ try:
+ if not os.path.exists(path):
+ os.mkdir(path)
+ except OSError:
+ print("failed to create the appropriate directory structures for %s" % file_name)
+
+
+def guid():
+ return str(uuid.uuid4())
+
+
+def encode_image(folder: str, name: str):
+ with open(f"{temp_images_path}/{folder}/{name}", "rb") as image:
+ encoded = base64.b64encode(image.read())
+ return encoded.decode("utf-8")
+
+
+def parse_document(name: str):
+ print(f"parsing {name}...")
+ pure_name = name.split(".")[0]
+
+ result = {}
+
+ saved_device_images_dir = server_images_path + "/" + pure_name
+ temp_device_images_dir = temp_images_path + "/" + pure_name
+ mkdir_if_absent(temp_device_images_dir)
+ mkdir_if_absent(saved_device_images_dir)
+
+ raw = str(docx2txt.process(source_path +
+ "/" + name, temp_device_images_dir))
+
+ extracted_images = []
+ for image in os.listdir(temp_device_images_dir):
+ temp = f"{temp_device_images_dir}/{image}"
+ native_width, native_height = Image.open(temp).size
+ if abs(native_width - native_height) < 10:
+ continue
+ original = saved_device_images_dir + "/" + image.replace(".", "_o.", 1)
+ medium = saved_device_images_dir + "/" + image.replace(".", "_m.", 1)
+ copyfile(temp, original)
+ copyfile(temp, medium)
+ server_path = f"http://localhost:1050/files/images/buxton/{pure_name}/{image}"
+ extracted_images.append(server_path)
+ result["extracted_images"] = extracted_images
+
+ def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
+ u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip()
+
+ def sanitize_price(raw_price: str):
+ raw_price = raw_price.replace(",", "")
+ start = raw_price.find("$")
+ if "x" in raw_price.lower():
+ return None
+ if start > -1:
+ i = start + 1
+ while i < len(raw_price) and re.match(r"[0-9.]", raw_price[i]):
+ i += 1
+ price = raw_price[start + 1: i + 1]
+ return float(price)
+ elif raw_price.lower().find("nfs"):
+ return -1
+ else:
+ return None
+
+ def remove_empty(line): return len(line) > 1
+
+ def try_parse(to_parse: int):
+ try:
+ value = int(to_parse)
+ return value
+ except ValueError:
+ value = None
+ return value
+
+ lines = list(map(sanitize, raw.split("\n")))
+ lines = list(filter(remove_empty, lines))
+
+ result["title"] = lines[2].strip()
+ result["short_description"] = lines[3].strip().replace(
+ "Short Description: ", "")
+
+ cur = 5
+ notes = ""
+ while lines[cur] != "Device Details":
+ notes += lines[cur] + " "
+ cur += 1
+ result["buxton_notes"] = notes.strip()
+
+ cur += 1
+ clean = list(
+ map(lambda data: data.strip().split(":"), lines[cur].split("|")))
+ result["company"] = clean[0][len(clean[0]) - 1].strip()
+
+ result["year"] = try_parse(clean[1][len(clean[1]) - 1].strip())
+ result["original_price"] = sanitize_price(
+ clean[2][len(clean[2]) - 1].strip())
+
+ cur += 1
+
+ result["degrees_of_freedom"] = try_parse(extract_value(
+ lines[cur]).replace("NA", "N/A"))
+ cur += 1
+
+ dimensions = lines[cur].lower()
+ if dimensions.startswith("dimensions"):
+ dim_concat = dimensions[11:].strip()
+ cur += 1
+ while lines[cur] != "Key Words":
+ dim_concat += (" " + lines[cur].strip())
+ cur += 1
+ result["dimensions"] = dim_concat
+ else:
+ result["dimensions"] = "N/A"
+
+ cur += 1
+ result["primary_key"] = extract_value(lines[cur])
+ cur += 1
+ result["secondary_key"] = extract_value(lines[cur])
+
+ while lines[cur] != "Links":
+ result["secondary_key"] += (" " + extract_value(lines[cur]).strip())
+ cur += 1
+
+ cur += 1
+ link_descriptions = []
+ while lines[cur] != "Image":
+ description = lines[cur].strip().lower()
+ valid = True
+ for ignored in ["powerpoint", "vimeo", "xxx"]:
+ if ignored in description:
+ valid = False
+ break
+ if valid:
+ link_descriptions.append(description)
+ cur += 1
+ result["link_descriptions"] = link_descriptions
+
+ result["hyperlinks"] = extract_links(source_path + "/" + name)
+
+ images = []
+ captions = []
+ cur += 3
+ while cur + 1 < len(lines) and lines[cur] != "NOTES:":
+ name = lines[cur]
+ if "full document" not in name.lower():
+ images.append(name)
+ captions.append(lines[cur + 1])
+ cur += 2
+ result["table_image_names"] = images
+
+ result["captions"] = captions
+
+ notes = []
+ if cur < len(lines) and lines[cur] == "NOTES:":
+ cur += 1
+ while cur < len(lines):
+ notes.append(lines[cur])
+ cur += 1
+ if len(notes) > 0:
+ result["notes"] = notes
+
+ return result
+
+
+if os.path.exists(server_images_path):
+ shutil.rmtree(server_images_path)
+while os.path.exists(server_images_path):
+ pass
+os.mkdir(server_images_path)
+
+mkdir_if_absent(source_path)
+mkdir_if_absent(json_path)
+mkdir_if_absent(temp_images_path)
+
+results = []
+
+candidates = 0
+for file_name in os.listdir(source_path):
+ if file_name.endswith('.docx') or file_name.endswith(".doc"):
+ candidates += 1
+ results.append(parse_document(file_name))
+
+
+with open(f"./json/buxton_collection.json", "w", encoding="utf-8") as out:
+ json.dump(results, out, ensure_ascii=False, indent=4)
+
+print(f"\nSuccessfully parsed {candidates} candidates.")
+
+print("\nrewriting .gitignore...")
+entries = ['*', '!.gitignore']
+with open(files_path + "/.gitignore", 'w') as f:
+ f.write('\n'.join(entries))
+
+shutil.rmtree(temp_images_path)
diff --git a/src/scraping/buxton/narratives.py b/src/scraping/buxton/narratives.py
new file mode 100644
index 000000000..947d60f91
--- /dev/null
+++ b/src/scraping/buxton/narratives.py
@@ -0,0 +1,38 @@
+from docx import Document
+import tempfile
+from zipfile import ZipFile
+import shutil
+from pathlib import Path
+from os import mkdir
+
+path = "./narratives/Theme - Chord Kbds.docx"
+doc = Document(path)
+
+# IMAGE_EXT = ('png', 'jpeg', 'jpg')
+#
+# with tempfile.TemporaryDirectory() as working_dir:
+# with ZipFile(path) as working_zip:
+# image_list = [name for name in working_zip.namelist() if any(name.endswith(ext) for ext in IMAGE_EXT)]
+# working_zip.extractall(working_dir, image_list)
+# mkdir("./test")
+# for image in image_list:
+# shutil.copy(Path(working_dir).resolve() / image, "./test")
+
+paragraphs = doc.paragraphs
+for i in range(len(paragraphs)):
+ print(f"{i}: {paragraphs[i].text}")
+
+# for section in doc.sections:
+# print(section.orientation)
+
+# for shape in doc.inline_shapes:
+# print(shape._inline)
+
+# images = doc.tables[0]
+# for row in images.rows:
+# contents = []
+# for cell in row.cells:
+# contents.append(cell.text)
+ # print(contents)
+
+
diff --git a/src/scraping/buxton/narratives/Theme - Chord Kbds.docx b/src/scraping/buxton/narratives/Theme - Chord Kbds.docx
new file mode 100644
index 000000000..439a7d975
--- /dev/null
+++ b/src/scraping/buxton/narratives/Theme - Chord Kbds.docx
Binary files differ
diff --git a/src/scraping/buxton/narratives/chord_keyboards.json b/src/scraping/buxton/narratives/chord_keyboards.json
new file mode 100644
index 000000000..748578769
--- /dev/null
+++ b/src/scraping/buxton/narratives/chord_keyboards.json
@@ -0,0 +1,39 @@
+{
+ "slides": [{
+ "text": "Theme: Chord Keyboards\nFrom music to type\n\nChord keyboards require 2 or more keys to be simultaneously pushed to spawn the intended output. Playing a chord on a piano or pushing both the shift + a letter key on a typewriter to enter an upper case character are examples.",
+ "devices": ["Casio CZ-101"]
+ },
+ {
+ "text": "This is an early mechanical keyboard for taking dictation. Instead of typing alphanumeric characters as on a typewriter, pressing different combinations prints shorthand symbols on the tape, each representing a different phoneme. Speech is easier to keep up with this way, since each phoneme typically represents multiple characters.\n\nThe downside – until AI came to the rescue – was that it then took hours to manually transcribe to shorthand into conventional readable text.",
+ "devices": ["Grandjean Sténotype"]
+ },
+ {
+ "text": "Designed and manufactured in the DDR, the purpose of this keyboard is to emboss dots representing Braille symbols onto paper. The effect is to enable blind users to use their tactile sensitivity to read with their fingers.\n\nEach Braille symbol consists of two columns of 3 embossed dots each. Which 3 dots are embossed in each column is determined by which of the three keys on either side are simultaneously pressed. The key in the middle, operated by either thumb, enters a space.",
+ "devices": ["Braille Writer"]
+ },
+ {
+ "text": "This combination is derived from the work of the inventor of the mouse, Doug Engelbart\n\nWhile these are 2 distinct devices, they are not what they appear to be.\n\nFunctionally, there is a virtual 7-button chord keyboard, employing the 5 buttons on the keyset and the middle and right button of the mouse. And, using the left mouse button, there is also a 1-button mouse\n\nText was entered using a minor variant of 7-bit ASCII. The intent was to enable entering small bits of text without moving back-and-forth between mouse and QWERTY keyboard. It didn’t catch on.",
+ "devices": ["Xerox PARC 5-Button Keyset & 3-Button Mouse"]
+ },
+ {
+ "text": "",
+ "devices": []
+ },
+ {
+ "text": "",
+ "devices": []
+ },
+ {
+ "text": "",
+ "devices": []
+ },
+ {
+ "text": "",
+ "devices": []
+ },
+ {
+ "text": "",
+ "devices": []
+ }
+ ]
+} \ No newline at end of file
diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts
deleted file mode 100644
index ef1d989d4..000000000
--- a/src/scraping/buxton/node_scraper.ts
+++ /dev/null
@@ -1,57 +0,0 @@
-import { readdirSync } from "fs";
-import { resolve } from "path";
-
-const StreamZip = require('node-stream-zip');
-
-export async function open(path: string) {
- const zip = new StreamZip({
- file: path,
- storeEntries: true
- });
- return new Promise<string>((resolve, reject) => {
- zip.on('ready', () => {
- console.log("READY!", zip.entriesCount);
- for (const entry of Object.values(zip.entries()) as any[]) {
- const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`;
- console.log(`Entry ${entry.name}: ${desc}`);
- }
- let body = "";
- zip.stream("word/document.xml", (error: any, stream: any) => {
- if (error) {
- reject(error);
- }
- stream.on('data', (chunk: any) => body += chunk.toString());
- stream.on('end', () => {
- resolve(body);
- zip.close();
- });
- });
- });
- });
-}
-
-export async function extract(path: string) {
- const contents = await open(path);
- let body = "";
- const components = contents.toString().split('<w:t');
- for (const component of components) {
- const tags = component.split('>');
- console.log(tags[1]);
- const content = tags[1].replace(/<.*$/, "");
- body += content;
- }
- return body;
-}
-
-async function parse(): Promise<string[]> {
- const sourceDirectory = resolve(`${__dirname}/source`);
- const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`);
- await extract(candidates[0]);
- try {
- return Promise.all(candidates.map(extract));
- } catch {
- return [];
- }
-}
-
-parse(); \ No newline at end of file
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index ec9c3f72c..1441a8621 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -10,7 +10,6 @@ import uuid
import datetime
from PIL import Image
import math
-import sys
source = "./source"
filesPath = "../../server/public/files"
@@ -116,8 +115,8 @@ def write_collection(parse_results, display_fields, storage_key, viewType):
target_collection.insert_one(view_doc)
data_doc_guid = data_doc["_id"]
- print(f"inserted view document ({view_doc_guid})")
- print(f"inserted data document ({data_doc_guid})\n")
+ # print(f"inserted view document ({view_doc_guid})")
+ # print(f"inserted data document ({data_doc_guid})\n")
return view_doc_guid
@@ -189,8 +188,8 @@ def write_image(folder, name):
"y": 10,
"_width": min(800, native_width),
"zIndex": 2,
- "widthUnit": "*",
- "widthMagnitude": 1
+ "dimUnit": "*",
+ "dimMagnitude": 1
},
"__type": "Doc"
}
@@ -234,7 +233,7 @@ def parse_document(file_name: str):
result = {}
dir_path = image_dist + "/" + pure_name
- print(dir_path)
+ # print(dir_path)
mkdir_if_absent(dir_path)
raw = str(docx2txt.process(source + "/" + file_name, dir_path))
@@ -253,13 +252,15 @@ def parse_document(file_name: str):
medium = dir_path + "/" + image.replace(".", "_m.", 1)
copyfile(resolved, original)
copyfile(resolved, medium)
- print(f"extracted {count} images...")
+ # print(f"extracted {count} images...")
def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip()
def sanitize_price(raw: str):
raw = raw.replace(",", "")
+ if "x" in raw.lower():
+ return None
start = raw.find("$")
if start > -1:
i = start + 1
@@ -274,6 +275,14 @@ def parse_document(file_name: str):
def remove_empty(line): return len(line) > 1
+ def try_parse(to_parse: int):
+ value: int
+ try:
+ value = int(to_parse)
+ except ValueError:
+ value = None
+ return value
+
lines = list(map(sanitize, raw.split("\n")))
lines = list(filter(remove_empty, lines))
@@ -293,13 +302,13 @@ def parse_document(file_name: str):
clean = list(
map(lambda data: data.strip().split(":"), lines[cur].split("|")))
result["company"] = clean[0][len(clean[0]) - 1].strip()
- result["year"] = clean[1][len(clean[1]) - 1].strip()
+ result["year"] = try_parse(clean[1][len(clean[1]) - 1].strip())
result["original_price"] = sanitize_price(
clean[2][len(clean[2]) - 1].strip())
cur += 1
- result["degrees_of_freedom"] = extract_value(
- lines[cur]).replace("NA", "N/A")
+ result["degrees_of_freedom"] = try_parse(extract_value(
+ lines[cur]).replace("NA", "N/A"))
cur += 1
dimensions = lines[cur].lower()
@@ -351,7 +360,7 @@ def parse_document(file_name: str):
if len(notes) > 0:
result["notes"] = listify(notes)
- print("writing child schema...")
+ # print("writing child schema...")
return {
"schema": {
@@ -383,7 +392,7 @@ def write_common_proto():
if os.path.exists(image_dist):
- shutil.rmtree(image_dist)
+ shutil.rmtree(image_dist, True)
while os.path.exists(image_dist):
pass
os.mkdir(image_dist)
@@ -393,7 +402,7 @@ common_proto_id = write_common_proto()
candidates = 0
for file_name in os.listdir(source):
- if file_name.endswith('.docx'):
+ if file_name.endswith('.docx') or file_name.endswith('.doc'):
candidates += 1
schema_guids.append(write_collection(
parse_document(file_name), ["title", "data"], "data", 5))
@@ -406,7 +415,7 @@ parent_guid = write_collection({
"__type": "Doc"
},
"child_guids": schema_guids
-}, ["title", "short_description", "original_price"], "data", 2)
+}, ["title", "short_description", "original_price"], "data", 4)
print("appending parent schema to main workspace...\n")
target_collection.update_one(
diff --git a/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx b/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx
deleted file mode 100644
index a2ab04b78..000000000
--- a/src/scraping/buxton/source/Bill_Notes_3_button_optical_mouse.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx b/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx
deleted file mode 100644
index e4375ebeb..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Amazon_Kindle_Keyboard.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx b/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx
deleted file mode 100644
index 99f7ad19d..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Apple_Adj_Keyboard.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx b/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx
deleted file mode 100644
index df1aafe9c..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Apple_Mac_Portable.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_BAT.docx b/src/scraping/buxton/source/Bill_Notes_BAT.docx
deleted file mode 100644
index 0e3368611..000000000
--- a/src/scraping/buxton/source/Bill_Notes_BAT.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx
deleted file mode 100644
index 06094b4d3..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx b/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx
deleted file mode 100644
index b00080e08..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx b/src/scraping/buxton/source/Bill_Notes_CasioC801.docx
deleted file mode 100644
index 510a006e0..000000000
--- a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx b/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx
deleted file mode 100644
index c8d3943c0..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Casio_CZ-101.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx b/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx
deleted file mode 100644
index cea9e7b69..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx b/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx
deleted file mode 100644
index f53402a06..000000000
--- a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx b/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx
deleted file mode 100644
index 0eec89949..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx
deleted file mode 100644
index d01e1bf5c..000000000
--- a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx
deleted file mode 100644
index b9a30c8a9..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx
deleted file mode 100644
index 0615c4953..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx b/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx
deleted file mode 100644
index f00fcb772..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Kindle_3G_lighted_cover.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Matias.docx b/src/scraping/buxton/source/Bill_Notes_Matias.docx
deleted file mode 100644
index d2d014bbe..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Matias.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Microwriter.docx b/src/scraping/buxton/source/Bill_Notes_Microwriter.docx
deleted file mode 100644
index 3ac272e42..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Microwriter.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_MousePen.docx b/src/scraping/buxton/source/Bill_Notes_MousePen.docx
deleted file mode 100644
index cd0b3eab3..000000000
--- a/src/scraping/buxton/source/Bill_Notes_MousePen.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_NB75D.docx b/src/scraping/buxton/source/Bill_Notes_NB75D.docx
deleted file mode 100644
index a5a5e3d90..000000000
--- a/src/scraping/buxton/source/Bill_Notes_NB75D.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx
deleted file mode 100644
index c0cf6ba9a..000000000
--- a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_PARCtab.doc b/src/scraping/buxton/source/Bill_Notes_PARCtab.doc
deleted file mode 100644
index 3cdc2d21b..000000000
--- a/src/scraping/buxton/source/Bill_Notes_PARCtab.doc
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx b/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx
deleted file mode 100644
index af72fa662..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx b/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx
deleted file mode 100644
index 5c2eb8d7f..000000000
--- a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx b/src/scraping/buxton/source/Bill_Notes_The_Tap.docx
deleted file mode 100644
index c9ee2eaea..000000000
--- a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_Twiddler.docx b/src/scraping/buxton/source/Bill_Notes_Twiddler.docx
deleted file mode 100644
index 27b4acc85..000000000
--- a/src/scraping/buxton/source/Bill_Notes_Twiddler.docx
+++ /dev/null
Binary files differ
diff --git a/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc b/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc
deleted file mode 100644
index 6bd71f20e..000000000
--- a/src/scraping/buxton/source/Bill_Notes_orbiTouch.doc
+++ /dev/null
Binary files differ