From d564601da06b696f59b97bf162fa52354d49f8c9 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Wed, 26 Jun 2019 14:55:38 -0400 Subject: scraping --- src/scraping/buxton/scraper.py | 331 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 331 insertions(+) create mode 100644 src/scraping/buxton/scraper.py (limited to 'src/scraping/buxton/scraper.py') diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py new file mode 100644 index 000000000..97af10519 --- /dev/null +++ b/src/scraping/buxton/scraper.py @@ -0,0 +1,331 @@ +import os +import docx2txt +from docx import Document +from docx.opc.constants import RELATIONSHIP_TYPE as RT +import re +from pymongo import MongoClient +import shutil +import uuid +import datetime +from PIL import Image +import math +import sys + +source = "./source" +dist = "../../server/public/files" + +db = MongoClient("localhost", 27017)["Dash"] +schema_guids = [] + + +def extract_links(fileName): + links = [] + doc = Document(fileName) + rels = doc.part.rels + for rel in rels: + item = rels[rel] + if item.reltype == RT.HYPERLINK and ".aspx" not in item._target: + links.append(item._target) + return listify(links) + + +def extract_value(kv_string): + pieces = kv_string.split(":") + return (pieces[1] if len(pieces) > 1 else kv_string).strip() + + +def mkdir_if_absent(path): + try: + if not os.path.exists(path): + os.mkdir(path) + except OSError: + print("failed to create the appropriate directory structures for %s" % file_name) + + +def guid(): + return str(uuid.uuid4()) + + +def listify(list): + return { + "fields": list, + "__type": "list" + } + + +def protofy(fieldId): + return { + "fieldId": fieldId, + "__type": "proxy" + } + + +def write_schema(parse_results, display_fields): + view_guids = parse_results["child_guids"] + + data_doc = parse_results["schema"] + fields = data_doc["fields"] + + view_doc_guid = guid() + + view_doc = { + "_id": view_doc_guid, + "fields": { + "proto": protofy(data_doc["_id"]), + "x": 10, + "y": 10, + "width": 900, + "height": 600, + "panX": 0, + "panY": 0, + "zoomBasis": 0.5, + "zIndex": 2, + "libraryBrush": False, + "viewType": 2 + }, + "__type": "Doc" + } + + fields["proto"] = protofy("collectionProto") + fields["data"] = listify(proxify_guids(view_guids)) + fields["schemaColumns"] = listify(display_fields) + fields["backgroundColor"] = "white" + fields["scale"] = 0.5 + fields["viewType"] = 2 + fields["author"] = "Bill Buxton" + fields["creationDate"] = { + "date": datetime.datetime.utcnow().microsecond, + "__type": "date" + } + fields["isPrototype"] = True + fields["page"] = -1 + + db.newDocuments.insert_one(data_doc) + db.newDocuments.insert_one(view_doc) + + data_doc_guid = data_doc["_id"] + print(f"inserted view document ({view_doc_guid})") + print(f"inserted data document ({data_doc_guid})\n") + + return view_doc_guid + + +def write_image(folder, name): + path = f"http://localhost:1050/files/{folder}/{name}" + + data_doc_guid = guid() + view_doc_guid = guid() + + view_doc = { + "_id": view_doc_guid, + "fields": { + "proto": protofy(data_doc_guid), + "x": 10, + "y": 10, + "width": 300, + "zIndex": 2, + "libraryBrush": False + }, + "__type": "Doc" + } + + image = Image.open(f"{dist}/{folder}/{name}") + native_width, native_height = image.size + + data_doc = { + "_id": data_doc_guid, + "fields": { + "proto": protofy("imageProto"), + "data": { + "url": path, + "__type": "image" + }, + "title": name, + "nativeWidth": native_width, + "author": "Bill Buxton", + "creationDate": { + "date": datetime.datetime.utcnow().microsecond, + "__type": "date" + }, + "isPrototype": True, + "page": -1, + "nativeHeight": native_height, + "height": native_height + }, + "__type": "Doc" + } + + db.newDocuments.insert_one(view_doc) + db.newDocuments.insert_one(data_doc) + + return view_doc_guid + + +def parse_document(file_name: str): + print(f"parsing {file_name}...") + pure_name = file_name.split(".")[0] + + result = {} + + dir_path = dist + "/" + pure_name + mkdir_if_absent(dir_path) + + raw = str(docx2txt.process(source + "/" + file_name, dir_path)) + + view_guids = [] + count = 0 + for image in os.listdir(dir_path): + count += 1 + view_guids.append(write_image(pure_name, image)) + os.rename(dir_path + "/" + image, dir_path + + "/" + image.replace(".", "_m.", 1)) + print(f"extracted {count} images...") + + def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( + u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip() + + def sanitize_price(raw: str): + raw = raw.replace(",", "") + start = raw.find("$") + if start > -1: + i = start + 1 + while (i < len(raw) and re.match(r"[0-9\.]", raw[i])): + i += 1 + price = raw[start + 1: i + 1] + return float(price) + elif (raw.lower().find("nfs")): + return -1 + else: + return math.nan + + def remove_empty(line): return len(line) > 1 + + lines = list(map(sanitize, raw.split("\n"))) + lines = list(filter(remove_empty, lines)) + + result["file_name"] = file_name + result["title"] = lines[2].strip() + result["short_description"] = lines[3].strip().replace( + "Short Description: ", "") + + cur = 5 + notes = "" + while lines[cur] != "Device Details": + notes += lines[cur] + " " + cur += 1 + result["buxton_notes"] = notes.strip() + + cur += 1 + clean = list( + map(lambda data: data.strip().split(":"), lines[cur].split("|"))) + result["company"] = clean[0][len(clean[0]) - 1].strip() + result["year"] = clean[1][len(clean[1]) - 1].strip() + result["original_price"] = sanitize_price( + clean[2][len(clean[2]) - 1].strip()) + + cur += 1 + result["degrees_of_freedom"] = extract_value( + lines[cur]).replace("NA", "N/A") + cur += 1 + + dimensions = lines[cur].lower() + if dimensions.startswith("dimensions"): + dim_concat = dimensions[11:].strip() + cur += 1 + while lines[cur] != "Key Words": + dim_concat += (" " + lines[cur].strip()) + cur += 1 + result["dimensions"] = dim_concat + else: + result["dimensions"] = "N/A" + + cur += 1 + result["primary_key"] = extract_value(lines[cur]) + cur += 1 + result["secondary_key"] = extract_value(lines[cur]) + + while lines[cur] != "Links": + result["secondary_key"] += (" " + extract_value(lines[cur]).strip()) + cur += 1 + + cur += 1 + link_descriptions = [] + while lines[cur] != "Image": + link_descriptions.append(lines[cur].strip()) + cur += 1 + result["link_descriptions"] = listify(link_descriptions) + + result["hyperlinks"] = extract_links(source + "/" + file_name) + + images = [] + captions = [] + cur += 3 + while cur + 1 < len(lines) and lines[cur] != "NOTES:": + images.append(lines[cur]) + captions.append(lines[cur + 1]) + cur += 2 + result["images"] = listify(images) + result["captions"] = listify(captions) + + notes = [] + if (cur < len(lines) and lines[cur] == "NOTES:"): + cur += 1 + while cur < len(lines): + notes.append(lines[cur]) + cur += 1 + if len(notes) > 0: + result["notes"] = listify(notes) + + print("writing child schema...") + + return { + "schema": { + "_id": guid(), + "fields": result, + "__type": "Doc" + }, + "child_guids": view_guids + } + + +def proxify_guids(guids): + return list(map(lambda guid: {"fieldId": guid, "__type": "proxy"}, guids)) + + +if os.path.exists(dist): + shutil.rmtree(dist) +while os.path.exists(dist): + pass +os.mkdir(dist) +mkdir_if_absent(source) + +candidates = 0 +for file_name in os.listdir(source): + if file_name.endswith('.docx'): + candidates += 1 + schema_guids.append(write_schema( + parse_document(file_name), ["title", "data"])) + +print("writing parent schema...") +parent_guid = write_schema({ + "schema": { + "_id": guid(), + "fields": {}, + "__type": "Doc" + }, + "child_guids": schema_guids +}, ["title", "short_description", "original_price"]) + +print("appending parent schema to main workspace...\n") +db.newDocuments.update_one( + {"fields.title": "WS collection 1"}, + {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}} +) + +print("rewriting .gitignore...\n") +lines = ['*', '!.gitignore'] +with open(dist + "/.gitignore", 'w') as f: + f.write('\n'.join(lines)) + +suffix = "" if candidates == 1 else "s" +print(f"conversion complete. {candidates} candidate{suffix} processed.") -- cgit v1.2.3-70-g09d2 From 18b568ce20b66c4e16521c043df804279a5cd163 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Wed, 26 Jun 2019 23:19:12 -0400 Subject: implemented drag drop template from key value key selection --- src/client/views/nodes/KeyValueBox.scss | 6 +-- src/client/views/nodes/KeyValueBox.tsx | 72 ++++++++++++++++++++++++++++++-- src/client/views/nodes/KeyValuePair.scss | 39 +++++++++++++---- src/client/views/nodes/KeyValuePair.tsx | 62 ++++++++++++++++++++------- src/scraping/buxton/scraper.py | 8 ++-- 5 files changed, 153 insertions(+), 34 deletions(-) (limited to 'src/scraping/buxton/scraper.py') diff --git a/src/client/views/nodes/KeyValueBox.scss b/src/client/views/nodes/KeyValueBox.scss index 20cae03d4..87a9565e8 100644 --- a/src/client/views/nodes/KeyValueBox.scss +++ b/src/client/views/nodes/KeyValueBox.scss @@ -91,12 +91,12 @@ $header-height: 30px; width: 4px; float: left; height: 30px; - width: 10px; + width: 5px; z-index: 20; right: 0; top: 0; - border-radius: 10px; - background: gray; + border-radius: 0; + background: black; pointer-events: all; } .keyValueBox-dividerDragger{ diff --git a/src/client/views/nodes/KeyValueBox.tsx b/src/client/views/nodes/KeyValueBox.tsx index cd65c42bc..4beb70284 100644 --- a/src/client/views/nodes/KeyValueBox.tsx +++ b/src/client/views/nodes/KeyValueBox.tsx @@ -7,13 +7,23 @@ import { FieldView, FieldViewProps } from './FieldView'; import "./KeyValueBox.scss"; import { KeyValuePair } from "./KeyValuePair"; import React = require("react"); -import { NumCast, Cast, FieldValue } from "../../../new_fields/Types"; -import { Doc, Field } from "../../../new_fields/Doc"; +import { NumCast, Cast, FieldValue, StrCast } from "../../../new_fields/Types"; +import { Doc, Field, FieldResult } from "../../../new_fields/Doc"; import { ComputedField } from "../../../new_fields/ScriptField"; +import { SetupDrag } from "../../util/DragManager"; +import { Docs } from "../../documents/Documents"; +import { RawDataOperationParameters } from "../../northstar/model/idea/idea"; +import { Templates } from "../Templates"; +import { List } from "../../../new_fields/List"; +import { TextField } from "../../util/ProsemirrorCopy/prompt"; +import { RichTextField } from "../../../new_fields/RichTextField"; +import { ImageField } from "../../../new_fields/URLField"; @observer export class KeyValueBox extends React.Component { private _mainCont = React.createRef(); + private _keyHeader = React.createRef(); + @observable private rows: KeyValuePair[] = []; public static LayoutString(fieldStr: string = "data") { return FieldView.LayoutString(KeyValueBox, fieldStr); } @observable private _keyInput: string = ""; @@ -90,7 +100,7 @@ export class KeyValueBox extends React.Component { let rows: JSX.Element[] = []; let i = 0; for (let key of Object.keys(ids).sort()) { - rows.push(); + rows.push( { if (el) this.rows.push(el); }} keyWidth={100 - this.splitPercentage} rowStyle={"keyValueBox-" + (i++ % 2 ? "oddRow" : "evenRow")} key={key} keyName={key} />); } return rows; } @@ -134,6 +144,58 @@ export class KeyValueBox extends React.Component { document.addEventListener('pointerup', this.onDividerUp); } + getTemplate = async () => { + let parent = Docs.FreeformDocument([], { width: 800, height: 800, title: "Template" }); + for (let row of this.rows.filter(row => row.isChecked)) { + await this.createTemplateField(parent, row); + row.uncheck(); + } + return parent; + } + + createTemplateField = async (parent: Doc, row: KeyValuePair) => { + let collectionKeyProp = `fieldKey={"data"}`; + let metaKey = row.props.keyName; + let metaKeyProp = `fieldKey={"${metaKey}"}`; + + let sourceDoc = await Cast(this.props.Document.data, Doc); + if (!sourceDoc) { + return; + } + let target = this.inferType(sourceDoc[metaKey], metaKey); + + let template = Doc.MakeAlias(target); + template.proto = parent; + template.title = metaKey; + template.nativeWidth = 300; + template.nativeHeight = 300; + template.embed = true; + template.isTemplate = true; + template.templates = new List([Templates.TitleBar(metaKey)]); + if (target.backgroundLayout) { + let metaAnoKeyProp = `fieldKey={"${metaKey}"} fieldExt={"annotations"}`; + let collectionAnoKeyProp = `fieldKey={"annotations"}`; + template.layout = StrCast(target.layout).replace(collectionAnoKeyProp, metaAnoKeyProp); + template.backgroundLayout = StrCast(target.backgroundLayout).replace(collectionKeyProp, metaKeyProp); + } else { + template.layout = StrCast(target.layout).replace(collectionKeyProp, metaKeyProp); + } + Doc.AddDocToList(parent, "data", template); + row.uncheck(); + } + + inferType = (field: FieldResult, metaKey: string) => { + let options = { width: 300, height: 300, title: metaKey }; + if (field instanceof RichTextField || typeof field === "string" || typeof field === "number") { + return Docs.TextDocument(options); + } else if (field instanceof List) { + return Docs.FreeformDocument([], options); + } else if (field instanceof ImageField) { + return Docs.ImageDocument("https://www.freepik.com/free-icon/picture-frame-with-mountain-image_748687.htm", options); + } + return new Doc; + } + render() { let dividerDragger = this.splitPercentage === 0 ? (null) :
@@ -144,7 +206,9 @@ export class KeyValueBox extends React.Component { - + {this.createTable()} diff --git a/src/client/views/nodes/KeyValuePair.scss b/src/client/views/nodes/KeyValuePair.scss index a1c5d5537..f78767234 100644 --- a/src/client/views/nodes/KeyValuePair.scss +++ b/src/client/views/nodes/KeyValuePair.scss @@ -3,6 +3,7 @@ .keyValuePair-td-key { display:inline-block; + .keyValuePair-td-key-container{ width:100%; height:100%; @@ -10,14 +11,23 @@ flex-direction: row; flex-wrap: nowrap; justify-content: space-between; + align-items: center; .keyValuePair-td-key-delete{ position: relative; background-color: transparent; color:red; } + .keyValuePair-td-key-check { + position: relative; + margin: 0; + } .keyValuePair-keyField { width:100%; - text-align: center; + margin-left: 20px; + margin-top: -1px; + font-family: monospace; + // text-align: center; + align-self: center; position: relative; overflow: auto; } @@ -26,12 +36,25 @@ .keyValuePair-td-value { display:inline-block; overflow: scroll; - img { - max-height: 36px; - width: auto; - } - .videoBox-cont{ - width: auto; - max-height: 36px; + font-family: monospace; + height: 30px; + .keyValuePair-td-value-container { + display: flex; + align-items: center; + align-content: center; + flex-direction: row; + justify-content: space-between; + flex-wrap: nowrap; + width: 100%; + height: 100%; + + img { + max-height: 36px; + width: auto; + } + .videoBox-cont{ + width: auto; + max-height: 36px; + } } } \ No newline at end of file diff --git a/src/client/views/nodes/KeyValuePair.tsx b/src/client/views/nodes/KeyValuePair.tsx index ede4e3858..b5db52ac7 100644 --- a/src/client/views/nodes/KeyValuePair.tsx +++ b/src/client/views/nodes/KeyValuePair.tsx @@ -12,6 +12,7 @@ import React = require("react"); import { Doc, Opt, Field } from '../../../new_fields/Doc'; import { FieldValue } from '../../../new_fields/Types'; import { KeyValueBox } from './KeyValueBox'; +import { DragManager, SetupDrag } from '../../util/DragManager'; // Represents one row in a key value plane @@ -23,6 +24,20 @@ export interface KeyValuePairProps { } @observer export class KeyValuePair extends React.Component { + @observable private isPointerOver = false; + @observable public isChecked = false; + private checkbox = React.createRef(); + + @action + handleCheck = (e: React.ChangeEvent) => { + this.isChecked = e.currentTarget.checked; + } + + @action + uncheck = () => { + this.checkbox.current!.checked = false; + this.isChecked = false; + } render() { let props: FieldViewProps = { @@ -44,12 +59,16 @@ export class KeyValuePair extends React.Component { addDocTab: returnZero, }; let contents = ; - let fieldKey = Object.keys(props.Document).indexOf(props.fieldKey) !== -1 ? props.fieldKey : "(" + props.fieldKey + ")"; + // let fieldKey = Object.keys(props.Document).indexOf(props.fieldKey) !== -1 ? props.fieldKey : "(" + props.fieldKey + ")"; + let keyStyle = Object.keys(props.Document).indexOf(props.fieldKey) !== -1 ? "black" : "blue"; + + let hover = { transition: "0.3s ease opacity", opacity: this.isPointerOver || this.isChecked ? 1 : 0 }; + return ( - + this.isPointerOver = true)} onPointerLeave={action(() => this.isPointerOver = false)}> + let field = FieldValue(props.Document[props.fieldKey]); + if (Field.IsField(field)) { + return (onDelegate ? "=" : "") + Field.toScriptString(field); + } + return ""; + }} + SetValue={(value: string) => + KeyValueBox.SetField(props.Document, props.fieldKey, value)}> + + + ); } diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 97af10519..fcffeac13 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -60,7 +60,7 @@ def protofy(fieldId): } -def write_schema(parse_results, display_fields): +def write_schema(parse_results, display_fields, storage_key): view_guids = parse_results["child_guids"] data_doc = parse_results["schema"] @@ -87,7 +87,7 @@ def write_schema(parse_results, display_fields): } fields["proto"] = protofy("collectionProto") - fields["data"] = listify(proxify_guids(view_guids)) + fields[storage_key] = listify(proxify_guids(view_guids)) fields["schemaColumns"] = listify(display_fields) fields["backgroundColor"] = "white" fields["scale"] = 0.5 @@ -304,7 +304,7 @@ for file_name in os.listdir(source): if file_name.endswith('.docx'): candidates += 1 schema_guids.append(write_schema( - parse_document(file_name), ["title", "data"])) + parse_document(file_name), ["title", "data"], "image_data")) print("writing parent schema...") parent_guid = write_schema({ @@ -314,7 +314,7 @@ parent_guid = write_schema({ "__type": "Doc" }, "child_guids": schema_guids -}, ["title", "short_description", "original_price"]) +}, ["title", "short_description", "original_price"], "data") print("appending parent schema to main workspace...\n") db.newDocuments.update_one( -- cgit v1.2.3-70-g09d2 From 522ec4097d6f08c6a1025dac37f68152c47df339 Mon Sep 17 00:00:00 2001 From: Sam Wilkins <35748010+samwilkins333@users.noreply.github.com> Date: Thu, 27 Jun 2019 12:41:09 -0400 Subject: scraper image width fixes --- src/scraping/buxton/scraper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'src/scraping/buxton/scraper.py') diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index fcffeac13..8766a54fd 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -116,22 +116,22 @@ def write_image(folder, name): data_doc_guid = guid() view_doc_guid = guid() + image = Image.open(f"{dist}/{folder}/{name}") + native_width, native_height = image.size + view_doc = { "_id": view_doc_guid, "fields": { "proto": protofy(data_doc_guid), "x": 10, "y": 10, - "width": 300, + "width": min(800, native_width), "zIndex": 2, "libraryBrush": False }, "__type": "Doc" } - image = Image.open(f"{dist}/{folder}/{name}") - native_width, native_height = image.size - data_doc = { "_id": data_doc_guid, "fields": { -- cgit v1.2.3-70-g09d2
KeyKey Fields
- -
{fieldKey}
+ +
{props.fieldKey}
- { - const onDelegate = Object.keys(props.Document).includes(props.fieldKey); +
+ { + const onDelegate = Object.keys(props.Document).includes(props.fieldKey); - let field = FieldValue(props.Document[props.fieldKey]); - if (Field.IsField(field)) { - return (onDelegate ? "=" : "") + Field.toScriptString(field); - } - return ""; - }} - SetValue={(value: string) => - KeyValueBox.SetField(props.Document, props.fieldKey, value)}> -