diff options
author | Sam Wilkins <samuel_wilkins@brown.edu> | 2019-06-18 22:02:58 -0400 |
---|---|---|
committer | Sam Wilkins <samuel_wilkins@brown.edu> | 2019-06-18 22:02:58 -0400 |
commit | 4bec1d89eff45d6dcbb4041bc211db88d9da1c8f (patch) | |
tree | f2aa63e3202e2eab05eda2fa998661853e832cdc | |
parent | de0304b2966ebdede9d9db8c510e19020046115c (diff) |
fixed serialization typo and added draft of python word doc scraper to git directory for safety
-rw-r--r-- | src/buxton/scraper.py | 128 | ||||
-rw-r--r-- | src/client/DocServer.ts | 2 | ||||
-rw-r--r-- | src/client/util/SerializationHelper.ts | 4 | ||||
-rw-r--r-- | src/new_fields/util.ts | 4 |
4 files changed, 133 insertions, 5 deletions
diff --git a/src/buxton/scraper.py b/src/buxton/scraper.py new file mode 100644 index 000000000..0abebb485 --- /dev/null +++ b/src/buxton/scraper.py @@ -0,0 +1,128 @@ +import os +import docx2txt +from docx import Document +from docx.opc.constants import RELATIONSHIP_TYPE as RT +import re +from pymongo import MongoClient +import shutil +import uuid + +source = "./source" +dist = "./Dash-Web/src/server/public/files" + +collection_handle = MongoClient("localhost", 27017)["Dash"]["buxton"] + +def extract_links(fileName): + links = [] + doc = Document(fileName) + rels = doc.part.rels + for rel in rels: + item = rels[rel] + if item.reltype == RT.HYPERLINK and ".aspx" not in item._target: + links.append(item._target) + return links + +def extract_value(kv_string): + return kv_string.split(":")[1].strip() + +def mkdir_if_absent(path): + try: + if not os.path.exists(path): + os.mkdir(path) + except OSError: + print("Failed to create the appropriate directory structures for %s" % file_name) + +def parse_document(file_name: str): + result = {} + pure_name = file_name.split(".")[0] + + dir_path = dist + "/" + pure_name + mkdir_if_absent(dir_path) + + raw = str(docx2txt.process(source + "/" + file_name, dir_path)) + + sanitize = lambda line: re.sub("[\n\t]+", "", line).strip().replace(u"\u00A0", " ").replace(u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''') + remove_empty = lambda line: len(line) > 1 + + lines = list(map(sanitize, raw.split("\n"))) + lines = list(filter(remove_empty, lines)) + + result["file_name"] = file_name + result["title"] = lines[2] + result["short_description"] = lines[3].replace("Short Description: ", "") + + cur = 5 + notes = "" + while lines[cur] != "Device Details": + notes += lines[cur] + " " + cur += 1 + result["buxton_notes"] = notes.strip() + + cur += 1 + clean = list(map(lambda data: data.strip().split(":"), lines[cur].split("|"))) + result["company"] = clean[0][1].strip() + result["year"] = clean[1][1].strip() + result["original_price"] = clean[2][1].strip() + + cur += 1 + result["degrees_of_freedom"] = extract_value(lines[cur]) + cur += 1 + result["dimensions"] = extract_value(lines[cur]) + + cur += 2 + result["primary_key"] = extract_value(lines[cur]) + cur += 1 + result["secondary_key"] = extract_value(lines[cur]) + + result["hyperlinks"] = extract_links(source + "/" + file_name) + + cur += 2 + link_descriptions = [] + while lines[cur] != "Image": + link_descriptions.append(lines[cur]) + cur += 1 + result["link_descriptions"] = link_descriptions + + images = [] + captions = [] + cur += 3 + while cur + 1 < len(lines) and lines[cur] != "NOTES:": + images.append(lines[cur]) + captions.append(lines[cur + 1]) + cur += 2 + result["images"] = images + result["captions"] = captions + + notes = [] + if (cur < len(lines) and lines[cur] == "NOTES:"): + cur += 1 + while cur < len(lines): + notes.append(lines[cur]) + cur += 1 + result["notes"] = notes + + return result + +def upload(document): + wrapper = {} + wrapper["_id"] = str(uuid.uuid4()) + wrapper["fields"] = document + wrapper["__type"] = "Doc" + collection_handle.insert_one(wrapper) + +if os.path.exists(dist): + shutil.rmtree(dist) +while (os.path.exists(dist)): + pass +os.mkdir(dist) + +for file_name in os.listdir(source): + if file_name.endswith('.docx'): + upload(parse_document(file_name)) + +lines = ['*', '!.gitignore'] +with open(dist + "/.gitignore", 'w') as f: + f.write('\n'.join(lines)) + + + diff --git a/src/client/DocServer.ts b/src/client/DocServer.ts index ad7c706b6..3b33657eb 100644 --- a/src/client/DocServer.ts +++ b/src/client/DocServer.ts @@ -257,7 +257,7 @@ export namespace DocServer { */ let _createField = (field: RefField) => { _cache[field[Id]] = field; - const initialState = SerializationHelper.serialize(field); + const initialState = SerializationHelper.Serialize(field); Utils.emit(_socket, MessageStore.CreateField, initialState); }; diff --git a/src/client/util/SerializationHelper.ts b/src/client/util/SerializationHelper.ts index ea8af3834..7ded85e43 100644 --- a/src/client/util/SerializationHelper.ts +++ b/src/client/util/SerializationHelper.ts @@ -7,7 +7,7 @@ export namespace SerializationHelper { return serializing > 0; } - export function serialize(obj: Field): any { + export function Serialize(obj: Field): any { if (obj === undefined || obj === null) { return undefined; } @@ -124,7 +124,7 @@ export namespace Deserializable { export function autoObject(): PropSchema { return custom( - (s) => SerializationHelper.serialize(s), + (s) => SerializationHelper.Serialize(s), (s) => SerializationHelper.Deserialize(s) ); }
\ No newline at end of file diff --git a/src/new_fields/util.ts b/src/new_fields/util.ts index 7709d6c24..8caceb063 100644 --- a/src/new_fields/util.ts +++ b/src/new_fields/util.ts @@ -43,7 +43,7 @@ export const setter = action(function (target: any, prop: string | symbol | numb } else { target.__fields[prop] = value; } - target[Update]({ '$set': { ["fields." + prop]: value instanceof ObjectField ? SerializationHelper.serialize(value) : (value === undefined ? null : value) } }); + target[Update]({ '$set': { ["fields." + prop]: value instanceof ObjectField ? SerializationHelper.Serialize(value) : (value === undefined ? null : value) } }); UndoManager.AddEvent({ redo: () => receiver[prop] = value, undo: () => receiver[prop] = curValue @@ -103,7 +103,7 @@ export function updateFunction(target: any, prop: any, value: any, receiver: any let current = ObjectField.MakeCopy(value); return (diff?: any) => { if (true || !diff) { - diff = { '$set': { ["fields." + prop]: SerializationHelper.serialize(value) } }; + diff = { '$set': { ["fields." + prop]: SerializationHelper.Serialize(value) } }; const oldValue = current; const newValue = ObjectField.MakeCopy(value); current = newValue; |