diff options
19 files changed, 46 insertions, 23 deletions
diff --git a/src/.DS_Store b/src/.DS_Store Binary files differindex d70e95c0a..071dafa1e 100644 --- a/src/.DS_Store +++ b/src/.DS_Store diff --git a/src/buxton/scraper.py b/src/buxton/scraper.py index 0abebb485..854c99379 100644 --- a/src/buxton/scraper.py +++ b/src/buxton/scraper.py @@ -8,9 +8,12 @@ import shutil import uuid source = "./source" -dist = "./Dash-Web/src/server/public/files" +dist = "../server/public/files" + +db = MongoClient("localhost", 27017)["Dash"] +db.buxton.drop() +collection_handle = db.buxton -collection_handle = MongoClient("localhost", 27017)["Dash"]["buxton"] def extract_links(fileName): links = [] @@ -22,8 +25,11 @@ def extract_links(fileName): links.append(item._target) return links + def extract_value(kv_string): - return kv_string.split(":")[1].strip() + pieces = kv_string.split(":") + return (pieces[1] if len(pieces) > 1 else kv_string).strip() + def mkdir_if_absent(path): try: @@ -32,6 +38,7 @@ def mkdir_if_absent(path): except OSError: print("Failed to create the appropriate directory structures for %s" % file_name) + def parse_document(file_name: str): result = {} pure_name = file_name.split(".")[0] @@ -41,15 +48,18 @@ def parse_document(file_name: str): raw = str(docx2txt.process(source + "/" + file_name, dir_path)) - sanitize = lambda line: re.sub("[\n\t]+", "", line).strip().replace(u"\u00A0", " ").replace(u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''') - remove_empty = lambda line: len(line) > 1 + def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( + u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip() + + def remove_empty(line): return len(line) > 1 lines = list(map(sanitize, raw.split("\n"))) lines = list(filter(remove_empty, lines)) result["file_name"] = file_name - result["title"] = lines[2] - result["short_description"] = lines[3].replace("Short Description: ", "") + result["title"] = lines[2].strip() + result["short_description"] = lines[3].strip().replace( + "Short Description: ", "") cur = 5 notes = "" @@ -57,32 +67,44 @@ def parse_document(file_name: str): notes += lines[cur] + " " cur += 1 result["buxton_notes"] = notes.strip() - + cur += 1 - clean = list(map(lambda data: data.strip().split(":"), lines[cur].split("|"))) - result["company"] = clean[0][1].strip() - result["year"] = clean[1][1].strip() - result["original_price"] = clean[2][1].strip() + clean = list( + map(lambda data: data.strip().split(":"), lines[cur].split("|"))) + result["company"] = clean[0][len(clean[0]) - 1].strip() + result["year"] = clean[1][len(clean[1]) - 1].strip() + result["original_price"] = clean[2][len(clean[2]) - 1].strip() cur += 1 result["degrees_of_freedom"] = extract_value(lines[cur]) cur += 1 - result["dimensions"] = extract_value(lines[cur]) - cur += 2 + dimensions = lines[cur].lower() + if dimensions.startswith("dimensions"): + result["dimensions"] = dimensions[11:].strip() + cur += 1 + while lines[cur] != "Key Words": + result["dimensions"] += (" " + lines[cur].strip()) + cur += 1 + + cur += 1 result["primary_key"] = extract_value(lines[cur]) cur += 1 result["secondary_key"] = extract_value(lines[cur]) - result["hyperlinks"] = extract_links(source + "/" + file_name) + while lines[cur] != "Links": + result["secondary_key"] += (" " + extract_value(lines[cur]).strip()) + cur += 1 - cur += 2 + cur += 1 link_descriptions = [] while lines[cur] != "Image": - link_descriptions.append(lines[cur]) + link_descriptions.append(lines[cur].strip()) cur += 1 result["link_descriptions"] = link_descriptions + result["hyperlinks"] = extract_links(source + "/" + file_name) + images = [] captions = [] cur += 3 @@ -99,9 +121,11 @@ def parse_document(file_name: str): while cur < len(lines): notes.append(lines[cur]) cur += 1 - result["notes"] = notes + if len(notes) > 0: + result["notes"] = notes + + return result - return result def upload(document): wrapper = {} @@ -110,11 +134,13 @@ def upload(document): wrapper["__type"] = "Doc" collection_handle.insert_one(wrapper) + if os.path.exists(dist): shutil.rmtree(dist) -while (os.path.exists(dist)): +while os.path.exists(dist): pass os.mkdir(dist) +mkdir_if_absent(source) for file_name in os.listdir(source): if file_name.endswith('.docx'): @@ -123,6 +149,3 @@ for file_name in os.listdir(source): lines = ['*', '!.gitignore'] with open(dist + "/.gitignore", 'w') as f: f.write('\n'.join(lines)) - - - diff --git a/src/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx Binary files differnew file mode 100644 index 000000000..06094b4d3 --- /dev/null +++ b/src/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx diff --git a/src/buxton/source/Bill_Notes_Braun_T3.docx b/src/buxton/source/Bill_Notes_Braun_T3.docx Binary files differnew file mode 100644 index 000000000..356697092 --- /dev/null +++ b/src/buxton/source/Bill_Notes_Braun_T3.docx diff --git a/src/buxton/source/Bill_Notes_CasioC801.docx b/src/buxton/source/Bill_Notes_CasioC801.docx Binary files differnew file mode 100644 index 000000000..cd89fb97b --- /dev/null +++ b/src/buxton/source/Bill_Notes_CasioC801.docx diff --git a/src/buxton/source/Bill_Notes_Casio_Mini.docx b/src/buxton/source/Bill_Notes_Casio_Mini.docx Binary files differnew file mode 100644 index 000000000..a503cddfc --- /dev/null +++ b/src/buxton/source/Bill_Notes_Casio_Mini.docx diff --git a/src/buxton/source/Bill_Notes_FingerWorks_Prototype.docx b/src/buxton/source/Bill_Notes_FingerWorks_Prototype.docx Binary files differnew file mode 100644 index 000000000..4d13a8cf5 --- /dev/null +++ b/src/buxton/source/Bill_Notes_FingerWorks_Prototype.docx diff --git a/src/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx b/src/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx Binary files differnew file mode 100644 index 000000000..578a1be08 --- /dev/null +++ b/src/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx diff --git a/src/buxton/source/Bill_Notes_FrogPad.docx b/src/buxton/source/Bill_Notes_FrogPad.docx Binary files differnew file mode 100644 index 000000000..d01e1bf5c --- /dev/null +++ b/src/buxton/source/Bill_Notes_FrogPad.docx diff --git a/src/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/buxton/source/Bill_Notes_Gavilan_SC.docx Binary files differnew file mode 100644 index 000000000..7bd28b376 --- /dev/null +++ b/src/buxton/source/Bill_Notes_Gavilan_SC.docx diff --git a/src/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/buxton/source/Bill_Notes_Grandjean_Stenotype.docx Binary files differnew file mode 100644 index 000000000..0615c4953 --- /dev/null +++ b/src/buxton/source/Bill_Notes_Grandjean_Stenotype.docx diff --git a/src/buxton/source/Bill_Notes_Matias.docx b/src/buxton/source/Bill_Notes_Matias.docx Binary files differnew file mode 100644 index 000000000..547603256 --- /dev/null +++ b/src/buxton/source/Bill_Notes_Matias.docx diff --git a/src/buxton/source/Bill_Notes_MousePen.docx b/src/buxton/source/Bill_Notes_MousePen.docx Binary files differnew file mode 100644 index 000000000..4e1056636 --- /dev/null +++ b/src/buxton/source/Bill_Notes_MousePen.docx diff --git a/src/buxton/source/Bill_Notes_NewO.docx b/src/buxton/source/Bill_Notes_NewO.docx Binary files differnew file mode 100644 index 000000000..a514926d2 --- /dev/null +++ b/src/buxton/source/Bill_Notes_NewO.docx diff --git a/src/buxton/source/Bill_Notes_OLPC.docx b/src/buxton/source/Bill_Notes_OLPC.docx Binary files differnew file mode 100644 index 000000000..bfca0a9bb --- /dev/null +++ b/src/buxton/source/Bill_Notes_OLPC.docx diff --git a/src/buxton/source/Bill_Notes_PARCkbd.docx b/src/buxton/source/Bill_Notes_PARCkbd.docx Binary files differnew file mode 100644 index 000000000..c0cf6ba9a --- /dev/null +++ b/src/buxton/source/Bill_Notes_PARCkbd.docx diff --git a/src/buxton/source/Bill_Notes_Philco_Mystery_Control.docx b/src/buxton/source/Bill_Notes_Philco_Mystery_Control.docx Binary files differnew file mode 100644 index 000000000..ad06903f3 --- /dev/null +++ b/src/buxton/source/Bill_Notes_Philco_Mystery_Control.docx diff --git a/src/buxton/source/Bill_Notes_TASA_Kbd.docx b/src/buxton/source/Bill_Notes_TASA_Kbd.docx Binary files differnew file mode 100644 index 000000000..e4c659de9 --- /dev/null +++ b/src/buxton/source/Bill_Notes_TASA_Kbd.docx diff --git a/src/buxton/source/Bill_Notes_The_Tap.docx b/src/buxton/source/Bill_Notes_The_Tap.docx Binary files differnew file mode 100644 index 000000000..8ceebc71e --- /dev/null +++ b/src/buxton/source/Bill_Notes_The_Tap.docx |