diff options
author | yipstanley <stanley_yip@brown.edu> | 2020-02-29 14:18:43 -0500 |
---|---|---|
committer | yipstanley <stanley_yip@brown.edu> | 2020-02-29 14:18:43 -0500 |
commit | 2f6e27c67d1790d4350eede3003f0b614460f4d1 (patch) | |
tree | ef5e70925b8cdeb8229af849e33e6f3a4cceae7f /src/scraping/buxton/scraper.py | |
parent | f1fcbeea5fb103b7623e795e72aacd4dfacc6c70 (diff) | |
parent | 640f14da28d97600fb32d09023fc932e3a4052c4 (diff) |
Merge branch 'master' of https://github.com/browngraphicslab/Dash-Web into pen
Diffstat (limited to 'src/scraping/buxton/scraper.py')
-rw-r--r-- | src/scraping/buxton/scraper.py | 37 |
1 files changed, 23 insertions, 14 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index ec9c3f72c..1441a8621 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -10,7 +10,6 @@ import uuid import datetime from PIL import Image import math -import sys source = "./source" filesPath = "../../server/public/files" @@ -116,8 +115,8 @@ def write_collection(parse_results, display_fields, storage_key, viewType): target_collection.insert_one(view_doc) data_doc_guid = data_doc["_id"] - print(f"inserted view document ({view_doc_guid})") - print(f"inserted data document ({data_doc_guid})\n") + # print(f"inserted view document ({view_doc_guid})") + # print(f"inserted data document ({data_doc_guid})\n") return view_doc_guid @@ -189,8 +188,8 @@ def write_image(folder, name): "y": 10, "_width": min(800, native_width), "zIndex": 2, - "widthUnit": "*", - "widthMagnitude": 1 + "dimUnit": "*", + "dimMagnitude": 1 }, "__type": "Doc" } @@ -234,7 +233,7 @@ def parse_document(file_name: str): result = {} dir_path = image_dist + "/" + pure_name - print(dir_path) + # print(dir_path) mkdir_if_absent(dir_path) raw = str(docx2txt.process(source + "/" + file_name, dir_path)) @@ -253,13 +252,15 @@ def parse_document(file_name: str): medium = dir_path + "/" + image.replace(".", "_m.", 1) copyfile(resolved, original) copyfile(resolved, medium) - print(f"extracted {count} images...") + # print(f"extracted {count} images...") def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip() def sanitize_price(raw: str): raw = raw.replace(",", "") + if "x" in raw.lower(): + return None start = raw.find("$") if start > -1: i = start + 1 @@ -274,6 +275,14 @@ def parse_document(file_name: str): def remove_empty(line): return len(line) > 1 + def try_parse(to_parse: int): + value: int + try: + value = int(to_parse) + except ValueError: + value = None + return value + lines = list(map(sanitize, raw.split("\n"))) lines = list(filter(remove_empty, lines)) @@ -293,13 +302,13 @@ def parse_document(file_name: str): clean = list( map(lambda data: data.strip().split(":"), lines[cur].split("|"))) result["company"] = clean[0][len(clean[0]) - 1].strip() - result["year"] = clean[1][len(clean[1]) - 1].strip() + result["year"] = try_parse(clean[1][len(clean[1]) - 1].strip()) result["original_price"] = sanitize_price( clean[2][len(clean[2]) - 1].strip()) cur += 1 - result["degrees_of_freedom"] = extract_value( - lines[cur]).replace("NA", "N/A") + result["degrees_of_freedom"] = try_parse(extract_value( + lines[cur]).replace("NA", "N/A")) cur += 1 dimensions = lines[cur].lower() @@ -351,7 +360,7 @@ def parse_document(file_name: str): if len(notes) > 0: result["notes"] = listify(notes) - print("writing child schema...") + # print("writing child schema...") return { "schema": { @@ -383,7 +392,7 @@ def write_common_proto(): if os.path.exists(image_dist): - shutil.rmtree(image_dist) + shutil.rmtree(image_dist, True) while os.path.exists(image_dist): pass os.mkdir(image_dist) @@ -393,7 +402,7 @@ common_proto_id = write_common_proto() candidates = 0 for file_name in os.listdir(source): - if file_name.endswith('.docx'): + if file_name.endswith('.docx') or file_name.endswith('.doc'): candidates += 1 schema_guids.append(write_collection( parse_document(file_name), ["title", "data"], "data", 5)) @@ -406,7 +415,7 @@ parent_guid = write_collection({ "__type": "Doc" }, "child_guids": schema_guids -}, ["title", "short_description", "original_price"], "data", 2) +}, ["title", "short_description", "original_price"], "data", 4) print("appending parent schema to main workspace...\n") target_collection.update_one( |