diff options
Diffstat (limited to 'src/scraping/buxton/scraper.py')
-rw-r--r-- | src/scraping/buxton/scraper.py | 129 |
1 files changed, 77 insertions, 52 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 4c79af437..1441a8621 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -10,10 +10,10 @@ import uuid import datetime from PIL import Image import math -import sys source = "./source" -dist = "../../server/public/files" +filesPath = "../../server/public/files" +image_dist = filesPath + "/images/buxton" db = MongoClient("localhost", 27017)["Dash"] target_collection = db.newDocuments @@ -70,7 +70,7 @@ def text_doc_map(string_list): return listify(proxify_guids(list(map(guid_map, string_list)))) -def write_collection(parse_results, display_fields, storage_key, viewType=2): +def write_collection(parse_results, display_fields, storage_key, viewType): view_guids = parse_results["child_guids"] data_doc = parse_results["schema"] @@ -84,13 +84,14 @@ def write_collection(parse_results, display_fields, storage_key, viewType=2): "proto": protofy(data_doc["_id"]), "x": 10, "y": 10, - "width": 900, - "height": 600, - "panX": 0, - "panY": 0, + "_width": 900, + "_height": 600, + "_panX": 0, + "_panY": 0, "zIndex": 2, "libraryBrush": False, - "viewType": viewType + "_viewType": viewType, + "_LODdisable": True }, "__type": "Doc" } @@ -98,23 +99,24 @@ def write_collection(parse_results, display_fields, storage_key, viewType=2): fields["proto"] = protofy(common_proto_id) fields[storage_key] = listify(proxify_guids(view_guids)) fields["schemaColumns"] = listify(display_fields) - fields["backgroundColor"] = "white" - fields["scale"] = 0.5 - fields["viewType"] = 2 fields["author"] = "Bill Buxton" fields["creationDate"] = { "date": datetime.datetime.utcnow().microsecond, "__type": "date" } + if "image_urls" in parse_results: + fields["hero"] = { + "url": parse_results["image_urls"][0], + "__type": "image" + } fields["isPrototype"] = True - fields["page"] = -1 target_collection.insert_one(data_doc) target_collection.insert_one(view_doc) data_doc_guid = data_doc["_id"] - print(f"inserted view document ({view_doc_guid})") - print(f"inserted data document ({data_doc_guid})\n") + # print(f"inserted view document ({view_doc_guid})") + # print(f"inserted data document ({data_doc_guid})\n") return view_doc_guid @@ -129,7 +131,7 @@ def write_text_doc(content): "proto": protofy(data_doc_guid), "x": 10, "y": 10, - "width": 400, + "_width": 400, "zIndex": 2 }, "__type": "Doc" @@ -144,17 +146,17 @@ def write_text_doc(content): "__type": "RichTextField" }, "title": content, - "nativeWidth": 200, + "_nativeWidth": 200, "author": "Bill Buxton", "creationDate": { "date": datetime.datetime.utcnow().microsecond, "__type": "date" }, "isPrototype": True, - "autoHeight": True, + "_autoHeight": True, "page": -1, - "nativeHeight": 200, - "height": 200, + "_nativeHeight": 200, + "_height": 200, "data_text": content }, "__type": "Doc" @@ -167,22 +169,27 @@ def write_text_doc(content): def write_image(folder, name): - path = f"http://localhost:1050/files/{folder}/{name}" + path = f"http://localhost:1050/files/images/buxton/{folder}/{name}" data_doc_guid = guid() view_doc_guid = guid() - image = Image.open(f"{dist}/{folder}/{name}") + image = Image.open(f"{image_dist}/{folder}/{name}") native_width, native_height = image.size + if abs(native_width - native_height) < 10: + return None + view_doc = { "_id": view_doc_guid, "fields": { "proto": protofy(data_doc_guid), "x": 10, "y": 10, - "width": min(800, native_width), - "zIndex": 2 + "_width": min(800, native_width), + "zIndex": 2, + "dimUnit": "*", + "dimMagnitude": 1 }, "__type": "Doc" } @@ -196,7 +203,7 @@ def write_image(folder, name): "__type": "image" }, "title": name, - "nativeWidth": native_width, + "_nativeWidth": native_width, "author": "Bill Buxton", "creationDate": { "date": datetime.datetime.utcnow().microsecond, @@ -204,8 +211,8 @@ def write_image(folder, name): }, "isPrototype": True, "page": -1, - "nativeHeight": native_height, - "height": native_height + "_nativeHeight": native_height, + "_height": native_height }, "__type": "Doc" } @@ -213,7 +220,10 @@ def write_image(folder, name): target_collection.insert_one(view_doc) target_collection.insert_one(data_doc) - return view_doc_guid + return { + "layout_id": view_doc_guid, + "url": path + } def parse_document(file_name: str): @@ -222,27 +232,35 @@ def parse_document(file_name: str): result = {} - dir_path = dist + "/" + pure_name + dir_path = image_dist + "/" + pure_name + # print(dir_path) mkdir_if_absent(dir_path) raw = str(docx2txt.process(source + "/" + file_name, dir_path)) + urls = [] view_guids = [] count = 0 for image in os.listdir(dir_path): - count += 1 - view_guids.append(write_image(pure_name, image)) - copyfile(dir_path + "/" + image, dir_path + - "/" + image.replace(".", "_o.", 1)) - copyfile(dir_path + "/" + image, dir_path + - "/" + image.replace(".", "_m.", 1)) - print(f"extracted {count} images...") + created = write_image(pure_name, image) + if created != None: + urls.append(created["url"]) + view_guids.append(created["layout_id"]) + count += 1 + resolved = dir_path + "/" + image + original = dir_path + "/" + image.replace(".", "_o.", 1) + medium = dir_path + "/" + image.replace(".", "_m.", 1) + copyfile(resolved, original) + copyfile(resolved, medium) + # print(f"extracted {count} images...") def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip() def sanitize_price(raw: str): raw = raw.replace(",", "") + if "x" in raw.lower(): + return None start = raw.find("$") if start > -1: i = start + 1 @@ -257,6 +275,14 @@ def parse_document(file_name: str): def remove_empty(line): return len(line) > 1 + def try_parse(to_parse: int): + value: int + try: + value = int(to_parse) + except ValueError: + value = None + return value + lines = list(map(sanitize, raw.split("\n"))) lines = list(filter(remove_empty, lines)) @@ -276,13 +302,13 @@ def parse_document(file_name: str): clean = list( map(lambda data: data.strip().split(":"), lines[cur].split("|"))) result["company"] = clean[0][len(clean[0]) - 1].strip() - result["year"] = clean[1][len(clean[1]) - 1].strip() + result["year"] = try_parse(clean[1][len(clean[1]) - 1].strip()) result["original_price"] = sanitize_price( clean[2][len(clean[2]) - 1].strip()) cur += 1 - result["degrees_of_freedom"] = extract_value( - lines[cur]).replace("NA", "N/A") + result["degrees_of_freedom"] = try_parse(extract_value( + lines[cur]).replace("NA", "N/A")) cur += 1 dimensions = lines[cur].lower() @@ -334,7 +360,7 @@ def parse_document(file_name: str): if len(notes) > 0: result["notes"] = listify(notes) - print("writing child schema...") + # print("writing child schema...") return { "schema": { @@ -342,12 +368,13 @@ def parse_document(file_name: str): "fields": result, "__type": "Doc" }, - "child_guids": view_guids + "child_guids": view_guids, + "image_urls": urls } def proxify_guids(guids): - return list(map(lambda guid: {"fieldId": guid, "__type": "proxy"}, guids)) + return list(map(lambda guid: {"fieldId": guid, "__type": "prefetch_proxy"}, guids)) def write_common_proto(): @@ -356,31 +383,29 @@ def write_common_proto(): "_id": id, "fields": { "proto": protofy("collectionProto"), - "title": "Common Import Proto", + "title": "The Buxton Collection", }, "__type": "Doc" } - target_collection.insert_one(common_proto) - return id -if os.path.exists(dist): - shutil.rmtree(dist) -while os.path.exists(dist): +if os.path.exists(image_dist): + shutil.rmtree(image_dist, True) +while os.path.exists(image_dist): pass -os.mkdir(dist) +os.mkdir(image_dist) mkdir_if_absent(source) common_proto_id = write_common_proto() candidates = 0 for file_name in os.listdir(source): - if file_name.endswith('.docx'): + if file_name.endswith('.docx') or file_name.endswith('.doc'): candidates += 1 schema_guids.append(write_collection( - parse_document(file_name), ["title", "data"], "image_data")) + parse_document(file_name), ["title", "data"], "data", 5)) print("writing parent schema...") parent_guid = write_collection({ @@ -390,7 +415,7 @@ parent_guid = write_collection({ "__type": "Doc" }, "child_guids": schema_guids -}, ["title", "short_description", "original_price"], "data", 1) +}, ["title", "short_description", "original_price"], "data", 4) print("appending parent schema to main workspace...\n") target_collection.update_one( @@ -400,7 +425,7 @@ target_collection.update_one( print("rewriting .gitignore...\n") lines = ['*', '!.gitignore'] -with open(dist + "/.gitignore", 'w') as f: +with open(filesPath + "/.gitignore", 'w') as f: f.write('\n'.join(lines)) suffix = "" if candidates == 1 else "s" |