diff options
Diffstat (limited to 'src/scraping/buxton/scraper.py')
-rw-r--r-- | src/scraping/buxton/scraper.py | 44 |
1 files changed, 29 insertions, 15 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 90205c40b..15b5844f7 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -71,7 +71,7 @@ def text_doc_map(string_list): return listify(proxify_guids(list(map(guid_map, string_list)))) -def write_collection(parse_results, display_fields, storage_key, viewType=2): +def write_collection(parse_results, display_fields, storage_key, viewType): view_guids = parse_results["child_guids"] data_doc = parse_results["schema"] @@ -107,6 +107,11 @@ def write_collection(parse_results, display_fields, storage_key, viewType=2): "date": datetime.datetime.utcnow().microsecond, "__type": "date" } + if "image_urls" in parse_results: + fields["hero"] = { + "url": parse_results["image_urls"][0], + "__type": "image" + } fields["isPrototype"] = True fields["page"] = -1 @@ -176,6 +181,9 @@ def write_image(folder, name): image = Image.open(f"{image_dist}/{folder}/{name}") native_width, native_height = image.size + if abs(native_width - native_height) < 10: + return None + view_doc = { "_id": view_doc_guid, "fields": { @@ -214,7 +222,10 @@ def write_image(folder, name): target_collection.insert_one(view_doc) target_collection.insert_one(data_doc) - return view_doc_guid + return { + "layout_id": view_doc_guid, + "url": path + } def parse_document(file_name: str): @@ -229,16 +240,20 @@ def parse_document(file_name: str): raw = str(docx2txt.process(source + "/" + file_name, dir_path)) + urls = [] view_guids = [] count = 0 for image in os.listdir(dir_path): - count += 1 - view_guids.append(write_image(pure_name, image)) - resolved = dir_path + "/" + image - original = dir_path + "/" + image.replace(".", "_o.", 1) - medium = dir_path + "/" + image.replace(".", "_m.", 1) - copyfile(resolved, original) - copyfile(resolved, medium) + created = write_image(pure_name, image) + if created != None: + urls.append(created["url"]) + view_guids.append(created["layout_id"]) + count += 1 + resolved = dir_path + "/" + image + original = dir_path + "/" + image.replace(".", "_o.", 1) + medium = dir_path + "/" + image.replace(".", "_m.", 1) + copyfile(resolved, original) + copyfile(resolved, medium) print(f"extracted {count} images...") def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( @@ -345,7 +360,8 @@ def parse_document(file_name: str): "fields": result, "__type": "Doc" }, - "child_guids": view_guids + "child_guids": view_guids, + "image_urls": urls } @@ -359,13 +375,11 @@ def write_common_proto(): "_id": id, "fields": { "proto": protofy("collectionProto"), - "title": "Common Import Proto", + "title": "The Buxton Collection", }, "__type": "Doc" } - target_collection.insert_one(common_proto) - return id @@ -383,7 +397,7 @@ for file_name in os.listdir(source): if file_name.endswith('.docx'): candidates += 1 schema_guids.append(write_collection( - parse_document(file_name), ["title", "data"], "image_data")) + parse_document(file_name), ["title", "data"], "data", 5)) print("writing parent schema...") parent_guid = write_collection({ @@ -393,7 +407,7 @@ parent_guid = write_collection({ "__type": "Doc" }, "child_guids": schema_guids -}, ["title", "short_description", "original_price"], "data", 1) +}, ["title", "short_description", "original_price"], "data", 4) print("appending parent schema to main workspace...\n") target_collection.update_one( |