aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/buxton/scraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping/buxton/scraper.py')
-rw-r--r--src/scraping/buxton/scraper.py44
1 files changed, 29 insertions, 15 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index 90205c40b..15b5844f7 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -71,7 +71,7 @@ def text_doc_map(string_list):
return listify(proxify_guids(list(map(guid_map, string_list))))
-def write_collection(parse_results, display_fields, storage_key, viewType=2):
+def write_collection(parse_results, display_fields, storage_key, viewType):
view_guids = parse_results["child_guids"]
data_doc = parse_results["schema"]
@@ -107,6 +107,11 @@ def write_collection(parse_results, display_fields, storage_key, viewType=2):
"date": datetime.datetime.utcnow().microsecond,
"__type": "date"
}
+ if "image_urls" in parse_results:
+ fields["hero"] = {
+ "url": parse_results["image_urls"][0],
+ "__type": "image"
+ }
fields["isPrototype"] = True
fields["page"] = -1
@@ -176,6 +181,9 @@ def write_image(folder, name):
image = Image.open(f"{image_dist}/{folder}/{name}")
native_width, native_height = image.size
+ if abs(native_width - native_height) < 10:
+ return None
+
view_doc = {
"_id": view_doc_guid,
"fields": {
@@ -214,7 +222,10 @@ def write_image(folder, name):
target_collection.insert_one(view_doc)
target_collection.insert_one(data_doc)
- return view_doc_guid
+ return {
+ "layout_id": view_doc_guid,
+ "url": path
+ }
def parse_document(file_name: str):
@@ -229,16 +240,20 @@ def parse_document(file_name: str):
raw = str(docx2txt.process(source + "/" + file_name, dir_path))
+ urls = []
view_guids = []
count = 0
for image in os.listdir(dir_path):
- count += 1
- view_guids.append(write_image(pure_name, image))
- resolved = dir_path + "/" + image
- original = dir_path + "/" + image.replace(".", "_o.", 1)
- medium = dir_path + "/" + image.replace(".", "_m.", 1)
- copyfile(resolved, original)
- copyfile(resolved, medium)
+ created = write_image(pure_name, image)
+ if created != None:
+ urls.append(created["url"])
+ view_guids.append(created["layout_id"])
+ count += 1
+ resolved = dir_path + "/" + image
+ original = dir_path + "/" + image.replace(".", "_o.", 1)
+ medium = dir_path + "/" + image.replace(".", "_m.", 1)
+ copyfile(resolved, original)
+ copyfile(resolved, medium)
print(f"extracted {count} images...")
def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
@@ -345,7 +360,8 @@ def parse_document(file_name: str):
"fields": result,
"__type": "Doc"
},
- "child_guids": view_guids
+ "child_guids": view_guids,
+ "image_urls": urls
}
@@ -359,13 +375,11 @@ def write_common_proto():
"_id": id,
"fields": {
"proto": protofy("collectionProto"),
- "title": "Common Import Proto",
+ "title": "The Buxton Collection",
},
"__type": "Doc"
}
-
target_collection.insert_one(common_proto)
-
return id
@@ -383,7 +397,7 @@ for file_name in os.listdir(source):
if file_name.endswith('.docx'):
candidates += 1
schema_guids.append(write_collection(
- parse_document(file_name), ["title", "data"], "image_data"))
+ parse_document(file_name), ["title", "data"], "data", 5))
print("writing parent schema...")
parent_guid = write_collection({
@@ -393,7 +407,7 @@ parent_guid = write_collection({
"__type": "Doc"
},
"child_guids": schema_guids
-}, ["title", "short_description", "original_price"], "data", 1)
+}, ["title", "short_description", "original_price"], "data", 4)
print("appending parent schema to main workspace...\n")
target_collection.update_one(