aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/buxton/scraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping/buxton/scraper.py')
-rw-r--r--src/scraping/buxton/scraper.py129
1 files changed, 77 insertions, 52 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index 4c79af437..1441a8621 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -10,10 +10,10 @@ import uuid
import datetime
from PIL import Image
import math
-import sys
source = "./source"
-dist = "../../server/public/files"
+filesPath = "../../server/public/files"
+image_dist = filesPath + "/images/buxton"
db = MongoClient("localhost", 27017)["Dash"]
target_collection = db.newDocuments
@@ -70,7 +70,7 @@ def text_doc_map(string_list):
return listify(proxify_guids(list(map(guid_map, string_list))))
-def write_collection(parse_results, display_fields, storage_key, viewType=2):
+def write_collection(parse_results, display_fields, storage_key, viewType):
view_guids = parse_results["child_guids"]
data_doc = parse_results["schema"]
@@ -84,13 +84,14 @@ def write_collection(parse_results, display_fields, storage_key, viewType=2):
"proto": protofy(data_doc["_id"]),
"x": 10,
"y": 10,
- "width": 900,
- "height": 600,
- "panX": 0,
- "panY": 0,
+ "_width": 900,
+ "_height": 600,
+ "_panX": 0,
+ "_panY": 0,
"zIndex": 2,
"libraryBrush": False,
- "viewType": viewType
+ "_viewType": viewType,
+ "_LODdisable": True
},
"__type": "Doc"
}
@@ -98,23 +99,24 @@ def write_collection(parse_results, display_fields, storage_key, viewType=2):
fields["proto"] = protofy(common_proto_id)
fields[storage_key] = listify(proxify_guids(view_guids))
fields["schemaColumns"] = listify(display_fields)
- fields["backgroundColor"] = "white"
- fields["scale"] = 0.5
- fields["viewType"] = 2
fields["author"] = "Bill Buxton"
fields["creationDate"] = {
"date": datetime.datetime.utcnow().microsecond,
"__type": "date"
}
+ if "image_urls" in parse_results:
+ fields["hero"] = {
+ "url": parse_results["image_urls"][0],
+ "__type": "image"
+ }
fields["isPrototype"] = True
- fields["page"] = -1
target_collection.insert_one(data_doc)
target_collection.insert_one(view_doc)
data_doc_guid = data_doc["_id"]
- print(f"inserted view document ({view_doc_guid})")
- print(f"inserted data document ({data_doc_guid})\n")
+ # print(f"inserted view document ({view_doc_guid})")
+ # print(f"inserted data document ({data_doc_guid})\n")
return view_doc_guid
@@ -129,7 +131,7 @@ def write_text_doc(content):
"proto": protofy(data_doc_guid),
"x": 10,
"y": 10,
- "width": 400,
+ "_width": 400,
"zIndex": 2
},
"__type": "Doc"
@@ -144,17 +146,17 @@ def write_text_doc(content):
"__type": "RichTextField"
},
"title": content,
- "nativeWidth": 200,
+ "_nativeWidth": 200,
"author": "Bill Buxton",
"creationDate": {
"date": datetime.datetime.utcnow().microsecond,
"__type": "date"
},
"isPrototype": True,
- "autoHeight": True,
+ "_autoHeight": True,
"page": -1,
- "nativeHeight": 200,
- "height": 200,
+ "_nativeHeight": 200,
+ "_height": 200,
"data_text": content
},
"__type": "Doc"
@@ -167,22 +169,27 @@ def write_text_doc(content):
def write_image(folder, name):
- path = f"http://localhost:1050/files/{folder}/{name}"
+ path = f"http://localhost:1050/files/images/buxton/{folder}/{name}"
data_doc_guid = guid()
view_doc_guid = guid()
- image = Image.open(f"{dist}/{folder}/{name}")
+ image = Image.open(f"{image_dist}/{folder}/{name}")
native_width, native_height = image.size
+ if abs(native_width - native_height) < 10:
+ return None
+
view_doc = {
"_id": view_doc_guid,
"fields": {
"proto": protofy(data_doc_guid),
"x": 10,
"y": 10,
- "width": min(800, native_width),
- "zIndex": 2
+ "_width": min(800, native_width),
+ "zIndex": 2,
+ "dimUnit": "*",
+ "dimMagnitude": 1
},
"__type": "Doc"
}
@@ -196,7 +203,7 @@ def write_image(folder, name):
"__type": "image"
},
"title": name,
- "nativeWidth": native_width,
+ "_nativeWidth": native_width,
"author": "Bill Buxton",
"creationDate": {
"date": datetime.datetime.utcnow().microsecond,
@@ -204,8 +211,8 @@ def write_image(folder, name):
},
"isPrototype": True,
"page": -1,
- "nativeHeight": native_height,
- "height": native_height
+ "_nativeHeight": native_height,
+ "_height": native_height
},
"__type": "Doc"
}
@@ -213,7 +220,10 @@ def write_image(folder, name):
target_collection.insert_one(view_doc)
target_collection.insert_one(data_doc)
- return view_doc_guid
+ return {
+ "layout_id": view_doc_guid,
+ "url": path
+ }
def parse_document(file_name: str):
@@ -222,27 +232,35 @@ def parse_document(file_name: str):
result = {}
- dir_path = dist + "/" + pure_name
+ dir_path = image_dist + "/" + pure_name
+ # print(dir_path)
mkdir_if_absent(dir_path)
raw = str(docx2txt.process(source + "/" + file_name, dir_path))
+ urls = []
view_guids = []
count = 0
for image in os.listdir(dir_path):
- count += 1
- view_guids.append(write_image(pure_name, image))
- copyfile(dir_path + "/" + image, dir_path +
- "/" + image.replace(".", "_o.", 1))
- copyfile(dir_path + "/" + image, dir_path +
- "/" + image.replace(".", "_m.", 1))
- print(f"extracted {count} images...")
+ created = write_image(pure_name, image)
+ if created != None:
+ urls.append(created["url"])
+ view_guids.append(created["layout_id"])
+ count += 1
+ resolved = dir_path + "/" + image
+ original = dir_path + "/" + image.replace(".", "_o.", 1)
+ medium = dir_path + "/" + image.replace(".", "_m.", 1)
+ copyfile(resolved, original)
+ copyfile(resolved, medium)
+ # print(f"extracted {count} images...")
def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip()
def sanitize_price(raw: str):
raw = raw.replace(",", "")
+ if "x" in raw.lower():
+ return None
start = raw.find("$")
if start > -1:
i = start + 1
@@ -257,6 +275,14 @@ def parse_document(file_name: str):
def remove_empty(line): return len(line) > 1
+ def try_parse(to_parse: int):
+ value: int
+ try:
+ value = int(to_parse)
+ except ValueError:
+ value = None
+ return value
+
lines = list(map(sanitize, raw.split("\n")))
lines = list(filter(remove_empty, lines))
@@ -276,13 +302,13 @@ def parse_document(file_name: str):
clean = list(
map(lambda data: data.strip().split(":"), lines[cur].split("|")))
result["company"] = clean[0][len(clean[0]) - 1].strip()
- result["year"] = clean[1][len(clean[1]) - 1].strip()
+ result["year"] = try_parse(clean[1][len(clean[1]) - 1].strip())
result["original_price"] = sanitize_price(
clean[2][len(clean[2]) - 1].strip())
cur += 1
- result["degrees_of_freedom"] = extract_value(
- lines[cur]).replace("NA", "N/A")
+ result["degrees_of_freedom"] = try_parse(extract_value(
+ lines[cur]).replace("NA", "N/A"))
cur += 1
dimensions = lines[cur].lower()
@@ -334,7 +360,7 @@ def parse_document(file_name: str):
if len(notes) > 0:
result["notes"] = listify(notes)
- print("writing child schema...")
+ # print("writing child schema...")
return {
"schema": {
@@ -342,12 +368,13 @@ def parse_document(file_name: str):
"fields": result,
"__type": "Doc"
},
- "child_guids": view_guids
+ "child_guids": view_guids,
+ "image_urls": urls
}
def proxify_guids(guids):
- return list(map(lambda guid: {"fieldId": guid, "__type": "proxy"}, guids))
+ return list(map(lambda guid: {"fieldId": guid, "__type": "prefetch_proxy"}, guids))
def write_common_proto():
@@ -356,31 +383,29 @@ def write_common_proto():
"_id": id,
"fields": {
"proto": protofy("collectionProto"),
- "title": "Common Import Proto",
+ "title": "The Buxton Collection",
},
"__type": "Doc"
}
-
target_collection.insert_one(common_proto)
-
return id
-if os.path.exists(dist):
- shutil.rmtree(dist)
-while os.path.exists(dist):
+if os.path.exists(image_dist):
+ shutil.rmtree(image_dist, True)
+while os.path.exists(image_dist):
pass
-os.mkdir(dist)
+os.mkdir(image_dist)
mkdir_if_absent(source)
common_proto_id = write_common_proto()
candidates = 0
for file_name in os.listdir(source):
- if file_name.endswith('.docx'):
+ if file_name.endswith('.docx') or file_name.endswith('.doc'):
candidates += 1
schema_guids.append(write_collection(
- parse_document(file_name), ["title", "data"], "image_data"))
+ parse_document(file_name), ["title", "data"], "data", 5))
print("writing parent schema...")
parent_guid = write_collection({
@@ -390,7 +415,7 @@ parent_guid = write_collection({
"__type": "Doc"
},
"child_guids": schema_guids
-}, ["title", "short_description", "original_price"], "data", 1)
+}, ["title", "short_description", "original_price"], "data", 4)
print("appending parent schema to main workspace...\n")
target_collection.update_one(
@@ -400,7 +425,7 @@ target_collection.update_one(
print("rewriting .gitignore...\n")
lines = ['*', '!.gitignore']
-with open(dist + "/.gitignore", 'w') as f:
+with open(filesPath + "/.gitignore", 'w') as f:
f.write('\n'.join(lines))
suffix = "" if candidates == 1 else "s"