Merge branch 'master' of https://github.com/browngraphicslab/Dash-Web

author: Bob Zeleznik <zzzman@gmail.com> 2020-02-02 11:19:48 -0500
committer: Bob Zeleznik <zzzman@gmail.com> 2020-02-02 11:19:48 -0500
commit: 0b4ac56df4dcac9485f484ffd444467d3c12e44c (patch)
tree: 98d8fa0e669274a618115493019ae4a095751b9e /src/scraping/buxton/scraper.py
parent: e544d3c4eb0e4263777735fedbf4d3987cd2b62c (diff)
parent: a4620be72d758223e41e60974e0bfb8fd764c274 (diff)
1 files changed, 111 insertions, 303 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index ec9c3f72c..394958823 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -1,37 +1,32 @@
 import os
-from shutil import copyfile
 import docx2txt
 from docx import Document
 from docx.opc.constants import RELATIONSHIP_TYPE as RT
 import re
-from pymongo import MongoClient
 import shutil
 import uuid
-import datetime
+import json
+import base64
+from shutil import copyfile
 from PIL import Image
-import math
-import sys
-
-source = "./source"
-filesPath = "../../server/public/files"
-image_dist = filesPath + "/images/buxton"
 
-db = MongoClient("localhost", 27017)["Dash"]
-target_collection = db.newDocuments
-target_doc_title = "Collection 1"
-schema_guids = []
-common_proto_id = ""
+files_path = "../../server/public/files"
+source_path = "./source"
+temp_images_path = "./extracted_images"
+server_images_path = f"{files_path}/images/buxton"
+json_path = "./json"
 
 
-def extract_links(fileName):
+# noinspection PyProtectedMember
+def extract_links(file):
     links = []
-    doc = Document(fileName)
+    doc = Document(file)
     rels = doc.part.rels
     for rel in rels:
         item = rels[rel]
         if item.reltype == RT.HYPERLINK and ".aspx" not in item._target:
             links.append(item._target)
-    return text_doc_map(links)
+    return links
 
 
 def extract_value(kv_string):
@@ -51,233 +46,72 @@ def guid():
     return str(uuid.uuid4())
 
 
-def listify(list):
-    return {
-        "fields": list,
-        "__type": "list"
-    }
-
-
-def protofy(fieldId):
-    return {
-        "fieldId": fieldId,
-        "__type": "proxy"
-    }
-
-
-def text_doc_map(string_list):
-    def guid_map(caption):
-        return write_text_doc(caption)
-    return listify(proxify_guids(list(map(guid_map, string_list))))
-
-
-def write_collection(parse_results, display_fields, storage_key, viewType):
-    view_guids = parse_results["child_guids"]
-
-    data_doc = parse_results["schema"]
-    fields = data_doc["fields"]
-
-    view_doc_guid = guid()
-
-    view_doc = {
-        "_id": view_doc_guid,
-        "fields": {
-            "proto": protofy(data_doc["_id"]),
-            "x": 10,
-            "y": 10,
-            "_width": 900,
-            "_height": 600,
-            "_panX": 0,
-            "_panY": 0,
-            "zIndex": 2,
-            "libraryBrush": False,
-            "_viewType": viewType,
-            "_LODdisable": True
-        },
-        "__type": "Doc"
-    }
-
-    fields["proto"] = protofy(common_proto_id)
-    fields[storage_key] = listify(proxify_guids(view_guids))
-    fields["schemaColumns"] = listify(display_fields)
-    fields["author"] = "Bill Buxton"
-    fields["creationDate"] = {
-        "date": datetime.datetime.utcnow().microsecond,
-        "__type": "date"
-    }
-    if "image_urls" in parse_results:
-        fields["hero"] = {
-            "url": parse_results["image_urls"][0],
-            "__type": "image"
-        }
-    fields["isPrototype"] = True
-
-    target_collection.insert_one(data_doc)
-    target_collection.insert_one(view_doc)
-
-    data_doc_guid = data_doc["_id"]
-    print(f"inserted view document ({view_doc_guid})")
-    print(f"inserted data document ({data_doc_guid})\n")
-
-    return view_doc_guid
-
-
-def write_text_doc(content):
-    data_doc_guid = guid()
-    view_doc_guid = guid()
-
-    view_doc = {
-        "_id": view_doc_guid,
-        "fields": {
-            "proto": protofy(data_doc_guid),
-            "x": 10,
-            "y": 10,
-            "_width": 400,
-            "zIndex": 2
-        },
-        "__type": "Doc"
-    }
-
-    data_doc = {
-        "_id": data_doc_guid,
-        "fields": {
-            "proto": protofy("textProto"),
-            "data": {
-                "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}',
-                "__type": "RichTextField"
-            },
-            "title": content,
-            "_nativeWidth": 200,
-            "author": "Bill Buxton",
-            "creationDate": {
-                "date": datetime.datetime.utcnow().microsecond,
-                "__type": "date"
-            },
-            "isPrototype": True,
-            "_autoHeight": True,
-            "page": -1,
-            "_nativeHeight": 200,
-            "_height": 200,
-            "data_text": content
-        },
-        "__type": "Doc"
-    }
-
-    target_collection.insert_one(view_doc)
-    target_collection.insert_one(data_doc)
-
-    return view_doc_guid
-
-
-def write_image(folder, name):
-    path = f"http://localhost:1050/files/images/buxton/{folder}/{name}"
-
-    data_doc_guid = guid()
-    view_doc_guid = guid()
-
-    image = Image.open(f"{image_dist}/{folder}/{name}")
-    native_width, native_height = image.size
-
-    if abs(native_width - native_height) < 10:
-        return None
-
-    view_doc = {
-        "_id": view_doc_guid,
-        "fields": {
-            "proto": protofy(data_doc_guid),
-            "x": 10,
-            "y": 10,
-            "_width": min(800, native_width),
-            "zIndex": 2,
-            "widthUnit": "*",
-            "widthMagnitude": 1
-        },
-        "__type": "Doc"
-    }
-
-    data_doc = {
-        "_id": data_doc_guid,
-        "fields": {
-            "proto": protofy("imageProto"),
-            "data": {
-                "url": path,
-                "__type": "image"
-            },
-            "title": name,
-            "_nativeWidth": native_width,
-            "author": "Bill Buxton",
-            "creationDate": {
-                "date": datetime.datetime.utcnow().microsecond,
-                "__type": "date"
-            },
-            "isPrototype": True,
-            "page": -1,
-            "_nativeHeight": native_height,
-            "_height": native_height
-        },
-        "__type": "Doc"
-    }
-
-    target_collection.insert_one(view_doc)
-    target_collection.insert_one(data_doc)
-
-    return {
-        "layout_id": view_doc_guid,
-        "url": path
-    }
-
-
-def parse_document(file_name: str):
-    print(f"parsing {file_name}...")
-    pure_name = file_name.split(".")[0]
+def encode_image(folder: str, name: str):
+    with open(f"{temp_images_path}/{folder}/{name}", "rb") as image:
+        encoded = base64.b64encode(image.read())
+        return encoded.decode("utf-8")
+
+
+def parse_document(name: str):
+    print(f"parsing {name}...")
+    pure_name = name.split(".")[0]
 
     result = {}
 
-    dir_path = image_dist + "/" + pure_name
-    print(dir_path)
-    mkdir_if_absent(dir_path)
-
-    raw = str(docx2txt.process(source + "/" + file_name, dir_path))
-
-    urls = []
-    view_guids = []
-    count = 0
-    for image in os.listdir(dir_path):
-        created = write_image(pure_name, image)
-        if created != None:
-            urls.append(created["url"])
-            view_guids.append(created["layout_id"])
-            count += 1
-            resolved = dir_path + "/" + image
-            original = dir_path + "/" + image.replace(".", "_o.", 1)
-            medium = dir_path + "/" + image.replace(".", "_m.", 1)
-            copyfile(resolved, original)
-            copyfile(resolved, medium)
-    print(f"extracted {count} images...")
+    saved_device_images_dir = server_images_path + "/" + pure_name
+    temp_device_images_dir = temp_images_path + "/" + pure_name
+    mkdir_if_absent(temp_device_images_dir)
+    mkdir_if_absent(saved_device_images_dir)
+
+    raw = str(docx2txt.process(source_path +
+                               "/" + name, temp_device_images_dir))
+
+    extracted_images = []
+    for image in os.listdir(temp_device_images_dir):
+        temp = f"{temp_device_images_dir}/{image}"
+        native_width, native_height = Image.open(temp).size
+        if abs(native_width - native_height) < 10:
+            continue
+        original = saved_device_images_dir + "/" + image.replace(".", "_o.", 1)
+        medium = saved_device_images_dir + "/" + image.replace(".", "_m.", 1)
+        copyfile(temp, original)
+        copyfile(temp, medium)
+        server_path = f"http://localhost:1050/files/images/buxton/{pure_name}/{image}"
+        extracted_images.append(server_path)
+    result["extracted_images"] = extracted_images
 
     def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
         u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip()
 
-    def sanitize_price(raw: str):
-        raw = raw.replace(",", "")
-        start = raw.find("$")
+    def sanitize_price(raw_price: str):
+        raw_price = raw_price.replace(",", "")
+        start = raw_price.find("$")
+        if "x" in raw_price.lower():
+            return None
         if start > -1:
             i = start + 1
-            while (i < len(raw) and re.match(r"[0-9\.]", raw[i])):
+            while i < len(raw_price) and re.match(r"[0-9.]", raw_price[i]):
                 i += 1
-            price = raw[start + 1: i + 1]
+            price = raw_price[start + 1: i + 1]
             return float(price)
-        elif (raw.lower().find("nfs")):
+        elif raw_price.lower().find("nfs"):
             return -1
         else:
-            return math.nan
+            return None
 
     def remove_empty(line): return len(line) > 1
 
+    def try_parse(to_parse: int):
+        value: int
+        try:
+            value = int(to_parse)
+        except ValueError:
+            value = None
+        return value
+
     lines = list(map(sanitize, raw.split("\n")))
     lines = list(filter(remove_empty, lines))
 
-    result["file_name"] = file_name
     result["title"] = lines[2].strip()
     result["short_description"] = lines[3].strip().replace(
         "Short Description: ", "")
@@ -293,13 +127,15 @@ def parse_document(file_name: str):
     clean = list(
         map(lambda data: data.strip().split(":"), lines[cur].split("|")))
     result["company"] = clean[0][len(clean[0]) - 1].strip()
-    result["year"] = clean[1][len(clean[1]) - 1].strip()
+
+    result["year"] = try_parse(clean[1][len(clean[1]) - 1].strip())
     result["original_price"] = sanitize_price(
         clean[2][len(clean[2]) - 1].strip())
 
     cur += 1
-    result["degrees_of_freedom"] = extract_value(
-        lines[cur]).replace("NA", "N/A")
+
+    result["degrees_of_freedom"] = try_parse(extract_value(
+        lines[cur]).replace("NA", "N/A"))
     cur += 1
 
     dimensions = lines[cur].lower()
@@ -325,99 +161,71 @@ def parse_document(file_name: str):
     cur += 1
     link_descriptions = []
     while lines[cur] != "Image":
-        link_descriptions.append(lines[cur].strip())
+        description = lines[cur].strip().lower()
+        valid = True
+        for ignored in ["powerpoint", "vimeo", "xxx"]:
+            if ignored in description:
+                valid = False
+                break
+        if valid:
+            link_descriptions.append(description)
         cur += 1
-    result["link_descriptions"] = text_doc_map(link_descriptions)
+    result["link_descriptions"] = link_descriptions
 
-    result["hyperlinks"] = extract_links(source + "/" + file_name)
+    result["hyperlinks"] = extract_links(source_path + "/" + name)
 
     images = []
     captions = []
     cur += 3
     while cur + 1 < len(lines) and lines[cur] != "NOTES:":
-        images.append(lines[cur])
-        captions.append(lines[cur + 1])
+        name = lines[cur]
+        if "full document" not in name.lower():
+            images.append(name)
+            captions.append(lines[cur + 1])
         cur += 2
-    result["images"] = listify(images)
+    result["table_image_names"] = images
 
-    result["captions"] = text_doc_map(captions)
+    result["captions"] = captions
 
     notes = []
-    if (cur < len(lines) and lines[cur] == "NOTES:"):
+    if cur < len(lines) and lines[cur] == "NOTES:":
         cur += 1
         while cur < len(lines):
             notes.append(lines[cur])
             cur += 1
     if len(notes) > 0:
-        result["notes"] = listify(notes)
-
-    print("writing child schema...")
-
-    return {
-        "schema": {
-            "_id": guid(),
-            "fields": result,
-            "__type": "Doc"
-        },
-        "child_guids": view_guids,
-        "image_urls": urls
-    }
-
-
-def proxify_guids(guids):
-    return list(map(lambda guid: {"fieldId": guid, "__type": "prefetch_proxy"}, guids))
-
-
-def write_common_proto():
-    id = guid()
-    common_proto = {
-        "_id": id,
-        "fields": {
-            "proto": protofy("collectionProto"),
-            "title": "The Buxton Collection",
-        },
-        "__type": "Doc"
-    }
-    target_collection.insert_one(common_proto)
-    return id
-
-
-if os.path.exists(image_dist):
-    shutil.rmtree(image_dist)
-while os.path.exists(image_dist):
+        result["notes"] = notes
+
+    return result
+
+
+if os.path.exists(server_images_path):
+    shutil.rmtree(server_images_path)
+while os.path.exists(server_images_path):
     pass
-os.mkdir(image_dist)
-mkdir_if_absent(source)
+os.mkdir(server_images_path)
 
-common_proto_id = write_common_proto()
+mkdir_if_absent(source_path)
+mkdir_if_absent(json_path)
+mkdir_if_absent(temp_images_path)
+
+results = []
 
 candidates = 0
-for file_name in os.listdir(source):
-    if file_name.endswith('.docx'):
+for file_name in os.listdir(source_path):
+    if file_name.endswith('.docx') or file_name.endswith(".doc"):
         candidates += 1
-        schema_guids.append(write_collection(
-            parse_document(file_name), ["title", "data"], "data", 5))
-
-print("writing parent schema...")
-parent_guid = write_collection({
-    "schema": {
-        "_id": guid(),
-        "fields": {},
-        "__type": "Doc"
-    },
-    "child_guids": schema_guids
-}, ["title", "short_description", "original_price"], "data", 2)
-
-print("appending parent schema to main workspace...\n")
-target_collection.update_one(
-    {"fields.title": target_doc_title},
-    {"$push": {"fields.data.fields": {"fieldId": parent_guid, "__type": "proxy"}}}
-)
-
-print("rewriting .gitignore...\n")
-lines = ['*', '!.gitignore']
-with open(filesPath + "/.gitignore", 'w') as f:
-    f.write('\n'.join(lines))
-
-suffix = "" if candidates == 1 else "s"
-print(f"conversion complete. {candidates} candidate{suffix} processed.")
+        results.append(parse_document(file_name))
+
+
+with open(f"./json/buxton_collection.json", "w", encoding="utf-8") as out:
+    json.dump(results, out, ensure_ascii=False, indent=4)
+
+print(f"\nSuccessfully parsed {candidates} candidates.")
+
+print("\nrewriting .gitignore...")
+entries = ['*', '!.gitignore']
+with open(files_path + "/.gitignore", 'w') as f:
+    f.write('\n'.join(entries))
+
+shutil.rmtree(temp_images_path)
author	Bob Zeleznik <zzzman@gmail.com>	2020-02-02 11:19:48 -0500
committer	Bob Zeleznik <zzzman@gmail.com>	2020-02-02 11:19:48 -0500
commit	0b4ac56df4dcac9485f484ffd444467d3c12e44c (patch)
tree	98d8fa0e669274a618115493019ae4a095751b9e /src/scraping/buxton/scraper.py
parent	e544d3c4eb0e4263777735fedbf4d3987cd2b62c (diff)
parent	a4620be72d758223e41e60974e0bfb8fd764c274 (diff)