jsonifier

author: Sam Wilkins <samwilkins333@gmail.com> 2020-02-05 16:23:24 -0500
committer: Sam Wilkins <samwilkins333@gmail.com> 2020-02-05 16:23:24 -0500
commit: 79e93e4bb852267f53710d865190c45041127538 (patch)
tree: 5fe6ea4ba2990543e1da59a4dc3a6d177ac0ae71 /src/scraping/buxton/jsonifier.py
parent: 23190ee16a7f54474060ea87767f5336242d03d7 (diff)
1 files changed, 231 insertions, 0 deletions
diff --git a/src/scraping/buxton/jsonifier.py b/src/scraping/buxton/jsonifier.py
new file mode 100644
index 000000000..a315d49c0
--- /dev/null
+++ b/src/scraping/buxton/jsonifier.py
@@ -0,0 +1,231 @@
+import os
+import docx2txt
+from docx import Document
+from docx.opc.constants import RELATIONSHIP_TYPE as RT
+import re
+import shutil
+import uuid
+import json
+import base64
+from shutil import copyfile
+from PIL import Image
+
+files_path = "../../server/public/files"
+source_path = "./source"
+temp_images_path = "./extracted_images"
+server_images_path = f"{files_path}/images/buxton"
+json_path = "./json"
+
+
+# noinspection PyProtectedMember
+def extract_links(file):
+    links = []
+    doc = Document(file)
+    rels = doc.part.rels
+    for rel in rels:
+        item = rels[rel]
+        if item.reltype == RT.HYPERLINK and ".aspx" not in item._target:
+            links.append(item._target)
+    return links
+
+
+def extract_value(kv_string):
+    pieces = kv_string.split(":")
+    return (pieces[1] if len(pieces) > 1 else kv_string).strip()
+
+
+def mkdir_if_absent(path):
+    try:
+        if not os.path.exists(path):
+            os.mkdir(path)
+    except OSError:
+        print("failed to create the appropriate directory structures for %s" % file_name)
+
+
+def guid():
+    return str(uuid.uuid4())
+
+
+def encode_image(folder: str, name: str):
+    with open(f"{temp_images_path}/{folder}/{name}", "rb") as image:
+        encoded = base64.b64encode(image.read())
+        return encoded.decode("utf-8")
+
+
+def parse_document(name: str):
+    print(f"parsing {name}...")
+    pure_name = name.split(".")[0]
+
+    result = {}
+
+    saved_device_images_dir = server_images_path + "/" + pure_name
+    temp_device_images_dir = temp_images_path + "/" + pure_name
+    mkdir_if_absent(temp_device_images_dir)
+    mkdir_if_absent(saved_device_images_dir)
+
+    raw = str(docx2txt.process(source_path +
+                               "/" + name, temp_device_images_dir))
+
+    extracted_images = []
+    for image in os.listdir(temp_device_images_dir):
+        temp = f"{temp_device_images_dir}/{image}"
+        native_width, native_height = Image.open(temp).size
+        if abs(native_width - native_height) < 10:
+            continue
+        original = saved_device_images_dir + "/" + image.replace(".", "_o.", 1)
+        medium = saved_device_images_dir + "/" + image.replace(".", "_m.", 1)
+        copyfile(temp, original)
+        copyfile(temp, medium)
+        server_path = f"http://localhost:1050/files/images/buxton/{pure_name}/{image}"
+        extracted_images.append(server_path)
+    result["extracted_images"] = extracted_images
+
+    def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
+        u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip()
+
+    def sanitize_price(raw_price: str):
+        raw_price = raw_price.replace(",", "")
+        start = raw_price.find("$")
+        if "x" in raw_price.lower():
+            return None
+        if start > -1:
+            i = start + 1
+            while i < len(raw_price) and re.match(r"[0-9.]", raw_price[i]):
+                i += 1
+            price = raw_price[start + 1: i + 1]
+            return float(price)
+        elif raw_price.lower().find("nfs"):
+            return -1
+        else:
+            return None
+
+    def remove_empty(line): return len(line) > 1
+
+    def try_parse(to_parse: int):
+        try:
+            value = int(to_parse)
+            return value
+        except ValueError:
+            value = None
+            return value
+
+    lines = list(map(sanitize, raw.split("\n")))
+    lines = list(filter(remove_empty, lines))
+
+    result["title"] = lines[2].strip()
+    result["short_description"] = lines[3].strip().replace(
+        "Short Description: ", "")
+
+    cur = 5
+    notes = ""
+    while lines[cur] != "Device Details":
+        notes += lines[cur] + " "
+        cur += 1
+    result["buxton_notes"] = notes.strip()
+
+    cur += 1
+    clean = list(
+        map(lambda data: data.strip().split(":"), lines[cur].split("|")))
+    result["company"] = clean[0][len(clean[0]) - 1].strip()
+
+    result["year"] = try_parse(clean[1][len(clean[1]) - 1].strip())
+    result["original_price"] = sanitize_price(
+        clean[2][len(clean[2]) - 1].strip())
+
+    cur += 1
+
+    result["degrees_of_freedom"] = try_parse(extract_value(
+        lines[cur]).replace("NA", "N/A"))
+    cur += 1
+
+    dimensions = lines[cur].lower()
+    if dimensions.startswith("dimensions"):
+        dim_concat = dimensions[11:].strip()
+        cur += 1
+        while lines[cur] != "Key Words":
+            dim_concat += (" " + lines[cur].strip())
+            cur += 1
+        result["dimensions"] = dim_concat
+    else:
+        result["dimensions"] = "N/A"
+
+    cur += 1
+    result["primary_key"] = extract_value(lines[cur])
+    cur += 1
+    result["secondary_key"] = extract_value(lines[cur])
+
+    while lines[cur] != "Links":
+        result["secondary_key"] += (" " + extract_value(lines[cur]).strip())
+        cur += 1
+
+    cur += 1
+    link_descriptions = []
+    while lines[cur] != "Image":
+        description = lines[cur].strip().lower()
+        valid = True
+        for ignored in ["powerpoint", "vimeo", "xxx"]:
+            if ignored in description:
+                valid = False
+                break
+        if valid:
+            link_descriptions.append(description)
+        cur += 1
+    result["link_descriptions"] = link_descriptions
+
+    result["hyperlinks"] = extract_links(source_path + "/" + name)
+
+    images = []
+    captions = []
+    cur += 3
+    while cur + 1 < len(lines) and lines[cur] != "NOTES:":
+        name = lines[cur]
+        if "full document" not in name.lower():
+            images.append(name)
+            captions.append(lines[cur + 1])
+        cur += 2
+    result["table_image_names"] = images
+
+    result["captions"] = captions
+
+    notes = []
+    if cur < len(lines) and lines[cur] == "NOTES:":
+        cur += 1
+        while cur < len(lines):
+            notes.append(lines[cur])
+            cur += 1
+    if len(notes) > 0:
+        result["notes"] = notes
+
+    return result
+
+
+if os.path.exists(server_images_path):
+    shutil.rmtree(server_images_path)
+while os.path.exists(server_images_path):
+    pass
+os.mkdir(server_images_path)
+
+mkdir_if_absent(source_path)
+mkdir_if_absent(json_path)
+mkdir_if_absent(temp_images_path)
+
+results = []
+
+candidates = 0
+for file_name in os.listdir(source_path):
+    if file_name.endswith('.docx') or file_name.endswith(".doc"):
+        candidates += 1
+        results.append(parse_document(file_name))
+
+
+with open(f"./json/buxton_collection.json", "w", encoding="utf-8") as out:
+    json.dump(results, out, ensure_ascii=False, indent=4)
+
+print(f"\nSuccessfully parsed {candidates} candidates.")
+
+print("\nrewriting .gitignore...")
+entries = ['*', '!.gitignore']
+with open(files_path + "/.gitignore", 'w') as f:
+    f.write('\n'.join(entries))
+
+shutil.rmtree(temp_images_path)
author	Sam Wilkins <samwilkins333@gmail.com>	2020-02-05 16:23:24 -0500
committer	Sam Wilkins <samwilkins333@gmail.com>	2020-02-05 16:23:24 -0500
commit	79e93e4bb852267f53710d865190c45041127538 (patch)
tree	5fe6ea4ba2990543e1da59a4dc3a6d177ac0ae71 /src/scraping/buxton/jsonifier.py
parent	23190ee16a7f54474060ea87767f5336242d03d7 (diff)