fixed serialization typo and added draft of python word doc scraper to git directory for safety

author: Sam Wilkins <samuel_wilkins@brown.edu> 2019-06-18 22:02:58 -0400
committer: Sam Wilkins <samuel_wilkins@brown.edu> 2019-06-18 22:02:58 -0400
commit: 4bec1d89eff45d6dcbb4041bc211db88d9da1c8f (patch)
tree: f2aa63e3202e2eab05eda2fa998661853e832cdc /src/buxton/scraper.py
parent: de0304b2966ebdede9d9db8c510e19020046115c (diff)
1 files changed, 128 insertions, 0 deletions
diff --git a/src/buxton/scraper.py b/src/buxton/scraper.py
new file mode 100644
index 000000000..0abebb485
--- /dev/null
+++ b/src/buxton/scraper.py
@@ -0,0 +1,128 @@
+import os
+import docx2txt
+from docx import Document
+from docx.opc.constants import RELATIONSHIP_TYPE as RT
+import re
+from pymongo import MongoClient
+import shutil
+import uuid
+
+source = "./source"
+dist = "./Dash-Web/src/server/public/files"
+
+collection_handle = MongoClient("localhost", 27017)["Dash"]["buxton"]
+
+def extract_links(fileName):
+    links = []
+    doc = Document(fileName)
+    rels = doc.part.rels
+    for rel in rels:
+        item = rels[rel]
+        if item.reltype == RT.HYPERLINK and ".aspx" not in item._target:
+            links.append(item._target)
+    return links
+
+def extract_value(kv_string):
+    return kv_string.split(":")[1].strip()
+
+def mkdir_if_absent(path):
+    try:
+        if not os.path.exists(path):
+            os.mkdir(path)
+    except OSError:
+        print("Failed to create the appropriate directory structures for %s" % file_name)
+
+def parse_document(file_name: str):
+    result = {}
+    pure_name = file_name.split(".")[0]
+
+    dir_path = dist + "/" + pure_name
+    mkdir_if_absent(dir_path)
+
+    raw = str(docx2txt.process(source + "/" + file_name, dir_path))
+
+    sanitize = lambda line: re.sub("[\n\t]+", "", line).strip().replace(u"\u00A0", " ").replace(u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''')
+    remove_empty = lambda line: len(line) > 1
+
+    lines = list(map(sanitize, raw.split("\n")))
+    lines = list(filter(remove_empty, lines))
+
+    result["file_name"] = file_name
+    result["title"] = lines[2]
+    result["short_description"] = lines[3].replace("Short Description: ", "")
+
+    cur = 5
+    notes = ""
+    while lines[cur] != "Device Details":
+        notes += lines[cur] + " "
+        cur += 1
+    result["buxton_notes"] = notes.strip()
+    
+    cur += 1
+    clean = list(map(lambda data: data.strip().split(":"), lines[cur].split("|")))
+    result["company"] = clean[0][1].strip()
+    result["year"] = clean[1][1].strip()
+    result["original_price"] = clean[2][1].strip()
+
+    cur += 1
+    result["degrees_of_freedom"] = extract_value(lines[cur])
+    cur += 1
+    result["dimensions"] = extract_value(lines[cur])
+
+    cur += 2
+    result["primary_key"] = extract_value(lines[cur])
+    cur += 1
+    result["secondary_key"] = extract_value(lines[cur])
+
+    result["hyperlinks"] = extract_links(source + "/" + file_name)
+
+    cur += 2
+    link_descriptions = []
+    while lines[cur] != "Image":
+        link_descriptions.append(lines[cur])
+        cur += 1
+    result["link_descriptions"] = link_descriptions
+
+    images = []
+    captions = []
+    cur += 3
+    while cur + 1 < len(lines) and lines[cur] != "NOTES:":
+        images.append(lines[cur])
+        captions.append(lines[cur + 1])
+        cur += 2
+    result["images"] = images
+    result["captions"] = captions
+
+    notes = []
+    if (cur < len(lines) and lines[cur] == "NOTES:"):
+        cur += 1
+        while cur < len(lines):
+            notes.append(lines[cur])
+            cur += 1
+    result["notes"] = notes
+
+    return result  
+
+def upload(document):
+    wrapper = {}
+    wrapper["_id"] = str(uuid.uuid4())
+    wrapper["fields"] = document
+    wrapper["__type"] = "Doc"
+    collection_handle.insert_one(wrapper)
+
+if os.path.exists(dist):
+    shutil.rmtree(dist)
+while (os.path.exists(dist)):
+    pass
+os.mkdir(dist)
+
+for file_name in os.listdir(source):
+    if file_name.endswith('.docx'):
+        upload(parse_document(file_name))
+
+lines = ['*', '!.gitignore']
+with open(dist + "/.gitignore", 'w') as f:
+    f.write('\n'.join(lines))
+
+
+
author	Sam Wilkins <samuel_wilkins@brown.edu>	2019-06-18 22:02:58 -0400
committer	Sam Wilkins <samuel_wilkins@brown.edu>	2019-06-18 22:02:58 -0400
commit	4bec1d89eff45d6dcbb4041bc211db88d9da1c8f (patch)
tree	f2aa63e3202e2eab05eda2fa998661853e832cdc /src/buxton/scraper.py
parent	de0304b2966ebdede9d9db8c510e19020046115c (diff)