aboutsummaryrefslogtreecommitdiff
path: root/src/buxton/scraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/buxton/scraper.py')
-rw-r--r--src/buxton/scraper.py128
1 files changed, 128 insertions, 0 deletions
diff --git a/src/buxton/scraper.py b/src/buxton/scraper.py
new file mode 100644
index 000000000..0abebb485
--- /dev/null
+++ b/src/buxton/scraper.py
@@ -0,0 +1,128 @@
+import os
+import docx2txt
+from docx import Document
+from docx.opc.constants import RELATIONSHIP_TYPE as RT
+import re
+from pymongo import MongoClient
+import shutil
+import uuid
+
+source = "./source"
+dist = "./Dash-Web/src/server/public/files"
+
+collection_handle = MongoClient("localhost", 27017)["Dash"]["buxton"]
+
+def extract_links(fileName):
+ links = []
+ doc = Document(fileName)
+ rels = doc.part.rels
+ for rel in rels:
+ item = rels[rel]
+ if item.reltype == RT.HYPERLINK and ".aspx" not in item._target:
+ links.append(item._target)
+ return links
+
+def extract_value(kv_string):
+ return kv_string.split(":")[1].strip()
+
+def mkdir_if_absent(path):
+ try:
+ if not os.path.exists(path):
+ os.mkdir(path)
+ except OSError:
+ print("Failed to create the appropriate directory structures for %s" % file_name)
+
+def parse_document(file_name: str):
+ result = {}
+ pure_name = file_name.split(".")[0]
+
+ dir_path = dist + "/" + pure_name
+ mkdir_if_absent(dir_path)
+
+ raw = str(docx2txt.process(source + "/" + file_name, dir_path))
+
+ sanitize = lambda line: re.sub("[\n\t]+", "", line).strip().replace(u"\u00A0", " ").replace(u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''')
+ remove_empty = lambda line: len(line) > 1
+
+ lines = list(map(sanitize, raw.split("\n")))
+ lines = list(filter(remove_empty, lines))
+
+ result["file_name"] = file_name
+ result["title"] = lines[2]
+ result["short_description"] = lines[3].replace("Short Description: ", "")
+
+ cur = 5
+ notes = ""
+ while lines[cur] != "Device Details":
+ notes += lines[cur] + " "
+ cur += 1
+ result["buxton_notes"] = notes.strip()
+
+ cur += 1
+ clean = list(map(lambda data: data.strip().split(":"), lines[cur].split("|")))
+ result["company"] = clean[0][1].strip()
+ result["year"] = clean[1][1].strip()
+ result["original_price"] = clean[2][1].strip()
+
+ cur += 1
+ result["degrees_of_freedom"] = extract_value(lines[cur])
+ cur += 1
+ result["dimensions"] = extract_value(lines[cur])
+
+ cur += 2
+ result["primary_key"] = extract_value(lines[cur])
+ cur += 1
+ result["secondary_key"] = extract_value(lines[cur])
+
+ result["hyperlinks"] = extract_links(source + "/" + file_name)
+
+ cur += 2
+ link_descriptions = []
+ while lines[cur] != "Image":
+ link_descriptions.append(lines[cur])
+ cur += 1
+ result["link_descriptions"] = link_descriptions
+
+ images = []
+ captions = []
+ cur += 3
+ while cur + 1 < len(lines) and lines[cur] != "NOTES:":
+ images.append(lines[cur])
+ captions.append(lines[cur + 1])
+ cur += 2
+ result["images"] = images
+ result["captions"] = captions
+
+ notes = []
+ if (cur < len(lines) and lines[cur] == "NOTES:"):
+ cur += 1
+ while cur < len(lines):
+ notes.append(lines[cur])
+ cur += 1
+ result["notes"] = notes
+
+ return result
+
+def upload(document):
+ wrapper = {}
+ wrapper["_id"] = str(uuid.uuid4())
+ wrapper["fields"] = document
+ wrapper["__type"] = "Doc"
+ collection_handle.insert_one(wrapper)
+
+if os.path.exists(dist):
+ shutil.rmtree(dist)
+while (os.path.exists(dist)):
+ pass
+os.mkdir(dist)
+
+for file_name in os.listdir(source):
+ if file_name.endswith('.docx'):
+ upload(parse_document(file_name))
+
+lines = ['*', '!.gitignore']
+with open(dist + "/.gitignore", 'w') as f:
+ f.write('\n'.join(lines))
+
+
+