diff options
author | Sam Wilkins <samuel_wilkins@brown.edu> | 2019-06-18 22:02:58 -0400 |
---|---|---|
committer | Sam Wilkins <samuel_wilkins@brown.edu> | 2019-06-18 22:02:58 -0400 |
commit | 4bec1d89eff45d6dcbb4041bc211db88d9da1c8f (patch) | |
tree | f2aa63e3202e2eab05eda2fa998661853e832cdc /src/buxton/scraper.py | |
parent | de0304b2966ebdede9d9db8c510e19020046115c (diff) |
fixed serialization typo and added draft of python word doc scraper to git directory for safety
Diffstat (limited to 'src/buxton/scraper.py')
-rw-r--r-- | src/buxton/scraper.py | 128 |
1 files changed, 128 insertions, 0 deletions
diff --git a/src/buxton/scraper.py b/src/buxton/scraper.py new file mode 100644 index 000000000..0abebb485 --- /dev/null +++ b/src/buxton/scraper.py @@ -0,0 +1,128 @@ +import os +import docx2txt +from docx import Document +from docx.opc.constants import RELATIONSHIP_TYPE as RT +import re +from pymongo import MongoClient +import shutil +import uuid + +source = "./source" +dist = "./Dash-Web/src/server/public/files" + +collection_handle = MongoClient("localhost", 27017)["Dash"]["buxton"] + +def extract_links(fileName): + links = [] + doc = Document(fileName) + rels = doc.part.rels + for rel in rels: + item = rels[rel] + if item.reltype == RT.HYPERLINK and ".aspx" not in item._target: + links.append(item._target) + return links + +def extract_value(kv_string): + return kv_string.split(":")[1].strip() + +def mkdir_if_absent(path): + try: + if not os.path.exists(path): + os.mkdir(path) + except OSError: + print("Failed to create the appropriate directory structures for %s" % file_name) + +def parse_document(file_name: str): + result = {} + pure_name = file_name.split(".")[0] + + dir_path = dist + "/" + pure_name + mkdir_if_absent(dir_path) + + raw = str(docx2txt.process(source + "/" + file_name, dir_path)) + + sanitize = lambda line: re.sub("[\n\t]+", "", line).strip().replace(u"\u00A0", " ").replace(u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''') + remove_empty = lambda line: len(line) > 1 + + lines = list(map(sanitize, raw.split("\n"))) + lines = list(filter(remove_empty, lines)) + + result["file_name"] = file_name + result["title"] = lines[2] + result["short_description"] = lines[3].replace("Short Description: ", "") + + cur = 5 + notes = "" + while lines[cur] != "Device Details": + notes += lines[cur] + " " + cur += 1 + result["buxton_notes"] = notes.strip() + + cur += 1 + clean = list(map(lambda data: data.strip().split(":"), lines[cur].split("|"))) + result["company"] = clean[0][1].strip() + result["year"] = clean[1][1].strip() + result["original_price"] = clean[2][1].strip() + + cur += 1 + result["degrees_of_freedom"] = extract_value(lines[cur]) + cur += 1 + result["dimensions"] = extract_value(lines[cur]) + + cur += 2 + result["primary_key"] = extract_value(lines[cur]) + cur += 1 + result["secondary_key"] = extract_value(lines[cur]) + + result["hyperlinks"] = extract_links(source + "/" + file_name) + + cur += 2 + link_descriptions = [] + while lines[cur] != "Image": + link_descriptions.append(lines[cur]) + cur += 1 + result["link_descriptions"] = link_descriptions + + images = [] + captions = [] + cur += 3 + while cur + 1 < len(lines) and lines[cur] != "NOTES:": + images.append(lines[cur]) + captions.append(lines[cur + 1]) + cur += 2 + result["images"] = images + result["captions"] = captions + + notes = [] + if (cur < len(lines) and lines[cur] == "NOTES:"): + cur += 1 + while cur < len(lines): + notes.append(lines[cur]) + cur += 1 + result["notes"] = notes + + return result + +def upload(document): + wrapper = {} + wrapper["_id"] = str(uuid.uuid4()) + wrapper["fields"] = document + wrapper["__type"] = "Doc" + collection_handle.insert_one(wrapper) + +if os.path.exists(dist): + shutil.rmtree(dist) +while (os.path.exists(dist)): + pass +os.mkdir(dist) + +for file_name in os.listdir(source): + if file_name.endswith('.docx'): + upload(parse_document(file_name)) + +lines = ['*', '!.gitignore'] +with open(dist + "/.gitignore", 'w') as f: + f.write('\n'.join(lines)) + + + |