diff options
author | Sam Wilkins <samwilkins333@gmail.com> | 2019-06-19 02:19:03 -0400 |
---|---|---|
committer | Sam Wilkins <samwilkins333@gmail.com> | 2019-06-19 02:19:03 -0400 |
commit | 5147528ef76ed069d7c5f1fc1feb7404c92227bc (patch) | |
tree | 4d5021bfc398c87e8675c1b4ea880e7c50a494f2 /src/buxton/scraper.py | |
parent | 4bec1d89eff45d6dcbb4041bc211db88d9da1c8f (diff) |
first pass at all documents, improved scraping for handling variation
Diffstat (limited to 'src/buxton/scraper.py')
-rw-r--r-- | src/buxton/scraper.py | 69 |
1 files changed, 46 insertions, 23 deletions
diff --git a/src/buxton/scraper.py b/src/buxton/scraper.py index 0abebb485..854c99379 100644 --- a/src/buxton/scraper.py +++ b/src/buxton/scraper.py @@ -8,9 +8,12 @@ import shutil import uuid source = "./source" -dist = "./Dash-Web/src/server/public/files" +dist = "../server/public/files" + +db = MongoClient("localhost", 27017)["Dash"] +db.buxton.drop() +collection_handle = db.buxton -collection_handle = MongoClient("localhost", 27017)["Dash"]["buxton"] def extract_links(fileName): links = [] @@ -22,8 +25,11 @@ def extract_links(fileName): links.append(item._target) return links + def extract_value(kv_string): - return kv_string.split(":")[1].strip() + pieces = kv_string.split(":") + return (pieces[1] if len(pieces) > 1 else kv_string).strip() + def mkdir_if_absent(path): try: @@ -32,6 +38,7 @@ def mkdir_if_absent(path): except OSError: print("Failed to create the appropriate directory structures for %s" % file_name) + def parse_document(file_name: str): result = {} pure_name = file_name.split(".")[0] @@ -41,15 +48,18 @@ def parse_document(file_name: str): raw = str(docx2txt.process(source + "/" + file_name, dir_path)) - sanitize = lambda line: re.sub("[\n\t]+", "", line).strip().replace(u"\u00A0", " ").replace(u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''') - remove_empty = lambda line: len(line) > 1 + def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace( + u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip() + + def remove_empty(line): return len(line) > 1 lines = list(map(sanitize, raw.split("\n"))) lines = list(filter(remove_empty, lines)) result["file_name"] = file_name - result["title"] = lines[2] - result["short_description"] = lines[3].replace("Short Description: ", "") + result["title"] = lines[2].strip() + result["short_description"] = lines[3].strip().replace( + "Short Description: ", "") cur = 5 notes = "" @@ -57,32 +67,44 @@ def parse_document(file_name: str): notes += lines[cur] + " " cur += 1 result["buxton_notes"] = notes.strip() - + cur += 1 - clean = list(map(lambda data: data.strip().split(":"), lines[cur].split("|"))) - result["company"] = clean[0][1].strip() - result["year"] = clean[1][1].strip() - result["original_price"] = clean[2][1].strip() + clean = list( + map(lambda data: data.strip().split(":"), lines[cur].split("|"))) + result["company"] = clean[0][len(clean[0]) - 1].strip() + result["year"] = clean[1][len(clean[1]) - 1].strip() + result["original_price"] = clean[2][len(clean[2]) - 1].strip() cur += 1 result["degrees_of_freedom"] = extract_value(lines[cur]) cur += 1 - result["dimensions"] = extract_value(lines[cur]) - cur += 2 + dimensions = lines[cur].lower() + if dimensions.startswith("dimensions"): + result["dimensions"] = dimensions[11:].strip() + cur += 1 + while lines[cur] != "Key Words": + result["dimensions"] += (" " + lines[cur].strip()) + cur += 1 + + cur += 1 result["primary_key"] = extract_value(lines[cur]) cur += 1 result["secondary_key"] = extract_value(lines[cur]) - result["hyperlinks"] = extract_links(source + "/" + file_name) + while lines[cur] != "Links": + result["secondary_key"] += (" " + extract_value(lines[cur]).strip()) + cur += 1 - cur += 2 + cur += 1 link_descriptions = [] while lines[cur] != "Image": - link_descriptions.append(lines[cur]) + link_descriptions.append(lines[cur].strip()) cur += 1 result["link_descriptions"] = link_descriptions + result["hyperlinks"] = extract_links(source + "/" + file_name) + images = [] captions = [] cur += 3 @@ -99,9 +121,11 @@ def parse_document(file_name: str): while cur < len(lines): notes.append(lines[cur]) cur += 1 - result["notes"] = notes + if len(notes) > 0: + result["notes"] = notes + + return result - return result def upload(document): wrapper = {} @@ -110,11 +134,13 @@ def upload(document): wrapper["__type"] = "Doc" collection_handle.insert_one(wrapper) + if os.path.exists(dist): shutil.rmtree(dist) -while (os.path.exists(dist)): +while os.path.exists(dist): pass os.mkdir(dist) +mkdir_if_absent(source) for file_name in os.listdir(source): if file_name.endswith('.docx'): @@ -123,6 +149,3 @@ for file_name in os.listdir(source): lines = ['*', '!.gitignore'] with open(dist + "/.gitignore", 'w') as f: f.write('\n'.join(lines)) - - - |