aboutsummaryrefslogtreecommitdiff
path: root/src/buxton/scraper.py
diff options
context:
space:
mode:
authorSam Wilkins <samwilkins333@gmail.com>2019-06-19 02:19:03 -0400
committerSam Wilkins <samwilkins333@gmail.com>2019-06-19 02:19:03 -0400
commit5147528ef76ed069d7c5f1fc1feb7404c92227bc (patch)
tree4d5021bfc398c87e8675c1b4ea880e7c50a494f2 /src/buxton/scraper.py
parent4bec1d89eff45d6dcbb4041bc211db88d9da1c8f (diff)
first pass at all documents, improved scraping for handling variation
Diffstat (limited to 'src/buxton/scraper.py')
-rw-r--r--src/buxton/scraper.py69
1 files changed, 46 insertions, 23 deletions
diff --git a/src/buxton/scraper.py b/src/buxton/scraper.py
index 0abebb485..854c99379 100644
--- a/src/buxton/scraper.py
+++ b/src/buxton/scraper.py
@@ -8,9 +8,12 @@ import shutil
import uuid
source = "./source"
-dist = "./Dash-Web/src/server/public/files"
+dist = "../server/public/files"
+
+db = MongoClient("localhost", 27017)["Dash"]
+db.buxton.drop()
+collection_handle = db.buxton
-collection_handle = MongoClient("localhost", 27017)["Dash"]["buxton"]
def extract_links(fileName):
links = []
@@ -22,8 +25,11 @@ def extract_links(fileName):
links.append(item._target)
return links
+
def extract_value(kv_string):
- return kv_string.split(":")[1].strip()
+ pieces = kv_string.split(":")
+ return (pieces[1] if len(pieces) > 1 else kv_string).strip()
+
def mkdir_if_absent(path):
try:
@@ -32,6 +38,7 @@ def mkdir_if_absent(path):
except OSError:
print("Failed to create the appropriate directory structures for %s" % file_name)
+
def parse_document(file_name: str):
result = {}
pure_name = file_name.split(".")[0]
@@ -41,15 +48,18 @@ def parse_document(file_name: str):
raw = str(docx2txt.process(source + "/" + file_name, dir_path))
- sanitize = lambda line: re.sub("[\n\t]+", "", line).strip().replace(u"\u00A0", " ").replace(u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''')
- remove_empty = lambda line: len(line) > 1
+ def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
+ u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip()
+
+ def remove_empty(line): return len(line) > 1
lines = list(map(sanitize, raw.split("\n")))
lines = list(filter(remove_empty, lines))
result["file_name"] = file_name
- result["title"] = lines[2]
- result["short_description"] = lines[3].replace("Short Description: ", "")
+ result["title"] = lines[2].strip()
+ result["short_description"] = lines[3].strip().replace(
+ "Short Description: ", "")
cur = 5
notes = ""
@@ -57,32 +67,44 @@ def parse_document(file_name: str):
notes += lines[cur] + " "
cur += 1
result["buxton_notes"] = notes.strip()
-
+
cur += 1
- clean = list(map(lambda data: data.strip().split(":"), lines[cur].split("|")))
- result["company"] = clean[0][1].strip()
- result["year"] = clean[1][1].strip()
- result["original_price"] = clean[2][1].strip()
+ clean = list(
+ map(lambda data: data.strip().split(":"), lines[cur].split("|")))
+ result["company"] = clean[0][len(clean[0]) - 1].strip()
+ result["year"] = clean[1][len(clean[1]) - 1].strip()
+ result["original_price"] = clean[2][len(clean[2]) - 1].strip()
cur += 1
result["degrees_of_freedom"] = extract_value(lines[cur])
cur += 1
- result["dimensions"] = extract_value(lines[cur])
- cur += 2
+ dimensions = lines[cur].lower()
+ if dimensions.startswith("dimensions"):
+ result["dimensions"] = dimensions[11:].strip()
+ cur += 1
+ while lines[cur] != "Key Words":
+ result["dimensions"] += (" " + lines[cur].strip())
+ cur += 1
+
+ cur += 1
result["primary_key"] = extract_value(lines[cur])
cur += 1
result["secondary_key"] = extract_value(lines[cur])
- result["hyperlinks"] = extract_links(source + "/" + file_name)
+ while lines[cur] != "Links":
+ result["secondary_key"] += (" " + extract_value(lines[cur]).strip())
+ cur += 1
- cur += 2
+ cur += 1
link_descriptions = []
while lines[cur] != "Image":
- link_descriptions.append(lines[cur])
+ link_descriptions.append(lines[cur].strip())
cur += 1
result["link_descriptions"] = link_descriptions
+ result["hyperlinks"] = extract_links(source + "/" + file_name)
+
images = []
captions = []
cur += 3
@@ -99,9 +121,11 @@ def parse_document(file_name: str):
while cur < len(lines):
notes.append(lines[cur])
cur += 1
- result["notes"] = notes
+ if len(notes) > 0:
+ result["notes"] = notes
+
+ return result
- return result
def upload(document):
wrapper = {}
@@ -110,11 +134,13 @@ def upload(document):
wrapper["__type"] = "Doc"
collection_handle.insert_one(wrapper)
+
if os.path.exists(dist):
shutil.rmtree(dist)
-while (os.path.exists(dist)):
+while os.path.exists(dist):
pass
os.mkdir(dist)
+mkdir_if_absent(source)
for file_name in os.listdir(source):
if file_name.endswith('.docx'):
@@ -123,6 +149,3 @@ for file_name in os.listdir(source):
lines = ['*', '!.gitignore']
with open(dist + "/.gitignore", 'w') as f:
f.write('\n'.join(lines))
-
-
-