aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/buxton/scraper.py
diff options
context:
space:
mode:
authorbob <bcz@cs.brown.edu>2020-02-03 18:13:38 -0500
committerbob <bcz@cs.brown.edu>2020-02-03 18:13:38 -0500
commit03c14a47c834d735a655d002c5263c088e1fd1fe (patch)
tree7cd5f80dff2ecede5f467b76f4b2b36d60861736 /src/scraping/buxton/scraper.py
parentfdc0e0e157d70fdcc569525db58d8e6e642c0041 (diff)
parentb66532c0545fea6476c4b66c1fdee9b16ac5df44 (diff)
Merge branch 'master' of https://github.com/browngraphicslab/Dash-Web
Diffstat (limited to 'src/scraping/buxton/scraper.py')
-rw-r--r--src/scraping/buxton/scraper.py14
1 files changed, 7 insertions, 7 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index 3375c1141..1441a8621 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -115,8 +115,8 @@ def write_collection(parse_results, display_fields, storage_key, viewType):
target_collection.insert_one(view_doc)
data_doc_guid = data_doc["_id"]
- print(f"inserted view document ({view_doc_guid})")
- print(f"inserted data document ({data_doc_guid})\n")
+ # print(f"inserted view document ({view_doc_guid})")
+ # print(f"inserted data document ({data_doc_guid})\n")
return view_doc_guid
@@ -233,7 +233,7 @@ def parse_document(file_name: str):
result = {}
dir_path = image_dist + "/" + pure_name
- print(dir_path)
+ # print(dir_path)
mkdir_if_absent(dir_path)
raw = str(docx2txt.process(source + "/" + file_name, dir_path))
@@ -252,7 +252,7 @@ def parse_document(file_name: str):
medium = dir_path + "/" + image.replace(".", "_m.", 1)
copyfile(resolved, original)
copyfile(resolved, medium)
- print(f"extracted {count} images...")
+ # print(f"extracted {count} images...")
def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip()
@@ -360,7 +360,7 @@ def parse_document(file_name: str):
if len(notes) > 0:
result["notes"] = listify(notes)
- print("writing child schema...")
+ # print("writing child schema...")
return {
"schema": {
@@ -392,7 +392,7 @@ def write_common_proto():
if os.path.exists(image_dist):
- shutil.rmtree(image_dist)
+ shutil.rmtree(image_dist, True)
while os.path.exists(image_dist):
pass
os.mkdir(image_dist)
@@ -415,7 +415,7 @@ parent_guid = write_collection({
"__type": "Doc"
},
"child_guids": schema_guids
-}, ["title", "short_description", "original_price"], "data", 2)
+}, ["title", "short_description", "original_price"], "data", 4)
print("appending parent schema to main workspace...\n")
target_collection.update_one(