aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorSam Wilkins <samuel_wilkins@brown.edu>2019-06-19 14:13:46 -0400
committerSam Wilkins <samuel_wilkins@brown.edu>2019-06-19 14:13:46 -0400
commit48d2ece574f613c084a083ae05148498dd0030e3 (patch)
tree31710bbbd4a98f1066923eede7daeab0ad621172 /src
parent5147528ef76ed069d7c5f1fc1feb7404c92227bc (diff)
implemented simulation of view and data docs
Diffstat (limited to 'src')
-rw-r--r--src/buxton/scraper.py122
1 files changed, 110 insertions, 12 deletions
diff --git a/src/buxton/scraper.py b/src/buxton/scraper.py
index 854c99379..a3bbc75ac 100644
--- a/src/buxton/scraper.py
+++ b/src/buxton/scraper.py
@@ -6,13 +6,14 @@ import re
from pymongo import MongoClient
import shutil
import uuid
+import datetime
+from PIL import Image
source = "./source"
dist = "../server/public/files"
db = MongoClient("localhost", 27017)["Dash"]
-db.buxton.drop()
-collection_handle = db.buxton
+view_doc_guids = []
def extract_links(fileName):
@@ -39,7 +40,71 @@ def mkdir_if_absent(path):
print("Failed to create the appropriate directory structures for %s" % file_name)
+def guid():
+ return str(uuid.uuid4())
+
+
+def write_image(folder, name):
+ path = f"http://localhost:1050/files/{folder}/{name}"
+
+ data_doc_guid = guid()
+ view_doc_guid = guid()
+
+ view_doc = {
+ "_id": view_doc_guid,
+ "fields": {
+ "proto": {
+ "fieldId": data_doc_guid,
+ "__type": "proxy"
+ },
+ "x": 10,
+ "y": 10,
+ "width": 300,
+ "zIndex": 2,
+ "libraryBrush": False
+ },
+ "__type": "Doc"
+ }
+
+ image = Image.open(f"{dist}/{folder}/{name}")
+ native_width, native_height = image.size
+
+ data_doc = {
+ "_id": data_doc_guid,
+ "fields": {
+ "proto": {
+ "_id": "imageProto",
+ "__type": "proxy"
+ },
+ "data": {
+ "url": path,
+ "type": "image"
+ },
+ "title": name,
+ "nativeWidth": native_width,
+ "author": "Bill Buxton",
+ "creationDate": {
+ "date": datetime.datetime.utcnow().microsecond,
+ "__type": "date"
+ },
+ "isPrototype": True,
+ "page": -1,
+ "nativeHeight": native_height,
+ "height": native_height
+ },
+ "__type": "Doc"
+ }
+
+ db.newDocuments.insert_one(view_doc)
+ db.newDocuments.insert_one(data_doc)
+
+ print(path)
+
+ return view_doc_guid
+
+
def parse_document(file_name: str):
+ print(f"Parsing {file_name}...")
result = {}
pure_name = file_name.split(".")[0]
@@ -48,6 +113,11 @@ def parse_document(file_name: str):
raw = str(docx2txt.process(source + "/" + file_name, dir_path))
+ print("Extracting images...\n")
+ for image in os.listdir(dir_path):
+ view_doc_guids.append(write_image(pure_name, image))
+ print()
+
def sanitize(line): return re.sub("[\n\t]+", "", line).replace(u"\u00A0", " ").replace(
u"\u2013", "-").replace(u"\u201c", '''"''').replace(u"\u201d", '''"''').strip()
@@ -76,16 +146,20 @@ def parse_document(file_name: str):
result["original_price"] = clean[2][len(clean[2]) - 1].strip()
cur += 1
- result["degrees_of_freedom"] = extract_value(lines[cur])
+ result["degrees_of_freedom"] = extract_value(
+ lines[cur]).replace("NA", "N/A")
cur += 1
dimensions = lines[cur].lower()
if dimensions.startswith("dimensions"):
- result["dimensions"] = dimensions[11:].strip()
+ dim_concat = dimensions[11:].strip()
cur += 1
while lines[cur] != "Key Words":
- result["dimensions"] += (" " + lines[cur].strip())
+ dim_concat += (" " + lines[cur].strip())
cur += 1
+ result["dimensions"] = dim_concat
+ else:
+ result["dimensions"] = "N/A"
cur += 1
result["primary_key"] = extract_value(lines[cur])
@@ -124,15 +198,22 @@ def parse_document(file_name: str):
if len(notes) > 0:
result["notes"] = notes
+ print("...contents dictionary constructed.")
+
return result
-def upload(document):
- wrapper = {}
- wrapper["_id"] = str(uuid.uuid4())
- wrapper["fields"] = document
- wrapper["__type"] = "Doc"
- collection_handle.insert_one(wrapper)
+def wrap(document):
+ return {
+ "_id": guid(),
+ "fields": document,
+ "__type": "Doc"
+ }
+
+
+def upload(collection, mongofied):
+ for doc in mongofied:
+ collection.insert_one(doc)
if os.path.exists(dist):
@@ -142,9 +223,26 @@ while os.path.exists(dist):
os.mkdir(dist)
mkdir_if_absent(source)
+candidates = 0
+mongofied = []
for file_name in os.listdir(source):
if file_name.endswith('.docx'):
- upload(parse_document(file_name))
+ candidates += 1
+ mongofied.append(wrap(parse_document(file_name)))
+
+for doc in mongofied:
+ db.newDocuments.insert_one(doc)
+
+proxified = list(
+ map(lambda guid: {"fieldId": guid, "type": "proxy"}, view_doc_guids))
+db.newDocuments.update_one(
+ {"fields.title": "WS collection 1"},
+ {"$push": {"fields.data.fields": {"$each": proxified}}}
+)
+
+print("...dictionaries written to Dash Document.\n")
+
+print(f"{candidates} candidates processed.")
lines = ['*', '!.gitignore']
with open(dist + "/.gitignore", 'w') as f: