aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/buxton/scraper.py
diff options
context:
space:
mode:
authorSam Wilkins <samwilkins333@gmail.com>2019-07-09 13:52:35 -0400
committerSam Wilkins <samwilkins333@gmail.com>2019-07-09 13:52:35 -0400
commit7fdb33cafaa3e8593d648ddba994356a9625ff56 (patch)
tree5a1b2657dc1e9d1e837f0ecee24f667aecc5c56a /src/scraping/buxton/scraper.py
parentda5ee792f64f74f514aceda1db9f54c893090d32 (diff)
string list to list of text doc map in scraper and cleaned up kv pane templating interaction code
Diffstat (limited to 'src/scraping/buxton/scraper.py')
-rw-r--r--src/scraping/buxton/scraper.py61
1 files changed, 58 insertions, 3 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index 02c6d8b74..700269727 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -26,7 +26,7 @@ def extract_links(fileName):
item = rels[rel]
if item.reltype == RT.HYPERLINK and ".aspx" not in item._target:
links.append(item._target)
- return listify(links)
+ return text_doc_map(links)
def extract_value(kv_string):
@@ -60,6 +60,12 @@ def protofy(fieldId):
}
+def text_doc_map(string_list):
+ def guid_map(caption):
+ return write_text_doc(caption)
+ return listify(proxify_guids(list(map(guid_map, string_list))))
+
+
def write_schema(parse_results, display_fields, storage_key):
view_guids = parse_results["child_guids"]
@@ -110,6 +116,54 @@ def write_schema(parse_results, display_fields, storage_key):
return view_doc_guid
+def write_text_doc(content):
+ data_doc_guid = guid()
+ view_doc_guid = guid()
+
+ view_doc = {
+ "_id": view_doc_guid,
+ "fields": {
+ "proto": protofy(data_doc_guid),
+ "x": 10,
+ "y": 10,
+ "width": 400,
+ "zIndex": 2,
+ "libraryBrush": False
+ },
+ "__type": "Doc"
+ }
+
+ data_doc = {
+ "_id": data_doc_guid,
+ "fields": {
+ "proto": protofy("textProto"),
+ "data": {
+ "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}',
+ "__type": "RichTextField"
+ },
+ "title": content,
+ "nativeWidth": 200,
+ "author": "Bill Buxton",
+ "creationDate": {
+ "date": datetime.datetime.utcnow().microsecond,
+ "__type": "date"
+ },
+ "isPrototype": True,
+ "autoHeight": True,
+ "page": -1,
+ "nativeHeight": 200,
+ "height": 200,
+ "data_text": content
+ },
+ "__type": "Doc"
+ }
+
+ db.newDocuments.insert_one(view_doc)
+ db.newDocuments.insert_one(data_doc)
+
+ return view_doc_guid
+
+
def write_image(folder, name):
path = f"http://localhost:1050/files/{folder}/{name}"
@@ -253,7 +307,7 @@ def parse_document(file_name: str):
while lines[cur] != "Image":
link_descriptions.append(lines[cur].strip())
cur += 1
- result["link_descriptions"] = listify(link_descriptions)
+ result["link_descriptions"] = text_doc_map(link_descriptions)
result["hyperlinks"] = extract_links(source + "/" + file_name)
@@ -265,7 +319,8 @@ def parse_document(file_name: str):
captions.append(lines[cur + 1])
cur += 2
result["images"] = listify(images)
- result["captions"] = listify(captions)
+
+ result["captions"] = text_doc_map(captions)
notes = []
if (cur < len(lines) and lines[cur] == "NOTES:"):