diff options
Diffstat (limited to 'src/scraping/buxton/scraper.py')
-rw-r--r-- | src/scraping/buxton/scraper.py | 61 |
1 files changed, 58 insertions, 3 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 02c6d8b74..700269727 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -26,7 +26,7 @@ def extract_links(fileName): item = rels[rel] if item.reltype == RT.HYPERLINK and ".aspx" not in item._target: links.append(item._target) - return listify(links) + return text_doc_map(links) def extract_value(kv_string): @@ -60,6 +60,12 @@ def protofy(fieldId): } +def text_doc_map(string_list): + def guid_map(caption): + return write_text_doc(caption) + return listify(proxify_guids(list(map(guid_map, string_list)))) + + def write_schema(parse_results, display_fields, storage_key): view_guids = parse_results["child_guids"] @@ -110,6 +116,54 @@ def write_schema(parse_results, display_fields, storage_key): return view_doc_guid +def write_text_doc(content): + data_doc_guid = guid() + view_doc_guid = guid() + + view_doc = { + "_id": view_doc_guid, + "fields": { + "proto": protofy(data_doc_guid), + "x": 10, + "y": 10, + "width": 400, + "zIndex": 2, + "libraryBrush": False + }, + "__type": "Doc" + } + + data_doc = { + "_id": data_doc_guid, + "fields": { + "proto": protofy("textProto"), + "data": { + "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}', + "__type": "RichTextField" + }, + "title": content, + "nativeWidth": 200, + "author": "Bill Buxton", + "creationDate": { + "date": datetime.datetime.utcnow().microsecond, + "__type": "date" + }, + "isPrototype": True, + "autoHeight": True, + "page": -1, + "nativeHeight": 200, + "height": 200, + "data_text": content + }, + "__type": "Doc" + } + + db.newDocuments.insert_one(view_doc) + db.newDocuments.insert_one(data_doc) + + return view_doc_guid + + def write_image(folder, name): path = f"http://localhost:1050/files/{folder}/{name}" @@ -253,7 +307,7 @@ def parse_document(file_name: str): while lines[cur] != "Image": link_descriptions.append(lines[cur].strip()) cur += 1 - result["link_descriptions"] = listify(link_descriptions) + result["link_descriptions"] = text_doc_map(link_descriptions) result["hyperlinks"] = extract_links(source + "/" + file_name) @@ -265,7 +319,8 @@ def parse_document(file_name: str): captions.append(lines[cur + 1]) cur += 2 result["images"] = listify(images) - result["captions"] = listify(captions) + + result["captions"] = text_doc_map(captions) notes = [] if (cur < len(lines) and lines[cur] == "NOTES:"): |