diff options
Diffstat (limited to 'src/scraping')
| -rw-r--r-- | src/scraping/acm/.gitignore | 2 | ||||
| -rw-r--r-- | src/scraping/acm/debug.log | 38 | ||||
| -rw-r--r-- | src/scraping/acm/index.js | 2 | ||||
| -rw-r--r-- | src/scraping/buxton/scraper.py | 61 |
4 files changed, 99 insertions, 4 deletions
diff --git a/src/scraping/acm/.gitignore b/src/scraping/acm/.gitignore new file mode 100644 index 000000000..caca8b99c --- /dev/null +++ b/src/scraping/acm/.gitignore @@ -0,0 +1,2 @@ +./citations.txt +./results.txt
\ No newline at end of file diff --git a/src/scraping/acm/debug.log b/src/scraping/acm/debug.log new file mode 100644 index 000000000..8c0a148f4 --- /dev/null +++ b/src/scraping/acm/debug.log @@ -0,0 +1,38 @@ +[0625/170004.768:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/170004.769:ERROR:exception_snapshot_win.cc(98)] thread ID 17604 not found in process +[0625/171124.644:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/171124.645:ERROR:exception_snapshot_win.cc(98)] thread ID 14348 not found in process +[0625/171853.989:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/171853.990:ERROR:exception_snapshot_win.cc(98)] thread ID 12080 not found in process +[0625/171947.744:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/171947.745:ERROR:exception_snapshot_win.cc(98)] thread ID 16160 not found in process +[0625/172007.424:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/172007.425:ERROR:exception_snapshot_win.cc(98)] thread ID 13472 not found in process +[0625/172059.353:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/172059.354:ERROR:exception_snapshot_win.cc(98)] thread ID 6396 not found in process +[0625/172402.795:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/172402.796:ERROR:exception_snapshot_win.cc(98)] thread ID 10720 not found in process +[0625/172618.850:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/172618.850:ERROR:exception_snapshot_win.cc(98)] thread ID 21136 not found in process +[0625/172819.875:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/172819.876:ERROR:exception_snapshot_win.cc(98)] thread ID 17624 not found in process +[0625/172953.674:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/172953.675:ERROR:exception_snapshot_win.cc(98)] thread ID 15180 not found in process +[0625/173412.182:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173412.182:ERROR:exception_snapshot_win.cc(98)] thread ID 13952 not found in process +[0625/173447.806:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173447.807:ERROR:exception_snapshot_win.cc(98)] thread ID 1572 not found in process +[0625/173516.188:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173516.189:ERROR:exception_snapshot_win.cc(98)] thread ID 5472 not found in process +[0625/173528.446:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173528.447:ERROR:exception_snapshot_win.cc(98)] thread ID 20420 not found in process +[0625/173539.436:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173539.437:ERROR:exception_snapshot_win.cc(98)] thread ID 16192 not found in process +[0625/173643.139:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173643.140:ERROR:exception_snapshot_win.cc(98)] thread ID 15716 not found in process +[0625/173659.376:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/173659.377:ERROR:exception_snapshot_win.cc(98)] thread ID 11828 not found in process +[0625/201137.209:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/201137.210:ERROR:exception_snapshot_win.cc(98)] thread ID 7688 not found in process +[0625/210240.476:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022) +[0625/210240.477:ERROR:exception_snapshot_win.cc(98)] thread ID 20828 not found in process diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js index 51781dba8..b71d55226 100644 --- a/src/scraping/acm/index.js +++ b/src/scraping/acm/index.js @@ -276,4 +276,4 @@ log_read("target references"); readFile(target_source, { encoding: "utf8" -}, scrape_targets);
\ No newline at end of file +}, scrape_targets); diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 02c6d8b74..700269727 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -26,7 +26,7 @@ def extract_links(fileName): item = rels[rel] if item.reltype == RT.HYPERLINK and ".aspx" not in item._target: links.append(item._target) - return listify(links) + return text_doc_map(links) def extract_value(kv_string): @@ -60,6 +60,12 @@ def protofy(fieldId): } +def text_doc_map(string_list): + def guid_map(caption): + return write_text_doc(caption) + return listify(proxify_guids(list(map(guid_map, string_list)))) + + def write_schema(parse_results, display_fields, storage_key): view_guids = parse_results["child_guids"] @@ -110,6 +116,54 @@ def write_schema(parse_results, display_fields, storage_key): return view_doc_guid +def write_text_doc(content): + data_doc_guid = guid() + view_doc_guid = guid() + + view_doc = { + "_id": view_doc_guid, + "fields": { + "proto": protofy(data_doc_guid), + "x": 10, + "y": 10, + "width": 400, + "zIndex": 2, + "libraryBrush": False + }, + "__type": "Doc" + } + + data_doc = { + "_id": data_doc_guid, + "fields": { + "proto": protofy("textProto"), + "data": { + "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}', + "__type": "RichTextField" + }, + "title": content, + "nativeWidth": 200, + "author": "Bill Buxton", + "creationDate": { + "date": datetime.datetime.utcnow().microsecond, + "__type": "date" + }, + "isPrototype": True, + "autoHeight": True, + "page": -1, + "nativeHeight": 200, + "height": 200, + "data_text": content + }, + "__type": "Doc" + } + + db.newDocuments.insert_one(view_doc) + db.newDocuments.insert_one(data_doc) + + return view_doc_guid + + def write_image(folder, name): path = f"http://localhost:1050/files/{folder}/{name}" @@ -253,7 +307,7 @@ def parse_document(file_name: str): while lines[cur] != "Image": link_descriptions.append(lines[cur].strip()) cur += 1 - result["link_descriptions"] = listify(link_descriptions) + result["link_descriptions"] = text_doc_map(link_descriptions) result["hyperlinks"] = extract_links(source + "/" + file_name) @@ -265,7 +319,8 @@ def parse_document(file_name: str): captions.append(lines[cur + 1]) cur += 2 result["images"] = listify(images) - result["captions"] = listify(captions) + + result["captions"] = text_doc_map(captions) notes = [] if (cur < len(lines) and lines[cur] == "NOTES:"): |
