aboutsummaryrefslogtreecommitdiff
path: root/src/scraping
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping')
-rw-r--r--src/scraping/acm/.gitignore2
-rw-r--r--src/scraping/acm/debug.log38
-rw-r--r--src/scraping/acm/index.js2
-rw-r--r--src/scraping/buxton/scraper.py61
4 files changed, 99 insertions, 4 deletions
diff --git a/src/scraping/acm/.gitignore b/src/scraping/acm/.gitignore
new file mode 100644
index 000000000..caca8b99c
--- /dev/null
+++ b/src/scraping/acm/.gitignore
@@ -0,0 +1,2 @@
+./citations.txt
+./results.txt \ No newline at end of file
diff --git a/src/scraping/acm/debug.log b/src/scraping/acm/debug.log
new file mode 100644
index 000000000..8c0a148f4
--- /dev/null
+++ b/src/scraping/acm/debug.log
@@ -0,0 +1,38 @@
+[0625/170004.768:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/170004.769:ERROR:exception_snapshot_win.cc(98)] thread ID 17604 not found in process
+[0625/171124.644:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/171124.645:ERROR:exception_snapshot_win.cc(98)] thread ID 14348 not found in process
+[0625/171853.989:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/171853.990:ERROR:exception_snapshot_win.cc(98)] thread ID 12080 not found in process
+[0625/171947.744:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/171947.745:ERROR:exception_snapshot_win.cc(98)] thread ID 16160 not found in process
+[0625/172007.424:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/172007.425:ERROR:exception_snapshot_win.cc(98)] thread ID 13472 not found in process
+[0625/172059.353:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/172059.354:ERROR:exception_snapshot_win.cc(98)] thread ID 6396 not found in process
+[0625/172402.795:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/172402.796:ERROR:exception_snapshot_win.cc(98)] thread ID 10720 not found in process
+[0625/172618.850:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/172618.850:ERROR:exception_snapshot_win.cc(98)] thread ID 21136 not found in process
+[0625/172819.875:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/172819.876:ERROR:exception_snapshot_win.cc(98)] thread ID 17624 not found in process
+[0625/172953.674:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/172953.675:ERROR:exception_snapshot_win.cc(98)] thread ID 15180 not found in process
+[0625/173412.182:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173412.182:ERROR:exception_snapshot_win.cc(98)] thread ID 13952 not found in process
+[0625/173447.806:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173447.807:ERROR:exception_snapshot_win.cc(98)] thread ID 1572 not found in process
+[0625/173516.188:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173516.189:ERROR:exception_snapshot_win.cc(98)] thread ID 5472 not found in process
+[0625/173528.446:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173528.447:ERROR:exception_snapshot_win.cc(98)] thread ID 20420 not found in process
+[0625/173539.436:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173539.437:ERROR:exception_snapshot_win.cc(98)] thread ID 16192 not found in process
+[0625/173643.139:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173643.140:ERROR:exception_snapshot_win.cc(98)] thread ID 15716 not found in process
+[0625/173659.376:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/173659.377:ERROR:exception_snapshot_win.cc(98)] thread ID 11828 not found in process
+[0625/201137.209:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/201137.210:ERROR:exception_snapshot_win.cc(98)] thread ID 7688 not found in process
+[0625/210240.476:ERROR:process_reader_win.cc(123)] NtOpenThread: {Access Denied} A process has requested access to an object, but has not been granted those access rights. (0xc0000022)
+[0625/210240.477:ERROR:exception_snapshot_win.cc(98)] thread ID 20828 not found in process
diff --git a/src/scraping/acm/index.js b/src/scraping/acm/index.js
index 51781dba8..b71d55226 100644
--- a/src/scraping/acm/index.js
+++ b/src/scraping/acm/index.js
@@ -276,4 +276,4 @@ log_read("target references");
readFile(target_source, {
encoding: "utf8"
-}, scrape_targets); \ No newline at end of file
+}, scrape_targets);
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py
index 02c6d8b74..700269727 100644
--- a/src/scraping/buxton/scraper.py
+++ b/src/scraping/buxton/scraper.py
@@ -26,7 +26,7 @@ def extract_links(fileName):
item = rels[rel]
if item.reltype == RT.HYPERLINK and ".aspx" not in item._target:
links.append(item._target)
- return listify(links)
+ return text_doc_map(links)
def extract_value(kv_string):
@@ -60,6 +60,12 @@ def protofy(fieldId):
}
+def text_doc_map(string_list):
+ def guid_map(caption):
+ return write_text_doc(caption)
+ return listify(proxify_guids(list(map(guid_map, string_list))))
+
+
def write_schema(parse_results, display_fields, storage_key):
view_guids = parse_results["child_guids"]
@@ -110,6 +116,54 @@ def write_schema(parse_results, display_fields, storage_key):
return view_doc_guid
+def write_text_doc(content):
+ data_doc_guid = guid()
+ view_doc_guid = guid()
+
+ view_doc = {
+ "_id": view_doc_guid,
+ "fields": {
+ "proto": protofy(data_doc_guid),
+ "x": 10,
+ "y": 10,
+ "width": 400,
+ "zIndex": 2,
+ "libraryBrush": False
+ },
+ "__type": "Doc"
+ }
+
+ data_doc = {
+ "_id": data_doc_guid,
+ "fields": {
+ "proto": protofy("textProto"),
+ "data": {
+ "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}',
+ "__type": "RichTextField"
+ },
+ "title": content,
+ "nativeWidth": 200,
+ "author": "Bill Buxton",
+ "creationDate": {
+ "date": datetime.datetime.utcnow().microsecond,
+ "__type": "date"
+ },
+ "isPrototype": True,
+ "autoHeight": True,
+ "page": -1,
+ "nativeHeight": 200,
+ "height": 200,
+ "data_text": content
+ },
+ "__type": "Doc"
+ }
+
+ db.newDocuments.insert_one(view_doc)
+ db.newDocuments.insert_one(data_doc)
+
+ return view_doc_guid
+
+
def write_image(folder, name):
path = f"http://localhost:1050/files/{folder}/{name}"
@@ -253,7 +307,7 @@ def parse_document(file_name: str):
while lines[cur] != "Image":
link_descriptions.append(lines[cur].strip())
cur += 1
- result["link_descriptions"] = listify(link_descriptions)
+ result["link_descriptions"] = text_doc_map(link_descriptions)
result["hyperlinks"] = extract_links(source + "/" + file_name)
@@ -265,7 +319,8 @@ def parse_document(file_name: str):
captions.append(lines[cur + 1])
cur += 2
result["images"] = listify(images)
- result["captions"] = listify(captions)
+
+ result["captions"] = text_doc_map(captions)
notes = []
if (cur < len(lines) and lines[cur] == "NOTES:"):