diff options
Diffstat (limited to 'src/scraping')
18 files changed, 4 insertions, 1 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 48b8fe3fa..1ff0e3b31 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -1,4 +1,5 @@ import os +from shutil import copyfile import docx2txt from docx import Document from docx.opc.constants import RELATIONSHIP_TYPE as RT @@ -138,7 +139,7 @@ def write_text_doc(content): data_doc = { "_id": data_doc_guid, "fields": { - "proto": protofy("commonImportProto"), + "proto": protofy("textProto"), "data": { "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}', "__type": "RichTextField" @@ -233,6 +234,8 @@ def parse_document(file_name: str): for image in os.listdir(dir_path): count += 1 view_guids.append(write_image(pure_name, image)) + copyfile(dir_path + "/" + image, dir_path + + "/" + image.replace(".", "_o.", 1)) os.rename(dir_path + "/" + image, dir_path + "/" + image.replace(".", "_m.", 1)) print(f"extracted {count} images...") diff --git a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx Binary files differindex 06094b4d3..649d636e3 100644 --- a/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx +++ b/src/scraping/buxton/source/Bill_Notes_Bill_Notes_CyKey.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx b/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx Binary files differindex 356697092..b00080e08 100644 --- a/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx +++ b/src/scraping/buxton/source/Bill_Notes_Braun_T3.docx diff --git a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx b/src/scraping/buxton/source/Bill_Notes_CasioC801.docx Binary files differindex cd89fb97b..510a006e0 100644 --- a/src/scraping/buxton/source/Bill_Notes_CasioC801.docx +++ b/src/scraping/buxton/source/Bill_Notes_CasioC801.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx b/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx Binary files differindex a503cddfc..cea9e7b69 100644 --- a/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx +++ b/src/scraping/buxton/source/Bill_Notes_Casio_Mini.docx diff --git a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx b/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx Binary files differindex 4d13a8cf5..f53402a06 100644 --- a/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx +++ b/src/scraping/buxton/source/Bill_Notes_FingerWorks_Prototype.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx b/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx Binary files differindex 578a1be08..0eec89949 100644 --- a/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx +++ b/src/scraping/buxton/source/Bill_Notes_Fingerworks_TouchStream.docx diff --git a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx Binary files differindex d01e1bf5c..ba80c1959 100644 --- a/src/scraping/buxton/source/Bill_Notes_FrogPad.docx +++ b/src/scraping/buxton/source/Bill_Notes_FrogPad.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx Binary files differindex 7bd28b376..8558a4e13 100644 --- a/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx +++ b/src/scraping/buxton/source/Bill_Notes_Gavilan_SC.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx Binary files differindex 0615c4953..09e17f971 100644 --- a/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx +++ b/src/scraping/buxton/source/Bill_Notes_Grandjean_Stenotype.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Matias.docx b/src/scraping/buxton/source/Bill_Notes_Matias.docx Binary files differindex 547603256..d2d014bbe 100644 --- a/src/scraping/buxton/source/Bill_Notes_Matias.docx +++ b/src/scraping/buxton/source/Bill_Notes_Matias.docx diff --git a/src/scraping/buxton/source/Bill_Notes_MousePen.docx b/src/scraping/buxton/source/Bill_Notes_MousePen.docx Binary files differindex 4e1056636..cd0b3eab3 100644 --- a/src/scraping/buxton/source/Bill_Notes_MousePen.docx +++ b/src/scraping/buxton/source/Bill_Notes_MousePen.docx diff --git a/src/scraping/buxton/source/Bill_Notes_NewO.docx b/src/scraping/buxton/source/Bill_Notes_NewO.docx Binary files differindex a514926d2..2f4a04e81 100644 --- a/src/scraping/buxton/source/Bill_Notes_NewO.docx +++ b/src/scraping/buxton/source/Bill_Notes_NewO.docx diff --git a/src/scraping/buxton/source/Bill_Notes_OLPC.docx b/src/scraping/buxton/source/Bill_Notes_OLPC.docx Binary files differindex bfca0a9bb..7a636e2d6 100644 --- a/src/scraping/buxton/source/Bill_Notes_OLPC.docx +++ b/src/scraping/buxton/source/Bill_Notes_OLPC.docx diff --git a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx Binary files differindex c0cf6ba9a..3038de363 100644 --- a/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx +++ b/src/scraping/buxton/source/Bill_Notes_PARCkbd.docx diff --git a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx b/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx Binary files differindex ad06903f3..af72fa662 100644 --- a/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx +++ b/src/scraping/buxton/source/Bill_Notes_Philco_Mystery_Control.docx diff --git a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx b/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx Binary files differindex e4c659de9..5c2eb8d7f 100644 --- a/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx +++ b/src/scraping/buxton/source/Bill_Notes_TASA_Kbd.docx diff --git a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx b/src/scraping/buxton/source/Bill_Notes_The_Tap.docx Binary files differindex 8ceebc71e..c9ee2eaea 100644 --- a/src/scraping/buxton/source/Bill_Notes_The_Tap.docx +++ b/src/scraping/buxton/source/Bill_Notes_The_Tap.docx |