diff options
author | Sam Wilkins <samwilkins333@gmail.com> | 2019-07-30 03:49:18 -0400 |
---|---|---|
committer | Sam Wilkins <samwilkins333@gmail.com> | 2019-07-30 03:49:18 -0400 |
commit | 79f47098381fef8e247ddb45ce56993669f218cb (patch) | |
tree | c7fb7e88586c2eba65ad0b5e8e2bf11a2c1b29cd /src/scraping/buxton/scraper.py | |
parent | 3152e69dfafe1c393bed38f3aad1e55881e62a33 (diff) | |
parent | e041988b84553797699a5a232e26e72252460e01 (diff) |
successful implementation of different approach
Diffstat (limited to 'src/scraping/buxton/scraper.py')
-rw-r--r-- | src/scraping/buxton/scraper.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/src/scraping/buxton/scraper.py b/src/scraping/buxton/scraper.py index 48b8fe3fa..1ff0e3b31 100644 --- a/src/scraping/buxton/scraper.py +++ b/src/scraping/buxton/scraper.py @@ -1,4 +1,5 @@ import os +from shutil import copyfile import docx2txt from docx import Document from docx.opc.constants import RELATIONSHIP_TYPE as RT @@ -138,7 +139,7 @@ def write_text_doc(content): data_doc = { "_id": data_doc_guid, "fields": { - "proto": protofy("commonImportProto"), + "proto": protofy("textProto"), "data": { "Data": '{"doc":{"type":"doc","content":[{"type":"paragraph","content":[{"type":"text","text":"' + content + '"}]}]},"selection":{"type":"text","anchor":1,"head":1}' + '}', "__type": "RichTextField" @@ -233,6 +234,8 @@ def parse_document(file_name: str): for image in os.listdir(dir_path): count += 1 view_guids.append(write_image(pure_name, image)) + copyfile(dir_path + "/" + image, dir_path + + "/" + image.replace(".", "_o.", 1)) os.rename(dir_path + "/" + image, dir_path + "/" + image.replace(".", "_m.", 1)) print(f"extracted {count} images...") |