diff options
| author | Sam Wilkins <samwilkins333@gmail.com> | 2020-04-26 19:29:50 -0700 |
|---|---|---|
| committer | Sam Wilkins <samwilkins333@gmail.com> | 2020-04-26 19:29:50 -0700 |
| commit | c75ffd4900acea74c55b6bf275a5e8082c15d573 (patch) | |
| tree | b8cedbc9da3061e13ab321d3a86134e41690b8be /src/scraping/buxton/final/BuxtonImporter.ts | |
| parent | 63985ba64dc4a3a94bac75a9120cd8a5ca791fec (diff) | |
formatted textbox disposers refactor, paragraph chunked rich text initialization and buxton importer updates
Diffstat (limited to 'src/scraping/buxton/final/BuxtonImporter.ts')
| -rw-r--r-- | src/scraping/buxton/final/BuxtonImporter.ts | 17 |
1 files changed, 10 insertions, 7 deletions
diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts index 122415460..64b988610 100644 --- a/src/scraping/buxton/final/BuxtonImporter.ts +++ b/src/scraping/buxton/final/BuxtonImporter.ts @@ -16,6 +16,7 @@ interface DocumentContents { hyperlinks: string[]; captions: string[]; embeddedFileNames: string[]; + longDescriptionParagraphs: string[]; } export interface DeviceDocument { @@ -186,10 +187,6 @@ const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([ exp: /Short Description:\s+(.*)Bill Buxton[’']s Notes/, transformer: Utilities.correctSentences }], - ["longDescription", { - exp: /Bill Buxton[’']s Notes(.*)Device Details/, - transformer: Utilities.correctSentences - }], ]); const sourceDir = path.resolve(__dirname, "source"); @@ -267,7 +264,12 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont const body = document.root()?.text() ?? "No body found. Check the import script's XML parser."; const captions: string[] = []; const embeddedFileNames: string[] = []; - const captionTargets = document.find(tableCellXPath).map(node => node.text()); + const captionTargets = document.find(tableCellXPath).map(node => node.text().trim()); + + const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!); + const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1; + const end = paragraphs.indexOf("Device Details"); + const longDescriptionParagraphs = paragraphs.slice(start, end); const { length } = captionTargets; strictEqual(length > 3, true, "No captions written."); @@ -290,7 +292,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont zip.close(); - return { body, imageData, captions, embeddedFileNames, hyperlinks }; + return { body, longDescriptionParagraphs, imageData, captions, embeddedFileNames, hyperlinks }; } const imageEntry = /^word\/media\/\w+\.(jpeg|jpg|png|gif)/; @@ -337,7 +339,7 @@ async function writeImages(zip: any): Promise<ImageData[]> { } function analyze(fileName: string, contents: DocumentContents): AnalysisResult { - const { body, imageData, captions, hyperlinks, embeddedFileNames } = contents; + const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescriptionParagraphs } = contents; const device: any = { hyperlinks, captions, @@ -376,6 +378,7 @@ function analyze(fileName: string, contents: DocumentContents): AnalysisResult { return { errors }; } + device.longDescription = longDescriptionParagraphs.join("\n\n"); return { device }; } |
