From c75ffd4900acea74c55b6bf275a5e8082c15d573 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Sun, 26 Apr 2020 19:29:50 -0700 Subject: formatted textbox disposers refactor, paragraph chunked rich text initialization and buxton importer updates --- src/scraping/buxton/final/BuxtonImporter.ts | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'src/scraping/buxton/final') diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts index 122415460..64b988610 100644 --- a/src/scraping/buxton/final/BuxtonImporter.ts +++ b/src/scraping/buxton/final/BuxtonImporter.ts @@ -16,6 +16,7 @@ interface DocumentContents { hyperlinks: string[]; captions: string[]; embeddedFileNames: string[]; + longDescriptionParagraphs: string[]; } export interface DeviceDocument { @@ -186,10 +187,6 @@ const RegexMap = new Map>([ exp: /Short Description:\s+(.*)Bill Buxton[’']s Notes/, transformer: Utilities.correctSentences }], - ["longDescription", { - exp: /Bill Buxton[’']s Notes(.*)Device Details/, - transformer: Utilities.correctSentences - }], ]); const sourceDir = path.resolve(__dirname, "source"); @@ -267,7 +264,12 @@ async function extractFileContents(pathToDocument: string): Promise node.text()); + const captionTargets = document.find(tableCellXPath).map(node => node.text().trim()); + + const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!); + const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1; + const end = paragraphs.indexOf("Device Details"); + const longDescriptionParagraphs = paragraphs.slice(start, end); const { length } = captionTargets; strictEqual(length > 3, true, "No captions written."); @@ -290,7 +292,7 @@ async function extractFileContents(pathToDocument: string): Promise { } function analyze(fileName: string, contents: DocumentContents): AnalysisResult { - const { body, imageData, captions, hyperlinks, embeddedFileNames } = contents; + const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescriptionParagraphs } = contents; const device: any = { hyperlinks, captions, @@ -376,6 +378,7 @@ function analyze(fileName: string, contents: DocumentContents): AnalysisResult { return { errors }; } + device.longDescription = longDescriptionParagraphs.join("\n\n"); return { device }; } -- cgit v1.2.3-70-g09d2