diff options
Diffstat (limited to 'src/scraping/buxton')
-rw-r--r-- | src/scraping/buxton/final/BuxtonImporter.ts | 39 |
1 files changed, 23 insertions, 16 deletions
diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts index 122415460..713207a07 100644 --- a/src/scraping/buxton/final/BuxtonImporter.ts +++ b/src/scraping/buxton/final/BuxtonImporter.ts @@ -16,6 +16,7 @@ interface DocumentContents { hyperlinks: string[]; captions: string[]; embeddedFileNames: string[]; + longDescription: string; } export interface DeviceDocument { @@ -186,10 +187,6 @@ const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([ exp: /Short Description:\s+(.*)Bill Buxton[’']s Notes/, transformer: Utilities.correctSentences }], - ["longDescription", { - exp: /Bill Buxton[’']s Notes(.*)Device Details/, - transformer: Utilities.correctSentences - }], ]); const sourceDir = path.resolve(__dirname, "source"); @@ -267,7 +264,12 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont const body = document.root()?.text() ?? "No body found. Check the import script's XML parser."; const captions: string[] = []; const embeddedFileNames: string[] = []; - const captionTargets = document.find(tableCellXPath).map(node => node.text()); + const captionTargets = document.find(tableCellXPath).map(node => node.text().trim()); + + const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!); + const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1; + const end = paragraphs.indexOf("Device Details"); + const longDescription = paragraphs.slice(start, end).filter(paragraph => paragraph.length).join("\n\n"); const { length } = captionTargets; strictEqual(length > 3, true, "No captions written."); @@ -290,7 +292,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont zip.close(); - return { body, imageData, captions, embeddedFileNames, hyperlinks }; + return { body, longDescription, imageData, captions, embeddedFileNames, hyperlinks }; } const imageEntry = /^word\/media\/\w+\.(jpeg|jpg|png|gif)/; @@ -306,26 +308,30 @@ async function writeImages(zip: any): Promise<ImageData[]> { const imageEntries = allEntries.filter(name => imageEntry.test(name)); const imageUrls: ImageData[] = []; - for (const mediaPath of imageEntries) { - const getImageStream = () => new Promise<Readable>((resolve, reject) => { - zip.stream(mediaPath, (error: any, stream: any) => error ? reject(error) : resolve(stream)); - }); + const valid: any[] = []; + const getImageStream = (mediaPath: string) => new Promise<Readable>((resolve, reject) => { + zip.stream(mediaPath, (error: any, stream: any) => error ? reject(error) : resolve(stream)); + }); + + for (const mediaPath of imageEntries) { const { width, height, type } = await new Promise<Dimensions>(async resolve => { const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: Dimensions) => { readStream.destroy(); resolve(dimensions); }).on("error", () => readStream.destroy()); - const readStream = await getImageStream(); + const readStream = await getImageStream(mediaPath); readStream.pipe(sizeStream); }); - if (Math.abs(width - height) < 10) { - continue; + + if (Math.abs(width - height) > 10) { + valid.push({ width, height, type, mediaPath }); } + } + for (const { type, width, height, mediaPath } of valid) { const generatedFileName = `upload_${Utils.GenerateGuid()}.${type.toLowerCase()}`; - await DashUploadUtils.outputResizedImages(getImageStream, generatedFileName, imageDir); - + await DashUploadUtils.outputResizedImages(() => getImageStream(mediaPath), generatedFileName, imageDir); imageUrls.push({ url: `/files/images/buxton/${generatedFileName}`, nativeWidth: width, @@ -337,11 +343,12 @@ async function writeImages(zip: any): Promise<ImageData[]> { } function analyze(fileName: string, contents: DocumentContents): AnalysisResult { - const { body, imageData, captions, hyperlinks, embeddedFileNames } = contents; + const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescription } = contents; const device: any = { hyperlinks, captions, embeddedFileNames, + longDescription, __images: imageData }; const errors: { [key: string]: string } = { fileName }; |