diff options
author | Bob Zeleznik <zzzman@gmail.com> | 2020-05-30 21:17:58 -0400 |
---|---|---|
committer | Bob Zeleznik <zzzman@gmail.com> | 2020-05-30 21:17:58 -0400 |
commit | 9b0ba3940ec9b718cbffb945298095bd2ddcedb9 (patch) | |
tree | 6c78223ebd63fdf714c68d5f9b13f9acd687e57d /src/scraping | |
parent | dc42fe7c83b86a53839994f2eae1ef545fbd5a9d (diff) | |
parent | 85721c9ed95b4c026d0a1c7891e1fee311e9f50e (diff) |
Merge branch 'master' of https://github.com/browngraphicslab/Dash-Web
Diffstat (limited to 'src/scraping')
-rw-r--r-- | src/scraping/buxton/final/BuxtonImporter.ts | 74 | ||||
-rw-r--r-- | src/scraping/buxton/final/assets/pdfs/3DCad_Brochure.pdf | bin | 0 -> 107790 bytes |
2 files changed, 60 insertions, 14 deletions
diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts index e55850b29..684c00c0d 100644 --- a/src/scraping/buxton/final/BuxtonImporter.ts +++ b/src/scraping/buxton/final/BuxtonImporter.ts @@ -1,4 +1,4 @@ -import { readdirSync, writeFile, mkdirSync } from "fs"; +import { readdirSync, writeFile, mkdirSync, createReadStream, createWriteStream, existsSync, statSync } from "fs"; import * as path from "path"; import { red, cyan, yellow } from "colors"; import { Utils } from "../../../Utils"; @@ -9,6 +9,7 @@ const createImageSizeStream = require("image-size-stream"); import { parseXml } from "libxmljs"; import { strictEqual } from "assert"; import { Readable, PassThrough } from "stream"; +import { Directory, serverPathToFile, pathToDirectory } from "../../../server/ApiManagers/UploadManager"; /** * This is an arbitrary bundle of data that gets populated @@ -18,8 +19,7 @@ interface DocumentContents { body: string; imageData: ImageData[]; hyperlinks: string[]; - captions: string[]; - embeddedFileNames: string[]; + tableData: TableData[]; longDescription: string; } @@ -40,6 +40,7 @@ export interface DeviceDocument { secondaryKey: string; attribute: string; __images: ImageData[]; + additionalMedia: ({ [type: string]: string } | undefined)[]; hyperlinks: string[]; captions: string[]; // from the table column embeddedFileNames: string[]; // from the table column @@ -255,6 +256,8 @@ const FormatMap = new Map<keyof DeviceDocument, ValueFormatDefinition<any>>([ ]); const sourceDir = path.resolve(__dirname, "source"); // where the Word documents are assumed to be stored +const assetDir = path.resolve(__dirname, "assets"); // where any additional media content like pdfs will be stored. Each subdirectory of this +// must follow the enum Directory.<type> naming scheme const outDir = path.resolve(__dirname, "json"); // where the JSON output of these device documents will be written const imageDir = path.resolve(__dirname, "../../../server/public/files/images/buxton"); // where, in the server, these images will be written const successOut = "buxton.json"; // the JSON list representing properly formatted documents @@ -277,12 +280,13 @@ export default async function executeImport(emitter: ResultCallback, terminator: rimraf.sync(dir); mkdirSync(dir); }); + await transferAssets(); return parseFiles(wordDocuments, emitter, terminator); } catch (e) { const message = [ "Unable to find a source directory.", - "Please ensure that the following directory exists and is populated with Word documents:", - `${sourceDir}` + "Please ensure that the following directory exists:", + `${e.message}` ].join('\n'); console.log(red(message)); return { error: message }; @@ -290,6 +294,32 @@ export default async function executeImport(emitter: ResultCallback, terminator: } /** + * Builds a mirrored directory structure of all media / asset files + * within the server's public directory. + */ +async function transferAssets() { + for (const assetType of readdirSync(assetDir)) { + const subroot = path.resolve(assetDir, assetType); + if (!statSync(subroot).isDirectory()) { + continue; + } + const outputSubroot = serverPathToFile(assetType as Directory, "buxton"); + if (existsSync(outputSubroot)) { + continue; + } else { + mkdirSync(outputSubroot); + } + for (const fileName of readdirSync(subroot)) { + const readStream = createReadStream(path.resolve(subroot, fileName)); + const writeStream = createWriteStream(path.resolve(outputSubroot, fileName)); + await new Promise<void>(resolve => { + readStream.pipe(writeStream).on("close", resolve); + }); + } + } +} + +/** * Parse every Word document in the directory, notifying any callers as needed * at each iteration via the emitter. * @param wordDocuments the string list of Word document names to parse @@ -356,6 +386,16 @@ const xPaths = { hyperlinks: '//*[name()="Relationship" and contains(@Type, "hyperlink")]' }; +interface TableData { + fileName: string; + caption: string; + additionalMedia?: { [type: string]: string }; +} + +const SuffixDirectoryMap = new Map<string, Directory>([ + ["p", Directory.pdfs] +]); + /** * The meat of the script, images and text content are extracted here * @param pathToDocument the path to the document relative to the root of the zip @@ -370,8 +410,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont // get plain text const body = document.root()?.text() ?? "No body found. Check the import script's XML parser."; const captions: string[] = []; - const embeddedFileNames: string[] = []; - + const tableData: TableData[] = []; // preserve paragraph formatting and line breaks that would otherwise get lost in the plain text parsing // of the XML hierarchy const paragraphs = document.find(xPaths.paragraphs).map(node => Utilities.correctSentences(node.text()).transformed!); @@ -382,7 +421,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont // extract captions from the table cells const tableRowsFlattened = document.find(xPaths.tableCells).map(node => node.text().trim()); const { length } = tableRowsFlattened; - const numCols = 3; + const numCols = 4; strictEqual(length > numCols, true, "No captions written."); // first row has the headers, not content strictEqual(length % numCols === 0, true, "Improper caption formatting."); @@ -392,8 +431,14 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont // have been added or reordered since this was written, but follow the same appraoch) for (let i = numCols; i < tableRowsFlattened.length; i += numCols) { const row = tableRowsFlattened.slice(i, i + numCols); - embeddedFileNames.push(row[1]); - captions.push(row[2]); + const entry: TableData = { fileName: row[1], caption: row[2] }; + const key = SuffixDirectoryMap.get(row[3].toLowerCase()); + if (key) { + const media: any = {}; + media[key] = `${entry.fileName.split(".")[0]}.pdf`; + entry.additionalMedia = media; + } + tableData.push(entry); } // extract all hyperlinks embedded in the document @@ -409,7 +454,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont // cleanup zip.close(); - return { body, longDescription, imageData, captions, embeddedFileNames, hyperlinks }; + return { body, longDescription, imageData, tableData, hyperlinks }; } // zip relative path from root expression / filter used to isolate only media assets @@ -492,11 +537,12 @@ async function writeImages(zip: any): Promise<ImageData[]> { * @param contents the data already computed / parsed by extractFileContents */ function analyze(fileName: string, contents: DocumentContents): AnalysisResult { - const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescription } = contents; + const { body, imageData, hyperlinks, tableData, longDescription } = contents; const device: any = { hyperlinks, - captions, - embeddedFileNames, + captions: tableData.map(({ caption }) => caption), + embeddedFileNames: tableData.map(({ fileName }) => fileName), + additionalMedia: tableData.map(({ additionalMedia }) => additionalMedia), longDescription, __images: imageData }; diff --git a/src/scraping/buxton/final/assets/pdfs/3DCad_Brochure.pdf b/src/scraping/buxton/final/assets/pdfs/3DCad_Brochure.pdf Binary files differnew file mode 100644 index 000000000..4746d2f41 --- /dev/null +++ b/src/scraping/buxton/final/assets/pdfs/3DCad_Brochure.pdf |