diff options
| author | Bob Zeleznik <zzzman@gmail.com> | 2020-05-30 21:17:58 -0400 | 
|---|---|---|
| committer | Bob Zeleznik <zzzman@gmail.com> | 2020-05-30 21:17:58 -0400 | 
| commit | 9b0ba3940ec9b718cbffb945298095bd2ddcedb9 (patch) | |
| tree | 6c78223ebd63fdf714c68d5f9b13f9acd687e57d /src/scraping/buxton/final | |
| parent | dc42fe7c83b86a53839994f2eae1ef545fbd5a9d (diff) | |
| parent | 85721c9ed95b4c026d0a1c7891e1fee311e9f50e (diff) | |
Merge branch 'master' of https://github.com/browngraphicslab/Dash-Web
Diffstat (limited to 'src/scraping/buxton/final')
| -rw-r--r-- | src/scraping/buxton/final/BuxtonImporter.ts | 74 | ||||
| -rw-r--r-- | src/scraping/buxton/final/assets/pdfs/3DCad_Brochure.pdf | bin | 0 -> 107790 bytes | 
2 files changed, 60 insertions, 14 deletions
| diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts index e55850b29..684c00c0d 100644 --- a/src/scraping/buxton/final/BuxtonImporter.ts +++ b/src/scraping/buxton/final/BuxtonImporter.ts @@ -1,4 +1,4 @@ -import { readdirSync, writeFile, mkdirSync } from "fs"; +import { readdirSync, writeFile, mkdirSync, createReadStream, createWriteStream, existsSync, statSync } from "fs";  import * as path from "path";  import { red, cyan, yellow } from "colors";  import { Utils } from "../../../Utils"; @@ -9,6 +9,7 @@ const createImageSizeStream = require("image-size-stream");  import { parseXml } from "libxmljs";  import { strictEqual } from "assert";  import { Readable, PassThrough } from "stream"; +import { Directory, serverPathToFile, pathToDirectory } from "../../../server/ApiManagers/UploadManager";  /**   * This is an arbitrary bundle of data that gets populated @@ -18,8 +19,7 @@ interface DocumentContents {      body: string;      imageData: ImageData[];      hyperlinks: string[]; -    captions: string[]; -    embeddedFileNames: string[]; +    tableData: TableData[];      longDescription: string;  } @@ -40,6 +40,7 @@ export interface DeviceDocument {      secondaryKey: string;      attribute: string;      __images: ImageData[]; +    additionalMedia: ({ [type: string]: string } | undefined)[];      hyperlinks: string[];      captions: string[]; // from the table column      embeddedFileNames: string[]; // from the table column @@ -255,6 +256,8 @@ const FormatMap = new Map<keyof DeviceDocument, ValueFormatDefinition<any>>([  ]);  const sourceDir = path.resolve(__dirname, "source"); // where the Word documents are assumed to be stored +const assetDir = path.resolve(__dirname, "assets"); // where any additional media content like pdfs will be stored. Each subdirectory of this +// must follow the enum Directory.<type> naming scheme  const outDir = path.resolve(__dirname, "json"); // where the JSON output of these device documents will be written  const imageDir = path.resolve(__dirname, "../../../server/public/files/images/buxton"); // where, in the server, these images will be written  const successOut = "buxton.json"; // the JSON list representing properly formatted documents @@ -277,12 +280,13 @@ export default async function executeImport(emitter: ResultCallback, terminator:              rimraf.sync(dir);              mkdirSync(dir);          }); +        await transferAssets();          return parseFiles(wordDocuments, emitter, terminator);      } catch (e) {          const message = [              "Unable to find a source directory.", -            "Please ensure that the following directory exists and is populated with Word documents:", -            `${sourceDir}` +            "Please ensure that the following directory exists:", +            `${e.message}`          ].join('\n');          console.log(red(message));          return { error: message }; @@ -290,6 +294,32 @@ export default async function executeImport(emitter: ResultCallback, terminator:  }  /** + * Builds a mirrored directory structure of all media / asset files + * within the server's public directory. + */ +async function transferAssets() { +    for (const assetType of readdirSync(assetDir)) { +        const subroot = path.resolve(assetDir, assetType); +        if (!statSync(subroot).isDirectory()) { +            continue; +        } +        const outputSubroot = serverPathToFile(assetType as Directory, "buxton"); +        if (existsSync(outputSubroot)) { +            continue; +        } else { +            mkdirSync(outputSubroot); +        } +        for (const fileName of readdirSync(subroot)) { +            const readStream = createReadStream(path.resolve(subroot, fileName)); +            const writeStream = createWriteStream(path.resolve(outputSubroot, fileName)); +            await new Promise<void>(resolve => { +                readStream.pipe(writeStream).on("close", resolve); +            }); +        } +    } +} + +/**   * Parse every Word document in the directory, notifying any callers as needed   * at each iteration via the emitter.   * @param wordDocuments the string list of Word document names to parse @@ -356,6 +386,16 @@ const xPaths = {      hyperlinks: '//*[name()="Relationship" and contains(@Type, "hyperlink")]'  }; +interface TableData { +    fileName: string; +    caption: string; +    additionalMedia?: { [type: string]: string }; +} + +const SuffixDirectoryMap = new Map<string, Directory>([ +    ["p", Directory.pdfs] +]); +  /**   * The meat of the script, images and text content are extracted here   * @param pathToDocument the path to the document relative to the root of the zip @@ -370,8 +410,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont      // get plain text      const body = document.root()?.text() ?? "No body found. Check the import script's XML parser.";      const captions: string[] = []; -    const embeddedFileNames: string[] = []; - +    const tableData: TableData[] = [];      // preserve paragraph formatting and line breaks that would otherwise get lost in the plain text parsing      // of the XML hierarchy      const paragraphs = document.find(xPaths.paragraphs).map(node => Utilities.correctSentences(node.text()).transformed!); @@ -382,7 +421,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont      // extract captions from the table cells      const tableRowsFlattened = document.find(xPaths.tableCells).map(node => node.text().trim());      const { length } = tableRowsFlattened; -    const numCols = 3; +    const numCols = 4;      strictEqual(length > numCols, true, "No captions written."); // first row has the headers, not content      strictEqual(length % numCols === 0, true, "Improper caption formatting."); @@ -392,8 +431,14 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont      // have been added or reordered since this was written, but follow the same appraoch)      for (let i = numCols; i < tableRowsFlattened.length; i += numCols) {          const row = tableRowsFlattened.slice(i, i + numCols); -        embeddedFileNames.push(row[1]); -        captions.push(row[2]); +        const entry: TableData = { fileName: row[1], caption: row[2] }; +        const key = SuffixDirectoryMap.get(row[3].toLowerCase()); +        if (key) { +            const media: any = {}; +            media[key] = `${entry.fileName.split(".")[0]}.pdf`; +            entry.additionalMedia = media; +        } +        tableData.push(entry);      }      // extract all hyperlinks embedded in the document @@ -409,7 +454,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont      // cleanup      zip.close(); -    return { body, longDescription, imageData, captions, embeddedFileNames, hyperlinks }; +    return { body, longDescription, imageData, tableData, hyperlinks };  }  // zip relative path from root expression / filter used to isolate only media assets @@ -492,11 +537,12 @@ async function writeImages(zip: any): Promise<ImageData[]> {   * @param contents the data already computed / parsed by extractFileContents   */  function analyze(fileName: string, contents: DocumentContents): AnalysisResult { -    const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescription } = contents; +    const { body, imageData, hyperlinks, tableData, longDescription } = contents;      const device: any = {          hyperlinks, -        captions, -        embeddedFileNames, +        captions: tableData.map(({ caption }) => caption), +        embeddedFileNames: tableData.map(({ fileName }) => fileName), +        additionalMedia: tableData.map(({ additionalMedia }) => additionalMedia),          longDescription,          __images: imageData      }; diff --git a/src/scraping/buxton/final/assets/pdfs/3DCad_Brochure.pdf b/src/scraping/buxton/final/assets/pdfs/3DCad_Brochure.pdfBinary files differ new file mode 100644 index 000000000..4746d2f41 --- /dev/null +++ b/src/scraping/buxton/final/assets/pdfs/3DCad_Brochure.pdf | 
