import { readdirSync, writeFile, mkdirSync, createReadStream, createWriteStream, existsSync, statSync } from "fs"; import * as path from "path"; import { red, cyan, yellow } from "colors"; import { Utils } from "../../../Utils"; import rimraf = require("rimraf"); import { DashUploadUtils } from "../../../server/DashUploadUtils"; const StreamZip = require('node-stream-zip'); const createImageSizeStream = require("image-size-stream"); import { parseXml } from "libxmljs"; import { strictEqual } from "assert"; import { Readable, PassThrough } from "stream"; import { Directory, serverPathToFile, pathToDirectory } from "../../../server/ApiManagers/UploadManager"; /** * This is an arbitrary bundle of data that gets populated * in extractFileContents */ interface DocumentContents { body: string; imageData: ImageData[]; hyperlinks: string[]; tableData: TableData[]; longDescription: string; } /** * A rough schema for everything that Bill has * included for each document */ export interface DeviceDocument { title: string; shortDescription: string; longDescription: string; company: string; year: number; originalPrice?: number; degreesOfFreedom?: number; dimensions?: string; primaryKey: string; secondaryKey: string; attribute: string; __images: ImageData[]; additionalMedia: ({ [type: string]: string } | undefined)[]; hyperlinks: string[]; captions: string[]; // from the table column embeddedFileNames: string[]; // from the table column } /** * A layer of abstraction around a single parsing * attempt. The error is not a TypeScript error, but * rather an invalidly formatted value for a given key. */ export interface AnalysisResult { device?: DeviceDocument; invalid?: { [deviceProperty: string]: string }; } /** * A mini API that takes in a string and returns * either the given T or an error indicating that the * transformation was rejected. */ type Transformer = (raw: string) => TransformResult; interface TransformResult { transformed?: T; error?: string; } /** * Simple bundle counting successful and failed imports */ export interface ImportResults { deviceCount: number; errorCount: number; } /** * Definitions for callback functions. Such instances are * just invoked by when a single document has been parsed * or the entire import is over. As of this writing, these * callbacks are supplied by WebSocket.ts and used to inform * the client of these events. */ type ResultCallback = (result: AnalysisResult) => void; type TerminatorCallback = (result: ImportResults) => void; /** * Defines everything needed to define how a single key should be * formatted within the plain body text. The association between * keys and their format definitions is stored FormatMap */ interface ValueFormatDefinition { exp: RegExp; // the expression that the key's value should match matchIndex?: number; // defaults to 0, but can be overridden to account for grouping in @param exp transformer?: Transformer; // if desirable, how to transform the Regex match required?: boolean; // defaults to true, confirms that for a whole document to be counted successful, // all of its required values should be present and properly formatted } /** * The basic data we extract from each image in the document */ interface ImageData { url: string; nativeWidth: number; nativeHeight: number; } namespace Utilities { /** * Numeric 'try parse', fits with the Transformer API * @param raw the serialized number */ export function numberValue(raw: string): TransformResult { const transformed = Number(raw); if (isNaN(transformed)) { return { error: `${raw} cannot be parsed to a numeric value.` }; } return { transformed }; } /** * A simple tokenizer that splits along 'and' and commas, and removes duplicates * Helpful mainly for attribute and primary key lists * @param raw the string to tokenize */ export function collectUniqueTokens(raw: string): TransformResult { const pieces = raw.replace(/,|\s+and\s+/g, " ").split(/\s+/).filter(piece => piece.length); const unique = new Set(pieces.map(token => token.toLowerCase().trim())); return { transformed: Array.from(unique).map(capitalize).sort() }; } /** * Tries to correct XML text parsing artifact where some sentences lose their separating space, * and others gain excess whitespace * @param raw */ export function correctSentences(raw: string): TransformResult { raw = raw.replace(/\./g, ". ").replace(/\:/g, ": ").replace(/\,/g, ", ").replace(/\?/g, "? ").trimRight(); raw = raw.replace(/\s{2,}/g, " "); return { transformed: raw }; } /** * Simple capitalization * @param word to capitalize */ export function capitalize(word: string): string { const clean = word.trim(); if (!clean.length) { return word; } return word.charAt(0).toUpperCase() + word.slice(1); } /** * Streams the requeted file at the relative path to the * root of the zip, then parses it with a library * @param zip the zip instance data source * @param relativePath the path to a .xml file within the zip to parse */ export async function readAndParseXml(zip: any, relativePath: string) { console.log(`Text streaming ${relativePath}`); const contents = await new Promise((resolve, reject) => { let body = ""; zip.stream(relativePath, (error: any, stream: any) => { if (error) { reject(error); } stream.on('data', (chunk: any) => body += chunk.toString()); stream.on('end', () => resolve(body)); }); }); return parseXml(contents); } } /** * Defines how device values should be formatted. As you can see, the formatting is * not super consistent and has changed over time as edge cases have been found, but this * at least imposes some constraints, and will notify you if a document doesn't match the specifications * in this map. */ const FormatMap = new Map>([ ["title", { exp: /contact\s+(.*)Short Description:/ }], ["company", { exp: /Company:\s+([^\|]*)\s+\|/, transformer: (raw: string) => ({ transformed: raw.replace(/\./g, "") }) }], ["year", { exp: /Year:\s+([^\|]*)\s+\|/, transformer: (raw: string) => Utilities.numberValue(/[0-9]{4}/.exec(raw)![0]) }], ["primaryKey", { exp: /Primary:\s+(.*)(Secondary|Additional):/, transformer: raw => { const { transformed, error } = Utilities.collectUniqueTokens(raw); return transformed ? { transformed: transformed[0] } : { error }; } }], ["secondaryKey", { exp: /(Secondary|Additional):\s+(.*)Attributes?:/, transformer: raw => { const { transformed, error } = Utilities.collectUniqueTokens(raw); return transformed ? { transformed: transformed[0] } : { error }; }, matchIndex: 2 }], ["attribute", { exp: /Attributes?:\s+(.*)Links/, transformer: raw => { const { transformed, error } = Utilities.collectUniqueTokens(raw); return transformed ? { transformed: transformed[0] } : { error }; }, }], ["originalPrice", { exp: /Original Price \(USD\)\:\s+(\$[0-9\,]+\.[0-9]+|NFS)/, transformer: (raw: string) => { raw = raw.replace(/\,/g, ""); if (raw === "NFS") { return { transformed: -1 }; } return Utilities.numberValue(raw.slice(1)); }, required: false }], ["degreesOfFreedom", { exp: /Degrees of Freedom:\s+([0-9]+)/, transformer: Utilities.numberValue, required: false }], ["dimensions", { exp: /Dimensions\s+\(L x W x H\):\s+([0-9\.]+\s+x\s+[0-9\.]+\s+x\s+[0-9\.]+\s\([A-Za-z]+\))/, transformer: (raw: string) => { const [length, width, group] = raw.split(" x "); const [height, unit] = group.split(" "); return { transformed: { dim_length: Number(length), dim_width: Number(width), dim_height: Number(height), dim_unit: unit.replace(/[\(\)]+/g, "") } }; }, required: false }], ["shortDescription", { exp: /Short Description:\s+(.*)Bill Buxton[’']s Notes/, transformer: Utilities.correctSentences }], ]); const sourceDir = path.resolve(__dirname, "source"); // where the Word documents are assumed to be stored const assetDir = path.resolve(__dirname, "assets"); // where any additional media content like pdfs will be stored. Each subdirectory of this // must follow the enum Directory. naming scheme const outDir = path.resolve(__dirname, "json"); // where the JSON output of these device documents will be written const imageDir = path.resolve(__dirname, "../../../server/public/files/images/buxton"); // where, in the server, these images will be written const successOut = "buxton.json"; // the JSON list representing properly formatted documents const failOut = "incomplete.json"; // the JSON list representing improperly formatted documents const deviceKeys = Array.from(FormatMap.keys()); // a way to iterate through all keys of the DeviceDocument interface /** * Starts by REMOVING ALL EXISTING BUXTON RESOURCES. This might need to be * changed going forward * @param emitter the callback when each document is completed * @param terminator the callback when the entire import is completed */ export default async function executeImport(emitter: ResultCallback, terminator: TerminatorCallback) { try { // get all Word documents in the source directory const contents = readdirSync(sourceDir); const wordDocuments = contents.filter(file => /.*\.docx?$/.test(file)).map(file => `${sourceDir}/${file}`); // removal takes place here [outDir, imageDir].forEach(dir => { rimraf.sync(dir); mkdirSync(dir); }); await transferAssets(); return parseFiles(wordDocuments, emitter, terminator); } catch (e) { const message = [ "Unable to find a source directory.", "Please ensure that the following directory exists:", `${e.message}` ].join('\n'); console.log(red(message)); return { error: message }; } } /** * Builds a mirrored directory structure of all media / asset files * within the server's public directory. */ async function transferAssets() { for (const assetType of readdirSync(assetDir)) { const subroot = path.resolve(assetDir, assetType); if (!statSync(subroot).isDirectory()) { continue; } const outputSubroot = serverPathToFile(assetType as Directory, "buxton"); if (existsSync(outputSubroot)) { continue; } else { mkdirSync(outputSubroot); } for (const fileName of readdirSync(subroot)) { const readStream = createReadStream(path.resolve(subroot, fileName)); const writeStream = createWriteStream(path.resolve(outputSubroot, fileName)); await new Promise(resolve => { readStream.pipe(writeStream).on("close", resolve); }); } } } /** * Parse every Word document in the directory, notifying any callers as needed * at each iteration via the emitter. * @param wordDocuments the string list of Word document names to parse * @param emitter the callback when each document is completed * @param terminator the callback when the entire import is completed */ async function parseFiles(wordDocuments: string[], emitter: ResultCallback, terminator: TerminatorCallback): Promise { // execute parent-most parse function const results: AnalysisResult[] = []; for (const filePath of wordDocuments) { const fileName = path.basename(filePath).replace("Bill_Notes_", ""); // not strictly needed, but cleaner console.log(cyan(`\nExtracting contents from ${fileName}...`)); const result = analyze(fileName, await extractFileContents(filePath)); emitter(result); results.push(result); } // collect information about errors and successes const masterDevices: DeviceDocument[] = []; const masterErrors: { [key: string]: string }[] = []; results.forEach(({ device, invalid: errors }) => { if (device) { masterDevices.push(device); } else if (errors) { masterErrors.push(errors); } }); // something went wrong, since errors and successes should sum to total inputs const total = wordDocuments.length; if (masterDevices.length + masterErrors.length !== total) { throw new Error(`Encountered a ${masterDevices.length} to ${masterErrors.length} mismatch in device / error split!`); } // write the external JSON representations of this import console.log(); await writeOutputFile(successOut, masterDevices, total, true); await writeOutputFile(failOut, masterErrors, total, false); console.log(); // notify the caller that the import has finished terminator({ deviceCount: masterDevices.length, errorCount: masterErrors.length }); return masterDevices; } /** * XPath definitions for desired XML targets in respective hierarchies. * * For table cells, can be read as: "find me anything that looks like in XML, whose * parent looks like , whose parent looks like " * * * * * * These are found by trial and error, and using an online XML parser / prettifier * to inspect the structure, since the Node XML library does not expose the parsed * structure very well for searching, say in the debug console. */ const xPaths = { paragraphs: '//*[name()="w:p"]', tableCells: '//*[name()="w:tbl"]/*[name()="w:tr"]/*[name()="w:tc"]', hyperlinks: '//*[name()="Relationship" and contains(@Type, "hyperlink")]' }; interface TableData { fileName: string; caption: string; additionalMedia?: { [type: string]: string }; } const SuffixDirectoryMap = new Map([ ["p", Directory.pdfs] ]); /** * The meat of the script, images and text content are extracted here * @param pathToDocument the path to the document relative to the root of the zip */ async function extractFileContents(pathToDocument: string): Promise { console.log('Extracting text...'); const zip = new StreamZip({ file: pathToDocument, storeEntries: true }); await new Promise(resolve => zip.on('ready', resolve)); // extract the body of the document and, specifically, its captions const document = await Utilities.readAndParseXml(zip, "word/document.xml"); // get plain text const body = document.root()?.text() ?? "No body found. Check the import script's XML parser."; const captions: string[] = []; const tableData: TableData[] = []; // preserve paragraph formatting and line breaks that would otherwise get lost in the plain text parsing // of the XML hierarchy const paragraphs = document.find(xPaths.paragraphs).map(node => Utilities.correctSentences(node.text()).transformed!); const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1; const end = paragraphs.indexOf("Device Details"); const longDescription = paragraphs.slice(start, end).filter(paragraph => paragraph.length).join("\n\n"); // extract captions from the table cells const tableRowsFlattened = document.find(xPaths.tableCells).map(node => node.text().trim()); const { length } = tableRowsFlattened; const numCols = 4; strictEqual(length > numCols, true, "No captions written."); // first row has the headers, not content strictEqual(length % numCols === 0, true, "Improper caption formatting."); // break the flat list of strings into groups of numColumns. Thus, each group represents // a row in the table, where the first row has no text content since it's // the image, the second has the file name and the third has the caption (maybe additional columns // have been added or reordered since this was written, but follow the same appraoch) for (let i = numCols; i < tableRowsFlattened.length; i += numCols) { const row = tableRowsFlattened.slice(i, i + numCols); const entry: TableData = { fileName: row[1], caption: row[2] }; const key = SuffixDirectoryMap.get(row[3].toLowerCase()); if (key) { const media: any = {}; media[key] = `${entry.fileName.split(".")[0]}.pdf`; entry.additionalMedia = media; } tableData.push(entry); } // extract all hyperlinks embedded in the document const rels = await Utilities.readAndParseXml(zip, "word/_rels/document.xml.rels"); const hyperlinks = rels.find(xPaths.hyperlinks).map(el => el.attrs()[2].value()); console.log("Text extracted."); // write out the images for this document console.log("Beginning image extraction..."); const imageData = await writeImages(zip); console.log(`Extracted ${imageData.length} images.`); // cleanup zip.close(); return { body, longDescription, imageData, tableData, hyperlinks }; } // zip relative path from root expression / filter used to isolate only media assets const imageEntry = /^word\/media\/\w+\.(jpeg|jpg|png|gif)/; /** * Image dimensions and file suffix, */ interface ImageAttrs { width: number; height: number; type: string; } /** * For each image, stream the file, get its size, check if it's an icon * (if it is, ignore it) * @param zip the zip instance data source */ async function writeImages(zip: any): Promise { const allEntries = Object.values(zip.entries()).map(({ name }) => name); const imageEntries = allEntries.filter(name => imageEntry.test(name)); const imageUrls: ImageData[] = []; const valid: any[] = []; const getImageStream = (mediaPath: string) => new Promise((resolve, reject) => { zip.stream(mediaPath, (error: any, stream: any) => error ? reject(error) : resolve(stream)); }); for (const mediaPath of imageEntries) { const { width, height, type } = await new Promise(async resolve => { const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: ImageAttrs) => { readStream.destroy(); resolve(dimensions); }).on("error", () => readStream.destroy()); const readStream = await getImageStream(mediaPath); readStream.pipe(sizeStream); }); // if it's not an icon, by this rough heuristic, i.e. is it not square const number = Number(/image(\d+)/.exec(mediaPath)![1]); if (number > 5 || width - height > 10) { valid.push({ width, height, type, mediaPath, number }); } } valid.sort((a, b) => a.number - b.number); const [{ width: first_w, height: first_h }, { width: second_w, height: second_h }] = valid; if (Math.abs(first_w / second_w - first_h / second_h) < 0.01) { const first_size = first_w * first_h; const second_size = second_w * second_h; const target = first_size >= second_size ? 1 : 0; valid.splice(target, 1); console.log(`Heuristically removed image with size ${target ? second_size : first_size}`); } // for each valid image, output the _o, _l, _m, and _s files // THIS IS WHERE THE SCRIPT SPENDS MOST OF ITS TIME for (const { type, width, height, mediaPath } of valid) { const generatedFileName = `upload_${Utils.GenerateGuid()}.${type.toLowerCase()}`; await DashUploadUtils.outputResizedImages(() => getImageStream(mediaPath), generatedFileName, imageDir); imageUrls.push({ url: `/files/images/buxton/${generatedFileName}`, nativeWidth: width, nativeHeight: height }); } return imageUrls; } /** * Takes the results of extractFileContents, which relative to this is sort of the * external media / preliminary text processing, and now tests the given file name to * with those value definitions to make sure the body of the document contains all * required fields, properly formatted * @param fileName the file whose body to inspect * @param contents the data already computed / parsed by extractFileContents */ function analyze(fileName: string, contents: DocumentContents): AnalysisResult { const { body, imageData, hyperlinks, tableData, longDescription } = contents; const device: any = { hyperlinks, captions: tableData.map(({ caption }) => caption), embeddedFileNames: tableData.map(({ fileName }) => fileName), additionalMedia: tableData.map(({ additionalMedia }) => additionalMedia), longDescription, __images: imageData }; const errors: { [key: string]: string } = { fileName }; for (const key of deviceKeys) { const { exp, transformer, matchIndex, required } = FormatMap.get(key)!; const matches = exp.exec(body); let captured: string; // if we matched and we got the specific match we're after if (matches && (captured = matches[matchIndex ?? 1])) { // matchIndex defaults to 1 captured = captured.replace(/\s{2,}/g, " "); // remove excess whitespace // if supplied, apply the required transformation (recall this is specified in FormatMap) if (transformer) { const { error, transformed } = transformer(captured); if (error) { // we hit a snag trying to transform the valid match // still counts as a fundamental error errors[key] = `__ERR__${key.toUpperCase()}__TRANSFORM__: ${error}`; continue; } captured = transformed; } device[key] = captured; } else if (required ?? true) { // the field was either implicitly or explicitly required, and failed to match the definition in // FormatMap errors[key] = `ERR__${key.toUpperCase()}__: outer match ${matches === null ? "wasn't" : "was"} captured.`; continue; } } // print errors - this can be removed const errorKeys = Object.keys(errors); if (errorKeys.length > 1) { console.log(red(`@ ${cyan(fileName.toUpperCase())}...`)); errorKeys.forEach(key => key !== "filename" && console.log(red(errors[key]))); return { invalid: errors }; } return { device }; } /** * A utility function that writes the JSON results for this import out to the desired path * @param relativePath where to write the JSON file * @param data valid device document objects, or errors * @param total used for more informative printing * @param success whether or not the caller is writing the successful parses or the failures */ async function writeOutputFile(relativePath: string, data: any[], total: number, success: boolean) { console.log(yellow(`Encountered ${data.length} ${success ? "valid" : "invalid"} documents out of ${total} candidates. Writing ${relativePath}...`)); return new Promise((resolve, reject) => { const destination = path.resolve(outDir, relativePath); const contents = JSON.stringify(data, undefined, 4); // format the JSON writeFile(destination, contents, err => err ? reject(err) : resolve()); }); }