diff options
author | Sam Wilkins <samwilkins333@gmail.com> | 2020-04-30 19:06:42 -0700 |
---|---|---|
committer | Sam Wilkins <samwilkins333@gmail.com> | 2020-04-30 19:06:42 -0700 |
commit | 1660defc561c904217ed5be34cd6e0fe64736fe1 (patch) | |
tree | a5dc2ccc790be5c2d6b815a5a4c4859b61ecea89 /src | |
parent | 22748f8d35235941fc6622b19a2d4d3f809ccee7 (diff) |
commented Buxton importer
Diffstat (limited to 'src')
-rw-r--r-- | src/scraping/buxton/final/BuxtonImporter.ts | 212 | ||||
-rw-r--r-- | src/server/authentication/models/current_user_utils.ts | 1 |
2 files changed, 179 insertions, 34 deletions
diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts index 713207a07..21363f848 100644 --- a/src/scraping/buxton/final/BuxtonImporter.ts +++ b/src/scraping/buxton/final/BuxtonImporter.ts @@ -10,6 +10,10 @@ import { parseXml } from "libxmljs"; import { strictEqual } from "assert"; import { Readable, PassThrough } from "stream"; +/** + * This is an arbitrary bundle of data that gets populated + * in extractFileContents + */ interface DocumentContents { body: string; imageData: ImageData[]; @@ -19,6 +23,10 @@ interface DocumentContents { longDescription: string; } +/** + * A rough schema for everything that Bill has + * included for each document + */ export interface DeviceDocument { title: string; shortDescription: string; @@ -33,36 +41,65 @@ export interface DeviceDocument { attribute: string; __images: ImageData[]; hyperlinks: string[]; - captions: string[]; - embeddedFileNames: string[]; + captions: string[]; // from the table column + embeddedFileNames: string[]; // from the table column } +/** + * A layer of abstraction around a single parsing + * attempt. The error is not a TypeScript error, but + * rather an invalidly formatted value for a given key. + */ export interface AnalysisResult { device?: DeviceDocument; - errors?: { [key: string]: string }; + invalid?: { [deviceProperty: string]: string }; } +/** + * A mini API that takes in a string and returns + * either the given T or an error indicating that the + * transformation was rejected. + */ type Transformer<T> = (raw: string) => TransformResult<T>; interface TransformResult<T> { transformed?: T; error?: string; } +/** + * Simple bundle counting successful and failed imports + */ export interface ImportResults { deviceCount: number; errorCount: number; } +/** + * Definitions for callback functions. Such instances are + * just invoked by when a single document has been parsed + * or the entire import is over. As of this writing, these + * callbacks are supplied by WebSocket.ts and used to inform + * the client of these events. + */ type ResultCallback = (result: AnalysisResult) => void; type TerminatorCallback = (result: ImportResults) => void; -interface Processor<T> { - exp: RegExp; - matchIndex?: number; - transformer?: Transformer<T>; - required?: boolean; +/** + * Defines everything needed to define how a single key should be + * formatted within the plain body text. The association between + * keys and their format definitions is stored FormatMap + */ +interface ValueFormatDefinition<T> { + exp: RegExp; // the expression that the key's value should match + matchIndex?: number; // defaults to 0, but can be overridden to account for grouping in @param exp + transformer?: Transformer<T>; // if desirable, how to transform the Regex match + required?: boolean; // defaults to true, confirms that for a whole document to be counted successful, + // all of its required values should be present and properly formatted } +/** + * The basic data we extract from each image in the document + */ interface ImageData { url: string; nativeWidth: number; @@ -71,6 +108,10 @@ interface ImageData { namespace Utilities { + /** + * Numeric 'try parse', fits with the Transformer API + * @param raw the serialized number + */ export function numberValue(raw: string): TransformResult<number> { const transformed = Number(raw); if (isNaN(transformed)) { @@ -79,18 +120,32 @@ namespace Utilities { return { transformed }; } + /** + * A simple tokenizer that splits along 'and' and commas, and removes duplicates + * Helpful mainly for attribute and primary key lists + * @param raw the string to tokenize + */ export function collectUniqueTokens(raw: string): TransformResult<string[]> { const pieces = raw.replace(/,|\s+and\s+/g, " ").split(/\s+/).filter(piece => piece.length); const unique = new Set(pieces.map(token => token.toLowerCase().trim())); return { transformed: Array.from(unique).map(capitalize).sort() }; } + /** + * Tries to correct XML text parsing artifact where some sentences lose their separating space, + * and others gain excess whitespace + * @param raw + */ export function correctSentences(raw: string): TransformResult<string> { raw = raw.replace(/\./g, ". ").replace(/\:/g, ": ").replace(/\,/g, ", ").replace(/\?/g, "? ").trimRight(); raw = raw.replace(/\s{2,}/g, " "); return { transformed: raw }; } + /** + * Simple capitalization + * @param word to capitalize + */ export function capitalize(word: string): string { const clean = word.trim(); if (!clean.length) { @@ -99,6 +154,12 @@ namespace Utilities { return word.charAt(0).toUpperCase() + word.slice(1); } + /** + * Streams the requeted file at the relative path to the + * root of the zip, then parses it with a library + * @param zip the zip instance data source + * @param relativePath the path to a .xml file within the zip to parse + */ export async function readAndParseXml(zip: any, relativePath: string) { console.log(`Text streaming ${relativePath}`); const contents = await new Promise<string>((resolve, reject) => { @@ -111,13 +172,17 @@ namespace Utilities { stream.on('end', () => resolve(body)); }); }); - return parseXml(contents); } - } -const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([ +/** + * Defines how device values should be formatted. As you can see, the formatting is + * not super consistent and has changed over time as edge cases have been found, but this + * at least imposes some constraints, and will notify you if a document doesn't match the specifications + * in this map. + */ +const FormatMap = new Map<keyof DeviceDocument, ValueFormatDefinition<any>>([ ["title", { exp: /contact\s+(.*)Short Description:/ }], @@ -189,17 +254,25 @@ const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([ }], ]); -const sourceDir = path.resolve(__dirname, "source"); -const outDir = path.resolve(__dirname, "json"); -const imageDir = path.resolve(__dirname, "../../../server/public/files/images/buxton"); -const successOut = "buxton.json"; -const failOut = "incomplete.json"; -const deviceKeys = Array.from(RegexMap.keys()); - +const sourceDir = path.resolve(__dirname, "source"); // where the Word documents are assumed to be stored +const outDir = path.resolve(__dirname, "json"); // where the JSON output of these device documents will be written +const imageDir = path.resolve(__dirname, "../../../server/public/files/images/buxton"); // where, in the server, these images will be written +const successOut = "buxton.json"; // the JSON list representing properly formatted documents +const failOut = "incomplete.json"; // the JSON list representing improperly formatted documents +const deviceKeys = Array.from(FormatMap.keys()); // a way to iterate through all keys of the DeviceDocument interface + +/** + * Starts by REMOVING ALL EXISTING BUXTON RESOURCES. This might need to be + * changed going forward + * @param emitter the callback when each document is completed + * @param terminator the callback when the entire import is completed + */ export default async function executeImport(emitter: ResultCallback, terminator: TerminatorCallback) { try { + // get all Word documents in the source directory const contents = readdirSync(sourceDir); const wordDocuments = contents.filter(file => /.*\.docx?$/.test(file)).map(file => `${sourceDir}/${file}`); + // removal takes place here [outDir, imageDir].forEach(dir => { rimraf.sync(dir); mkdirSync(dir); @@ -216,19 +289,28 @@ export default async function executeImport(emitter: ResultCallback, terminator: } } +/** + * Parse every Word document in the directory, notifying any callers as needed + * at each iteration via the emitter. + * @param wordDocuments the string list of Word document names to parse + * @param emitter the callback when each document is completed + * @param terminator the callback when the entire import is completed + */ async function parseFiles(wordDocuments: string[], emitter: ResultCallback, terminator: TerminatorCallback): Promise<DeviceDocument[]> { + // execute parent-most parse function const results: AnalysisResult[] = []; for (const filePath of wordDocuments) { - const fileName = path.basename(filePath).replace("Bill_Notes_", ""); + const fileName = path.basename(filePath).replace("Bill_Notes_", ""); // not strictly needed, but cleaner console.log(cyan(`\nExtracting contents from ${fileName}...`)); const result = analyze(fileName, await extractFileContents(filePath)); emitter(result); results.push(result); } + // collect information about errors and successes const masterDevices: DeviceDocument[] = []; const masterErrors: { [key: string]: string }[] = []; - results.forEach(({ device, errors }) => { + results.forEach(({ device, invalid: errors }) => { if (device) { masterDevices.push(device); } else if (errors) { @@ -236,24 +318,45 @@ async function parseFiles(wordDocuments: string[], emitter: ResultCallback, term } }); + // something went wrong, since errors and successes should sum to total inputs const total = wordDocuments.length; if (masterDevices.length + masterErrors.length !== total) { throw new Error(`Encountered a ${masterDevices.length} to ${masterErrors.length} mismatch in device / error split!`); } + // write the external JSON representations of this import console.log(); await writeOutputFile(successOut, masterDevices, total, true); await writeOutputFile(failOut, masterErrors, total, false); console.log(); + // notify the caller that the import has finished terminator({ deviceCount: masterDevices.length, errorCount: masterErrors.length }); return masterDevices; } +/** + * XPath definitions for desired XML targets in respective hierarchies. + * + * For table cells, can be read as: "find me anything that looks like <w:tc> in XML, whose + * parent looks like <w:tr>, whose parent looks like <w:tbl>" + * + * <w:tbl> + * <w:tr> + * <w:tc> + * + * These are found by trial and error, and using an online XML parser / prettifier + * to inspect the structure, since the Node XML library does not expose the parsed + * structure very well for searching, say in the debug console. + */ const tableCellXPath = '//*[name()="w:tbl"]/*[name()="w:tr"]/*[name()="w:tc"]'; const hyperlinkXPath = '//*[name()="Relationship" and contains(@Type, "hyperlink")]'; +/** + * The meat of the script, images and text content are extracted here + * @param pathToDocument the path to the document relative to the root of the zip + */ async function extractFileContents(pathToDocument: string): Promise<DocumentContents> { console.log('Extracting text...'); const zip = new StreamZip({ file: pathToDocument, storeEntries: true }); @@ -261,22 +364,30 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont // extract the body of the document and, specifically, its captions const document = await Utilities.readAndParseXml(zip, "word/document.xml"); + // get plain text const body = document.root()?.text() ?? "No body found. Check the import script's XML parser."; const captions: string[] = []; const embeddedFileNames: string[] = []; - const captionTargets = document.find(tableCellXPath).map(node => node.text().trim()); + // preserve paragraph formatting and line breaks that would otherwise get lost in the plain text parsing + // of the XML hierarchy const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!); const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1; const end = paragraphs.indexOf("Device Details"); const longDescription = paragraphs.slice(start, end).filter(paragraph => paragraph.length).join("\n\n"); - const { length } = captionTargets; + // extract captions from the table cells + const tableRowsFlattened = document.find(tableCellXPath).map(node => node.text().trim()); + const { length } = tableRowsFlattened; strictEqual(length > 3, true, "No captions written."); strictEqual(length % 3 === 0, true, "Improper caption formatting."); - for (let i = 3; i < captionTargets.length; i += 3) { - const row = captionTargets.slice(i, i + 3); + // break the flat list of strings into groups of three, since there + // currently are three columns in the table. Thus, each group represents + // a row in the table, where the first row has no text content since it's + // the image, the second has the file name and the third has the caption + for (let i = 3; i < tableRowsFlattened.length; i += 3) { + const row = tableRowsFlattened.slice(i, i + 3); embeddedFileNames.push(row[1]); captions.push(row[2]); } @@ -286,23 +397,34 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont const hyperlinks = rels.find(hyperlinkXPath).map(el => el.attrs()[2].value()); console.log("Text extracted."); + // write out the images for this document console.log("Beginning image extraction..."); const imageData = await writeImages(zip); console.log(`Extracted ${imageData.length} images.`); + // cleanup zip.close(); return { body, longDescription, imageData, captions, embeddedFileNames, hyperlinks }; } +// zip relative path from root expression / filter used to isolate only media assets const imageEntry = /^word\/media\/\w+\.(jpeg|jpg|png|gif)/; -interface Dimensions { +/** + * Image dimensions and file suffix, + */ +interface ImageAttrs { width: number; height: number; type: string; } +/** + * For each image, stream the file, get its size, check if it's an icon + * (if it is, ignore it) + * @param zip the zip instance data source + */ async function writeImages(zip: any): Promise<ImageData[]> { const allEntries = Object.values<any>(zip.entries()).map(({ name }) => name); const imageEntries = allEntries.filter(name => imageEntry.test(name)); @@ -315,8 +437,8 @@ async function writeImages(zip: any): Promise<ImageData[]> { }); for (const mediaPath of imageEntries) { - const { width, height, type } = await new Promise<Dimensions>(async resolve => { - const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: Dimensions) => { + const { width, height, type } = await new Promise<ImageAttrs>(async resolve => { + const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: ImageAttrs) => { readStream.destroy(); resolve(dimensions); }).on("error", () => readStream.destroy()); @@ -324,11 +446,14 @@ async function writeImages(zip: any): Promise<ImageData[]> { readStream.pipe(sizeStream); }); + // if it's not an icon, by this rough heuristic, i.e. is it not square if (Math.abs(width - height) > 10) { valid.push({ width, height, type, mediaPath }); } } + // for each valid image, output the _o, _l, _m, and _s files + // THIS IS WHERE THE SCRIPT SPENDS MOST OF ITS TIME for (const { type, width, height, mediaPath } of valid) { const generatedFileName = `upload_${Utils.GenerateGuid()}.${type.toLowerCase()}`; await DashUploadUtils.outputResizedImages(() => getImageStream(mediaPath), generatedFileName, imageDir); @@ -342,6 +467,14 @@ async function writeImages(zip: any): Promise<ImageData[]> { return imageUrls; } +/** + * Takes the results of extractFileContents, which relative to this is sort of the + * external media / preliminary text processing, and now tests the given file name to + * with those value definitions to make sure the body of the document contains all + * required fields, properly formatted + * @param fileName the file whose body to inspect + * @param contents the data already computed / parsed by extractFileContents + */ function analyze(fileName: string, contents: DocumentContents): AnalysisResult { const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescription } = contents; const device: any = { @@ -354,43 +487,56 @@ function analyze(fileName: string, contents: DocumentContents): AnalysisResult { const errors: { [key: string]: string } = { fileName }; for (const key of deviceKeys) { - const { exp, transformer, matchIndex, required } = RegexMap.get(key)!; + const { exp, transformer, matchIndex, required } = FormatMap.get(key)!; const matches = exp.exec(body); let captured: string; - if (matches && (captured = matches[matchIndex ?? 1])) { - captured = captured.replace(/\s{2,}/g, " "); + // if we matched and we got the specific match we're after + if (matches && (captured = matches[matchIndex ?? 1])) { // matchIndex defaults to 1 + captured = captured.replace(/\s{2,}/g, " "); // remove excess whitespace + // if supplied, apply the required transformation (recall this is specified in FormatMap) if (transformer) { const { error, transformed } = transformer(captured); if (error) { + // we hit a snag trying to transform the valid match + // still counts as a fundamental error errors[key] = `__ERR__${key.toUpperCase()}__TRANSFORM__: ${error}`; continue; } captured = transformed; } - device[key] = captured; } else if (required ?? true) { + // the field was either implicitly or explicitly required, and failed to match the definition in + // FormatMap errors[key] = `ERR__${key.toUpperCase()}__: outer match ${matches === null ? "wasn't" : "was"} captured.`; continue; } } + // print errors - this can be removed const errorKeys = Object.keys(errors); if (errorKeys.length > 1) { console.log(red(`@ ${cyan(fileName.toUpperCase())}...`)); errorKeys.forEach(key => key !== "filename" && console.log(red(errors[key]))); - return { errors }; + return { invalid: errors }; } return { device }; } +/** + * A utility function that writes the JSON results for this import out to the desired path + * @param relativePath where to write the JSON file + * @param data valid device document objects, or errors + * @param total used for more informative printing + * @param success whether or not the caller is writing the successful parses or the failures + */ async function writeOutputFile(relativePath: string, data: any[], total: number, success: boolean) { console.log(yellow(`Encountered ${data.length} ${success ? "valid" : "invalid"} documents out of ${total} candidates. Writing ${relativePath}...`)); return new Promise<void>((resolve, reject) => { const destination = path.resolve(outDir, relativePath); - const contents = JSON.stringify(data, undefined, 4); + const contents = JSON.stringify(data, undefined, 4); // format the JSON writeFile(destination, contents, err => err ? reject(err) : resolve()); }); }
\ No newline at end of file diff --git a/src/server/authentication/models/current_user_utils.ts b/src/server/authentication/models/current_user_utils.ts index 663343f47..d7cc1e6bf 100644 --- a/src/server/authentication/models/current_user_utils.ts +++ b/src/server/authentication/models/current_user_utils.ts @@ -9,7 +9,6 @@ import { List } from "../../../new_fields/List"; import { listSpec } from "../../../new_fields/Schema"; import { ScriptField, ComputedField } from "../../../new_fields/ScriptField"; import { Cast, PromiseValue, StrCast, NumCast } from "../../../new_fields/Types"; -import { Utils } from "../../../Utils"; import { nullAudio, ImageField } from "../../../new_fields/URLField"; import { DragManager } from "../../../client/util/DragManager"; import { InkingControl } from "../../../client/views/InkingControl"; |