From c75ffd4900acea74c55b6bf275a5e8082c15d573 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Sun, 26 Apr 2020 19:29:50 -0700 Subject: formatted textbox disposers refactor, paragraph chunked rich text initialization and buxton importer updates --- src/scraping/buxton/final/BuxtonImporter.ts | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'src/scraping/buxton/final') diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts index 122415460..64b988610 100644 --- a/src/scraping/buxton/final/BuxtonImporter.ts +++ b/src/scraping/buxton/final/BuxtonImporter.ts @@ -16,6 +16,7 @@ interface DocumentContents { hyperlinks: string[]; captions: string[]; embeddedFileNames: string[]; + longDescriptionParagraphs: string[]; } export interface DeviceDocument { @@ -186,10 +187,6 @@ const RegexMap = new Map>([ exp: /Short Description:\s+(.*)Bill Buxton[’']s Notes/, transformer: Utilities.correctSentences }], - ["longDescription", { - exp: /Bill Buxton[’']s Notes(.*)Device Details/, - transformer: Utilities.correctSentences - }], ]); const sourceDir = path.resolve(__dirname, "source"); @@ -267,7 +264,12 @@ async function extractFileContents(pathToDocument: string): Promise node.text()); + const captionTargets = document.find(tableCellXPath).map(node => node.text().trim()); + + const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!); + const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1; + const end = paragraphs.indexOf("Device Details"); + const longDescriptionParagraphs = paragraphs.slice(start, end); const { length } = captionTargets; strictEqual(length > 3, true, "No captions written."); @@ -290,7 +292,7 @@ async function extractFileContents(pathToDocument: string): Promise { } function analyze(fileName: string, contents: DocumentContents): AnalysisResult { - const { body, imageData, captions, hyperlinks, embeddedFileNames } = contents; + const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescriptionParagraphs } = contents; const device: any = { hyperlinks, captions, @@ -376,6 +378,7 @@ function analyze(fileName: string, contents: DocumentContents): AnalysisResult { return { errors }; } + device.longDescription = longDescriptionParagraphs.join("\n\n"); return { device }; } -- cgit v1.2.3-70-g09d2 From fc470b25759e8f051dc527066f9bebcaf5e7707d Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Sun, 26 Apr 2020 23:55:23 -0700 Subject: various buxton fixes --- src/client/documents/Documents.ts | 1 - .../views/collections/CollectionTreeView.tsx | 24 ++++++++-------- src/client/views/nodes/DocumentContentsView.tsx | 8 +++--- src/client/views/nodes/FormattedTextBox.tsx | 9 +----- src/scraping/buxton/final/BuxtonImporter.ts | 32 ++++++++++++---------- 5 files changed, 36 insertions(+), 38 deletions(-) (limited to 'src/scraping/buxton/final') diff --git a/src/client/documents/Documents.ts b/src/client/documents/Documents.ts index 1651a6d55..5e0890e76 100644 --- a/src/client/documents/Documents.ts +++ b/src/client/documents/Documents.ts @@ -408,7 +408,6 @@ export namespace Docs { const doc = StackingDocument(deviceImages, { title: device.title, _LODdisable: true }); const deviceProto = Doc.GetProto(doc); deviceProto.hero = new ImageField(constructed[0].url); - deviceProto.fontFamily = "Arial"; Docs.Get.FromJson({ data: device, appendToExisting: { targetDoc: deviceProto } }); Doc.AddDocToList(parentProto, "data", doc); } else if (errors) { diff --git a/src/client/views/collections/CollectionTreeView.tsx b/src/client/views/collections/CollectionTreeView.tsx index 362d43ee7..dcb5e116c 100644 --- a/src/client/views/collections/CollectionTreeView.tsx +++ b/src/client/views/collections/CollectionTreeView.tsx @@ -733,19 +733,21 @@ export class CollectionTreeView extends CollectionSubView { const style: { [key: string]: any } = {}; const divKeys = OmitKeys(this.props, ["children", "htmltag", "RootDoc", "Document", "key", "onInput", "onClick", "__proto__"]).omit; Object.keys(divKeys).map((prop: string) => { - let p = (this.props as any)[prop] as string; + const p = (this.props as any)[prop] as string; const replacer = (match: any, expr: string, offset: any, string: any) => { // bcz: this executes a script to convert a propery expression string: { script } into a value return ScriptField.MakeFunction(expr, { self: Doc.name, this: Doc.name })?.script.run({ self: this.props.RootDoc, this: this.props.Document }).result as string || ""; }; @@ -178,9 +178,9 @@ export class DocumentContentsView extends React.Component 1 ? splits[0] + splits[1].replace(/{([^{}]|(?R))*}/, replacer4) : ""; // might have been more elegant if javascript supported recursive patterns diff --git a/src/client/views/nodes/FormattedTextBox.tsx b/src/client/views/nodes/FormattedTextBox.tsx index 8d4b90c41..d98172823 100644 --- a/src/client/views/nodes/FormattedTextBox.tsx +++ b/src/client/views/nodes/FormattedTextBox.tsx @@ -860,15 +860,8 @@ export class FormattedTextBox extends ViewBoxAnnotatableComponent<(FieldViewProp }); const startupText = !rtfField && this._editorView && Field.toString(this.dataDoc[fieldKey] as Field); if (startupText) { - const paragraphSegments = startupText.split("\n\n"); const { state: { tr }, dispatch } = this._editorView; - if (paragraphSegments.length) { - for (const paragraph of paragraphSegments) { - dispatch(tr.insertText(paragraph)); - } - } else { - dispatch(tr.insertText(startupText)); - } + dispatch(tr.insertText(startupText)); } } diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts index 64b988610..713207a07 100644 --- a/src/scraping/buxton/final/BuxtonImporter.ts +++ b/src/scraping/buxton/final/BuxtonImporter.ts @@ -16,7 +16,7 @@ interface DocumentContents { hyperlinks: string[]; captions: string[]; embeddedFileNames: string[]; - longDescriptionParagraphs: string[]; + longDescription: string; } export interface DeviceDocument { @@ -269,7 +269,7 @@ async function extractFileContents(pathToDocument: string): Promise Utilities.correctSentences(node.text()).transformed!); const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1; const end = paragraphs.indexOf("Device Details"); - const longDescriptionParagraphs = paragraphs.slice(start, end); + const longDescription = paragraphs.slice(start, end).filter(paragraph => paragraph.length).join("\n\n"); const { length } = captionTargets; strictEqual(length > 3, true, "No captions written."); @@ -292,7 +292,7 @@ async function extractFileContents(pathToDocument: string): Promise { const imageEntries = allEntries.filter(name => imageEntry.test(name)); const imageUrls: ImageData[] = []; - for (const mediaPath of imageEntries) { - const getImageStream = () => new Promise((resolve, reject) => { - zip.stream(mediaPath, (error: any, stream: any) => error ? reject(error) : resolve(stream)); - }); + const valid: any[] = []; + + const getImageStream = (mediaPath: string) => new Promise((resolve, reject) => { + zip.stream(mediaPath, (error: any, stream: any) => error ? reject(error) : resolve(stream)); + }); + for (const mediaPath of imageEntries) { const { width, height, type } = await new Promise(async resolve => { const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: Dimensions) => { readStream.destroy(); resolve(dimensions); }).on("error", () => readStream.destroy()); - const readStream = await getImageStream(); + const readStream = await getImageStream(mediaPath); readStream.pipe(sizeStream); }); - if (Math.abs(width - height) < 10) { - continue; + + if (Math.abs(width - height) > 10) { + valid.push({ width, height, type, mediaPath }); } + } + for (const { type, width, height, mediaPath } of valid) { const generatedFileName = `upload_${Utils.GenerateGuid()}.${type.toLowerCase()}`; - await DashUploadUtils.outputResizedImages(getImageStream, generatedFileName, imageDir); - + await DashUploadUtils.outputResizedImages(() => getImageStream(mediaPath), generatedFileName, imageDir); imageUrls.push({ url: `/files/images/buxton/${generatedFileName}`, nativeWidth: width, @@ -339,11 +343,12 @@ async function writeImages(zip: any): Promise { } function analyze(fileName: string, contents: DocumentContents): AnalysisResult { - const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescriptionParagraphs } = contents; + const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescription } = contents; const device: any = { hyperlinks, captions, embeddedFileNames, + longDescription, __images: imageData }; const errors: { [key: string]: string } = { fileName }; @@ -378,7 +383,6 @@ function analyze(fileName: string, contents: DocumentContents): AnalysisResult { return { errors }; } - device.longDescription = longDescriptionParagraphs.join("\n\n"); return { device }; } -- cgit v1.2.3-70-g09d2 From 1660defc561c904217ed5be34cd6e0fe64736fe1 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Thu, 30 Apr 2020 19:06:42 -0700 Subject: commented Buxton importer --- src/scraping/buxton/final/BuxtonImporter.ts | 212 +++++++++++++++++---- .../authentication/models/current_user_utils.ts | 1 - 2 files changed, 179 insertions(+), 34 deletions(-) (limited to 'src/scraping/buxton/final') diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts index 713207a07..21363f848 100644 --- a/src/scraping/buxton/final/BuxtonImporter.ts +++ b/src/scraping/buxton/final/BuxtonImporter.ts @@ -10,6 +10,10 @@ import { parseXml } from "libxmljs"; import { strictEqual } from "assert"; import { Readable, PassThrough } from "stream"; +/** + * This is an arbitrary bundle of data that gets populated + * in extractFileContents + */ interface DocumentContents { body: string; imageData: ImageData[]; @@ -19,6 +23,10 @@ interface DocumentContents { longDescription: string; } +/** + * A rough schema for everything that Bill has + * included for each document + */ export interface DeviceDocument { title: string; shortDescription: string; @@ -33,36 +41,65 @@ export interface DeviceDocument { attribute: string; __images: ImageData[]; hyperlinks: string[]; - captions: string[]; - embeddedFileNames: string[]; + captions: string[]; // from the table column + embeddedFileNames: string[]; // from the table column } +/** + * A layer of abstraction around a single parsing + * attempt. The error is not a TypeScript error, but + * rather an invalidly formatted value for a given key. + */ export interface AnalysisResult { device?: DeviceDocument; - errors?: { [key: string]: string }; + invalid?: { [deviceProperty: string]: string }; } +/** + * A mini API that takes in a string and returns + * either the given T or an error indicating that the + * transformation was rejected. + */ type Transformer = (raw: string) => TransformResult; interface TransformResult { transformed?: T; error?: string; } +/** + * Simple bundle counting successful and failed imports + */ export interface ImportResults { deviceCount: number; errorCount: number; } +/** + * Definitions for callback functions. Such instances are + * just invoked by when a single document has been parsed + * or the entire import is over. As of this writing, these + * callbacks are supplied by WebSocket.ts and used to inform + * the client of these events. + */ type ResultCallback = (result: AnalysisResult) => void; type TerminatorCallback = (result: ImportResults) => void; -interface Processor { - exp: RegExp; - matchIndex?: number; - transformer?: Transformer; - required?: boolean; +/** + * Defines everything needed to define how a single key should be + * formatted within the plain body text. The association between + * keys and their format definitions is stored FormatMap + */ +interface ValueFormatDefinition { + exp: RegExp; // the expression that the key's value should match + matchIndex?: number; // defaults to 0, but can be overridden to account for grouping in @param exp + transformer?: Transformer; // if desirable, how to transform the Regex match + required?: boolean; // defaults to true, confirms that for a whole document to be counted successful, + // all of its required values should be present and properly formatted } +/** + * The basic data we extract from each image in the document + */ interface ImageData { url: string; nativeWidth: number; @@ -71,6 +108,10 @@ interface ImageData { namespace Utilities { + /** + * Numeric 'try parse', fits with the Transformer API + * @param raw the serialized number + */ export function numberValue(raw: string): TransformResult { const transformed = Number(raw); if (isNaN(transformed)) { @@ -79,18 +120,32 @@ namespace Utilities { return { transformed }; } + /** + * A simple tokenizer that splits along 'and' and commas, and removes duplicates + * Helpful mainly for attribute and primary key lists + * @param raw the string to tokenize + */ export function collectUniqueTokens(raw: string): TransformResult { const pieces = raw.replace(/,|\s+and\s+/g, " ").split(/\s+/).filter(piece => piece.length); const unique = new Set(pieces.map(token => token.toLowerCase().trim())); return { transformed: Array.from(unique).map(capitalize).sort() }; } + /** + * Tries to correct XML text parsing artifact where some sentences lose their separating space, + * and others gain excess whitespace + * @param raw + */ export function correctSentences(raw: string): TransformResult { raw = raw.replace(/\./g, ". ").replace(/\:/g, ": ").replace(/\,/g, ", ").replace(/\?/g, "? ").trimRight(); raw = raw.replace(/\s{2,}/g, " "); return { transformed: raw }; } + /** + * Simple capitalization + * @param word to capitalize + */ export function capitalize(word: string): string { const clean = word.trim(); if (!clean.length) { @@ -99,6 +154,12 @@ namespace Utilities { return word.charAt(0).toUpperCase() + word.slice(1); } + /** + * Streams the requeted file at the relative path to the + * root of the zip, then parses it with a library + * @param zip the zip instance data source + * @param relativePath the path to a .xml file within the zip to parse + */ export async function readAndParseXml(zip: any, relativePath: string) { console.log(`Text streaming ${relativePath}`); const contents = await new Promise((resolve, reject) => { @@ -111,13 +172,17 @@ namespace Utilities { stream.on('end', () => resolve(body)); }); }); - return parseXml(contents); } - } -const RegexMap = new Map>([ +/** + * Defines how device values should be formatted. As you can see, the formatting is + * not super consistent and has changed over time as edge cases have been found, but this + * at least imposes some constraints, and will notify you if a document doesn't match the specifications + * in this map. + */ +const FormatMap = new Map>([ ["title", { exp: /contact\s+(.*)Short Description:/ }], @@ -189,17 +254,25 @@ const RegexMap = new Map>([ }], ]); -const sourceDir = path.resolve(__dirname, "source"); -const outDir = path.resolve(__dirname, "json"); -const imageDir = path.resolve(__dirname, "../../../server/public/files/images/buxton"); -const successOut = "buxton.json"; -const failOut = "incomplete.json"; -const deviceKeys = Array.from(RegexMap.keys()); - +const sourceDir = path.resolve(__dirname, "source"); // where the Word documents are assumed to be stored +const outDir = path.resolve(__dirname, "json"); // where the JSON output of these device documents will be written +const imageDir = path.resolve(__dirname, "../../../server/public/files/images/buxton"); // where, in the server, these images will be written +const successOut = "buxton.json"; // the JSON list representing properly formatted documents +const failOut = "incomplete.json"; // the JSON list representing improperly formatted documents +const deviceKeys = Array.from(FormatMap.keys()); // a way to iterate through all keys of the DeviceDocument interface + +/** + * Starts by REMOVING ALL EXISTING BUXTON RESOURCES. This might need to be + * changed going forward + * @param emitter the callback when each document is completed + * @param terminator the callback when the entire import is completed + */ export default async function executeImport(emitter: ResultCallback, terminator: TerminatorCallback) { try { + // get all Word documents in the source directory const contents = readdirSync(sourceDir); const wordDocuments = contents.filter(file => /.*\.docx?$/.test(file)).map(file => `${sourceDir}/${file}`); + // removal takes place here [outDir, imageDir].forEach(dir => { rimraf.sync(dir); mkdirSync(dir); @@ -216,19 +289,28 @@ export default async function executeImport(emitter: ResultCallback, terminator: } } +/** + * Parse every Word document in the directory, notifying any callers as needed + * at each iteration via the emitter. + * @param wordDocuments the string list of Word document names to parse + * @param emitter the callback when each document is completed + * @param terminator the callback when the entire import is completed + */ async function parseFiles(wordDocuments: string[], emitter: ResultCallback, terminator: TerminatorCallback): Promise { + // execute parent-most parse function const results: AnalysisResult[] = []; for (const filePath of wordDocuments) { - const fileName = path.basename(filePath).replace("Bill_Notes_", ""); + const fileName = path.basename(filePath).replace("Bill_Notes_", ""); // not strictly needed, but cleaner console.log(cyan(`\nExtracting contents from ${fileName}...`)); const result = analyze(fileName, await extractFileContents(filePath)); emitter(result); results.push(result); } + // collect information about errors and successes const masterDevices: DeviceDocument[] = []; const masterErrors: { [key: string]: string }[] = []; - results.forEach(({ device, errors }) => { + results.forEach(({ device, invalid: errors }) => { if (device) { masterDevices.push(device); } else if (errors) { @@ -236,24 +318,45 @@ async function parseFiles(wordDocuments: string[], emitter: ResultCallback, term } }); + // something went wrong, since errors and successes should sum to total inputs const total = wordDocuments.length; if (masterDevices.length + masterErrors.length !== total) { throw new Error(`Encountered a ${masterDevices.length} to ${masterErrors.length} mismatch in device / error split!`); } + // write the external JSON representations of this import console.log(); await writeOutputFile(successOut, masterDevices, total, true); await writeOutputFile(failOut, masterErrors, total, false); console.log(); + // notify the caller that the import has finished terminator({ deviceCount: masterDevices.length, errorCount: masterErrors.length }); return masterDevices; } +/** + * XPath definitions for desired XML targets in respective hierarchies. + * + * For table cells, can be read as: "find me anything that looks like in XML, whose + * parent looks like , whose parent looks like " + * + * + * + * + * + * These are found by trial and error, and using an online XML parser / prettifier + * to inspect the structure, since the Node XML library does not expose the parsed + * structure very well for searching, say in the debug console. + */ const tableCellXPath = '//*[name()="w:tbl"]/*[name()="w:tr"]/*[name()="w:tc"]'; const hyperlinkXPath = '//*[name()="Relationship" and contains(@Type, "hyperlink")]'; +/** + * The meat of the script, images and text content are extracted here + * @param pathToDocument the path to the document relative to the root of the zip + */ async function extractFileContents(pathToDocument: string): Promise { console.log('Extracting text...'); const zip = new StreamZip({ file: pathToDocument, storeEntries: true }); @@ -261,22 +364,30 @@ async function extractFileContents(pathToDocument: string): Promise node.text().trim()); + // preserve paragraph formatting and line breaks that would otherwise get lost in the plain text parsing + // of the XML hierarchy const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!); const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1; const end = paragraphs.indexOf("Device Details"); const longDescription = paragraphs.slice(start, end).filter(paragraph => paragraph.length).join("\n\n"); - const { length } = captionTargets; + // extract captions from the table cells + const tableRowsFlattened = document.find(tableCellXPath).map(node => node.text().trim()); + const { length } = tableRowsFlattened; strictEqual(length > 3, true, "No captions written."); strictEqual(length % 3 === 0, true, "Improper caption formatting."); - for (let i = 3; i < captionTargets.length; i += 3) { - const row = captionTargets.slice(i, i + 3); + // break the flat list of strings into groups of three, since there + // currently are three columns in the table. Thus, each group represents + // a row in the table, where the first row has no text content since it's + // the image, the second has the file name and the third has the caption + for (let i = 3; i < tableRowsFlattened.length; i += 3) { + const row = tableRowsFlattened.slice(i, i + 3); embeddedFileNames.push(row[1]); captions.push(row[2]); } @@ -286,23 +397,34 @@ async function extractFileContents(pathToDocument: string): Promise el.attrs()[2].value()); console.log("Text extracted."); + // write out the images for this document console.log("Beginning image extraction..."); const imageData = await writeImages(zip); console.log(`Extracted ${imageData.length} images.`); + // cleanup zip.close(); return { body, longDescription, imageData, captions, embeddedFileNames, hyperlinks }; } +// zip relative path from root expression / filter used to isolate only media assets const imageEntry = /^word\/media\/\w+\.(jpeg|jpg|png|gif)/; -interface Dimensions { +/** + * Image dimensions and file suffix, + */ +interface ImageAttrs { width: number; height: number; type: string; } +/** + * For each image, stream the file, get its size, check if it's an icon + * (if it is, ignore it) + * @param zip the zip instance data source + */ async function writeImages(zip: any): Promise { const allEntries = Object.values(zip.entries()).map(({ name }) => name); const imageEntries = allEntries.filter(name => imageEntry.test(name)); @@ -315,8 +437,8 @@ async function writeImages(zip: any): Promise { }); for (const mediaPath of imageEntries) { - const { width, height, type } = await new Promise(async resolve => { - const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: Dimensions) => { + const { width, height, type } = await new Promise(async resolve => { + const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: ImageAttrs) => { readStream.destroy(); resolve(dimensions); }).on("error", () => readStream.destroy()); @@ -324,11 +446,14 @@ async function writeImages(zip: any): Promise { readStream.pipe(sizeStream); }); + // if it's not an icon, by this rough heuristic, i.e. is it not square if (Math.abs(width - height) > 10) { valid.push({ width, height, type, mediaPath }); } } + // for each valid image, output the _o, _l, _m, and _s files + // THIS IS WHERE THE SCRIPT SPENDS MOST OF ITS TIME for (const { type, width, height, mediaPath } of valid) { const generatedFileName = `upload_${Utils.GenerateGuid()}.${type.toLowerCase()}`; await DashUploadUtils.outputResizedImages(() => getImageStream(mediaPath), generatedFileName, imageDir); @@ -342,6 +467,14 @@ async function writeImages(zip: any): Promise { return imageUrls; } +/** + * Takes the results of extractFileContents, which relative to this is sort of the + * external media / preliminary text processing, and now tests the given file name to + * with those value definitions to make sure the body of the document contains all + * required fields, properly formatted + * @param fileName the file whose body to inspect + * @param contents the data already computed / parsed by extractFileContents + */ function analyze(fileName: string, contents: DocumentContents): AnalysisResult { const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescription } = contents; const device: any = { @@ -354,43 +487,56 @@ function analyze(fileName: string, contents: DocumentContents): AnalysisResult { const errors: { [key: string]: string } = { fileName }; for (const key of deviceKeys) { - const { exp, transformer, matchIndex, required } = RegexMap.get(key)!; + const { exp, transformer, matchIndex, required } = FormatMap.get(key)!; const matches = exp.exec(body); let captured: string; - if (matches && (captured = matches[matchIndex ?? 1])) { - captured = captured.replace(/\s{2,}/g, " "); + // if we matched and we got the specific match we're after + if (matches && (captured = matches[matchIndex ?? 1])) { // matchIndex defaults to 1 + captured = captured.replace(/\s{2,}/g, " "); // remove excess whitespace + // if supplied, apply the required transformation (recall this is specified in FormatMap) if (transformer) { const { error, transformed } = transformer(captured); if (error) { + // we hit a snag trying to transform the valid match + // still counts as a fundamental error errors[key] = `__ERR__${key.toUpperCase()}__TRANSFORM__: ${error}`; continue; } captured = transformed; } - device[key] = captured; } else if (required ?? true) { + // the field was either implicitly or explicitly required, and failed to match the definition in + // FormatMap errors[key] = `ERR__${key.toUpperCase()}__: outer match ${matches === null ? "wasn't" : "was"} captured.`; continue; } } + // print errors - this can be removed const errorKeys = Object.keys(errors); if (errorKeys.length > 1) { console.log(red(`@ ${cyan(fileName.toUpperCase())}...`)); errorKeys.forEach(key => key !== "filename" && console.log(red(errors[key]))); - return { errors }; + return { invalid: errors }; } return { device }; } +/** + * A utility function that writes the JSON results for this import out to the desired path + * @param relativePath where to write the JSON file + * @param data valid device document objects, or errors + * @param total used for more informative printing + * @param success whether or not the caller is writing the successful parses or the failures + */ async function writeOutputFile(relativePath: string, data: any[], total: number, success: boolean) { console.log(yellow(`Encountered ${data.length} ${success ? "valid" : "invalid"} documents out of ${total} candidates. Writing ${relativePath}...`)); return new Promise((resolve, reject) => { const destination = path.resolve(outDir, relativePath); - const contents = JSON.stringify(data, undefined, 4); + const contents = JSON.stringify(data, undefined, 4); // format the JSON writeFile(destination, contents, err => err ? reject(err) : resolve()); }); } \ No newline at end of file diff --git a/src/server/authentication/models/current_user_utils.ts b/src/server/authentication/models/current_user_utils.ts index 663343f47..d7cc1e6bf 100644 --- a/src/server/authentication/models/current_user_utils.ts +++ b/src/server/authentication/models/current_user_utils.ts @@ -9,7 +9,6 @@ import { List } from "../../../new_fields/List"; import { listSpec } from "../../../new_fields/Schema"; import { ScriptField, ComputedField } from "../../../new_fields/ScriptField"; import { Cast, PromiseValue, StrCast, NumCast } from "../../../new_fields/Types"; -import { Utils } from "../../../Utils"; import { nullAudio, ImageField } from "../../../new_fields/URLField"; import { DragManager } from "../../../client/util/DragManager"; import { InkingControl } from "../../../client/views/InkingControl"; -- cgit v1.2.3-70-g09d2 From b8a62e6404a695e57ab1305fd13be23e8d935360 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Sun, 3 May 2020 15:05:04 -0700 Subject: cleanup --- .../apis/google_docs/GooglePhotosClientUtils.ts | 28 +++++++++------------- src/scraping/buxton/final/BuxtonImporter.ts | 28 ++++++++++++---------- src/server/DashUploadUtils.ts | 7 +----- 3 files changed, 28 insertions(+), 35 deletions(-) (limited to 'src/scraping/buxton/final') diff --git a/src/client/apis/google_docs/GooglePhotosClientUtils.ts b/src/client/apis/google_docs/GooglePhotosClientUtils.ts index e3f801c46..ff471853a 100644 --- a/src/client/apis/google_docs/GooglePhotosClientUtils.ts +++ b/src/client/apis/google_docs/GooglePhotosClientUtils.ts @@ -76,7 +76,6 @@ export namespace GooglePhotos { } export const CollectionToAlbum = async (options: AlbumCreationOptions): Promise> => { - await GoogleAuthenticationManager.Instance.fetchOrGenerateAccessToken(); const { collection, title, descriptionKey, tag } = options; const dataDocument = Doc.GetProto(collection); const images = ((await DocListCastAsync(dataDocument.data)) || []).filter(doc => Cast(doc.data, ImageField)); @@ -157,24 +156,20 @@ export namespace GooglePhotos { images && images.forEach(image => tagMapping.set(image[Id], ContentCategories.NONE)); const values = Object.values(ContentCategories); for (const value of values) { - if (value !== ContentCategories.NONE) { - const results = await ContentSearch({ included: [value] }); - if (results.mediaItems) { - const ids = results.mediaItems.map(item => item.id); - for (const id of ids) { - const image = await Cast(idMapping[id], Doc); - if (image) { - const key = image[Id]; - const tags = tagMapping.get(key)!; - if (!tags.includes(value)) { - tagMapping.set(key, tags + delimiter + value); - } - } - } + if (value === ContentCategories.NONE) { + continue; + } + for (const id of (await ContentSearch({ included: [value] }))?.mediaItems?.map(({ id }) => id)) { + const image = await Cast(idMapping[id], Doc); + if (!image) { + continue; } + const key = image[Id]; + const tags = tagMapping.get(key); + !tags?.includes(value) && tagMapping.set(key, tags + delimiter + value); } } - images && images.forEach(image => { + images?.forEach(image => { const concatenated = tagMapping.get(image[Id])!; const tags = concatenated.split(delimiter); if (tags.length > 1) { @@ -184,7 +179,6 @@ export namespace GooglePhotos { image.googlePhotosTags = ContentCategories.NONE; } }); - }; interface DateRange { diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts index 21363f848..94302c7b3 100644 --- a/src/scraping/buxton/final/BuxtonImporter.ts +++ b/src/scraping/buxton/final/BuxtonImporter.ts @@ -350,8 +350,11 @@ async function parseFiles(wordDocuments: string[], emitter: ResultCallback, term * to inspect the structure, since the Node XML library does not expose the parsed * structure very well for searching, say in the debug console. */ -const tableCellXPath = '//*[name()="w:tbl"]/*[name()="w:tr"]/*[name()="w:tc"]'; -const hyperlinkXPath = '//*[name()="Relationship" and contains(@Type, "hyperlink")]'; +const xPaths = { + paragraphs: '//*[name()="w:p"]', + tableCells: '//*[name()="w:tbl"]/*[name()="w:tr"]/*[name()="w:tc"]', + hyperlinks: '//*[name()="Relationship" and contains(@Type, "hyperlink")]' +}; /** * The meat of the script, images and text content are extracted here @@ -371,30 +374,31 @@ async function extractFileContents(pathToDocument: string): Promise Utilities.correctSentences(node.text()).transformed!); + const paragraphs = document.find(xPaths.paragraphs).map(node => Utilities.correctSentences(node.text()).transformed!); const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1; const end = paragraphs.indexOf("Device Details"); const longDescription = paragraphs.slice(start, end).filter(paragraph => paragraph.length).join("\n\n"); // extract captions from the table cells - const tableRowsFlattened = document.find(tableCellXPath).map(node => node.text().trim()); + const tableRowsFlattened = document.find(xPaths.tableCells).map(node => node.text().trim()); const { length } = tableRowsFlattened; - strictEqual(length > 3, true, "No captions written."); - strictEqual(length % 3 === 0, true, "Improper caption formatting."); + const numCols = 3; + strictEqual(length > numCols, true, "No captions written."); // first row has the headers, not content + strictEqual(length % numCols === 0, true, "Improper caption formatting."); - // break the flat list of strings into groups of three, since there - // currently are three columns in the table. Thus, each group represents + // break the flat list of strings into groups of numColumns. Thus, each group represents // a row in the table, where the first row has no text content since it's - // the image, the second has the file name and the third has the caption - for (let i = 3; i < tableRowsFlattened.length; i += 3) { - const row = tableRowsFlattened.slice(i, i + 3); + // the image, the second has the file name and the third has the caption (maybe additional columns + // have been added or reordered since this was written, but follow the same appraoch) + for (let i = numCols; i < tableRowsFlattened.length; i += numCols) { + const row = tableRowsFlattened.slice(i, i + numCols); embeddedFileNames.push(row[1]); captions.push(row[2]); } // extract all hyperlinks embedded in the document const rels = await Utilities.readAndParseXml(zip, "word/_rels/document.xml.rels"); - const hyperlinks = rels.find(hyperlinkXPath).map(el => el.attrs()[2].value()); + const hyperlinks = rels.find(xPaths.hyperlinks).map(el => el.attrs()[2].value()); console.log("Text extracted."); // write out the images for this document diff --git a/src/server/DashUploadUtils.ts b/src/server/DashUploadUtils.ts index 3f903a861..8567631cd 100644 --- a/src/server/DashUploadUtils.ts +++ b/src/server/DashUploadUtils.ts @@ -325,12 +325,7 @@ export namespace DashUploadUtils { const outputPath = path.resolve(outputDirectory, writtenFiles[suffix] = InjectSize(outputFileName, suffix)); await new Promise(async (resolve, reject) => { const source = streamProvider(); - let readStream: Stream; - if (source instanceof Promise) { - readStream = await source; - } else { - readStream = source; - } + let readStream: Stream = source instanceof Promise ? await source : source; if (resizer) { readStream = readStream.pipe(resizer.withMetadata()); } -- cgit v1.2.3-70-g09d2 From 7b5b04560ba24b049d77d36562fed1f7dc190d43 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Wed, 13 May 2020 01:22:16 -0700 Subject: improved buxton heuristic, but still seems intractable --- src/scraping/buxton/final/BuxtonImporter.ts | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'src/scraping/buxton/final') diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts index 94302c7b3..e55850b29 100644 --- a/src/scraping/buxton/final/BuxtonImporter.ts +++ b/src/scraping/buxton/final/BuxtonImporter.ts @@ -451,11 +451,23 @@ async function writeImages(zip: any): Promise { }); // if it's not an icon, by this rough heuristic, i.e. is it not square - if (Math.abs(width - height) > 10) { - valid.push({ width, height, type, mediaPath }); + const number = Number(/image(\d+)/.exec(mediaPath)![1]); + if (number > 5 || width - height > 10) { + valid.push({ width, height, type, mediaPath, number }); } } + valid.sort((a, b) => a.number - b.number); + + const [{ width: first_w, height: first_h }, { width: second_w, height: second_h }] = valid; + if (Math.abs(first_w / second_w - first_h / second_h) < 0.01) { + const first_size = first_w * first_h; + const second_size = second_w * second_h; + const target = first_size >= second_size ? 1 : 0; + valid.splice(target, 1); + console.log(`Heuristically removed image with size ${target ? second_size : first_size}`); + } + // for each valid image, output the _o, _l, _m, and _s files // THIS IS WHERE THE SCRIPT SPENDS MOST OF ITS TIME for (const { type, width, height, mediaPath } of valid) { -- cgit v1.2.3-70-g09d2