From c75ffd4900acea74c55b6bf275a5e8082c15d573 Mon Sep 17 00:00:00 2001
From: Sam Wilkins <samwilkins333@gmail.com>
Date: Sun, 26 Apr 2020 19:29:50 -0700
Subject: formatted textbox disposers refactor, paragraph chunked rich text
 initialization and buxton importer updates

---
 src/scraping/buxton/final/BuxtonImporter.ts | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'src/scraping/buxton/final')
diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts
index 122415460..64b988610 100644
--- a/src/scraping/buxton/final/BuxtonImporter.ts
+++ b/src/scraping/buxton/final/BuxtonImporter.ts
@@ -16,6 +16,7 @@ interface DocumentContents {
     hyperlinks: string[];
     captions: string[];
     embeddedFileNames: string[];
+    longDescriptionParagraphs: string[];
 }
 
 export interface DeviceDocument {
@@ -186,10 +187,6 @@ const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([
         exp: /Short Description:\s+(.*)Bill Buxton[’']s Notes/,
         transformer: Utilities.correctSentences
     }],
-    ["longDescription", {
-        exp: /Bill Buxton[’']s Notes(.*)Device Details/,
-        transformer: Utilities.correctSentences
-    }],
 ]);
 
 const sourceDir = path.resolve(__dirname, "source");
@@ -267,7 +264,12 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
     const body = document.root()?.text() ?? "No body found. Check the import script's XML parser.";
     const captions: string[] = [];
     const embeddedFileNames: string[] = [];
-    const captionTargets = document.find(tableCellXPath).map(node => node.text());
+    const captionTargets = document.find(tableCellXPath).map(node => node.text().trim());
+
+    const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!);
+    const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1;
+    const end = paragraphs.indexOf("Device Details");
+    const longDescriptionParagraphs = paragraphs.slice(start, end);
 
     const { length } = captionTargets;
     strictEqual(length > 3, true, "No captions written.");
@@ -290,7 +292,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
 
     zip.close();
 
-    return { body, imageData, captions, embeddedFileNames, hyperlinks };
+    return { body, longDescriptionParagraphs, imageData, captions, embeddedFileNames, hyperlinks };
 }
 
 const imageEntry = /^word\/media\/\w+\.(jpeg|jpg|png|gif)/;
@@ -337,7 +339,7 @@ async function writeImages(zip: any): Promise<ImageData[]> {
 }
 
 function analyze(fileName: string, contents: DocumentContents): AnalysisResult {
-    const { body, imageData, captions, hyperlinks, embeddedFileNames } = contents;
+    const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescriptionParagraphs } = contents;
     const device: any = {
         hyperlinks,
         captions,
@@ -376,6 +378,7 @@ function analyze(fileName: string, contents: DocumentContents): AnalysisResult {
         return { errors };
     }
 
+    device.longDescription = longDescriptionParagraphs.join("\n\n");
     return { device };
 }
 
-- 
cgit v1.2.3-70-g09d2


From fc470b25759e8f051dc527066f9bebcaf5e7707d Mon Sep 17 00:00:00 2001
From: Sam Wilkins <samwilkins333@gmail.com>
Date: Sun, 26 Apr 2020 23:55:23 -0700
Subject: various buxton fixes

---
 src/client/documents/Documents.ts                  |  1 -
 .../views/collections/CollectionTreeView.tsx       | 24 ++++++++--------
 src/client/views/nodes/DocumentContentsView.tsx    |  8 +++---
 src/client/views/nodes/FormattedTextBox.tsx        |  9 +-----
 src/scraping/buxton/final/BuxtonImporter.ts        | 32 ++++++++++++----------
 5 files changed, 36 insertions(+), 38 deletions(-)

(limited to 'src/scraping/buxton/final')

diff --git a/src/client/documents/Documents.ts b/src/client/documents/Documents.ts
index 1651a6d55..5e0890e76 100644
--- a/src/client/documents/Documents.ts
+++ b/src/client/documents/Documents.ts
@@ -408,7 +408,6 @@ export namespace Docs {
                     const doc = StackingDocument(deviceImages, { title: device.title, _LODdisable: true });
                     const deviceProto = Doc.GetProto(doc);
                     deviceProto.hero = new ImageField(constructed[0].url);
-                    deviceProto.fontFamily = "Arial";
                     Docs.Get.FromJson({ data: device, appendToExisting: { targetDoc: deviceProto } });
                     Doc.AddDocToList(parentProto, "data", doc);
                 } else if (errors) {
diff --git a/src/client/views/collections/CollectionTreeView.tsx b/src/client/views/collections/CollectionTreeView.tsx
index 362d43ee7..dcb5e116c 100644
--- a/src/client/views/collections/CollectionTreeView.tsx
+++ b/src/client/views/collections/CollectionTreeView.tsx
@@ -733,19 +733,21 @@ export class CollectionTreeView extends CollectionSubView<Document, Partial<coll
                 const { Document } = this.props;
                 const fallbackImg = "http://www.cs.brown.edu/~bcz/face.gif";
 
+                const carousel = CarouselDocument([], { title: "data", _height: 350, _itemIndex: 0, backgroundColor: "#9b9b9b3F" });
                 const textDoc = TextDocument("", { title: "details", _autoHeight: true });
-                const detailView = Docs.Create.StackingDocument([
-                    CarouselDocument([], { title: "data", _height: 350, _itemIndex: 0, backgroundColor: "#9b9b9b3F" }),
-                    TreeDocument([
-                        // textDoc,
-                        TextDocument("", { title: "shortDescription", _autoHeight: true }),
-                        // TreeDocument([], { title: "narratives", _height: 75, treeViewHideTitle: true }),
-                        TextDocument("", { title: "longDescription", _height: 350 })
-                    ], { title: "stuff", _height: 100 })
-                ], { _chromeStatus: "disabled", _width: 300, _height: 300, _autoHeight: true, title: "detailView" });
-                // const detailView = Cast(Cast(Doc.UserDoc()["template-button-detail"], Doc, null)?.dragFactor, Doc, null);
+                const short = TextDocument("", { title: "shortDescription", _autoHeight: true });
+                const long = TextDocument("", { title: "longDescription", _height: 350 });
+                long.treeViewExpandedView = "layout";
+                const long_wrapper = TreeDocument([long], { title: "Descriptions", _height: 350 });
+
+                // const narratives = TreeDocument([], { title: "narratives", _height: 75, treeViewHideTitle: true }),
+                // const detailView = Cast(Cast(Doc.UserDoc()["template-button-detail"], Doc, null)?.dragFactory, Doc, null);
+
+                textDoc.fontFamily = short.fontFamily = long.fontFamily = carousel.fontFamily = "Arial";
+
+                const detailViewOpts = { _chromeStatus: "disabled", _width: 300, _height: 300, _autoHeight: true, title: "detailView" };
+                const detailView = Docs.Create.StackingDocument([carousel, textDoc, short, long_wrapper], detailViewOpts);
                 detailView.isTemplateDoc = makeTemplate(detailView);
-                detailView.fontFamily = "Arial";
 
                 const buxtonFieldKeys = ["year", "originalPrice", "degreesOfFreedom", "company", "attribute", "primaryKey", "secondaryKey", "dimensions"];
                 const detailedTemplate = {
diff --git a/src/client/views/nodes/DocumentContentsView.tsx b/src/client/views/nodes/DocumentContentsView.tsx
index 8582f92ed..cd78ac7b3 100644
--- a/src/client/views/nodes/DocumentContentsView.tsx
+++ b/src/client/views/nodes/DocumentContentsView.tsx
@@ -78,7 +78,7 @@ export class HTMLtag extends React.Component<HTMLtagProps> {
         const style: { [key: string]: any } = {};
         const divKeys = OmitKeys(this.props, ["children", "htmltag", "RootDoc", "Document", "key", "onInput", "onClick", "__proto__"]).omit;
         Object.keys(divKeys).map((prop: string) => {
-            let p = (this.props as any)[prop] as string;
+            const p = (this.props as any)[prop] as string;
             const replacer = (match: any, expr: string, offset: any, string: any) => { // bcz: this executes a script to convert a propery expression string:  { script }  into a value
                 return ScriptField.MakeFunction(expr, { self: Doc.name, this: Doc.name })?.script.run({ self: this.props.RootDoc, this: this.props.Document }).result as string || "";
             };
@@ -178,9 +178,9 @@ export class DocumentContentsView extends React.Component<DocumentViewProps & {
             }
             return undefined;
             // add input function to props
-        }
-        let onClick = makeFuncProp("onClick");
-        let onInput = makeFuncProp("onInput");
+        };
+        const onClick = makeFuncProp("onClick");
+        const onInput = makeFuncProp("onInput");
 
         const bindings = this.CreateBindings(onClick, onInput);
         //  layoutFrame = splits.length > 1 ? splits[0] + splits[1].replace(/{([^{}]|(?R))*}/, replacer4) : ""; // might have been more elegant if javascript supported recursive patterns
diff --git a/src/client/views/nodes/FormattedTextBox.tsx b/src/client/views/nodes/FormattedTextBox.tsx
index 8d4b90c41..d98172823 100644
--- a/src/client/views/nodes/FormattedTextBox.tsx
+++ b/src/client/views/nodes/FormattedTextBox.tsx
@@ -860,15 +860,8 @@ export class FormattedTextBox extends ViewBoxAnnotatableComponent<(FieldViewProp
             });
             const startupText = !rtfField && this._editorView && Field.toString(this.dataDoc[fieldKey] as Field);
             if (startupText) {
-                const paragraphSegments = startupText.split("\n\n");
                 const { state: { tr }, dispatch } = this._editorView;
-                if (paragraphSegments.length) {
-                    for (const paragraph of paragraphSegments) {
-                        dispatch(tr.insertText(paragraph));
-                    }
-                } else {
-                    dispatch(tr.insertText(startupText));
-                }
+                dispatch(tr.insertText(startupText));
             }
         }
 
diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts
index 64b988610..713207a07 100644
--- a/src/scraping/buxton/final/BuxtonImporter.ts
+++ b/src/scraping/buxton/final/BuxtonImporter.ts
@@ -16,7 +16,7 @@ interface DocumentContents {
     hyperlinks: string[];
     captions: string[];
     embeddedFileNames: string[];
-    longDescriptionParagraphs: string[];
+    longDescription: string;
 }
 
 export interface DeviceDocument {
@@ -269,7 +269,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
     const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!);
     const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1;
     const end = paragraphs.indexOf("Device Details");
-    const longDescriptionParagraphs = paragraphs.slice(start, end);
+    const longDescription = paragraphs.slice(start, end).filter(paragraph => paragraph.length).join("\n\n");
 
     const { length } = captionTargets;
     strictEqual(length > 3, true, "No captions written.");
@@ -292,7 +292,7 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
 
     zip.close();
 
-    return { body, longDescriptionParagraphs, imageData, captions, embeddedFileNames, hyperlinks };
+    return { body, longDescription, imageData, captions, embeddedFileNames, hyperlinks };
 }
 
 const imageEntry = /^word\/media\/\w+\.(jpeg|jpg|png|gif)/;
@@ -308,26 +308,30 @@ async function writeImages(zip: any): Promise<ImageData[]> {
     const imageEntries = allEntries.filter(name => imageEntry.test(name));
 
     const imageUrls: ImageData[] = [];
-    for (const mediaPath of imageEntries) {
-        const getImageStream = () => new Promise<Readable>((resolve, reject) => {
-            zip.stream(mediaPath, (error: any, stream: any) => error ? reject(error) : resolve(stream));
-        });
+    const valid: any[] = [];
+
+    const getImageStream = (mediaPath: string) => new Promise<Readable>((resolve, reject) => {
+        zip.stream(mediaPath, (error: any, stream: any) => error ? reject(error) : resolve(stream));
+    });
 
+    for (const mediaPath of imageEntries) {
         const { width, height, type } = await new Promise<Dimensions>(async resolve => {
             const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: Dimensions) => {
                 readStream.destroy();
                 resolve(dimensions);
             }).on("error", () => readStream.destroy());
-            const readStream = await getImageStream();
+            const readStream = await getImageStream(mediaPath);
             readStream.pipe(sizeStream);
         });
-        if (Math.abs(width - height) < 10) {
-            continue;
+
+        if (Math.abs(width - height) > 10) {
+            valid.push({ width, height, type, mediaPath });
         }
+    }
 
+    for (const { type, width, height, mediaPath } of valid) {
         const generatedFileName = `upload_${Utils.GenerateGuid()}.${type.toLowerCase()}`;
-        await DashUploadUtils.outputResizedImages(getImageStream, generatedFileName, imageDir);
-
+        await DashUploadUtils.outputResizedImages(() => getImageStream(mediaPath), generatedFileName, imageDir);
         imageUrls.push({
             url: `/files/images/buxton/${generatedFileName}`,
             nativeWidth: width,
@@ -339,11 +343,12 @@ async function writeImages(zip: any): Promise<ImageData[]> {
 }
 
 function analyze(fileName: string, contents: DocumentContents): AnalysisResult {
-    const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescriptionParagraphs } = contents;
+    const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescription } = contents;
     const device: any = {
         hyperlinks,
         captions,
         embeddedFileNames,
+        longDescription,
         __images: imageData
     };
     const errors: { [key: string]: string } = { fileName };
@@ -378,7 +383,6 @@ function analyze(fileName: string, contents: DocumentContents): AnalysisResult {
         return { errors };
     }
 
-    device.longDescription = longDescriptionParagraphs.join("\n\n");
     return { device };
 }
 
-- 
cgit v1.2.3-70-g09d2


From 1660defc561c904217ed5be34cd6e0fe64736fe1 Mon Sep 17 00:00:00 2001
From: Sam Wilkins <samwilkins333@gmail.com>
Date: Thu, 30 Apr 2020 19:06:42 -0700
Subject: commented Buxton importer

---
 src/scraping/buxton/final/BuxtonImporter.ts        | 212 +++++++++++++++++----
 .../authentication/models/current_user_utils.ts    |   1 -
 2 files changed, 179 insertions(+), 34 deletions(-)

(limited to 'src/scraping/buxton/final')

diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts
index 713207a07..21363f848 100644
--- a/src/scraping/buxton/final/BuxtonImporter.ts
+++ b/src/scraping/buxton/final/BuxtonImporter.ts
@@ -10,6 +10,10 @@ import { parseXml } from "libxmljs";
 import { strictEqual } from "assert";
 import { Readable, PassThrough } from "stream";
 
+/**
+ * This is an arbitrary bundle of data that gets populated
+ * in extractFileContents
+ */
 interface DocumentContents {
     body: string;
     imageData: ImageData[];
@@ -19,6 +23,10 @@ interface DocumentContents {
     longDescription: string;
 }
 
+/**
+ * A rough schema for everything that Bill has
+ * included for each document
+ */
 export interface DeviceDocument {
     title: string;
     shortDescription: string;
@@ -33,36 +41,65 @@ export interface DeviceDocument {
     attribute: string;
     __images: ImageData[];
     hyperlinks: string[];
-    captions: string[];
-    embeddedFileNames: string[];
+    captions: string[]; // from the table column
+    embeddedFileNames: string[]; // from the table column
 }
 
+/**
+ * A layer of abstraction around a single parsing
+ * attempt. The error is not a TypeScript error, but
+ * rather an invalidly formatted value for a given key.
+ */
 export interface AnalysisResult {
     device?: DeviceDocument;
-    errors?: { [key: string]: string };
+    invalid?: { [deviceProperty: string]: string };
 }
 
+/**
+ * A mini API that takes in a string and returns
+ * either the given T or an error indicating that the
+ * transformation was rejected.
+ */
 type Transformer<T> = (raw: string) => TransformResult<T>;
 interface TransformResult<T> {
     transformed?: T;
     error?: string;
 }
 
+/**
+ * Simple bundle counting successful and failed imports
+ */
 export interface ImportResults {
     deviceCount: number;
     errorCount: number;
 }
 
+/**
+ * Definitions for callback functions. Such instances are
+ * just invoked by when a single document has been parsed
+ * or the entire import is over. As of this writing, these
+ * callbacks are supplied by WebSocket.ts and used to inform
+ * the client of these events.
+ */
 type ResultCallback = (result: AnalysisResult) => void;
 type TerminatorCallback = (result: ImportResults) => void;
 
-interface Processor<T> {
-    exp: RegExp;
-    matchIndex?: number;
-    transformer?: Transformer<T>;
-    required?: boolean;
+/**
+ * Defines everything needed to define how a single key should be
+ * formatted within the plain body text. The association between
+ * keys and their format definitions is stored FormatMap
+ */
+interface ValueFormatDefinition<T> {
+    exp: RegExp; // the expression that the key's value should match
+    matchIndex?: number; // defaults to 0, but can be overridden to account for grouping in @param exp
+    transformer?: Transformer<T>; // if desirable, how to transform the Regex match
+    required?: boolean; // defaults to true, confirms that for a whole document to be counted successful,
+    // all of its required values should be present and properly formatted
 }
 
+/**
+ * The basic data we extract from each image in the document
+ */
 interface ImageData {
     url: string;
     nativeWidth: number;
@@ -71,6 +108,10 @@ interface ImageData {
 
 namespace Utilities {
 
+    /**
+     * Numeric 'try parse', fits with the Transformer API
+     * @param raw the serialized number
+     */
     export function numberValue(raw: string): TransformResult<number> {
         const transformed = Number(raw);
         if (isNaN(transformed)) {
@@ -79,18 +120,32 @@ namespace Utilities {
         return { transformed };
     }
 
+    /**
+     * A simple tokenizer that splits along 'and' and commas, and removes duplicates
+     * Helpful mainly for attribute and primary key lists
+     * @param raw the string to tokenize
+     */
     export function collectUniqueTokens(raw: string): TransformResult<string[]> {
         const pieces = raw.replace(/,|\s+and\s+/g, " ").split(/\s+/).filter(piece => piece.length);
         const unique = new Set(pieces.map(token => token.toLowerCase().trim()));
         return { transformed: Array.from(unique).map(capitalize).sort() };
     }
 
+    /**
+     * Tries to correct XML text parsing artifact where some sentences lose their separating space,
+     * and others gain excess whitespace
+     * @param raw 
+     */
     export function correctSentences(raw: string): TransformResult<string> {
         raw = raw.replace(/\./g, ". ").replace(/\:/g, ": ").replace(/\,/g, ", ").replace(/\?/g, "? ").trimRight();
         raw = raw.replace(/\s{2,}/g, " ");
         return { transformed: raw };
     }
 
+    /**
+     * Simple capitalization
+     * @param word to capitalize
+     */
     export function capitalize(word: string): string {
         const clean = word.trim();
         if (!clean.length) {
@@ -99,6 +154,12 @@ namespace Utilities {
         return word.charAt(0).toUpperCase() + word.slice(1);
     }
 
+    /**
+     * Streams the requeted file at the relative path to the
+     * root of the zip, then parses it with a library
+     * @param zip the zip instance data source
+     * @param relativePath the path to a .xml file within the zip to parse
+     */
     export async function readAndParseXml(zip: any, relativePath: string) {
         console.log(`Text streaming ${relativePath}`);
         const contents = await new Promise<string>((resolve, reject) => {
@@ -111,13 +172,17 @@ namespace Utilities {
                 stream.on('end', () => resolve(body));
             });
         });
-
         return parseXml(contents);
     }
-
 }
 
-const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([
+/**
+ * Defines how device values should be formatted. As you can see, the formatting is
+ * not super consistent and has changed over time as edge cases have been found, but this
+ * at least imposes some constraints, and will notify you if a document doesn't match the specifications
+ * in this map.
+ */
+const FormatMap = new Map<keyof DeviceDocument, ValueFormatDefinition<any>>([
     ["title", {
         exp: /contact\s+(.*)Short Description:/
     }],
@@ -189,17 +254,25 @@ const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([
     }],
 ]);
 
-const sourceDir = path.resolve(__dirname, "source");
-const outDir = path.resolve(__dirname, "json");
-const imageDir = path.resolve(__dirname, "../../../server/public/files/images/buxton");
-const successOut = "buxton.json";
-const failOut = "incomplete.json";
-const deviceKeys = Array.from(RegexMap.keys());
-
+const sourceDir = path.resolve(__dirname, "source"); // where the Word documents are assumed to be stored
+const outDir = path.resolve(__dirname, "json"); // where the JSON output of these device documents will be written
+const imageDir = path.resolve(__dirname, "../../../server/public/files/images/buxton"); // where, in the server, these images will be written
+const successOut = "buxton.json"; // the JSON list representing properly formatted documents
+const failOut = "incomplete.json"; // the JSON list representing improperly formatted documents
+const deviceKeys = Array.from(FormatMap.keys()); // a way to iterate through all keys of the DeviceDocument interface
+
+/**
+ * Starts by REMOVING ALL EXISTING BUXTON RESOURCES. This might need to be
+ * changed going forward
+ * @param emitter the callback when each document is completed
+ * @param terminator the callback when the entire import is completed
+ */
 export default async function executeImport(emitter: ResultCallback, terminator: TerminatorCallback) {
     try {
+        // get all Word documents in the source directory
         const contents = readdirSync(sourceDir);
         const wordDocuments = contents.filter(file => /.*\.docx?$/.test(file)).map(file => `${sourceDir}/${file}`);
+        // removal takes place here
         [outDir, imageDir].forEach(dir => {
             rimraf.sync(dir);
             mkdirSync(dir);
@@ -216,19 +289,28 @@ export default async function executeImport(emitter: ResultCallback, terminator:
     }
 }
 
+/**
+ * Parse every Word document in the directory, notifying any callers as needed
+ * at each iteration via the emitter.
+ * @param wordDocuments the string list of Word document names to parse
+ * @param emitter the callback when each document is completed
+ * @param terminator the callback when the entire import is completed
+ */
 async function parseFiles(wordDocuments: string[], emitter: ResultCallback, terminator: TerminatorCallback): Promise<DeviceDocument[]> {
+    // execute parent-most parse function
     const results: AnalysisResult[] = [];
     for (const filePath of wordDocuments) {
-        const fileName = path.basename(filePath).replace("Bill_Notes_", "");
+        const fileName = path.basename(filePath).replace("Bill_Notes_", ""); // not strictly needed, but cleaner
         console.log(cyan(`\nExtracting contents from ${fileName}...`));
         const result = analyze(fileName, await extractFileContents(filePath));
         emitter(result);
         results.push(result);
     }
 
+    // collect information about errors and successes
     const masterDevices: DeviceDocument[] = [];
     const masterErrors: { [key: string]: string }[] = [];
-    results.forEach(({ device, errors }) => {
+    results.forEach(({ device, invalid: errors }) => {
         if (device) {
             masterDevices.push(device);
         } else if (errors) {
@@ -236,24 +318,45 @@ async function parseFiles(wordDocuments: string[], emitter: ResultCallback, term
         }
     });
 
+    // something went wrong, since errors and successes should sum to total inputs
     const total = wordDocuments.length;
     if (masterDevices.length + masterErrors.length !== total) {
         throw new Error(`Encountered a ${masterDevices.length} to ${masterErrors.length} mismatch in device / error split!`);
     }
 
+    // write the external JSON representations of this import
     console.log();
     await writeOutputFile(successOut, masterDevices, total, true);
     await writeOutputFile(failOut, masterErrors, total, false);
     console.log();
 
+    // notify the caller that the import has finished
     terminator({ deviceCount: masterDevices.length, errorCount: masterErrors.length });
 
     return masterDevices;
 }
 
+/**
+ * XPath definitions for desired XML targets in respective hierarchies.
+ * 
+ * For table cells, can be read as: "find me anything that looks like <w:tc> in XML, whose
+ * parent looks like <w:tr>, whose parent looks like <w:tbl>"
+ * 
+ * <w:tbl>
+ *      <w:tr>
+ *           <w:tc>
+ * 
+ * These are found by trial and error, and using an online XML parser / prettifier
+ * to inspect the structure, since the Node XML library does not expose the parsed
+ * structure very well for searching, say in the debug console.
+ */
 const tableCellXPath = '//*[name()="w:tbl"]/*[name()="w:tr"]/*[name()="w:tc"]';
 const hyperlinkXPath = '//*[name()="Relationship" and contains(@Type, "hyperlink")]';
 
+/**
+ * The meat of the script, images and text content are extracted here
+ * @param pathToDocument the path to the document relative to the root of the zip
+ */
 async function extractFileContents(pathToDocument: string): Promise<DocumentContents> {
     console.log('Extracting text...');
     const zip = new StreamZip({ file: pathToDocument, storeEntries: true });
@@ -261,22 +364,30 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
 
     // extract the body of the document and, specifically, its captions
     const document = await Utilities.readAndParseXml(zip, "word/document.xml");
+    // get plain text
     const body = document.root()?.text() ?? "No body found. Check the import script's XML parser.";
     const captions: string[] = [];
     const embeddedFileNames: string[] = [];
-    const captionTargets = document.find(tableCellXPath).map(node => node.text().trim());
 
+    // preserve paragraph formatting and line breaks that would otherwise get lost in the plain text parsing
+    // of the XML hierarchy
     const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!);
     const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1;
     const end = paragraphs.indexOf("Device Details");
     const longDescription = paragraphs.slice(start, end).filter(paragraph => paragraph.length).join("\n\n");
 
-    const { length } = captionTargets;
+    // extract captions from the table cells
+    const tableRowsFlattened = document.find(tableCellXPath).map(node => node.text().trim());
+    const { length } = tableRowsFlattened;
     strictEqual(length > 3, true, "No captions written.");
     strictEqual(length % 3 === 0, true, "Improper caption formatting.");
 
-    for (let i = 3; i < captionTargets.length; i += 3) {
-        const row = captionTargets.slice(i, i + 3);
+    // break the flat list of strings into groups of three, since there
+    // currently are three columns in the table. Thus, each group represents
+    // a row in the table, where the first row has no text content since it's
+    // the image, the second has the file name and the third has the caption
+    for (let i = 3; i < tableRowsFlattened.length; i += 3) {
+        const row = tableRowsFlattened.slice(i, i + 3);
         embeddedFileNames.push(row[1]);
         captions.push(row[2]);
     }
@@ -286,23 +397,34 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
     const hyperlinks = rels.find(hyperlinkXPath).map(el => el.attrs()[2].value());
     console.log("Text extracted.");
 
+    // write out the images for this document
     console.log("Beginning image extraction...");
     const imageData = await writeImages(zip);
     console.log(`Extracted ${imageData.length} images.`);
 
+    // cleanup
     zip.close();
 
     return { body, longDescription, imageData, captions, embeddedFileNames, hyperlinks };
 }
 
+// zip relative path from root expression / filter used to isolate only media assets
 const imageEntry = /^word\/media\/\w+\.(jpeg|jpg|png|gif)/;
 
-interface Dimensions {
+/**
+ * Image dimensions and file suffix, 
+ */
+interface ImageAttrs {
     width: number;
     height: number;
     type: string;
 }
 
+/**
+ * For each image, stream the file, get its size, check if it's an icon
+ * (if it is, ignore it)
+ * @param zip the zip instance data source
+ */
 async function writeImages(zip: any): Promise<ImageData[]> {
     const allEntries = Object.values<any>(zip.entries()).map(({ name }) => name);
     const imageEntries = allEntries.filter(name => imageEntry.test(name));
@@ -315,8 +437,8 @@ async function writeImages(zip: any): Promise<ImageData[]> {
     });
 
     for (const mediaPath of imageEntries) {
-        const { width, height, type } = await new Promise<Dimensions>(async resolve => {
-            const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: Dimensions) => {
+        const { width, height, type } = await new Promise<ImageAttrs>(async resolve => {
+            const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: ImageAttrs) => {
                 readStream.destroy();
                 resolve(dimensions);
             }).on("error", () => readStream.destroy());
@@ -324,11 +446,14 @@ async function writeImages(zip: any): Promise<ImageData[]> {
             readStream.pipe(sizeStream);
         });
 
+        // if it's not an icon, by this rough heuristic, i.e. is it not square
         if (Math.abs(width - height) > 10) {
             valid.push({ width, height, type, mediaPath });
         }
     }
 
+    // for each valid image, output the _o, _l, _m, and _s files
+    // THIS IS WHERE THE SCRIPT SPENDS MOST OF ITS TIME
     for (const { type, width, height, mediaPath } of valid) {
         const generatedFileName = `upload_${Utils.GenerateGuid()}.${type.toLowerCase()}`;
         await DashUploadUtils.outputResizedImages(() => getImageStream(mediaPath), generatedFileName, imageDir);
@@ -342,6 +467,14 @@ async function writeImages(zip: any): Promise<ImageData[]> {
     return imageUrls;
 }
 
+/**
+ * Takes the results of extractFileContents, which relative to this is sort of the
+ * external media / preliminary text processing, and now tests the given file name to
+ * with those value definitions to make sure the body of the document contains all
+ * required fields, properly formatted
+ * @param fileName the file whose body to inspect
+ * @param contents the data already computed / parsed by extractFileContents
+ */
 function analyze(fileName: string, contents: DocumentContents): AnalysisResult {
     const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescription } = contents;
     const device: any = {
@@ -354,43 +487,56 @@ function analyze(fileName: string, contents: DocumentContents): AnalysisResult {
     const errors: { [key: string]: string } = { fileName };
 
     for (const key of deviceKeys) {
-        const { exp, transformer, matchIndex, required } = RegexMap.get(key)!;
+        const { exp, transformer, matchIndex, required } = FormatMap.get(key)!;
         const matches = exp.exec(body);
 
         let captured: string;
-        if (matches && (captured = matches[matchIndex ?? 1])) {
-            captured = captured.replace(/\s{2,}/g, " ");
+        // if we matched and we got the specific match we're after
+        if (matches && (captured = matches[matchIndex ?? 1])) { // matchIndex defaults to 1
+            captured = captured.replace(/\s{2,}/g, " "); // remove excess whitespace
+            // if supplied, apply the required transformation (recall this is specified in FormatMap)
             if (transformer) {
                 const { error, transformed } = transformer(captured);
                 if (error) {
+                    // we hit a snag trying to transform the valid match
+                    // still counts as a fundamental error
                     errors[key] = `__ERR__${key.toUpperCase()}__TRANSFORM__: ${error}`;
                     continue;
                 }
                 captured = transformed;
             }
-
             device[key] = captured;
         } else if (required ?? true) {
+            // the field was either implicitly or explicitly required, and failed to match the definition in
+            // FormatMap
             errors[key] = `ERR__${key.toUpperCase()}__: outer match ${matches === null ? "wasn't" : "was"} captured.`;
             continue;
         }
     }
 
+    // print errors - this can be removed
     const errorKeys = Object.keys(errors);
     if (errorKeys.length > 1) {
         console.log(red(`@ ${cyan(fileName.toUpperCase())}...`));
         errorKeys.forEach(key => key !== "filename" && console.log(red(errors[key])));
-        return { errors };
+        return { invalid: errors };
     }
 
     return { device };
 }
 
+/**
+ * A utility function that writes the JSON results for this import out to the desired path
+ * @param relativePath where to write the JSON file
+ * @param data valid device document objects, or errors
+ * @param total used for more informative printing
+ * @param success whether or not the caller is writing the successful parses or the failures
+ */
 async function writeOutputFile(relativePath: string, data: any[], total: number, success: boolean) {
     console.log(yellow(`Encountered ${data.length} ${success ? "valid" : "invalid"} documents out of ${total} candidates. Writing ${relativePath}...`));
     return new Promise<void>((resolve, reject) => {
         const destination = path.resolve(outDir, relativePath);
-        const contents = JSON.stringify(data, undefined, 4);
+        const contents = JSON.stringify(data, undefined, 4); // format the JSON
         writeFile(destination, contents, err => err ? reject(err) : resolve());
     });
 }
\ No newline at end of file
diff --git a/src/server/authentication/models/current_user_utils.ts b/src/server/authentication/models/current_user_utils.ts
index 663343f47..d7cc1e6bf 100644
--- a/src/server/authentication/models/current_user_utils.ts
+++ b/src/server/authentication/models/current_user_utils.ts
@@ -9,7 +9,6 @@ import { List } from "../../../new_fields/List";
 import { listSpec } from "../../../new_fields/Schema";
 import { ScriptField, ComputedField } from "../../../new_fields/ScriptField";
 import { Cast, PromiseValue, StrCast, NumCast } from "../../../new_fields/Types";
-import { Utils } from "../../../Utils";
 import { nullAudio, ImageField } from "../../../new_fields/URLField";
 import { DragManager } from "../../../client/util/DragManager";
 import { InkingControl } from "../../../client/views/InkingControl";
-- 
cgit v1.2.3-70-g09d2


From b8a62e6404a695e57ab1305fd13be23e8d935360 Mon Sep 17 00:00:00 2001
From: Sam Wilkins <samwilkins333@gmail.com>
Date: Sun, 3 May 2020 15:05:04 -0700
Subject: cleanup

---
 .../apis/google_docs/GooglePhotosClientUtils.ts    | 28 +++++++++-------------
 src/scraping/buxton/final/BuxtonImporter.ts        | 28 ++++++++++++----------
 src/server/DashUploadUtils.ts                      |  7 +-----
 3 files changed, 28 insertions(+), 35 deletions(-)

(limited to 'src/scraping/buxton/final')

diff --git a/src/client/apis/google_docs/GooglePhotosClientUtils.ts b/src/client/apis/google_docs/GooglePhotosClientUtils.ts
index e3f801c46..ff471853a 100644
--- a/src/client/apis/google_docs/GooglePhotosClientUtils.ts
+++ b/src/client/apis/google_docs/GooglePhotosClientUtils.ts
@@ -76,7 +76,6 @@ export namespace GooglePhotos {
         }
 
         export const CollectionToAlbum = async (options: AlbumCreationOptions): Promise<Opt<AlbumCreationResult>> => {
-            await GoogleAuthenticationManager.Instance.fetchOrGenerateAccessToken();
             const { collection, title, descriptionKey, tag } = options;
             const dataDocument = Doc.GetProto(collection);
             const images = ((await DocListCastAsync(dataDocument.data)) || []).filter(doc => Cast(doc.data, ImageField));
@@ -157,24 +156,20 @@ export namespace GooglePhotos {
             images && images.forEach(image => tagMapping.set(image[Id], ContentCategories.NONE));
             const values = Object.values(ContentCategories);
             for (const value of values) {
-                if (value !== ContentCategories.NONE) {
-                    const results = await ContentSearch({ included: [value] });
-                    if (results.mediaItems) {
-                        const ids = results.mediaItems.map(item => item.id);
-                        for (const id of ids) {
-                            const image = await Cast(idMapping[id], Doc);
-                            if (image) {
-                                const key = image[Id];
-                                const tags = tagMapping.get(key)!;
-                                if (!tags.includes(value)) {
-                                    tagMapping.set(key, tags + delimiter + value);
-                                }
-                            }
-                        }
+                if (value === ContentCategories.NONE) {
+                    continue;
+                }
+                for (const id of (await ContentSearch({ included: [value] }))?.mediaItems?.map(({ id }) => id)) {
+                    const image = await Cast(idMapping[id], Doc);
+                    if (!image) {
+                        continue;
                     }
+                    const key = image[Id];
+                    const tags = tagMapping.get(key);
+                    !tags?.includes(value) && tagMapping.set(key, tags + delimiter + value);
                 }
             }
-            images && images.forEach(image => {
+            images?.forEach(image => {
                 const concatenated = tagMapping.get(image[Id])!;
                 const tags = concatenated.split(delimiter);
                 if (tags.length > 1) {
@@ -184,7 +179,6 @@ export namespace GooglePhotos {
                     image.googlePhotosTags = ContentCategories.NONE;
                 }
             });
-
         };
 
         interface DateRange {
diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts
index 21363f848..94302c7b3 100644
--- a/src/scraping/buxton/final/BuxtonImporter.ts
+++ b/src/scraping/buxton/final/BuxtonImporter.ts
@@ -350,8 +350,11 @@ async function parseFiles(wordDocuments: string[], emitter: ResultCallback, term
  * to inspect the structure, since the Node XML library does not expose the parsed
  * structure very well for searching, say in the debug console.
  */
-const tableCellXPath = '//*[name()="w:tbl"]/*[name()="w:tr"]/*[name()="w:tc"]';
-const hyperlinkXPath = '//*[name()="Relationship" and contains(@Type, "hyperlink")]';
+const xPaths = {
+    paragraphs: '//*[name()="w:p"]',
+    tableCells: '//*[name()="w:tbl"]/*[name()="w:tr"]/*[name()="w:tc"]',
+    hyperlinks: '//*[name()="Relationship" and contains(@Type, "hyperlink")]'
+};
 
 /**
  * The meat of the script, images and text content are extracted here
@@ -371,30 +374,31 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
 
     // preserve paragraph formatting and line breaks that would otherwise get lost in the plain text parsing
     // of the XML hierarchy
-    const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!);
+    const paragraphs = document.find(xPaths.paragraphs).map(node => Utilities.correctSentences(node.text()).transformed!);
     const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1;
     const end = paragraphs.indexOf("Device Details");
     const longDescription = paragraphs.slice(start, end).filter(paragraph => paragraph.length).join("\n\n");
 
     // extract captions from the table cells
-    const tableRowsFlattened = document.find(tableCellXPath).map(node => node.text().trim());
+    const tableRowsFlattened = document.find(xPaths.tableCells).map(node => node.text().trim());
     const { length } = tableRowsFlattened;
-    strictEqual(length > 3, true, "No captions written.");
-    strictEqual(length % 3 === 0, true, "Improper caption formatting.");
+    const numCols = 3;
+    strictEqual(length > numCols, true, "No captions written."); // first row has the headers, not content
+    strictEqual(length % numCols === 0, true, "Improper caption formatting.");
 
-    // break the flat list of strings into groups of three, since there
-    // currently are three columns in the table. Thus, each group represents
+    // break the flat list of strings into groups of numColumns. Thus, each group represents
     // a row in the table, where the first row has no text content since it's
-    // the image, the second has the file name and the third has the caption
-    for (let i = 3; i < tableRowsFlattened.length; i += 3) {
-        const row = tableRowsFlattened.slice(i, i + 3);
+    // the image, the second has the file name and the third has the caption (maybe additional columns
+    // have been added or reordered since this was written, but follow the same appraoch)
+    for (let i = numCols; i < tableRowsFlattened.length; i += numCols) {
+        const row = tableRowsFlattened.slice(i, i + numCols);
         embeddedFileNames.push(row[1]);
         captions.push(row[2]);
     }
 
     // extract all hyperlinks embedded in the document
     const rels = await Utilities.readAndParseXml(zip, "word/_rels/document.xml.rels");
-    const hyperlinks = rels.find(hyperlinkXPath).map(el => el.attrs()[2].value());
+    const hyperlinks = rels.find(xPaths.hyperlinks).map(el => el.attrs()[2].value());
     console.log("Text extracted.");
 
     // write out the images for this document
diff --git a/src/server/DashUploadUtils.ts b/src/server/DashUploadUtils.ts
index 3f903a861..8567631cd 100644
--- a/src/server/DashUploadUtils.ts
+++ b/src/server/DashUploadUtils.ts
@@ -325,12 +325,7 @@ export namespace DashUploadUtils {
             const outputPath = path.resolve(outputDirectory, writtenFiles[suffix] = InjectSize(outputFileName, suffix));
             await new Promise<void>(async (resolve, reject) => {
                 const source = streamProvider();
-                let readStream: Stream;
-                if (source instanceof Promise) {
-                    readStream = await source;
-                } else {
-                    readStream = source;
-                }
+                let readStream: Stream = source instanceof Promise ? await source : source;
                 if (resizer) {
                     readStream = readStream.pipe(resizer.withMetadata());
                 }
-- 
cgit v1.2.3-70-g09d2


From 7b5b04560ba24b049d77d36562fed1f7dc190d43 Mon Sep 17 00:00:00 2001
From: Sam Wilkins <samwilkins333@gmail.com>
Date: Wed, 13 May 2020 01:22:16 -0700
Subject: improved buxton heuristic, but still seems intractable

---
 src/scraping/buxton/final/BuxtonImporter.ts | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'src/scraping/buxton/final')

diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts
index 94302c7b3..e55850b29 100644
--- a/src/scraping/buxton/final/BuxtonImporter.ts
+++ b/src/scraping/buxton/final/BuxtonImporter.ts
@@ -451,11 +451,23 @@ async function writeImages(zip: any): Promise<ImageData[]> {
         });
 
         // if it's not an icon, by this rough heuristic, i.e. is it not square
-        if (Math.abs(width - height) > 10) {
-            valid.push({ width, height, type, mediaPath });
+        const number = Number(/image(\d+)/.exec(mediaPath)![1]);
+        if (number > 5 || width - height > 10) {
+            valid.push({ width, height, type, mediaPath, number });
         }
     }
 
+    valid.sort((a, b) => a.number - b.number);
+
+    const [{ width: first_w, height: first_h }, { width: second_w, height: second_h }] = valid;
+    if (Math.abs(first_w / second_w - first_h / second_h) < 0.01) {
+        const first_size = first_w * first_h;
+        const second_size = second_w * second_h;
+        const target = first_size >= second_size ? 1 : 0;
+        valid.splice(target, 1);
+        console.log(`Heuristically removed image with size ${target ? second_size : first_size}`);
+    }
+
     // for each valid image, output the _o, _l, _m, and _s files
     // THIS IS WHERE THE SCRIPT SPENDS MOST OF ITS TIME
     for (const { type, width, height, mediaPath } of valid) {
-- 
cgit v1.2.3-70-g09d2