aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorSam Wilkins <samwilkins333@gmail.com>2020-04-30 19:06:42 -0700
committerSam Wilkins <samwilkins333@gmail.com>2020-04-30 19:06:42 -0700
commit1660defc561c904217ed5be34cd6e0fe64736fe1 (patch)
treea5dc2ccc790be5c2d6b815a5a4c4859b61ecea89 /src
parent22748f8d35235941fc6622b19a2d4d3f809ccee7 (diff)
commented Buxton importer
Diffstat (limited to 'src')
-rw-r--r--src/scraping/buxton/final/BuxtonImporter.ts212
-rw-r--r--src/server/authentication/models/current_user_utils.ts1
2 files changed, 179 insertions, 34 deletions
diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts
index 713207a07..21363f848 100644
--- a/src/scraping/buxton/final/BuxtonImporter.ts
+++ b/src/scraping/buxton/final/BuxtonImporter.ts
@@ -10,6 +10,10 @@ import { parseXml } from "libxmljs";
import { strictEqual } from "assert";
import { Readable, PassThrough } from "stream";
+/**
+ * This is an arbitrary bundle of data that gets populated
+ * in extractFileContents
+ */
interface DocumentContents {
body: string;
imageData: ImageData[];
@@ -19,6 +23,10 @@ interface DocumentContents {
longDescription: string;
}
+/**
+ * A rough schema for everything that Bill has
+ * included for each document
+ */
export interface DeviceDocument {
title: string;
shortDescription: string;
@@ -33,36 +41,65 @@ export interface DeviceDocument {
attribute: string;
__images: ImageData[];
hyperlinks: string[];
- captions: string[];
- embeddedFileNames: string[];
+ captions: string[]; // from the table column
+ embeddedFileNames: string[]; // from the table column
}
+/**
+ * A layer of abstraction around a single parsing
+ * attempt. The error is not a TypeScript error, but
+ * rather an invalidly formatted value for a given key.
+ */
export interface AnalysisResult {
device?: DeviceDocument;
- errors?: { [key: string]: string };
+ invalid?: { [deviceProperty: string]: string };
}
+/**
+ * A mini API that takes in a string and returns
+ * either the given T or an error indicating that the
+ * transformation was rejected.
+ */
type Transformer<T> = (raw: string) => TransformResult<T>;
interface TransformResult<T> {
transformed?: T;
error?: string;
}
+/**
+ * Simple bundle counting successful and failed imports
+ */
export interface ImportResults {
deviceCount: number;
errorCount: number;
}
+/**
+ * Definitions for callback functions. Such instances are
+ * just invoked by when a single document has been parsed
+ * or the entire import is over. As of this writing, these
+ * callbacks are supplied by WebSocket.ts and used to inform
+ * the client of these events.
+ */
type ResultCallback = (result: AnalysisResult) => void;
type TerminatorCallback = (result: ImportResults) => void;
-interface Processor<T> {
- exp: RegExp;
- matchIndex?: number;
- transformer?: Transformer<T>;
- required?: boolean;
+/**
+ * Defines everything needed to define how a single key should be
+ * formatted within the plain body text. The association between
+ * keys and their format definitions is stored FormatMap
+ */
+interface ValueFormatDefinition<T> {
+ exp: RegExp; // the expression that the key's value should match
+ matchIndex?: number; // defaults to 0, but can be overridden to account for grouping in @param exp
+ transformer?: Transformer<T>; // if desirable, how to transform the Regex match
+ required?: boolean; // defaults to true, confirms that for a whole document to be counted successful,
+ // all of its required values should be present and properly formatted
}
+/**
+ * The basic data we extract from each image in the document
+ */
interface ImageData {
url: string;
nativeWidth: number;
@@ -71,6 +108,10 @@ interface ImageData {
namespace Utilities {
+ /**
+ * Numeric 'try parse', fits with the Transformer API
+ * @param raw the serialized number
+ */
export function numberValue(raw: string): TransformResult<number> {
const transformed = Number(raw);
if (isNaN(transformed)) {
@@ -79,18 +120,32 @@ namespace Utilities {
return { transformed };
}
+ /**
+ * A simple tokenizer that splits along 'and' and commas, and removes duplicates
+ * Helpful mainly for attribute and primary key lists
+ * @param raw the string to tokenize
+ */
export function collectUniqueTokens(raw: string): TransformResult<string[]> {
const pieces = raw.replace(/,|\s+and\s+/g, " ").split(/\s+/).filter(piece => piece.length);
const unique = new Set(pieces.map(token => token.toLowerCase().trim()));
return { transformed: Array.from(unique).map(capitalize).sort() };
}
+ /**
+ * Tries to correct XML text parsing artifact where some sentences lose their separating space,
+ * and others gain excess whitespace
+ * @param raw
+ */
export function correctSentences(raw: string): TransformResult<string> {
raw = raw.replace(/\./g, ". ").replace(/\:/g, ": ").replace(/\,/g, ", ").replace(/\?/g, "? ").trimRight();
raw = raw.replace(/\s{2,}/g, " ");
return { transformed: raw };
}
+ /**
+ * Simple capitalization
+ * @param word to capitalize
+ */
export function capitalize(word: string): string {
const clean = word.trim();
if (!clean.length) {
@@ -99,6 +154,12 @@ namespace Utilities {
return word.charAt(0).toUpperCase() + word.slice(1);
}
+ /**
+ * Streams the requeted file at the relative path to the
+ * root of the zip, then parses it with a library
+ * @param zip the zip instance data source
+ * @param relativePath the path to a .xml file within the zip to parse
+ */
export async function readAndParseXml(zip: any, relativePath: string) {
console.log(`Text streaming ${relativePath}`);
const contents = await new Promise<string>((resolve, reject) => {
@@ -111,13 +172,17 @@ namespace Utilities {
stream.on('end', () => resolve(body));
});
});
-
return parseXml(contents);
}
-
}
-const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([
+/**
+ * Defines how device values should be formatted. As you can see, the formatting is
+ * not super consistent and has changed over time as edge cases have been found, but this
+ * at least imposes some constraints, and will notify you if a document doesn't match the specifications
+ * in this map.
+ */
+const FormatMap = new Map<keyof DeviceDocument, ValueFormatDefinition<any>>([
["title", {
exp: /contact\s+(.*)Short Description:/
}],
@@ -189,17 +254,25 @@ const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([
}],
]);
-const sourceDir = path.resolve(__dirname, "source");
-const outDir = path.resolve(__dirname, "json");
-const imageDir = path.resolve(__dirname, "../../../server/public/files/images/buxton");
-const successOut = "buxton.json";
-const failOut = "incomplete.json";
-const deviceKeys = Array.from(RegexMap.keys());
-
+const sourceDir = path.resolve(__dirname, "source"); // where the Word documents are assumed to be stored
+const outDir = path.resolve(__dirname, "json"); // where the JSON output of these device documents will be written
+const imageDir = path.resolve(__dirname, "../../../server/public/files/images/buxton"); // where, in the server, these images will be written
+const successOut = "buxton.json"; // the JSON list representing properly formatted documents
+const failOut = "incomplete.json"; // the JSON list representing improperly formatted documents
+const deviceKeys = Array.from(FormatMap.keys()); // a way to iterate through all keys of the DeviceDocument interface
+
+/**
+ * Starts by REMOVING ALL EXISTING BUXTON RESOURCES. This might need to be
+ * changed going forward
+ * @param emitter the callback when each document is completed
+ * @param terminator the callback when the entire import is completed
+ */
export default async function executeImport(emitter: ResultCallback, terminator: TerminatorCallback) {
try {
+ // get all Word documents in the source directory
const contents = readdirSync(sourceDir);
const wordDocuments = contents.filter(file => /.*\.docx?$/.test(file)).map(file => `${sourceDir}/${file}`);
+ // removal takes place here
[outDir, imageDir].forEach(dir => {
rimraf.sync(dir);
mkdirSync(dir);
@@ -216,19 +289,28 @@ export default async function executeImport(emitter: ResultCallback, terminator:
}
}
+/**
+ * Parse every Word document in the directory, notifying any callers as needed
+ * at each iteration via the emitter.
+ * @param wordDocuments the string list of Word document names to parse
+ * @param emitter the callback when each document is completed
+ * @param terminator the callback when the entire import is completed
+ */
async function parseFiles(wordDocuments: string[], emitter: ResultCallback, terminator: TerminatorCallback): Promise<DeviceDocument[]> {
+ // execute parent-most parse function
const results: AnalysisResult[] = [];
for (const filePath of wordDocuments) {
- const fileName = path.basename(filePath).replace("Bill_Notes_", "");
+ const fileName = path.basename(filePath).replace("Bill_Notes_", ""); // not strictly needed, but cleaner
console.log(cyan(`\nExtracting contents from ${fileName}...`));
const result = analyze(fileName, await extractFileContents(filePath));
emitter(result);
results.push(result);
}
+ // collect information about errors and successes
const masterDevices: DeviceDocument[] = [];
const masterErrors: { [key: string]: string }[] = [];
- results.forEach(({ device, errors }) => {
+ results.forEach(({ device, invalid: errors }) => {
if (device) {
masterDevices.push(device);
} else if (errors) {
@@ -236,24 +318,45 @@ async function parseFiles(wordDocuments: string[], emitter: ResultCallback, term
}
});
+ // something went wrong, since errors and successes should sum to total inputs
const total = wordDocuments.length;
if (masterDevices.length + masterErrors.length !== total) {
throw new Error(`Encountered a ${masterDevices.length} to ${masterErrors.length} mismatch in device / error split!`);
}
+ // write the external JSON representations of this import
console.log();
await writeOutputFile(successOut, masterDevices, total, true);
await writeOutputFile(failOut, masterErrors, total, false);
console.log();
+ // notify the caller that the import has finished
terminator({ deviceCount: masterDevices.length, errorCount: masterErrors.length });
return masterDevices;
}
+/**
+ * XPath definitions for desired XML targets in respective hierarchies.
+ *
+ * For table cells, can be read as: "find me anything that looks like <w:tc> in XML, whose
+ * parent looks like <w:tr>, whose parent looks like <w:tbl>"
+ *
+ * <w:tbl>
+ * <w:tr>
+ * <w:tc>
+ *
+ * These are found by trial and error, and using an online XML parser / prettifier
+ * to inspect the structure, since the Node XML library does not expose the parsed
+ * structure very well for searching, say in the debug console.
+ */
const tableCellXPath = '//*[name()="w:tbl"]/*[name()="w:tr"]/*[name()="w:tc"]';
const hyperlinkXPath = '//*[name()="Relationship" and contains(@Type, "hyperlink")]';
+/**
+ * The meat of the script, images and text content are extracted here
+ * @param pathToDocument the path to the document relative to the root of the zip
+ */
async function extractFileContents(pathToDocument: string): Promise<DocumentContents> {
console.log('Extracting text...');
const zip = new StreamZip({ file: pathToDocument, storeEntries: true });
@@ -261,22 +364,30 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
// extract the body of the document and, specifically, its captions
const document = await Utilities.readAndParseXml(zip, "word/document.xml");
+ // get plain text
const body = document.root()?.text() ?? "No body found. Check the import script's XML parser.";
const captions: string[] = [];
const embeddedFileNames: string[] = [];
- const captionTargets = document.find(tableCellXPath).map(node => node.text().trim());
+ // preserve paragraph formatting and line breaks that would otherwise get lost in the plain text parsing
+ // of the XML hierarchy
const paragraphs = document.find('//*[name()="w:p"]').map(node => Utilities.correctSentences(node.text()).transformed!);
const start = paragraphs.indexOf(paragraphs.find(el => /Bill Buxton[’']s Notes/.test(el))!) + 1;
const end = paragraphs.indexOf("Device Details");
const longDescription = paragraphs.slice(start, end).filter(paragraph => paragraph.length).join("\n\n");
- const { length } = captionTargets;
+ // extract captions from the table cells
+ const tableRowsFlattened = document.find(tableCellXPath).map(node => node.text().trim());
+ const { length } = tableRowsFlattened;
strictEqual(length > 3, true, "No captions written.");
strictEqual(length % 3 === 0, true, "Improper caption formatting.");
- for (let i = 3; i < captionTargets.length; i += 3) {
- const row = captionTargets.slice(i, i + 3);
+ // break the flat list of strings into groups of three, since there
+ // currently are three columns in the table. Thus, each group represents
+ // a row in the table, where the first row has no text content since it's
+ // the image, the second has the file name and the third has the caption
+ for (let i = 3; i < tableRowsFlattened.length; i += 3) {
+ const row = tableRowsFlattened.slice(i, i + 3);
embeddedFileNames.push(row[1]);
captions.push(row[2]);
}
@@ -286,23 +397,34 @@ async function extractFileContents(pathToDocument: string): Promise<DocumentCont
const hyperlinks = rels.find(hyperlinkXPath).map(el => el.attrs()[2].value());
console.log("Text extracted.");
+ // write out the images for this document
console.log("Beginning image extraction...");
const imageData = await writeImages(zip);
console.log(`Extracted ${imageData.length} images.`);
+ // cleanup
zip.close();
return { body, longDescription, imageData, captions, embeddedFileNames, hyperlinks };
}
+// zip relative path from root expression / filter used to isolate only media assets
const imageEntry = /^word\/media\/\w+\.(jpeg|jpg|png|gif)/;
-interface Dimensions {
+/**
+ * Image dimensions and file suffix,
+ */
+interface ImageAttrs {
width: number;
height: number;
type: string;
}
+/**
+ * For each image, stream the file, get its size, check if it's an icon
+ * (if it is, ignore it)
+ * @param zip the zip instance data source
+ */
async function writeImages(zip: any): Promise<ImageData[]> {
const allEntries = Object.values<any>(zip.entries()).map(({ name }) => name);
const imageEntries = allEntries.filter(name => imageEntry.test(name));
@@ -315,8 +437,8 @@ async function writeImages(zip: any): Promise<ImageData[]> {
});
for (const mediaPath of imageEntries) {
- const { width, height, type } = await new Promise<Dimensions>(async resolve => {
- const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: Dimensions) => {
+ const { width, height, type } = await new Promise<ImageAttrs>(async resolve => {
+ const sizeStream = (createImageSizeStream() as PassThrough).on('size', (dimensions: ImageAttrs) => {
readStream.destroy();
resolve(dimensions);
}).on("error", () => readStream.destroy());
@@ -324,11 +446,14 @@ async function writeImages(zip: any): Promise<ImageData[]> {
readStream.pipe(sizeStream);
});
+ // if it's not an icon, by this rough heuristic, i.e. is it not square
if (Math.abs(width - height) > 10) {
valid.push({ width, height, type, mediaPath });
}
}
+ // for each valid image, output the _o, _l, _m, and _s files
+ // THIS IS WHERE THE SCRIPT SPENDS MOST OF ITS TIME
for (const { type, width, height, mediaPath } of valid) {
const generatedFileName = `upload_${Utils.GenerateGuid()}.${type.toLowerCase()}`;
await DashUploadUtils.outputResizedImages(() => getImageStream(mediaPath), generatedFileName, imageDir);
@@ -342,6 +467,14 @@ async function writeImages(zip: any): Promise<ImageData[]> {
return imageUrls;
}
+/**
+ * Takes the results of extractFileContents, which relative to this is sort of the
+ * external media / preliminary text processing, and now tests the given file name to
+ * with those value definitions to make sure the body of the document contains all
+ * required fields, properly formatted
+ * @param fileName the file whose body to inspect
+ * @param contents the data already computed / parsed by extractFileContents
+ */
function analyze(fileName: string, contents: DocumentContents): AnalysisResult {
const { body, imageData, captions, hyperlinks, embeddedFileNames, longDescription } = contents;
const device: any = {
@@ -354,43 +487,56 @@ function analyze(fileName: string, contents: DocumentContents): AnalysisResult {
const errors: { [key: string]: string } = { fileName };
for (const key of deviceKeys) {
- const { exp, transformer, matchIndex, required } = RegexMap.get(key)!;
+ const { exp, transformer, matchIndex, required } = FormatMap.get(key)!;
const matches = exp.exec(body);
let captured: string;
- if (matches && (captured = matches[matchIndex ?? 1])) {
- captured = captured.replace(/\s{2,}/g, " ");
+ // if we matched and we got the specific match we're after
+ if (matches && (captured = matches[matchIndex ?? 1])) { // matchIndex defaults to 1
+ captured = captured.replace(/\s{2,}/g, " "); // remove excess whitespace
+ // if supplied, apply the required transformation (recall this is specified in FormatMap)
if (transformer) {
const { error, transformed } = transformer(captured);
if (error) {
+ // we hit a snag trying to transform the valid match
+ // still counts as a fundamental error
errors[key] = `__ERR__${key.toUpperCase()}__TRANSFORM__: ${error}`;
continue;
}
captured = transformed;
}
-
device[key] = captured;
} else if (required ?? true) {
+ // the field was either implicitly or explicitly required, and failed to match the definition in
+ // FormatMap
errors[key] = `ERR__${key.toUpperCase()}__: outer match ${matches === null ? "wasn't" : "was"} captured.`;
continue;
}
}
+ // print errors - this can be removed
const errorKeys = Object.keys(errors);
if (errorKeys.length > 1) {
console.log(red(`@ ${cyan(fileName.toUpperCase())}...`));
errorKeys.forEach(key => key !== "filename" && console.log(red(errors[key])));
- return { errors };
+ return { invalid: errors };
}
return { device };
}
+/**
+ * A utility function that writes the JSON results for this import out to the desired path
+ * @param relativePath where to write the JSON file
+ * @param data valid device document objects, or errors
+ * @param total used for more informative printing
+ * @param success whether or not the caller is writing the successful parses or the failures
+ */
async function writeOutputFile(relativePath: string, data: any[], total: number, success: boolean) {
console.log(yellow(`Encountered ${data.length} ${success ? "valid" : "invalid"} documents out of ${total} candidates. Writing ${relativePath}...`));
return new Promise<void>((resolve, reject) => {
const destination = path.resolve(outDir, relativePath);
- const contents = JSON.stringify(data, undefined, 4);
+ const contents = JSON.stringify(data, undefined, 4); // format the JSON
writeFile(destination, contents, err => err ? reject(err) : resolve());
});
} \ No newline at end of file
diff --git a/src/server/authentication/models/current_user_utils.ts b/src/server/authentication/models/current_user_utils.ts
index 663343f47..d7cc1e6bf 100644
--- a/src/server/authentication/models/current_user_utils.ts
+++ b/src/server/authentication/models/current_user_utils.ts
@@ -9,7 +9,6 @@ import { List } from "../../../new_fields/List";
import { listSpec } from "../../../new_fields/Schema";
import { ScriptField, ComputedField } from "../../../new_fields/ScriptField";
import { Cast, PromiseValue, StrCast, NumCast } from "../../../new_fields/Types";
-import { Utils } from "../../../Utils";
import { nullAudio, ImageField } from "../../../new_fields/URLField";
import { DragManager } from "../../../client/util/DragManager";
import { InkingControl } from "../../../client/views/InkingControl";