diff options
Diffstat (limited to 'src/scraping')
-rw-r--r-- | src/scraping/buxton/final/BuxtonImporter.ts | 52 |
1 files changed, 32 insertions, 20 deletions
diff --git a/src/scraping/buxton/final/BuxtonImporter.ts b/src/scraping/buxton/final/BuxtonImporter.ts index 3d7421e90..d9d48d68c 100644 --- a/src/scraping/buxton/final/BuxtonImporter.ts +++ b/src/scraping/buxton/final/BuxtonImporter.ts @@ -8,7 +8,6 @@ const StreamZip = require('node-stream-zip'); const createImageSizeStream = require("image-size-stream"); import { parseXml } from "libxmljs"; import { strictEqual } from "assert"; -import { BatchedArray, TimeUnit } from "array-batcher"; interface DocumentContents { body: string; @@ -24,21 +23,33 @@ export interface DeviceDocument { longDescription: string; company: string; year: number; - originalPrice: number; - degreesOfFreedom: number; + originalPrice?: number; + degreesOfFreedom?: number; dimensions?: string; primaryKey: string; secondaryKey: string; attribute: string; + __images: string[]; + hyperlinks: string[]; + captions: string[]; + embeddedFileNames: string[]; } -interface AnalysisResult { +export interface AnalysisResult { device?: DeviceDocument; - errors?: any; + errors?: { [key: string]: string }; } type Transformer<T> = (raw: string) => { transformed?: T, error?: string }; +export interface ImportResults { + deviceCount: number, + errorCount: number +} + +type ResultCallback = (result: AnalysisResult) => void; +type TerminatorCallback = (result: ImportResults) => void; + interface Processor<T> { exp: RegExp; matchIndex?: number; @@ -168,7 +179,7 @@ const successOut = "buxton.json"; const failOut = "incomplete.json"; const deviceKeys = Array.from(RegexMap.keys()); -export default async function executeImport() { +export default async function executeImport(emitter: ResultCallback, terminator: TerminatorCallback) { try { const contents = readdirSync(sourceDir); const wordDocuments = contents.filter(file => /.*\.docx?$/.test(file)).map(file => `${sourceDir}/${file}`); @@ -176,7 +187,7 @@ export default async function executeImport() { rimraf.sync(dir); mkdirSync(dir); }); - return parseFiles(wordDocuments); + return parseFiles(wordDocuments, emitter, terminator); } catch (e) { const message = [ "Unable to find a source directory.", @@ -188,23 +199,22 @@ export default async function executeImport() { } } -async function parseFiles(wordDocuments: string[]): Promise<DeviceDocument[]> { - const imported = await BatchedArray.from(wordDocuments, { batchSize: 8 }).batchedMapPatientInterval<{ fileName: string, contents: DocumentContents }>({ magnitude: 10, unit: TimeUnit.Seconds }, async (batch, collector) => { - for (const filePath of batch) { - const fileName = path.basename(filePath).replace("Bill_Notes_", ""); - console.log(cyan(`\nExtracting contents from ${fileName}...`)); - collector.push({ fileName, contents: await extractFileContents(filePath) }); - } - }); - console.log(yellow("\nAnalyzing the extracted document text...\n")); - const results = imported.map(({ fileName, contents }) => analyze(fileName, contents)); +async function parseFiles(wordDocuments: string[], emitter: ResultCallback, terminator: TerminatorCallback): Promise<DeviceDocument[]> { + const results: AnalysisResult[] = []; + for (const filePath of wordDocuments) { + const fileName = path.basename(filePath).replace("Bill_Notes_", ""); + console.log(cyan(`\nExtracting contents from ${fileName}...`)); + const result = analyze(fileName, await extractFileContents(filePath)); + emitter(result); + results.push(result); + } const masterDevices: DeviceDocument[] = []; - const masterErrors: any[] = []; + const masterErrors: { [key: string]: string }[] = []; results.forEach(({ device, errors }) => { if (device) { masterDevices.push(device); - } else { + } else if (errors) { masterErrors.push(errors); } }); @@ -219,6 +229,8 @@ async function parseFiles(wordDocuments: string[]): Promise<DeviceDocument[]> { await writeOutputFile(failOut, masterErrors, total, false); console.log(); + terminator({ deviceCount: masterDevices.length, errorCount: masterErrors.length }); + return masterDevices; } @@ -311,7 +323,7 @@ function analyze(fileName: string, contents: DocumentContents): AnalysisResult { embeddedFileNames, __images: imageUrls }; - const errors: any = { fileName }; + const errors: { [key: string]: string } = { fileName }; for (const key of deviceKeys) { const { exp, transformer, matchIndex, required } = RegexMap.get(key)!; |