diff options
author | Mohammad Amoush <47069173+mamoush34@users.noreply.github.com> | 2020-02-08 17:03:12 -0500 |
---|---|---|
committer | Mohammad Amoush <47069173+mamoush34@users.noreply.github.com> | 2020-02-08 17:03:12 -0500 |
commit | f9855e8d1ec83405ae3cc7d0113b46de63fc0848 (patch) | |
tree | bf4be61a021e59b771c1cd5958fd9fd43cac8693 /src/scraping/buxton/node_scraper.ts | |
parent | 87f5f043388b591c52e96a795fa461a79770550d (diff) | |
parent | 1b046f76cf39f1f6cb1875aa84b45db74b6d994e (diff) |
Merge branch 'master' of https://github.com/browngraphicslab/Dash-Web into webcam_mohammad
Diffstat (limited to 'src/scraping/buxton/node_scraper.ts')
-rw-r--r-- | src/scraping/buxton/node_scraper.ts | 256 |
1 files changed, 256 insertions, 0 deletions
diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts index e69de29bb..ab6c9dcb2 100644 --- a/src/scraping/buxton/node_scraper.ts +++ b/src/scraping/buxton/node_scraper.ts @@ -0,0 +1,256 @@ +import { readdirSync, writeFile, existsSync, mkdirSync } from "fs"; +import * as path from "path"; +import { red, cyan, yellow, green } from "colors"; +import { Opt } from "../../new_fields/Doc"; +const StreamZip = require('node-stream-zip'); + +export interface DeviceDocument { + title: string; + shortDescription: string; + longDescription: string; + company: string; + year: number; + originalPrice: number; + degreesOfFreedom: number; + dimensions: string; + primaryKey: string; + secondaryKey: string; +} + +interface AnalysisResult { + device?: DeviceDocument; + errors?: any; +} + +type Converter<T> = (raw: string) => { transformed?: T, error?: string }; + +interface Processor<T> { + exp: RegExp; + matchIndex?: number; + transformer?: Converter<T>; +} + +const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([ + ["title", { + exp: /contact\s+(.*)Short Description:/ + }], + ["company", { + exp: /Company:\s+([^\|]*)\s+\|/, + transformer: (raw: string) => ({ transformed: raw.replace(/\./g, "") }) + }], + ["year", { + exp: /Year:\s+([^\|]*)\s+\|/, + transformer: numberValue + }], + ["primaryKey", { + exp: /Primary:\s+(.*)(Secondary|Additional):/, + transformer: collectUniqueTokens + }], + ["secondaryKey", { + exp: /(Secondary|Additional):\s+([^\{\}]*)Links/, + transformer: collectUniqueTokens, + matchIndex: 2 + }], + ["originalPrice", { + exp: /Original Price \(USD\)\:\s+\$([0-9\.]+)/, + transformer: numberValue + }], + ["degreesOfFreedom", { + exp: /Degrees of Freedom:\s+([0-9]+)/, + transformer: numberValue + }], + ["dimensions", { + exp: /Dimensions\s+\(L x W x H\):\s+([0-9\.]+\s+x\s+[0-9\.]+\s+x\s+[0-9\.]+\s\([A-Za-z]+\))/, + transformer: (raw: string) => { + const [length, width, group] = raw.split(" x "); + const [height, unit] = group.split(" "); + return { + transformed: { + length: Number(length), + width: Number(width), + height: Number(height), + unit: unit.replace(/[\(\)]+/g, "") + } + }; + } + }], + ["shortDescription", { + exp: /Short Description:\s+(.*)Bill Buxton[’']s Notes/, + transformer: correctSentences + }], + ["longDescription", { + exp: /Bill Buxton[’']s Notes(.*)Device Details/, + transformer: correctSentences + }], +]); + +function numberValue(raw: string) { + const transformed = Number(raw); + if (isNaN(transformed)) { + return { error: `${transformed} cannot be parsed to a numeric value.` }; + } + return { transformed }; +} + +function collectUniqueTokens(raw: string) { + return { transformed: Array.from(new Set(raw.replace(/,|\s+and\s+/g, " ").split(/\s+/).map(token => token.toLowerCase().trim()))).map(capitalize).sort() }; +} + +function correctSentences(raw: string) { + raw = raw.replace(/\./g, ". ").replace(/\:/g, ": ").replace(/\,/g, ", ").replace(/\?/g, "? ").trimRight(); + raw = raw.replace(/\s{2,}/g, " "); + return { transformed: raw }; +} + +const outDir = path.resolve(__dirname, "json"); +const successOut = "buxton.json"; +const failOut = "incomplete.json"; +const deviceKeys = Array.from(RegexMap.keys()); + +function printEntries(zip: any) { + const { entriesCount } = zip; + console.log(`Recognized ${entriesCount} entr${entriesCount === 1 ? "y" : "ies"}.`); + for (const entry of Object.values<any>(zip.entries())) { + const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`; + console.log(`${entry.name}: ${desc}`); + } +} + +async function wordToPlainText(pathToDocument: string): Promise<string> { + const zip = new StreamZip({ file: pathToDocument, storeEntries: true }); + const contents = await new Promise<string>((resolve, reject) => { + zip.on('ready', () => { + let body = ""; + zip.stream("word/document.xml", (error: any, stream: any) => { + if (error) { + reject(error); + } + stream.on('data', (chunk: any) => body += chunk.toString()); + stream.on('end', () => { + resolve(body); + zip.close(); + }); + }); + }); + }); + let body = ""; + const components = contents.toString().split('<w:t'); + for (const component of components) { + const tags = component.split('>'); + const content = tags[1].replace(/<.*$/, ""); + body += content; + } + return body; +} + +function tryGetValidCapture(matches: RegExpExecArray | null, matchIndex: number): Opt<string> { + let captured: string; + if (!matches || !(captured = matches[matchIndex])) { + return undefined; + } + const lower = captured.toLowerCase(); + if (/to come/.test(lower)) { + return undefined; + } + if (lower.includes("xxx")) { + return undefined; + } + if (!captured.toLowerCase().replace(/[….\s]+/g, "").length) { + return undefined; + } + return captured; +} + +function capitalize(word: string): string { + const clean = word.trim(); + if (!clean.length) { + return word; + } + return word.charAt(0).toUpperCase() + word.slice(1); +} + +function analyze(path: string, body: string): AnalysisResult { + const device: any = {}; + + const segments = path.split("/"); + const filename = segments[segments.length - 1].replace("Bill_Notes_", ""); + + const errors: any = { filename }; + + for (const key of deviceKeys) { + const { exp, transformer, matchIndex } = RegexMap.get(key)!; + const matches = exp.exec(body); + + let captured = tryGetValidCapture(matches, matchIndex ?? 1); + if (!captured) { + errors[key] = `ERR__${key.toUpperCase()}__: outer match ${matches === null ? "wasn't" : "was"} captured.`; + continue; + } + + captured = captured.replace(/\s{2,}/g, " "); + if (transformer) { + const { error, transformed } = transformer(captured); + if (error) { + errors[key] = `__ERR__${key.toUpperCase()}__TRANSFORM__: ${error}`; + continue; + } + captured = transformed; + } + + device[key] = captured; + } + + const errorKeys = Object.keys(errors); + if (errorKeys.length > 1) { + console.log(red(`\n@ ${cyan(filename.toUpperCase())}...`)); + errorKeys.forEach(key => key !== "filename" && console.log(red(errors[key]))); + return { errors }; + } + + return { device }; +} + +async function parseFiles(): Promise<DeviceDocument[]> { + const sourceDirectory = path.resolve(`${__dirname}/source`); + const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`); + const imported = await Promise.all(candidates.map(async path => ({ path, body: await wordToPlainText(path) }))); + // const imported = [{ path: candidates[10], body: await extract(candidates[10]) }]; + const data = imported.map(({ path, body }) => analyze(path, body)); + const masterDevices: DeviceDocument[] = []; + const masterErrors: any[] = []; + data.forEach(({ device, errors }) => { + if (device) { + masterDevices.push(device); + } else { + masterErrors.push(errors); + } + }); + const total = candidates.length; + if (masterDevices.length + masterErrors.length !== total) { + throw new Error(`Encountered a ${masterDevices.length} to ${masterErrors.length} mismatch in device / error split!`); + } + console.log(); + await writeOutputFile(successOut, masterDevices, total, true); + await writeOutputFile(failOut, masterErrors, total, false); + console.log(); + + return masterDevices; +} + +async function writeOutputFile(relativePath: string, data: any[], total: number, success: boolean) { + console.log(yellow(`Encountered ${data.length} ${success ? "valid" : "invalid"} documents out of ${total} candidates. Writing ${relativePath}...`)); + return new Promise<void>((resolve, reject) => { + const destination = path.resolve(outDir, relativePath); + const contents = JSON.stringify(data, undefined, 4); + writeFile(destination, contents, err => err ? reject(err) : resolve()); + }); +} + +export async function main() { + if (!existsSync(outDir)) { + mkdirSync(outDir); + } + return parseFiles(); +} + +main();
\ No newline at end of file |