diff options
author | Sam Wilkins <samwilkins333@gmail.com> | 2020-02-08 17:32:33 -0500 |
---|---|---|
committer | Sam Wilkins <samwilkins333@gmail.com> | 2020-02-08 17:32:33 -0500 |
commit | 79e1323acd0d0f95d08a09cefce908e35d0e7558 (patch) | |
tree | 3ee8c780fca7d8a7a35847af5f59d2d3fd0875f6 /src/scraping/buxton/node_scraper.ts | |
parent | 36933b7b647a54aa7bda0600612d34b402d42919 (diff) |
initial commit, pending png read error bug fix
Diffstat (limited to 'src/scraping/buxton/node_scraper.ts')
-rw-r--r-- | src/scraping/buxton/node_scraper.ts | 256 |
1 files changed, 0 insertions, 256 deletions
diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts deleted file mode 100644 index ab6c9dcb2..000000000 --- a/src/scraping/buxton/node_scraper.ts +++ /dev/null @@ -1,256 +0,0 @@ -import { readdirSync, writeFile, existsSync, mkdirSync } from "fs"; -import * as path from "path"; -import { red, cyan, yellow, green } from "colors"; -import { Opt } from "../../new_fields/Doc"; -const StreamZip = require('node-stream-zip'); - -export interface DeviceDocument { - title: string; - shortDescription: string; - longDescription: string; - company: string; - year: number; - originalPrice: number; - degreesOfFreedom: number; - dimensions: string; - primaryKey: string; - secondaryKey: string; -} - -interface AnalysisResult { - device?: DeviceDocument; - errors?: any; -} - -type Converter<T> = (raw: string) => { transformed?: T, error?: string }; - -interface Processor<T> { - exp: RegExp; - matchIndex?: number; - transformer?: Converter<T>; -} - -const RegexMap = new Map<keyof DeviceDocument, Processor<any>>([ - ["title", { - exp: /contact\s+(.*)Short Description:/ - }], - ["company", { - exp: /Company:\s+([^\|]*)\s+\|/, - transformer: (raw: string) => ({ transformed: raw.replace(/\./g, "") }) - }], - ["year", { - exp: /Year:\s+([^\|]*)\s+\|/, - transformer: numberValue - }], - ["primaryKey", { - exp: /Primary:\s+(.*)(Secondary|Additional):/, - transformer: collectUniqueTokens - }], - ["secondaryKey", { - exp: /(Secondary|Additional):\s+([^\{\}]*)Links/, - transformer: collectUniqueTokens, - matchIndex: 2 - }], - ["originalPrice", { - exp: /Original Price \(USD\)\:\s+\$([0-9\.]+)/, - transformer: numberValue - }], - ["degreesOfFreedom", { - exp: /Degrees of Freedom:\s+([0-9]+)/, - transformer: numberValue - }], - ["dimensions", { - exp: /Dimensions\s+\(L x W x H\):\s+([0-9\.]+\s+x\s+[0-9\.]+\s+x\s+[0-9\.]+\s\([A-Za-z]+\))/, - transformer: (raw: string) => { - const [length, width, group] = raw.split(" x "); - const [height, unit] = group.split(" "); - return { - transformed: { - length: Number(length), - width: Number(width), - height: Number(height), - unit: unit.replace(/[\(\)]+/g, "") - } - }; - } - }], - ["shortDescription", { - exp: /Short Description:\s+(.*)Bill Buxton[’']s Notes/, - transformer: correctSentences - }], - ["longDescription", { - exp: /Bill Buxton[’']s Notes(.*)Device Details/, - transformer: correctSentences - }], -]); - -function numberValue(raw: string) { - const transformed = Number(raw); - if (isNaN(transformed)) { - return { error: `${transformed} cannot be parsed to a numeric value.` }; - } - return { transformed }; -} - -function collectUniqueTokens(raw: string) { - return { transformed: Array.from(new Set(raw.replace(/,|\s+and\s+/g, " ").split(/\s+/).map(token => token.toLowerCase().trim()))).map(capitalize).sort() }; -} - -function correctSentences(raw: string) { - raw = raw.replace(/\./g, ". ").replace(/\:/g, ": ").replace(/\,/g, ", ").replace(/\?/g, "? ").trimRight(); - raw = raw.replace(/\s{2,}/g, " "); - return { transformed: raw }; -} - -const outDir = path.resolve(__dirname, "json"); -const successOut = "buxton.json"; -const failOut = "incomplete.json"; -const deviceKeys = Array.from(RegexMap.keys()); - -function printEntries(zip: any) { - const { entriesCount } = zip; - console.log(`Recognized ${entriesCount} entr${entriesCount === 1 ? "y" : "ies"}.`); - for (const entry of Object.values<any>(zip.entries())) { - const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`; - console.log(`${entry.name}: ${desc}`); - } -} - -async function wordToPlainText(pathToDocument: string): Promise<string> { - const zip = new StreamZip({ file: pathToDocument, storeEntries: true }); - const contents = await new Promise<string>((resolve, reject) => { - zip.on('ready', () => { - let body = ""; - zip.stream("word/document.xml", (error: any, stream: any) => { - if (error) { - reject(error); - } - stream.on('data', (chunk: any) => body += chunk.toString()); - stream.on('end', () => { - resolve(body); - zip.close(); - }); - }); - }); - }); - let body = ""; - const components = contents.toString().split('<w:t'); - for (const component of components) { - const tags = component.split('>'); - const content = tags[1].replace(/<.*$/, ""); - body += content; - } - return body; -} - -function tryGetValidCapture(matches: RegExpExecArray | null, matchIndex: number): Opt<string> { - let captured: string; - if (!matches || !(captured = matches[matchIndex])) { - return undefined; - } - const lower = captured.toLowerCase(); - if (/to come/.test(lower)) { - return undefined; - } - if (lower.includes("xxx")) { - return undefined; - } - if (!captured.toLowerCase().replace(/[….\s]+/g, "").length) { - return undefined; - } - return captured; -} - -function capitalize(word: string): string { - const clean = word.trim(); - if (!clean.length) { - return word; - } - return word.charAt(0).toUpperCase() + word.slice(1); -} - -function analyze(path: string, body: string): AnalysisResult { - const device: any = {}; - - const segments = path.split("/"); - const filename = segments[segments.length - 1].replace("Bill_Notes_", ""); - - const errors: any = { filename }; - - for (const key of deviceKeys) { - const { exp, transformer, matchIndex } = RegexMap.get(key)!; - const matches = exp.exec(body); - - let captured = tryGetValidCapture(matches, matchIndex ?? 1); - if (!captured) { - errors[key] = `ERR__${key.toUpperCase()}__: outer match ${matches === null ? "wasn't" : "was"} captured.`; - continue; - } - - captured = captured.replace(/\s{2,}/g, " "); - if (transformer) { - const { error, transformed } = transformer(captured); - if (error) { - errors[key] = `__ERR__${key.toUpperCase()}__TRANSFORM__: ${error}`; - continue; - } - captured = transformed; - } - - device[key] = captured; - } - - const errorKeys = Object.keys(errors); - if (errorKeys.length > 1) { - console.log(red(`\n@ ${cyan(filename.toUpperCase())}...`)); - errorKeys.forEach(key => key !== "filename" && console.log(red(errors[key]))); - return { errors }; - } - - return { device }; -} - -async function parseFiles(): Promise<DeviceDocument[]> { - const sourceDirectory = path.resolve(`${__dirname}/source`); - const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`); - const imported = await Promise.all(candidates.map(async path => ({ path, body: await wordToPlainText(path) }))); - // const imported = [{ path: candidates[10], body: await extract(candidates[10]) }]; - const data = imported.map(({ path, body }) => analyze(path, body)); - const masterDevices: DeviceDocument[] = []; - const masterErrors: any[] = []; - data.forEach(({ device, errors }) => { - if (device) { - masterDevices.push(device); - } else { - masterErrors.push(errors); - } - }); - const total = candidates.length; - if (masterDevices.length + masterErrors.length !== total) { - throw new Error(`Encountered a ${masterDevices.length} to ${masterErrors.length} mismatch in device / error split!`); - } - console.log(); - await writeOutputFile(successOut, masterDevices, total, true); - await writeOutputFile(failOut, masterErrors, total, false); - console.log(); - - return masterDevices; -} - -async function writeOutputFile(relativePath: string, data: any[], total: number, success: boolean) { - console.log(yellow(`Encountered ${data.length} ${success ? "valid" : "invalid"} documents out of ${total} candidates. Writing ${relativePath}...`)); - return new Promise<void>((resolve, reject) => { - const destination = path.resolve(outDir, relativePath); - const contents = JSON.stringify(data, undefined, 4); - writeFile(destination, contents, err => err ? reject(err) : resolve()); - }); -} - -export async function main() { - if (!existsSync(outDir)) { - mkdirSync(outDir); - } - return parseFiles(); -} - -main();
\ No newline at end of file |