diff options
author | Sam Wilkins <samwilkins333@gmail.com> | 2020-01-19 13:22:26 -0500 |
---|---|---|
committer | Sam Wilkins <samwilkins333@gmail.com> | 2020-01-19 13:22:26 -0500 |
commit | 3336d663bc04c602d652ebc1f63a26df68ff92c4 (patch) | |
tree | 801f3097f84f01c3bfbd56e01be66d854fb9f68c /src/scraping/buxton/node_scraper.ts | |
parent | aff9cc02750eb032ade98d77cf9ff45677063fc8 (diff) |
added node scraper
Diffstat (limited to 'src/scraping/buxton/node_scraper.ts')
-rw-r--r-- | src/scraping/buxton/node_scraper.ts | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts index e69de29bb..ef1d989d4 100644 --- a/src/scraping/buxton/node_scraper.ts +++ b/src/scraping/buxton/node_scraper.ts @@ -0,0 +1,57 @@ +import { readdirSync } from "fs"; +import { resolve } from "path"; + +const StreamZip = require('node-stream-zip'); + +export async function open(path: string) { + const zip = new StreamZip({ + file: path, + storeEntries: true + }); + return new Promise<string>((resolve, reject) => { + zip.on('ready', () => { + console.log("READY!", zip.entriesCount); + for (const entry of Object.values(zip.entries()) as any[]) { + const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`; + console.log(`Entry ${entry.name}: ${desc}`); + } + let body = ""; + zip.stream("word/document.xml", (error: any, stream: any) => { + if (error) { + reject(error); + } + stream.on('data', (chunk: any) => body += chunk.toString()); + stream.on('end', () => { + resolve(body); + zip.close(); + }); + }); + }); + }); +} + +export async function extract(path: string) { + const contents = await open(path); + let body = ""; + const components = contents.toString().split('<w:t'); + for (const component of components) { + const tags = component.split('>'); + console.log(tags[1]); + const content = tags[1].replace(/<.*$/, ""); + body += content; + } + return body; +} + +async function parse(): Promise<string[]> { + const sourceDirectory = resolve(`${__dirname}/source`); + const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`); + await extract(candidates[0]); + try { + return Promise.all(candidates.map(extract)); + } catch { + return []; + } +} + +parse();
\ No newline at end of file |