From 3336d663bc04c602d652ebc1f63a26df68ff92c4 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Sun, 19 Jan 2020 13:22:26 -0500 Subject: added node scraper --- src/scraping/buxton/node_scraper.ts | 57 +++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'src') diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts index e69de29bb..ef1d989d4 100644 --- a/src/scraping/buxton/node_scraper.ts +++ b/src/scraping/buxton/node_scraper.ts @@ -0,0 +1,57 @@ +import { readdirSync } from "fs"; +import { resolve } from "path"; + +const StreamZip = require('node-stream-zip'); + +export async function open(path: string) { + const zip = new StreamZip({ + file: path, + storeEntries: true + }); + return new Promise((resolve, reject) => { + zip.on('ready', () => { + console.log("READY!", zip.entriesCount); + for (const entry of Object.values(zip.entries()) as any[]) { + const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`; + console.log(`Entry ${entry.name}: ${desc}`); + } + let body = ""; + zip.stream("word/document.xml", (error: any, stream: any) => { + if (error) { + reject(error); + } + stream.on('data', (chunk: any) => body += chunk.toString()); + stream.on('end', () => { + resolve(body); + zip.close(); + }); + }); + }); + }); +} + +export async function extract(path: string) { + const contents = await open(path); + let body = ""; + const components = contents.toString().split(''); + console.log(tags[1]); + const content = tags[1].replace(/<.*$/, ""); + body += content; + } + return body; +} + +async function parse(): Promise { + const sourceDirectory = resolve(`${__dirname}/source`); + const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`); + await extract(candidates[0]); + try { + return Promise.all(candidates.map(extract)); + } catch { + return []; + } +} + +parse(); \ No newline at end of file -- cgit v1.2.3-70-g09d2