diff options
author | Sam Wilkins <samwilkins333@gmail.com> | 2020-01-19 13:22:26 -0500 |
---|---|---|
committer | Sam Wilkins <samwilkins333@gmail.com> | 2020-01-19 13:22:26 -0500 |
commit | 3336d663bc04c602d652ebc1f63a26df68ff92c4 (patch) | |
tree | 801f3097f84f01c3bfbd56e01be66d854fb9f68c | |
parent | aff9cc02750eb032ade98d77cf9ff45677063fc8 (diff) |
added node scraper
-rw-r--r-- | package-lock.json | 5 | ||||
-rw-r--r-- | package.json | 1 | ||||
-rw-r--r-- | src/scraping/buxton/node_scraper.ts | 57 |
3 files changed, 63 insertions, 0 deletions
diff --git a/package-lock.json b/package-lock.json index 474084fe7..5119494c6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8711,6 +8711,11 @@ } } }, + "node-stream-zip": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/node-stream-zip/-/node-stream-zip-1.9.1.tgz", + "integrity": "sha512-7/Xs9gkuYF0WBimz5OrSc6UVKLDTxvBG2yLGtEK8PSx94d86o/6iQLvIe/140ATz35JDqHKWIxh3GcA3u5hB0w==" + }, "nodemailer": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/nodemailer/-/nodemailer-5.1.1.tgz", diff --git a/package.json b/package.json index 6a82cbdcb..5077e7118 100644 --- a/package.json +++ b/package.json @@ -172,6 +172,7 @@ "mongodb": "^3.4.1", "mongoose": "^5.8.4", "node-sass": "^4.13.0", + "node-stream-zip": "^1.9.1", "nodemailer": "^5.1.1", "nodemon": "^1.19.4", "normalize.css": "^8.0.1", diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts index e69de29bb..ef1d989d4 100644 --- a/src/scraping/buxton/node_scraper.ts +++ b/src/scraping/buxton/node_scraper.ts @@ -0,0 +1,57 @@ +import { readdirSync } from "fs"; +import { resolve } from "path"; + +const StreamZip = require('node-stream-zip'); + +export async function open(path: string) { + const zip = new StreamZip({ + file: path, + storeEntries: true + }); + return new Promise<string>((resolve, reject) => { + zip.on('ready', () => { + console.log("READY!", zip.entriesCount); + for (const entry of Object.values(zip.entries()) as any[]) { + const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`; + console.log(`Entry ${entry.name}: ${desc}`); + } + let body = ""; + zip.stream("word/document.xml", (error: any, stream: any) => { + if (error) { + reject(error); + } + stream.on('data', (chunk: any) => body += chunk.toString()); + stream.on('end', () => { + resolve(body); + zip.close(); + }); + }); + }); + }); +} + +export async function extract(path: string) { + const contents = await open(path); + let body = ""; + const components = contents.toString().split('<w:t'); + for (const component of components) { + const tags = component.split('>'); + console.log(tags[1]); + const content = tags[1].replace(/<.*$/, ""); + body += content; + } + return body; +} + +async function parse(): Promise<string[]> { + const sourceDirectory = resolve(`${__dirname}/source`); + const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`); + await extract(candidates[0]); + try { + return Promise.all(candidates.map(extract)); + } catch { + return []; + } +} + +parse();
\ No newline at end of file |