diff options
author | yipstanley <stanley_yip@brown.edu> | 2020-02-29 14:18:43 -0500 |
---|---|---|
committer | yipstanley <stanley_yip@brown.edu> | 2020-02-29 14:18:43 -0500 |
commit | 2f6e27c67d1790d4350eede3003f0b614460f4d1 (patch) | |
tree | ef5e70925b8cdeb8229af849e33e6f3a4cceae7f /src/scraping/buxton/node_scraper.ts | |
parent | f1fcbeea5fb103b7623e795e72aacd4dfacc6c70 (diff) | |
parent | 640f14da28d97600fb32d09023fc932e3a4052c4 (diff) |
Merge branch 'master' of https://github.com/browngraphicslab/Dash-Web into pen
Diffstat (limited to 'src/scraping/buxton/node_scraper.ts')
-rw-r--r-- | src/scraping/buxton/node_scraper.ts | 57 |
1 files changed, 0 insertions, 57 deletions
diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts deleted file mode 100644 index ef1d989d4..000000000 --- a/src/scraping/buxton/node_scraper.ts +++ /dev/null @@ -1,57 +0,0 @@ -import { readdirSync } from "fs"; -import { resolve } from "path"; - -const StreamZip = require('node-stream-zip'); - -export async function open(path: string) { - const zip = new StreamZip({ - file: path, - storeEntries: true - }); - return new Promise<string>((resolve, reject) => { - zip.on('ready', () => { - console.log("READY!", zip.entriesCount); - for (const entry of Object.values(zip.entries()) as any[]) { - const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`; - console.log(`Entry ${entry.name}: ${desc}`); - } - let body = ""; - zip.stream("word/document.xml", (error: any, stream: any) => { - if (error) { - reject(error); - } - stream.on('data', (chunk: any) => body += chunk.toString()); - stream.on('end', () => { - resolve(body); - zip.close(); - }); - }); - }); - }); -} - -export async function extract(path: string) { - const contents = await open(path); - let body = ""; - const components = contents.toString().split('<w:t'); - for (const component of components) { - const tags = component.split('>'); - console.log(tags[1]); - const content = tags[1].replace(/<.*$/, ""); - body += content; - } - return body; -} - -async function parse(): Promise<string[]> { - const sourceDirectory = resolve(`${__dirname}/source`); - const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`); - await extract(candidates[0]); - try { - return Promise.all(candidates.map(extract)); - } catch { - return []; - } -} - -parse();
\ No newline at end of file |