aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/buxton/node_scraper.ts
diff options
context:
space:
mode:
authoryipstanley <stanley_yip@brown.edu>2020-02-29 14:18:43 -0500
committeryipstanley <stanley_yip@brown.edu>2020-02-29 14:18:43 -0500
commit2f6e27c67d1790d4350eede3003f0b614460f4d1 (patch)
treeef5e70925b8cdeb8229af849e33e6f3a4cceae7f /src/scraping/buxton/node_scraper.ts
parentf1fcbeea5fb103b7623e795e72aacd4dfacc6c70 (diff)
parent640f14da28d97600fb32d09023fc932e3a4052c4 (diff)
Merge branch 'master' of https://github.com/browngraphicslab/Dash-Web into pen
Diffstat (limited to 'src/scraping/buxton/node_scraper.ts')
-rw-r--r--src/scraping/buxton/node_scraper.ts57
1 files changed, 0 insertions, 57 deletions
diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts
deleted file mode 100644
index ef1d989d4..000000000
--- a/src/scraping/buxton/node_scraper.ts
+++ /dev/null
@@ -1,57 +0,0 @@
-import { readdirSync } from "fs";
-import { resolve } from "path";
-
-const StreamZip = require('node-stream-zip');
-
-export async function open(path: string) {
- const zip = new StreamZip({
- file: path,
- storeEntries: true
- });
- return new Promise<string>((resolve, reject) => {
- zip.on('ready', () => {
- console.log("READY!", zip.entriesCount);
- for (const entry of Object.values(zip.entries()) as any[]) {
- const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`;
- console.log(`Entry ${entry.name}: ${desc}`);
- }
- let body = "";
- zip.stream("word/document.xml", (error: any, stream: any) => {
- if (error) {
- reject(error);
- }
- stream.on('data', (chunk: any) => body += chunk.toString());
- stream.on('end', () => {
- resolve(body);
- zip.close();
- });
- });
- });
- });
-}
-
-export async function extract(path: string) {
- const contents = await open(path);
- let body = "";
- const components = contents.toString().split('<w:t');
- for (const component of components) {
- const tags = component.split('>');
- console.log(tags[1]);
- const content = tags[1].replace(/<.*$/, "");
- body += content;
- }
- return body;
-}
-
-async function parse(): Promise<string[]> {
- const sourceDirectory = resolve(`${__dirname}/source`);
- const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`);
- await extract(candidates[0]);
- try {
- return Promise.all(candidates.map(extract));
- } catch {
- return [];
- }
-}
-
-parse(); \ No newline at end of file