From 3336d663bc04c602d652ebc1f63a26df68ff92c4 Mon Sep 17 00:00:00 2001
From: Sam Wilkins <samwilkins333@gmail.com>
Date: Sun, 19 Jan 2020 13:22:26 -0500
Subject: added node scraper

---
 src/scraping/buxton/node_scraper.ts | 57 +++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

(limited to 'src')
diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts
index e69de29bb..ef1d989d4 100644
--- a/src/scraping/buxton/node_scraper.ts
+++ b/src/scraping/buxton/node_scraper.ts
@@ -0,0 +1,57 @@
+import { readdirSync } from "fs";
+import { resolve } from "path";
+
+const StreamZip = require('node-stream-zip');
+
+export async function open(path: string) {
+    const zip = new StreamZip({
+        file: path,
+        storeEntries: true
+    });
+    return new Promise<string>((resolve, reject) => {
+        zip.on('ready', () => {
+            console.log("READY!", zip.entriesCount);
+            for (const entry of Object.values(zip.entries()) as any[]) {
+                const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`;
+                console.log(`Entry ${entry.name}: ${desc}`);
+            }
+            let body = "";
+            zip.stream("word/document.xml", (error: any, stream: any) => {
+                if (error) {
+                    reject(error);
+                }
+                stream.on('data', (chunk: any) => body += chunk.toString());
+                stream.on('end', () => {
+                    resolve(body);
+                    zip.close();
+                });
+            });
+        });
+    });
+}
+
+export async function extract(path: string) {
+    const contents = await open(path);
+    let body = "";
+    const components = contents.toString().split('<w:t');
+    for (const component of components) {
+        const tags = component.split('>');
+        console.log(tags[1]);
+        const content = tags[1].replace(/<.*$/, "");
+        body += content;
+    }
+    return body;
+}
+
+async function parse(): Promise<string[]> {
+    const sourceDirectory = resolve(`${__dirname}/source`);
+    const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`);
+    await extract(candidates[0]);
+    try {
+        return Promise.all(candidates.map(extract));
+    } catch {
+        return [];
+    }
+}
+
+parse();
\ No newline at end of file
-- 
cgit v1.2.3-70-g09d2