aboutsummaryrefslogtreecommitdiff
path: root/src/scraping
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping')
-rw-r--r--src/scraping/buxton/node_scraper.ts57
1 files changed, 57 insertions, 0 deletions
diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts
index e69de29bb..ef1d989d4 100644
--- a/src/scraping/buxton/node_scraper.ts
+++ b/src/scraping/buxton/node_scraper.ts
@@ -0,0 +1,57 @@
+import { readdirSync } from "fs";
+import { resolve } from "path";
+
+const StreamZip = require('node-stream-zip');
+
+export async function open(path: string) {
+ const zip = new StreamZip({
+ file: path,
+ storeEntries: true
+ });
+ return new Promise<string>((resolve, reject) => {
+ zip.on('ready', () => {
+ console.log("READY!", zip.entriesCount);
+ for (const entry of Object.values(zip.entries()) as any[]) {
+ const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`;
+ console.log(`Entry ${entry.name}: ${desc}`);
+ }
+ let body = "";
+ zip.stream("word/document.xml", (error: any, stream: any) => {
+ if (error) {
+ reject(error);
+ }
+ stream.on('data', (chunk: any) => body += chunk.toString());
+ stream.on('end', () => {
+ resolve(body);
+ zip.close();
+ });
+ });
+ });
+ });
+}
+
+export async function extract(path: string) {
+ const contents = await open(path);
+ let body = "";
+ const components = contents.toString().split('<w:t');
+ for (const component of components) {
+ const tags = component.split('>');
+ console.log(tags[1]);
+ const content = tags[1].replace(/<.*$/, "");
+ body += content;
+ }
+ return body;
+}
+
+async function parse(): Promise<string[]> {
+ const sourceDirectory = resolve(`${__dirname}/source`);
+ const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`);
+ await extract(candidates[0]);
+ try {
+ return Promise.all(candidates.map(extract));
+ } catch {
+ return [];
+ }
+}
+
+parse(); \ No newline at end of file