aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/buxton/node_scraper.ts
diff options
context:
space:
mode:
Diffstat (limited to 'src/scraping/buxton/node_scraper.ts')
-rw-r--r--src/scraping/buxton/node_scraper.ts57
1 files changed, 0 insertions, 57 deletions
diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts
deleted file mode 100644
index ef1d989d4..000000000
--- a/src/scraping/buxton/node_scraper.ts
+++ /dev/null
@@ -1,57 +0,0 @@
-import { readdirSync } from "fs";
-import { resolve } from "path";
-
-const StreamZip = require('node-stream-zip');
-
-export async function open(path: string) {
- const zip = new StreamZip({
- file: path,
- storeEntries: true
- });
- return new Promise<string>((resolve, reject) => {
- zip.on('ready', () => {
- console.log("READY!", zip.entriesCount);
- for (const entry of Object.values(zip.entries()) as any[]) {
- const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`;
- console.log(`Entry ${entry.name}: ${desc}`);
- }
- let body = "";
- zip.stream("word/document.xml", (error: any, stream: any) => {
- if (error) {
- reject(error);
- }
- stream.on('data', (chunk: any) => body += chunk.toString());
- stream.on('end', () => {
- resolve(body);
- zip.close();
- });
- });
- });
- });
-}
-
-export async function extract(path: string) {
- const contents = await open(path);
- let body = "";
- const components = contents.toString().split('<w:t');
- for (const component of components) {
- const tags = component.split('>');
- console.log(tags[1]);
- const content = tags[1].replace(/<.*$/, "");
- body += content;
- }
- return body;
-}
-
-async function parse(): Promise<string[]> {
- const sourceDirectory = resolve(`${__dirname}/source`);
- const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`);
- await extract(candidates[0]);
- try {
- return Promise.all(candidates.map(extract));
- } catch {
- return [];
- }
-}
-
-parse(); \ No newline at end of file