aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSam Wilkins <samwilkins333@gmail.com>2020-01-19 13:22:26 -0500
committerSam Wilkins <samwilkins333@gmail.com>2020-01-19 13:22:26 -0500
commit3336d663bc04c602d652ebc1f63a26df68ff92c4 (patch)
tree801f3097f84f01c3bfbd56e01be66d854fb9f68c
parentaff9cc02750eb032ade98d77cf9ff45677063fc8 (diff)
added node scraper
-rw-r--r--package-lock.json5
-rw-r--r--package.json1
-rw-r--r--src/scraping/buxton/node_scraper.ts57
3 files changed, 63 insertions, 0 deletions
diff --git a/package-lock.json b/package-lock.json
index 474084fe7..5119494c6 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -8711,6 +8711,11 @@
}
}
},
+ "node-stream-zip": {
+ "version": "1.9.1",
+ "resolved": "https://registry.npmjs.org/node-stream-zip/-/node-stream-zip-1.9.1.tgz",
+ "integrity": "sha512-7/Xs9gkuYF0WBimz5OrSc6UVKLDTxvBG2yLGtEK8PSx94d86o/6iQLvIe/140ATz35JDqHKWIxh3GcA3u5hB0w=="
+ },
"nodemailer": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/nodemailer/-/nodemailer-5.1.1.tgz",
diff --git a/package.json b/package.json
index 6a82cbdcb..5077e7118 100644
--- a/package.json
+++ b/package.json
@@ -172,6 +172,7 @@
"mongodb": "^3.4.1",
"mongoose": "^5.8.4",
"node-sass": "^4.13.0",
+ "node-stream-zip": "^1.9.1",
"nodemailer": "^5.1.1",
"nodemon": "^1.19.4",
"normalize.css": "^8.0.1",
diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts
index e69de29bb..ef1d989d4 100644
--- a/src/scraping/buxton/node_scraper.ts
+++ b/src/scraping/buxton/node_scraper.ts
@@ -0,0 +1,57 @@
+import { readdirSync } from "fs";
+import { resolve } from "path";
+
+const StreamZip = require('node-stream-zip');
+
+export async function open(path: string) {
+ const zip = new StreamZip({
+ file: path,
+ storeEntries: true
+ });
+ return new Promise<string>((resolve, reject) => {
+ zip.on('ready', () => {
+ console.log("READY!", zip.entriesCount);
+ for (const entry of Object.values(zip.entries()) as any[]) {
+ const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`;
+ console.log(`Entry ${entry.name}: ${desc}`);
+ }
+ let body = "";
+ zip.stream("word/document.xml", (error: any, stream: any) => {
+ if (error) {
+ reject(error);
+ }
+ stream.on('data', (chunk: any) => body += chunk.toString());
+ stream.on('end', () => {
+ resolve(body);
+ zip.close();
+ });
+ });
+ });
+ });
+}
+
+export async function extract(path: string) {
+ const contents = await open(path);
+ let body = "";
+ const components = contents.toString().split('<w:t');
+ for (const component of components) {
+ const tags = component.split('>');
+ console.log(tags[1]);
+ const content = tags[1].replace(/<.*$/, "");
+ body += content;
+ }
+ return body;
+}
+
+async function parse(): Promise<string[]> {
+ const sourceDirectory = resolve(`${__dirname}/source`);
+ const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`);
+ await extract(candidates[0]);
+ try {
+ return Promise.all(candidates.map(extract));
+ } catch {
+ return [];
+ }
+}
+
+parse(); \ No newline at end of file