From aff9cc02750eb032ade98d77cf9ff45677063fc8 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Sun, 19 Jan 2020 02:36:37 -0500 Subject: from field changes and scraping start --- package.json | 2 ++ 1 file changed, 2 insertions(+) (limited to 'package.json') diff --git a/package.json b/package.json index be1d3a9fe..6a82cbdcb 100644 --- a/package.json +++ b/package.json @@ -114,6 +114,7 @@ "@types/typescript": "^2.0.0", "@types/uuid": "^3.4.6", "@types/webpack": "^4.41.1", + "@types/word-extractor": "^0.3.0", "@types/youtube": "0.0.38", "adm-zip": "^0.4.13", "archiver": "^3.1.1", @@ -229,6 +230,7 @@ "url-loader": "^1.1.2", "uuid": "^3.3.3", "wikijs": "^6.0.1", + "word-extractor": "^0.3.0", "words-to-numbers": "^1.5.1", "xoauth2": "^1.2.0", "youtube": "^0.1.0" -- cgit v1.2.3-70-g09d2 From 3336d663bc04c602d652ebc1f63a26df68ff92c4 Mon Sep 17 00:00:00 2001 From: Sam Wilkins Date: Sun, 19 Jan 2020 13:22:26 -0500 Subject: added node scraper --- package-lock.json | 5 ++++ package.json | 1 + src/scraping/buxton/node_scraper.ts | 57 +++++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) (limited to 'package.json') diff --git a/package-lock.json b/package-lock.json index 474084fe7..5119494c6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8711,6 +8711,11 @@ } } }, + "node-stream-zip": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/node-stream-zip/-/node-stream-zip-1.9.1.tgz", + "integrity": "sha512-7/Xs9gkuYF0WBimz5OrSc6UVKLDTxvBG2yLGtEK8PSx94d86o/6iQLvIe/140ATz35JDqHKWIxh3GcA3u5hB0w==" + }, "nodemailer": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/nodemailer/-/nodemailer-5.1.1.tgz", diff --git a/package.json b/package.json index 6a82cbdcb..5077e7118 100644 --- a/package.json +++ b/package.json @@ -172,6 +172,7 @@ "mongodb": "^3.4.1", "mongoose": "^5.8.4", "node-sass": "^4.13.0", + "node-stream-zip": "^1.9.1", "nodemailer": "^5.1.1", "nodemon": "^1.19.4", "normalize.css": "^8.0.1", diff --git a/src/scraping/buxton/node_scraper.ts b/src/scraping/buxton/node_scraper.ts index e69de29bb..ef1d989d4 100644 --- a/src/scraping/buxton/node_scraper.ts +++ b/src/scraping/buxton/node_scraper.ts @@ -0,0 +1,57 @@ +import { readdirSync } from "fs"; +import { resolve } from "path"; + +const StreamZip = require('node-stream-zip'); + +export async function open(path: string) { + const zip = new StreamZip({ + file: path, + storeEntries: true + }); + return new Promise((resolve, reject) => { + zip.on('ready', () => { + console.log("READY!", zip.entriesCount); + for (const entry of Object.values(zip.entries()) as any[]) { + const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`; + console.log(`Entry ${entry.name}: ${desc}`); + } + let body = ""; + zip.stream("word/document.xml", (error: any, stream: any) => { + if (error) { + reject(error); + } + stream.on('data', (chunk: any) => body += chunk.toString()); + stream.on('end', () => { + resolve(body); + zip.close(); + }); + }); + }); + }); +} + +export async function extract(path: string) { + const contents = await open(path); + let body = ""; + const components = contents.toString().split(''); + console.log(tags[1]); + const content = tags[1].replace(/<.*$/, ""); + body += content; + } + return body; +} + +async function parse(): Promise { + const sourceDirectory = resolve(`${__dirname}/source`); + const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`); + await extract(candidates[0]); + try { + return Promise.all(candidates.map(extract)); + } catch { + return []; + } +} + +parse(); \ No newline at end of file -- cgit v1.2.3-70-g09d2