aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/buxton/node_scraper.ts
blob: ef1d989d4d4865bea54a3453412ae4196664c7c7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import { readdirSync } from "fs";
import { resolve } from "path";

const StreamZip = require('node-stream-zip');

export async function open(path: string) {
    const zip = new StreamZip({
        file: path,
        storeEntries: true
    });
    return new Promise<string>((resolve, reject) => {
        zip.on('ready', () => {
            console.log("READY!", zip.entriesCount);
            for (const entry of Object.values(zip.entries()) as any[]) {
                const desc = entry.isDirectory ? 'directory' : `${entry.size} bytes`;
                console.log(`Entry ${entry.name}: ${desc}`);
            }
            let body = "";
            zip.stream("word/document.xml", (error: any, stream: any) => {
                if (error) {
                    reject(error);
                }
                stream.on('data', (chunk: any) => body += chunk.toString());
                stream.on('end', () => {
                    resolve(body);
                    zip.close();
                });
            });
        });
    });
}

export async function extract(path: string) {
    const contents = await open(path);
    let body = "";
    const components = contents.toString().split('<w:t');
    for (const component of components) {
        const tags = component.split('>');
        console.log(tags[1]);
        const content = tags[1].replace(/<.*$/, "");
        body += content;
    }
    return body;
}

async function parse(): Promise<string[]> {
    const sourceDirectory = resolve(`${__dirname}/source`);
    const candidates = readdirSync(sourceDirectory).filter(file => file.endsWith(".doc") || file.endsWith(".docx")).map(file => `${sourceDirectory}/${file}`);
    await extract(candidates[0]);
    try {
        return Promise.all(candidates.map(extract));
    } catch {
        return [];
    }
}

parse();