aboutsummaryrefslogtreecommitdiff
path: root/src/scraping/acm/index.js
blob: b7455fe0d641ac481d35fd2c4284ef32aa4d0870 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
const {
    Builder,
    By,
    Key,
    until
} = require('selenium-webdriver');
const fs = require("fs");

let driver;
fs.readFile("./citations.txt", {
    encoding: "utf8"
}, scrapeTargets);
results = []

async function scrapeTargets(error, data) {
    if (error) {
        console.log("\nUnable to collect target citations from a citations.txt file stored in this directory.\nPlease make sure one is provided.")
        return;
    }

    driver = await new Builder().forBrowser('chrome').build();

    let references = data.split("\n").map(entry => entry.replace("\r", "")).filter(line => line.match(/\d+/g));

    let results = []
    let pdfs = []
    for (let id of references) {
        let result = {}
        let lines = []
        try {
            let url = `https://dl.acm.org/citation.cfm?id=${id}`;
            await driver.get(url);
            await driver.sleep(500)
            let candidates = await driver.findElements(By.xpath('.//a[contains(@href,  "ft_gateway.cfm?id=")]'));
            if (candidates.length > 0) {
                pdfs.push(candidates[0])
            }
            let webElements = await driver.findElements(By.id("abstract-body"))
            for (let el of webElements) {
                let text = await el.getText()
                lines.push(text)
            }
            result.url = url
            result.abstract = lines.join(" ");
            await driver.findElement(By.xpath(`//*[@id="tab-1014-btnInnerEl"]/span`)).click()
            await driver.sleep(500)
            let authors = await driver.findElement(By.xpath('//*[@id="tabpanel-1009-body"]')).getText()
            let sanitize = line => line.length > 0 && !(line.startsWith("No contact information") || line.startsWith("View colleagues of") || line.startsWith("Bibliometrics:"))
            authorLines = authors.split("\n").map(line => line.trim()).filter(sanitize)

            let i = 0;
            let allAuthors = []
            while (i < authorLines.length) {
                let individual = [];
                while (!authorLines[i].startsWith("Average citations")) {
                    individual.push(authorLines[i])
                    i++
                }
                individual.push(authorLines[i])
                allAuthors.push(individual);
                i++
            }
            result.authors = allAuthors.map(metadata => {
                let publicationYears = metadata[1].substring(18).split("-");
                author = {
                    name: metadata[0],
                    publication_start: parseInt(publicationYears[0]),
                    publication_end: parseInt(publicationYears[1])
                };
                for (let count = 2; count < metadata.length; count++) {
                    let attr = metadata[count]
                    let char = attr.length - 1;
                    while (attr[char] != " ") {
                        char--
                    }
                    let key = attr.substring(0, char).toLowerCase().replace(/ /g, "_").replace(/[\(\)]/g, "");
                    let value = parseFloat(attr.substring(char + 1).replace(/,/g, ""));
                    author[key] = value
                }
                return author
            })
        } catch (e) {
            console.log(e)
            await driver.quit();
        }
        results.push(result)
    }

    let output = "";
    results.forEach(res => output += (JSON.stringify(res, null, 4) + "\n"));

    fs.writeFile("./results.txt", output, function errorHandler(exception) {
        console.log(exception || "results successfully written")
    })

    await driver.quit();
}