src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

import { Networking } from '../../../../Network';
import { BaseTool } from './BaseTool';
import { v4 as uuidv4 } from 'uuid';

export class WebsiteInfoScraperTool extends BaseTool<{ url: string | string[] }> {
    private _addLinkedUrlDoc: (url: string, id: string) => void;

    constructor(addLinkedUrlDoc: (url: string, id: string) => void) {
        super(
            'websiteInfoScraper',
            'Scrape detailed information from specific websites identified as relevant to the user query',
            {
                url: {
                    type: 'string',
                    description: 'The URL(s) of the website(s) to scrape',
                    required: 'true',
                    max_inputs: '3',
                },
            },
            'Provide up to 3 URLs of websites that you have identified as the most relevant from the previous search. This tool will provide the text content of those specific websites. When providing a final response to the user based on information from these chunks, ideally cite as many of the url chunks as possible (ground your infromation from multiple sources, if possible) in order to provide a well grounded result.',
            'Returns the text content of the webpages for analysis.'
        );
        this._addLinkedUrlDoc = addLinkedUrlDoc;
    }

    async execute(args: { url: string | string[] }): Promise<any> {
        const urls = Array.isArray(args.url) ? args.url : [args.url];
        const results = [];

        for (const url of urls) {
            try {
                const { website_plain_text } = await Networking.PostToServer('/scrapeWebsite', { url });
                const id = uuidv4();
                this._addLinkedUrlDoc(url, id);
                results.push({ type: 'text', text: `<chunk chunk_id=${id} chunk_type=url>\n${website_plain_text}\n</chunk>\n` });
            } catch (error) {
                results.push({ type: 'text', text: `An error occurred while scraping the website: ${url}` });
            }
        }

        return results;
    }
}