src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

import { v4 as uuidv4 } from 'uuid';
import { Networking } from '../../../../Network';
import { BaseTool } from './BaseTool';

export class WebsiteInfoScraperTool extends BaseTool<{ url: string | string[] }> {
    private _addLinkedUrlDoc: (url: string, id: string) => void;

    constructor(addLinkedUrlDoc: (url: string, id: string) => void) {
        super(
            'websiteInfoScraper',
            'Scrape detailed information from specific websites relevant to the user query',
            {
                url: {
                    type: 'string',
                    description: 'The URL(s) of the website(s) to scrape',
                    required: true,
                    max_inputs: 3,
                },
            },
            `
            Your task is to provide a comprehensive response to the user's prompt using the content scraped from relevant websites. Ensure you follow these guidelines for structuring your response:

            1. Grounded Text Tag Structure:
               - Wrap all text derived from the scraped website(s) in <grounded_text> tags.
               - **Do not include non-sourced information** in <grounded_text> tags.
               - Use a single <grounded_text> tag for content derived from a single website. If citing multiple websites, create new <grounded_text> tags for each.
               - Ensure each <grounded_text> tag has a citation index corresponding to the scraped URL.

            2. Citation Tag Structure:
               - Create a <citation> tag for each distinct piece of information used from the website(s).
               - Each <citation> tag must reference a URL chunk using the chunk_id attribute.
               - For URL-based citations, leave the citation content empty, but reference the chunk_id and type as 'url'.

            3. Structural Integrity Checks:
               - Ensure all opening and closing tags are matched properly.
               - Verify that all citation_index attributes in <grounded_text> tags correspond to valid citations.
               - Do not over-cite—cite only the most relevant parts of the websites.

            Example Usage:

            <answer>
                <grounded_text citation_index="1">
                Based on data from the World Bank, economic growth has stabilized in recent years, following a surge in investments.
                </grounded_text>
                <grounded_text citation_index="2">
                According to information retrieved from the International Monetary Fund, the inflation rate has been gradually decreasing since 2020.
                </grounded_text>

                <citations>
                    <citation index="1" chunk_id="1234" type="url"></citation>
                    <citation index="2" chunk_id="5678" type="url"></citation>
                </citations>

                <follow_up_questions>
                    <question>What are the long-term economic impacts of increased investments on GDP?</question>
                    <question>How might inflation trends affect future monetary policy?</question>
                    <question>Are there additional factors that could influence economic growth beyond investments and inflation?</question>
                </follow_up_questions>
            </answer>
            `,
            'Returns the text content of the webpages for further analysis and grounding.'
        );
        this._addLinkedUrlDoc = addLinkedUrlDoc;
    }

    async execute(args: { url: string | string[] }): Promise<unknown> {
        const urls = Array.isArray(args.url) ? args.url : [args.url];
        const results = [];

        for (const url of urls) {
            try {
                const { website_plain_text } = await Networking.PostToServer('/scrapeWebsite', { url });
                const id = uuidv4();
                this._addLinkedUrlDoc(url, id);
                results.push({ type: 'text', text: `<chunk chunk_id=${id} chunk_type=url>\n${website_plain_text}\n</chunk>\n` });
            } catch (error) {
                console.log(error);
                results.push({ type: 'text', text: `An error occurred while scraping the website: ${url}` });
            }
        }

        return results;
    }
}