From 4791cd23af08da70895204a3a7fbaf889d9af2d5 Mon Sep 17 00:00:00 2001
From: "A.J. Shulman" <Shulman.aj@gmail.com>
Date: Sat, 7 Sep 2024 12:43:05 -0400
Subject: completely restructured, added comments, and significantly reduced
 the length of the prompt (~72% shorter and cheaper)

---
 .../nodes/chatbot/tools/WebsiteInfoScraperTool.ts  | 83 ++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts

(limited to 'src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts')
diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
new file mode 100644
index 000000000..2118218f6
--- /dev/null
+++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
@@ -0,0 +1,83 @@
+import { Networking } from '../../../../Network';
+import { BaseTool } from './BaseTool';
+import { v4 as uuidv4 } from 'uuid';
+
+export class WebsiteInfoScraperTool extends BaseTool<{ url: string | string[] }> {
+    private _addLinkedUrlDoc: (url: string, id: string) => void;
+
+    constructor(addLinkedUrlDoc: (url: string, id: string) => void) {
+        super(
+            'websiteInfoScraper',
+            'Scrape detailed information from specific websites relevant to the user query',
+            {
+                url: {
+                    type: 'string',
+                    description: 'The URL(s) of the website(s) to scrape',
+                    required: true,
+                    max_inputs: 3,
+                },
+            },
+            `
+            Your task is to provide a comprehensive response to the user's prompt using the content scraped from relevant websites. Ensure you follow these guidelines for structuring your response:
+
+            1. Grounded Text Tag Structure:
+               - Wrap all text derived from the scraped website(s) in <grounded_text> tags.
+               - **Do not include non-sourced information** in <grounded_text> tags.
+               - Use a single <grounded_text> tag for content derived from a single website. If citing multiple websites, create new <grounded_text> tags for each.
+               - Ensure each <grounded_text> tag has a citation index corresponding to the scraped URL.
+
+            2. Citation Tag Structure:
+               - Create a <citation> tag for each distinct piece of information used from the website(s).
+               - Each <citation> tag must reference a URL chunk using the chunk_id attribute.
+               - For URL-based citations, leave the citation content empty, but reference the chunk_id and type as 'url'.
+
+            3. Structural Integrity Checks:
+               - Ensure all opening and closing tags are matched properly.
+               - Verify that all citation_index attributes in <grounded_text> tags correspond to valid citations.
+               - Do not over-cite—cite only the most relevant parts of the websites.
+
+            Example Usage:
+
+            <answer>
+                <grounded_text citation_index="1">
+                Based on data from the World Bank, economic growth has stabilized in recent years, following a surge in investments.
+                </grounded_text>
+                <grounded_text citation_index="2">
+                According to information retrieved from the International Monetary Fund, the inflation rate has been gradually decreasing since 2020.
+                </grounded_text>
+
+                <citations>
+                    <citation index="1" chunk_id="1234" type="url"></citation>
+                    <citation index="2" chunk_id="5678" type="url"></citation>
+                </citations>
+
+                <follow_up_questions>
+                    <question>What are the long-term economic impacts of increased investments on GDP?</question>
+                    <question>How might inflation trends affect future monetary policy?</question>
+                    <question>Are there additional factors that could influence economic growth beyond investments and inflation?</question>
+                </follow_up_questions>
+            </answer>
+            `,
+            'Returns the text content of the webpages for further analysis and grounding.'
+        );
+        this._addLinkedUrlDoc = addLinkedUrlDoc;
+    }
+
+    async execute(args: { url: string | string[] }): Promise<any> {
+        const urls = Array.isArray(args.url) ? args.url : [args.url];
+        const results = [];
+
+        for (const url of urls) {
+            try {
+                const { website_plain_text } = await Networking.PostToServer('/scrapeWebsite', { url });
+                const id = uuidv4();
+                this._addLinkedUrlDoc(url, id);
+                results.push({ type: 'text', text: `<chunk chunk_id=${id} chunk_type=url>\n${website_plain_text}\n</chunk>\n` });
+            } catch (error) {
+                results.push({ type: 'text', text: `An error occurred while scraping the website: ${url}` });
+            }
+        }
+
+        return results;
+    }
+}
-- 
cgit v1.2.3-70-g09d2