Merge branch 'agent-paper-main' of https://github.com/brown-dash/Dash-Web into joanne-tutorialagent

author: Joanne <zehan_ding@brown.edu> 2025-06-17 13:02:50 -0400
committer: Joanne <zehan_ding@brown.edu> 2025-06-17 13:02:50 -0400
commit: 2aa2c26b95a539d220e46b20cdfbef6ae39d6c43 (patch)
tree: 344a6f798f692fdd4921ab5a6762e907f5ad7b06 /src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
parent: 430db63077868fa54829721d6530a810aa4d4588 (diff)
parent: ccfdf905400cd4b81d8cde0f16bb0e15cd65621b (diff)
1 files changed, 106 insertions, 20 deletions
diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
index 19ccd0b36..727d35e2c 100644
--- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
+++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
@@ -3,12 +3,14 @@ import { Networking } from '../../../../Network';
 import { BaseTool } from './BaseTool';
 import { Observation } from '../types/types';
 import { ParametersType, ToolInfo } from '../types/tool_types';
-
+import { AgentDocumentManager } from '../utils/AgentDocumentManager';
+import { Doc } from '../../../../../fields/Doc';
+import { StrCast, WebCast } from '../../../../../fields/Types';
 const websiteInfoScraperToolParams = [
     {
-        name: 'urls',
+        name: 'chunk_ids',
         type: 'string[]',
-        description: 'The URLs of the websites to scrape',
+        description: 'The chunk_ids of the urls to scrape from the SearchTool.',
         required: true,
         max_inputs: 3,
     },
@@ -20,6 +22,7 @@ const websiteInfoScraperToolInfo: ToolInfo<WebsiteInfoScraperToolParamsType> = {
     name: 'websiteInfoScraper',
     description: 'Scrape detailed information from specific websites relevant to the user query. Returns the text content of the webpages for further analysis and grounding.',
     citationRules: `
+      !IMPORTANT! THESE CHUNKS REPLACE THE CHUNKS THAT ARE RETURNED FROM THE SEARCHTOOL.
       Your task is to provide a comprehensive response to the user's prompt using the content scraped from relevant websites. Ensure you follow these guidelines for structuring your response:
 
       1. Grounded Text Tag Structure:
@@ -66,38 +69,121 @@ const websiteInfoScraperToolInfo: ToolInfo<WebsiteInfoScraperToolParamsType> = {
 };
 
 export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParamsType> {
-    private _addLinkedUrlDoc: (url: string, id: string) => void;
+    private _docManager: AgentDocumentManager;
 
-    constructor(addLinkedUrlDoc: (url: string, id: string) => void) {
+    constructor(docManager: AgentDocumentManager) {
         super(websiteInfoScraperToolInfo);
-        this._addLinkedUrlDoc = addLinkedUrlDoc;
+        this._docManager = docManager;
     }
 
-    async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
-        const urls = args.urls;
-
-        // Create an array of promises, each one handling a website scrape for a URL
-        const scrapingPromises = urls.map(async url => {
+    /**
+     * Attempts to scrape a website with retry logic
+     * @param url URL to scrape
+     * @param maxRetries Maximum number of retry attempts
+     * @returns The scraped content or error message
+     */
+    private async scrapeWithRetry(chunkDoc: Doc, maxRetries = 2): Promise<Observation> {
+        let lastError = '';
+        let retryCount = 0;
+        const url = WebCast(chunkDoc.data!)!.url.href;
+        console.log(url);
+        console.log(chunkDoc);
+        console.log(chunkDoc.data);
+        const id = chunkDoc.id;
+        // Validate URL format
+        try {
+            new URL(url); // This will throw if URL is invalid
+        } catch (e) {
+            return {
+                type: 'text',
+                text: `Invalid URL format: ${url}. Please provide a valid URL including http:// or https://`,
+            } as Observation;
+        }
+
+        while (retryCount <= maxRetries) {
             try {
-                const { website_plain_text } = await Networking.PostToServer('/scrapeWebsite', { url });
-                const id = uuidv4();
-                this._addLinkedUrlDoc(url, id);
+                // Add a slight delay between retries
+                if (retryCount > 0) {
+                    console.log(`Retry attempt ${retryCount} for ${url}`);
+                    await new Promise(resolve => setTimeout(resolve, retryCount * 2000)); // Increasing delay for each retry
+                }
+
+                const response = await Networking.PostToServer('/scrapeWebsite', { url });
+
+                if (!response || typeof response !== 'object') {
+                    lastError = 'Empty or invalid response from server';
+                    retryCount++;
+                    continue;
+                }
+
+                const { website_plain_text } = response as { website_plain_text: string };
+
+                // Validate content quality
+                if (!website_plain_text) {
+                    lastError = 'Retrieved content was empty';
+                    retryCount++;
+                    continue;
+                }
+
+                if (website_plain_text.length < 100) {
+                    console.warn(`Warning: Content from ${url} is very short (${website_plain_text.length} chars)`);
+
+                    // Still return it if this is our last try
+                    if (retryCount === maxRetries) {
+                        return {
+                            type: 'text',
+                            text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
+                        } as Observation;
+                    }
+
+                    lastError = 'Retrieved content was too short, trying again';
+                    retryCount++;
+                    continue;
+                }
+
+                // Process and return content if it looks good
                 return {
                     type: 'text',
                     text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
                 } as Observation;
             } catch (error) {
-                console.log(error);
-                return {
-                    type: 'text',
-                    text: `An error occurred while scraping the website: ${url}`,
-                } as Observation;
+                lastError = error instanceof Error ? error.message : 'Unknown error';
+                console.log(`Error scraping ${url} (attempt ${retryCount + 1}):`, error);
             }
-        });
+
+            retryCount++;
+        }
+
+        // All attempts failed
+        return {
+            type: 'text',
+            text: `Unable to scrape website: ${url}. Error: ${lastError}`,
+        } as Observation;
+    }
+
+    async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
+        const chunk_ids = args.chunk_ids;
+
+        // Create an array of promises, each one handling a website scrape for a URL
+        const scrapingPromises = chunk_ids.map(chunk_id => this.scrapeWithRetry(this._docManager.getDocument(chunk_id)!));
 
         // Wait for all scraping promises to resolve
         const results = await Promise.all(scrapingPromises);
 
+        // Check if we got any successful results
+        const successfulResults = results.filter(result => {
+            if (result.type !== 'text') return false;
+            return (result as { type: 'text'; text: string }).text.includes('chunk_id') && !(result as { type: 'text'; text: string }).text.includes('Unable to scrape');
+        });
+
+        // If all scrapes failed, provide a more helpful error message
+        if (successfulResults.length === 0 && results.length > 0) {
+            results.push({
+                type: 'text',
+                text: `Note: All website scraping attempts failed. Please try with different URLs or try again later.`,
+            } as Observation);
+        }
+
         return results;
     }
 }
author	Joanne <zehan_ding@brown.edu>	2025-06-17 13:02:50 -0400
committer	Joanne <zehan_ding@brown.edu>	2025-06-17 13:02:50 -0400
commit	2aa2c26b95a539d220e46b20cdfbef6ae39d6c43 (patch)
tree	344a6f798f692fdd4921ab5a6762e907f5ad7b06 /src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
parent	430db63077868fa54829721d6530a810aa4d4588 (diff)
parent	ccfdf905400cd4b81d8cde0f16bb0e15cd65621b (diff)