aboutsummaryrefslogtreecommitdiff
path: root/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
diff options
context:
space:
mode:
authorJoanne <zehan_ding@brown.edu>2025-06-17 13:02:50 -0400
committerJoanne <zehan_ding@brown.edu>2025-06-17 13:02:50 -0400
commit2aa2c26b95a539d220e46b20cdfbef6ae39d6c43 (patch)
tree344a6f798f692fdd4921ab5a6762e907f5ad7b06 /src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
parent430db63077868fa54829721d6530a810aa4d4588 (diff)
parentccfdf905400cd4b81d8cde0f16bb0e15cd65621b (diff)
Merge branch 'agent-paper-main' of https://github.com/brown-dash/Dash-Web into joanne-tutorialagent
Diffstat (limited to 'src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts')
-rw-r--r--src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts126
1 files changed, 106 insertions, 20 deletions
diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
index 19ccd0b36..727d35e2c 100644
--- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
+++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
@@ -3,12 +3,14 @@ import { Networking } from '../../../../Network';
import { BaseTool } from './BaseTool';
import { Observation } from '../types/types';
import { ParametersType, ToolInfo } from '../types/tool_types';
-
+import { AgentDocumentManager } from '../utils/AgentDocumentManager';
+import { Doc } from '../../../../../fields/Doc';
+import { StrCast, WebCast } from '../../../../../fields/Types';
const websiteInfoScraperToolParams = [
{
- name: 'urls',
+ name: 'chunk_ids',
type: 'string[]',
- description: 'The URLs of the websites to scrape',
+ description: 'The chunk_ids of the urls to scrape from the SearchTool.',
required: true,
max_inputs: 3,
},
@@ -20,6 +22,7 @@ const websiteInfoScraperToolInfo: ToolInfo<WebsiteInfoScraperToolParamsType> = {
name: 'websiteInfoScraper',
description: 'Scrape detailed information from specific websites relevant to the user query. Returns the text content of the webpages for further analysis and grounding.',
citationRules: `
+ !IMPORTANT! THESE CHUNKS REPLACE THE CHUNKS THAT ARE RETURNED FROM THE SEARCHTOOL.
Your task is to provide a comprehensive response to the user's prompt using the content scraped from relevant websites. Ensure you follow these guidelines for structuring your response:
1. Grounded Text Tag Structure:
@@ -66,38 +69,121 @@ const websiteInfoScraperToolInfo: ToolInfo<WebsiteInfoScraperToolParamsType> = {
};
export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParamsType> {
- private _addLinkedUrlDoc: (url: string, id: string) => void;
+ private _docManager: AgentDocumentManager;
- constructor(addLinkedUrlDoc: (url: string, id: string) => void) {
+ constructor(docManager: AgentDocumentManager) {
super(websiteInfoScraperToolInfo);
- this._addLinkedUrlDoc = addLinkedUrlDoc;
+ this._docManager = docManager;
}
- async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
- const urls = args.urls;
-
- // Create an array of promises, each one handling a website scrape for a URL
- const scrapingPromises = urls.map(async url => {
+ /**
+ * Attempts to scrape a website with retry logic
+ * @param url URL to scrape
+ * @param maxRetries Maximum number of retry attempts
+ * @returns The scraped content or error message
+ */
+ private async scrapeWithRetry(chunkDoc: Doc, maxRetries = 2): Promise<Observation> {
+ let lastError = '';
+ let retryCount = 0;
+ const url = WebCast(chunkDoc.data!)!.url.href;
+ console.log(url);
+ console.log(chunkDoc);
+ console.log(chunkDoc.data);
+ const id = chunkDoc.id;
+ // Validate URL format
+ try {
+ new URL(url); // This will throw if URL is invalid
+ } catch (e) {
+ return {
+ type: 'text',
+ text: `Invalid URL format: ${url}. Please provide a valid URL including http:// or https://`,
+ } as Observation;
+ }
+
+ while (retryCount <= maxRetries) {
try {
- const { website_plain_text } = await Networking.PostToServer('/scrapeWebsite', { url });
- const id = uuidv4();
- this._addLinkedUrlDoc(url, id);
+ // Add a slight delay between retries
+ if (retryCount > 0) {
+ console.log(`Retry attempt ${retryCount} for ${url}`);
+ await new Promise(resolve => setTimeout(resolve, retryCount * 2000)); // Increasing delay for each retry
+ }
+
+ const response = await Networking.PostToServer('/scrapeWebsite', { url });
+
+ if (!response || typeof response !== 'object') {
+ lastError = 'Empty or invalid response from server';
+ retryCount++;
+ continue;
+ }
+
+ const { website_plain_text } = response as { website_plain_text: string };
+
+ // Validate content quality
+ if (!website_plain_text) {
+ lastError = 'Retrieved content was empty';
+ retryCount++;
+ continue;
+ }
+
+ if (website_plain_text.length < 100) {
+ console.warn(`Warning: Content from ${url} is very short (${website_plain_text.length} chars)`);
+
+ // Still return it if this is our last try
+ if (retryCount === maxRetries) {
+ return {
+ type: 'text',
+ text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
+ } as Observation;
+ }
+
+ lastError = 'Retrieved content was too short, trying again';
+ retryCount++;
+ continue;
+ }
+
+ // Process and return content if it looks good
return {
type: 'text',
text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
} as Observation;
} catch (error) {
- console.log(error);
- return {
- type: 'text',
- text: `An error occurred while scraping the website: ${url}`,
- } as Observation;
+ lastError = error instanceof Error ? error.message : 'Unknown error';
+ console.log(`Error scraping ${url} (attempt ${retryCount + 1}):`, error);
}
- });
+
+ retryCount++;
+ }
+
+ // All attempts failed
+ return {
+ type: 'text',
+ text: `Unable to scrape website: ${url}. Error: ${lastError}`,
+ } as Observation;
+ }
+
+ async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
+ const chunk_ids = args.chunk_ids;
+
+ // Create an array of promises, each one handling a website scrape for a URL
+ const scrapingPromises = chunk_ids.map(chunk_id => this.scrapeWithRetry(this._docManager.getDocument(chunk_id)!));
// Wait for all scraping promises to resolve
const results = await Promise.all(scrapingPromises);
+ // Check if we got any successful results
+ const successfulResults = results.filter(result => {
+ if (result.type !== 'text') return false;
+ return (result as { type: 'text'; text: string }).text.includes('chunk_id') && !(result as { type: 'text'; text: string }).text.includes('Unable to scrape');
+ });
+
+ // If all scrapes failed, provide a more helpful error message
+ if (successfulResults.length === 0 && results.length > 0) {
+ results.push({
+ type: 'text',
+ text: `Note: All website scraping attempts failed. Please try with different URLs or try again later.`,
+ } as Observation);
+ }
+
return results;
}
}