import { v4 as uuidv4 } from 'uuid'; import { Networking } from '../../../../Network'; import { BaseTool } from './BaseTool'; import { Observation } from '../types/types'; import { ParametersType, ToolInfo } from '../types/tool_types'; import { AgentDocumentManager } from '../utils/AgentDocumentManager'; import { Doc } from '../../../../../fields/Doc'; import { StrCast, WebCast } from '../../../../../fields/Types'; const websiteInfoScraperToolParams = [ { name: 'chunk_ids', type: 'string[]', description: 'The chunk_ids of the urls to scrape from the SearchTool.', required: true, max_inputs: 3, }, ] as const; type WebsiteInfoScraperToolParamsType = typeof websiteInfoScraperToolParams; const websiteInfoScraperToolInfo: ToolInfo = { name: 'websiteInfoScraper', description: 'Scrape detailed information from specific websites relevant to the user query. Returns the text content of the webpages for further analysis and grounding.', citationRules: ` !IMPORTANT! THESE CHUNKS REPLACE THE CHUNKS THAT ARE RETURNED FROM THE SEARCHTOOL. Your task is to provide a comprehensive response to the user's prompt using the content scraped from relevant websites. Ensure you follow these guidelines for structuring your response: 1. Grounded Text Tag Structure: - Wrap all text derived from the scraped website(s) in tags. - **Do not include non-sourced information** in tags. - Use a single tag for content derived from a single website. If citing multiple websites, create new tags for each. - Ensure each tag has a citation index corresponding to the scraped URL. 2. Citation Tag Structure: - Create a tag for each distinct piece of information used from the website(s). - Each tag must reference a URL chunk using the chunk_id attribute. - For URL-based citations, leave the citation content empty, but reference the chunk_id and type as 'url'. 3. Structural Integrity Checks: - Ensure all opening and closing tags are matched properly. - Verify that all citation_index attributes in tags correspond to valid citations. - Do not over-cite—cite only the most relevant parts of the websites. Example Usage: Based on data from the World Bank, economic growth has stabilized in recent years, following a surge in investments. According to information retrieved from the International Monetary Fund, the inflation rate has been gradually decreasing since 2020. What are the long-term economic impacts of increased investments on GDP? How might inflation trends affect future monetary policy? Are there additional factors that could influence economic growth beyond investments and inflation? ***NOTE***: Ensure that the response is structured correctly and adheres to the guidelines provided. Also, if needed/possible, cite multiple websites to provide a comprehensive response. `, parameterRules: websiteInfoScraperToolParams, }; export class WebsiteInfoScraperTool extends BaseTool { private _docManager: AgentDocumentManager; constructor(docManager: AgentDocumentManager) { super(websiteInfoScraperToolInfo); this._docManager = docManager; } /** * Attempts to scrape a website with retry logic * @param url URL to scrape * @param maxRetries Maximum number of retry attempts * @returns The scraped content or error message */ private async scrapeWithRetry(chunkDoc: Doc, maxRetries = 2): Promise { let lastError = ''; let retryCount = 0; const url = WebCast(chunkDoc.data!)!.url.href; console.log(url); console.log(chunkDoc); console.log(chunkDoc.data); const id = chunkDoc.id; // Validate URL format try { new URL(url); // This will throw if URL is invalid } catch (e) { return { type: 'text', text: `Invalid URL format: ${url}. Please provide a valid URL including http:// or https://`, } as Observation; } while (retryCount <= maxRetries) { try { // Add a slight delay between retries if (retryCount > 0) { console.log(`Retry attempt ${retryCount} for ${url}`); await new Promise(resolve => setTimeout(resolve, retryCount * 2000)); // Increasing delay for each retry } const response = await Networking.PostToServer('/scrapeWebsite', { url }); if (!response || typeof response !== 'object') { lastError = 'Empty or invalid response from server'; retryCount++; continue; } const { website_plain_text } = response as { website_plain_text: string }; // Validate content quality if (!website_plain_text) { lastError = 'Retrieved content was empty'; retryCount++; continue; } if (website_plain_text.length < 100) { console.warn(`Warning: Content from ${url} is very short (${website_plain_text.length} chars)`); // Still return it if this is our last try if (retryCount === maxRetries) { return { type: 'text', text: `\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n`, } as Observation; } lastError = 'Retrieved content was too short, trying again'; retryCount++; continue; } // Process and return content if it looks good return { type: 'text', text: `\n${website_plain_text}\n`, } as Observation; } catch (error) { lastError = error instanceof Error ? error.message : 'Unknown error'; console.log(`Error scraping ${url} (attempt ${retryCount + 1}):`, error); } retryCount++; } // All attempts failed return { type: 'text', text: `Unable to scrape website: ${url}. Error: ${lastError}`, } as Observation; } async execute(args: ParametersType): Promise { const chunk_ids = args.chunk_ids; // Create an array of promises, each one handling a website scrape for a URL const scrapingPromises = chunk_ids.map(chunk_id => this.scrapeWithRetry(this._docManager.getDocument(chunk_id)!)); // Wait for all scraping promises to resolve const results = await Promise.all(scrapingPromises); // Check if we got any successful results const successfulResults = results.filter(result => { if (result.type !== 'text') return false; return (result as { type: 'text'; text: string }).text.includes('chunk_id') && !(result as { type: 'text'; text: string }).text.includes('Unable to scrape'); }); // If all scrapes failed, provide a more helpful error message if (successfulResults.length === 0 && results.length > 0) { results.push({ type: 'text', text: `Note: All website scraping attempts failed. Please try with different URLs or try again later.`, } as Observation); } return results; } }