From 0db4583914e43e6efdba3e86a614a19956e73b5e Mon Sep 17 00:00:00 2001 From: "A.J. Shulman" Date: Sat, 10 May 2025 20:30:24 -0400 Subject: feat: changed web document to display screenshot --- .../nodes/chatbot/tools/WebsiteInfoScraperTool.ts | 105 ++++++++++++++++++--- 1 file changed, 93 insertions(+), 12 deletions(-) (limited to 'src/client/views/nodes/chatbot/tools') diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts index bff38ae15..3c7b4e3db 100644 --- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts +++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts @@ -73,30 +73,111 @@ export class WebsiteInfoScraperTool extends BaseTool): Promise { - const urls = args.urls; - - // Create an array of promises, each one handling a website scrape for a URL - const scrapingPromises = urls.map(async url => { + /** + * Attempts to scrape a website with retry logic + * @param url URL to scrape + * @param maxRetries Maximum number of retry attempts + * @returns The scraped content or error message + */ + private async scrapeWithRetry(url: string, maxRetries = 2): Promise { + let lastError = ''; + let retryCount = 0; + + // Validate URL format + try { + new URL(url); // This will throw if URL is invalid + } catch (e) { + return { + type: 'text', + text: `Invalid URL format: ${url}. Please provide a valid URL including http:// or https://`, + } as Observation; + } + + while (retryCount <= maxRetries) { try { - const { website_plain_text } = (await Networking.PostToServer('/scrapeWebsite', { url })) as { website_plain_text: string }; + // Add a slight delay between retries + if (retryCount > 0) { + console.log(`Retry attempt ${retryCount} for ${url}`); + await new Promise(resolve => setTimeout(resolve, retryCount * 2000)); // Increasing delay for each retry + } + + const response = await Networking.PostToServer('/scrapeWebsite', { url }); + + if (!response || typeof response !== 'object') { + lastError = 'Empty or invalid response from server'; + retryCount++; + continue; + } + + const { website_plain_text } = response as { website_plain_text: string }; const id = this._getLinkedUrlDocId(url); + + // Validate content quality + if (!website_plain_text) { + lastError = 'Retrieved content was empty'; + retryCount++; + continue; + } + + if (website_plain_text.length < 100) { + console.warn(`Warning: Content from ${url} is very short (${website_plain_text.length} chars)`); + + // Still return it if this is our last try + if (retryCount === maxRetries) { + return { + type: 'text', + text: `\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n`, + } as Observation; + } + + lastError = 'Retrieved content was too short, trying again'; + retryCount++; + continue; + } + + // Process and return content if it looks good return { type: 'text', text: `\n${website_plain_text}\n`, } as Observation; } catch (error) { - console.log(error); - return { - type: 'text', - text: `An error occurred while scraping the website: ${url}`, - } as Observation; + lastError = error instanceof Error ? error.message : 'Unknown error'; + console.log(`Error scraping ${url} (attempt ${retryCount + 1}):`, error); } - }); + + retryCount++; + } + + // All attempts failed + return { + type: 'text', + text: `Unable to scrape website: ${url}. Error: ${lastError}`, + } as Observation; + } + + async execute(args: ParametersType): Promise { + const urls = args.urls; + + // Create an array of promises, each one handling a website scrape for a URL + const scrapingPromises = urls.map(url => this.scrapeWithRetry(url)); // Wait for all scraping promises to resolve const results = await Promise.all(scrapingPromises); + // Check if we got any successful results + const successfulResults = results.filter(result => { + if (result.type !== 'text') return false; + return (result as { type: 'text'; text: string }).text.includes('chunk_id') && !(result as { type: 'text'; text: string }).text.includes('Unable to scrape'); + }); + + // If all scrapes failed, provide a more helpful error message + if (successfulResults.length === 0 && results.length > 0) { + results.push({ + type: 'text', + text: `Note: All website scraping attempts failed. Please try with different URLs or try again later.`, + } as Observation); + } + return results; } } -- cgit v1.2.3-70-g09d2