diff options
| author | A.J. Shulman <Shulman.aj@gmail.com> | 2025-05-10 20:30:24 -0400 |
|---|---|---|
| committer | A.J. Shulman <Shulman.aj@gmail.com> | 2025-05-10 20:30:24 -0400 |
| commit | 0db4583914e43e6efdba3e86a614a19956e73b5e (patch) | |
| tree | 68dfef85ea47d6d79e63a6ac0914922dc69c99c5 /src/client/views/nodes/chatbot | |
| parent | 0a05616fb9f685dc8534db4949a6f7ad6b85eadb (diff) | |
feat: changed web document to display screenshot
Diffstat (limited to 'src/client/views/nodes/chatbot')
| -rw-r--r-- | src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts | 105 |
1 files changed, 93 insertions, 12 deletions
diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts index bff38ae15..3c7b4e3db 100644 --- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts +++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts @@ -73,30 +73,111 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam this._getLinkedUrlDocId = getLinkedUrlDocIds; } - async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> { - const urls = args.urls; - - // Create an array of promises, each one handling a website scrape for a URL - const scrapingPromises = urls.map(async url => { + /** + * Attempts to scrape a website with retry logic + * @param url URL to scrape + * @param maxRetries Maximum number of retry attempts + * @returns The scraped content or error message + */ + private async scrapeWithRetry(url: string, maxRetries = 2): Promise<Observation> { + let lastError = ''; + let retryCount = 0; + + // Validate URL format + try { + new URL(url); // This will throw if URL is invalid + } catch (e) { + return { + type: 'text', + text: `Invalid URL format: ${url}. Please provide a valid URL including http:// or https://`, + } as Observation; + } + + while (retryCount <= maxRetries) { try { - const { website_plain_text } = (await Networking.PostToServer('/scrapeWebsite', { url })) as { website_plain_text: string }; + // Add a slight delay between retries + if (retryCount > 0) { + console.log(`Retry attempt ${retryCount} for ${url}`); + await new Promise(resolve => setTimeout(resolve, retryCount * 2000)); // Increasing delay for each retry + } + + const response = await Networking.PostToServer('/scrapeWebsite', { url }); + + if (!response || typeof response !== 'object') { + lastError = 'Empty or invalid response from server'; + retryCount++; + continue; + } + + const { website_plain_text } = response as { website_plain_text: string }; const id = this._getLinkedUrlDocId(url); + + // Validate content quality + if (!website_plain_text) { + lastError = 'Retrieved content was empty'; + retryCount++; + continue; + } + + if (website_plain_text.length < 100) { + console.warn(`Warning: Content from ${url} is very short (${website_plain_text.length} chars)`); + + // Still return it if this is our last try + if (retryCount === maxRetries) { + return { + type: 'text', + text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`, + } as Observation; + } + + lastError = 'Retrieved content was too short, trying again'; + retryCount++; + continue; + } + + // Process and return content if it looks good return { type: 'text', text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`, } as Observation; } catch (error) { - console.log(error); - return { - type: 'text', - text: `An error occurred while scraping the website: ${url}`, - } as Observation; + lastError = error instanceof Error ? error.message : 'Unknown error'; + console.log(`Error scraping ${url} (attempt ${retryCount + 1}):`, error); } - }); + + retryCount++; + } + + // All attempts failed + return { + type: 'text', + text: `Unable to scrape website: ${url}. Error: ${lastError}`, + } as Observation; + } + + async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> { + const urls = args.urls; + + // Create an array of promises, each one handling a website scrape for a URL + const scrapingPromises = urls.map(url => this.scrapeWithRetry(url)); // Wait for all scraping promises to resolve const results = await Promise.all(scrapingPromises); + // Check if we got any successful results + const successfulResults = results.filter(result => { + if (result.type !== 'text') return false; + return (result as { type: 'text'; text: string }).text.includes('chunk_id') && !(result as { type: 'text'; text: string }).text.includes('Unable to scrape'); + }); + + // If all scrapes failed, provide a more helpful error message + if (successfulResults.length === 0 && results.length > 0) { + results.push({ + type: 'text', + text: `Note: All website scraping attempts failed. Please try with different URLs or try again later.`, + } as Observation); + } + return results; } } |
