feat: changed web document to display screenshot

author: A.J. Shulman <Shulman.aj@gmail.com> 2025-05-10 20:30:24 -0400
committer: A.J. Shulman <Shulman.aj@gmail.com> 2025-05-10 20:30:24 -0400
commit: 0db4583914e43e6efdba3e86a614a19956e73b5e (patch)
tree: 68dfef85ea47d6d79e63a6ac0914922dc69c99c5 /src/client/views/nodes/chatbot
parent: 0a05616fb9f685dc8534db4949a6f7ad6b85eadb (diff)
1 files changed, 93 insertions, 12 deletions
diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
index bff38ae15..3c7b4e3db 100644
--- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
+++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
@@ -73,30 +73,111 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
         this._getLinkedUrlDocId = getLinkedUrlDocIds;
     }
 
-    async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
-        const urls = args.urls;
-
-        // Create an array of promises, each one handling a website scrape for a URL
-        const scrapingPromises = urls.map(async url => {
+    /**
+     * Attempts to scrape a website with retry logic
+     * @param url URL to scrape
+     * @param maxRetries Maximum number of retry attempts
+     * @returns The scraped content or error message
+     */
+    private async scrapeWithRetry(url: string, maxRetries = 2): Promise<Observation> {
+        let lastError = '';
+        let retryCount = 0;
+
+        // Validate URL format
+        try {
+            new URL(url); // This will throw if URL is invalid
+        } catch (e) {
+            return {
+                type: 'text',
+                text: `Invalid URL format: ${url}. Please provide a valid URL including http:// or https://`,
+            } as Observation;
+        }
+
+        while (retryCount <= maxRetries) {
             try {
-                const { website_plain_text } = (await Networking.PostToServer('/scrapeWebsite', { url })) as { website_plain_text: string };
+                // Add a slight delay between retries
+                if (retryCount > 0) {
+                    console.log(`Retry attempt ${retryCount} for ${url}`);
+                    await new Promise(resolve => setTimeout(resolve, retryCount * 2000)); // Increasing delay for each retry
+                }
+
+                const response = await Networking.PostToServer('/scrapeWebsite', { url });
+
+                if (!response || typeof response !== 'object') {
+                    lastError = 'Empty or invalid response from server';
+                    retryCount++;
+                    continue;
+                }
+
+                const { website_plain_text } = response as { website_plain_text: string };
                 const id = this._getLinkedUrlDocId(url);
+
+                // Validate content quality
+                if (!website_plain_text) {
+                    lastError = 'Retrieved content was empty';
+                    retryCount++;
+                    continue;
+                }
+
+                if (website_plain_text.length < 100) {
+                    console.warn(`Warning: Content from ${url} is very short (${website_plain_text.length} chars)`);
+
+                    // Still return it if this is our last try
+                    if (retryCount === maxRetries) {
+                        return {
+                            type: 'text',
+                            text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
+                        } as Observation;
+                    }
+
+                    lastError = 'Retrieved content was too short, trying again';
+                    retryCount++;
+                    continue;
+                }
+
+                // Process and return content if it looks good
                 return {
                     type: 'text',
                     text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
                 } as Observation;
             } catch (error) {
-                console.log(error);
-                return {
-                    type: 'text',
-                    text: `An error occurred while scraping the website: ${url}`,
-                } as Observation;
+                lastError = error instanceof Error ? error.message : 'Unknown error';
+                console.log(`Error scraping ${url} (attempt ${retryCount + 1}):`, error);
             }
-        });
+
+            retryCount++;
+        }
+
+        // All attempts failed
+        return {
+            type: 'text',
+            text: `Unable to scrape website: ${url}. Error: ${lastError}`,
+        } as Observation;
+    }
+
+    async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
+        const urls = args.urls;
+
+        // Create an array of promises, each one handling a website scrape for a URL
+        const scrapingPromises = urls.map(url => this.scrapeWithRetry(url));
 
         // Wait for all scraping promises to resolve
         const results = await Promise.all(scrapingPromises);
 
+        // Check if we got any successful results
+        const successfulResults = results.filter(result => {
+            if (result.type !== 'text') return false;
+            return (result as { type: 'text'; text: string }).text.includes('chunk_id') && !(result as { type: 'text'; text: string }).text.includes('Unable to scrape');
+        });
+
+        // If all scrapes failed, provide a more helpful error message
+        if (successfulResults.length === 0 && results.length > 0) {
+            results.push({
+                type: 'text',
+                text: `Note: All website scraping attempts failed. Please try with different URLs or try again later.`,
+            } as Observation);
+        }
+
         return results;
     }
 }
author	A.J. Shulman <Shulman.aj@gmail.com>	2025-05-10 20:30:24 -0400
committer	A.J. Shulman <Shulman.aj@gmail.com>	2025-05-10 20:30:24 -0400
commit	0db4583914e43e6efdba3e86a614a19956e73b5e (patch)
tree	68dfef85ea47d6d79e63a6ac0914922dc69c99c5 /src/client/views/nodes/chatbot
parent	0a05616fb9f685dc8534db4949a6f7ad6b85eadb (diff)