aboutsummaryrefslogtreecommitdiff
path: root/src/client/views/nodes/chatbot/tools
diff options
context:
space:
mode:
Diffstat (limited to 'src/client/views/nodes/chatbot/tools')
-rw-r--r--src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts105
1 files changed, 93 insertions, 12 deletions
diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
index bff38ae15..3c7b4e3db 100644
--- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
+++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
@@ -73,30 +73,111 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
this._getLinkedUrlDocId = getLinkedUrlDocIds;
}
- async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
- const urls = args.urls;
-
- // Create an array of promises, each one handling a website scrape for a URL
- const scrapingPromises = urls.map(async url => {
+ /**
+ * Attempts to scrape a website with retry logic
+ * @param url URL to scrape
+ * @param maxRetries Maximum number of retry attempts
+ * @returns The scraped content or error message
+ */
+ private async scrapeWithRetry(url: string, maxRetries = 2): Promise<Observation> {
+ let lastError = '';
+ let retryCount = 0;
+
+ // Validate URL format
+ try {
+ new URL(url); // This will throw if URL is invalid
+ } catch (e) {
+ return {
+ type: 'text',
+ text: `Invalid URL format: ${url}. Please provide a valid URL including http:// or https://`,
+ } as Observation;
+ }
+
+ while (retryCount <= maxRetries) {
try {
- const { website_plain_text } = (await Networking.PostToServer('/scrapeWebsite', { url })) as { website_plain_text: string };
+ // Add a slight delay between retries
+ if (retryCount > 0) {
+ console.log(`Retry attempt ${retryCount} for ${url}`);
+ await new Promise(resolve => setTimeout(resolve, retryCount * 2000)); // Increasing delay for each retry
+ }
+
+ const response = await Networking.PostToServer('/scrapeWebsite', { url });
+
+ if (!response || typeof response !== 'object') {
+ lastError = 'Empty or invalid response from server';
+ retryCount++;
+ continue;
+ }
+
+ const { website_plain_text } = response as { website_plain_text: string };
const id = this._getLinkedUrlDocId(url);
+
+ // Validate content quality
+ if (!website_plain_text) {
+ lastError = 'Retrieved content was empty';
+ retryCount++;
+ continue;
+ }
+
+ if (website_plain_text.length < 100) {
+ console.warn(`Warning: Content from ${url} is very short (${website_plain_text.length} chars)`);
+
+ // Still return it if this is our last try
+ if (retryCount === maxRetries) {
+ return {
+ type: 'text',
+ text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
+ } as Observation;
+ }
+
+ lastError = 'Retrieved content was too short, trying again';
+ retryCount++;
+ continue;
+ }
+
+ // Process and return content if it looks good
return {
type: 'text',
text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
} as Observation;
} catch (error) {
- console.log(error);
- return {
- type: 'text',
- text: `An error occurred while scraping the website: ${url}`,
- } as Observation;
+ lastError = error instanceof Error ? error.message : 'Unknown error';
+ console.log(`Error scraping ${url} (attempt ${retryCount + 1}):`, error);
}
- });
+
+ retryCount++;
+ }
+
+ // All attempts failed
+ return {
+ type: 'text',
+ text: `Unable to scrape website: ${url}. Error: ${lastError}`,
+ } as Observation;
+ }
+
+ async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> {
+ const urls = args.urls;
+
+ // Create an array of promises, each one handling a website scrape for a URL
+ const scrapingPromises = urls.map(url => this.scrapeWithRetry(url));
// Wait for all scraping promises to resolve
const results = await Promise.all(scrapingPromises);
+ // Check if we got any successful results
+ const successfulResults = results.filter(result => {
+ if (result.type !== 'text') return false;
+ return (result as { type: 'text'; text: string }).text.includes('chunk_id') && !(result as { type: 'text'; text: string }).text.includes('Unable to scrape');
+ });
+
+ // If all scrapes failed, provide a more helpful error message
+ if (successfulResults.length === 0 && results.length > 0) {
+ results.push({
+ type: 'text',
+ text: `Note: All website scraping attempts failed. Please try with different URLs or try again later.`,
+ } as Observation);
+ }
+
return results;
}
}