Added codebase search and retrieval to Vectorstore

Summary indexing: Added functionality to embed and index file summaries from file_summaries.json in Pinecone Vector search: Implemented semantic search to find the top 5 most relevant files for a query Content retrieval: Added method to fetch full file content from file_content.json API endpoints: /getFileSummaries - Retrieves all file summaries /getFileContent - Fetches file content by path /getRawFileContent - Returns content as plain text to avoid JSON parsing errors Error handling: Added comprehensive error handling and debugging throughout Initialization: Implemented proper async initialization sequence with verification Performance: Added streaming for large files to improve memory efficiency Testing: Added automated test queries to validate functionality
author: A.J. Shulman <Shulman.aj@gmail.com> 2025-05-21 12:38:55 -0400
committer: A.J. Shulman <Shulman.aj@gmail.com> 2025-05-21 12:38:55 -0400
commit: 0e98320d3b237f1927b9f1367494dccd7f66eda9 (patch)
tree: 112fc95b0dfd2da8a93a37bbb2e1139067c993bd /src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
parent: 9437753fdebfc7c4b172eeda53610c08abe7287a (diff)
1 files changed, 4 insertions, 2 deletions
diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
index 495a985cb..727d35e2c 100644
--- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
+++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
@@ -22,6 +22,7 @@ const websiteInfoScraperToolInfo: ToolInfo<WebsiteInfoScraperToolParamsType> = {
     name: 'websiteInfoScraper',
     description: 'Scrape detailed information from specific websites relevant to the user query. Returns the text content of the webpages for further analysis and grounding.',
     citationRules: `
+      !IMPORTANT! THESE CHUNKS REPLACE THE CHUNKS THAT ARE RETURNED FROM THE SEARCHTOOL.
       Your task is to provide a comprehensive response to the user's prompt using the content scraped from relevant websites. Ensure you follow these guidelines for structuring your response:
 
       1. Grounded Text Tag Structure:
@@ -88,6 +89,7 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
         console.log(url);
         console.log(chunkDoc);
         console.log(chunkDoc.data);
+        const id = chunkDoc.id;
         // Validate URL format
         try {
             new URL(url); // This will throw if URL is invalid
@@ -130,7 +132,7 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
                     if (retryCount === maxRetries) {
                         return {
                             type: 'text',
-                            text: `<chunk chunk_id="${chunkDoc.id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
+                            text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
                         } as Observation;
                     }
 
@@ -142,7 +144,7 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
                 // Process and return content if it looks good
                 return {
                     type: 'text',
-                    text: `<chunk chunk_id="${chunkDoc.id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
+                    text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
                 } as Observation;
             } catch (error) {
                 lastError = error instanceof Error ? error.message : 'Unknown error';
author	A.J. Shulman <Shulman.aj@gmail.com>	2025-05-21 12:38:55 -0400
committer	A.J. Shulman <Shulman.aj@gmail.com>	2025-05-21 12:38:55 -0400
commit	0e98320d3b237f1927b9f1367494dccd7f66eda9 (patch)
tree	112fc95b0dfd2da8a93a37bbb2e1139067c993bd /src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
parent	9437753fdebfc7c4b172eeda53610c08abe7287a (diff)