Made it so chunk Ids are seperately managed and made sure the doc id is sonsistent and not created in python spawn

author: A.J. Shulman <Shulman.aj@gmail.com> 2025-05-11 13:42:00 -0400
committer: A.J. Shulman <Shulman.aj@gmail.com> 2025-05-11 13:42:00 -0400
commit: a5d7f5c38192b91b7df3bd6ecace5ba7365449a6 (patch)
tree: c6be94f983b5fcc65424b81d42ddb0718127404c /src/client/views/nodes/chatbot/vectorstore
parent: 3c28aa3a706869d818bc8a089e8d1a53f7234bc0 (diff)
1 files changed, 15 insertions, 18 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index 3df1294e9..1349df483 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -16,6 +16,7 @@ import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
 import OpenAI from 'openai';
 import { Embedding } from 'openai/resources';
 import { AgentDocumentManager } from '../utils/AgentDocumentManager';
+import { Id } from '../../../../../fields/FieldSymbols';
 
 dotenv.config();
 
@@ -24,13 +25,12 @@ dotenv.config();
  * and OpenAI text-embedding-3-large for text embedding. It handles AI document management, uploads, and query-based retrieval.
  */
 export class Vectorstore {
-    private pinecone: Pinecone; // Pinecone client for managing the vector index.
+    private pinecone!: Pinecone; // Pinecone client for managing the vector index.
     private index!: Index; // The specific Pinecone index used for document chunks.
-    private openai: OpenAI; // OpenAI client for generating embeddings.
+    private openai!: OpenAI; // OpenAI client for generating embeddings.
     private indexName: string = 'pdf-chatbot'; // Default name for the index.
-    private _id: string; // Unique ID for the Vectorstore instance.
-    private docManager: AgentDocumentManager; // Document manager for handling documents
-
+    private _id!: string; // Unique ID for the Vectorstore instance.
+    private docManager!: AgentDocumentManager; // Document manager for handling documents
     documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
 
     /**
@@ -143,10 +143,8 @@ export class Vectorstore {
                     progressCallback(85, 'Embeddings generated. Finalizing document...');
 
                     doc.original_segments = JSON.stringify(typedResponse.full);
-                    const doc_id = uuidv4();
-
-                    // Register the document with the AgentDocumentManager
-                    this.docManager.addCustomId(doc, doc_id);
+                    const doc_id = doc[Id];
+                    console.log('doc_id in vectorstore', doc_id);
 
                     // Generate chunk IDs upfront so we can register them
                     const chunkIds = segmentedTranscript.map(() => uuidv4());
@@ -191,7 +189,7 @@ export class Vectorstore {
             } else {
                 // Process regular document
                 console.log('Processing regular document...');
-                const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+                const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path, doc_id: doc[Id] });
 
                 // Type assertion for the response
                 const { jobId } = createDocumentResponse as { jobId: string };
@@ -211,12 +209,13 @@ export class Vectorstore {
                     }
                 }
 
-                // Register the document with the AgentDocumentManager
-                this.docManager.addCustomId(doc, result.doc_id);
-
                 // Collect all chunk IDs
                 const chunkIds = result.chunks.map(chunk => chunk.id);
 
+                if (result.doc_id !== doc[Id]) {
+                    console.log('doc_id in vectorstore', result.doc_id, 'does not match doc_id in doc', doc[Id]);
+                }
+
                 // Register chunks with the document manager
                 this.docManager.registerChunkIds(result.doc_id, chunkIds);
 
@@ -319,16 +318,14 @@ export class Vectorstore {
 
             const queryEmbedding = queryEmbeddingResponse.data[0].embedding;
 
-            // Get document IDs from the AgentDocumentManager
-            const docIds = Array.from(this.docManager.listDocs());
-            console.log('Using document IDs for retrieval:', docIds);
+            console.log('Using document IDs for retrieval:', this.docManager.docIds);
 
             // Query the Pinecone index using the embedding and filter by document IDs.
             // We'll query based on document IDs that are registered in the document manager
             const queryResponse: QueryResponse = await this.index.query({
                 vector: queryEmbedding,
                 filter: {
-                    doc_id: { $in: docIds },
+                    doc_id: { $in: this.docManager.docIds },
                 },
                 topK,
                 includeValues: true,
@@ -356,7 +353,7 @@ export class Vectorstore {
 
                 // Ensure the document manager knows about this chunk
                 // This is important for maintaining backwards compatibility
-                if (chunk.id && !this.docManager.getDocIdByChunkId(chunk.id)) {
+                if (chunk.id && !this.docManager.getDocByChunkId(chunk.id)) {
                     // If the chunk ID isn't registered but we have a doc_id in metadata
                     if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) {
                         // Register the chunk with its parent document
author	A.J. Shulman <Shulman.aj@gmail.com>	2025-05-11 13:42:00 -0400
committer	A.J. Shulman <Shulman.aj@gmail.com>	2025-05-11 13:42:00 -0400
commit	a5d7f5c38192b91b7df3bd6ecace5ba7365449a6 (patch)
tree	c6be94f983b5fcc65424b81d42ddb0718127404c /src/client/views/nodes/chatbot/vectorstore
parent	3c28aa3a706869d818bc8a089e8d1a53f7234bc0 (diff)