diff options
| author | A.J. Shulman <Shulman.aj@gmail.com> | 2025-05-11 13:42:00 -0400 |
|---|---|---|
| committer | A.J. Shulman <Shulman.aj@gmail.com> | 2025-05-11 13:42:00 -0400 |
| commit | a5d7f5c38192b91b7df3bd6ecace5ba7365449a6 (patch) | |
| tree | c6be94f983b5fcc65424b81d42ddb0718127404c /src/client/views/nodes/chatbot/vectorstore | |
| parent | 3c28aa3a706869d818bc8a089e8d1a53f7234bc0 (diff) | |
Made it so chunk Ids are seperately managed and made sure the doc id is sonsistent and not created in python spawn
Diffstat (limited to 'src/client/views/nodes/chatbot/vectorstore')
| -rw-r--r-- | src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts | 33 |
1 files changed, 15 insertions, 18 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index 3df1294e9..1349df483 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -16,6 +16,7 @@ import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types'; import OpenAI from 'openai'; import { Embedding } from 'openai/resources'; import { AgentDocumentManager } from '../utils/AgentDocumentManager'; +import { Id } from '../../../../../fields/FieldSymbols'; dotenv.config(); @@ -24,13 +25,12 @@ dotenv.config(); * and OpenAI text-embedding-3-large for text embedding. It handles AI document management, uploads, and query-based retrieval. */ export class Vectorstore { - private pinecone: Pinecone; // Pinecone client for managing the vector index. + private pinecone!: Pinecone; // Pinecone client for managing the vector index. private index!: Index; // The specific Pinecone index used for document chunks. - private openai: OpenAI; // OpenAI client for generating embeddings. + private openai!: OpenAI; // OpenAI client for generating embeddings. private indexName: string = 'pdf-chatbot'; // Default name for the index. - private _id: string; // Unique ID for the Vectorstore instance. - private docManager: AgentDocumentManager; // Document manager for handling documents - + private _id!: string; // Unique ID for the Vectorstore instance. + private docManager!: AgentDocumentManager; // Document manager for handling documents documents: AI_Document[] = []; // Store the documents indexed in the vectorstore. /** @@ -143,10 +143,8 @@ export class Vectorstore { progressCallback(85, 'Embeddings generated. Finalizing document...'); doc.original_segments = JSON.stringify(typedResponse.full); - const doc_id = uuidv4(); - - // Register the document with the AgentDocumentManager - this.docManager.addCustomId(doc, doc_id); + const doc_id = doc[Id]; + console.log('doc_id in vectorstore', doc_id); // Generate chunk IDs upfront so we can register them const chunkIds = segmentedTranscript.map(() => uuidv4()); @@ -191,7 +189,7 @@ export class Vectorstore { } else { // Process regular document console.log('Processing regular document...'); - const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path }); + const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path, doc_id: doc[Id] }); // Type assertion for the response const { jobId } = createDocumentResponse as { jobId: string }; @@ -211,12 +209,13 @@ export class Vectorstore { } } - // Register the document with the AgentDocumentManager - this.docManager.addCustomId(doc, result.doc_id); - // Collect all chunk IDs const chunkIds = result.chunks.map(chunk => chunk.id); + if (result.doc_id !== doc[Id]) { + console.log('doc_id in vectorstore', result.doc_id, 'does not match doc_id in doc', doc[Id]); + } + // Register chunks with the document manager this.docManager.registerChunkIds(result.doc_id, chunkIds); @@ -319,16 +318,14 @@ export class Vectorstore { const queryEmbedding = queryEmbeddingResponse.data[0].embedding; - // Get document IDs from the AgentDocumentManager - const docIds = Array.from(this.docManager.listDocs()); - console.log('Using document IDs for retrieval:', docIds); + console.log('Using document IDs for retrieval:', this.docManager.docIds); // Query the Pinecone index using the embedding and filter by document IDs. // We'll query based on document IDs that are registered in the document manager const queryResponse: QueryResponse = await this.index.query({ vector: queryEmbedding, filter: { - doc_id: { $in: docIds }, + doc_id: { $in: this.docManager.docIds }, }, topK, includeValues: true, @@ -356,7 +353,7 @@ export class Vectorstore { // Ensure the document manager knows about this chunk // This is important for maintaining backwards compatibility - if (chunk.id && !this.docManager.getDocIdByChunkId(chunk.id)) { + if (chunk.id && !this.docManager.getDocByChunkId(chunk.id)) { // If the chunk ID isn't registered but we have a doc_id in metadata if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) { // Register the chunk with its parent document |
