aboutsummaryrefslogtreecommitdiff
path: root/src/client/views/nodes/chatbot/vectorstore
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2025-05-11 13:42:00 -0400
committerA.J. Shulman <Shulman.aj@gmail.com>2025-05-11 13:42:00 -0400
commita5d7f5c38192b91b7df3bd6ecace5ba7365449a6 (patch)
treec6be94f983b5fcc65424b81d42ddb0718127404c /src/client/views/nodes/chatbot/vectorstore
parent3c28aa3a706869d818bc8a089e8d1a53f7234bc0 (diff)
Made it so chunk Ids are seperately managed and made sure the doc id is sonsistent and not created in python spawn
Diffstat (limited to 'src/client/views/nodes/chatbot/vectorstore')
-rw-r--r--src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts33
1 files changed, 15 insertions, 18 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index 3df1294e9..1349df483 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -16,6 +16,7 @@ import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
import OpenAI from 'openai';
import { Embedding } from 'openai/resources';
import { AgentDocumentManager } from '../utils/AgentDocumentManager';
+import { Id } from '../../../../../fields/FieldSymbols';
dotenv.config();
@@ -24,13 +25,12 @@ dotenv.config();
* and OpenAI text-embedding-3-large for text embedding. It handles AI document management, uploads, and query-based retrieval.
*/
export class Vectorstore {
- private pinecone: Pinecone; // Pinecone client for managing the vector index.
+ private pinecone!: Pinecone; // Pinecone client for managing the vector index.
private index!: Index; // The specific Pinecone index used for document chunks.
- private openai: OpenAI; // OpenAI client for generating embeddings.
+ private openai!: OpenAI; // OpenAI client for generating embeddings.
private indexName: string = 'pdf-chatbot'; // Default name for the index.
- private _id: string; // Unique ID for the Vectorstore instance.
- private docManager: AgentDocumentManager; // Document manager for handling documents
-
+ private _id!: string; // Unique ID for the Vectorstore instance.
+ private docManager!: AgentDocumentManager; // Document manager for handling documents
documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
/**
@@ -143,10 +143,8 @@ export class Vectorstore {
progressCallback(85, 'Embeddings generated. Finalizing document...');
doc.original_segments = JSON.stringify(typedResponse.full);
- const doc_id = uuidv4();
-
- // Register the document with the AgentDocumentManager
- this.docManager.addCustomId(doc, doc_id);
+ const doc_id = doc[Id];
+ console.log('doc_id in vectorstore', doc_id);
// Generate chunk IDs upfront so we can register them
const chunkIds = segmentedTranscript.map(() => uuidv4());
@@ -191,7 +189,7 @@ export class Vectorstore {
} else {
// Process regular document
console.log('Processing regular document...');
- const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+ const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path, doc_id: doc[Id] });
// Type assertion for the response
const { jobId } = createDocumentResponse as { jobId: string };
@@ -211,12 +209,13 @@ export class Vectorstore {
}
}
- // Register the document with the AgentDocumentManager
- this.docManager.addCustomId(doc, result.doc_id);
-
// Collect all chunk IDs
const chunkIds = result.chunks.map(chunk => chunk.id);
+ if (result.doc_id !== doc[Id]) {
+ console.log('doc_id in vectorstore', result.doc_id, 'does not match doc_id in doc', doc[Id]);
+ }
+
// Register chunks with the document manager
this.docManager.registerChunkIds(result.doc_id, chunkIds);
@@ -319,16 +318,14 @@ export class Vectorstore {
const queryEmbedding = queryEmbeddingResponse.data[0].embedding;
- // Get document IDs from the AgentDocumentManager
- const docIds = Array.from(this.docManager.listDocs());
- console.log('Using document IDs for retrieval:', docIds);
+ console.log('Using document IDs for retrieval:', this.docManager.docIds);
// Query the Pinecone index using the embedding and filter by document IDs.
// We'll query based on document IDs that are registered in the document manager
const queryResponse: QueryResponse = await this.index.query({
vector: queryEmbedding,
filter: {
- doc_id: { $in: docIds },
+ doc_id: { $in: this.docManager.docIds },
},
topK,
includeValues: true,
@@ -356,7 +353,7 @@ export class Vectorstore {
// Ensure the document manager knows about this chunk
// This is important for maintaining backwards compatibility
- if (chunk.id && !this.docManager.getDocIdByChunkId(chunk.id)) {
+ if (chunk.id && !this.docManager.getDocByChunkId(chunk.id)) {
// If the chunk ID isn't registered but we have a doc_id in metadata
if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) {
// Register the chunk with its parent document