new attempt with new citation unification

author: A.J. Shulman <Shulman.aj@gmail.com> 2025-04-27 13:14:49 -0400
committer: A.J. Shulman <Shulman.aj@gmail.com> 2025-04-27 13:14:49 -0400
commit: 3ef3d40506348d9fd537cc8f4aea975b9770689f (patch)
tree: afe779e8240e88c8b20ff6b68ac45840a927ee76 /src/client/views/nodes/chatbot/vectorstore
parent: 5ce2263849bfb901e276a4c5fc8ca2dbd8b80350 (diff)
1 files changed, 90 insertions, 40 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index afd34f28d..4bb61d8b2 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -15,7 +15,7 @@ import { Networking } from '../../../../Network';
 import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
 import OpenAI from 'openai';
 import { Embedding } from 'openai/resources';
-import { PineconeEnvironmentVarsNotSupportedError } from '@pinecone-database/pinecone/dist/errors';
+import { AgentDocumentManager } from '../utils/AgentDocumentManager';
 
 dotenv.config();
 
@@ -29,7 +29,7 @@ export class Vectorstore {
     private openai: OpenAI; // OpenAI client for generating embeddings.
     private indexName: string = 'pdf-chatbot'; // Default name for the index.
     private _id: string; // Unique ID for the Vectorstore instance.
-    private _doc_ids: () => string[]; // List of document IDs handled by this instance.
+    private docManager: AgentDocumentManager; // Document manager for handling documents
 
     documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
 
@@ -37,9 +37,9 @@ export class Vectorstore {
      * Initializes the Pinecone and OpenAI clients, sets up the document ID list,
      * and initializes the Pinecone index.
      * @param id The unique identifier for the vectorstore instance.
-     * @param doc_ids A function that returns a list of document IDs.
+     * @param docManager An instance of AgentDocumentManager to handle document management.
      */
-    constructor(id: string, doc_ids: () => string[]) {
+    constructor(id: string, docManager: AgentDocumentManager) {
         const pineconeApiKey = process.env.PINECONE_API_KEY;
         if (!pineconeApiKey) {
             throw new Error('PINECONE_API_KEY is not defined.');
@@ -49,7 +49,7 @@ export class Vectorstore {
         this.pinecone = new Pinecone({ apiKey: pineconeApiKey });
         this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true });
         this._id = id;
-        this._doc_ids = doc_ids;
+        this.docManager = docManager;
         this.initializeIndex();
     }
 
@@ -109,15 +109,25 @@ export class Vectorstore {
 
             const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4');
             let result: AI_Document & { doc_id: string };
+
             if (isAudioOrVideo) {
                 console.log('Processing media file...');
                 const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
-                const segmentedTranscript = response.condensed;
+
+                // Type assertion to handle the response properties
+                const typedResponse = response as {
+                    condensed: Array<{ text: string; indexes: string[]; start: number; end: number }>;
+                    full: Array<unknown>;
+                    summary: string;
+                };
+
+                const segmentedTranscript = typedResponse.condensed;
                 console.log(segmentedTranscript);
-                const summary = response.summary;
+                const summary = typedResponse.summary;
                 doc.summary = summary;
+
                 // Generate embeddings for each chunk
-                const texts = segmentedTranscript.map((chunk: any) => chunk.text);
+                const texts = segmentedTranscript.map(chunk => chunk.text);
 
                 try {
                     const embeddingsResponse = await this.openai.embeddings.create({
@@ -126,10 +136,19 @@ export class Vectorstore {
                         encoding_format: 'float',
                     });
 
-                    doc.original_segments = JSON.stringify(response.full);
+                    doc.original_segments = JSON.stringify(typedResponse.full);
                     doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
                     const doc_id = uuidv4();
 
+                    // Register the document with the AgentDocumentManager
+                    this.docManager.addCustomId(doc, doc_id);
+
+                    // Generate chunk IDs upfront so we can register them
+                    const chunkIds = segmentedTranscript.map(() => uuidv4());
+
+                    // Register all chunk IDs with the document manager
+                    this.docManager.registerChunkIds(doc_id, chunkIds);
+
                     // Add transcript and embeddings to metadata
                     result = {
                         doc_id,
@@ -137,13 +156,13 @@ export class Vectorstore {
                         file_name: local_file_path,
                         num_pages: 0,
                         summary: '',
-                        chunks: segmentedTranscript.map((chunk: any, index: number) => ({
-                            id: uuidv4(),
+                        chunks: segmentedTranscript.map((chunk, index) => ({
+                            id: chunkIds[index], // Use pre-generated chunk ID
                             values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding
                             metadata: {
                                 indexes: chunk.indexes,
                                 original_document: local_file_path,
-                                doc_id: doc_id,
+                                doc_id: doc_id, // Ensure doc_id is consistent
                                 file_path: local_file_path,
                                 start_time: chunk.start,
                                 end_time: chunk.end,
@@ -159,20 +178,24 @@ export class Vectorstore {
                 }
 
                 doc.segmented_transcript = JSON.stringify(segmentedTranscript);
-                // Simplify chunks for storage
+                // Simplify chunks for storage - ensure simplified chunks use EXACTLY the same IDs
                 const simplifiedChunks = result.chunks.map(chunk => ({
-                    chunkId: chunk.id,
+                    chunkId: chunk.id, // Use the exact same ID as the full chunk
                     start_time: chunk.metadata.start_time,
                     end_time: chunk.metadata.end_time,
                     indexes: chunk.metadata.indexes,
                     chunkType: CHUNK_TYPE.VIDEO,
                     text: chunk.metadata.text,
+                    doc_id: chunk.metadata.doc_id, // Include parent doc_id for completeness
                 }));
                 doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
             } else {
-                // Existing document processing logic remains unchanged
+                // Process regular document
                 console.log('Processing regular document...');
-                const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+                const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+
+                // Type assertion for the response
+                const { jobId } = createDocumentResponse as { jobId: string };
 
                 while (true) {
                     await new Promise(resolve => setTimeout(resolve, 2000));
@@ -188,6 +211,16 @@ export class Vectorstore {
                         progressCallback(progressResponseJson.progress, progressResponseJson.step);
                     }
                 }
+
+                // Register the document with the AgentDocumentManager
+                this.docManager.addCustomId(doc, result.doc_id);
+
+                // Collect all chunk IDs
+                const chunkIds = result.chunks.map(chunk => chunk.id);
+
+                // Register chunks with the document manager
+                this.docManager.registerChunkIds(result.doc_id, chunkIds);
+
                 if (!doc.chunk_simpl) {
                     doc.chunk_simpl = JSON.stringify({ chunks: [] });
                 }
@@ -196,12 +229,13 @@ export class Vectorstore {
 
                 result.chunks.forEach((chunk: RAGChunk) => {
                     const chunkToAdd = {
-                        chunkId: chunk.id,
+                        chunkId: chunk.id, // Ensure we use the exact same ID
                         startPage: chunk.metadata.start_page,
                         endPage: chunk.metadata.end_page,
                         location: chunk.metadata.location,
                         chunkType: chunk.metadata.type as CHUNK_TYPE,
                         text: chunk.metadata.text,
+                        doc_id: chunk.metadata.doc_id, // Include parent doc_id for consistency
                     };
                     const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl));
                     new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd);
@@ -298,39 +332,55 @@ export class Vectorstore {
 
             let queryEmbedding = queryEmbeddingResponse.data[0].embedding;
 
-            // Extract the embedding from the response.
+            // Get document IDs from the AgentDocumentManager
+            const docIds = Array.from(this.docManager.listDocs());
+            console.log('Using document IDs for retrieval:', docIds);
 
-            console.log(this._doc_ids());
             // Query the Pinecone index using the embedding and filter by document IDs.
+            // We'll query based on document IDs that are registered in the document manager
             const queryResponse: QueryResponse = await this.index.query({
                 vector: queryEmbedding,
                 filter: {
-                    doc_id: { $in: this._doc_ids() },
+                    doc_id: { $in: docIds },
                 },
                 topK,
                 includeValues: true,
                 includeMetadata: true,
             });
-            console.log(queryResponse);
-
-            // Map the results into RAGChunks and return them.
-            return queryResponse.matches.map(
-                match =>
-                    ({
-                        id: match.id,
-                        values: match.values as number[],
-                        metadata: match.metadata as {
-                            text: string;
-                            type: string;
-                            original_document: string;
-                            file_path: string;
-                            doc_id: string;
-                            location: string;
-                            start_page: number;
-                            end_page: number;
-                        },
-                    }) as RAGChunk
-            );
+            console.log(`Found ${queryResponse.matches.length} matching chunks`);
+
+            // For each retrieved chunk, ensure its document ID is registered in the document manager
+            // This maintains compatibility with existing code while ensuring consistency
+            const processedMatches = queryResponse.matches.map(match => {
+                const chunk = {
+                    id: match.id,
+                    values: match.values as number[],
+                    metadata: match.metadata as {
+                        text: string;
+                        type: string;
+                        original_document: string;
+                        file_path: string;
+                        doc_id: string;
+                        location: string;
+                        start_page: number;
+                        end_page: number;
+                    },
+                } as RAGChunk;
+
+                // Ensure the document manager knows about this chunk
+                // This is important for maintaining backwards compatibility
+                if (chunk.id && !this.docManager.getDocIdByChunkId(chunk.id)) {
+                    // If the chunk ID isn't registered but we have a doc_id in metadata
+                    if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) {
+                        // Register the chunk with its parent document
+                        this.docManager.registerChunkIds(chunk.metadata.doc_id, [chunk.id]);
+                    }
+                }
+
+                return chunk;
+            });
+
+            return processedMatches;
         } catch (error) {
             console.error(`Error retrieving chunks: ${error}`);
             return [];
author	A.J. Shulman <Shulman.aj@gmail.com>	2025-04-27 13:14:49 -0400
committer	A.J. Shulman <Shulman.aj@gmail.com>	2025-04-27 13:14:49 -0400
commit	3ef3d40506348d9fd537cc8f4aea975b9770689f (patch)
tree	afe779e8240e88c8b20ff6b68ac45840a927ee76 /src/client/views/nodes/chatbot/vectorstore
parent	5ce2263849bfb901e276a4c5fc8ca2dbd8b80350 (diff)