reorganized parsers, added comments to vectorstore, and added citation popup for text citations

author: A.J. Shulman <Shulman.aj@gmail.com> 2024-09-07 11:48:36 -0400
committer: A.J. Shulman <Shulman.aj@gmail.com> 2024-09-07 11:48:36 -0400
commit: 210f8f5f1cd19e9416a12524cce119b273334fd3 (patch)
tree: 02268cd69abc868c428f42e8d57812a4f29be1a7 /src/client/views/nodes/ChatBox/vectorstore
parent: 0b3c2ed595b85391e9833a3b7710d2169439a582 (diff)
1 files changed, 87 insertions, 38 deletions
diff --git a/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts
index b5145c1f7..cc3b1ccd5 100644
--- a/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts
@@ -2,47 +2,55 @@ import { Pinecone, Index, IndexList, PineconeRecord, RecordMetadata, QueryRespon
 import { CohereClient } from 'cohere-ai';
 import { EmbedResponse } from 'cohere-ai/api';
 import dotenv from 'dotenv';
-import axios from 'axios';
-import { SimplifiedChunk } from '../types';
-
 import { RAGChunk, AI_Document, CHUNK_TYPE } from '../types';
 import { Doc } from '../../../../../fields/Doc';
-import { DocData } from '../../../../../fields/DocSymbols';
 import { CsvCast, PDFCast, StrCast } from '../../../../../fields/Types';
 import { Networking } from '../../../../Network';
 
 dotenv.config();
 
+/**
+ * The Vectorstore class integrates with Pinecone for vector-based document indexing and retrieval,
+ * and Cohere for text embedding. It handles AI document management, uploads, and query-based retrieval.
+ */
 export class Vectorstore {
-    private pinecone: Pinecone;
-    private index!: Index;
-    private cohere: CohereClient;
-    private indexName: string = 'pdf-chatbot';
-    private _id: string;
-    private _doc_ids: string[] = [];
+    private pinecone: Pinecone; // Pinecone client for managing the vector index.
+    private index!: Index; // The specific Pinecone index used for document chunks.
+    private cohere: CohereClient; // Cohere client for generating embeddings.
+    private indexName: string = 'pdf-chatbot'; // Default name for the index.
+    private _id: string; // Unique ID for the Vectorstore instance.
+    private _doc_ids: string[] = []; // List of document IDs handled by this instance.
 
-    documents: AI_Document[] = [];
+    documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
 
+    /**
+     * Constructor initializes the Pinecone and Cohere clients, sets up the document ID list,
+     * and initializes the Pinecone index.
+     * @param id The unique identifier for the vectorstore instance.
+     * @param doc_ids A function that returns a list of document IDs.
+     */
     constructor(id: string, doc_ids: () => string[]) {
         const pineconeApiKey = process.env.PINECONE_API_KEY;
         if (!pineconeApiKey) {
             throw new Error('PINECONE_API_KEY is not defined.');
         }
 
-        this.pinecone = new Pinecone({
-            apiKey: pineconeApiKey,
-        });
-        this.cohere = new CohereClient({
-            token: process.env.COHERE_API_KEY,
-        });
+        // Initialize Pinecone and Cohere clients with API keys from the environment.
+        this.pinecone = new Pinecone({ apiKey: pineconeApiKey });
+        this.cohere = new CohereClient({ token: process.env.COHERE_API_KEY });
         this._id = id;
         this._doc_ids = doc_ids();
         this.initializeIndex();
     }
 
+    /**
+     * Initializes the Pinecone index by checking if it exists, and creating it if not.
+     * The index is set to use the cosine metric for vector similarity.
+     */
     private async initializeIndex() {
         const indexList: IndexList = await this.pinecone.listIndexes();
 
+        // Check if the index already exists, otherwise create it.
         if (!indexList.indexes?.some(index => index.name === this.indexName)) {
             await this.pinecone.createIndex({
                 name: this.indexName,
@@ -57,62 +65,76 @@ export class Vectorstore {
             });
         }
 
+        // Set the index for future use.
         this.index = this.pinecone.Index(this.indexName);
     }
 
+    /**
+     * Adds an AI document to the vectorstore. This method handles document chunking, uploading to the
+     * vectorstore, and updating the progress for long-running tasks like file uploads.
+     * @param doc The document to be added to the vectorstore.
+     * @param progressCallback Callback to update the progress of the upload.
+     */
     async addAIDoc(doc: Doc, progressCallback: (progress: number, step: string) => void) {
         console.log('Adding AI Document:', doc);
         const ai_document_status: string = StrCast(doc.ai_document_status);
 
-        if (ai_document_status !== undefined && ai_document_status !== null && ai_document_status.trim() !== '' && ai_document_status !== '{}') {
+        // Skip if the document is already in progress or completed.
+        if (ai_document_status !== undefined && ai_document_status.trim() !== '' && ai_document_status !== '{}') {
             if (ai_document_status === 'IN PROGRESS') {
                 console.log('Already in progress.');
                 return;
             }
-            if (!this._doc_ids.includes(StrCast(doc.ai_doc_id))) this._doc_ids.push(StrCast(doc.ai_doc_id));
+            if (!this._doc_ids.includes(StrCast(doc.ai_doc_id))) {
+                this._doc_ids.push(StrCast(doc.ai_doc_id));
+            }
         } else {
+            // Start processing the document.
             doc.ai_document_status = 'PROGRESS';
             console.log(doc);
+
+            // Get the local file path (CSV or PDF).
             const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname;
             console.log('Local File Path:', local_file_path);
 
             if (local_file_path) {
                 console.log('Creating AI Document...');
-                // Start the document creation process
+                // Start the document creation process by sending the file to the server.
                 const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
 
-                // Poll the server for progress updates
+                // Poll the server for progress updates.
                 let inProgress: boolean = true;
                 let result: any = null;
                 while (inProgress) {
-                    await new Promise(resolve => setTimeout(resolve, 2000)); // Polling interval
+                    // Polling interval for status updates.
+                    await new Promise(resolve => setTimeout(resolve, 2000));
 
+                    // Check if the job is completed.
                     const resultResponse = await Networking.FetchFromServer(`/getResult/${jobId}`);
                     const resultResponseJson = JSON.parse(resultResponse);
-                    //console.log('Result Response:', resultResponseJson);
                     if (resultResponseJson.status === 'completed') {
                         console.log('Result here:', resultResponseJson);
                         result = resultResponseJson;
                         break;
                     }
 
+                    // Fetch progress information and update the progress callback.
                     const progressResponse = await Networking.FetchFromServer(`/getProgress/${jobId}`);
                     const progressResponseJson = JSON.parse(progressResponse);
-                    //console.log('Progress Response:', progressResponseJson);
-
                     if (progressResponseJson) {
-                        console.log('Progress:', progressResponseJson);
                         const progress = progressResponseJson.progress;
                         const step = progressResponseJson.step;
                         progressCallback(progress, step);
                     }
                 }
 
-                // Process the final document result
+                // Once completed, process the document and add it to the vectorstore.
                 console.log('Document JSON:', result);
                 this.documents.push(result);
-                await this.indexDocument(JSON.parse(JSON.stringify(result, (key, value) => (value === null || value === undefined ? undefined : value))));
+                await this.indexDocument(result);
                 console.log(`Document added: ${result.file_name}`);
+
+                // Update document metadata such as summary, purpose, and vectorstore ID.
                 doc.summary = result.summary;
                 doc.ai_doc_id = result.doc_id;
                 this._doc_ids.push(result.doc_id);
@@ -128,6 +150,7 @@ export class Vectorstore {
                     doc.chunk_simpl = JSON.stringify({ chunks: [] });
                 }
 
+                // Process each chunk of the document and update the document's chunk_simpl field.
                 result.chunks.forEach((chunk: RAGChunk) => {
                     const chunkToAdd = {
                         chunkId: chunk.id,
@@ -142,27 +165,41 @@ export class Vectorstore {
                     doc.chunk_simpl = JSON.stringify(new_chunk_simpl);
                 });
 
+                // Mark the document status as completed.
                 doc.ai_document_status = 'COMPLETED';
             }
         }
     }
 
+    /**
+     * Indexes the processed document by uploading the document's vector chunks to the Pinecone index.
+     * @param document The processed document containing its chunks and metadata.
+     */
     private async indexDocument(document: any) {
         console.log('Uploading vectors to content namespace...');
-        const pineconeRecords: PineconeRecord[] = (document.chunks as RAGChunk[]).map(
-            chunk =>
-                ({
-                    id: chunk.id,
-                    values: chunk.values,
-                    metadata: { ...chunk.metadata } as RecordMetadata,
-                }) as PineconeRecord
-        );
+
+        // Prepare Pinecone records for each chunk in the document.
+        const pineconeRecords: PineconeRecord[] = (document.chunks as RAGChunk[]).map(chunk => ({
+            id: chunk.id,
+            values: chunk.values,
+            metadata: { ...chunk.metadata } as RecordMetadata,
+        }));
+
+        // Upload the records to Pinecone.
         await this.index.upsert(pineconeRecords);
     }
 
-    async retrieve(query: string, topK: number = 10): Promise {
+    /**
+     * Retrieves the top K document chunks relevant to the user's query.
+     * This involves embedding the query using Cohere, then querying Pinecone for matching vectors.
+     * @param query The search query string.
+     * @param topK The number of top results to return (default is 10).
+     * @returns A list of document chunks that match the query.
+     */
+    async retrieve(query: string, topK: number = 10): Promise<RAGChunk[]> {
         console.log(`Retrieving chunks for query: ${query}`);
         try {
+            // Generate an embedding for the query using Cohere.
             const queryEmbeddingResponse: EmbedResponse = await this.cohere.embed({
                 texts: [query],
                 model: 'embed-english-v3.0',
@@ -171,6 +208,7 @@ export class Vectorstore {
 
             let queryEmbedding: number[];
 
+            // Extract the embedding from the response.
             if (Array.isArray(queryEmbeddingResponse.embeddings)) {
                 queryEmbedding = queryEmbeddingResponse.embeddings[0];
             } else if (queryEmbeddingResponse.embeddings && 'embeddings' in queryEmbeddingResponse.embeddings) {
@@ -183,6 +221,7 @@ export class Vectorstore {
                 throw new Error('Query embedding is not an array');
             }
 
+            // Query the Pinecone index using the embedding and filter by document IDs.
             const queryResponse: QueryResponse = await this.index.query({
                 vector: queryEmbedding,
                 filter: {
@@ -193,12 +232,22 @@ export class Vectorstore {
                 includeMetadata: true,
             });
 
+            // Map the results into RAGChunks and return them.
             return queryResponse.matches.map(
                 match =>
                     ({
                         id: match.id,
                         values: match.values as number[],
-                        metadata: match.metadata as { text: string; type: string; original_document: string; file_path: string; doc_id: string; location: string; start_page: number; end_page: number },
+                        metadata: match.metadata as {
+                            text: string;
+                            type: string;
+                            original_document: string;
+                            file_path: string;
+                            doc_id: string;
+                            location: string;
+                            start_page: number;
+                            end_page: number;
+                        },
                     }) as RAGChunk
             );
         } catch (error) {
author	A.J. Shulman <Shulman.aj@gmail.com>	2024-09-07 11:48:36 -0400
committer	A.J. Shulman <Shulman.aj@gmail.com>	2024-09-07 11:48:36 -0400
commit	210f8f5f1cd19e9416a12524cce119b273334fd3 (patch)
tree	02268cd69abc868c428f42e8d57812a4f29be1a7 /src/client/views/nodes/ChatBox/vectorstore
parent	0b3c2ed595b85391e9833a3b7710d2169439a582 (diff)