/** * @file Vectorstore.ts * @description This file defines the Vectorstore class, which integrates with Pinecone for vector-based document indexing and OpenAI text-embedding-3-large for text embeddings. * It manages AI document handling, including adding documents, processing media files, combining document chunks, indexing documents, * and retrieving relevant sections based on user queries. */ import { Index, IndexList, Pinecone, PineconeRecord, QueryResponse, RecordMetadata } from '@pinecone-database/pinecone'; import dotenv from 'dotenv'; import path from 'path'; import { v4 as uuidv4 } from 'uuid'; import { Doc } from '../../../../../fields/Doc'; import { AudioCast, CsvCast, PDFCast, StrCast, VideoCast } from '../../../../../fields/Types'; import { Networking } from '../../../../Network'; import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types'; import OpenAI from 'openai'; import { Embedding } from 'openai/resources'; import { AgentDocumentManager } from '../utils/AgentDocumentManager'; import { Id } from '../../../../../fields/FieldSymbols'; dotenv.config(); /** * The Vectorstore class integrates with Pinecone for vector-based document indexing and retrieval, * and OpenAI text-embedding-3-large for text embedding. It handles AI document management, uploads, and query-based retrieval. */ export class Vectorstore { private pinecone!: Pinecone; // Pinecone client for managing the vector index. private index!: Index; // The specific Pinecone index used for document chunks. private summaryIndex!: Index; // The Pinecone index used for file summaries. private openai!: OpenAI; // OpenAI client for generating embeddings. private indexName: string = 'pdf-chatbot'; // Default name for the index. private summaryIndexName: string = 'file-summaries'; // Name for the summaries index. private _id!: string; // Unique ID for the Vectorstore instance. private docManager!: AgentDocumentManager; // Document manager for handling documents private summaryCacheCount: number = 0; // Cache for the number of summaries documents: AI_Document[] = []; // Store the documents indexed in the vectorstore. private debug: boolean = true; // Enable debugging private initialized: boolean = false; /** * Initializes the Pinecone and OpenAI clients, sets up the document ID list, * and initializes the Pinecone index. * @param id The unique identifier for the vectorstore instance. * @param docManager An instance of AgentDocumentManager to handle document management. */ constructor(id: string, docManager: AgentDocumentManager) { if (this.debug) console.log(`[DEBUG] Initializing Vectorstore with ID: ${id}`); const pineconeApiKey = 'pcsk_3txLxJ_9fxdmAph4csnq4yxoDF5De5A8bJvjWaXXigBgshy4eoXggrXcxATJiH8vzXbrKm'; if (!pineconeApiKey) { console.log('PINECONE_API_KEY is not defined - Vectorstore will be unavailable'); return; } // Initialize Pinecone and OpenAI clients with API keys from the environment. this.pinecone = new Pinecone({ apiKey: pineconeApiKey }); this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true }); this._id = id; this.docManager = docManager; // Proper async initialization sequence this.initializeAsync(id); } /** * Handles async initialization of all components */ private async initializeAsync(id: string) { try { if (this.debug) console.log(`[DEBUG] Starting async initialization sequence for Vectorstore ID: ${id}`); // Initialize the main document index await this.initializeIndex(); // Initialize the summary index await this.initializeSummaryIndex(); this.initialized = true; if (this.debug) console.log(`[DEBUG] ✅ Vectorstore initialization complete, running test query...`); // Run a single test query instead of multiple await this.runSingleTestQuery(); } catch (error) { console.error('[ERROR] Failed to initialize Vectorstore:', error); } } async getFileNames() { const response = await Networking.FetchFromServer('/getFileNames'); const filepaths = JSON.parse(response); return filepaths; } /** * Initializes the Pinecone index by checking if it exists and creating it if necessary. * Sets the index to use cosine similarity for vector similarity calculations. */ private async initializeIndex() { if (this.debug) console.log(`[DEBUG] Initializing main document index: ${this.indexName}`); const indexList: IndexList = await this.pinecone.listIndexes(); if (this.debug) console.log(`[DEBUG] Available Pinecone indexes: ${indexList.indexes?.map(i => i.name).join(', ') || 'none'}`); // Check if the index already exists, otherwise create it. if (!indexList.indexes?.some(index => index.name === this.indexName)) { if (this.debug) console.log(`[DEBUG] Creating new index: ${this.indexName}`); await this.pinecone.createIndex({ name: this.indexName, dimension: 3072, metric: 'cosine', spec: { serverless: { cloud: 'aws', region: 'us-east-1', }, }, }); if (this.debug) console.log(`[DEBUG] ✅ Index ${this.indexName} created successfully`); } else { if (this.debug) console.log(`[DEBUG] ✅ Using existing index: ${this.indexName}`); } // Set the index for future use. this.index = this.pinecone.Index(this.indexName); } /** * Initializes the Pinecone index for file summaries. * Checks if it exists and creates it if necessary. */ private async initializeSummaryIndex() { if (this.debug) console.log(`[DEBUG] Initializing file summaries index: ${this.summaryIndexName}`); const indexList: IndexList = await this.pinecone.listIndexes(); // Check if the index already exists, otherwise create it. if (!indexList.indexes?.some(index => index.name === this.summaryIndexName)) { if (this.debug) console.log(`[DEBUG] Creating new summary index: ${this.summaryIndexName}`); await this.pinecone.createIndex({ name: this.summaryIndexName, dimension: 3072, metric: 'cosine', spec: { serverless: { cloud: 'aws', region: 'us-east-1', }, }, }); if (this.debug) console.log(`[DEBUG] ✅ Summary index ${this.summaryIndexName} created successfully`); } else { if (this.debug) console.log(`[DEBUG] ✅ Using existing summary index: ${this.summaryIndexName}`); } // Set the summaries index for future use. this.summaryIndex = this.pinecone.Index(this.summaryIndexName); // Check if we need to index the file summaries await this.processFileSummaries(); } /** * Processes file summaries from the JSON file if needed. * Checks if the index contains the correct number of summaries before embedding. */ private async processFileSummaries() { if (this.debug) console.log(`[DEBUG] Starting file summaries processing`); try { // Get file summaries from the server if (this.debug) console.log(`[DEBUG] Fetching file summaries from server...`); const response = await Networking.FetchFromServer('/getFileSummaries'); if (!response) { console.error('[ERROR] Failed to fetch file summaries'); return; } if (this.debug) console.log(`[DEBUG] File summaries response received (${response.length} bytes)`); const summaries = JSON.parse(response); const filepaths = Object.keys(summaries); const summaryCount = filepaths.length; this.summaryCacheCount = summaryCount; if (this.debug) { console.log(`[DEBUG] File summaries parsed: ${summaryCount} files`); console.log(`[DEBUG] Sample filepaths: ${filepaths.slice(0, 3).join(', ')}...`); console.log(`[DEBUG] Sample summary: "${summaries[filepaths[0]].substring(0, 100)}..."`); } // Check if index already has the correct number of summaries try { if (this.debug) console.log(`[DEBUG] Checking summary index stats...`); const indexStats = await this.summaryIndex.describeIndexStats(); const vectorCount = indexStats.totalRecordCount; if (this.debug) console.log(`[DEBUG] Summary index has ${vectorCount} records, expecting ${summaryCount}`); if (vectorCount === summaryCount) { console.log(`[DEBUG] ✅ Summary index already contains ${vectorCount} entries, skipping embedding.`); return; } if (this.debug) console.log(`[DEBUG] ⚠️ Summary index contains ${vectorCount} entries, but there are ${summaryCount} summaries. Re-indexing.`); } catch (error) { console.error('[ERROR] Error checking summary index stats:', error); } // If we get here, we need to embed the summaries await this.embedAndIndexFileSummaries(summaries); } catch (error) { console.error('[ERROR] Error processing file summaries:', error); } } /** * Embeds and indexes file summaries into the summary index. * @param summaries Object mapping filepaths to summaries */ private async embedAndIndexFileSummaries(summaries: Record) { if (this.debug) console.log(`[DEBUG] Starting embedding and indexing of file summaries...`); const filepaths = Object.keys(summaries); const summaryTexts = Object.values(summaries); // Split into batches of 100 to avoid exceeding API limits const batchSize = 100; const totalBatches = Math.ceil(filepaths.length / batchSize); if (this.debug) console.log(`[DEBUG] Processing ${filepaths.length} files in ${totalBatches} batches of size ${batchSize}`); for (let i = 0; i < filepaths.length; i += batchSize) { const batchFilepaths = filepaths.slice(i, i + batchSize); const batchTexts = summaryTexts.slice(i, i + batchSize); if (this.debug) { console.log(`[DEBUG] Processing batch ${Math.floor(i / batchSize) + 1}/${totalBatches}`); console.log(`[DEBUG] First file in batch: ${batchFilepaths[0]}`); console.log(`[DEBUG] First summary in batch: "${batchTexts[0].substring(0, 50)}..."`); } try { // Generate embeddings for this batch if (this.debug) console.log(`[DEBUG] Generating embeddings for batch of ${batchTexts.length} summaries...`); const startTime = Date.now(); const embeddingResponse = await this.openai.embeddings.create({ model: 'text-embedding-3-large', input: batchTexts, encoding_format: 'float', }); const duration = Date.now() - startTime; if (this.debug) console.log(`[DEBUG] ✅ Embeddings generated in ${duration}ms`); // Prepare Pinecone records if (this.debug) console.log(`[DEBUG] Preparing Pinecone records...`); const pineconeRecords: PineconeRecord[] = batchTexts.map((text, index) => { const embedding = (embeddingResponse.data as Embedding[])[index].embedding; if (this.debug && index === 0) console.log(`[DEBUG] Sample embedding dimensions: ${embedding.length}, first few values: [${embedding.slice(0, 5).join(', ')}...]`); return { id: uuidv4(), // Generate a unique ID for each summary values: embedding, metadata: { filepath: batchFilepaths[index], summary: text, } as RecordMetadata, }; }); // Upload to Pinecone if (this.debug) console.log(`[DEBUG] Upserting ${pineconeRecords.length} records to Pinecone...`); const upsertStart = Date.now(); try { await this.summaryIndex.upsert(pineconeRecords); const upsertDuration = Date.now() - upsertStart; if (this.debug) console.log(`[DEBUG] ✅ Batch ${Math.floor(i / batchSize) + 1}/${totalBatches} indexed in ${upsertDuration}ms`); } catch (upsertError) { console.error(`[ERROR] Failed to upsert batch ${Math.floor(i / batchSize) + 1}/${totalBatches} to Pinecone:`, upsertError); // Try again with smaller batch if (batchTexts.length > 20) { console.log(`[DEBUG] 🔄 Retrying with smaller batch size...`); // Split the batch in half and retry recursively const midpoint = Math.floor(batchTexts.length / 2); const firstHalf = { filepaths: batchFilepaths.slice(0, midpoint), texts: batchTexts.slice(0, midpoint), }; const secondHalf = { filepaths: batchFilepaths.slice(midpoint), texts: batchTexts.slice(midpoint), }; // Create a helper function to retry smaller batches const retryBatch = async (paths: string[], texts: string[], batchNum: string) => { try { if (this.debug) console.log(`[DEBUG] Generating embeddings for sub-batch ${batchNum}...`); const embRes = await this.openai.embeddings.create({ model: 'text-embedding-3-large', input: texts, encoding_format: 'float', }); const records = texts.map((t, idx) => ({ id: uuidv4(), values: (embRes.data as Embedding[])[idx].embedding, metadata: { filepath: paths[idx], summary: t, } as RecordMetadata, })); if (this.debug) console.log(`[DEBUG] Upserting sub-batch ${batchNum} (${records.length} records)...`); await this.summaryIndex.upsert(records); if (this.debug) console.log(`[DEBUG] ✅ Sub-batch ${batchNum} upserted successfully`); } catch (retryError) { console.error(`[ERROR] Failed to upsert sub-batch ${batchNum}:`, retryError); } }; await retryBatch(firstHalf.filepaths, firstHalf.texts, `${Math.floor(i / batchSize) + 1}.1`); await retryBatch(secondHalf.filepaths, secondHalf.texts, `${Math.floor(i / batchSize) + 1}.2`); } } } catch (error) { console.error('[ERROR] Error processing batch:', error); } } if (this.debug) console.log(`[DEBUG] ✅ File summary indexing complete for all ${filepaths.length} files`); // Verify the index was populated correctly try { const indexStats = await this.summaryIndex.describeIndexStats(); const vectorCount = indexStats.totalRecordCount; if (this.debug) console.log(`[DEBUG] 🔍 Final index verification: ${vectorCount} records in Pinecone index (expected ${filepaths.length})`); } catch (error) { console.error('[ERROR] Failed to verify index stats:', error); } } /** * Searches for file summaries similar to the given query. * @param query The search query * @param topK Number of results to return (default: 5) * @returns Array of filepath and summary pairs with relevance scores */ async searchFileSummaries(query: string, topK: number = 5): Promise> { if (!this.initialized) { console.error('[ERROR] Cannot search - Vectorstore not fully initialized'); return []; } if (this.debug) console.log(`[DEBUG] Searching file summaries for query: "${query}" (topK=${topK})`); try { // Generate embedding for the query if (this.debug) console.log(`[DEBUG] Generating embedding for query...`); const startTime = Date.now(); const queryEmbeddingResponse = await this.openai.embeddings.create({ model: 'text-embedding-3-large', input: query, encoding_format: 'float', }); const duration = Date.now() - startTime; const queryEmbedding = queryEmbeddingResponse.data[0].embedding; if (this.debug) { console.log(`[DEBUG] ✅ Query embedding generated in ${duration}ms`); console.log(`[DEBUG] Query embedding dimensions: ${queryEmbedding.length}`); } // Check if summary index is ready try { const indexStats = await this.summaryIndex.describeIndexStats(); const vectorCount = indexStats.totalRecordCount; if (this.debug) console.log(`[DEBUG] Summary index contains ${vectorCount} records`); if (vectorCount === 0) { console.error('[ERROR] Summary index is empty, cannot perform search'); return []; } } catch (statsError) { console.error('[ERROR] Failed to check summary index stats:', statsError); console.error('[ERROR] Stats error details:', JSON.stringify(statsError)); } // Test direct API access to Pinecone if (this.debug) console.log(`[DEBUG] Testing Pinecone connection...`); try { const indexes = await this.pinecone.listIndexes(); console.log(`[DEBUG] Available Pinecone indexes: ${indexes.indexes?.map(idx => idx.name).join(', ')}`); } catch (connectionError) { console.error('[ERROR] Could not connect to Pinecone:', connectionError); } // Query the summaries index if (this.debug) console.log(`[DEBUG] Querying Pinecone summary index (${this.summaryIndexName})...`); const queryStart = Date.now(); let queryResponse; try { // First, make sure we can access the index const indexInfo = await this.summaryIndex.describeIndexStats(); if (this.debug) console.log(`[DEBUG] Index stats:`, indexInfo); queryResponse = await this.summaryIndex.query({ vector: queryEmbedding, topK, includeMetadata: true, }); const queryDuration = Date.now() - queryStart; if (this.debug) { console.log(`[DEBUG] ✅ Pinecone query completed in ${queryDuration}ms`); console.log(`[DEBUG] Raw Pinecone response:`, JSON.stringify(queryResponse, null, 2)); if (queryResponse.matches) { console.log(`[DEBUG] Found ${queryResponse.matches.length} matching summaries`); console.log(`[DEBUG] Match scores: ${queryResponse.matches.map(m => m.score?.toFixed(4)).join(', ')}`); } else { console.log(`[DEBUG] No matches in response`); } } } catch (queryError) { console.error('[ERROR] Pinecone query failed:', queryError); if (typeof queryError === 'object' && queryError !== null) { console.error('[ERROR] Query error details:', JSON.stringify(queryError, null, 2)); } return []; } if (!queryResponse || !queryResponse.matches || queryResponse.matches.length === 0) { console.log('[DEBUG] ⚠️ No matches found in Pinecone for query'); return []; } // Format results const results = queryResponse.matches.map(match => { if (!match.metadata) { console.error('[ERROR] Match is missing metadata:', match); return { filepath: 'unknown', summary: 'No summary available' }; } return { filepath: (match.metadata as { filepath: string }).filepath || 'unknown', summary: (match.metadata as { summary: string }).summary || 'No summary available', score: match.score, }; }); if (this.debug) { if (results.length > 0) { console.log(`[DEBUG] Top result filepath: ${results[0]?.filepath}`); console.log(`[DEBUG] Top result score: ${results[0]?.score}`); console.log(`[DEBUG] Top result summary excerpt: "${results[0]?.summary?.substring(0, 100)}..."`); } else { console.log(`[DEBUG] No results returned after processing`); } } return results; } catch (error) { console.error('[ERROR] Error searching file summaries:', error); if (typeof error === 'object' && error !== null) { console.error('[ERROR] Full error details:', JSON.stringify(error, null, 2)); } return []; } } /** * Runs a single test query after setup to validate the file summary search functionality. */ private async runSingleTestQuery() { console.log(`\n[TEST] Running single test query to validate file summary search functionality...`); // Verify the index is accessible try { const indexStats = await this.summaryIndex.describeIndexStats(); console.log(`[TEST] Pinecone index stats:`, JSON.stringify(indexStats, null, 2)); console.log(`[TEST] Summary index contains ${indexStats.totalRecordCount} indexed summaries`); } catch (error) { console.error('[TEST] ❌ Failed to access Pinecone index:', error); return; } // Add a brief delay to ensure Pinecone has finished processing console.log('[TEST] Waiting 2 seconds for Pinecone indexing to complete...'); await new Promise(resolve => setTimeout(resolve, 2000)); // Run a single test query const query = 'React components for the UI'; console.log(`\n[TEST] Executing query: "${query}"`); try { const results = await this.searchFileSummaries(query); console.log(`[TEST] Search returned ${results.length} results:`); results.forEach((result, i) => { console.log(`\n[TEST] Result ${i + 1}:`); console.log(`[TEST] File: ${result.filepath}`); console.log(`[TEST] Score: ${result.score}`); console.log(`[TEST] Summary: "${result.summary?.substring(0, 150)}..."`); }); // If we have results, fetch the content for the first one if (results.length > 0) { const topFilepath = results[0].filepath; console.log(`\n[TEST] Fetching full content for top result: ${topFilepath}`); const content = await this.getFileContent(topFilepath); if (content) { console.log(`[TEST] ✅ Content retrieved successfully (${content.length} chars)`); console.log(`[TEST] Content excerpt:\n---\n${content.substring(0, 300)}...\n---`); } else { console.log(`[TEST] ❌ Failed to retrieve content for ${topFilepath}`); } } else { console.log(`\n[TEST] ⚠️ No results to fetch content for`); } console.log(`\n[TEST] ✅ Test query completed`); } catch (testError) { console.error(`[TEST] ❌ Test query failed:`, testError); if (typeof testError === 'object' && testError !== null) { console.error('[TEST] Full error details:', JSON.stringify(testError, null, 2)); } } } /** * Gets the full content of a file by its filepath. * @param filepath The filepath to look up * @returns The file content or null if not found */ async getFileContent(filepath: string): Promise { if (this.debug) console.log(`[DEBUG] Getting file content for: ${filepath}`); try { const startTime = Date.now(); // Use the Networking utility for consistent API access // But convert the response to text manually to avoid JSON parsing const rawResponse = await fetch('/getRawFileContent', { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify({ filepath }), }); if (!rawResponse.ok) { const errorText = await rawResponse.text(); console.error(`[ERROR] Server returned error ${rawResponse.status}: ${errorText}`); return null; } // Get the raw text content without JSON parsing const content = await rawResponse.text(); const duration = Date.now() - startTime; if (this.debug) { console.log(`[DEBUG] ✅ File content retrieved in ${duration}ms`); console.log(`[DEBUG] Content length: ${content.length} chars`); console.log(`[DEBUG] Content excerpt: "${content.substring(0, 100)}..."`); } return content; } catch (error) { console.error('[ERROR] Error getting file content:', error); if (typeof error === 'object' && error !== null) { console.error('[ERROR] Full error details:', JSON.stringify(error, null, 2)); } return null; } } /** * Adds an AI document to the vectorstore. Handles media file processing for audio/video, * and text embedding for all document types. Updates document metadata during processing. * @param doc The document to add. * @param progressCallback Callback to track the progress of the addition process. */ async addAIDoc(doc: Doc, progressCallback: (progress: number, step: string) => void) { const ai_document_status: string = StrCast(doc.ai_document_status); // Skip if the document is already in progress or completed. if (ai_document_status !== undefined && ai_document_status.trim() !== '' && ai_document_status !== '{}') { if (ai_document_status === 'PROGRESS') { console.log('Already in progress.'); return; } else if (ai_document_status === 'COMPLETED') { console.log('Already completed.'); return; } } else { // Start processing the document. doc.ai_document_status = 'PROGRESS'; const local_file_path = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname; if (!local_file_path) { console.log('Not adding to vectorstore. Invalid file path for vectorstore addition.'); return; } const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4'); let result: AI_Document & { doc_id: string }; if (isAudioOrVideo) { console.log('Processing media file...'); progressCallback(10, 'Preparing media file for transcription...'); // Post to processMediaFile endpoint to get the transcript const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) }); progressCallback(60, 'Transcription completed. Processing transcript...'); // Type assertion to handle the response properties const typedResponse = response as { condensed: Array<{ text: string; indexes: string[]; start: number; end: number }>; full: Array; summary: string; }; const segmentedTranscript = typedResponse.condensed; console.log(segmentedTranscript); const summary = typedResponse.summary; doc.summary = summary; // Generate embeddings for each chunk const texts = segmentedTranscript.map(chunk => chunk.text); try { const embeddingsResponse = await this.openai.embeddings.create({ model: 'text-embedding-3-large', input: texts, encoding_format: 'float', }); progressCallback(85, 'Embeddings generated. Finalizing document...'); doc.original_segments = JSON.stringify(typedResponse.full); const doc_id = doc[Id]; console.log('doc_id in vectorstore', doc_id); // Generate chunk IDs upfront so we can register them const chunkIds = segmentedTranscript.map(() => uuidv4()); // Add transcript and embeddings to metadata result = { doc_id, purpose: '', file_name: local_file_path, num_pages: 0, summary: summary, chunks: segmentedTranscript.map((chunk, index) => ({ id: chunkIds[index], // Use pre-generated chunk ID values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding metadata: { indexes: chunk.indexes, original_document: local_file_path, doc_id: doc_id, // Ensure doc_id is consistent file_path: local_file_path, start_time: chunk.start, end_time: chunk.end, text: chunk.text, type: local_file_path.endsWith('.mp3') ? CHUNK_TYPE.AUDIO : CHUNK_TYPE.VIDEO, }, })), type: 'media', }; progressCallback(95, 'Adding document to vectorstore...'); } catch (error) { console.error('Error generating embeddings:', error); doc.ai_document_status = 'ERROR'; throw new Error('Embedding generation failed'); } doc.segmented_transcript = JSON.stringify(segmentedTranscript); // Use doc manager to add simplified chunks const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; const simplifiedChunks = this.docManager.getSimplifiedChunks(result.chunks, docType); doc.chunk_simplified = JSON.stringify(simplifiedChunks); this.docManager.addSimplifiedChunks(simplifiedChunks); } else { // Process regular document console.log('Processing regular document...'); const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path, doc_id: doc[Id] }); // Type assertion for the response const { jobId } = createDocumentResponse as { jobId: string }; while (true) { await new Promise(resolve => setTimeout(resolve, 2000)); const resultResponse = await Networking.FetchFromServer(`/getResult/${jobId}`); const resultResponseJson = JSON.parse(resultResponse); if (resultResponseJson.status === 'completed') { result = resultResponseJson; break; } const progressResponse = await Networking.FetchFromServer(`/getProgress/${jobId}`); const progressResponseJson = JSON.parse(progressResponse); if (progressResponseJson) { progressCallback(progressResponseJson.progress, progressResponseJson.step); } } // Collect all chunk IDs const chunkIds = result.chunks.map(chunk => chunk.id); if (result.doc_id !== doc[Id]) { console.log('doc_id in vectorstore', result.doc_id, 'does not match doc_id in doc', doc[Id]); } // Use doc manager to add simplified chunks - determine document type from file extension const fileExt = path.extname(local_file_path).toLowerCase(); const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text'; const simplifiedChunks = this.docManager.getSimplifiedChunks(result.chunks, docType); doc.chunk_simplified = JSON.stringify(simplifiedChunks); this.docManager.addSimplifiedChunks(simplifiedChunks); doc.summary = result.summary; doc.ai_purpose = result.purpose; } // Index the document await this.indexDocument(result); progressCallback(100, 'Document added successfully!'); // Preserve existing metadata updates if (!doc.vectorstore_id) { doc.vectorstore_id = JSON.stringify([this._id]); } else { doc.vectorstore_id = JSON.stringify(JSON.parse(StrCast(doc.vectorstore_id)).concat([this._id])); } doc.ai_doc_id = result.doc_id; console.log(`Document added: ${result.file_name}`); doc.ai_document_status = 'COMPLETED'; } } /** * Uploads the document's vector chunks to the Pinecone index. * Prepares the metadata for each chunk and uses Pinecone's upsert operation. * @param document The processed document containing its chunks and metadata. */ private async indexDocument(document: AI_Document) { console.log('Uploading vectors to content namespace...'); // Prepare Pinecone records for each chunk in the document. const pineconeRecords: PineconeRecord[] = (document.chunks as RAGChunk[]).map(chunk => ({ id: chunk.id, values: chunk.values, metadata: { ...chunk.metadata } as RecordMetadata, })); // Upload the records to Pinecone. await this.index.upsert(pineconeRecords); } /** * Combines document chunks until their combined text reaches a minimum word count. * This is used to optimize retrieval and indexing processes. * @param chunks The original chunks to combine. * @returns Combined chunks with updated text and metadata. */ private combineChunks(chunks: RAGChunk[]): RAGChunk[] { const combinedChunks: RAGChunk[] = []; let currentChunk: RAGChunk | null = null; let wordCount = 0; chunks.forEach(chunk => { const textWords = chunk.metadata.text.split(' ').length; if (!currentChunk) { currentChunk = { ...chunk, metadata: { ...chunk.metadata, text: chunk.metadata.text } }; wordCount = textWords; } else if (wordCount + textWords >= 500) { combinedChunks.push(currentChunk); currentChunk = { ...chunk, metadata: { ...chunk.metadata, text: chunk.metadata.text } }; wordCount = textWords; } else { currentChunk.metadata.text += ` ${chunk.metadata.text}`; wordCount += textWords; } }); if (currentChunk) { combinedChunks.push(currentChunk); } return combinedChunks; } /** * Retrieves the most relevant document chunks for a given query. * Uses OpenAI for embedding the query and Pinecone for vector similarity matching. * @param query The search query string. * @param topK The number of top results to return (default is 15). * @returns A list of document chunks that match the query. */ async retrieve(query: string, topK: number = 15, docIds?: string[]): Promise { console.log(`Retrieving chunks for query: ${query}`); try { // Generate an embedding for the query using OpenAI. const queryEmbeddingResponse = await this.openai.embeddings.create({ model: 'text-embedding-3-large', input: query, encoding_format: 'float', }); const queryEmbedding = queryEmbeddingResponse.data[0].embedding; const _docIds = docIds?.length === 0 || !docIds ? this.docManager.docIds : docIds; console.log('Using document IDs for retrieval:', _docIds); // Query the Pinecone index using the embedding and filter by document IDs. // We'll query based on document IDs that are registered in the document manager const queryResponse: QueryResponse = await this.index.query({ vector: queryEmbedding, filter: { doc_id: { $in: _docIds }, }, topK, includeValues: true, includeMetadata: true, }); console.log(`Found ${queryResponse.matches.length} matching chunks`); // For each retrieved chunk, ensure its document ID is registered in the document manager // This maintains compatibility with existing code while ensuring consistency const processedMatches = queryResponse.matches.map(match => { const chunk = { id: match.id, values: match.values as number[], metadata: match.metadata as { text: string; type: string; original_document: string; file_path: string; doc_id: string; location: string; start_page: number; end_page: number; }, } as RAGChunk; return chunk; }); return processedMatches; } catch (error) { console.error(`Error retrieving chunks: ${error}`); return []; } } }