diff options
| author | Joanne <zehan_ding@brown.edu> | 2025-06-17 13:02:50 -0400 |
|---|---|---|
| committer | Joanne <zehan_ding@brown.edu> | 2025-06-17 13:02:50 -0400 |
| commit | 2aa2c26b95a539d220e46b20cdfbef6ae39d6c43 (patch) | |
| tree | 344a6f798f692fdd4921ab5a6762e907f5ad7b06 /src/client/views/nodes/chatbot/vectorstore | |
| parent | 430db63077868fa54829721d6530a810aa4d4588 (diff) | |
| parent | ccfdf905400cd4b81d8cde0f16bb0e15cd65621b (diff) | |
Merge branch 'agent-paper-main' of https://github.com/brown-dash/Dash-Web into joanne-tutorialagent
Diffstat (limited to 'src/client/views/nodes/chatbot/vectorstore')
| -rw-r--r-- | src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts | 662 |
1 files changed, 587 insertions, 75 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index 6d524e40f..72060973b 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -15,6 +15,8 @@ import { Networking } from '../../../../Network'; import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types'; import OpenAI from 'openai'; import { Embedding } from 'openai/resources'; +import { AgentDocumentManager } from '../utils/AgentDocumentManager'; +import { Id } from '../../../../../fields/FieldSymbols'; dotenv.config(); @@ -23,23 +25,28 @@ dotenv.config(); * and OpenAI text-embedding-3-large for text embedding. It handles AI document management, uploads, and query-based retrieval. */ export class Vectorstore { - private pinecone: Pinecone; // Pinecone client for managing the vector index. + private pinecone!: Pinecone; // Pinecone client for managing the vector index. private index!: Index; // The specific Pinecone index used for document chunks. - private openai: OpenAI; // OpenAI client for generating embeddings. + private summaryIndex!: Index; // The Pinecone index used for file summaries. + private openai!: OpenAI; // OpenAI client for generating embeddings. private indexName: string = 'pdf-chatbot'; // Default name for the index. - private _id: string; // Unique ID for the Vectorstore instance. - private _doc_ids: () => string[]; // List of document IDs handled by this instance. - + private summaryIndexName: string = 'file-summaries'; // Name for the summaries index. + private _id!: string; // Unique ID for the Vectorstore instance. + private docManager!: AgentDocumentManager; // Document manager for handling documents + private summaryCacheCount: number = 0; // Cache for the number of summaries documents: AI_Document[] = []; // Store the documents indexed in the vectorstore. + private debug: boolean = true; // Enable debugging + private initialized: boolean = false; /** * Initializes the Pinecone and OpenAI clients, sets up the document ID list, * and initializes the Pinecone index. * @param id The unique identifier for the vectorstore instance. - * @param doc_ids A function that returns a list of document IDs. + * @param docManager An instance of AgentDocumentManager to handle document management. */ - constructor(id: string, doc_ids: () => string[]) { - const pineconeApiKey = process.env.PINECONE_API_KEY; + constructor(id: string, docManager: AgentDocumentManager) { + if (this.debug) console.log(`[DEBUG] Initializing Vectorstore with ID: ${id}`); + const pineconeApiKey = 'pcsk_3txLxJ_9fxdmAph4csnq4yxoDF5De5A8bJvjWaXXigBgshy4eoXggrXcxATJiH8vzXbrKm'; if (!pineconeApiKey) { console.log('PINECONE_API_KEY is not defined - Vectorstore will be unavailable'); return; @@ -49,8 +56,39 @@ export class Vectorstore { this.pinecone = new Pinecone({ apiKey: pineconeApiKey }); this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true }); this._id = id; - this._doc_ids = doc_ids; - this.initializeIndex(); + this.docManager = docManager; + + // Proper async initialization sequence + this.initializeAsync(id); + } + + /** + * Handles async initialization of all components + */ + private async initializeAsync(id: string) { + try { + if (this.debug) console.log(`[DEBUG] Starting async initialization sequence for Vectorstore ID: ${id}`); + + // Initialize the main document index + await this.initializeIndex(); + + // Initialize the summary index + await this.initializeSummaryIndex(); + + this.initialized = true; + if (this.debug) console.log(`[DEBUG] ✅ Vectorstore initialization complete, running test query...`); + + // Run a single test query instead of multiple + await this.runSingleTestQuery(); + } catch (error) { + console.error('[ERROR] Failed to initialize Vectorstore:', error); + } + } + + async getFileNames() { + const response = await Networking.FetchFromServer('/getFileNames'); + const filepaths = JSON.parse(response); + return filepaths; } /** @@ -58,10 +96,13 @@ export class Vectorstore { * Sets the index to use cosine similarity for vector similarity calculations. */ private async initializeIndex() { + if (this.debug) console.log(`[DEBUG] Initializing main document index: ${this.indexName}`); const indexList: IndexList = await this.pinecone.listIndexes(); + if (this.debug) console.log(`[DEBUG] Available Pinecone indexes: ${indexList.indexes?.map(i => i.name).join(', ') || 'none'}`); // Check if the index already exists, otherwise create it. if (!indexList.indexes?.some(index => index.name === this.indexName)) { + if (this.debug) console.log(`[DEBUG] Creating new index: ${this.indexName}`); await this.pinecone.createIndex({ name: this.indexName, dimension: 3072, @@ -73,6 +114,9 @@ export class Vectorstore { }, }, }); + if (this.debug) console.log(`[DEBUG] ✅ Index ${this.indexName} created successfully`); + } else { + if (this.debug) console.log(`[DEBUG] ✅ Using existing index: ${this.indexName}`); } // Set the index for future use. @@ -80,6 +124,453 @@ export class Vectorstore { } /** + * Initializes the Pinecone index for file summaries. + * Checks if it exists and creates it if necessary. + */ + private async initializeSummaryIndex() { + if (this.debug) console.log(`[DEBUG] Initializing file summaries index: ${this.summaryIndexName}`); + const indexList: IndexList = await this.pinecone.listIndexes(); + + // Check if the index already exists, otherwise create it. + if (!indexList.indexes?.some(index => index.name === this.summaryIndexName)) { + if (this.debug) console.log(`[DEBUG] Creating new summary index: ${this.summaryIndexName}`); + await this.pinecone.createIndex({ + name: this.summaryIndexName, + dimension: 3072, + metric: 'cosine', + spec: { + serverless: { + cloud: 'aws', + region: 'us-east-1', + }, + }, + }); + if (this.debug) console.log(`[DEBUG] ✅ Summary index ${this.summaryIndexName} created successfully`); + } else { + if (this.debug) console.log(`[DEBUG] ✅ Using existing summary index: ${this.summaryIndexName}`); + } + + // Set the summaries index for future use. + this.summaryIndex = this.pinecone.Index(this.summaryIndexName); + + // Check if we need to index the file summaries + await this.processFileSummaries(); + } + + /** + * Processes file summaries from the JSON file if needed. + * Checks if the index contains the correct number of summaries before embedding. + */ + private async processFileSummaries() { + if (this.debug) console.log(`[DEBUG] Starting file summaries processing`); + try { + // Get file summaries from the server + if (this.debug) console.log(`[DEBUG] Fetching file summaries from server...`); + const response = await Networking.FetchFromServer('/getFileSummaries'); + + if (!response) { + console.error('[ERROR] Failed to fetch file summaries'); + return; + } + if (this.debug) console.log(`[DEBUG] File summaries response received (${response.length} bytes)`); + + const summaries = JSON.parse(response); + const filepaths = Object.keys(summaries); + const summaryCount = filepaths.length; + this.summaryCacheCount = summaryCount; + + if (this.debug) { + console.log(`[DEBUG] File summaries parsed: ${summaryCount} files`); + console.log(`[DEBUG] Sample filepaths: ${filepaths.slice(0, 3).join(', ')}...`); + console.log(`[DEBUG] Sample summary: "${summaries[filepaths[0]].substring(0, 100)}..."`); + } + + // Check if index already has the correct number of summaries + try { + if (this.debug) console.log(`[DEBUG] Checking summary index stats...`); + const indexStats = await this.summaryIndex.describeIndexStats(); + const vectorCount = indexStats.totalRecordCount; + + if (this.debug) console.log(`[DEBUG] Summary index has ${vectorCount} records, expecting ${summaryCount}`); + + if (vectorCount === summaryCount) { + console.log(`[DEBUG] ✅ Summary index already contains ${vectorCount} entries, skipping embedding.`); + return; + } + + if (this.debug) console.log(`[DEBUG] ⚠️ Summary index contains ${vectorCount} entries, but there are ${summaryCount} summaries. Re-indexing.`); + } catch (error) { + console.error('[ERROR] Error checking summary index stats:', error); + } + + // If we get here, we need to embed the summaries + await this.embedAndIndexFileSummaries(summaries); + } catch (error) { + console.error('[ERROR] Error processing file summaries:', error); + } + } + + /** + * Embeds and indexes file summaries into the summary index. + * @param summaries Object mapping filepaths to summaries + */ + private async embedAndIndexFileSummaries(summaries: Record<string, string>) { + if (this.debug) console.log(`[DEBUG] Starting embedding and indexing of file summaries...`); + + const filepaths = Object.keys(summaries); + const summaryTexts = Object.values(summaries); + + // Split into batches of 100 to avoid exceeding API limits + const batchSize = 100; + const totalBatches = Math.ceil(filepaths.length / batchSize); + + if (this.debug) console.log(`[DEBUG] Processing ${filepaths.length} files in ${totalBatches} batches of size ${batchSize}`); + + for (let i = 0; i < filepaths.length; i += batchSize) { + const batchFilepaths = filepaths.slice(i, i + batchSize); + const batchTexts = summaryTexts.slice(i, i + batchSize); + + if (this.debug) { + console.log(`[DEBUG] Processing batch ${Math.floor(i / batchSize) + 1}/${totalBatches}`); + console.log(`[DEBUG] First file in batch: ${batchFilepaths[0]}`); + console.log(`[DEBUG] First summary in batch: "${batchTexts[0].substring(0, 50)}..."`); + } + + try { + // Generate embeddings for this batch + if (this.debug) console.log(`[DEBUG] Generating embeddings for batch of ${batchTexts.length} summaries...`); + const startTime = Date.now(); + const embeddingResponse = await this.openai.embeddings.create({ + model: 'text-embedding-3-large', + input: batchTexts, + encoding_format: 'float', + }); + const duration = Date.now() - startTime; + if (this.debug) console.log(`[DEBUG] ✅ Embeddings generated in ${duration}ms`); + + // Prepare Pinecone records + if (this.debug) console.log(`[DEBUG] Preparing Pinecone records...`); + const pineconeRecords: PineconeRecord[] = batchTexts.map((text, index) => { + const embedding = (embeddingResponse.data as Embedding[])[index].embedding; + if (this.debug && index === 0) console.log(`[DEBUG] Sample embedding dimensions: ${embedding.length}, first few values: [${embedding.slice(0, 5).join(', ')}...]`); + + return { + id: uuidv4(), // Generate a unique ID for each summary + values: embedding, + metadata: { + filepath: batchFilepaths[index], + summary: text, + } as RecordMetadata, + }; + }); + + // Upload to Pinecone + if (this.debug) console.log(`[DEBUG] Upserting ${pineconeRecords.length} records to Pinecone...`); + const upsertStart = Date.now(); + try { + await this.summaryIndex.upsert(pineconeRecords); + const upsertDuration = Date.now() - upsertStart; + if (this.debug) console.log(`[DEBUG] ✅ Batch ${Math.floor(i / batchSize) + 1}/${totalBatches} indexed in ${upsertDuration}ms`); + } catch (upsertError) { + console.error(`[ERROR] Failed to upsert batch ${Math.floor(i / batchSize) + 1}/${totalBatches} to Pinecone:`, upsertError); + // Try again with smaller batch + if (batchTexts.length > 20) { + console.log(`[DEBUG] 🔄 Retrying with smaller batch size...`); + // Split the batch in half and retry recursively + const midpoint = Math.floor(batchTexts.length / 2); + const firstHalf = { + filepaths: batchFilepaths.slice(0, midpoint), + texts: batchTexts.slice(0, midpoint), + }; + const secondHalf = { + filepaths: batchFilepaths.slice(midpoint), + texts: batchTexts.slice(midpoint), + }; + + // Create a helper function to retry smaller batches + const retryBatch = async (paths: string[], texts: string[], batchNum: string) => { + try { + if (this.debug) console.log(`[DEBUG] Generating embeddings for sub-batch ${batchNum}...`); + const embRes = await this.openai.embeddings.create({ + model: 'text-embedding-3-large', + input: texts, + encoding_format: 'float', + }); + + const records = texts.map((t, idx) => ({ + id: uuidv4(), + values: (embRes.data as Embedding[])[idx].embedding, + metadata: { + filepath: paths[idx], + summary: t, + } as RecordMetadata, + })); + + if (this.debug) console.log(`[DEBUG] Upserting sub-batch ${batchNum} (${records.length} records)...`); + await this.summaryIndex.upsert(records); + if (this.debug) console.log(`[DEBUG] ✅ Sub-batch ${batchNum} upserted successfully`); + } catch (retryError) { + console.error(`[ERROR] Failed to upsert sub-batch ${batchNum}:`, retryError); + } + }; + + await retryBatch(firstHalf.filepaths, firstHalf.texts, `${Math.floor(i / batchSize) + 1}.1`); + await retryBatch(secondHalf.filepaths, secondHalf.texts, `${Math.floor(i / batchSize) + 1}.2`); + } + } + } catch (error) { + console.error('[ERROR] Error processing batch:', error); + } + } + + if (this.debug) console.log(`[DEBUG] ✅ File summary indexing complete for all ${filepaths.length} files`); + + // Verify the index was populated correctly + try { + const indexStats = await this.summaryIndex.describeIndexStats(); + const vectorCount = indexStats.totalRecordCount; + if (this.debug) console.log(`[DEBUG] 🔍 Final index verification: ${vectorCount} records in Pinecone index (expected ${filepaths.length})`); + } catch (error) { + console.error('[ERROR] Failed to verify index stats:', error); + } + } + + /** + * Searches for file summaries similar to the given query. + * @param query The search query + * @param topK Number of results to return (default: 5) + * @returns Array of filepath and summary pairs with relevance scores + */ + async searchFileSummaries(query: string, topK: number = 5): Promise<Array<{ filepath: string; summary: string; score?: number }>> { + if (!this.initialized) { + console.error('[ERROR] Cannot search - Vectorstore not fully initialized'); + return []; + } + + if (this.debug) console.log(`[DEBUG] Searching file summaries for query: "${query}" (topK=${topK})`); + try { + // Generate embedding for the query + if (this.debug) console.log(`[DEBUG] Generating embedding for query...`); + const startTime = Date.now(); + const queryEmbeddingResponse = await this.openai.embeddings.create({ + model: 'text-embedding-3-large', + input: query, + encoding_format: 'float', + }); + const duration = Date.now() - startTime; + + const queryEmbedding = queryEmbeddingResponse.data[0].embedding; + if (this.debug) { + console.log(`[DEBUG] ✅ Query embedding generated in ${duration}ms`); + console.log(`[DEBUG] Query embedding dimensions: ${queryEmbedding.length}`); + } + + // Check if summary index is ready + try { + const indexStats = await this.summaryIndex.describeIndexStats(); + const vectorCount = indexStats.totalRecordCount; + if (this.debug) console.log(`[DEBUG] Summary index contains ${vectorCount} records`); + + if (vectorCount === 0) { + console.error('[ERROR] Summary index is empty, cannot perform search'); + return []; + } + } catch (statsError) { + console.error('[ERROR] Failed to check summary index stats:', statsError); + console.error('[ERROR] Stats error details:', JSON.stringify(statsError)); + } + + // Test direct API access to Pinecone + if (this.debug) console.log(`[DEBUG] Testing Pinecone connection...`); + try { + const indexes = await this.pinecone.listIndexes(); + console.log(`[DEBUG] Available Pinecone indexes: ${indexes.indexes?.map(idx => idx.name).join(', ')}`); + } catch (connectionError) { + console.error('[ERROR] Could not connect to Pinecone:', connectionError); + } + + // Query the summaries index + if (this.debug) console.log(`[DEBUG] Querying Pinecone summary index (${this.summaryIndexName})...`); + const queryStart = Date.now(); + + let queryResponse; + try { + // First, make sure we can access the index + const indexInfo = await this.summaryIndex.describeIndexStats(); + if (this.debug) console.log(`[DEBUG] Index stats:`, indexInfo); + + queryResponse = await this.summaryIndex.query({ + vector: queryEmbedding, + topK, + includeMetadata: true, + }); + + const queryDuration = Date.now() - queryStart; + + if (this.debug) { + console.log(`[DEBUG] ✅ Pinecone query completed in ${queryDuration}ms`); + console.log(`[DEBUG] Raw Pinecone response:`, JSON.stringify(queryResponse, null, 2)); + if (queryResponse.matches) { + console.log(`[DEBUG] Found ${queryResponse.matches.length} matching summaries`); + console.log(`[DEBUG] Match scores: ${queryResponse.matches.map(m => m.score?.toFixed(4)).join(', ')}`); + } else { + console.log(`[DEBUG] No matches in response`); + } + } + } catch (queryError) { + console.error('[ERROR] Pinecone query failed:', queryError); + if (typeof queryError === 'object' && queryError !== null) { + console.error('[ERROR] Query error details:', JSON.stringify(queryError, null, 2)); + } + return []; + } + + if (!queryResponse || !queryResponse.matches || queryResponse.matches.length === 0) { + console.log('[DEBUG] ⚠️ No matches found in Pinecone for query'); + return []; + } + + // Format results + const results = queryResponse.matches.map(match => { + if (!match.metadata) { + console.error('[ERROR] Match is missing metadata:', match); + return { filepath: 'unknown', summary: 'No summary available' }; + } + + return { + filepath: (match.metadata as { filepath: string }).filepath || 'unknown', + summary: (match.metadata as { summary: string }).summary || 'No summary available', + score: match.score, + }; + }); + + if (this.debug) { + if (results.length > 0) { + console.log(`[DEBUG] Top result filepath: ${results[0]?.filepath}`); + console.log(`[DEBUG] Top result score: ${results[0]?.score}`); + console.log(`[DEBUG] Top result summary excerpt: "${results[0]?.summary?.substring(0, 100)}..."`); + } else { + console.log(`[DEBUG] No results returned after processing`); + } + } + + return results; + } catch (error) { + console.error('[ERROR] Error searching file summaries:', error); + if (typeof error === 'object' && error !== null) { + console.error('[ERROR] Full error details:', JSON.stringify(error, null, 2)); + } + return []; + } + } + + /** + * Runs a single test query after setup to validate the file summary search functionality. + */ + private async runSingleTestQuery() { + console.log(`\n[TEST] Running single test query to validate file summary search functionality...`); + + // Verify the index is accessible + try { + const indexStats = await this.summaryIndex.describeIndexStats(); + console.log(`[TEST] Pinecone index stats:`, JSON.stringify(indexStats, null, 2)); + console.log(`[TEST] Summary index contains ${indexStats.totalRecordCount} indexed summaries`); + } catch (error) { + console.error('[TEST] ❌ Failed to access Pinecone index:', error); + return; + } + + // Add a brief delay to ensure Pinecone has finished processing + console.log('[TEST] Waiting 2 seconds for Pinecone indexing to complete...'); + await new Promise(resolve => setTimeout(resolve, 2000)); + + // Run a single test query + const query = 'React components for the UI'; + console.log(`\n[TEST] Executing query: "${query}"`); + + try { + const results = await this.searchFileSummaries(query); + console.log(`[TEST] Search returned ${results.length} results:`); + + results.forEach((result, i) => { + console.log(`\n[TEST] Result ${i + 1}:`); + console.log(`[TEST] File: ${result.filepath}`); + console.log(`[TEST] Score: ${result.score}`); + console.log(`[TEST] Summary: "${result.summary?.substring(0, 150)}..."`); + }); + + // If we have results, fetch the content for the first one + if (results.length > 0) { + const topFilepath = results[0].filepath; + console.log(`\n[TEST] Fetching full content for top result: ${topFilepath}`); + const content = await this.getFileContent(topFilepath); + + if (content) { + console.log(`[TEST] ✅ Content retrieved successfully (${content.length} chars)`); + console.log(`[TEST] Content excerpt:\n---\n${content.substring(0, 300)}...\n---`); + } else { + console.log(`[TEST] ❌ Failed to retrieve content for ${topFilepath}`); + } + } else { + console.log(`\n[TEST] ⚠️ No results to fetch content for`); + } + + console.log(`\n[TEST] ✅ Test query completed`); + } catch (testError) { + console.error(`[TEST] ❌ Test query failed:`, testError); + if (typeof testError === 'object' && testError !== null) { + console.error('[TEST] Full error details:', JSON.stringify(testError, null, 2)); + } + } + } + + /** + * Gets the full content of a file by its filepath. + * @param filepath The filepath to look up + * @returns The file content or null if not found + */ + async getFileContent(filepath: string): Promise<string | null> { + if (this.debug) console.log(`[DEBUG] Getting file content for: ${filepath}`); + try { + const startTime = Date.now(); + + // Use the Networking utility for consistent API access + // But convert the response to text manually to avoid JSON parsing + const rawResponse = await fetch('/getRawFileContent', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ filepath }), + }); + + if (!rawResponse.ok) { + const errorText = await rawResponse.text(); + console.error(`[ERROR] Server returned error ${rawResponse.status}: ${errorText}`); + return null; + } + + // Get the raw text content without JSON parsing + const content = await rawResponse.text(); + const duration = Date.now() - startTime; + + if (this.debug) { + console.log(`[DEBUG] ✅ File content retrieved in ${duration}ms`); + console.log(`[DEBUG] Content length: ${content.length} chars`); + console.log(`[DEBUG] Content excerpt: "${content.substring(0, 100)}..."`); + } + + return content; + } catch (error) { + console.error('[ERROR] Error getting file content:', error); + if (typeof error === 'object' && error !== null) { + console.error('[ERROR] Full error details:', JSON.stringify(error, null, 2)); + } + return null; + } + } + + /** * Adds an AI document to the vectorstore. Handles media file processing for audio/video, * and text embedding for all document types. Updates document metadata during processing. * @param doc The document to add. @@ -103,21 +594,35 @@ export class Vectorstore { const local_file_path = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname; if (!local_file_path) { - console.log('Invalid file path.'); + console.log('Not adding to vectorstore. Invalid file path for vectorstore addition.'); return; } const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4'); let result: AI_Document & { doc_id: string }; + if (isAudioOrVideo) { console.log('Processing media file...'); - const response = (await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) })) as { [key: string]: unknown }; - const segmentedTranscript = response.condensed; + progressCallback(10, 'Preparing media file for transcription...'); + + // Post to processMediaFile endpoint to get the transcript + const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) }); + progressCallback(60, 'Transcription completed. Processing transcript...'); + + // Type assertion to handle the response properties + const typedResponse = response as { + condensed: Array<{ text: string; indexes: string[]; start: number; end: number }>; + full: Array<unknown>; + summary: string; + }; + + const segmentedTranscript = typedResponse.condensed; console.log(segmentedTranscript); - const summary = response.summary as string; + const summary = typedResponse.summary; doc.summary = summary; + // Generate embeddings for each chunk - const texts = (segmentedTranscript as { text: string }[])?.map(chunk => chunk.text); + const texts = segmentedTranscript.map(chunk => chunk.text); try { const embeddingsResponse = await this.openai.embeddings.create({ @@ -125,54 +630,57 @@ export class Vectorstore { input: texts, encoding_format: 'float', }); + progressCallback(85, 'Embeddings generated. Finalizing document...'); - doc.original_segments = JSON.stringify(response.full); - doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; - const doc_id = uuidv4(); + doc.original_segments = JSON.stringify(typedResponse.full); + const doc_id = doc[Id]; + console.log('doc_id in vectorstore', doc_id); + // Generate chunk IDs upfront so we can register them + const chunkIds = segmentedTranscript.map(() => uuidv4()); // Add transcript and embeddings to metadata result = { doc_id, purpose: '', file_name: local_file_path, num_pages: 0, - summary: '', - chunks: (segmentedTranscript as { text: string; start: number; end: number; indexes: string[] }[]).map((chunk, index) => ({ - id: uuidv4(), + summary: summary, + chunks: segmentedTranscript.map((chunk, index) => ({ + id: chunkIds[index], // Use pre-generated chunk ID values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding metadata: { indexes: chunk.indexes, original_document: local_file_path, - doc_id: doc_id, + doc_id: doc_id, // Ensure doc_id is consistent file_path: local_file_path, start_time: chunk.start, end_time: chunk.end, text: chunk.text, - type: CHUNK_TYPE.VIDEO, + type: local_file_path.endsWith('.mp3') ? CHUNK_TYPE.AUDIO : CHUNK_TYPE.VIDEO, }, })), type: 'media', }; + progressCallback(95, 'Adding document to vectorstore...'); } catch (error) { console.error('Error generating embeddings:', error); + doc.ai_document_status = 'ERROR'; throw new Error('Embedding generation failed'); } doc.segmented_transcript = JSON.stringify(segmentedTranscript); - // Simplify chunks for storage - const simplifiedChunks = result.chunks.map(chunk => ({ - chunkId: chunk.id, - start_time: chunk.metadata.start_time, - end_time: chunk.metadata.end_time, - indexes: chunk.metadata.indexes, - chunkType: CHUNK_TYPE.VIDEO, - text: chunk.metadata.text, - })); - doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks }); + // Use doc manager to add simplified chunks + const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; + const simplifiedChunks = this.docManager.getSimplifiedChunks(result.chunks, docType); + doc.chunk_simplified = JSON.stringify(simplifiedChunks); + this.docManager.addSimplifiedChunks(simplifiedChunks); } else { - // Existing document processing logic remains unchanged + // Process regular document console.log('Processing regular document...'); - const { jobId } = (await Networking.PostToServer('/createDocument', { file_path: local_file_path })) as { jobId: string }; + const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path, doc_id: doc[Id] }); + + // Type assertion for the response + const { jobId } = createDocumentResponse as { jobId: string }; while (true) { await new Promise(resolve => setTimeout(resolve, 2000)); @@ -188,29 +696,28 @@ export class Vectorstore { progressCallback(progressResponseJson.progress, progressResponseJson.step); } } - if (!doc.chunk_simpl) { - doc.chunk_simpl = JSON.stringify({ chunks: [] }); + + // Collect all chunk IDs + const chunkIds = result.chunks.map(chunk => chunk.id); + + if (result.doc_id !== doc[Id]) { + console.log('doc_id in vectorstore', result.doc_id, 'does not match doc_id in doc', doc[Id]); } + + // Use doc manager to add simplified chunks - determine document type from file extension + const fileExt = path.extname(local_file_path).toLowerCase(); + const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text'; + const simplifiedChunks = this.docManager.getSimplifiedChunks(result.chunks, docType); + doc.chunk_simplified = JSON.stringify(simplifiedChunks); + this.docManager.addSimplifiedChunks(simplifiedChunks); + doc.summary = result.summary; doc.ai_purpose = result.purpose; - - result.chunks.forEach((chunk: RAGChunk) => { - const chunkToAdd = { - chunkId: chunk.id, - startPage: chunk.metadata.start_page, - endPage: chunk.metadata.end_page, - location: chunk.metadata.location, - chunkType: chunk.metadata.type as CHUNK_TYPE, - text: chunk.metadata.text, - }; - const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); - new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd); - doc.chunk_simpl = JSON.stringify(new_chunk_simpl); - }); } // Index the document await this.indexDocument(result); + progressCallback(100, 'Document added successfully!'); // Preserve existing metadata updates if (!doc.vectorstore_id) { @@ -286,7 +793,7 @@ export class Vectorstore { * @param topK The number of top results to return (default is 10). * @returns A list of document chunks that match the query. */ - async retrieve(query: string, topK: number = 10): Promise<RAGChunk[]> { + async retrieve(query: string, topK: number = 10, docIds?: string[]): Promise<RAGChunk[]> { console.log(`Retrieving chunks for query: ${query}`); try { // Generate an embedding for the query using OpenAI. @@ -297,40 +804,45 @@ export class Vectorstore { }); const queryEmbedding = queryEmbeddingResponse.data[0].embedding; + const _docIds = docIds?.length === 0 || !docIds ? this.docManager.docIds : docIds; - // Extract the embedding from the response. + console.log('Using document IDs for retrieval:', _docIds); - console.log(this._doc_ids()); // Query the Pinecone index using the embedding and filter by document IDs. + // We'll query based on document IDs that are registered in the document manager const queryResponse: QueryResponse = await this.index.query({ vector: queryEmbedding, filter: { - doc_id: { $in: this._doc_ids() }, + doc_id: { $in: _docIds }, }, topK, includeValues: true, includeMetadata: true, }); - console.log(queryResponse); - - // Map the results into RAGChunks and return them. - return queryResponse.matches.map( - match => - ({ - id: match.id, - values: match.values as number[], - metadata: match.metadata as { - text: string; - type: string; - original_document: string; - file_path: string; - doc_id: string; - location: string; - start_page: number; - end_page: number; - }, - }) as RAGChunk - ); + console.log(`Found ${queryResponse.matches.length} matching chunks`); + + // For each retrieved chunk, ensure its document ID is registered in the document manager + // This maintains compatibility with existing code while ensuring consistency + const processedMatches = queryResponse.matches.map(match => { + const chunk = { + id: match.id, + values: match.values as number[], + metadata: match.metadata as { + text: string; + type: string; + original_document: string; + file_path: string; + doc_id: string; + location: string; + start_page: number; + end_page: number; + }, + } as RAGChunk; + + return chunk; + }); + + return processedMatches; } catch (error) { console.error(`Error retrieving chunks: ${error}`); return []; |
