aboutsummaryrefslogtreecommitdiff
path: root/src/client/views/nodes/chatbot/vectorstore
diff options
context:
space:
mode:
authorJoanne <zehan_ding@brown.edu>2025-06-17 13:02:50 -0400
committerJoanne <zehan_ding@brown.edu>2025-06-17 13:02:50 -0400
commit2aa2c26b95a539d220e46b20cdfbef6ae39d6c43 (patch)
tree344a6f798f692fdd4921ab5a6762e907f5ad7b06 /src/client/views/nodes/chatbot/vectorstore
parent430db63077868fa54829721d6530a810aa4d4588 (diff)
parentccfdf905400cd4b81d8cde0f16bb0e15cd65621b (diff)
Merge branch 'agent-paper-main' of https://github.com/brown-dash/Dash-Web into joanne-tutorialagent
Diffstat (limited to 'src/client/views/nodes/chatbot/vectorstore')
-rw-r--r--src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts662
1 files changed, 587 insertions, 75 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index 6d524e40f..72060973b 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -15,6 +15,8 @@ import { Networking } from '../../../../Network';
import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
import OpenAI from 'openai';
import { Embedding } from 'openai/resources';
+import { AgentDocumentManager } from '../utils/AgentDocumentManager';
+import { Id } from '../../../../../fields/FieldSymbols';
dotenv.config();
@@ -23,23 +25,28 @@ dotenv.config();
* and OpenAI text-embedding-3-large for text embedding. It handles AI document management, uploads, and query-based retrieval.
*/
export class Vectorstore {
- private pinecone: Pinecone; // Pinecone client for managing the vector index.
+ private pinecone!: Pinecone; // Pinecone client for managing the vector index.
private index!: Index; // The specific Pinecone index used for document chunks.
- private openai: OpenAI; // OpenAI client for generating embeddings.
+ private summaryIndex!: Index; // The Pinecone index used for file summaries.
+ private openai!: OpenAI; // OpenAI client for generating embeddings.
private indexName: string = 'pdf-chatbot'; // Default name for the index.
- private _id: string; // Unique ID for the Vectorstore instance.
- private _doc_ids: () => string[]; // List of document IDs handled by this instance.
-
+ private summaryIndexName: string = 'file-summaries'; // Name for the summaries index.
+ private _id!: string; // Unique ID for the Vectorstore instance.
+ private docManager!: AgentDocumentManager; // Document manager for handling documents
+ private summaryCacheCount: number = 0; // Cache for the number of summaries
documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
+ private debug: boolean = true; // Enable debugging
+ private initialized: boolean = false;
/**
* Initializes the Pinecone and OpenAI clients, sets up the document ID list,
* and initializes the Pinecone index.
* @param id The unique identifier for the vectorstore instance.
- * @param doc_ids A function that returns a list of document IDs.
+ * @param docManager An instance of AgentDocumentManager to handle document management.
*/
- constructor(id: string, doc_ids: () => string[]) {
- const pineconeApiKey = process.env.PINECONE_API_KEY;
+ constructor(id: string, docManager: AgentDocumentManager) {
+ if (this.debug) console.log(`[DEBUG] Initializing Vectorstore with ID: ${id}`);
+ const pineconeApiKey = 'pcsk_3txLxJ_9fxdmAph4csnq4yxoDF5De5A8bJvjWaXXigBgshy4eoXggrXcxATJiH8vzXbrKm';
if (!pineconeApiKey) {
console.log('PINECONE_API_KEY is not defined - Vectorstore will be unavailable');
return;
@@ -49,8 +56,39 @@ export class Vectorstore {
this.pinecone = new Pinecone({ apiKey: pineconeApiKey });
this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true });
this._id = id;
- this._doc_ids = doc_ids;
- this.initializeIndex();
+ this.docManager = docManager;
+
+ // Proper async initialization sequence
+ this.initializeAsync(id);
+ }
+
+ /**
+ * Handles async initialization of all components
+ */
+ private async initializeAsync(id: string) {
+ try {
+ if (this.debug) console.log(`[DEBUG] Starting async initialization sequence for Vectorstore ID: ${id}`);
+
+ // Initialize the main document index
+ await this.initializeIndex();
+
+ // Initialize the summary index
+ await this.initializeSummaryIndex();
+
+ this.initialized = true;
+ if (this.debug) console.log(`[DEBUG] ✅ Vectorstore initialization complete, running test query...`);
+
+ // Run a single test query instead of multiple
+ await this.runSingleTestQuery();
+ } catch (error) {
+ console.error('[ERROR] Failed to initialize Vectorstore:', error);
+ }
+ }
+
+ async getFileNames() {
+ const response = await Networking.FetchFromServer('/getFileNames');
+ const filepaths = JSON.parse(response);
+ return filepaths;
}
/**
@@ -58,10 +96,13 @@ export class Vectorstore {
* Sets the index to use cosine similarity for vector similarity calculations.
*/
private async initializeIndex() {
+ if (this.debug) console.log(`[DEBUG] Initializing main document index: ${this.indexName}`);
const indexList: IndexList = await this.pinecone.listIndexes();
+ if (this.debug) console.log(`[DEBUG] Available Pinecone indexes: ${indexList.indexes?.map(i => i.name).join(', ') || 'none'}`);
// Check if the index already exists, otherwise create it.
if (!indexList.indexes?.some(index => index.name === this.indexName)) {
+ if (this.debug) console.log(`[DEBUG] Creating new index: ${this.indexName}`);
await this.pinecone.createIndex({
name: this.indexName,
dimension: 3072,
@@ -73,6 +114,9 @@ export class Vectorstore {
},
},
});
+ if (this.debug) console.log(`[DEBUG] ✅ Index ${this.indexName} created successfully`);
+ } else {
+ if (this.debug) console.log(`[DEBUG] ✅ Using existing index: ${this.indexName}`);
}
// Set the index for future use.
@@ -80,6 +124,453 @@ export class Vectorstore {
}
/**
+ * Initializes the Pinecone index for file summaries.
+ * Checks if it exists and creates it if necessary.
+ */
+ private async initializeSummaryIndex() {
+ if (this.debug) console.log(`[DEBUG] Initializing file summaries index: ${this.summaryIndexName}`);
+ const indexList: IndexList = await this.pinecone.listIndexes();
+
+ // Check if the index already exists, otherwise create it.
+ if (!indexList.indexes?.some(index => index.name === this.summaryIndexName)) {
+ if (this.debug) console.log(`[DEBUG] Creating new summary index: ${this.summaryIndexName}`);
+ await this.pinecone.createIndex({
+ name: this.summaryIndexName,
+ dimension: 3072,
+ metric: 'cosine',
+ spec: {
+ serverless: {
+ cloud: 'aws',
+ region: 'us-east-1',
+ },
+ },
+ });
+ if (this.debug) console.log(`[DEBUG] ✅ Summary index ${this.summaryIndexName} created successfully`);
+ } else {
+ if (this.debug) console.log(`[DEBUG] ✅ Using existing summary index: ${this.summaryIndexName}`);
+ }
+
+ // Set the summaries index for future use.
+ this.summaryIndex = this.pinecone.Index(this.summaryIndexName);
+
+ // Check if we need to index the file summaries
+ await this.processFileSummaries();
+ }
+
+ /**
+ * Processes file summaries from the JSON file if needed.
+ * Checks if the index contains the correct number of summaries before embedding.
+ */
+ private async processFileSummaries() {
+ if (this.debug) console.log(`[DEBUG] Starting file summaries processing`);
+ try {
+ // Get file summaries from the server
+ if (this.debug) console.log(`[DEBUG] Fetching file summaries from server...`);
+ const response = await Networking.FetchFromServer('/getFileSummaries');
+
+ if (!response) {
+ console.error('[ERROR] Failed to fetch file summaries');
+ return;
+ }
+ if (this.debug) console.log(`[DEBUG] File summaries response received (${response.length} bytes)`);
+
+ const summaries = JSON.parse(response);
+ const filepaths = Object.keys(summaries);
+ const summaryCount = filepaths.length;
+ this.summaryCacheCount = summaryCount;
+
+ if (this.debug) {
+ console.log(`[DEBUG] File summaries parsed: ${summaryCount} files`);
+ console.log(`[DEBUG] Sample filepaths: ${filepaths.slice(0, 3).join(', ')}...`);
+ console.log(`[DEBUG] Sample summary: "${summaries[filepaths[0]].substring(0, 100)}..."`);
+ }
+
+ // Check if index already has the correct number of summaries
+ try {
+ if (this.debug) console.log(`[DEBUG] Checking summary index stats...`);
+ const indexStats = await this.summaryIndex.describeIndexStats();
+ const vectorCount = indexStats.totalRecordCount;
+
+ if (this.debug) console.log(`[DEBUG] Summary index has ${vectorCount} records, expecting ${summaryCount}`);
+
+ if (vectorCount === summaryCount) {
+ console.log(`[DEBUG] ✅ Summary index already contains ${vectorCount} entries, skipping embedding.`);
+ return;
+ }
+
+ if (this.debug) console.log(`[DEBUG] ⚠️ Summary index contains ${vectorCount} entries, but there are ${summaryCount} summaries. Re-indexing.`);
+ } catch (error) {
+ console.error('[ERROR] Error checking summary index stats:', error);
+ }
+
+ // If we get here, we need to embed the summaries
+ await this.embedAndIndexFileSummaries(summaries);
+ } catch (error) {
+ console.error('[ERROR] Error processing file summaries:', error);
+ }
+ }
+
+ /**
+ * Embeds and indexes file summaries into the summary index.
+ * @param summaries Object mapping filepaths to summaries
+ */
+ private async embedAndIndexFileSummaries(summaries: Record<string, string>) {
+ if (this.debug) console.log(`[DEBUG] Starting embedding and indexing of file summaries...`);
+
+ const filepaths = Object.keys(summaries);
+ const summaryTexts = Object.values(summaries);
+
+ // Split into batches of 100 to avoid exceeding API limits
+ const batchSize = 100;
+ const totalBatches = Math.ceil(filepaths.length / batchSize);
+
+ if (this.debug) console.log(`[DEBUG] Processing ${filepaths.length} files in ${totalBatches} batches of size ${batchSize}`);
+
+ for (let i = 0; i < filepaths.length; i += batchSize) {
+ const batchFilepaths = filepaths.slice(i, i + batchSize);
+ const batchTexts = summaryTexts.slice(i, i + batchSize);
+
+ if (this.debug) {
+ console.log(`[DEBUG] Processing batch ${Math.floor(i / batchSize) + 1}/${totalBatches}`);
+ console.log(`[DEBUG] First file in batch: ${batchFilepaths[0]}`);
+ console.log(`[DEBUG] First summary in batch: "${batchTexts[0].substring(0, 50)}..."`);
+ }
+
+ try {
+ // Generate embeddings for this batch
+ if (this.debug) console.log(`[DEBUG] Generating embeddings for batch of ${batchTexts.length} summaries...`);
+ const startTime = Date.now();
+ const embeddingResponse = await this.openai.embeddings.create({
+ model: 'text-embedding-3-large',
+ input: batchTexts,
+ encoding_format: 'float',
+ });
+ const duration = Date.now() - startTime;
+ if (this.debug) console.log(`[DEBUG] ✅ Embeddings generated in ${duration}ms`);
+
+ // Prepare Pinecone records
+ if (this.debug) console.log(`[DEBUG] Preparing Pinecone records...`);
+ const pineconeRecords: PineconeRecord[] = batchTexts.map((text, index) => {
+ const embedding = (embeddingResponse.data as Embedding[])[index].embedding;
+ if (this.debug && index === 0) console.log(`[DEBUG] Sample embedding dimensions: ${embedding.length}, first few values: [${embedding.slice(0, 5).join(', ')}...]`);
+
+ return {
+ id: uuidv4(), // Generate a unique ID for each summary
+ values: embedding,
+ metadata: {
+ filepath: batchFilepaths[index],
+ summary: text,
+ } as RecordMetadata,
+ };
+ });
+
+ // Upload to Pinecone
+ if (this.debug) console.log(`[DEBUG] Upserting ${pineconeRecords.length} records to Pinecone...`);
+ const upsertStart = Date.now();
+ try {
+ await this.summaryIndex.upsert(pineconeRecords);
+ const upsertDuration = Date.now() - upsertStart;
+ if (this.debug) console.log(`[DEBUG] ✅ Batch ${Math.floor(i / batchSize) + 1}/${totalBatches} indexed in ${upsertDuration}ms`);
+ } catch (upsertError) {
+ console.error(`[ERROR] Failed to upsert batch ${Math.floor(i / batchSize) + 1}/${totalBatches} to Pinecone:`, upsertError);
+ // Try again with smaller batch
+ if (batchTexts.length > 20) {
+ console.log(`[DEBUG] 🔄 Retrying with smaller batch size...`);
+ // Split the batch in half and retry recursively
+ const midpoint = Math.floor(batchTexts.length / 2);
+ const firstHalf = {
+ filepaths: batchFilepaths.slice(0, midpoint),
+ texts: batchTexts.slice(0, midpoint),
+ };
+ const secondHalf = {
+ filepaths: batchFilepaths.slice(midpoint),
+ texts: batchTexts.slice(midpoint),
+ };
+
+ // Create a helper function to retry smaller batches
+ const retryBatch = async (paths: string[], texts: string[], batchNum: string) => {
+ try {
+ if (this.debug) console.log(`[DEBUG] Generating embeddings for sub-batch ${batchNum}...`);
+ const embRes = await this.openai.embeddings.create({
+ model: 'text-embedding-3-large',
+ input: texts,
+ encoding_format: 'float',
+ });
+
+ const records = texts.map((t, idx) => ({
+ id: uuidv4(),
+ values: (embRes.data as Embedding[])[idx].embedding,
+ metadata: {
+ filepath: paths[idx],
+ summary: t,
+ } as RecordMetadata,
+ }));
+
+ if (this.debug) console.log(`[DEBUG] Upserting sub-batch ${batchNum} (${records.length} records)...`);
+ await this.summaryIndex.upsert(records);
+ if (this.debug) console.log(`[DEBUG] ✅ Sub-batch ${batchNum} upserted successfully`);
+ } catch (retryError) {
+ console.error(`[ERROR] Failed to upsert sub-batch ${batchNum}:`, retryError);
+ }
+ };
+
+ await retryBatch(firstHalf.filepaths, firstHalf.texts, `${Math.floor(i / batchSize) + 1}.1`);
+ await retryBatch(secondHalf.filepaths, secondHalf.texts, `${Math.floor(i / batchSize) + 1}.2`);
+ }
+ }
+ } catch (error) {
+ console.error('[ERROR] Error processing batch:', error);
+ }
+ }
+
+ if (this.debug) console.log(`[DEBUG] ✅ File summary indexing complete for all ${filepaths.length} files`);
+
+ // Verify the index was populated correctly
+ try {
+ const indexStats = await this.summaryIndex.describeIndexStats();
+ const vectorCount = indexStats.totalRecordCount;
+ if (this.debug) console.log(`[DEBUG] 🔍 Final index verification: ${vectorCount} records in Pinecone index (expected ${filepaths.length})`);
+ } catch (error) {
+ console.error('[ERROR] Failed to verify index stats:', error);
+ }
+ }
+
+ /**
+ * Searches for file summaries similar to the given query.
+ * @param query The search query
+ * @param topK Number of results to return (default: 5)
+ * @returns Array of filepath and summary pairs with relevance scores
+ */
+ async searchFileSummaries(query: string, topK: number = 5): Promise<Array<{ filepath: string; summary: string; score?: number }>> {
+ if (!this.initialized) {
+ console.error('[ERROR] Cannot search - Vectorstore not fully initialized');
+ return [];
+ }
+
+ if (this.debug) console.log(`[DEBUG] Searching file summaries for query: "${query}" (topK=${topK})`);
+ try {
+ // Generate embedding for the query
+ if (this.debug) console.log(`[DEBUG] Generating embedding for query...`);
+ const startTime = Date.now();
+ const queryEmbeddingResponse = await this.openai.embeddings.create({
+ model: 'text-embedding-3-large',
+ input: query,
+ encoding_format: 'float',
+ });
+ const duration = Date.now() - startTime;
+
+ const queryEmbedding = queryEmbeddingResponse.data[0].embedding;
+ if (this.debug) {
+ console.log(`[DEBUG] ✅ Query embedding generated in ${duration}ms`);
+ console.log(`[DEBUG] Query embedding dimensions: ${queryEmbedding.length}`);
+ }
+
+ // Check if summary index is ready
+ try {
+ const indexStats = await this.summaryIndex.describeIndexStats();
+ const vectorCount = indexStats.totalRecordCount;
+ if (this.debug) console.log(`[DEBUG] Summary index contains ${vectorCount} records`);
+
+ if (vectorCount === 0) {
+ console.error('[ERROR] Summary index is empty, cannot perform search');
+ return [];
+ }
+ } catch (statsError) {
+ console.error('[ERROR] Failed to check summary index stats:', statsError);
+ console.error('[ERROR] Stats error details:', JSON.stringify(statsError));
+ }
+
+ // Test direct API access to Pinecone
+ if (this.debug) console.log(`[DEBUG] Testing Pinecone connection...`);
+ try {
+ const indexes = await this.pinecone.listIndexes();
+ console.log(`[DEBUG] Available Pinecone indexes: ${indexes.indexes?.map(idx => idx.name).join(', ')}`);
+ } catch (connectionError) {
+ console.error('[ERROR] Could not connect to Pinecone:', connectionError);
+ }
+
+ // Query the summaries index
+ if (this.debug) console.log(`[DEBUG] Querying Pinecone summary index (${this.summaryIndexName})...`);
+ const queryStart = Date.now();
+
+ let queryResponse;
+ try {
+ // First, make sure we can access the index
+ const indexInfo = await this.summaryIndex.describeIndexStats();
+ if (this.debug) console.log(`[DEBUG] Index stats:`, indexInfo);
+
+ queryResponse = await this.summaryIndex.query({
+ vector: queryEmbedding,
+ topK,
+ includeMetadata: true,
+ });
+
+ const queryDuration = Date.now() - queryStart;
+
+ if (this.debug) {
+ console.log(`[DEBUG] ✅ Pinecone query completed in ${queryDuration}ms`);
+ console.log(`[DEBUG] Raw Pinecone response:`, JSON.stringify(queryResponse, null, 2));
+ if (queryResponse.matches) {
+ console.log(`[DEBUG] Found ${queryResponse.matches.length} matching summaries`);
+ console.log(`[DEBUG] Match scores: ${queryResponse.matches.map(m => m.score?.toFixed(4)).join(', ')}`);
+ } else {
+ console.log(`[DEBUG] No matches in response`);
+ }
+ }
+ } catch (queryError) {
+ console.error('[ERROR] Pinecone query failed:', queryError);
+ if (typeof queryError === 'object' && queryError !== null) {
+ console.error('[ERROR] Query error details:', JSON.stringify(queryError, null, 2));
+ }
+ return [];
+ }
+
+ if (!queryResponse || !queryResponse.matches || queryResponse.matches.length === 0) {
+ console.log('[DEBUG] ⚠️ No matches found in Pinecone for query');
+ return [];
+ }
+
+ // Format results
+ const results = queryResponse.matches.map(match => {
+ if (!match.metadata) {
+ console.error('[ERROR] Match is missing metadata:', match);
+ return { filepath: 'unknown', summary: 'No summary available' };
+ }
+
+ return {
+ filepath: (match.metadata as { filepath: string }).filepath || 'unknown',
+ summary: (match.metadata as { summary: string }).summary || 'No summary available',
+ score: match.score,
+ };
+ });
+
+ if (this.debug) {
+ if (results.length > 0) {
+ console.log(`[DEBUG] Top result filepath: ${results[0]?.filepath}`);
+ console.log(`[DEBUG] Top result score: ${results[0]?.score}`);
+ console.log(`[DEBUG] Top result summary excerpt: "${results[0]?.summary?.substring(0, 100)}..."`);
+ } else {
+ console.log(`[DEBUG] No results returned after processing`);
+ }
+ }
+
+ return results;
+ } catch (error) {
+ console.error('[ERROR] Error searching file summaries:', error);
+ if (typeof error === 'object' && error !== null) {
+ console.error('[ERROR] Full error details:', JSON.stringify(error, null, 2));
+ }
+ return [];
+ }
+ }
+
+ /**
+ * Runs a single test query after setup to validate the file summary search functionality.
+ */
+ private async runSingleTestQuery() {
+ console.log(`\n[TEST] Running single test query to validate file summary search functionality...`);
+
+ // Verify the index is accessible
+ try {
+ const indexStats = await this.summaryIndex.describeIndexStats();
+ console.log(`[TEST] Pinecone index stats:`, JSON.stringify(indexStats, null, 2));
+ console.log(`[TEST] Summary index contains ${indexStats.totalRecordCount} indexed summaries`);
+ } catch (error) {
+ console.error('[TEST] ❌ Failed to access Pinecone index:', error);
+ return;
+ }
+
+ // Add a brief delay to ensure Pinecone has finished processing
+ console.log('[TEST] Waiting 2 seconds for Pinecone indexing to complete...');
+ await new Promise(resolve => setTimeout(resolve, 2000));
+
+ // Run a single test query
+ const query = 'React components for the UI';
+ console.log(`\n[TEST] Executing query: "${query}"`);
+
+ try {
+ const results = await this.searchFileSummaries(query);
+ console.log(`[TEST] Search returned ${results.length} results:`);
+
+ results.forEach((result, i) => {
+ console.log(`\n[TEST] Result ${i + 1}:`);
+ console.log(`[TEST] File: ${result.filepath}`);
+ console.log(`[TEST] Score: ${result.score}`);
+ console.log(`[TEST] Summary: "${result.summary?.substring(0, 150)}..."`);
+ });
+
+ // If we have results, fetch the content for the first one
+ if (results.length > 0) {
+ const topFilepath = results[0].filepath;
+ console.log(`\n[TEST] Fetching full content for top result: ${topFilepath}`);
+ const content = await this.getFileContent(topFilepath);
+
+ if (content) {
+ console.log(`[TEST] ✅ Content retrieved successfully (${content.length} chars)`);
+ console.log(`[TEST] Content excerpt:\n---\n${content.substring(0, 300)}...\n---`);
+ } else {
+ console.log(`[TEST] ❌ Failed to retrieve content for ${topFilepath}`);
+ }
+ } else {
+ console.log(`\n[TEST] ⚠️ No results to fetch content for`);
+ }
+
+ console.log(`\n[TEST] ✅ Test query completed`);
+ } catch (testError) {
+ console.error(`[TEST] ❌ Test query failed:`, testError);
+ if (typeof testError === 'object' && testError !== null) {
+ console.error('[TEST] Full error details:', JSON.stringify(testError, null, 2));
+ }
+ }
+ }
+
+ /**
+ * Gets the full content of a file by its filepath.
+ * @param filepath The filepath to look up
+ * @returns The file content or null if not found
+ */
+ async getFileContent(filepath: string): Promise<string | null> {
+ if (this.debug) console.log(`[DEBUG] Getting file content for: ${filepath}`);
+ try {
+ const startTime = Date.now();
+
+ // Use the Networking utility for consistent API access
+ // But convert the response to text manually to avoid JSON parsing
+ const rawResponse = await fetch('/getRawFileContent', {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify({ filepath }),
+ });
+
+ if (!rawResponse.ok) {
+ const errorText = await rawResponse.text();
+ console.error(`[ERROR] Server returned error ${rawResponse.status}: ${errorText}`);
+ return null;
+ }
+
+ // Get the raw text content without JSON parsing
+ const content = await rawResponse.text();
+ const duration = Date.now() - startTime;
+
+ if (this.debug) {
+ console.log(`[DEBUG] ✅ File content retrieved in ${duration}ms`);
+ console.log(`[DEBUG] Content length: ${content.length} chars`);
+ console.log(`[DEBUG] Content excerpt: "${content.substring(0, 100)}..."`);
+ }
+
+ return content;
+ } catch (error) {
+ console.error('[ERROR] Error getting file content:', error);
+ if (typeof error === 'object' && error !== null) {
+ console.error('[ERROR] Full error details:', JSON.stringify(error, null, 2));
+ }
+ return null;
+ }
+ }
+
+ /**
* Adds an AI document to the vectorstore. Handles media file processing for audio/video,
* and text embedding for all document types. Updates document metadata during processing.
* @param doc The document to add.
@@ -103,21 +594,35 @@ export class Vectorstore {
const local_file_path = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname;
if (!local_file_path) {
- console.log('Invalid file path.');
+ console.log('Not adding to vectorstore. Invalid file path for vectorstore addition.');
return;
}
const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4');
let result: AI_Document & { doc_id: string };
+
if (isAudioOrVideo) {
console.log('Processing media file...');
- const response = (await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) })) as { [key: string]: unknown };
- const segmentedTranscript = response.condensed;
+ progressCallback(10, 'Preparing media file for transcription...');
+
+ // Post to processMediaFile endpoint to get the transcript
+ const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
+ progressCallback(60, 'Transcription completed. Processing transcript...');
+
+ // Type assertion to handle the response properties
+ const typedResponse = response as {
+ condensed: Array<{ text: string; indexes: string[]; start: number; end: number }>;
+ full: Array<unknown>;
+ summary: string;
+ };
+
+ const segmentedTranscript = typedResponse.condensed;
console.log(segmentedTranscript);
- const summary = response.summary as string;
+ const summary = typedResponse.summary;
doc.summary = summary;
+
// Generate embeddings for each chunk
- const texts = (segmentedTranscript as { text: string }[])?.map(chunk => chunk.text);
+ const texts = segmentedTranscript.map(chunk => chunk.text);
try {
const embeddingsResponse = await this.openai.embeddings.create({
@@ -125,54 +630,57 @@ export class Vectorstore {
input: texts,
encoding_format: 'float',
});
+ progressCallback(85, 'Embeddings generated. Finalizing document...');
- doc.original_segments = JSON.stringify(response.full);
- doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
- const doc_id = uuidv4();
+ doc.original_segments = JSON.stringify(typedResponse.full);
+ const doc_id = doc[Id];
+ console.log('doc_id in vectorstore', doc_id);
+ // Generate chunk IDs upfront so we can register them
+ const chunkIds = segmentedTranscript.map(() => uuidv4());
// Add transcript and embeddings to metadata
result = {
doc_id,
purpose: '',
file_name: local_file_path,
num_pages: 0,
- summary: '',
- chunks: (segmentedTranscript as { text: string; start: number; end: number; indexes: string[] }[]).map((chunk, index) => ({
- id: uuidv4(),
+ summary: summary,
+ chunks: segmentedTranscript.map((chunk, index) => ({
+ id: chunkIds[index], // Use pre-generated chunk ID
values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding
metadata: {
indexes: chunk.indexes,
original_document: local_file_path,
- doc_id: doc_id,
+ doc_id: doc_id, // Ensure doc_id is consistent
file_path: local_file_path,
start_time: chunk.start,
end_time: chunk.end,
text: chunk.text,
- type: CHUNK_TYPE.VIDEO,
+ type: local_file_path.endsWith('.mp3') ? CHUNK_TYPE.AUDIO : CHUNK_TYPE.VIDEO,
},
})),
type: 'media',
};
+ progressCallback(95, 'Adding document to vectorstore...');
} catch (error) {
console.error('Error generating embeddings:', error);
+ doc.ai_document_status = 'ERROR';
throw new Error('Embedding generation failed');
}
doc.segmented_transcript = JSON.stringify(segmentedTranscript);
- // Simplify chunks for storage
- const simplifiedChunks = result.chunks.map(chunk => ({
- chunkId: chunk.id,
- start_time: chunk.metadata.start_time,
- end_time: chunk.metadata.end_time,
- indexes: chunk.metadata.indexes,
- chunkType: CHUNK_TYPE.VIDEO,
- text: chunk.metadata.text,
- }));
- doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
+ // Use doc manager to add simplified chunks
+ const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
+ const simplifiedChunks = this.docManager.getSimplifiedChunks(result.chunks, docType);
+ doc.chunk_simplified = JSON.stringify(simplifiedChunks);
+ this.docManager.addSimplifiedChunks(simplifiedChunks);
} else {
- // Existing document processing logic remains unchanged
+ // Process regular document
console.log('Processing regular document...');
- const { jobId } = (await Networking.PostToServer('/createDocument', { file_path: local_file_path })) as { jobId: string };
+ const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path, doc_id: doc[Id] });
+
+ // Type assertion for the response
+ const { jobId } = createDocumentResponse as { jobId: string };
while (true) {
await new Promise(resolve => setTimeout(resolve, 2000));
@@ -188,29 +696,28 @@ export class Vectorstore {
progressCallback(progressResponseJson.progress, progressResponseJson.step);
}
}
- if (!doc.chunk_simpl) {
- doc.chunk_simpl = JSON.stringify({ chunks: [] });
+
+ // Collect all chunk IDs
+ const chunkIds = result.chunks.map(chunk => chunk.id);
+
+ if (result.doc_id !== doc[Id]) {
+ console.log('doc_id in vectorstore', result.doc_id, 'does not match doc_id in doc', doc[Id]);
}
+
+ // Use doc manager to add simplified chunks - determine document type from file extension
+ const fileExt = path.extname(local_file_path).toLowerCase();
+ const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text';
+ const simplifiedChunks = this.docManager.getSimplifiedChunks(result.chunks, docType);
+ doc.chunk_simplified = JSON.stringify(simplifiedChunks);
+ this.docManager.addSimplifiedChunks(simplifiedChunks);
+
doc.summary = result.summary;
doc.ai_purpose = result.purpose;
-
- result.chunks.forEach((chunk: RAGChunk) => {
- const chunkToAdd = {
- chunkId: chunk.id,
- startPage: chunk.metadata.start_page,
- endPage: chunk.metadata.end_page,
- location: chunk.metadata.location,
- chunkType: chunk.metadata.type as CHUNK_TYPE,
- text: chunk.metadata.text,
- };
- const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl));
- new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd);
- doc.chunk_simpl = JSON.stringify(new_chunk_simpl);
- });
}
// Index the document
await this.indexDocument(result);
+ progressCallback(100, 'Document added successfully!');
// Preserve existing metadata updates
if (!doc.vectorstore_id) {
@@ -286,7 +793,7 @@ export class Vectorstore {
* @param topK The number of top results to return (default is 10).
* @returns A list of document chunks that match the query.
*/
- async retrieve(query: string, topK: number = 10): Promise<RAGChunk[]> {
+ async retrieve(query: string, topK: number = 10, docIds?: string[]): Promise<RAGChunk[]> {
console.log(`Retrieving chunks for query: ${query}`);
try {
// Generate an embedding for the query using OpenAI.
@@ -297,40 +804,45 @@ export class Vectorstore {
});
const queryEmbedding = queryEmbeddingResponse.data[0].embedding;
+ const _docIds = docIds?.length === 0 || !docIds ? this.docManager.docIds : docIds;
- // Extract the embedding from the response.
+ console.log('Using document IDs for retrieval:', _docIds);
- console.log(this._doc_ids());
// Query the Pinecone index using the embedding and filter by document IDs.
+ // We'll query based on document IDs that are registered in the document manager
const queryResponse: QueryResponse = await this.index.query({
vector: queryEmbedding,
filter: {
- doc_id: { $in: this._doc_ids() },
+ doc_id: { $in: _docIds },
},
topK,
includeValues: true,
includeMetadata: true,
});
- console.log(queryResponse);
-
- // Map the results into RAGChunks and return them.
- return queryResponse.matches.map(
- match =>
- ({
- id: match.id,
- values: match.values as number[],
- metadata: match.metadata as {
- text: string;
- type: string;
- original_document: string;
- file_path: string;
- doc_id: string;
- location: string;
- start_page: number;
- end_page: number;
- },
- }) as RAGChunk
- );
+ console.log(`Found ${queryResponse.matches.length} matching chunks`);
+
+ // For each retrieved chunk, ensure its document ID is registered in the document manager
+ // This maintains compatibility with existing code while ensuring consistency
+ const processedMatches = queryResponse.matches.map(match => {
+ const chunk = {
+ id: match.id,
+ values: match.values as number[],
+ metadata: match.metadata as {
+ text: string;
+ type: string;
+ original_document: string;
+ file_path: string;
+ doc_id: string;
+ location: string;
+ start_page: number;
+ end_page: number;
+ },
+ } as RAGChunk;
+
+ return chunk;
+ });
+
+ return processedMatches;
} catch (error) {
console.error(`Error retrieving chunks: ${error}`);
return [];