aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx8
-rw-r--r--src/client/views/nodes/chatbot/tools/RAGTool.ts8
-rw-r--r--src/client/views/nodes/chatbot/tools/SearchTool.ts2
-rw-r--r--src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts6
-rw-r--r--src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts20
-rw-r--r--src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts493
-rw-r--r--src/server/ApiManagers/AssistantManager.ts180
7 files changed, 701 insertions, 16 deletions
diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
index 91a7adf24..470f94a8d 100644
--- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
+++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
@@ -164,7 +164,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
});
// Process the document first to ensure it has a valid ID
- this.docManager.processDocument(newLinkedDoc);
+ await this.docManager.processDocument(newLinkedDoc);
// Add the document to the vectorstore which will also register chunks
await this.vectorstore.addAIDoc(newLinkedDoc, this.updateProgress);
@@ -648,7 +648,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
const { foundChunk, doc, dataDoc } = this.docManager.getSimplifiedChunkById(chunkId);
console.log('doc: ', doc);
console.log('dataDoc: ', dataDoc);
- if (!foundChunk) {
+ if (!foundChunk || !doc) {
if (doc) {
console.warn(`Chunk not found in document, ${doc.id}, for chunk ID: ${chunkId}`);
DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
@@ -1102,8 +1102,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
// If there are stored doc IDs in our list of docs to add, process them
if (this._linked_docs_to_add.size > 0) {
- this._linked_docs_to_add.forEach(doc => {
- this.docManager.processDocument(doc);
+ this._linked_docs_to_add.forEach(async doc => {
+ await this.docManager.processDocument(doc);
});
}
}
diff --git a/src/client/views/nodes/chatbot/tools/RAGTool.ts b/src/client/views/nodes/chatbot/tools/RAGTool.ts
index 90b803d21..af44de520 100644
--- a/src/client/views/nodes/chatbot/tools/RAGTool.ts
+++ b/src/client/views/nodes/chatbot/tools/RAGTool.ts
@@ -12,6 +12,12 @@ const ragToolParams = [
description: "A detailed prompt representing an ideal chunk to embed and compare against document vectors to retrieve the most relevant content for answering the user's query.",
required: true,
},
+ {
+ name: 'doc_ids',
+ type: 'string[]',
+ description: 'An optional array of document IDs to retrieve chunks from. If you want to retrieve chunks from all documents, leave this as an empty array: [] (DO NOT LEAVE THIS EMPTY).',
+ required: false,
+ },
] as const;
type RAGToolParamsType = typeof ragToolParams;
@@ -69,7 +75,7 @@ export class RAGTool extends BaseTool<RAGToolParamsType> {
}
async execute(args: ParametersType<RAGToolParamsType>): Promise<Observation[]> {
- const relevantChunks = await this.vectorstore.retrieve(args.hypothetical_document_chunk);
+ const relevantChunks = await this.vectorstore.retrieve(args.hypothetical_document_chunk, undefined, args.doc_ids ?? undefined);
const formattedChunks = await this.getFormattedChunks(relevantChunks);
return formattedChunks;
}
diff --git a/src/client/views/nodes/chatbot/tools/SearchTool.ts b/src/client/views/nodes/chatbot/tools/SearchTool.ts
index 43f14ea83..8e6edce8c 100644
--- a/src/client/views/nodes/chatbot/tools/SearchTool.ts
+++ b/src/client/views/nodes/chatbot/tools/SearchTool.ts
@@ -22,7 +22,7 @@ type SearchToolParamsType = typeof searchToolParams;
const searchToolInfo: ToolInfo<SearchToolParamsType> = {
name: 'searchTool',
- citationRules: 'No citation needed. Cannot cite search results for a response. Use web scraping tools to cite specific information.',
+ citationRules: 'Always cite the search results for a response, if the search results are relevant to the response. Use the chunk_id to cite the search results. If the search results are not relevant to the response, do not cite them. ',
parameterRules: searchToolParams,
description: 'Search the web to find a wide range of websites related to a query or multiple queries. Returns a list of websites and their overviews based on the search queries.',
};
diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
index 495a985cb..727d35e2c 100644
--- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
+++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts
@@ -22,6 +22,7 @@ const websiteInfoScraperToolInfo: ToolInfo<WebsiteInfoScraperToolParamsType> = {
name: 'websiteInfoScraper',
description: 'Scrape detailed information from specific websites relevant to the user query. Returns the text content of the webpages for further analysis and grounding.',
citationRules: `
+ !IMPORTANT! THESE CHUNKS REPLACE THE CHUNKS THAT ARE RETURNED FROM THE SEARCHTOOL.
Your task is to provide a comprehensive response to the user's prompt using the content scraped from relevant websites. Ensure you follow these guidelines for structuring your response:
1. Grounded Text Tag Structure:
@@ -88,6 +89,7 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
console.log(url);
console.log(chunkDoc);
console.log(chunkDoc.data);
+ const id = chunkDoc.id;
// Validate URL format
try {
new URL(url); // This will throw if URL is invalid
@@ -130,7 +132,7 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
if (retryCount === maxRetries) {
return {
type: 'text',
- text: `<chunk chunk_id="${chunkDoc.id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
+ text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`,
} as Observation;
}
@@ -142,7 +144,7 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam
// Process and return content if it looks good
return {
type: 'text',
- text: `<chunk chunk_id="${chunkDoc.id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
+ text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`,
} as Observation;
} catch (error) {
lastError = error instanceof Error ? error.message : 'Unknown error';
diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
index 33eec5972..3c8b49f33 100644
--- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
+++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
@@ -153,9 +153,9 @@ export class AgentDocumentManager {
console.log(`Found ${linkedDocs.length} linked documents via LinkManager`);
// Process the linked documents
- linkedDocs.forEach((doc: Doc | undefined) => {
+ linkedDocs.forEach(async (doc: Doc | undefined) => {
if (doc) {
- this.processDocument(doc);
+ await this.processDocument(doc);
console.log('Processed linked document:', doc[Id], doc.title, doc.type);
}
});
@@ -170,7 +170,7 @@ export class AgentDocumentManager {
* @param doc The document to process
*/
@action
- public processDocument(doc: Doc): string {
+ public async processDocument(doc: Doc): Promise<string> {
// Ensure document has a persistent ID
const docId = this.ensureDocumentId(doc);
if (doc.chunk_simplified) {
@@ -900,7 +900,7 @@ export class AgentDocumentManager {
}
});
- const id = this.processDocument(doc);
+ const id = await this.processDocument(doc);
return id;
} else {
throw new Error(`Error creating document. Created document not found.`);
@@ -1081,6 +1081,18 @@ export class AgentDocumentManager {
return { foundChunk: this.simplifiedChunks.get(chunkId), doc: this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || chunkId), dataDoc: this.getDataDocument(this.simplifiedChunks.get(chunkId)?.doc_id || chunkId) };
}
+ public getChunkIdsFromDocIds(docIds: string[]): string[] {
+ return docIds
+ .map(docId => {
+ for (const chunk of this.simplifiedChunks.values()) {
+ if (chunk.doc_id === docId) {
+ return chunk.chunkId;
+ }
+ }
+ })
+ .filter(chunkId => chunkId !== undefined) as string[];
+ }
+
/**
* Gets the original segments from a media document
* @param doc The document containing original media segments
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index 252672dfc..5c2d0e5ea 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -27,11 +27,16 @@ dotenv.config();
export class Vectorstore {
private pinecone!: Pinecone; // Pinecone client for managing the vector index.
private index!: Index; // The specific Pinecone index used for document chunks.
+ private summaryIndex!: Index; // The Pinecone index used for file summaries.
private openai!: OpenAI; // OpenAI client for generating embeddings.
private indexName: string = 'pdf-chatbot'; // Default name for the index.
+ private summaryIndexName: string = 'file-summaries'; // Name for the summaries index.
private _id!: string; // Unique ID for the Vectorstore instance.
private docManager!: AgentDocumentManager; // Document manager for handling documents
+ private summaryCacheCount: number = 0; // Cache for the number of summaries
documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
+ private debug: boolean = true; // Enable debugging
+ private initialized: boolean = false;
/**
* Initializes the Pinecone and OpenAI clients, sets up the document ID list,
@@ -40,6 +45,7 @@ export class Vectorstore {
* @param docManager An instance of AgentDocumentManager to handle document management.
*/
constructor(id: string, docManager: AgentDocumentManager) {
+ if (this.debug) console.log(`[DEBUG] Initializing Vectorstore with ID: ${id}`);
const pineconeApiKey = 'pcsk_3txLxJ_9fxdmAph4csnq4yxoDF5De5A8bJvjWaXXigBgshy4eoXggrXcxATJiH8vzXbrKm';
if (!pineconeApiKey) {
console.log('PINECONE_API_KEY is not defined - Vectorstore will be unavailable');
@@ -51,7 +57,32 @@ export class Vectorstore {
this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true });
this._id = id;
this.docManager = docManager;
- this.initializeIndex();
+
+ // Proper async initialization sequence
+ this.initializeAsync(id);
+ }
+
+ /**
+ * Handles async initialization of all components
+ */
+ private async initializeAsync(id: string) {
+ try {
+ if (this.debug) console.log(`[DEBUG] Starting async initialization sequence for Vectorstore ID: ${id}`);
+
+ // Initialize the main document index
+ await this.initializeIndex();
+
+ // Initialize the summary index
+ await this.initializeSummaryIndex();
+
+ this.initialized = true;
+ if (this.debug) console.log(`[DEBUG] ✅ Vectorstore initialization complete, running test query...`);
+
+ // Run a single test query instead of multiple
+ await this.runSingleTestQuery();
+ } catch (error) {
+ console.error('[ERROR] Failed to initialize Vectorstore:', error);
+ }
}
/**
@@ -59,10 +90,13 @@ export class Vectorstore {
* Sets the index to use cosine similarity for vector similarity calculations.
*/
private async initializeIndex() {
+ if (this.debug) console.log(`[DEBUG] Initializing main document index: ${this.indexName}`);
const indexList: IndexList = await this.pinecone.listIndexes();
+ if (this.debug) console.log(`[DEBUG] Available Pinecone indexes: ${indexList.indexes?.map(i => i.name).join(', ') || 'none'}`);
// Check if the index already exists, otherwise create it.
if (!indexList.indexes?.some(index => index.name === this.indexName)) {
+ if (this.debug) console.log(`[DEBUG] Creating new index: ${this.indexName}`);
await this.pinecone.createIndex({
name: this.indexName,
dimension: 3072,
@@ -74,6 +108,9 @@ export class Vectorstore {
},
},
});
+ if (this.debug) console.log(`[DEBUG] ✅ Index ${this.indexName} created successfully`);
+ } else {
+ if (this.debug) console.log(`[DEBUG] ✅ Using existing index: ${this.indexName}`);
}
// Set the index for future use.
@@ -81,6 +118,453 @@ export class Vectorstore {
}
/**
+ * Initializes the Pinecone index for file summaries.
+ * Checks if it exists and creates it if necessary.
+ */
+ private async initializeSummaryIndex() {
+ if (this.debug) console.log(`[DEBUG] Initializing file summaries index: ${this.summaryIndexName}`);
+ const indexList: IndexList = await this.pinecone.listIndexes();
+
+ // Check if the index already exists, otherwise create it.
+ if (!indexList.indexes?.some(index => index.name === this.summaryIndexName)) {
+ if (this.debug) console.log(`[DEBUG] Creating new summary index: ${this.summaryIndexName}`);
+ await this.pinecone.createIndex({
+ name: this.summaryIndexName,
+ dimension: 3072,
+ metric: 'cosine',
+ spec: {
+ serverless: {
+ cloud: 'aws',
+ region: 'us-east-1',
+ },
+ },
+ });
+ if (this.debug) console.log(`[DEBUG] ✅ Summary index ${this.summaryIndexName} created successfully`);
+ } else {
+ if (this.debug) console.log(`[DEBUG] ✅ Using existing summary index: ${this.summaryIndexName}`);
+ }
+
+ // Set the summaries index for future use.
+ this.summaryIndex = this.pinecone.Index(this.summaryIndexName);
+
+ // Check if we need to index the file summaries
+ await this.processFileSummaries();
+ }
+
+ /**
+ * Processes file summaries from the JSON file if needed.
+ * Checks if the index contains the correct number of summaries before embedding.
+ */
+ private async processFileSummaries() {
+ if (this.debug) console.log(`[DEBUG] Starting file summaries processing`);
+ try {
+ // Get file summaries from the server
+ if (this.debug) console.log(`[DEBUG] Fetching file summaries from server...`);
+ const response = await Networking.FetchFromServer('/getFileSummaries');
+
+ if (!response) {
+ console.error('[ERROR] Failed to fetch file summaries');
+ return;
+ }
+ if (this.debug) console.log(`[DEBUG] File summaries response received (${response.length} bytes)`);
+
+ const summaries = JSON.parse(response);
+ const filepaths = Object.keys(summaries);
+ const summaryCount = filepaths.length;
+ this.summaryCacheCount = summaryCount;
+
+ if (this.debug) {
+ console.log(`[DEBUG] File summaries parsed: ${summaryCount} files`);
+ console.log(`[DEBUG] Sample filepaths: ${filepaths.slice(0, 3).join(', ')}...`);
+ console.log(`[DEBUG] Sample summary: "${summaries[filepaths[0]].substring(0, 100)}..."`);
+ }
+
+ // Check if index already has the correct number of summaries
+ try {
+ if (this.debug) console.log(`[DEBUG] Checking summary index stats...`);
+ const indexStats = await this.summaryIndex.describeIndexStats();
+ const vectorCount = indexStats.totalRecordCount;
+
+ if (this.debug) console.log(`[DEBUG] Summary index has ${vectorCount} records, expecting ${summaryCount}`);
+
+ if (vectorCount === summaryCount) {
+ console.log(`[DEBUG] ✅ Summary index already contains ${vectorCount} entries, skipping embedding.`);
+ return;
+ }
+
+ if (this.debug) console.log(`[DEBUG] ⚠️ Summary index contains ${vectorCount} entries, but there are ${summaryCount} summaries. Re-indexing.`);
+ } catch (error) {
+ console.error('[ERROR] Error checking summary index stats:', error);
+ }
+
+ // If we get here, we need to embed the summaries
+ await this.embedAndIndexFileSummaries(summaries);
+ } catch (error) {
+ console.error('[ERROR] Error processing file summaries:', error);
+ }
+ }
+
+ /**
+ * Embeds and indexes file summaries into the summary index.
+ * @param summaries Object mapping filepaths to summaries
+ */
+ private async embedAndIndexFileSummaries(summaries: Record<string, string>) {
+ if (this.debug) console.log(`[DEBUG] Starting embedding and indexing of file summaries...`);
+
+ const filepaths = Object.keys(summaries);
+ const summaryTexts = Object.values(summaries);
+
+ // Split into batches of 100 to avoid exceeding API limits
+ const batchSize = 100;
+ const totalBatches = Math.ceil(filepaths.length / batchSize);
+
+ if (this.debug) console.log(`[DEBUG] Processing ${filepaths.length} files in ${totalBatches} batches of size ${batchSize}`);
+
+ for (let i = 0; i < filepaths.length; i += batchSize) {
+ const batchFilepaths = filepaths.slice(i, i + batchSize);
+ const batchTexts = summaryTexts.slice(i, i + batchSize);
+
+ if (this.debug) {
+ console.log(`[DEBUG] Processing batch ${Math.floor(i / batchSize) + 1}/${totalBatches}`);
+ console.log(`[DEBUG] First file in batch: ${batchFilepaths[0]}`);
+ console.log(`[DEBUG] First summary in batch: "${batchTexts[0].substring(0, 50)}..."`);
+ }
+
+ try {
+ // Generate embeddings for this batch
+ if (this.debug) console.log(`[DEBUG] Generating embeddings for batch of ${batchTexts.length} summaries...`);
+ const startTime = Date.now();
+ const embeddingResponse = await this.openai.embeddings.create({
+ model: 'text-embedding-3-large',
+ input: batchTexts,
+ encoding_format: 'float',
+ });
+ const duration = Date.now() - startTime;
+ if (this.debug) console.log(`[DEBUG] ✅ Embeddings generated in ${duration}ms`);
+
+ // Prepare Pinecone records
+ if (this.debug) console.log(`[DEBUG] Preparing Pinecone records...`);
+ const pineconeRecords: PineconeRecord[] = batchTexts.map((text, index) => {
+ const embedding = (embeddingResponse.data as Embedding[])[index].embedding;
+ if (this.debug && index === 0) console.log(`[DEBUG] Sample embedding dimensions: ${embedding.length}, first few values: [${embedding.slice(0, 5).join(', ')}...]`);
+
+ return {
+ id: uuidv4(), // Generate a unique ID for each summary
+ values: embedding,
+ metadata: {
+ filepath: batchFilepaths[index],
+ summary: text,
+ } as RecordMetadata,
+ };
+ });
+
+ // Upload to Pinecone
+ if (this.debug) console.log(`[DEBUG] Upserting ${pineconeRecords.length} records to Pinecone...`);
+ const upsertStart = Date.now();
+ try {
+ await this.summaryIndex.upsert(pineconeRecords);
+ const upsertDuration = Date.now() - upsertStart;
+ if (this.debug) console.log(`[DEBUG] ✅ Batch ${Math.floor(i / batchSize) + 1}/${totalBatches} indexed in ${upsertDuration}ms`);
+ } catch (upsertError) {
+ console.error(`[ERROR] Failed to upsert batch ${Math.floor(i / batchSize) + 1}/${totalBatches} to Pinecone:`, upsertError);
+ // Try again with smaller batch
+ if (batchTexts.length > 20) {
+ console.log(`[DEBUG] 🔄 Retrying with smaller batch size...`);
+ // Split the batch in half and retry recursively
+ const midpoint = Math.floor(batchTexts.length / 2);
+ const firstHalf = {
+ filepaths: batchFilepaths.slice(0, midpoint),
+ texts: batchTexts.slice(0, midpoint),
+ };
+ const secondHalf = {
+ filepaths: batchFilepaths.slice(midpoint),
+ texts: batchTexts.slice(midpoint),
+ };
+
+ // Create a helper function to retry smaller batches
+ const retryBatch = async (paths: string[], texts: string[], batchNum: string) => {
+ try {
+ if (this.debug) console.log(`[DEBUG] Generating embeddings for sub-batch ${batchNum}...`);
+ const embRes = await this.openai.embeddings.create({
+ model: 'text-embedding-3-large',
+ input: texts,
+ encoding_format: 'float',
+ });
+
+ const records = texts.map((t, idx) => ({
+ id: uuidv4(),
+ values: (embRes.data as Embedding[])[idx].embedding,
+ metadata: {
+ filepath: paths[idx],
+ summary: t,
+ } as RecordMetadata,
+ }));
+
+ if (this.debug) console.log(`[DEBUG] Upserting sub-batch ${batchNum} (${records.length} records)...`);
+ await this.summaryIndex.upsert(records);
+ if (this.debug) console.log(`[DEBUG] ✅ Sub-batch ${batchNum} upserted successfully`);
+ } catch (retryError) {
+ console.error(`[ERROR] Failed to upsert sub-batch ${batchNum}:`, retryError);
+ }
+ };
+
+ await retryBatch(firstHalf.filepaths, firstHalf.texts, `${Math.floor(i / batchSize) + 1}.1`);
+ await retryBatch(secondHalf.filepaths, secondHalf.texts, `${Math.floor(i / batchSize) + 1}.2`);
+ }
+ }
+ } catch (error) {
+ console.error('[ERROR] Error processing batch:', error);
+ }
+ }
+
+ if (this.debug) console.log(`[DEBUG] ✅ File summary indexing complete for all ${filepaths.length} files`);
+
+ // Verify the index was populated correctly
+ try {
+ const indexStats = await this.summaryIndex.describeIndexStats();
+ const vectorCount = indexStats.totalRecordCount;
+ if (this.debug) console.log(`[DEBUG] 🔍 Final index verification: ${vectorCount} records in Pinecone index (expected ${filepaths.length})`);
+ } catch (error) {
+ console.error('[ERROR] Failed to verify index stats:', error);
+ }
+ }
+
+ /**
+ * Searches for file summaries similar to the given query.
+ * @param query The search query
+ * @param topK Number of results to return (default: 5)
+ * @returns Array of filepath and summary pairs with relevance scores
+ */
+ async searchFileSummaries(query: string, topK: number = 5): Promise<Array<{ filepath: string; summary: string; score?: number }>> {
+ if (!this.initialized) {
+ console.error('[ERROR] Cannot search - Vectorstore not fully initialized');
+ return [];
+ }
+
+ if (this.debug) console.log(`[DEBUG] Searching file summaries for query: "${query}" (topK=${topK})`);
+ try {
+ // Generate embedding for the query
+ if (this.debug) console.log(`[DEBUG] Generating embedding for query...`);
+ const startTime = Date.now();
+ const queryEmbeddingResponse = await this.openai.embeddings.create({
+ model: 'text-embedding-3-large',
+ input: query,
+ encoding_format: 'float',
+ });
+ const duration = Date.now() - startTime;
+
+ const queryEmbedding = queryEmbeddingResponse.data[0].embedding;
+ if (this.debug) {
+ console.log(`[DEBUG] ✅ Query embedding generated in ${duration}ms`);
+ console.log(`[DEBUG] Query embedding dimensions: ${queryEmbedding.length}`);
+ }
+
+ // Check if summary index is ready
+ try {
+ const indexStats = await this.summaryIndex.describeIndexStats();
+ const vectorCount = indexStats.totalRecordCount;
+ if (this.debug) console.log(`[DEBUG] Summary index contains ${vectorCount} records`);
+
+ if (vectorCount === 0) {
+ console.error('[ERROR] Summary index is empty, cannot perform search');
+ return [];
+ }
+ } catch (statsError) {
+ console.error('[ERROR] Failed to check summary index stats:', statsError);
+ console.error('[ERROR] Stats error details:', JSON.stringify(statsError));
+ }
+
+ // Test direct API access to Pinecone
+ if (this.debug) console.log(`[DEBUG] Testing Pinecone connection...`);
+ try {
+ const indexes = await this.pinecone.listIndexes();
+ console.log(`[DEBUG] Available Pinecone indexes: ${indexes.indexes?.map(idx => idx.name).join(', ')}`);
+ } catch (connectionError) {
+ console.error('[ERROR] Could not connect to Pinecone:', connectionError);
+ }
+
+ // Query the summaries index
+ if (this.debug) console.log(`[DEBUG] Querying Pinecone summary index (${this.summaryIndexName})...`);
+ const queryStart = Date.now();
+
+ let queryResponse;
+ try {
+ // First, make sure we can access the index
+ const indexInfo = await this.summaryIndex.describeIndexStats();
+ if (this.debug) console.log(`[DEBUG] Index stats:`, indexInfo);
+
+ queryResponse = await this.summaryIndex.query({
+ vector: queryEmbedding,
+ topK,
+ includeMetadata: true,
+ });
+
+ const queryDuration = Date.now() - queryStart;
+
+ if (this.debug) {
+ console.log(`[DEBUG] ✅ Pinecone query completed in ${queryDuration}ms`);
+ console.log(`[DEBUG] Raw Pinecone response:`, JSON.stringify(queryResponse, null, 2));
+ if (queryResponse.matches) {
+ console.log(`[DEBUG] Found ${queryResponse.matches.length} matching summaries`);
+ console.log(`[DEBUG] Match scores: ${queryResponse.matches.map(m => m.score?.toFixed(4)).join(', ')}`);
+ } else {
+ console.log(`[DEBUG] No matches in response`);
+ }
+ }
+ } catch (queryError) {
+ console.error('[ERROR] Pinecone query failed:', queryError);
+ if (typeof queryError === 'object' && queryError !== null) {
+ console.error('[ERROR] Query error details:', JSON.stringify(queryError, null, 2));
+ }
+ return [];
+ }
+
+ if (!queryResponse || !queryResponse.matches || queryResponse.matches.length === 0) {
+ console.log('[DEBUG] ⚠️ No matches found in Pinecone for query');
+ return [];
+ }
+
+ // Format results
+ const results = queryResponse.matches.map(match => {
+ if (!match.metadata) {
+ console.error('[ERROR] Match is missing metadata:', match);
+ return { filepath: 'unknown', summary: 'No summary available' };
+ }
+
+ return {
+ filepath: (match.metadata as { filepath: string }).filepath || 'unknown',
+ summary: (match.metadata as { summary: string }).summary || 'No summary available',
+ score: match.score,
+ };
+ });
+
+ if (this.debug) {
+ if (results.length > 0) {
+ console.log(`[DEBUG] Top result filepath: ${results[0]?.filepath}`);
+ console.log(`[DEBUG] Top result score: ${results[0]?.score}`);
+ console.log(`[DEBUG] Top result summary excerpt: "${results[0]?.summary?.substring(0, 100)}..."`);
+ } else {
+ console.log(`[DEBUG] No results returned after processing`);
+ }
+ }
+
+ return results;
+ } catch (error) {
+ console.error('[ERROR] Error searching file summaries:', error);
+ if (typeof error === 'object' && error !== null) {
+ console.error('[ERROR] Full error details:', JSON.stringify(error, null, 2));
+ }
+ return [];
+ }
+ }
+
+ /**
+ * Runs a single test query after setup to validate the file summary search functionality.
+ */
+ private async runSingleTestQuery() {
+ console.log(`\n[TEST] Running single test query to validate file summary search functionality...`);
+
+ // Verify the index is accessible
+ try {
+ const indexStats = await this.summaryIndex.describeIndexStats();
+ console.log(`[TEST] Pinecone index stats:`, JSON.stringify(indexStats, null, 2));
+ console.log(`[TEST] Summary index contains ${indexStats.totalRecordCount} indexed summaries`);
+ } catch (error) {
+ console.error('[TEST] ❌ Failed to access Pinecone index:', error);
+ return;
+ }
+
+ // Add a brief delay to ensure Pinecone has finished processing
+ console.log('[TEST] Waiting 2 seconds for Pinecone indexing to complete...');
+ await new Promise(resolve => setTimeout(resolve, 2000));
+
+ // Run a single test query
+ const query = 'React components for the UI';
+ console.log(`\n[TEST] Executing query: "${query}"`);
+
+ try {
+ const results = await this.searchFileSummaries(query);
+ console.log(`[TEST] Search returned ${results.length} results:`);
+
+ results.forEach((result, i) => {
+ console.log(`\n[TEST] Result ${i + 1}:`);
+ console.log(`[TEST] File: ${result.filepath}`);
+ console.log(`[TEST] Score: ${result.score}`);
+ console.log(`[TEST] Summary: "${result.summary?.substring(0, 150)}..."`);
+ });
+
+ // If we have results, fetch the content for the first one
+ if (results.length > 0) {
+ const topFilepath = results[0].filepath;
+ console.log(`\n[TEST] Fetching full content for top result: ${topFilepath}`);
+ const content = await this.getFileContent(topFilepath);
+
+ if (content) {
+ console.log(`[TEST] ✅ Content retrieved successfully (${content.length} chars)`);
+ console.log(`[TEST] Content excerpt:\n---\n${content.substring(0, 300)}...\n---`);
+ } else {
+ console.log(`[TEST] ❌ Failed to retrieve content for ${topFilepath}`);
+ }
+ } else {
+ console.log(`\n[TEST] ⚠️ No results to fetch content for`);
+ }
+
+ console.log(`\n[TEST] ✅ Test query completed`);
+ } catch (testError) {
+ console.error(`[TEST] ❌ Test query failed:`, testError);
+ if (typeof testError === 'object' && testError !== null) {
+ console.error('[TEST] Full error details:', JSON.stringify(testError, null, 2));
+ }
+ }
+ }
+
+ /**
+ * Gets the full content of a file by its filepath.
+ * @param filepath The filepath to look up
+ * @returns The file content or null if not found
+ */
+ async getFileContent(filepath: string): Promise<string | null> {
+ if (this.debug) console.log(`[DEBUG] Getting file content for: ${filepath}`);
+ try {
+ const startTime = Date.now();
+
+ // Use the Networking utility for consistent API access
+ // But convert the response to text manually to avoid JSON parsing
+ const rawResponse = await fetch('/getRawFileContent', {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify({ filepath }),
+ });
+
+ if (!rawResponse.ok) {
+ const errorText = await rawResponse.text();
+ console.error(`[ERROR] Server returned error ${rawResponse.status}: ${errorText}`);
+ return null;
+ }
+
+ // Get the raw text content without JSON parsing
+ const content = await rawResponse.text();
+ const duration = Date.now() - startTime;
+
+ if (this.debug) {
+ console.log(`[DEBUG] ✅ File content retrieved in ${duration}ms`);
+ console.log(`[DEBUG] Content length: ${content.length} chars`);
+ console.log(`[DEBUG] Content excerpt: "${content.substring(0, 100)}..."`);
+ }
+
+ return content;
+ } catch (error) {
+ console.error('[ERROR] Error getting file content:', error);
+ if (typeof error === 'object' && error !== null) {
+ console.error('[ERROR] Full error details:', JSON.stringify(error, null, 2));
+ }
+ return null;
+ }
+ }
+
+ /**
* Adds an AI document to the vectorstore. Handles media file processing for audio/video,
* and text embedding for all document types. Updates document metadata during processing.
* @param doc The document to add.
@@ -303,7 +787,7 @@ export class Vectorstore {
* @param topK The number of top results to return (default is 10).
* @returns A list of document chunks that match the query.
*/
- async retrieve(query: string, topK: number = 10): Promise<RAGChunk[]> {
+ async retrieve(query: string, topK: number = 10, docIds?: string[]): Promise<RAGChunk[]> {
console.log(`Retrieving chunks for query: ${query}`);
try {
// Generate an embedding for the query using OpenAI.
@@ -314,15 +798,16 @@ export class Vectorstore {
});
const queryEmbedding = queryEmbeddingResponse.data[0].embedding;
+ const _docIds = docIds?.length === 0 || !docIds ? this.docManager.docIds : docIds;
- console.log('Using document IDs for retrieval:', this.docManager.docIds);
+ console.log('Using document IDs for retrieval:', _docIds);
// Query the Pinecone index using the embedding and filter by document IDs.
// We'll query based on document IDs that are registered in the document manager
const queryResponse: QueryResponse = await this.index.query({
vector: queryEmbedding,
filter: {
- doc_id: { $in: this.docManager.docIds },
+ doc_id: { $in: _docIds },
},
topK,
includeValues: true,
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index b7ce4f663..9d0427b52 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -39,6 +39,7 @@ export enum Directory {
csv = 'csv',
chunk_images = 'chunk_images',
scrape_images = 'scrape_images',
+ vectorstore = 'vectorstore',
}
// In-memory job tracking
@@ -92,6 +93,119 @@ export default class AssistantManager extends ApiManager {
const customsearch = google.customsearch('v1');
const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY });
+ // Register an endpoint to retrieve file summaries from the json file
+ register({
+ method: Method.GET,
+ subscription: '/getFileSummaries',
+ secureHandler: async ({ req, res }) => {
+ try {
+ // Read the file summaries JSON file
+ const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json');
+
+ if (!fs.existsSync(filePath)) {
+ res.status(404).send({ error: 'File summaries not found' });
+ return;
+ }
+
+ const data = fs.readFileSync(filePath, 'utf8');
+ res.send(data);
+ } catch (error) {
+ console.error('Error retrieving file summaries:', error);
+ res.status(500).send({
+ error: 'Failed to retrieve file summaries',
+ });
+ }
+ },
+ });
+
+ // Register an endpoint to retrieve file content from the content json file
+ register({
+ method: Method.POST,
+ subscription: '/getFileContent',
+ secureHandler: async ({ req, res }) => {
+ const { filepath } = req.body;
+
+ if (!filepath) {
+ res.status(400).send({ error: 'Filepath is required' });
+ return;
+ }
+
+ try {
+ // Read the file content JSON file
+ const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json');
+
+ if (!fs.existsSync(filePath)) {
+ res.status(404).send({ error: 'File content database not found' });
+ return;
+ }
+
+ console.log(`[DEBUG] Retrieving content for: ${filepath}`);
+
+ // Read the JSON file in chunks to handle large files
+ const readStream = fs.createReadStream(filePath, { encoding: 'utf8' });
+ let jsonData = '';
+
+ readStream.on('data', chunk => {
+ jsonData += chunk;
+ });
+
+ readStream.on('end', () => {
+ try {
+ // Parse the JSON
+ const contentMap = JSON.parse(jsonData);
+
+ // Check if the filepath exists in the map
+ if (!contentMap[filepath]) {
+ console.log(`[DEBUG] Content not found for: ${filepath}`);
+ res.status(404).send({ error: `Content not found for filepath: ${filepath}` });
+ return;
+ }
+
+ // Return the file content as is, not as JSON
+ console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`);
+ res.send(contentMap[filepath]);
+ } catch (parseError) {
+ console.error('Error parsing file_content.json:', parseError);
+ res.status(500).send({
+ error: 'Failed to parse file content database',
+ });
+ }
+ });
+
+ readStream.on('error', streamError => {
+ console.error('Error reading file_content.json:', streamError);
+ res.status(500).send({
+ error: 'Failed to read file content database',
+ });
+ });
+ } catch (error) {
+ console.error('Error retrieving file content:', error);
+ res.status(500).send({
+ error: 'Failed to retrieve file content',
+ });
+ }
+ },
+ });
+
+ // Register an endpoint to search file summaries
+ register({
+ method: Method.POST,
+ subscription: '/searchFileSummaries',
+ secureHandler: async ({ req, res }) => {
+ const { query, topK } = req.body;
+
+ if (!query) {
+ res.status(400).send({ error: 'Search query is required' });
+ return;
+ }
+
+ // This endpoint will be called by the client-side Vectorstore to perform the search
+ // The actual search is implemented in the Vectorstore class
+
+ res.send({ message: 'This endpoint should be called through the Vectorstore class' });
+ },
+ });
+
// Register Wikipedia summary API route
register({
method: Method.POST,
@@ -848,6 +962,72 @@ export default class AssistantManager extends ApiManager {
}
},
});
+
+ // Register an endpoint to retrieve raw file content as plain text (no JSON parsing)
+ register({
+ method: Method.POST,
+ subscription: '/getRawFileContent',
+ secureHandler: async ({ req, res }) => {
+ const { filepath } = req.body;
+
+ if (!filepath) {
+ res.status(400).send('Filepath is required');
+ return;
+ }
+
+ try {
+ // Read the file content JSON file
+ const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json');
+
+ if (!fs.existsSync(filePath)) {
+ res.status(404).send('File content database not found');
+ return;
+ }
+
+ console.log(`[DEBUG] Retrieving raw content for: ${filepath}`);
+
+ // Read the JSON file
+ const readStream = fs.createReadStream(filePath, { encoding: 'utf8' });
+ let jsonData = '';
+
+ readStream.on('data', chunk => {
+ jsonData += chunk;
+ });
+
+ readStream.on('end', () => {
+ try {
+ // Parse the JSON
+ const contentMap = JSON.parse(jsonData);
+
+ // Check if the filepath exists in the map
+ if (!contentMap[filepath]) {
+ console.log(`[DEBUG] Content not found for: ${filepath}`);
+ res.status(404).send(`Content not found for filepath: ${filepath}`);
+ return;
+ }
+
+ // Set content type to plain text to avoid JSON parsing
+ res.setHeader('Content-Type', 'text/plain');
+
+ // Return the file content as plain text
+ console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`);
+ res.send(contentMap[filepath]);
+ } catch (parseError) {
+ console.error('Error parsing file_content.json:', parseError);
+ res.status(500).send('Failed to parse file content database');
+ }
+ });
+
+ readStream.on('error', streamError => {
+ console.error('Error reading file_content.json:', streamError);
+ res.status(500).send('Failed to read file content database');
+ });
+ } catch (error) {
+ console.error('Error retrieving file content:', error);
+ res.status(500).send('Failed to retrieve file content');
+ }
+ },
+ });
}
}