aboutsummaryrefslogtreecommitdiff
path: root/src/client/views/nodes/chatbot/vectorstore
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2025-04-27 13:14:49 -0400
committerA.J. Shulman <Shulman.aj@gmail.com>2025-04-27 13:14:49 -0400
commit3ef3d40506348d9fd537cc8f4aea975b9770689f (patch)
treeafe779e8240e88c8b20ff6b68ac45840a927ee76 /src/client/views/nodes/chatbot/vectorstore
parent5ce2263849bfb901e276a4c5fc8ca2dbd8b80350 (diff)
new attempt with new citation unification
Diffstat (limited to 'src/client/views/nodes/chatbot/vectorstore')
-rw-r--r--src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts130
1 files changed, 90 insertions, 40 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index afd34f28d..4bb61d8b2 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -15,7 +15,7 @@ import { Networking } from '../../../../Network';
import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
import OpenAI from 'openai';
import { Embedding } from 'openai/resources';
-import { PineconeEnvironmentVarsNotSupportedError } from '@pinecone-database/pinecone/dist/errors';
+import { AgentDocumentManager } from '../utils/AgentDocumentManager';
dotenv.config();
@@ -29,7 +29,7 @@ export class Vectorstore {
private openai: OpenAI; // OpenAI client for generating embeddings.
private indexName: string = 'pdf-chatbot'; // Default name for the index.
private _id: string; // Unique ID for the Vectorstore instance.
- private _doc_ids: () => string[]; // List of document IDs handled by this instance.
+ private docManager: AgentDocumentManager; // Document manager for handling documents
documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
@@ -37,9 +37,9 @@ export class Vectorstore {
* Initializes the Pinecone and OpenAI clients, sets up the document ID list,
* and initializes the Pinecone index.
* @param id The unique identifier for the vectorstore instance.
- * @param doc_ids A function that returns a list of document IDs.
+ * @param docManager An instance of AgentDocumentManager to handle document management.
*/
- constructor(id: string, doc_ids: () => string[]) {
+ constructor(id: string, docManager: AgentDocumentManager) {
const pineconeApiKey = process.env.PINECONE_API_KEY;
if (!pineconeApiKey) {
throw new Error('PINECONE_API_KEY is not defined.');
@@ -49,7 +49,7 @@ export class Vectorstore {
this.pinecone = new Pinecone({ apiKey: pineconeApiKey });
this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true });
this._id = id;
- this._doc_ids = doc_ids;
+ this.docManager = docManager;
this.initializeIndex();
}
@@ -109,15 +109,25 @@ export class Vectorstore {
const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4');
let result: AI_Document & { doc_id: string };
+
if (isAudioOrVideo) {
console.log('Processing media file...');
const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
- const segmentedTranscript = response.condensed;
+
+ // Type assertion to handle the response properties
+ const typedResponse = response as {
+ condensed: Array<{ text: string; indexes: string[]; start: number; end: number }>;
+ full: Array<unknown>;
+ summary: string;
+ };
+
+ const segmentedTranscript = typedResponse.condensed;
console.log(segmentedTranscript);
- const summary = response.summary;
+ const summary = typedResponse.summary;
doc.summary = summary;
+
// Generate embeddings for each chunk
- const texts = segmentedTranscript.map((chunk: any) => chunk.text);
+ const texts = segmentedTranscript.map(chunk => chunk.text);
try {
const embeddingsResponse = await this.openai.embeddings.create({
@@ -126,10 +136,19 @@ export class Vectorstore {
encoding_format: 'float',
});
- doc.original_segments = JSON.stringify(response.full);
+ doc.original_segments = JSON.stringify(typedResponse.full);
doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
const doc_id = uuidv4();
+ // Register the document with the AgentDocumentManager
+ this.docManager.addCustomId(doc, doc_id);
+
+ // Generate chunk IDs upfront so we can register them
+ const chunkIds = segmentedTranscript.map(() => uuidv4());
+
+ // Register all chunk IDs with the document manager
+ this.docManager.registerChunkIds(doc_id, chunkIds);
+
// Add transcript and embeddings to metadata
result = {
doc_id,
@@ -137,13 +156,13 @@ export class Vectorstore {
file_name: local_file_path,
num_pages: 0,
summary: '',
- chunks: segmentedTranscript.map((chunk: any, index: number) => ({
- id: uuidv4(),
+ chunks: segmentedTranscript.map((chunk, index) => ({
+ id: chunkIds[index], // Use pre-generated chunk ID
values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding
metadata: {
indexes: chunk.indexes,
original_document: local_file_path,
- doc_id: doc_id,
+ doc_id: doc_id, // Ensure doc_id is consistent
file_path: local_file_path,
start_time: chunk.start,
end_time: chunk.end,
@@ -159,20 +178,24 @@ export class Vectorstore {
}
doc.segmented_transcript = JSON.stringify(segmentedTranscript);
- // Simplify chunks for storage
+ // Simplify chunks for storage - ensure simplified chunks use EXACTLY the same IDs
const simplifiedChunks = result.chunks.map(chunk => ({
- chunkId: chunk.id,
+ chunkId: chunk.id, // Use the exact same ID as the full chunk
start_time: chunk.metadata.start_time,
end_time: chunk.metadata.end_time,
indexes: chunk.metadata.indexes,
chunkType: CHUNK_TYPE.VIDEO,
text: chunk.metadata.text,
+ doc_id: chunk.metadata.doc_id, // Include parent doc_id for completeness
}));
doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
} else {
- // Existing document processing logic remains unchanged
+ // Process regular document
console.log('Processing regular document...');
- const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+ const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+
+ // Type assertion for the response
+ const { jobId } = createDocumentResponse as { jobId: string };
while (true) {
await new Promise(resolve => setTimeout(resolve, 2000));
@@ -188,6 +211,16 @@ export class Vectorstore {
progressCallback(progressResponseJson.progress, progressResponseJson.step);
}
}
+
+ // Register the document with the AgentDocumentManager
+ this.docManager.addCustomId(doc, result.doc_id);
+
+ // Collect all chunk IDs
+ const chunkIds = result.chunks.map(chunk => chunk.id);
+
+ // Register chunks with the document manager
+ this.docManager.registerChunkIds(result.doc_id, chunkIds);
+
if (!doc.chunk_simpl) {
doc.chunk_simpl = JSON.stringify({ chunks: [] });
}
@@ -196,12 +229,13 @@ export class Vectorstore {
result.chunks.forEach((chunk: RAGChunk) => {
const chunkToAdd = {
- chunkId: chunk.id,
+ chunkId: chunk.id, // Ensure we use the exact same ID
startPage: chunk.metadata.start_page,
endPage: chunk.metadata.end_page,
location: chunk.metadata.location,
chunkType: chunk.metadata.type as CHUNK_TYPE,
text: chunk.metadata.text,
+ doc_id: chunk.metadata.doc_id, // Include parent doc_id for consistency
};
const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl));
new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd);
@@ -298,39 +332,55 @@ export class Vectorstore {
let queryEmbedding = queryEmbeddingResponse.data[0].embedding;
- // Extract the embedding from the response.
+ // Get document IDs from the AgentDocumentManager
+ const docIds = Array.from(this.docManager.listDocs());
+ console.log('Using document IDs for retrieval:', docIds);
- console.log(this._doc_ids());
// Query the Pinecone index using the embedding and filter by document IDs.
+ // We'll query based on document IDs that are registered in the document manager
const queryResponse: QueryResponse = await this.index.query({
vector: queryEmbedding,
filter: {
- doc_id: { $in: this._doc_ids() },
+ doc_id: { $in: docIds },
},
topK,
includeValues: true,
includeMetadata: true,
});
- console.log(queryResponse);
-
- // Map the results into RAGChunks and return them.
- return queryResponse.matches.map(
- match =>
- ({
- id: match.id,
- values: match.values as number[],
- metadata: match.metadata as {
- text: string;
- type: string;
- original_document: string;
- file_path: string;
- doc_id: string;
- location: string;
- start_page: number;
- end_page: number;
- },
- }) as RAGChunk
- );
+ console.log(`Found ${queryResponse.matches.length} matching chunks`);
+
+ // For each retrieved chunk, ensure its document ID is registered in the document manager
+ // This maintains compatibility with existing code while ensuring consistency
+ const processedMatches = queryResponse.matches.map(match => {
+ const chunk = {
+ id: match.id,
+ values: match.values as number[],
+ metadata: match.metadata as {
+ text: string;
+ type: string;
+ original_document: string;
+ file_path: string;
+ doc_id: string;
+ location: string;
+ start_page: number;
+ end_page: number;
+ },
+ } as RAGChunk;
+
+ // Ensure the document manager knows about this chunk
+ // This is important for maintaining backwards compatibility
+ if (chunk.id && !this.docManager.getDocIdByChunkId(chunk.id)) {
+ // If the chunk ID isn't registered but we have a doc_id in metadata
+ if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) {
+ // Register the chunk with its parent document
+ this.docManager.registerChunkIds(chunk.metadata.doc_id, [chunk.id]);
+ }
+ }
+
+ return chunk;
+ });
+
+ return processedMatches;
} catch (error) {
console.error(`Error retrieving chunks: ${error}`);
return [];