From 210f8f5f1cd19e9416a12524cce119b273334fd3 Mon Sep 17 00:00:00 2001 From: "A.J. Shulman" Date: Sat, 7 Sep 2024 11:48:36 -0400 Subject: reorganized parsers, added comments to vectorstore, and added citation popup for text citations --- src/client/views/nodes/ChatBox/Agent.ts | 4 +- src/client/views/nodes/ChatBox/AnswerParser.ts | 125 --------------------- src/client/views/nodes/ChatBox/ChatBox.scss | 27 +++++ src/client/views/nodes/ChatBox/ChatBox.tsx | 12 ++ src/client/views/nodes/ChatBox/ChunkManager.ts | 24 ---- .../views/nodes/ChatBox/StreamedAnswerParser.ts | 73 ------------ .../nodes/ChatBox/response_parsers/AnswerParser.ts | 125 +++++++++++++++++++++ .../response_parsers/StreamedAnswerParser.ts | 73 ++++++++++++ .../views/nodes/ChatBox/vectorstore/Vectorstore.ts | 125 ++++++++++++++------- 9 files changed, 326 insertions(+), 262 deletions(-) delete mode 100644 src/client/views/nodes/ChatBox/AnswerParser.ts delete mode 100644 src/client/views/nodes/ChatBox/ChunkManager.ts delete mode 100644 src/client/views/nodes/ChatBox/StreamedAnswerParser.ts create mode 100644 src/client/views/nodes/ChatBox/response_parsers/AnswerParser.ts create mode 100644 src/client/views/nodes/ChatBox/response_parsers/StreamedAnswerParser.ts (limited to 'src') diff --git a/src/client/views/nodes/ChatBox/Agent.ts b/src/client/views/nodes/ChatBox/Agent.ts index eaa17d283..9eb069c78 100644 --- a/src/client/views/nodes/ChatBox/Agent.ts +++ b/src/client/views/nodes/ChatBox/Agent.ts @@ -13,8 +13,8 @@ import { SearchTool } from './tools/SearchTool'; import { NoTool } from './tools/NoTool'; import { on } from 'events'; import { v4 as uuidv4 } from 'uuid'; -import { AnswerParser } from './AnswerParser'; -import { StreamedAnswerParser } from './StreamedAnswerParser'; +import { AnswerParser } from './response_parsers/AnswerParser'; +import { StreamedAnswerParser } from './response_parsers/StreamedAnswerParser'; import { CreateCSVTool } from './tools/CreateCSVTool'; dotenv.config(); diff --git a/src/client/views/nodes/ChatBox/AnswerParser.ts b/src/client/views/nodes/ChatBox/AnswerParser.ts deleted file mode 100644 index 885114195..000000000 --- a/src/client/views/nodes/ChatBox/AnswerParser.ts +++ /dev/null @@ -1,125 +0,0 @@ -import { ASSISTANT_ROLE, AssistantMessage, Citation, CHUNK_TYPE, TEXT_TYPE, getChunkType, ProcessingInfo } from './types'; -import { v4 as uuid } from 'uuid'; - -export class AnswerParser { - static parse(xml: string, processingInfo: ProcessingInfo[]): AssistantMessage { - const answerRegex = /([\s\S]*?)<\/answer>/; - const citationsRegex = /([\s\S]*?)<\/citations>/; - const citationRegex = /([\s\S]*?)<\/citation>/g; - const followUpQuestionsRegex = /([\s\S]*?)<\/follow_up_questions>/; - const questionRegex = /(.*?)<\/question>/g; - const groundedTextRegex = /([\s\S]*?)<\/grounded_text>/g; - const normalTextRegex = /([\s\S]*?)<\/normal_text>/g; - const loopSummaryRegex = /([\s\S]*?)<\/loop_summary>/; - - const answerMatch = answerRegex.exec(xml); - const citationsMatch = citationsRegex.exec(xml); - const followUpQuestionsMatch = followUpQuestionsRegex.exec(xml); - const loopSummaryMatch = loopSummaryRegex.exec(xml); - - if (!answerMatch) { - throw new Error('Invalid XML: Missing tag.'); - } - - let rawTextContent = answerMatch[1].trim(); - let content: AssistantMessage['content'] = []; - let citations: Citation[] = []; - let contentIndex = 0; - - // Remove citations and follow-up questions from rawTextContent - if (citationsMatch) { - rawTextContent = rawTextContent.replace(citationsMatch[0], '').trim(); - } - if (followUpQuestionsMatch) { - rawTextContent = rawTextContent.replace(followUpQuestionsMatch[0], '').trim(); - } - if (loopSummaryMatch) { - rawTextContent = rawTextContent.replace(loopSummaryMatch[0], '').trim(); - } - - // Parse citations - let citationMatch; - const citationMap = new Map(); - if (citationsMatch) { - const citationsContent = citationsMatch[1]; - while ((citationMatch = citationRegex.exec(citationsContent)) !== null) { - const [_, index, chunk_id, type, direct_text] = citationMatch; - const citation_id = uuid(); - citationMap.set(index, citation_id); - citations.push({ - direct_text: direct_text.trim(), - type: getChunkType(type), - chunk_id, - citation_id, - }); - } - } - - rawTextContent = rawTextContent.replace(normalTextRegex, '$1'); - - // Parse text content (normal and grounded) - let lastIndex = 0; - let match; - - while ((match = groundedTextRegex.exec(rawTextContent)) !== null) { - const [fullMatch, citationIndex, groundedText] = match; - - // Add normal text that is before the grounded text - if (match.index > lastIndex) { - const normalText = rawTextContent.slice(lastIndex, match.index).trim(); - if (normalText) { - content.push({ - index: contentIndex++, - type: TEXT_TYPE.NORMAL, - text: normalText, - citation_ids: null, - }); - } - } - - // Add grounded text - const citation_ids = citationIndex.split(',').map(index => citationMap.get(index) || ''); - content.push({ - index: contentIndex++, - type: TEXT_TYPE.GROUNDED, - text: groundedText.trim(), - citation_ids, - }); - - lastIndex = match.index + fullMatch.length; - } - - // Add any remaining normal text after the last grounded text - if (lastIndex < rawTextContent.length) { - const remainingText = rawTextContent.slice(lastIndex).trim(); - if (remainingText) { - content.push({ - index: contentIndex++, - type: TEXT_TYPE.NORMAL, - text: remainingText, - citation_ids: null, - }); - } - } - - let followUpQuestions: string[] = []; - if (followUpQuestionsMatch) { - const questionsText = followUpQuestionsMatch[1]; - let questionMatch; - while ((questionMatch = questionRegex.exec(questionsText)) !== null) { - followUpQuestions.push(questionMatch[1].trim()); - } - } - - const assistantResponse: AssistantMessage = { - role: ASSISTANT_ROLE.ASSISTANT, - content, - follow_up_questions: followUpQuestions, - citations, - processing_info: processingInfo, - loop_summary: loopSummaryMatch ? loopSummaryMatch[1].trim() : undefined, - }; - - return assistantResponse; - } -} diff --git a/src/client/views/nodes/ChatBox/ChatBox.scss b/src/client/views/nodes/ChatBox/ChatBox.scss index adb0663c3..42f6a0d61 100644 --- a/src/client/views/nodes/ChatBox/ChatBox.scss +++ b/src/client/views/nodes/ChatBox/ChatBox.scss @@ -116,6 +116,33 @@ $transition: all 0.3s ease; } } } + .citation-popup { + position: fixed; + bottom: 50px; + left: 50%; + transform: translateX(-50%); + background-color: rgba(0, 0, 0, 0.8); + color: white; + padding: 10px 20px; + border-radius: 10px; + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2); + z-index: 1000; + animation: fadeIn 0.3s ease-in-out; + + p { + margin: 0; + font-size: 14px; + } + + @keyframes fadeIn { + from { + opacity: 0; + } + to { + opacity: 1; + } + } + } } .message { diff --git a/src/client/views/nodes/ChatBox/ChatBox.tsx b/src/client/views/nodes/ChatBox/ChatBox.tsx index ff699aab3..98a2e6002 100644 --- a/src/client/views/nodes/ChatBox/ChatBox.tsx +++ b/src/client/views/nodes/ChatBox/ChatBox.tsx @@ -44,6 +44,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { @observable private linked_docs_to_add: ObservableSet = observable.set(); @observable private linked_csv_files: { filename: string; id: string; text: string }[] = []; @observable private isUploadingDocs: boolean = false; + @observable private citationPopup: { text: string; visible: boolean } = { text: '', visible: false }; // Private properties for managing OpenAI API, vector store, agent, and UI elements private openai: OpenAI; @@ -450,6 +451,9 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { DocumentManager.Instance.showDocument(highlightDoc, { willZoomCentered: true }, () => {}); break; case CHUNK_TYPE.TEXT: + this.citationPopup = { text: citation.direct_text ?? 'No text available', visible: true }; + setTimeout(() => (this.citationPopup.visible = false), 3000); // Hide after 3 seconds + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => { const firstView = Array.from(doc[DocViews])[0] as DocumentView; firstView.ComponentView?.search?.(citation.direct_text ?? ''); @@ -730,6 +734,14 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { )} + {/* Popup for citation */} + {this.citationPopup.visible && ( +
+

+ Text from your document: {this.citationPopup.text} +

+
+ )} ); } diff --git a/src/client/views/nodes/ChatBox/ChunkManager.ts b/src/client/views/nodes/ChatBox/ChunkManager.ts deleted file mode 100644 index 64c073640..000000000 --- a/src/client/views/nodes/ChatBox/ChunkManager.ts +++ /dev/null @@ -1,24 +0,0 @@ -import { SimplifiedChunk } from './types'; - -class ChunkManager { - private chunks: SimplifiedChunk[]; - - constructor() { - this.chunks = []; - } - - addChunk(chunk: SimplifiedChunk) { - this.chunks.push(chunk); - } - - removeChunk(chunk: SimplifiedChunk) { - const index = this.chunks.indexOf(chunk); - if (index !== -1) { - this.chunks.splice(index, 1); - } - } - - getChunks() { - return this.chunks; - } -} diff --git a/src/client/views/nodes/ChatBox/StreamedAnswerParser.ts b/src/client/views/nodes/ChatBox/StreamedAnswerParser.ts deleted file mode 100644 index 3585cab4a..000000000 --- a/src/client/views/nodes/ChatBox/StreamedAnswerParser.ts +++ /dev/null @@ -1,73 +0,0 @@ -import { threadId } from 'worker_threads'; - -enum ParserState { - Outside, - InGroundedText, - InNormalText, -} - -export class StreamedAnswerParser { - private state: ParserState = ParserState.Outside; - private buffer: string = ''; - private result: string = ''; - private isStartOfLine: boolean = true; - - public parse(char: string): string { - switch (this.state) { - case ParserState.Outside: - if (char === '<') { - this.buffer = '<'; - } else if (char === '>') { - if (this.buffer.startsWith('') { - this.state = ParserState.Outside; - this.buffer = ''; - } else if (this.buffer.startsWith('') { - this.state = ParserState.Outside; - this.buffer = ''; - } else if (this.buffer.startsWith('<')) { - this.buffer += char; - } else { - this.processChar(char); - } - break; - } - - return this.result.trim(); - } - - private processChar(char: string): void { - if (this.isStartOfLine && char === ' ') { - // Skip leading spaces - return; - } - if (char === '\n') { - this.result += char; - this.isStartOfLine = true; - } else { - this.result += char; - this.isStartOfLine = false; - } - } - - public reset(): void { - this.state = ParserState.Outside; - this.buffer = ''; - this.result = ''; - this.isStartOfLine = true; - } -} diff --git a/src/client/views/nodes/ChatBox/response_parsers/AnswerParser.ts b/src/client/views/nodes/ChatBox/response_parsers/AnswerParser.ts new file mode 100644 index 000000000..79b53b0a3 --- /dev/null +++ b/src/client/views/nodes/ChatBox/response_parsers/AnswerParser.ts @@ -0,0 +1,125 @@ +import { ASSISTANT_ROLE, AssistantMessage, Citation, CHUNK_TYPE, TEXT_TYPE, getChunkType, ProcessingInfo } from '../types'; +import { v4 as uuid } from 'uuid'; + +export class AnswerParser { + static parse(xml: string, processingInfo: ProcessingInfo[]): AssistantMessage { + const answerRegex = /([\s\S]*?)<\/answer>/; + const citationsRegex = /([\s\S]*?)<\/citations>/; + const citationRegex = /([\s\S]*?)<\/citation>/g; + const followUpQuestionsRegex = /([\s\S]*?)<\/follow_up_questions>/; + const questionRegex = /(.*?)<\/question>/g; + const groundedTextRegex = /([\s\S]*?)<\/grounded_text>/g; + const normalTextRegex = /([\s\S]*?)<\/normal_text>/g; + const loopSummaryRegex = /([\s\S]*?)<\/loop_summary>/; + + const answerMatch = answerRegex.exec(xml); + const citationsMatch = citationsRegex.exec(xml); + const followUpQuestionsMatch = followUpQuestionsRegex.exec(xml); + const loopSummaryMatch = loopSummaryRegex.exec(xml); + + if (!answerMatch) { + throw new Error('Invalid XML: Missing tag.'); + } + + let rawTextContent = answerMatch[1].trim(); + let content: AssistantMessage['content'] = []; + let citations: Citation[] = []; + let contentIndex = 0; + + // Remove citations and follow-up questions from rawTextContent + if (citationsMatch) { + rawTextContent = rawTextContent.replace(citationsMatch[0], '').trim(); + } + if (followUpQuestionsMatch) { + rawTextContent = rawTextContent.replace(followUpQuestionsMatch[0], '').trim(); + } + if (loopSummaryMatch) { + rawTextContent = rawTextContent.replace(loopSummaryMatch[0], '').trim(); + } + + // Parse citations + let citationMatch; + const citationMap = new Map(); + if (citationsMatch) { + const citationsContent = citationsMatch[1]; + while ((citationMatch = citationRegex.exec(citationsContent)) !== null) { + const [_, index, chunk_id, type, direct_text] = citationMatch; + const citation_id = uuid(); + citationMap.set(index, citation_id); + citations.push({ + direct_text: direct_text.trim(), + type: getChunkType(type), + chunk_id, + citation_id, + }); + } + } + + rawTextContent = rawTextContent.replace(normalTextRegex, '$1'); + + // Parse text content (normal and grounded) + let lastIndex = 0; + let match; + + while ((match = groundedTextRegex.exec(rawTextContent)) !== null) { + const [fullMatch, citationIndex, groundedText] = match; + + // Add normal text that is before the grounded text + if (match.index > lastIndex) { + const normalText = rawTextContent.slice(lastIndex, match.index).trim(); + if (normalText) { + content.push({ + index: contentIndex++, + type: TEXT_TYPE.NORMAL, + text: normalText, + citation_ids: null, + }); + } + } + + // Add grounded text + const citation_ids = citationIndex.split(',').map(index => citationMap.get(index) || ''); + content.push({ + index: contentIndex++, + type: TEXT_TYPE.GROUNDED, + text: groundedText.trim(), + citation_ids, + }); + + lastIndex = match.index + fullMatch.length; + } + + // Add any remaining normal text after the last grounded text + if (lastIndex < rawTextContent.length) { + const remainingText = rawTextContent.slice(lastIndex).trim(); + if (remainingText) { + content.push({ + index: contentIndex++, + type: TEXT_TYPE.NORMAL, + text: remainingText, + citation_ids: null, + }); + } + } + + let followUpQuestions: string[] = []; + if (followUpQuestionsMatch) { + const questionsText = followUpQuestionsMatch[1]; + let questionMatch; + while ((questionMatch = questionRegex.exec(questionsText)) !== null) { + followUpQuestions.push(questionMatch[1].trim()); + } + } + + const assistantResponse: AssistantMessage = { + role: ASSISTANT_ROLE.ASSISTANT, + content, + follow_up_questions: followUpQuestions, + citations, + processing_info: processingInfo, + loop_summary: loopSummaryMatch ? loopSummaryMatch[1].trim() : undefined, + }; + + return assistantResponse; + } +} diff --git a/src/client/views/nodes/ChatBox/response_parsers/StreamedAnswerParser.ts b/src/client/views/nodes/ChatBox/response_parsers/StreamedAnswerParser.ts new file mode 100644 index 000000000..3585cab4a --- /dev/null +++ b/src/client/views/nodes/ChatBox/response_parsers/StreamedAnswerParser.ts @@ -0,0 +1,73 @@ +import { threadId } from 'worker_threads'; + +enum ParserState { + Outside, + InGroundedText, + InNormalText, +} + +export class StreamedAnswerParser { + private state: ParserState = ParserState.Outside; + private buffer: string = ''; + private result: string = ''; + private isStartOfLine: boolean = true; + + public parse(char: string): string { + switch (this.state) { + case ParserState.Outside: + if (char === '<') { + this.buffer = '<'; + } else if (char === '>') { + if (this.buffer.startsWith('') { + this.state = ParserState.Outside; + this.buffer = ''; + } else if (this.buffer.startsWith('') { + this.state = ParserState.Outside; + this.buffer = ''; + } else if (this.buffer.startsWith('<')) { + this.buffer += char; + } else { + this.processChar(char); + } + break; + } + + return this.result.trim(); + } + + private processChar(char: string): void { + if (this.isStartOfLine && char === ' ') { + // Skip leading spaces + return; + } + if (char === '\n') { + this.result += char; + this.isStartOfLine = true; + } else { + this.result += char; + this.isStartOfLine = false; + } + } + + public reset(): void { + this.state = ParserState.Outside; + this.buffer = ''; + this.result = ''; + this.isStartOfLine = true; + } +} diff --git a/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts index b5145c1f7..cc3b1ccd5 100644 --- a/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts @@ -2,47 +2,55 @@ import { Pinecone, Index, IndexList, PineconeRecord, RecordMetadata, QueryRespon import { CohereClient } from 'cohere-ai'; import { EmbedResponse } from 'cohere-ai/api'; import dotenv from 'dotenv'; -import axios from 'axios'; -import { SimplifiedChunk } from '../types'; - import { RAGChunk, AI_Document, CHUNK_TYPE } from '../types'; import { Doc } from '../../../../../fields/Doc'; -import { DocData } from '../../../../../fields/DocSymbols'; import { CsvCast, PDFCast, StrCast } from '../../../../../fields/Types'; import { Networking } from '../../../../Network'; dotenv.config(); +/** + * The Vectorstore class integrates with Pinecone for vector-based document indexing and retrieval, + * and Cohere for text embedding. It handles AI document management, uploads, and query-based retrieval. + */ export class Vectorstore { - private pinecone: Pinecone; - private index!: Index; - private cohere: CohereClient; - private indexName: string = 'pdf-chatbot'; - private _id: string; - private _doc_ids: string[] = []; + private pinecone: Pinecone; // Pinecone client for managing the vector index. + private index!: Index; // The specific Pinecone index used for document chunks. + private cohere: CohereClient; // Cohere client for generating embeddings. + private indexName: string = 'pdf-chatbot'; // Default name for the index. + private _id: string; // Unique ID for the Vectorstore instance. + private _doc_ids: string[] = []; // List of document IDs handled by this instance. - documents: AI_Document[] = []; + documents: AI_Document[] = []; // Store the documents indexed in the vectorstore. + /** + * Constructor initializes the Pinecone and Cohere clients, sets up the document ID list, + * and initializes the Pinecone index. + * @param id The unique identifier for the vectorstore instance. + * @param doc_ids A function that returns a list of document IDs. + */ constructor(id: string, doc_ids: () => string[]) { const pineconeApiKey = process.env.PINECONE_API_KEY; if (!pineconeApiKey) { throw new Error('PINECONE_API_KEY is not defined.'); } - this.pinecone = new Pinecone({ - apiKey: pineconeApiKey, - }); - this.cohere = new CohereClient({ - token: process.env.COHERE_API_KEY, - }); + // Initialize Pinecone and Cohere clients with API keys from the environment. + this.pinecone = new Pinecone({ apiKey: pineconeApiKey }); + this.cohere = new CohereClient({ token: process.env.COHERE_API_KEY }); this._id = id; this._doc_ids = doc_ids(); this.initializeIndex(); } + /** + * Initializes the Pinecone index by checking if it exists, and creating it if not. + * The index is set to use the cosine metric for vector similarity. + */ private async initializeIndex() { const indexList: IndexList = await this.pinecone.listIndexes(); + // Check if the index already exists, otherwise create it. if (!indexList.indexes?.some(index => index.name === this.indexName)) { await this.pinecone.createIndex({ name: this.indexName, @@ -57,62 +65,76 @@ export class Vectorstore { }); } + // Set the index for future use. this.index = this.pinecone.Index(this.indexName); } + /** + * Adds an AI document to the vectorstore. This method handles document chunking, uploading to the + * vectorstore, and updating the progress for long-running tasks like file uploads. + * @param doc The document to be added to the vectorstore. + * @param progressCallback Callback to update the progress of the upload. + */ async addAIDoc(doc: Doc, progressCallback: (progress: number, step: string) => void) { console.log('Adding AI Document:', doc); const ai_document_status: string = StrCast(doc.ai_document_status); - if (ai_document_status !== undefined && ai_document_status !== null && ai_document_status.trim() !== '' && ai_document_status !== '{}') { + // Skip if the document is already in progress or completed. + if (ai_document_status !== undefined && ai_document_status.trim() !== '' && ai_document_status !== '{}') { if (ai_document_status === 'IN PROGRESS') { console.log('Already in progress.'); return; } - if (!this._doc_ids.includes(StrCast(doc.ai_doc_id))) this._doc_ids.push(StrCast(doc.ai_doc_id)); + if (!this._doc_ids.includes(StrCast(doc.ai_doc_id))) { + this._doc_ids.push(StrCast(doc.ai_doc_id)); + } } else { + // Start processing the document. doc.ai_document_status = 'PROGRESS'; console.log(doc); + + // Get the local file path (CSV or PDF). const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname; console.log('Local File Path:', local_file_path); if (local_file_path) { console.log('Creating AI Document...'); - // Start the document creation process + // Start the document creation process by sending the file to the server. const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path }); - // Poll the server for progress updates + // Poll the server for progress updates. let inProgress: boolean = true; let result: any = null; while (inProgress) { - await new Promise(resolve => setTimeout(resolve, 2000)); // Polling interval + // Polling interval for status updates. + await new Promise(resolve => setTimeout(resolve, 2000)); + // Check if the job is completed. const resultResponse = await Networking.FetchFromServer(`/getResult/${jobId}`); const resultResponseJson = JSON.parse(resultResponse); - //console.log('Result Response:', resultResponseJson); if (resultResponseJson.status === 'completed') { console.log('Result here:', resultResponseJson); result = resultResponseJson; break; } + // Fetch progress information and update the progress callback. const progressResponse = await Networking.FetchFromServer(`/getProgress/${jobId}`); const progressResponseJson = JSON.parse(progressResponse); - //console.log('Progress Response:', progressResponseJson); - if (progressResponseJson) { - console.log('Progress:', progressResponseJson); const progress = progressResponseJson.progress; const step = progressResponseJson.step; progressCallback(progress, step); } } - // Process the final document result + // Once completed, process the document and add it to the vectorstore. console.log('Document JSON:', result); this.documents.push(result); - await this.indexDocument(JSON.parse(JSON.stringify(result, (key, value) => (value === null || value === undefined ? undefined : value)))); + await this.indexDocument(result); console.log(`Document added: ${result.file_name}`); + + // Update document metadata such as summary, purpose, and vectorstore ID. doc.summary = result.summary; doc.ai_doc_id = result.doc_id; this._doc_ids.push(result.doc_id); @@ -128,6 +150,7 @@ export class Vectorstore { doc.chunk_simpl = JSON.stringify({ chunks: [] }); } + // Process each chunk of the document and update the document's chunk_simpl field. result.chunks.forEach((chunk: RAGChunk) => { const chunkToAdd = { chunkId: chunk.id, @@ -142,27 +165,41 @@ export class Vectorstore { doc.chunk_simpl = JSON.stringify(new_chunk_simpl); }); + // Mark the document status as completed. doc.ai_document_status = 'COMPLETED'; } } } + /** + * Indexes the processed document by uploading the document's vector chunks to the Pinecone index. + * @param document The processed document containing its chunks and metadata. + */ private async indexDocument(document: any) { console.log('Uploading vectors to content namespace...'); - const pineconeRecords: PineconeRecord[] = (document.chunks as RAGChunk[]).map( - chunk => - ({ - id: chunk.id, - values: chunk.values, - metadata: { ...chunk.metadata } as RecordMetadata, - }) as PineconeRecord - ); + + // Prepare Pinecone records for each chunk in the document. + const pineconeRecords: PineconeRecord[] = (document.chunks as RAGChunk[]).map(chunk => ({ + id: chunk.id, + values: chunk.values, + metadata: { ...chunk.metadata } as RecordMetadata, + })); + + // Upload the records to Pinecone. await this.index.upsert(pineconeRecords); } - async retrieve(query: string, topK: number = 10): Promise { + /** + * Retrieves the top K document chunks relevant to the user's query. + * This involves embedding the query using Cohere, then querying Pinecone for matching vectors. + * @param query The search query string. + * @param topK The number of top results to return (default is 10). + * @returns A list of document chunks that match the query. + */ + async retrieve(query: string, topK: number = 10): Promise { console.log(`Retrieving chunks for query: ${query}`); try { + // Generate an embedding for the query using Cohere. const queryEmbeddingResponse: EmbedResponse = await this.cohere.embed({ texts: [query], model: 'embed-english-v3.0', @@ -171,6 +208,7 @@ export class Vectorstore { let queryEmbedding: number[]; + // Extract the embedding from the response. if (Array.isArray(queryEmbeddingResponse.embeddings)) { queryEmbedding = queryEmbeddingResponse.embeddings[0]; } else if (queryEmbeddingResponse.embeddings && 'embeddings' in queryEmbeddingResponse.embeddings) { @@ -183,6 +221,7 @@ export class Vectorstore { throw new Error('Query embedding is not an array'); } + // Query the Pinecone index using the embedding and filter by document IDs. const queryResponse: QueryResponse = await this.index.query({ vector: queryEmbedding, filter: { @@ -193,12 +232,22 @@ export class Vectorstore { includeMetadata: true, }); + // Map the results into RAGChunks and return them. return queryResponse.matches.map( match => ({ id: match.id, values: match.values as number[], - metadata: match.metadata as { text: string; type: string; original_document: string; file_path: string; doc_id: string; location: string; start_page: number; end_page: number }, + metadata: match.metadata as { + text: string; + type: string; + original_document: string; + file_path: string; + doc_id: string; + location: string; + start_page: number; + end_page: number; + }, }) as RAGChunk ); } catch (error) { -- cgit v1.2.3-70-g09d2