aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2024-09-07 11:48:36 -0400
committerA.J. Shulman <Shulman.aj@gmail.com>2024-09-07 11:48:36 -0400
commit210f8f5f1cd19e9416a12524cce119b273334fd3 (patch)
tree02268cd69abc868c428f42e8d57812a4f29be1a7 /src
parent0b3c2ed595b85391e9833a3b7710d2169439a582 (diff)
reorganized parsers, added comments to vectorstore, and added citation popup for text citations
Diffstat (limited to 'src')
-rw-r--r--src/client/views/nodes/ChatBox/Agent.ts4
-rw-r--r--src/client/views/nodes/ChatBox/ChatBox.scss27
-rw-r--r--src/client/views/nodes/ChatBox/ChatBox.tsx12
-rw-r--r--src/client/views/nodes/ChatBox/ChunkManager.ts24
-rw-r--r--src/client/views/nodes/ChatBox/response_parsers/AnswerParser.ts (renamed from src/client/views/nodes/ChatBox/AnswerParser.ts)2
-rw-r--r--src/client/views/nodes/ChatBox/response_parsers/StreamedAnswerParser.ts (renamed from src/client/views/nodes/ChatBox/StreamedAnswerParser.ts)0
-rw-r--r--src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts125
7 files changed, 129 insertions, 65 deletions
diff --git a/src/client/views/nodes/ChatBox/Agent.ts b/src/client/views/nodes/ChatBox/Agent.ts
index eaa17d283..9eb069c78 100644
--- a/src/client/views/nodes/ChatBox/Agent.ts
+++ b/src/client/views/nodes/ChatBox/Agent.ts
@@ -13,8 +13,8 @@ import { SearchTool } from './tools/SearchTool';
import { NoTool } from './tools/NoTool';
import { on } from 'events';
import { v4 as uuidv4 } from 'uuid';
-import { AnswerParser } from './AnswerParser';
-import { StreamedAnswerParser } from './StreamedAnswerParser';
+import { AnswerParser } from './response_parsers/AnswerParser';
+import { StreamedAnswerParser } from './response_parsers/StreamedAnswerParser';
import { CreateCSVTool } from './tools/CreateCSVTool';
dotenv.config();
diff --git a/src/client/views/nodes/ChatBox/ChatBox.scss b/src/client/views/nodes/ChatBox/ChatBox.scss
index adb0663c3..42f6a0d61 100644
--- a/src/client/views/nodes/ChatBox/ChatBox.scss
+++ b/src/client/views/nodes/ChatBox/ChatBox.scss
@@ -116,6 +116,33 @@ $transition: all 0.3s ease;
}
}
}
+ .citation-popup {
+ position: fixed;
+ bottom: 50px;
+ left: 50%;
+ transform: translateX(-50%);
+ background-color: rgba(0, 0, 0, 0.8);
+ color: white;
+ padding: 10px 20px;
+ border-radius: 10px;
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
+ z-index: 1000;
+ animation: fadeIn 0.3s ease-in-out;
+
+ p {
+ margin: 0;
+ font-size: 14px;
+ }
+
+ @keyframes fadeIn {
+ from {
+ opacity: 0;
+ }
+ to {
+ opacity: 1;
+ }
+ }
+ }
}
.message {
diff --git a/src/client/views/nodes/ChatBox/ChatBox.tsx b/src/client/views/nodes/ChatBox/ChatBox.tsx
index ff699aab3..98a2e6002 100644
--- a/src/client/views/nodes/ChatBox/ChatBox.tsx
+++ b/src/client/views/nodes/ChatBox/ChatBox.tsx
@@ -44,6 +44,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
@observable private linked_docs_to_add: ObservableSet = observable.set();
@observable private linked_csv_files: { filename: string; id: string; text: string }[] = [];
@observable private isUploadingDocs: boolean = false;
+ @observable private citationPopup: { text: string; visible: boolean } = { text: '', visible: false };
// Private properties for managing OpenAI API, vector store, agent, and UI elements
private openai: OpenAI;
@@ -450,6 +451,9 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
DocumentManager.Instance.showDocument(highlightDoc, { willZoomCentered: true }, () => {});
break;
case CHUNK_TYPE.TEXT:
+ this.citationPopup = { text: citation.direct_text ?? 'No text available', visible: true };
+ setTimeout(() => (this.citationPopup.visible = false), 3000); // Hide after 3 seconds
+
DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {
const firstView = Array.from(doc[DocViews])[0] as DocumentView;
firstView.ComponentView?.search?.(citation.direct_text ?? '');
@@ -730,6 +734,14 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
)}
</button>
</form>
+ {/* Popup for citation */}
+ {this.citationPopup.visible && (
+ <div className="citation-popup">
+ <p>
+ <strong>Text from your document: </strong> {this.citationPopup.text}
+ </p>
+ </div>
+ )}
</div>
);
}
diff --git a/src/client/views/nodes/ChatBox/ChunkManager.ts b/src/client/views/nodes/ChatBox/ChunkManager.ts
deleted file mode 100644
index 64c073640..000000000
--- a/src/client/views/nodes/ChatBox/ChunkManager.ts
+++ /dev/null
@@ -1,24 +0,0 @@
-import { SimplifiedChunk } from './types';
-
-class ChunkManager {
- private chunks: SimplifiedChunk[];
-
- constructor() {
- this.chunks = [];
- }
-
- addChunk(chunk: SimplifiedChunk) {
- this.chunks.push(chunk);
- }
-
- removeChunk(chunk: SimplifiedChunk) {
- const index = this.chunks.indexOf(chunk);
- if (index !== -1) {
- this.chunks.splice(index, 1);
- }
- }
-
- getChunks() {
- return this.chunks;
- }
-}
diff --git a/src/client/views/nodes/ChatBox/AnswerParser.ts b/src/client/views/nodes/ChatBox/response_parsers/AnswerParser.ts
index 885114195..79b53b0a3 100644
--- a/src/client/views/nodes/ChatBox/AnswerParser.ts
+++ b/src/client/views/nodes/ChatBox/response_parsers/AnswerParser.ts
@@ -1,4 +1,4 @@
-import { ASSISTANT_ROLE, AssistantMessage, Citation, CHUNK_TYPE, TEXT_TYPE, getChunkType, ProcessingInfo } from './types';
+import { ASSISTANT_ROLE, AssistantMessage, Citation, CHUNK_TYPE, TEXT_TYPE, getChunkType, ProcessingInfo } from '../types';
import { v4 as uuid } from 'uuid';
export class AnswerParser {
diff --git a/src/client/views/nodes/ChatBox/StreamedAnswerParser.ts b/src/client/views/nodes/ChatBox/response_parsers/StreamedAnswerParser.ts
index 3585cab4a..3585cab4a 100644
--- a/src/client/views/nodes/ChatBox/StreamedAnswerParser.ts
+++ b/src/client/views/nodes/ChatBox/response_parsers/StreamedAnswerParser.ts
diff --git a/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts
index b5145c1f7..cc3b1ccd5 100644
--- a/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts
@@ -2,47 +2,55 @@ import { Pinecone, Index, IndexList, PineconeRecord, RecordMetadata, QueryRespon
import { CohereClient } from 'cohere-ai';
import { EmbedResponse } from 'cohere-ai/api';
import dotenv from 'dotenv';
-import axios from 'axios';
-import { SimplifiedChunk } from '../types';
-
import { RAGChunk, AI_Document, CHUNK_TYPE } from '../types';
import { Doc } from '../../../../../fields/Doc';
-import { DocData } from '../../../../../fields/DocSymbols';
import { CsvCast, PDFCast, StrCast } from '../../../../../fields/Types';
import { Networking } from '../../../../Network';
dotenv.config();
+/**
+ * The Vectorstore class integrates with Pinecone for vector-based document indexing and retrieval,
+ * and Cohere for text embedding. It handles AI document management, uploads, and query-based retrieval.
+ */
export class Vectorstore {
- private pinecone: Pinecone;
- private index!: Index;
- private cohere: CohereClient;
- private indexName: string = 'pdf-chatbot';
- private _id: string;
- private _doc_ids: string[] = [];
+ private pinecone: Pinecone; // Pinecone client for managing the vector index.
+ private index!: Index; // The specific Pinecone index used for document chunks.
+ private cohere: CohereClient; // Cohere client for generating embeddings.
+ private indexName: string = 'pdf-chatbot'; // Default name for the index.
+ private _id: string; // Unique ID for the Vectorstore instance.
+ private _doc_ids: string[] = []; // List of document IDs handled by this instance.
- documents: AI_Document[] = [];
+ documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
+ /**
+ * Constructor initializes the Pinecone and Cohere clients, sets up the document ID list,
+ * and initializes the Pinecone index.
+ * @param id The unique identifier for the vectorstore instance.
+ * @param doc_ids A function that returns a list of document IDs.
+ */
constructor(id: string, doc_ids: () => string[]) {
const pineconeApiKey = process.env.PINECONE_API_KEY;
if (!pineconeApiKey) {
throw new Error('PINECONE_API_KEY is not defined.');
}
- this.pinecone = new Pinecone({
- apiKey: pineconeApiKey,
- });
- this.cohere = new CohereClient({
- token: process.env.COHERE_API_KEY,
- });
+ // Initialize Pinecone and Cohere clients with API keys from the environment.
+ this.pinecone = new Pinecone({ apiKey: pineconeApiKey });
+ this.cohere = new CohereClient({ token: process.env.COHERE_API_KEY });
this._id = id;
this._doc_ids = doc_ids();
this.initializeIndex();
}
+ /**
+ * Initializes the Pinecone index by checking if it exists, and creating it if not.
+ * The index is set to use the cosine metric for vector similarity.
+ */
private async initializeIndex() {
const indexList: IndexList = await this.pinecone.listIndexes();
+ // Check if the index already exists, otherwise create it.
if (!indexList.indexes?.some(index => index.name === this.indexName)) {
await this.pinecone.createIndex({
name: this.indexName,
@@ -57,62 +65,76 @@ export class Vectorstore {
});
}
+ // Set the index for future use.
this.index = this.pinecone.Index(this.indexName);
}
+ /**
+ * Adds an AI document to the vectorstore. This method handles document chunking, uploading to the
+ * vectorstore, and updating the progress for long-running tasks like file uploads.
+ * @param doc The document to be added to the vectorstore.
+ * @param progressCallback Callback to update the progress of the upload.
+ */
async addAIDoc(doc: Doc, progressCallback: (progress: number, step: string) => void) {
console.log('Adding AI Document:', doc);
const ai_document_status: string = StrCast(doc.ai_document_status);
- if (ai_document_status !== undefined && ai_document_status !== null && ai_document_status.trim() !== '' && ai_document_status !== '{}') {
+ // Skip if the document is already in progress or completed.
+ if (ai_document_status !== undefined && ai_document_status.trim() !== '' && ai_document_status !== '{}') {
if (ai_document_status === 'IN PROGRESS') {
console.log('Already in progress.');
return;
}
- if (!this._doc_ids.includes(StrCast(doc.ai_doc_id))) this._doc_ids.push(StrCast(doc.ai_doc_id));
+ if (!this._doc_ids.includes(StrCast(doc.ai_doc_id))) {
+ this._doc_ids.push(StrCast(doc.ai_doc_id));
+ }
} else {
+ // Start processing the document.
doc.ai_document_status = 'PROGRESS';
console.log(doc);
+
+ // Get the local file path (CSV or PDF).
const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname;
console.log('Local File Path:', local_file_path);
if (local_file_path) {
console.log('Creating AI Document...');
- // Start the document creation process
+ // Start the document creation process by sending the file to the server.
const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
- // Poll the server for progress updates
+ // Poll the server for progress updates.
let inProgress: boolean = true;
let result: any = null;
while (inProgress) {
- await new Promise(resolve => setTimeout(resolve, 2000)); // Polling interval
+ // Polling interval for status updates.
+ await new Promise(resolve => setTimeout(resolve, 2000));
+ // Check if the job is completed.
const resultResponse = await Networking.FetchFromServer(`/getResult/${jobId}`);
const resultResponseJson = JSON.parse(resultResponse);
- //console.log('Result Response:', resultResponseJson);
if (resultResponseJson.status === 'completed') {
console.log('Result here:', resultResponseJson);
result = resultResponseJson;
break;
}
+ // Fetch progress information and update the progress callback.
const progressResponse = await Networking.FetchFromServer(`/getProgress/${jobId}`);
const progressResponseJson = JSON.parse(progressResponse);
- //console.log('Progress Response:', progressResponseJson);
-
if (progressResponseJson) {
- console.log('Progress:', progressResponseJson);
const progress = progressResponseJson.progress;
const step = progressResponseJson.step;
progressCallback(progress, step);
}
}
- // Process the final document result
+ // Once completed, process the document and add it to the vectorstore.
console.log('Document JSON:', result);
this.documents.push(result);
- await this.indexDocument(JSON.parse(JSON.stringify(result, (key, value) => (value === null || value === undefined ? undefined : value))));
+ await this.indexDocument(result);
console.log(`Document added: ${result.file_name}`);
+
+ // Update document metadata such as summary, purpose, and vectorstore ID.
doc.summary = result.summary;
doc.ai_doc_id = result.doc_id;
this._doc_ids.push(result.doc_id);
@@ -128,6 +150,7 @@ export class Vectorstore {
doc.chunk_simpl = JSON.stringify({ chunks: [] });
}
+ // Process each chunk of the document and update the document's chunk_simpl field.
result.chunks.forEach((chunk: RAGChunk) => {
const chunkToAdd = {
chunkId: chunk.id,
@@ -142,27 +165,41 @@ export class Vectorstore {
doc.chunk_simpl = JSON.stringify(new_chunk_simpl);
});
+ // Mark the document status as completed.
doc.ai_document_status = 'COMPLETED';
}
}
}
+ /**
+ * Indexes the processed document by uploading the document's vector chunks to the Pinecone index.
+ * @param document The processed document containing its chunks and metadata.
+ */
private async indexDocument(document: any) {
console.log('Uploading vectors to content namespace...');
- const pineconeRecords: PineconeRecord[] = (document.chunks as RAGChunk[]).map(
- chunk =>
- ({
- id: chunk.id,
- values: chunk.values,
- metadata: { ...chunk.metadata } as RecordMetadata,
- }) as PineconeRecord
- );
+
+ // Prepare Pinecone records for each chunk in the document.
+ const pineconeRecords: PineconeRecord[] = (document.chunks as RAGChunk[]).map(chunk => ({
+ id: chunk.id,
+ values: chunk.values,
+ metadata: { ...chunk.metadata } as RecordMetadata,
+ }));
+
+ // Upload the records to Pinecone.
await this.index.upsert(pineconeRecords);
}
- async retrieve(query: string, topK: number = 10): Promise {
+ /**
+ * Retrieves the top K document chunks relevant to the user's query.
+ * This involves embedding the query using Cohere, then querying Pinecone for matching vectors.
+ * @param query The search query string.
+ * @param topK The number of top results to return (default is 10).
+ * @returns A list of document chunks that match the query.
+ */
+ async retrieve(query: string, topK: number = 10): Promise<RAGChunk[]> {
console.log(`Retrieving chunks for query: ${query}`);
try {
+ // Generate an embedding for the query using Cohere.
const queryEmbeddingResponse: EmbedResponse = await this.cohere.embed({
texts: [query],
model: 'embed-english-v3.0',
@@ -171,6 +208,7 @@ export class Vectorstore {
let queryEmbedding: number[];
+ // Extract the embedding from the response.
if (Array.isArray(queryEmbeddingResponse.embeddings)) {
queryEmbedding = queryEmbeddingResponse.embeddings[0];
} else if (queryEmbeddingResponse.embeddings && 'embeddings' in queryEmbeddingResponse.embeddings) {
@@ -183,6 +221,7 @@ export class Vectorstore {
throw new Error('Query embedding is not an array');
}
+ // Query the Pinecone index using the embedding and filter by document IDs.
const queryResponse: QueryResponse = await this.index.query({
vector: queryEmbedding,
filter: {
@@ -193,12 +232,22 @@ export class Vectorstore {
includeMetadata: true,
});
+ // Map the results into RAGChunks and return them.
return queryResponse.matches.map(
match =>
({
id: match.id,
values: match.values as number[],
- metadata: match.metadata as { text: string; type: string; original_document: string; file_path: string; doc_id: string; location: string; start_page: number; end_page: number },
+ metadata: match.metadata as {
+ text: string;
+ type: string;
+ original_document: string;
+ file_path: string;
+ doc_id: string;
+ location: string;
+ start_page: number;
+ end_page: number;
+ },
}) as RAGChunk
);
} catch (error) {