aboutsummaryrefslogtreecommitdiff
path: root/src/client/views/nodes/chatbot/vectorstore
diff options
context:
space:
mode:
authorNathan-SR <144961007+Nathan-SR@users.noreply.github.com>2025-03-04 04:32:50 -0500
committerNathan-SR <144961007+Nathan-SR@users.noreply.github.com>2025-03-04 04:32:50 -0500
commit95abdada5a275fc258fa72781f7f3c40c0b306ea (patch)
tree6d729cebe0937ae81108005de9895b5398d1f475 /src/client/views/nodes/chatbot/vectorstore
parent0a8f3739cf5c30852f18751a4c05d81e0dabe928 (diff)
parent215ad40efa2e343e290d18bffbc55884829f1a0d (diff)
Merge branch 'master' of https://github.com/brown-dash/Dash-Web into Merge
Diffstat (limited to 'src/client/views/nodes/chatbot/vectorstore')
-rw-r--r--src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts339
1 files changed, 339 insertions, 0 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
new file mode 100644
index 000000000..afd34f28d
--- /dev/null
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -0,0 +1,339 @@
+/**
+ * @file Vectorstore.ts
+ * @description This file defines the Vectorstore class, which integrates with Pinecone for vector-based document indexing and OpenAI text-embedding-3-large for text embeddings.
+ * It manages AI document handling, including adding documents, processing media files, combining document chunks, indexing documents,
+ * and retrieving relevant sections based on user queries.
+ */
+
+import { Index, IndexList, Pinecone, PineconeRecord, QueryResponse, RecordMetadata } from '@pinecone-database/pinecone';
+import dotenv from 'dotenv';
+import path from 'path';
+import { v4 as uuidv4 } from 'uuid';
+import { Doc } from '../../../../../fields/Doc';
+import { AudioCast, CsvCast, PDFCast, StrCast, VideoCast } from '../../../../../fields/Types';
+import { Networking } from '../../../../Network';
+import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
+import OpenAI from 'openai';
+import { Embedding } from 'openai/resources';
+import { PineconeEnvironmentVarsNotSupportedError } from '@pinecone-database/pinecone/dist/errors';
+
+dotenv.config();
+
+/**
+ * The Vectorstore class integrates with Pinecone for vector-based document indexing and retrieval,
+ * and OpenAI text-embedding-3-large for text embedding. It handles AI document management, uploads, and query-based retrieval.
+ */
+export class Vectorstore {
+ private pinecone: Pinecone; // Pinecone client for managing the vector index.
+ private index!: Index; // The specific Pinecone index used for document chunks.
+ private openai: OpenAI; // OpenAI client for generating embeddings.
+ private indexName: string = 'pdf-chatbot'; // Default name for the index.
+ private _id: string; // Unique ID for the Vectorstore instance.
+ private _doc_ids: () => string[]; // List of document IDs handled by this instance.
+
+ documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
+
+ /**
+ * Initializes the Pinecone and OpenAI clients, sets up the document ID list,
+ * and initializes the Pinecone index.
+ * @param id The unique identifier for the vectorstore instance.
+ * @param doc_ids A function that returns a list of document IDs.
+ */
+ constructor(id: string, doc_ids: () => string[]) {
+ const pineconeApiKey = process.env.PINECONE_API_KEY;
+ if (!pineconeApiKey) {
+ throw new Error('PINECONE_API_KEY is not defined.');
+ }
+
+ // Initialize Pinecone and OpenAI clients with API keys from the environment.
+ this.pinecone = new Pinecone({ apiKey: pineconeApiKey });
+ this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true });
+ this._id = id;
+ this._doc_ids = doc_ids;
+ this.initializeIndex();
+ }
+
+ /**
+ * Initializes the Pinecone index by checking if it exists and creating it if necessary.
+ * Sets the index to use cosine similarity for vector similarity calculations.
+ */
+ private async initializeIndex() {
+ const indexList: IndexList = await this.pinecone.listIndexes();
+
+ // Check if the index already exists, otherwise create it.
+ if (!indexList.indexes?.some(index => index.name === this.indexName)) {
+ await this.pinecone.createIndex({
+ name: this.indexName,
+ dimension: 3072,
+ metric: 'cosine',
+ spec: {
+ serverless: {
+ cloud: 'aws',
+ region: 'us-east-1',
+ },
+ },
+ });
+ }
+
+ // Set the index for future use.
+ this.index = this.pinecone.Index(this.indexName);
+ }
+
+ /**
+ * Adds an AI document to the vectorstore. Handles media file processing for audio/video,
+ * and text embedding for all document types. Updates document metadata during processing.
+ * @param doc The document to add.
+ * @param progressCallback Callback to track the progress of the addition process.
+ */
+ async addAIDoc(doc: Doc, progressCallback: (progress: number, step: string) => void) {
+ const ai_document_status: string = StrCast(doc.ai_document_status);
+
+ // Skip if the document is already in progress or completed.
+ if (ai_document_status !== undefined && ai_document_status.trim() !== '' && ai_document_status !== '{}') {
+ if (ai_document_status === 'PROGRESS') {
+ console.log('Already in progress.');
+ return;
+ } else if (ai_document_status === 'COMPLETED') {
+ console.log('Already completed.');
+ return;
+ }
+ } else {
+ // Start processing the document.
+ doc.ai_document_status = 'PROGRESS';
+ const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname;
+
+ if (!local_file_path) {
+ console.log('Invalid file path.');
+ return;
+ }
+
+ const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4');
+ let result: AI_Document & { doc_id: string };
+ if (isAudioOrVideo) {
+ console.log('Processing media file...');
+ const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
+ const segmentedTranscript = response.condensed;
+ console.log(segmentedTranscript);
+ const summary = response.summary;
+ doc.summary = summary;
+ // Generate embeddings for each chunk
+ const texts = segmentedTranscript.map((chunk: any) => chunk.text);
+
+ try {
+ const embeddingsResponse = await this.openai.embeddings.create({
+ model: 'text-embedding-3-large',
+ input: texts,
+ encoding_format: 'float',
+ });
+
+ doc.original_segments = JSON.stringify(response.full);
+ doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
+ const doc_id = uuidv4();
+
+ // Add transcript and embeddings to metadata
+ result = {
+ doc_id,
+ purpose: '',
+ file_name: local_file_path,
+ num_pages: 0,
+ summary: '',
+ chunks: segmentedTranscript.map((chunk: any, index: number) => ({
+ id: uuidv4(),
+ values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding
+ metadata: {
+ indexes: chunk.indexes,
+ original_document: local_file_path,
+ doc_id: doc_id,
+ file_path: local_file_path,
+ start_time: chunk.start,
+ end_time: chunk.end,
+ text: chunk.text,
+ type: CHUNK_TYPE.VIDEO,
+ },
+ })),
+ type: 'media',
+ };
+ } catch (error) {
+ console.error('Error generating embeddings:', error);
+ throw new Error('Embedding generation failed');
+ }
+
+ doc.segmented_transcript = JSON.stringify(segmentedTranscript);
+ // Simplify chunks for storage
+ const simplifiedChunks = result.chunks.map(chunk => ({
+ chunkId: chunk.id,
+ start_time: chunk.metadata.start_time,
+ end_time: chunk.metadata.end_time,
+ indexes: chunk.metadata.indexes,
+ chunkType: CHUNK_TYPE.VIDEO,
+ text: chunk.metadata.text,
+ }));
+ doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
+ } else {
+ // Existing document processing logic remains unchanged
+ console.log('Processing regular document...');
+ const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+
+ while (true) {
+ await new Promise(resolve => setTimeout(resolve, 2000));
+ const resultResponse = await Networking.FetchFromServer(`/getResult/${jobId}`);
+ const resultResponseJson = JSON.parse(resultResponse);
+ if (resultResponseJson.status === 'completed') {
+ result = resultResponseJson;
+ break;
+ }
+ const progressResponse = await Networking.FetchFromServer(`/getProgress/${jobId}`);
+ const progressResponseJson = JSON.parse(progressResponse);
+ if (progressResponseJson) {
+ progressCallback(progressResponseJson.progress, progressResponseJson.step);
+ }
+ }
+ if (!doc.chunk_simpl) {
+ doc.chunk_simpl = JSON.stringify({ chunks: [] });
+ }
+ doc.summary = result.summary;
+ doc.ai_purpose = result.purpose;
+
+ result.chunks.forEach((chunk: RAGChunk) => {
+ const chunkToAdd = {
+ chunkId: chunk.id,
+ startPage: chunk.metadata.start_page,
+ endPage: chunk.metadata.end_page,
+ location: chunk.metadata.location,
+ chunkType: chunk.metadata.type as CHUNK_TYPE,
+ text: chunk.metadata.text,
+ };
+ const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl));
+ new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd);
+ doc.chunk_simpl = JSON.stringify(new_chunk_simpl);
+ });
+ }
+
+ // Index the document
+ await this.indexDocument(result);
+
+ // Preserve existing metadata updates
+ if (!doc.vectorstore_id) {
+ doc.vectorstore_id = JSON.stringify([this._id]);
+ } else {
+ doc.vectorstore_id = JSON.stringify(JSON.parse(StrCast(doc.vectorstore_id)).concat([this._id]));
+ }
+
+ doc.ai_doc_id = result.doc_id;
+
+ console.log(`Document added: ${result.file_name}`);
+ doc.ai_document_status = 'COMPLETED';
+ }
+ }
+
+ /**
+ * Uploads the document's vector chunks to the Pinecone index.
+ * Prepares the metadata for each chunk and uses Pinecone's upsert operation.
+ * @param document The processed document containing its chunks and metadata.
+ */
+ private async indexDocument(document: AI_Document) {
+ console.log('Uploading vectors to content namespace...');
+
+ // Prepare Pinecone records for each chunk in the document.
+ const pineconeRecords: PineconeRecord[] = (document.chunks as RAGChunk[]).map(chunk => ({
+ id: chunk.id,
+ values: chunk.values,
+ metadata: { ...chunk.metadata } as RecordMetadata,
+ }));
+
+ // Upload the records to Pinecone.
+ await this.index.upsert(pineconeRecords);
+ }
+
+ /**
+ * Combines document chunks until their combined text reaches a minimum word count.
+ * This is used to optimize retrieval and indexing processes.
+ * @param chunks The original chunks to combine.
+ * @returns Combined chunks with updated text and metadata.
+ */
+ private combineChunks(chunks: RAGChunk[]): RAGChunk[] {
+ const combinedChunks: RAGChunk[] = [];
+ let currentChunk: RAGChunk | null = null;
+ let wordCount = 0;
+
+ chunks.forEach(chunk => {
+ const textWords = chunk.metadata.text.split(' ').length;
+
+ if (!currentChunk) {
+ currentChunk = { ...chunk, metadata: { ...chunk.metadata, text: chunk.metadata.text } };
+ wordCount = textWords;
+ } else if (wordCount + textWords >= 500) {
+ combinedChunks.push(currentChunk);
+ currentChunk = { ...chunk, metadata: { ...chunk.metadata, text: chunk.metadata.text } };
+ wordCount = textWords;
+ } else {
+ currentChunk.metadata.text += ` ${chunk.metadata.text}`;
+ wordCount += textWords;
+ }
+ });
+
+ if (currentChunk) {
+ combinedChunks.push(currentChunk);
+ }
+
+ return combinedChunks;
+ }
+
+ /**
+ * Retrieves the most relevant document chunks for a given query.
+ * Uses OpenAI for embedding the query and Pinecone for vector similarity matching.
+ * @param query The search query string.
+ * @param topK The number of top results to return (default is 10).
+ * @returns A list of document chunks that match the query.
+ */
+ async retrieve(query: string, topK: number = 10): Promise<RAGChunk[]> {
+ console.log(`Retrieving chunks for query: ${query}`);
+ try {
+ // Generate an embedding for the query using OpenAI.
+ const queryEmbeddingResponse = await this.openai.embeddings.create({
+ model: 'text-embedding-3-large',
+ input: query,
+ encoding_format: 'float',
+ });
+
+ let queryEmbedding = queryEmbeddingResponse.data[0].embedding;
+
+ // Extract the embedding from the response.
+
+ console.log(this._doc_ids());
+ // Query the Pinecone index using the embedding and filter by document IDs.
+ const queryResponse: QueryResponse = await this.index.query({
+ vector: queryEmbedding,
+ filter: {
+ doc_id: { $in: this._doc_ids() },
+ },
+ topK,
+ includeValues: true,
+ includeMetadata: true,
+ });
+ console.log(queryResponse);
+
+ // Map the results into RAGChunks and return them.
+ return queryResponse.matches.map(
+ match =>
+ ({
+ id: match.id,
+ values: match.values as number[],
+ metadata: match.metadata as {
+ text: string;
+ type: string;
+ original_document: string;
+ file_path: string;
+ doc_id: string;
+ location: string;
+ start_page: number;
+ end_page: number;
+ },
+ }) as RAGChunk
+ );
+ } catch (error) {
+ console.error(`Error retrieving chunks: ${error}`);
+ return [];
+ }
+ }
+}