diff options
author | bobzel <zzzman@gmail.com> | 2025-02-24 16:24:41 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-02-24 16:24:41 -0500 |
commit | 22763cc4d69ac8b2436a3ef53d79142a43299dbc (patch) | |
tree | 7ee37b74c8ba511ac59160f9b11f251861faab1d /src | |
parent | 383a8a2f017c12c578537d3cb3005e00be019bd7 (diff) | |
parent | 7b0bd66a0ad22b5a5cb17e76e811b59c6c7ca729 (diff) |
Merge pull request #333 from brown-dash/ajs-finalagent
Ajs finalagent
Diffstat (limited to 'src')
-rw-r--r-- | src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts | 66 | ||||
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 4 | ||||
-rw-r--r-- | src/server/chunker/pdf_chunker.py | 20 |
3 files changed, 35 insertions, 55 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index 7027aceb4..afd34f28d 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -1,13 +1,11 @@ /** * @file Vectorstore.ts - * @description This file defines the Vectorstore class, which integrates with Pinecone for vector-based document indexing and Cohere for text embeddings. + * @description This file defines the Vectorstore class, which integrates with Pinecone for vector-based document indexing and OpenAI text-embedding-3-large for text embeddings. * It manages AI document handling, including adding documents, processing media files, combining document chunks, indexing documents, * and retrieving relevant sections based on user queries. */ import { Index, IndexList, Pinecone, PineconeRecord, QueryResponse, RecordMetadata } from '@pinecone-database/pinecone'; -import { CohereClient } from 'cohere-ai'; -import { EmbedResponse } from 'cohere-ai/api'; import dotenv from 'dotenv'; import path from 'path'; import { v4 as uuidv4 } from 'uuid'; @@ -15,17 +13,20 @@ import { Doc } from '../../../../../fields/Doc'; import { AudioCast, CsvCast, PDFCast, StrCast, VideoCast } from '../../../../../fields/Types'; import { Networking } from '../../../../Network'; import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types'; +import OpenAI from 'openai'; +import { Embedding } from 'openai/resources'; +import { PineconeEnvironmentVarsNotSupportedError } from '@pinecone-database/pinecone/dist/errors'; dotenv.config(); /** * The Vectorstore class integrates with Pinecone for vector-based document indexing and retrieval, - * and Cohere for text embedding. It handles AI document management, uploads, and query-based retrieval. + * and OpenAI text-embedding-3-large for text embedding. It handles AI document management, uploads, and query-based retrieval. */ export class Vectorstore { private pinecone: Pinecone; // Pinecone client for managing the vector index. private index!: Index; // The specific Pinecone index used for document chunks. - private cohere: CohereClient; // Cohere client for generating embeddings. + private openai: OpenAI; // OpenAI client for generating embeddings. private indexName: string = 'pdf-chatbot'; // Default name for the index. private _id: string; // Unique ID for the Vectorstore instance. private _doc_ids: () => string[]; // List of document IDs handled by this instance. @@ -33,20 +34,20 @@ export class Vectorstore { documents: AI_Document[] = []; // Store the documents indexed in the vectorstore. /** - * Initializes the Pinecone and Cohere clients, sets up the document ID list, + * Initializes the Pinecone and OpenAI clients, sets up the document ID list, * and initializes the Pinecone index. * @param id The unique identifier for the vectorstore instance. * @param doc_ids A function that returns a list of document IDs. */ constructor(id: string, doc_ids: () => string[]) { - const pineconeApiKey = process.env.PINECONE_API_KEY || '51738e9a-bea2-4c11-b6bf-48a825e774dc'; + const pineconeApiKey = process.env.PINECONE_API_KEY; if (!pineconeApiKey) { throw new Error('PINECONE_API_KEY is not defined.'); } - // Initialize Pinecone and Cohere clients with API keys from the environment. + // Initialize Pinecone and OpenAI clients with API keys from the environment. this.pinecone = new Pinecone({ apiKey: pineconeApiKey }); - // this.cohere = new CohereClient({ token: process.env.COHERE_API_KEY }); + this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true }); this._id = id; this._doc_ids = doc_ids; this.initializeIndex(); @@ -63,7 +64,7 @@ export class Vectorstore { if (!indexList.indexes?.some(index => index.name === this.indexName)) { await this.pinecone.createIndex({ name: this.indexName, - dimension: 1024, + dimension: 3072, metric: 'cosine', spec: { serverless: { @@ -119,23 +120,12 @@ export class Vectorstore { const texts = segmentedTranscript.map((chunk: any) => chunk.text); try { - const embeddingsResponse = await this.cohere.v2.embed({ - model: 'embed-english-v3.0', - inputType: 'classification', - embeddingTypes: ['float'], // Specify that embeddings should be floats - texts, // Pass the array of chunk texts + const embeddingsResponse = await this.openai.embeddings.create({ + model: 'text-embedding-3-large', + input: texts, + encoding_format: 'float', }); - if (!embeddingsResponse.embeddings.float || embeddingsResponse.embeddings.float.length !== texts.length) { - throw new Error('Mismatch between embeddings and the number of chunks'); - } - - // Assign embeddings to each chunk - segmentedTranscript.forEach((chunk: any, index: number) => { - if (!embeddingsResponse.embeddings || !embeddingsResponse.embeddings.float) { - throw new Error('Invalid embeddings response'); - } - }); doc.original_segments = JSON.stringify(response.full); doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; const doc_id = uuidv4(); @@ -149,7 +139,7 @@ export class Vectorstore { summary: '', chunks: segmentedTranscript.map((chunk: any, index: number) => ({ id: uuidv4(), - values: (embeddingsResponse.embeddings.float as number[][])[index], // Assign embedding + values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding metadata: { indexes: chunk.indexes, original_document: local_file_path, @@ -291,7 +281,7 @@ export class Vectorstore { /** * Retrieves the most relevant document chunks for a given query. - * Uses Cohere for embedding the query and Pinecone for vector similarity matching. + * Uses OpenAI for embedding the query and Pinecone for vector similarity matching. * @param query The search query string. * @param topK The number of top results to return (default is 10). * @returns A list of document chunks that match the query. @@ -299,27 +289,17 @@ export class Vectorstore { async retrieve(query: string, topK: number = 10): Promise<RAGChunk[]> { console.log(`Retrieving chunks for query: ${query}`); try { - // Generate an embedding for the query using Cohere. - const queryEmbeddingResponse: EmbedResponse = await this.cohere.embed({ - texts: [query], - model: 'embed-english-v3.0', - inputType: 'search_query', + // Generate an embedding for the query using OpenAI. + const queryEmbeddingResponse = await this.openai.embeddings.create({ + model: 'text-embedding-3-large', + input: query, + encoding_format: 'float', }); - let queryEmbedding: number[]; + let queryEmbedding = queryEmbeddingResponse.data[0].embedding; // Extract the embedding from the response. - if (Array.isArray(queryEmbeddingResponse.embeddings)) { - queryEmbedding = queryEmbeddingResponse.embeddings[0]; - } else if (queryEmbeddingResponse.embeddings && 'embeddings' in queryEmbeddingResponse.embeddings) { - queryEmbedding = (queryEmbeddingResponse.embeddings as { embeddings: number[][] }).embeddings[0]; - } else { - throw new Error('Invalid embedding response format'); - } - if (!Array.isArray(queryEmbedding)) { - throw new Error('Query embedding is not an array'); - } console.log(this._doc_ids()); // Query the Pinecone index using the embedding and filter by document IDs. const queryResponse: QueryResponse = await this.index.query({ diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index c41f697db..4719541b9 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -538,7 +538,7 @@ export default class AssistantManager extends ApiManager { // Spawn the Python process and track its progress/output // eslint-disable-next-line no-use-before-define - spawnPythonProcess(jobId, file_name, public_path); + spawnPythonProcess(jobId, public_path); // Send the job ID back to the client for tracking res.send({ jobId }); @@ -695,7 +695,7 @@ export default class AssistantManager extends ApiManager { * @param file_name The name of the file to process. * @param file_path The filepath of the file to process. */ -function spawnPythonProcess(jobId: string, file_name: string, file_path: string) { +function spawnPythonProcess(jobId: string, file_path: string) { const venvPath = path.join(__dirname, '../chunker/venv'); const requirementsPath = path.join(__dirname, '../chunker/requirements.txt'); const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py'); diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py index a9dbcbb0c..697550f2e 100644 --- a/src/server/chunker/pdf_chunker.py +++ b/src/server/chunker/pdf_chunker.py @@ -21,7 +21,7 @@ import json import os import uuid # For generating unique IDs from enum import Enum # Enums for types like document type and purpose -import cohere # Embedding client +import openai import numpy as np from PyPDF2 import PdfReader # PDF text extraction from openai import OpenAI # OpenAI client for text completion @@ -35,8 +35,8 @@ warnings.filterwarnings('ignore', message="torch.load") dotenv.load_dotenv() # Load environment variables # Fix for newer versions of PIL -if parse(PIL.__version__) >= parse('10.0.0'): - Image.LINEAR = Image.BILINEAR +# if parse(PIL.__version__) >= parse('10.0.0'): +# Image.LINEAR = Image.BILINEAR # Global dictionary to track progress of document processing jobs current_progress = {} @@ -727,19 +727,19 @@ class Document: """ Embed the text chunks using the Cohere API. """ - co = cohere.Client(os.getenv("COHERE_API_KEY")) # Initialize Cohere client with API key + openai = OpenAI() # Initialize Cohere client with API key batch_size = 90 # Batch size for embedding chunks_len = len(self.chunks) # Total number of chunks to embed for i in tqdm(range(0, chunks_len, batch_size), desc="Embedding Chunks"): batch = self.chunks[i: min(i + batch_size, chunks_len)] # Get batch of chunks texts = [chunk['metadata']['text'] for chunk in batch] # Extract text from each chunk - chunk_embs_batch = co.embed( - texts=texts, - model="embed-english-v3.0", # Use Cohere's embedding model - input_type="search_document" # Specify input type + chunk_embs_batch = openai.embeddings.create( + model="text-embedding-3-large", + input=texts, + encoding_format="float" ) - for j, emb in enumerate(chunk_embs_batch.embeddings): - self.chunks[i + j]['values'] = emb # Store the embeddings in the corresponding chunks + for j, data_val in enumerate(chunk_embs_batch.data): + self.chunks[i + j]['values'] = data_val.embedding # Store the embeddings in the corresponding chunks def _generate_summary(self) -> str: """ |