aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts66
-rw-r--r--src/server/ApiManagers/AssistantManager.ts4
-rw-r--r--src/server/chunker/pdf_chunker.py20
3 files changed, 35 insertions, 55 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index 7027aceb4..afd34f28d 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -1,13 +1,11 @@
/**
* @file Vectorstore.ts
- * @description This file defines the Vectorstore class, which integrates with Pinecone for vector-based document indexing and Cohere for text embeddings.
+ * @description This file defines the Vectorstore class, which integrates with Pinecone for vector-based document indexing and OpenAI text-embedding-3-large for text embeddings.
* It manages AI document handling, including adding documents, processing media files, combining document chunks, indexing documents,
* and retrieving relevant sections based on user queries.
*/
import { Index, IndexList, Pinecone, PineconeRecord, QueryResponse, RecordMetadata } from '@pinecone-database/pinecone';
-import { CohereClient } from 'cohere-ai';
-import { EmbedResponse } from 'cohere-ai/api';
import dotenv from 'dotenv';
import path from 'path';
import { v4 as uuidv4 } from 'uuid';
@@ -15,17 +13,20 @@ import { Doc } from '../../../../../fields/Doc';
import { AudioCast, CsvCast, PDFCast, StrCast, VideoCast } from '../../../../../fields/Types';
import { Networking } from '../../../../Network';
import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
+import OpenAI from 'openai';
+import { Embedding } from 'openai/resources';
+import { PineconeEnvironmentVarsNotSupportedError } from '@pinecone-database/pinecone/dist/errors';
dotenv.config();
/**
* The Vectorstore class integrates with Pinecone for vector-based document indexing and retrieval,
- * and Cohere for text embedding. It handles AI document management, uploads, and query-based retrieval.
+ * and OpenAI text-embedding-3-large for text embedding. It handles AI document management, uploads, and query-based retrieval.
*/
export class Vectorstore {
private pinecone: Pinecone; // Pinecone client for managing the vector index.
private index!: Index; // The specific Pinecone index used for document chunks.
- private cohere: CohereClient; // Cohere client for generating embeddings.
+ private openai: OpenAI; // OpenAI client for generating embeddings.
private indexName: string = 'pdf-chatbot'; // Default name for the index.
private _id: string; // Unique ID for the Vectorstore instance.
private _doc_ids: () => string[]; // List of document IDs handled by this instance.
@@ -33,20 +34,20 @@ export class Vectorstore {
documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
/**
- * Initializes the Pinecone and Cohere clients, sets up the document ID list,
+ * Initializes the Pinecone and OpenAI clients, sets up the document ID list,
* and initializes the Pinecone index.
* @param id The unique identifier for the vectorstore instance.
* @param doc_ids A function that returns a list of document IDs.
*/
constructor(id: string, doc_ids: () => string[]) {
- const pineconeApiKey = process.env.PINECONE_API_KEY || '51738e9a-bea2-4c11-b6bf-48a825e774dc';
+ const pineconeApiKey = process.env.PINECONE_API_KEY;
if (!pineconeApiKey) {
throw new Error('PINECONE_API_KEY is not defined.');
}
- // Initialize Pinecone and Cohere clients with API keys from the environment.
+ // Initialize Pinecone and OpenAI clients with API keys from the environment.
this.pinecone = new Pinecone({ apiKey: pineconeApiKey });
- // this.cohere = new CohereClient({ token: process.env.COHERE_API_KEY });
+ this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true });
this._id = id;
this._doc_ids = doc_ids;
this.initializeIndex();
@@ -63,7 +64,7 @@ export class Vectorstore {
if (!indexList.indexes?.some(index => index.name === this.indexName)) {
await this.pinecone.createIndex({
name: this.indexName,
- dimension: 1024,
+ dimension: 3072,
metric: 'cosine',
spec: {
serverless: {
@@ -119,23 +120,12 @@ export class Vectorstore {
const texts = segmentedTranscript.map((chunk: any) => chunk.text);
try {
- const embeddingsResponse = await this.cohere.v2.embed({
- model: 'embed-english-v3.0',
- inputType: 'classification',
- embeddingTypes: ['float'], // Specify that embeddings should be floats
- texts, // Pass the array of chunk texts
+ const embeddingsResponse = await this.openai.embeddings.create({
+ model: 'text-embedding-3-large',
+ input: texts,
+ encoding_format: 'float',
});
- if (!embeddingsResponse.embeddings.float || embeddingsResponse.embeddings.float.length !== texts.length) {
- throw new Error('Mismatch between embeddings and the number of chunks');
- }
-
- // Assign embeddings to each chunk
- segmentedTranscript.forEach((chunk: any, index: number) => {
- if (!embeddingsResponse.embeddings || !embeddingsResponse.embeddings.float) {
- throw new Error('Invalid embeddings response');
- }
- });
doc.original_segments = JSON.stringify(response.full);
doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
const doc_id = uuidv4();
@@ -149,7 +139,7 @@ export class Vectorstore {
summary: '',
chunks: segmentedTranscript.map((chunk: any, index: number) => ({
id: uuidv4(),
- values: (embeddingsResponse.embeddings.float as number[][])[index], // Assign embedding
+ values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding
metadata: {
indexes: chunk.indexes,
original_document: local_file_path,
@@ -291,7 +281,7 @@ export class Vectorstore {
/**
* Retrieves the most relevant document chunks for a given query.
- * Uses Cohere for embedding the query and Pinecone for vector similarity matching.
+ * Uses OpenAI for embedding the query and Pinecone for vector similarity matching.
* @param query The search query string.
* @param topK The number of top results to return (default is 10).
* @returns A list of document chunks that match the query.
@@ -299,27 +289,17 @@ export class Vectorstore {
async retrieve(query: string, topK: number = 10): Promise<RAGChunk[]> {
console.log(`Retrieving chunks for query: ${query}`);
try {
- // Generate an embedding for the query using Cohere.
- const queryEmbeddingResponse: EmbedResponse = await this.cohere.embed({
- texts: [query],
- model: 'embed-english-v3.0',
- inputType: 'search_query',
+ // Generate an embedding for the query using OpenAI.
+ const queryEmbeddingResponse = await this.openai.embeddings.create({
+ model: 'text-embedding-3-large',
+ input: query,
+ encoding_format: 'float',
});
- let queryEmbedding: number[];
+ let queryEmbedding = queryEmbeddingResponse.data[0].embedding;
// Extract the embedding from the response.
- if (Array.isArray(queryEmbeddingResponse.embeddings)) {
- queryEmbedding = queryEmbeddingResponse.embeddings[0];
- } else if (queryEmbeddingResponse.embeddings && 'embeddings' in queryEmbeddingResponse.embeddings) {
- queryEmbedding = (queryEmbeddingResponse.embeddings as { embeddings: number[][] }).embeddings[0];
- } else {
- throw new Error('Invalid embedding response format');
- }
- if (!Array.isArray(queryEmbedding)) {
- throw new Error('Query embedding is not an array');
- }
console.log(this._doc_ids());
// Query the Pinecone index using the embedding and filter by document IDs.
const queryResponse: QueryResponse = await this.index.query({
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index c41f697db..4719541b9 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -538,7 +538,7 @@ export default class AssistantManager extends ApiManager {
// Spawn the Python process and track its progress/output
// eslint-disable-next-line no-use-before-define
- spawnPythonProcess(jobId, file_name, public_path);
+ spawnPythonProcess(jobId, public_path);
// Send the job ID back to the client for tracking
res.send({ jobId });
@@ -695,7 +695,7 @@ export default class AssistantManager extends ApiManager {
* @param file_name The name of the file to process.
* @param file_path The filepath of the file to process.
*/
-function spawnPythonProcess(jobId: string, file_name: string, file_path: string) {
+function spawnPythonProcess(jobId: string, file_path: string) {
const venvPath = path.join(__dirname, '../chunker/venv');
const requirementsPath = path.join(__dirname, '../chunker/requirements.txt');
const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py');
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index a9dbcbb0c..697550f2e 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -21,7 +21,7 @@ import json
import os
import uuid # For generating unique IDs
from enum import Enum # Enums for types like document type and purpose
-import cohere # Embedding client
+import openai
import numpy as np
from PyPDF2 import PdfReader # PDF text extraction
from openai import OpenAI # OpenAI client for text completion
@@ -35,8 +35,8 @@ warnings.filterwarnings('ignore', message="torch.load")
dotenv.load_dotenv() # Load environment variables
# Fix for newer versions of PIL
-if parse(PIL.__version__) >= parse('10.0.0'):
- Image.LINEAR = Image.BILINEAR
+# if parse(PIL.__version__) >= parse('10.0.0'):
+# Image.LINEAR = Image.BILINEAR
# Global dictionary to track progress of document processing jobs
current_progress = {}
@@ -727,19 +727,19 @@ class Document:
"""
Embed the text chunks using the Cohere API.
"""
- co = cohere.Client(os.getenv("COHERE_API_KEY")) # Initialize Cohere client with API key
+ openai = OpenAI() # Initialize Cohere client with API key
batch_size = 90 # Batch size for embedding
chunks_len = len(self.chunks) # Total number of chunks to embed
for i in tqdm(range(0, chunks_len, batch_size), desc="Embedding Chunks"):
batch = self.chunks[i: min(i + batch_size, chunks_len)] # Get batch of chunks
texts = [chunk['metadata']['text'] for chunk in batch] # Extract text from each chunk
- chunk_embs_batch = co.embed(
- texts=texts,
- model="embed-english-v3.0", # Use Cohere's embedding model
- input_type="search_document" # Specify input type
+ chunk_embs_batch = openai.embeddings.create(
+ model="text-embedding-3-large",
+ input=texts,
+ encoding_format="float"
)
- for j, emb in enumerate(chunk_embs_batch.embeddings):
- self.chunks[i + j]['values'] = emb # Store the embeddings in the corresponding chunks
+ for j, data_val in enumerate(chunk_embs_batch.data):
+ self.chunks[i + j]['values'] = data_val.embedding # Store the embeddings in the corresponding chunks
def _generate_summary(self) -> str:
"""