Merge branch 'master' into aarav_edit

author: bobzel <zzzman@gmail.com> 2025-03-06 16:17:47 -0500
committer: bobzel <zzzman@gmail.com> 2025-03-06 16:17:47 -0500
commit: 5ad858090f3006631062877d90120e3cc505fada (patch)
tree: 9f87a8e1e7098a1025f6f4aac332dbc854db5be3 /src/server/chunker/pdf_chunker.py
parent: 9c2a7c14fd9d0e44609aab30c6323583162009db (diff)
parent: adaa107aac8558fa6f46e6ba1263c650c212d506 (diff)
1 files changed, 10 insertions, 10 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index a9dbcbb0c..697550f2e 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -21,7 +21,7 @@ import json
 import os
 import uuid  # For generating unique IDs
 from enum import Enum  # Enums for types like document type and purpose
-import cohere  # Embedding client
+import openai
 import numpy as np
 from PyPDF2 import PdfReader  # PDF text extraction
 from openai import OpenAI  # OpenAI client for text completion
@@ -35,8 +35,8 @@ warnings.filterwarnings('ignore', message="torch.load")
 dotenv.load_dotenv()  # Load environment variables
 
 # Fix for newer versions of PIL
-if parse(PIL.__version__) >= parse('10.0.0'):
-    Image.LINEAR = Image.BILINEAR
+# if parse(PIL.__version__) >= parse('10.0.0'):
+#     Image.LINEAR = Image.BILINEAR
 
 # Global dictionary to track progress of document processing jobs
 current_progress = {}
@@ -727,19 +727,19 @@ class Document:
         """
         Embed the text chunks using the Cohere API.
         """
-        co = cohere.Client(os.getenv("COHERE_API_KEY"))  # Initialize Cohere client with API key
+        openai = OpenAI()  # Initialize Cohere client with API key
         batch_size = 90  # Batch size for embedding
         chunks_len = len(self.chunks)  # Total number of chunks to embed
         for i in tqdm(range(0, chunks_len, batch_size), desc="Embedding Chunks"):
             batch = self.chunks[i: min(i + batch_size, chunks_len)]  # Get batch of chunks
             texts = [chunk['metadata']['text'] for chunk in batch]  # Extract text from each chunk
-            chunk_embs_batch = co.embed(
-                texts=texts,
-                model="embed-english-v3.0",  # Use Cohere's embedding model
-                input_type="search_document"  # Specify input type
+            chunk_embs_batch = openai.embeddings.create(
+                model="text-embedding-3-large",
+                input=texts,
+                encoding_format="float"
             )
-            for j, emb in enumerate(chunk_embs_batch.embeddings):
-                self.chunks[i + j]['values'] = emb  # Store the embeddings in the corresponding chunks
+            for j, data_val in enumerate(chunk_embs_batch.data):
+                self.chunks[i + j]['values'] = data_val.embedding  # Store the embeddings in the corresponding chunks
 
     def _generate_summary(self) -> str:
         """
author	bobzel <zzzman@gmail.com>	2025-03-06 16:17:47 -0500
committer	bobzel <zzzman@gmail.com>	2025-03-06 16:17:47 -0500
commit	5ad858090f3006631062877d90120e3cc505fada (patch)
tree	9f87a8e1e7098a1025f6f4aac332dbc854db5be3 /src/server/chunker/pdf_chunker.py
parent	9c2a7c14fd9d0e44609aab30c6323583162009db (diff)
parent	adaa107aac8558fa6f46e6ba1263c650c212d506 (diff)