aboutsummaryrefslogtreecommitdiff
path: root/src/server/chunker/pdf_chunker.py
diff options
context:
space:
mode:
authorbobzel <zzzman@gmail.com>2025-02-24 16:24:41 -0500
committerGitHub <noreply@github.com>2025-02-24 16:24:41 -0500
commit22763cc4d69ac8b2436a3ef53d79142a43299dbc (patch)
tree7ee37b74c8ba511ac59160f9b11f251861faab1d /src/server/chunker/pdf_chunker.py
parent383a8a2f017c12c578537d3cb3005e00be019bd7 (diff)
parent7b0bd66a0ad22b5a5cb17e76e811b59c6c7ca729 (diff)
Merge pull request #333 from brown-dash/ajs-finalagent
Ajs finalagent
Diffstat (limited to 'src/server/chunker/pdf_chunker.py')
-rw-r--r--src/server/chunker/pdf_chunker.py20
1 files changed, 10 insertions, 10 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index a9dbcbb0c..697550f2e 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -21,7 +21,7 @@ import json
import os
import uuid # For generating unique IDs
from enum import Enum # Enums for types like document type and purpose
-import cohere # Embedding client
+import openai
import numpy as np
from PyPDF2 import PdfReader # PDF text extraction
from openai import OpenAI # OpenAI client for text completion
@@ -35,8 +35,8 @@ warnings.filterwarnings('ignore', message="torch.load")
dotenv.load_dotenv() # Load environment variables
# Fix for newer versions of PIL
-if parse(PIL.__version__) >= parse('10.0.0'):
- Image.LINEAR = Image.BILINEAR
+# if parse(PIL.__version__) >= parse('10.0.0'):
+# Image.LINEAR = Image.BILINEAR
# Global dictionary to track progress of document processing jobs
current_progress = {}
@@ -727,19 +727,19 @@ class Document:
"""
Embed the text chunks using the Cohere API.
"""
- co = cohere.Client(os.getenv("COHERE_API_KEY")) # Initialize Cohere client with API key
+ openai = OpenAI() # Initialize Cohere client with API key
batch_size = 90 # Batch size for embedding
chunks_len = len(self.chunks) # Total number of chunks to embed
for i in tqdm(range(0, chunks_len, batch_size), desc="Embedding Chunks"):
batch = self.chunks[i: min(i + batch_size, chunks_len)] # Get batch of chunks
texts = [chunk['metadata']['text'] for chunk in batch] # Extract text from each chunk
- chunk_embs_batch = co.embed(
- texts=texts,
- model="embed-english-v3.0", # Use Cohere's embedding model
- input_type="search_document" # Specify input type
+ chunk_embs_batch = openai.embeddings.create(
+ model="text-embedding-3-large",
+ input=texts,
+ encoding_format="float"
)
- for j, emb in enumerate(chunk_embs_batch.embeddings):
- self.chunks[i + j]['values'] = emb # Store the embeddings in the corresponding chunks
+ for j, data_val in enumerate(chunk_embs_batch.data):
+ self.chunks[i + j]['values'] = data_val.embedding # Store the embeddings in the corresponding chunks
def _generate_summary(self) -> str:
"""