diff options
author | bobzel <zzzman@gmail.com> | 2025-02-24 16:24:41 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-02-24 16:24:41 -0500 |
commit | 22763cc4d69ac8b2436a3ef53d79142a43299dbc (patch) | |
tree | 7ee37b74c8ba511ac59160f9b11f251861faab1d /src/server/chunker/pdf_chunker.py | |
parent | 383a8a2f017c12c578537d3cb3005e00be019bd7 (diff) | |
parent | 7b0bd66a0ad22b5a5cb17e76e811b59c6c7ca729 (diff) |
Merge pull request #333 from brown-dash/ajs-finalagent
Ajs finalagent
Diffstat (limited to 'src/server/chunker/pdf_chunker.py')
-rw-r--r-- | src/server/chunker/pdf_chunker.py | 20 |
1 files changed, 10 insertions, 10 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py index a9dbcbb0c..697550f2e 100644 --- a/src/server/chunker/pdf_chunker.py +++ b/src/server/chunker/pdf_chunker.py @@ -21,7 +21,7 @@ import json import os import uuid # For generating unique IDs from enum import Enum # Enums for types like document type and purpose -import cohere # Embedding client +import openai import numpy as np from PyPDF2 import PdfReader # PDF text extraction from openai import OpenAI # OpenAI client for text completion @@ -35,8 +35,8 @@ warnings.filterwarnings('ignore', message="torch.load") dotenv.load_dotenv() # Load environment variables # Fix for newer versions of PIL -if parse(PIL.__version__) >= parse('10.0.0'): - Image.LINEAR = Image.BILINEAR +# if parse(PIL.__version__) >= parse('10.0.0'): +# Image.LINEAR = Image.BILINEAR # Global dictionary to track progress of document processing jobs current_progress = {} @@ -727,19 +727,19 @@ class Document: """ Embed the text chunks using the Cohere API. """ - co = cohere.Client(os.getenv("COHERE_API_KEY")) # Initialize Cohere client with API key + openai = OpenAI() # Initialize Cohere client with API key batch_size = 90 # Batch size for embedding chunks_len = len(self.chunks) # Total number of chunks to embed for i in tqdm(range(0, chunks_len, batch_size), desc="Embedding Chunks"): batch = self.chunks[i: min(i + batch_size, chunks_len)] # Get batch of chunks texts = [chunk['metadata']['text'] for chunk in batch] # Extract text from each chunk - chunk_embs_batch = co.embed( - texts=texts, - model="embed-english-v3.0", # Use Cohere's embedding model - input_type="search_document" # Specify input type + chunk_embs_batch = openai.embeddings.create( + model="text-embedding-3-large", + input=texts, + encoding_format="float" ) - for j, emb in enumerate(chunk_embs_batch.embeddings): - self.chunks[i + j]['values'] = emb # Store the embeddings in the corresponding chunks + for j, data_val in enumerate(chunk_embs_batch.data): + self.chunks[i + j]['values'] = data_val.embedding # Store the embeddings in the corresponding chunks def _generate_summary(self) -> str: """ |