diff options
| author | bobzel <zzzman@gmail.com> | 2025-03-06 16:17:47 -0500 | 
|---|---|---|
| committer | bobzel <zzzman@gmail.com> | 2025-03-06 16:17:47 -0500 | 
| commit | 5ad858090f3006631062877d90120e3cc505fada (patch) | |
| tree | 9f87a8e1e7098a1025f6f4aac332dbc854db5be3 /src/server/chunker/pdf_chunker.py | |
| parent | 9c2a7c14fd9d0e44609aab30c6323583162009db (diff) | |
| parent | adaa107aac8558fa6f46e6ba1263c650c212d506 (diff) | |
Merge branch 'master' into aarav_edit
Diffstat (limited to 'src/server/chunker/pdf_chunker.py')
| -rw-r--r-- | src/server/chunker/pdf_chunker.py | 20 | 
1 files changed, 10 insertions, 10 deletions
| diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py index a9dbcbb0c..697550f2e 100644 --- a/src/server/chunker/pdf_chunker.py +++ b/src/server/chunker/pdf_chunker.py @@ -21,7 +21,7 @@ import json  import os  import uuid  # For generating unique IDs  from enum import Enum  # Enums for types like document type and purpose -import cohere  # Embedding client +import openai  import numpy as np  from PyPDF2 import PdfReader  # PDF text extraction  from openai import OpenAI  # OpenAI client for text completion @@ -35,8 +35,8 @@ warnings.filterwarnings('ignore', message="torch.load")  dotenv.load_dotenv()  # Load environment variables  # Fix for newer versions of PIL -if parse(PIL.__version__) >= parse('10.0.0'): -    Image.LINEAR = Image.BILINEAR +# if parse(PIL.__version__) >= parse('10.0.0'): +#     Image.LINEAR = Image.BILINEAR  # Global dictionary to track progress of document processing jobs  current_progress = {} @@ -727,19 +727,19 @@ class Document:          """          Embed the text chunks using the Cohere API.          """ -        co = cohere.Client(os.getenv("COHERE_API_KEY"))  # Initialize Cohere client with API key +        openai = OpenAI()  # Initialize Cohere client with API key          batch_size = 90  # Batch size for embedding          chunks_len = len(self.chunks)  # Total number of chunks to embed          for i in tqdm(range(0, chunks_len, batch_size), desc="Embedding Chunks"):              batch = self.chunks[i: min(i + batch_size, chunks_len)]  # Get batch of chunks              texts = [chunk['metadata']['text'] for chunk in batch]  # Extract text from each chunk -            chunk_embs_batch = co.embed( -                texts=texts, -                model="embed-english-v3.0",  # Use Cohere's embedding model -                input_type="search_document"  # Specify input type +            chunk_embs_batch = openai.embeddings.create( +                model="text-embedding-3-large", +                input=texts, +                encoding_format="float"              ) -            for j, emb in enumerate(chunk_embs_batch.embeddings): -                self.chunks[i + j]['values'] = emb  # Store the embeddings in the corresponding chunks +            for j, data_val in enumerate(chunk_embs_batch.data): +                self.chunks[i + j]['values'] = data_val.embedding  # Store the embeddings in the corresponding chunks      def _generate_summary(self) -> str:          """ | 
