diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 2 | ||||
-rw-r--r-- | src/server/chunker/pdf_chunker.py | 41 | ||||
-rw-r--r-- | src/server/chunker/requirements.txt | 37 |
3 files changed, 62 insertions, 18 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index c7c347c71..07c970a4e 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -1135,7 +1135,7 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) { console.log('Virtual environment not found. Creating and setting up...'); // Create venv - const createVenvProcess = spawn('python', ['-m', 'venv', venvPath]); + const createVenvProcess = spawn('python3.10', ['-m', 'venv', venvPath]); createVenvProcess.on('close', code => { if (code !== 0) { diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py index e34753176..04d9f51a4 100644 --- a/src/server/chunker/pdf_chunker.py +++ b/src/server/chunker/pdf_chunker.py @@ -701,20 +701,43 @@ class Document: :return: The generated summary of the document. """ - num_clusters = min(10, len(self.chunks)) # Set number of clusters for KMeans, capped at 10 - kmeans = KMeans(n_clusters=num_clusters, random_state=42) # Initialize KMeans with 10 clusters - doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk] # Extract embeddings - cluster_labels = kmeans.fit_predict(doc_chunks) # Assign each chunk to a cluster + # num_clusters = min(10, len(self.chunks)) # Set number of clusters for KMeans, capped at 10 + # kmeans = KMeans(n_clusters=num_clusters, random_state=42) # Initialize KMeans with 10 clusters + # doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk] # Extract embeddings + # cluster_labels = kmeans.fit_predict(doc_chunks) # Assign each chunk to a cluster + + doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk] + if not doc_chunks: + raise ValueError("No valid embedded chunks to summarize.") + + # Remove duplicates (e.g., from OCR-ed blank pages or repeated captions) + unique_chunks = np.unique(np.array(doc_chunks), axis=0) + + # Dynamically scale number of clusters to available signal + num_clusters = min(10, len(unique_chunks)) + kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(unique_chunks) + + # Predict cluster labels for original chunks (not just unique ones) + cluster_labels = kmeans.predict(np.array(doc_chunks)) + # Select representative chunks from each cluster selected_chunks = [] for i in range(num_clusters): - cluster_chunks = [chunk for chunk, label in zip(self.chunks, cluster_labels) if label == i] # Get all chunks in this cluster - cluster_embs = [emb for emb, label in zip(doc_chunks, cluster_labels) if label == i] # Get embeddings for this cluster + # cluster_chunks = [chunk for chunk, label in zip(self.chunks, cluster_labels) if label == i] # Get all chunks in this cluster + # cluster_embs = [emb for emb, label in zip(doc_chunks, cluster_labels) if label == i] # Get embeddings for this cluster + + cluster_idxs = np.where(cluster_labels == i)[0] + if len(cluster_idxs) == 0: + continue # skip empty clusters (shouldn't happen after downsizing) + centroid = kmeans.cluster_centers_[i] # Get the centroid of the cluster - distances = [np.linalg.norm(np.array(emb) - centroid) for emb in cluster_embs] # Compute distance to centroid - closest_chunk = cluster_chunks[np.argmin(distances)] # Select chunk closest to the centroid - selected_chunks.append(closest_chunk) + distances = [np.linalg.norm(doc_chunks[idx] - centroid) for idx in cluster_idxs] + closest_idx = cluster_idxs[int(np.argmin(distances))] + selected_chunks.append(self.chunks[closest_idx]) + # distances = [np.linalg.norm(np.array(emb) - centroid) for emb in cluster_embs] # Compute distance to centroid + # closest_chunk = cluster_chunks[np.argmin(distances)] # Select chunk closest to the centroid + # selected_chunks.append(closest_chunk) # Combine selected chunks into a summary combined_text = "\n\n".join([chunk['metadata']['text'] for chunk in selected_chunks]) # Concatenate chunk texts diff --git a/src/server/chunker/requirements.txt b/src/server/chunker/requirements.txt index 20bd486e5..3df3cdd24 100644 --- a/src/server/chunker/requirements.txt +++ b/src/server/chunker/requirements.txt @@ -1,15 +1,36 @@ +# Prefer official CPU wheels from the PyTorch index +--extra-index-url https://download.pytorch.org/whl/cpu + +############################################################################### +# Stable env for pdf_chunker.py # +############################################################################### + +# ─── LLM clients ───────────────────────────────────────────────────────────── +openai==1.40.6 +httpx==0.27.2 # <0.28 → avoids "proxies=" crash anthropic==0.34.0 cohere==5.8.0 -python-dotenv==1.0.1 + +# ─── Torch stack (CPU) ─────────────────────────────────────────────────────── +torch==2.5.1 +torchvision==0.20.1 # matches torch 2.5.x +torchaudio==2.5.1 + +# ─── Vision / OCR / PDF processing ─────────────────────────────────────────── +ultralyticsplus==0.0.28 +easyocr==1.7.0 pymupdf==1.22.2 -lxml==5.3.0 +PyPDF2==3.0.1 +pytesseract==0.3.10 +Pillow==10.4.0 layoutparser==0.3.4 +lxml==5.3.0 + +# ─── ML / maths ────────────────────────────────────────────────────────────── numpy==1.26.4 -openai==1.40.6 -Pillow==10.4.0 -pytesseract==0.3.10 -PyPDF2==3.0.1 scikit-learn==1.5.1 + +# ─── Utilities ────────────────────────────────────────────────────────────── tqdm==4.66.5 -ultralyticsplus==0.0.28 -easyocr==1.7.0
\ No newline at end of file +python-dotenv==1.0.1 +packaging==24.0
\ No newline at end of file |