Fix for small PDFs: dynamic cluster downsizing + summary stability

author: Skitty1238 <157652284+Skitty1238@users.noreply.github.com> 2025-06-27 13:34:54 -0400
committer: Skitty1238 <157652284+Skitty1238@users.noreply.github.com> 2025-06-27 13:34:54 -0400
commit: a7eff29530d6e95058fee3eda6a71a7d168dc913 (patch)
tree: 8e3c5aa88a745000ddfcdb5b907fcecc6c9e60a7 /src
parent: fc423eed6aa81aedfad4ee74f2e05e385858801d (diff)
3 files changed, 62 insertions, 18 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index c7c347c71..07c970a4e 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -1135,7 +1135,7 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
         console.log('Virtual environment not found. Creating and setting up...');
 
         // Create venv
-        const createVenvProcess = spawn('python', ['-m', 'venv', venvPath]);
+        const createVenvProcess = spawn('python3.10', ['-m', 'venv', venvPath]);
 
         createVenvProcess.on('close', code => {
             if (code !== 0) {
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index e34753176..04d9f51a4 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -701,20 +701,43 @@ class Document:
 
         :return: The generated summary of the document.
         """
-        num_clusters = min(10, len(self.chunks))  # Set number of clusters for KMeans, capped at 10
-        kmeans = KMeans(n_clusters=num_clusters, random_state=42)  # Initialize KMeans with 10 clusters
-        doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk]  # Extract embeddings
-        cluster_labels = kmeans.fit_predict(doc_chunks)  # Assign each chunk to a cluster
+        # num_clusters = min(10, len(self.chunks))  # Set number of clusters for KMeans, capped at 10
+        # kmeans = KMeans(n_clusters=num_clusters, random_state=42)  # Initialize KMeans with 10 clusters
+        # doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk]  # Extract embeddings
+        # cluster_labels = kmeans.fit_predict(doc_chunks)  # Assign each chunk to a cluster
+
+        doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk]
+        if not doc_chunks:
+            raise ValueError("No valid embedded chunks to summarize.")
+
+        # Remove duplicates (e.g., from OCR-ed blank pages or repeated captions)
+        unique_chunks = np.unique(np.array(doc_chunks), axis=0)
+
+        # Dynamically scale number of clusters to available signal
+        num_clusters = min(10, len(unique_chunks))
+        kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(unique_chunks)
+
+        # Predict cluster labels for original chunks (not just unique ones)
+        cluster_labels = kmeans.predict(np.array(doc_chunks))
+
 
         # Select representative chunks from each cluster
         selected_chunks = []
         for i in range(num_clusters):
-            cluster_chunks = [chunk for chunk, label in zip(self.chunks, cluster_labels) if label == i]  # Get all chunks in this cluster
-            cluster_embs = [emb for emb, label in zip(doc_chunks, cluster_labels) if label == i]  # Get embeddings for this cluster
+            # cluster_chunks = [chunk for chunk, label in zip(self.chunks, cluster_labels) if label == i]  # Get all chunks in this cluster
+            # cluster_embs = [emb for emb, label in zip(doc_chunks, cluster_labels) if label == i]  # Get embeddings for this cluster
+
+            cluster_idxs = np.where(cluster_labels == i)[0]
+            if len(cluster_idxs) == 0:
+                continue  # skip empty clusters (shouldn't happen after downsizing)
+
             centroid = kmeans.cluster_centers_[i]  # Get the centroid of the cluster
-            distances = [np.linalg.norm(np.array(emb) - centroid) for emb in cluster_embs]  # Compute distance to centroid
-            closest_chunk = cluster_chunks[np.argmin(distances)]  # Select chunk closest to the centroid
-            selected_chunks.append(closest_chunk)
+            distances = [np.linalg.norm(doc_chunks[idx] - centroid) for idx in cluster_idxs]
+            closest_idx = cluster_idxs[int(np.argmin(distances))]
+            selected_chunks.append(self.chunks[closest_idx])
+            # distances = [np.linalg.norm(np.array(emb) - centroid) for emb in cluster_embs]  # Compute distance to centroid
+            # closest_chunk = cluster_chunks[np.argmin(distances)]  # Select chunk closest to the centroid
+            # selected_chunks.append(closest_chunk)
 
         # Combine selected chunks into a summary
         combined_text = "\n\n".join([chunk['metadata']['text'] for chunk in selected_chunks])  # Concatenate chunk texts
diff --git a/src/server/chunker/requirements.txt b/src/server/chunker/requirements.txt
index 20bd486e5..3df3cdd24 100644
--- a/src/server/chunker/requirements.txt
+++ b/src/server/chunker/requirements.txt
@@ -1,15 +1,36 @@
+# Prefer official CPU wheels from the PyTorch index
+--extra-index-url https://download.pytorch.org/whl/cpu
+
+###############################################################################
+#  Stable env for pdf_chunker.py                                              #
+###############################################################################
+
+# ─── LLM clients ─────────────────────────────────────────────────────────────
+openai==1.40.6
+httpx==0.27.2          # <0.28 → avoids "proxies=" crash
 anthropic==0.34.0
 cohere==5.8.0
-python-dotenv==1.0.1
+
+# ─── Torch stack (CPU) ───────────────────────────────────────────────────────
+torch==2.5.1
+torchvision==0.20.1       # matches torch 2.5.x
+torchaudio==2.5.1
+
+# ─── Vision / OCR / PDF processing ───────────────────────────────────────────
+ultralyticsplus==0.0.28
+easyocr==1.7.0
 pymupdf==1.22.2
-lxml==5.3.0
+PyPDF2==3.0.1
+pytesseract==0.3.10
+Pillow==10.4.0
 layoutparser==0.3.4
+lxml==5.3.0
+
+# ─── ML / maths ──────────────────────────────────────────────────────────────
 numpy==1.26.4
-openai==1.40.6
-Pillow==10.4.0
-pytesseract==0.3.10
-PyPDF2==3.0.1
 scikit-learn==1.5.1
+
+# ─── Utilities ──────────────────────────────────────────────────────────────
 tqdm==4.66.5
-ultralyticsplus==0.0.28
-easyocr==1.7.0
-\ No newline at end of file
+python-dotenv==1.0.1
+packaging==24.0
+\ No newline at end of file
author	Skitty1238 <157652284+Skitty1238@users.noreply.github.com>	2025-06-27 13:34:54 -0400
committer	Skitty1238 <157652284+Skitty1238@users.noreply.github.com>	2025-06-27 13:34:54 -0400
commit	a7eff29530d6e95058fee3eda6a71a7d168dc913 (patch)
tree	8e3c5aa88a745000ddfcdb5b907fcecc6c9e60a7 /src
parent	fc423eed6aa81aedfad4ee74f2e05e385858801d (diff)