diff options
Diffstat (limited to 'src/server/chunker/pdf_chunker.py')
-rw-r--r-- | src/server/chunker/pdf_chunker.py | 41 |
1 files changed, 9 insertions, 32 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py index 914594f1e..7cb7d077c 100644 --- a/src/server/chunker/pdf_chunker.py +++ b/src/server/chunker/pdf_chunker.py @@ -701,43 +701,20 @@ class Document: :return: The generated summary of the document. """ - # num_clusters = min(10, len(self.chunks)) # Set number of clusters for KMeans, capped at 10 - # kmeans = KMeans(n_clusters=num_clusters, random_state=42) # Initialize KMeans with 10 clusters - # doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk] # Extract embeddings - # cluster_labels = kmeans.fit_predict(doc_chunks) # Assign each chunk to a cluster - - doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk] - if not doc_chunks: - raise ValueError("No valid embedded chunks to summarize.") - - # Remove duplicates (e.g., from OCR-ed blank pages or repeated captions) - unique_chunks = np.unique(np.array(doc_chunks), axis=0) - - # Dynamically scale number of clusters to available signal - num_clusters = min(10, len(unique_chunks)) - kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(unique_chunks) - - # Predict cluster labels for original chunks (not just unique ones) - cluster_labels = kmeans.predict(np.array(doc_chunks)) - + num_clusters = min(10, len(self.chunks)) # Set number of clusters for KMeans, capped at 10 + kmeans = KMeans(n_clusters=num_clusters, random_state=42) # Initialize KMeans with 10 clusters + doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk] # Extract embeddings + cluster_labels = kmeans.fit_predict(doc_chunks) # Assign each chunk to a cluster # Select representative chunks from each cluster selected_chunks = [] for i in range(num_clusters): - # cluster_chunks = [chunk for chunk, label in zip(self.chunks, cluster_labels) if label == i] # Get all chunks in this cluster - # cluster_embs = [emb for emb, label in zip(doc_chunks, cluster_labels) if label == i] # Get embeddings for this cluster - - cluster_idxs = np.where(cluster_labels == i)[0] - if len(cluster_idxs) == 0: - continue # skip empty clusters (shouldn't happen after downsizing) - + cluster_chunks = [chunk for chunk, label in zip(self.chunks, cluster_labels) if label == i] # Get all chunks in this cluster + cluster_embs = [emb for emb, label in zip(doc_chunks, cluster_labels) if label == i] # Get embeddings for this cluster centroid = kmeans.cluster_centers_[i] # Get the centroid of the cluster - distances = [np.linalg.norm(doc_chunks[idx] - centroid) for idx in cluster_idxs] - closest_idx = cluster_idxs[int(np.argmin(distances))] - selected_chunks.append(self.chunks[closest_idx]) - # distances = [np.linalg.norm(np.array(emb) - centroid) for emb in cluster_embs] # Compute distance to centroid - # closest_chunk = cluster_chunks[np.argmin(distances)] # Select chunk closest to the centroid - # selected_chunks.append(closest_chunk) + distances = [np.linalg.norm(np.array(emb) - centroid) for emb in cluster_embs] # Compute distance to centroid + closest_chunk = cluster_chunks[np.argmin(distances)] # Select chunk closest to the centroid + selected_chunks.append(closest_chunk) # Combine selected chunks into a summary combined_text = "\n\n".join([chunk['metadata']['text'] for chunk in selected_chunks]) # Concatenate chunk texts |