aboutsummaryrefslogtreecommitdiff
path: root/src/server/chunker/pdf_chunker.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/server/chunker/pdf_chunker.py')
-rw-r--r--src/server/chunker/pdf_chunker.py43
1 files changed, 10 insertions, 33 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index 04d9f51a4..7cb7d077c 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -307,7 +307,7 @@ class PDFChunker:
page_texts = await self.extract_text_from_masked_pages(pages, job_id) # Extract text from masked pages
update_progress(job_id, "Processing text...", 0)
- text_chunks = self.chunk_text_with_metadata(page_texts, max_words=1000, job_id=job_id) # Chunk text into smaller parts
+ text_chunks = self.chunk_text_with_metadata(page_texts, max_words=2000, job_id=job_id) # Chunk text into smaller parts
# Combine text and visual elements into a unified structure (chunks)
chunks = self.combine_chunks(text_chunks, [elem for page in pages for elem in page.elements], file_name,
@@ -701,43 +701,20 @@ class Document:
:return: The generated summary of the document.
"""
- # num_clusters = min(10, len(self.chunks)) # Set number of clusters for KMeans, capped at 10
- # kmeans = KMeans(n_clusters=num_clusters, random_state=42) # Initialize KMeans with 10 clusters
- # doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk] # Extract embeddings
- # cluster_labels = kmeans.fit_predict(doc_chunks) # Assign each chunk to a cluster
-
- doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk]
- if not doc_chunks:
- raise ValueError("No valid embedded chunks to summarize.")
-
- # Remove duplicates (e.g., from OCR-ed blank pages or repeated captions)
- unique_chunks = np.unique(np.array(doc_chunks), axis=0)
-
- # Dynamically scale number of clusters to available signal
- num_clusters = min(10, len(unique_chunks))
- kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(unique_chunks)
-
- # Predict cluster labels for original chunks (not just unique ones)
- cluster_labels = kmeans.predict(np.array(doc_chunks))
-
+ num_clusters = min(10, len(self.chunks)) # Set number of clusters for KMeans, capped at 10
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42) # Initialize KMeans with 10 clusters
+ doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk] # Extract embeddings
+ cluster_labels = kmeans.fit_predict(doc_chunks) # Assign each chunk to a cluster
# Select representative chunks from each cluster
selected_chunks = []
for i in range(num_clusters):
- # cluster_chunks = [chunk for chunk, label in zip(self.chunks, cluster_labels) if label == i] # Get all chunks in this cluster
- # cluster_embs = [emb for emb, label in zip(doc_chunks, cluster_labels) if label == i] # Get embeddings for this cluster
-
- cluster_idxs = np.where(cluster_labels == i)[0]
- if len(cluster_idxs) == 0:
- continue # skip empty clusters (shouldn't happen after downsizing)
-
+ cluster_chunks = [chunk for chunk, label in zip(self.chunks, cluster_labels) if label == i] # Get all chunks in this cluster
+ cluster_embs = [emb for emb, label in zip(doc_chunks, cluster_labels) if label == i] # Get embeddings for this cluster
centroid = kmeans.cluster_centers_[i] # Get the centroid of the cluster
- distances = [np.linalg.norm(doc_chunks[idx] - centroid) for idx in cluster_idxs]
- closest_idx = cluster_idxs[int(np.argmin(distances))]
- selected_chunks.append(self.chunks[closest_idx])
- # distances = [np.linalg.norm(np.array(emb) - centroid) for emb in cluster_embs] # Compute distance to centroid
- # closest_chunk = cluster_chunks[np.argmin(distances)] # Select chunk closest to the centroid
- # selected_chunks.append(closest_chunk)
+ distances = [np.linalg.norm(np.array(emb) - centroid) for emb in cluster_embs] # Compute distance to centroid
+ closest_chunk = cluster_chunks[np.argmin(distances)] # Select chunk closest to the centroid
+ selected_chunks.append(closest_chunk)
# Combine selected chunks into a summary
combined_text = "\n\n".join([chunk['metadata']['text'] for chunk in selected_chunks]) # Concatenate chunk texts