diff options
Diffstat (limited to 'src/server/chunker/pdf_chunker.py')
-rw-r--r-- | src/server/chunker/pdf_chunker.py | 40 |
1 files changed, 13 insertions, 27 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py index c9f6737e7..12e71c29d 100644 --- a/src/server/chunker/pdf_chunker.py +++ b/src/server/chunker/pdf_chunker.py @@ -26,6 +26,12 @@ import numpy as np from PyPDF2 import PdfReader # PDF text extraction from openai import OpenAI # OpenAI client for text completion from sklearn.cluster import KMeans # Clustering for summarization +import warnings + +# Silence specific warnings +warnings.filterwarnings('ignore', message="Valid config keys have changed") +warnings.filterwarnings('ignore', message="torch.load") + dotenv.load_dotenv() # Load environment variables @@ -36,7 +42,6 @@ if parse(PIL.__version__) >= parse('10.0.0'): # Global dictionary to track progress of document processing jobs current_progress = {} - def update_progress(job_id, step, progress_value): """ Output the progress in JSON format to stdout for the Node.js process to capture. @@ -46,15 +51,8 @@ def update_progress(job_id, step, progress_value): "step": step, "progress": progress_value } - print(json.dumps(progress_data)) # Output progress to stdout - sys.stdout.flush() # Ensure it's sent immediately - - -def get_current_progress(): - """ - Return the current progress of all jobs. - """ - return current_progress + print(json.dumps(progress_data), file=sys.stderr) # Use stderr for progress logs + sys.stderr.flush() # Ensure it's sent immediately class ElementExtractor: @@ -698,25 +696,13 @@ def process_document(file_data, file_name, job_id): return new_document.to_json() -def print_progress(job_id, step, progress_value): - """ - Output the progress in JSON format to stdout for the Node.js process to capture. - """ - progress_data = { - "job_id": job_id, - "step": step, - "progress": progress_value - } - print(json.dumps(progress_data)) # Output progress to stdout - sys.stdout.flush() # Ensure it's sent immediately - def main(): """ Main entry point for the script, called with arguments from Node.js. """ if len(sys.argv) != 4: - print(json.dumps({"error": "Invalid arguments"})) + print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr) return job_id = sys.argv[1] @@ -730,14 +716,14 @@ def main(): # Process the document document_result = process_document(file_bytes, file_name, job_id) - # Output the final result as JSON + # Output the final result as JSON to stdout print(document_result) sys.stdout.flush() except Exception as e: - # If any error occurs, print the error to stdout for Node.js to capture - print(json.dumps({"error": str(e)})) - sys.stdout.flush() + # Print errors to stderr so they don't interfere with JSON output + print(json.dumps({"error": str(e)}), file=sys.stderr) + sys.stderr.flush() if __name__ == "__main__": |