Working now with Python script

author: A.J. Shulman <Shulman.aj@gmail.com> 2024-09-19 12:36:18 -0400
committer: A.J. Shulman <Shulman.aj@gmail.com> 2024-09-19 12:36:18 -0400
commit: 2d61b3b0d00c239f05615c691ffbf4b98f3054e9 (patch)
tree: 2e20441e58bc19d2fa06462ea633fec661c14b4d /src/server/chunker/pdf_chunker.py
parent: badc8362c80ca33d2b3d93dda6a73b3bfb35a214 (diff)
1 files changed, 13 insertions, 27 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index c9f6737e7..12e71c29d 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -26,6 +26,12 @@ import numpy as np
 from PyPDF2 import PdfReader  # PDF text extraction
 from openai import OpenAI  # OpenAI client for text completion
 from sklearn.cluster import KMeans  # Clustering for summarization
+import warnings
+
+# Silence specific warnings
+warnings.filterwarnings('ignore', message="Valid config keys have changed")
+warnings.filterwarnings('ignore', message="torch.load")
+
 
 dotenv.load_dotenv()  # Load environment variables
 
@@ -36,7 +42,6 @@ if parse(PIL.__version__) >= parse('10.0.0'):
 # Global dictionary to track progress of document processing jobs
 current_progress = {}
 
-
 def update_progress(job_id, step, progress_value):
     """
     Output the progress in JSON format to stdout for the Node.js process to capture.
@@ -46,15 +51,8 @@ def update_progress(job_id, step, progress_value):
         "step": step,
         "progress": progress_value
     }
-    print(json.dumps(progress_data))  # Output progress to stdout
-    sys.stdout.flush()  # Ensure it's sent immediately
-
-
-def get_current_progress():
-    """
-    Return the current progress of all jobs.
-    """
-    return current_progress
+    print(json.dumps(progress_data), file=sys.stderr)  # Use stderr for progress logs
+    sys.stderr.flush()  # Ensure it's sent immediately
 
 
 class ElementExtractor:
@@ -698,25 +696,13 @@ def process_document(file_data, file_name, job_id):
     return new_document.to_json()
 
 
-def print_progress(job_id, step, progress_value):
-    """
-    Output the progress in JSON format to stdout for the Node.js process to capture.
-    """
-    progress_data = {
-        "job_id": job_id,
-        "step": step,
-        "progress": progress_value
-    }
-    print(json.dumps(progress_data))  # Output progress to stdout
-    sys.stdout.flush()  # Ensure it's sent immediately
-
 
 def main():
     """
     Main entry point for the script, called with arguments from Node.js.
     """
     if len(sys.argv) != 4:
-        print(json.dumps({"error": "Invalid arguments"}))
+        print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr)
         return
 
     job_id = sys.argv[1]
@@ -730,14 +716,14 @@ def main():
         # Process the document
         document_result = process_document(file_bytes, file_name, job_id)
 
-        # Output the final result as JSON
+        # Output the final result as JSON to stdout
         print(document_result)
         sys.stdout.flush()
 
     except Exception as e:
-        # If any error occurs, print the error to stdout for Node.js to capture
-        print(json.dumps({"error": str(e)}))
-        sys.stdout.flush()
+        # Print errors to stderr so they don't interfere with JSON output
+        print(json.dumps({"error": str(e)}), file=sys.stderr)
+        sys.stderr.flush()
 
 
 if __name__ == "__main__":
author	A.J. Shulman <Shulman.aj@gmail.com>	2024-09-19 12:36:18 -0400
committer	A.J. Shulman <Shulman.aj@gmail.com>	2024-09-19 12:36:18 -0400
commit	2d61b3b0d00c239f05615c691ffbf4b98f3054e9 (patch)
tree	2e20441e58bc19d2fa06462ea633fec661c14b4d /src/server/chunker/pdf_chunker.py
parent	badc8362c80ca33d2b3d93dda6a73b3bfb35a214 (diff)