aboutsummaryrefslogtreecommitdiff
path: root/src/server/chunker/pdf_chunker.py
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2024-09-19 12:36:18 -0400
committerA.J. Shulman <Shulman.aj@gmail.com>2024-09-19 12:36:18 -0400
commit2d61b3b0d00c239f05615c691ffbf4b98f3054e9 (patch)
tree2e20441e58bc19d2fa06462ea633fec661c14b4d /src/server/chunker/pdf_chunker.py
parentbadc8362c80ca33d2b3d93dda6a73b3bfb35a214 (diff)
Working now with Python script
Diffstat (limited to 'src/server/chunker/pdf_chunker.py')
-rw-r--r--src/server/chunker/pdf_chunker.py40
1 files changed, 13 insertions, 27 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index c9f6737e7..12e71c29d 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -26,6 +26,12 @@ import numpy as np
from PyPDF2 import PdfReader # PDF text extraction
from openai import OpenAI # OpenAI client for text completion
from sklearn.cluster import KMeans # Clustering for summarization
+import warnings
+
+# Silence specific warnings
+warnings.filterwarnings('ignore', message="Valid config keys have changed")
+warnings.filterwarnings('ignore', message="torch.load")
+
dotenv.load_dotenv() # Load environment variables
@@ -36,7 +42,6 @@ if parse(PIL.__version__) >= parse('10.0.0'):
# Global dictionary to track progress of document processing jobs
current_progress = {}
-
def update_progress(job_id, step, progress_value):
"""
Output the progress in JSON format to stdout for the Node.js process to capture.
@@ -46,15 +51,8 @@ def update_progress(job_id, step, progress_value):
"step": step,
"progress": progress_value
}
- print(json.dumps(progress_data)) # Output progress to stdout
- sys.stdout.flush() # Ensure it's sent immediately
-
-
-def get_current_progress():
- """
- Return the current progress of all jobs.
- """
- return current_progress
+ print(json.dumps(progress_data), file=sys.stderr) # Use stderr for progress logs
+ sys.stderr.flush() # Ensure it's sent immediately
class ElementExtractor:
@@ -698,25 +696,13 @@ def process_document(file_data, file_name, job_id):
return new_document.to_json()
-def print_progress(job_id, step, progress_value):
- """
- Output the progress in JSON format to stdout for the Node.js process to capture.
- """
- progress_data = {
- "job_id": job_id,
- "step": step,
- "progress": progress_value
- }
- print(json.dumps(progress_data)) # Output progress to stdout
- sys.stdout.flush() # Ensure it's sent immediately
-
def main():
"""
Main entry point for the script, called with arguments from Node.js.
"""
if len(sys.argv) != 4:
- print(json.dumps({"error": "Invalid arguments"}))
+ print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr)
return
job_id = sys.argv[1]
@@ -730,14 +716,14 @@ def main():
# Process the document
document_result = process_document(file_bytes, file_name, job_id)
- # Output the final result as JSON
+ # Output the final result as JSON to stdout
print(document_result)
sys.stdout.flush()
except Exception as e:
- # If any error occurs, print the error to stdout for Node.js to capture
- print(json.dumps({"error": str(e)}))
- sys.stdout.flush()
+ # Print errors to stderr so they don't interfere with JSON output
+ print(json.dumps({"error": str(e)}), file=sys.stderr)
+ sys.stderr.flush()
if __name__ == "__main__":