diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 47 | ||||
-rw-r--r-- | src/server/chunker/pdf_chunker.py | 40 |
2 files changed, 44 insertions, 43 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index dfe5d747b..224d47d3b 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -291,7 +291,10 @@ export default class AssistantManager extends ApiManager { if (jobProgress[jobId]) { res.json(jobProgress[jobId]); } else { - res.status(404).send({ error: 'Job not found' }); + res.json({ + step: 'Processing Document...', + progress: '0', + }); } }, }); @@ -452,43 +455,55 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string) ]); let pythonOutput = ''; // Accumulate stdout data + let stderrOutput = ''; // For stderr logs and progress - // Handle stdout data (progress and final results) + // Handle stdout data (final result in JSON format) pythonProcess.stdout.on('data', data => { - pythonOutput += data.toString(); // Accumulate data + pythonOutput += data.toString(); // Accumulate data from stdout + }); - const lines = pythonOutput.split('\n'); // Handle multi-line JSON + // Handle stderr (progress logs or errors) + pythonProcess.stderr.on('data', data => { + stderrOutput += data.toString(); + const lines = stderrOutput.split('\n'); lines.forEach(line => { if (line.trim()) { try { - const parsedOutput = JSON.parse(line); // Parse each line of JSON + // Progress and warnings are printed as JSON to stderr + const parsedOutput = JSON.parse(line); + // Handle progress updates if (parsedOutput.job_id && parsedOutput.progress !== undefined) { jobProgress[parsedOutput.job_id] = { step: parsedOutput.step, progress: parsedOutput.progress, }; - } else if (parsedOutput.chunks) { - jobResults[parsedOutput.job_id] = parsedOutput; - jobProgress[parsedOutput.job_id] = { step: 'Complete', progress: 100 }; + } else if (parsedOutput.progress !== undefined) { + jobProgress[jobId] = { + step: parsedOutput.step, + progress: parsedOutput.progress, + }; } } catch (err) { - console.error('Error parsing Python output:', err); + console.error('Progress log from Python:', line); } } }); }); - // Handle stderr (error logging) - pythonProcess.stderr.on('data', data => { - console.error(`Python script error: ${data}`); - }); - // Handle process exit pythonProcess.on('close', code => { - if (code !== 0) { + if (code === 0) { + // Parse final JSON output (stdout) + try { + const finalResult = JSON.parse(pythonOutput); // Parse JSON from stdout + jobResults[jobId] = finalResult; + jobProgress[jobId] = { step: 'Complete', progress: 100 }; + } catch (err) { + console.error('Error parsing final JSON result:', err); + } + } else { console.error(`Python process exited with code ${code}`); - console.error(`Command: python3 ${path.join(__dirname, '../chunker/pdf_chunker.py')} ${jobId} ${file_name}`); jobResults[jobId] = { error: 'Python process failed' }; } }); diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py index c9f6737e7..12e71c29d 100644 --- a/src/server/chunker/pdf_chunker.py +++ b/src/server/chunker/pdf_chunker.py @@ -26,6 +26,12 @@ import numpy as np from PyPDF2 import PdfReader # PDF text extraction from openai import OpenAI # OpenAI client for text completion from sklearn.cluster import KMeans # Clustering for summarization +import warnings + +# Silence specific warnings +warnings.filterwarnings('ignore', message="Valid config keys have changed") +warnings.filterwarnings('ignore', message="torch.load") + dotenv.load_dotenv() # Load environment variables @@ -36,7 +42,6 @@ if parse(PIL.__version__) >= parse('10.0.0'): # Global dictionary to track progress of document processing jobs current_progress = {} - def update_progress(job_id, step, progress_value): """ Output the progress in JSON format to stdout for the Node.js process to capture. @@ -46,15 +51,8 @@ def update_progress(job_id, step, progress_value): "step": step, "progress": progress_value } - print(json.dumps(progress_data)) # Output progress to stdout - sys.stdout.flush() # Ensure it's sent immediately - - -def get_current_progress(): - """ - Return the current progress of all jobs. - """ - return current_progress + print(json.dumps(progress_data), file=sys.stderr) # Use stderr for progress logs + sys.stderr.flush() # Ensure it's sent immediately class ElementExtractor: @@ -698,25 +696,13 @@ def process_document(file_data, file_name, job_id): return new_document.to_json() -def print_progress(job_id, step, progress_value): - """ - Output the progress in JSON format to stdout for the Node.js process to capture. - """ - progress_data = { - "job_id": job_id, - "step": step, - "progress": progress_value - } - print(json.dumps(progress_data)) # Output progress to stdout - sys.stdout.flush() # Ensure it's sent immediately - def main(): """ Main entry point for the script, called with arguments from Node.js. """ if len(sys.argv) != 4: - print(json.dumps({"error": "Invalid arguments"})) + print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr) return job_id = sys.argv[1] @@ -730,14 +716,14 @@ def main(): # Process the document document_result = process_document(file_bytes, file_name, job_id) - # Output the final result as JSON + # Output the final result as JSON to stdout print(document_result) sys.stdout.flush() except Exception as e: - # If any error occurs, print the error to stdout for Node.js to capture - print(json.dumps({"error": str(e)})) - sys.stdout.flush() + # Print errors to stderr so they don't interfere with JSON output + print(json.dumps({"error": str(e)}), file=sys.stderr) + sys.stderr.flush() if __name__ == "__main__": |