before changing the get result endpoint

author: A.J. Shulman <Shulman.aj@gmail.com> 2024-10-30 14:17:03 -0400
committer: A.J. Shulman <Shulman.aj@gmail.com> 2024-10-30 14:17:03 -0400
commit: a99b38e4cdc4ec995cf2d56e94980987d6f31cbb (patch)
tree: 8a5cc63224da68ef9d953cdecf22b16ab9efd289
parent: e8b724c22bed4b6ed01e34ba661228c348f50378 (diff)
2 files changed, 41 insertions, 31 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index d7b72bac7..cfa95cb4e 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -495,10 +495,12 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string)
     const requirementsPath = path.join(__dirname, '../chunker/requirements.txt');
     const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py');
 
+    const outputDirectory = pathToDirectory(Directory.chunk_images);
+
     function runPythonScript() {
         const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3');
 
-        const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_name, file_data]);
+        const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_name, file_data, outputDirectory]);
 
         let pythonOutput = '';
         let stderrOutput = '';
@@ -510,23 +512,30 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string)
         pythonProcess.stderr.on('data', data => {
             stderrOutput += data.toString();
             const lines = stderrOutput.split('\n');
+            stderrOutput = lines.pop() || ''; // Save the last partial line back to stderrOutput
             lines.forEach(line => {
                 if (line.trim()) {
-                    try {
-                        const parsedOutput = JSON.parse(line);
-                        if (parsedOutput.job_id && parsedOutput.progress !== undefined) {
-                            jobProgress[parsedOutput.job_id] = {
-                                step: parsedOutput.step,
-                                progress: parsedOutput.progress,
-                            };
-                        } else if (parsedOutput.progress !== undefined) {
-                            jobProgress[jobId] = {
-                                step: parsedOutput.step,
-                                progress: parsedOutput.progress,
-                            };
+                    if (line.startsWith('PROGRESS:')) {
+                        const jsonString = line.substring('PROGRESS:'.length);
+                        try {
+                            const parsedOutput = JSON.parse(jsonString);
+                            if (parsedOutput.job_id && parsedOutput.progress !== undefined) {
+                                jobProgress[parsedOutput.job_id] = {
+                                    step: parsedOutput.step,
+                                    progress: parsedOutput.progress,
+                                };
+                            } else if (parsedOutput.progress !== undefined) {
+                                jobProgress[jobId] = {
+                                    step: parsedOutput.step,
+                                    progress: parsedOutput.progress,
+                                };
+                            }
+                        } catch (err) {
+                            console.error('Error parsing progress JSON:', jsonString, err);
                         }
-                    } catch (err) {
-                        console.error('Progress log from Python:', line, err);
+                    } else {
+                        // Log other stderr output
+                        console.error('Python stderr:', line);
                     }
                 }
             });
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index 4fe3b9dbf..7a3244fbc 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -54,8 +54,9 @@ def update_progress(job_id, step, progress_value):
         "step": step,
         "progress": progress_value
     }
-    print(json.dumps(progress_data), file=sys.stderr)  # Use stderr for progress logs
-    sys.stderr.flush()  # Ensure it's sent immediately
+    print(f"PROGRESS:{json.dumps(progress_data)}", file=sys.stderr)
+    sys.stderr.flush()
+
 
 
 class ElementExtractor:
@@ -664,7 +665,7 @@ class Document:
     Represents a document being processed, such as a PDF, handling chunking, embedding, and summarization.
     """
 
-    def __init__(self, file_data: bytes, file_name: str, job_id: str):
+    def __init__(self, file_data: bytes, file_name: str, job_id: str, output_folder: str):
         """
         Initialize the Document with file data, file name, and job ID.
 
@@ -672,6 +673,7 @@ class Document:
         :param file_name: The name of the file being processed.
         :param job_id: The job ID associated with this document processing task.
         """
+        self.output_folder = output_folder
         self.file_data = file_data
         self.file_name = file_name
         self.job_id = job_id
@@ -680,14 +682,13 @@ class Document:
         self.chunks = []  # List to hold text and visual chunks
         self.num_pages = 0  # Number of pages in the document (if applicable)
         self.summary = ""  # The generated summary for the document
-
         self._process()  # Start processing the document
 
     def _process(self):
         """
         Process the document: extract chunks, embed them, and generate a summary.
         """
-        pdf_chunker = PDFChunker(output_folder="output")  # Initialize the PDF chunker
+        pdf_chunker = PDFChunker(output_folder=self.output_folder)
         self.chunks = asyncio.run(pdf_chunker.chunk_pdf(self.file_data, self.file_name, self.doc_id, self.job_id))  # Extract chunks
 
         self.num_pages = self._get_pdf_pages()  # Get the number of pages in the document
@@ -796,8 +797,7 @@ class Document:
             "doc_id": self.doc_id
         }, indent=2)  # Convert the document's attributes to JSON format
 
-
-def process_document(file_data, file_name, job_id):
+def process_document(file_data, file_name, job_id, output_folder):
     """
     Top-level function to process a document and return the JSON output.
 
@@ -806,28 +806,28 @@ def process_document(file_data, file_name, job_id):
     :param job_id: The job ID for this document processing task.
     :return: The processed document's data in JSON format.
     """
-    new_document = Document(file_data, file_name, job_id)  # Create a new Document object
-    return new_document.to_json()  # Return the document's JSON data
-
+    new_document = Document(file_data, file_name, job_id, output_folder)
+    return new_document.to_json()
 
 def main():
     """
     Main entry point for the script, called with arguments from Node.js.
     """
-    if len(sys.argv) != 4:
-        print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr)  # Print error if incorrect number of arguments
+    if len(sys.argv) != 5:
+        print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr)
         return
 
-    job_id = sys.argv[1]  # Get the job ID from command-line arguments
-    file_name = sys.argv[2]  # Get the file name from command-line arguments
-    file_data = sys.argv[3]  # Get the base64-encoded file data from command-line arguments
+    job_id = sys.argv[1]
+    file_name = sys.argv[2]
+    file_data = sys.argv[3]
+    output_folder = sys.argv[4]  # Get the output folder from arguments
 
     try:
         # Decode the base64 file data
         file_bytes = base64.b64decode(file_data)
 
         # Process the document
-        document_result = process_document(file_bytes, file_name, job_id)
+        document_result = process_document(file_bytes, file_name, job_id, output_folder)  # Pass output_folder
 
         # Output the final result as JSON to stdout
         print(document_result)
@@ -839,5 +839,6 @@ def main():
         sys.stderr.flush()
 
 
+
 if __name__ == "__main__":
     main()  # Execute the main function when the script is run
author	A.J. Shulman <Shulman.aj@gmail.com>	2024-10-30 14:17:03 -0400
committer	A.J. Shulman <Shulman.aj@gmail.com>	2024-10-30 14:17:03 -0400
commit	a99b38e4cdc4ec995cf2d56e94980987d6f31cbb (patch)
tree	8a5cc63224da68ef9d953cdecf22b16ab9efd289
parent	e8b724c22bed4b6ed01e34ba661228c348f50378 (diff)