2 files changed, 41 insertions, 54 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index cfa95cb4e..4d2068014 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -23,6 +23,7 @@ import { AI_Document } from '../../client/views/nodes/chatbot/types/types';
 import { Method } from '../RouteManager';
 import { filesDirectory, publicDirectory } from '../SocketData';
 import ApiManager, { Registration } from './ApiManager';
+import { getServerPath } from '../../client/util/reportManager/reportManagerUtils';
 
 // Enumeration of directories where different file types are stored
 export enum Directory {
@@ -349,47 +350,16 @@ export default class AssistantManager extends ApiManager {
             method: Method.GET,
             subscription: '/getResult/:jobId',
             secureHandler: async ({ req, res }) => {
-                const { jobId } = req.params; // Get the job ID from the URL parameters
-                // Check if the job result is available
+                const { jobId } = req.params;
                 if (jobResults[jobId]) {
                     const result = jobResults[jobId] as AI_Document & { status: string };
 
-                    // If the result contains image or table chunks, save the base64 data as image files
                     if (result.chunks && Array.isArray(result.chunks)) {
-                        await Promise.all(
-                            result.chunks.map(chunk => {
-                                if (chunk.metadata && (chunk.metadata.type === 'image' || chunk.metadata.type === 'table')) {
-                                    const files_directory = '/files/chunk_images/';
-                                    const directory = path.join(publicDirectory, files_directory);
-
-                                    // Ensure the directory exists or create it
-                                    if (!fs.existsSync(directory)) {
-                                        fs.mkdirSync(directory);
-                                    }
-
-                                    const fileName = path.basename(chunk.metadata.file_path); // Get the file name from the path
-                                    const filePath = path.join(directory, fileName); // Create the full file path
-
-                                    // Check if the chunk contains base64 encoded data
-                                    if (chunk.metadata.base64_data) {
-                                        // Decode the base64 data and write it to a file
-                                        const buffer = Buffer.from(chunk.metadata.base64_data, 'base64');
-                                        fs.promises.writeFile(filePath, buffer).then(() => {
-                                            // Update the file path in the chunk's metadata
-                                            chunk.metadata.file_path = path.join(files_directory, fileName);
-                                            chunk.metadata.base64_data = undefined; // Remove the base64 data from the metadata
-                                        });
-                                    } else {
-                                        console.warn(`No base64_data found for chunk: ${fileName}`);
-                                    }
-                                }
-                            })
-                        );
                         result.status = 'completed';
                     } else {
                         result.status = 'pending';
                     }
-                    res.json(result); // Send the result back to the client
+                    res.json(result);
                 } else {
                     res.status(202).send({ status: 'pending' });
                 }
@@ -417,7 +387,7 @@ export default class AssistantManager extends ApiManager {
                         // If the chunk is an image or table, read the corresponding file and encode it as base64
                         if (chunk.metadata.type === 'image' || chunk.metadata.type === 'table') {
                             try {
-                                const filePath = serverPathToFile(Directory.chunk_images, chunk.metadata.file_path); // Get the file path
+                                const filePath = path.join(pathToDirectory(Directory.chunk_images), chunk.metadata.file_path); // Get the file path
                                 readFileAsync(filePath).then(imageBuffer => {
                                     const base64Image = imageBuffer.toString('base64'); // Convert the image to base64
 
@@ -549,10 +519,24 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string)
                     jobProgress[jobId] = { step: 'Complete', progress: 100 };
                 } catch (err) {
                     console.error('Error parsing final JSON result:', err);
+                    jobResults[jobId] = { error: 'Failed to parse final result' };
                 }
             } else {
                 console.error(`Python process exited with code ${code}`);
-                jobResults[jobId] = { error: 'Python process failed' };
+                // Check if there was an error message in stderr
+                if (stderrOutput) {
+                    // Try to parse the last line as JSON
+                    const lines = stderrOutput.trim().split('\n');
+                    const lastLine = lines[lines.length - 1];
+                    try {
+                        const errorOutput = JSON.parse(lastLine);
+                        jobResults[jobId] = errorOutput;
+                    } catch (err) {
+                        jobResults[jobId] = { error: 'Python process failed' };
+                    }
+                } else {
+                    jobResults[jobId] = { error: 'Python process failed' };
+                }
             }
         });
     }
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index 7a3244fbc..130987343 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -64,13 +64,15 @@ class ElementExtractor:
     A class that uses a YOLO model to extract tables and images from a PDF page.
     """
 
-    def __init__(self, output_folder: str):
+    def __init__(self, output_folder: str, doc_id: str):
         """
         Initializes the ElementExtractor with the output folder for saving images and the YOLO model.
 
         :param output_folder: Path to the folder where extracted elements will be saved.
         """
-        self.output_folder = output_folder
+        self.doc_id = doc_id
+        self.output_folder = os.path.join(output_folder, doc_id)
+        os.makedirs(self.output_folder, exist_ok=True)
         self.model = YOLO('keremberke/yolov8m-table-extraction')  # Load YOLO model for table extraction
         self.model.overrides['conf'] = 0.25  # Set confidence threshold for detection
         self.model.overrides['iou'] = 0.45  # Set Intersection over Union (IoU) threshold
@@ -114,20 +116,18 @@ class ElementExtractor:
 
             # Save the full page with the red outline
             table_filename = f"table_page{page_num + 1}_{idx + 1}.png"
+            file_path_for_client = f"{self.doc_id}/{table_filename}"
             table_path = os.path.join(self.output_folder, table_filename)
             page_with_outline.save(table_path)
 
-            # Convert the full-page image with red outline to base64
-            base64_data = self.image_to_base64(page_with_outline)
-
             tables.append({
                 'metadata': {
                     "type": "table",
                     "location": [x1 / img.width, y1 / img.height, x2 / img.width, y2 / img.height],
-                    "file_path": table_path,
+                    "file_path": file_path_for_client,
                     "start_page": page_num,
                     "end_page": page_num,
-                    "base64_data": base64_data,
+                    "base64_data": self.image_to_base64(page_with_outline)
                 }
             })
 
@@ -173,21 +173,19 @@ class ElementExtractor:
 
             # Save the full page with the red outline
             image_filename = f"image_page{page_num + 1}_{img_index + 1}.png"
+            file_path_for_client = f"{self.doc_id}/{image_filename}"
             image_path = os.path.join(self.output_folder, image_filename)
             page_with_outline.save(image_path)
 
-            # Convert the full-page image with red outline to base64
-            base64_data = self.image_to_base64(page_with_outline)
-
             images.append({
                 'metadata': {
                     "type": "image",
                     "location": [x1 / page.rect.width, y1 / page.rect.height, x2 / page.rect.width,
                                     y2 / page.rect.height],
-                    "file_path": image_path,
+                    "file_path": file_path_for_client,
                     "start_page": page_num,
                     "end_page": page_num,
-                    "base64_data": base64_data,
+                    "base64_data": self.image_to_base64(image)
                 }
             })
 
@@ -269,7 +267,7 @@ class PDFChunker:
     The main class responsible for chunking PDF files into text and visual elements (tables/images).
     """
 
-    def __init__(self, output_folder: str = "output", image_batch_size: int = 5) -> None:
+    def __init__(self, output_folder: str = "output", doc_id: str = '', image_batch_size: int = 5) -> None:
         """
         Initializes the PDFChunker with an output folder and an element extractor for visual elements.
 
@@ -279,7 +277,8 @@ class PDFChunker:
         self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))  # Initialize the Anthropic API client
         self.output_folder = output_folder
         self.image_batch_size = image_batch_size  # Batch size for image processing
-        self.element_extractor = ElementExtractor(output_folder)  # Initialize the element extractor
+        self.doc_id = doc_id  # Add doc_id
+        self.element_extractor = ElementExtractor(output_folder, doc_id)
 
     async def chunk_pdf(self, file_data: bytes, file_name: str, doc_id: str, job_id: str) -> List[Dict[str, Any]]:
         """
@@ -364,6 +363,7 @@ class PDFChunker:
                 for j, elem in enumerate(batch, start=1):
                     if j in summaries:
                         elem['metadata']['text'] = re.sub(r'^(Image|Table):\s*', '', summaries[j])
+                        elem['metadata']['base64_data'] = ''
                         processed_elements.append(elem)
 
                 progress = ((i // image_batch_size) + 1) / total_batches * 100  # Calculate progress
@@ -629,10 +629,11 @@ class PDFChunker:
 
             return summaries
 
-        except Exception:
-            #print(f"Error in batch_summarize_images: {str(e)}")
-            #print("Returning placeholder summaries")
-            return {number: "Error: No summary available" for number in images}
+        except Exception as e:
+            # Print errors to stderr so they don't interfere with JSON output
+            print(json.dumps({"error": str(e)}), file=sys.stderr)
+            sys.stderr.flush()
+
 
 class DocumentType(Enum):
     """
@@ -688,7 +689,7 @@ class Document:
         """
         Process the document: extract chunks, embed them, and generate a summary.
         """
-        pdf_chunker = PDFChunker(output_folder=self.output_folder)
+        pdf_chunker = PDFChunker(output_folder=self.output_folder, doc_id=self.doc_id)  # Initialize PDFChunker
         self.chunks = asyncio.run(pdf_chunker.chunk_pdf(self.file_data, self.file_name, self.doc_id, self.job_id))  # Extract chunks
 
         self.num_pages = self._get_pdf_pages()  # Get the number of pages in the document
@@ -823,6 +824,8 @@ def main():
     output_folder = sys.argv[4]  # Get the output folder from arguments
 
     try:
+        os.makedirs(output_folder, exist_ok=True)
+
         # Decode the base64 file data
         file_bytes = base64.b64decode(file_data)