5 files changed, 150 insertions, 97 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 8447a4934..4d2068014 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -9,7 +9,7 @@
  */
 
 import { Readability } from '@mozilla/readability';
-import axios from 'axios';
+import axios, { AxiosResponse } from 'axios';
 import { spawn } from 'child_process';
 import * as fs from 'fs';
 import { writeFile } from 'fs';
@@ -23,6 +23,7 @@ import { AI_Document } from '../../client/views/nodes/chatbot/types/types';
 import { Method } from '../RouteManager';
 import { filesDirectory, publicDirectory } from '../SocketData';
 import ApiManager, { Registration } from './ApiManager';
+import { getServerPath } from '../../client/util/reportManager/reportManagerUtils';
 
 // Enumeration of directories where different file types are stored
 export enum Directory {
@@ -115,29 +116,79 @@ export default class AssistantManager extends ApiManager {
             },
         });
 
-        // Register Google Web Search Results API route
         register({
             method: Method.POST,
             subscription: '/getWebSearchResults',
             secureHandler: async ({ req, res }) => {
                 const { query, max_results } = req.body;
-                try {
-                    // Fetch search results using Google Custom Search API
-                    const response = await customsearch.cse.list({
+                const MIN_VALID_RESULTS_RATIO = 0.75; // 3/4 threshold
+                let startIndex = 1; // Start at the first result initially
+                let validResults: any[] = [];
+
+                const fetchSearchResults = async (start: number) => {
+                    return customsearch.cse.list({
                         q: query,
                         cx: process.env._CLIENT_GOOGLE_SEARCH_ENGINE_ID,
                         key: process.env._CLIENT_GOOGLE_API_KEY,
                         safe: 'active',
                         num: max_results,
+                        start, // This controls which result index the search starts from
                     });
+                };
+
+                const filterResultsByXFrameOptions = async (results: any[]) => {
+                    const filteredResults = await Promise.all(
+                        results.map(async result => {
+                            try {
+                                const urlResponse: AxiosResponse = await axios.head(result.url, { timeout: 5000 });
+                                const xFrameOptions = urlResponse.headers['x-frame-options'];
+                                if (xFrameOptions && xFrameOptions.toUpperCase() === 'SAMEORIGIN') {
+                                    return result;
+                                }
+                            } catch (error) {
+                                console.error(`Error checking x-frame-options for URL: ${result.url}`, error);
+                            }
+                            return null; // Exclude the result if it doesn't match
+                        })
+                    );
+                    return filteredResults.filter(result => result !== null); // Remove null results
+                };
 
-                    const results =
+                try {
+                    // Fetch initial search results
+                    let response = await fetchSearchResults(startIndex);
+                    let initialResults =
                         response.data.items?.map(item => ({
                             url: item.link,
                             snippet: item.snippet,
                         })) || [];
 
-                    res.send({ results });
+                    // Filter the initial results
+                    validResults = await filterResultsByXFrameOptions(initialResults);
+
+                    // If valid results are less than 3/4 of max_results, fetch more results
+                    while (validResults.length < max_results * MIN_VALID_RESULTS_RATIO) {
+                        // Increment the start index by the max_results to fetch the next set of results
+                        startIndex += max_results;
+                        response = await fetchSearchResults(startIndex);
+
+                        const additionalResults =
+                            response.data.items?.map(item => ({
+                                url: item.link,
+                                snippet: item.snippet,
+                            })) || [];
+
+                        const additionalValidResults = await filterResultsByXFrameOptions(additionalResults);
+                        validResults = [...validResults, ...additionalValidResults]; // Combine valid results
+
+                        // Break if no more results are available
+                        if (additionalValidResults.length === 0 || response.data.items?.length === 0) {
+                            break;
+                        }
+                    }
+
+                    // Return the filtered valid results
+                    res.send({ results: validResults.slice(0, max_results) }); // Limit the results to max_results
                 } catch (error) {
                     console.error('Error performing web search:', error);
                     res.status(500).send({
@@ -299,47 +350,16 @@ export default class AssistantManager extends ApiManager {
             method: Method.GET,
             subscription: '/getResult/:jobId',
             secureHandler: async ({ req, res }) => {
-                const { jobId } = req.params; // Get the job ID from the URL parameters
-                // Check if the job result is available
+                const { jobId } = req.params;
                 if (jobResults[jobId]) {
                     const result = jobResults[jobId] as AI_Document & { status: string };
 
-                    // If the result contains image or table chunks, save the base64 data as image files
                     if (result.chunks && Array.isArray(result.chunks)) {
-                        await Promise.all(
-                            result.chunks.map(chunk => {
-                                if (chunk.metadata && (chunk.metadata.type === 'image' || chunk.metadata.type === 'table')) {
-                                    const files_directory = '/files/chunk_images/';
-                                    const directory = path.join(publicDirectory, files_directory);
-
-                                    // Ensure the directory exists or create it
-                                    if (!fs.existsSync(directory)) {
-                                        fs.mkdirSync(directory);
-                                    }
-
-                                    const fileName = path.basename(chunk.metadata.file_path); // Get the file name from the path
-                                    const filePath = path.join(directory, fileName); // Create the full file path
-
-                                    // Check if the chunk contains base64 encoded data
-                                    if (chunk.metadata.base64_data) {
-                                        // Decode the base64 data and write it to a file
-                                        const buffer = Buffer.from(chunk.metadata.base64_data, 'base64');
-                                        fs.promises.writeFile(filePath, buffer).then(() => {
-                                            // Update the file path in the chunk's metadata
-                                            chunk.metadata.file_path = path.join(files_directory, fileName);
-                                            chunk.metadata.base64_data = undefined; // Remove the base64 data from the metadata
-                                        });
-                                    } else {
-                                        console.warn(`No base64_data found for chunk: ${fileName}`);
-                                    }
-                                }
-                            })
-                        );
                         result.status = 'completed';
                     } else {
                         result.status = 'pending';
                     }
-                    res.json(result); // Send the result back to the client
+                    res.json(result);
                 } else {
                     res.status(202).send({ status: 'pending' });
                 }
@@ -367,7 +387,7 @@ export default class AssistantManager extends ApiManager {
                         // If the chunk is an image or table, read the corresponding file and encode it as base64
                         if (chunk.metadata.type === 'image' || chunk.metadata.type === 'table') {
                             try {
-                                const filePath = serverPathToFile(Directory.chunk_images, chunk.metadata.file_path); // Get the file path
+                                const filePath = path.join(pathToDirectory(Directory.chunk_images), chunk.metadata.file_path); // Get the file path
                                 readFileAsync(filePath).then(imageBuffer => {
                                     const base64Image = imageBuffer.toString('base64'); // Convert the image to base64
 
@@ -445,10 +465,12 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string)
     const requirementsPath = path.join(__dirname, '../chunker/requirements.txt');
     const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py');
 
+    const outputDirectory = pathToDirectory(Directory.chunk_images);
+
     function runPythonScript() {
         const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3');
 
-        const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_name, file_data]);
+        const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_name, file_data, outputDirectory]);
 
         let pythonOutput = '';
         let stderrOutput = '';
@@ -460,23 +482,30 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string)
         pythonProcess.stderr.on('data', data => {
             stderrOutput += data.toString();
             const lines = stderrOutput.split('\n');
+            stderrOutput = lines.pop() || ''; // Save the last partial line back to stderrOutput
             lines.forEach(line => {
                 if (line.trim()) {
-                    try {
-                        const parsedOutput = JSON.parse(line);
-                        if (parsedOutput.job_id && parsedOutput.progress !== undefined) {
-                            jobProgress[parsedOutput.job_id] = {
-                                step: parsedOutput.step,
-                                progress: parsedOutput.progress,
-                            };
-                        } else if (parsedOutput.progress !== undefined) {
-                            jobProgress[jobId] = {
-                                step: parsedOutput.step,
-                                progress: parsedOutput.progress,
-                            };
+                    if (line.startsWith('PROGRESS:')) {
+                        const jsonString = line.substring('PROGRESS:'.length);
+                        try {
+                            const parsedOutput = JSON.parse(jsonString);
+                            if (parsedOutput.job_id && parsedOutput.progress !== undefined) {
+                                jobProgress[parsedOutput.job_id] = {
+                                    step: parsedOutput.step,
+                                    progress: parsedOutput.progress,
+                                };
+                            } else if (parsedOutput.progress !== undefined) {
+                                jobProgress[jobId] = {
+                                    step: parsedOutput.step,
+                                    progress: parsedOutput.progress,
+                                };
+                            }
+                        } catch (err) {
+                            console.error('Error parsing progress JSON:', jsonString, err);
                         }
-                    } catch (err) {
-                        console.error('Progress log from Python:', line, err);
+                    } else {
+                        // Log other stderr output
+                        console.error('Python stderr:', line);
                     }
                 }
             });
@@ -490,10 +519,24 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string)
                     jobProgress[jobId] = { step: 'Complete', progress: 100 };
                 } catch (err) {
                     console.error('Error parsing final JSON result:', err);
+                    jobResults[jobId] = { error: 'Failed to parse final result' };
                 }
             } else {
                 console.error(`Python process exited with code ${code}`);
-                jobResults[jobId] = { error: 'Python process failed' };
+                // Check if there was an error message in stderr
+                if (stderrOutput) {
+                    // Try to parse the last line as JSON
+                    const lines = stderrOutput.trim().split('\n');
+                    const lastLine = lines[lines.length - 1];
+                    try {
+                        const errorOutput = JSON.parse(lastLine);
+                        jobResults[jobId] = errorOutput;
+                    } catch (err) {
+                        jobResults[jobId] = { error: 'Python process failed' };
+                    }
+                } else {
+                    jobResults[jobId] = { error: 'Python process failed' };
+                }
             }
         });
     }
diff --git a/src/server/ApiManagers/DataVizManager.ts b/src/server/ApiManagers/DataVizManager.ts
index 88f22992d..d2028f23b 100644
--- a/src/server/ApiManagers/DataVizManager.ts
+++ b/src/server/ApiManagers/DataVizManager.ts
@@ -9,7 +9,7 @@ export default class DataVizManager extends ApiManager {
         register({
             method: Method.GET,
             subscription: '/csvData',
-            secureHandler: async ({ req, res }) => {
+            secureHandler: ({ req, res }) => {
                 const uri = req.query.uri as string;
 
                 return new Promise<void>(resolve => {
diff --git a/src/server/DashUploadUtils.ts b/src/server/DashUploadUtils.ts
index 1e55a885a..351351ca5 100644
--- a/src/server/DashUploadUtils.ts
+++ b/src/server/DashUploadUtils.ts
@@ -1,3 +1,4 @@
+/* eslint-disable no-use-before-define */
 import axios from 'axios';
 import { exec, spawn } from 'child_process';
 import { green, red } from 'colors';
@@ -47,7 +48,8 @@ if (isMainThread) {
 
     async function workerResampleImage(message: { imgSourcePath: string; outputPath: string; origSuffix: string; unlinkSource: boolean }) {
         const { imgSourcePath, outputPath, origSuffix, unlinkSource } = message;
-        const sizes = !origSuffix ? [{ width: 400, suffix: SizeSuffix.Medium }] : DashUploadUtils.imageResampleSizes(path.extname(imgSourcePath));
+        const extension = path.extname(imgSourcePath);
+        const sizes = !origSuffix ? [{ width: 400, suffix: SizeSuffix.Medium }] : DashUploadUtils.imageResampleSizes(extension === '.xml' ? '.png' : extension);
         // prettier-ignore
         Jimp.read(imgSourcePath)
             .then(img => 
@@ -60,7 +62,7 @@ if (isMainThread) {
     }
 }
 
-// eslint-disable-next-line @typescript-eslint/no-var-requires
+// eslint-disable-next-line @typescript-eslint/no-require-imports
 const requestImageSize = require('../client/util/request-image-size');
 
 export enum SizeSuffix {
@@ -369,7 +371,8 @@ export namespace DashUploadUtils {
      */
     export const UploadInspectedImage = async (metadata: Upload.InspectionResults, filename: string, prefix = '', cleanUp = true): Promise<Upload.ImageInformation> => {
         const { requestable, source, ...remaining } = metadata;
-        const resolved = filename || `${prefix}upload_${Utils.GenerateGuid()}.${remaining.contentType.split('/')[1].toLowerCase()}`;
+        const dfltSuffix = remaining.contentType.split('/')[1].toLowerCase();
+        const resolved = filename || `${prefix}upload_${Utils.GenerateGuid()}.${dfltSuffix === 'xml' ? 'jpg' : dfltSuffix}`;
         const { images } = Directory;
         const information: Upload.ImageInformation = {
             accessPaths: {
@@ -400,10 +403,10 @@ export namespace DashUploadUtils {
                 writtenFiles = {};
             }
         } else {
-            const unlinkSrcWhenFinished = isLocal().test(source) && cleanUp;
+            const unlinkSrcWhenFinished = cleanUp; // isLocal().test(source) && cleanUp;
             try {
                 writtenFiles = await outputResizedImages(metadata.source, resolved, unlinkSrcWhenFinished);
-            } catch (e) {
+            } catch {
                 // input is a blob or other, try reading it to create a metadata source file.
                 const reqSource = request(metadata.source);
                 const readStream: Stream = reqSource instanceof Promise ? await reqSource : reqSource;
@@ -415,7 +418,7 @@ export namespace DashUploadUtils {
                         .on('error', () => rej());
                 });
                 writtenFiles = await outputResizedImages(readSource, resolved, unlinkSrcWhenFinished);
-                fs.unlink(readSource, err => console.log("Couldn't unlink temporary image file:" + readSource, err));
+                //fs.unlink(readSource, err => console.log("Couldn't unlink temporary image file:" + readSource, err));
             }
         }
         Array.from(Object.keys(writtenFiles)).forEach(suffix => {
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index 4fe3b9dbf..48b2dbf97 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -54,8 +54,9 @@ def update_progress(job_id, step, progress_value):
         "step": step,
         "progress": progress_value
     }
-    print(json.dumps(progress_data), file=sys.stderr)  # Use stderr for progress logs
-    sys.stderr.flush()  # Ensure it's sent immediately
+    print(f"PROGRESS:{json.dumps(progress_data)}", file=sys.stderr)
+    sys.stderr.flush()
+
 
 
 class ElementExtractor:
@@ -63,13 +64,15 @@ class ElementExtractor:
     A class that uses a YOLO model to extract tables and images from a PDF page.
     """
 
-    def __init__(self, output_folder: str):
+    def __init__(self, output_folder: str, doc_id: str):
         """
         Initializes the ElementExtractor with the output folder for saving images and the YOLO model.
 
         :param output_folder: Path to the folder where extracted elements will be saved.
         """
-        self.output_folder = output_folder
+        self.doc_id = doc_id
+        self.output_folder = os.path.join(output_folder, doc_id)
+        os.makedirs(self.output_folder, exist_ok=True)
         self.model = YOLO('keremberke/yolov8m-table-extraction')  # Load YOLO model for table extraction
         self.model.overrides['conf'] = 0.25  # Set confidence threshold for detection
         self.model.overrides['iou'] = 0.45  # Set Intersection over Union (IoU) threshold
@@ -116,17 +119,16 @@ class ElementExtractor:
             table_path = os.path.join(self.output_folder, table_filename)
             page_with_outline.save(table_path)
 
-            # Convert the full-page image with red outline to base64
-            base64_data = self.image_to_base64(page_with_outline)
+            file_path_for_client = f"{self.doc_id}/{table_filename}"
 
             tables.append({
                 'metadata': {
                     "type": "table",
                     "location": [x1 / img.width, y1 / img.height, x2 / img.width, y2 / img.height],
-                    "file_path": table_path,
+                    "file_path": file_path_for_client,
                     "start_page": page_num,
                     "end_page": page_num,
-                    "base64_data": base64_data,
+                    "base64_data": self.image_to_base64(page_with_outline)
                 }
             })
 
@@ -175,18 +177,17 @@ class ElementExtractor:
             image_path = os.path.join(self.output_folder, image_filename)
             page_with_outline.save(image_path)
 
-            # Convert the full-page image with red outline to base64
-            base64_data = self.image_to_base64(page_with_outline)
+            file_path_for_client = f"{self.doc_id}/{image_filename}"
 
             images.append({
                 'metadata': {
                     "type": "image",
                     "location": [x1 / page.rect.width, y1 / page.rect.height, x2 / page.rect.width,
                                     y2 / page.rect.height],
-                    "file_path": image_path,
+                    "file_path": file_path_for_client,
                     "start_page": page_num,
                     "end_page": page_num,
-                    "base64_data": base64_data,
+                    "base64_data": self.image_to_base64(image)
                 }
             })
 
@@ -268,7 +269,7 @@ class PDFChunker:
     The main class responsible for chunking PDF files into text and visual elements (tables/images).
     """
 
-    def __init__(self, output_folder: str = "output", image_batch_size: int = 5) -> None:
+    def __init__(self, output_folder: str = "output", doc_id: str = '', image_batch_size: int = 5) -> None:
         """
         Initializes the PDFChunker with an output folder and an element extractor for visual elements.
 
@@ -278,7 +279,8 @@ class PDFChunker:
         self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))  # Initialize the Anthropic API client
         self.output_folder = output_folder
         self.image_batch_size = image_batch_size  # Batch size for image processing
-        self.element_extractor = ElementExtractor(output_folder)  # Initialize the element extractor
+        self.doc_id = doc_id  # Add doc_id
+        self.element_extractor = ElementExtractor(output_folder, doc_id)
 
     async def chunk_pdf(self, file_data: bytes, file_name: str, doc_id: str, job_id: str) -> List[Dict[str, Any]]:
         """
@@ -363,6 +365,7 @@ class PDFChunker:
                 for j, elem in enumerate(batch, start=1):
                     if j in summaries:
                         elem['metadata']['text'] = re.sub(r'^(Image|Table):\s*', '', summaries[j])
+                        elem['metadata']['base64_data'] = ''
                         processed_elements.append(elem)
 
                 progress = ((i // image_batch_size) + 1) / total_batches * 100  # Calculate progress
@@ -628,10 +631,11 @@ class PDFChunker:
 
             return summaries
 
-        except Exception:
-            #print(f"Error in batch_summarize_images: {str(e)}")
-            #print("Returning placeholder summaries")
-            return {number: "Error: No summary available" for number in images}
+        except Exception as e:
+            # Print errors to stderr so they don't interfere with JSON output
+            print(json.dumps({"error": str(e)}), file=sys.stderr)
+            sys.stderr.flush()
+
 
 class DocumentType(Enum):
     """
@@ -664,7 +668,7 @@ class Document:
     Represents a document being processed, such as a PDF, handling chunking, embedding, and summarization.
     """
 
-    def __init__(self, file_data: bytes, file_name: str, job_id: str):
+    def __init__(self, file_data: bytes, file_name: str, job_id: str, output_folder: str):
         """
         Initialize the Document with file data, file name, and job ID.
 
@@ -672,6 +676,7 @@ class Document:
         :param file_name: The name of the file being processed.
         :param job_id: The job ID associated with this document processing task.
         """
+        self.output_folder = output_folder
         self.file_data = file_data
         self.file_name = file_name
         self.job_id = job_id
@@ -680,14 +685,13 @@ class Document:
         self.chunks = []  # List to hold text and visual chunks
         self.num_pages = 0  # Number of pages in the document (if applicable)
         self.summary = ""  # The generated summary for the document
-
         self._process()  # Start processing the document
 
     def _process(self):
         """
         Process the document: extract chunks, embed them, and generate a summary.
         """
-        pdf_chunker = PDFChunker(output_folder="output")  # Initialize the PDF chunker
+        pdf_chunker = PDFChunker(output_folder=self.output_folder, doc_id=self.doc_id)  # Initialize PDFChunker
         self.chunks = asyncio.run(pdf_chunker.chunk_pdf(self.file_data, self.file_name, self.doc_id, self.job_id))  # Extract chunks
 
         self.num_pages = self._get_pdf_pages()  # Get the number of pages in the document
@@ -796,8 +800,7 @@ class Document:
             "doc_id": self.doc_id
         }, indent=2)  # Convert the document's attributes to JSON format
 
-
-def process_document(file_data, file_name, job_id):
+def process_document(file_data, file_name, job_id, output_folder):
     """
     Top-level function to process a document and return the JSON output.
 
@@ -806,28 +809,30 @@ def process_document(file_data, file_name, job_id):
     :param job_id: The job ID for this document processing task.
     :return: The processed document's data in JSON format.
     """
-    new_document = Document(file_data, file_name, job_id)  # Create a new Document object
-    return new_document.to_json()  # Return the document's JSON data
-
+    new_document = Document(file_data, file_name, job_id, output_folder)
+    return new_document.to_json()
 
 def main():
     """
     Main entry point for the script, called with arguments from Node.js.
     """
-    if len(sys.argv) != 4:
-        print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr)  # Print error if incorrect number of arguments
+    if len(sys.argv) != 5:
+        print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr)
         return
 
-    job_id = sys.argv[1]  # Get the job ID from command-line arguments
-    file_name = sys.argv[2]  # Get the file name from command-line arguments
-    file_data = sys.argv[3]  # Get the base64-encoded file data from command-line arguments
+    job_id = sys.argv[1]
+    file_name = sys.argv[2]
+    file_data = sys.argv[3]
+    output_folder = sys.argv[4]  # Get the output folder from arguments
 
     try:
+        os.makedirs(output_folder, exist_ok=True)
+
         # Decode the base64 file data
         file_bytes = base64.b64decode(file_data)
 
         # Process the document
-        document_result = process_document(file_bytes, file_name, job_id)
+        document_result = process_document(file_bytes, file_name, job_id, output_folder)  # Pass output_folder
 
         # Output the final result as JSON to stdout
         print(document_result)
@@ -839,5 +844,6 @@ def main():
         sys.stderr.flush()
 
 
+
 if __name__ == "__main__":
     main()  # Execute the main function when the script is run
diff --git a/src/server/index.ts b/src/server/index.ts
index 88dbd232d..1f9af9ee0 100644
--- a/src/server/index.ts
+++ b/src/server/index.ts
@@ -7,6 +7,7 @@ import AssistantManager from './ApiManagers/AssistantManager';
 import DataVizManager from './ApiManagers/DataVizManager';
 import DeleteManager from './ApiManagers/DeleteManager';
 import DownloadManager from './ApiManagers/DownloadManager';
+import FireflyManager from './ApiManagers/FireflyManager';
 import GeneralGoogleManager from './ApiManagers/GeneralGoogleManager';
 import SessionManager from './ApiManagers/SessionManager';
 import UploadManager from './ApiManagers/UploadManager';
@@ -71,6 +72,7 @@ function routeSetter({ addSupervisedRoute, logRegistrationOutcome }: RouteManage
         new GeneralGoogleManager(),
         /* new GooglePhotosManager(), */ new DataVizManager(),
         new AssistantManager(),
+        new FireflyManager(),
     ];
 
     // initialize API Managers
@@ -112,7 +114,6 @@ function routeSetter({ addSupervisedRoute, logRegistrationOutcome }: RouteManage
     });
 
     const serve: PublicHandler = ({ req, res }) => {
-        // eslint-disable-next-line new-cap
         const detector = new mobileDetect(req.headers['user-agent'] || '');
         const filename = detector.mobile() !== null ? 'mobile/image.html' : 'index.html';
         res.sendFile(path.join(__dirname, '../../deploy/' + filename));