1 files changed, 100 insertions, 57 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 8447a4934..4d2068014 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -9,7 +9,7 @@
  */
 
 import { Readability } from '@mozilla/readability';
-import axios from 'axios';
+import axios, { AxiosResponse } from 'axios';
 import { spawn } from 'child_process';
 import * as fs from 'fs';
 import { writeFile } from 'fs';
@@ -23,6 +23,7 @@ import { AI_Document } from '../../client/views/nodes/chatbot/types/types';
 import { Method } from '../RouteManager';
 import { filesDirectory, publicDirectory } from '../SocketData';
 import ApiManager, { Registration } from './ApiManager';
+import { getServerPath } from '../../client/util/reportManager/reportManagerUtils';
 
 // Enumeration of directories where different file types are stored
 export enum Directory {
@@ -115,29 +116,79 @@ export default class AssistantManager extends ApiManager {
             },
         });
 
-        // Register Google Web Search Results API route
         register({
             method: Method.POST,
             subscription: '/getWebSearchResults',
             secureHandler: async ({ req, res }) => {
                 const { query, max_results } = req.body;
-                try {
-                    // Fetch search results using Google Custom Search API
-                    const response = await customsearch.cse.list({
+                const MIN_VALID_RESULTS_RATIO = 0.75; // 3/4 threshold
+                let startIndex = 1; // Start at the first result initially
+                let validResults: any[] = [];
+
+                const fetchSearchResults = async (start: number) => {
+                    return customsearch.cse.list({
                         q: query,
                         cx: process.env._CLIENT_GOOGLE_SEARCH_ENGINE_ID,
                         key: process.env._CLIENT_GOOGLE_API_KEY,
                         safe: 'active',
                         num: max_results,
+                        start, // This controls which result index the search starts from
                     });
+                };
+
+                const filterResultsByXFrameOptions = async (results: any[]) => {
+                    const filteredResults = await Promise.all(
+                        results.map(async result => {
+                            try {
+                                const urlResponse: AxiosResponse = await axios.head(result.url, { timeout: 5000 });
+                                const xFrameOptions = urlResponse.headers['x-frame-options'];
+                                if (xFrameOptions && xFrameOptions.toUpperCase() === 'SAMEORIGIN') {
+                                    return result;
+                                }
+                            } catch (error) {
+                                console.error(`Error checking x-frame-options for URL: ${result.url}`, error);
+                            }
+                            return null; // Exclude the result if it doesn't match
+                        })
+                    );
+                    return filteredResults.filter(result => result !== null); // Remove null results
+                };
 
-                    const results =
+                try {
+                    // Fetch initial search results
+                    let response = await fetchSearchResults(startIndex);
+                    let initialResults =
                         response.data.items?.map(item => ({
                             url: item.link,
                             snippet: item.snippet,
                         })) || [];
 
-                    res.send({ results });
+                    // Filter the initial results
+                    validResults = await filterResultsByXFrameOptions(initialResults);
+
+                    // If valid results are less than 3/4 of max_results, fetch more results
+                    while (validResults.length < max_results * MIN_VALID_RESULTS_RATIO) {
+                        // Increment the start index by the max_results to fetch the next set of results
+                        startIndex += max_results;
+                        response = await fetchSearchResults(startIndex);
+
+                        const additionalResults =
+                            response.data.items?.map(item => ({
+                                url: item.link,
+                                snippet: item.snippet,
+                            })) || [];
+
+                        const additionalValidResults = await filterResultsByXFrameOptions(additionalResults);
+                        validResults = [...validResults, ...additionalValidResults]; // Combine valid results
+
+                        // Break if no more results are available
+                        if (additionalValidResults.length === 0 || response.data.items?.length === 0) {
+                            break;
+                        }
+                    }
+
+                    // Return the filtered valid results
+                    res.send({ results: validResults.slice(0, max_results) }); // Limit the results to max_results
                 } catch (error) {
                     console.error('Error performing web search:', error);
                     res.status(500).send({
@@ -299,47 +350,16 @@ export default class AssistantManager extends ApiManager {
             method: Method.GET,
             subscription: '/getResult/:jobId',
             secureHandler: async ({ req, res }) => {
-                const { jobId } = req.params; // Get the job ID from the URL parameters
-                // Check if the job result is available
+                const { jobId } = req.params;
                 if (jobResults[jobId]) {
                     const result = jobResults[jobId] as AI_Document & { status: string };
 
-                    // If the result contains image or table chunks, save the base64 data as image files
                     if (result.chunks && Array.isArray(result.chunks)) {
-                        await Promise.all(
-                            result.chunks.map(chunk => {
-                                if (chunk.metadata && (chunk.metadata.type === 'image' || chunk.metadata.type === 'table')) {
-                                    const files_directory = '/files/chunk_images/';
-                                    const directory = path.join(publicDirectory, files_directory);
-
-                                    // Ensure the directory exists or create it
-                                    if (!fs.existsSync(directory)) {
-                                        fs.mkdirSync(directory);
-                                    }
-
-                                    const fileName = path.basename(chunk.metadata.file_path); // Get the file name from the path
-                                    const filePath = path.join(directory, fileName); // Create the full file path
-
-                                    // Check if the chunk contains base64 encoded data
-                                    if (chunk.metadata.base64_data) {
-                                        // Decode the base64 data and write it to a file
-                                        const buffer = Buffer.from(chunk.metadata.base64_data, 'base64');
-                                        fs.promises.writeFile(filePath, buffer).then(() => {
-                                            // Update the file path in the chunk's metadata
-                                            chunk.metadata.file_path = path.join(files_directory, fileName);
-                                            chunk.metadata.base64_data = undefined; // Remove the base64 data from the metadata
-                                        });
-                                    } else {
-                                        console.warn(`No base64_data found for chunk: ${fileName}`);
-                                    }
-                                }
-                            })
-                        );
                         result.status = 'completed';
                     } else {
                         result.status = 'pending';
                     }
-                    res.json(result); // Send the result back to the client
+                    res.json(result);
                 } else {
                     res.status(202).send({ status: 'pending' });
                 }
@@ -367,7 +387,7 @@ export default class AssistantManager extends ApiManager {
                         // If the chunk is an image or table, read the corresponding file and encode it as base64
                         if (chunk.metadata.type === 'image' || chunk.metadata.type === 'table') {
                             try {
-                                const filePath = serverPathToFile(Directory.chunk_images, chunk.metadata.file_path); // Get the file path
+                                const filePath = path.join(pathToDirectory(Directory.chunk_images), chunk.metadata.file_path); // Get the file path
                                 readFileAsync(filePath).then(imageBuffer => {
                                     const base64Image = imageBuffer.toString('base64'); // Convert the image to base64
 
@@ -445,10 +465,12 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string)
     const requirementsPath = path.join(__dirname, '../chunker/requirements.txt');
     const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py');
 
+    const outputDirectory = pathToDirectory(Directory.chunk_images);
+
     function runPythonScript() {
         const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3');
 
-        const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_name, file_data]);
+        const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_name, file_data, outputDirectory]);
 
         let pythonOutput = '';
         let stderrOutput = '';
@@ -460,23 +482,30 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string)
         pythonProcess.stderr.on('data', data => {
             stderrOutput += data.toString();
             const lines = stderrOutput.split('\n');
+            stderrOutput = lines.pop() || ''; // Save the last partial line back to stderrOutput
             lines.forEach(line => {
                 if (line.trim()) {
-                    try {
-                        const parsedOutput = JSON.parse(line);
-                        if (parsedOutput.job_id && parsedOutput.progress !== undefined) {
-                            jobProgress[parsedOutput.job_id] = {
-                                step: parsedOutput.step,
-                                progress: parsedOutput.progress,
-                            };
-                        } else if (parsedOutput.progress !== undefined) {
-                            jobProgress[jobId] = {
-                                step: parsedOutput.step,
-                                progress: parsedOutput.progress,
-                            };
+                    if (line.startsWith('PROGRESS:')) {
+                        const jsonString = line.substring('PROGRESS:'.length);
+                        try {
+                            const parsedOutput = JSON.parse(jsonString);
+                            if (parsedOutput.job_id && parsedOutput.progress !== undefined) {
+                                jobProgress[parsedOutput.job_id] = {
+                                    step: parsedOutput.step,
+                                    progress: parsedOutput.progress,
+                                };
+                            } else if (parsedOutput.progress !== undefined) {
+                                jobProgress[jobId] = {
+                                    step: parsedOutput.step,
+                                    progress: parsedOutput.progress,
+                                };
+                            }
+                        } catch (err) {
+                            console.error('Error parsing progress JSON:', jsonString, err);
                         }
-                    } catch (err) {
-                        console.error('Progress log from Python:', line, err);
+                    } else {
+                        // Log other stderr output
+                        console.error('Python stderr:', line);
                     }
                 }
             });
@@ -490,10 +519,24 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string)
                     jobProgress[jobId] = { step: 'Complete', progress: 100 };
                 } catch (err) {
                     console.error('Error parsing final JSON result:', err);
+                    jobResults[jobId] = { error: 'Failed to parse final result' };
                 }
             } else {
                 console.error(`Python process exited with code ${code}`);
-                jobResults[jobId] = { error: 'Python process failed' };
+                // Check if there was an error message in stderr
+                if (stderrOutput) {
+                    // Try to parse the last line as JSON
+                    const lines = stderrOutput.trim().split('\n');
+                    const lastLine = lines[lines.length - 1];
+                    try {
+                        const errorOutput = JSON.parse(lastLine);
+                        jobResults[jobId] = errorOutput;
+                    } catch (err) {
+                        jobResults[jobId] = { error: 'Python process failed' };
+                    }
+                } else {
+                    jobResults[jobId] = { error: 'Python process failed' };
+                }
             }
         });
     }