1 files changed, 491 insertions, 19 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index af25722a4..07c970a4e 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -39,6 +39,7 @@ export enum Directory {
     csv = 'csv',
     chunk_images = 'chunk_images',
     scrape_images = 'scrape_images',
+    vectorstore = 'vectorstore',
 }
 
 // In-memory job tracking
@@ -92,6 +93,132 @@ export default class AssistantManager extends ApiManager {
         const customsearch = google.customsearch('v1');
         const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY });
 
+        // Register an endpoint to retrieve file summaries from the json file
+        register({
+            method: Method.GET,
+            subscription: '/getFileSummaries',
+            secureHandler: async ({ req, res }) => {
+                try {
+                    // Read the file summaries JSON file
+                    const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json');
+
+                    if (!fs.existsSync(filePath)) {
+                        res.status(404).send({ error: 'File summaries not found' });
+                        return;
+                    }
+
+                    const data = fs.readFileSync(filePath, 'utf8');
+                    res.send(data);
+                } catch (error) {
+                    console.error('Error retrieving file summaries:', error);
+                    res.status(500).send({
+                        error: 'Failed to retrieve file summaries',
+                    });
+                }
+            },
+        });
+
+        // Register an endpoint to retrieve file names from the file_summaries.json file
+        register({
+            method: Method.GET,
+            subscription: '/getFileNames',
+            secureHandler: async ({ res }) => {
+                const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json');
+                const data = fs.readFileSync(filePath, 'utf8');
+                console.log(Object.keys(JSON.parse(data)));
+
+                res.send(Object.keys(JSON.parse(data)));
+            },
+        });
+
+        // Register an endpoint to retrieve file content from the content json file
+        register({
+            method: Method.POST,
+            subscription: '/getFileContent',
+            secureHandler: async ({ req, res }) => {
+                const { filepath } = req.body;
+
+                if (!filepath) {
+                    res.status(400).send({ error: 'Filepath is required' });
+                    return;
+                }
+
+                try {
+                    // Read the file content JSON file
+                    const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json');
+
+                    if (!fs.existsSync(filePath)) {
+                        res.status(404).send({ error: 'File content database not found' });
+                        return;
+                    }
+
+                    console.log(`[DEBUG] Retrieving content for: ${filepath}`);
+
+                    // Read the JSON file in chunks to handle large files
+                    const readStream = fs.createReadStream(filePath, { encoding: 'utf8' });
+                    let jsonData = '';
+
+                    readStream.on('data', chunk => {
+                        jsonData += chunk;
+                    });
+
+                    readStream.on('end', () => {
+                        try {
+                            // Parse the JSON
+                            const contentMap = JSON.parse(jsonData);
+
+                            // Check if the filepath exists in the map
+                            if (!contentMap[filepath]) {
+                                console.log(`[DEBUG] Content not found for: ${filepath}`);
+                                res.status(404).send({ error: `Content not found for filepath: ${filepath}` });
+                                return;
+                            }
+
+                            // Return the file content as is, not as JSON
+                            console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`);
+                            res.send(contentMap[filepath]);
+                        } catch (parseError) {
+                            console.error('Error parsing file_content.json:', parseError);
+                            res.status(500).send({
+                                error: 'Failed to parse file content database',
+                            });
+                        }
+                    });
+
+                    readStream.on('error', streamError => {
+                        console.error('Error reading file_content.json:', streamError);
+                        res.status(500).send({
+                            error: 'Failed to read file content database',
+                        });
+                    });
+                } catch (error) {
+                    console.error('Error retrieving file content:', error);
+                    res.status(500).send({
+                        error: 'Failed to retrieve file content',
+                    });
+                }
+            },
+        });
+
+        // Register an endpoint to search file summaries
+        register({
+            method: Method.POST,
+            subscription: '/searchFileSummaries',
+            secureHandler: async ({ req, res }) => {
+                const { query, topK } = req.body;
+
+                if (!query) {
+                    res.status(400).send({ error: 'Search query is required' });
+                    return;
+                }
+
+                // This endpoint will be called by the client-side Vectorstore to perform the search
+                // The actual search is implemented in the Vectorstore class
+
+                res.send({ message: 'This endpoint should be called through the Vectorstore class' });
+            },
+        });
+
         // Register Wikipedia summary API route
         register({
             method: Method.POST,
@@ -485,36 +612,76 @@ export default class AssistantManager extends ApiManager {
             subscription: '/scrapeWebsite',
             secureHandler: async ({ req, res }) => {
                 const { url } = req.body;
+                let browser = null;
                 try {
+                    // Set a longer timeout for slow-loading pages
+                    const navigationTimeout = 60000; // 60 seconds
+
                     // Launch Puppeteer browser to navigate to the webpage
-                    const browser = await puppeteer.launch({
-                        args: ['--no-sandbox', '--disable-setuid-sandbox'],
+                    browser = await puppeteer.launch({
+                        args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
                     });
                     const page = await browser.newPage();
                     await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
-                    await page.goto(url, { waitUntil: 'networkidle2' });
+
+                    // Set timeout for navigation
+                    page.setDefaultNavigationTimeout(navigationTimeout);
+
+                    // Navigate with timeout and wait for content to load
+                    await page.goto(url, {
+                        waitUntil: 'networkidle2',
+                        timeout: navigationTimeout,
+                    });
+
+                    // Wait a bit longer to ensure dynamic content loads
+                    await new Promise(resolve => setTimeout(resolve, 2000));
 
                     // Extract HTML content
                     const htmlContent = await page.content();
                     await browser.close();
+                    browser = null;
 
-                    // Parse HTML content using JSDOM
-                    const dom = new JSDOM(htmlContent, { url });
+                    let extractedText = '';
 
-                    // Extract readable content using Mozilla's Readability API
-                    const reader = new Readability(dom.window.document);
-                    const article = reader.parse();
+                    // First try with Readability
+                    try {
+                        // Parse HTML content using JSDOM
+                        const dom = new JSDOM(htmlContent, { url });
+
+                        // Extract readable content using Mozilla's Readability API
+                        const reader = new Readability(dom.window.document, {
+                            // Readability configuration to focus on text content
+                            charThreshold: 100,
+                            keepClasses: false,
+                        });
+                        const article = reader.parse();
 
-                    if (article) {
-                        const plainText = article.textContent;
-                        res.send({ website_plain_text: plainText });
-                    } else {
-                        res.status(500).send({ error: 'Failed to extract readable content' });
+                        if (article && article.textContent) {
+                            extractedText = article.textContent;
+                        } else {
+                            // If Readability doesn't return useful content, try alternate method
+                            extractedText = await extractEnhancedContent(htmlContent);
+                        }
+                    } catch (parsingError) {
+                        console.error('Error parsing website content with Readability:', parsingError);
+                        // Fallback to enhanced content extraction
+                        extractedText = await extractEnhancedContent(htmlContent);
                     }
+
+                    // Clean up the extracted text
+                    extractedText = cleanupText(extractedText);
+
+                    res.send({ website_plain_text: extractedText });
                 } catch (error) {
                     console.error('Error scraping website:', error);
+
+                    // Clean up browser if still open
+                    if (browser) {
+                        await browser.close().catch(e => console.error('Error closing browser:', e));
+                    }
+
                     res.status(500).send({
-                        error: 'Failed to scrape website',
+                        error: 'Failed to scrape website: ' + ((error as Error).message || 'Unknown error'),
                     });
                 }
             },
@@ -526,7 +693,7 @@ export default class AssistantManager extends ApiManager {
             method: Method.POST,
             subscription: '/createDocument',
             secureHandler: async ({ req, res }) => {
-                const { file_path } = req.body;
+                const { file_path, doc_id } = req.body;
                 const public_path = path.join(publicDirectory, file_path); // Resolve the file path in the public directory
                 const file_name = path.basename(file_path); // Extract the file name from the path
 
@@ -539,7 +706,7 @@ export default class AssistantManager extends ApiManager {
 
                     // Spawn the Python process and track its progress/output
                     // eslint-disable-next-line no-use-before-define
-                    spawnPythonProcess(jobId, public_path);
+                    spawnPythonProcess(jobId, public_path, doc_id);
 
                     // Send the job ID back to the client for tracking
                     res.send({ jobId });
@@ -687,6 +854,193 @@ export default class AssistantManager extends ApiManager {
                 }
             },
         });
+
+        // Register an API route to capture a screenshot of a webpage using Puppeteer
+        // and return the image URL for display in the WebBox component
+        register({
+            method: Method.POST,
+            subscription: '/captureWebScreenshot',
+            secureHandler: async ({ req, res }) => {
+                const { url, width, height, fullPage } = req.body;
+
+                if (!url) {
+                    res.status(400).send({ error: 'URL is required' });
+                    return;
+                }
+
+                let browser = null;
+                try {
+                    // Increase timeout for websites that load slowly
+                    const navigationTimeout = 60000; // 60 seconds
+
+                    // Launch a headless browser with additional options to improve stability
+                    browser = await puppeteer.launch({
+                        headless: true, // Use headless mode
+                        args: [
+                            '--no-sandbox',
+                            '--disable-setuid-sandbox',
+                            '--disable-dev-shm-usage',
+                            '--disable-accelerated-2d-canvas',
+                            '--disable-gpu',
+                            '--window-size=1200,800',
+                            '--disable-web-security', // Helps with cross-origin issues
+                            '--disable-features=IsolateOrigins,site-per-process', // Helps with frames
+                        ],
+                        timeout: navigationTimeout,
+                    });
+
+                    const page = await browser.newPage();
+
+                    // Set a larger viewport to capture more content
+                    await page.setViewport({
+                        width: Number(width) || 1200,
+                        height: Number(height) || 800,
+                        deviceScaleFactor: 1,
+                    });
+
+                    // Enable request interception to speed up page loading
+                    await page.setRequestInterception(true);
+                    page.on('request', request => {
+                        // Skip unnecessary resources to speed up loading
+                        const resourceType = request.resourceType();
+                        if (resourceType === 'font' || resourceType === 'media' || resourceType === 'websocket' || request.url().includes('analytics') || request.url().includes('tracker')) {
+                            request.abort();
+                        } else {
+                            request.continue();
+                        }
+                    });
+
+                    // Set navigation and timeout options
+                    console.log(`Navigating to URL: ${url}`);
+
+                    // Navigate to the URL and wait for the page to load
+                    await page.goto(url, {
+                        waitUntil: ['networkidle2'],
+                        timeout: navigationTimeout,
+                    });
+
+                    // Wait for a short delay after navigation to allow content to render
+                    await new Promise(resolve => setTimeout(resolve, 2000));
+
+                    // Take a screenshot
+                    console.log('Taking screenshot...');
+                    const screenshotPath = `./src/server/public/files/images/webpage_${Date.now()}.png`;
+                    const screenshotOptions = {
+                        path: screenshotPath,
+                        fullPage: fullPage === true,
+                        omitBackground: false,
+                        type: 'png' as 'png',
+                        clip:
+                            fullPage !== true
+                                ? {
+                                      x: 0,
+                                      y: 0,
+                                      width: Number(width) || 1200,
+                                      height: Number(height) || 800,
+                                  }
+                                : undefined,
+                    };
+
+                    await page.screenshot(screenshotOptions);
+
+                    // Get the full height of the page
+                    const fullHeight = await page.evaluate(() => {
+                        return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight);
+                    });
+
+                    console.log(`Screenshot captured successfully with height: ${fullHeight}px`);
+
+                    // Return the URL to the screenshot
+                    const screenshotUrl = `/files/images/webpage_${Date.now()}.png`;
+                    res.json({
+                        screenshotUrl,
+                        fullHeight,
+                    });
+                } catch (error: any) {
+                    console.error('Error capturing screenshot:', error);
+                    res.status(500).send({
+                        error: `Failed to capture screenshot: ${error.message}`,
+                        details: error.stack,
+                    });
+                } finally {
+                    // Ensure browser is closed to free resources
+                    if (browser) {
+                        try {
+                            await browser.close();
+                            console.log('Browser closed successfully');
+                        } catch (error) {
+                            console.error('Error closing browser:', error);
+                        }
+                    }
+                }
+            },
+        });
+
+        // Register an endpoint to retrieve raw file content as plain text (no JSON parsing)
+        register({
+            method: Method.POST,
+            subscription: '/getRawFileContent',
+            secureHandler: async ({ req, res }) => {
+                const { filepath } = req.body;
+
+                if (!filepath) {
+                    res.status(400).send('Filepath is required');
+                    return;
+                }
+
+                try {
+                    // Read the file content JSON file
+                    const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json');
+
+                    if (!fs.existsSync(filePath)) {
+                        res.status(404).send('File content database not found');
+                        return;
+                    }
+
+                    console.log(`[DEBUG] Retrieving raw content for: ${filepath}`);
+
+                    // Read the JSON file
+                    const readStream = fs.createReadStream(filePath, { encoding: 'utf8' });
+                    let jsonData = '';
+
+                    readStream.on('data', chunk => {
+                        jsonData += chunk;
+                    });
+
+                    readStream.on('end', () => {
+                        try {
+                            // Parse the JSON
+                            const contentMap = JSON.parse(jsonData);
+
+                            // Check if the filepath exists in the map
+                            if (!contentMap[filepath]) {
+                                console.log(`[DEBUG] Content not found for: ${filepath}`);
+                                res.status(404).send(`Content not found for filepath: ${filepath}`);
+                                return;
+                            }
+
+                            // Set content type to plain text to avoid JSON parsing
+                            res.setHeader('Content-Type', 'text/plain');
+
+                            // Return the file content as plain text
+                            console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`);
+                            res.send(contentMap[filepath]);
+                        } catch (parseError) {
+                            console.error('Error parsing file_content.json:', parseError);
+                            res.status(500).send('Failed to parse file content database');
+                        }
+                    });
+
+                    readStream.on('error', streamError => {
+                        console.error('Error reading file_content.json:', streamError);
+                        res.status(500).send('Failed to read file content database');
+                    });
+                } catch (error) {
+                    console.error('Error retrieving file content:', error);
+                    res.status(500).send('Failed to retrieve file content');
+                }
+            },
+        });
     }
 }
 
@@ -696,7 +1050,7 @@ export default class AssistantManager extends ApiManager {
  * @param file_name The name of the file to process.
  * @param file_path The filepath of the file to process.
  */
-function spawnPythonProcess(jobId: string, file_path: string) {
+function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
     const venvPath = path.join(__dirname, '../chunker/venv');
     const requirementsPath = path.join(__dirname, '../chunker/requirements.txt');
     const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py');
@@ -706,7 +1060,7 @@ function spawnPythonProcess(jobId: string, file_path: string) {
     function runPythonScript() {
         const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3');
 
-        const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory]);
+        const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory, doc_id]);
 
         let pythonOutput = '';
         let stderrOutput = '';
@@ -781,7 +1135,7 @@ function spawnPythonProcess(jobId: string, file_path: string) {
         console.log('Virtual environment not found. Creating and setting up...');
 
         // Create venv
-        const createVenvProcess = spawn('python', ['-m', 'venv', venvPath]);
+        const createVenvProcess = spawn('python3.10', ['-m', 'venv', venvPath]);
 
         createVenvProcess.on('close', code => {
             if (code !== 0) {
@@ -829,3 +1183,121 @@ function spawnPythonProcess(jobId: string, file_path: string) {
         runPythonScript();
     }
 }
+
+/**
+ * Enhanced content extraction that focuses on meaningful text content.
+ * @param html The HTML content to process
+ * @returns Extracted and cleaned text content
+ */
+async function extractEnhancedContent(html: string): Promise<string> {
+    try {
+        // Create DOM to extract content
+        const dom = new JSDOM(html, { runScripts: 'outside-only' });
+        const document = dom.window.document;
+
+        // Remove all non-content elements
+        const elementsToRemove = [
+            'script',
+            'style',
+            'iframe',
+            'noscript',
+            'svg',
+            'canvas',
+            'header',
+            'footer',
+            'nav',
+            'aside',
+            'form',
+            'button',
+            'input',
+            'select',
+            'textarea',
+            'meta',
+            'link',
+            'img',
+            'video',
+            'audio',
+            '.ad',
+            '.ads',
+            '.advertisement',
+            '.banner',
+            '.cookie',
+            '.popup',
+            '.modal',
+            '.newsletter',
+            '[role="banner"]',
+            '[role="navigation"]',
+            '[role="complementary"]',
+        ];
+
+        elementsToRemove.forEach(selector => {
+            const elements = document.querySelectorAll(selector);
+            elements.forEach(el => el.remove());
+        });
+
+        // Get all text paragraphs with meaningful content
+        const contentElements = [
+            ...Array.from(document.querySelectorAll('p')),
+            ...Array.from(document.querySelectorAll('h1')),
+            ...Array.from(document.querySelectorAll('h2')),
+            ...Array.from(document.querySelectorAll('h3')),
+            ...Array.from(document.querySelectorAll('h4')),
+            ...Array.from(document.querySelectorAll('h5')),
+            ...Array.from(document.querySelectorAll('h6')),
+            ...Array.from(document.querySelectorAll('li')),
+            ...Array.from(document.querySelectorAll('td')),
+            ...Array.from(document.querySelectorAll('article')),
+            ...Array.from(document.querySelectorAll('section')),
+            ...Array.from(document.querySelectorAll('div:not([class]):not([id])')),
+        ];
+
+        // Extract text from content elements that have meaningful text
+        let contentParts: string[] = [];
+        contentElements.forEach(el => {
+            const text = el.textContent?.trim();
+            // Only include elements with substantial text (more than just a few characters)
+            if (text && text.length > 10 && !contentParts.includes(text)) {
+                contentParts.push(text);
+            }
+        });
+
+        // If no significant content found with selective approach, fallback to body
+        if (contentParts.length < 3) {
+            return document.body.textContent || '';
+        }
+
+        return contentParts.join('\n\n');
+    } catch (error) {
+        console.error('Error extracting enhanced content:', error);
+        return 'Failed to extract content from the webpage.';
+    }
+}
+
+/**
+ * Cleans up extracted text to improve readability and focus on useful content.
+ * @param text The raw extracted text
+ * @returns Cleaned and formatted text
+ */
+function cleanupText(text: string): string {
+    if (!text) return '';
+
+    return (
+        text
+            // Remove excessive whitespace and normalize line breaks
+            .replace(/\s+/g, ' ')
+            .replace(/\n\s*\n\s*\n+/g, '\n\n')
+            // Remove common boilerplate phrases
+            .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '')
+            // Remove email addresses
+            .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '')
+            // Remove URLs
+            .replace(/https?:\/\/[^\s]+/g, '')
+            // Remove social media handles
+            .replace(/@[a-zA-Z0-9_]+/g, '')
+            // Clean up any remaining HTML tags that might have been missed
+            .replace(/<[^>]*>/g, '')
+            // Fix spacing issues after cleanup
+            .replace(/ +/g, ' ')
+            .trim()
+    );
+}