1 files changed, 22 insertions, 493 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 07c970a4e..b917f555c 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -39,7 +39,6 @@ export enum Directory {
     csv = 'csv',
     chunk_images = 'chunk_images',
     scrape_images = 'scrape_images',
-    vectorstore = 'vectorstore',
 }
 
 // In-memory job tracking
@@ -93,132 +92,6 @@ export default class AssistantManager extends ApiManager {
         const customsearch = google.customsearch('v1');
         const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY });
 
-        // Register an endpoint to retrieve file summaries from the json file
-        register({
-            method: Method.GET,
-            subscription: '/getFileSummaries',
-            secureHandler: async ({ req, res }) => {
-                try {
-                    // Read the file summaries JSON file
-                    const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json');
-
-                    if (!fs.existsSync(filePath)) {
-                        res.status(404).send({ error: 'File summaries not found' });
-                        return;
-                    }
-
-                    const data = fs.readFileSync(filePath, 'utf8');
-                    res.send(data);
-                } catch (error) {
-                    console.error('Error retrieving file summaries:', error);
-                    res.status(500).send({
-                        error: 'Failed to retrieve file summaries',
-                    });
-                }
-            },
-        });
-
-        // Register an endpoint to retrieve file names from the file_summaries.json file
-        register({
-            method: Method.GET,
-            subscription: '/getFileNames',
-            secureHandler: async ({ res }) => {
-                const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json');
-                const data = fs.readFileSync(filePath, 'utf8');
-                console.log(Object.keys(JSON.parse(data)));
-
-                res.send(Object.keys(JSON.parse(data)));
-            },
-        });
-
-        // Register an endpoint to retrieve file content from the content json file
-        register({
-            method: Method.POST,
-            subscription: '/getFileContent',
-            secureHandler: async ({ req, res }) => {
-                const { filepath } = req.body;
-
-                if (!filepath) {
-                    res.status(400).send({ error: 'Filepath is required' });
-                    return;
-                }
-
-                try {
-                    // Read the file content JSON file
-                    const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json');
-
-                    if (!fs.existsSync(filePath)) {
-                        res.status(404).send({ error: 'File content database not found' });
-                        return;
-                    }
-
-                    console.log(`[DEBUG] Retrieving content for: ${filepath}`);
-
-                    // Read the JSON file in chunks to handle large files
-                    const readStream = fs.createReadStream(filePath, { encoding: 'utf8' });
-                    let jsonData = '';
-
-                    readStream.on('data', chunk => {
-                        jsonData += chunk;
-                    });
-
-                    readStream.on('end', () => {
-                        try {
-                            // Parse the JSON
-                            const contentMap = JSON.parse(jsonData);
-
-                            // Check if the filepath exists in the map
-                            if (!contentMap[filepath]) {
-                                console.log(`[DEBUG] Content not found for: ${filepath}`);
-                                res.status(404).send({ error: `Content not found for filepath: ${filepath}` });
-                                return;
-                            }
-
-                            // Return the file content as is, not as JSON
-                            console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`);
-                            res.send(contentMap[filepath]);
-                        } catch (parseError) {
-                            console.error('Error parsing file_content.json:', parseError);
-                            res.status(500).send({
-                                error: 'Failed to parse file content database',
-                            });
-                        }
-                    });
-
-                    readStream.on('error', streamError => {
-                        console.error('Error reading file_content.json:', streamError);
-                        res.status(500).send({
-                            error: 'Failed to read file content database',
-                        });
-                    });
-                } catch (error) {
-                    console.error('Error retrieving file content:', error);
-                    res.status(500).send({
-                        error: 'Failed to retrieve file content',
-                    });
-                }
-            },
-        });
-
-        // Register an endpoint to search file summaries
-        register({
-            method: Method.POST,
-            subscription: '/searchFileSummaries',
-            secureHandler: async ({ req, res }) => {
-                const { query, topK } = req.body;
-
-                if (!query) {
-                    res.status(400).send({ error: 'Search query is required' });
-                    return;
-                }
-
-                // This endpoint will be called by the client-side Vectorstore to perform the search
-                // The actual search is implemented in the Vectorstore class
-
-                res.send({ message: 'This endpoint should be called through the Vectorstore class' });
-            },
-        });
-
         // Register Wikipedia summary API route
         register({
             method: Method.POST,
@@ -566,9 +439,9 @@ export default class AssistantManager extends ApiManager {
                 try {
                     const image = await openai.images.generate({ model: 'dall-e-3', prompt: image_prompt, response_format: 'url' });
                     console.log(image);
-                    const result = await DashUploadUtils.UploadImage(image.data[0].url!);
+                    const url = image.data?.[0].url;
 
-                    const url = image.data[0].url;
+                    const result = url ? await DashUploadUtils.UploadImage(url) : { error: 'Image generation failed' };
 
                     res.send({ result, url });
                 } catch (error) {
@@ -612,76 +485,36 @@ export default class AssistantManager extends ApiManager {
             subscription: '/scrapeWebsite',
             secureHandler: async ({ req, res }) => {
                 const { url } = req.body;
-                let browser = null;
                 try {
-                    // Set a longer timeout for slow-loading pages
-                    const navigationTimeout = 60000; // 60 seconds
-
                     // Launch Puppeteer browser to navigate to the webpage
-                    browser = await puppeteer.launch({
-                        args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
+                    const browser = await puppeteer.launch({
+                        args: ['--no-sandbox', '--disable-setuid-sandbox'],
                     });
                     const page = await browser.newPage();
                     await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
-
-                    // Set timeout for navigation
-                    page.setDefaultNavigationTimeout(navigationTimeout);
-
-                    // Navigate with timeout and wait for content to load
-                    await page.goto(url, {
-                        waitUntil: 'networkidle2',
-                        timeout: navigationTimeout,
-                    });
-
-                    // Wait a bit longer to ensure dynamic content loads
-                    await new Promise(resolve => setTimeout(resolve, 2000));
+                    await page.goto(url, { waitUntil: 'networkidle2' });
 
                     // Extract HTML content
                     const htmlContent = await page.content();
                     await browser.close();
-                    browser = null;
 
-                    let extractedText = '';
+                    // Parse HTML content using JSDOM
+                    const dom = new JSDOM(htmlContent, { url });
 
-                    // First try with Readability
-                    try {
-                        // Parse HTML content using JSDOM
-                        const dom = new JSDOM(htmlContent, { url });
-
-                        // Extract readable content using Mozilla's Readability API
-                        const reader = new Readability(dom.window.document, {
-                            // Readability configuration to focus on text content
-                            charThreshold: 100,
-                            keepClasses: false,
-                        });
-                        const article = reader.parse();
+                    // Extract readable content using Mozilla's Readability API
+                    const reader = new Readability(dom.window.document);
+                    const article = reader.parse();
 
-                        if (article && article.textContent) {
-                            extractedText = article.textContent;
-                        } else {
-                            // If Readability doesn't return useful content, try alternate method
-                            extractedText = await extractEnhancedContent(htmlContent);
-                        }
-                    } catch (parsingError) {
-                        console.error('Error parsing website content with Readability:', parsingError);
-                        // Fallback to enhanced content extraction
-                        extractedText = await extractEnhancedContent(htmlContent);
+                    if (article) {
+                        const plainText = article.textContent;
+                        res.send({ website_plain_text: plainText });
+                    } else {
+                        res.status(500).send({ error: 'Failed to extract readable content' });
                     }
-
-                    // Clean up the extracted text
-                    extractedText = cleanupText(extractedText);
-
-                    res.send({ website_plain_text: extractedText });
                 } catch (error) {
                     console.error('Error scraping website:', error);
-
-                    // Clean up browser if still open
-                    if (browser) {
-                        await browser.close().catch(e => console.error('Error closing browser:', e));
-                    }
-
                     res.status(500).send({
-                        error: 'Failed to scrape website: ' + ((error as Error).message || 'Unknown error'),
+                        error: 'Failed to scrape website',
                     });
                 }
             },
@@ -693,20 +526,20 @@ export default class AssistantManager extends ApiManager {
             method: Method.POST,
             subscription: '/createDocument',
             secureHandler: async ({ req, res }) => {
-                const { file_path, doc_id } = req.body;
+                const { file_path } = req.body;
                 const public_path = path.join(publicDirectory, file_path); // Resolve the file path in the public directory
                 const file_name = path.basename(file_path); // Extract the file name from the path
 
                 try {
                     // Read the file data and encode it as base64
-                    const file_data: string = fs.readFileSync(public_path, { encoding: 'base64' });
+                    const file_data = fs.readFileSync(public_path, { encoding: 'base64' });
 
                     // Generate a unique job ID for tracking
                     const jobId = uuid.v4();
 
                     // Spawn the Python process and track its progress/output
                     // eslint-disable-next-line no-use-before-define
-                    spawnPythonProcess(jobId, public_path, doc_id);
+                    spawnPythonProcess(jobId, public_path);
 
                     // Send the job ID back to the client for tracking
                     res.send({ jobId });
@@ -854,193 +687,6 @@ export default class AssistantManager extends ApiManager {
                 }
             },
         });
-
-        // Register an API route to capture a screenshot of a webpage using Puppeteer
-        // and return the image URL for display in the WebBox component
-        register({
-            method: Method.POST,
-            subscription: '/captureWebScreenshot',
-            secureHandler: async ({ req, res }) => {
-                const { url, width, height, fullPage } = req.body;
-
-                if (!url) {
-                    res.status(400).send({ error: 'URL is required' });
-                    return;
-                }
-
-                let browser = null;
-                try {
-                    // Increase timeout for websites that load slowly
-                    const navigationTimeout = 60000; // 60 seconds
-
-                    // Launch a headless browser with additional options to improve stability
-                    browser = await puppeteer.launch({
-                        headless: true, // Use headless mode
-                        args: [
-                            '--no-sandbox',
-                            '--disable-setuid-sandbox',
-                            '--disable-dev-shm-usage',
-                            '--disable-accelerated-2d-canvas',
-                            '--disable-gpu',
-                            '--window-size=1200,800',
-                            '--disable-web-security', // Helps with cross-origin issues
-                            '--disable-features=IsolateOrigins,site-per-process', // Helps with frames
-                        ],
-                        timeout: navigationTimeout,
-                    });
-
-                    const page = await browser.newPage();
-
-                    // Set a larger viewport to capture more content
-                    await page.setViewport({
-                        width: Number(width) || 1200,
-                        height: Number(height) || 800,
-                        deviceScaleFactor: 1,
-                    });
-
-                    // Enable request interception to speed up page loading
-                    await page.setRequestInterception(true);
-                    page.on('request', request => {
-                        // Skip unnecessary resources to speed up loading
-                        const resourceType = request.resourceType();
-                        if (resourceType === 'font' || resourceType === 'media' || resourceType === 'websocket' || request.url().includes('analytics') || request.url().includes('tracker')) {
-                            request.abort();
-                        } else {
-                            request.continue();
-                        }
-                    });
-
-                    // Set navigation and timeout options
-                    console.log(`Navigating to URL: ${url}`);
-
-                    // Navigate to the URL and wait for the page to load
-                    await page.goto(url, {
-                        waitUntil: ['networkidle2'],
-                        timeout: navigationTimeout,
-                    });
-
-                    // Wait for a short delay after navigation to allow content to render
-                    await new Promise(resolve => setTimeout(resolve, 2000));
-
-                    // Take a screenshot
-                    console.log('Taking screenshot...');
-                    const screenshotPath = `./src/server/public/files/images/webpage_${Date.now()}.png`;
-                    const screenshotOptions = {
-                        path: screenshotPath,
-                        fullPage: fullPage === true,
-                        omitBackground: false,
-                        type: 'png' as 'png',
-                        clip:
-                            fullPage !== true
-                                ? {
-                                      x: 0,
-                                      y: 0,
-                                      width: Number(width) || 1200,
-                                      height: Number(height) || 800,
-                                  }
-                                : undefined,
-                    };
-
-                    await page.screenshot(screenshotOptions);
-
-                    // Get the full height of the page
-                    const fullHeight = await page.evaluate(() => {
-                        return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight);
-                    });
-
-                    console.log(`Screenshot captured successfully with height: ${fullHeight}px`);
-
-                    // Return the URL to the screenshot
-                    const screenshotUrl = `/files/images/webpage_${Date.now()}.png`;
-                    res.json({
-                        screenshotUrl,
-                        fullHeight,
-                    });
-                } catch (error: any) {
-                    console.error('Error capturing screenshot:', error);
-                    res.status(500).send({
-                        error: `Failed to capture screenshot: ${error.message}`,
-                        details: error.stack,
-                    });
-                } finally {
-                    // Ensure browser is closed to free resources
-                    if (browser) {
-                        try {
-                            await browser.close();
-                            console.log('Browser closed successfully');
-                        } catch (error) {
-                            console.error('Error closing browser:', error);
-                        }
-                    }
-                }
-            },
-        });
-
-        // Register an endpoint to retrieve raw file content as plain text (no JSON parsing)
-        register({
-            method: Method.POST,
-            subscription: '/getRawFileContent',
-            secureHandler: async ({ req, res }) => {
-                const { filepath } = req.body;
-
-                if (!filepath) {
-                    res.status(400).send('Filepath is required');
-                    return;
-                }
-
-                try {
-                    // Read the file content JSON file
-                    const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json');
-
-                    if (!fs.existsSync(filePath)) {
-                        res.status(404).send('File content database not found');
-                        return;
-                    }
-
-                    console.log(`[DEBUG] Retrieving raw content for: ${filepath}`);
-
-                    // Read the JSON file
-                    const readStream = fs.createReadStream(filePath, { encoding: 'utf8' });
-                    let jsonData = '';
-
-                    readStream.on('data', chunk => {
-                        jsonData += chunk;
-                    });
-
-                    readStream.on('end', () => {
-                        try {
-                            // Parse the JSON
-                            const contentMap = JSON.parse(jsonData);
-
-                            // Check if the filepath exists in the map
-                            if (!contentMap[filepath]) {
-                                console.log(`[DEBUG] Content not found for: ${filepath}`);
-                                res.status(404).send(`Content not found for filepath: ${filepath}`);
-                                return;
-                            }
-
-                            // Set content type to plain text to avoid JSON parsing
-                            res.setHeader('Content-Type', 'text/plain');
-
-                            // Return the file content as plain text
-                            console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`);
-                            res.send(contentMap[filepath]);
-                        } catch (parseError) {
-                            console.error('Error parsing file_content.json:', parseError);
-                            res.status(500).send('Failed to parse file content database');
-                        }
-                    });
-
-                    readStream.on('error', streamError => {
-                        console.error('Error reading file_content.json:', streamError);
-                        res.status(500).send('Failed to read file content database');
-                    });
-                } catch (error) {
-                    console.error('Error retrieving file content:', error);
-                    res.status(500).send('Failed to retrieve file content');
-                }
-            },
-        });
     }
 }
 
@@ -1050,7 +696,7 @@ export default class AssistantManager extends ApiManager {
  * @param file_name The name of the file to process.
  * @param file_path The filepath of the file to process.
  */
-function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
+function spawnPythonProcess(jobId: string, file_path: string) {
     const venvPath = path.join(__dirname, '../chunker/venv');
     const requirementsPath = path.join(__dirname, '../chunker/requirements.txt');
     const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py');
@@ -1060,7 +706,7 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
     function runPythonScript() {
         const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3');
 
-        const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory, doc_id]);
+        const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory]);
 
         let pythonOutput = '';
         let stderrOutput = '';
@@ -1135,6 +781,7 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
         console.log('Virtual environment not found. Creating and setting up...');
 
         // Create venv
+        // const createVenvProcess = spawn('python', ['-m', 'venv', venvPath]);
         const createVenvProcess = spawn('python3.10', ['-m', 'venv', venvPath]);
 
         createVenvProcess.on('close', code => {
@@ -1183,121 +830,3 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
         runPythonScript();
     }
 }
-
-/**
- * Enhanced content extraction that focuses on meaningful text content.
- * @param html The HTML content to process
- * @returns Extracted and cleaned text content
- */
-async function extractEnhancedContent(html: string): Promise<string> {
-    try {
-        // Create DOM to extract content
-        const dom = new JSDOM(html, { runScripts: 'outside-only' });
-        const document = dom.window.document;
-
-        // Remove all non-content elements
-        const elementsToRemove = [
-            'script',
-            'style',
-            'iframe',
-            'noscript',
-            'svg',
-            'canvas',
-            'header',
-            'footer',
-            'nav',
-            'aside',
-            'form',
-            'button',
-            'input',
-            'select',
-            'textarea',
-            'meta',
-            'link',
-            'img',
-            'video',
-            'audio',
-            '.ad',
-            '.ads',
-            '.advertisement',
-            '.banner',
-            '.cookie',
-            '.popup',
-            '.modal',
-            '.newsletter',
-            '[role="banner"]',
-            '[role="navigation"]',
-            '[role="complementary"]',
-        ];
-
-        elementsToRemove.forEach(selector => {
-            const elements = document.querySelectorAll(selector);
-            elements.forEach(el => el.remove());
-        });
-
-        // Get all text paragraphs with meaningful content
-        const contentElements = [
-            ...Array.from(document.querySelectorAll('p')),
-            ...Array.from(document.querySelectorAll('h1')),
-            ...Array.from(document.querySelectorAll('h2')),
-            ...Array.from(document.querySelectorAll('h3')),
-            ...Array.from(document.querySelectorAll('h4')),
-            ...Array.from(document.querySelectorAll('h5')),
-            ...Array.from(document.querySelectorAll('h6')),
-            ...Array.from(document.querySelectorAll('li')),
-            ...Array.from(document.querySelectorAll('td')),
-            ...Array.from(document.querySelectorAll('article')),
-            ...Array.from(document.querySelectorAll('section')),
-            ...Array.from(document.querySelectorAll('div:not([class]):not([id])')),
-        ];
-
-        // Extract text from content elements that have meaningful text
-        let contentParts: string[] = [];
-        contentElements.forEach(el => {
-            const text = el.textContent?.trim();
-            // Only include elements with substantial text (more than just a few characters)
-            if (text && text.length > 10 && !contentParts.includes(text)) {
-                contentParts.push(text);
-            }
-        });
-
-        // If no significant content found with selective approach, fallback to body
-        if (contentParts.length < 3) {
-            return document.body.textContent || '';
-        }
-
-        return contentParts.join('\n\n');
-    } catch (error) {
-        console.error('Error extracting enhanced content:', error);
-        return 'Failed to extract content from the webpage.';
-    }
-}
-
-/**
- * Cleans up extracted text to improve readability and focus on useful content.
- * @param text The raw extracted text
- * @returns Cleaned and formatted text
- */
-function cleanupText(text: string): string {
-    if (!text) return '';
-
-    return (
-        text
-            // Remove excessive whitespace and normalize line breaks
-            .replace(/\s+/g, ' ')
-            .replace(/\n\s*\n\s*\n+/g, '\n\n')
-            // Remove common boilerplate phrases
-            .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '')
-            // Remove email addresses
-            .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '')
-            // Remove URLs
-            .replace(/https?:\/\/[^\s]+/g, '')
-            // Remove social media handles
-            .replace(/@[a-zA-Z0-9_]+/g, '')
-            // Clean up any remaining HTML tags that might have been missed
-            .replace(/<[^>]*>/g, '')
-            // Fix spacing issues after cleanup
-            .replace(/ +/g, ' ')
-            .trim()
-    );
-}