diff options
Diffstat (limited to 'src/server/ApiManagers/AssistantManager.ts')
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 515 |
1 files changed, 22 insertions, 493 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index 07c970a4e..b917f555c 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -39,7 +39,6 @@ export enum Directory { csv = 'csv', chunk_images = 'chunk_images', scrape_images = 'scrape_images', - vectorstore = 'vectorstore', } // In-memory job tracking @@ -93,132 +92,6 @@ export default class AssistantManager extends ApiManager { const customsearch = google.customsearch('v1'); const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY }); - // Register an endpoint to retrieve file summaries from the json file - register({ - method: Method.GET, - subscription: '/getFileSummaries', - secureHandler: async ({ req, res }) => { - try { - // Read the file summaries JSON file - const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json'); - - if (!fs.existsSync(filePath)) { - res.status(404).send({ error: 'File summaries not found' }); - return; - } - - const data = fs.readFileSync(filePath, 'utf8'); - res.send(data); - } catch (error) { - console.error('Error retrieving file summaries:', error); - res.status(500).send({ - error: 'Failed to retrieve file summaries', - }); - } - }, - }); - - // Register an endpoint to retrieve file names from the file_summaries.json file - register({ - method: Method.GET, - subscription: '/getFileNames', - secureHandler: async ({ res }) => { - const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json'); - const data = fs.readFileSync(filePath, 'utf8'); - console.log(Object.keys(JSON.parse(data))); - - res.send(Object.keys(JSON.parse(data))); - }, - }); - - // Register an endpoint to retrieve file content from the content json file - register({ - method: Method.POST, - subscription: '/getFileContent', - secureHandler: async ({ req, res }) => { - const { filepath } = req.body; - - if (!filepath) { - res.status(400).send({ error: 'Filepath is required' }); - return; - } - - try { - // Read the file content JSON file - const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json'); - - if (!fs.existsSync(filePath)) { - res.status(404).send({ error: 'File content database not found' }); - return; - } - - console.log(`[DEBUG] Retrieving content for: ${filepath}`); - - // Read the JSON file in chunks to handle large files - const readStream = fs.createReadStream(filePath, { encoding: 'utf8' }); - let jsonData = ''; - - readStream.on('data', chunk => { - jsonData += chunk; - }); - - readStream.on('end', () => { - try { - // Parse the JSON - const contentMap = JSON.parse(jsonData); - - // Check if the filepath exists in the map - if (!contentMap[filepath]) { - console.log(`[DEBUG] Content not found for: ${filepath}`); - res.status(404).send({ error: `Content not found for filepath: ${filepath}` }); - return; - } - - // Return the file content as is, not as JSON - console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`); - res.send(contentMap[filepath]); - } catch (parseError) { - console.error('Error parsing file_content.json:', parseError); - res.status(500).send({ - error: 'Failed to parse file content database', - }); - } - }); - - readStream.on('error', streamError => { - console.error('Error reading file_content.json:', streamError); - res.status(500).send({ - error: 'Failed to read file content database', - }); - }); - } catch (error) { - console.error('Error retrieving file content:', error); - res.status(500).send({ - error: 'Failed to retrieve file content', - }); - } - }, - }); - - // Register an endpoint to search file summaries - register({ - method: Method.POST, - subscription: '/searchFileSummaries', - secureHandler: async ({ req, res }) => { - const { query, topK } = req.body; - - if (!query) { - res.status(400).send({ error: 'Search query is required' }); - return; - } - - // This endpoint will be called by the client-side Vectorstore to perform the search - // The actual search is implemented in the Vectorstore class - - res.send({ message: 'This endpoint should be called through the Vectorstore class' }); - }, - }); - // Register Wikipedia summary API route register({ method: Method.POST, @@ -566,9 +439,9 @@ export default class AssistantManager extends ApiManager { try { const image = await openai.images.generate({ model: 'dall-e-3', prompt: image_prompt, response_format: 'url' }); console.log(image); - const result = await DashUploadUtils.UploadImage(image.data[0].url!); + const url = image.data?.[0].url; - const url = image.data[0].url; + const result = url ? await DashUploadUtils.UploadImage(url) : { error: 'Image generation failed' }; res.send({ result, url }); } catch (error) { @@ -612,76 +485,36 @@ export default class AssistantManager extends ApiManager { subscription: '/scrapeWebsite', secureHandler: async ({ req, res }) => { const { url } = req.body; - let browser = null; try { - // Set a longer timeout for slow-loading pages - const navigationTimeout = 60000; // 60 seconds - // Launch Puppeteer browser to navigate to the webpage - browser = await puppeteer.launch({ - args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'], + const browser = await puppeteer.launch({ + args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'); - - // Set timeout for navigation - page.setDefaultNavigationTimeout(navigationTimeout); - - // Navigate with timeout and wait for content to load - await page.goto(url, { - waitUntil: 'networkidle2', - timeout: navigationTimeout, - }); - - // Wait a bit longer to ensure dynamic content loads - await new Promise(resolve => setTimeout(resolve, 2000)); + await page.goto(url, { waitUntil: 'networkidle2' }); // Extract HTML content const htmlContent = await page.content(); await browser.close(); - browser = null; - let extractedText = ''; + // Parse HTML content using JSDOM + const dom = new JSDOM(htmlContent, { url }); - // First try with Readability - try { - // Parse HTML content using JSDOM - const dom = new JSDOM(htmlContent, { url }); - - // Extract readable content using Mozilla's Readability API - const reader = new Readability(dom.window.document, { - // Readability configuration to focus on text content - charThreshold: 100, - keepClasses: false, - }); - const article = reader.parse(); + // Extract readable content using Mozilla's Readability API + const reader = new Readability(dom.window.document); + const article = reader.parse(); - if (article && article.textContent) { - extractedText = article.textContent; - } else { - // If Readability doesn't return useful content, try alternate method - extractedText = await extractEnhancedContent(htmlContent); - } - } catch (parsingError) { - console.error('Error parsing website content with Readability:', parsingError); - // Fallback to enhanced content extraction - extractedText = await extractEnhancedContent(htmlContent); + if (article) { + const plainText = article.textContent; + res.send({ website_plain_text: plainText }); + } else { + res.status(500).send({ error: 'Failed to extract readable content' }); } - - // Clean up the extracted text - extractedText = cleanupText(extractedText); - - res.send({ website_plain_text: extractedText }); } catch (error) { console.error('Error scraping website:', error); - - // Clean up browser if still open - if (browser) { - await browser.close().catch(e => console.error('Error closing browser:', e)); - } - res.status(500).send({ - error: 'Failed to scrape website: ' + ((error as Error).message || 'Unknown error'), + error: 'Failed to scrape website', }); } }, @@ -693,20 +526,20 @@ export default class AssistantManager extends ApiManager { method: Method.POST, subscription: '/createDocument', secureHandler: async ({ req, res }) => { - const { file_path, doc_id } = req.body; + const { file_path } = req.body; const public_path = path.join(publicDirectory, file_path); // Resolve the file path in the public directory const file_name = path.basename(file_path); // Extract the file name from the path try { // Read the file data and encode it as base64 - const file_data: string = fs.readFileSync(public_path, { encoding: 'base64' }); + const file_data = fs.readFileSync(public_path, { encoding: 'base64' }); // Generate a unique job ID for tracking const jobId = uuid.v4(); // Spawn the Python process and track its progress/output // eslint-disable-next-line no-use-before-define - spawnPythonProcess(jobId, public_path, doc_id); + spawnPythonProcess(jobId, public_path); // Send the job ID back to the client for tracking res.send({ jobId }); @@ -854,193 +687,6 @@ export default class AssistantManager extends ApiManager { } }, }); - - // Register an API route to capture a screenshot of a webpage using Puppeteer - // and return the image URL for display in the WebBox component - register({ - method: Method.POST, - subscription: '/captureWebScreenshot', - secureHandler: async ({ req, res }) => { - const { url, width, height, fullPage } = req.body; - - if (!url) { - res.status(400).send({ error: 'URL is required' }); - return; - } - - let browser = null; - try { - // Increase timeout for websites that load slowly - const navigationTimeout = 60000; // 60 seconds - - // Launch a headless browser with additional options to improve stability - browser = await puppeteer.launch({ - headless: true, // Use headless mode - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-accelerated-2d-canvas', - '--disable-gpu', - '--window-size=1200,800', - '--disable-web-security', // Helps with cross-origin issues - '--disable-features=IsolateOrigins,site-per-process', // Helps with frames - ], - timeout: navigationTimeout, - }); - - const page = await browser.newPage(); - - // Set a larger viewport to capture more content - await page.setViewport({ - width: Number(width) || 1200, - height: Number(height) || 800, - deviceScaleFactor: 1, - }); - - // Enable request interception to speed up page loading - await page.setRequestInterception(true); - page.on('request', request => { - // Skip unnecessary resources to speed up loading - const resourceType = request.resourceType(); - if (resourceType === 'font' || resourceType === 'media' || resourceType === 'websocket' || request.url().includes('analytics') || request.url().includes('tracker')) { - request.abort(); - } else { - request.continue(); - } - }); - - // Set navigation and timeout options - console.log(`Navigating to URL: ${url}`); - - // Navigate to the URL and wait for the page to load - await page.goto(url, { - waitUntil: ['networkidle2'], - timeout: navigationTimeout, - }); - - // Wait for a short delay after navigation to allow content to render - await new Promise(resolve => setTimeout(resolve, 2000)); - - // Take a screenshot - console.log('Taking screenshot...'); - const screenshotPath = `./src/server/public/files/images/webpage_${Date.now()}.png`; - const screenshotOptions = { - path: screenshotPath, - fullPage: fullPage === true, - omitBackground: false, - type: 'png' as 'png', - clip: - fullPage !== true - ? { - x: 0, - y: 0, - width: Number(width) || 1200, - height: Number(height) || 800, - } - : undefined, - }; - - await page.screenshot(screenshotOptions); - - // Get the full height of the page - const fullHeight = await page.evaluate(() => { - return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight); - }); - - console.log(`Screenshot captured successfully with height: ${fullHeight}px`); - - // Return the URL to the screenshot - const screenshotUrl = `/files/images/webpage_${Date.now()}.png`; - res.json({ - screenshotUrl, - fullHeight, - }); - } catch (error: any) { - console.error('Error capturing screenshot:', error); - res.status(500).send({ - error: `Failed to capture screenshot: ${error.message}`, - details: error.stack, - }); - } finally { - // Ensure browser is closed to free resources - if (browser) { - try { - await browser.close(); - console.log('Browser closed successfully'); - } catch (error) { - console.error('Error closing browser:', error); - } - } - } - }, - }); - - // Register an endpoint to retrieve raw file content as plain text (no JSON parsing) - register({ - method: Method.POST, - subscription: '/getRawFileContent', - secureHandler: async ({ req, res }) => { - const { filepath } = req.body; - - if (!filepath) { - res.status(400).send('Filepath is required'); - return; - } - - try { - // Read the file content JSON file - const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json'); - - if (!fs.existsSync(filePath)) { - res.status(404).send('File content database not found'); - return; - } - - console.log(`[DEBUG] Retrieving raw content for: ${filepath}`); - - // Read the JSON file - const readStream = fs.createReadStream(filePath, { encoding: 'utf8' }); - let jsonData = ''; - - readStream.on('data', chunk => { - jsonData += chunk; - }); - - readStream.on('end', () => { - try { - // Parse the JSON - const contentMap = JSON.parse(jsonData); - - // Check if the filepath exists in the map - if (!contentMap[filepath]) { - console.log(`[DEBUG] Content not found for: ${filepath}`); - res.status(404).send(`Content not found for filepath: ${filepath}`); - return; - } - - // Set content type to plain text to avoid JSON parsing - res.setHeader('Content-Type', 'text/plain'); - - // Return the file content as plain text - console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`); - res.send(contentMap[filepath]); - } catch (parseError) { - console.error('Error parsing file_content.json:', parseError); - res.status(500).send('Failed to parse file content database'); - } - }); - - readStream.on('error', streamError => { - console.error('Error reading file_content.json:', streamError); - res.status(500).send('Failed to read file content database'); - }); - } catch (error) { - console.error('Error retrieving file content:', error); - res.status(500).send('Failed to retrieve file content'); - } - }, - }); } } @@ -1050,7 +696,7 @@ export default class AssistantManager extends ApiManager { * @param file_name The name of the file to process. * @param file_path The filepath of the file to process. */ -function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) { +function spawnPythonProcess(jobId: string, file_path: string) { const venvPath = path.join(__dirname, '../chunker/venv'); const requirementsPath = path.join(__dirname, '../chunker/requirements.txt'); const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py'); @@ -1060,7 +706,7 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) { function runPythonScript() { const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3'); - const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory, doc_id]); + const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory]); let pythonOutput = ''; let stderrOutput = ''; @@ -1135,6 +781,7 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) { console.log('Virtual environment not found. Creating and setting up...'); // Create venv + // const createVenvProcess = spawn('python', ['-m', 'venv', venvPath]); const createVenvProcess = spawn('python3.10', ['-m', 'venv', venvPath]); createVenvProcess.on('close', code => { @@ -1183,121 +830,3 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) { runPythonScript(); } } - -/** - * Enhanced content extraction that focuses on meaningful text content. - * @param html The HTML content to process - * @returns Extracted and cleaned text content - */ -async function extractEnhancedContent(html: string): Promise<string> { - try { - // Create DOM to extract content - const dom = new JSDOM(html, { runScripts: 'outside-only' }); - const document = dom.window.document; - - // Remove all non-content elements - const elementsToRemove = [ - 'script', - 'style', - 'iframe', - 'noscript', - 'svg', - 'canvas', - 'header', - 'footer', - 'nav', - 'aside', - 'form', - 'button', - 'input', - 'select', - 'textarea', - 'meta', - 'link', - 'img', - 'video', - 'audio', - '.ad', - '.ads', - '.advertisement', - '.banner', - '.cookie', - '.popup', - '.modal', - '.newsletter', - '[role="banner"]', - '[role="navigation"]', - '[role="complementary"]', - ]; - - elementsToRemove.forEach(selector => { - const elements = document.querySelectorAll(selector); - elements.forEach(el => el.remove()); - }); - - // Get all text paragraphs with meaningful content - const contentElements = [ - ...Array.from(document.querySelectorAll('p')), - ...Array.from(document.querySelectorAll('h1')), - ...Array.from(document.querySelectorAll('h2')), - ...Array.from(document.querySelectorAll('h3')), - ...Array.from(document.querySelectorAll('h4')), - ...Array.from(document.querySelectorAll('h5')), - ...Array.from(document.querySelectorAll('h6')), - ...Array.from(document.querySelectorAll('li')), - ...Array.from(document.querySelectorAll('td')), - ...Array.from(document.querySelectorAll('article')), - ...Array.from(document.querySelectorAll('section')), - ...Array.from(document.querySelectorAll('div:not([class]):not([id])')), - ]; - - // Extract text from content elements that have meaningful text - let contentParts: string[] = []; - contentElements.forEach(el => { - const text = el.textContent?.trim(); - // Only include elements with substantial text (more than just a few characters) - if (text && text.length > 10 && !contentParts.includes(text)) { - contentParts.push(text); - } - }); - - // If no significant content found with selective approach, fallback to body - if (contentParts.length < 3) { - return document.body.textContent || ''; - } - - return contentParts.join('\n\n'); - } catch (error) { - console.error('Error extracting enhanced content:', error); - return 'Failed to extract content from the webpage.'; - } -} - -/** - * Cleans up extracted text to improve readability and focus on useful content. - * @param text The raw extracted text - * @returns Cleaned and formatted text - */ -function cleanupText(text: string): string { - if (!text) return ''; - - return ( - text - // Remove excessive whitespace and normalize line breaks - .replace(/\s+/g, ' ') - .replace(/\n\s*\n\s*\n+/g, '\n\n') - // Remove common boilerplate phrases - .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '') - // Remove email addresses - .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '') - // Remove URLs - .replace(/https?:\/\/[^\s]+/g, '') - // Remove social media handles - .replace(/@[a-zA-Z0-9_]+/g, '') - // Clean up any remaining HTML tags that might have been missed - .replace(/<[^>]*>/g, '') - // Fix spacing issues after cleanup - .replace(/ +/g, ' ') - .trim() - ); -} |