diff options
Diffstat (limited to 'src/server/ApiManagers/AssistantManager.ts')
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 229 |
1 files changed, 215 insertions, 14 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index af25722a4..6d2779163 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -485,36 +485,69 @@ export default class AssistantManager extends ApiManager { subscription: '/scrapeWebsite', secureHandler: async ({ req, res }) => { const { url } = req.body; + let browser = null; try { + // Set a longer timeout for slow-loading pages + const navigationTimeout = 60000; // 60 seconds + // Launch Puppeteer browser to navigate to the webpage - const browser = await puppeteer.launch({ - args: ['--no-sandbox', '--disable-setuid-sandbox'], + browser = await puppeteer.launch({ + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'], }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'); - await page.goto(url, { waitUntil: 'networkidle2' }); + + // Set timeout for navigation + page.setDefaultNavigationTimeout(navigationTimeout); + + // Navigate with timeout and wait for content to load + await page.goto(url, { + waitUntil: 'networkidle2', + timeout: navigationTimeout, + }); + + // Wait a bit longer to ensure dynamic content loads + await new Promise(resolve => setTimeout(resolve, 2000)); // Extract HTML content const htmlContent = await page.content(); await browser.close(); + browser = null; - // Parse HTML content using JSDOM - const dom = new JSDOM(htmlContent, { url }); + // Use a try-catch block specifically for JSDOM parsing + try { + // Parse HTML content using JSDOM + const dom = new JSDOM(htmlContent, { url }); - // Extract readable content using Mozilla's Readability API - const reader = new Readability(dom.window.document); - const article = reader.parse(); + // Extract readable content using Mozilla's Readability API + const reader = new Readability(dom.window.document); + const article = reader.parse(); - if (article) { - const plainText = article.textContent; - res.send({ website_plain_text: plainText }); - } else { - res.status(500).send({ error: 'Failed to extract readable content' }); + if (article) { + const plainText = article.textContent; + res.send({ website_plain_text: plainText }); + } else { + // If Readability fails, fallback to extracting main content + const mainContent = await extractMainContent(htmlContent); + res.send({ website_plain_text: mainContent }); + } + } catch (parsingError) { + console.error('Error parsing website content:', parsingError); + + // Fallback to a simplified extraction method + const mainContent = await extractMainContent(htmlContent); + res.send({ website_plain_text: mainContent }); } } catch (error) { console.error('Error scraping website:', error); + + // Clean up browser if still open + if (browser) { + await browser.close().catch(e => console.error('Error closing browser:', e)); + } + res.status(500).send({ - error: 'Failed to scrape website', + error: 'Failed to scrape website: ' + ((error as Error).message || 'Unknown error'), }); } }, @@ -687,6 +720,127 @@ export default class AssistantManager extends ApiManager { } }, }); + + // Register an API route to capture a screenshot of a webpage using Puppeteer + // and return the image URL for display in the WebBox component + register({ + method: Method.POST, + subscription: '/captureWebScreenshot', + secureHandler: async ({ req, res }) => { + const { url, width, height, fullPage } = req.body; + + if (!url) { + res.status(400).send({ error: 'URL is required' }); + return; + } + + let browser = null; + try { + // Increase timeout for websites that load slowly + const navigationTimeout = 60000; // 60 seconds + + // Launch a headless browser with additional options to improve stability + browser = await puppeteer.launch({ + headless: true, // Use headless mode + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-accelerated-2d-canvas', + '--disable-gpu', + '--window-size=1200,800', + '--disable-web-security', // Helps with cross-origin issues + '--disable-features=IsolateOrigins,site-per-process', // Helps with frames + ], + timeout: navigationTimeout, + }); + + const page = await browser.newPage(); + + // Set a larger viewport to capture more content + await page.setViewport({ + width: Number(width) || 1200, + height: Number(height) || 800, + deviceScaleFactor: 1, + }); + + // Enable request interception to speed up page loading + await page.setRequestInterception(true); + page.on('request', request => { + // Skip unnecessary resources to speed up loading + const resourceType = request.resourceType(); + if (resourceType === 'font' || resourceType === 'media' || resourceType === 'websocket' || request.url().includes('analytics') || request.url().includes('tracker')) { + request.abort(); + } else { + request.continue(); + } + }); + + // Set navigation and timeout options + console.log(`Navigating to URL: ${url}`); + + // Navigate to the URL and wait for the page to load + await page.goto(url, { + waitUntil: ['networkidle2'], + timeout: navigationTimeout, + }); + + // Wait for a short delay after navigation to allow content to render + await new Promise(resolve => setTimeout(resolve, 2000)); + + // Take a screenshot + console.log('Taking screenshot...'); + const screenshotPath = `./src/server/public/files/images/webpage_${Date.now()}.png`; + const screenshotOptions = { + path: screenshotPath, + fullPage: fullPage === true, + omitBackground: false, + type: 'png' as 'png', + clip: + fullPage !== true + ? { + x: 0, + y: 0, + width: Number(width) || 1200, + height: Number(height) || 800, + } + : undefined, + }; + + await page.screenshot(screenshotOptions); + + // Get the full height of the page + const fullHeight = await page.evaluate(() => { + return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight); + }); + + console.log(`Screenshot captured successfully with height: ${fullHeight}px`); + + // Return the URL to the screenshot + const screenshotUrl = `/files/images/webpage_${Date.now()}.png`; + res.json({ + screenshotUrl, + fullHeight, + }); + } catch (error: any) { + console.error('Error capturing screenshot:', error); + res.status(500).send({ + error: `Failed to capture screenshot: ${error.message}`, + details: error.stack, + }); + } finally { + // Ensure browser is closed to free resources + if (browser) { + try { + await browser.close(); + console.log('Browser closed successfully'); + } catch (error) { + console.error('Error closing browser:', error); + } + } + } + }, + }); } } @@ -829,3 +983,50 @@ function spawnPythonProcess(jobId: string, file_path: string) { runPythonScript(); } } + +/** + * Extracts main content from HTML by removing scripts, styles, and non-content elements + * Used as a fallback when Readability fails + * @param html The HTML content to process + * @returns Extracted main text content + */ +async function extractMainContent(html: string): Promise<string> { + try { + // Create a simple DOM to extract content + const dom = new JSDOM(html, { runScripts: 'outside-only' }); + const document = dom.window.document; + + // Remove scripts, styles, and other non-content elements + const elementsToRemove = ['script', 'style', 'iframe', 'noscript', 'svg', 'header', 'footer', 'nav', 'aside', 'ads', 'banner', 'form', 'button', 'input']; + + elementsToRemove.forEach(tag => { + const elements = document.querySelectorAll(tag); + elements.forEach(el => el.remove()); + }); + + // Try to find the main content container using common selectors + const mainSelectors = ['main', 'article', '#content', '.content', '#main', '.main', '.post-content', '.article-content', '.entry-content']; + + let mainContent = ''; + + // Try each selector to find main content + for (const selector of mainSelectors) { + const element = document.querySelector(selector); + if (element && element.textContent && element.textContent.trim().length > 100) { + mainContent = element.textContent; + break; + } + } + + // If no main content found with selectors, use body content + if (!mainContent || mainContent.length < 200) { + mainContent = document.body.textContent || ''; + } + + // Clean up the text + return mainContent.replace(/\s+/g, ' ').replace(/\n+/g, '\n').trim(); + } catch (error) { + console.error('Error extracting main content:', error); + return 'Failed to extract content from the webpage.'; + } +} |