aboutsummaryrefslogtreecommitdiff
path: root/src/server/ApiManagers/AssistantManager.ts
diff options
context:
space:
mode:
Diffstat (limited to 'src/server/ApiManagers/AssistantManager.ts')
-rw-r--r--src/server/ApiManagers/AssistantManager.ts229
1 files changed, 215 insertions, 14 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index af25722a4..6d2779163 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -485,36 +485,69 @@ export default class AssistantManager extends ApiManager {
subscription: '/scrapeWebsite',
secureHandler: async ({ req, res }) => {
const { url } = req.body;
+ let browser = null;
try {
+ // Set a longer timeout for slow-loading pages
+ const navigationTimeout = 60000; // 60 seconds
+
// Launch Puppeteer browser to navigate to the webpage
- const browser = await puppeteer.launch({
- args: ['--no-sandbox', '--disable-setuid-sandbox'],
+ browser = await puppeteer.launch({
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
- await page.goto(url, { waitUntil: 'networkidle2' });
+
+ // Set timeout for navigation
+ page.setDefaultNavigationTimeout(navigationTimeout);
+
+ // Navigate with timeout and wait for content to load
+ await page.goto(url, {
+ waitUntil: 'networkidle2',
+ timeout: navigationTimeout,
+ });
+
+ // Wait a bit longer to ensure dynamic content loads
+ await new Promise(resolve => setTimeout(resolve, 2000));
// Extract HTML content
const htmlContent = await page.content();
await browser.close();
+ browser = null;
- // Parse HTML content using JSDOM
- const dom = new JSDOM(htmlContent, { url });
+ // Use a try-catch block specifically for JSDOM parsing
+ try {
+ // Parse HTML content using JSDOM
+ const dom = new JSDOM(htmlContent, { url });
- // Extract readable content using Mozilla's Readability API
- const reader = new Readability(dom.window.document);
- const article = reader.parse();
+ // Extract readable content using Mozilla's Readability API
+ const reader = new Readability(dom.window.document);
+ const article = reader.parse();
- if (article) {
- const plainText = article.textContent;
- res.send({ website_plain_text: plainText });
- } else {
- res.status(500).send({ error: 'Failed to extract readable content' });
+ if (article) {
+ const plainText = article.textContent;
+ res.send({ website_plain_text: plainText });
+ } else {
+ // If Readability fails, fallback to extracting main content
+ const mainContent = await extractMainContent(htmlContent);
+ res.send({ website_plain_text: mainContent });
+ }
+ } catch (parsingError) {
+ console.error('Error parsing website content:', parsingError);
+
+ // Fallback to a simplified extraction method
+ const mainContent = await extractMainContent(htmlContent);
+ res.send({ website_plain_text: mainContent });
}
} catch (error) {
console.error('Error scraping website:', error);
+
+ // Clean up browser if still open
+ if (browser) {
+ await browser.close().catch(e => console.error('Error closing browser:', e));
+ }
+
res.status(500).send({
- error: 'Failed to scrape website',
+ error: 'Failed to scrape website: ' + ((error as Error).message || 'Unknown error'),
});
}
},
@@ -687,6 +720,127 @@ export default class AssistantManager extends ApiManager {
}
},
});
+
+ // Register an API route to capture a screenshot of a webpage using Puppeteer
+ // and return the image URL for display in the WebBox component
+ register({
+ method: Method.POST,
+ subscription: '/captureWebScreenshot',
+ secureHandler: async ({ req, res }) => {
+ const { url, width, height, fullPage } = req.body;
+
+ if (!url) {
+ res.status(400).send({ error: 'URL is required' });
+ return;
+ }
+
+ let browser = null;
+ try {
+ // Increase timeout for websites that load slowly
+ const navigationTimeout = 60000; // 60 seconds
+
+ // Launch a headless browser with additional options to improve stability
+ browser = await puppeteer.launch({
+ headless: true, // Use headless mode
+ args: [
+ '--no-sandbox',
+ '--disable-setuid-sandbox',
+ '--disable-dev-shm-usage',
+ '--disable-accelerated-2d-canvas',
+ '--disable-gpu',
+ '--window-size=1200,800',
+ '--disable-web-security', // Helps with cross-origin issues
+ '--disable-features=IsolateOrigins,site-per-process', // Helps with frames
+ ],
+ timeout: navigationTimeout,
+ });
+
+ const page = await browser.newPage();
+
+ // Set a larger viewport to capture more content
+ await page.setViewport({
+ width: Number(width) || 1200,
+ height: Number(height) || 800,
+ deviceScaleFactor: 1,
+ });
+
+ // Enable request interception to speed up page loading
+ await page.setRequestInterception(true);
+ page.on('request', request => {
+ // Skip unnecessary resources to speed up loading
+ const resourceType = request.resourceType();
+ if (resourceType === 'font' || resourceType === 'media' || resourceType === 'websocket' || request.url().includes('analytics') || request.url().includes('tracker')) {
+ request.abort();
+ } else {
+ request.continue();
+ }
+ });
+
+ // Set navigation and timeout options
+ console.log(`Navigating to URL: ${url}`);
+
+ // Navigate to the URL and wait for the page to load
+ await page.goto(url, {
+ waitUntil: ['networkidle2'],
+ timeout: navigationTimeout,
+ });
+
+ // Wait for a short delay after navigation to allow content to render
+ await new Promise(resolve => setTimeout(resolve, 2000));
+
+ // Take a screenshot
+ console.log('Taking screenshot...');
+ const screenshotPath = `./src/server/public/files/images/webpage_${Date.now()}.png`;
+ const screenshotOptions = {
+ path: screenshotPath,
+ fullPage: fullPage === true,
+ omitBackground: false,
+ type: 'png' as 'png',
+ clip:
+ fullPage !== true
+ ? {
+ x: 0,
+ y: 0,
+ width: Number(width) || 1200,
+ height: Number(height) || 800,
+ }
+ : undefined,
+ };
+
+ await page.screenshot(screenshotOptions);
+
+ // Get the full height of the page
+ const fullHeight = await page.evaluate(() => {
+ return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight);
+ });
+
+ console.log(`Screenshot captured successfully with height: ${fullHeight}px`);
+
+ // Return the URL to the screenshot
+ const screenshotUrl = `/files/images/webpage_${Date.now()}.png`;
+ res.json({
+ screenshotUrl,
+ fullHeight,
+ });
+ } catch (error: any) {
+ console.error('Error capturing screenshot:', error);
+ res.status(500).send({
+ error: `Failed to capture screenshot: ${error.message}`,
+ details: error.stack,
+ });
+ } finally {
+ // Ensure browser is closed to free resources
+ if (browser) {
+ try {
+ await browser.close();
+ console.log('Browser closed successfully');
+ } catch (error) {
+ console.error('Error closing browser:', error);
+ }
+ }
+ }
+ },
+ });
}
}
@@ -829,3 +983,50 @@ function spawnPythonProcess(jobId: string, file_path: string) {
runPythonScript();
}
}
+
+/**
+ * Extracts main content from HTML by removing scripts, styles, and non-content elements
+ * Used as a fallback when Readability fails
+ * @param html The HTML content to process
+ * @returns Extracted main text content
+ */
+async function extractMainContent(html: string): Promise<string> {
+ try {
+ // Create a simple DOM to extract content
+ const dom = new JSDOM(html, { runScripts: 'outside-only' });
+ const document = dom.window.document;
+
+ // Remove scripts, styles, and other non-content elements
+ const elementsToRemove = ['script', 'style', 'iframe', 'noscript', 'svg', 'header', 'footer', 'nav', 'aside', 'ads', 'banner', 'form', 'button', 'input'];
+
+ elementsToRemove.forEach(tag => {
+ const elements = document.querySelectorAll(tag);
+ elements.forEach(el => el.remove());
+ });
+
+ // Try to find the main content container using common selectors
+ const mainSelectors = ['main', 'article', '#content', '.content', '#main', '.main', '.post-content', '.article-content', '.entry-content'];
+
+ let mainContent = '';
+
+ // Try each selector to find main content
+ for (const selector of mainSelectors) {
+ const element = document.querySelector(selector);
+ if (element && element.textContent && element.textContent.trim().length > 100) {
+ mainContent = element.textContent;
+ break;
+ }
+ }
+
+ // If no main content found with selectors, use body content
+ if (!mainContent || mainContent.length < 200) {
+ mainContent = document.body.textContent || '';
+ }
+
+ // Clean up the text
+ return mainContent.replace(/\s+/g, ' ').replace(/\n+/g, '\n').trim();
+ } catch (error) {
+ console.error('Error extracting main content:', error);
+ return 'Failed to extract content from the webpage.';
+ }
+}