aboutsummaryrefslogtreecommitdiff
path: root/src/server/ApiManagers/AssistantManager.ts
diff options
context:
space:
mode:
Diffstat (limited to 'src/server/ApiManagers/AssistantManager.ts')
-rw-r--r--src/server/ApiManagers/AssistantManager.ts510
1 files changed, 491 insertions, 19 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index af25722a4..07c970a4e 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -39,6 +39,7 @@ export enum Directory {
csv = 'csv',
chunk_images = 'chunk_images',
scrape_images = 'scrape_images',
+ vectorstore = 'vectorstore',
}
// In-memory job tracking
@@ -92,6 +93,132 @@ export default class AssistantManager extends ApiManager {
const customsearch = google.customsearch('v1');
const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY });
+ // Register an endpoint to retrieve file summaries from the json file
+ register({
+ method: Method.GET,
+ subscription: '/getFileSummaries',
+ secureHandler: async ({ req, res }) => {
+ try {
+ // Read the file summaries JSON file
+ const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json');
+
+ if (!fs.existsSync(filePath)) {
+ res.status(404).send({ error: 'File summaries not found' });
+ return;
+ }
+
+ const data = fs.readFileSync(filePath, 'utf8');
+ res.send(data);
+ } catch (error) {
+ console.error('Error retrieving file summaries:', error);
+ res.status(500).send({
+ error: 'Failed to retrieve file summaries',
+ });
+ }
+ },
+ });
+
+ // Register an endpoint to retrieve file names from the file_summaries.json file
+ register({
+ method: Method.GET,
+ subscription: '/getFileNames',
+ secureHandler: async ({ res }) => {
+ const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json');
+ const data = fs.readFileSync(filePath, 'utf8');
+ console.log(Object.keys(JSON.parse(data)));
+
+ res.send(Object.keys(JSON.parse(data)));
+ },
+ });
+
+ // Register an endpoint to retrieve file content from the content json file
+ register({
+ method: Method.POST,
+ subscription: '/getFileContent',
+ secureHandler: async ({ req, res }) => {
+ const { filepath } = req.body;
+
+ if (!filepath) {
+ res.status(400).send({ error: 'Filepath is required' });
+ return;
+ }
+
+ try {
+ // Read the file content JSON file
+ const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json');
+
+ if (!fs.existsSync(filePath)) {
+ res.status(404).send({ error: 'File content database not found' });
+ return;
+ }
+
+ console.log(`[DEBUG] Retrieving content for: ${filepath}`);
+
+ // Read the JSON file in chunks to handle large files
+ const readStream = fs.createReadStream(filePath, { encoding: 'utf8' });
+ let jsonData = '';
+
+ readStream.on('data', chunk => {
+ jsonData += chunk;
+ });
+
+ readStream.on('end', () => {
+ try {
+ // Parse the JSON
+ const contentMap = JSON.parse(jsonData);
+
+ // Check if the filepath exists in the map
+ if (!contentMap[filepath]) {
+ console.log(`[DEBUG] Content not found for: ${filepath}`);
+ res.status(404).send({ error: `Content not found for filepath: ${filepath}` });
+ return;
+ }
+
+ // Return the file content as is, not as JSON
+ console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`);
+ res.send(contentMap[filepath]);
+ } catch (parseError) {
+ console.error('Error parsing file_content.json:', parseError);
+ res.status(500).send({
+ error: 'Failed to parse file content database',
+ });
+ }
+ });
+
+ readStream.on('error', streamError => {
+ console.error('Error reading file_content.json:', streamError);
+ res.status(500).send({
+ error: 'Failed to read file content database',
+ });
+ });
+ } catch (error) {
+ console.error('Error retrieving file content:', error);
+ res.status(500).send({
+ error: 'Failed to retrieve file content',
+ });
+ }
+ },
+ });
+
+ // Register an endpoint to search file summaries
+ register({
+ method: Method.POST,
+ subscription: '/searchFileSummaries',
+ secureHandler: async ({ req, res }) => {
+ const { query, topK } = req.body;
+
+ if (!query) {
+ res.status(400).send({ error: 'Search query is required' });
+ return;
+ }
+
+ // This endpoint will be called by the client-side Vectorstore to perform the search
+ // The actual search is implemented in the Vectorstore class
+
+ res.send({ message: 'This endpoint should be called through the Vectorstore class' });
+ },
+ });
+
// Register Wikipedia summary API route
register({
method: Method.POST,
@@ -485,36 +612,76 @@ export default class AssistantManager extends ApiManager {
subscription: '/scrapeWebsite',
secureHandler: async ({ req, res }) => {
const { url } = req.body;
+ let browser = null;
try {
+ // Set a longer timeout for slow-loading pages
+ const navigationTimeout = 60000; // 60 seconds
+
// Launch Puppeteer browser to navigate to the webpage
- const browser = await puppeteer.launch({
- args: ['--no-sandbox', '--disable-setuid-sandbox'],
+ browser = await puppeteer.launch({
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
- await page.goto(url, { waitUntil: 'networkidle2' });
+
+ // Set timeout for navigation
+ page.setDefaultNavigationTimeout(navigationTimeout);
+
+ // Navigate with timeout and wait for content to load
+ await page.goto(url, {
+ waitUntil: 'networkidle2',
+ timeout: navigationTimeout,
+ });
+
+ // Wait a bit longer to ensure dynamic content loads
+ await new Promise(resolve => setTimeout(resolve, 2000));
// Extract HTML content
const htmlContent = await page.content();
await browser.close();
+ browser = null;
- // Parse HTML content using JSDOM
- const dom = new JSDOM(htmlContent, { url });
+ let extractedText = '';
- // Extract readable content using Mozilla's Readability API
- const reader = new Readability(dom.window.document);
- const article = reader.parse();
+ // First try with Readability
+ try {
+ // Parse HTML content using JSDOM
+ const dom = new JSDOM(htmlContent, { url });
+
+ // Extract readable content using Mozilla's Readability API
+ const reader = new Readability(dom.window.document, {
+ // Readability configuration to focus on text content
+ charThreshold: 100,
+ keepClasses: false,
+ });
+ const article = reader.parse();
- if (article) {
- const plainText = article.textContent;
- res.send({ website_plain_text: plainText });
- } else {
- res.status(500).send({ error: 'Failed to extract readable content' });
+ if (article && article.textContent) {
+ extractedText = article.textContent;
+ } else {
+ // If Readability doesn't return useful content, try alternate method
+ extractedText = await extractEnhancedContent(htmlContent);
+ }
+ } catch (parsingError) {
+ console.error('Error parsing website content with Readability:', parsingError);
+ // Fallback to enhanced content extraction
+ extractedText = await extractEnhancedContent(htmlContent);
}
+
+ // Clean up the extracted text
+ extractedText = cleanupText(extractedText);
+
+ res.send({ website_plain_text: extractedText });
} catch (error) {
console.error('Error scraping website:', error);
+
+ // Clean up browser if still open
+ if (browser) {
+ await browser.close().catch(e => console.error('Error closing browser:', e));
+ }
+
res.status(500).send({
- error: 'Failed to scrape website',
+ error: 'Failed to scrape website: ' + ((error as Error).message || 'Unknown error'),
});
}
},
@@ -526,7 +693,7 @@ export default class AssistantManager extends ApiManager {
method: Method.POST,
subscription: '/createDocument',
secureHandler: async ({ req, res }) => {
- const { file_path } = req.body;
+ const { file_path, doc_id } = req.body;
const public_path = path.join(publicDirectory, file_path); // Resolve the file path in the public directory
const file_name = path.basename(file_path); // Extract the file name from the path
@@ -539,7 +706,7 @@ export default class AssistantManager extends ApiManager {
// Spawn the Python process and track its progress/output
// eslint-disable-next-line no-use-before-define
- spawnPythonProcess(jobId, public_path);
+ spawnPythonProcess(jobId, public_path, doc_id);
// Send the job ID back to the client for tracking
res.send({ jobId });
@@ -687,6 +854,193 @@ export default class AssistantManager extends ApiManager {
}
},
});
+
+ // Register an API route to capture a screenshot of a webpage using Puppeteer
+ // and return the image URL for display in the WebBox component
+ register({
+ method: Method.POST,
+ subscription: '/captureWebScreenshot',
+ secureHandler: async ({ req, res }) => {
+ const { url, width, height, fullPage } = req.body;
+
+ if (!url) {
+ res.status(400).send({ error: 'URL is required' });
+ return;
+ }
+
+ let browser = null;
+ try {
+ // Increase timeout for websites that load slowly
+ const navigationTimeout = 60000; // 60 seconds
+
+ // Launch a headless browser with additional options to improve stability
+ browser = await puppeteer.launch({
+ headless: true, // Use headless mode
+ args: [
+ '--no-sandbox',
+ '--disable-setuid-sandbox',
+ '--disable-dev-shm-usage',
+ '--disable-accelerated-2d-canvas',
+ '--disable-gpu',
+ '--window-size=1200,800',
+ '--disable-web-security', // Helps with cross-origin issues
+ '--disable-features=IsolateOrigins,site-per-process', // Helps with frames
+ ],
+ timeout: navigationTimeout,
+ });
+
+ const page = await browser.newPage();
+
+ // Set a larger viewport to capture more content
+ await page.setViewport({
+ width: Number(width) || 1200,
+ height: Number(height) || 800,
+ deviceScaleFactor: 1,
+ });
+
+ // Enable request interception to speed up page loading
+ await page.setRequestInterception(true);
+ page.on('request', request => {
+ // Skip unnecessary resources to speed up loading
+ const resourceType = request.resourceType();
+ if (resourceType === 'font' || resourceType === 'media' || resourceType === 'websocket' || request.url().includes('analytics') || request.url().includes('tracker')) {
+ request.abort();
+ } else {
+ request.continue();
+ }
+ });
+
+ // Set navigation and timeout options
+ console.log(`Navigating to URL: ${url}`);
+
+ // Navigate to the URL and wait for the page to load
+ await page.goto(url, {
+ waitUntil: ['networkidle2'],
+ timeout: navigationTimeout,
+ });
+
+ // Wait for a short delay after navigation to allow content to render
+ await new Promise(resolve => setTimeout(resolve, 2000));
+
+ // Take a screenshot
+ console.log('Taking screenshot...');
+ const screenshotPath = `./src/server/public/files/images/webpage_${Date.now()}.png`;
+ const screenshotOptions = {
+ path: screenshotPath,
+ fullPage: fullPage === true,
+ omitBackground: false,
+ type: 'png' as 'png',
+ clip:
+ fullPage !== true
+ ? {
+ x: 0,
+ y: 0,
+ width: Number(width) || 1200,
+ height: Number(height) || 800,
+ }
+ : undefined,
+ };
+
+ await page.screenshot(screenshotOptions);
+
+ // Get the full height of the page
+ const fullHeight = await page.evaluate(() => {
+ return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight);
+ });
+
+ console.log(`Screenshot captured successfully with height: ${fullHeight}px`);
+
+ // Return the URL to the screenshot
+ const screenshotUrl = `/files/images/webpage_${Date.now()}.png`;
+ res.json({
+ screenshotUrl,
+ fullHeight,
+ });
+ } catch (error: any) {
+ console.error('Error capturing screenshot:', error);
+ res.status(500).send({
+ error: `Failed to capture screenshot: ${error.message}`,
+ details: error.stack,
+ });
+ } finally {
+ // Ensure browser is closed to free resources
+ if (browser) {
+ try {
+ await browser.close();
+ console.log('Browser closed successfully');
+ } catch (error) {
+ console.error('Error closing browser:', error);
+ }
+ }
+ }
+ },
+ });
+
+ // Register an endpoint to retrieve raw file content as plain text (no JSON parsing)
+ register({
+ method: Method.POST,
+ subscription: '/getRawFileContent',
+ secureHandler: async ({ req, res }) => {
+ const { filepath } = req.body;
+
+ if (!filepath) {
+ res.status(400).send('Filepath is required');
+ return;
+ }
+
+ try {
+ // Read the file content JSON file
+ const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json');
+
+ if (!fs.existsSync(filePath)) {
+ res.status(404).send('File content database not found');
+ return;
+ }
+
+ console.log(`[DEBUG] Retrieving raw content for: ${filepath}`);
+
+ // Read the JSON file
+ const readStream = fs.createReadStream(filePath, { encoding: 'utf8' });
+ let jsonData = '';
+
+ readStream.on('data', chunk => {
+ jsonData += chunk;
+ });
+
+ readStream.on('end', () => {
+ try {
+ // Parse the JSON
+ const contentMap = JSON.parse(jsonData);
+
+ // Check if the filepath exists in the map
+ if (!contentMap[filepath]) {
+ console.log(`[DEBUG] Content not found for: ${filepath}`);
+ res.status(404).send(`Content not found for filepath: ${filepath}`);
+ return;
+ }
+
+ // Set content type to plain text to avoid JSON parsing
+ res.setHeader('Content-Type', 'text/plain');
+
+ // Return the file content as plain text
+ console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`);
+ res.send(contentMap[filepath]);
+ } catch (parseError) {
+ console.error('Error parsing file_content.json:', parseError);
+ res.status(500).send('Failed to parse file content database');
+ }
+ });
+
+ readStream.on('error', streamError => {
+ console.error('Error reading file_content.json:', streamError);
+ res.status(500).send('Failed to read file content database');
+ });
+ } catch (error) {
+ console.error('Error retrieving file content:', error);
+ res.status(500).send('Failed to retrieve file content');
+ }
+ },
+ });
}
}
@@ -696,7 +1050,7 @@ export default class AssistantManager extends ApiManager {
* @param file_name The name of the file to process.
* @param file_path The filepath of the file to process.
*/
-function spawnPythonProcess(jobId: string, file_path: string) {
+function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
const venvPath = path.join(__dirname, '../chunker/venv');
const requirementsPath = path.join(__dirname, '../chunker/requirements.txt');
const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py');
@@ -706,7 +1060,7 @@ function spawnPythonProcess(jobId: string, file_path: string) {
function runPythonScript() {
const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3');
- const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory]);
+ const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory, doc_id]);
let pythonOutput = '';
let stderrOutput = '';
@@ -781,7 +1135,7 @@ function spawnPythonProcess(jobId: string, file_path: string) {
console.log('Virtual environment not found. Creating and setting up...');
// Create venv
- const createVenvProcess = spawn('python', ['-m', 'venv', venvPath]);
+ const createVenvProcess = spawn('python3.10', ['-m', 'venv', venvPath]);
createVenvProcess.on('close', code => {
if (code !== 0) {
@@ -829,3 +1183,121 @@ function spawnPythonProcess(jobId: string, file_path: string) {
runPythonScript();
}
}
+
+/**
+ * Enhanced content extraction that focuses on meaningful text content.
+ * @param html The HTML content to process
+ * @returns Extracted and cleaned text content
+ */
+async function extractEnhancedContent(html: string): Promise<string> {
+ try {
+ // Create DOM to extract content
+ const dom = new JSDOM(html, { runScripts: 'outside-only' });
+ const document = dom.window.document;
+
+ // Remove all non-content elements
+ const elementsToRemove = [
+ 'script',
+ 'style',
+ 'iframe',
+ 'noscript',
+ 'svg',
+ 'canvas',
+ 'header',
+ 'footer',
+ 'nav',
+ 'aside',
+ 'form',
+ 'button',
+ 'input',
+ 'select',
+ 'textarea',
+ 'meta',
+ 'link',
+ 'img',
+ 'video',
+ 'audio',
+ '.ad',
+ '.ads',
+ '.advertisement',
+ '.banner',
+ '.cookie',
+ '.popup',
+ '.modal',
+ '.newsletter',
+ '[role="banner"]',
+ '[role="navigation"]',
+ '[role="complementary"]',
+ ];
+
+ elementsToRemove.forEach(selector => {
+ const elements = document.querySelectorAll(selector);
+ elements.forEach(el => el.remove());
+ });
+
+ // Get all text paragraphs with meaningful content
+ const contentElements = [
+ ...Array.from(document.querySelectorAll('p')),
+ ...Array.from(document.querySelectorAll('h1')),
+ ...Array.from(document.querySelectorAll('h2')),
+ ...Array.from(document.querySelectorAll('h3')),
+ ...Array.from(document.querySelectorAll('h4')),
+ ...Array.from(document.querySelectorAll('h5')),
+ ...Array.from(document.querySelectorAll('h6')),
+ ...Array.from(document.querySelectorAll('li')),
+ ...Array.from(document.querySelectorAll('td')),
+ ...Array.from(document.querySelectorAll('article')),
+ ...Array.from(document.querySelectorAll('section')),
+ ...Array.from(document.querySelectorAll('div:not([class]):not([id])')),
+ ];
+
+ // Extract text from content elements that have meaningful text
+ let contentParts: string[] = [];
+ contentElements.forEach(el => {
+ const text = el.textContent?.trim();
+ // Only include elements with substantial text (more than just a few characters)
+ if (text && text.length > 10 && !contentParts.includes(text)) {
+ contentParts.push(text);
+ }
+ });
+
+ // If no significant content found with selective approach, fallback to body
+ if (contentParts.length < 3) {
+ return document.body.textContent || '';
+ }
+
+ return contentParts.join('\n\n');
+ } catch (error) {
+ console.error('Error extracting enhanced content:', error);
+ return 'Failed to extract content from the webpage.';
+ }
+}
+
+/**
+ * Cleans up extracted text to improve readability and focus on useful content.
+ * @param text The raw extracted text
+ * @returns Cleaned and formatted text
+ */
+function cleanupText(text: string): string {
+ if (!text) return '';
+
+ return (
+ text
+ // Remove excessive whitespace and normalize line breaks
+ .replace(/\s+/g, ' ')
+ .replace(/\n\s*\n\s*\n+/g, '\n\n')
+ // Remove common boilerplate phrases
+ .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '')
+ // Remove email addresses
+ .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '')
+ // Remove URLs
+ .replace(/https?:\/\/[^\s]+/g, '')
+ // Remove social media handles
+ .replace(/@[a-zA-Z0-9_]+/g, '')
+ // Clean up any remaining HTML tags that might have been missed
+ .replace(/<[^>]*>/g, '')
+ // Fix spacing issues after cleanup
+ .replace(/ +/g, ' ')
+ .trim()
+ );
+}