aboutsummaryrefslogtreecommitdiff
path: root/src/server/ApiManagers/AssistantManager.ts
diff options
context:
space:
mode:
Diffstat (limited to 'src/server/ApiManagers/AssistantManager.ts')
-rw-r--r--src/server/ApiManagers/AssistantManager.ts515
1 files changed, 22 insertions, 493 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 07c970a4e..b917f555c 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -39,7 +39,6 @@ export enum Directory {
csv = 'csv',
chunk_images = 'chunk_images',
scrape_images = 'scrape_images',
- vectorstore = 'vectorstore',
}
// In-memory job tracking
@@ -93,132 +92,6 @@ export default class AssistantManager extends ApiManager {
const customsearch = google.customsearch('v1');
const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY });
- // Register an endpoint to retrieve file summaries from the json file
- register({
- method: Method.GET,
- subscription: '/getFileSummaries',
- secureHandler: async ({ req, res }) => {
- try {
- // Read the file summaries JSON file
- const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json');
-
- if (!fs.existsSync(filePath)) {
- res.status(404).send({ error: 'File summaries not found' });
- return;
- }
-
- const data = fs.readFileSync(filePath, 'utf8');
- res.send(data);
- } catch (error) {
- console.error('Error retrieving file summaries:', error);
- res.status(500).send({
- error: 'Failed to retrieve file summaries',
- });
- }
- },
- });
-
- // Register an endpoint to retrieve file names from the file_summaries.json file
- register({
- method: Method.GET,
- subscription: '/getFileNames',
- secureHandler: async ({ res }) => {
- const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json');
- const data = fs.readFileSync(filePath, 'utf8');
- console.log(Object.keys(JSON.parse(data)));
-
- res.send(Object.keys(JSON.parse(data)));
- },
- });
-
- // Register an endpoint to retrieve file content from the content json file
- register({
- method: Method.POST,
- subscription: '/getFileContent',
- secureHandler: async ({ req, res }) => {
- const { filepath } = req.body;
-
- if (!filepath) {
- res.status(400).send({ error: 'Filepath is required' });
- return;
- }
-
- try {
- // Read the file content JSON file
- const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json');
-
- if (!fs.existsSync(filePath)) {
- res.status(404).send({ error: 'File content database not found' });
- return;
- }
-
- console.log(`[DEBUG] Retrieving content for: ${filepath}`);
-
- // Read the JSON file in chunks to handle large files
- const readStream = fs.createReadStream(filePath, { encoding: 'utf8' });
- let jsonData = '';
-
- readStream.on('data', chunk => {
- jsonData += chunk;
- });
-
- readStream.on('end', () => {
- try {
- // Parse the JSON
- const contentMap = JSON.parse(jsonData);
-
- // Check if the filepath exists in the map
- if (!contentMap[filepath]) {
- console.log(`[DEBUG] Content not found for: ${filepath}`);
- res.status(404).send({ error: `Content not found for filepath: ${filepath}` });
- return;
- }
-
- // Return the file content as is, not as JSON
- console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`);
- res.send(contentMap[filepath]);
- } catch (parseError) {
- console.error('Error parsing file_content.json:', parseError);
- res.status(500).send({
- error: 'Failed to parse file content database',
- });
- }
- });
-
- readStream.on('error', streamError => {
- console.error('Error reading file_content.json:', streamError);
- res.status(500).send({
- error: 'Failed to read file content database',
- });
- });
- } catch (error) {
- console.error('Error retrieving file content:', error);
- res.status(500).send({
- error: 'Failed to retrieve file content',
- });
- }
- },
- });
-
- // Register an endpoint to search file summaries
- register({
- method: Method.POST,
- subscription: '/searchFileSummaries',
- secureHandler: async ({ req, res }) => {
- const { query, topK } = req.body;
-
- if (!query) {
- res.status(400).send({ error: 'Search query is required' });
- return;
- }
-
- // This endpoint will be called by the client-side Vectorstore to perform the search
- // The actual search is implemented in the Vectorstore class
-
- res.send({ message: 'This endpoint should be called through the Vectorstore class' });
- },
- });
-
// Register Wikipedia summary API route
register({
method: Method.POST,
@@ -566,9 +439,9 @@ export default class AssistantManager extends ApiManager {
try {
const image = await openai.images.generate({ model: 'dall-e-3', prompt: image_prompt, response_format: 'url' });
console.log(image);
- const result = await DashUploadUtils.UploadImage(image.data[0].url!);
+ const url = image.data?.[0].url;
- const url = image.data[0].url;
+ const result = url ? await DashUploadUtils.UploadImage(url) : { error: 'Image generation failed' };
res.send({ result, url });
} catch (error) {
@@ -612,76 +485,36 @@ export default class AssistantManager extends ApiManager {
subscription: '/scrapeWebsite',
secureHandler: async ({ req, res }) => {
const { url } = req.body;
- let browser = null;
try {
- // Set a longer timeout for slow-loading pages
- const navigationTimeout = 60000; // 60 seconds
-
// Launch Puppeteer browser to navigate to the webpage
- browser = await puppeteer.launch({
- args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
+ const browser = await puppeteer.launch({
+ args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
-
- // Set timeout for navigation
- page.setDefaultNavigationTimeout(navigationTimeout);
-
- // Navigate with timeout and wait for content to load
- await page.goto(url, {
- waitUntil: 'networkidle2',
- timeout: navigationTimeout,
- });
-
- // Wait a bit longer to ensure dynamic content loads
- await new Promise(resolve => setTimeout(resolve, 2000));
+ await page.goto(url, { waitUntil: 'networkidle2' });
// Extract HTML content
const htmlContent = await page.content();
await browser.close();
- browser = null;
- let extractedText = '';
+ // Parse HTML content using JSDOM
+ const dom = new JSDOM(htmlContent, { url });
- // First try with Readability
- try {
- // Parse HTML content using JSDOM
- const dom = new JSDOM(htmlContent, { url });
-
- // Extract readable content using Mozilla's Readability API
- const reader = new Readability(dom.window.document, {
- // Readability configuration to focus on text content
- charThreshold: 100,
- keepClasses: false,
- });
- const article = reader.parse();
+ // Extract readable content using Mozilla's Readability API
+ const reader = new Readability(dom.window.document);
+ const article = reader.parse();
- if (article && article.textContent) {
- extractedText = article.textContent;
- } else {
- // If Readability doesn't return useful content, try alternate method
- extractedText = await extractEnhancedContent(htmlContent);
- }
- } catch (parsingError) {
- console.error('Error parsing website content with Readability:', parsingError);
- // Fallback to enhanced content extraction
- extractedText = await extractEnhancedContent(htmlContent);
+ if (article) {
+ const plainText = article.textContent;
+ res.send({ website_plain_text: plainText });
+ } else {
+ res.status(500).send({ error: 'Failed to extract readable content' });
}
-
- // Clean up the extracted text
- extractedText = cleanupText(extractedText);
-
- res.send({ website_plain_text: extractedText });
} catch (error) {
console.error('Error scraping website:', error);
-
- // Clean up browser if still open
- if (browser) {
- await browser.close().catch(e => console.error('Error closing browser:', e));
- }
-
res.status(500).send({
- error: 'Failed to scrape website: ' + ((error as Error).message || 'Unknown error'),
+ error: 'Failed to scrape website',
});
}
},
@@ -693,20 +526,20 @@ export default class AssistantManager extends ApiManager {
method: Method.POST,
subscription: '/createDocument',
secureHandler: async ({ req, res }) => {
- const { file_path, doc_id } = req.body;
+ const { file_path } = req.body;
const public_path = path.join(publicDirectory, file_path); // Resolve the file path in the public directory
const file_name = path.basename(file_path); // Extract the file name from the path
try {
// Read the file data and encode it as base64
- const file_data: string = fs.readFileSync(public_path, { encoding: 'base64' });
+ const file_data = fs.readFileSync(public_path, { encoding: 'base64' });
// Generate a unique job ID for tracking
const jobId = uuid.v4();
// Spawn the Python process and track its progress/output
// eslint-disable-next-line no-use-before-define
- spawnPythonProcess(jobId, public_path, doc_id);
+ spawnPythonProcess(jobId, public_path);
// Send the job ID back to the client for tracking
res.send({ jobId });
@@ -854,193 +687,6 @@ export default class AssistantManager extends ApiManager {
}
},
});
-
- // Register an API route to capture a screenshot of a webpage using Puppeteer
- // and return the image URL for display in the WebBox component
- register({
- method: Method.POST,
- subscription: '/captureWebScreenshot',
- secureHandler: async ({ req, res }) => {
- const { url, width, height, fullPage } = req.body;
-
- if (!url) {
- res.status(400).send({ error: 'URL is required' });
- return;
- }
-
- let browser = null;
- try {
- // Increase timeout for websites that load slowly
- const navigationTimeout = 60000; // 60 seconds
-
- // Launch a headless browser with additional options to improve stability
- browser = await puppeteer.launch({
- headless: true, // Use headless mode
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-accelerated-2d-canvas',
- '--disable-gpu',
- '--window-size=1200,800',
- '--disable-web-security', // Helps with cross-origin issues
- '--disable-features=IsolateOrigins,site-per-process', // Helps with frames
- ],
- timeout: navigationTimeout,
- });
-
- const page = await browser.newPage();
-
- // Set a larger viewport to capture more content
- await page.setViewport({
- width: Number(width) || 1200,
- height: Number(height) || 800,
- deviceScaleFactor: 1,
- });
-
- // Enable request interception to speed up page loading
- await page.setRequestInterception(true);
- page.on('request', request => {
- // Skip unnecessary resources to speed up loading
- const resourceType = request.resourceType();
- if (resourceType === 'font' || resourceType === 'media' || resourceType === 'websocket' || request.url().includes('analytics') || request.url().includes('tracker')) {
- request.abort();
- } else {
- request.continue();
- }
- });
-
- // Set navigation and timeout options
- console.log(`Navigating to URL: ${url}`);
-
- // Navigate to the URL and wait for the page to load
- await page.goto(url, {
- waitUntil: ['networkidle2'],
- timeout: navigationTimeout,
- });
-
- // Wait for a short delay after navigation to allow content to render
- await new Promise(resolve => setTimeout(resolve, 2000));
-
- // Take a screenshot
- console.log('Taking screenshot...');
- const screenshotPath = `./src/server/public/files/images/webpage_${Date.now()}.png`;
- const screenshotOptions = {
- path: screenshotPath,
- fullPage: fullPage === true,
- omitBackground: false,
- type: 'png' as 'png',
- clip:
- fullPage !== true
- ? {
- x: 0,
- y: 0,
- width: Number(width) || 1200,
- height: Number(height) || 800,
- }
- : undefined,
- };
-
- await page.screenshot(screenshotOptions);
-
- // Get the full height of the page
- const fullHeight = await page.evaluate(() => {
- return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight);
- });
-
- console.log(`Screenshot captured successfully with height: ${fullHeight}px`);
-
- // Return the URL to the screenshot
- const screenshotUrl = `/files/images/webpage_${Date.now()}.png`;
- res.json({
- screenshotUrl,
- fullHeight,
- });
- } catch (error: any) {
- console.error('Error capturing screenshot:', error);
- res.status(500).send({
- error: `Failed to capture screenshot: ${error.message}`,
- details: error.stack,
- });
- } finally {
- // Ensure browser is closed to free resources
- if (browser) {
- try {
- await browser.close();
- console.log('Browser closed successfully');
- } catch (error) {
- console.error('Error closing browser:', error);
- }
- }
- }
- },
- });
-
- // Register an endpoint to retrieve raw file content as plain text (no JSON parsing)
- register({
- method: Method.POST,
- subscription: '/getRawFileContent',
- secureHandler: async ({ req, res }) => {
- const { filepath } = req.body;
-
- if (!filepath) {
- res.status(400).send('Filepath is required');
- return;
- }
-
- try {
- // Read the file content JSON file
- const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json');
-
- if (!fs.existsSync(filePath)) {
- res.status(404).send('File content database not found');
- return;
- }
-
- console.log(`[DEBUG] Retrieving raw content for: ${filepath}`);
-
- // Read the JSON file
- const readStream = fs.createReadStream(filePath, { encoding: 'utf8' });
- let jsonData = '';
-
- readStream.on('data', chunk => {
- jsonData += chunk;
- });
-
- readStream.on('end', () => {
- try {
- // Parse the JSON
- const contentMap = JSON.parse(jsonData);
-
- // Check if the filepath exists in the map
- if (!contentMap[filepath]) {
- console.log(`[DEBUG] Content not found for: ${filepath}`);
- res.status(404).send(`Content not found for filepath: ${filepath}`);
- return;
- }
-
- // Set content type to plain text to avoid JSON parsing
- res.setHeader('Content-Type', 'text/plain');
-
- // Return the file content as plain text
- console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`);
- res.send(contentMap[filepath]);
- } catch (parseError) {
- console.error('Error parsing file_content.json:', parseError);
- res.status(500).send('Failed to parse file content database');
- }
- });
-
- readStream.on('error', streamError => {
- console.error('Error reading file_content.json:', streamError);
- res.status(500).send('Failed to read file content database');
- });
- } catch (error) {
- console.error('Error retrieving file content:', error);
- res.status(500).send('Failed to retrieve file content');
- }
- },
- });
}
}
@@ -1050,7 +696,7 @@ export default class AssistantManager extends ApiManager {
* @param file_name The name of the file to process.
* @param file_path The filepath of the file to process.
*/
-function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
+function spawnPythonProcess(jobId: string, file_path: string) {
const venvPath = path.join(__dirname, '../chunker/venv');
const requirementsPath = path.join(__dirname, '../chunker/requirements.txt');
const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py');
@@ -1060,7 +706,7 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
function runPythonScript() {
const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3');
- const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory, doc_id]);
+ const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory]);
let pythonOutput = '';
let stderrOutput = '';
@@ -1135,6 +781,7 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
console.log('Virtual environment not found. Creating and setting up...');
// Create venv
+ // const createVenvProcess = spawn('python', ['-m', 'venv', venvPath]);
const createVenvProcess = spawn('python3.10', ['-m', 'venv', venvPath]);
createVenvProcess.on('close', code => {
@@ -1183,121 +830,3 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
runPythonScript();
}
}
-
-/**
- * Enhanced content extraction that focuses on meaningful text content.
- * @param html The HTML content to process
- * @returns Extracted and cleaned text content
- */
-async function extractEnhancedContent(html: string): Promise<string> {
- try {
- // Create DOM to extract content
- const dom = new JSDOM(html, { runScripts: 'outside-only' });
- const document = dom.window.document;
-
- // Remove all non-content elements
- const elementsToRemove = [
- 'script',
- 'style',
- 'iframe',
- 'noscript',
- 'svg',
- 'canvas',
- 'header',
- 'footer',
- 'nav',
- 'aside',
- 'form',
- 'button',
- 'input',
- 'select',
- 'textarea',
- 'meta',
- 'link',
- 'img',
- 'video',
- 'audio',
- '.ad',
- '.ads',
- '.advertisement',
- '.banner',
- '.cookie',
- '.popup',
- '.modal',
- '.newsletter',
- '[role="banner"]',
- '[role="navigation"]',
- '[role="complementary"]',
- ];
-
- elementsToRemove.forEach(selector => {
- const elements = document.querySelectorAll(selector);
- elements.forEach(el => el.remove());
- });
-
- // Get all text paragraphs with meaningful content
- const contentElements = [
- ...Array.from(document.querySelectorAll('p')),
- ...Array.from(document.querySelectorAll('h1')),
- ...Array.from(document.querySelectorAll('h2')),
- ...Array.from(document.querySelectorAll('h3')),
- ...Array.from(document.querySelectorAll('h4')),
- ...Array.from(document.querySelectorAll('h5')),
- ...Array.from(document.querySelectorAll('h6')),
- ...Array.from(document.querySelectorAll('li')),
- ...Array.from(document.querySelectorAll('td')),
- ...Array.from(document.querySelectorAll('article')),
- ...Array.from(document.querySelectorAll('section')),
- ...Array.from(document.querySelectorAll('div:not([class]):not([id])')),
- ];
-
- // Extract text from content elements that have meaningful text
- let contentParts: string[] = [];
- contentElements.forEach(el => {
- const text = el.textContent?.trim();
- // Only include elements with substantial text (more than just a few characters)
- if (text && text.length > 10 && !contentParts.includes(text)) {
- contentParts.push(text);
- }
- });
-
- // If no significant content found with selective approach, fallback to body
- if (contentParts.length < 3) {
- return document.body.textContent || '';
- }
-
- return contentParts.join('\n\n');
- } catch (error) {
- console.error('Error extracting enhanced content:', error);
- return 'Failed to extract content from the webpage.';
- }
-}
-
-/**
- * Cleans up extracted text to improve readability and focus on useful content.
- * @param text The raw extracted text
- * @returns Cleaned and formatted text
- */
-function cleanupText(text: string): string {
- if (!text) return '';
-
- return (
- text
- // Remove excessive whitespace and normalize line breaks
- .replace(/\s+/g, ' ')
- .replace(/\n\s*\n\s*\n+/g, '\n\n')
- // Remove common boilerplate phrases
- .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '')
- // Remove email addresses
- .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '')
- // Remove URLs
- .replace(/https?:\/\/[^\s]+/g, '')
- // Remove social media handles
- .replace(/@[a-zA-Z0-9_]+/g, '')
- // Clean up any remaining HTML tags that might have been missed
- .replace(/<[^>]*>/g, '')
- // Fix spacing issues after cleanup
- .replace(/ +/g, ' ')
- .trim()
- );
-}