diff options
Diffstat (limited to 'src/server')
| -rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 508 | ||||
| -rw-r--r-- | src/server/api/dynamicTools.ts | 130 | ||||
| -rw-r--r-- | src/server/chunker/pdf_chunker.py | 187 | ||||
| -rw-r--r-- | src/server/index.ts | 1 | ||||
| -rw-r--r-- | src/server/server_Initialization.ts | 5 |
5 files changed, 697 insertions, 134 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index af25722a4..c7c347c71 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -39,6 +39,7 @@ export enum Directory { csv = 'csv', chunk_images = 'chunk_images', scrape_images = 'scrape_images', + vectorstore = 'vectorstore', } // In-memory job tracking @@ -92,6 +93,132 @@ export default class AssistantManager extends ApiManager { const customsearch = google.customsearch('v1'); const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY }); + // Register an endpoint to retrieve file summaries from the json file + register({ + method: Method.GET, + subscription: '/getFileSummaries', + secureHandler: async ({ req, res }) => { + try { + // Read the file summaries JSON file + const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json'); + + if (!fs.existsSync(filePath)) { + res.status(404).send({ error: 'File summaries not found' }); + return; + } + + const data = fs.readFileSync(filePath, 'utf8'); + res.send(data); + } catch (error) { + console.error('Error retrieving file summaries:', error); + res.status(500).send({ + error: 'Failed to retrieve file summaries', + }); + } + }, + }); + + // Register an endpoint to retrieve file names from the file_summaries.json file + register({ + method: Method.GET, + subscription: '/getFileNames', + secureHandler: async ({ res }) => { + const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json'); + const data = fs.readFileSync(filePath, 'utf8'); + console.log(Object.keys(JSON.parse(data))); + + res.send(Object.keys(JSON.parse(data))); + }, + }); + + // Register an endpoint to retrieve file content from the content json file + register({ + method: Method.POST, + subscription: '/getFileContent', + secureHandler: async ({ req, res }) => { + const { filepath } = req.body; + + if (!filepath) { + res.status(400).send({ error: 'Filepath is required' }); + return; + } + + try { + // Read the file content JSON file + const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json'); + + if (!fs.existsSync(filePath)) { + res.status(404).send({ error: 'File content database not found' }); + return; + } + + console.log(`[DEBUG] Retrieving content for: ${filepath}`); + + // Read the JSON file in chunks to handle large files + const readStream = fs.createReadStream(filePath, { encoding: 'utf8' }); + let jsonData = ''; + + readStream.on('data', chunk => { + jsonData += chunk; + }); + + readStream.on('end', () => { + try { + // Parse the JSON + const contentMap = JSON.parse(jsonData); + + // Check if the filepath exists in the map + if (!contentMap[filepath]) { + console.log(`[DEBUG] Content not found for: ${filepath}`); + res.status(404).send({ error: `Content not found for filepath: ${filepath}` }); + return; + } + + // Return the file content as is, not as JSON + console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`); + res.send(contentMap[filepath]); + } catch (parseError) { + console.error('Error parsing file_content.json:', parseError); + res.status(500).send({ + error: 'Failed to parse file content database', + }); + } + }); + + readStream.on('error', streamError => { + console.error('Error reading file_content.json:', streamError); + res.status(500).send({ + error: 'Failed to read file content database', + }); + }); + } catch (error) { + console.error('Error retrieving file content:', error); + res.status(500).send({ + error: 'Failed to retrieve file content', + }); + } + }, + }); + + // Register an endpoint to search file summaries + register({ + method: Method.POST, + subscription: '/searchFileSummaries', + secureHandler: async ({ req, res }) => { + const { query, topK } = req.body; + + if (!query) { + res.status(400).send({ error: 'Search query is required' }); + return; + } + + // This endpoint will be called by the client-side Vectorstore to perform the search + // The actual search is implemented in the Vectorstore class + + res.send({ message: 'This endpoint should be called through the Vectorstore class' }); + }, + }); + // Register Wikipedia summary API route register({ method: Method.POST, @@ -485,36 +612,76 @@ export default class AssistantManager extends ApiManager { subscription: '/scrapeWebsite', secureHandler: async ({ req, res }) => { const { url } = req.body; + let browser = null; try { + // Set a longer timeout for slow-loading pages + const navigationTimeout = 60000; // 60 seconds + // Launch Puppeteer browser to navigate to the webpage - const browser = await puppeteer.launch({ - args: ['--no-sandbox', '--disable-setuid-sandbox'], + browser = await puppeteer.launch({ + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'], }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'); - await page.goto(url, { waitUntil: 'networkidle2' }); + + // Set timeout for navigation + page.setDefaultNavigationTimeout(navigationTimeout); + + // Navigate with timeout and wait for content to load + await page.goto(url, { + waitUntil: 'networkidle2', + timeout: navigationTimeout, + }); + + // Wait a bit longer to ensure dynamic content loads + await new Promise(resolve => setTimeout(resolve, 2000)); // Extract HTML content const htmlContent = await page.content(); await browser.close(); + browser = null; - // Parse HTML content using JSDOM - const dom = new JSDOM(htmlContent, { url }); + let extractedText = ''; - // Extract readable content using Mozilla's Readability API - const reader = new Readability(dom.window.document); - const article = reader.parse(); + // First try with Readability + try { + // Parse HTML content using JSDOM + const dom = new JSDOM(htmlContent, { url }); + + // Extract readable content using Mozilla's Readability API + const reader = new Readability(dom.window.document, { + // Readability configuration to focus on text content + charThreshold: 100, + keepClasses: false, + }); + const article = reader.parse(); - if (article) { - const plainText = article.textContent; - res.send({ website_plain_text: plainText }); - } else { - res.status(500).send({ error: 'Failed to extract readable content' }); + if (article && article.textContent) { + extractedText = article.textContent; + } else { + // If Readability doesn't return useful content, try alternate method + extractedText = await extractEnhancedContent(htmlContent); + } + } catch (parsingError) { + console.error('Error parsing website content with Readability:', parsingError); + // Fallback to enhanced content extraction + extractedText = await extractEnhancedContent(htmlContent); } + + // Clean up the extracted text + extractedText = cleanupText(extractedText); + + res.send({ website_plain_text: extractedText }); } catch (error) { console.error('Error scraping website:', error); + + // Clean up browser if still open + if (browser) { + await browser.close().catch(e => console.error('Error closing browser:', e)); + } + res.status(500).send({ - error: 'Failed to scrape website', + error: 'Failed to scrape website: ' + ((error as Error).message || 'Unknown error'), }); } }, @@ -526,7 +693,7 @@ export default class AssistantManager extends ApiManager { method: Method.POST, subscription: '/createDocument', secureHandler: async ({ req, res }) => { - const { file_path } = req.body; + const { file_path, doc_id } = req.body; const public_path = path.join(publicDirectory, file_path); // Resolve the file path in the public directory const file_name = path.basename(file_path); // Extract the file name from the path @@ -539,7 +706,7 @@ export default class AssistantManager extends ApiManager { // Spawn the Python process and track its progress/output // eslint-disable-next-line no-use-before-define - spawnPythonProcess(jobId, public_path); + spawnPythonProcess(jobId, public_path, doc_id); // Send the job ID back to the client for tracking res.send({ jobId }); @@ -687,6 +854,193 @@ export default class AssistantManager extends ApiManager { } }, }); + + // Register an API route to capture a screenshot of a webpage using Puppeteer + // and return the image URL for display in the WebBox component + register({ + method: Method.POST, + subscription: '/captureWebScreenshot', + secureHandler: async ({ req, res }) => { + const { url, width, height, fullPage } = req.body; + + if (!url) { + res.status(400).send({ error: 'URL is required' }); + return; + } + + let browser = null; + try { + // Increase timeout for websites that load slowly + const navigationTimeout = 60000; // 60 seconds + + // Launch a headless browser with additional options to improve stability + browser = await puppeteer.launch({ + headless: true, // Use headless mode + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-accelerated-2d-canvas', + '--disable-gpu', + '--window-size=1200,800', + '--disable-web-security', // Helps with cross-origin issues + '--disable-features=IsolateOrigins,site-per-process', // Helps with frames + ], + timeout: navigationTimeout, + }); + + const page = await browser.newPage(); + + // Set a larger viewport to capture more content + await page.setViewport({ + width: Number(width) || 1200, + height: Number(height) || 800, + deviceScaleFactor: 1, + }); + + // Enable request interception to speed up page loading + await page.setRequestInterception(true); + page.on('request', request => { + // Skip unnecessary resources to speed up loading + const resourceType = request.resourceType(); + if (resourceType === 'font' || resourceType === 'media' || resourceType === 'websocket' || request.url().includes('analytics') || request.url().includes('tracker')) { + request.abort(); + } else { + request.continue(); + } + }); + + // Set navigation and timeout options + console.log(`Navigating to URL: ${url}`); + + // Navigate to the URL and wait for the page to load + await page.goto(url, { + waitUntil: ['networkidle2'], + timeout: navigationTimeout, + }); + + // Wait for a short delay after navigation to allow content to render + await new Promise(resolve => setTimeout(resolve, 2000)); + + // Take a screenshot + console.log('Taking screenshot...'); + const screenshotPath = `./src/server/public/files/images/webpage_${Date.now()}.png`; + const screenshotOptions = { + path: screenshotPath, + fullPage: fullPage === true, + omitBackground: false, + type: 'png' as 'png', + clip: + fullPage !== true + ? { + x: 0, + y: 0, + width: Number(width) || 1200, + height: Number(height) || 800, + } + : undefined, + }; + + await page.screenshot(screenshotOptions); + + // Get the full height of the page + const fullHeight = await page.evaluate(() => { + return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight); + }); + + console.log(`Screenshot captured successfully with height: ${fullHeight}px`); + + // Return the URL to the screenshot + const screenshotUrl = `/files/images/webpage_${Date.now()}.png`; + res.json({ + screenshotUrl, + fullHeight, + }); + } catch (error: any) { + console.error('Error capturing screenshot:', error); + res.status(500).send({ + error: `Failed to capture screenshot: ${error.message}`, + details: error.stack, + }); + } finally { + // Ensure browser is closed to free resources + if (browser) { + try { + await browser.close(); + console.log('Browser closed successfully'); + } catch (error) { + console.error('Error closing browser:', error); + } + } + } + }, + }); + + // Register an endpoint to retrieve raw file content as plain text (no JSON parsing) + register({ + method: Method.POST, + subscription: '/getRawFileContent', + secureHandler: async ({ req, res }) => { + const { filepath } = req.body; + + if (!filepath) { + res.status(400).send('Filepath is required'); + return; + } + + try { + // Read the file content JSON file + const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json'); + + if (!fs.existsSync(filePath)) { + res.status(404).send('File content database not found'); + return; + } + + console.log(`[DEBUG] Retrieving raw content for: ${filepath}`); + + // Read the JSON file + const readStream = fs.createReadStream(filePath, { encoding: 'utf8' }); + let jsonData = ''; + + readStream.on('data', chunk => { + jsonData += chunk; + }); + + readStream.on('end', () => { + try { + // Parse the JSON + const contentMap = JSON.parse(jsonData); + + // Check if the filepath exists in the map + if (!contentMap[filepath]) { + console.log(`[DEBUG] Content not found for: ${filepath}`); + res.status(404).send(`Content not found for filepath: ${filepath}`); + return; + } + + // Set content type to plain text to avoid JSON parsing + res.setHeader('Content-Type', 'text/plain'); + + // Return the file content as plain text + console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`); + res.send(contentMap[filepath]); + } catch (parseError) { + console.error('Error parsing file_content.json:', parseError); + res.status(500).send('Failed to parse file content database'); + } + }); + + readStream.on('error', streamError => { + console.error('Error reading file_content.json:', streamError); + res.status(500).send('Failed to read file content database'); + }); + } catch (error) { + console.error('Error retrieving file content:', error); + res.status(500).send('Failed to retrieve file content'); + } + }, + }); } } @@ -696,7 +1050,7 @@ export default class AssistantManager extends ApiManager { * @param file_name The name of the file to process. * @param file_path The filepath of the file to process. */ -function spawnPythonProcess(jobId: string, file_path: string) { +function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) { const venvPath = path.join(__dirname, '../chunker/venv'); const requirementsPath = path.join(__dirname, '../chunker/requirements.txt'); const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py'); @@ -706,7 +1060,7 @@ function spawnPythonProcess(jobId: string, file_path: string) { function runPythonScript() { const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3'); - const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory]); + const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory, doc_id]); let pythonOutput = ''; let stderrOutput = ''; @@ -829,3 +1183,121 @@ function spawnPythonProcess(jobId: string, file_path: string) { runPythonScript(); } } + +/** + * Enhanced content extraction that focuses on meaningful text content. + * @param html The HTML content to process + * @returns Extracted and cleaned text content + */ +async function extractEnhancedContent(html: string): Promise<string> { + try { + // Create DOM to extract content + const dom = new JSDOM(html, { runScripts: 'outside-only' }); + const document = dom.window.document; + + // Remove all non-content elements + const elementsToRemove = [ + 'script', + 'style', + 'iframe', + 'noscript', + 'svg', + 'canvas', + 'header', + 'footer', + 'nav', + 'aside', + 'form', + 'button', + 'input', + 'select', + 'textarea', + 'meta', + 'link', + 'img', + 'video', + 'audio', + '.ad', + '.ads', + '.advertisement', + '.banner', + '.cookie', + '.popup', + '.modal', + '.newsletter', + '[role="banner"]', + '[role="navigation"]', + '[role="complementary"]', + ]; + + elementsToRemove.forEach(selector => { + const elements = document.querySelectorAll(selector); + elements.forEach(el => el.remove()); + }); + + // Get all text paragraphs with meaningful content + const contentElements = [ + ...Array.from(document.querySelectorAll('p')), + ...Array.from(document.querySelectorAll('h1')), + ...Array.from(document.querySelectorAll('h2')), + ...Array.from(document.querySelectorAll('h3')), + ...Array.from(document.querySelectorAll('h4')), + ...Array.from(document.querySelectorAll('h5')), + ...Array.from(document.querySelectorAll('h6')), + ...Array.from(document.querySelectorAll('li')), + ...Array.from(document.querySelectorAll('td')), + ...Array.from(document.querySelectorAll('article')), + ...Array.from(document.querySelectorAll('section')), + ...Array.from(document.querySelectorAll('div:not([class]):not([id])')), + ]; + + // Extract text from content elements that have meaningful text + let contentParts: string[] = []; + contentElements.forEach(el => { + const text = el.textContent?.trim(); + // Only include elements with substantial text (more than just a few characters) + if (text && text.length > 10 && !contentParts.includes(text)) { + contentParts.push(text); + } + }); + + // If no significant content found with selective approach, fallback to body + if (contentParts.length < 3) { + return document.body.textContent || ''; + } + + return contentParts.join('\n\n'); + } catch (error) { + console.error('Error extracting enhanced content:', error); + return 'Failed to extract content from the webpage.'; + } +} + +/** + * Cleans up extracted text to improve readability and focus on useful content. + * @param text The raw extracted text + * @returns Cleaned and formatted text + */ +function cleanupText(text: string): string { + if (!text) return ''; + + return ( + text + // Remove excessive whitespace and normalize line breaks + .replace(/\s+/g, ' ') + .replace(/\n\s*\n\s*\n+/g, '\n\n') + // Remove common boilerplate phrases + .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '') + // Remove email addresses + .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '') + // Remove URLs + .replace(/https?:\/\/[^\s]+/g, '') + // Remove social media handles + .replace(/@[a-zA-Z0-9_]+/g, '') + // Clean up any remaining HTML tags that might have been missed + .replace(/<[^>]*>/g, '') + // Fix spacing issues after cleanup + .replace(/ +/g, ' ') + .trim() + ); +} diff --git a/src/server/api/dynamicTools.ts b/src/server/api/dynamicTools.ts new file mode 100644 index 000000000..a7b7e1478 --- /dev/null +++ b/src/server/api/dynamicTools.ts @@ -0,0 +1,130 @@ +import * as express from 'express'; +import * as fs from 'fs'; +import * as path from 'path'; + +// Define handler types to match project patterns +type RouteHandler = (req: express.Request, res: express.Response) => any; + +/** + * Handles API endpoints for dynamic tools created by the agent + */ +export function setupDynamicToolsAPI(app: express.Express): void { + // Directory where dynamic tools will be stored + const dynamicToolsDir = path.join(process.cwd(), 'src', 'client', 'views', 'nodes', 'chatbot', 'tools', 'dynamic'); + + console.log(`Dynamic tools directory path: ${dynamicToolsDir}`); + + // Ensure directory exists + if (!fs.existsSync(dynamicToolsDir)) { + try { + fs.mkdirSync(dynamicToolsDir, { recursive: true }); + console.log(`Created dynamic tools directory at ${dynamicToolsDir}`); + } catch (error) { + console.error(`Failed to create dynamic tools directory: ${error}`); + } + } + + /** + * Save a dynamic tool to the server + */ + const saveDynamicTool: RouteHandler = (req, res) => { + try { + const { toolName, toolCode } = req.body; + + if (!toolName || !toolCode) { + return res.status(400).json({ + success: false, + error: 'Missing toolName or toolCode in request body', + }); + } + + // Validate the tool name (should be PascalCase) + if (!/^[A-Z][a-zA-Z0-9]*$/.test(toolName)) { + return res.status(400).json({ + success: false, + error: 'Tool name must be in PascalCase format', + }); + } + + // Create the file path + const filePath = path.join(dynamicToolsDir, `${toolName}.ts`); + + // Check if file already exists and is different + let existingCode = ''; + if (fs.existsSync(filePath)) { + existingCode = fs.readFileSync(filePath, 'utf8'); + } + + // Only write if the file doesn't exist or the content is different + if (existingCode !== toolCode) { + fs.writeFileSync(filePath, toolCode, 'utf8'); + console.log(`Saved dynamic tool: ${toolName}`); + } else { + console.log(`Dynamic tool ${toolName} already exists with the same content`); + } + + return res.json({ success: true, toolName }); + } catch (error) { + console.error('Error saving dynamic tool:', error); + return res.status(500).json({ + success: false, + error: error instanceof Error ? error.message : 'Unknown error', + }); + } + }; + + /** + * Get a list of all available dynamic tools + */ + const getDynamicTools: RouteHandler = (req, res) => { + try { + // Get all TypeScript files in the dynamic tools directory + const files = fs + .readdirSync(dynamicToolsDir) + .filter(file => file.endsWith('.ts')) + .map(file => ({ + name: path.basename(file, '.ts'), + path: path.join('dynamic', file), + })); + + return res.json({ success: true, tools: files }); + } catch (error) { + console.error('Error getting dynamic tools:', error); + return res.status(500).json({ + success: false, + error: error instanceof Error ? error.message : 'Unknown error', + }); + } + }; + + /** + * Get the code for a specific dynamic tool + */ + const getDynamicTool: RouteHandler = (req, res) => { + try { + const { toolName } = req.params; + const filePath = path.join(dynamicToolsDir, `${toolName}.ts`); + + if (!fs.existsSync(filePath)) { + return res.status(404).json({ + success: false, + error: `Tool ${toolName} not found`, + }); + } + + const toolCode = fs.readFileSync(filePath, 'utf8'); + return res.json({ success: true, toolName, toolCode }); + } catch (error) { + console.error('Error getting dynamic tool:', error); + return res.status(500).json({ + success: false, + error: error instanceof Error ? error.message : 'Unknown error', + }); + } + }; + + // Register routes + app.post('/saveDynamicTool', saveDynamicTool); + app.get('/getDynamicTools', getDynamicTools); + app.get('/getDynamicTool/:toolName', getDynamicTool); +} diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py index 697550f2e..e34753176 100644 --- a/src/server/chunker/pdf_chunker.py +++ b/src/server/chunker/pdf_chunker.py @@ -153,7 +153,7 @@ class ElementExtractor: xref = img_info[0] # XREF of the image in the PDF base_image = page.parent.extract_image(xref) # Extract the image by its XREF image_bytes = base_image["image"] - image = Image.open(io.BytesIO(image_bytes)) # Convert bytes to PIL image + image = Image.open(io.BytesIO(image_bytes)).convert("RGB") # Ensure it's RGB before saving as PNG width_ratio = img.width / page.rect.width # Scale factor for width height_ratio = img.height / page.rect.height # Scale factor for height @@ -276,12 +276,13 @@ class PDFChunker: :param output_folder: Folder to store the output files (extracted tables/images). :param image_batch_size: The batch size for processing visual elements. """ - self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) # Initialize the Anthropic API client + self.client = OpenAI() # ← replaces Anthropic() self.output_folder = output_folder self.image_batch_size = image_batch_size # Batch size for image processing self.doc_id = doc_id # Add doc_id self.element_extractor = ElementExtractor(output_folder, doc_id) + async def chunk_pdf(self, file_data: bytes, file_name: str, doc_id: str, job_id: str) -> List[Dict[str, Any]]: """ Processes a PDF file, extracting text and visual elements, and returning structured chunks. @@ -518,124 +519,77 @@ class PDFChunker: def batch_summarize_images(self, images: Dict[int, str]) -> Dict[int, str]: """ - Summarize images or tables by generating descriptive text. - - :param images: A dictionary mapping image numbers to base64-encoded image data. - :return: A dictionary mapping image numbers to their generated summaries. - """ - # Prompt for the AI model to summarize images and tables - prompt = f"""<instruction> - <task> - You are tasked with summarizing a series of {len(images)} images and tables for use in a RAG (Retrieval-Augmented Generation) system. - Your goal is to create concise, informative summaries that capture the essential content of each image or table. - These summaries will be used for embedding, so they should be descriptive and relevant. The image or table will be outlined in red on an image of the full page that it is on. Where necessary, use the context of the full page to heklp with the summary but don't summarize other content on the page. - </task> - - <steps> - <step>Identify whether it's an image or a table.</step> - <step>Examine its content carefully.</step> - <step> - Write a detailed summary that captures the main points or visual elements: - <details> - <table>After summarizing what the table is about, include the column headers, a detailed summary of the data, and any notable data trends.</table> - <image>Describe the main subjects, actions, or notable features.</image> - </details> - </step> - <step>Focus on writing summaries that would make it easy to retrieve the content if compared to a user query using vector similarity search.</step> - <step>Keep summaries concise and include important words that may help with retrieval (but do not include numbers and numerical data).</step> - </steps> - - <important_notes> - <note>Avoid using special characters like &, <, >, ", ', $, %, etc. Instead, use their word equivalents:</note> - <note>Use "and" instead of &.</note> - <note>Use "dollars" instead of $.</note> - <note>Use "percent" instead of %.</note> - <note>Refrain from using quotation marks " or apostrophes ' unless absolutely necessary.</note> - <note>Ensure your output is in valid XML format.</note> - </important_notes> - - <formatting> - <note>Enclose all summaries within a root element called <summaries>.</note> - <note>Use <summary> tags to enclose each individual summary.</note> - <note>Include an attribute 'number' in each <summary> tag to indicate the sequence, matching the provided image numbers.</note> - <note>Start each summary by indicating whether it's an image or a table (e.g., "This image shows..." or "The table presents...").</note> - <note>If an image is completely blank, leave the summary blank (e.g., <summary number="3"></summary>).</note> - </formatting> - - <example> - <note>Do not replicate the example below—stay grounded to the content of the table or image and describe it completely and accurately.</note> - <output> - <summaries> - <summary number="1"> - The image shows two men shaking hands on stage at a formal event. The man on the left, in a dark suit and glasses, has a professional appearance, possibly an academic or business figure. The man on the right, Tim Cook, CEO of Apple, is recognizable by his silver hair and dark blue blazer. Cook holds a document titled "Tsinghua SEM EMBA," suggesting a link to Tsinghua University’s Executive MBA program. The backdrop displays English and Chinese text about business management and education, with the event dated October 23, 2014. - </summary> - <summary number="2"> - The table compares the company's assets between December 30, 2023, and September 30, 2023. Key changes include an increase in cash and cash equivalents, while marketable securities had a slight rise. Accounts receivable and vendor non-trade receivables decreased. Inventories and other current assets saw minor fluctuations. Non-current assets like marketable securities slightly declined, while property, plant, and equipment remained stable. Total assets showed minimal change, holding steady at around three hundred fifty-three billion dollars. - </summary> - <summary number="3"> - The table outlines the company's shareholders' equity as of December 30, 2023, versus September 30, 2023. Common stock and additional paid-in capital increased, and retained earnings shifted from a deficit to a positive figure. Accumulated other comprehensive loss decreased. Overall, total shareholders' equity rose significantly, while total liabilities and equity remained nearly unchanged at about three hundred fifty-three billion dollars. - </summary> - <summary number="4"> - The table details the company's liabilities as of December 30, 2023, compared to September 30, 2023. Current liabilities decreased due to lower accounts payable and other current liabilities, while deferred revenue slightly increased. Commercial paper significantly decreased, and term debt rose modestly. Non-current liabilities were stable, with minimal changes in term debt and other non-current liabilities. Total liabilities dropped from two hundred ninety billion dollars to two hundred seventy-nine billion dollars. - </summary> - <summary number="5"> - </summary> - </summaries> - </output> - </example> - - <final_notes> - <note>Process each image or table in the order provided.</note> - <note>Maintain consistent formatting throughout your response.</note> - <note>Ensure the output is in full, valid XML format with the root <summaries> element and each summary being within a <summary> element with the summary number specified as well.</note> - </final_notes> -</instruction> - """ - content = [] - for number, img in images.items(): - content.append({"type": "text", "text": f"\nImage {number}:\n"}) - content.append({"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img}}) + Summarise a batch of images/tables with GPT‑4o using Structured Outputs. + :param images: {image_number: base64_png} + :return: {image_number: summary_text} + """ + # -------- 1. Build the prompt ----------- + content: list[dict] = [] + for n, b64 in images.items(): + content.append({"type": "text", + "text": f"\nImage {n} (outlined in red on the page):"}) + content.append({"type": "image_url", + "image_url": {"url": f"data:image/png;base64,{b64}"}}) messages = [ - {"role": "user", "content": content} + { + "role": "system", + "content": ( + "You are generating retrieval‑ready summaries for each highlighted " + "image or table. Start by identifying whether the element is an " + "image or a table, then write one informative sentence that a vector " + "search would find useful. Provide detail but limit to a couple of paragraphs per image." + ), + }, + {"role": "user", "content": content}, ] + schema = { + "type": "object", + "properties": { + "summaries": { + "type": "array", + "items": { + "type": "object", + "properties": { + "number": {"type": "integer"}, + "type": {"type": "string", "enum": ["image", "table"]}, + "summary": {"type": "string"} + }, + "required": ["number", "type", "summary"], + "additionalProperties": False + } + } + }, + "required": ["summaries"], + "additionalProperties": False + } + + # ---------- OpenAI call ----------------------------------------------------- try: - response = self.client.messages.create( - model='claude-3-5-sonnet-20240620', - system=prompt, - max_tokens=400 * len(images), # Increased token limit for more detailed summaries + resp = self.client.chat.completions.create( + model="gpt-4o", messages=messages, + max_tokens=400 * len(images), temperature=0, - extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"} + response_format={ + "type": "json_schema", + "json_schema": { + "name": "image_batch_summaries", # ← REQUIRED + "schema": schema, # ← REQUIRED + "strict": True # ← strongly recommended + }, + }, ) - # Parse the response - text = response.content[0].text - #print(text) - # Attempt to parse and fix the XML if necessary - parser = etree.XMLParser(recover=True) - root = etree.fromstring(text, parser=parser) - # Check if there were errors corrected - # if parser.error_log: - # #print("XML Parsing Errors:") - # for error in parser.error_log: - # #print(error) - # Extract summaries - summaries = {} - for summary in root.findall('summary'): - number = int(summary.get('number')) - content = summary.text.strip() if summary.text else "" - if content: # Only include non-empty summaries - summaries[number] = content - - return summaries + parsed = json.loads(resp.choices[0].message.content) # schema‑safe + return {item["number"]: item["summary"] + for item in parsed["summaries"]} except Exception as e: - # Print errors to stderr so they don't interfere with JSON output - print(json.dumps({"error": str(e)}), file=sys.stderr) - sys.stderr.flush() - + # Log and fall back gracefully + print(json.dumps({"error": str(e)}), file=sys.stderr, flush=True) + return {} class DocumentType(Enum): """ @@ -668,7 +622,7 @@ class Document: Represents a document being processed, such as a PDF, handling chunking, embedding, and summarization. """ - def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str): + def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str, doc_id: str): """ Initialize the Document with file data, file name, and job ID. @@ -681,7 +635,7 @@ class Document: self.file_path = file_path self.job_id = job_id self.type = self._get_document_type(file_name) # Determine the document type (PDF, CSV, etc.) - self.doc_id = job_id # Use the job ID as the document ID + self.doc_id = doc_id # Use the job ID as the document ID self.chunks = [] # List to hold text and visual chunks self.num_pages = 0 # Number of pages in the document (if applicable) self.summary = "" # The generated summary for the document @@ -767,7 +721,7 @@ class Document: client = OpenAI() # Initialize OpenAI client for text generation completion = client.chat.completions.create( - model="gpt-3.5-turbo", # Specify the language model + model="gpt-4o", # Specify the language model messages=[ {"role": "system", "content": "You are an AI assistant tasked with summarizing a document. You are provided with important chunks from the document and provide a summary, as best you can, of what the document will contain overall. Be concise and brief with your response."}, @@ -801,7 +755,7 @@ class Document: "doc_id": self.doc_id }, indent=2) # Convert the document's attributes to JSON format -def process_document(file_path, job_id, output_folder): +def process_document(file_path, job_id, output_folder, doc_id): """ Top-level function to process a document and return the JSON output. @@ -809,26 +763,27 @@ def process_document(file_path, job_id, output_folder): :param job_id: The job ID for this document processing task. :return: The processed document's data in JSON format. """ - new_document = Document(file_path, file_path, job_id, output_folder) + new_document = Document(file_path, file_path, job_id, output_folder, doc_id) return new_document.to_json() def main(): """ Main entry point for the script, called with arguments from Node.js. """ - if len(sys.argv) != 4: + if len(sys.argv) != 5: print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr) return job_id = sys.argv[1] file_path = sys.argv[2] output_folder = sys.argv[3] # Get the output folder from arguments + doc_id = sys.argv[4] try: os.makedirs(output_folder, exist_ok=True) # Process the document - document_result = process_document(file_path, job_id, output_folder) # Pass output_folder + document_result = process_document(file_path, job_id, output_folder,doc_id) # Pass output_folder # Output the final result as JSON to stdout print(document_result) diff --git a/src/server/index.ts b/src/server/index.ts index 3b77359ec..887974ed8 100644 --- a/src/server/index.ts +++ b/src/server/index.ts @@ -2,6 +2,7 @@ import { yellow } from 'colors'; import * as dotenv from 'dotenv'; import * as mobileDetect from 'mobile-detect'; import * as path from 'path'; +import * as express from 'express'; import { logExecution } from './ActionUtilities'; import AssistantManager from './ApiManagers/AssistantManager'; import FlashcardManager from './ApiManagers/FlashcardManager'; diff --git a/src/server/server_Initialization.ts b/src/server/server_Initialization.ts index 641a88312..5deb66caf 100644 --- a/src/server/server_Initialization.ts +++ b/src/server/server_Initialization.ts @@ -21,6 +21,7 @@ import { Database } from './database'; import { WebSocket } from './websocket'; import axios from 'axios'; import { JSDOM } from 'jsdom'; +import { setupDynamicToolsAPI } from './api/dynamicTools'; /* RouteSetter is a wrapper around the server that prevents the server from being exposed. */ @@ -213,6 +214,10 @@ export default async function InitializeServer(routeSetter: RouteSetter) { // app.use(cors({ origin: (_origin: any, callback: any) => callback(null, true) })); registerAuthenticationRoutes(app); // this adds routes to authenticate a user (login, etc) registerCorsProxy(app); // this adds a /corsproxy/ route to allow clients to get to urls that would otherwise be blocked by cors policies + + // Set up the dynamic tools API + setupDynamicToolsAPI(app); + isRelease && !SSL.Loaded && SSL.exit(); routeSetter(new RouteManager(app, isRelease)); // this sets up all the regular supervised routes (things like /home, download/upload api's, pdf, search, session, etc) isRelease && process.env.serverPort && (resolvedPorts.server = Number(process.env.serverPort)); |
