Merge branch 'nathan-starter' of https://github.com/brown-dash/Dash-Web into nathan-starter

author: Nathan-SR <144961007+Nathan-SR@users.noreply.github.com> 2024-10-10 19:30:06 -0400
committer: Nathan-SR <144961007+Nathan-SR@users.noreply.github.com> 2024-10-10 19:30:06 -0400
commit: 373340938a4bc48edb4b9345f28e562de41153d6 (patch)
tree: d6604992d93a12920e1b62a1f906735d59434765 /src/server/ApiManagers/AssistantManager.ts
parent: 772c7a4c4d8867cbc33a673c3e3c6f3e330d395d (diff)
parent: 5752dff8ff7b1b2858542feec0b1bb037461bf1a (diff)
1 files changed, 486 insertions, 69 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index b42314e41..b7d4191ca 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -1,13 +1,31 @@
+/**
+ * @file AssistantManager.ts
+ * @description This file defines the AssistantManager class, responsible for managing various
+ * API routes related to the Assistant functionality. It provides features such as file handling,
+ * web scraping, and integration with third-party APIs like OpenAI and Google Custom Search.
+ * It also handles job tracking and progress reporting for tasks like document creation and web scraping.
+ * Utility functions for path manipulation and file operations are included, along with
+ * a mechanism for handling retry logic during API calls.
+ */
+
+import { Readability } from '@mozilla/readability';
+import axios from 'axios';
+import { spawn } from 'child_process';
 import * as fs from 'fs';
-import { createReadStream, writeFile } from 'fs';
+import { writeFile } from 'fs';
+import { google } from 'googleapis';
+import { JSDOM } from 'jsdom';
 import OpenAI from 'openai';
 import * as path from 'path';
+import * as puppeteer from 'puppeteer';
 import { promisify } from 'util';
 import * as uuid from 'uuid';
-import { filesDirectory, publicDirectory } from '../SocketData';
+import { AI_Document } from '../../client/views/nodes/chatbot/types/types';
 import { Method } from '../RouteManager';
+import { filesDirectory, publicDirectory } from '../SocketData';
 import ApiManager, { Registration } from './ApiManager';
 
+// Enumeration of directories where different file types are stored
 export enum Directory {
     parsed_files = 'parsed_files',
     images = 'images',
@@ -17,115 +35,514 @@ export enum Directory {
     pdf_thumbnails = 'pdf_thumbnails',
     audio = 'audio',
     csv = 'csv',
+    chunk_images = 'chunk_images',
+    scrape_images = 'scrape_images',
 }
 
+// In-memory job tracking
+const jobResults: { [key: string]: unknown } = {};
+const jobProgress: { [key: string]: unknown } = {};
+
+/**
+ * Constructs a normalized path to a file in the server's file system.
+ * @param directory The directory where the file is stored.
+ * @param filename The name of the file.
+ * @returns The full normalized path to the file.
+ */
 export function serverPathToFile(directory: Directory, filename: string) {
     return path.normalize(`${filesDirectory}/${directory}/${filename}`);
 }
 
+/**
+ * Constructs a normalized path to a directory in the server's file system.
+ * @param directory The directory to access.
+ * @returns The full normalized path to the directory.
+ */
 export function pathToDirectory(directory: Directory) {
     return path.normalize(`${filesDirectory}/${directory}`);
 }
 
+/**
+ * Constructs the client-accessible URL for a file.
+ * @param directory The directory where the file is stored.
+ * @param filename The name of the file.
+ * @returns The URL path to the file.
+ */
 export function clientPathToFile(directory: Directory, filename: string) {
     return `/files/${directory}/${filename}`;
 }
 
+// Promisified versions of filesystem functions
 const writeFileAsync = promisify(writeFile);
 const readFileAsync = promisify(fs.readFile);
 
+/**
+ * Class responsible for handling various API routes related to the Assistant functionality.
+ * This class extends `ApiManager` and handles registration of routes and secure request handlers.
+ */
 export default class AssistantManager extends ApiManager {
+    /**
+     * Registers all API routes and initializes necessary services like OpenAI and Google Custom Search.
+     * @param register The registration method to register routes and handlers.
+     */
     protected initialize(register: Registration): void {
-        const openai = new OpenAI({ apiKey: process.env.OPENAI_KEY, dangerouslyAllowBrowser: true });
+        // Initialize Google Custom Search API
+        const customsearch = google.customsearch('v1');
+
+        // Register Wikipedia summary API route
+        register({
+            method: Method.POST,
+            subscription: '/getWikipediaSummary',
+            secureHandler: async ({ req, res }) => {
+                const { title } = req.body;
+                try {
+                    // Fetch summary from Wikipedia using axios
+                    const response = await axios.get('https://en.wikipedia.org/w/api.php', {
+                        params: {
+                            action: 'query',
+                            list: 'search',
+                            srsearch: title,
+                            format: 'json',
+                        },
+                    });
+                    const summary = response.data.query.search[0]?.snippet || 'No article found with that title.';
+                    res.send({ text: summary });
+                } catch (error) {
+                    console.error('Error retrieving Wikipedia summary:', error);
+                    res.status(500).send({
+                        error: 'Error retrieving article summary from Wikipedia.',
+                    });
+                }
+            },
+        });
 
+        // Register Google Web Search Results API route
         register({
             method: Method.POST,
-            subscription: '/uploadPDFToVectorStore',
+            subscription: '/getWebSearchResults',
             secureHandler: async ({ req, res }) => {
-                const { urls, threadID, assistantID, vector_store_id } = req.body;
-
-                const csvFilesIds: string[] = [];
-                const otherFileIds: string[] = [];
-                const allFileIds: string[] = [];
-
-                const fileProcesses = urls.map(async (source: string) => {
-                    const fullPath = path.join(publicDirectory, source);
-                    const fileData = await openai.files.create({ file: createReadStream(fullPath), purpose: 'assistants' });
-                    allFileIds.push(fileData.id);
-                    if (source.endsWith('.csv')) {
-                        console.log(source);
-                        csvFilesIds.push(fileData.id);
+                const { query, max_results } = req.body;
+                try {
+                    // Fetch search results using Google Custom Search API
+                    const response = await customsearch.cse.list({
+                        q: query,
+                        cx: process.env._CLIENT_GOOGLE_SEARCH_ENGINE_ID,
+                        key: process.env._CLIENT_GOOGLE_API_KEY,
+                        safe: 'active',
+                        num: max_results,
+                    });
+
+                    const results =
+                        response.data.items?.map(item => ({
+                            url: item.link,
+                            snippet: item.snippet,
+                        })) || [];
+
+                    res.send({ results });
+                } catch (error) {
+                    console.error('Error performing web search:', error);
+                    res.status(500).send({
+                        error: 'Failed to perform web search',
+                    });
+                }
+            },
+        });
+
+        // Axios instance with custom headers for scraping
+        const axiosInstance = axios.create({
+            headers: {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            },
+        });
+
+        /**
+         * Utility function to introduce delay (used for retries).
+         * @param ms Delay in milliseconds.
+         */
+        const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms));
+
+        /**
+         * Function to fetch a URL with retry logic, handling rate limits.
+         * Retries a request if it fails due to rate limits (HTTP status 429).
+         * @param url The URL to fetch.
+         * @param retries The number of retry attempts.
+         * @param backoff Initial backoff time in milliseconds.
+         */
+        const fetchWithRetry = async (url: string, retries = 3, backoff = 300): Promise<unknown> => {
+            try {
+                const response = await axiosInstance.get(url);
+                return response.data;
+            } catch (error) {
+                if (retries > 0 && (error as { response: { status: number } }).response?.status === 429) {   // bcz: don't know the error type
+                    console.log(`Rate limited. Retrying in ${backoff}ms...`);
+                    await delay(backoff);
+                    return fetchWithRetry(url, retries - 1, backoff * 2);
+                } // prettier-ignore
+                throw error;
+            }
+        };
+
+        // Register a proxy fetch API route
+        register({
+            method: Method.POST,
+            subscription: '/proxyFetch',
+            secureHandler: async ({ req, res }) => {
+                const { url } = req.body;
+
+                if (!url) {
+                    res.status(400).send({ error: 'No URL provided' });
+                    return;
+                }
+
+                try {
+                    const data = await fetchWithRetry(url);
+                    res.send({ data });
+                } catch (error) {
+                    console.error('Error fetching the URL:', error);
+                    res.status(500).send({
+                        error: 'Failed to fetch the URL',
+                    });
+                }
+            },
+        });
+
+        // Register an API route to scrape website content using Puppeteer and JSDOM
+        register({
+            method: Method.POST,
+            subscription: '/scrapeWebsite',
+            secureHandler: async ({ req, res }) => {
+                const { url } = req.body;
+                try {
+                    // Launch Puppeteer browser to navigate to the webpage
+                    const browser = await puppeteer.launch({
+                        args: ['--no-sandbox', '--disable-setuid-sandbox'],
+                    });
+                    const page = await browser.newPage();
+                    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
+                    await page.goto(url, { waitUntil: 'networkidle2' });
+
+                    // Extract HTML content
+                    const htmlContent = await page.content();
+                    await browser.close();
+
+                    // Parse HTML content using JSDOM
+                    const dom = new JSDOM(htmlContent, { url });
+
+                    // Extract readable content using Mozilla's Readability API
+                    const reader = new Readability(dom.window.document);
+                    const article = reader.parse();
+
+                    if (article) {
+                        const plainText = article.textContent;
+                        res.send({ website_plain_text: plainText });
                     } else {
-                        openai.beta.vectorStores.files.create(vector_store_id, { file_id: fileData.id });
-                        otherFileIds.push(fileData.id);
+                        res.status(500).send({ error: 'Failed to extract readable content' });
                     }
-                });
-                try {
-                    await Promise.all(fileProcesses).then(() => {
-                        res.send({ vector_store_id: vector_store_id, openai_file_ids: allFileIds });
+                } catch (error) {
+                    console.error('Error scraping website:', error);
+                    res.status(500).send({
+                        error: 'Failed to scrape website',
                     });
+                }
+            },
+        });
+
+        register({
+            method: Method.POST,
+            subscription: '/createDocument',
+            secureHandler: async ({ req, res }) => {
+                const { file_path } = req.body;
+                const public_path = path.join(publicDirectory, file_path); // Resolve the file path in the public directory
+                const file_name = path.basename(file_path); // Extract the file name from the path
+
+                try {
+                    // Read the file data and encode it as base64
+                    const file_data: string = fs.readFileSync(public_path, { encoding: 'base64' });
+
+                    // Generate a unique job ID for tracking
+                    const jobId = uuid.v4();
+
+                    // Spawn the Python process and track its progress/output
+                    // eslint-disable-next-line no-use-before-define
+                    spawnPythonProcess(jobId, file_name, file_data);
+
+                    // Send the job ID back to the client for tracking
+                    res.send({ jobId });
                 } catch (error) {
-                    res.status(500).send({ error: 'Failed to process files' + error });
+                    console.error('Error initiating document creation:', error);
+                    res.status(500).send({
+                        error: 'Failed to initiate document creation',
+                    });
                 }
             },
         });
 
+        // Register an API route to check the progress of a document creation job
+        register({
+            method: Method.GET,
+            subscription: '/getProgress/:jobId',
+            secureHandler: async ({ req, res }) => {
+                const { jobId } = req.params; // Get the job ID from the URL parameters
+                // Check if the job progress is available
+                if (jobProgress[jobId]) {
+                    res.json(jobProgress[jobId]);
+                } else {
+                    res.json({
+                        step: 'Processing Document...',
+                        progress: '0',
+                    });
+                }
+            },
+        });
+
+        // Register an API route to get the final result of a document creation job
+        register({
+            method: Method.GET,
+            subscription: '/getResult/:jobId',
+            secureHandler: async ({ req, res }) => {
+                const { jobId } = req.params; // Get the job ID from the URL parameters
+                // Check if the job result is available
+                if (jobResults[jobId]) {
+                    const result = jobResults[jobId] as AI_Document & { status: string };
+
+                    // If the result contains image or table chunks, save the base64 data as image files
+                    if (result.chunks && Array.isArray(result.chunks)) {
+                        for (const chunk of result.chunks) {
+                            if (chunk.metadata && (chunk.metadata.type === 'image' || chunk.metadata.type === 'table')) {
+                                const files_directory = '/files/chunk_images/';
+                                const directory = path.join(publicDirectory, files_directory);
+
+                                // Ensure the directory exists or create it
+                                if (!fs.existsSync(directory)) {
+                                    fs.mkdirSync(directory);
+                                }
+
+                                const fileName = path.basename(chunk.metadata.file_path); // Get the file name from the path
+                                const filePath = path.join(directory, fileName); // Create the full file path
+
+                                // Check if the chunk contains base64 encoded data
+                                if (chunk.metadata.base64_data) {
+                                    // Decode the base64 data and write it to a file
+                                    const buffer = Buffer.from(chunk.metadata.base64_data, 'base64');
+                                    await fs.promises.writeFile(filePath, buffer);
+
+                                    // Update the file path in the chunk's metadata
+                                    chunk.metadata.file_path = path.join(files_directory, fileName);
+                                    chunk.metadata.base64_data = undefined; // Remove the base64 data from the metadata
+                                } else {
+                                    console.warn(`No base64_data found for chunk: ${fileName}`);
+                                }
+                            }
+                        }
+                        result.status = 'completed';
+                    } else {
+                        result.status = 'pending';
+                    }
+                    res.json(result); // Send the result back to the client
+                } else {
+                    res.status(202).send({ status: 'pending' });
+                }
+            },
+        });
+
+        // Register an API route to format chunks (e.g., text or image chunks) for display
         register({
             method: Method.POST,
-            subscription: '/downloadFileFromOpenAI',
+            subscription: '/formatChunks',
             secureHandler: async ({ req, res }) => {
-                const { file_id, file_name } = req.body;
-                //let files_directory: string;
-                let files_directory = '/files/openAIFiles/';
-                switch (file_name.split('.').pop()) {
-                    case 'pdf':
-                        files_directory = '/files/pdfs/';
-                        break;
-                    case 'csv':
-                        files_directory = '/files/csv/';
-                        break;
-                    case 'png':
-                    case 'jpg':
-                    case 'jpeg':
-                        files_directory = '/files/images/';
-                        break;
-                    default:
-                        break;
+                const { relevantChunks } = req.body; // Get the relevant chunks from the request body
+
+                // Initialize an array to hold the formatted content
+                const content: { type: string; text?: string; image_url?: { url: string } }[] = [{ type: 'text', text: '<chunks>' }];
+
+                for (const chunk of relevantChunks) {
+                    // Format each chunk by adding its metadata and content
+                    content.push({
+                        type: 'text',
+                        text: `<chunk chunk_id=${chunk.id} chunk_type="${chunk.metadata.type}">`,
+                    });
+
+                    // If the chunk is an image or table, read the corresponding file and encode it as base64
+                    if (chunk.metadata.type === 'image' || chunk.metadata.type === 'table') {
+                        try {
+                            const filePath = serverPathToFile(Directory.chunk_images, chunk.metadata.file_path); // Get the file path
+                            const imageBuffer = await readFileAsync(filePath); // Read the image file
+                            const base64Image = imageBuffer.toString('base64'); // Convert the image to base64
+
+                            // Add the base64-encoded image to the content array
+                            if (base64Image) {
+                                content.push({
+                                    type: 'image_url',
+                                    image_url: {
+                                        url: `data:image/jpeg;base64,${base64Image}`,
+                                    },
+                                });
+                            } else {
+                                console.log(`Failed to encode image for chunk ${chunk.id}`);
+                            }
+                        } catch (error) {
+                            console.error(`Error reading image file for chunk ${chunk.id}:`, error);
+                        }
+                    }
+
+                    // Add the chunk's text content to the formatted content
+                    content.push({ type: 'text', text: `${chunk.metadata.text}\n</chunk>\n` });
                 }
 
-                const directory = path.join(publicDirectory, files_directory);
+                content.push({ type: 'text', text: '</chunks>' });
 
-                if (!fs.existsSync(directory)) {
-                    fs.mkdirSync(directory);
+                // Send the formatted content back to the client
+                res.send({ formattedChunks: content });
+            },
+        });
+
+        // Register an API route to create and save a CSV file on the server
+        register({
+            method: Method.POST,
+            subscription: '/createCSV',
+            secureHandler: async ({ req, res }) => {
+                const { filename, data } = req.body;
+
+                // Validate that both the filename and data are provided
+                if (!filename || !data) {
+                    res.status(400).send({ error: 'Filename and data fields are required.' });
+                    return;
                 }
-                const file = await openai.files.content(file_id);
-                const new_file_name = `${uuid.v4()}-${file_name}`;
-                const file_path = path.join(directory, new_file_name);
-                const file_array_buffer = await file.arrayBuffer();
-                const bufferView = new Uint8Array(file_array_buffer);
+
                 try {
-                    const written_file = await writeFileAsync(file_path, bufferView);
-                    console.log(written_file);
-                    console.log(file_path);
-                    console.log(file_array_buffer);
-                    console.log(bufferView);
-                    const file_object = new File([bufferView], file_name);
-                    //DashUploadUtils.upload(file_object, 'openAIFiles');
-                    res.send({ file_path: path.join(files_directory, new_file_name) });
-                    /* res.send( {
-                        source: "file",
-                        result: {
-                                  accessPaths: {
-                                      agnostic: {client: path.join('/files/openAIFiles/', `${uuid.v4()}-${file_name}`)}
-                                  },
-                                  rawText: "",
-                                  duration: 0,
-                              },
-                    } ); */
+                    // Generate a UUID for the file to ensure unique naming
+                    const uuidv4 = uuid.v4();
+                    const fullFilename = `${uuidv4}-${filename}`; // Prefix the file name with the UUID
+
+                    // Get the full server path where the file will be saved
+                    const serverFilePath = serverPathToFile(Directory.csv, fullFilename);
+
+                    // Write the CSV data (which is a raw string) to the file
+                    await writeFileAsync(serverFilePath, data, 'utf8');
+
+                    // Construct the client-accessible URL for the file
+                    const fileUrl = clientPathToFile(Directory.csv, fullFilename);
+
+                    // Send the file URL and UUID back to the client
+                    res.send({ fileUrl, id: uuidv4 });
                 } catch (error) {
-                    res.status(500).send({ error: 'Failed to write file' + error });
+                    console.error('Error creating CSV file:', error);
+                    res.status(500).send({
+                        error: 'Failed to create CSV file.',
+                    });
                 }
             },
         });
     }
 }
+
+function spawnPythonProcess(jobId: string, file_name: string, file_data: string) {
+    const venvPath = path.join(__dirname, '../chunker/venv');
+    const requirementsPath = path.join(__dirname, '../chunker/requirements.txt');
+    const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py');
+
+    function runPythonScript() {
+        const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3');
+
+        const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_name, file_data]);
+
+        let pythonOutput = '';
+        let stderrOutput = '';
+
+        pythonProcess.stdout.on('data', data => {
+            pythonOutput += data.toString();
+        });
+
+        pythonProcess.stderr.on('data', data => {
+            stderrOutput += data.toString();
+            const lines = stderrOutput.split('\n');
+            lines.forEach(line => {
+                if (line.trim()) {
+                    try {
+                        const parsedOutput = JSON.parse(line);
+                        if (parsedOutput.job_id && parsedOutput.progress !== undefined) {
+                            jobProgress[parsedOutput.job_id] = {
+                                step: parsedOutput.step,
+                                progress: parsedOutput.progress,
+                            };
+                        } else if (parsedOutput.progress !== undefined) {
+                            jobProgress[jobId] = {
+                                step: parsedOutput.step,
+                                progress: parsedOutput.progress,
+                            };
+                        }
+                    } catch (err) {
+                        console.error('Progress log from Python:', line, err);
+                    }
+                }
+            });
+        });
+
+        pythonProcess.on('close', code => {
+            if (code === 0) {
+                try {
+                    const finalResult = JSON.parse(pythonOutput);
+                    jobResults[jobId] = finalResult;
+                    jobProgress[jobId] = { step: 'Complete', progress: 100 };
+                } catch (err) {
+                    console.error('Error parsing final JSON result:', err);
+                }
+            } else {
+                console.error(`Python process exited with code ${code}`);
+                jobResults[jobId] = { error: 'Python process failed' };
+            }
+        });
+    }
+    // Check if venv exists
+    if (!fs.existsSync(venvPath)) {
+        console.log('Virtual environment not found. Creating and setting up...');
+
+        // Create venv
+        const createVenvProcess = spawn('python', ['-m', 'venv', venvPath]);
+
+        createVenvProcess.on('close', code => {
+            if (code !== 0) {
+                console.error(`Failed to create virtual environment. Exit code: ${code}`);
+                return;
+            }
+
+            console.log('Virtual environment created. Installing requirements...');
+
+            // Determine the pip path based on the OS
+            const pipPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'pip.exe') : path.join(venvPath, 'bin', 'pip3'); // Try 'pip3' for Unix-like systems
+
+            if (!fs.existsSync(pipPath)) {
+                console.error(`pip executable not found at ${pipPath}`);
+                return;
+            }
+
+            // Install requirements
+            const installRequirementsProcess = spawn(pipPath, ['install', '-r', requirementsPath]);
+
+            installRequirementsProcess.stdout.on('data', data => {
+                console.log(`pip stdout: ${data}`);
+            });
+
+            installRequirementsProcess.stderr.on('data', data => {
+                console.error(`pip stderr: ${data}`);
+            });
+
+            installRequirementsProcess.on('error', error => {
+                console.error(`Error starting pip process: ${error}`);
+            });
+
+            installRequirementsProcess.on('close', closecode => {
+                if (closecode !== 0) {
+                    console.error(`Failed to install requirements. Exit code: ${closecode}`);
+                    return;
+                }
+
+                console.log('Requirements installed. Running Python script...');
+                runPythonScript();
+            });
+        });
+    } else {
+        console.log('Virtual environment found. Running Python script...');
+        runPythonScript();
+    }
+}
author	Nathan-SR <144961007+Nathan-SR@users.noreply.github.com>	2024-10-10 19:30:06 -0400
committer	Nathan-SR <144961007+Nathan-SR@users.noreply.github.com>	2024-10-10 19:30:06 -0400
commit	373340938a4bc48edb4b9345f28e562de41153d6 (patch)
tree	d6604992d93a12920e1b62a1f906735d59434765 /src/server/ApiManagers/AssistantManager.ts
parent	772c7a4c4d8867cbc33a673c3e3c6f3e330d395d (diff)
parent	5752dff8ff7b1b2858542feec0b1bb037461bf1a (diff)