aboutsummaryrefslogtreecommitdiff
path: root/src/server/ApiManagers/AssistantManager.ts
diff options
context:
space:
mode:
authorNathan-SR <144961007+Nathan-SR@users.noreply.github.com>2024-10-10 19:30:06 -0400
committerNathan-SR <144961007+Nathan-SR@users.noreply.github.com>2024-10-10 19:30:06 -0400
commit373340938a4bc48edb4b9345f28e562de41153d6 (patch)
treed6604992d93a12920e1b62a1f906735d59434765 /src/server/ApiManagers/AssistantManager.ts
parent772c7a4c4d8867cbc33a673c3e3c6f3e330d395d (diff)
parent5752dff8ff7b1b2858542feec0b1bb037461bf1a (diff)
Merge branch 'nathan-starter' of https://github.com/brown-dash/Dash-Web into nathan-starter
Diffstat (limited to 'src/server/ApiManagers/AssistantManager.ts')
-rw-r--r--src/server/ApiManagers/AssistantManager.ts555
1 files changed, 486 insertions, 69 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index b42314e41..b7d4191ca 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -1,13 +1,31 @@
+/**
+ * @file AssistantManager.ts
+ * @description This file defines the AssistantManager class, responsible for managing various
+ * API routes related to the Assistant functionality. It provides features such as file handling,
+ * web scraping, and integration with third-party APIs like OpenAI and Google Custom Search.
+ * It also handles job tracking and progress reporting for tasks like document creation and web scraping.
+ * Utility functions for path manipulation and file operations are included, along with
+ * a mechanism for handling retry logic during API calls.
+ */
+
+import { Readability } from '@mozilla/readability';
+import axios from 'axios';
+import { spawn } from 'child_process';
import * as fs from 'fs';
-import { createReadStream, writeFile } from 'fs';
+import { writeFile } from 'fs';
+import { google } from 'googleapis';
+import { JSDOM } from 'jsdom';
import OpenAI from 'openai';
import * as path from 'path';
+import * as puppeteer from 'puppeteer';
import { promisify } from 'util';
import * as uuid from 'uuid';
-import { filesDirectory, publicDirectory } from '../SocketData';
+import { AI_Document } from '../../client/views/nodes/chatbot/types/types';
import { Method } from '../RouteManager';
+import { filesDirectory, publicDirectory } from '../SocketData';
import ApiManager, { Registration } from './ApiManager';
+// Enumeration of directories where different file types are stored
export enum Directory {
parsed_files = 'parsed_files',
images = 'images',
@@ -17,115 +35,514 @@ export enum Directory {
pdf_thumbnails = 'pdf_thumbnails',
audio = 'audio',
csv = 'csv',
+ chunk_images = 'chunk_images',
+ scrape_images = 'scrape_images',
}
+// In-memory job tracking
+const jobResults: { [key: string]: unknown } = {};
+const jobProgress: { [key: string]: unknown } = {};
+
+/**
+ * Constructs a normalized path to a file in the server's file system.
+ * @param directory The directory where the file is stored.
+ * @param filename The name of the file.
+ * @returns The full normalized path to the file.
+ */
export function serverPathToFile(directory: Directory, filename: string) {
return path.normalize(`${filesDirectory}/${directory}/${filename}`);
}
+/**
+ * Constructs a normalized path to a directory in the server's file system.
+ * @param directory The directory to access.
+ * @returns The full normalized path to the directory.
+ */
export function pathToDirectory(directory: Directory) {
return path.normalize(`${filesDirectory}/${directory}`);
}
+/**
+ * Constructs the client-accessible URL for a file.
+ * @param directory The directory where the file is stored.
+ * @param filename The name of the file.
+ * @returns The URL path to the file.
+ */
export function clientPathToFile(directory: Directory, filename: string) {
return `/files/${directory}/${filename}`;
}
+// Promisified versions of filesystem functions
const writeFileAsync = promisify(writeFile);
const readFileAsync = promisify(fs.readFile);
+/**
+ * Class responsible for handling various API routes related to the Assistant functionality.
+ * This class extends `ApiManager` and handles registration of routes and secure request handlers.
+ */
export default class AssistantManager extends ApiManager {
+ /**
+ * Registers all API routes and initializes necessary services like OpenAI and Google Custom Search.
+ * @param register The registration method to register routes and handlers.
+ */
protected initialize(register: Registration): void {
- const openai = new OpenAI({ apiKey: process.env.OPENAI_KEY, dangerouslyAllowBrowser: true });
+ // Initialize Google Custom Search API
+ const customsearch = google.customsearch('v1');
+
+ // Register Wikipedia summary API route
+ register({
+ method: Method.POST,
+ subscription: '/getWikipediaSummary',
+ secureHandler: async ({ req, res }) => {
+ const { title } = req.body;
+ try {
+ // Fetch summary from Wikipedia using axios
+ const response = await axios.get('https://en.wikipedia.org/w/api.php', {
+ params: {
+ action: 'query',
+ list: 'search',
+ srsearch: title,
+ format: 'json',
+ },
+ });
+ const summary = response.data.query.search[0]?.snippet || 'No article found with that title.';
+ res.send({ text: summary });
+ } catch (error) {
+ console.error('Error retrieving Wikipedia summary:', error);
+ res.status(500).send({
+ error: 'Error retrieving article summary from Wikipedia.',
+ });
+ }
+ },
+ });
+ // Register Google Web Search Results API route
register({
method: Method.POST,
- subscription: '/uploadPDFToVectorStore',
+ subscription: '/getWebSearchResults',
secureHandler: async ({ req, res }) => {
- const { urls, threadID, assistantID, vector_store_id } = req.body;
-
- const csvFilesIds: string[] = [];
- const otherFileIds: string[] = [];
- const allFileIds: string[] = [];
-
- const fileProcesses = urls.map(async (source: string) => {
- const fullPath = path.join(publicDirectory, source);
- const fileData = await openai.files.create({ file: createReadStream(fullPath), purpose: 'assistants' });
- allFileIds.push(fileData.id);
- if (source.endsWith('.csv')) {
- console.log(source);
- csvFilesIds.push(fileData.id);
+ const { query, max_results } = req.body;
+ try {
+ // Fetch search results using Google Custom Search API
+ const response = await customsearch.cse.list({
+ q: query,
+ cx: process.env._CLIENT_GOOGLE_SEARCH_ENGINE_ID,
+ key: process.env._CLIENT_GOOGLE_API_KEY,
+ safe: 'active',
+ num: max_results,
+ });
+
+ const results =
+ response.data.items?.map(item => ({
+ url: item.link,
+ snippet: item.snippet,
+ })) || [];
+
+ res.send({ results });
+ } catch (error) {
+ console.error('Error performing web search:', error);
+ res.status(500).send({
+ error: 'Failed to perform web search',
+ });
+ }
+ },
+ });
+
+ // Axios instance with custom headers for scraping
+ const axiosInstance = axios.create({
+ headers: {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+ },
+ });
+
+ /**
+ * Utility function to introduce delay (used for retries).
+ * @param ms Delay in milliseconds.
+ */
+ const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms));
+
+ /**
+ * Function to fetch a URL with retry logic, handling rate limits.
+ * Retries a request if it fails due to rate limits (HTTP status 429).
+ * @param url The URL to fetch.
+ * @param retries The number of retry attempts.
+ * @param backoff Initial backoff time in milliseconds.
+ */
+ const fetchWithRetry = async (url: string, retries = 3, backoff = 300): Promise<unknown> => {
+ try {
+ const response = await axiosInstance.get(url);
+ return response.data;
+ } catch (error) {
+ if (retries > 0 && (error as { response: { status: number } }).response?.status === 429) { // bcz: don't know the error type
+ console.log(`Rate limited. Retrying in ${backoff}ms...`);
+ await delay(backoff);
+ return fetchWithRetry(url, retries - 1, backoff * 2);
+ } // prettier-ignore
+ throw error;
+ }
+ };
+
+ // Register a proxy fetch API route
+ register({
+ method: Method.POST,
+ subscription: '/proxyFetch',
+ secureHandler: async ({ req, res }) => {
+ const { url } = req.body;
+
+ if (!url) {
+ res.status(400).send({ error: 'No URL provided' });
+ return;
+ }
+
+ try {
+ const data = await fetchWithRetry(url);
+ res.send({ data });
+ } catch (error) {
+ console.error('Error fetching the URL:', error);
+ res.status(500).send({
+ error: 'Failed to fetch the URL',
+ });
+ }
+ },
+ });
+
+ // Register an API route to scrape website content using Puppeteer and JSDOM
+ register({
+ method: Method.POST,
+ subscription: '/scrapeWebsite',
+ secureHandler: async ({ req, res }) => {
+ const { url } = req.body;
+ try {
+ // Launch Puppeteer browser to navigate to the webpage
+ const browser = await puppeteer.launch({
+ args: ['--no-sandbox', '--disable-setuid-sandbox'],
+ });
+ const page = await browser.newPage();
+ await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
+ await page.goto(url, { waitUntil: 'networkidle2' });
+
+ // Extract HTML content
+ const htmlContent = await page.content();
+ await browser.close();
+
+ // Parse HTML content using JSDOM
+ const dom = new JSDOM(htmlContent, { url });
+
+ // Extract readable content using Mozilla's Readability API
+ const reader = new Readability(dom.window.document);
+ const article = reader.parse();
+
+ if (article) {
+ const plainText = article.textContent;
+ res.send({ website_plain_text: plainText });
} else {
- openai.beta.vectorStores.files.create(vector_store_id, { file_id: fileData.id });
- otherFileIds.push(fileData.id);
+ res.status(500).send({ error: 'Failed to extract readable content' });
}
- });
- try {
- await Promise.all(fileProcesses).then(() => {
- res.send({ vector_store_id: vector_store_id, openai_file_ids: allFileIds });
+ } catch (error) {
+ console.error('Error scraping website:', error);
+ res.status(500).send({
+ error: 'Failed to scrape website',
});
+ }
+ },
+ });
+
+ register({
+ method: Method.POST,
+ subscription: '/createDocument',
+ secureHandler: async ({ req, res }) => {
+ const { file_path } = req.body;
+ const public_path = path.join(publicDirectory, file_path); // Resolve the file path in the public directory
+ const file_name = path.basename(file_path); // Extract the file name from the path
+
+ try {
+ // Read the file data and encode it as base64
+ const file_data: string = fs.readFileSync(public_path, { encoding: 'base64' });
+
+ // Generate a unique job ID for tracking
+ const jobId = uuid.v4();
+
+ // Spawn the Python process and track its progress/output
+ // eslint-disable-next-line no-use-before-define
+ spawnPythonProcess(jobId, file_name, file_data);
+
+ // Send the job ID back to the client for tracking
+ res.send({ jobId });
} catch (error) {
- res.status(500).send({ error: 'Failed to process files' + error });
+ console.error('Error initiating document creation:', error);
+ res.status(500).send({
+ error: 'Failed to initiate document creation',
+ });
}
},
});
+ // Register an API route to check the progress of a document creation job
+ register({
+ method: Method.GET,
+ subscription: '/getProgress/:jobId',
+ secureHandler: async ({ req, res }) => {
+ const { jobId } = req.params; // Get the job ID from the URL parameters
+ // Check if the job progress is available
+ if (jobProgress[jobId]) {
+ res.json(jobProgress[jobId]);
+ } else {
+ res.json({
+ step: 'Processing Document...',
+ progress: '0',
+ });
+ }
+ },
+ });
+
+ // Register an API route to get the final result of a document creation job
+ register({
+ method: Method.GET,
+ subscription: '/getResult/:jobId',
+ secureHandler: async ({ req, res }) => {
+ const { jobId } = req.params; // Get the job ID from the URL parameters
+ // Check if the job result is available
+ if (jobResults[jobId]) {
+ const result = jobResults[jobId] as AI_Document & { status: string };
+
+ // If the result contains image or table chunks, save the base64 data as image files
+ if (result.chunks && Array.isArray(result.chunks)) {
+ for (const chunk of result.chunks) {
+ if (chunk.metadata && (chunk.metadata.type === 'image' || chunk.metadata.type === 'table')) {
+ const files_directory = '/files/chunk_images/';
+ const directory = path.join(publicDirectory, files_directory);
+
+ // Ensure the directory exists or create it
+ if (!fs.existsSync(directory)) {
+ fs.mkdirSync(directory);
+ }
+
+ const fileName = path.basename(chunk.metadata.file_path); // Get the file name from the path
+ const filePath = path.join(directory, fileName); // Create the full file path
+
+ // Check if the chunk contains base64 encoded data
+ if (chunk.metadata.base64_data) {
+ // Decode the base64 data and write it to a file
+ const buffer = Buffer.from(chunk.metadata.base64_data, 'base64');
+ await fs.promises.writeFile(filePath, buffer);
+
+ // Update the file path in the chunk's metadata
+ chunk.metadata.file_path = path.join(files_directory, fileName);
+ chunk.metadata.base64_data = undefined; // Remove the base64 data from the metadata
+ } else {
+ console.warn(`No base64_data found for chunk: ${fileName}`);
+ }
+ }
+ }
+ result.status = 'completed';
+ } else {
+ result.status = 'pending';
+ }
+ res.json(result); // Send the result back to the client
+ } else {
+ res.status(202).send({ status: 'pending' });
+ }
+ },
+ });
+
+ // Register an API route to format chunks (e.g., text or image chunks) for display
register({
method: Method.POST,
- subscription: '/downloadFileFromOpenAI',
+ subscription: '/formatChunks',
secureHandler: async ({ req, res }) => {
- const { file_id, file_name } = req.body;
- //let files_directory: string;
- let files_directory = '/files/openAIFiles/';
- switch (file_name.split('.').pop()) {
- case 'pdf':
- files_directory = '/files/pdfs/';
- break;
- case 'csv':
- files_directory = '/files/csv/';
- break;
- case 'png':
- case 'jpg':
- case 'jpeg':
- files_directory = '/files/images/';
- break;
- default:
- break;
+ const { relevantChunks } = req.body; // Get the relevant chunks from the request body
+
+ // Initialize an array to hold the formatted content
+ const content: { type: string; text?: string; image_url?: { url: string } }[] = [{ type: 'text', text: '<chunks>' }];
+
+ for (const chunk of relevantChunks) {
+ // Format each chunk by adding its metadata and content
+ content.push({
+ type: 'text',
+ text: `<chunk chunk_id=${chunk.id} chunk_type="${chunk.metadata.type}">`,
+ });
+
+ // If the chunk is an image or table, read the corresponding file and encode it as base64
+ if (chunk.metadata.type === 'image' || chunk.metadata.type === 'table') {
+ try {
+ const filePath = serverPathToFile(Directory.chunk_images, chunk.metadata.file_path); // Get the file path
+ const imageBuffer = await readFileAsync(filePath); // Read the image file
+ const base64Image = imageBuffer.toString('base64'); // Convert the image to base64
+
+ // Add the base64-encoded image to the content array
+ if (base64Image) {
+ content.push({
+ type: 'image_url',
+ image_url: {
+ url: `data:image/jpeg;base64,${base64Image}`,
+ },
+ });
+ } else {
+ console.log(`Failed to encode image for chunk ${chunk.id}`);
+ }
+ } catch (error) {
+ console.error(`Error reading image file for chunk ${chunk.id}:`, error);
+ }
+ }
+
+ // Add the chunk's text content to the formatted content
+ content.push({ type: 'text', text: `${chunk.metadata.text}\n</chunk>\n` });
}
- const directory = path.join(publicDirectory, files_directory);
+ content.push({ type: 'text', text: '</chunks>' });
- if (!fs.existsSync(directory)) {
- fs.mkdirSync(directory);
+ // Send the formatted content back to the client
+ res.send({ formattedChunks: content });
+ },
+ });
+
+ // Register an API route to create and save a CSV file on the server
+ register({
+ method: Method.POST,
+ subscription: '/createCSV',
+ secureHandler: async ({ req, res }) => {
+ const { filename, data } = req.body;
+
+ // Validate that both the filename and data are provided
+ if (!filename || !data) {
+ res.status(400).send({ error: 'Filename and data fields are required.' });
+ return;
}
- const file = await openai.files.content(file_id);
- const new_file_name = `${uuid.v4()}-${file_name}`;
- const file_path = path.join(directory, new_file_name);
- const file_array_buffer = await file.arrayBuffer();
- const bufferView = new Uint8Array(file_array_buffer);
+
try {
- const written_file = await writeFileAsync(file_path, bufferView);
- console.log(written_file);
- console.log(file_path);
- console.log(file_array_buffer);
- console.log(bufferView);
- const file_object = new File([bufferView], file_name);
- //DashUploadUtils.upload(file_object, 'openAIFiles');
- res.send({ file_path: path.join(files_directory, new_file_name) });
- /* res.send( {
- source: "file",
- result: {
- accessPaths: {
- agnostic: {client: path.join('/files/openAIFiles/', `${uuid.v4()}-${file_name}`)}
- },
- rawText: "",
- duration: 0,
- },
- } ); */
+ // Generate a UUID for the file to ensure unique naming
+ const uuidv4 = uuid.v4();
+ const fullFilename = `${uuidv4}-${filename}`; // Prefix the file name with the UUID
+
+ // Get the full server path where the file will be saved
+ const serverFilePath = serverPathToFile(Directory.csv, fullFilename);
+
+ // Write the CSV data (which is a raw string) to the file
+ await writeFileAsync(serverFilePath, data, 'utf8');
+
+ // Construct the client-accessible URL for the file
+ const fileUrl = clientPathToFile(Directory.csv, fullFilename);
+
+ // Send the file URL and UUID back to the client
+ res.send({ fileUrl, id: uuidv4 });
} catch (error) {
- res.status(500).send({ error: 'Failed to write file' + error });
+ console.error('Error creating CSV file:', error);
+ res.status(500).send({
+ error: 'Failed to create CSV file.',
+ });
}
},
});
}
}
+
+function spawnPythonProcess(jobId: string, file_name: string, file_data: string) {
+ const venvPath = path.join(__dirname, '../chunker/venv');
+ const requirementsPath = path.join(__dirname, '../chunker/requirements.txt');
+ const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py');
+
+ function runPythonScript() {
+ const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3');
+
+ const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_name, file_data]);
+
+ let pythonOutput = '';
+ let stderrOutput = '';
+
+ pythonProcess.stdout.on('data', data => {
+ pythonOutput += data.toString();
+ });
+
+ pythonProcess.stderr.on('data', data => {
+ stderrOutput += data.toString();
+ const lines = stderrOutput.split('\n');
+ lines.forEach(line => {
+ if (line.trim()) {
+ try {
+ const parsedOutput = JSON.parse(line);
+ if (parsedOutput.job_id && parsedOutput.progress !== undefined) {
+ jobProgress[parsedOutput.job_id] = {
+ step: parsedOutput.step,
+ progress: parsedOutput.progress,
+ };
+ } else if (parsedOutput.progress !== undefined) {
+ jobProgress[jobId] = {
+ step: parsedOutput.step,
+ progress: parsedOutput.progress,
+ };
+ }
+ } catch (err) {
+ console.error('Progress log from Python:', line, err);
+ }
+ }
+ });
+ });
+
+ pythonProcess.on('close', code => {
+ if (code === 0) {
+ try {
+ const finalResult = JSON.parse(pythonOutput);
+ jobResults[jobId] = finalResult;
+ jobProgress[jobId] = { step: 'Complete', progress: 100 };
+ } catch (err) {
+ console.error('Error parsing final JSON result:', err);
+ }
+ } else {
+ console.error(`Python process exited with code ${code}`);
+ jobResults[jobId] = { error: 'Python process failed' };
+ }
+ });
+ }
+ // Check if venv exists
+ if (!fs.existsSync(venvPath)) {
+ console.log('Virtual environment not found. Creating and setting up...');
+
+ // Create venv
+ const createVenvProcess = spawn('python', ['-m', 'venv', venvPath]);
+
+ createVenvProcess.on('close', code => {
+ if (code !== 0) {
+ console.error(`Failed to create virtual environment. Exit code: ${code}`);
+ return;
+ }
+
+ console.log('Virtual environment created. Installing requirements...');
+
+ // Determine the pip path based on the OS
+ const pipPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'pip.exe') : path.join(venvPath, 'bin', 'pip3'); // Try 'pip3' for Unix-like systems
+
+ if (!fs.existsSync(pipPath)) {
+ console.error(`pip executable not found at ${pipPath}`);
+ return;
+ }
+
+ // Install requirements
+ const installRequirementsProcess = spawn(pipPath, ['install', '-r', requirementsPath]);
+
+ installRequirementsProcess.stdout.on('data', data => {
+ console.log(`pip stdout: ${data}`);
+ });
+
+ installRequirementsProcess.stderr.on('data', data => {
+ console.error(`pip stderr: ${data}`);
+ });
+
+ installRequirementsProcess.on('error', error => {
+ console.error(`Error starting pip process: ${error}`);
+ });
+
+ installRequirementsProcess.on('close', closecode => {
+ if (closecode !== 0) {
+ console.error(`Failed to install requirements. Exit code: ${closecode}`);
+ return;
+ }
+
+ console.log('Requirements installed. Running Python script...');
+ runPythonScript();
+ });
+ });
+ } else {
+ console.log('Virtual environment found. Running Python script...');
+ runPythonScript();
+ }
+}