diff options
Diffstat (limited to 'src/server/ApiManagers/AssistantManager.ts')
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 411 |
1 files changed, 344 insertions, 67 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index b42314e41..2ffc99e58 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -7,6 +7,17 @@ import * as uuid from 'uuid'; import { filesDirectory, publicDirectory } from '../SocketData'; import { Method } from '../RouteManager'; import ApiManager, { Registration } from './ApiManager'; +import axios from 'axios'; +import { RAGChunk } from '../../client/views/nodes/ChatBox/types'; +import { UnstructuredClient } from 'unstructured-client'; +import { PartitionResponse } from 'unstructured-client/sdk/models/operations'; +import { ChunkingStrategy, Strategy } from 'unstructured-client/sdk/models/shared'; +import * as cheerio from 'cheerio'; +import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk'; +import { google } from 'googleapis'; +import * as puppeteer from 'puppeteer'; +import { JSDOM } from 'jsdom'; +import { Readability } from '@mozilla/readability'; export enum Directory { parsed_files = 'parsed_files', @@ -17,6 +28,8 @@ export enum Directory { pdf_thumbnails = 'pdf_thumbnails', audio = 'audio', csv = 'csv', + chunk_images = 'chunk_images', + scrape_images = 'scrape_images', } export function serverPathToFile(directory: Directory, filename: string) { @@ -36,94 +49,358 @@ const readFileAsync = promisify(fs.readFile); export default class AssistantManager extends ApiManager { protected initialize(register: Registration): void { - const openai = new OpenAI({ apiKey: process.env.OPENAI_KEY, dangerouslyAllowBrowser: true }); + const openai = new OpenAI({ + apiKey: process.env._CLIENT_OPENAI_KEY, // Use client key so don't have to set key seperately for client and server. + dangerouslyAllowBrowser: true, + }); + const unstructuredClient = new UnstructuredClient({ + security: { + apiKeyAuth: process.env._CLIENT_UNSTRUCTURED_API_KEY!, + }, + }); + const scrapflyClient = new ScrapflyClient({ key: process.env._CLIENT_SCRAPFLY_API_KEY! }); + const customsearch = google.customsearch('v1'); register({ method: Method.POST, - subscription: '/uploadPDFToVectorStore', + subscription: '/getWikipediaSummary', secureHandler: async ({ req, res }) => { - const { urls, threadID, assistantID, vector_store_id } = req.body; - - const csvFilesIds: string[] = []; - const otherFileIds: string[] = []; - const allFileIds: string[] = []; - - const fileProcesses = urls.map(async (source: string) => { - const fullPath = path.join(publicDirectory, source); - const fileData = await openai.files.create({ file: createReadStream(fullPath), purpose: 'assistants' }); - allFileIds.push(fileData.id); - if (source.endsWith('.csv')) { - console.log(source); - csvFilesIds.push(fileData.id); + const { title } = req.body; + try { + const response = await axios.get('https://en.wikipedia.org/w/api.php', { + params: { + action: 'query', + list: 'search', + srsearch: title, + format: 'json', + }, + }); + const summary = response.data.query.search[0].snippet; + if (!summary || summary.length === 0 || summary === '' || summary === ' ') { + res.send({ text: 'No article found with that title.' }); } else { - openai.beta.vectorStores.files.create(vector_store_id, { file_id: fileData.id }); - otherFileIds.push(fileData.id); + res.send({ text: summary }); } - }); + } catch (error: any) { + console.error('Error retrieving article summary from Wikipedia:', error); + res.status(500).send({ error: 'Error retrieving article summary from Wikipedia.', details: error.message }); + } + }, + }); + + register({ + method: Method.POST, + subscription: '/getWebSearchResults', + secureHandler: async ({ req, res }) => { + const { query, max_results } = req.body; try { - await Promise.all(fileProcesses).then(() => { - res.send({ vector_store_id: vector_store_id, openai_file_ids: allFileIds }); + const response = await customsearch.cse.list({ + q: query, + cx: process.env._CLIENT_GOOGLE_SEARCH_ENGINE_ID, + key: process.env._CLIENT_GOOGLE_API_KEY, + safe: 'active', + num: max_results, }); - } catch (error) { - res.status(500).send({ error: 'Failed to process files' + error }); + + const results = + response.data.items?.map((item: any) => ({ + url: item.link, + snippet: item.snippet, + })) || []; + + res.send({ results }); + } catch (error: any) { + console.error('Error performing web search:', error); + res.status(500).send({ error: 'Failed to perform web search', details: error.message }); } }, }); + const axiosInstance = axios.create({ + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + }, + }); + + const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms)); + + const fetchWithRetry = async (url: string, retries = 3, backoff = 300) => { + try { + const response = await axiosInstance.get(url); + return response.data; + } catch (error: any) { + if (retries > 0 && error.response && error.response.status === 429) { + console.log(`Rate limited. Retrying in ${backoff}ms...`); + await delay(backoff); + return fetchWithRetry(url, retries - 1, backoff * 2); + } + throw error; + } + }; + register({ method: Method.POST, - subscription: '/downloadFileFromOpenAI', + subscription: '/scrapeWebsite', secureHandler: async ({ req, res }) => { - const { file_id, file_name } = req.body; - //let files_directory: string; - let files_directory = '/files/openAIFiles/'; - switch (file_name.split('.').pop()) { - case 'pdf': - files_directory = '/files/pdfs/'; - break; - case 'csv': - files_directory = '/files/csv/'; - break; - case 'png': - case 'jpg': - case 'jpeg': - files_directory = '/files/images/'; - break; - default: - break; + const { url } = req.body; + try { + // Launch Puppeteer to navigate to the webpage + const browser = await puppeteer.launch({ + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + const page = await browser.newPage(); + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'); + await page.goto(url, { waitUntil: 'networkidle2' }); + + // Get the HTML content of the page + const htmlContent = await page.content(); + await browser.close(); + + // Use JSDOM to parse the HTML content + const dom = new JSDOM(htmlContent, { url }); + + // Use Readability to extract the readable content + const reader = new Readability(dom.window.document); + const article = reader.parse(); + + if (article) { + // Extract the plain text from the article content + const plainText = article.textContent; + + // Return the plain text content + res.send({ website_plain_text: plainText }); + } else { + res.status(500).send({ error: 'Failed to extract readable content' }); + } + } catch (error: any) { + console.error('Error scraping website:', error); + res.status(500).send({ error: 'Failed to scrape website', details: error.message }); } + }, + }); + + register({ + method: Method.POST, + subscription: '/createDocument', + secureHandler: async ({ req, res }) => { + const { file_path } = req.body; + const public_path = path.join(publicDirectory, file_path); + const file_name = path.basename(file_path); + + try { + // Read file data and convert to base64 + const file_data: string = fs.readFileSync(public_path, { encoding: 'base64' }); - const directory = path.join(publicDirectory, files_directory); + const response = await axios.post( + 'http://localhost:8080/createDocument', + { + file_data, + file_name, + }, + { + headers: { + 'Content-Type': 'application/json', + }, + } + ); - if (!fs.existsSync(directory)) { - fs.mkdirSync(directory); + const jobId = response.data['job_id']; + console.log('Job ID:', jobId); + + res.send({ jobId }); + } catch (error: any) { + console.error('Error communicating with chatbot:', error); + res.status(500).send({ error: 'Failed to communicate with the chatbot', details: error.message }); + } + }, + }); + + register({ + method: Method.GET, + subscription: '/getProgress/:jobId', + secureHandler: async ({ req, res }) => { + const { jobId } = req.params; + try { + const progressResponse = await axios.get(`http://localhost:8080/getProgress/${jobId}`); + console.log(`Current step: ${progressResponse.data.step}, Progress within step: ${progressResponse.data.progress}%`); + res.json(progressResponse.data); + } catch (error) { + console.error('Error getting progress:', error); + res.status(500).send({ error: 'Failed to get progress', details: JSON.parse(error as string).message }); } - const file = await openai.files.content(file_id); - const new_file_name = `${uuid.v4()}-${file_name}`; - const file_path = path.join(directory, new_file_name); - const file_array_buffer = await file.arrayBuffer(); - const bufferView = new Uint8Array(file_array_buffer); + }, + }); + + register({ + method: Method.GET, + subscription: '/getResult/:jobId', + secureHandler: async ({ req, res }) => { + const { jobId } = req.params; try { - const written_file = await writeFileAsync(file_path, bufferView); - console.log(written_file); - console.log(file_path); - console.log(file_array_buffer); - console.log(bufferView); - const file_object = new File([bufferView], file_name); - //DashUploadUtils.upload(file_object, 'openAIFiles'); - res.send({ file_path: path.join(files_directory, new_file_name) }); - /* res.send( { - source: "file", - result: { - accessPaths: { - agnostic: {client: path.join('/files/openAIFiles/', `${uuid.v4()}-${file_name}`)} - }, - rawText: "", - duration: 0, - }, - } ); */ + const finalResponse = await axios.get(`http://localhost:8080/getResult/${jobId}`); + console.log('Result:', finalResponse.data); + const result = finalResponse.data; + + if (result.chunks && Array.isArray(result.chunks)) { + for (const chunk of result.chunks) { + if (chunk.metadata && (chunk.metadata.type === 'image' || chunk.metadata.type === 'table')) { + let files_directory = '/files/chunk_images/'; + const directory = path.join(publicDirectory, files_directory); + + if (!fs.existsSync(directory)) { + fs.mkdirSync(directory); + } + + const fileName = path.basename(chunk.metadata.file_path); + const filePath = path.join(directory, fileName); + + // Check if base64_data exists + if (chunk.metadata.base64_data) { + // Decode Base64 and save as file + const buffer = Buffer.from(chunk.metadata.base64_data, 'base64'); + await fs.promises.writeFile(filePath, buffer); + + // Update the file path in the chunk + chunk.metadata.file_path = path.join(files_directory, fileName); + chunk.metadata.base64_data = undefined; + } else { + console.warn(`No base64_data found for chunk: ${fileName}`); + } + } + } + result['status'] = 'completed'; + } else { + console.warn('Not ready'); + result.status = 'pending'; + } + res.json(result); } catch (error) { - res.status(500).send({ error: 'Failed to write file' + error }); + console.error('Error getting progress:', error); + res.status(500).send({ error: 'Failed to get progress', details: error }); + } + }, + }); + + register({ + method: Method.POST, + subscription: '/formatChunks', + secureHandler: async ({ req, res }) => { + const { relevantChunks } = req.body; + const content: { type: string; text?: string; image_url?: { url: string } }[] = [{ type: 'text', text: '<chunks>' }]; + + for (const chunk of relevantChunks) { + content.push({ + type: 'text', + text: `<chunk chunk_id=${chunk.id} chunk_type=${chunk.metadata.type}>`, + }); + + if (chunk.metadata.type === 'image' || chunk.metadata.type === 'table') { + try { + const filePath = serverPathToFile(Directory.chunk_images, chunk.metadata.file_path); + const imageBuffer = await readFileAsync(filePath); + const base64Image = imageBuffer.toString('base64'); + if (base64Image) { + content.push({ + type: 'image_url', + image_url: { + url: `data:image/jpeg;base64,${base64Image}`, + }, + }); + } else { + console.log(`Failed to encode image for chunk ${chunk.id}`); + } + } catch (error) { + console.error(`Error reading image file for chunk ${chunk.id}:`, error); + } + } + content.push({ type: 'text', text: `${chunk.metadata.text}\n</chunk>\n` }); + } + + content.push({ type: 'text', text: '</chunks>' }); + + res.send({ formattedChunks: content }); + }, + }); + + register({ + method: Method.POST, + subscription: '/createCSV', + secureHandler: async ({ req, res }) => { + const { filename, data } = req.body; + + // Validate input + if (!filename || !data) { + res.status(400).send({ error: 'Filename and data fields are required.' }); + return; + } + + try { + // Generate a UUID for the file + const uuidv4 = uuid.v4(); + + // Construct the full filename with the UUID prefix + const fullFilename = `${uuidv4}-${filename}`; + + // Get the full server path where the file will be saved + const serverFilePath = serverPathToFile(Directory.csv, fullFilename); + + // Write the CSV data (which is a raw string) to the file + await writeFileAsync(serverFilePath, data, 'utf8'); + + // Construct the full client URL for accessing the file + const fileUrl = clientPathToFile(Directory.csv, fullFilename); + + // Return the file URL and UUID to the client + res.send({ fileUrl, id: uuidv4 }); + } catch (error: any) { + console.error('Error creating CSV file:', error); + res.status(500).send({ error: 'Failed to create CSV file.', details: error.message }); + } + }, + }); + + register({ + method: Method.POST, + subscription: '/chunkDocument', + secureHandler: async ({ req, res }) => { + const { file_path } = req.body; + const public_path = path.join(publicDirectory, file_path); + const file_name = path.basename(file_path); + + try { + // Read file data and convert to base64 + const file_data = await fs.promises.readFile(public_path); + + try { + const result = await unstructuredClient.general.partition({ + partitionParameters: { + files: { + content: file_data, + fileName: file_name, + }, + strategy: Strategy.Auto, + chunkingStrategy: ChunkingStrategy.ByTitle, + extractImageBlockTypes: ['Image', 'Table'], + }, + }); + + if (result.statusCode === 200) { + console.log(result.elements); + const jsonElements = JSON.stringify(result.elements, null, 2); + // Print the processed data. + console.log(jsonElements); + res.send({ document_json: jsonElements }); + } else { + console.error(`Unexpected status code: ${result.statusCode}`); + res.status(result.statusCode).send({ error: 'Failed to process the document', details: result }); + } + } catch (e: any) { + console.error('Error during partitioning:', e); + res.status(500).send({ error: 'Failed to partition the document', details: e.message }); + } + } catch (error: any) { + console.error('Error reading file:', error); + res.status(500).send({ error: 'Failed to read the file', details: error.message }); } }, }); |