import * as fs from 'fs'; import { createReadStream, writeFile } from 'fs'; import OpenAI from 'openai'; import * as path from 'path'; import { promisify } from 'util'; import * as uuid from 'uuid'; import { filesDirectory, publicDirectory } from '../SocketData'; import { Method } from '../RouteManager'; import ApiManager, { Registration } from './ApiManager'; import axios from 'axios'; import { RAGChunk } from '../../client/views/nodes/ChatBox/types'; import { UnstructuredClient } from 'unstructured-client'; import { PartitionResponse } from 'unstructured-client/sdk/models/operations'; import { ChunkingStrategy, Strategy } from 'unstructured-client/sdk/models/shared'; import * as cheerio from 'cheerio'; import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk'; import { google } from 'googleapis'; import * as puppeteer from 'puppeteer'; import { JSDOM } from 'jsdom'; import { Readability } from '@mozilla/readability'; export enum Directory { parsed_files = 'parsed_files', images = 'images', videos = 'videos', pdfs = 'pdfs', text = 'text', pdf_thumbnails = 'pdf_thumbnails', audio = 'audio', csv = 'csv', chunk_images = 'chunk_images', scrape_images = 'scrape_images', } export function serverPathToFile(directory: Directory, filename: string) { return path.normalize(`${filesDirectory}/${directory}/${filename}`); } export function pathToDirectory(directory: Directory) { return path.normalize(`${filesDirectory}/${directory}`); } export function clientPathToFile(directory: Directory, filename: string) { return `/files/${directory}/${filename}`; } const writeFileAsync = promisify(writeFile); const readFileAsync = promisify(fs.readFile); export default class AssistantManager extends ApiManager { protected initialize(register: Registration): void { const openai = new OpenAI({ apiKey: process.env._CLIENT_OPENAI_KEY, // Use client key so don't have to set key seperately for client and server. dangerouslyAllowBrowser: true, }); const unstructuredClient = new UnstructuredClient({ security: { apiKeyAuth: process.env._CLIENT_UNSTRUCTURED_API_KEY!, }, }); const scrapflyClient = new ScrapflyClient({ key: process.env._CLIENT_SCRAPFLY_API_KEY! }); const customsearch = google.customsearch('v1'); register({ method: Method.POST, subscription: '/getWikipediaSummary', secureHandler: async ({ req, res }) => { const { title } = req.body; try { const response = await axios.get('https://en.wikipedia.org/w/api.php', { params: { action: 'query', list: 'search', srsearch: title, format: 'json', }, }); const summary = response.data.query.search[0].snippet; if (!summary || summary.length === 0 || summary === '' || summary === ' ') { res.send({ text: 'No article found with that title.' }); } else { res.send({ text: summary }); } } catch (error: any) { console.error('Error retrieving article summary from Wikipedia:', error); res.status(500).send({ error: 'Error retrieving article summary from Wikipedia.', details: error.message }); } }, }); register({ method: Method.POST, subscription: '/getWebSearchResults', secureHandler: async ({ req, res }) => { const { query, max_results } = req.body; try { const response = await customsearch.cse.list({ q: query, cx: process.env._CLIENT_GOOGLE_SEARCH_ENGINE_ID, key: process.env._CLIENT_GOOGLE_API_KEY, safe: 'active', num: max_results, }); const results = response.data.items?.map((item: any) => ({ url: item.link, snippet: item.snippet, })) || []; res.send({ results }); } catch (error: any) { console.error('Error performing web search:', error); res.status(500).send({ error: 'Failed to perform web search', details: error.message }); } }, }); const axiosInstance = axios.create({ headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', }, }); const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms)); const fetchWithRetry = async (url: string, retries = 3, backoff = 300) => { try { const response = await axiosInstance.get(url); return response.data; } catch (error: any) { if (retries > 0 && error.response && error.response.status === 429) { console.log(`Rate limited. Retrying in ${backoff}ms...`); await delay(backoff); return fetchWithRetry(url, retries - 1, backoff * 2); } throw error; } }; register({ method: Method.POST, subscription: '/scrapeWebsite', secureHandler: async ({ req, res }) => { const { url } = req.body; try { // Launch Puppeteer to navigate to the webpage const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'); await page.goto(url, { waitUntil: 'networkidle2' }); // Get the HTML content of the page const htmlContent = await page.content(); await browser.close(); // Use JSDOM to parse the HTML content const dom = new JSDOM(htmlContent, { url }); // Use Readability to extract the readable content const reader = new Readability(dom.window.document); const article = reader.parse(); if (article) { // Extract the plain text from the article content const plainText = article.textContent; // Return the plain text content res.send({ website_plain_text: plainText }); } else { res.status(500).send({ error: 'Failed to extract readable content' }); } } catch (error: any) { console.error('Error scraping website:', error); res.status(500).send({ error: 'Failed to scrape website', details: error.message }); } }, }); register({ method: Method.POST, subscription: '/createDocument', secureHandler: async ({ req, res }) => { const { file_path } = req.body; const public_path = path.join(publicDirectory, file_path); const file_name = path.basename(file_path); try { // Read file data and convert to base64 const file_data = fs.readFileSync(public_path, { encoding: 'base64' }); const response = await axios.post( 'http://localhost:8080/createDocument', { file_data, file_name, }, { headers: { 'Content-Type': 'application/json', }, } ); const jobId = response.data.job_id; // Poll for results let result; while (!result) { await new Promise(resolve => setTimeout(resolve, 5000)); // Wait for 1 second const resultResponse = await axios.get(`http://localhost:8080/getResult/${jobId}`); if (resultResponse.status === 200) { result = resultResponse.data; } } if (result.chunks && Array.isArray(result.chunks)) { for (const chunk of result.chunks) { if (chunk.metadata && (chunk.metadata.type === 'image' || chunk.metadata.type === 'table')) { let files_directory = '/files/chunk_images/'; const directory = path.join(publicDirectory, files_directory); if (!fs.existsSync(directory)) { fs.mkdirSync(directory); } const fileName = path.basename(chunk.metadata.file_path); const filePath = path.join(directory, fileName); // Check if base64_data exists if (chunk.metadata.base64_data) { // Decode Base64 and save as file const buffer = Buffer.from(chunk.metadata.base64_data, 'base64'); await fs.promises.writeFile(filePath, buffer); // Update the file path in the chunk chunk.metadata.file_path = path.join(files_directory, fileName); chunk.metadata.base64_data = undefined; } else { console.warn(`No base64_data found for chunk: ${fileName}`); } } } } else { console.warn("Result does not contain an iterable 'chunks' property"); } res.send({ document_json: result }); } catch (error: any) { console.error('Error communicating with chatbot:', error); res.status(500).send({ error: 'Failed to communicate with the chatbot', details: error.message }); } }, }); register({ method: Method.POST, subscription: '/formatChunks', secureHandler: async ({ req, res }) => { const { relevantChunks } = req.body; const content: { type: string; text?: string; image_url?: { url: string } }[] = [{ type: 'text', text: '' }]; for (const chunk of relevantChunks) { content.push({ type: 'text', text: ``, }); if (chunk.metadata.type === 'image' || chunk.metadata.type === 'table') { try { const filePath = serverPathToFile(Directory.parsed_files, chunk.metadata.file_path); const imageBuffer = await readFileAsync(filePath); const base64Image = imageBuffer.toString('base64'); if (base64Image) { content.push({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${base64Image}`, }, }); } else { console.log(`Failed to encode image for chunk ${chunk.id}`); } } catch (error) { console.error(`Error reading image file for chunk ${chunk.id}:`, error); } content.push({ type: 'text', text: `\n\n` }); } else { content.push({ type: 'text', text: `${chunk.metadata.text}\n\n` }); } } content.push({ type: 'text', text: '' }); res.send({ formattedChunks: content }); }, }); register({ method: Method.POST, subscription: '/chunkDocument', secureHandler: async ({ req, res }) => { const { file_path } = req.body; const public_path = path.join(publicDirectory, file_path); const file_name = path.basename(file_path); try { // Read file data and convert to base64 const file_data = await fs.promises.readFile(public_path); try { const result = await unstructuredClient.general.partition({ partitionParameters: { files: { content: file_data, fileName: file_name, }, strategy: Strategy.Auto, chunkingStrategy: ChunkingStrategy.ByTitle, extractImageBlockTypes: ['Image', 'Table'], }, }); if (result.statusCode === 200) { console.log(result.elements); const jsonElements = JSON.stringify(result.elements, null, 2); // Print the processed data. console.log(jsonElements); res.send({ document_json: jsonElements }); } else { console.error(`Unexpected status code: ${result.statusCode}`); res.status(result.statusCode).send({ error: 'Failed to process the document', details: result }); } } catch (e: any) { console.error('Error during partitioning:', e); res.status(500).send({ error: 'Failed to partition the document', details: e.message }); } } catch (error: any) { console.error('Error reading file:', error); res.status(500).send({ error: 'Failed to read the file', details: error.message }); } }, }); } }