diff options
Diffstat (limited to 'src/server')
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 842 | ||||
-rw-r--r-- | src/server/ApiManagers/DataVizManager.ts | 2 | ||||
-rw-r--r-- | src/server/ApiManagers/FireflyManager.ts | 410 | ||||
-rw-r--r-- | src/server/ApiManagers/FlashcardManager.ts | 161 | ||||
-rw-r--r-- | src/server/ApiManagers/UploadManager.ts | 35 | ||||
-rw-r--r-- | src/server/DashUploadUtils.ts | 59 | ||||
-rw-r--r-- | src/server/GarbageCollector.ts | 3 | ||||
-rw-r--r-- | src/server/RouteManager.ts | 3 | ||||
-rw-r--r-- | src/server/apis/google/GoogleApiServerUtils.ts | 93 | ||||
-rw-r--r-- | src/server/authentication/DashUserModel.ts | 5 | ||||
-rw-r--r-- | src/server/chunker/pdf_chunker.py | 843 | ||||
-rw-r--r-- | src/server/chunker/requirements.txt | 15 | ||||
-rw-r--r-- | src/server/flashcard/labels.py | 285 | ||||
-rw-r--r-- | src/server/flashcard/requirements.txt | 12 | ||||
-rw-r--r-- | src/server/flashcard/venv/pyvenv.cfg | 3 | ||||
-rw-r--r-- | src/server/index.ts | 5 | ||||
-rw-r--r-- | src/server/server_Initialization.ts | 12 | ||||
-rw-r--r-- | src/server/websocket.ts | 42 |
18 files changed, 2639 insertions, 191 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index b42314e41..af25722a4 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -1,13 +1,33 @@ +/** + * @file AssistantManager.ts + * @description This file defines the AssistantManager class, responsible for managing various + * API routes related to the Assistant functionality. It provides features such as file handling, + * web scraping, and integration with third-party APIs like OpenAI and Google Custom Search. + * It also handles job tracking and progress reporting for tasks like document creation and web scraping. + * Utility functions for path manipulation and file operations are included, along with + * a mechanism for handling retry logic during API calls. + */ + +import { Readability } from '@mozilla/readability'; +import axios from 'axios'; +import { spawn } from 'child_process'; import * as fs from 'fs'; -import { createReadStream, writeFile } from 'fs'; +import { writeFile } from 'fs'; +import { google } from 'googleapis'; +import { JSDOM } from 'jsdom'; import OpenAI from 'openai'; import * as path from 'path'; +import * as puppeteer from 'puppeteer'; import { promisify } from 'util'; import * as uuid from 'uuid'; -import { filesDirectory, publicDirectory } from '../SocketData'; +import { AI_Document } from '../../client/views/nodes/chatbot/types/types'; +import { DashUploadUtils } from '../DashUploadUtils'; import { Method } from '../RouteManager'; +import { filesDirectory, publicDirectory } from '../SocketData'; import ApiManager, { Registration } from './ApiManager'; +import { env } from 'process'; +// Enumeration of directories where different file types are stored export enum Directory { parsed_files = 'parsed_files', images = 'images', @@ -17,115 +37,795 @@ export enum Directory { pdf_thumbnails = 'pdf_thumbnails', audio = 'audio', csv = 'csv', + chunk_images = 'chunk_images', + scrape_images = 'scrape_images', } +// In-memory job tracking +const jobResults: { [key: string]: unknown } = {}; +const jobProgress: { [key: string]: unknown } = {}; + +/** + * Constructs a normalized path to a file in the server's file system. + * @param directory The directory where the file is stored. + * @param filename The name of the file. + * @returns The full normalized path to the file. + */ export function serverPathToFile(directory: Directory, filename: string) { return path.normalize(`${filesDirectory}/${directory}/${filename}`); } +/** + * Constructs a normalized path to a directory in the server's file system. + * @param directory The directory to access. + * @returns The full normalized path to the directory. + */ export function pathToDirectory(directory: Directory) { return path.normalize(`${filesDirectory}/${directory}`); } +/** + * Constructs the client-accessible URL for a file. + * @param directory The directory where the file is stored. + * @param filename The name of the file. + * @returns The URL path to the file. + */ export function clientPathToFile(directory: Directory, filename: string) { return `/files/${directory}/${filename}`; } +// Promisified versions of filesystem functions const writeFileAsync = promisify(writeFile); const readFileAsync = promisify(fs.readFile); +/** + * Class responsible for handling various API routes related to the Assistant functionality. + * This class extends `ApiManager` and handles registration of routes and secure request handlers. + */ export default class AssistantManager extends ApiManager { + /** + * Registers all API routes and initializes necessary services like OpenAI and Google Custom Search. + * @param register The registration method to register routes and handlers. + */ protected initialize(register: Registration): void { - const openai = new OpenAI({ apiKey: process.env.OPENAI_KEY, dangerouslyAllowBrowser: true }); + // Initialize Google Custom Search API + const customsearch = google.customsearch('v1'); + const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY }); + + // Register Wikipedia summary API route + register({ + method: Method.POST, + subscription: '/getWikipediaSummary', + secureHandler: async ({ req, res }) => { + const { title } = req.body; + try { + // Fetch summary from Wikipedia using axios + const response = await axios.get('https://en.wikipedia.org/w/api.php', { + params: { + action: 'query', + list: 'search', + srsearch: title, + format: 'json', + }, + }); + const summary = response.data.query.search[0]?.snippet || 'No article found with that title.'; + res.send({ text: summary }); + } catch (error) { + console.error('Error retrieving Wikipedia summary:', error); + res.status(500).send({ + error: 'Error retrieving article summary from Wikipedia.', + }); + } + }, + }); + // Register an API route to retrieve web search results using Google Custom Search + // This route filters results by checking their x-frame-options headers for security purposes register({ method: Method.POST, - subscription: '/uploadPDFToVectorStore', + subscription: '/getWebSearchResults', secureHandler: async ({ req, res }) => { - const { urls, threadID, assistantID, vector_store_id } = req.body; - - const csvFilesIds: string[] = []; - const otherFileIds: string[] = []; - const allFileIds: string[] = []; - - const fileProcesses = urls.map(async (source: string) => { - const fullPath = path.join(publicDirectory, source); - const fileData = await openai.files.create({ file: createReadStream(fullPath), purpose: 'assistants' }); - allFileIds.push(fileData.id); - if (source.endsWith('.csv')) { - console.log(source); - csvFilesIds.push(fileData.id); + const { query, max_results } = req.body; + const MIN_VALID_RESULTS_RATIO = 0.75; // 3/4 threshold + let startIndex = 1; // Start at the first result initially + const fetchSearchResults = async (start: number) => { + return customsearch.cse.list({ + q: query, + cx: process.env._CLIENT_GOOGLE_SEARCH_ENGINE_ID, + key: process.env._CLIENT_GOOGLE_API_KEY, + safe: 'active', + num: max_results, + start, // This controls which result index the search starts from + }); + }; + + const filterResultsByXFrameOptions = async ( + results: { + url: string | null | undefined; + snippet: string | null | undefined; + }[] + ) => { + const filteredResults = await Promise.all( + results + .filter(result => result.url) + .map(async result => { + try { + const urlResponse = await axios.head(result.url!, { timeout: 5000 }); + const xFrameOptions = urlResponse.headers['x-frame-options']; + if (xFrameOptions && xFrameOptions.toUpperCase() === 'SAMEORIGIN') { + return result; + } + } catch (error) { + console.error(`Error checking x-frame-options for URL: ${result.url}`, error); + } + return null; // Exclude the result if it doesn't match + }) + ); + return filteredResults.filter(result => result !== null); // Remove null results + }; + + try { + // Fetch initial search results + let response = await fetchSearchResults(startIndex); + const initialResults = + response.data.items?.map(item => ({ + url: item.link, + snippet: item.snippet, + })) || []; + + // Filter the initial results + let validResults = await filterResultsByXFrameOptions(initialResults); + + // If valid results are less than 3/4 of max_results, fetch more results + while (validResults.length < max_results * MIN_VALID_RESULTS_RATIO) { + // Increment the start index by the max_results to fetch the next set of results + startIndex += max_results; + response = await fetchSearchResults(startIndex); + + const additionalResults = + response.data.items?.map(item => ({ + url: item.link, + snippet: item.snippet, + })) || []; + + const additionalValidResults = await filterResultsByXFrameOptions(additionalResults); + validResults = [...validResults, ...additionalValidResults]; // Combine valid results + + // Break if no more results are available + if (additionalValidResults.length === 0 || response.data.items?.length === 0) { + break; + } + } + + // Return the filtered valid results + res.send({ results: validResults.slice(0, max_results) }); // Limit the results to max_results + } catch (error) { + console.error('Error performing web search:', error); + res.status(500).send({ + error: 'Failed to perform web search', + }); + } + }, + }); + + /** + * Converts a video file to audio format using ffmpeg. + * @param videoPath The path to the input video file. + * @param outputAudioPath The path to the output audio file. + * @returns A promise that resolves when the conversion is complete. + */ + function convertVideoToAudio(videoPath: string, outputAudioPath: string): Promise<void> { + return new Promise((resolve, reject) => { + const ffmpegProcess = spawn('ffmpeg', [ + '-i', + videoPath, // Input file + '-vn', // No video + '-acodec', + 'pcm_s16le', // Audio codec + '-ac', + '1', // Number of audio channels + '-ar', + '16000', // Audio sampling frequency + '-f', + 'wav', // Output format + outputAudioPath, // Output file + ]); + + ffmpegProcess.on('error', error => { + console.error('Error running ffmpeg:', error); + reject(error); + }); + + ffmpegProcess.on('close', code => { + if (code === 0) { + console.log('Audio extraction complete:', outputAudioPath); + resolve(); } else { - openai.beta.vectorStores.files.create(vector_store_id, { file_id: fileData.id }); - otherFileIds.push(fileData.id); + reject(new Error(`ffmpeg exited with code ${code}`)); } }); + }); + } + + // Register an API route to process a media file (audio or video) + // Extracts audio from video files, transcribes the audio using OpenAI Whisper, and provides a summary + register({ + method: Method.POST, + subscription: '/processMediaFile', + secureHandler: async ({ req, res }) => { + const { fileName } = req.body; + + // Ensure the filename is provided + if (!fileName) { + res.status(400).send({ error: 'Filename is required' }); + return; + } + + try { + // Determine the file type and location + const isAudio = fileName.toLowerCase().endsWith('.mp3'); + const directory = isAudio ? Directory.audio : Directory.videos; + const filePath = serverPathToFile(directory, fileName); + + // Check if the file exists + if (!fs.existsSync(filePath)) { + res.status(404).send({ error: 'File not found' }); + return; + } + + console.log(`Processing ${isAudio ? 'audio' : 'video'} file: ${fileName}`); + + // Step 1: Extract audio if it's a video + let audioPath = filePath; + if (!isAudio) { + const audioFileName = `${path.basename(fileName, path.extname(fileName))}.wav`; + audioPath = path.join(pathToDirectory(Directory.audio), audioFileName); + + console.log('Extracting audio from video...'); + await convertVideoToAudio(filePath, audioPath); + } + + // Step 2: Transcribe audio using OpenAI Whisper + console.log('Transcribing audio...'); + const transcription = await openai.audio.transcriptions.create({ + file: fs.createReadStream(audioPath), + model: 'whisper-1', + response_format: 'verbose_json', + timestamp_granularities: ['segment'], + }); + + console.log('Audio transcription complete.'); + + // Step 3: Extract concise JSON + console.log('Extracting concise JSON...'); + const originalSegments = transcription.segments?.map((segment, index) => ({ + index: index.toString(), + text: segment.text, + start: segment.start, + end: segment.end, + })); + + interface ConciseSegment { + text: string; + indexes: string[]; + start: number | null; + end: number | null; + } + + const combinedSegments = []; + let currentGroup: ConciseSegment = { text: '', indexes: [], start: null, end: null }; + let currentDuration = 0; + + originalSegments?.forEach(segment => { + const segmentDuration = segment.end - segment.start; + + if (currentDuration + segmentDuration <= 4000) { + // Add segment to the current group + currentGroup.text += (currentGroup.text ? ' ' : '') + segment.text; + currentGroup.indexes.push(segment.index); + if (currentGroup.start === null) { + currentGroup.start = segment.start; + } + currentGroup.end = segment.end; + currentDuration += segmentDuration; + } else { + // Push the current group and start a new one + combinedSegments.push({ ...currentGroup }); + currentGroup = { + text: segment.text, + indexes: [segment.index], + start: segment.start, + end: segment.end, + }; + currentDuration = segmentDuration; + } + }); + + // Push the final group if it has content + if (currentGroup.text) { + combinedSegments.push({ ...currentGroup }); + } + const lastSegment = combinedSegments[combinedSegments.length - 1]; + + // Check if the last segment is too short and combine it with the second last + if (combinedSegments.length > 1 && lastSegment.end && lastSegment.start) { + const secondLastSegment = combinedSegments[combinedSegments.length - 2]; + const lastDuration = lastSegment.end - lastSegment.start; + + if (lastDuration < 30) { + // Combine the last segment with the second last + secondLastSegment.text += (secondLastSegment.text ? ' ' : '') + lastSegment.text; + secondLastSegment.indexes = secondLastSegment.indexes.concat(lastSegment.indexes); + secondLastSegment.end = lastSegment.end; + + // Remove the last segment from the array + combinedSegments.pop(); + } + } + + console.log('Segments combined successfully.'); + + console.log('Generating summary using GPT-4...'); + const combinedText = combinedSegments.map(segment => segment.text).join(' '); + + let summary = ''; + try { + const completion = await openai.chat.completions.create({ + messages: [{ role: 'system', content: `Summarize the following text in a concise paragraph:\n\n${combinedText}` }], + model: 'gpt-4o', + }); + console.log('Summary generation complete.'); + summary = completion.choices[0].message.content ?? 'Summary could not be generated.'; + } catch (summaryError) { + console.error('Error generating summary:', summaryError); + summary = 'Summary could not be generated.'; + } + // Step 5: Return the JSON result + res.send({ full: originalSegments, condensed: combinedSegments, summary }); + } catch (error) { + console.error('Error processing media file:', error); + res.status(500).send({ error: 'Failed to process media file' }); + } + }, + }); + + // Axios instance with custom headers for scraping + const axiosInstance = axios.create({ + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + }, + }); + + /** + * Utility function to introduce delay (used for retries). + * @param ms Delay in milliseconds. + */ + const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms)); + + /** + * Function to fetch a URL with retry logic, handling rate limits. + * Retries a request if it fails due to rate limits (HTTP status 429). + * @param url The URL to fetch. + * @param retries The number of retry attempts. + * @param backoff Initial backoff time in milliseconds. + */ + const fetchWithRetry = async (url: string, retries = 3, backoff = 300): Promise<unknown> => { + try { + const response = await axiosInstance.get(url); + return response.data; + } catch (error) { + if (retries > 0 && (error as { response: { status: number } }).response?.status === 429) { // bcz: don't know the error type + console.log(`Rate limited. Retrying in ${backoff}ms...`); + await delay(backoff); + return fetchWithRetry(url, retries - 1, backoff * 2); + } // prettier-ignore + throw error; + } + }; + + // Register an API route to generate an image using OpenAI's DALL-E model + // Uploads the generated image to the server and provides a URL for access + register({ + method: Method.POST, + subscription: '/generateImage', + secureHandler: async ({ req, res }) => { + const { image_prompt } = req.body; + + if (!image_prompt) { + res.status(400).send({ error: 'No prompt provided' }); + return; + } + + try { + const image = await openai.images.generate({ model: 'dall-e-3', prompt: image_prompt, response_format: 'url' }); + console.log(image); + const result = await DashUploadUtils.UploadImage(image.data[0].url!); + + const url = image.data[0].url; + + res.send({ result, url }); + } catch (error) { + console.error('Error fetching the URL:', error); + res.status(500).send({ + error: 'Failed to fetch the URL', + }); + } + }, + }); + + // Register an API route to fetch data from a URL using a proxy with retry logic + // Useful for bypassing rate limits or scraping inaccessible data + register({ + method: Method.POST, + subscription: '/proxyFetch', + secureHandler: async ({ req, res }) => { + const { url } = req.body; + + if (!url) { + res.status(400).send({ error: 'No URL provided' }); + return; + } + + try { + const data = await fetchWithRetry(url); + res.send({ data }); + } catch (error) { + console.error('Error fetching the URL:', error); + res.status(500).send({ + error: 'Failed to fetch the URL', + }); + } + }, + }); + + // Register an API route to scrape website content using Puppeteer and JSDOM + // Extracts and returns readable content from a given URL + register({ + method: Method.POST, + subscription: '/scrapeWebsite', + secureHandler: async ({ req, res }) => { + const { url } = req.body; try { - await Promise.all(fileProcesses).then(() => { - res.send({ vector_store_id: vector_store_id, openai_file_ids: allFileIds }); + // Launch Puppeteer browser to navigate to the webpage + const browser = await puppeteer.launch({ + args: ['--no-sandbox', '--disable-setuid-sandbox'], }); + const page = await browser.newPage(); + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'); + await page.goto(url, { waitUntil: 'networkidle2' }); + + // Extract HTML content + const htmlContent = await page.content(); + await browser.close(); + + // Parse HTML content using JSDOM + const dom = new JSDOM(htmlContent, { url }); + + // Extract readable content using Mozilla's Readability API + const reader = new Readability(dom.window.document); + const article = reader.parse(); + + if (article) { + const plainText = article.textContent; + res.send({ website_plain_text: plainText }); + } else { + res.status(500).send({ error: 'Failed to extract readable content' }); + } } catch (error) { - res.status(500).send({ error: 'Failed to process files' + error }); + console.error('Error scraping website:', error); + res.status(500).send({ + error: 'Failed to scrape website', + }); } }, }); + // Register an API route to create a document and start a background job for processing + // Uses Python scripts to process files and generate document chunks for further use register({ method: Method.POST, - subscription: '/downloadFileFromOpenAI', + subscription: '/createDocument', secureHandler: async ({ req, res }) => { - const { file_id, file_name } = req.body; - //let files_directory: string; - let files_directory = '/files/openAIFiles/'; - switch (file_name.split('.').pop()) { - case 'pdf': - files_directory = '/files/pdfs/'; - break; - case 'csv': - files_directory = '/files/csv/'; - break; - case 'png': - case 'jpg': - case 'jpeg': - files_directory = '/files/images/'; - break; - default: - break; - } - - const directory = path.join(publicDirectory, files_directory); - - if (!fs.existsSync(directory)) { - fs.mkdirSync(directory); - } - const file = await openai.files.content(file_id); - const new_file_name = `${uuid.v4()}-${file_name}`; - const file_path = path.join(directory, new_file_name); - const file_array_buffer = await file.arrayBuffer(); - const bufferView = new Uint8Array(file_array_buffer); + const { file_path } = req.body; + const public_path = path.join(publicDirectory, file_path); // Resolve the file path in the public directory + const file_name = path.basename(file_path); // Extract the file name from the path + try { - const written_file = await writeFileAsync(file_path, bufferView); - console.log(written_file); - console.log(file_path); - console.log(file_array_buffer); - console.log(bufferView); - const file_object = new File([bufferView], file_name); - //DashUploadUtils.upload(file_object, 'openAIFiles'); - res.send({ file_path: path.join(files_directory, new_file_name) }); - /* res.send( { - source: "file", - result: { - accessPaths: { - agnostic: {client: path.join('/files/openAIFiles/', `${uuid.v4()}-${file_name}`)} - }, - rawText: "", - duration: 0, - }, - } ); */ + // Read the file data and encode it as base64 + const file_data: string = fs.readFileSync(public_path, { encoding: 'base64' }); + + // Generate a unique job ID for tracking + const jobId = uuid.v4(); + + // Spawn the Python process and track its progress/output + // eslint-disable-next-line no-use-before-define + spawnPythonProcess(jobId, public_path); + + // Send the job ID back to the client for tracking + res.send({ jobId }); } catch (error) { - res.status(500).send({ error: 'Failed to write file' + error }); + console.error('Error initiating document creation:', error); + res.status(500).send({ + error: 'Failed to initiate document creation', + }); } }, }); + + // Register an API route to check the progress of a document creation job + // Returns the current step and progress percentage + register({ + method: Method.GET, + subscription: '/getProgress/:jobId', + secureHandler: async ({ req, res }) => { + const { jobId } = req.params; // Get the job ID from the URL parameters + // Check if the job progress is available + if (jobProgress[jobId]) { + res.json(jobProgress[jobId]); + } else { + res.json({ + step: 'Processing Document...', + progress: '0', + }); + } + }, + }); + + // Register an API route to retrieve the final result of a document creation job + // Returns the processed data or an error status if the job is incomplete + register({ + method: Method.GET, + subscription: '/getResult/:jobId', + secureHandler: async ({ req, res }) => { + const { jobId } = req.params; + if (jobResults[jobId]) { + const result = jobResults[jobId] as AI_Document & { status: string }; + + if (result.chunks && Array.isArray(result.chunks)) { + result.status = 'completed'; + } else { + result.status = 'pending'; + } + res.json(result); + } else { + res.status(202).send({ status: 'pending' }); + } + }, + }); + + // Register an API route to format chunks of text or images for structured display + // Converts raw chunk data into a structured format for frontend consumption + register({ + method: Method.POST, + subscription: '/formatChunks', + secureHandler: async ({ req, res }) => { + const { relevantChunks } = req.body; // Get the relevant chunks from the request body + + // Initialize an array to hold the formatted content + const content: { type: string; text?: string; image_url?: { url: string } }[] = [{ type: 'text', text: '<chunks>' }]; + + await Promise.all( + relevantChunks.map((chunk: { id: string; metadata: { type: string; text: TimeRanges; file_path: string } }) => { + // Format each chunk by adding its metadata and content + content.push({ + type: 'text', + text: `<chunk chunk_id=${chunk.id} chunk_type="${chunk.metadata.type}">`, + }); + + // If the chunk is an image or table, read the corresponding file and encode it as base64 + if (chunk.metadata.type === 'image' || chunk.metadata.type === 'table') { + try { + const filePath = path.join(pathToDirectory(Directory.chunk_images), chunk.metadata.file_path); // Get the file path + console.log(filePath); + readFileAsync(filePath).then(imageBuffer => { + const base64Image = imageBuffer.toString('base64'); // Convert the image to base64 + + // Add the base64-encoded image to the content array + if (base64Image) { + content.push({ + type: 'image_url', + image_url: { + url: `data:image/jpeg;base64,${base64Image}`, + }, + }); + } else { + console.log(`Failed to encode image for chunk ${chunk.id}`); + } + }); + } catch (error) { + console.error(`Error reading image file for chunk ${chunk.id}:`, error); + } + } + + // Add the chunk's text content to the formatted content + content.push({ type: 'text', text: `${chunk.metadata.text}\n</chunk>\n` }); + }) + ); + + content.push({ type: 'text', text: '</chunks>' }); + + // Send the formatted content back to the client + res.send({ formattedChunks: content }); + }, + }); + + // Register an API route to create and save a CSV file on the server + // Writes the CSV content to a unique file and provides a URL for download + register({ + method: Method.POST, + subscription: '/createCSV', + secureHandler: async ({ req, res }) => { + const { filename, data } = req.body; + + // Validate that both the filename and data are provided + if (!filename || !data) { + res.status(400).send({ error: 'Filename and data fields are required.' }); + return; + } + + try { + // Generate a UUID for the file to ensure unique naming + const uuidv4 = uuid.v4(); + const fullFilename = `${uuidv4}-${filename}`; // Prefix the file name with the UUID + + // Get the full server path where the file will be saved + const serverFilePath = serverPathToFile(Directory.csv, fullFilename); + + // Write the CSV data (which is a raw string) to the file + await writeFileAsync(serverFilePath, data, 'utf8'); + + // Construct the client-accessible URL for the file + const fileUrl = clientPathToFile(Directory.csv, fullFilename); + + // Send the file URL and UUID back to the client + res.send({ fileUrl, id: uuidv4 }); + } catch (error) { + console.error('Error creating CSV file:', error); + res.status(500).send({ + error: 'Failed to create CSV file.', + }); + } + }, + }); + } +} + +/** + * Spawns a Python process to handle file processing tasks. + * @param jobId The job ID for tracking progress. + * @param file_name The name of the file to process. + * @param file_path The filepath of the file to process. + */ +function spawnPythonProcess(jobId: string, file_path: string) { + const venvPath = path.join(__dirname, '../chunker/venv'); + const requirementsPath = path.join(__dirname, '../chunker/requirements.txt'); + const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py'); + + const outputDirectory = pathToDirectory(Directory.chunk_images); + + function runPythonScript() { + const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3'); + + const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory]); + + let pythonOutput = ''; + let stderrOutput = ''; + + pythonProcess.stdout.on('data', data => { + pythonOutput += data.toString(); + }); + + pythonProcess.stderr.on('data', data => { + stderrOutput += data.toString(); + const lines = stderrOutput.split('\n'); + stderrOutput = lines.pop() || ''; // Save the last partial line back to stderrOutput + lines.forEach(line => { + if (line.trim()) { + if (line.startsWith('PROGRESS:')) { + const jsonString = line.substring('PROGRESS:'.length); + try { + const parsedOutput = JSON.parse(jsonString); + if (parsedOutput.job_id && parsedOutput.progress !== undefined) { + jobProgress[parsedOutput.job_id] = { + step: parsedOutput.step, + progress: parsedOutput.progress, + }; + } else if (parsedOutput.progress !== undefined) { + jobProgress[jobId] = { + step: parsedOutput.step, + progress: parsedOutput.progress, + }; + } + } catch (err) { + console.error('Error parsing progress JSON:', jsonString, err); + } + } else { + // Log other stderr output + console.error('Python stderr:', line); + } + } + }); + }); + + pythonProcess.on('close', code => { + if (code === 0) { + try { + const finalResult = JSON.parse(pythonOutput); + jobResults[jobId] = finalResult; + jobProgress[jobId] = { step: 'Complete', progress: 100 }; + } catch (err) { + console.error('Error parsing final JSON result:', err); + jobResults[jobId] = { error: 'Failed to parse final result' }; + } + } else { + console.error(`Python process exited with code ${code}`); + // Check if there was an error message in stderr + if (stderrOutput) { + // Try to parse the last line as JSON + const lines = stderrOutput.trim().split('\n'); + const lastLine = lines[lines.length - 1]; + try { + const errorOutput = JSON.parse(lastLine); + jobResults[jobId] = errorOutput; + } catch { + jobResults[jobId] = { error: 'Python process failed' }; + } + } else { + jobResults[jobId] = { error: 'Python process failed' }; + } + } + }); + } + // Check if venv exists + if (!fs.existsSync(venvPath)) { + console.log('Virtual environment not found. Creating and setting up...'); + + // Create venv + const createVenvProcess = spawn('python', ['-m', 'venv', venvPath]); + + createVenvProcess.on('close', code => { + if (code !== 0) { + console.error(`Failed to create virtual environment. Exit code: ${code}`); + return; + } + + console.log('Virtual environment created. Installing requirements...'); + + // Determine the pip path based on the OS + const pipPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'pip.exe') : path.join(venvPath, 'bin', 'pip3'); // Try 'pip3' for Unix-like systems + + if (!fs.existsSync(pipPath)) { + console.error(`pip executable not found at ${pipPath}`); + return; + } + + // Install requirements + const installRequirementsProcess = spawn(pipPath, ['install', '-r', requirementsPath]); + + installRequirementsProcess.stdout.on('data', data => { + console.log(`pip stdout: ${data}`); + }); + + installRequirementsProcess.stderr.on('data', data => { + console.error(`pip stderr: ${data}`); + }); + + installRequirementsProcess.on('error', error => { + console.error(`Error starting pip process: ${error}`); + }); + + installRequirementsProcess.on('close', closecode => { + if (closecode !== 0) { + console.error(`Failed to install requirements. Exit code: ${closecode}`); + return; + } + + console.log('Requirements installed. Running Python script...'); + runPythonScript(); + }); + }); + } else { + console.log('Virtual environment found. Running Python script...'); + runPythonScript(); } } diff --git a/src/server/ApiManagers/DataVizManager.ts b/src/server/ApiManagers/DataVizManager.ts index 88f22992d..d2028f23b 100644 --- a/src/server/ApiManagers/DataVizManager.ts +++ b/src/server/ApiManagers/DataVizManager.ts @@ -9,7 +9,7 @@ export default class DataVizManager extends ApiManager { register({ method: Method.GET, subscription: '/csvData', - secureHandler: async ({ req, res }) => { + secureHandler: ({ req, res }) => { const uri = req.query.uri as string; return new Promise<void>(resolve => { diff --git a/src/server/ApiManagers/FireflyManager.ts b/src/server/ApiManagers/FireflyManager.ts new file mode 100644 index 000000000..e75ede9df --- /dev/null +++ b/src/server/ApiManagers/FireflyManager.ts @@ -0,0 +1,410 @@ +import axios from 'axios'; +import { Dropbox } from 'dropbox'; +import * as fs from 'fs'; +import * as multipart from 'parse-multipart-data'; +import * as path from 'path'; +import { DashUserModel } from '../authentication/DashUserModel'; +import { DashUploadUtils } from '../DashUploadUtils'; +import { _error, _invalid, _success, Method } from '../RouteManager'; +import { Directory, filesDirectory } from '../SocketData'; +import ApiManager, { Registration } from './ApiManager'; + +export default class FireflyManager extends ApiManager { + getBearerToken = () => + fetch('https://ims-na1.adobelogin.com/ims/token/v3', { + method: 'POST', + headers: { + 'Content-Type': 'application/x-www-form-urlencoded', + }, + body: `grant_type=client_credentials&client_id=${process.env._CLIENT_FIREFLY_CLIENT_ID}&client_secret=${process.env._CLIENT_FIREFLY_SECRET}&scope=openid,AdobeID,session,additional_info,read_organizations,firefly_api,ff_apis`, + }).catch(error => { + console.error('Error:', error); + return undefined; + }); + + generateImageFromStructure = (prompt: string = 'a realistic illustration of a cat coding', width: number = 2048, height: number = 2048, structureUrl: string, strength: number = 50, styles: string[], styleUrl: string | undefined) => + this.getBearerToken().then(response => + response?.json().then((data: { access_token: string }) => + //prettier-ignore + fetch('https://firefly-api.adobe.io/v3/images/generate', { + method: 'POST', + headers: [ + ['Content-Type', 'application/json'], + ['Accept', 'application/json'], + ['x-api-key', process.env._CLIENT_FIREFLY_CLIENT_ID ?? ''], + ['Authorization', `Bearer ${data.access_token}`], + ], + body: JSON.stringify({ + prompt, + numVariations: 4, + detailLevel: 'preview', + modelVersion: 'image3_fast', + size: { width, height }, + structure: !structureUrl + ? undefined + : { + strength, + imageReference: { + source: { url: structureUrl }, + }, + }, + // prettier-ignore + style: { + presets: styles, + imageReference : !styleUrl + ? undefined + : { + source: { url: styleUrl }, + } + } + }), + }) + .then(response2 => response2.json().then(json => + { + if (json.outputs?.length) + return (json.outputs as {image: {url:string }}[]).map(output => output.image); + throw new Error(JSON.stringify(json)); + }) + ) + ) + ); + + uploadImageToDropbox = (fileUrl: string, user: DashUserModel | undefined, dbx = new Dropbox({ accessToken: user?.dropboxToken || '' })) => + new Promise<string | Error>((res, rej) => + fs.readFile(path.join(filesDirectory, `${Directory.images}/${path.basename(fileUrl)}`), undefined, (err, contents) => { + if (err) { + console.log('Error: ', err); + rej(); + } else { + dbx.filesUpload({ path: `/Apps/browndash/${path.basename(fileUrl)}`, contents }) + .then(response => { + dbx.filesGetTemporaryLink({ path: response.result.path_display ?? '' }) + .then(link => res(link.result.link)) + .catch(e => res(new Error(e.toString()))); + }) + .catch(e => { + if (user?.dropboxRefresh) { + console.log('*********** try refresh dropbox for: ' + user.email + ' ***********'); + this.refreshDropboxToken(user).then(token => { + if (!token) { + console.log('Dropbox error: cannot refresh token'); + res(new Error(e.toString())); + } else { + const dbxNew = new Dropbox({ accessToken: user.dropboxToken || '' }); + dbxNew + .filesUpload({ path: `/Apps/browndash/${path.basename(fileUrl)}`, contents }) + .then(response => { + dbxNew + .filesGetTemporaryLink({ path: response.result.path_display ?? '' }) + .then(link => res(link.result.link)) + .catch(linkErr => res(new Error(linkErr.toString()))); + }) + .catch(uploadErr => { + console.log('Dropbox error:', uploadErr); + res(new Error(uploadErr.toString())); + }); + } + }); + } else { + console.log('Dropbox error:', e); + res(new Error(e.toString())); + } + }); + } + }) + ); + + generateImage = (prompt: string = 'a realistic illustration of a cat coding', width: number = 2048, height: number = 2048, seed?: number) => { + let body = `{ "prompt": "${prompt}", "size": { "width": ${width}, "height": ${height}} }`; + if (seed) { + console.log('RECEIVED SEED', seed); + body = `{ "prompt": "${prompt}", "size": { "width": ${width}, "height": ${height}}, "seeds": [${seed}]}`; + } + const fetched = this.getBearerToken().then(response => + response?.json().then((data: { access_token: string }) => + fetch('https://firefly-api.adobe.io/v3/images/generate', { + method: 'POST', + headers: [ + ['Content-Type', 'application/json'], + ['Accept', 'application/json'], + ['x-api-key', process.env._CLIENT_FIREFLY_CLIENT_ID ?? ''], + ['Authorization', `Bearer ${data.access_token}`], + ], + body: body, + }) + .then(response2 => response2.json()) + .then(json => (json.error_code ? json : { seed: json.outputs?.[0]?.seed, url: json.outputs?.[0]?.image?.url })) + .catch(error => { + console.error('Error:', error); + return undefined; + }) + ) + ); + return fetched; + }; + expandImage = (imgUrl: string, prompt?: string) => { + const dropboxImgUrl = imgUrl; + const fetched = this.getBearerToken().then(response => + response + ?.json() + .then((data: { access_token: string }) => { + return fetch('https://firefly-api.adobe.io/v3/images/expand', { + method: 'POST', + headers: [ + ['Content-Type', 'application/json'], + ['Accept', 'application/json'], + ['x-api-key', process.env._CLIENT_FIREFLY_CLIENT_ID ?? ''], + ['Authorization', `Bearer ${data.access_token}`], + ], + body: JSON.stringify({ + image: { + source: { + url: dropboxImgUrl, + }, + }, + numVariations: 1, + seeds: [0], + size: { + width: 3048, + height: 2048, + }, + prompt: prompt ?? 'cloudy skies', + placement: { + inset: { + left: 0, + top: 0, + right: 0, + bottom: 0, + }, + alignment: { + horizontal: 'center', + vertical: 'center', + }, + }, + }), + }); + }) + .then(resp => resp.json()) + ); + return fetched; + }; + getImageText = (imageBlob: Blob) => { + const inputFileVarName = 'infile'; + const outputVarName = 'result'; + const fetched = this.getBearerToken().then(response => + response?.json().then((data: { access_token: string }) => { + return fetch('https://sensei.adobe.io/services/v2/predict', { + method: 'POST', + headers: [ + ['Prefer', 'respond-async, wait=59'], + ['x-api-key', process.env._CLIENT_FIREFLY_CLIENT_ID ?? ''], + // ['content-type', 'multipart/form-data'], // bcz: Don't set this!! content-type will get set automatically including the Boundary string + ['Authorization', `Bearer ${data.access_token}`], + ], + body: ((form: FormData) => { + form.set(inputFileVarName, imageBlob); + form.set( + 'contentAnalyzerRequests', + JSON.stringify({ + 'sensei:name': 'Feature:cintel-object-detection:Service-b9ace8b348b6433e9e7d82371aa16690', + 'sensei:invocation_mode': 'asynchronous', + 'sensei:invocation_batch': false, + 'sensei:engines': [ + { + 'sensei:execution_info': { + 'sensei:engine': 'Feature:cintel-object-detection:Service-b9ace8b348b6433e9e7d82371aa16690', + }, + 'sensei:inputs': { + documents: [ + { + 'sensei:multipart_field_name': inputFileVarName, + 'dc:format': 'image/png', + }, + ], + }, + 'sensei:params': { + correct_with_dictionary: true, + }, + 'sensei:outputs': { + result: { + 'sensei:multipart_field_name': outputVarName, + 'dc:format': 'application/json', + }, + }, + }, + ], + }) + ); + return form; + })(new FormData()), + }).then(response2 => { + const contentType = response2.headers.get('content-type') ?? ''; + if (contentType.includes('application/json')) { + return response2.json().then((json: object) => JSON.stringify(json)); + } + if (contentType.includes('multipart')) { + return response2 + .arrayBuffer() + .then(arrayBuffer => + multipart + .parse(Buffer.from(arrayBuffer), 'Boundary' + (response2.headers.get('content-type')?.match(/=Boundary(.*);/)?.[1] ?? '')) + .filter(part => part.name === outputVarName) + .map(part => JSON.parse(part.data.toString())[0]) + .reduce((text, json) => text + (json?.is_text_present ? json.tags.map((tag: { text: string }) => tag.text).join(' ') : ''), '') + ) + .catch(error => { + console.error('Error:', error); + return ''; + }); + } + return response2.text(); + }); + }) + ); + return fetched; + }; + + refreshDropboxToken = (user: DashUserModel) => + axios + .post( + 'https://api.dropbox.com/oauth2/token', + new URLSearchParams({ + refresh_token: user.dropboxRefresh || '', + grant_type: 'refresh_token', + client_id: process.env._CLIENT_DROPBOX_CLIENT_ID || '', + client_secret: process.env._CLIENT_DROPBOX_SECRET || '', + }).toString() + ) + .then(refresh => { + console.log('***** dropbox token refreshed for ' + user?.email + ' ******* '); + user.dropboxToken = refresh.data.access_token; + user.save(); + return user.dropboxToken; + }) + .catch(e => { + console.log(e); + return undefined; + }); + + protected initialize(register: Registration): void { + register({ + method: Method.POST, + subscription: '/queryFireflyImageFromStructure', + secureHandler: ({ req, res }) => + new Promise<void>(resolver => { + (req.body.styleUrl ? this.uploadImageToDropbox(req.body.styleUrl, req.user as DashUserModel) : Promise.resolve(undefined)) + .then(styleUrl => { + if (styleUrl instanceof Error) { + _invalid(res, styleUrl.message); + throw new Error('Error uploading images to dropbox'); + } + this.uploadImageToDropbox(req.body.structureUrl, req.user as DashUserModel) + .then(dropboxStructureUrl => { + if (dropboxStructureUrl instanceof Error) { + _invalid(res, dropboxStructureUrl.message); + throw new Error('Error uploading images to dropbox'); + } + return { styleUrl, structureUrl: dropboxStructureUrl }; + }) + .then(uploads => + this.generateImageFromStructure(req.body.prompt, req.body.width, req.body.height, uploads.structureUrl, req.body.strength, req.body.presets, uploads.styleUrl) + .then(images => { + Promise.all((images ?? [new Error('no images were generated')]).map(fire => (fire instanceof Error ? fire : DashUploadUtils.UploadImage(fire.url)))) + .then(dashImages => { + if (dashImages.every(img => img instanceof Error)) _invalid(res, dashImages[0]!.message); + else _success(res, JSON.stringify(dashImages.filter(img => !(img instanceof Error)))); + }) + .then(resolver); + }) + .catch(e => { + _invalid(res, e.message); + resolver(); + }) + ); + }) + .catch(() => { + /* do nothing */ + resolver(); + }); + }), + }); + register({ + method: Method.POST, + subscription: '/queryFireflyImage', + secureHandler: ({ req, res }) => + this.generateImage(req.body.prompt, req.body.width, req.body.height, req.body.seed).then(img => + img.error_code + ? _invalid(res, img.message) + : DashUploadUtils.UploadImage(img?.url ?? '', undefined, img?.seed).then(info => { + if (info instanceof Error) _invalid(res, info.message); + else _success(res, info); + }) + ), + }); + + register({ + method: Method.POST, + subscription: '/queryFireflyImageText', + secureHandler: ({ req, res }) => + fetch(req.body.file).then(json => + json.blob().then(file => + this.getImageText(file).then(text => { + _success(res, text); + }) + ) + ), + }); + register({ + method: Method.POST, + subscription: '/expandImage', + secureHandler: ({ req, res }) => + this.uploadImageToDropbox(req.body.file, req.user as DashUserModel).then(uploadUrl => + uploadUrl instanceof Error + ? _invalid(res, uploadUrl.message) + : this.expandImage(uploadUrl, req.body.prompt).then(text => { + if (text.error_code) _error(res, text.message); + else + DashUploadUtils.UploadImage(text.outputs[0].image.url).then(info => { + if (info instanceof Error) _invalid(res, info.message); + else _success(res, info); + }); + }) + ), + }); + + // construct this url and send user to it. It will allow them to authorize their dropbox account and will send the resulting token to our endpoint /refreshDropbox + // https://www.dropbox.com/oauth2/authorize?client_id=DROPBOX_CLIENT_ID&response_type=code&token_access_type=offline&redirect_uri=http://localhost:1050/refreshDropbox + // see: https://dropbox.tech/developers/using-oauth-2-0-with-offline-access + // + register({ + method: Method.GET, + subscription: '/refreshDropbox', + secureHandler: ({ req, res }) => { + const user = req.user as DashUserModel; + console.log(`******************* dropbox authorized for ${user?.email} ******************`); + _success(res, 'dropbox authorized for ' + user?.email); + + const data = new URLSearchParams({ + code: req.query.code as string, + grant_type: 'authorization_code', + client_id: process.env._CLIENT_DROPBOX_CLIENT_ID ?? '', + client_secret: process.env._CLIENT_DROPBOX_SECRET ?? '', + redirect_uri: 'http://localhost:1050/refreshDropbox', + }); + axios + .post('https://api.dropbox.com/oauth2/token', data.toString()) + .then(response => { + console.log('***** dropbox token (and refresh) received for ' + user?.email + ' ******* '); + user.dropboxToken = response.data.access_token; + user.dropboxRefresh = response.data.refresh_token; + user.save(); + + setTimeout(() => this.refreshDropboxToken(user), response.data.expires_in - 600); + }) + .catch(e => { + console.log(e); + }); + }, + }); + } +} diff --git a/src/server/ApiManagers/FlashcardManager.ts b/src/server/ApiManagers/FlashcardManager.ts new file mode 100644 index 000000000..fd7c42437 --- /dev/null +++ b/src/server/ApiManagers/FlashcardManager.ts @@ -0,0 +1,161 @@ +/** + * @file FlashcardManager.ts + * @description This file defines the FlashcardManager class, responsible for managing API routes + * related to flashcard creation and manipulation. It provides functionality for handling file processing, + * running Python scripts in a virtual environment, and managing dependencies. + */ + +import { spawn } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import { Method } from '../RouteManager'; +import ApiManager, { Registration } from './ApiManager'; + +/** + * Runs a Python script using the provided virtual environment and passes file and option arguments. + * @param {string} venvPath - Path to the virtual environment. + * @param {string} scriptPath - Path to the Python script. + * @param {string} [file] - Optional file to pass to the Python script. + * @param {string} [drag] - Optional argument to control drag mode. + * @param {string} [smart] - Optional argument to control smart mode. + * @returns {Promise<string>} - Resolves with the output from the Python script, or rejects on error. + */ +function runPythonScript(venvPath: string, scriptPath: string, file?: string, drag?: string, smart?: string): Promise<string> { + return new Promise((resolve, reject) => { + const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python.exe') : path.join(venvPath, 'bin', 'python3'); + + const tempFilePath = path.join(__dirname, `temp_data.txt`); // Unique temp file name + + if (file) { + // Write the raw file data to the temp file without conversion + fs.writeFileSync(tempFilePath, file, 'utf8'); + } + + const pythonProcess = spawn( + pythonPath, + [scriptPath, file ? tempFilePath : undefined, drag, smart].filter(arg => arg !== undefined) + ); + + let pythonOutput = ''; + let stderrOutput = ''; + + pythonProcess.stdout.on('data', data => { + pythonOutput += data.toString(); + }); + + pythonProcess.stderr.on('data', data => { + stderrOutput += data.toString(); + }); + + pythonProcess.on('close', code => { + if (code === 0) { + resolve(pythonOutput); + } else { + reject(`Python process exited with code ${code}: ${stderrOutput}`); + } + }); + }); +} + +/** + * Installs Python dependencies using pip in the specified virtual environment. + * @param {string} venvPath - Path to the virtual environment. + * @param {string} requirementsPath - Path to the requirements.txt file. + * @returns {Promise<void>} - Resolves when dependencies are successfully installed, rejects on failure. + */ +function installDependencies(venvPath: string, requirementsPath: string): Promise<void> { + return new Promise((resolve, reject) => { + const pipPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'pip.exe') : path.join(venvPath, 'bin', 'pip3'); + + const installProcess = spawn(pipPath, ['install', '-r', requirementsPath]); + + installProcess.stdout.on('data', data => { + console.log(`pip stdout: ${data}`); + }); + + installProcess.stderr.on('data', data => { + console.error(`pip stderr: ${data}`); + }); + + installProcess.on('close', code => { + if (code !== 0) { + reject(`Failed to install dependencies. Exit code: ${code}`); + } else { + resolve(); + } + }); + }); +} + +/** + * Creates a new Python virtual environment. + * @param {string} venvPath - Path to the virtual environment that will be created. + * @returns {Promise<void>} - Resolves when the virtual environment is successfully created, rejects on failure. + */ +function createVirtualEnvironment(venvPath: string): Promise<void> { + return new Promise((resolve, reject) => { + const createVenvProcess = spawn('python3', ['-m', 'venv', venvPath]); + + createVenvProcess.on('close', code => { + if (code !== 0) { + reject(`Failed to create virtual environment. Exit code: ${code}`); + } else { + resolve(); + } + }); + }); +} + +/** + * Manages the creation of the virtual environment, installation of dependencies, and running of the Python script. + * @param {string} [file] - Optional file data to be processed by the Python script. + * @param {string} [drag] - Optional argument controlling drag mode. + * @param {string} [smart] - Optional argument controlling smart mode. + * @returns {Promise<string>} - Resolves with the Python script output, or rejects on failure. + */ +async function manageVenvAndRunScript(file?: string, drag?: string, smart?: string): Promise<string> { + const venvPath = path.join(__dirname, '../flashcard/venv'); // Virtual environment path + const requirementsPath = path.join(__dirname, '../flashcard/requirements.txt'); + const pythonScriptPath = path.join(__dirname, '../flashcard/labels.py'); + console.log('venvPath:', venvPath); + + // Check if the virtual environment exists + if (!fs.existsSync(path.join(venvPath, 'bin', 'python3')) && !fs.existsSync(path.join(venvPath, 'Scripts', 'python.exe'))) { + await createVirtualEnvironment(venvPath); + + await installDependencies(venvPath, requirementsPath); + } + + return runPythonScript(venvPath, pythonScriptPath, file, drag, smart); +} + +/** + * FlashcardManager class responsible for managing API routes related to flashcard functionality. + * It initializes API routes for handling YouTube subscriptions and label creation using a Python backend. + */ +export default class FlashcardManager extends ApiManager { + /** + * Initializes the API routes for the FlashcardManager class. + * @param {Registration} register - The registration function for defining API routes. + */ + protected initialize(register: Registration): void { + register({ + method: Method.POST, + subscription: '/labels', + secureHandler: async ({ req, res }) => { + const { file, drag, smart } = req.body; + + try { + // Run the Python process + const result = await manageVenvAndRunScript(file, drag, smart); + res.status(200).send({ result }); + } catch (error) { + console.error('Error initiating document creation:', error); + res.status(500).send({ + error: 'Failed to initiate document creation', + }); + } + }, + }); + } +} diff --git a/src/server/ApiManagers/UploadManager.ts b/src/server/ApiManagers/UploadManager.ts index 868373474..c9d5df547 100644 --- a/src/server/ApiManagers/UploadManager.ts +++ b/src/server/ApiManagers/UploadManager.ts @@ -70,10 +70,16 @@ export default class UploadManager extends ApiManager { ]); } else { fileguids.split(';').map(guid => DashUploadUtils.uploadProgress.set(guid, `resampling images`)); + // original filenames with '.'s, such as a Macbook screenshot, can be a problem - their extension is not kept in formidable's newFilename. + // This makes sure that the extension is preserved in the newFilename. + const fixNewFilename = (f: formidable.File) => { + if (path.extname(f.originalFilename ?? '') !== path.extname(f.newFilename)) f.newFilename = f.newFilename + path.extname(f.originalFilename ?? ''); + return f; + }; const results = ( await Promise.all( Array.from(Object.keys(files)).map( - async key => (!files[key] ? undefined : DashUploadUtils.upload(files[key]![0] /* , key */)) // key is the guid used by the client to track upload progress. + async key => (!files[key] ? undefined : DashUploadUtils.upload(fixNewFilename(files[key][0]) /* , key */)) // key is the guid used by the client to track upload progress. ) ) ).filter(result => result && !(result.result instanceof Error)); @@ -147,13 +153,10 @@ export default class UploadManager extends ApiManager { if (doc.id) { doc.id = getId(doc.id); } - // eslint-disable-next-line no-restricted-syntax for (const key in doc.fields) { - // eslint-disable-next-line no-continue if (!Object.prototype.hasOwnProperty.call(doc.fields, key)) continue; const field = doc.fields[key]; - // eslint-disable-next-line no-continue if (field === undefined || field === null) continue; if (field.__type === 'Doc') { @@ -182,11 +185,9 @@ export default class UploadManager extends ApiManager { let docids: string[] = []; let linkids: string[] = []; try { - // eslint-disable-next-line no-restricted-syntax for (const name in files) { if (Object.prototype.hasOwnProperty.call(files, name)) { const f = files[name]; - // eslint-disable-next-line no-continue if (!f) continue; const path2 = f[0]; // what about the rest of the array? are we guaranteed only one value is set? const zip = new AdmZip(path2.filepath); @@ -273,14 +274,20 @@ export default class UploadManager extends ApiManager { .filter(f => regex.test(f)) .map(f => fs.unlinkSync(serverPath + f)); } - imageDataUri.outputFile(uri, serverPathToFile(Directory.images, InjectSize(filename, origSuffix))).then((savedName: string) => { - const ext = path.extname(savedName).toLowerCase(); - const outputPath = serverPathToFile(Directory.images, filename + ext); - if (AcceptableMedia.imageFormats.includes(ext)) { - workerResample(savedName, outputPath, origSuffix, false); - } - res.send(clientPathToFile(Directory.images, filename + ext)); - }); + imageDataUri + .outputFile(uri, serverPathToFile(Directory.images, InjectSize(filename, origSuffix))) + .then((savedName: string) => { + const ext = path.extname(savedName).toLowerCase(); + const outputPath = serverPathToFile(Directory.images, filename + ext); + if (AcceptableMedia.imageFormats.includes(ext)) { + workerResample(savedName, outputPath, origSuffix, false); + } + res.send(clientPathToFile(Directory.images, filename + ext)); + }) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + .catch((e: any) => { + res.status(404).json({ error: e.toString() }); + }); }, }); } diff --git a/src/server/DashUploadUtils.ts b/src/server/DashUploadUtils.ts index 1e55a885a..a2747257a 100644 --- a/src/server/DashUploadUtils.ts +++ b/src/server/DashUploadUtils.ts @@ -1,3 +1,4 @@ +/* eslint-disable no-use-before-define */ import axios from 'axios'; import { exec, spawn } from 'child_process'; import { green, red } from 'colors'; @@ -21,8 +22,8 @@ import { AzureManager } from './ApiManagers/AzureManager'; import { AcceptableMedia, Upload } from './SharedMediaTypes'; import { Directory, clientPathToFile, filesDirectory, pathToDirectory, publicDirectory, serverPathToFile } from './SocketData'; import { resolvedServerUrl } from './server_Initialization'; - import { Worker, isMainThread, parentPort } from 'worker_threads'; +import requestImageSize from '../client/util/request-image-size'; // Create an array to store worker threads enum workertasks { @@ -47,22 +48,21 @@ if (isMainThread) { async function workerResampleImage(message: { imgSourcePath: string; outputPath: string; origSuffix: string; unlinkSource: boolean }) { const { imgSourcePath, outputPath, origSuffix, unlinkSource } = message; - const sizes = !origSuffix ? [{ width: 400, suffix: SizeSuffix.Medium }] : DashUploadUtils.imageResampleSizes(path.extname(imgSourcePath)); + const extension = path.extname(imgSourcePath); + const sizes = !origSuffix ? [{ width: 400, suffix: SizeSuffix.Medium }] : DashUploadUtils.imageResampleSizes(extension === '.xml' ? '.png' : extension); // prettier-ignore Jimp.read(imgSourcePath) .then(img => sizes.forEach(({ width, suffix }) => img.resize({ w: width || img.bitmap.width }) - .write(InjectSize(outputPath, suffix) as `${string}.${string}`) + .write(InjectSize(outputPath, suffix) as `${string}.${string}`) + .catch(e => console.log("Jimp error:", e)) )) .catch(e => console.log('Error Jimp:', e)) .finally(() => unlinkSource && unlinkSync(imgSourcePath)); } } -// eslint-disable-next-line @typescript-eslint/no-var-requires -const requestImageSize = require('../client/util/request-image-size'); - export enum SizeSuffix { Small = '_s', Medium = '_m', @@ -221,7 +221,6 @@ export namespace DashUploadUtils { const parseExifData = async (source: string) => { const image = await request.get(source, { encoding: null }); const { /* data, */ error } = await new Promise<{ data: ExifData; error: string | undefined }>(resolve => { - // eslint-disable-next-line no-new new ExifImage({ image }, (exifError, data) => { resolve({ data, error: exifError?.message }); }); @@ -300,7 +299,6 @@ export namespace DashUploadUtils { // Bundle up the information into an object return { source, - // eslint-disable-next-line radix contentSize: parseInt(headers[size]), contentType: headers[type], nativeWidth, @@ -343,15 +341,24 @@ export namespace DashUploadUtils { const outputPath = path.resolve(pathToDirectory(Directory.images), outputFileName); const sizes = imageResampleSizes(path.extname(outputFileName)); - const imgReadStream = new Duplex(); - imgReadStream.push(fs.readFileSync(imgSourcePath)); - imgReadStream.push(null); - await Promise.all( - sizes.map(({ suffix }) => - new Promise<unknown>(res => - imgReadStream.pipe(createWriteStream(writtenFiles[suffix] = InjectSize(outputPath, suffix))).on('close', res) - ) - )); // prettier-ignore + if (unlinkSource) { + const imgReadStream = new Duplex(); + imgReadStream.push(fs.readFileSync(imgSourcePath)); + imgReadStream.push(null); + await Promise.all( + sizes.map(({ suffix }) => + new Promise<void>(res => + imgReadStream.pipe(createWriteStream(writtenFiles[suffix] = InjectSize(outputPath, suffix))).on('close', res) + ) + )); // prettier-ignore + } else { + await Promise.all( + sizes.map(({ suffix }) => + new Promise<void>(res => + request.get(imgSourcePath).pipe(createWriteStream(writtenFiles[suffix] = InjectSize(outputPath, suffix))).on('close', res) + ) + )); // prettier-ignore + } workerResample(imgSourcePath, outputPath, SizeSuffix.Original, unlinkSource); return writtenFiles; @@ -368,8 +375,9 @@ export namespace DashUploadUtils { * @returns the accessPaths for the resized files. */ export const UploadInspectedImage = async (metadata: Upload.InspectionResults, filename: string, prefix = '', cleanUp = true): Promise<Upload.ImageInformation> => { - const { requestable, source, ...remaining } = metadata; - const resolved = filename || `${prefix}upload_${Utils.GenerateGuid()}.${remaining.contentType.split('/')[1].toLowerCase()}`; + const { requestable, ...remaining } = metadata; + const dfltSuffix = remaining.contentType.split('/')[1].toLowerCase(); + const resolved = filename || `${prefix}upload_${Utils.GenerateGuid()}.${dfltSuffix === 'xml' ? 'jpg' : dfltSuffix}`; const { images } = Directory; const information: Upload.ImageInformation = { accessPaths: { @@ -400,10 +408,10 @@ export namespace DashUploadUtils { writtenFiles = {}; } } else { - const unlinkSrcWhenFinished = isLocal().test(source) && cleanUp; + const unlinkSrcWhenFinished = cleanUp; // isLocal().test(source) && cleanUp; try { writtenFiles = await outputResizedImages(metadata.source, resolved, unlinkSrcWhenFinished); - } catch (e) { + } catch { // input is a blob or other, try reading it to create a metadata source file. const reqSource = request(metadata.source); const readStream: Stream = reqSource instanceof Promise ? await reqSource : reqSource; @@ -415,7 +423,7 @@ export namespace DashUploadUtils { .on('error', () => rej()); }); writtenFiles = await outputResizedImages(readSource, resolved, unlinkSrcWhenFinished); - fs.unlink(readSource, err => console.log("Couldn't unlink temporary image file:" + readSource, err)); + //fs.unlink(readSource, err => console.log("Couldn't unlink temporary image file:" + readSource, err)); } } Array.from(Object.keys(writtenFiles)).forEach(suffix => { @@ -448,8 +456,7 @@ export namespace DashUploadUtils { return { name: result.name, message: result.message }; } const outputFile = filename || result.filename || ''; - - return UploadInspectedImage(result, outputFile, prefix); + return UploadInspectedImage(result, outputFile, prefix, isLocal().exec(source) || source.startsWith('data:') ? true : false); }; type md5 = 'md5'; @@ -567,7 +574,9 @@ export namespace DashUploadUtils { switch (category) { case 'image': if (imageFormats.includes(format)) { - const result = await UploadImage(filepath, basename(filepath)); + const outputName = basename(filepath); + const extname = path.extname(originalFilename ?? ''); + const result = await UploadImage(filepath, outputName.endsWith(extname) ? outputName : outputName + extname, undefined); return { source: file, result }; } fs.unlink(filepath, () => {}); diff --git a/src/server/GarbageCollector.ts b/src/server/GarbageCollector.ts index 041f65592..74e8c288a 100644 --- a/src/server/GarbageCollector.ts +++ b/src/server/GarbageCollector.ts @@ -1,7 +1,4 @@ /* eslint-disable no-await-in-loop */ -/* eslint-disable no-continue */ -/* eslint-disable no-cond-assign */ -/* eslint-disable no-restricted-syntax */ import * as fs from 'fs'; import * as path from 'path'; import { Database } from './database'; diff --git a/src/server/RouteManager.ts b/src/server/RouteManager.ts index d8e0455f6..2f6cf80b5 100644 --- a/src/server/RouteManager.ts +++ b/src/server/RouteManager.ts @@ -39,8 +39,7 @@ export function _success(res: Response, body: any) { } export function _invalid(res: Response, message: string) { - res.statusMessage = message; - res.status(STATUS.BAD_REQUEST).send(); + res.status(STATUS.BAD_REQUEST).send(message); } export function _permissionDenied(res: Response, message?: string) { diff --git a/src/server/apis/google/GoogleApiServerUtils.ts b/src/server/apis/google/GoogleApiServerUtils.ts index 21c405bee..7373df473 100644 --- a/src/server/apis/google/GoogleApiServerUtils.ts +++ b/src/server/apis/google/GoogleApiServerUtils.ts @@ -1,7 +1,7 @@ +/* eslint-disable no-use-before-define */ import { GaxiosResponse } from 'gaxios'; import { Credentials, OAuth2Client, OAuth2ClientOptions } from 'google-auth-library'; import { google } from 'googleapis'; -import * as qs from 'query-string'; import * as request from 'request-promise'; import { Opt } from '../../../fields/Doc'; import { Database } from '../../database'; @@ -21,7 +21,6 @@ const scope = ['documents.readonly', 'documents', 'presentations', 'presentation * This namespace manages server side authentication for Google API queries, either * from the standard v1 APIs or the Google Photos REST API. */ - export namespace GoogleApiServerUtils { /** * As we expand out to more Google APIs that are accessible from @@ -71,29 +70,29 @@ export namespace GoogleApiServerUtils { /** * A briefer format for the response from a 'googleapis' API request */ - export type ApiResponse = Promise<GaxiosResponse>; + export type ApiResponse = Promise<GaxiosResponse<unknown>>; /** * A generic form for a handler that executes some request on the endpoint */ - export type ApiRouter = (endpoint: Endpoint, parameters: any) => ApiResponse; + export type ApiRouter = (endpoint: Endpoint, parameters: Record<string, unknown>) => ApiResponse; /** * A generic form for the asynchronous function that actually submits the - * request to the API and returns the corresporing response. Helpful when + * request to the API and returns the corresponding response. Helpful when * making an extensible endpoint definition. */ - export type ApiHandler = (parameters: any, methodOptions?: any) => ApiResponse; + export type ApiHandler = (parameters: Record<string, unknown>, methodOptions?: Record<string, unknown>) => ApiResponse; /** * A literal union type indicating the valid actions for these 'googleapis' - * requestions + * requests */ export type Action = 'create' | 'retrieve' | 'update'; /** * An interface defining any entity on which one can invoke - * anuy of the following handlers. All 'googleapis' wrappers + * any of the following handlers. All 'googleapis' wrappers * such as google.docs().documents and google.slides().presentations * satisfy this interface. */ @@ -109,7 +108,7 @@ export namespace GoogleApiServerUtils { * of needless duplicate clients that would arise from * making one new client instance per request. */ - const authenticationClients = new Map<String, OAuth2Client>(); + const authenticationClients = new Map<string, OAuth2Client>(); /** * This function receives the target sector ("which G-Suite app's API am I interested in?") @@ -120,23 +119,21 @@ export namespace GoogleApiServerUtils { * @returns the relevant 'googleapis' wrapper, if any */ export async function GetEndpoint(sector: string, userId: string): Promise<Endpoint | void> { - return new Promise(async resolve => { - const auth = await retrieveOAuthClient(userId); - if (!auth) { - return resolve(); - } - let routed: Opt<Endpoint>; - const parameters: any = { auth, version: 'v1' }; - switch (sector) { - case Service.Documents: - routed = google.docs(parameters).documents; - break; - case Service.Slides: - routed = google.slides(parameters).presentations; - break; - } - resolve(routed); - }); + const auth = await retrieveOAuthClient(userId); + if (!auth) { + return; + } + let routed: Opt<Endpoint>; + const parameters: { version: 'v1' } = { /* auth, */ version: 'v1' }; ///* auth: OAuth2Client;*/ + switch (sector) { + case Service.Documents: + routed = google.docs(parameters).documents; + break; + case Service.Slides: + routed = google.slides(parameters).presentations; + break; + } + return routed; } /** @@ -149,19 +146,17 @@ export namespace GoogleApiServerUtils { * security. */ export async function retrieveOAuthClient(userId: string): Promise<OAuth2Client | void> { - return new Promise(async resolve => { - const { credentials, refreshed } = await retrieveCredentials(userId); - if (!credentials) { - return resolve(); - } - let client = authenticationClients.get(userId); - if (!client) { - authenticationClients.set(userId, (client = generateClient(credentials))); - } else if (refreshed) { - client.setCredentials(credentials); - } - resolve(client); - }); + const { credentials, refreshed } = await retrieveCredentials(userId); + if (!credentials) { + return; + } + let client = authenticationClients.get(userId); + if (!client) { + authenticationClients.set(userId, (client = generateClient(credentials))); + } else if (refreshed) { + client.setCredentials(credentials); + } + return client; } /** @@ -173,7 +168,9 @@ export namespace GoogleApiServerUtils { */ function generateClient(credentials?: Credentials): OAuth2Client { const client = new google.auth.OAuth2(oAuthOptions); - credentials && client.setCredentials(credentials); + if (credentials) { + client.setCredentials(credentials); + } return client; } @@ -206,7 +203,7 @@ export namespace GoogleApiServerUtils { */ export async function processNewUser(userId: string, authenticationCode: string): Promise<EnrichedCredentials> { const credentials = await new Promise<Credentials>((resolve, reject) => { - worker.getToken(authenticationCode, async (err, credentials) => { + worker.getToken(authenticationCode, (err, credentials) => { if (err || !credentials) { reject(err); return; @@ -221,7 +218,7 @@ export namespace GoogleApiServerUtils { /** * This type represents the union of the full set of OAuth2 credentials - * and all of a Google user's publically available information. This is the strucure + * and all of a Google user's publicly available information. This is the structure * of the JSON object we ultimately store in the googleAuthentication table of the database. */ export type EnrichedCredentials = Credentials & { userInfo: UserInfo }; @@ -297,15 +294,15 @@ export namespace GoogleApiServerUtils { async function refreshAccessToken(credentials: Credentials, userId: string): Promise<Credentials> { const headerParameters = { headers: { 'Content-Type': 'application/x-www-form-urlencoded' } }; const { client_id, client_secret } = GoogleCredentialsLoader.ProjectCredentials; - const url = `https://oauth2.googleapis.com/token?${qs.stringify({ - refreshToken: credentials.refresh_token, + const params = new URLSearchParams({ + refresh_token: credentials.refresh_token!, client_id, client_secret, grant_type: 'refresh_token', - })}`; - const { access_token, expires_in } = await new Promise<any>(async resolve => { - const response = await request.post(url, headerParameters); - resolve(JSON.parse(response)); + }); + const url = `https://oauth2.googleapis.com/token?${params.toString()}`; + const { access_token, expires_in } = await new Promise<{ access_token: string; expires_in: number }>(resolve => { + request.post(url, headerParameters).then(response => resolve(JSON.parse(response))); }); // expires_in is in seconds, but we're building the new expiry date in milliseconds const expiry_date = new Date().getTime() + expires_in * 1000; diff --git a/src/server/authentication/DashUserModel.ts b/src/server/authentication/DashUserModel.ts index bfa6d7bdb..debeef60c 100644 --- a/src/server/authentication/DashUserModel.ts +++ b/src/server/authentication/DashUserModel.ts @@ -9,6 +9,9 @@ export type DashUserModel = mongoose.Document & { passwordResetToken?: string; passwordResetExpires?: Date; + dropboxRefresh?: string; + dropboxToken?: string; + userDocumentId: string; sharingDocumentId: string; linkDatabaseId: string; @@ -37,6 +40,8 @@ const userSchema = new mongoose.Schema( passwordResetToken: String, passwordResetExpires: Date, + dropboxRefresh: String, + dropboxToken: String, userDocumentId: String, // id that identifies a document which hosts all of a user's account data sharingDocumentId: String, // id that identifies a document that stores documents shared to a user, their user color, and any additional info needed to communicate between users linkDatabaseId: String, diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py new file mode 100644 index 000000000..697550f2e --- /dev/null +++ b/src/server/chunker/pdf_chunker.py @@ -0,0 +1,843 @@ +import asyncio +import concurrent +import sys + +from tqdm.asyncio import tqdm_asyncio # Progress bar for async tasks +import PIL +from anthropic import Anthropic # For language model API +from packaging.version import parse # Version checking +import pytesseract # OCR library for text extraction from images +import re +import dotenv # For environment variable loading +from lxml import etree # XML parsing +from tqdm import tqdm # Progress bar for non-async tasks +import fitz # PyMuPDF, PDF processing library +from PIL import Image, ImageDraw # Image processing +from typing import List, Dict, Any, TypedDict # Typing for function annotations +from ultralyticsplus import YOLO # Object detection model (YOLO) +import base64 +import io +import json +import os +import uuid # For generating unique IDs +from enum import Enum # Enums for types like document type and purpose +import openai +import numpy as np +from PyPDF2 import PdfReader # PDF text extraction +from openai import OpenAI # OpenAI client for text completion +from sklearn.cluster import KMeans # Clustering for summarization +import warnings + +# Silence specific warnings +warnings.filterwarnings('ignore', message="Valid config keys have changed") +warnings.filterwarnings('ignore', message="torch.load") + +dotenv.load_dotenv() # Load environment variables + +# Fix for newer versions of PIL +# if parse(PIL.__version__) >= parse('10.0.0'): +# Image.LINEAR = Image.BILINEAR + +# Global dictionary to track progress of document processing jobs +current_progress = {} + +def update_progress(job_id, step, progress_value): + """ + Output the progress in JSON format to stdout for the Node.js process to capture. + + :param job_id: The unique identifier for the processing job. + :param step: The current step of the job. + :param progress_value: The percentage of completion for the current step. + """ + progress_data = { + "job_id": job_id, + "step": step, + "progress": progress_value + } + print(f"PROGRESS:{json.dumps(progress_data)}", file=sys.stderr) + sys.stderr.flush() + + + +class ElementExtractor: + """ + A class that uses a YOLO model to extract tables and images from a PDF page. + """ + + def __init__(self, output_folder: str, doc_id: str): + """ + Initializes the ElementExtractor with the output folder for saving images and the YOLO model. + + :param output_folder: Path to the folder where extracted elements will be saved. + """ + self.doc_id = doc_id + self.output_folder = os.path.join(output_folder, doc_id) + os.makedirs(self.output_folder, exist_ok=True) + self.model = YOLO('keremberke/yolov8m-table-extraction') # Load YOLO model for table extraction + self.model.overrides['conf'] = 0.25 # Set confidence threshold for detection + self.model.overrides['iou'] = 0.45 # Set Intersection over Union (IoU) threshold + self.padding = 5 # Padding around detected elements + + async def extract_elements(self, page, padding: int = 20) -> List[Dict[str, Any]]: + """ + Asynchronously extract tables and images from a PDF page. + + :param page: A Page object representing a PDF page. + :param padding: Padding around the extracted elements. + :return: A list of dictionaries containing the extracted elements. + """ + tasks = [ + asyncio.create_task(self.extract_tables(page.image, page.page_num)), # Extract tables from the page + asyncio.create_task(self.extract_images(page.page, page.image, page.page_num)) # Extract images from the page + ] + results = await asyncio.gather(*tasks) # Wait for both tasks to complete + return [item for sublist in results for item in sublist] # Flatten and return results + + async def extract_tables(self, img: Image.Image, page_num: int) -> List[Dict[str, Any]]: + """ + Asynchronously extract tables from a given page image using the YOLO model. + + :param img: The image of the PDF page. + :param page_num: The current page number. + :return: A list of dictionaries with metadata about the detected tables. + """ + results = self.model.predict(img, verbose=False) # Predict table locations using YOLO + tables = [] + + for idx, box in enumerate(results[0].boxes): + x1, y1, x2, y2 = map(int, box.xyxy[0]) # Extract bounding box coordinates + + # Draw a red rectangle on the full page image around the table + page_with_outline = img.copy() + draw = ImageDraw.Draw(page_with_outline) + draw.rectangle( + [max(0, x1 + self.padding), max(0, y1 + self.padding), min(page_with_outline.width, x2 + self.padding), + min(page_with_outline.height, y2 + self.padding)], outline="red", width=2) # Draw red outline + + # Save the full page with the red outline + table_filename = f"table_page{page_num + 1}_{idx + 1}.png" + table_path = os.path.join(self.output_folder, table_filename) + page_with_outline.save(table_path) + + file_path_for_client = f"{self.doc_id}/{table_filename}" + + tables.append({ + 'metadata': { + "type": "table", + "location": [x1 / img.width, y1 / img.height, x2 / img.width, y2 / img.height], + "file_path": file_path_for_client, + "start_page": page_num, + "end_page": page_num, + "base64_data": self.image_to_base64(page_with_outline) + } + }) + + return tables + + async def extract_images(self, page: fitz.Page, img: Image.Image, page_num: int) -> List[Dict[str, Any]]: + """ + Asynchronously extract embedded images from a PDF page. + + :param page: A fitz.Page object representing the PDF page. + :param img: The image of the PDF page. + :param page_num: The current page number. + :return: A list of dictionaries with metadata about the detected images. + """ + images = [] + image_list = page.get_images(full=True) # Get a list of images on the page + + if not image_list: + return images + + for img_index, img_info in enumerate(image_list): + xref = img_info[0] # XREF of the image in the PDF + base_image = page.parent.extract_image(xref) # Extract the image by its XREF + image_bytes = base_image["image"] + image = Image.open(io.BytesIO(image_bytes)) # Convert bytes to PIL image + width_ratio = img.width / page.rect.width # Scale factor for width + height_ratio = img.height / page.rect.height # Scale factor for height + + # Get image coordinates or default to page rectangle + rect_list = page.get_image_rects(xref) + if rect_list: + rect = rect_list[0] + x1, y1, x2, y2 = rect + else: + rect = page.rect + x1, y1, x2, y2 = rect + + # Draw a red rectangle on the full page image around the embedded image + page_with_outline = img.copy() + draw = ImageDraw.Draw(page_with_outline) + draw.rectangle([x1 * width_ratio, y1 * height_ratio, x2 * width_ratio, y2 * height_ratio], + outline="red", width=2) # Draw red outline + + # Save the full page with the red outline + image_filename = f"image_page{page_num + 1}_{img_index + 1}.png" + image_path = os.path.join(self.output_folder, image_filename) + page_with_outline.save(image_path) + + file_path_for_client = f"{self.doc_id}/{image_filename}" + + images.append({ + 'metadata': { + "type": "image", + "location": [x1 / page.rect.width, y1 / page.rect.height, x2 / page.rect.width, + y2 / page.rect.height], + "file_path": file_path_for_client, + "start_page": page_num, + "end_page": page_num, + "base64_data": self.image_to_base64(image) + } + }) + + return images + + @staticmethod + def image_to_base64(image: Image.Image) -> str: + """ + Convert a PIL image to a base64-encoded string. + + :param image: The PIL image to be converted. + :return: The base64-encoded string of the image. + """ + buffered = io.BytesIO() + image.save(buffered, format="PNG") # Save image as PNG to an in-memory buffer + return base64.b64encode(buffered.getvalue()).decode('utf-8') # Convert to base64 and return + + +class ChunkMetaData(TypedDict): + """ + A TypedDict that defines the metadata structure for chunks of text and visual elements. + """ + text: str + type: str + original_document: str + file_path: str + doc_id: str + location: str + start_page: int + end_page: int + base64_data: str + + +class Chunk(TypedDict): + """ + A TypedDict that defines the structure for a document chunk, including metadata and embeddings. + """ + id: str + values: List[float] + metadata: ChunkMetaData + + +class Page: + """ + A class that represents a single PDF page, handling its image representation and element masking. + """ + + def __init__(self, page: fitz.Page, page_num: int): + """ + Initializes the Page with its page number and the image representation of the page. + + :param page: A fitz.Page object representing the PDF page. + :param page_num: The number of the page in the PDF. + """ + self.page = page + self.page_num = page_num + # Get high-resolution image of the page (for table/image extraction) + self.pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) + self.image = Image.frombytes("RGB", [self.pix.width, self.pix.height], self.pix.samples) + self.masked_image = self.image.copy() # Image with masked elements (tables/images) + self.draw = ImageDraw.Draw(self.masked_image) + self.elements = [] # List to store extracted elements + + def add_element(self, element): + """ + Adds a detected element (table/image) to the page and masks its location on the page image. + + :param element: A dictionary containing metadata about the detected element. + """ + self.elements.append(element) + # Mask the element on the page image by drawing a white rectangle over its location + x1, y1, x2, y2 = [coord * self.image.width if i % 2 == 0 else coord * self.image.height + for i, coord in enumerate(element['metadata']['location'])] + self.draw.rectangle([x1, y1, x2, y2], fill="white") # Draw a white rectangle to mask the element + + +class PDFChunker: + """ + The main class responsible for chunking PDF files into text and visual elements (tables/images). + """ + + def __init__(self, output_folder: str = "output", doc_id: str = '', image_batch_size: int = 5) -> None: + """ + Initializes the PDFChunker with an output folder and an element extractor for visual elements. + + :param output_folder: Folder to store the output files (extracted tables/images). + :param image_batch_size: The batch size for processing visual elements. + """ + self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) # Initialize the Anthropic API client + self.output_folder = output_folder + self.image_batch_size = image_batch_size # Batch size for image processing + self.doc_id = doc_id # Add doc_id + self.element_extractor = ElementExtractor(output_folder, doc_id) + + async def chunk_pdf(self, file_data: bytes, file_name: str, doc_id: str, job_id: str) -> List[Dict[str, Any]]: + """ + Processes a PDF file, extracting text and visual elements, and returning structured chunks. + + :param file_data: The binary data of the PDF file. + :param file_name: The name of the PDF file. + :param doc_id: The unique document ID for this job. + :param job_id: The unique job ID for the processing task. + :return: A list of structured chunks containing text and visual elements. + """ + with fitz.open(stream=file_data, filetype="pdf") as pdf_document: + num_pages = len(pdf_document) # Get the total number of pages in the PDF + pages = [Page(pdf_document[i], i) for i in tqdm(range(num_pages), desc="Initializing Pages")] # Initialize each page + + update_progress(job_id, "Extracting tables and images...", 0) + await self.extract_and_mask_elements(pages, job_id) # Extract and mask elements (tables/images) + + update_progress(job_id, "Processing tables and images...", 0) + await self.process_visual_elements(pages, self.image_batch_size, job_id) # Process visual elements + + update_progress(job_id, "Extracting text...", 0) + page_texts = await self.extract_text_from_masked_pages(pages, job_id) # Extract text from masked pages + + update_progress(job_id, "Processing text...", 0) + text_chunks = self.chunk_text_with_metadata(page_texts, max_words=1000, job_id=job_id) # Chunk text into smaller parts + + # Combine text and visual elements into a unified structure (chunks) + chunks = self.combine_chunks(text_chunks, [elem for page in pages for elem in page.elements], file_name, + doc_id) + + return chunks + + async def extract_and_mask_elements(self, pages: List[Page], job_id: str): + """ + Extract visual elements (tables and images) from each page and mask them on the page. + + :param pages: A list of Page objects representing the PDF pages. + :param job_id: The unique job ID for the processing task. + """ + total_pages = len(pages) + tasks = [] + + for i, page in enumerate(pages): + tasks.append(asyncio.create_task(self.element_extractor.extract_elements(page))) # Extract elements asynchronously + progress = ((i + 1) / total_pages) * 100 # Calculate progress + update_progress(job_id, "Extracting tables and images...", progress) + + # Gather all extraction results + results = await asyncio.gather(*tasks) + + # Mask the detected elements on the page images + for page, elements in zip(pages, results): + for element in elements: + page.add_element(element) # Mask each extracted element on the page + + async def process_visual_elements(self, pages: List[Page], image_batch_size: int, job_id: str) -> List[Dict[str, Any]]: + """ + Process extracted visual elements in batches, generating summaries or descriptions. + + :param pages: A list of Page objects representing the PDF pages. + :param image_batch_size: The batch size for processing visual elements. + :param job_id: The unique job ID for the processing task. + :return: A list of processed elements with metadata and generated summaries. + """ + pre_elements = [element for page in pages for element in page.elements] # Flatten list of elements + processed_elements = [] + total_batches = (len(pre_elements) // image_batch_size) + 1 # Calculate total number of batches + + loop = asyncio.get_event_loop() + with concurrent.futures.ThreadPoolExecutor() as executor: + # Process elements in batches + for i in tqdm(range(0, len(pre_elements), image_batch_size), desc="Processing Visual Elements"): + batch = pre_elements[i:i + image_batch_size] + # Run image summarization in a separate thread + summaries = await loop.run_in_executor( + executor, self.batch_summarize_images, + {j + 1: element.get('metadata').get('base64_data') for j, element in enumerate(batch)} + ) + + # Append generated summaries to the elements + for j, elem in enumerate(batch, start=1): + if j in summaries: + elem['metadata']['text'] = re.sub(r'^(Image|Table):\s*', '', summaries[j]) + elem['metadata']['base64_data'] = '' + processed_elements.append(elem) + + progress = ((i // image_batch_size) + 1) / total_batches * 100 # Calculate progress + update_progress(job_id, "Processing tables and images...", progress) + + return processed_elements + + async def extract_text_from_masked_pages(self, pages: List[Page], job_id: str) -> Dict[int, str]: + """ + Extract text from masked page images (where tables and images have been masked out). + + :param pages: A list of Page objects representing the PDF pages. + :param job_id: The unique job ID for the processing task. + :return: A dictionary mapping page numbers to extracted text. + """ + total_pages = len(pages) + tasks = [] + + for i, page in enumerate(pages): + tasks.append(asyncio.create_task(self.extract_text(page.masked_image, page.page_num))) # Perform OCR on each page + progress = ((i + 1) / total_pages) * 100 # Calculate progress + update_progress(job_id, "Extracting text...", progress) + + # Return extracted text from each page + return dict(await asyncio.gather(*tasks)) + + @staticmethod + async def extract_text(image: Image.Image, page_num: int) -> (int, str): + """ + Perform OCR on the provided image to extract text. + + :param image: The PIL image of the page. + :param page_num: The current page number. + :return: A tuple containing the page number and the extracted text. + """ + result = pytesseract.image_to_string(image) # Extract text using Tesseract OCR + return page_num + 1, result.strip() # Return the page number and extracted text + + def chunk_text_with_metadata(self, page_texts: Dict[int, str], max_words: int, job_id: str) -> List[Dict[str, Any]]: + """ + Break the extracted text into smaller chunks with metadata (e.g., page numbers). + + :param page_texts: A dictionary mapping page numbers to extracted text. + :param max_words: The maximum number of words allowed in a chunk. + :param job_id: The unique job ID for the processing task. + :return: A list of dictionaries containing text chunks with metadata. + """ + chunks = [] + current_chunk = "" + current_start_page = 0 + total_words = 0 + + def add_chunk(chunk_text, start_page, end_page): + # Add a chunk of text with metadata + chunks.append({ + "text": chunk_text.strip(), + "start_page": start_page, + "end_page": end_page + }) + + total_pages = len(page_texts) + for i, (page_num, text) in enumerate(tqdm(page_texts.items(), desc="Chunking Text")): + sentences = self.split_into_sentences(text) + for sentence in sentences: + word_count = len(sentence.split()) + # If adding this sentence exceeds max_words, create a new chunk + if total_words + word_count > max_words: + add_chunk(current_chunk, current_start_page, page_num) + current_chunk = sentence + " " + current_start_page = page_num + total_words = word_count + else: + current_chunk += sentence + " " + total_words += word_count + current_chunk += "\n\n" + + progress = ((i + 1) / total_pages) * 100 # Calculate progress + update_progress(job_id, "Processing text...", progress) + + # Add the last chunk if there is leftover text + if current_chunk.strip(): + add_chunk(current_chunk, current_start_page, page_num) + + return chunks + + @staticmethod + def split_into_sentences(text): + """ + Split the text into sentences using regular expressions. + + :param text: The raw text to be split into sentences. + :return: A list of sentences. + """ + return re.split(r'(?<=[.!?])\s+', text) + + @staticmethod + def combine_chunks(text_chunks: List[Dict[str, Any]], visual_elements: List[Dict[str, Any]], pdf_path: str, + doc_id: str) -> List[Chunk]: + """ + Combine text and visual chunks into a unified list. + + :param text_chunks: A list of dictionaries containing text chunks with metadata. + :param visual_elements: A list of dictionaries containing visual elements (tables/images) with metadata. + :param pdf_path: The path to the original PDF file. + :param doc_id: The unique document ID for this job. + :return: A list of Chunk objects representing the combined data. + """ + combined_chunks = [] + # Add text chunks + for text_chunk in text_chunks: + chunk_metadata: ChunkMetaData = { + "text": text_chunk["text"], + "type": "text", + "original_document": pdf_path, + "file_path": "", + "location": "", + "start_page": text_chunk["start_page"], + "end_page": text_chunk["end_page"], + "base64_data": "", + "doc_id": doc_id, + } + chunk_dict: Chunk = { + "id": str(uuid.uuid4()), # Generate a unique ID for the chunk + "values": [], + "metadata": chunk_metadata, + } + combined_chunks.append(chunk_dict) + + # Add visual chunks (tables/images) + for elem in visual_elements: + visual_chunk_metadata: ChunkMetaData = { + "type": elem['metadata']['type'], + "file_path": elem['metadata']['file_path'], + "text": elem['metadata'].get('text', ''), + "start_page": elem['metadata']['start_page'], + "end_page": elem['metadata']['end_page'], + "location": str(elem['metadata']['location']), + "base64_data": elem['metadata']['base64_data'], + "doc_id": doc_id, + "original_document": pdf_path, + } + visual_chunk_dict: Chunk = { + "id": str(uuid.uuid4()), # Generate a unique ID for the visual chunk + "values": [], + "metadata": visual_chunk_metadata, + } + combined_chunks.append(visual_chunk_dict) + + return combined_chunks + + def batch_summarize_images(self, images: Dict[int, str]) -> Dict[int, str]: + """ + Summarize images or tables by generating descriptive text. + + :param images: A dictionary mapping image numbers to base64-encoded image data. + :return: A dictionary mapping image numbers to their generated summaries. + """ + # Prompt for the AI model to summarize images and tables + prompt = f"""<instruction> + <task> + You are tasked with summarizing a series of {len(images)} images and tables for use in a RAG (Retrieval-Augmented Generation) system. + Your goal is to create concise, informative summaries that capture the essential content of each image or table. + These summaries will be used for embedding, so they should be descriptive and relevant. The image or table will be outlined in red on an image of the full page that it is on. Where necessary, use the context of the full page to heklp with the summary but don't summarize other content on the page. + </task> + + <steps> + <step>Identify whether it's an image or a table.</step> + <step>Examine its content carefully.</step> + <step> + Write a detailed summary that captures the main points or visual elements: + <details> + <table>After summarizing what the table is about, include the column headers, a detailed summary of the data, and any notable data trends.</table> + <image>Describe the main subjects, actions, or notable features.</image> + </details> + </step> + <step>Focus on writing summaries that would make it easy to retrieve the content if compared to a user query using vector similarity search.</step> + <step>Keep summaries concise and include important words that may help with retrieval (but do not include numbers and numerical data).</step> + </steps> + + <important_notes> + <note>Avoid using special characters like &, <, >, ", ', $, %, etc. Instead, use their word equivalents:</note> + <note>Use "and" instead of &.</note> + <note>Use "dollars" instead of $.</note> + <note>Use "percent" instead of %.</note> + <note>Refrain from using quotation marks " or apostrophes ' unless absolutely necessary.</note> + <note>Ensure your output is in valid XML format.</note> + </important_notes> + + <formatting> + <note>Enclose all summaries within a root element called <summaries>.</note> + <note>Use <summary> tags to enclose each individual summary.</note> + <note>Include an attribute 'number' in each <summary> tag to indicate the sequence, matching the provided image numbers.</note> + <note>Start each summary by indicating whether it's an image or a table (e.g., "This image shows..." or "The table presents...").</note> + <note>If an image is completely blank, leave the summary blank (e.g., <summary number="3"></summary>).</note> + </formatting> + + <example> + <note>Do not replicate the example below—stay grounded to the content of the table or image and describe it completely and accurately.</note> + <output> + <summaries> + <summary number="1"> + The image shows two men shaking hands on stage at a formal event. The man on the left, in a dark suit and glasses, has a professional appearance, possibly an academic or business figure. The man on the right, Tim Cook, CEO of Apple, is recognizable by his silver hair and dark blue blazer. Cook holds a document titled "Tsinghua SEM EMBA," suggesting a link to Tsinghua University’s Executive MBA program. The backdrop displays English and Chinese text about business management and education, with the event dated October 23, 2014. + </summary> + <summary number="2"> + The table compares the company's assets between December 30, 2023, and September 30, 2023. Key changes include an increase in cash and cash equivalents, while marketable securities had a slight rise. Accounts receivable and vendor non-trade receivables decreased. Inventories and other current assets saw minor fluctuations. Non-current assets like marketable securities slightly declined, while property, plant, and equipment remained stable. Total assets showed minimal change, holding steady at around three hundred fifty-three billion dollars. + </summary> + <summary number="3"> + The table outlines the company's shareholders' equity as of December 30, 2023, versus September 30, 2023. Common stock and additional paid-in capital increased, and retained earnings shifted from a deficit to a positive figure. Accumulated other comprehensive loss decreased. Overall, total shareholders' equity rose significantly, while total liabilities and equity remained nearly unchanged at about three hundred fifty-three billion dollars. + </summary> + <summary number="4"> + The table details the company's liabilities as of December 30, 2023, compared to September 30, 2023. Current liabilities decreased due to lower accounts payable and other current liabilities, while deferred revenue slightly increased. Commercial paper significantly decreased, and term debt rose modestly. Non-current liabilities were stable, with minimal changes in term debt and other non-current liabilities. Total liabilities dropped from two hundred ninety billion dollars to two hundred seventy-nine billion dollars. + </summary> + <summary number="5"> + </summary> + </summaries> + </output> + </example> + + <final_notes> + <note>Process each image or table in the order provided.</note> + <note>Maintain consistent formatting throughout your response.</note> + <note>Ensure the output is in full, valid XML format with the root <summaries> element and each summary being within a <summary> element with the summary number specified as well.</note> + </final_notes> +</instruction> + """ + content = [] + for number, img in images.items(): + content.append({"type": "text", "text": f"\nImage {number}:\n"}) + content.append({"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img}}) + + messages = [ + {"role": "user", "content": content} + ] + + try: + response = self.client.messages.create( + model='claude-3-5-sonnet-20240620', + system=prompt, + max_tokens=400 * len(images), # Increased token limit for more detailed summaries + messages=messages, + temperature=0, + extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"} + ) + + # Parse the response + text = response.content[0].text + #print(text) + # Attempt to parse and fix the XML if necessary + parser = etree.XMLParser(recover=True) + root = etree.fromstring(text, parser=parser) + # Check if there were errors corrected + # if parser.error_log: + # #print("XML Parsing Errors:") + # for error in parser.error_log: + # #print(error) + # Extract summaries + summaries = {} + for summary in root.findall('summary'): + number = int(summary.get('number')) + content = summary.text.strip() if summary.text else "" + if content: # Only include non-empty summaries + summaries[number] = content + + return summaries + + except Exception as e: + # Print errors to stderr so they don't interfere with JSON output + print(json.dumps({"error": str(e)}), file=sys.stderr) + sys.stderr.flush() + + +class DocumentType(Enum): + """ + Enum representing different types of documents that can be processed. + """ + PDF = "pdf" # PDF file type + CSV = "csv" # CSV file type + TXT = "txt" # Plain text file type + HTML = "html" # HTML file type + + +class FileTypeNotSupportedException(Exception): + """ + Exception raised when a file type is unsupported during document processing. + """ + + def __init__(self, file_extension: str): + """ + Initialize the exception with the unsupported file extension. + + :param file_extension: The file extension that triggered the exception. + """ + self.file_extension = file_extension + self.message = f"File type '{file_extension}' is not supported." + super().__init__(self.message) # Call the parent class constructor with the message + + +class Document: + """ + Represents a document being processed, such as a PDF, handling chunking, embedding, and summarization. + """ + + def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str): + """ + Initialize the Document with file data, file name, and job ID. + + :param file_data: The binary data of the file being processed. + :param file_name: The name of the file being processed. + :param job_id: The job ID associated with this document processing task. + """ + self.output_folder = output_folder + self.file_name = file_name + self.file_path = file_path + self.job_id = job_id + self.type = self._get_document_type(file_name) # Determine the document type (PDF, CSV, etc.) + self.doc_id = job_id # Use the job ID as the document ID + self.chunks = [] # List to hold text and visual chunks + self.num_pages = 0 # Number of pages in the document (if applicable) + self.summary = "" # The generated summary for the document + self._process() # Start processing the document + + def _process(self): + """ + Process the document: extract chunks, embed them, and generate a summary. + """ + with open(self.file_path, 'rb') as file: + pdf_data = file.read() + pdf_chunker = PDFChunker(output_folder=self.output_folder, doc_id=self.doc_id) # Initialize PDFChunker + self.chunks = asyncio.run(pdf_chunker.chunk_pdf(pdf_data, os.path.basename(self.file_path), self.doc_id, self.job_id)) # Extract chunks + self.num_pages = self._get_pdf_pages(pdf_data) # Get the number of pages in the document + self._embed_chunks() # Embed the text chunks into embeddings + self.summary = self._generate_summary() # Generate a summary for the document + + def _get_pdf_pages(self, pdf_data: bytes) -> int: + """ + Get the total number of pages in the PDF document. + """ + pdf_file = io.BytesIO(pdf_data) # Convert the file data to an in-memory binary stream + pdf_reader = PdfReader(pdf_file) # Initialize PDF reader + return len(pdf_reader.pages) # Return the number of pages in the PDF + + + def _get_document_type(self, file_name: str) -> DocumentType: + """ + Determine the document type based on its file extension. + + :param file_name: The name of the file being processed. + :return: The DocumentType enum value corresponding to the file extension. + """ + _, extension = os.path.splitext(file_name) # Split the file name to get the extension + extension = extension.lower().lstrip('.') # Convert to lowercase and remove leading period + try: + return DocumentType(extension) # Try to match the extension to a DocumentType + except ValueError: + raise FileTypeNotSupportedException(extension) # Raise exception if file type is unsupported + + + def _embed_chunks(self) -> None: + """ + Embed the text chunks using the Cohere API. + """ + openai = OpenAI() # Initialize Cohere client with API key + batch_size = 90 # Batch size for embedding + chunks_len = len(self.chunks) # Total number of chunks to embed + for i in tqdm(range(0, chunks_len, batch_size), desc="Embedding Chunks"): + batch = self.chunks[i: min(i + batch_size, chunks_len)] # Get batch of chunks + texts = [chunk['metadata']['text'] for chunk in batch] # Extract text from each chunk + chunk_embs_batch = openai.embeddings.create( + model="text-embedding-3-large", + input=texts, + encoding_format="float" + ) + for j, data_val in enumerate(chunk_embs_batch.data): + self.chunks[i + j]['values'] = data_val.embedding # Store the embeddings in the corresponding chunks + + def _generate_summary(self) -> str: + """ + Generate a summary of the document using KMeans clustering and a language model. + + :return: The generated summary of the document. + """ + num_clusters = min(10, len(self.chunks)) # Set number of clusters for KMeans, capped at 10 + kmeans = KMeans(n_clusters=num_clusters, random_state=42) # Initialize KMeans with 10 clusters + doc_chunks = [chunk['values'] for chunk in self.chunks if 'values' in chunk] # Extract embeddings + cluster_labels = kmeans.fit_predict(doc_chunks) # Assign each chunk to a cluster + + # Select representative chunks from each cluster + selected_chunks = [] + for i in range(num_clusters): + cluster_chunks = [chunk for chunk, label in zip(self.chunks, cluster_labels) if label == i] # Get all chunks in this cluster + cluster_embs = [emb for emb, label in zip(doc_chunks, cluster_labels) if label == i] # Get embeddings for this cluster + centroid = kmeans.cluster_centers_[i] # Get the centroid of the cluster + distances = [np.linalg.norm(np.array(emb) - centroid) for emb in cluster_embs] # Compute distance to centroid + closest_chunk = cluster_chunks[np.argmin(distances)] # Select chunk closest to the centroid + selected_chunks.append(closest_chunk) + + # Combine selected chunks into a summary + combined_text = "\n\n".join([chunk['metadata']['text'] for chunk in selected_chunks]) # Concatenate chunk texts + + client = OpenAI() # Initialize OpenAI client for text generation + completion = client.chat.completions.create( + model="gpt-3.5-turbo", # Specify the language model + messages=[ + {"role": "system", + "content": "You are an AI assistant tasked with summarizing a document. You are provided with important chunks from the document and provide a summary, as best you can, of what the document will contain overall. Be concise and brief with your response."}, + {"role": "user", "content": f"""Please provide a comprehensive summary of what you think the document from which these chunks were sampled would be. + Ensure the summary captures the main ideas and key points from all provided chunks. Be concise and brief and only provide the summary in paragraph form. + + Sample text chunks: + ``` + {combined_text} + ``` + ********** + Summary: + """} + ], + max_tokens=300 # Set max tokens for the summary + ) + return completion.choices[0].message.content.strip() # Return the generated summary + + def to_json(self) -> str: + """ + Return the document's data in JSON format. + + :return: JSON string representing the document's metadata, chunks, and summary. + """ + return json.dumps({ + "file_name": self.file_name, + "num_pages": self.num_pages, + "summary": self.summary, + "chunks": self.chunks, + "type": self.type.value, + "doc_id": self.doc_id + }, indent=2) # Convert the document's attributes to JSON format + +def process_document(file_path, job_id, output_folder): + """ + Top-level function to process a document and return the JSON output. + + :param file_path: The path to the file being processed. + :param job_id: The job ID for this document processing task. + :return: The processed document's data in JSON format. + """ + new_document = Document(file_path, file_path, job_id, output_folder) + return new_document.to_json() + +def main(): + """ + Main entry point for the script, called with arguments from Node.js. + """ + if len(sys.argv) != 4: + print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr) + return + + job_id = sys.argv[1] + file_path = sys.argv[2] + output_folder = sys.argv[3] # Get the output folder from arguments + + try: + os.makedirs(output_folder, exist_ok=True) + + # Process the document + document_result = process_document(file_path, job_id, output_folder) # Pass output_folder + + # Output the final result as JSON to stdout + print(document_result) + sys.stdout.flush() + + except Exception as e: + # Print errors to stderr so they don't interfere with JSON output + print(json.dumps({"error": str(e)}), file=sys.stderr) + sys.stderr.flush() + +if __name__ == "__main__": + main() # Execute the main function when the script is run
\ No newline at end of file diff --git a/src/server/chunker/requirements.txt b/src/server/chunker/requirements.txt new file mode 100644 index 000000000..20bd486e5 --- /dev/null +++ b/src/server/chunker/requirements.txt @@ -0,0 +1,15 @@ +anthropic==0.34.0 +cohere==5.8.0 +python-dotenv==1.0.1 +pymupdf==1.22.2 +lxml==5.3.0 +layoutparser==0.3.4 +numpy==1.26.4 +openai==1.40.6 +Pillow==10.4.0 +pytesseract==0.3.10 +PyPDF2==3.0.1 +scikit-learn==1.5.1 +tqdm==4.66.5 +ultralyticsplus==0.0.28 +easyocr==1.7.0
\ No newline at end of file diff --git a/src/server/flashcard/labels.py b/src/server/flashcard/labels.py new file mode 100644 index 000000000..546fc4bd3 --- /dev/null +++ b/src/server/flashcard/labels.py @@ -0,0 +1,285 @@ +import base64 +import numpy as np +import base64 +import easyocr +import sys +from PIL import Image +from io import BytesIO +import requests +import json +import numpy as np + +class BoundingBoxUtils: + """Utility class for bounding box operations and OCR result corrections.""" + + @staticmethod + def is_close(box1, box2, x_threshold=20, y_threshold=20): + """ + Determines if two bounding boxes are horizontally and vertically close. + + Parameters: + box1, box2 (list): The bounding boxes to compare. + x_threshold (int): The threshold for horizontal proximity. + y_threshold (int): The threshold for vertical proximity. + + Returns: + bool: True if boxes are close, False otherwise. + """ + horizontally_close = (abs(box1[2] - box2[0]) < x_threshold or # Right edge of box1 and left edge of box2 + abs(box2[2] - box1[0]) < x_threshold or # Right edge of box2 and left edge of box1 + abs(box1[2] - box2[2]) < x_threshold or + abs(box2[0] - box1[0]) < x_threshold) + + vertically_close = (abs(box1[3] - box2[1]) < y_threshold or # Bottom edge of box1 and top edge of box2 + abs(box2[3] - box1[1]) < y_threshold or + box1[1] == box2[1] or box1[3] == box2[3]) + + return horizontally_close and vertically_close + + @staticmethod + def adjust_bounding_box(bbox, original_text, corrected_text): + """ + Adjusts a bounding box based on differences in text length. + + Parameters: + bbox (list): The original bounding box coordinates. + original_text (str): The original text detected by OCR. + corrected_text (str): The corrected text after cleaning. + + Returns: + list: The adjusted bounding box. + """ + if not bbox or len(bbox) != 4: + return bbox + + # Adjust the x-coordinates slightly to account for text correction + x_adjustment = 5 + adjusted_bbox = [ + [bbox[0][0] + x_adjustment, bbox[0][1]], + [bbox[1][0], bbox[1][1]], + [bbox[2][0] + x_adjustment, bbox[2][1]], + [bbox[3][0], bbox[3][1]] + ] + return adjusted_bbox + + @staticmethod + def correct_ocr_results(results): + """ + Corrects common OCR misinterpretations in the detected text and adjusts bounding boxes accordingly. + + Parameters: + results (list): A list of OCR results, each containing bounding box, text, and confidence score. + + Returns: + list: Corrected OCR results with adjusted bounding boxes. + """ + corrections = { + "~": "", # Replace '~' with empty string + "-": "" # Replace '-' with empty string + } + + corrected_results = [] + for (bbox, text, prob) in results: + corrected_text = ''.join(corrections.get(char, char) for char in text) + adjusted_bbox = BoundingBoxUtils.adjust_bounding_box(bbox, text, corrected_text) + corrected_results.append((adjusted_bbox, corrected_text, prob)) + + return corrected_results + + @staticmethod + def convert_to_json_serializable(data): + """ + Converts a list containing various types, including numpy types, to a JSON-serializable format. + + Parameters: + data (list): A list containing numpy or other non-serializable types. + + Returns: + list: A JSON-serializable version of the input list. + """ + def convert_element(element): + if isinstance(element, list): + return [convert_element(e) for e in element] + elif isinstance(element, tuple): + return tuple(convert_element(e) for e in element) + elif isinstance(element, np.integer): + return int(element) + elif isinstance(element, np.floating): + return float(element) + elif isinstance(element, np.ndarray): + return element.tolist() + else: + return element + + return convert_element(data) + +class ImageLabelProcessor: + """Class to process images and perform OCR with EasyOCR.""" + + VERTICAL_THRESHOLD = 20 + HORIZONTAL_THRESHOLD = 8 + + def __init__(self, img_source, source_type, smart_mode): + self.img_source = img_source + self.source_type = source_type + self.smart_mode = smart_mode + self.img_val = self.load_image() + + def load_image(self): + """Load image from either a base64 string or URL.""" + if self.source_type == 'drag': + return self._load_base64_image() + else: + return self._load_url_image() + + def _load_base64_image(self): + """Decode and save the base64 image.""" + base64_string = self.img_source + if base64_string.startswith("data:image"): + base64_string = base64_string.split(",")[1] + + + # Decode the base64 string + image_data = base64.b64decode(base64_string) + image = Image.open(BytesIO(image_data)).convert('RGB') + image.save("temp_image.jpg") + return "temp_image.jpg" + + def _load_url_image(self): + """Download image from URL and return it in byte format.""" + url = self.img_source + response = requests.get(url) + image = Image.open(BytesIO(response.content)).convert('RGB') + + image_bytes = BytesIO() + image.save(image_bytes, format='PNG') + return image_bytes.getvalue() + + def process_image(self): + """Process the image and return the OCR results.""" + if self.smart_mode: + return self._process_smart_mode() + else: + return self._process_standard_mode() + + def _process_smart_mode(self): + """Process the image in smart mode using EasyOCR.""" + reader = easyocr.Reader(['en']) + result = reader.readtext(self.img_val, detail=1, paragraph=True) + + all_boxes = [bbox for bbox, text in result] + all_texts = [text for bbox, text in result] + + response_data = { + 'status': 'success', + 'message': 'Data received', + 'boxes': BoundingBoxUtils.convert_to_json_serializable(all_boxes), + 'text': BoundingBoxUtils.convert_to_json_serializable(all_texts), + } + + return response_data + + def _process_standard_mode(self): + """Process the image in standard mode using EasyOCR.""" + reader = easyocr.Reader(['en']) + results = reader.readtext(self.img_val) + + filtered_results = BoundingBoxUtils.correct_ocr_results([ + (bbox, text, prob) for bbox, text, prob in results if prob >= 0.7 + ]) + + return self._merge_and_prepare_response(filtered_results) + + def are_vertically_close(self, box1, box2): + """Check if two bounding boxes are vertically close.""" + box1_bottom = max(box1[2][1], box1[3][1]) + box2_top = min(box2[0][1], box2[1][1]) + vertical_distance = box2_top - box1_bottom + + box1_left = box1[0][0] + box2_left = box2[0][0] + box1_right = box1[1][0] + box2_right = box2[1][0] + hori_close = abs(box2_left - box1_left) <= self.HORIZONTAL_THRESHOLD or abs(box2_right - box1_right) <= self.HORIZONTAL_THRESHOLD + + return vertical_distance <= self.VERTICAL_THRESHOLD and hori_close + + def merge_boxes(self, boxes, texts): + """Merge multiple bounding boxes and their associated text.""" + x_coords = [] + y_coords = [] + + # Collect all x and y coordinates + for box in boxes: + for point in box: + x_coords.append(point[0]) + y_coords.append(point[1]) + + # Create the merged bounding box + merged_box = [ + [min(x_coords), min(y_coords)], + [max(x_coords), min(y_coords)], + [max(x_coords), max(y_coords)], + [min(x_coords), max(y_coords)] + ] + + # Combine the texts + merged_text = ' '.join(texts) + + return merged_box, merged_text + + def _merge_and_prepare_response(self, filtered_results): + """Merge vertically close boxes and prepare the final response.""" + current_boxes, current_texts = [], [] + all_boxes, all_texts = [], [] + + for ind in range(len(filtered_results) - 1): + if not current_boxes: + current_boxes.append(filtered_results[ind][0]) + current_texts.append(filtered_results[ind][1]) + + if self.are_vertically_close(filtered_results[ind][0], filtered_results[ind + 1][0]): + current_boxes.append(filtered_results[ind + 1][0]) + current_texts.append(filtered_results[ind + 1][1]) + else: + merged = self.merge_boxes(current_boxes, current_texts) + all_boxes.append(merged[0]) + all_texts.append(merged[1]) + current_boxes, current_texts = [], [] + + if current_boxes: + merged = self.merge_boxes(current_boxes, current_texts) + all_boxes.append(merged[0]) + all_texts.append(merged[1]) + + if not current_boxes and filtered_results: + merged = self.merge_boxes([filtered_results[-1][0]], [filtered_results[-1][1]]) + all_boxes.append(merged[0]) + all_texts.append(merged[1]) + + response = { + 'status': 'success', + 'message': 'Data received', + 'boxes': BoundingBoxUtils.convert_to_json_serializable(all_boxes), + 'text': BoundingBoxUtils.convert_to_json_serializable(all_texts), + } + + return response + +# Main execution function +def labels(): + """Main function to handle image OCR processing based on input arguments.""" + source_type = sys.argv[2] + smart_mode = (sys.argv[3] == 'smart') + with open(sys.argv[1], 'r') as f: + img_source = f.read() + # Create ImageLabelProcessor instance + processor = ImageLabelProcessor(img_source, source_type, smart_mode) + response = processor.process_image() + + # Print and return the response + print(response) + return response + + +labels() diff --git a/src/server/flashcard/requirements.txt b/src/server/flashcard/requirements.txt new file mode 100644 index 000000000..eb92a819b --- /dev/null +++ b/src/server/flashcard/requirements.txt @@ -0,0 +1,12 @@ +easyocr==1.7.1 +requests==2.32.3 +pillow==10.4.0 +numpy==1.26.4 +tqdm==4.66.4 +Werkzeug==3.0.3 +python-dateutil==2.9.0.post0 +six==1.16.0 +certifi==2024.6.2 +charset-normalizer==3.3.2 +idna==3.7 +urllib3==1.26.19
\ No newline at end of file diff --git a/src/server/flashcard/venv/pyvenv.cfg b/src/server/flashcard/venv/pyvenv.cfg new file mode 100644 index 000000000..740014e00 --- /dev/null +++ b/src/server/flashcard/venv/pyvenv.cfg @@ -0,0 +1,3 @@ +home = /Library/Frameworks/Python.framework/Versions/3.10/bin +include-system-site-packages = false +version = 3.10.11 diff --git a/src/server/index.ts b/src/server/index.ts index 88dbd232d..3b77359ec 100644 --- a/src/server/index.ts +++ b/src/server/index.ts @@ -4,9 +4,11 @@ import * as mobileDetect from 'mobile-detect'; import * as path from 'path'; import { logExecution } from './ActionUtilities'; import AssistantManager from './ApiManagers/AssistantManager'; +import FlashcardManager from './ApiManagers/FlashcardManager'; import DataVizManager from './ApiManagers/DataVizManager'; import DeleteManager from './ApiManagers/DeleteManager'; import DownloadManager from './ApiManagers/DownloadManager'; +import FireflyManager from './ApiManagers/FireflyManager'; import GeneralGoogleManager from './ApiManagers/GeneralGoogleManager'; import SessionManager from './ApiManagers/SessionManager'; import UploadManager from './ApiManagers/UploadManager'; @@ -71,6 +73,8 @@ function routeSetter({ addSupervisedRoute, logRegistrationOutcome }: RouteManage new GeneralGoogleManager(), /* new GooglePhotosManager(), */ new DataVizManager(), new AssistantManager(), + new FlashcardManager(), + new FireflyManager(), ]; // initialize API Managers @@ -112,7 +116,6 @@ function routeSetter({ addSupervisedRoute, logRegistrationOutcome }: RouteManage }); const serve: PublicHandler = ({ req, res }) => { - // eslint-disable-next-line new-cap const detector = new mobileDetect(req.headers['user-agent'] || ''); const filename = detector.mobile() !== null ? 'mobile/image.html' : 'index.html'; res.sendFile(path.join(__dirname, '../../deploy/' + filename)); diff --git a/src/server/server_Initialization.ts b/src/server/server_Initialization.ts index 0cf9a6e58..a56ab5d18 100644 --- a/src/server/server_Initialization.ts +++ b/src/server/server_Initialization.ts @@ -108,14 +108,14 @@ function registerEmbeddedBrowseRelativePathHandler(server: express.Express) { // detect search query and use default search engine res.redirect(req.headers.referer + 'corsProxy/' + encodeURIComponent('http://www.google.com' + relativeUrl)); } else { - res.end(); + res.status(404).json({ error: 'no such file or endpoint: try /home /logout /login' }); } }); } // eslint-disable-next-line @typescript-eslint/no-explicit-any function proxyServe(req: any, requrl: string, response: any) { - // eslint-disable-next-line global-require, @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires + // eslint-disable-next-line @typescript-eslint/no-require-imports const htmlBodyMemoryStream = new (require('memorystream'))(); let wasinBrFormat = false; const sendModifiedBody = () => { @@ -130,7 +130,7 @@ function proxyServe(req: any, requrl: string, response: any) { const htmlText = htmlInputText .toString('utf8') .replace('<head>', '<head> <style>[id ^= "google"] { display: none; } </style>') - .replace(/(src|href)=(['"])(https?[^\2\n]*)\1/g, refToCors) // replace src or href='http(s)://...' or href="http(s)://.." + .replace(/(src|href)=(['"])(https?[^\n]*)\1/g, refToCors) // replace src or href='http(s)://...' or href="http(s)://.." // .replace(/= *"\/([^"]*)"/g, relpathToCors) .replace(/data-srcset="[^"]*"/g, '') .replace(/srcset="[^"]*"/g, '') @@ -189,7 +189,6 @@ function proxyServe(req: any, requrl: string, response: any) { res.headers['x-permitted-cross-domain-policies'] = 'all'; res.headers['x-frame-options'] = ''; res.headers['content-security-policy'] = ''; - // eslint-disable-next-line no-multi-assign response.headers = response._headers = res.headers; }) .on('end', sendModifiedBody) @@ -247,6 +246,10 @@ export default async function InitializeServer(routeSetter: RouteSetter) { const app = buildWithMiddleware(express()); const compiler = webpack(config as webpack.Configuration); + // Default route + app.get('/', (req, res) => { + res.redirect(req.user ? '/home' : '/login'); //res.send('This is the default route.'); + }); // route table managed by express. routes are tested sequentially against each of these map rules. when a match is found, the handler is called to process the request app.use(wdm(compiler, { publicPath: config.output.publicPath })); app.use(whm(compiler)); @@ -259,7 +262,6 @@ export default async function InitializeServer(routeSetter: RouteSetter) { isRelease && !SSL.Loaded && SSL.exit(); routeSetter(new RouteManager(app, isRelease)); // this sets up all the regular supervised routes (things like /home, download/upload api's, pdf, search, session, etc) registerEmbeddedBrowseRelativePathHandler(app); // this allows renered web pages which internally have relative paths to find their content - isRelease && process.env.serverPort && (resolvedPorts.server = Number(process.env.serverPort)); const server = isRelease ? createServer(SSL.Credentials, app) : app; await new Promise<void>(resolve => { diff --git a/src/server/websocket.ts b/src/server/websocket.ts index 1e25a8a27..effe94219 100644 --- a/src/server/websocket.ts +++ b/src/server/websocket.ts @@ -61,27 +61,6 @@ export namespace WebSocket { Database.Instance.getDocuments(ids, callback); } - const pendingOps = new Map<string, { diff: Diff; socket: Socket }[]>(); - - function dispatchNextOp(id: string): unknown { - const next = pendingOps.get(id)?.shift(); - // eslint-disable-next-line @typescript-eslint/no-unused-vars - const nextOp = (res: boolean) => dispatchNextOp(id); - if (next) { - const { diff, socket } = next; - // ideally, we'd call the Database update method for all actions, but for now we handle list insertion/removal on our own - switch (diff.diff.$addToSet ? 'add' : diff.diff.$remFromSet ? 'rem' : 'set') { - case 'add': return GetRefFieldLocal(id, (result) => addToListField(socket, diff, result, nextOp)); // prettier-ignore - case 'rem': return GetRefFieldLocal(id, (result) => remFromListField(socket, diff, result, nextOp)); // prettier-ignore - default: return Database.Instance.update(id, diff.diff, - () => nextOp(socket.broadcast.emit(MessageStore.UpdateField.Message, diff)), - false - ); // prettier-ignore - } - } - return !pendingOps.get(id)?.length && pendingOps.delete(id); - } - function addToListField(socket: Socket, diff: Diff, listDoc: serializedDoctype | undefined, cb: (res: boolean) => void): void { const $addToSet = diff.diff.$addToSet as serializedFieldsType; const updatefield = Array.from(Object.keys($addToSet ?? {}))[0]; @@ -181,6 +160,27 @@ export namespace WebSocket { } else cb(false); } + const pendingOps = new Map<string, { diff: Diff; socket: Socket }[]>(); + + function dispatchNextOp(id: string): unknown { + const next = pendingOps.get(id)?.shift(); + // eslint-disable-next-line @typescript-eslint/no-unused-vars + const nextOp = (res: boolean) => dispatchNextOp(id); + if (next) { + const { diff, socket } = next; + // ideally, we'd call the Database update method for all actions, but for now we handle list insertion/removal on our own + switch (diff.diff.$addToSet ? 'add' : diff.diff.$remFromSet ? 'rem' : 'set') { + case 'add': return GetRefFieldLocal(id, (result) => addToListField(socket, diff, result, nextOp)); // prettier-ignore + case 'rem': return GetRefFieldLocal(id, (result) => remFromListField(socket, diff, result, nextOp)); // prettier-ignore + default: return Database.Instance.update(id, diff.diff, + () => nextOp(socket.broadcast.emit(MessageStore.UpdateField.Message, diff)), + false + ); // prettier-ignore + } + } + return !pendingOps.get(id)?.length && pendingOps.delete(id); + } + function UpdateField(socket: Socket, diff: Diff) { const curUser = socketMap.get(socket); if (curUser) { |