aboutsummaryrefslogtreecommitdiff
path: root/src/server
diff options
context:
space:
mode:
Diffstat (limited to 'src/server')
-rw-r--r--src/server/ApiManagers/AssistantManager.ts157
-rw-r--r--src/server/chunker/pdf_chunker.py70
-rw-r--r--src/server/flashcard/labels.py285
-rw-r--r--src/server/flashcard/requirements.txt12
-rw-r--r--src/server/flashcard/venv/pyvenv.cfg3
5 files changed, 438 insertions, 89 deletions
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 8447a4934..4d2068014 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -9,7 +9,7 @@
*/
import { Readability } from '@mozilla/readability';
-import axios from 'axios';
+import axios, { AxiosResponse } from 'axios';
import { spawn } from 'child_process';
import * as fs from 'fs';
import { writeFile } from 'fs';
@@ -23,6 +23,7 @@ import { AI_Document } from '../../client/views/nodes/chatbot/types/types';
import { Method } from '../RouteManager';
import { filesDirectory, publicDirectory } from '../SocketData';
import ApiManager, { Registration } from './ApiManager';
+import { getServerPath } from '../../client/util/reportManager/reportManagerUtils';
// Enumeration of directories where different file types are stored
export enum Directory {
@@ -115,29 +116,79 @@ export default class AssistantManager extends ApiManager {
},
});
- // Register Google Web Search Results API route
register({
method: Method.POST,
subscription: '/getWebSearchResults',
secureHandler: async ({ req, res }) => {
const { query, max_results } = req.body;
- try {
- // Fetch search results using Google Custom Search API
- const response = await customsearch.cse.list({
+ const MIN_VALID_RESULTS_RATIO = 0.75; // 3/4 threshold
+ let startIndex = 1; // Start at the first result initially
+ let validResults: any[] = [];
+
+ const fetchSearchResults = async (start: number) => {
+ return customsearch.cse.list({
q: query,
cx: process.env._CLIENT_GOOGLE_SEARCH_ENGINE_ID,
key: process.env._CLIENT_GOOGLE_API_KEY,
safe: 'active',
num: max_results,
+ start, // This controls which result index the search starts from
});
+ };
+
+ const filterResultsByXFrameOptions = async (results: any[]) => {
+ const filteredResults = await Promise.all(
+ results.map(async result => {
+ try {
+ const urlResponse: AxiosResponse = await axios.head(result.url, { timeout: 5000 });
+ const xFrameOptions = urlResponse.headers['x-frame-options'];
+ if (xFrameOptions && xFrameOptions.toUpperCase() === 'SAMEORIGIN') {
+ return result;
+ }
+ } catch (error) {
+ console.error(`Error checking x-frame-options for URL: ${result.url}`, error);
+ }
+ return null; // Exclude the result if it doesn't match
+ })
+ );
+ return filteredResults.filter(result => result !== null); // Remove null results
+ };
- const results =
+ try {
+ // Fetch initial search results
+ let response = await fetchSearchResults(startIndex);
+ let initialResults =
response.data.items?.map(item => ({
url: item.link,
snippet: item.snippet,
})) || [];
- res.send({ results });
+ // Filter the initial results
+ validResults = await filterResultsByXFrameOptions(initialResults);
+
+ // If valid results are less than 3/4 of max_results, fetch more results
+ while (validResults.length < max_results * MIN_VALID_RESULTS_RATIO) {
+ // Increment the start index by the max_results to fetch the next set of results
+ startIndex += max_results;
+ response = await fetchSearchResults(startIndex);
+
+ const additionalResults =
+ response.data.items?.map(item => ({
+ url: item.link,
+ snippet: item.snippet,
+ })) || [];
+
+ const additionalValidResults = await filterResultsByXFrameOptions(additionalResults);
+ validResults = [...validResults, ...additionalValidResults]; // Combine valid results
+
+ // Break if no more results are available
+ if (additionalValidResults.length === 0 || response.data.items?.length === 0) {
+ break;
+ }
+ }
+
+ // Return the filtered valid results
+ res.send({ results: validResults.slice(0, max_results) }); // Limit the results to max_results
} catch (error) {
console.error('Error performing web search:', error);
res.status(500).send({
@@ -299,47 +350,16 @@ export default class AssistantManager extends ApiManager {
method: Method.GET,
subscription: '/getResult/:jobId',
secureHandler: async ({ req, res }) => {
- const { jobId } = req.params; // Get the job ID from the URL parameters
- // Check if the job result is available
+ const { jobId } = req.params;
if (jobResults[jobId]) {
const result = jobResults[jobId] as AI_Document & { status: string };
- // If the result contains image or table chunks, save the base64 data as image files
if (result.chunks && Array.isArray(result.chunks)) {
- await Promise.all(
- result.chunks.map(chunk => {
- if (chunk.metadata && (chunk.metadata.type === 'image' || chunk.metadata.type === 'table')) {
- const files_directory = '/files/chunk_images/';
- const directory = path.join(publicDirectory, files_directory);
-
- // Ensure the directory exists or create it
- if (!fs.existsSync(directory)) {
- fs.mkdirSync(directory);
- }
-
- const fileName = path.basename(chunk.metadata.file_path); // Get the file name from the path
- const filePath = path.join(directory, fileName); // Create the full file path
-
- // Check if the chunk contains base64 encoded data
- if (chunk.metadata.base64_data) {
- // Decode the base64 data and write it to a file
- const buffer = Buffer.from(chunk.metadata.base64_data, 'base64');
- fs.promises.writeFile(filePath, buffer).then(() => {
- // Update the file path in the chunk's metadata
- chunk.metadata.file_path = path.join(files_directory, fileName);
- chunk.metadata.base64_data = undefined; // Remove the base64 data from the metadata
- });
- } else {
- console.warn(`No base64_data found for chunk: ${fileName}`);
- }
- }
- })
- );
result.status = 'completed';
} else {
result.status = 'pending';
}
- res.json(result); // Send the result back to the client
+ res.json(result);
} else {
res.status(202).send({ status: 'pending' });
}
@@ -367,7 +387,7 @@ export default class AssistantManager extends ApiManager {
// If the chunk is an image or table, read the corresponding file and encode it as base64
if (chunk.metadata.type === 'image' || chunk.metadata.type === 'table') {
try {
- const filePath = serverPathToFile(Directory.chunk_images, chunk.metadata.file_path); // Get the file path
+ const filePath = path.join(pathToDirectory(Directory.chunk_images), chunk.metadata.file_path); // Get the file path
readFileAsync(filePath).then(imageBuffer => {
const base64Image = imageBuffer.toString('base64'); // Convert the image to base64
@@ -445,10 +465,12 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string)
const requirementsPath = path.join(__dirname, '../chunker/requirements.txt');
const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py');
+ const outputDirectory = pathToDirectory(Directory.chunk_images);
+
function runPythonScript() {
const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3');
- const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_name, file_data]);
+ const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_name, file_data, outputDirectory]);
let pythonOutput = '';
let stderrOutput = '';
@@ -460,23 +482,30 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string)
pythonProcess.stderr.on('data', data => {
stderrOutput += data.toString();
const lines = stderrOutput.split('\n');
+ stderrOutput = lines.pop() || ''; // Save the last partial line back to stderrOutput
lines.forEach(line => {
if (line.trim()) {
- try {
- const parsedOutput = JSON.parse(line);
- if (parsedOutput.job_id && parsedOutput.progress !== undefined) {
- jobProgress[parsedOutput.job_id] = {
- step: parsedOutput.step,
- progress: parsedOutput.progress,
- };
- } else if (parsedOutput.progress !== undefined) {
- jobProgress[jobId] = {
- step: parsedOutput.step,
- progress: parsedOutput.progress,
- };
+ if (line.startsWith('PROGRESS:')) {
+ const jsonString = line.substring('PROGRESS:'.length);
+ try {
+ const parsedOutput = JSON.parse(jsonString);
+ if (parsedOutput.job_id && parsedOutput.progress !== undefined) {
+ jobProgress[parsedOutput.job_id] = {
+ step: parsedOutput.step,
+ progress: parsedOutput.progress,
+ };
+ } else if (parsedOutput.progress !== undefined) {
+ jobProgress[jobId] = {
+ step: parsedOutput.step,
+ progress: parsedOutput.progress,
+ };
+ }
+ } catch (err) {
+ console.error('Error parsing progress JSON:', jsonString, err);
}
- } catch (err) {
- console.error('Progress log from Python:', line, err);
+ } else {
+ // Log other stderr output
+ console.error('Python stderr:', line);
}
}
});
@@ -490,10 +519,24 @@ function spawnPythonProcess(jobId: string, file_name: string, file_data: string)
jobProgress[jobId] = { step: 'Complete', progress: 100 };
} catch (err) {
console.error('Error parsing final JSON result:', err);
+ jobResults[jobId] = { error: 'Failed to parse final result' };
}
} else {
console.error(`Python process exited with code ${code}`);
- jobResults[jobId] = { error: 'Python process failed' };
+ // Check if there was an error message in stderr
+ if (stderrOutput) {
+ // Try to parse the last line as JSON
+ const lines = stderrOutput.trim().split('\n');
+ const lastLine = lines[lines.length - 1];
+ try {
+ const errorOutput = JSON.parse(lastLine);
+ jobResults[jobId] = errorOutput;
+ } catch (err) {
+ jobResults[jobId] = { error: 'Python process failed' };
+ }
+ } else {
+ jobResults[jobId] = { error: 'Python process failed' };
+ }
}
});
}
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index 4fe3b9dbf..48b2dbf97 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -54,8 +54,9 @@ def update_progress(job_id, step, progress_value):
"step": step,
"progress": progress_value
}
- print(json.dumps(progress_data), file=sys.stderr) # Use stderr for progress logs
- sys.stderr.flush() # Ensure it's sent immediately
+ print(f"PROGRESS:{json.dumps(progress_data)}", file=sys.stderr)
+ sys.stderr.flush()
+
class ElementExtractor:
@@ -63,13 +64,15 @@ class ElementExtractor:
A class that uses a YOLO model to extract tables and images from a PDF page.
"""
- def __init__(self, output_folder: str):
+ def __init__(self, output_folder: str, doc_id: str):
"""
Initializes the ElementExtractor with the output folder for saving images and the YOLO model.
:param output_folder: Path to the folder where extracted elements will be saved.
"""
- self.output_folder = output_folder
+ self.doc_id = doc_id
+ self.output_folder = os.path.join(output_folder, doc_id)
+ os.makedirs(self.output_folder, exist_ok=True)
self.model = YOLO('keremberke/yolov8m-table-extraction') # Load YOLO model for table extraction
self.model.overrides['conf'] = 0.25 # Set confidence threshold for detection
self.model.overrides['iou'] = 0.45 # Set Intersection over Union (IoU) threshold
@@ -116,17 +119,16 @@ class ElementExtractor:
table_path = os.path.join(self.output_folder, table_filename)
page_with_outline.save(table_path)
- # Convert the full-page image with red outline to base64
- base64_data = self.image_to_base64(page_with_outline)
+ file_path_for_client = f"{self.doc_id}/{table_filename}"
tables.append({
'metadata': {
"type": "table",
"location": [x1 / img.width, y1 / img.height, x2 / img.width, y2 / img.height],
- "file_path": table_path,
+ "file_path": file_path_for_client,
"start_page": page_num,
"end_page": page_num,
- "base64_data": base64_data,
+ "base64_data": self.image_to_base64(page_with_outline)
}
})
@@ -175,18 +177,17 @@ class ElementExtractor:
image_path = os.path.join(self.output_folder, image_filename)
page_with_outline.save(image_path)
- # Convert the full-page image with red outline to base64
- base64_data = self.image_to_base64(page_with_outline)
+ file_path_for_client = f"{self.doc_id}/{image_filename}"
images.append({
'metadata': {
"type": "image",
"location": [x1 / page.rect.width, y1 / page.rect.height, x2 / page.rect.width,
y2 / page.rect.height],
- "file_path": image_path,
+ "file_path": file_path_for_client,
"start_page": page_num,
"end_page": page_num,
- "base64_data": base64_data,
+ "base64_data": self.image_to_base64(image)
}
})
@@ -268,7 +269,7 @@ class PDFChunker:
The main class responsible for chunking PDF files into text and visual elements (tables/images).
"""
- def __init__(self, output_folder: str = "output", image_batch_size: int = 5) -> None:
+ def __init__(self, output_folder: str = "output", doc_id: str = '', image_batch_size: int = 5) -> None:
"""
Initializes the PDFChunker with an output folder and an element extractor for visual elements.
@@ -278,7 +279,8 @@ class PDFChunker:
self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) # Initialize the Anthropic API client
self.output_folder = output_folder
self.image_batch_size = image_batch_size # Batch size for image processing
- self.element_extractor = ElementExtractor(output_folder) # Initialize the element extractor
+ self.doc_id = doc_id # Add doc_id
+ self.element_extractor = ElementExtractor(output_folder, doc_id)
async def chunk_pdf(self, file_data: bytes, file_name: str, doc_id: str, job_id: str) -> List[Dict[str, Any]]:
"""
@@ -363,6 +365,7 @@ class PDFChunker:
for j, elem in enumerate(batch, start=1):
if j in summaries:
elem['metadata']['text'] = re.sub(r'^(Image|Table):\s*', '', summaries[j])
+ elem['metadata']['base64_data'] = ''
processed_elements.append(elem)
progress = ((i // image_batch_size) + 1) / total_batches * 100 # Calculate progress
@@ -628,10 +631,11 @@ class PDFChunker:
return summaries
- except Exception:
- #print(f"Error in batch_summarize_images: {str(e)}")
- #print("Returning placeholder summaries")
- return {number: "Error: No summary available" for number in images}
+ except Exception as e:
+ # Print errors to stderr so they don't interfere with JSON output
+ print(json.dumps({"error": str(e)}), file=sys.stderr)
+ sys.stderr.flush()
+
class DocumentType(Enum):
"""
@@ -664,7 +668,7 @@ class Document:
Represents a document being processed, such as a PDF, handling chunking, embedding, and summarization.
"""
- def __init__(self, file_data: bytes, file_name: str, job_id: str):
+ def __init__(self, file_data: bytes, file_name: str, job_id: str, output_folder: str):
"""
Initialize the Document with file data, file name, and job ID.
@@ -672,6 +676,7 @@ class Document:
:param file_name: The name of the file being processed.
:param job_id: The job ID associated with this document processing task.
"""
+ self.output_folder = output_folder
self.file_data = file_data
self.file_name = file_name
self.job_id = job_id
@@ -680,14 +685,13 @@ class Document:
self.chunks = [] # List to hold text and visual chunks
self.num_pages = 0 # Number of pages in the document (if applicable)
self.summary = "" # The generated summary for the document
-
self._process() # Start processing the document
def _process(self):
"""
Process the document: extract chunks, embed them, and generate a summary.
"""
- pdf_chunker = PDFChunker(output_folder="output") # Initialize the PDF chunker
+ pdf_chunker = PDFChunker(output_folder=self.output_folder, doc_id=self.doc_id) # Initialize PDFChunker
self.chunks = asyncio.run(pdf_chunker.chunk_pdf(self.file_data, self.file_name, self.doc_id, self.job_id)) # Extract chunks
self.num_pages = self._get_pdf_pages() # Get the number of pages in the document
@@ -796,8 +800,7 @@ class Document:
"doc_id": self.doc_id
}, indent=2) # Convert the document's attributes to JSON format
-
-def process_document(file_data, file_name, job_id):
+def process_document(file_data, file_name, job_id, output_folder):
"""
Top-level function to process a document and return the JSON output.
@@ -806,28 +809,30 @@ def process_document(file_data, file_name, job_id):
:param job_id: The job ID for this document processing task.
:return: The processed document's data in JSON format.
"""
- new_document = Document(file_data, file_name, job_id) # Create a new Document object
- return new_document.to_json() # Return the document's JSON data
-
+ new_document = Document(file_data, file_name, job_id, output_folder)
+ return new_document.to_json()
def main():
"""
Main entry point for the script, called with arguments from Node.js.
"""
- if len(sys.argv) != 4:
- print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr) # Print error if incorrect number of arguments
+ if len(sys.argv) != 5:
+ print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr)
return
- job_id = sys.argv[1] # Get the job ID from command-line arguments
- file_name = sys.argv[2] # Get the file name from command-line arguments
- file_data = sys.argv[3] # Get the base64-encoded file data from command-line arguments
+ job_id = sys.argv[1]
+ file_name = sys.argv[2]
+ file_data = sys.argv[3]
+ output_folder = sys.argv[4] # Get the output folder from arguments
try:
+ os.makedirs(output_folder, exist_ok=True)
+
# Decode the base64 file data
file_bytes = base64.b64decode(file_data)
# Process the document
- document_result = process_document(file_bytes, file_name, job_id)
+ document_result = process_document(file_bytes, file_name, job_id, output_folder) # Pass output_folder
# Output the final result as JSON to stdout
print(document_result)
@@ -839,5 +844,6 @@ def main():
sys.stderr.flush()
+
if __name__ == "__main__":
main() # Execute the main function when the script is run
diff --git a/src/server/flashcard/labels.py b/src/server/flashcard/labels.py
new file mode 100644
index 000000000..546fc4bd3
--- /dev/null
+++ b/src/server/flashcard/labels.py
@@ -0,0 +1,285 @@
+import base64
+import numpy as np
+import base64
+import easyocr
+import sys
+from PIL import Image
+from io import BytesIO
+import requests
+import json
+import numpy as np
+
+class BoundingBoxUtils:
+ """Utility class for bounding box operations and OCR result corrections."""
+
+ @staticmethod
+ def is_close(box1, box2, x_threshold=20, y_threshold=20):
+ """
+ Determines if two bounding boxes are horizontally and vertically close.
+
+ Parameters:
+ box1, box2 (list): The bounding boxes to compare.
+ x_threshold (int): The threshold for horizontal proximity.
+ y_threshold (int): The threshold for vertical proximity.
+
+ Returns:
+ bool: True if boxes are close, False otherwise.
+ """
+ horizontally_close = (abs(box1[2] - box2[0]) < x_threshold or # Right edge of box1 and left edge of box2
+ abs(box2[2] - box1[0]) < x_threshold or # Right edge of box2 and left edge of box1
+ abs(box1[2] - box2[2]) < x_threshold or
+ abs(box2[0] - box1[0]) < x_threshold)
+
+ vertically_close = (abs(box1[3] - box2[1]) < y_threshold or # Bottom edge of box1 and top edge of box2
+ abs(box2[3] - box1[1]) < y_threshold or
+ box1[1] == box2[1] or box1[3] == box2[3])
+
+ return horizontally_close and vertically_close
+
+ @staticmethod
+ def adjust_bounding_box(bbox, original_text, corrected_text):
+ """
+ Adjusts a bounding box based on differences in text length.
+
+ Parameters:
+ bbox (list): The original bounding box coordinates.
+ original_text (str): The original text detected by OCR.
+ corrected_text (str): The corrected text after cleaning.
+
+ Returns:
+ list: The adjusted bounding box.
+ """
+ if not bbox or len(bbox) != 4:
+ return bbox
+
+ # Adjust the x-coordinates slightly to account for text correction
+ x_adjustment = 5
+ adjusted_bbox = [
+ [bbox[0][0] + x_adjustment, bbox[0][1]],
+ [bbox[1][0], bbox[1][1]],
+ [bbox[2][0] + x_adjustment, bbox[2][1]],
+ [bbox[3][0], bbox[3][1]]
+ ]
+ return adjusted_bbox
+
+ @staticmethod
+ def correct_ocr_results(results):
+ """
+ Corrects common OCR misinterpretations in the detected text and adjusts bounding boxes accordingly.
+
+ Parameters:
+ results (list): A list of OCR results, each containing bounding box, text, and confidence score.
+
+ Returns:
+ list: Corrected OCR results with adjusted bounding boxes.
+ """
+ corrections = {
+ "~": "", # Replace '~' with empty string
+ "-": "" # Replace '-' with empty string
+ }
+
+ corrected_results = []
+ for (bbox, text, prob) in results:
+ corrected_text = ''.join(corrections.get(char, char) for char in text)
+ adjusted_bbox = BoundingBoxUtils.adjust_bounding_box(bbox, text, corrected_text)
+ corrected_results.append((adjusted_bbox, corrected_text, prob))
+
+ return corrected_results
+
+ @staticmethod
+ def convert_to_json_serializable(data):
+ """
+ Converts a list containing various types, including numpy types, to a JSON-serializable format.
+
+ Parameters:
+ data (list): A list containing numpy or other non-serializable types.
+
+ Returns:
+ list: A JSON-serializable version of the input list.
+ """
+ def convert_element(element):
+ if isinstance(element, list):
+ return [convert_element(e) for e in element]
+ elif isinstance(element, tuple):
+ return tuple(convert_element(e) for e in element)
+ elif isinstance(element, np.integer):
+ return int(element)
+ elif isinstance(element, np.floating):
+ return float(element)
+ elif isinstance(element, np.ndarray):
+ return element.tolist()
+ else:
+ return element
+
+ return convert_element(data)
+
+class ImageLabelProcessor:
+ """Class to process images and perform OCR with EasyOCR."""
+
+ VERTICAL_THRESHOLD = 20
+ HORIZONTAL_THRESHOLD = 8
+
+ def __init__(self, img_source, source_type, smart_mode):
+ self.img_source = img_source
+ self.source_type = source_type
+ self.smart_mode = smart_mode
+ self.img_val = self.load_image()
+
+ def load_image(self):
+ """Load image from either a base64 string or URL."""
+ if self.source_type == 'drag':
+ return self._load_base64_image()
+ else:
+ return self._load_url_image()
+
+ def _load_base64_image(self):
+ """Decode and save the base64 image."""
+ base64_string = self.img_source
+ if base64_string.startswith("data:image"):
+ base64_string = base64_string.split(",")[1]
+
+
+ # Decode the base64 string
+ image_data = base64.b64decode(base64_string)
+ image = Image.open(BytesIO(image_data)).convert('RGB')
+ image.save("temp_image.jpg")
+ return "temp_image.jpg"
+
+ def _load_url_image(self):
+ """Download image from URL and return it in byte format."""
+ url = self.img_source
+ response = requests.get(url)
+ image = Image.open(BytesIO(response.content)).convert('RGB')
+
+ image_bytes = BytesIO()
+ image.save(image_bytes, format='PNG')
+ return image_bytes.getvalue()
+
+ def process_image(self):
+ """Process the image and return the OCR results."""
+ if self.smart_mode:
+ return self._process_smart_mode()
+ else:
+ return self._process_standard_mode()
+
+ def _process_smart_mode(self):
+ """Process the image in smart mode using EasyOCR."""
+ reader = easyocr.Reader(['en'])
+ result = reader.readtext(self.img_val, detail=1, paragraph=True)
+
+ all_boxes = [bbox for bbox, text in result]
+ all_texts = [text for bbox, text in result]
+
+ response_data = {
+ 'status': 'success',
+ 'message': 'Data received',
+ 'boxes': BoundingBoxUtils.convert_to_json_serializable(all_boxes),
+ 'text': BoundingBoxUtils.convert_to_json_serializable(all_texts),
+ }
+
+ return response_data
+
+ def _process_standard_mode(self):
+ """Process the image in standard mode using EasyOCR."""
+ reader = easyocr.Reader(['en'])
+ results = reader.readtext(self.img_val)
+
+ filtered_results = BoundingBoxUtils.correct_ocr_results([
+ (bbox, text, prob) for bbox, text, prob in results if prob >= 0.7
+ ])
+
+ return self._merge_and_prepare_response(filtered_results)
+
+ def are_vertically_close(self, box1, box2):
+ """Check if two bounding boxes are vertically close."""
+ box1_bottom = max(box1[2][1], box1[3][1])
+ box2_top = min(box2[0][1], box2[1][1])
+ vertical_distance = box2_top - box1_bottom
+
+ box1_left = box1[0][0]
+ box2_left = box2[0][0]
+ box1_right = box1[1][0]
+ box2_right = box2[1][0]
+ hori_close = abs(box2_left - box1_left) <= self.HORIZONTAL_THRESHOLD or abs(box2_right - box1_right) <= self.HORIZONTAL_THRESHOLD
+
+ return vertical_distance <= self.VERTICAL_THRESHOLD and hori_close
+
+ def merge_boxes(self, boxes, texts):
+ """Merge multiple bounding boxes and their associated text."""
+ x_coords = []
+ y_coords = []
+
+ # Collect all x and y coordinates
+ for box in boxes:
+ for point in box:
+ x_coords.append(point[0])
+ y_coords.append(point[1])
+
+ # Create the merged bounding box
+ merged_box = [
+ [min(x_coords), min(y_coords)],
+ [max(x_coords), min(y_coords)],
+ [max(x_coords), max(y_coords)],
+ [min(x_coords), max(y_coords)]
+ ]
+
+ # Combine the texts
+ merged_text = ' '.join(texts)
+
+ return merged_box, merged_text
+
+ def _merge_and_prepare_response(self, filtered_results):
+ """Merge vertically close boxes and prepare the final response."""
+ current_boxes, current_texts = [], []
+ all_boxes, all_texts = [], []
+
+ for ind in range(len(filtered_results) - 1):
+ if not current_boxes:
+ current_boxes.append(filtered_results[ind][0])
+ current_texts.append(filtered_results[ind][1])
+
+ if self.are_vertically_close(filtered_results[ind][0], filtered_results[ind + 1][0]):
+ current_boxes.append(filtered_results[ind + 1][0])
+ current_texts.append(filtered_results[ind + 1][1])
+ else:
+ merged = self.merge_boxes(current_boxes, current_texts)
+ all_boxes.append(merged[0])
+ all_texts.append(merged[1])
+ current_boxes, current_texts = [], []
+
+ if current_boxes:
+ merged = self.merge_boxes(current_boxes, current_texts)
+ all_boxes.append(merged[0])
+ all_texts.append(merged[1])
+
+ if not current_boxes and filtered_results:
+ merged = self.merge_boxes([filtered_results[-1][0]], [filtered_results[-1][1]])
+ all_boxes.append(merged[0])
+ all_texts.append(merged[1])
+
+ response = {
+ 'status': 'success',
+ 'message': 'Data received',
+ 'boxes': BoundingBoxUtils.convert_to_json_serializable(all_boxes),
+ 'text': BoundingBoxUtils.convert_to_json_serializable(all_texts),
+ }
+
+ return response
+
+# Main execution function
+def labels():
+ """Main function to handle image OCR processing based on input arguments."""
+ source_type = sys.argv[2]
+ smart_mode = (sys.argv[3] == 'smart')
+ with open(sys.argv[1], 'r') as f:
+ img_source = f.read()
+ # Create ImageLabelProcessor instance
+ processor = ImageLabelProcessor(img_source, source_type, smart_mode)
+ response = processor.process_image()
+
+ # Print and return the response
+ print(response)
+ return response
+
+
+labels()
diff --git a/src/server/flashcard/requirements.txt b/src/server/flashcard/requirements.txt
new file mode 100644
index 000000000..eb92a819b
--- /dev/null
+++ b/src/server/flashcard/requirements.txt
@@ -0,0 +1,12 @@
+easyocr==1.7.1
+requests==2.32.3
+pillow==10.4.0
+numpy==1.26.4
+tqdm==4.66.4
+Werkzeug==3.0.3
+python-dateutil==2.9.0.post0
+six==1.16.0
+certifi==2024.6.2
+charset-normalizer==3.3.2
+idna==3.7
+urllib3==1.26.19 \ No newline at end of file
diff --git a/src/server/flashcard/venv/pyvenv.cfg b/src/server/flashcard/venv/pyvenv.cfg
new file mode 100644
index 000000000..740014e00
--- /dev/null
+++ b/src/server/flashcard/venv/pyvenv.cfg
@@ -0,0 +1,3 @@
+home = /Library/Frameworks/Python.framework/Versions/3.10/bin
+include-system-site-packages = false
+version = 3.10.11