1 files changed, 24 insertions, 30 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index 48b2dbf97..a9dbcbb0c 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -668,7 +668,7 @@ class Document:
     Represents a document being processed, such as a PDF, handling chunking, embedding, and summarization.
     """
 
-    def __init__(self, file_data: bytes, file_name: str, job_id: str, output_folder: str):
+    def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str):
         """
         Initialize the Document with file data, file name, and job ID.
 
@@ -677,8 +677,8 @@ class Document:
         :param job_id: The job ID associated with this document processing task.
         """
         self.output_folder = output_folder
-        self.file_data = file_data
         self.file_name = file_name
+        self.file_path = file_path
         self.job_id = job_id
         self.type = self._get_document_type(file_name)  # Determine the document type (PDF, CSV, etc.)
         self.doc_id = job_id  # Use the job ID as the document ID
@@ -691,13 +691,23 @@ class Document:
         """
         Process the document: extract chunks, embed them, and generate a summary.
         """
+        with open(self.file_path, 'rb') as file:
+            pdf_data = file.read()
         pdf_chunker = PDFChunker(output_folder=self.output_folder, doc_id=self.doc_id)  # Initialize PDFChunker
-        self.chunks = asyncio.run(pdf_chunker.chunk_pdf(self.file_data, self.file_name, self.doc_id, self.job_id))  # Extract chunks
-
-        self.num_pages = self._get_pdf_pages()  # Get the number of pages in the document
+        self.chunks = asyncio.run(pdf_chunker.chunk_pdf(pdf_data, os.path.basename(self.file_path), self.doc_id, self.job_id))  # Extract chunks
+        self.num_pages = self._get_pdf_pages(pdf_data)  # Get the number of pages in the document
         self._embed_chunks()  # Embed the text chunks into embeddings
         self.summary = self._generate_summary()  # Generate a summary for the document
 
+    def _get_pdf_pages(self, pdf_data: bytes) -> int:
+        """
+        Get the total number of pages in the PDF document.
+        """
+        pdf_file = io.BytesIO(pdf_data)  # Convert the file data to an in-memory binary stream
+        pdf_reader = PdfReader(pdf_file)  # Initialize PDF reader
+        return len(pdf_reader.pages)  # Return the number of pages in the PDF
+
+
     def _get_document_type(self, file_name: str) -> DocumentType:
         """
         Determine the document type based on its file extension.
@@ -712,15 +722,6 @@ class Document:
         except ValueError:
             raise FileTypeNotSupportedException(extension)  # Raise exception if file type is unsupported
 
-    def _get_pdf_pages(self) -> int:
-        """
-        Get the total number of pages in the PDF document.
-
-        :return: The number of pages in the PDF.
-        """
-        pdf_file = io.BytesIO(self.file_data)  # Convert the file data to an in-memory binary stream
-        pdf_reader = PdfReader(pdf_file)  # Initialize PDF reader
-        return len(pdf_reader.pages)  # Return the number of pages in the PDF
 
     def _embed_chunks(self) -> None:
         """
@@ -800,39 +801,34 @@ class Document:
             "doc_id": self.doc_id
         }, indent=2)  # Convert the document's attributes to JSON format
 
-def process_document(file_data, file_name, job_id, output_folder):
+def process_document(file_path, job_id, output_folder):
     """
     Top-level function to process a document and return the JSON output.
 
-    :param file_data: The binary data of the file being processed.
-    :param file_name: The name of the file being processed.
+    :param file_path: The path to the file being processed.
     :param job_id: The job ID for this document processing task.
     :return: The processed document's data in JSON format.
     """
-    new_document = Document(file_data, file_name, job_id, output_folder)
+    new_document = Document(file_path, file_path, job_id, output_folder)
     return new_document.to_json()
 
 def main():
     """
     Main entry point for the script, called with arguments from Node.js.
     """
-    if len(sys.argv) != 5:
+    if len(sys.argv) != 4:
         print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr)
         return
 
     job_id = sys.argv[1]
-    file_name = sys.argv[2]
-    file_data = sys.argv[3]
-    output_folder = sys.argv[4]  # Get the output folder from arguments
+    file_path = sys.argv[2]
+    output_folder = sys.argv[3]  # Get the output folder from arguments
 
     try:
         os.makedirs(output_folder, exist_ok=True)
-
-        # Decode the base64 file data
-        file_bytes = base64.b64decode(file_data)
-
+        
         # Process the document
-        document_result = process_document(file_bytes, file_name, job_id, output_folder)  # Pass output_folder
+        document_result = process_document(file_path, job_id, output_folder)  # Pass output_folder
 
         # Output the final result as JSON to stdout
         print(document_result)
@@ -843,7 +839,5 @@ def main():
         print(json.dumps({"error": str(e)}), file=sys.stderr)
         sys.stderr.flush()
 
-
-
 if __name__ == "__main__":
-    main()  # Execute the main function when the script is run
+    main()  # Execute the main function when the script is run
+\ No newline at end of file