aboutsummaryrefslogtreecommitdiff
path: root/src/server/chunker/pdf_chunker.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/server/chunker/pdf_chunker.py')
-rw-r--r--src/server/chunker/pdf_chunker.py54
1 files changed, 24 insertions, 30 deletions
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index 48b2dbf97..a9dbcbb0c 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -668,7 +668,7 @@ class Document:
Represents a document being processed, such as a PDF, handling chunking, embedding, and summarization.
"""
- def __init__(self, file_data: bytes, file_name: str, job_id: str, output_folder: str):
+ def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str):
"""
Initialize the Document with file data, file name, and job ID.
@@ -677,8 +677,8 @@ class Document:
:param job_id: The job ID associated with this document processing task.
"""
self.output_folder = output_folder
- self.file_data = file_data
self.file_name = file_name
+ self.file_path = file_path
self.job_id = job_id
self.type = self._get_document_type(file_name) # Determine the document type (PDF, CSV, etc.)
self.doc_id = job_id # Use the job ID as the document ID
@@ -691,13 +691,23 @@ class Document:
"""
Process the document: extract chunks, embed them, and generate a summary.
"""
+ with open(self.file_path, 'rb') as file:
+ pdf_data = file.read()
pdf_chunker = PDFChunker(output_folder=self.output_folder, doc_id=self.doc_id) # Initialize PDFChunker
- self.chunks = asyncio.run(pdf_chunker.chunk_pdf(self.file_data, self.file_name, self.doc_id, self.job_id)) # Extract chunks
-
- self.num_pages = self._get_pdf_pages() # Get the number of pages in the document
+ self.chunks = asyncio.run(pdf_chunker.chunk_pdf(pdf_data, os.path.basename(self.file_path), self.doc_id, self.job_id)) # Extract chunks
+ self.num_pages = self._get_pdf_pages(pdf_data) # Get the number of pages in the document
self._embed_chunks() # Embed the text chunks into embeddings
self.summary = self._generate_summary() # Generate a summary for the document
+ def _get_pdf_pages(self, pdf_data: bytes) -> int:
+ """
+ Get the total number of pages in the PDF document.
+ """
+ pdf_file = io.BytesIO(pdf_data) # Convert the file data to an in-memory binary stream
+ pdf_reader = PdfReader(pdf_file) # Initialize PDF reader
+ return len(pdf_reader.pages) # Return the number of pages in the PDF
+
+
def _get_document_type(self, file_name: str) -> DocumentType:
"""
Determine the document type based on its file extension.
@@ -712,15 +722,6 @@ class Document:
except ValueError:
raise FileTypeNotSupportedException(extension) # Raise exception if file type is unsupported
- def _get_pdf_pages(self) -> int:
- """
- Get the total number of pages in the PDF document.
-
- :return: The number of pages in the PDF.
- """
- pdf_file = io.BytesIO(self.file_data) # Convert the file data to an in-memory binary stream
- pdf_reader = PdfReader(pdf_file) # Initialize PDF reader
- return len(pdf_reader.pages) # Return the number of pages in the PDF
def _embed_chunks(self) -> None:
"""
@@ -800,39 +801,34 @@ class Document:
"doc_id": self.doc_id
}, indent=2) # Convert the document's attributes to JSON format
-def process_document(file_data, file_name, job_id, output_folder):
+def process_document(file_path, job_id, output_folder):
"""
Top-level function to process a document and return the JSON output.
- :param file_data: The binary data of the file being processed.
- :param file_name: The name of the file being processed.
+ :param file_path: The path to the file being processed.
:param job_id: The job ID for this document processing task.
:return: The processed document's data in JSON format.
"""
- new_document = Document(file_data, file_name, job_id, output_folder)
+ new_document = Document(file_path, file_path, job_id, output_folder)
return new_document.to_json()
def main():
"""
Main entry point for the script, called with arguments from Node.js.
"""
- if len(sys.argv) != 5:
+ if len(sys.argv) != 4:
print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr)
return
job_id = sys.argv[1]
- file_name = sys.argv[2]
- file_data = sys.argv[3]
- output_folder = sys.argv[4] # Get the output folder from arguments
+ file_path = sys.argv[2]
+ output_folder = sys.argv[3] # Get the output folder from arguments
try:
os.makedirs(output_folder, exist_ok=True)
-
- # Decode the base64 file data
- file_bytes = base64.b64decode(file_data)
-
+
# Process the document
- document_result = process_document(file_bytes, file_name, job_id, output_folder) # Pass output_folder
+ document_result = process_document(file_path, job_id, output_folder) # Pass output_folder
# Output the final result as JSON to stdout
print(document_result)
@@ -843,7 +839,5 @@ def main():
print(json.dumps({"error": str(e)}), file=sys.stderr)
sys.stderr.flush()
-
-
if __name__ == "__main__":
- main() # Execute the main function when the script is run
+ main() # Execute the main function when the script is run \ No newline at end of file