2 files changed, 77 insertions, 120 deletions
diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
index 3152fe923..33419e559 100644
--- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
+++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
@@ -107,21 +107,12 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
             this.vectorstore_id = StrCast(this.dataDoc.vectorstore_id);
         }
         this.vectorstore = new Vectorstore(this.vectorstore_id, this.retrieveDocIds);
-        this.agent = new Agent(
-            this.vectorstore, 
-            this.retrieveSummaries, 
-            this.retrieveFormattedHistory, 
-            this.retrieveCSVData, 
-            this.addLinkedUrlDoc, 
-            this.createImageInDash, 
-            this.createDocInDash, 
-            this.createCSVInDash
-        );
-        
+        this.agent = new Agent(this.vectorstore, this.retrieveSummaries, this.retrieveFormattedHistory, this.retrieveCSVData, this.addLinkedUrlDoc, this.createImageInDash, this.createDocInDash, this.createCSVInDash);
+
         // Reinitialize the DocumentMetadataTool with a direct reference to this ChatBox instance
         // This ensures the tool can properly access documents in the same Freeform view
         this.agent.reinitializeDocumentMetadataTool(this);
-        
+
         this.messagesRef = React.createRef<HTMLDivElement>();
 
         // Reaction to update dataDoc when chat history changes
@@ -743,6 +734,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
                         DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
                         return;
                     }
+
                     const x1 = parseFloat(values[0]) * Doc.NativeWidth(doc);
                     const y1 = parseFloat(values[1]) * Doc.NativeHeight(doc) + foundChunk.startPage * Doc.NativeHeight(doc);
                     const x2 = parseFloat(values[2]) * Doc.NativeWidth(doc);
@@ -751,9 +743,18 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
                     const annotationKey = Doc.LayoutFieldKey(doc) + '_annotations';
 
                     const existingDoc = DocListCast(doc[DocData][annotationKey]).find(d => d.citation_id === citation.citation_id);
+                    if (existingDoc) {
+                        existingDoc.x = x1;
+                        existingDoc.y = y1;
+                        existingDoc._width = x2 - x1;
+                        existingDoc._height = y2 - y1;
+                    }
                     const highlightDoc = existingDoc ?? this.createImageCitationHighlight(x1, y1, x2, y2, citation, annotationKey, doc);
 
-                    DocumentManager.Instance.showDocument(highlightDoc, { willZoomCentered: true }, () => {});
+                    //doc.layout_scroll = y1;
+                    doc._layout_curPage = foundChunk.startPage + 1;
+                    DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
+                    //DocumentManager.Instance.showDocument(highlightDoc, { willZoomCentered: true }, () => {});
                 }
                 break;
             case CHUNK_TYPE.TEXT:
@@ -795,6 +796,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
             backgroundColor: 'rgba(255, 255, 0, 0.5)',
         });
         highlight_doc[DocData].citation_id = citation.citation_id;
+        highlight_doc.freeform_scale = 1;
         Doc.AddDocToList(pdfDoc[DocData], annotationKey, highlight_doc);
         highlight_doc.annotationOn = pdfDoc;
         Doc.SetContainer(highlight_doc, pdfDoc);
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index feb437f1f..e9b9ef2b3 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -276,12 +276,13 @@ class PDFChunker:
         :param output_folder: Folder to store the output files (extracted tables/images).
         :param image_batch_size: The batch size for processing visual elements.
         """
-        self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))  # Initialize the Anthropic API client
+        self.client = OpenAI()                        # ← replaces Anthropic()
         self.output_folder = output_folder
         self.image_batch_size = image_batch_size  # Batch size for image processing
         self.doc_id = doc_id  # Add doc_id
         self.element_extractor = ElementExtractor(output_folder, doc_id)
 
+
     async def chunk_pdf(self, file_data: bytes, file_name: str, doc_id: str, job_id: str) -> List[Dict[str, Any]]:
         """
         Processes a PDF file, extracting text and visual elements, and returning structured chunks.
@@ -518,123 +519,77 @@ class PDFChunker:
 
     def batch_summarize_images(self, images: Dict[int, str]) -> Dict[int, str]:
         """
-        Summarize images or tables by generating descriptive text.
-
-        :param images: A dictionary mapping image numbers to base64-encoded image data.
-        :return: A dictionary mapping image numbers to their generated summaries.
-        """
-        # Prompt for the AI model to summarize images and tables
-        prompt = f"""<instruction>
-    <task>
-        You are tasked with summarizing a series of {len(images)} images and tables for use in a RAG (Retrieval-Augmented Generation) system. 
-        Your goal is to create concise, informative summaries that capture the essential content of each image or table. 
-        These summaries will be used for embedding, so they should be descriptive and relevant. The image or table will be outlined in red on an image of the full page that it is on. Where necessary, use the context of the full page to heklp with the summary but don't summarize other content on the page.
-    </task>
-
-    <steps>
-        <step>Identify whether it's an image or a table.</step>
-        <step>Examine its content carefully.</step>
-        <step>
-            Write a detailed summary that captures the main points or visual elements:
-            <details>
-                <table>After summarizing what the table is about, include the column headers, a detailed summary of the data, and any notable data trends.</table>
-                <image>Describe the main subjects, actions, or notable features.</image>
-            </details>
-        </step>
-        <step>Focus on writing summaries that would make it easy to retrieve the content if compared to a user query using vector similarity search.</step>
-        <step>Keep summaries concise and include important words that may help with retrieval (but do not include numbers and numerical data).</step>
-    </steps>
-
-    <important_notes>
-        <note>Avoid using special characters like &amp;, &lt;, &gt;, &quot;, &apos;, $, %, etc. Instead, use their word equivalents:</note>
-        <note>Use "and" instead of &amp;.</note>
-        <note>Use "dollars" instead of $.</note>
-        <note>Use "percent" instead of %.</note>
-        <note>Refrain from using quotation marks &quot; or apostrophes &apos; unless absolutely necessary.</note>
-        <note>Ensure your output is in valid XML format.</note>
-    </important_notes>
-
-    <formatting>
-        <note>Enclose all summaries within a root element called &lt;summaries&gt;.</note>
-        <note>Use &lt;summary&gt; tags to enclose each individual summary.</note>
-        <note>Include an attribute 'number' in each &lt;summary&gt; tag to indicate the sequence, matching the provided image numbers.</note>
-        <note>Start each summary by indicating whether it's an image or a table (e.g., "This image shows..." or "The table presents...").</note>
-        <note>If an image is completely blank, leave the summary blank (e.g., &lt;summary number="3"&gt;&lt;/summary&gt;).</note>
-    </formatting>
-
-    <example>
-        <note>Do not replicate the example below—stay grounded to the content of the table or image and describe it completely and accurately.</note>
-        <output>
-            &lt;summaries&gt;
-            &lt;summary number="1"&gt;
-            The image shows two men shaking hands on stage at a formal event. The man on the left, in a dark suit and glasses, has a professional appearance, possibly an academic or business figure. The man on the right, Tim Cook, CEO of Apple, is recognizable by his silver hair and dark blue blazer. Cook holds a document titled "Tsinghua SEM EMBA," suggesting a link to Tsinghua University’s Executive MBA program. The backdrop displays English and Chinese text about business management and education, with the event dated October 23, 2014.
-            &lt;/summary&gt;
-            &lt;summary number="2"&gt;
-            The table compares the company's assets between December 30, 2023, and September 30, 2023. Key changes include an increase in cash and cash equivalents, while marketable securities had a slight rise. Accounts receivable and vendor non-trade receivables decreased. Inventories and other current assets saw minor fluctuations. Non-current assets like marketable securities slightly declined, while property, plant, and equipment remained stable. Total assets showed minimal change, holding steady at around three hundred fifty-three billion dollars.
-            &lt;/summary&gt;
-            &lt;summary number="3"&gt;
-            The table outlines the company's shareholders' equity as of December 30, 2023, versus September 30, 2023. Common stock and additional paid-in capital increased, and retained earnings shifted from a deficit to a positive figure. Accumulated other comprehensive loss decreased. Overall, total shareholders' equity rose significantly, while total liabilities and equity remained nearly unchanged at about three hundred fifty-three billion dollars.
-            &lt;/summary&gt;
-            &lt;summary number="4"&gt;
-            The table details the company's liabilities as of December 30, 2023, compared to September 30, 2023. Current liabilities decreased due to lower accounts payable and other current liabilities, while deferred revenue slightly increased. Commercial paper significantly decreased, and term debt rose modestly. Non-current liabilities were stable, with minimal changes in term debt and other non-current liabilities. Total liabilities dropped from two hundred ninety billion dollars to two hundred seventy-nine billion dollars.
-            &lt;/summary&gt;
-            &lt;summary number="5"&gt;
-            &lt;/summary&gt;
-            &lt;/summaries&gt;
-        </output>
-    </example>
-
-    <final_notes>
-        <note>Process each image or table in the order provided.</note>
-        <note>Maintain consistent formatting throughout your response.</note>
-        <note>Ensure the output is in full, valid XML format with the root &lt;summaries&gt; element and each summary being within a &lt;summary&gt; element with the summary number specified as well.</note>
-    </final_notes>
-</instruction>
-        """
-        content = []
-        for number, img in images.items():
-            content.append({"type": "text", "text": f"\nImage {number}:\n"})
-            content.append({"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img}})
+        Summarise a batch of images/tables with GPT‑4o using Structured Outputs.
+        :param images: {image_number: base64_png}
+        :return:       {image_number: summary_text}
+        """
+        # -------- 1. Build the prompt -----------
+        content: list[dict] = []
+        for n, b64 in images.items():
+            content.append({"type": "text",
+                            "text": f"\nImage {n} (outlined in red on the page):"})
+            content.append({"type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{b64}"}})
 
         messages = [
-            {"role": "user", "content": content}
+            {
+                "role": "system",
+                "content": (
+                    "You are generating retrieval‑ready summaries for each highlighted "
+                    "image or table.  Start by identifying whether the element is an "
+                    "image or a table, then write one informative sentence that a vector "
+                    "search would find useful. Provide detail but limit to a couple of paragraphs per image."
+                ),
+            },
+            {"role": "user", "content": content},
         ]
 
+        schema = {
+            "type": "object",
+            "properties": {
+                "summaries": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "number":  {"type": "integer"},
+                            "type":    {"type": "string", "enum": ["image", "table"]},
+                            "summary": {"type": "string"}
+                        },
+                        "required": ["number", "type", "summary"],
+                        "additionalProperties": False
+                    }
+                }
+            },
+            "required": ["summaries"],
+            "additionalProperties": False
+        }
+
+        # ---------- OpenAI call -----------------------------------------------------
         try:
-            response = self.client.messages.create(
-                model='claude-3-7-sonnet-20250219',
-                system=prompt,
-                max_tokens=400 * len(images),  # Increased token limit for more detailed summaries
+            resp = self.client.chat.completions.create(
+                model="gpt-4o",
                 messages=messages,
+                max_tokens=400 * len(images),
                 temperature=0,
+                response_format={
+                    "type": "json_schema",
+                    "json_schema": {
+                        "name":   "image_batch_summaries",  # ← REQUIRED
+                        "schema": schema,                   # ← REQUIRED
+                        "strict": True                      # ← strongly recommended
+                    },
+                },
             )
 
-            # Parse the response
-            text = response.content[0].text
-            #print(text)
-            # Attempt to parse and fix the XML if necessary
-            parser = etree.XMLParser(recover=True)
-            root = etree.fromstring(text, parser=parser)
-            # Check if there were errors corrected
-            # if parser.error_log:
-            #     #print("XML Parsing Errors:")
-            #     for error in parser.error_log:
-            #         #print(error)
-            # Extract summaries
-            summaries = {}
-            for summary in root.findall('summary'):
-                number = int(summary.get('number'))
-                content = summary.text.strip() if summary.text else ""
-                if content:  # Only include non-empty summaries
-                    summaries[number] = content
-
-            return summaries
+            parsed = json.loads(resp.choices[0].message.content)   # schema‑safe
+            return {item["number"]: item["summary"]
+                    for item in parsed["summaries"]}
 
         except Exception as e:
-            # Print errors to stderr so they don't interfere with JSON output
-            print(json.dumps({"error": str(e)}), file=sys.stderr)
-            sys.stderr.flush()
-
+            # Log and fall back gracefully
+            print(json.dumps({"error": str(e)}), file=sys.stderr, flush=True)
+            return {}
 
 class DocumentType(Enum):
     """