new attempt with new citation unification

author: A.J. Shulman <Shulman.aj@gmail.com> 2025-04-27 13:14:49 -0400
committer: A.J. Shulman <Shulman.aj@gmail.com> 2025-04-27 13:14:49 -0400
commit: 3ef3d40506348d9fd537cc8f4aea975b9770689f (patch)
tree: afe779e8240e88c8b20ff6b68ac45840a927ee76 /src
parent: 5ce2263849bfb901e276a4c5fc8ca2dbd8b80350 (diff)
7 files changed, 510 insertions, 278 deletions
diff --git a/src/client/views/nodes/chatbot/agentsystem/Agent.ts b/src/client/views/nodes/chatbot/agentsystem/Agent.ts
index c021d141e..80fdb6533 100644
--- a/src/client/views/nodes/chatbot/agentsystem/Agent.ts
+++ b/src/client/views/nodes/chatbot/agentsystem/Agent.ts
@@ -65,12 +65,9 @@ export class Agent {
         summaries: () => string,
         history: () => string,
         csvData: () => { filename: string; id: string; text: string }[],
-        addLinkedUrlDoc: (url: string, id: string) => void,
         getLinkedUrlDocId: (url: string) => string[],
         createImage: (result: Upload.FileInformation & Upload.InspectionResults, options: DocumentOptions) => void,
-        // eslint-disable-next-line @typescript-eslint/no-unused-vars
         createCSVInDash: (url: string, title: string, id: string, data: string) => void,
-        chatBox: ChatBox,
         docManager: AgentDocumentManager
     ) {
         // Initialize OpenAI client with API key from environment
@@ -87,7 +84,7 @@ export class Agent {
             rag: new RAGTool(this.vectorstore),
             dataAnalysis: new DataAnalysisTool(csvData),
             websiteInfoScraper: new WebsiteInfoScraperTool(getLinkedUrlDocId),
-            searchTool: new SearchTool(addLinkedUrlDoc),
+            searchTool: new SearchTool(this._docManager),
             noTool: new NoTool(),
             //imageCreationTool: new ImageCreationTool(createImage),
             documentMetadata: new DocumentMetadataTool(this._docManager),
diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
index 43765c1ce..35dbee3e9 100644
--- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
+++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
@@ -71,7 +71,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
     @observable private _citationPopup: { text: string; visible: boolean } = { text: '', visible: false };
 
     // Private properties for managing OpenAI API, vector store, agent, and UI elements
-    private openai: OpenAI;
+    private openai!: OpenAI; // Using definite assignment assertion
     private vectorstore_id: string;
     private vectorstore: Vectorstore;
     private agent: Agent;
@@ -98,25 +98,34 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
      */
     constructor(props: FieldViewProps) {
         super(props);
-        makeObservable(this); // Enable MobX observables
+        makeObservable(this);
 
-        // Initialize OpenAI, vectorstore, and agent
-        this.openai = this.initializeOpenAI();
-        if (StrCast(this.dataDoc.vectorstore_id) == '') {
-            this.vectorstore_id = uuidv4();
-            this.dataDoc.vectorstore_id = this.vectorstore_id;
-        } else {
-            this.vectorstore_id = StrCast(this.dataDoc.vectorstore_id);
-        }
-        this.vectorstore = new Vectorstore(this.vectorstore_id, this.retrieveDocIds);
+        this.messagesRef = React.createRef();
         this.docManager = new AgentDocumentManager(this);
-        this.agent = new Agent(this.vectorstore, this.retrieveSummaries, this.retrieveFormattedHistory, this.retrieveCSVData, this.addLinkedUrlDoc, this.getLinkedUrlDocIds, this.createImageInDash, this.createCSVInDash, this, this.docManager);
 
-        // Reinitialize the DocumentMetadataTool with a direct reference to this ChatBox instance
-        // This ensures the tool can properly access documents in the same Freeform view
-        this.agent.reinitializeDocumentMetadataTool();
+        // Initialize OpenAI client
+        this.initializeOpenAI();
+
+        // Create a unique vectorstore ID for this ChatBox
+        this.vectorstore_id = uuidv4();
+
+        // Initialize vectorstore with the document manager
+        this.vectorstore = new Vectorstore(this.vectorstore_id, this.docManager);
+
+        // Create an agent with the vectorstore
+        this.agent = new Agent(
+            this.vectorstore,
+            this.retrieveSummaries.bind(this),
+            this.retrieveFormattedHistory.bind(this),
+            this.retrieveCSVData.bind(this),
+            this.retrieveDocIds.bind(this),
+            this.createImageInDash.bind(this),
+            this.createCSVInDash.bind(this),
+            this.docManager
+        );
 
-        this.messagesRef = React.createRef<HTMLDivElement>();
+        // Add event listeners
+        this.addScrollListener();
 
         // Reaction to update dataDoc when chat history changes
         reaction(
@@ -140,22 +149,25 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
      */
     @action
     addDocToVectorstore = async (newLinkedDoc: Doc) => {
-        this._uploadProgress = 0;
-        this._currentStep = 'Initializing...';
-        this._isUploadingDocs = true;
-
         try {
-            // Add the document to the vectorstore
+            this._isUploadingDocs = true;
+
+            // Process the document first to ensure it has a valid ID
+            this.docManager.processDocument(newLinkedDoc);
+
+            // Add the document to the vectorstore which will also register chunks
             await this.vectorstore.addAIDoc(newLinkedDoc, this.updateProgress);
-        } catch (error) {
-            console.error('Error uploading document:', error);
-            this._currentStep = 'Error during upload';
-        } finally {
-            runInAction(() => {
-                this._isUploadingDocs = false;
-                this._uploadProgress = 0;
-                this._currentStep = '';
-            });
+
+            // No longer needed as documents are tracked by the AgentDocumentManager
+            // this._linked_docs_to_add.add(newLinkedDoc);
+
+            this._isUploadingDocs = false;
+
+            return true;
+        } catch (err) {
+            console.error('Error adding document to vectorstore:', err);
+            this._isUploadingDocs = false;
+            return false;
         }
     };
 
@@ -238,7 +250,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
             apiKey: process.env.OPENAI_KEY,
             dangerouslyAllowBrowser: true,
         };
-        return new OpenAI(configuration);
+        this.openai = new OpenAI(configuration);
     }
 
     /**
@@ -376,49 +388,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
     };
 
     /**
-     * Adds a linked document from a URL for future reference and analysis.
-     * @param url The URL of the document to add.
-     * @param id The unique identifier for the document.
-     */
-    @action
-    addLinkedUrlDoc = async (url: string, id: string) => {
-        const doc = Docs.Create.WebDocument(url, { data_useCors: true });
-        this.docManager.addCustomId(doc, id);
-        const linkDoc = Docs.Create.LinkDocument(this.Document, doc);
-        LinkManager.Instance.addLink(linkDoc);
-
-        const chunkToAdd = {
-            chunkId: id,
-            chunkType: CHUNK_TYPE.URL,
-            url: url,
-        };
-
-        doc.chunk_simpl = JSON.stringify({ chunks: [chunkToAdd] });
-        this.docManager.processDocument(doc);
-    };
-
-    /**
-     * Retrieves the IDs of linked url documents.
-     * @returns An array of document IDs.
-     */
-    @action
-    getLinkedUrlDocIds = () => {
-        const linkedDocs: Doc[] = this.linkedDocs;
-        const linkedUrlDocIds: string[] = [];
-
-        for (const doc of linkedDocs) {
-            if (doc.chunk_simpl) {
-                const docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl)) as { chunks: SimplifiedChunk[] };
-                const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkType === CHUNK_TYPE.URL);
-                if (foundChunk) {
-                    linkedUrlDocIds.push(foundChunk.chunkId);
-                }
-            }
-        }
-        return linkedUrlDocIds;
-    };
-
-    /**
      * Getter to retrieve the current user's name from the client utils.
      */
     @computed
@@ -613,82 +582,224 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
      */
     @action
     handleCitationClick = async (citation: Citation) => {
-        const currentLinkedDocs: Doc[] = this.linkedDocs;
-        const chunkId = citation.chunk_id;
+        try {
+            // Extract values from MobX proxy object if needed
+            const chunkId = typeof citation.chunk_id === 'object' ? (citation.chunk_id as any).toString() : citation.chunk_id;
+
+            // For debugging
+            console.log('Citation clicked:', {
+                chunkId,
+                citation: JSON.stringify(citation, null, 2),
+            });
 
-        for (const doc of currentLinkedDocs) {
-            if (doc.chunk_simpl) {
-                const docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl)) as { chunks: SimplifiedChunk[] };
-                const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkId === chunkId);
+            // Try to find the document
+            const linkedDocs = this.linkedDocs;
+            let doc: Doc | undefined;
 
-                if (foundChunk) {
-                    // Handle media chunks specifically
+            // First try to find the document using the document manager's chunk ID lookup
+            const parentDocId = this.docManager.getDocIdByChunkId(chunkId);
+            if (parentDocId) {
+                doc = this.docManager.getDocument(parentDocId);
+                console.log(`Found document by chunk ID lookup: ${parentDocId}`);
+            }
 
-                    if (doc.ai_type == 'video' || doc.ai_type == 'audio') {
-                        const directMatchSegmentStart = this.getDirectMatchingSegmentStart(doc, citation.direct_text || '', foundChunk.indexes || []);
+            // If not found, fall back to searching through linked docs (maintains compatibility)
+            if (!doc) {
+                for (const linkedDoc of linkedDocs) {
+                    if (linkedDoc.chunk_simpl) {
+                        try {
+                            const docChunkSimpl = JSON.parse(StrCast(linkedDoc.chunk_simpl)) as { chunks: SimplifiedChunk[] };
+                            const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkId === chunkId);
+                            if (foundChunk) {
+                                doc = linkedDoc;
+                                console.log(`Found document by iterating through linked docs`);
+                                break;
+                            }
+                        } catch (e) {
+                            console.error(`Error parsing chunk_simpl for doc ${linkedDoc.id}:`, e);
+                        }
+                    }
+                }
+            }
 
-                        if (directMatchSegmentStart) {
-                            // Navigate to the segment's start time in the media player
-                            await this.goToMediaTimestamp(doc, directMatchSegmentStart, doc.ai_type);
-                        } else {
-                            console.error('No direct matching segment found for the citation.');
+            if (!doc) {
+                console.warn(`Document not found for citation with chunk_id: ${chunkId}`);
+                return;
+            }
+
+            // Process the chunk data
+            let docChunkSimpl: { chunks: SimplifiedChunk[] } = { chunks: [] };
+            try {
+                docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl) || '{"chunks":[]}');
+            } catch (e) {
+                console.error(`Error parsing chunk_simpl for the found document:`, e);
+                return;
+            }
+
+            const foundChunk = docChunkSimpl.chunks.find((chunk: SimplifiedChunk) => chunk.chunkId === chunkId);
+
+            // Handle different chunk types
+            if (foundChunk) {
+                console.log(`Found chunk in document:`, foundChunk);
+
+                // Handle video chunks
+                if (foundChunk.chunkType === CHUNK_TYPE.VIDEO) {
+                    if (foundChunk.start_time !== undefined) {
+                        await this.goToMediaTimestamp(doc, foundChunk.start_time, 'video');
+                    } else {
+                        console.warn('Video chunk missing start_time:', foundChunk);
+                    }
+                }
+                // Handle audio chunks - note that we're using string comparison since 'audio' isn't in CHUNK_TYPE enum
+                else if (String(foundChunk.chunkType).toLowerCase() === 'audio') {
+                    if (foundChunk.start_time !== undefined) {
+                        await this.goToMediaTimestamp(doc, foundChunk.start_time, 'audio');
+                    } else {
+                        console.warn('Audio chunk missing start_time:', foundChunk);
+                    }
+                }
+                // Handle table or image chunks
+                else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) {
+                    this.handleOtherChunkTypes(foundChunk, citation, doc);
+                }
+                // Handle text chunks
+                else if (foundChunk.chunkType === CHUNK_TYPE.TEXT) {
+                    // Find text from the document's chunks metadata
+                    let chunkText = '';
+
+                    try {
+                        // We already parsed the chunks earlier, so use that
+                        const matchingChunk = docChunkSimpl.chunks.find(c => c.chunkId === foundChunk.chunkId);
+                        if (matchingChunk && 'text' in matchingChunk) {
+                            // If the text property exists on the chunk (even though it's not in the type)
+                            chunkText = String(matchingChunk['text'] || '');
                         }
+                    } catch (e) {
+                        console.error('Error getting chunk text:', e);
+                    }
+
+                    // Default text if none found
+                    if (!chunkText) {
+                        chunkText = 'Text content not available';
+                    }
+
+                    this._citationPopup = {
+                        text: chunkText,
+                        visible: true,
+                    };
+                }
+                // Handle URL chunks
+                else if (foundChunk.chunkType === CHUNK_TYPE.URL) {
+                    if (foundChunk.url) {
+                        // Instead of opening the URL in a new window, show the document in the viewer
+                        DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
+                        console.log(`Navigated to web document with URL: ${foundChunk.url}`);
                     } else {
-                        // Handle other chunk types as before
-                        this.handleOtherChunkTypes(foundChunk, citation, doc);
+                        console.warn('URL chunk missing URL:', foundChunk);
                     }
                 }
+            } else if (doc?.original_segments) {
+                // Handle original segments for media files
+                let original_segments: any[] = [];
+                try {
+                    original_segments = JSON.parse(StrCast(doc.original_segments));
+                } catch (e) {
+                    console.error(`Error parsing original_segments:`, e);
+                    return;
+                }
+
+                // Check if there's direct text to find in the segments
+                if (citation.direct_text) {
+                    // Find the segment that contains the direct text
+                    const start = this.getDirectMatchingSegmentStart(doc, citation.direct_text, []);
+                    if (start !== -1) {
+                        await this.goToMediaTimestamp(doc, start, doc.ai_type === 'audio' ? 'audio' : 'video');
+                    }
+                }
+            } else {
+                console.warn('Unable to find chunk or segments for citation', citation);
             }
+        } catch (error) {
+            console.error('Error handling citation click:', error);
         }
     };
 
+    /**
+     * Finds a matching segment in a document based on text content.
+     * @param doc The document to search in
+     * @param citationText The text to find in the document
+     * @param indexesOfSegments Optional indexes of segments to search in
+     * @returns The starting timestamp of the matching segment, or -1 if not found
+     */
     getDirectMatchingSegmentStart = (doc: Doc, citationText: string, indexesOfSegments: string[]): number => {
-        const originalSegments = JSON.parse(StrCast(doc.original_segments!)).map((segment: any, index: number) => ({
-            index: index.toString(),
-            text: segment.text,
-            start: segment.start,
-            end: segment.end,
-        }));
-
-        if (!Array.isArray(originalSegments) || originalSegments.length === 0 || !Array.isArray(indexesOfSegments)) {
-            return 0;
+        if (!doc || !citationText) return -1;
+
+        // Get original segments from the document
+        const original_segments = doc.original_segments ? JSON.parse(StrCast(doc.original_segments)) : [];
+
+        if (!original_segments || !Array.isArray(original_segments) || original_segments.length === 0) {
+            return -1;
         }
 
-        // Create itemsToSearch array based on indexesOfSegments
-        const itemsToSearch = indexesOfSegments.map((indexStr: string) => {
-            const index = parseInt(indexStr, 10);
-            const segment = originalSegments[index];
-            return { text: segment.text, start: segment.start };
-        });
+        let segments = original_segments;
 
-        console.log('Constructed itemsToSearch:', itemsToSearch);
+        // If specific indexes are provided, filter segments by those indexes
+        if (indexesOfSegments && indexesOfSegments.length > 0) {
+            segments = original_segments.filter((segment: any) => indexesOfSegments.includes(segment.index));
+        }
+
+        // If no segments match the indexes, use all segments
+        if (segments.length === 0) {
+            segments = original_segments;
+        }
 
-        // Helper function to calculate word overlap score
+        // First try to find an exact match
+        const exactMatch = segments.find((segment: any) => segment.text && segment.text.includes(citationText));
+
+        if (exactMatch) {
+            return exactMatch.start;
+        }
+
+        // If no exact match, find segment with best word overlap
         const calculateWordOverlap = (text1: string, text2: string): number => {
-            const words1 = new Set(text1.toLowerCase().split(/\W+/));
-            const words2 = new Set(text2.toLowerCase().split(/\W+/));
-            const intersection = new Set([...words1].filter(word => words2.has(word)));
-            return intersection.size / Math.max(words1.size, words2.size); // Jaccard similarity
+            if (!text1 || !text2) return 0;
+
+            const words1 = text1.toLowerCase().split(/\s+/);
+            const words2 = text2.toLowerCase().split(/\s+/);
+            const wordSet1 = new Set(words1);
+
+            let overlap = 0;
+            for (const word of words2) {
+                if (wordSet1.has(word)) {
+                    overlap++;
+                }
+            }
+
+            // Return percentage of overlap relative to the shorter text
+            return overlap / Math.min(words1.length, words2.length);
         };
 
-        // Search for the best matching segment
-        let bestMatchStart = 0;
-        let bestScore = 0;
-
-        console.log(`Searching for best match for query: "${citationText}"`);
-        itemsToSearch.forEach(item => {
-            const score = calculateWordOverlap(citationText, item.text);
-            console.log(`Comparing query to segment: "${item.text}" | Score: ${score}`);
-            if (score > bestScore) {
-                bestScore = score;
-                bestMatchStart = item.start;
+        // Find segment with highest word overlap
+        let bestMatch = null;
+        let highestOverlap = 0;
+
+        for (const segment of segments) {
+            if (!segment.text) continue;
+
+            const overlap = calculateWordOverlap(segment.text, citationText);
+            if (overlap > highestOverlap) {
+                highestOverlap = overlap;
+                bestMatch = segment;
             }
-        });
+        }
 
-        console.log('Best match found with score:', bestScore, '| Start time:', bestMatchStart);
+        // Only return matches with significant overlap (more than 30%)
+        if (bestMatch && highestOverlap > 0.3) {
+            return bestMatch.start;
+        }
 
-        // Return the start time of the best match
-        return bestMatchStart;
+        // If no good match found, return the start of the first segment as fallback
+        return segments.length > 0 ? segments[0].start : -1;
     };
 
     /**
@@ -772,7 +883,9 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
                 break;
             case CHUNK_TYPE.CSV:
             case CHUNK_TYPE.URL:
-                DocumentManager.Instance.showDocument(doc, { willZoomCentered: true });
+                DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {
+                    console.log(`Showing web document in viewer with URL: ${foundChunk.url}`);
+                });
                 break;
             default:
                 console.error('Unhandled chunk type:', foundChunk.chunkType);
@@ -879,6 +992,16 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
             }
         });
         this.addScrollListener();
+
+        // Initialize the document manager by finding existing documents
+        this.docManager.initializeFindDocsFreeform();
+
+        // If there are stored doc IDs in our list of docs to add, process them
+        if (this._linked_docs_to_add.size > 0) {
+            this._linked_docs_to_add.forEach(doc => {
+                this.docManager.processDocument(doc);
+            });
+        }
     }
 
     /**
@@ -892,28 +1015,28 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
     /**
      * Getter that retrieves all linked documents for the current document.
      */
-    @computed
-    get linkedDocs() {
-        return LinkManager.Instance.getAllRelatedLinks(this.Document)
-            .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document)))
-            .map(d => DocCast(d?.annotationOn, d))
-            .filter(d => d);
+    @computed get linkedDocs(): Doc[] {
+        const docIds = this.docManager.listDocs();
+        const docs: Doc[] = [];
+
+        // Get documents from the document manager using the getDocument method
+        docIds.forEach(id => {
+            const doc = this.docManager.getDocument(id);
+            if (doc) {
+                docs.push(doc);
+            }
+        });
+
+        return docs;
     }
 
     /**
-     * Getter that retrieves document IDs of linked documents that have AI-related content.
+     * Getter that retrieves document IDs of linked documents that have PDF_chunker–parsed content.
      */
     @computed
-    get docIds() {
-        return LinkManager.Instance.getAllRelatedLinks(this.Document)
-            .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document)))
-            .map(d => DocCast(d?.annotationOn, d))
-            .filter(d => d)
-            .filter(d => {
-                console.log(d.ai_doc_id);
-                return d.ai_doc_id;
-            })
-            .map(d => StrCast(d.ai_doc_id));
+    get docIds(): string[] {
+        // Use the document manager to get all document IDs
+        return Array.from(this.docManager.listDocs());
     }
 
     /**
@@ -921,23 +1044,18 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
      */
     @computed
     get summaries(): string {
-        return (
-            LinkManager.Instance.getAllRelatedLinks(this.Document)
-                .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document)))
-                .map(d => DocCast(d?.annotationOn, d))
-                .filter(d => d)
-                .filter(d => d.summary)
-                .map((doc, index) => {
-                    if (PDFCast(doc.data)) {
-                        return `<summary file_name="${PDFCast(doc.data).url.pathname}" applicable_tools=["rag"]>${doc.summary}</summary>`;
-                    } else if (CsvCast(doc.data)) {
-                        return `<summary file_name="${CsvCast(doc.data).url.pathname}" applicable_tools=["dataAnalysis"]>${doc.summary}</summary>`;
-                    } else {
-                        return `${index + 1}) ${doc.summary}`;
-                    }
-                })
-                .join('\n') + '\n'
-        );
+        const linkedDocs = Array.from(this.docManager.listDocs())
+            .map(id => {
+                const doc = this.docManager.extractDocumentMetadata(id);
+                if (doc && doc.fields && (doc.fields.layout.summary || doc.fields.data.summary)) {
+                    return doc.fields.layout.summary || doc.fields.data.summary;
+                }
+                return null;
+            })
+            .filter(Boolean)
+            .join('\n\n');
+
+        return linkedDocs;
     }
 
     /**
@@ -965,7 +1083,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
 
     // Other helper methods for retrieving document data and processing
 
-    retrieveSummaries = () => {
+    retrieveSummaries = (): string => {
         return this.summaries;
     };
 
@@ -973,12 +1091,12 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
         return this.linkedCSVs;
     };
 
-    retrieveFormattedHistory = () => {
+    retrieveFormattedHistory = (): string => {
         return this.formattedHistory;
     };
 
-    retrieveDocIds = () => {
-        return this.docIds;
+    retrieveDocIds = (): string[] => {
+        return Array.from(this.docManager.listDocs());
     };
 
     /**
diff --git a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
index 4b751acc0..e6c2421e5 100644
--- a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
+++ b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
@@ -417,9 +417,9 @@ export class DocumentMetadataTool extends BaseTool<DocumentMetadataToolParamsTyp
                     const title = String(args.title);
                     const data = String(args.data);
 
-                    const createdDoc = this._docManager.createDocInDash(docType, title, data);
+                    const id = this._docManager.createDocInDash(docType, data, { title: title });
 
-                    if (!createdDoc) {
+                    if (!id) {
                         return [
                             {
                                 type: 'text',
@@ -427,18 +427,14 @@ export class DocumentMetadataTool extends BaseTool<DocumentMetadataToolParamsTyp
                             },
                         ];
                     }
-
-                    // Update our local document maps with the new document
-                    this._docManager.processDocument(createdDoc);
-
                     // Get the created document's metadata
-                    const createdMetadata = this._docManager.extractDocumentMetadata(this._docManager.createAgentDoc(createdDoc));
+                    const createdMetadata = this._docManager.extractDocumentMetadata(id);
 
                     return [
                         {
                             type: 'text',
                             text: `Document created successfully.
-Document ID: ${createdDoc.id}
+Document ID: ${id}
 Type: ${docType}
 Title: "${title}"
 
@@ -447,9 +443,9 @@ You can now use the "edit" action to modify additional properties of this docume
 
 Next steps:
 1. Use the "getFieldOptions" action to understand available editable/addable fields/properties and their dependencies.
-2. To modify this document, use: { action: "edit", documentId: "${createdDoc.id}", fieldEdits: [{"fieldName":"property","fieldValue":"value"}] }
+2. To modify this document, use: { action: "edit", documentId: "${id}", fieldEdits: [{"fieldName":"property","fieldValue":"value"}] }
 3. To add styling, consider setting backgroundColor, fontColor, or other properties
-4. For text documents, you can edit the content with: { action: "edit", documentId: "${createdDoc.id}", fieldEdits: [{"fieldName":"text","fieldValue":"New content"}] }
+4. For text documents, you can edit the content with: { action: "edit", documentId: "${id}", fieldEdits: [{"fieldName":"text","fieldValue":"New content"}] }
 
 Full metadata for the created document:
 ${JSON.stringify(createdMetadata, null, 2)}`,
diff --git a/src/client/views/nodes/chatbot/tools/SearchTool.ts b/src/client/views/nodes/chatbot/tools/SearchTool.ts
index 2ee30f0cf..53f5fc109 100644
--- a/src/client/views/nodes/chatbot/tools/SearchTool.ts
+++ b/src/client/views/nodes/chatbot/tools/SearchTool.ts
@@ -3,6 +3,9 @@ import { Networking } from '../../../../Network';
 import { BaseTool } from './BaseTool';
 import { Observation } from '../types/types';
 import { ParametersType, ToolInfo } from '../types/tool_types';
+import { Agent } from 'http';
+import { AgentDocumentManager } from '../utils/AgentDocumentManager';
+import { StrCast } from '../../../../../fields/Types';
 
 const searchToolParams = [
     {
@@ -25,12 +28,12 @@ const searchToolInfo: ToolInfo<SearchToolParamsType> = {
 };
 
 export class SearchTool extends BaseTool<SearchToolParamsType> {
-    private _addLinkedUrlDoc: (url: string, id: string) => void;
+    private _docManager: AgentDocumentManager;
     private _max_results: number;
 
-    constructor(addLinkedUrlDoc: (url: string, id: string) => void, max_results: number = 3) {
+    constructor(docManager: AgentDocumentManager, max_results: number = 3) {
         super(searchToolInfo);
-        this._addLinkedUrlDoc = addLinkedUrlDoc;
+        this._docManager = docManager;
         this._max_results = max_results;
     }
 
@@ -46,8 +49,13 @@ export class SearchTool extends BaseTool<SearchToolParamsType> {
                     max_results: this._max_results,
                 })) as { results: { url: string; snippet: string }[] };
                 const data = results.map((result: { url: string; snippet: string }) => {
-                    const id = uuidv4();
-                    this._addLinkedUrlDoc(result.url, id);
+                    // Create a web document with the URL
+                    const id = this._docManager.createDocInDash('web', result.url, {
+                        title: `Search Result: ${result.url}`,
+                        text_html: result.snippet,
+                        data_useCors: true,
+                    });
+
                     return {
                         type: 'text' as const,
                         text: `<chunk chunk_id="${id}" chunk_type="url"><url>${result.url}</url><overview>${result.snippet}</overview></chunk>`,
diff --git a/src/client/views/nodes/chatbot/types/types.ts b/src/client/views/nodes/chatbot/types/types.ts
index 882e74ebb..dcb132ec7 100644
--- a/src/client/views/nodes/chatbot/types/types.ts
+++ b/src/client/views/nodes/chatbot/types/types.ts
@@ -108,6 +108,7 @@ export interface SimplifiedChunk {
     start_time?: number;
     end_time?: number;
     indexes?: string[];
+    text?: string;
 }
 
 export interface AI_Document {
diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
index 4eeac3c6a..c3beebcde 100644
--- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
+++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
@@ -165,22 +165,18 @@ export class AgentDocumentManager {
         }
     }
 
-    public addCustomId(doc: Doc, id: string) {
-        doc.id = id;
-        doc.DOCUMENT_ID_FIELD = id;
-    }
-
     /**
      * Process a document by ensuring it has an ID and adding it to the appropriate collections
      * @param doc The document to process
      */
-    public processDocument(doc: Doc) {
+    public processDocument(doc: Doc): string {
         // Ensure document has a persistent ID
         const docId = this.ensureDocumentId(doc);
         // Only add if we haven't already processed this document
         if (!this.documentsById.has(docId)) {
             this.documentsById.set(docId, { layoutDoc: doc, dataDoc: doc[DocData] });
         }
+        return docId;
     }
 
     /**
@@ -232,7 +228,9 @@ export class AgentDocumentManager {
      * @param docId The ID of the document to extract metadata from
      * @returns An object containing the document's metadata
      */
-    public extractDocumentMetadata(doc?: AgentDocument) {
+    public extractDocumentMetadata(id: string) {
+        if (!id) return null;
+        const doc = this.documentsById.get(id);
         if (!doc) return null;
         const layoutDoc = doc.layoutDoc;
         const dataDoc = doc.dataDoc;
@@ -729,16 +727,14 @@ export class AgentDocumentManager {
      */
     public getDocumentMetadata(documentId?: string): any {
         if (documentId) {
-            const doc = this.documentsById.get(documentId);
-            // Get metadata for a specific document
-            return this.extractDocumentMetadata(doc);
+            console.log(`Returning document metadata for docID, ${documentId}:`, this.extractDocumentMetadata(documentId));
+            return this.extractDocumentMetadata(documentId);
         } else {
             // Get metadata for all documents
             const documentsMetadata: Record<string, any> = {};
-            for (const doc of this.documentsById.values()) {
-                documentsMetadata.add(this.extractDocumentMetadata(doc) ?? { documentId: doc.layoutDoc.id, title: doc.layoutDoc.title, type: doc.layoutDoc.type });
+            for (const documentId of this.documentsById.keys()) {
+                documentsMetadata.add(this.extractDocumentMetadata(documentId));
             }
-
             return {
                 documentCount: this.documentsById.size,
                 documents: documentsMetadata,
@@ -845,14 +841,15 @@ export class AgentDocumentManager {
         return Object.values(supportedDocTypes).includes(docType as supportedDocTypes);
     }
     /**
-     * Creates a document in the dashboard.
+     * Creates a document in the dashboard and returns its ID.
+     * This is a public API used by tools like SearchTool.
      *
-     * @param {string} doc_type - The type of document to create.
-     * @param {string} data - The data used to generate the document.
-     * @param {DocumentOptions} options - Configuration options for the document.
-     * @returns {Promise<void>} A promise that resolves once the document is created and displayed.
+     * @param docType The type of document to create
+     * @param data The data for the document
+     * @param options Optional configuration options
+     * @returns The ID of the created document
      */
-    createDocInDash = (docType: string, title: string, data: string) => {
+    public createDocInDash(docType: string, data: string, options?: any): string {
         // Validate doc_type
         if (!this.isValidDocType(docType)) {
             throw new Error(`Invalid document type: ${docType}`);
@@ -862,10 +859,10 @@ export class AgentDocumentManager {
             // Create simple document with just title and data
             const simpleDoc: parsedDoc = {
                 doc_type: docType,
-                title: title,
+                title: options?.title ?? `Untitled Document ${this.documentsById.size + 1}`,
                 data: data,
-                x: 0,
-                y: 0,
+                x: options?.x ?? 0,
+                y: options?.y ?? 0,
                 _width: 300,
                 _height: 300,
                 _layout_fitWidth: false,
@@ -884,46 +881,111 @@ export class AgentDocumentManager {
                 }
             };
             const doc = this.chatBox.whichDoc(simpleDoc, false);
-            if (doc) linkAndShowDoc(doc);
-            return doc;
+            if (doc) {
+                linkAndShowDoc(doc);
+                const id = this.processDocument(doc);
+                return id;
+            } else {
+                throw new Error(`Error creating document. Created document not found.`);
+            }
         } catch (error) {
             throw new Error(`Error creating document: ${error}`);
         }
-    };
+    }
 
     public has(docId: string) {
         return this.documentsById.has(docId);
     }
 
-    public listDocs() {
-        // List all available documents in simple format
-        const docs = Array.from(this.documentsById.entries()).map(([id, doc]) => ({
-            id,
-            title: doc.layoutDoc.title || 'Untitled Document',
-            type: doc.layoutDoc.type || doc.dataDoc.type || 'Unknown Type',
-        }));
-
-        if (docs.length === 0) {
-            return [
-                {
-                    type: 'text',
-                    text: 'No documents found in the current view.',
-                },
-            ];
-        }
-
-        return [
-            {
-                type: 'text',
-                text: `Found ${docs.length} document(s) in the current view:\n${JSON.stringify(docs, null, 2)}`,
-            },
-        ];
+    /**
+     * Returns a list of all document IDs in the manager.
+     * @returns An array of document IDs (strings).
+     */
+    public listDocs(): string[] {
+        return Array.from(this.documentsById.keys());
+    }
+
+    /**
+     * Adds a document with a custom ID to the manager
+     * @param doc The document to add
+     * @param customId The custom ID to assign to the document
+     * @returns The customId that was assigned
+     */
+    public addCustomId(doc: Doc, customId: string): string {
+        if (!doc) {
+            console.error('Cannot add null document with custom ID');
+            return '';
+        }
+
+        // Set the custom ID in the document's metadata
+        doc[this.DOCUMENT_ID_FIELD] = customId;
+
+        // Store the document in our map
+        this.documentsById.set(customId, {
+            layoutDoc: doc,
+            dataDoc: doc,
+        });
+
+        return customId;
     }
 
-    public createAgentDoc(doc: Doc) {
-        // Ideally check if Doc is already in there.
-        const agentDoc = { layoutDoc: doc, dataDoc: doc[DocData] };
-        this.documentsById.set(this.ensureDocumentId(doc), agentDoc);
-        return agentDoc;
+    /**
+     * Gets a document by its ID
+     * @param docId The ID of the document to retrieve
+     * @returns The document if found, undefined otherwise
+     */
+    public getDocument(docId: string): Doc | undefined {
+        const docInfo = this.documentsById.get(docId);
+        return docInfo?.layoutDoc;
+    }
+
+    /**
+     * Registers chunk IDs associated with a document in the manager
+     * @param docId The parent document ID
+     * @param chunkIds Array of chunk IDs associated with this document
+     */
+    public registerChunkIds(docId: string, chunkIds: string[]): void {
+        // Get the document if it exists
+        const docInfo = this.documentsById.get(docId);
+        if (!docInfo) {
+            console.warn(`Cannot register chunks for unknown document ID: ${docId}`);
+            return;
+        }
+
+        // Store chunk IDs on the document for future reference
+        const doc = docInfo.layoutDoc;
+        if (!doc.chunk_ids) {
+            doc.chunk_ids = JSON.stringify(chunkIds);
+        } else {
+            // Merge with existing chunk IDs if they exist
+            const existingIds = JSON.parse(doc.chunk_ids as string);
+            const updatedIds = [...new Set([...existingIds, ...chunkIds])]; // Remove duplicates
+            doc.chunk_ids = JSON.stringify(updatedIds);
+        }
+
+        // Ensure each chunk ID can be linked back to its parent document
+        chunkIds.forEach(chunkId => {
+            // Store a mapping from chunk ID to parent document ID
+            // This allows us to easily find a document by any of its chunk IDs
+            if (!this.documentsById.has(chunkId)) {
+                this.documentsById.set(chunkId, {
+                    layoutDoc: doc,
+                    dataDoc: docInfo.dataDoc,
+                });
+            }
+        });
+    }
+
+    /**
+     * Gets a document ID by a chunk ID
+     * @param chunkId The chunk ID to look up
+     * @returns The parent document ID if found
+     */
+    public getDocIdByChunkId(chunkId: string): string | undefined {
+        const docInfo = this.documentsById.get(chunkId);
+        if (docInfo) {
+            return docInfo.layoutDoc[this.DOCUMENT_ID_FIELD] as string;
+        }
+        return undefined;
     }
 }
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index afd34f28d..4bb61d8b2 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -15,7 +15,7 @@ import { Networking } from '../../../../Network';
 import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
 import OpenAI from 'openai';
 import { Embedding } from 'openai/resources';
-import { PineconeEnvironmentVarsNotSupportedError } from '@pinecone-database/pinecone/dist/errors';
+import { AgentDocumentManager } from '../utils/AgentDocumentManager';
 
 dotenv.config();
 
@@ -29,7 +29,7 @@ export class Vectorstore {
     private openai: OpenAI; // OpenAI client for generating embeddings.
     private indexName: string = 'pdf-chatbot'; // Default name for the index.
     private _id: string; // Unique ID for the Vectorstore instance.
-    private _doc_ids: () => string[]; // List of document IDs handled by this instance.
+    private docManager: AgentDocumentManager; // Document manager for handling documents
 
     documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
 
@@ -37,9 +37,9 @@ export class Vectorstore {
      * Initializes the Pinecone and OpenAI clients, sets up the document ID list,
      * and initializes the Pinecone index.
      * @param id The unique identifier for the vectorstore instance.
-     * @param doc_ids A function that returns a list of document IDs.
+     * @param docManager An instance of AgentDocumentManager to handle document management.
      */
-    constructor(id: string, doc_ids: () => string[]) {
+    constructor(id: string, docManager: AgentDocumentManager) {
         const pineconeApiKey = process.env.PINECONE_API_KEY;
         if (!pineconeApiKey) {
             throw new Error('PINECONE_API_KEY is not defined.');
@@ -49,7 +49,7 @@ export class Vectorstore {
         this.pinecone = new Pinecone({ apiKey: pineconeApiKey });
         this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true });
         this._id = id;
-        this._doc_ids = doc_ids;
+        this.docManager = docManager;
         this.initializeIndex();
     }
 
@@ -109,15 +109,25 @@ export class Vectorstore {
 
             const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4');
             let result: AI_Document & { doc_id: string };
+
             if (isAudioOrVideo) {
                 console.log('Processing media file...');
                 const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
-                const segmentedTranscript = response.condensed;
+
+                // Type assertion to handle the response properties
+                const typedResponse = response as {
+                    condensed: Array<{ text: string; indexes: string[]; start: number; end: number }>;
+                    full: Array<unknown>;
+                    summary: string;
+                };
+
+                const segmentedTranscript = typedResponse.condensed;
                 console.log(segmentedTranscript);
-                const summary = response.summary;
+                const summary = typedResponse.summary;
                 doc.summary = summary;
+
                 // Generate embeddings for each chunk
-                const texts = segmentedTranscript.map((chunk: any) => chunk.text);
+                const texts = segmentedTranscript.map(chunk => chunk.text);
 
                 try {
                     const embeddingsResponse = await this.openai.embeddings.create({
@@ -126,10 +136,19 @@ export class Vectorstore {
                         encoding_format: 'float',
                     });
 
-                    doc.original_segments = JSON.stringify(response.full);
+                    doc.original_segments = JSON.stringify(typedResponse.full);
                     doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
                     const doc_id = uuidv4();
 
+                    // Register the document with the AgentDocumentManager
+                    this.docManager.addCustomId(doc, doc_id);
+
+                    // Generate chunk IDs upfront so we can register them
+                    const chunkIds = segmentedTranscript.map(() => uuidv4());
+
+                    // Register all chunk IDs with the document manager
+                    this.docManager.registerChunkIds(doc_id, chunkIds);
+
                     // Add transcript and embeddings to metadata
                     result = {
                         doc_id,
@@ -137,13 +156,13 @@ export class Vectorstore {
                         file_name: local_file_path,
                         num_pages: 0,
                         summary: '',
-                        chunks: segmentedTranscript.map((chunk: any, index: number) => ({
-                            id: uuidv4(),
+                        chunks: segmentedTranscript.map((chunk, index) => ({
+                            id: chunkIds[index], // Use pre-generated chunk ID
                             values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding
                             metadata: {
                                 indexes: chunk.indexes,
                                 original_document: local_file_path,
-                                doc_id: doc_id,
+                                doc_id: doc_id, // Ensure doc_id is consistent
                                 file_path: local_file_path,
                                 start_time: chunk.start,
                                 end_time: chunk.end,
@@ -159,20 +178,24 @@ export class Vectorstore {
                 }
 
                 doc.segmented_transcript = JSON.stringify(segmentedTranscript);
-                // Simplify chunks for storage
+                // Simplify chunks for storage - ensure simplified chunks use EXACTLY the same IDs
                 const simplifiedChunks = result.chunks.map(chunk => ({
-                    chunkId: chunk.id,
+                    chunkId: chunk.id, // Use the exact same ID as the full chunk
                     start_time: chunk.metadata.start_time,
                     end_time: chunk.metadata.end_time,
                     indexes: chunk.metadata.indexes,
                     chunkType: CHUNK_TYPE.VIDEO,
                     text: chunk.metadata.text,
+                    doc_id: chunk.metadata.doc_id, // Include parent doc_id for completeness
                 }));
                 doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
             } else {
-                // Existing document processing logic remains unchanged
+                // Process regular document
                 console.log('Processing regular document...');
-                const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+                const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+
+                // Type assertion for the response
+                const { jobId } = createDocumentResponse as { jobId: string };
 
                 while (true) {
                     await new Promise(resolve => setTimeout(resolve, 2000));
@@ -188,6 +211,16 @@ export class Vectorstore {
                         progressCallback(progressResponseJson.progress, progressResponseJson.step);
                     }
                 }
+
+                // Register the document with the AgentDocumentManager
+                this.docManager.addCustomId(doc, result.doc_id);
+
+                // Collect all chunk IDs
+                const chunkIds = result.chunks.map(chunk => chunk.id);
+
+                // Register chunks with the document manager
+                this.docManager.registerChunkIds(result.doc_id, chunkIds);
+
                 if (!doc.chunk_simpl) {
                     doc.chunk_simpl = JSON.stringify({ chunks: [] });
                 }
@@ -196,12 +229,13 @@ export class Vectorstore {
 
                 result.chunks.forEach((chunk: RAGChunk) => {
                     const chunkToAdd = {
-                        chunkId: chunk.id,
+                        chunkId: chunk.id, // Ensure we use the exact same ID
                         startPage: chunk.metadata.start_page,
                         endPage: chunk.metadata.end_page,
                         location: chunk.metadata.location,
                         chunkType: chunk.metadata.type as CHUNK_TYPE,
                         text: chunk.metadata.text,
+                        doc_id: chunk.metadata.doc_id, // Include parent doc_id for consistency
                     };
                     const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl));
                     new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd);
@@ -298,39 +332,55 @@ export class Vectorstore {
 
             let queryEmbedding = queryEmbeddingResponse.data[0].embedding;
 
-            // Extract the embedding from the response.
+            // Get document IDs from the AgentDocumentManager
+            const docIds = Array.from(this.docManager.listDocs());
+            console.log('Using document IDs for retrieval:', docIds);
 
-            console.log(this._doc_ids());
             // Query the Pinecone index using the embedding and filter by document IDs.
+            // We'll query based on document IDs that are registered in the document manager
             const queryResponse: QueryResponse = await this.index.query({
                 vector: queryEmbedding,
                 filter: {
-                    doc_id: { $in: this._doc_ids() },
+                    doc_id: { $in: docIds },
                 },
                 topK,
                 includeValues: true,
                 includeMetadata: true,
             });
-            console.log(queryResponse);
-
-            // Map the results into RAGChunks and return them.
-            return queryResponse.matches.map(
-                match =>
-                    ({
-                        id: match.id,
-                        values: match.values as number[],
-                        metadata: match.metadata as {
-                            text: string;
-                            type: string;
-                            original_document: string;
-                            file_path: string;
-                            doc_id: string;
-                            location: string;
-                            start_page: number;
-                            end_page: number;
-                        },
-                    }) as RAGChunk
-            );
+            console.log(`Found ${queryResponse.matches.length} matching chunks`);
+
+            // For each retrieved chunk, ensure its document ID is registered in the document manager
+            // This maintains compatibility with existing code while ensuring consistency
+            const processedMatches = queryResponse.matches.map(match => {
+                const chunk = {
+                    id: match.id,
+                    values: match.values as number[],
+                    metadata: match.metadata as {
+                        text: string;
+                        type: string;
+                        original_document: string;
+                        file_path: string;
+                        doc_id: string;
+                        location: string;
+                        start_page: number;
+                        end_page: number;
+                    },
+                } as RAGChunk;
+
+                // Ensure the document manager knows about this chunk
+                // This is important for maintaining backwards compatibility
+                if (chunk.id && !this.docManager.getDocIdByChunkId(chunk.id)) {
+                    // If the chunk ID isn't registered but we have a doc_id in metadata
+                    if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) {
+                        // Register the chunk with its parent document
+                        this.docManager.registerChunkIds(chunk.metadata.doc_id, [chunk.id]);
+                    }
+                }
+
+                return chunk;
+            });
+
+            return processedMatches;
         } catch (error) {
             console.error(`Error retrieving chunks: ${error}`);
             return [];
author	A.J. Shulman <Shulman.aj@gmail.com>	2025-04-27 13:14:49 -0400
committer	A.J. Shulman <Shulman.aj@gmail.com>	2025-04-27 13:14:49 -0400
commit	3ef3d40506348d9fd537cc8f4aea975b9770689f (patch)
tree	afe779e8240e88c8b20ff6b68ac45840a927ee76 /src
parent	5ce2263849bfb901e276a4c5fc8ca2dbd8b80350 (diff)