diff options
author | A.J. Shulman <Shulman.aj@gmail.com> | 2025-04-27 13:14:49 -0400 |
---|---|---|
committer | A.J. Shulman <Shulman.aj@gmail.com> | 2025-04-27 13:14:49 -0400 |
commit | 3ef3d40506348d9fd537cc8f4aea975b9770689f (patch) | |
tree | afe779e8240e88c8b20ff6b68ac45840a927ee76 /src | |
parent | 5ce2263849bfb901e276a4c5fc8ca2dbd8b80350 (diff) |
new attempt with new citation unification
Diffstat (limited to 'src')
7 files changed, 510 insertions, 278 deletions
diff --git a/src/client/views/nodes/chatbot/agentsystem/Agent.ts b/src/client/views/nodes/chatbot/agentsystem/Agent.ts index c021d141e..80fdb6533 100644 --- a/src/client/views/nodes/chatbot/agentsystem/Agent.ts +++ b/src/client/views/nodes/chatbot/agentsystem/Agent.ts @@ -65,12 +65,9 @@ export class Agent { summaries: () => string, history: () => string, csvData: () => { filename: string; id: string; text: string }[], - addLinkedUrlDoc: (url: string, id: string) => void, getLinkedUrlDocId: (url: string) => string[], createImage: (result: Upload.FileInformation & Upload.InspectionResults, options: DocumentOptions) => void, - // eslint-disable-next-line @typescript-eslint/no-unused-vars createCSVInDash: (url: string, title: string, id: string, data: string) => void, - chatBox: ChatBox, docManager: AgentDocumentManager ) { // Initialize OpenAI client with API key from environment @@ -87,7 +84,7 @@ export class Agent { rag: new RAGTool(this.vectorstore), dataAnalysis: new DataAnalysisTool(csvData), websiteInfoScraper: new WebsiteInfoScraperTool(getLinkedUrlDocId), - searchTool: new SearchTool(addLinkedUrlDoc), + searchTool: new SearchTool(this._docManager), noTool: new NoTool(), //imageCreationTool: new ImageCreationTool(createImage), documentMetadata: new DocumentMetadataTool(this._docManager), diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx index 43765c1ce..35dbee3e9 100644 --- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx +++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx @@ -71,7 +71,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { @observable private _citationPopup: { text: string; visible: boolean } = { text: '', visible: false }; // Private properties for managing OpenAI API, vector store, agent, and UI elements - private openai: OpenAI; + private openai!: OpenAI; // Using definite assignment assertion private vectorstore_id: string; private vectorstore: Vectorstore; private agent: Agent; @@ -98,25 +98,34 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { */ constructor(props: FieldViewProps) { super(props); - makeObservable(this); // Enable MobX observables + makeObservable(this); - // Initialize OpenAI, vectorstore, and agent - this.openai = this.initializeOpenAI(); - if (StrCast(this.dataDoc.vectorstore_id) == '') { - this.vectorstore_id = uuidv4(); - this.dataDoc.vectorstore_id = this.vectorstore_id; - } else { - this.vectorstore_id = StrCast(this.dataDoc.vectorstore_id); - } - this.vectorstore = new Vectorstore(this.vectorstore_id, this.retrieveDocIds); + this.messagesRef = React.createRef(); this.docManager = new AgentDocumentManager(this); - this.agent = new Agent(this.vectorstore, this.retrieveSummaries, this.retrieveFormattedHistory, this.retrieveCSVData, this.addLinkedUrlDoc, this.getLinkedUrlDocIds, this.createImageInDash, this.createCSVInDash, this, this.docManager); - // Reinitialize the DocumentMetadataTool with a direct reference to this ChatBox instance - // This ensures the tool can properly access documents in the same Freeform view - this.agent.reinitializeDocumentMetadataTool(); + // Initialize OpenAI client + this.initializeOpenAI(); + + // Create a unique vectorstore ID for this ChatBox + this.vectorstore_id = uuidv4(); + + // Initialize vectorstore with the document manager + this.vectorstore = new Vectorstore(this.vectorstore_id, this.docManager); + + // Create an agent with the vectorstore + this.agent = new Agent( + this.vectorstore, + this.retrieveSummaries.bind(this), + this.retrieveFormattedHistory.bind(this), + this.retrieveCSVData.bind(this), + this.retrieveDocIds.bind(this), + this.createImageInDash.bind(this), + this.createCSVInDash.bind(this), + this.docManager + ); - this.messagesRef = React.createRef<HTMLDivElement>(); + // Add event listeners + this.addScrollListener(); // Reaction to update dataDoc when chat history changes reaction( @@ -140,22 +149,25 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { */ @action addDocToVectorstore = async (newLinkedDoc: Doc) => { - this._uploadProgress = 0; - this._currentStep = 'Initializing...'; - this._isUploadingDocs = true; - try { - // Add the document to the vectorstore + this._isUploadingDocs = true; + + // Process the document first to ensure it has a valid ID + this.docManager.processDocument(newLinkedDoc); + + // Add the document to the vectorstore which will also register chunks await this.vectorstore.addAIDoc(newLinkedDoc, this.updateProgress); - } catch (error) { - console.error('Error uploading document:', error); - this._currentStep = 'Error during upload'; - } finally { - runInAction(() => { - this._isUploadingDocs = false; - this._uploadProgress = 0; - this._currentStep = ''; - }); + + // No longer needed as documents are tracked by the AgentDocumentManager + // this._linked_docs_to_add.add(newLinkedDoc); + + this._isUploadingDocs = false; + + return true; + } catch (err) { + console.error('Error adding document to vectorstore:', err); + this._isUploadingDocs = false; + return false; } }; @@ -238,7 +250,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { apiKey: process.env.OPENAI_KEY, dangerouslyAllowBrowser: true, }; - return new OpenAI(configuration); + this.openai = new OpenAI(configuration); } /** @@ -376,49 +388,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { }; /** - * Adds a linked document from a URL for future reference and analysis. - * @param url The URL of the document to add. - * @param id The unique identifier for the document. - */ - @action - addLinkedUrlDoc = async (url: string, id: string) => { - const doc = Docs.Create.WebDocument(url, { data_useCors: true }); - this.docManager.addCustomId(doc, id); - const linkDoc = Docs.Create.LinkDocument(this.Document, doc); - LinkManager.Instance.addLink(linkDoc); - - const chunkToAdd = { - chunkId: id, - chunkType: CHUNK_TYPE.URL, - url: url, - }; - - doc.chunk_simpl = JSON.stringify({ chunks: [chunkToAdd] }); - this.docManager.processDocument(doc); - }; - - /** - * Retrieves the IDs of linked url documents. - * @returns An array of document IDs. - */ - @action - getLinkedUrlDocIds = () => { - const linkedDocs: Doc[] = this.linkedDocs; - const linkedUrlDocIds: string[] = []; - - for (const doc of linkedDocs) { - if (doc.chunk_simpl) { - const docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl)) as { chunks: SimplifiedChunk[] }; - const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkType === CHUNK_TYPE.URL); - if (foundChunk) { - linkedUrlDocIds.push(foundChunk.chunkId); - } - } - } - return linkedUrlDocIds; - }; - - /** * Getter to retrieve the current user's name from the client utils. */ @computed @@ -613,82 +582,224 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { */ @action handleCitationClick = async (citation: Citation) => { - const currentLinkedDocs: Doc[] = this.linkedDocs; - const chunkId = citation.chunk_id; + try { + // Extract values from MobX proxy object if needed + const chunkId = typeof citation.chunk_id === 'object' ? (citation.chunk_id as any).toString() : citation.chunk_id; + + // For debugging + console.log('Citation clicked:', { + chunkId, + citation: JSON.stringify(citation, null, 2), + }); - for (const doc of currentLinkedDocs) { - if (doc.chunk_simpl) { - const docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl)) as { chunks: SimplifiedChunk[] }; - const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkId === chunkId); + // Try to find the document + const linkedDocs = this.linkedDocs; + let doc: Doc | undefined; - if (foundChunk) { - // Handle media chunks specifically + // First try to find the document using the document manager's chunk ID lookup + const parentDocId = this.docManager.getDocIdByChunkId(chunkId); + if (parentDocId) { + doc = this.docManager.getDocument(parentDocId); + console.log(`Found document by chunk ID lookup: ${parentDocId}`); + } - if (doc.ai_type == 'video' || doc.ai_type == 'audio') { - const directMatchSegmentStart = this.getDirectMatchingSegmentStart(doc, citation.direct_text || '', foundChunk.indexes || []); + // If not found, fall back to searching through linked docs (maintains compatibility) + if (!doc) { + for (const linkedDoc of linkedDocs) { + if (linkedDoc.chunk_simpl) { + try { + const docChunkSimpl = JSON.parse(StrCast(linkedDoc.chunk_simpl)) as { chunks: SimplifiedChunk[] }; + const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkId === chunkId); + if (foundChunk) { + doc = linkedDoc; + console.log(`Found document by iterating through linked docs`); + break; + } + } catch (e) { + console.error(`Error parsing chunk_simpl for doc ${linkedDoc.id}:`, e); + } + } + } + } - if (directMatchSegmentStart) { - // Navigate to the segment's start time in the media player - await this.goToMediaTimestamp(doc, directMatchSegmentStart, doc.ai_type); - } else { - console.error('No direct matching segment found for the citation.'); + if (!doc) { + console.warn(`Document not found for citation with chunk_id: ${chunkId}`); + return; + } + + // Process the chunk data + let docChunkSimpl: { chunks: SimplifiedChunk[] } = { chunks: [] }; + try { + docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl) || '{"chunks":[]}'); + } catch (e) { + console.error(`Error parsing chunk_simpl for the found document:`, e); + return; + } + + const foundChunk = docChunkSimpl.chunks.find((chunk: SimplifiedChunk) => chunk.chunkId === chunkId); + + // Handle different chunk types + if (foundChunk) { + console.log(`Found chunk in document:`, foundChunk); + + // Handle video chunks + if (foundChunk.chunkType === CHUNK_TYPE.VIDEO) { + if (foundChunk.start_time !== undefined) { + await this.goToMediaTimestamp(doc, foundChunk.start_time, 'video'); + } else { + console.warn('Video chunk missing start_time:', foundChunk); + } + } + // Handle audio chunks - note that we're using string comparison since 'audio' isn't in CHUNK_TYPE enum + else if (String(foundChunk.chunkType).toLowerCase() === 'audio') { + if (foundChunk.start_time !== undefined) { + await this.goToMediaTimestamp(doc, foundChunk.start_time, 'audio'); + } else { + console.warn('Audio chunk missing start_time:', foundChunk); + } + } + // Handle table or image chunks + else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) { + this.handleOtherChunkTypes(foundChunk, citation, doc); + } + // Handle text chunks + else if (foundChunk.chunkType === CHUNK_TYPE.TEXT) { + // Find text from the document's chunks metadata + let chunkText = ''; + + try { + // We already parsed the chunks earlier, so use that + const matchingChunk = docChunkSimpl.chunks.find(c => c.chunkId === foundChunk.chunkId); + if (matchingChunk && 'text' in matchingChunk) { + // If the text property exists on the chunk (even though it's not in the type) + chunkText = String(matchingChunk['text'] || ''); } + } catch (e) { + console.error('Error getting chunk text:', e); + } + + // Default text if none found + if (!chunkText) { + chunkText = 'Text content not available'; + } + + this._citationPopup = { + text: chunkText, + visible: true, + }; + } + // Handle URL chunks + else if (foundChunk.chunkType === CHUNK_TYPE.URL) { + if (foundChunk.url) { + // Instead of opening the URL in a new window, show the document in the viewer + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); + console.log(`Navigated to web document with URL: ${foundChunk.url}`); } else { - // Handle other chunk types as before - this.handleOtherChunkTypes(foundChunk, citation, doc); + console.warn('URL chunk missing URL:', foundChunk); } } + } else if (doc?.original_segments) { + // Handle original segments for media files + let original_segments: any[] = []; + try { + original_segments = JSON.parse(StrCast(doc.original_segments)); + } catch (e) { + console.error(`Error parsing original_segments:`, e); + return; + } + + // Check if there's direct text to find in the segments + if (citation.direct_text) { + // Find the segment that contains the direct text + const start = this.getDirectMatchingSegmentStart(doc, citation.direct_text, []); + if (start !== -1) { + await this.goToMediaTimestamp(doc, start, doc.ai_type === 'audio' ? 'audio' : 'video'); + } + } + } else { + console.warn('Unable to find chunk or segments for citation', citation); } + } catch (error) { + console.error('Error handling citation click:', error); } }; + /** + * Finds a matching segment in a document based on text content. + * @param doc The document to search in + * @param citationText The text to find in the document + * @param indexesOfSegments Optional indexes of segments to search in + * @returns The starting timestamp of the matching segment, or -1 if not found + */ getDirectMatchingSegmentStart = (doc: Doc, citationText: string, indexesOfSegments: string[]): number => { - const originalSegments = JSON.parse(StrCast(doc.original_segments!)).map((segment: any, index: number) => ({ - index: index.toString(), - text: segment.text, - start: segment.start, - end: segment.end, - })); - - if (!Array.isArray(originalSegments) || originalSegments.length === 0 || !Array.isArray(indexesOfSegments)) { - return 0; + if (!doc || !citationText) return -1; + + // Get original segments from the document + const original_segments = doc.original_segments ? JSON.parse(StrCast(doc.original_segments)) : []; + + if (!original_segments || !Array.isArray(original_segments) || original_segments.length === 0) { + return -1; } - // Create itemsToSearch array based on indexesOfSegments - const itemsToSearch = indexesOfSegments.map((indexStr: string) => { - const index = parseInt(indexStr, 10); - const segment = originalSegments[index]; - return { text: segment.text, start: segment.start }; - }); + let segments = original_segments; - console.log('Constructed itemsToSearch:', itemsToSearch); + // If specific indexes are provided, filter segments by those indexes + if (indexesOfSegments && indexesOfSegments.length > 0) { + segments = original_segments.filter((segment: any) => indexesOfSegments.includes(segment.index)); + } + + // If no segments match the indexes, use all segments + if (segments.length === 0) { + segments = original_segments; + } - // Helper function to calculate word overlap score + // First try to find an exact match + const exactMatch = segments.find((segment: any) => segment.text && segment.text.includes(citationText)); + + if (exactMatch) { + return exactMatch.start; + } + + // If no exact match, find segment with best word overlap const calculateWordOverlap = (text1: string, text2: string): number => { - const words1 = new Set(text1.toLowerCase().split(/\W+/)); - const words2 = new Set(text2.toLowerCase().split(/\W+/)); - const intersection = new Set([...words1].filter(word => words2.has(word))); - return intersection.size / Math.max(words1.size, words2.size); // Jaccard similarity + if (!text1 || !text2) return 0; + + const words1 = text1.toLowerCase().split(/\s+/); + const words2 = text2.toLowerCase().split(/\s+/); + const wordSet1 = new Set(words1); + + let overlap = 0; + for (const word of words2) { + if (wordSet1.has(word)) { + overlap++; + } + } + + // Return percentage of overlap relative to the shorter text + return overlap / Math.min(words1.length, words2.length); }; - // Search for the best matching segment - let bestMatchStart = 0; - let bestScore = 0; - - console.log(`Searching for best match for query: "${citationText}"`); - itemsToSearch.forEach(item => { - const score = calculateWordOverlap(citationText, item.text); - console.log(`Comparing query to segment: "${item.text}" | Score: ${score}`); - if (score > bestScore) { - bestScore = score; - bestMatchStart = item.start; + // Find segment with highest word overlap + let bestMatch = null; + let highestOverlap = 0; + + for (const segment of segments) { + if (!segment.text) continue; + + const overlap = calculateWordOverlap(segment.text, citationText); + if (overlap > highestOverlap) { + highestOverlap = overlap; + bestMatch = segment; } - }); + } - console.log('Best match found with score:', bestScore, '| Start time:', bestMatchStart); + // Only return matches with significant overlap (more than 30%) + if (bestMatch && highestOverlap > 0.3) { + return bestMatch.start; + } - // Return the start time of the best match - return bestMatchStart; + // If no good match found, return the start of the first segment as fallback + return segments.length > 0 ? segments[0].start : -1; }; /** @@ -772,7 +883,9 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { break; case CHUNK_TYPE.CSV: case CHUNK_TYPE.URL: - DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }); + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => { + console.log(`Showing web document in viewer with URL: ${foundChunk.url}`); + }); break; default: console.error('Unhandled chunk type:', foundChunk.chunkType); @@ -879,6 +992,16 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { } }); this.addScrollListener(); + + // Initialize the document manager by finding existing documents + this.docManager.initializeFindDocsFreeform(); + + // If there are stored doc IDs in our list of docs to add, process them + if (this._linked_docs_to_add.size > 0) { + this._linked_docs_to_add.forEach(doc => { + this.docManager.processDocument(doc); + }); + } } /** @@ -892,28 +1015,28 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { /** * Getter that retrieves all linked documents for the current document. */ - @computed - get linkedDocs() { - return LinkManager.Instance.getAllRelatedLinks(this.Document) - .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document))) - .map(d => DocCast(d?.annotationOn, d)) - .filter(d => d); + @computed get linkedDocs(): Doc[] { + const docIds = this.docManager.listDocs(); + const docs: Doc[] = []; + + // Get documents from the document manager using the getDocument method + docIds.forEach(id => { + const doc = this.docManager.getDocument(id); + if (doc) { + docs.push(doc); + } + }); + + return docs; } /** - * Getter that retrieves document IDs of linked documents that have AI-related content. + * Getter that retrieves document IDs of linked documents that have PDF_chunker–parsed content. */ @computed - get docIds() { - return LinkManager.Instance.getAllRelatedLinks(this.Document) - .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document))) - .map(d => DocCast(d?.annotationOn, d)) - .filter(d => d) - .filter(d => { - console.log(d.ai_doc_id); - return d.ai_doc_id; - }) - .map(d => StrCast(d.ai_doc_id)); + get docIds(): string[] { + // Use the document manager to get all document IDs + return Array.from(this.docManager.listDocs()); } /** @@ -921,23 +1044,18 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { */ @computed get summaries(): string { - return ( - LinkManager.Instance.getAllRelatedLinks(this.Document) - .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document))) - .map(d => DocCast(d?.annotationOn, d)) - .filter(d => d) - .filter(d => d.summary) - .map((doc, index) => { - if (PDFCast(doc.data)) { - return `<summary file_name="${PDFCast(doc.data).url.pathname}" applicable_tools=["rag"]>${doc.summary}</summary>`; - } else if (CsvCast(doc.data)) { - return `<summary file_name="${CsvCast(doc.data).url.pathname}" applicable_tools=["dataAnalysis"]>${doc.summary}</summary>`; - } else { - return `${index + 1}) ${doc.summary}`; - } - }) - .join('\n') + '\n' - ); + const linkedDocs = Array.from(this.docManager.listDocs()) + .map(id => { + const doc = this.docManager.extractDocumentMetadata(id); + if (doc && doc.fields && (doc.fields.layout.summary || doc.fields.data.summary)) { + return doc.fields.layout.summary || doc.fields.data.summary; + } + return null; + }) + .filter(Boolean) + .join('\n\n'); + + return linkedDocs; } /** @@ -965,7 +1083,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { // Other helper methods for retrieving document data and processing - retrieveSummaries = () => { + retrieveSummaries = (): string => { return this.summaries; }; @@ -973,12 +1091,12 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { return this.linkedCSVs; }; - retrieveFormattedHistory = () => { + retrieveFormattedHistory = (): string => { return this.formattedHistory; }; - retrieveDocIds = () => { - return this.docIds; + retrieveDocIds = (): string[] => { + return Array.from(this.docManager.listDocs()); }; /** diff --git a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts index 4b751acc0..e6c2421e5 100644 --- a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts +++ b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts @@ -417,9 +417,9 @@ export class DocumentMetadataTool extends BaseTool<DocumentMetadataToolParamsTyp const title = String(args.title); const data = String(args.data); - const createdDoc = this._docManager.createDocInDash(docType, title, data); + const id = this._docManager.createDocInDash(docType, data, { title: title }); - if (!createdDoc) { + if (!id) { return [ { type: 'text', @@ -427,18 +427,14 @@ export class DocumentMetadataTool extends BaseTool<DocumentMetadataToolParamsTyp }, ]; } - - // Update our local document maps with the new document - this._docManager.processDocument(createdDoc); - // Get the created document's metadata - const createdMetadata = this._docManager.extractDocumentMetadata(this._docManager.createAgentDoc(createdDoc)); + const createdMetadata = this._docManager.extractDocumentMetadata(id); return [ { type: 'text', text: `Document created successfully. -Document ID: ${createdDoc.id} +Document ID: ${id} Type: ${docType} Title: "${title}" @@ -447,9 +443,9 @@ You can now use the "edit" action to modify additional properties of this docume Next steps: 1. Use the "getFieldOptions" action to understand available editable/addable fields/properties and their dependencies. -2. To modify this document, use: { action: "edit", documentId: "${createdDoc.id}", fieldEdits: [{"fieldName":"property","fieldValue":"value"}] } +2. To modify this document, use: { action: "edit", documentId: "${id}", fieldEdits: [{"fieldName":"property","fieldValue":"value"}] } 3. To add styling, consider setting backgroundColor, fontColor, or other properties -4. For text documents, you can edit the content with: { action: "edit", documentId: "${createdDoc.id}", fieldEdits: [{"fieldName":"text","fieldValue":"New content"}] } +4. For text documents, you can edit the content with: { action: "edit", documentId: "${id}", fieldEdits: [{"fieldName":"text","fieldValue":"New content"}] } Full metadata for the created document: ${JSON.stringify(createdMetadata, null, 2)}`, diff --git a/src/client/views/nodes/chatbot/tools/SearchTool.ts b/src/client/views/nodes/chatbot/tools/SearchTool.ts index 2ee30f0cf..53f5fc109 100644 --- a/src/client/views/nodes/chatbot/tools/SearchTool.ts +++ b/src/client/views/nodes/chatbot/tools/SearchTool.ts @@ -3,6 +3,9 @@ import { Networking } from '../../../../Network'; import { BaseTool } from './BaseTool'; import { Observation } from '../types/types'; import { ParametersType, ToolInfo } from '../types/tool_types'; +import { Agent } from 'http'; +import { AgentDocumentManager } from '../utils/AgentDocumentManager'; +import { StrCast } from '../../../../../fields/Types'; const searchToolParams = [ { @@ -25,12 +28,12 @@ const searchToolInfo: ToolInfo<SearchToolParamsType> = { }; export class SearchTool extends BaseTool<SearchToolParamsType> { - private _addLinkedUrlDoc: (url: string, id: string) => void; + private _docManager: AgentDocumentManager; private _max_results: number; - constructor(addLinkedUrlDoc: (url: string, id: string) => void, max_results: number = 3) { + constructor(docManager: AgentDocumentManager, max_results: number = 3) { super(searchToolInfo); - this._addLinkedUrlDoc = addLinkedUrlDoc; + this._docManager = docManager; this._max_results = max_results; } @@ -46,8 +49,13 @@ export class SearchTool extends BaseTool<SearchToolParamsType> { max_results: this._max_results, })) as { results: { url: string; snippet: string }[] }; const data = results.map((result: { url: string; snippet: string }) => { - const id = uuidv4(); - this._addLinkedUrlDoc(result.url, id); + // Create a web document with the URL + const id = this._docManager.createDocInDash('web', result.url, { + title: `Search Result: ${result.url}`, + text_html: result.snippet, + data_useCors: true, + }); + return { type: 'text' as const, text: `<chunk chunk_id="${id}" chunk_type="url"><url>${result.url}</url><overview>${result.snippet}</overview></chunk>`, diff --git a/src/client/views/nodes/chatbot/types/types.ts b/src/client/views/nodes/chatbot/types/types.ts index 882e74ebb..dcb132ec7 100644 --- a/src/client/views/nodes/chatbot/types/types.ts +++ b/src/client/views/nodes/chatbot/types/types.ts @@ -108,6 +108,7 @@ export interface SimplifiedChunk { start_time?: number; end_time?: number; indexes?: string[]; + text?: string; } export interface AI_Document { diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts index 4eeac3c6a..c3beebcde 100644 --- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts +++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts @@ -165,22 +165,18 @@ export class AgentDocumentManager { } } - public addCustomId(doc: Doc, id: string) { - doc.id = id; - doc.DOCUMENT_ID_FIELD = id; - } - /** * Process a document by ensuring it has an ID and adding it to the appropriate collections * @param doc The document to process */ - public processDocument(doc: Doc) { + public processDocument(doc: Doc): string { // Ensure document has a persistent ID const docId = this.ensureDocumentId(doc); // Only add if we haven't already processed this document if (!this.documentsById.has(docId)) { this.documentsById.set(docId, { layoutDoc: doc, dataDoc: doc[DocData] }); } + return docId; } /** @@ -232,7 +228,9 @@ export class AgentDocumentManager { * @param docId The ID of the document to extract metadata from * @returns An object containing the document's metadata */ - public extractDocumentMetadata(doc?: AgentDocument) { + public extractDocumentMetadata(id: string) { + if (!id) return null; + const doc = this.documentsById.get(id); if (!doc) return null; const layoutDoc = doc.layoutDoc; const dataDoc = doc.dataDoc; @@ -729,16 +727,14 @@ export class AgentDocumentManager { */ public getDocumentMetadata(documentId?: string): any { if (documentId) { - const doc = this.documentsById.get(documentId); - // Get metadata for a specific document - return this.extractDocumentMetadata(doc); + console.log(`Returning document metadata for docID, ${documentId}:`, this.extractDocumentMetadata(documentId)); + return this.extractDocumentMetadata(documentId); } else { // Get metadata for all documents const documentsMetadata: Record<string, any> = {}; - for (const doc of this.documentsById.values()) { - documentsMetadata.add(this.extractDocumentMetadata(doc) ?? { documentId: doc.layoutDoc.id, title: doc.layoutDoc.title, type: doc.layoutDoc.type }); + for (const documentId of this.documentsById.keys()) { + documentsMetadata.add(this.extractDocumentMetadata(documentId)); } - return { documentCount: this.documentsById.size, documents: documentsMetadata, @@ -845,14 +841,15 @@ export class AgentDocumentManager { return Object.values(supportedDocTypes).includes(docType as supportedDocTypes); } /** - * Creates a document in the dashboard. + * Creates a document in the dashboard and returns its ID. + * This is a public API used by tools like SearchTool. * - * @param {string} doc_type - The type of document to create. - * @param {string} data - The data used to generate the document. - * @param {DocumentOptions} options - Configuration options for the document. - * @returns {Promise<void>} A promise that resolves once the document is created and displayed. + * @param docType The type of document to create + * @param data The data for the document + * @param options Optional configuration options + * @returns The ID of the created document */ - createDocInDash = (docType: string, title: string, data: string) => { + public createDocInDash(docType: string, data: string, options?: any): string { // Validate doc_type if (!this.isValidDocType(docType)) { throw new Error(`Invalid document type: ${docType}`); @@ -862,10 +859,10 @@ export class AgentDocumentManager { // Create simple document with just title and data const simpleDoc: parsedDoc = { doc_type: docType, - title: title, + title: options?.title ?? `Untitled Document ${this.documentsById.size + 1}`, data: data, - x: 0, - y: 0, + x: options?.x ?? 0, + y: options?.y ?? 0, _width: 300, _height: 300, _layout_fitWidth: false, @@ -884,46 +881,111 @@ export class AgentDocumentManager { } }; const doc = this.chatBox.whichDoc(simpleDoc, false); - if (doc) linkAndShowDoc(doc); - return doc; + if (doc) { + linkAndShowDoc(doc); + const id = this.processDocument(doc); + return id; + } else { + throw new Error(`Error creating document. Created document not found.`); + } } catch (error) { throw new Error(`Error creating document: ${error}`); } - }; + } public has(docId: string) { return this.documentsById.has(docId); } - public listDocs() { - // List all available documents in simple format - const docs = Array.from(this.documentsById.entries()).map(([id, doc]) => ({ - id, - title: doc.layoutDoc.title || 'Untitled Document', - type: doc.layoutDoc.type || doc.dataDoc.type || 'Unknown Type', - })); - - if (docs.length === 0) { - return [ - { - type: 'text', - text: 'No documents found in the current view.', - }, - ]; - } - - return [ - { - type: 'text', - text: `Found ${docs.length} document(s) in the current view:\n${JSON.stringify(docs, null, 2)}`, - }, - ]; + /** + * Returns a list of all document IDs in the manager. + * @returns An array of document IDs (strings). + */ + public listDocs(): string[] { + return Array.from(this.documentsById.keys()); + } + + /** + * Adds a document with a custom ID to the manager + * @param doc The document to add + * @param customId The custom ID to assign to the document + * @returns The customId that was assigned + */ + public addCustomId(doc: Doc, customId: string): string { + if (!doc) { + console.error('Cannot add null document with custom ID'); + return ''; + } + + // Set the custom ID in the document's metadata + doc[this.DOCUMENT_ID_FIELD] = customId; + + // Store the document in our map + this.documentsById.set(customId, { + layoutDoc: doc, + dataDoc: doc, + }); + + return customId; } - public createAgentDoc(doc: Doc) { - // Ideally check if Doc is already in there. - const agentDoc = { layoutDoc: doc, dataDoc: doc[DocData] }; - this.documentsById.set(this.ensureDocumentId(doc), agentDoc); - return agentDoc; + /** + * Gets a document by its ID + * @param docId The ID of the document to retrieve + * @returns The document if found, undefined otherwise + */ + public getDocument(docId: string): Doc | undefined { + const docInfo = this.documentsById.get(docId); + return docInfo?.layoutDoc; + } + + /** + * Registers chunk IDs associated with a document in the manager + * @param docId The parent document ID + * @param chunkIds Array of chunk IDs associated with this document + */ + public registerChunkIds(docId: string, chunkIds: string[]): void { + // Get the document if it exists + const docInfo = this.documentsById.get(docId); + if (!docInfo) { + console.warn(`Cannot register chunks for unknown document ID: ${docId}`); + return; + } + + // Store chunk IDs on the document for future reference + const doc = docInfo.layoutDoc; + if (!doc.chunk_ids) { + doc.chunk_ids = JSON.stringify(chunkIds); + } else { + // Merge with existing chunk IDs if they exist + const existingIds = JSON.parse(doc.chunk_ids as string); + const updatedIds = [...new Set([...existingIds, ...chunkIds])]; // Remove duplicates + doc.chunk_ids = JSON.stringify(updatedIds); + } + + // Ensure each chunk ID can be linked back to its parent document + chunkIds.forEach(chunkId => { + // Store a mapping from chunk ID to parent document ID + // This allows us to easily find a document by any of its chunk IDs + if (!this.documentsById.has(chunkId)) { + this.documentsById.set(chunkId, { + layoutDoc: doc, + dataDoc: docInfo.dataDoc, + }); + } + }); + } + + /** + * Gets a document ID by a chunk ID + * @param chunkId The chunk ID to look up + * @returns The parent document ID if found + */ + public getDocIdByChunkId(chunkId: string): string | undefined { + const docInfo = this.documentsById.get(chunkId); + if (docInfo) { + return docInfo.layoutDoc[this.DOCUMENT_ID_FIELD] as string; + } + return undefined; } } diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index afd34f28d..4bb61d8b2 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -15,7 +15,7 @@ import { Networking } from '../../../../Network'; import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types'; import OpenAI from 'openai'; import { Embedding } from 'openai/resources'; -import { PineconeEnvironmentVarsNotSupportedError } from '@pinecone-database/pinecone/dist/errors'; +import { AgentDocumentManager } from '../utils/AgentDocumentManager'; dotenv.config(); @@ -29,7 +29,7 @@ export class Vectorstore { private openai: OpenAI; // OpenAI client for generating embeddings. private indexName: string = 'pdf-chatbot'; // Default name for the index. private _id: string; // Unique ID for the Vectorstore instance. - private _doc_ids: () => string[]; // List of document IDs handled by this instance. + private docManager: AgentDocumentManager; // Document manager for handling documents documents: AI_Document[] = []; // Store the documents indexed in the vectorstore. @@ -37,9 +37,9 @@ export class Vectorstore { * Initializes the Pinecone and OpenAI clients, sets up the document ID list, * and initializes the Pinecone index. * @param id The unique identifier for the vectorstore instance. - * @param doc_ids A function that returns a list of document IDs. + * @param docManager An instance of AgentDocumentManager to handle document management. */ - constructor(id: string, doc_ids: () => string[]) { + constructor(id: string, docManager: AgentDocumentManager) { const pineconeApiKey = process.env.PINECONE_API_KEY; if (!pineconeApiKey) { throw new Error('PINECONE_API_KEY is not defined.'); @@ -49,7 +49,7 @@ export class Vectorstore { this.pinecone = new Pinecone({ apiKey: pineconeApiKey }); this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true }); this._id = id; - this._doc_ids = doc_ids; + this.docManager = docManager; this.initializeIndex(); } @@ -109,15 +109,25 @@ export class Vectorstore { const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4'); let result: AI_Document & { doc_id: string }; + if (isAudioOrVideo) { console.log('Processing media file...'); const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) }); - const segmentedTranscript = response.condensed; + + // Type assertion to handle the response properties + const typedResponse = response as { + condensed: Array<{ text: string; indexes: string[]; start: number; end: number }>; + full: Array<unknown>; + summary: string; + }; + + const segmentedTranscript = typedResponse.condensed; console.log(segmentedTranscript); - const summary = response.summary; + const summary = typedResponse.summary; doc.summary = summary; + // Generate embeddings for each chunk - const texts = segmentedTranscript.map((chunk: any) => chunk.text); + const texts = segmentedTranscript.map(chunk => chunk.text); try { const embeddingsResponse = await this.openai.embeddings.create({ @@ -126,10 +136,19 @@ export class Vectorstore { encoding_format: 'float', }); - doc.original_segments = JSON.stringify(response.full); + doc.original_segments = JSON.stringify(typedResponse.full); doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; const doc_id = uuidv4(); + // Register the document with the AgentDocumentManager + this.docManager.addCustomId(doc, doc_id); + + // Generate chunk IDs upfront so we can register them + const chunkIds = segmentedTranscript.map(() => uuidv4()); + + // Register all chunk IDs with the document manager + this.docManager.registerChunkIds(doc_id, chunkIds); + // Add transcript and embeddings to metadata result = { doc_id, @@ -137,13 +156,13 @@ export class Vectorstore { file_name: local_file_path, num_pages: 0, summary: '', - chunks: segmentedTranscript.map((chunk: any, index: number) => ({ - id: uuidv4(), + chunks: segmentedTranscript.map((chunk, index) => ({ + id: chunkIds[index], // Use pre-generated chunk ID values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding metadata: { indexes: chunk.indexes, original_document: local_file_path, - doc_id: doc_id, + doc_id: doc_id, // Ensure doc_id is consistent file_path: local_file_path, start_time: chunk.start, end_time: chunk.end, @@ -159,20 +178,24 @@ export class Vectorstore { } doc.segmented_transcript = JSON.stringify(segmentedTranscript); - // Simplify chunks for storage + // Simplify chunks for storage - ensure simplified chunks use EXACTLY the same IDs const simplifiedChunks = result.chunks.map(chunk => ({ - chunkId: chunk.id, + chunkId: chunk.id, // Use the exact same ID as the full chunk start_time: chunk.metadata.start_time, end_time: chunk.metadata.end_time, indexes: chunk.metadata.indexes, chunkType: CHUNK_TYPE.VIDEO, text: chunk.metadata.text, + doc_id: chunk.metadata.doc_id, // Include parent doc_id for completeness })); doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks }); } else { - // Existing document processing logic remains unchanged + // Process regular document console.log('Processing regular document...'); - const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path }); + const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path }); + + // Type assertion for the response + const { jobId } = createDocumentResponse as { jobId: string }; while (true) { await new Promise(resolve => setTimeout(resolve, 2000)); @@ -188,6 +211,16 @@ export class Vectorstore { progressCallback(progressResponseJson.progress, progressResponseJson.step); } } + + // Register the document with the AgentDocumentManager + this.docManager.addCustomId(doc, result.doc_id); + + // Collect all chunk IDs + const chunkIds = result.chunks.map(chunk => chunk.id); + + // Register chunks with the document manager + this.docManager.registerChunkIds(result.doc_id, chunkIds); + if (!doc.chunk_simpl) { doc.chunk_simpl = JSON.stringify({ chunks: [] }); } @@ -196,12 +229,13 @@ export class Vectorstore { result.chunks.forEach((chunk: RAGChunk) => { const chunkToAdd = { - chunkId: chunk.id, + chunkId: chunk.id, // Ensure we use the exact same ID startPage: chunk.metadata.start_page, endPage: chunk.metadata.end_page, location: chunk.metadata.location, chunkType: chunk.metadata.type as CHUNK_TYPE, text: chunk.metadata.text, + doc_id: chunk.metadata.doc_id, // Include parent doc_id for consistency }; const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd); @@ -298,39 +332,55 @@ export class Vectorstore { let queryEmbedding = queryEmbeddingResponse.data[0].embedding; - // Extract the embedding from the response. + // Get document IDs from the AgentDocumentManager + const docIds = Array.from(this.docManager.listDocs()); + console.log('Using document IDs for retrieval:', docIds); - console.log(this._doc_ids()); // Query the Pinecone index using the embedding and filter by document IDs. + // We'll query based on document IDs that are registered in the document manager const queryResponse: QueryResponse = await this.index.query({ vector: queryEmbedding, filter: { - doc_id: { $in: this._doc_ids() }, + doc_id: { $in: docIds }, }, topK, includeValues: true, includeMetadata: true, }); - console.log(queryResponse); - - // Map the results into RAGChunks and return them. - return queryResponse.matches.map( - match => - ({ - id: match.id, - values: match.values as number[], - metadata: match.metadata as { - text: string; - type: string; - original_document: string; - file_path: string; - doc_id: string; - location: string; - start_page: number; - end_page: number; - }, - }) as RAGChunk - ); + console.log(`Found ${queryResponse.matches.length} matching chunks`); + + // For each retrieved chunk, ensure its document ID is registered in the document manager + // This maintains compatibility with existing code while ensuring consistency + const processedMatches = queryResponse.matches.map(match => { + const chunk = { + id: match.id, + values: match.values as number[], + metadata: match.metadata as { + text: string; + type: string; + original_document: string; + file_path: string; + doc_id: string; + location: string; + start_page: number; + end_page: number; + }, + } as RAGChunk; + + // Ensure the document manager knows about this chunk + // This is important for maintaining backwards compatibility + if (chunk.id && !this.docManager.getDocIdByChunkId(chunk.id)) { + // If the chunk ID isn't registered but we have a doc_id in metadata + if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) { + // Register the chunk with its parent document + this.docManager.registerChunkIds(chunk.metadata.doc_id, [chunk.id]); + } + } + + return chunk; + }); + + return processedMatches; } catch (error) { console.error(`Error retrieving chunks: ${error}`); return []; |