From 3ef3d40506348d9fd537cc8f4aea975b9770689f Mon Sep 17 00:00:00 2001 From: "A.J. Shulman" Date: Sun, 27 Apr 2025 13:14:49 -0400 Subject: new attempt with new citation unification --- .../views/nodes/chatbot/vectorstore/Vectorstore.ts | 130 ++++++++++++++------- 1 file changed, 90 insertions(+), 40 deletions(-) (limited to 'src/client/views/nodes/chatbot/vectorstore') diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index afd34f28d..4bb61d8b2 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -15,7 +15,7 @@ import { Networking } from '../../../../Network'; import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types'; import OpenAI from 'openai'; import { Embedding } from 'openai/resources'; -import { PineconeEnvironmentVarsNotSupportedError } from '@pinecone-database/pinecone/dist/errors'; +import { AgentDocumentManager } from '../utils/AgentDocumentManager'; dotenv.config(); @@ -29,7 +29,7 @@ export class Vectorstore { private openai: OpenAI; // OpenAI client for generating embeddings. private indexName: string = 'pdf-chatbot'; // Default name for the index. private _id: string; // Unique ID for the Vectorstore instance. - private _doc_ids: () => string[]; // List of document IDs handled by this instance. + private docManager: AgentDocumentManager; // Document manager for handling documents documents: AI_Document[] = []; // Store the documents indexed in the vectorstore. @@ -37,9 +37,9 @@ export class Vectorstore { * Initializes the Pinecone and OpenAI clients, sets up the document ID list, * and initializes the Pinecone index. * @param id The unique identifier for the vectorstore instance. - * @param doc_ids A function that returns a list of document IDs. + * @param docManager An instance of AgentDocumentManager to handle document management. */ - constructor(id: string, doc_ids: () => string[]) { + constructor(id: string, docManager: AgentDocumentManager) { const pineconeApiKey = process.env.PINECONE_API_KEY; if (!pineconeApiKey) { throw new Error('PINECONE_API_KEY is not defined.'); @@ -49,7 +49,7 @@ export class Vectorstore { this.pinecone = new Pinecone({ apiKey: pineconeApiKey }); this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true }); this._id = id; - this._doc_ids = doc_ids; + this.docManager = docManager; this.initializeIndex(); } @@ -109,15 +109,25 @@ export class Vectorstore { const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4'); let result: AI_Document & { doc_id: string }; + if (isAudioOrVideo) { console.log('Processing media file...'); const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) }); - const segmentedTranscript = response.condensed; + + // Type assertion to handle the response properties + const typedResponse = response as { + condensed: Array<{ text: string; indexes: string[]; start: number; end: number }>; + full: Array; + summary: string; + }; + + const segmentedTranscript = typedResponse.condensed; console.log(segmentedTranscript); - const summary = response.summary; + const summary = typedResponse.summary; doc.summary = summary; + // Generate embeddings for each chunk - const texts = segmentedTranscript.map((chunk: any) => chunk.text); + const texts = segmentedTranscript.map(chunk => chunk.text); try { const embeddingsResponse = await this.openai.embeddings.create({ @@ -126,10 +136,19 @@ export class Vectorstore { encoding_format: 'float', }); - doc.original_segments = JSON.stringify(response.full); + doc.original_segments = JSON.stringify(typedResponse.full); doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; const doc_id = uuidv4(); + // Register the document with the AgentDocumentManager + this.docManager.addCustomId(doc, doc_id); + + // Generate chunk IDs upfront so we can register them + const chunkIds = segmentedTranscript.map(() => uuidv4()); + + // Register all chunk IDs with the document manager + this.docManager.registerChunkIds(doc_id, chunkIds); + // Add transcript and embeddings to metadata result = { doc_id, @@ -137,13 +156,13 @@ export class Vectorstore { file_name: local_file_path, num_pages: 0, summary: '', - chunks: segmentedTranscript.map((chunk: any, index: number) => ({ - id: uuidv4(), + chunks: segmentedTranscript.map((chunk, index) => ({ + id: chunkIds[index], // Use pre-generated chunk ID values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding metadata: { indexes: chunk.indexes, original_document: local_file_path, - doc_id: doc_id, + doc_id: doc_id, // Ensure doc_id is consistent file_path: local_file_path, start_time: chunk.start, end_time: chunk.end, @@ -159,20 +178,24 @@ export class Vectorstore { } doc.segmented_transcript = JSON.stringify(segmentedTranscript); - // Simplify chunks for storage + // Simplify chunks for storage - ensure simplified chunks use EXACTLY the same IDs const simplifiedChunks = result.chunks.map(chunk => ({ - chunkId: chunk.id, + chunkId: chunk.id, // Use the exact same ID as the full chunk start_time: chunk.metadata.start_time, end_time: chunk.metadata.end_time, indexes: chunk.metadata.indexes, chunkType: CHUNK_TYPE.VIDEO, text: chunk.metadata.text, + doc_id: chunk.metadata.doc_id, // Include parent doc_id for completeness })); doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks }); } else { - // Existing document processing logic remains unchanged + // Process regular document console.log('Processing regular document...'); - const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path }); + const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path }); + + // Type assertion for the response + const { jobId } = createDocumentResponse as { jobId: string }; while (true) { await new Promise(resolve => setTimeout(resolve, 2000)); @@ -188,6 +211,16 @@ export class Vectorstore { progressCallback(progressResponseJson.progress, progressResponseJson.step); } } + + // Register the document with the AgentDocumentManager + this.docManager.addCustomId(doc, result.doc_id); + + // Collect all chunk IDs + const chunkIds = result.chunks.map(chunk => chunk.id); + + // Register chunks with the document manager + this.docManager.registerChunkIds(result.doc_id, chunkIds); + if (!doc.chunk_simpl) { doc.chunk_simpl = JSON.stringify({ chunks: [] }); } @@ -196,12 +229,13 @@ export class Vectorstore { result.chunks.forEach((chunk: RAGChunk) => { const chunkToAdd = { - chunkId: chunk.id, + chunkId: chunk.id, // Ensure we use the exact same ID startPage: chunk.metadata.start_page, endPage: chunk.metadata.end_page, location: chunk.metadata.location, chunkType: chunk.metadata.type as CHUNK_TYPE, text: chunk.metadata.text, + doc_id: chunk.metadata.doc_id, // Include parent doc_id for consistency }; const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd); @@ -298,39 +332,55 @@ export class Vectorstore { let queryEmbedding = queryEmbeddingResponse.data[0].embedding; - // Extract the embedding from the response. + // Get document IDs from the AgentDocumentManager + const docIds = Array.from(this.docManager.listDocs()); + console.log('Using document IDs for retrieval:', docIds); - console.log(this._doc_ids()); // Query the Pinecone index using the embedding and filter by document IDs. + // We'll query based on document IDs that are registered in the document manager const queryResponse: QueryResponse = await this.index.query({ vector: queryEmbedding, filter: { - doc_id: { $in: this._doc_ids() }, + doc_id: { $in: docIds }, }, topK, includeValues: true, includeMetadata: true, }); - console.log(queryResponse); - - // Map the results into RAGChunks and return them. - return queryResponse.matches.map( - match => - ({ - id: match.id, - values: match.values as number[], - metadata: match.metadata as { - text: string; - type: string; - original_document: string; - file_path: string; - doc_id: string; - location: string; - start_page: number; - end_page: number; - }, - }) as RAGChunk - ); + console.log(`Found ${queryResponse.matches.length} matching chunks`); + + // For each retrieved chunk, ensure its document ID is registered in the document manager + // This maintains compatibility with existing code while ensuring consistency + const processedMatches = queryResponse.matches.map(match => { + const chunk = { + id: match.id, + values: match.values as number[], + metadata: match.metadata as { + text: string; + type: string; + original_document: string; + file_path: string; + doc_id: string; + location: string; + start_page: number; + end_page: number; + }, + } as RAGChunk; + + // Ensure the document manager knows about this chunk + // This is important for maintaining backwards compatibility + if (chunk.id && !this.docManager.getDocIdByChunkId(chunk.id)) { + // If the chunk ID isn't registered but we have a doc_id in metadata + if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) { + // Register the chunk with its parent document + this.docManager.registerChunkIds(chunk.metadata.doc_id, [chunk.id]); + } + } + + return chunk; + }); + + return processedMatches; } catch (error) { console.error(`Error retrieving chunks: ${error}`); return []; -- cgit v1.2.3-70-g09d2 From 67a7996278ce176e227393fa410e7afc80228a83 Mon Sep 17 00:00:00 2001 From: "A.J. Shulman" Date: Sun, 27 Apr 2025 13:37:37 -0400 Subject: a bit more consistent --- .../nodes/chatbot/chatboxcomponents/ChatBox.tsx | 69 +++------------------- src/client/views/nodes/chatbot/types/types.ts | 3 +- .../views/nodes/chatbot/vectorstore/Vectorstore.ts | 3 +- 3 files changed, 12 insertions(+), 63 deletions(-) (limited to 'src/client/views/nodes/chatbot/vectorstore') diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx index 35dbee3e9..b11bf7405 100644 --- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx +++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx @@ -593,7 +593,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { }); // Try to find the document - const linkedDocs = this.linkedDocs; let doc: Doc | undefined; // First try to find the document using the document manager's chunk ID lookup @@ -603,25 +602,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { console.log(`Found document by chunk ID lookup: ${parentDocId}`); } - // If not found, fall back to searching through linked docs (maintains compatibility) - if (!doc) { - for (const linkedDoc of linkedDocs) { - if (linkedDoc.chunk_simpl) { - try { - const docChunkSimpl = JSON.parse(StrCast(linkedDoc.chunk_simpl)) as { chunks: SimplifiedChunk[] }; - const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkId === chunkId); - if (foundChunk) { - doc = linkedDoc; - console.log(`Found document by iterating through linked docs`); - break; - } - } catch (e) { - console.error(`Error parsing chunk_simpl for doc ${linkedDoc.id}:`, e); - } - } - } - } - if (!doc) { console.warn(`Document not found for citation with chunk_id: ${chunkId}`); return; @@ -641,29 +621,16 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { // Handle different chunk types if (foundChunk) { console.log(`Found chunk in document:`, foundChunk); - - // Handle video chunks - if (foundChunk.chunkType === CHUNK_TYPE.VIDEO) { - if (foundChunk.start_time !== undefined) { - await this.goToMediaTimestamp(doc, foundChunk.start_time, 'video'); + if (foundChunk.chunkType === CHUNK_TYPE.AUDIO || foundChunk.chunkType === CHUNK_TYPE.VIDEO) { + const directMatchSegmentStart = this.getDirectMatchingSegmentStart(doc, citation.direct_text || '', foundChunk.indexes || []); + if (directMatchSegmentStart) { + await this.goToMediaTimestamp(doc, directMatchSegmentStart, foundChunk.chunkType); } else { - console.warn('Video chunk missing start_time:', foundChunk); + console.error('No direct matching segment found for the citation.'); } - } - // Handle audio chunks - note that we're using string comparison since 'audio' isn't in CHUNK_TYPE enum - else if (String(foundChunk.chunkType).toLowerCase() === 'audio') { - if (foundChunk.start_time !== undefined) { - await this.goToMediaTimestamp(doc, foundChunk.start_time, 'audio'); - } else { - console.warn('Audio chunk missing start_time:', foundChunk); - } - } - // Handle table or image chunks - else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) { + } else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) { this.handleOtherChunkTypes(foundChunk, citation, doc); - } - // Handle text chunks - else if (foundChunk.chunkType === CHUNK_TYPE.TEXT) { + } else if (foundChunk.chunkType === CHUNK_TYPE.TEXT) { // Find text from the document's chunks metadata let chunkText = ''; @@ -691,33 +658,15 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { // Handle URL chunks else if (foundChunk.chunkType === CHUNK_TYPE.URL) { if (foundChunk.url) { - // Instead of opening the URL in a new window, show the document in the viewer DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); console.log(`Navigated to web document with URL: ${foundChunk.url}`); } else { console.warn('URL chunk missing URL:', foundChunk); } } - } else if (doc?.original_segments) { - // Handle original segments for media files - let original_segments: any[] = []; - try { - original_segments = JSON.parse(StrCast(doc.original_segments)); - } catch (e) { - console.error(`Error parsing original_segments:`, e); - return; - } - - // Check if there's direct text to find in the segments - if (citation.direct_text) { - // Find the segment that contains the direct text - const start = this.getDirectMatchingSegmentStart(doc, citation.direct_text, []); - if (start !== -1) { - await this.goToMediaTimestamp(doc, start, doc.ai_type === 'audio' ? 'audio' : 'video'); - } - } } else { - console.warn('Unable to find chunk or segments for citation', citation); + console.warn('Navigating to doc. Unable to find chunk or segments for citation', citation); + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); } } catch (error) { console.error('Error handling citation click:', error); diff --git a/src/client/views/nodes/chatbot/types/types.ts b/src/client/views/nodes/chatbot/types/types.ts index dcb132ec7..90b5e7e11 100644 --- a/src/client/views/nodes/chatbot/types/types.ts +++ b/src/client/views/nodes/chatbot/types/types.ts @@ -15,8 +15,9 @@ export enum CHUNK_TYPE { TABLE = 'table', URL = 'url', CSV = 'CSV', - MEDIA = 'media', + //MEDIA = 'media', VIDEO = 'video', + AUDIO = 'audio', } export enum PROCESSING_TYPE { diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index 4bb61d8b2..4512ae3e6 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -137,7 +137,6 @@ export class Vectorstore { }); doc.original_segments = JSON.stringify(typedResponse.full); - doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; const doc_id = uuidv4(); // Register the document with the AgentDocumentManager @@ -167,7 +166,7 @@ export class Vectorstore { start_time: chunk.start, end_time: chunk.end, text: chunk.text, - type: CHUNK_TYPE.VIDEO, + type: local_file_path.endsWith('.mp3') ? CHUNK_TYPE.AUDIO : CHUNK_TYPE.VIDEO, }, })), type: 'media', -- cgit v1.2.3-70-g09d2 From 393b7f8286422c933102449eba1ba82874a48896 Mon Sep 17 00:00:00 2001 From: "A.J. Shulman" Date: Sun, 27 Apr 2025 14:57:39 -0400 Subject: improved consistency across doc types and parsing --- src/client/documents/Documents.ts | 1 + .../views/nodes/chatbot/agentsystem/Agent.ts | 15 +- .../nodes/chatbot/chatboxcomponents/ChatBox.tsx | 176 +++++++++------- .../chatbot/chatboxcomponents/ProgressBar.scss | 40 +++- .../nodes/chatbot/utils/AgentDocumentManager.ts | 234 ++++++++++++++++++++- .../views/nodes/chatbot/vectorstore/Vectorstore.ts | 49 ++--- 6 files changed, 390 insertions(+), 125 deletions(-) (limited to 'src/client/views/nodes/chatbot/vectorstore') diff --git a/src/client/documents/Documents.ts b/src/client/documents/Documents.ts index 317bb7feb..f87bd7092 100644 --- a/src/client/documents/Documents.ts +++ b/src/client/documents/Documents.ts @@ -273,6 +273,7 @@ export class DocumentOptions { _layout_reflowHorizontal?: BOOLt = new BoolInfo('permit horizontal resizing with content reflow'); _layout_noSidebar?: BOOLt = new BoolInfo('whether to display the sidebar toggle button'); layout_boxShadow?: string; // box-shadow css string OR "standard" to use dash standard box shadow + _iframe_sandbox?: STRt = new StrInfo('sandbox attributes for iframes in web documents (e.g., allow-scripts, allow-same-origin)'); layout_maxShown?: NUMt = new NumInfo('maximum number of children to display at one time (see multicolumnview)'); _layout_columnWidth?: NUMt = new NumInfo('width of table column', false); _layout_columnCount?: NUMt = new NumInfo('number of columns in a masonry view'); diff --git a/src/client/views/nodes/chatbot/agentsystem/Agent.ts b/src/client/views/nodes/chatbot/agentsystem/Agent.ts index 80fdb6533..24471bf5b 100644 --- a/src/client/views/nodes/chatbot/agentsystem/Agent.ts +++ b/src/client/views/nodes/chatbot/agentsystem/Agent.ts @@ -41,7 +41,6 @@ export class Agent { private interMessages: AgentMessage[] = []; private vectorstore: Vectorstore; private _history: () => string; - private _summaries: () => string; private _csvData: () => { filename: string; id: string; text: string }[]; private actionNumber: number = 0; private thoughtNumber: number = 0; @@ -54,11 +53,13 @@ export class Agent { /** * The constructor initializes the agent with the vector store and toolset, and sets up the OpenAI client. * @param _vectorstore Vector store instance for document storage and retrieval. - * @param summaries A function to retrieve document summaries. + * @param summaries A function to retrieve document summaries (deprecated, now using docManager directly). * @param history A function to retrieve chat history. * @param csvData A function to retrieve CSV data linked to the assistant. - * @param addLinkedUrlDoc A function to add a linked document from a URL. + * @param getLinkedUrlDocId A function to get document IDs from URLs. + * @param createImage A function to create images in the dashboard. * @param createCSVInDash A function to create a CSV document in the dashboard. + * @param docManager The document manager instance. */ constructor( _vectorstore: Vectorstore, @@ -74,7 +75,6 @@ export class Agent { this.client = new OpenAI({ apiKey: process.env.OPENAI_KEY, dangerouslyAllowBrowser: true }); this.vectorstore = _vectorstore; this._history = history; - this._summaries = summaries; this._csvData = csvData; this._docManager = docManager; @@ -124,7 +124,12 @@ export class Agent { // Retrieve chat history and generate system prompt const chatHistory = this._history(); - const systemPrompt = getReactPrompt(Object.values(this.tools), this._summaries, chatHistory); + // Get document summaries directly from document manager + const documentSummaries = this._docManager.getAllDocumentSummaries(); + // Create a function that returns document summaries for the prompt + const getSummaries = () => documentSummaries; + // Generate the system prompt with the summaries + const systemPrompt = getReactPrompt(Object.values(this.tools), getSummaries, chatHistory); // Initialize intermediate messages this.interMessages = [{ role: 'system', content: systemPrompt }]; diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx index b11bf7405..ba30cb42b 100644 --- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx +++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx @@ -18,7 +18,7 @@ import { Doc, DocListCast, Opt } from '../../../../../fields/Doc'; import { DocData, DocViews } from '../../../../../fields/DocSymbols'; import { RichTextField } from '../../../../../fields/RichTextField'; import { ScriptField } from '../../../../../fields/ScriptField'; -import { CsvCast, DocCast, NumCast, PDFCast, RTFCast, StrCast } from '../../../../../fields/Types'; +import { CsvCast, DocCast, NumCast, PDFCast, RTFCast, StrCast, VideoCast, AudioCast } from '../../../../../fields/Types'; import { DocUtils } from '../../../../documents/DocUtils'; import { CollectionViewType, DocumentType } from '../../../../documents/DocumentTypes'; import { Docs, DocumentOptions } from '../../../../documents/Documents'; @@ -48,7 +48,14 @@ import { AgentDocumentManager } from '../utils/AgentDocumentManager'; dotenv.config(); -export type parsedDocData = { doc_type: string; data: unknown }; +export type parsedDocData = { + doc_type: string; + data: unknown; + _disable_resource_loading?: boolean; + _sandbox_iframe?: boolean; + _iframe_sandbox?: string; + data_useCors?: boolean; +}; export type parsedDoc = DocumentOptions & parsedDocData; /** * ChatBox is the main class responsible for managing the interaction between the user and the assistant, @@ -150,7 +157,14 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { @action addDocToVectorstore = async (newLinkedDoc: Doc) => { try { - this._isUploadingDocs = true; + const isAudioOrVideo = VideoCast(newLinkedDoc.data)?.url?.pathname || AudioCast(newLinkedDoc.data)?.url?.pathname; + + // Set UI state to show the processing overlay + runInAction(() => { + this._isUploadingDocs = true; + this._uploadProgress = 0; + this._currentStep = isAudioOrVideo ? 'Preparing media file...' : 'Processing document...'; + }); // Process the document first to ensure it has a valid ID this.docManager.processDocument(newLinkedDoc); @@ -158,15 +172,36 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { // Add the document to the vectorstore which will also register chunks await this.vectorstore.addAIDoc(newLinkedDoc, this.updateProgress); - // No longer needed as documents are tracked by the AgentDocumentManager - // this._linked_docs_to_add.add(newLinkedDoc); + // Give a slight delay to show the completion message + if (this._uploadProgress === 100) { + await new Promise(resolve => setTimeout(resolve, 1000)); + } - this._isUploadingDocs = false; + // Reset UI state + runInAction(() => { + this._isUploadingDocs = false; + this._uploadProgress = 0; + this._currentStep = ''; + }); return true; } catch (err) { console.error('Error adding document to vectorstore:', err); - this._isUploadingDocs = false; + + // Show error in UI + runInAction(() => { + this._currentStep = `Error: ${err instanceof Error ? err.message : 'Failed to process document'}`; + }); + + await new Promise(resolve => setTimeout(resolve, 2000)); + + // Reset UI state + runInAction(() => { + this._isUploadingDocs = false; + this._uploadProgress = 0; + this._currentStep = ''; + }); + return false; } }; @@ -178,8 +213,15 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { */ @action updateProgress = (progress: number, step: string) => { - this._uploadProgress = progress; + // Ensure progress is within expected bounds + const validProgress = Math.min(Math.max(0, progress), 100); + this._uploadProgress = validProgress; this._currentStep = step; + + // Force UI update + if (process.env.NODE_ENV !== 'production') { + console.log(`Progress: ${validProgress}%, Step: ${step}`); + } }; /** @@ -453,7 +495,19 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { case supportedDocTypes.image: return Docs.Create.ImageDocument(data as string, options); case supportedDocTypes.equation: return Docs.Create.EquationDocument(data as string, options); case supportedDocTypes.notetaking: return Docs.Create.NoteTakingDocument([], options); - case supportedDocTypes.web: return Docs.Create.WebDocument(data as string, { ...options, data_useCors: true }); + case supportedDocTypes.web: + // Create web document with enhanced safety options + const webOptions = { + ...options, + data_useCors: true + }; + + // If iframe_sandbox was passed from AgentDocumentManager, add it to the options + if ('_iframe_sandbox' in options) { + (webOptions as any)._iframe_sandbox = options._iframe_sandbox; + } + + return Docs.Create.WebDocument(data as string, webOptions); case supportedDocTypes.dataviz: return Docs.Create.DataVizDocument('/users/rz/Downloads/addresses.csv', options); case supportedDocTypes.pdf: return Docs.Create.PdfDocument(data as string, options); case supportedDocTypes.video: return Docs.Create.VideoDocument(data as string, options); @@ -607,65 +661,36 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { return; } - // Process the chunk data - let docChunkSimpl: { chunks: SimplifiedChunk[] } = { chunks: [] }; - try { - docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl) || '{"chunks":[]}'); - } catch (e) { - console.error(`Error parsing chunk_simpl for the found document:`, e); + // Get the simplified chunk using the document manager + const foundChunk = this.docManager.getSimplifiedChunkById(doc, chunkId); + if (!foundChunk) { + console.warn(`Chunk not found in document for chunk ID: ${chunkId}`); + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); return; } - const foundChunk = docChunkSimpl.chunks.find((chunk: SimplifiedChunk) => chunk.chunkId === chunkId); + console.log(`Found chunk in document:`, foundChunk); // Handle different chunk types - if (foundChunk) { - console.log(`Found chunk in document:`, foundChunk); - if (foundChunk.chunkType === CHUNK_TYPE.AUDIO || foundChunk.chunkType === CHUNK_TYPE.VIDEO) { - const directMatchSegmentStart = this.getDirectMatchingSegmentStart(doc, citation.direct_text || '', foundChunk.indexes || []); - if (directMatchSegmentStart) { - await this.goToMediaTimestamp(doc, directMatchSegmentStart, foundChunk.chunkType); - } else { - console.error('No direct matching segment found for the citation.'); - } - } else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) { - this.handleOtherChunkTypes(foundChunk, citation, doc); - } else if (foundChunk.chunkType === CHUNK_TYPE.TEXT) { - // Find text from the document's chunks metadata - let chunkText = ''; - - try { - // We already parsed the chunks earlier, so use that - const matchingChunk = docChunkSimpl.chunks.find(c => c.chunkId === foundChunk.chunkId); - if (matchingChunk && 'text' in matchingChunk) { - // If the text property exists on the chunk (even though it's not in the type) - chunkText = String(matchingChunk['text'] || ''); - } - } catch (e) { - console.error('Error getting chunk text:', e); - } - - // Default text if none found - if (!chunkText) { - chunkText = 'Text content not available'; - } - - this._citationPopup = { - text: chunkText, - visible: true, - }; - } - // Handle URL chunks - else if (foundChunk.chunkType === CHUNK_TYPE.URL) { - if (foundChunk.url) { - DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); - console.log(`Navigated to web document with URL: ${foundChunk.url}`); - } else { - console.warn('URL chunk missing URL:', foundChunk); - } + if (foundChunk.chunkType === CHUNK_TYPE.AUDIO || foundChunk.chunkType === CHUNK_TYPE.VIDEO) { + const directMatchSegmentStart = this.getDirectMatchingSegmentStart(doc, citation.direct_text || '', foundChunk.indexes || []); + if (directMatchSegmentStart) { + await this.goToMediaTimestamp(doc, directMatchSegmentStart, foundChunk.chunkType); + } else { + console.error('No direct matching segment found for the citation.'); } + } else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) { + this.handleOtherChunkTypes(foundChunk, citation, doc); } else { - console.warn('Navigating to doc. Unable to find chunk or segments for citation', citation); + // Show the chunk text in citation popup + let chunkText = foundChunk.text || 'Text content not available'; + + this._citationPopup = { + text: chunkText, + visible: true, + }; + + // Also navigate to the document DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); } } catch (error) { @@ -683,8 +708,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { getDirectMatchingSegmentStart = (doc: Doc, citationText: string, indexesOfSegments: string[]): number => { if (!doc || !citationText) return -1; - // Get original segments from the document - const original_segments = doc.original_segments ? JSON.parse(StrCast(doc.original_segments)) : []; + // Get original segments using document manager + const original_segments = this.docManager.getOriginalSegments(doc); if (!original_segments || !Array.isArray(original_segments) || original_segments.length === 0) { return -1; @@ -993,18 +1018,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { */ @computed get summaries(): string { - const linkedDocs = Array.from(this.docManager.listDocs()) - .map(id => { - const doc = this.docManager.extractDocumentMetadata(id); - if (doc && doc.fields && (doc.fields.layout.summary || doc.fields.data.summary)) { - return doc.fields.layout.summary || doc.fields.data.summary; - } - return null; - }) - .filter(Boolean) - .join('\n\n'); - - return linkedDocs; + // Use the document manager to get all summaries + return this.docManager.getAllDocumentSummaries(); } /** @@ -1033,7 +1048,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { // Other helper methods for retrieving document data and processing retrieveSummaries = (): string => { - return this.summaries; + return this.docManager.getAllDocumentSummaries(); }; retrieveCSVData = () => { @@ -1068,8 +1083,13 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { {this._isUploadingDocs && (
- -
{this._currentStep}
+
+
+
+
+
{Math.round(this._uploadProgress)}%
+
{this._currentStep}
+
)} diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss b/src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss index ff5be4a38..3a8334695 100644 --- a/src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss +++ b/src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss @@ -58,12 +58,48 @@ flex-direction: column; align-items: center; text-align: center; + width: 80%; + max-width: 400px; + background-color: white; + padding: 20px; + border-radius: 8px; + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15); } -.step-name { +.progress-bar-wrapper { + width: 100%; + height: 12px; + background-color: #e0e0e0; + border-radius: 6px; + overflow: hidden; + margin-bottom: 10px; +} + +.progress-bar { + height: 100%; + background-color: #4a90e2; + border-radius: 6px; + transition: width 0.5s ease; +} + +.progress-details { + display: flex; + flex-direction: column; + align-items: center; + width: 100%; +} + +.progress-percentage { font-size: 18px; + font-weight: bold; color: #333; + margin-bottom: 5px; +} + +.step-name { + font-size: 16px; + color: #666; text-align: center; width: 100%; - margin-top: -10px; // Adjust to move the text closer to the spinner + margin-top: 5px; } diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts index c3beebcde..cff8380db 100644 --- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts +++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts @@ -14,6 +14,8 @@ import { parsedDoc } from '../chatboxcomponents/ChatBox'; import { faThumbTackSlash } from '@fortawesome/free-solid-svg-icons'; import { DocumentManager } from '../../../../util/DocumentManager'; import { DocumentView } from '../../DocumentView'; +import { RAGChunk, CHUNK_TYPE } from '../types/types'; +import { runInAction } from 'mobx'; /** * Interface representing a document in the freeform view @@ -869,20 +871,43 @@ export class AgentDocumentManager { _layout_autoHeight: true, }; - // Use the chatBox's createDocInDash method to create and link the document + // Additional handling for web documents + if (docType === 'web') { + // For web documents, don't sanitize the URL here + // Instead, set properties to handle content safely when loaded + simpleDoc._disable_resource_loading = true; + simpleDoc._sandbox_iframe = true; + simpleDoc.data_useCors = true; + + // Specify a more permissive sandbox to allow content to render properly + // but still maintain security + simpleDoc._iframe_sandbox = 'allow-same-origin allow-scripts allow-popups allow-forms'; + } + + // Use the chatBox's createDocInDash method to create the document if (!this.chatBox) { throw new Error('ChatBox instance not available for creating document'); } - const linkAndShowDoc = (doc: Opt) => { - if (doc) { - LinkManager.Instance.addLink(Docs.Create.LinkDocument(this.chatBoxDocument!, doc)); - this.chatBox._props.addDocument?.(doc); - DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); - } - }; + const doc = this.chatBox.whichDoc(simpleDoc, false); if (doc) { - linkAndShowDoc(doc); + // Use MobX runInAction to properly modify observable state + runInAction(() => { + if (this.chatBoxDocument && doc) { + // Create link and add it to the document system + const linkDoc = Docs.Create.LinkDocument(this.chatBoxDocument, doc); + LinkManager.Instance.addLink(linkDoc); + + // Add document to view + this.chatBox._props.addDocument?.(doc); + + // Show document - defer actual display to prevent immediate resource loading + setTimeout(() => { + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); + }, 100); + } + }); + const id = this.processDocument(doc); return id; } else { @@ -893,6 +918,62 @@ export class AgentDocumentManager { } } + /** + * Sanitizes web content to prevent errors with external resources + * @param content The web content to sanitize + * @returns Sanitized content + */ + private sanitizeWebContent(content: string): string { + if (!content) return content; + + try { + // Replace problematic resource references that might cause errors + const sanitized = content + // Remove preload links that might cause errors + .replace(/]*rel=["']preload["'][^>]*>/gi, '') + // Remove map file references + .replace(/\/\/# sourceMappingURL=.*\.map/gi, '') + // Remove external CSS map files references + .replace(/\/\*# sourceMappingURL=.*\.css\.map.*\*\//gi, '') + // Add sandbox to iframes + .replace(/