diff options
author | A.J. Shulman <Shulman.aj@gmail.com> | 2025-04-27 14:57:39 -0400 |
---|---|---|
committer | A.J. Shulman <Shulman.aj@gmail.com> | 2025-04-27 14:57:39 -0400 |
commit | 393b7f8286422c933102449eba1ba82874a48896 (patch) | |
tree | c34cd5dffc7306a66fcfe54c81d8656c341facb9 /src | |
parent | 67a7996278ce176e227393fa410e7afc80228a83 (diff) |
improved consistency across doc types and parsing
Diffstat (limited to 'src')
6 files changed, 390 insertions, 125 deletions
diff --git a/src/client/documents/Documents.ts b/src/client/documents/Documents.ts index 317bb7feb..f87bd7092 100644 --- a/src/client/documents/Documents.ts +++ b/src/client/documents/Documents.ts @@ -273,6 +273,7 @@ export class DocumentOptions { _layout_reflowHorizontal?: BOOLt = new BoolInfo('permit horizontal resizing with content reflow'); _layout_noSidebar?: BOOLt = new BoolInfo('whether to display the sidebar toggle button'); layout_boxShadow?: string; // box-shadow css string OR "standard" to use dash standard box shadow + _iframe_sandbox?: STRt = new StrInfo('sandbox attributes for iframes in web documents (e.g., allow-scripts, allow-same-origin)'); layout_maxShown?: NUMt = new NumInfo('maximum number of children to display at one time (see multicolumnview)'); _layout_columnWidth?: NUMt = new NumInfo('width of table column', false); _layout_columnCount?: NUMt = new NumInfo('number of columns in a masonry view'); diff --git a/src/client/views/nodes/chatbot/agentsystem/Agent.ts b/src/client/views/nodes/chatbot/agentsystem/Agent.ts index 80fdb6533..24471bf5b 100644 --- a/src/client/views/nodes/chatbot/agentsystem/Agent.ts +++ b/src/client/views/nodes/chatbot/agentsystem/Agent.ts @@ -41,7 +41,6 @@ export class Agent { private interMessages: AgentMessage[] = []; private vectorstore: Vectorstore; private _history: () => string; - private _summaries: () => string; private _csvData: () => { filename: string; id: string; text: string }[]; private actionNumber: number = 0; private thoughtNumber: number = 0; @@ -54,11 +53,13 @@ export class Agent { /** * The constructor initializes the agent with the vector store and toolset, and sets up the OpenAI client. * @param _vectorstore Vector store instance for document storage and retrieval. - * @param summaries A function to retrieve document summaries. + * @param summaries A function to retrieve document summaries (deprecated, now using docManager directly). * @param history A function to retrieve chat history. * @param csvData A function to retrieve CSV data linked to the assistant. - * @param addLinkedUrlDoc A function to add a linked document from a URL. + * @param getLinkedUrlDocId A function to get document IDs from URLs. + * @param createImage A function to create images in the dashboard. * @param createCSVInDash A function to create a CSV document in the dashboard. + * @param docManager The document manager instance. */ constructor( _vectorstore: Vectorstore, @@ -74,7 +75,6 @@ export class Agent { this.client = new OpenAI({ apiKey: process.env.OPENAI_KEY, dangerouslyAllowBrowser: true }); this.vectorstore = _vectorstore; this._history = history; - this._summaries = summaries; this._csvData = csvData; this._docManager = docManager; @@ -124,7 +124,12 @@ export class Agent { // Retrieve chat history and generate system prompt const chatHistory = this._history(); - const systemPrompt = getReactPrompt(Object.values(this.tools), this._summaries, chatHistory); + // Get document summaries directly from document manager + const documentSummaries = this._docManager.getAllDocumentSummaries(); + // Create a function that returns document summaries for the prompt + const getSummaries = () => documentSummaries; + // Generate the system prompt with the summaries + const systemPrompt = getReactPrompt(Object.values(this.tools), getSummaries, chatHistory); // Initialize intermediate messages this.interMessages = [{ role: 'system', content: systemPrompt }]; diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx index b11bf7405..ba30cb42b 100644 --- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx +++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx @@ -18,7 +18,7 @@ import { Doc, DocListCast, Opt } from '../../../../../fields/Doc'; import { DocData, DocViews } from '../../../../../fields/DocSymbols'; import { RichTextField } from '../../../../../fields/RichTextField'; import { ScriptField } from '../../../../../fields/ScriptField'; -import { CsvCast, DocCast, NumCast, PDFCast, RTFCast, StrCast } from '../../../../../fields/Types'; +import { CsvCast, DocCast, NumCast, PDFCast, RTFCast, StrCast, VideoCast, AudioCast } from '../../../../../fields/Types'; import { DocUtils } from '../../../../documents/DocUtils'; import { CollectionViewType, DocumentType } from '../../../../documents/DocumentTypes'; import { Docs, DocumentOptions } from '../../../../documents/Documents'; @@ -48,7 +48,14 @@ import { AgentDocumentManager } from '../utils/AgentDocumentManager'; dotenv.config(); -export type parsedDocData = { doc_type: string; data: unknown }; +export type parsedDocData = { + doc_type: string; + data: unknown; + _disable_resource_loading?: boolean; + _sandbox_iframe?: boolean; + _iframe_sandbox?: string; + data_useCors?: boolean; +}; export type parsedDoc = DocumentOptions & parsedDocData; /** * ChatBox is the main class responsible for managing the interaction between the user and the assistant, @@ -150,7 +157,14 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { @action addDocToVectorstore = async (newLinkedDoc: Doc) => { try { - this._isUploadingDocs = true; + const isAudioOrVideo = VideoCast(newLinkedDoc.data)?.url?.pathname || AudioCast(newLinkedDoc.data)?.url?.pathname; + + // Set UI state to show the processing overlay + runInAction(() => { + this._isUploadingDocs = true; + this._uploadProgress = 0; + this._currentStep = isAudioOrVideo ? 'Preparing media file...' : 'Processing document...'; + }); // Process the document first to ensure it has a valid ID this.docManager.processDocument(newLinkedDoc); @@ -158,15 +172,36 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { // Add the document to the vectorstore which will also register chunks await this.vectorstore.addAIDoc(newLinkedDoc, this.updateProgress); - // No longer needed as documents are tracked by the AgentDocumentManager - // this._linked_docs_to_add.add(newLinkedDoc); + // Give a slight delay to show the completion message + if (this._uploadProgress === 100) { + await new Promise(resolve => setTimeout(resolve, 1000)); + } - this._isUploadingDocs = false; + // Reset UI state + runInAction(() => { + this._isUploadingDocs = false; + this._uploadProgress = 0; + this._currentStep = ''; + }); return true; } catch (err) { console.error('Error adding document to vectorstore:', err); - this._isUploadingDocs = false; + + // Show error in UI + runInAction(() => { + this._currentStep = `Error: ${err instanceof Error ? err.message : 'Failed to process document'}`; + }); + + await new Promise(resolve => setTimeout(resolve, 2000)); + + // Reset UI state + runInAction(() => { + this._isUploadingDocs = false; + this._uploadProgress = 0; + this._currentStep = ''; + }); + return false; } }; @@ -178,8 +213,15 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { */ @action updateProgress = (progress: number, step: string) => { - this._uploadProgress = progress; + // Ensure progress is within expected bounds + const validProgress = Math.min(Math.max(0, progress), 100); + this._uploadProgress = validProgress; this._currentStep = step; + + // Force UI update + if (process.env.NODE_ENV !== 'production') { + console.log(`Progress: ${validProgress}%, Step: ${step}`); + } }; /** @@ -453,7 +495,19 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { case supportedDocTypes.image: return Docs.Create.ImageDocument(data as string, options); case supportedDocTypes.equation: return Docs.Create.EquationDocument(data as string, options); case supportedDocTypes.notetaking: return Docs.Create.NoteTakingDocument([], options); - case supportedDocTypes.web: return Docs.Create.WebDocument(data as string, { ...options, data_useCors: true }); + case supportedDocTypes.web: + // Create web document with enhanced safety options + const webOptions = { + ...options, + data_useCors: true + }; + + // If iframe_sandbox was passed from AgentDocumentManager, add it to the options + if ('_iframe_sandbox' in options) { + (webOptions as any)._iframe_sandbox = options._iframe_sandbox; + } + + return Docs.Create.WebDocument(data as string, webOptions); case supportedDocTypes.dataviz: return Docs.Create.DataVizDocument('/users/rz/Downloads/addresses.csv', options); case supportedDocTypes.pdf: return Docs.Create.PdfDocument(data as string, options); case supportedDocTypes.video: return Docs.Create.VideoDocument(data as string, options); @@ -607,65 +661,36 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { return; } - // Process the chunk data - let docChunkSimpl: { chunks: SimplifiedChunk[] } = { chunks: [] }; - try { - docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl) || '{"chunks":[]}'); - } catch (e) { - console.error(`Error parsing chunk_simpl for the found document:`, e); + // Get the simplified chunk using the document manager + const foundChunk = this.docManager.getSimplifiedChunkById(doc, chunkId); + if (!foundChunk) { + console.warn(`Chunk not found in document for chunk ID: ${chunkId}`); + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); return; } - const foundChunk = docChunkSimpl.chunks.find((chunk: SimplifiedChunk) => chunk.chunkId === chunkId); + console.log(`Found chunk in document:`, foundChunk); // Handle different chunk types - if (foundChunk) { - console.log(`Found chunk in document:`, foundChunk); - if (foundChunk.chunkType === CHUNK_TYPE.AUDIO || foundChunk.chunkType === CHUNK_TYPE.VIDEO) { - const directMatchSegmentStart = this.getDirectMatchingSegmentStart(doc, citation.direct_text || '', foundChunk.indexes || []); - if (directMatchSegmentStart) { - await this.goToMediaTimestamp(doc, directMatchSegmentStart, foundChunk.chunkType); - } else { - console.error('No direct matching segment found for the citation.'); - } - } else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) { - this.handleOtherChunkTypes(foundChunk, citation, doc); - } else if (foundChunk.chunkType === CHUNK_TYPE.TEXT) { - // Find text from the document's chunks metadata - let chunkText = ''; - - try { - // We already parsed the chunks earlier, so use that - const matchingChunk = docChunkSimpl.chunks.find(c => c.chunkId === foundChunk.chunkId); - if (matchingChunk && 'text' in matchingChunk) { - // If the text property exists on the chunk (even though it's not in the type) - chunkText = String(matchingChunk['text'] || ''); - } - } catch (e) { - console.error('Error getting chunk text:', e); - } - - // Default text if none found - if (!chunkText) { - chunkText = 'Text content not available'; - } - - this._citationPopup = { - text: chunkText, - visible: true, - }; - } - // Handle URL chunks - else if (foundChunk.chunkType === CHUNK_TYPE.URL) { - if (foundChunk.url) { - DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); - console.log(`Navigated to web document with URL: ${foundChunk.url}`); - } else { - console.warn('URL chunk missing URL:', foundChunk); - } + if (foundChunk.chunkType === CHUNK_TYPE.AUDIO || foundChunk.chunkType === CHUNK_TYPE.VIDEO) { + const directMatchSegmentStart = this.getDirectMatchingSegmentStart(doc, citation.direct_text || '', foundChunk.indexes || []); + if (directMatchSegmentStart) { + await this.goToMediaTimestamp(doc, directMatchSegmentStart, foundChunk.chunkType); + } else { + console.error('No direct matching segment found for the citation.'); } + } else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) { + this.handleOtherChunkTypes(foundChunk, citation, doc); } else { - console.warn('Navigating to doc. Unable to find chunk or segments for citation', citation); + // Show the chunk text in citation popup + let chunkText = foundChunk.text || 'Text content not available'; + + this._citationPopup = { + text: chunkText, + visible: true, + }; + + // Also navigate to the document DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); } } catch (error) { @@ -683,8 +708,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { getDirectMatchingSegmentStart = (doc: Doc, citationText: string, indexesOfSegments: string[]): number => { if (!doc || !citationText) return -1; - // Get original segments from the document - const original_segments = doc.original_segments ? JSON.parse(StrCast(doc.original_segments)) : []; + // Get original segments using document manager + const original_segments = this.docManager.getOriginalSegments(doc); if (!original_segments || !Array.isArray(original_segments) || original_segments.length === 0) { return -1; @@ -993,18 +1018,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { */ @computed get summaries(): string { - const linkedDocs = Array.from(this.docManager.listDocs()) - .map(id => { - const doc = this.docManager.extractDocumentMetadata(id); - if (doc && doc.fields && (doc.fields.layout.summary || doc.fields.data.summary)) { - return doc.fields.layout.summary || doc.fields.data.summary; - } - return null; - }) - .filter(Boolean) - .join('\n\n'); - - return linkedDocs; + // Use the document manager to get all summaries + return this.docManager.getAllDocumentSummaries(); } /** @@ -1033,7 +1048,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { // Other helper methods for retrieving document data and processing retrieveSummaries = (): string => { - return this.summaries; + return this.docManager.getAllDocumentSummaries(); }; retrieveCSVData = () => { @@ -1068,8 +1083,13 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { {this._isUploadingDocs && ( <div className="uploading-overlay"> <div className="progress-container"> - <ProgressBar /> - <div className="step-name">{this._currentStep}</div> + <div className="progress-bar-wrapper"> + <div className="progress-bar" style={{ width: `${this._uploadProgress}%` }} /> + </div> + <div className="progress-details"> + <div className="progress-percentage">{Math.round(this._uploadProgress)}%</div> + <div className="step-name">{this._currentStep}</div> + </div> </div> </div> )} diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss b/src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss index ff5be4a38..3a8334695 100644 --- a/src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss +++ b/src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss @@ -58,12 +58,48 @@ flex-direction: column; align-items: center; text-align: center; + width: 80%; + max-width: 400px; + background-color: white; + padding: 20px; + border-radius: 8px; + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15); } -.step-name { +.progress-bar-wrapper { + width: 100%; + height: 12px; + background-color: #e0e0e0; + border-radius: 6px; + overflow: hidden; + margin-bottom: 10px; +} + +.progress-bar { + height: 100%; + background-color: #4a90e2; + border-radius: 6px; + transition: width 0.5s ease; +} + +.progress-details { + display: flex; + flex-direction: column; + align-items: center; + width: 100%; +} + +.progress-percentage { font-size: 18px; + font-weight: bold; color: #333; + margin-bottom: 5px; +} + +.step-name { + font-size: 16px; + color: #666; text-align: center; width: 100%; - margin-top: -10px; // Adjust to move the text closer to the spinner + margin-top: 5px; } diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts index c3beebcde..cff8380db 100644 --- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts +++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts @@ -14,6 +14,8 @@ import { parsedDoc } from '../chatboxcomponents/ChatBox'; import { faThumbTackSlash } from '@fortawesome/free-solid-svg-icons'; import { DocumentManager } from '../../../../util/DocumentManager'; import { DocumentView } from '../../DocumentView'; +import { RAGChunk, CHUNK_TYPE } from '../types/types'; +import { runInAction } from 'mobx'; /** * Interface representing a document in the freeform view @@ -869,20 +871,43 @@ export class AgentDocumentManager { _layout_autoHeight: true, }; - // Use the chatBox's createDocInDash method to create and link the document + // Additional handling for web documents + if (docType === 'web') { + // For web documents, don't sanitize the URL here + // Instead, set properties to handle content safely when loaded + simpleDoc._disable_resource_loading = true; + simpleDoc._sandbox_iframe = true; + simpleDoc.data_useCors = true; + + // Specify a more permissive sandbox to allow content to render properly + // but still maintain security + simpleDoc._iframe_sandbox = 'allow-same-origin allow-scripts allow-popups allow-forms'; + } + + // Use the chatBox's createDocInDash method to create the document if (!this.chatBox) { throw new Error('ChatBox instance not available for creating document'); } - const linkAndShowDoc = (doc: Opt<Doc>) => { - if (doc) { - LinkManager.Instance.addLink(Docs.Create.LinkDocument(this.chatBoxDocument!, doc)); - this.chatBox._props.addDocument?.(doc); - DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); - } - }; + const doc = this.chatBox.whichDoc(simpleDoc, false); if (doc) { - linkAndShowDoc(doc); + // Use MobX runInAction to properly modify observable state + runInAction(() => { + if (this.chatBoxDocument && doc) { + // Create link and add it to the document system + const linkDoc = Docs.Create.LinkDocument(this.chatBoxDocument, doc); + LinkManager.Instance.addLink(linkDoc); + + // Add document to view + this.chatBox._props.addDocument?.(doc); + + // Show document - defer actual display to prevent immediate resource loading + setTimeout(() => { + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); + }, 100); + } + }); + const id = this.processDocument(doc); return id; } else { @@ -893,6 +918,62 @@ export class AgentDocumentManager { } } + /** + * Sanitizes web content to prevent errors with external resources + * @param content The web content to sanitize + * @returns Sanitized content + */ + private sanitizeWebContent(content: string): string { + if (!content) return content; + + try { + // Replace problematic resource references that might cause errors + const sanitized = content + // Remove preload links that might cause errors + .replace(/<link[^>]*rel=["']preload["'][^>]*>/gi, '') + // Remove map file references + .replace(/\/\/# sourceMappingURL=.*\.map/gi, '') + // Remove external CSS map files references + .replace(/\/\*# sourceMappingURL=.*\.css\.map.*\*\//gi, '') + // Add sandbox to iframes + .replace(/<iframe/gi, '<iframe sandbox="allow-same-origin" loading="lazy"') + // Prevent automatic resource loading for images + .replace(/<img/gi, '<img loading="lazy"') + // Prevent automatic resource loading for scripts + .replace(/<script/gi, '<script type="text/disabled"') + // Handle invalid URIs by converting relative URLs to absolute ones + .replace(/href=["'](\/[^"']+)["']/gi, (match, p1) => { + // Only handle relative URLs starting with / + if (p1.startsWith('/')) { + return `href="#disabled-link"`; + } + return match; + }) + // Prevent automatic loading of CSS + .replace(/<link[^>]*rel=["']stylesheet["'][^>]*href=["']([^"']+)["']/gi, (match, href) => `<link rel="prefetch" data-original-href="${href}" />`); + + // Wrap the content in a sandboxed container + return ` + <div class="sandboxed-web-content"> + <style> + /* Override styles to prevent external resource loading */ + @font-face { font-family: 'disabled'; src: local('Arial'); } + * { font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif !important; } + img, iframe, frame, embed, object { max-width: 100%; } + </style> + ${sanitized} + </div>`; + } catch (e) { + console.warn('Error sanitizing web content:', e); + // Fall back to a safe container with the content as text + return ` + <div class="sandboxed-web-content"> + <p>Content could not be safely displayed. Raw content:</p> + <pre>${content.replace(/</g, '<').replace(/>/g, '>')}</pre> + </div>`; + } + } + public has(docId: string) { return this.documentsById.has(docId); } @@ -988,4 +1069,139 @@ export class AgentDocumentManager { } return undefined; } + + /** + * Adds simplified chunks to a document for citation handling + * @param doc The document to add simplified chunks to + * @param chunks Array of full RAG chunks to simplify + * @param docType The type of document (e.g., 'pdf', 'video', 'audio', etc.) + * @returns The updated document with simplified chunks + */ + public addSimplifiedChunks(doc: Doc, chunks: RAGChunk[], docType: string): Doc { + if (!doc) { + console.error('Cannot add simplified chunks to null document'); + return doc; + } + + // Initialize empty chunks array if not exists + if (!doc.chunk_simpl) { + doc.chunk_simpl = JSON.stringify({ chunks: [] }); + } + + // Create array of simplified chunks based on document type + const simplifiedChunks = chunks.map(chunk => { + // Common properties across all chunk types + const baseChunk = { + chunkId: chunk.id, + text: chunk.metadata.text, + doc_id: chunk.metadata.doc_id, + chunkType: chunk.metadata.type || CHUNK_TYPE.TEXT, + }; + + // Add type-specific properties + if (docType === 'video' || docType === 'audio') { + return { + ...baseChunk, + start_time: chunk.metadata.start_time, + end_time: chunk.metadata.end_time, + indexes: chunk.metadata.indexes, + chunkType: docType === 'video' ? CHUNK_TYPE.VIDEO : CHUNK_TYPE.AUDIO, + }; + } else if (docType === 'pdf') { + return { + ...baseChunk, + startPage: chunk.metadata.start_page, + endPage: chunk.metadata.end_page, + location: chunk.metadata.location, + }; + } else if (docType === 'csv') { + return { + ...baseChunk, + rowStart: (chunk.metadata as any).row_start, + rowEnd: (chunk.metadata as any).row_end, + colStart: (chunk.metadata as any).col_start, + colEnd: (chunk.metadata as any).col_end, + }; + } else { + // Default for other document types + return baseChunk; + } + }); + + // Update the document with all simplified chunks at once + doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks }); + + return doc; + } + + /** + * Gets the simplified chunks from a document + * @param doc The document to get simplified chunks from + * @returns Array of simplified chunks or empty array if none exist + */ + public getSimplifiedChunks(doc: Doc): any[] { + if (!doc || !doc.chunk_simpl) { + return []; + } + + try { + const parsed = JSON.parse(StrCast(doc.chunk_simpl)); + return parsed.chunks || []; + } catch (e) { + console.error('Error parsing simplified chunks:', e); + return []; + } + } + + /** + * Gets a specific simplified chunk by ID + * @param doc The document containing chunks + * @param chunkId The ID of the chunk to retrieve + * @returns The simplified chunk if found, undefined otherwise + */ + public getSimplifiedChunkById(doc: Doc, chunkId: string): any | undefined { + const chunks = this.getSimplifiedChunks(doc); + return chunks.find(chunk => chunk.chunkId === chunkId); + } + + /** + * Gets the original segments from a media document + * @param doc The document containing original media segments + * @returns Array of media segments or empty array if none exist + */ + public getOriginalSegments(doc: Doc): any[] { + if (!doc || !doc.original_segments) { + return []; + } + + try { + return JSON.parse(StrCast(doc.original_segments)) || []; + } catch (e) { + console.error('Error parsing original segments:', e); + return []; + } + } + + /** + * Gets all document summaries combined into a single string + * @returns String containing all document summaries + */ + public getAllDocumentSummaries(): string { + const summaries = Array.from(this.documentsById.keys()) + .map(id => { + const doc = this.getDocument(id); + if (doc) { + // Try to get summary from either the document or its data document + const summary = doc.summary || (doc[DocData] && doc[DocData].summary); + if (summary) { + return StrCast(summary); + } + } + return null; + }) + .filter(Boolean) + .join('\n\n'); + + return summaries; + } } diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index 4512ae3e6..4268c0180 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -103,7 +103,7 @@ export class Vectorstore { const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname; if (!local_file_path) { - console.log('Invalid file path.'); + console.log('Not adding to vectorstore. Invalid file path for vectorstore addition.'); return; } @@ -112,7 +112,11 @@ export class Vectorstore { if (isAudioOrVideo) { console.log('Processing media file...'); + progressCallback(10, 'Preparing media file for transcription...'); + + // Post to processMediaFile endpoint to get the transcript const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) }); + progressCallback(60, 'Transcription completed. Processing transcript...'); // Type assertion to handle the response properties const typedResponse = response as { @@ -135,6 +139,7 @@ export class Vectorstore { input: texts, encoding_format: 'float', }); + progressCallback(85, 'Embeddings generated. Finalizing document...'); doc.original_segments = JSON.stringify(typedResponse.full); const doc_id = uuidv4(); @@ -154,7 +159,7 @@ export class Vectorstore { purpose: '', file_name: local_file_path, num_pages: 0, - summary: '', + summary: summary, chunks: segmentedTranscript.map((chunk, index) => ({ id: chunkIds[index], // Use pre-generated chunk ID values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding @@ -171,23 +176,17 @@ export class Vectorstore { })), type: 'media', }; + progressCallback(95, 'Adding document to vectorstore...'); } catch (error) { console.error('Error generating embeddings:', error); + doc.ai_document_status = 'ERROR'; throw new Error('Embedding generation failed'); } doc.segmented_transcript = JSON.stringify(segmentedTranscript); - // Simplify chunks for storage - ensure simplified chunks use EXACTLY the same IDs - const simplifiedChunks = result.chunks.map(chunk => ({ - chunkId: chunk.id, // Use the exact same ID as the full chunk - start_time: chunk.metadata.start_time, - end_time: chunk.metadata.end_time, - indexes: chunk.metadata.indexes, - chunkType: CHUNK_TYPE.VIDEO, - text: chunk.metadata.text, - doc_id: chunk.metadata.doc_id, // Include parent doc_id for completeness - })); - doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks }); + // Use doc manager to add simplified chunks + const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; + this.docManager.addSimplifiedChunks(doc, result.chunks, docType); } else { // Process regular document console.log('Processing regular document...'); @@ -220,30 +219,18 @@ export class Vectorstore { // Register chunks with the document manager this.docManager.registerChunkIds(result.doc_id, chunkIds); - if (!doc.chunk_simpl) { - doc.chunk_simpl = JSON.stringify({ chunks: [] }); - } + // Use doc manager to add simplified chunks - determine document type from file extension + const fileExt = path.extname(local_file_path).toLowerCase(); + const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text'; + this.docManager.addSimplifiedChunks(doc, result.chunks, docType); + doc.summary = result.summary; doc.ai_purpose = result.purpose; - - result.chunks.forEach((chunk: RAGChunk) => { - const chunkToAdd = { - chunkId: chunk.id, // Ensure we use the exact same ID - startPage: chunk.metadata.start_page, - endPage: chunk.metadata.end_page, - location: chunk.metadata.location, - chunkType: chunk.metadata.type as CHUNK_TYPE, - text: chunk.metadata.text, - doc_id: chunk.metadata.doc_id, // Include parent doc_id for consistency - }; - const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); - new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd); - doc.chunk_simpl = JSON.stringify(new_chunk_simpl); - }); } // Index the document await this.indexDocument(result); + progressCallback(100, 'Document added successfully!'); // Preserve existing metadata updates if (!doc.vectorstore_id) { |