aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2025-04-27 14:57:39 -0400
committerA.J. Shulman <Shulman.aj@gmail.com>2025-04-27 14:57:39 -0400
commit393b7f8286422c933102449eba1ba82874a48896 (patch)
treec34cd5dffc7306a66fcfe54c81d8656c341facb9 /src
parent67a7996278ce176e227393fa410e7afc80228a83 (diff)
improved consistency across doc types and parsing
Diffstat (limited to 'src')
-rw-r--r--src/client/documents/Documents.ts1
-rw-r--r--src/client/views/nodes/chatbot/agentsystem/Agent.ts15
-rw-r--r--src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx176
-rw-r--r--src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss40
-rw-r--r--src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts234
-rw-r--r--src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts49
6 files changed, 390 insertions, 125 deletions
diff --git a/src/client/documents/Documents.ts b/src/client/documents/Documents.ts
index 317bb7feb..f87bd7092 100644
--- a/src/client/documents/Documents.ts
+++ b/src/client/documents/Documents.ts
@@ -273,6 +273,7 @@ export class DocumentOptions {
_layout_reflowHorizontal?: BOOLt = new BoolInfo('permit horizontal resizing with content reflow');
_layout_noSidebar?: BOOLt = new BoolInfo('whether to display the sidebar toggle button');
layout_boxShadow?: string; // box-shadow css string OR "standard" to use dash standard box shadow
+ _iframe_sandbox?: STRt = new StrInfo('sandbox attributes for iframes in web documents (e.g., allow-scripts, allow-same-origin)');
layout_maxShown?: NUMt = new NumInfo('maximum number of children to display at one time (see multicolumnview)');
_layout_columnWidth?: NUMt = new NumInfo('width of table column', false);
_layout_columnCount?: NUMt = new NumInfo('number of columns in a masonry view');
diff --git a/src/client/views/nodes/chatbot/agentsystem/Agent.ts b/src/client/views/nodes/chatbot/agentsystem/Agent.ts
index 80fdb6533..24471bf5b 100644
--- a/src/client/views/nodes/chatbot/agentsystem/Agent.ts
+++ b/src/client/views/nodes/chatbot/agentsystem/Agent.ts
@@ -41,7 +41,6 @@ export class Agent {
private interMessages: AgentMessage[] = [];
private vectorstore: Vectorstore;
private _history: () => string;
- private _summaries: () => string;
private _csvData: () => { filename: string; id: string; text: string }[];
private actionNumber: number = 0;
private thoughtNumber: number = 0;
@@ -54,11 +53,13 @@ export class Agent {
/**
* The constructor initializes the agent with the vector store and toolset, and sets up the OpenAI client.
* @param _vectorstore Vector store instance for document storage and retrieval.
- * @param summaries A function to retrieve document summaries.
+ * @param summaries A function to retrieve document summaries (deprecated, now using docManager directly).
* @param history A function to retrieve chat history.
* @param csvData A function to retrieve CSV data linked to the assistant.
- * @param addLinkedUrlDoc A function to add a linked document from a URL.
+ * @param getLinkedUrlDocId A function to get document IDs from URLs.
+ * @param createImage A function to create images in the dashboard.
* @param createCSVInDash A function to create a CSV document in the dashboard.
+ * @param docManager The document manager instance.
*/
constructor(
_vectorstore: Vectorstore,
@@ -74,7 +75,6 @@ export class Agent {
this.client = new OpenAI({ apiKey: process.env.OPENAI_KEY, dangerouslyAllowBrowser: true });
this.vectorstore = _vectorstore;
this._history = history;
- this._summaries = summaries;
this._csvData = csvData;
this._docManager = docManager;
@@ -124,7 +124,12 @@ export class Agent {
// Retrieve chat history and generate system prompt
const chatHistory = this._history();
- const systemPrompt = getReactPrompt(Object.values(this.tools), this._summaries, chatHistory);
+ // Get document summaries directly from document manager
+ const documentSummaries = this._docManager.getAllDocumentSummaries();
+ // Create a function that returns document summaries for the prompt
+ const getSummaries = () => documentSummaries;
+ // Generate the system prompt with the summaries
+ const systemPrompt = getReactPrompt(Object.values(this.tools), getSummaries, chatHistory);
// Initialize intermediate messages
this.interMessages = [{ role: 'system', content: systemPrompt }];
diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
index b11bf7405..ba30cb42b 100644
--- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
+++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
@@ -18,7 +18,7 @@ import { Doc, DocListCast, Opt } from '../../../../../fields/Doc';
import { DocData, DocViews } from '../../../../../fields/DocSymbols';
import { RichTextField } from '../../../../../fields/RichTextField';
import { ScriptField } from '../../../../../fields/ScriptField';
-import { CsvCast, DocCast, NumCast, PDFCast, RTFCast, StrCast } from '../../../../../fields/Types';
+import { CsvCast, DocCast, NumCast, PDFCast, RTFCast, StrCast, VideoCast, AudioCast } from '../../../../../fields/Types';
import { DocUtils } from '../../../../documents/DocUtils';
import { CollectionViewType, DocumentType } from '../../../../documents/DocumentTypes';
import { Docs, DocumentOptions } from '../../../../documents/Documents';
@@ -48,7 +48,14 @@ import { AgentDocumentManager } from '../utils/AgentDocumentManager';
dotenv.config();
-export type parsedDocData = { doc_type: string; data: unknown };
+export type parsedDocData = {
+ doc_type: string;
+ data: unknown;
+ _disable_resource_loading?: boolean;
+ _sandbox_iframe?: boolean;
+ _iframe_sandbox?: string;
+ data_useCors?: boolean;
+};
export type parsedDoc = DocumentOptions & parsedDocData;
/**
* ChatBox is the main class responsible for managing the interaction between the user and the assistant,
@@ -150,7 +157,14 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
@action
addDocToVectorstore = async (newLinkedDoc: Doc) => {
try {
- this._isUploadingDocs = true;
+ const isAudioOrVideo = VideoCast(newLinkedDoc.data)?.url?.pathname || AudioCast(newLinkedDoc.data)?.url?.pathname;
+
+ // Set UI state to show the processing overlay
+ runInAction(() => {
+ this._isUploadingDocs = true;
+ this._uploadProgress = 0;
+ this._currentStep = isAudioOrVideo ? 'Preparing media file...' : 'Processing document...';
+ });
// Process the document first to ensure it has a valid ID
this.docManager.processDocument(newLinkedDoc);
@@ -158,15 +172,36 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
// Add the document to the vectorstore which will also register chunks
await this.vectorstore.addAIDoc(newLinkedDoc, this.updateProgress);
- // No longer needed as documents are tracked by the AgentDocumentManager
- // this._linked_docs_to_add.add(newLinkedDoc);
+ // Give a slight delay to show the completion message
+ if (this._uploadProgress === 100) {
+ await new Promise(resolve => setTimeout(resolve, 1000));
+ }
- this._isUploadingDocs = false;
+ // Reset UI state
+ runInAction(() => {
+ this._isUploadingDocs = false;
+ this._uploadProgress = 0;
+ this._currentStep = '';
+ });
return true;
} catch (err) {
console.error('Error adding document to vectorstore:', err);
- this._isUploadingDocs = false;
+
+ // Show error in UI
+ runInAction(() => {
+ this._currentStep = `Error: ${err instanceof Error ? err.message : 'Failed to process document'}`;
+ });
+
+ await new Promise(resolve => setTimeout(resolve, 2000));
+
+ // Reset UI state
+ runInAction(() => {
+ this._isUploadingDocs = false;
+ this._uploadProgress = 0;
+ this._currentStep = '';
+ });
+
return false;
}
};
@@ -178,8 +213,15 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
*/
@action
updateProgress = (progress: number, step: string) => {
- this._uploadProgress = progress;
+ // Ensure progress is within expected bounds
+ const validProgress = Math.min(Math.max(0, progress), 100);
+ this._uploadProgress = validProgress;
this._currentStep = step;
+
+ // Force UI update
+ if (process.env.NODE_ENV !== 'production') {
+ console.log(`Progress: ${validProgress}%, Step: ${step}`);
+ }
};
/**
@@ -453,7 +495,19 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
case supportedDocTypes.image: return Docs.Create.ImageDocument(data as string, options);
case supportedDocTypes.equation: return Docs.Create.EquationDocument(data as string, options);
case supportedDocTypes.notetaking: return Docs.Create.NoteTakingDocument([], options);
- case supportedDocTypes.web: return Docs.Create.WebDocument(data as string, { ...options, data_useCors: true });
+ case supportedDocTypes.web:
+ // Create web document with enhanced safety options
+ const webOptions = {
+ ...options,
+ data_useCors: true
+ };
+
+ // If iframe_sandbox was passed from AgentDocumentManager, add it to the options
+ if ('_iframe_sandbox' in options) {
+ (webOptions as any)._iframe_sandbox = options._iframe_sandbox;
+ }
+
+ return Docs.Create.WebDocument(data as string, webOptions);
case supportedDocTypes.dataviz: return Docs.Create.DataVizDocument('/users/rz/Downloads/addresses.csv', options);
case supportedDocTypes.pdf: return Docs.Create.PdfDocument(data as string, options);
case supportedDocTypes.video: return Docs.Create.VideoDocument(data as string, options);
@@ -607,65 +661,36 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
return;
}
- // Process the chunk data
- let docChunkSimpl: { chunks: SimplifiedChunk[] } = { chunks: [] };
- try {
- docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl) || '{"chunks":[]}');
- } catch (e) {
- console.error(`Error parsing chunk_simpl for the found document:`, e);
+ // Get the simplified chunk using the document manager
+ const foundChunk = this.docManager.getSimplifiedChunkById(doc, chunkId);
+ if (!foundChunk) {
+ console.warn(`Chunk not found in document for chunk ID: ${chunkId}`);
+ DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
return;
}
- const foundChunk = docChunkSimpl.chunks.find((chunk: SimplifiedChunk) => chunk.chunkId === chunkId);
+ console.log(`Found chunk in document:`, foundChunk);
// Handle different chunk types
- if (foundChunk) {
- console.log(`Found chunk in document:`, foundChunk);
- if (foundChunk.chunkType === CHUNK_TYPE.AUDIO || foundChunk.chunkType === CHUNK_TYPE.VIDEO) {
- const directMatchSegmentStart = this.getDirectMatchingSegmentStart(doc, citation.direct_text || '', foundChunk.indexes || []);
- if (directMatchSegmentStart) {
- await this.goToMediaTimestamp(doc, directMatchSegmentStart, foundChunk.chunkType);
- } else {
- console.error('No direct matching segment found for the citation.');
- }
- } else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) {
- this.handleOtherChunkTypes(foundChunk, citation, doc);
- } else if (foundChunk.chunkType === CHUNK_TYPE.TEXT) {
- // Find text from the document's chunks metadata
- let chunkText = '';
-
- try {
- // We already parsed the chunks earlier, so use that
- const matchingChunk = docChunkSimpl.chunks.find(c => c.chunkId === foundChunk.chunkId);
- if (matchingChunk && 'text' in matchingChunk) {
- // If the text property exists on the chunk (even though it's not in the type)
- chunkText = String(matchingChunk['text'] || '');
- }
- } catch (e) {
- console.error('Error getting chunk text:', e);
- }
-
- // Default text if none found
- if (!chunkText) {
- chunkText = 'Text content not available';
- }
-
- this._citationPopup = {
- text: chunkText,
- visible: true,
- };
- }
- // Handle URL chunks
- else if (foundChunk.chunkType === CHUNK_TYPE.URL) {
- if (foundChunk.url) {
- DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
- console.log(`Navigated to web document with URL: ${foundChunk.url}`);
- } else {
- console.warn('URL chunk missing URL:', foundChunk);
- }
+ if (foundChunk.chunkType === CHUNK_TYPE.AUDIO || foundChunk.chunkType === CHUNK_TYPE.VIDEO) {
+ const directMatchSegmentStart = this.getDirectMatchingSegmentStart(doc, citation.direct_text || '', foundChunk.indexes || []);
+ if (directMatchSegmentStart) {
+ await this.goToMediaTimestamp(doc, directMatchSegmentStart, foundChunk.chunkType);
+ } else {
+ console.error('No direct matching segment found for the citation.');
}
+ } else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) {
+ this.handleOtherChunkTypes(foundChunk, citation, doc);
} else {
- console.warn('Navigating to doc. Unable to find chunk or segments for citation', citation);
+ // Show the chunk text in citation popup
+ let chunkText = foundChunk.text || 'Text content not available';
+
+ this._citationPopup = {
+ text: chunkText,
+ visible: true,
+ };
+
+ // Also navigate to the document
DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
}
} catch (error) {
@@ -683,8 +708,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
getDirectMatchingSegmentStart = (doc: Doc, citationText: string, indexesOfSegments: string[]): number => {
if (!doc || !citationText) return -1;
- // Get original segments from the document
- const original_segments = doc.original_segments ? JSON.parse(StrCast(doc.original_segments)) : [];
+ // Get original segments using document manager
+ const original_segments = this.docManager.getOriginalSegments(doc);
if (!original_segments || !Array.isArray(original_segments) || original_segments.length === 0) {
return -1;
@@ -993,18 +1018,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
*/
@computed
get summaries(): string {
- const linkedDocs = Array.from(this.docManager.listDocs())
- .map(id => {
- const doc = this.docManager.extractDocumentMetadata(id);
- if (doc && doc.fields && (doc.fields.layout.summary || doc.fields.data.summary)) {
- return doc.fields.layout.summary || doc.fields.data.summary;
- }
- return null;
- })
- .filter(Boolean)
- .join('\n\n');
-
- return linkedDocs;
+ // Use the document manager to get all summaries
+ return this.docManager.getAllDocumentSummaries();
}
/**
@@ -1033,7 +1048,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
// Other helper methods for retrieving document data and processing
retrieveSummaries = (): string => {
- return this.summaries;
+ return this.docManager.getAllDocumentSummaries();
};
retrieveCSVData = () => {
@@ -1068,8 +1083,13 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
{this._isUploadingDocs && (
<div className="uploading-overlay">
<div className="progress-container">
- <ProgressBar />
- <div className="step-name">{this._currentStep}</div>
+ <div className="progress-bar-wrapper">
+ <div className="progress-bar" style={{ width: `${this._uploadProgress}%` }} />
+ </div>
+ <div className="progress-details">
+ <div className="progress-percentage">{Math.round(this._uploadProgress)}%</div>
+ <div className="step-name">{this._currentStep}</div>
+ </div>
</div>
</div>
)}
diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss b/src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss
index ff5be4a38..3a8334695 100644
--- a/src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss
+++ b/src/client/views/nodes/chatbot/chatboxcomponents/ProgressBar.scss
@@ -58,12 +58,48 @@
flex-direction: column;
align-items: center;
text-align: center;
+ width: 80%;
+ max-width: 400px;
+ background-color: white;
+ padding: 20px;
+ border-radius: 8px;
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
}
-.step-name {
+.progress-bar-wrapper {
+ width: 100%;
+ height: 12px;
+ background-color: #e0e0e0;
+ border-radius: 6px;
+ overflow: hidden;
+ margin-bottom: 10px;
+}
+
+.progress-bar {
+ height: 100%;
+ background-color: #4a90e2;
+ border-radius: 6px;
+ transition: width 0.5s ease;
+}
+
+.progress-details {
+ display: flex;
+ flex-direction: column;
+ align-items: center;
+ width: 100%;
+}
+
+.progress-percentage {
font-size: 18px;
+ font-weight: bold;
color: #333;
+ margin-bottom: 5px;
+}
+
+.step-name {
+ font-size: 16px;
+ color: #666;
text-align: center;
width: 100%;
- margin-top: -10px; // Adjust to move the text closer to the spinner
+ margin-top: 5px;
}
diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
index c3beebcde..cff8380db 100644
--- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
+++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
@@ -14,6 +14,8 @@ import { parsedDoc } from '../chatboxcomponents/ChatBox';
import { faThumbTackSlash } from '@fortawesome/free-solid-svg-icons';
import { DocumentManager } from '../../../../util/DocumentManager';
import { DocumentView } from '../../DocumentView';
+import { RAGChunk, CHUNK_TYPE } from '../types/types';
+import { runInAction } from 'mobx';
/**
* Interface representing a document in the freeform view
@@ -869,20 +871,43 @@ export class AgentDocumentManager {
_layout_autoHeight: true,
};
- // Use the chatBox's createDocInDash method to create and link the document
+ // Additional handling for web documents
+ if (docType === 'web') {
+ // For web documents, don't sanitize the URL here
+ // Instead, set properties to handle content safely when loaded
+ simpleDoc._disable_resource_loading = true;
+ simpleDoc._sandbox_iframe = true;
+ simpleDoc.data_useCors = true;
+
+ // Specify a more permissive sandbox to allow content to render properly
+ // but still maintain security
+ simpleDoc._iframe_sandbox = 'allow-same-origin allow-scripts allow-popups allow-forms';
+ }
+
+ // Use the chatBox's createDocInDash method to create the document
if (!this.chatBox) {
throw new Error('ChatBox instance not available for creating document');
}
- const linkAndShowDoc = (doc: Opt<Doc>) => {
- if (doc) {
- LinkManager.Instance.addLink(Docs.Create.LinkDocument(this.chatBoxDocument!, doc));
- this.chatBox._props.addDocument?.(doc);
- DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
- }
- };
+
const doc = this.chatBox.whichDoc(simpleDoc, false);
if (doc) {
- linkAndShowDoc(doc);
+ // Use MobX runInAction to properly modify observable state
+ runInAction(() => {
+ if (this.chatBoxDocument && doc) {
+ // Create link and add it to the document system
+ const linkDoc = Docs.Create.LinkDocument(this.chatBoxDocument, doc);
+ LinkManager.Instance.addLink(linkDoc);
+
+ // Add document to view
+ this.chatBox._props.addDocument?.(doc);
+
+ // Show document - defer actual display to prevent immediate resource loading
+ setTimeout(() => {
+ DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
+ }, 100);
+ }
+ });
+
const id = this.processDocument(doc);
return id;
} else {
@@ -893,6 +918,62 @@ export class AgentDocumentManager {
}
}
+ /**
+ * Sanitizes web content to prevent errors with external resources
+ * @param content The web content to sanitize
+ * @returns Sanitized content
+ */
+ private sanitizeWebContent(content: string): string {
+ if (!content) return content;
+
+ try {
+ // Replace problematic resource references that might cause errors
+ const sanitized = content
+ // Remove preload links that might cause errors
+ .replace(/<link[^>]*rel=["']preload["'][^>]*>/gi, '')
+ // Remove map file references
+ .replace(/\/\/# sourceMappingURL=.*\.map/gi, '')
+ // Remove external CSS map files references
+ .replace(/\/\*# sourceMappingURL=.*\.css\.map.*\*\//gi, '')
+ // Add sandbox to iframes
+ .replace(/<iframe/gi, '<iframe sandbox="allow-same-origin" loading="lazy"')
+ // Prevent automatic resource loading for images
+ .replace(/<img/gi, '<img loading="lazy"')
+ // Prevent automatic resource loading for scripts
+ .replace(/<script/gi, '<script type="text/disabled"')
+ // Handle invalid URIs by converting relative URLs to absolute ones
+ .replace(/href=["'](\/[^"']+)["']/gi, (match, p1) => {
+ // Only handle relative URLs starting with /
+ if (p1.startsWith('/')) {
+ return `href="#disabled-link"`;
+ }
+ return match;
+ })
+ // Prevent automatic loading of CSS
+ .replace(/<link[^>]*rel=["']stylesheet["'][^>]*href=["']([^"']+)["']/gi, (match, href) => `<link rel="prefetch" data-original-href="${href}" />`);
+
+ // Wrap the content in a sandboxed container
+ return `
+ <div class="sandboxed-web-content">
+ <style>
+ /* Override styles to prevent external resource loading */
+ @font-face { font-family: 'disabled'; src: local('Arial'); }
+ * { font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif !important; }
+ img, iframe, frame, embed, object { max-width: 100%; }
+ </style>
+ ${sanitized}
+ </div>`;
+ } catch (e) {
+ console.warn('Error sanitizing web content:', e);
+ // Fall back to a safe container with the content as text
+ return `
+ <div class="sandboxed-web-content">
+ <p>Content could not be safely displayed. Raw content:</p>
+ <pre>${content.replace(/</g, '&lt;').replace(/>/g, '&gt;')}</pre>
+ </div>`;
+ }
+ }
+
public has(docId: string) {
return this.documentsById.has(docId);
}
@@ -988,4 +1069,139 @@ export class AgentDocumentManager {
}
return undefined;
}
+
+ /**
+ * Adds simplified chunks to a document for citation handling
+ * @param doc The document to add simplified chunks to
+ * @param chunks Array of full RAG chunks to simplify
+ * @param docType The type of document (e.g., 'pdf', 'video', 'audio', etc.)
+ * @returns The updated document with simplified chunks
+ */
+ public addSimplifiedChunks(doc: Doc, chunks: RAGChunk[], docType: string): Doc {
+ if (!doc) {
+ console.error('Cannot add simplified chunks to null document');
+ return doc;
+ }
+
+ // Initialize empty chunks array if not exists
+ if (!doc.chunk_simpl) {
+ doc.chunk_simpl = JSON.stringify({ chunks: [] });
+ }
+
+ // Create array of simplified chunks based on document type
+ const simplifiedChunks = chunks.map(chunk => {
+ // Common properties across all chunk types
+ const baseChunk = {
+ chunkId: chunk.id,
+ text: chunk.metadata.text,
+ doc_id: chunk.metadata.doc_id,
+ chunkType: chunk.metadata.type || CHUNK_TYPE.TEXT,
+ };
+
+ // Add type-specific properties
+ if (docType === 'video' || docType === 'audio') {
+ return {
+ ...baseChunk,
+ start_time: chunk.metadata.start_time,
+ end_time: chunk.metadata.end_time,
+ indexes: chunk.metadata.indexes,
+ chunkType: docType === 'video' ? CHUNK_TYPE.VIDEO : CHUNK_TYPE.AUDIO,
+ };
+ } else if (docType === 'pdf') {
+ return {
+ ...baseChunk,
+ startPage: chunk.metadata.start_page,
+ endPage: chunk.metadata.end_page,
+ location: chunk.metadata.location,
+ };
+ } else if (docType === 'csv') {
+ return {
+ ...baseChunk,
+ rowStart: (chunk.metadata as any).row_start,
+ rowEnd: (chunk.metadata as any).row_end,
+ colStart: (chunk.metadata as any).col_start,
+ colEnd: (chunk.metadata as any).col_end,
+ };
+ } else {
+ // Default for other document types
+ return baseChunk;
+ }
+ });
+
+ // Update the document with all simplified chunks at once
+ doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
+
+ return doc;
+ }
+
+ /**
+ * Gets the simplified chunks from a document
+ * @param doc The document to get simplified chunks from
+ * @returns Array of simplified chunks or empty array if none exist
+ */
+ public getSimplifiedChunks(doc: Doc): any[] {
+ if (!doc || !doc.chunk_simpl) {
+ return [];
+ }
+
+ try {
+ const parsed = JSON.parse(StrCast(doc.chunk_simpl));
+ return parsed.chunks || [];
+ } catch (e) {
+ console.error('Error parsing simplified chunks:', e);
+ return [];
+ }
+ }
+
+ /**
+ * Gets a specific simplified chunk by ID
+ * @param doc The document containing chunks
+ * @param chunkId The ID of the chunk to retrieve
+ * @returns The simplified chunk if found, undefined otherwise
+ */
+ public getSimplifiedChunkById(doc: Doc, chunkId: string): any | undefined {
+ const chunks = this.getSimplifiedChunks(doc);
+ return chunks.find(chunk => chunk.chunkId === chunkId);
+ }
+
+ /**
+ * Gets the original segments from a media document
+ * @param doc The document containing original media segments
+ * @returns Array of media segments or empty array if none exist
+ */
+ public getOriginalSegments(doc: Doc): any[] {
+ if (!doc || !doc.original_segments) {
+ return [];
+ }
+
+ try {
+ return JSON.parse(StrCast(doc.original_segments)) || [];
+ } catch (e) {
+ console.error('Error parsing original segments:', e);
+ return [];
+ }
+ }
+
+ /**
+ * Gets all document summaries combined into a single string
+ * @returns String containing all document summaries
+ */
+ public getAllDocumentSummaries(): string {
+ const summaries = Array.from(this.documentsById.keys())
+ .map(id => {
+ const doc = this.getDocument(id);
+ if (doc) {
+ // Try to get summary from either the document or its data document
+ const summary = doc.summary || (doc[DocData] && doc[DocData].summary);
+ if (summary) {
+ return StrCast(summary);
+ }
+ }
+ return null;
+ })
+ .filter(Boolean)
+ .join('\n\n');
+
+ return summaries;
+ }
}
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index 4512ae3e6..4268c0180 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -103,7 +103,7 @@ export class Vectorstore {
const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname;
if (!local_file_path) {
- console.log('Invalid file path.');
+ console.log('Not adding to vectorstore. Invalid file path for vectorstore addition.');
return;
}
@@ -112,7 +112,11 @@ export class Vectorstore {
if (isAudioOrVideo) {
console.log('Processing media file...');
+ progressCallback(10, 'Preparing media file for transcription...');
+
+ // Post to processMediaFile endpoint to get the transcript
const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
+ progressCallback(60, 'Transcription completed. Processing transcript...');
// Type assertion to handle the response properties
const typedResponse = response as {
@@ -135,6 +139,7 @@ export class Vectorstore {
input: texts,
encoding_format: 'float',
});
+ progressCallback(85, 'Embeddings generated. Finalizing document...');
doc.original_segments = JSON.stringify(typedResponse.full);
const doc_id = uuidv4();
@@ -154,7 +159,7 @@ export class Vectorstore {
purpose: '',
file_name: local_file_path,
num_pages: 0,
- summary: '',
+ summary: summary,
chunks: segmentedTranscript.map((chunk, index) => ({
id: chunkIds[index], // Use pre-generated chunk ID
values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding
@@ -171,23 +176,17 @@ export class Vectorstore {
})),
type: 'media',
};
+ progressCallback(95, 'Adding document to vectorstore...');
} catch (error) {
console.error('Error generating embeddings:', error);
+ doc.ai_document_status = 'ERROR';
throw new Error('Embedding generation failed');
}
doc.segmented_transcript = JSON.stringify(segmentedTranscript);
- // Simplify chunks for storage - ensure simplified chunks use EXACTLY the same IDs
- const simplifiedChunks = result.chunks.map(chunk => ({
- chunkId: chunk.id, // Use the exact same ID as the full chunk
- start_time: chunk.metadata.start_time,
- end_time: chunk.metadata.end_time,
- indexes: chunk.metadata.indexes,
- chunkType: CHUNK_TYPE.VIDEO,
- text: chunk.metadata.text,
- doc_id: chunk.metadata.doc_id, // Include parent doc_id for completeness
- }));
- doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
+ // Use doc manager to add simplified chunks
+ const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
+ this.docManager.addSimplifiedChunks(doc, result.chunks, docType);
} else {
// Process regular document
console.log('Processing regular document...');
@@ -220,30 +219,18 @@ export class Vectorstore {
// Register chunks with the document manager
this.docManager.registerChunkIds(result.doc_id, chunkIds);
- if (!doc.chunk_simpl) {
- doc.chunk_simpl = JSON.stringify({ chunks: [] });
- }
+ // Use doc manager to add simplified chunks - determine document type from file extension
+ const fileExt = path.extname(local_file_path).toLowerCase();
+ const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text';
+ this.docManager.addSimplifiedChunks(doc, result.chunks, docType);
+
doc.summary = result.summary;
doc.ai_purpose = result.purpose;
-
- result.chunks.forEach((chunk: RAGChunk) => {
- const chunkToAdd = {
- chunkId: chunk.id, // Ensure we use the exact same ID
- startPage: chunk.metadata.start_page,
- endPage: chunk.metadata.end_page,
- location: chunk.metadata.location,
- chunkType: chunk.metadata.type as CHUNK_TYPE,
- text: chunk.metadata.text,
- doc_id: chunk.metadata.doc_id, // Include parent doc_id for consistency
- };
- const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl));
- new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd);
- doc.chunk_simpl = JSON.stringify(new_chunk_simpl);
- });
}
// Index the document
await this.indexDocument(result);
+ progressCallback(100, 'Document added successfully!');
// Preserve existing metadata updates
if (!doc.vectorstore_id) {