aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2025-04-27 13:14:49 -0400
committerA.J. Shulman <Shulman.aj@gmail.com>2025-04-27 13:14:49 -0400
commit3ef3d40506348d9fd537cc8f4aea975b9770689f (patch)
treeafe779e8240e88c8b20ff6b68ac45840a927ee76 /src
parent5ce2263849bfb901e276a4c5fc8ca2dbd8b80350 (diff)
new attempt with new citation unification
Diffstat (limited to 'src')
-rw-r--r--src/client/views/nodes/chatbot/agentsystem/Agent.ts5
-rw-r--r--src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx450
-rw-r--r--src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts16
-rw-r--r--src/client/views/nodes/chatbot/tools/SearchTool.ts18
-rw-r--r--src/client/views/nodes/chatbot/types/types.ts1
-rw-r--r--src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts168
-rw-r--r--src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts130
7 files changed, 510 insertions, 278 deletions
diff --git a/src/client/views/nodes/chatbot/agentsystem/Agent.ts b/src/client/views/nodes/chatbot/agentsystem/Agent.ts
index c021d141e..80fdb6533 100644
--- a/src/client/views/nodes/chatbot/agentsystem/Agent.ts
+++ b/src/client/views/nodes/chatbot/agentsystem/Agent.ts
@@ -65,12 +65,9 @@ export class Agent {
summaries: () => string,
history: () => string,
csvData: () => { filename: string; id: string; text: string }[],
- addLinkedUrlDoc: (url: string, id: string) => void,
getLinkedUrlDocId: (url: string) => string[],
createImage: (result: Upload.FileInformation & Upload.InspectionResults, options: DocumentOptions) => void,
- // eslint-disable-next-line @typescript-eslint/no-unused-vars
createCSVInDash: (url: string, title: string, id: string, data: string) => void,
- chatBox: ChatBox,
docManager: AgentDocumentManager
) {
// Initialize OpenAI client with API key from environment
@@ -87,7 +84,7 @@ export class Agent {
rag: new RAGTool(this.vectorstore),
dataAnalysis: new DataAnalysisTool(csvData),
websiteInfoScraper: new WebsiteInfoScraperTool(getLinkedUrlDocId),
- searchTool: new SearchTool(addLinkedUrlDoc),
+ searchTool: new SearchTool(this._docManager),
noTool: new NoTool(),
//imageCreationTool: new ImageCreationTool(createImage),
documentMetadata: new DocumentMetadataTool(this._docManager),
diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
index 43765c1ce..35dbee3e9 100644
--- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
+++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
@@ -71,7 +71,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
@observable private _citationPopup: { text: string; visible: boolean } = { text: '', visible: false };
// Private properties for managing OpenAI API, vector store, agent, and UI elements
- private openai: OpenAI;
+ private openai!: OpenAI; // Using definite assignment assertion
private vectorstore_id: string;
private vectorstore: Vectorstore;
private agent: Agent;
@@ -98,25 +98,34 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
*/
constructor(props: FieldViewProps) {
super(props);
- makeObservable(this); // Enable MobX observables
+ makeObservable(this);
- // Initialize OpenAI, vectorstore, and agent
- this.openai = this.initializeOpenAI();
- if (StrCast(this.dataDoc.vectorstore_id) == '') {
- this.vectorstore_id = uuidv4();
- this.dataDoc.vectorstore_id = this.vectorstore_id;
- } else {
- this.vectorstore_id = StrCast(this.dataDoc.vectorstore_id);
- }
- this.vectorstore = new Vectorstore(this.vectorstore_id, this.retrieveDocIds);
+ this.messagesRef = React.createRef();
this.docManager = new AgentDocumentManager(this);
- this.agent = new Agent(this.vectorstore, this.retrieveSummaries, this.retrieveFormattedHistory, this.retrieveCSVData, this.addLinkedUrlDoc, this.getLinkedUrlDocIds, this.createImageInDash, this.createCSVInDash, this, this.docManager);
- // Reinitialize the DocumentMetadataTool with a direct reference to this ChatBox instance
- // This ensures the tool can properly access documents in the same Freeform view
- this.agent.reinitializeDocumentMetadataTool();
+ // Initialize OpenAI client
+ this.initializeOpenAI();
+
+ // Create a unique vectorstore ID for this ChatBox
+ this.vectorstore_id = uuidv4();
+
+ // Initialize vectorstore with the document manager
+ this.vectorstore = new Vectorstore(this.vectorstore_id, this.docManager);
+
+ // Create an agent with the vectorstore
+ this.agent = new Agent(
+ this.vectorstore,
+ this.retrieveSummaries.bind(this),
+ this.retrieveFormattedHistory.bind(this),
+ this.retrieveCSVData.bind(this),
+ this.retrieveDocIds.bind(this),
+ this.createImageInDash.bind(this),
+ this.createCSVInDash.bind(this),
+ this.docManager
+ );
- this.messagesRef = React.createRef<HTMLDivElement>();
+ // Add event listeners
+ this.addScrollListener();
// Reaction to update dataDoc when chat history changes
reaction(
@@ -140,22 +149,25 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
*/
@action
addDocToVectorstore = async (newLinkedDoc: Doc) => {
- this._uploadProgress = 0;
- this._currentStep = 'Initializing...';
- this._isUploadingDocs = true;
-
try {
- // Add the document to the vectorstore
+ this._isUploadingDocs = true;
+
+ // Process the document first to ensure it has a valid ID
+ this.docManager.processDocument(newLinkedDoc);
+
+ // Add the document to the vectorstore which will also register chunks
await this.vectorstore.addAIDoc(newLinkedDoc, this.updateProgress);
- } catch (error) {
- console.error('Error uploading document:', error);
- this._currentStep = 'Error during upload';
- } finally {
- runInAction(() => {
- this._isUploadingDocs = false;
- this._uploadProgress = 0;
- this._currentStep = '';
- });
+
+ // No longer needed as documents are tracked by the AgentDocumentManager
+ // this._linked_docs_to_add.add(newLinkedDoc);
+
+ this._isUploadingDocs = false;
+
+ return true;
+ } catch (err) {
+ console.error('Error adding document to vectorstore:', err);
+ this._isUploadingDocs = false;
+ return false;
}
};
@@ -238,7 +250,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
apiKey: process.env.OPENAI_KEY,
dangerouslyAllowBrowser: true,
};
- return new OpenAI(configuration);
+ this.openai = new OpenAI(configuration);
}
/**
@@ -376,49 +388,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
};
/**
- * Adds a linked document from a URL for future reference and analysis.
- * @param url The URL of the document to add.
- * @param id The unique identifier for the document.
- */
- @action
- addLinkedUrlDoc = async (url: string, id: string) => {
- const doc = Docs.Create.WebDocument(url, { data_useCors: true });
- this.docManager.addCustomId(doc, id);
- const linkDoc = Docs.Create.LinkDocument(this.Document, doc);
- LinkManager.Instance.addLink(linkDoc);
-
- const chunkToAdd = {
- chunkId: id,
- chunkType: CHUNK_TYPE.URL,
- url: url,
- };
-
- doc.chunk_simpl = JSON.stringify({ chunks: [chunkToAdd] });
- this.docManager.processDocument(doc);
- };
-
- /**
- * Retrieves the IDs of linked url documents.
- * @returns An array of document IDs.
- */
- @action
- getLinkedUrlDocIds = () => {
- const linkedDocs: Doc[] = this.linkedDocs;
- const linkedUrlDocIds: string[] = [];
-
- for (const doc of linkedDocs) {
- if (doc.chunk_simpl) {
- const docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl)) as { chunks: SimplifiedChunk[] };
- const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkType === CHUNK_TYPE.URL);
- if (foundChunk) {
- linkedUrlDocIds.push(foundChunk.chunkId);
- }
- }
- }
- return linkedUrlDocIds;
- };
-
- /**
* Getter to retrieve the current user's name from the client utils.
*/
@computed
@@ -613,82 +582,224 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
*/
@action
handleCitationClick = async (citation: Citation) => {
- const currentLinkedDocs: Doc[] = this.linkedDocs;
- const chunkId = citation.chunk_id;
+ try {
+ // Extract values from MobX proxy object if needed
+ const chunkId = typeof citation.chunk_id === 'object' ? (citation.chunk_id as any).toString() : citation.chunk_id;
+
+ // For debugging
+ console.log('Citation clicked:', {
+ chunkId,
+ citation: JSON.stringify(citation, null, 2),
+ });
- for (const doc of currentLinkedDocs) {
- if (doc.chunk_simpl) {
- const docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl)) as { chunks: SimplifiedChunk[] };
- const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkId === chunkId);
+ // Try to find the document
+ const linkedDocs = this.linkedDocs;
+ let doc: Doc | undefined;
- if (foundChunk) {
- // Handle media chunks specifically
+ // First try to find the document using the document manager's chunk ID lookup
+ const parentDocId = this.docManager.getDocIdByChunkId(chunkId);
+ if (parentDocId) {
+ doc = this.docManager.getDocument(parentDocId);
+ console.log(`Found document by chunk ID lookup: ${parentDocId}`);
+ }
- if (doc.ai_type == 'video' || doc.ai_type == 'audio') {
- const directMatchSegmentStart = this.getDirectMatchingSegmentStart(doc, citation.direct_text || '', foundChunk.indexes || []);
+ // If not found, fall back to searching through linked docs (maintains compatibility)
+ if (!doc) {
+ for (const linkedDoc of linkedDocs) {
+ if (linkedDoc.chunk_simpl) {
+ try {
+ const docChunkSimpl = JSON.parse(StrCast(linkedDoc.chunk_simpl)) as { chunks: SimplifiedChunk[] };
+ const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkId === chunkId);
+ if (foundChunk) {
+ doc = linkedDoc;
+ console.log(`Found document by iterating through linked docs`);
+ break;
+ }
+ } catch (e) {
+ console.error(`Error parsing chunk_simpl for doc ${linkedDoc.id}:`, e);
+ }
+ }
+ }
+ }
- if (directMatchSegmentStart) {
- // Navigate to the segment's start time in the media player
- await this.goToMediaTimestamp(doc, directMatchSegmentStart, doc.ai_type);
- } else {
- console.error('No direct matching segment found for the citation.');
+ if (!doc) {
+ console.warn(`Document not found for citation with chunk_id: ${chunkId}`);
+ return;
+ }
+
+ // Process the chunk data
+ let docChunkSimpl: { chunks: SimplifiedChunk[] } = { chunks: [] };
+ try {
+ docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl) || '{"chunks":[]}');
+ } catch (e) {
+ console.error(`Error parsing chunk_simpl for the found document:`, e);
+ return;
+ }
+
+ const foundChunk = docChunkSimpl.chunks.find((chunk: SimplifiedChunk) => chunk.chunkId === chunkId);
+
+ // Handle different chunk types
+ if (foundChunk) {
+ console.log(`Found chunk in document:`, foundChunk);
+
+ // Handle video chunks
+ if (foundChunk.chunkType === CHUNK_TYPE.VIDEO) {
+ if (foundChunk.start_time !== undefined) {
+ await this.goToMediaTimestamp(doc, foundChunk.start_time, 'video');
+ } else {
+ console.warn('Video chunk missing start_time:', foundChunk);
+ }
+ }
+ // Handle audio chunks - note that we're using string comparison since 'audio' isn't in CHUNK_TYPE enum
+ else if (String(foundChunk.chunkType).toLowerCase() === 'audio') {
+ if (foundChunk.start_time !== undefined) {
+ await this.goToMediaTimestamp(doc, foundChunk.start_time, 'audio');
+ } else {
+ console.warn('Audio chunk missing start_time:', foundChunk);
+ }
+ }
+ // Handle table or image chunks
+ else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) {
+ this.handleOtherChunkTypes(foundChunk, citation, doc);
+ }
+ // Handle text chunks
+ else if (foundChunk.chunkType === CHUNK_TYPE.TEXT) {
+ // Find text from the document's chunks metadata
+ let chunkText = '';
+
+ try {
+ // We already parsed the chunks earlier, so use that
+ const matchingChunk = docChunkSimpl.chunks.find(c => c.chunkId === foundChunk.chunkId);
+ if (matchingChunk && 'text' in matchingChunk) {
+ // If the text property exists on the chunk (even though it's not in the type)
+ chunkText = String(matchingChunk['text'] || '');
}
+ } catch (e) {
+ console.error('Error getting chunk text:', e);
+ }
+
+ // Default text if none found
+ if (!chunkText) {
+ chunkText = 'Text content not available';
+ }
+
+ this._citationPopup = {
+ text: chunkText,
+ visible: true,
+ };
+ }
+ // Handle URL chunks
+ else if (foundChunk.chunkType === CHUNK_TYPE.URL) {
+ if (foundChunk.url) {
+ // Instead of opening the URL in a new window, show the document in the viewer
+ DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {});
+ console.log(`Navigated to web document with URL: ${foundChunk.url}`);
} else {
- // Handle other chunk types as before
- this.handleOtherChunkTypes(foundChunk, citation, doc);
+ console.warn('URL chunk missing URL:', foundChunk);
}
}
+ } else if (doc?.original_segments) {
+ // Handle original segments for media files
+ let original_segments: any[] = [];
+ try {
+ original_segments = JSON.parse(StrCast(doc.original_segments));
+ } catch (e) {
+ console.error(`Error parsing original_segments:`, e);
+ return;
+ }
+
+ // Check if there's direct text to find in the segments
+ if (citation.direct_text) {
+ // Find the segment that contains the direct text
+ const start = this.getDirectMatchingSegmentStart(doc, citation.direct_text, []);
+ if (start !== -1) {
+ await this.goToMediaTimestamp(doc, start, doc.ai_type === 'audio' ? 'audio' : 'video');
+ }
+ }
+ } else {
+ console.warn('Unable to find chunk or segments for citation', citation);
}
+ } catch (error) {
+ console.error('Error handling citation click:', error);
}
};
+ /**
+ * Finds a matching segment in a document based on text content.
+ * @param doc The document to search in
+ * @param citationText The text to find in the document
+ * @param indexesOfSegments Optional indexes of segments to search in
+ * @returns The starting timestamp of the matching segment, or -1 if not found
+ */
getDirectMatchingSegmentStart = (doc: Doc, citationText: string, indexesOfSegments: string[]): number => {
- const originalSegments = JSON.parse(StrCast(doc.original_segments!)).map((segment: any, index: number) => ({
- index: index.toString(),
- text: segment.text,
- start: segment.start,
- end: segment.end,
- }));
-
- if (!Array.isArray(originalSegments) || originalSegments.length === 0 || !Array.isArray(indexesOfSegments)) {
- return 0;
+ if (!doc || !citationText) return -1;
+
+ // Get original segments from the document
+ const original_segments = doc.original_segments ? JSON.parse(StrCast(doc.original_segments)) : [];
+
+ if (!original_segments || !Array.isArray(original_segments) || original_segments.length === 0) {
+ return -1;
}
- // Create itemsToSearch array based on indexesOfSegments
- const itemsToSearch = indexesOfSegments.map((indexStr: string) => {
- const index = parseInt(indexStr, 10);
- const segment = originalSegments[index];
- return { text: segment.text, start: segment.start };
- });
+ let segments = original_segments;
- console.log('Constructed itemsToSearch:', itemsToSearch);
+ // If specific indexes are provided, filter segments by those indexes
+ if (indexesOfSegments && indexesOfSegments.length > 0) {
+ segments = original_segments.filter((segment: any) => indexesOfSegments.includes(segment.index));
+ }
+
+ // If no segments match the indexes, use all segments
+ if (segments.length === 0) {
+ segments = original_segments;
+ }
- // Helper function to calculate word overlap score
+ // First try to find an exact match
+ const exactMatch = segments.find((segment: any) => segment.text && segment.text.includes(citationText));
+
+ if (exactMatch) {
+ return exactMatch.start;
+ }
+
+ // If no exact match, find segment with best word overlap
const calculateWordOverlap = (text1: string, text2: string): number => {
- const words1 = new Set(text1.toLowerCase().split(/\W+/));
- const words2 = new Set(text2.toLowerCase().split(/\W+/));
- const intersection = new Set([...words1].filter(word => words2.has(word)));
- return intersection.size / Math.max(words1.size, words2.size); // Jaccard similarity
+ if (!text1 || !text2) return 0;
+
+ const words1 = text1.toLowerCase().split(/\s+/);
+ const words2 = text2.toLowerCase().split(/\s+/);
+ const wordSet1 = new Set(words1);
+
+ let overlap = 0;
+ for (const word of words2) {
+ if (wordSet1.has(word)) {
+ overlap++;
+ }
+ }
+
+ // Return percentage of overlap relative to the shorter text
+ return overlap / Math.min(words1.length, words2.length);
};
- // Search for the best matching segment
- let bestMatchStart = 0;
- let bestScore = 0;
-
- console.log(`Searching for best match for query: "${citationText}"`);
- itemsToSearch.forEach(item => {
- const score = calculateWordOverlap(citationText, item.text);
- console.log(`Comparing query to segment: "${item.text}" | Score: ${score}`);
- if (score > bestScore) {
- bestScore = score;
- bestMatchStart = item.start;
+ // Find segment with highest word overlap
+ let bestMatch = null;
+ let highestOverlap = 0;
+
+ for (const segment of segments) {
+ if (!segment.text) continue;
+
+ const overlap = calculateWordOverlap(segment.text, citationText);
+ if (overlap > highestOverlap) {
+ highestOverlap = overlap;
+ bestMatch = segment;
}
- });
+ }
- console.log('Best match found with score:', bestScore, '| Start time:', bestMatchStart);
+ // Only return matches with significant overlap (more than 30%)
+ if (bestMatch && highestOverlap > 0.3) {
+ return bestMatch.start;
+ }
- // Return the start time of the best match
- return bestMatchStart;
+ // If no good match found, return the start of the first segment as fallback
+ return segments.length > 0 ? segments[0].start : -1;
};
/**
@@ -772,7 +883,9 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
break;
case CHUNK_TYPE.CSV:
case CHUNK_TYPE.URL:
- DocumentManager.Instance.showDocument(doc, { willZoomCentered: true });
+ DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {
+ console.log(`Showing web document in viewer with URL: ${foundChunk.url}`);
+ });
break;
default:
console.error('Unhandled chunk type:', foundChunk.chunkType);
@@ -879,6 +992,16 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
}
});
this.addScrollListener();
+
+ // Initialize the document manager by finding existing documents
+ this.docManager.initializeFindDocsFreeform();
+
+ // If there are stored doc IDs in our list of docs to add, process them
+ if (this._linked_docs_to_add.size > 0) {
+ this._linked_docs_to_add.forEach(doc => {
+ this.docManager.processDocument(doc);
+ });
+ }
}
/**
@@ -892,28 +1015,28 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
/**
* Getter that retrieves all linked documents for the current document.
*/
- @computed
- get linkedDocs() {
- return LinkManager.Instance.getAllRelatedLinks(this.Document)
- .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document)))
- .map(d => DocCast(d?.annotationOn, d))
- .filter(d => d);
+ @computed get linkedDocs(): Doc[] {
+ const docIds = this.docManager.listDocs();
+ const docs: Doc[] = [];
+
+ // Get documents from the document manager using the getDocument method
+ docIds.forEach(id => {
+ const doc = this.docManager.getDocument(id);
+ if (doc) {
+ docs.push(doc);
+ }
+ });
+
+ return docs;
}
/**
- * Getter that retrieves document IDs of linked documents that have AI-related content.
+ * Getter that retrieves document IDs of linked documents that have PDF_chunker–parsed content.
*/
@computed
- get docIds() {
- return LinkManager.Instance.getAllRelatedLinks(this.Document)
- .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document)))
- .map(d => DocCast(d?.annotationOn, d))
- .filter(d => d)
- .filter(d => {
- console.log(d.ai_doc_id);
- return d.ai_doc_id;
- })
- .map(d => StrCast(d.ai_doc_id));
+ get docIds(): string[] {
+ // Use the document manager to get all document IDs
+ return Array.from(this.docManager.listDocs());
}
/**
@@ -921,23 +1044,18 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
*/
@computed
get summaries(): string {
- return (
- LinkManager.Instance.getAllRelatedLinks(this.Document)
- .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document)))
- .map(d => DocCast(d?.annotationOn, d))
- .filter(d => d)
- .filter(d => d.summary)
- .map((doc, index) => {
- if (PDFCast(doc.data)) {
- return `<summary file_name="${PDFCast(doc.data).url.pathname}" applicable_tools=["rag"]>${doc.summary}</summary>`;
- } else if (CsvCast(doc.data)) {
- return `<summary file_name="${CsvCast(doc.data).url.pathname}" applicable_tools=["dataAnalysis"]>${doc.summary}</summary>`;
- } else {
- return `${index + 1}) ${doc.summary}`;
- }
- })
- .join('\n') + '\n'
- );
+ const linkedDocs = Array.from(this.docManager.listDocs())
+ .map(id => {
+ const doc = this.docManager.extractDocumentMetadata(id);
+ if (doc && doc.fields && (doc.fields.layout.summary || doc.fields.data.summary)) {
+ return doc.fields.layout.summary || doc.fields.data.summary;
+ }
+ return null;
+ })
+ .filter(Boolean)
+ .join('\n\n');
+
+ return linkedDocs;
}
/**
@@ -965,7 +1083,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
// Other helper methods for retrieving document data and processing
- retrieveSummaries = () => {
+ retrieveSummaries = (): string => {
return this.summaries;
};
@@ -973,12 +1091,12 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
return this.linkedCSVs;
};
- retrieveFormattedHistory = () => {
+ retrieveFormattedHistory = (): string => {
return this.formattedHistory;
};
- retrieveDocIds = () => {
- return this.docIds;
+ retrieveDocIds = (): string[] => {
+ return Array.from(this.docManager.listDocs());
};
/**
diff --git a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
index 4b751acc0..e6c2421e5 100644
--- a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
+++ b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
@@ -417,9 +417,9 @@ export class DocumentMetadataTool extends BaseTool<DocumentMetadataToolParamsTyp
const title = String(args.title);
const data = String(args.data);
- const createdDoc = this._docManager.createDocInDash(docType, title, data);
+ const id = this._docManager.createDocInDash(docType, data, { title: title });
- if (!createdDoc) {
+ if (!id) {
return [
{
type: 'text',
@@ -427,18 +427,14 @@ export class DocumentMetadataTool extends BaseTool<DocumentMetadataToolParamsTyp
},
];
}
-
- // Update our local document maps with the new document
- this._docManager.processDocument(createdDoc);
-
// Get the created document's metadata
- const createdMetadata = this._docManager.extractDocumentMetadata(this._docManager.createAgentDoc(createdDoc));
+ const createdMetadata = this._docManager.extractDocumentMetadata(id);
return [
{
type: 'text',
text: `Document created successfully.
-Document ID: ${createdDoc.id}
+Document ID: ${id}
Type: ${docType}
Title: "${title}"
@@ -447,9 +443,9 @@ You can now use the "edit" action to modify additional properties of this docume
Next steps:
1. Use the "getFieldOptions" action to understand available editable/addable fields/properties and their dependencies.
-2. To modify this document, use: { action: "edit", documentId: "${createdDoc.id}", fieldEdits: [{"fieldName":"property","fieldValue":"value"}] }
+2. To modify this document, use: { action: "edit", documentId: "${id}", fieldEdits: [{"fieldName":"property","fieldValue":"value"}] }
3. To add styling, consider setting backgroundColor, fontColor, or other properties
-4. For text documents, you can edit the content with: { action: "edit", documentId: "${createdDoc.id}", fieldEdits: [{"fieldName":"text","fieldValue":"New content"}] }
+4. For text documents, you can edit the content with: { action: "edit", documentId: "${id}", fieldEdits: [{"fieldName":"text","fieldValue":"New content"}] }
Full metadata for the created document:
${JSON.stringify(createdMetadata, null, 2)}`,
diff --git a/src/client/views/nodes/chatbot/tools/SearchTool.ts b/src/client/views/nodes/chatbot/tools/SearchTool.ts
index 2ee30f0cf..53f5fc109 100644
--- a/src/client/views/nodes/chatbot/tools/SearchTool.ts
+++ b/src/client/views/nodes/chatbot/tools/SearchTool.ts
@@ -3,6 +3,9 @@ import { Networking } from '../../../../Network';
import { BaseTool } from './BaseTool';
import { Observation } from '../types/types';
import { ParametersType, ToolInfo } from '../types/tool_types';
+import { Agent } from 'http';
+import { AgentDocumentManager } from '../utils/AgentDocumentManager';
+import { StrCast } from '../../../../../fields/Types';
const searchToolParams = [
{
@@ -25,12 +28,12 @@ const searchToolInfo: ToolInfo<SearchToolParamsType> = {
};
export class SearchTool extends BaseTool<SearchToolParamsType> {
- private _addLinkedUrlDoc: (url: string, id: string) => void;
+ private _docManager: AgentDocumentManager;
private _max_results: number;
- constructor(addLinkedUrlDoc: (url: string, id: string) => void, max_results: number = 3) {
+ constructor(docManager: AgentDocumentManager, max_results: number = 3) {
super(searchToolInfo);
- this._addLinkedUrlDoc = addLinkedUrlDoc;
+ this._docManager = docManager;
this._max_results = max_results;
}
@@ -46,8 +49,13 @@ export class SearchTool extends BaseTool<SearchToolParamsType> {
max_results: this._max_results,
})) as { results: { url: string; snippet: string }[] };
const data = results.map((result: { url: string; snippet: string }) => {
- const id = uuidv4();
- this._addLinkedUrlDoc(result.url, id);
+ // Create a web document with the URL
+ const id = this._docManager.createDocInDash('web', result.url, {
+ title: `Search Result: ${result.url}`,
+ text_html: result.snippet,
+ data_useCors: true,
+ });
+
return {
type: 'text' as const,
text: `<chunk chunk_id="${id}" chunk_type="url"><url>${result.url}</url><overview>${result.snippet}</overview></chunk>`,
diff --git a/src/client/views/nodes/chatbot/types/types.ts b/src/client/views/nodes/chatbot/types/types.ts
index 882e74ebb..dcb132ec7 100644
--- a/src/client/views/nodes/chatbot/types/types.ts
+++ b/src/client/views/nodes/chatbot/types/types.ts
@@ -108,6 +108,7 @@ export interface SimplifiedChunk {
start_time?: number;
end_time?: number;
indexes?: string[];
+ text?: string;
}
export interface AI_Document {
diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
index 4eeac3c6a..c3beebcde 100644
--- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
+++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
@@ -165,22 +165,18 @@ export class AgentDocumentManager {
}
}
- public addCustomId(doc: Doc, id: string) {
- doc.id = id;
- doc.DOCUMENT_ID_FIELD = id;
- }
-
/**
* Process a document by ensuring it has an ID and adding it to the appropriate collections
* @param doc The document to process
*/
- public processDocument(doc: Doc) {
+ public processDocument(doc: Doc): string {
// Ensure document has a persistent ID
const docId = this.ensureDocumentId(doc);
// Only add if we haven't already processed this document
if (!this.documentsById.has(docId)) {
this.documentsById.set(docId, { layoutDoc: doc, dataDoc: doc[DocData] });
}
+ return docId;
}
/**
@@ -232,7 +228,9 @@ export class AgentDocumentManager {
* @param docId The ID of the document to extract metadata from
* @returns An object containing the document's metadata
*/
- public extractDocumentMetadata(doc?: AgentDocument) {
+ public extractDocumentMetadata(id: string) {
+ if (!id) return null;
+ const doc = this.documentsById.get(id);
if (!doc) return null;
const layoutDoc = doc.layoutDoc;
const dataDoc = doc.dataDoc;
@@ -729,16 +727,14 @@ export class AgentDocumentManager {
*/
public getDocumentMetadata(documentId?: string): any {
if (documentId) {
- const doc = this.documentsById.get(documentId);
- // Get metadata for a specific document
- return this.extractDocumentMetadata(doc);
+ console.log(`Returning document metadata for docID, ${documentId}:`, this.extractDocumentMetadata(documentId));
+ return this.extractDocumentMetadata(documentId);
} else {
// Get metadata for all documents
const documentsMetadata: Record<string, any> = {};
- for (const doc of this.documentsById.values()) {
- documentsMetadata.add(this.extractDocumentMetadata(doc) ?? { documentId: doc.layoutDoc.id, title: doc.layoutDoc.title, type: doc.layoutDoc.type });
+ for (const documentId of this.documentsById.keys()) {
+ documentsMetadata.add(this.extractDocumentMetadata(documentId));
}
-
return {
documentCount: this.documentsById.size,
documents: documentsMetadata,
@@ -845,14 +841,15 @@ export class AgentDocumentManager {
return Object.values(supportedDocTypes).includes(docType as supportedDocTypes);
}
/**
- * Creates a document in the dashboard.
+ * Creates a document in the dashboard and returns its ID.
+ * This is a public API used by tools like SearchTool.
*
- * @param {string} doc_type - The type of document to create.
- * @param {string} data - The data used to generate the document.
- * @param {DocumentOptions} options - Configuration options for the document.
- * @returns {Promise<void>} A promise that resolves once the document is created and displayed.
+ * @param docType The type of document to create
+ * @param data The data for the document
+ * @param options Optional configuration options
+ * @returns The ID of the created document
*/
- createDocInDash = (docType: string, title: string, data: string) => {
+ public createDocInDash(docType: string, data: string, options?: any): string {
// Validate doc_type
if (!this.isValidDocType(docType)) {
throw new Error(`Invalid document type: ${docType}`);
@@ -862,10 +859,10 @@ export class AgentDocumentManager {
// Create simple document with just title and data
const simpleDoc: parsedDoc = {
doc_type: docType,
- title: title,
+ title: options?.title ?? `Untitled Document ${this.documentsById.size + 1}`,
data: data,
- x: 0,
- y: 0,
+ x: options?.x ?? 0,
+ y: options?.y ?? 0,
_width: 300,
_height: 300,
_layout_fitWidth: false,
@@ -884,46 +881,111 @@ export class AgentDocumentManager {
}
};
const doc = this.chatBox.whichDoc(simpleDoc, false);
- if (doc) linkAndShowDoc(doc);
- return doc;
+ if (doc) {
+ linkAndShowDoc(doc);
+ const id = this.processDocument(doc);
+ return id;
+ } else {
+ throw new Error(`Error creating document. Created document not found.`);
+ }
} catch (error) {
throw new Error(`Error creating document: ${error}`);
}
- };
+ }
public has(docId: string) {
return this.documentsById.has(docId);
}
- public listDocs() {
- // List all available documents in simple format
- const docs = Array.from(this.documentsById.entries()).map(([id, doc]) => ({
- id,
- title: doc.layoutDoc.title || 'Untitled Document',
- type: doc.layoutDoc.type || doc.dataDoc.type || 'Unknown Type',
- }));
-
- if (docs.length === 0) {
- return [
- {
- type: 'text',
- text: 'No documents found in the current view.',
- },
- ];
- }
-
- return [
- {
- type: 'text',
- text: `Found ${docs.length} document(s) in the current view:\n${JSON.stringify(docs, null, 2)}`,
- },
- ];
+ /**
+ * Returns a list of all document IDs in the manager.
+ * @returns An array of document IDs (strings).
+ */
+ public listDocs(): string[] {
+ return Array.from(this.documentsById.keys());
+ }
+
+ /**
+ * Adds a document with a custom ID to the manager
+ * @param doc The document to add
+ * @param customId The custom ID to assign to the document
+ * @returns The customId that was assigned
+ */
+ public addCustomId(doc: Doc, customId: string): string {
+ if (!doc) {
+ console.error('Cannot add null document with custom ID');
+ return '';
+ }
+
+ // Set the custom ID in the document's metadata
+ doc[this.DOCUMENT_ID_FIELD] = customId;
+
+ // Store the document in our map
+ this.documentsById.set(customId, {
+ layoutDoc: doc,
+ dataDoc: doc,
+ });
+
+ return customId;
}
- public createAgentDoc(doc: Doc) {
- // Ideally check if Doc is already in there.
- const agentDoc = { layoutDoc: doc, dataDoc: doc[DocData] };
- this.documentsById.set(this.ensureDocumentId(doc), agentDoc);
- return agentDoc;
+ /**
+ * Gets a document by its ID
+ * @param docId The ID of the document to retrieve
+ * @returns The document if found, undefined otherwise
+ */
+ public getDocument(docId: string): Doc | undefined {
+ const docInfo = this.documentsById.get(docId);
+ return docInfo?.layoutDoc;
+ }
+
+ /**
+ * Registers chunk IDs associated with a document in the manager
+ * @param docId The parent document ID
+ * @param chunkIds Array of chunk IDs associated with this document
+ */
+ public registerChunkIds(docId: string, chunkIds: string[]): void {
+ // Get the document if it exists
+ const docInfo = this.documentsById.get(docId);
+ if (!docInfo) {
+ console.warn(`Cannot register chunks for unknown document ID: ${docId}`);
+ return;
+ }
+
+ // Store chunk IDs on the document for future reference
+ const doc = docInfo.layoutDoc;
+ if (!doc.chunk_ids) {
+ doc.chunk_ids = JSON.stringify(chunkIds);
+ } else {
+ // Merge with existing chunk IDs if they exist
+ const existingIds = JSON.parse(doc.chunk_ids as string);
+ const updatedIds = [...new Set([...existingIds, ...chunkIds])]; // Remove duplicates
+ doc.chunk_ids = JSON.stringify(updatedIds);
+ }
+
+ // Ensure each chunk ID can be linked back to its parent document
+ chunkIds.forEach(chunkId => {
+ // Store a mapping from chunk ID to parent document ID
+ // This allows us to easily find a document by any of its chunk IDs
+ if (!this.documentsById.has(chunkId)) {
+ this.documentsById.set(chunkId, {
+ layoutDoc: doc,
+ dataDoc: docInfo.dataDoc,
+ });
+ }
+ });
+ }
+
+ /**
+ * Gets a document ID by a chunk ID
+ * @param chunkId The chunk ID to look up
+ * @returns The parent document ID if found
+ */
+ public getDocIdByChunkId(chunkId: string): string | undefined {
+ const docInfo = this.documentsById.get(chunkId);
+ if (docInfo) {
+ return docInfo.layoutDoc[this.DOCUMENT_ID_FIELD] as string;
+ }
+ return undefined;
}
}
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index afd34f28d..4bb61d8b2 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -15,7 +15,7 @@ import { Networking } from '../../../../Network';
import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
import OpenAI from 'openai';
import { Embedding } from 'openai/resources';
-import { PineconeEnvironmentVarsNotSupportedError } from '@pinecone-database/pinecone/dist/errors';
+import { AgentDocumentManager } from '../utils/AgentDocumentManager';
dotenv.config();
@@ -29,7 +29,7 @@ export class Vectorstore {
private openai: OpenAI; // OpenAI client for generating embeddings.
private indexName: string = 'pdf-chatbot'; // Default name for the index.
private _id: string; // Unique ID for the Vectorstore instance.
- private _doc_ids: () => string[]; // List of document IDs handled by this instance.
+ private docManager: AgentDocumentManager; // Document manager for handling documents
documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
@@ -37,9 +37,9 @@ export class Vectorstore {
* Initializes the Pinecone and OpenAI clients, sets up the document ID list,
* and initializes the Pinecone index.
* @param id The unique identifier for the vectorstore instance.
- * @param doc_ids A function that returns a list of document IDs.
+ * @param docManager An instance of AgentDocumentManager to handle document management.
*/
- constructor(id: string, doc_ids: () => string[]) {
+ constructor(id: string, docManager: AgentDocumentManager) {
const pineconeApiKey = process.env.PINECONE_API_KEY;
if (!pineconeApiKey) {
throw new Error('PINECONE_API_KEY is not defined.');
@@ -49,7 +49,7 @@ export class Vectorstore {
this.pinecone = new Pinecone({ apiKey: pineconeApiKey });
this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true });
this._id = id;
- this._doc_ids = doc_ids;
+ this.docManager = docManager;
this.initializeIndex();
}
@@ -109,15 +109,25 @@ export class Vectorstore {
const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4');
let result: AI_Document & { doc_id: string };
+
if (isAudioOrVideo) {
console.log('Processing media file...');
const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
- const segmentedTranscript = response.condensed;
+
+ // Type assertion to handle the response properties
+ const typedResponse = response as {
+ condensed: Array<{ text: string; indexes: string[]; start: number; end: number }>;
+ full: Array<unknown>;
+ summary: string;
+ };
+
+ const segmentedTranscript = typedResponse.condensed;
console.log(segmentedTranscript);
- const summary = response.summary;
+ const summary = typedResponse.summary;
doc.summary = summary;
+
// Generate embeddings for each chunk
- const texts = segmentedTranscript.map((chunk: any) => chunk.text);
+ const texts = segmentedTranscript.map(chunk => chunk.text);
try {
const embeddingsResponse = await this.openai.embeddings.create({
@@ -126,10 +136,19 @@ export class Vectorstore {
encoding_format: 'float',
});
- doc.original_segments = JSON.stringify(response.full);
+ doc.original_segments = JSON.stringify(typedResponse.full);
doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
const doc_id = uuidv4();
+ // Register the document with the AgentDocumentManager
+ this.docManager.addCustomId(doc, doc_id);
+
+ // Generate chunk IDs upfront so we can register them
+ const chunkIds = segmentedTranscript.map(() => uuidv4());
+
+ // Register all chunk IDs with the document manager
+ this.docManager.registerChunkIds(doc_id, chunkIds);
+
// Add transcript and embeddings to metadata
result = {
doc_id,
@@ -137,13 +156,13 @@ export class Vectorstore {
file_name: local_file_path,
num_pages: 0,
summary: '',
- chunks: segmentedTranscript.map((chunk: any, index: number) => ({
- id: uuidv4(),
+ chunks: segmentedTranscript.map((chunk, index) => ({
+ id: chunkIds[index], // Use pre-generated chunk ID
values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding
metadata: {
indexes: chunk.indexes,
original_document: local_file_path,
- doc_id: doc_id,
+ doc_id: doc_id, // Ensure doc_id is consistent
file_path: local_file_path,
start_time: chunk.start,
end_time: chunk.end,
@@ -159,20 +178,24 @@ export class Vectorstore {
}
doc.segmented_transcript = JSON.stringify(segmentedTranscript);
- // Simplify chunks for storage
+ // Simplify chunks for storage - ensure simplified chunks use EXACTLY the same IDs
const simplifiedChunks = result.chunks.map(chunk => ({
- chunkId: chunk.id,
+ chunkId: chunk.id, // Use the exact same ID as the full chunk
start_time: chunk.metadata.start_time,
end_time: chunk.metadata.end_time,
indexes: chunk.metadata.indexes,
chunkType: CHUNK_TYPE.VIDEO,
text: chunk.metadata.text,
+ doc_id: chunk.metadata.doc_id, // Include parent doc_id for completeness
}));
doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
} else {
- // Existing document processing logic remains unchanged
+ // Process regular document
console.log('Processing regular document...');
- const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+ const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+
+ // Type assertion for the response
+ const { jobId } = createDocumentResponse as { jobId: string };
while (true) {
await new Promise(resolve => setTimeout(resolve, 2000));
@@ -188,6 +211,16 @@ export class Vectorstore {
progressCallback(progressResponseJson.progress, progressResponseJson.step);
}
}
+
+ // Register the document with the AgentDocumentManager
+ this.docManager.addCustomId(doc, result.doc_id);
+
+ // Collect all chunk IDs
+ const chunkIds = result.chunks.map(chunk => chunk.id);
+
+ // Register chunks with the document manager
+ this.docManager.registerChunkIds(result.doc_id, chunkIds);
+
if (!doc.chunk_simpl) {
doc.chunk_simpl = JSON.stringify({ chunks: [] });
}
@@ -196,12 +229,13 @@ export class Vectorstore {
result.chunks.forEach((chunk: RAGChunk) => {
const chunkToAdd = {
- chunkId: chunk.id,
+ chunkId: chunk.id, // Ensure we use the exact same ID
startPage: chunk.metadata.start_page,
endPage: chunk.metadata.end_page,
location: chunk.metadata.location,
chunkType: chunk.metadata.type as CHUNK_TYPE,
text: chunk.metadata.text,
+ doc_id: chunk.metadata.doc_id, // Include parent doc_id for consistency
};
const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl));
new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd);
@@ -298,39 +332,55 @@ export class Vectorstore {
let queryEmbedding = queryEmbeddingResponse.data[0].embedding;
- // Extract the embedding from the response.
+ // Get document IDs from the AgentDocumentManager
+ const docIds = Array.from(this.docManager.listDocs());
+ console.log('Using document IDs for retrieval:', docIds);
- console.log(this._doc_ids());
// Query the Pinecone index using the embedding and filter by document IDs.
+ // We'll query based on document IDs that are registered in the document manager
const queryResponse: QueryResponse = await this.index.query({
vector: queryEmbedding,
filter: {
- doc_id: { $in: this._doc_ids() },
+ doc_id: { $in: docIds },
},
topK,
includeValues: true,
includeMetadata: true,
});
- console.log(queryResponse);
-
- // Map the results into RAGChunks and return them.
- return queryResponse.matches.map(
- match =>
- ({
- id: match.id,
- values: match.values as number[],
- metadata: match.metadata as {
- text: string;
- type: string;
- original_document: string;
- file_path: string;
- doc_id: string;
- location: string;
- start_page: number;
- end_page: number;
- },
- }) as RAGChunk
- );
+ console.log(`Found ${queryResponse.matches.length} matching chunks`);
+
+ // For each retrieved chunk, ensure its document ID is registered in the document manager
+ // This maintains compatibility with existing code while ensuring consistency
+ const processedMatches = queryResponse.matches.map(match => {
+ const chunk = {
+ id: match.id,
+ values: match.values as number[],
+ metadata: match.metadata as {
+ text: string;
+ type: string;
+ original_document: string;
+ file_path: string;
+ doc_id: string;
+ location: string;
+ start_page: number;
+ end_page: number;
+ },
+ } as RAGChunk;
+
+ // Ensure the document manager knows about this chunk
+ // This is important for maintaining backwards compatibility
+ if (chunk.id && !this.docManager.getDocIdByChunkId(chunk.id)) {
+ // If the chunk ID isn't registered but we have a doc_id in metadata
+ if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) {
+ // Register the chunk with its parent document
+ this.docManager.registerChunkIds(chunk.metadata.doc_id, [chunk.id]);
+ }
+ }
+
+ return chunk;
+ });
+
+ return processedMatches;
} catch (error) {
console.error(`Error retrieving chunks: ${error}`);
return [];