From 5ce2263849bfb901e276a4c5fc8ca2dbd8b80350 Mon Sep 17 00:00:00 2001 From: "A.J. Shulman" Date: Thu, 24 Apr 2025 13:21:00 -0400 Subject: attempt at linking docs but listing metadata doesn't work --- src/client/views/nodes/chatbot/tools/SearchTool.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/client/views/nodes/chatbot/tools/SearchTool.ts') diff --git a/src/client/views/nodes/chatbot/tools/SearchTool.ts b/src/client/views/nodes/chatbot/tools/SearchTool.ts index 6a11407a5..2ee30f0cf 100644 --- a/src/client/views/nodes/chatbot/tools/SearchTool.ts +++ b/src/client/views/nodes/chatbot/tools/SearchTool.ts @@ -28,7 +28,7 @@ export class SearchTool extends BaseTool { private _addLinkedUrlDoc: (url: string, id: string) => void; private _max_results: number; - constructor(addLinkedUrlDoc: (url: string, id: string) => void, max_results: number = 4) { + constructor(addLinkedUrlDoc: (url: string, id: string) => void, max_results: number = 3) { super(searchToolInfo); this._addLinkedUrlDoc = addLinkedUrlDoc; this._max_results = max_results; -- cgit v1.2.3-70-g09d2 From 3ef3d40506348d9fd537cc8f4aea975b9770689f Mon Sep 17 00:00:00 2001 From: "A.J. Shulman" Date: Sun, 27 Apr 2025 13:14:49 -0400 Subject: new attempt with new citation unification --- .../views/nodes/chatbot/agentsystem/Agent.ts | 5 +- .../nodes/chatbot/chatboxcomponents/ChatBox.tsx | 450 +++++++++++++-------- .../nodes/chatbot/tools/DocumentMetadataTool.ts | 16 +- src/client/views/nodes/chatbot/tools/SearchTool.ts | 18 +- src/client/views/nodes/chatbot/types/types.ts | 1 + .../nodes/chatbot/utils/AgentDocumentManager.ts | 168 +++++--- .../views/nodes/chatbot/vectorstore/Vectorstore.ts | 130 ++++-- 7 files changed, 510 insertions(+), 278 deletions(-) (limited to 'src/client/views/nodes/chatbot/tools/SearchTool.ts') diff --git a/src/client/views/nodes/chatbot/agentsystem/Agent.ts b/src/client/views/nodes/chatbot/agentsystem/Agent.ts index c021d141e..80fdb6533 100644 --- a/src/client/views/nodes/chatbot/agentsystem/Agent.ts +++ b/src/client/views/nodes/chatbot/agentsystem/Agent.ts @@ -65,12 +65,9 @@ export class Agent { summaries: () => string, history: () => string, csvData: () => { filename: string; id: string; text: string }[], - addLinkedUrlDoc: (url: string, id: string) => void, getLinkedUrlDocId: (url: string) => string[], createImage: (result: Upload.FileInformation & Upload.InspectionResults, options: DocumentOptions) => void, - // eslint-disable-next-line @typescript-eslint/no-unused-vars createCSVInDash: (url: string, title: string, id: string, data: string) => void, - chatBox: ChatBox, docManager: AgentDocumentManager ) { // Initialize OpenAI client with API key from environment @@ -87,7 +84,7 @@ export class Agent { rag: new RAGTool(this.vectorstore), dataAnalysis: new DataAnalysisTool(csvData), websiteInfoScraper: new WebsiteInfoScraperTool(getLinkedUrlDocId), - searchTool: new SearchTool(addLinkedUrlDoc), + searchTool: new SearchTool(this._docManager), noTool: new NoTool(), //imageCreationTool: new ImageCreationTool(createImage), documentMetadata: new DocumentMetadataTool(this._docManager), diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx index 43765c1ce..35dbee3e9 100644 --- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx +++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx @@ -71,7 +71,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { @observable private _citationPopup: { text: string; visible: boolean } = { text: '', visible: false }; // Private properties for managing OpenAI API, vector store, agent, and UI elements - private openai: OpenAI; + private openai!: OpenAI; // Using definite assignment assertion private vectorstore_id: string; private vectorstore: Vectorstore; private agent: Agent; @@ -98,25 +98,34 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { */ constructor(props: FieldViewProps) { super(props); - makeObservable(this); // Enable MobX observables + makeObservable(this); - // Initialize OpenAI, vectorstore, and agent - this.openai = this.initializeOpenAI(); - if (StrCast(this.dataDoc.vectorstore_id) == '') { - this.vectorstore_id = uuidv4(); - this.dataDoc.vectorstore_id = this.vectorstore_id; - } else { - this.vectorstore_id = StrCast(this.dataDoc.vectorstore_id); - } - this.vectorstore = new Vectorstore(this.vectorstore_id, this.retrieveDocIds); + this.messagesRef = React.createRef(); this.docManager = new AgentDocumentManager(this); - this.agent = new Agent(this.vectorstore, this.retrieveSummaries, this.retrieveFormattedHistory, this.retrieveCSVData, this.addLinkedUrlDoc, this.getLinkedUrlDocIds, this.createImageInDash, this.createCSVInDash, this, this.docManager); - // Reinitialize the DocumentMetadataTool with a direct reference to this ChatBox instance - // This ensures the tool can properly access documents in the same Freeform view - this.agent.reinitializeDocumentMetadataTool(); + // Initialize OpenAI client + this.initializeOpenAI(); + + // Create a unique vectorstore ID for this ChatBox + this.vectorstore_id = uuidv4(); + + // Initialize vectorstore with the document manager + this.vectorstore = new Vectorstore(this.vectorstore_id, this.docManager); + + // Create an agent with the vectorstore + this.agent = new Agent( + this.vectorstore, + this.retrieveSummaries.bind(this), + this.retrieveFormattedHistory.bind(this), + this.retrieveCSVData.bind(this), + this.retrieveDocIds.bind(this), + this.createImageInDash.bind(this), + this.createCSVInDash.bind(this), + this.docManager + ); - this.messagesRef = React.createRef(); + // Add event listeners + this.addScrollListener(); // Reaction to update dataDoc when chat history changes reaction( @@ -140,22 +149,25 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { */ @action addDocToVectorstore = async (newLinkedDoc: Doc) => { - this._uploadProgress = 0; - this._currentStep = 'Initializing...'; - this._isUploadingDocs = true; - try { - // Add the document to the vectorstore + this._isUploadingDocs = true; + + // Process the document first to ensure it has a valid ID + this.docManager.processDocument(newLinkedDoc); + + // Add the document to the vectorstore which will also register chunks await this.vectorstore.addAIDoc(newLinkedDoc, this.updateProgress); - } catch (error) { - console.error('Error uploading document:', error); - this._currentStep = 'Error during upload'; - } finally { - runInAction(() => { - this._isUploadingDocs = false; - this._uploadProgress = 0; - this._currentStep = ''; - }); + + // No longer needed as documents are tracked by the AgentDocumentManager + // this._linked_docs_to_add.add(newLinkedDoc); + + this._isUploadingDocs = false; + + return true; + } catch (err) { + console.error('Error adding document to vectorstore:', err); + this._isUploadingDocs = false; + return false; } }; @@ -238,7 +250,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { apiKey: process.env.OPENAI_KEY, dangerouslyAllowBrowser: true, }; - return new OpenAI(configuration); + this.openai = new OpenAI(configuration); } /** @@ -375,49 +387,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { } }; - /** - * Adds a linked document from a URL for future reference and analysis. - * @param url The URL of the document to add. - * @param id The unique identifier for the document. - */ - @action - addLinkedUrlDoc = async (url: string, id: string) => { - const doc = Docs.Create.WebDocument(url, { data_useCors: true }); - this.docManager.addCustomId(doc, id); - const linkDoc = Docs.Create.LinkDocument(this.Document, doc); - LinkManager.Instance.addLink(linkDoc); - - const chunkToAdd = { - chunkId: id, - chunkType: CHUNK_TYPE.URL, - url: url, - }; - - doc.chunk_simpl = JSON.stringify({ chunks: [chunkToAdd] }); - this.docManager.processDocument(doc); - }; - - /** - * Retrieves the IDs of linked url documents. - * @returns An array of document IDs. - */ - @action - getLinkedUrlDocIds = () => { - const linkedDocs: Doc[] = this.linkedDocs; - const linkedUrlDocIds: string[] = []; - - for (const doc of linkedDocs) { - if (doc.chunk_simpl) { - const docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl)) as { chunks: SimplifiedChunk[] }; - const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkType === CHUNK_TYPE.URL); - if (foundChunk) { - linkedUrlDocIds.push(foundChunk.chunkId); - } - } - } - return linkedUrlDocIds; - }; - /** * Getter to retrieve the current user's name from the client utils. */ @@ -613,82 +582,224 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { */ @action handleCitationClick = async (citation: Citation) => { - const currentLinkedDocs: Doc[] = this.linkedDocs; - const chunkId = citation.chunk_id; + try { + // Extract values from MobX proxy object if needed + const chunkId = typeof citation.chunk_id === 'object' ? (citation.chunk_id as any).toString() : citation.chunk_id; + + // For debugging + console.log('Citation clicked:', { + chunkId, + citation: JSON.stringify(citation, null, 2), + }); - for (const doc of currentLinkedDocs) { - if (doc.chunk_simpl) { - const docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl)) as { chunks: SimplifiedChunk[] }; - const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkId === chunkId); + // Try to find the document + const linkedDocs = this.linkedDocs; + let doc: Doc | undefined; - if (foundChunk) { - // Handle media chunks specifically + // First try to find the document using the document manager's chunk ID lookup + const parentDocId = this.docManager.getDocIdByChunkId(chunkId); + if (parentDocId) { + doc = this.docManager.getDocument(parentDocId); + console.log(`Found document by chunk ID lookup: ${parentDocId}`); + } - if (doc.ai_type == 'video' || doc.ai_type == 'audio') { - const directMatchSegmentStart = this.getDirectMatchingSegmentStart(doc, citation.direct_text || '', foundChunk.indexes || []); + // If not found, fall back to searching through linked docs (maintains compatibility) + if (!doc) { + for (const linkedDoc of linkedDocs) { + if (linkedDoc.chunk_simpl) { + try { + const docChunkSimpl = JSON.parse(StrCast(linkedDoc.chunk_simpl)) as { chunks: SimplifiedChunk[] }; + const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkId === chunkId); + if (foundChunk) { + doc = linkedDoc; + console.log(`Found document by iterating through linked docs`); + break; + } + } catch (e) { + console.error(`Error parsing chunk_simpl for doc ${linkedDoc.id}:`, e); + } + } + } + } - if (directMatchSegmentStart) { - // Navigate to the segment's start time in the media player - await this.goToMediaTimestamp(doc, directMatchSegmentStart, doc.ai_type); - } else { - console.error('No direct matching segment found for the citation.'); + if (!doc) { + console.warn(`Document not found for citation with chunk_id: ${chunkId}`); + return; + } + + // Process the chunk data + let docChunkSimpl: { chunks: SimplifiedChunk[] } = { chunks: [] }; + try { + docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl) || '{"chunks":[]}'); + } catch (e) { + console.error(`Error parsing chunk_simpl for the found document:`, e); + return; + } + + const foundChunk = docChunkSimpl.chunks.find((chunk: SimplifiedChunk) => chunk.chunkId === chunkId); + + // Handle different chunk types + if (foundChunk) { + console.log(`Found chunk in document:`, foundChunk); + + // Handle video chunks + if (foundChunk.chunkType === CHUNK_TYPE.VIDEO) { + if (foundChunk.start_time !== undefined) { + await this.goToMediaTimestamp(doc, foundChunk.start_time, 'video'); + } else { + console.warn('Video chunk missing start_time:', foundChunk); + } + } + // Handle audio chunks - note that we're using string comparison since 'audio' isn't in CHUNK_TYPE enum + else if (String(foundChunk.chunkType).toLowerCase() === 'audio') { + if (foundChunk.start_time !== undefined) { + await this.goToMediaTimestamp(doc, foundChunk.start_time, 'audio'); + } else { + console.warn('Audio chunk missing start_time:', foundChunk); + } + } + // Handle table or image chunks + else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) { + this.handleOtherChunkTypes(foundChunk, citation, doc); + } + // Handle text chunks + else if (foundChunk.chunkType === CHUNK_TYPE.TEXT) { + // Find text from the document's chunks metadata + let chunkText = ''; + + try { + // We already parsed the chunks earlier, so use that + const matchingChunk = docChunkSimpl.chunks.find(c => c.chunkId === foundChunk.chunkId); + if (matchingChunk && 'text' in matchingChunk) { + // If the text property exists on the chunk (even though it's not in the type) + chunkText = String(matchingChunk['text'] || ''); } + } catch (e) { + console.error('Error getting chunk text:', e); + } + + // Default text if none found + if (!chunkText) { + chunkText = 'Text content not available'; + } + + this._citationPopup = { + text: chunkText, + visible: true, + }; + } + // Handle URL chunks + else if (foundChunk.chunkType === CHUNK_TYPE.URL) { + if (foundChunk.url) { + // Instead of opening the URL in a new window, show the document in the viewer + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); + console.log(`Navigated to web document with URL: ${foundChunk.url}`); } else { - // Handle other chunk types as before - this.handleOtherChunkTypes(foundChunk, citation, doc); + console.warn('URL chunk missing URL:', foundChunk); } } + } else if (doc?.original_segments) { + // Handle original segments for media files + let original_segments: any[] = []; + try { + original_segments = JSON.parse(StrCast(doc.original_segments)); + } catch (e) { + console.error(`Error parsing original_segments:`, e); + return; + } + + // Check if there's direct text to find in the segments + if (citation.direct_text) { + // Find the segment that contains the direct text + const start = this.getDirectMatchingSegmentStart(doc, citation.direct_text, []); + if (start !== -1) { + await this.goToMediaTimestamp(doc, start, doc.ai_type === 'audio' ? 'audio' : 'video'); + } + } + } else { + console.warn('Unable to find chunk or segments for citation', citation); } + } catch (error) { + console.error('Error handling citation click:', error); } }; + /** + * Finds a matching segment in a document based on text content. + * @param doc The document to search in + * @param citationText The text to find in the document + * @param indexesOfSegments Optional indexes of segments to search in + * @returns The starting timestamp of the matching segment, or -1 if not found + */ getDirectMatchingSegmentStart = (doc: Doc, citationText: string, indexesOfSegments: string[]): number => { - const originalSegments = JSON.parse(StrCast(doc.original_segments!)).map((segment: any, index: number) => ({ - index: index.toString(), - text: segment.text, - start: segment.start, - end: segment.end, - })); - - if (!Array.isArray(originalSegments) || originalSegments.length === 0 || !Array.isArray(indexesOfSegments)) { - return 0; + if (!doc || !citationText) return -1; + + // Get original segments from the document + const original_segments = doc.original_segments ? JSON.parse(StrCast(doc.original_segments)) : []; + + if (!original_segments || !Array.isArray(original_segments) || original_segments.length === 0) { + return -1; } - // Create itemsToSearch array based on indexesOfSegments - const itemsToSearch = indexesOfSegments.map((indexStr: string) => { - const index = parseInt(indexStr, 10); - const segment = originalSegments[index]; - return { text: segment.text, start: segment.start }; - }); + let segments = original_segments; - console.log('Constructed itemsToSearch:', itemsToSearch); + // If specific indexes are provided, filter segments by those indexes + if (indexesOfSegments && indexesOfSegments.length > 0) { + segments = original_segments.filter((segment: any) => indexesOfSegments.includes(segment.index)); + } + + // If no segments match the indexes, use all segments + if (segments.length === 0) { + segments = original_segments; + } - // Helper function to calculate word overlap score + // First try to find an exact match + const exactMatch = segments.find((segment: any) => segment.text && segment.text.includes(citationText)); + + if (exactMatch) { + return exactMatch.start; + } + + // If no exact match, find segment with best word overlap const calculateWordOverlap = (text1: string, text2: string): number => { - const words1 = new Set(text1.toLowerCase().split(/\W+/)); - const words2 = new Set(text2.toLowerCase().split(/\W+/)); - const intersection = new Set([...words1].filter(word => words2.has(word))); - return intersection.size / Math.max(words1.size, words2.size); // Jaccard similarity + if (!text1 || !text2) return 0; + + const words1 = text1.toLowerCase().split(/\s+/); + const words2 = text2.toLowerCase().split(/\s+/); + const wordSet1 = new Set(words1); + + let overlap = 0; + for (const word of words2) { + if (wordSet1.has(word)) { + overlap++; + } + } + + // Return percentage of overlap relative to the shorter text + return overlap / Math.min(words1.length, words2.length); }; - // Search for the best matching segment - let bestMatchStart = 0; - let bestScore = 0; - - console.log(`Searching for best match for query: "${citationText}"`); - itemsToSearch.forEach(item => { - const score = calculateWordOverlap(citationText, item.text); - console.log(`Comparing query to segment: "${item.text}" | Score: ${score}`); - if (score > bestScore) { - bestScore = score; - bestMatchStart = item.start; + // Find segment with highest word overlap + let bestMatch = null; + let highestOverlap = 0; + + for (const segment of segments) { + if (!segment.text) continue; + + const overlap = calculateWordOverlap(segment.text, citationText); + if (overlap > highestOverlap) { + highestOverlap = overlap; + bestMatch = segment; } - }); + } - console.log('Best match found with score:', bestScore, '| Start time:', bestMatchStart); + // Only return matches with significant overlap (more than 30%) + if (bestMatch && highestOverlap > 0.3) { + return bestMatch.start; + } - // Return the start time of the best match - return bestMatchStart; + // If no good match found, return the start of the first segment as fallback + return segments.length > 0 ? segments[0].start : -1; }; /** @@ -772,7 +883,9 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { break; case CHUNK_TYPE.CSV: case CHUNK_TYPE.URL: - DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }); + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => { + console.log(`Showing web document in viewer with URL: ${foundChunk.url}`); + }); break; default: console.error('Unhandled chunk type:', foundChunk.chunkType); @@ -879,6 +992,16 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { } }); this.addScrollListener(); + + // Initialize the document manager by finding existing documents + this.docManager.initializeFindDocsFreeform(); + + // If there are stored doc IDs in our list of docs to add, process them + if (this._linked_docs_to_add.size > 0) { + this._linked_docs_to_add.forEach(doc => { + this.docManager.processDocument(doc); + }); + } } /** @@ -892,28 +1015,28 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { /** * Getter that retrieves all linked documents for the current document. */ - @computed - get linkedDocs() { - return LinkManager.Instance.getAllRelatedLinks(this.Document) - .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document))) - .map(d => DocCast(d?.annotationOn, d)) - .filter(d => d); + @computed get linkedDocs(): Doc[] { + const docIds = this.docManager.listDocs(); + const docs: Doc[] = []; + + // Get documents from the document manager using the getDocument method + docIds.forEach(id => { + const doc = this.docManager.getDocument(id); + if (doc) { + docs.push(doc); + } + }); + + return docs; } /** - * Getter that retrieves document IDs of linked documents that have AI-related content. + * Getter that retrieves document IDs of linked documents that have PDF_chunker–parsed content. */ @computed - get docIds() { - return LinkManager.Instance.getAllRelatedLinks(this.Document) - .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document))) - .map(d => DocCast(d?.annotationOn, d)) - .filter(d => d) - .filter(d => { - console.log(d.ai_doc_id); - return d.ai_doc_id; - }) - .map(d => StrCast(d.ai_doc_id)); + get docIds(): string[] { + // Use the document manager to get all document IDs + return Array.from(this.docManager.listDocs()); } /** @@ -921,23 +1044,18 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { */ @computed get summaries(): string { - return ( - LinkManager.Instance.getAllRelatedLinks(this.Document) - .map(d => DocCast(LinkManager.getOppositeAnchor(d, this.Document))) - .map(d => DocCast(d?.annotationOn, d)) - .filter(d => d) - .filter(d => d.summary) - .map((doc, index) => { - if (PDFCast(doc.data)) { - return `${doc.summary}`; - } else if (CsvCast(doc.data)) { - return `${doc.summary}`; - } else { - return `${index + 1}) ${doc.summary}`; - } - }) - .join('\n') + '\n' - ); + const linkedDocs = Array.from(this.docManager.listDocs()) + .map(id => { + const doc = this.docManager.extractDocumentMetadata(id); + if (doc && doc.fields && (doc.fields.layout.summary || doc.fields.data.summary)) { + return doc.fields.layout.summary || doc.fields.data.summary; + } + return null; + }) + .filter(Boolean) + .join('\n\n'); + + return linkedDocs; } /** @@ -965,7 +1083,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { // Other helper methods for retrieving document data and processing - retrieveSummaries = () => { + retrieveSummaries = (): string => { return this.summaries; }; @@ -973,12 +1091,12 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { return this.linkedCSVs; }; - retrieveFormattedHistory = () => { + retrieveFormattedHistory = (): string => { return this.formattedHistory; }; - retrieveDocIds = () => { - return this.docIds; + retrieveDocIds = (): string[] => { + return Array.from(this.docManager.listDocs()); }; /** diff --git a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts index 4b751acc0..e6c2421e5 100644 --- a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts +++ b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts @@ -417,9 +417,9 @@ export class DocumentMetadataTool extends BaseTool = { }; export class SearchTool extends BaseTool { - private _addLinkedUrlDoc: (url: string, id: string) => void; + private _docManager: AgentDocumentManager; private _max_results: number; - constructor(addLinkedUrlDoc: (url: string, id: string) => void, max_results: number = 3) { + constructor(docManager: AgentDocumentManager, max_results: number = 3) { super(searchToolInfo); - this._addLinkedUrlDoc = addLinkedUrlDoc; + this._docManager = docManager; this._max_results = max_results; } @@ -46,8 +49,13 @@ export class SearchTool extends BaseTool { max_results: this._max_results, })) as { results: { url: string; snippet: string }[] }; const data = results.map((result: { url: string; snippet: string }) => { - const id = uuidv4(); - this._addLinkedUrlDoc(result.url, id); + // Create a web document with the URL + const id = this._docManager.createDocInDash('web', result.url, { + title: `Search Result: ${result.url}`, + text_html: result.snippet, + data_useCors: true, + }); + return { type: 'text' as const, text: `${result.url}${result.snippet}`, diff --git a/src/client/views/nodes/chatbot/types/types.ts b/src/client/views/nodes/chatbot/types/types.ts index 882e74ebb..dcb132ec7 100644 --- a/src/client/views/nodes/chatbot/types/types.ts +++ b/src/client/views/nodes/chatbot/types/types.ts @@ -108,6 +108,7 @@ export interface SimplifiedChunk { start_time?: number; end_time?: number; indexes?: string[]; + text?: string; } export interface AI_Document { diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts index 4eeac3c6a..c3beebcde 100644 --- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts +++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts @@ -165,22 +165,18 @@ export class AgentDocumentManager { } } - public addCustomId(doc: Doc, id: string) { - doc.id = id; - doc.DOCUMENT_ID_FIELD = id; - } - /** * Process a document by ensuring it has an ID and adding it to the appropriate collections * @param doc The document to process */ - public processDocument(doc: Doc) { + public processDocument(doc: Doc): string { // Ensure document has a persistent ID const docId = this.ensureDocumentId(doc); // Only add if we haven't already processed this document if (!this.documentsById.has(docId)) { this.documentsById.set(docId, { layoutDoc: doc, dataDoc: doc[DocData] }); } + return docId; } /** @@ -232,7 +228,9 @@ export class AgentDocumentManager { * @param docId The ID of the document to extract metadata from * @returns An object containing the document's metadata */ - public extractDocumentMetadata(doc?: AgentDocument) { + public extractDocumentMetadata(id: string) { + if (!id) return null; + const doc = this.documentsById.get(id); if (!doc) return null; const layoutDoc = doc.layoutDoc; const dataDoc = doc.dataDoc; @@ -729,16 +727,14 @@ export class AgentDocumentManager { */ public getDocumentMetadata(documentId?: string): any { if (documentId) { - const doc = this.documentsById.get(documentId); - // Get metadata for a specific document - return this.extractDocumentMetadata(doc); + console.log(`Returning document metadata for docID, ${documentId}:`, this.extractDocumentMetadata(documentId)); + return this.extractDocumentMetadata(documentId); } else { // Get metadata for all documents const documentsMetadata: Record = {}; - for (const doc of this.documentsById.values()) { - documentsMetadata.add(this.extractDocumentMetadata(doc) ?? { documentId: doc.layoutDoc.id, title: doc.layoutDoc.title, type: doc.layoutDoc.type }); + for (const documentId of this.documentsById.keys()) { + documentsMetadata.add(this.extractDocumentMetadata(documentId)); } - return { documentCount: this.documentsById.size, documents: documentsMetadata, @@ -845,14 +841,15 @@ export class AgentDocumentManager { return Object.values(supportedDocTypes).includes(docType as supportedDocTypes); } /** - * Creates a document in the dashboard. + * Creates a document in the dashboard and returns its ID. + * This is a public API used by tools like SearchTool. * - * @param {string} doc_type - The type of document to create. - * @param {string} data - The data used to generate the document. - * @param {DocumentOptions} options - Configuration options for the document. - * @returns {Promise} A promise that resolves once the document is created and displayed. + * @param docType The type of document to create + * @param data The data for the document + * @param options Optional configuration options + * @returns The ID of the created document */ - createDocInDash = (docType: string, title: string, data: string) => { + public createDocInDash(docType: string, data: string, options?: any): string { // Validate doc_type if (!this.isValidDocType(docType)) { throw new Error(`Invalid document type: ${docType}`); @@ -862,10 +859,10 @@ export class AgentDocumentManager { // Create simple document with just title and data const simpleDoc: parsedDoc = { doc_type: docType, - title: title, + title: options?.title ?? `Untitled Document ${this.documentsById.size + 1}`, data: data, - x: 0, - y: 0, + x: options?.x ?? 0, + y: options?.y ?? 0, _width: 300, _height: 300, _layout_fitWidth: false, @@ -884,46 +881,111 @@ export class AgentDocumentManager { } }; const doc = this.chatBox.whichDoc(simpleDoc, false); - if (doc) linkAndShowDoc(doc); - return doc; + if (doc) { + linkAndShowDoc(doc); + const id = this.processDocument(doc); + return id; + } else { + throw new Error(`Error creating document. Created document not found.`); + } } catch (error) { throw new Error(`Error creating document: ${error}`); } - }; + } public has(docId: string) { return this.documentsById.has(docId); } - public listDocs() { - // List all available documents in simple format - const docs = Array.from(this.documentsById.entries()).map(([id, doc]) => ({ - id, - title: doc.layoutDoc.title || 'Untitled Document', - type: doc.layoutDoc.type || doc.dataDoc.type || 'Unknown Type', - })); - - if (docs.length === 0) { - return [ - { - type: 'text', - text: 'No documents found in the current view.', - }, - ]; - } - - return [ - { - type: 'text', - text: `Found ${docs.length} document(s) in the current view:\n${JSON.stringify(docs, null, 2)}`, - }, - ]; + /** + * Returns a list of all document IDs in the manager. + * @returns An array of document IDs (strings). + */ + public listDocs(): string[] { + return Array.from(this.documentsById.keys()); + } + + /** + * Adds a document with a custom ID to the manager + * @param doc The document to add + * @param customId The custom ID to assign to the document + * @returns The customId that was assigned + */ + public addCustomId(doc: Doc, customId: string): string { + if (!doc) { + console.error('Cannot add null document with custom ID'); + return ''; + } + + // Set the custom ID in the document's metadata + doc[this.DOCUMENT_ID_FIELD] = customId; + + // Store the document in our map + this.documentsById.set(customId, { + layoutDoc: doc, + dataDoc: doc, + }); + + return customId; } - public createAgentDoc(doc: Doc) { - // Ideally check if Doc is already in there. - const agentDoc = { layoutDoc: doc, dataDoc: doc[DocData] }; - this.documentsById.set(this.ensureDocumentId(doc), agentDoc); - return agentDoc; + /** + * Gets a document by its ID + * @param docId The ID of the document to retrieve + * @returns The document if found, undefined otherwise + */ + public getDocument(docId: string): Doc | undefined { + const docInfo = this.documentsById.get(docId); + return docInfo?.layoutDoc; + } + + /** + * Registers chunk IDs associated with a document in the manager + * @param docId The parent document ID + * @param chunkIds Array of chunk IDs associated with this document + */ + public registerChunkIds(docId: string, chunkIds: string[]): void { + // Get the document if it exists + const docInfo = this.documentsById.get(docId); + if (!docInfo) { + console.warn(`Cannot register chunks for unknown document ID: ${docId}`); + return; + } + + // Store chunk IDs on the document for future reference + const doc = docInfo.layoutDoc; + if (!doc.chunk_ids) { + doc.chunk_ids = JSON.stringify(chunkIds); + } else { + // Merge with existing chunk IDs if they exist + const existingIds = JSON.parse(doc.chunk_ids as string); + const updatedIds = [...new Set([...existingIds, ...chunkIds])]; // Remove duplicates + doc.chunk_ids = JSON.stringify(updatedIds); + } + + // Ensure each chunk ID can be linked back to its parent document + chunkIds.forEach(chunkId => { + // Store a mapping from chunk ID to parent document ID + // This allows us to easily find a document by any of its chunk IDs + if (!this.documentsById.has(chunkId)) { + this.documentsById.set(chunkId, { + layoutDoc: doc, + dataDoc: docInfo.dataDoc, + }); + } + }); + } + + /** + * Gets a document ID by a chunk ID + * @param chunkId The chunk ID to look up + * @returns The parent document ID if found + */ + public getDocIdByChunkId(chunkId: string): string | undefined { + const docInfo = this.documentsById.get(chunkId); + if (docInfo) { + return docInfo.layoutDoc[this.DOCUMENT_ID_FIELD] as string; + } + return undefined; } } diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index afd34f28d..4bb61d8b2 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -15,7 +15,7 @@ import { Networking } from '../../../../Network'; import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types'; import OpenAI from 'openai'; import { Embedding } from 'openai/resources'; -import { PineconeEnvironmentVarsNotSupportedError } from '@pinecone-database/pinecone/dist/errors'; +import { AgentDocumentManager } from '../utils/AgentDocumentManager'; dotenv.config(); @@ -29,7 +29,7 @@ export class Vectorstore { private openai: OpenAI; // OpenAI client for generating embeddings. private indexName: string = 'pdf-chatbot'; // Default name for the index. private _id: string; // Unique ID for the Vectorstore instance. - private _doc_ids: () => string[]; // List of document IDs handled by this instance. + private docManager: AgentDocumentManager; // Document manager for handling documents documents: AI_Document[] = []; // Store the documents indexed in the vectorstore. @@ -37,9 +37,9 @@ export class Vectorstore { * Initializes the Pinecone and OpenAI clients, sets up the document ID list, * and initializes the Pinecone index. * @param id The unique identifier for the vectorstore instance. - * @param doc_ids A function that returns a list of document IDs. + * @param docManager An instance of AgentDocumentManager to handle document management. */ - constructor(id: string, doc_ids: () => string[]) { + constructor(id: string, docManager: AgentDocumentManager) { const pineconeApiKey = process.env.PINECONE_API_KEY; if (!pineconeApiKey) { throw new Error('PINECONE_API_KEY is not defined.'); @@ -49,7 +49,7 @@ export class Vectorstore { this.pinecone = new Pinecone({ apiKey: pineconeApiKey }); this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true }); this._id = id; - this._doc_ids = doc_ids; + this.docManager = docManager; this.initializeIndex(); } @@ -109,15 +109,25 @@ export class Vectorstore { const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4'); let result: AI_Document & { doc_id: string }; + if (isAudioOrVideo) { console.log('Processing media file...'); const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) }); - const segmentedTranscript = response.condensed; + + // Type assertion to handle the response properties + const typedResponse = response as { + condensed: Array<{ text: string; indexes: string[]; start: number; end: number }>; + full: Array; + summary: string; + }; + + const segmentedTranscript = typedResponse.condensed; console.log(segmentedTranscript); - const summary = response.summary; + const summary = typedResponse.summary; doc.summary = summary; + // Generate embeddings for each chunk - const texts = segmentedTranscript.map((chunk: any) => chunk.text); + const texts = segmentedTranscript.map(chunk => chunk.text); try { const embeddingsResponse = await this.openai.embeddings.create({ @@ -126,10 +136,19 @@ export class Vectorstore { encoding_format: 'float', }); - doc.original_segments = JSON.stringify(response.full); + doc.original_segments = JSON.stringify(typedResponse.full); doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; const doc_id = uuidv4(); + // Register the document with the AgentDocumentManager + this.docManager.addCustomId(doc, doc_id); + + // Generate chunk IDs upfront so we can register them + const chunkIds = segmentedTranscript.map(() => uuidv4()); + + // Register all chunk IDs with the document manager + this.docManager.registerChunkIds(doc_id, chunkIds); + // Add transcript and embeddings to metadata result = { doc_id, @@ -137,13 +156,13 @@ export class Vectorstore { file_name: local_file_path, num_pages: 0, summary: '', - chunks: segmentedTranscript.map((chunk: any, index: number) => ({ - id: uuidv4(), + chunks: segmentedTranscript.map((chunk, index) => ({ + id: chunkIds[index], // Use pre-generated chunk ID values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding metadata: { indexes: chunk.indexes, original_document: local_file_path, - doc_id: doc_id, + doc_id: doc_id, // Ensure doc_id is consistent file_path: local_file_path, start_time: chunk.start, end_time: chunk.end, @@ -159,20 +178,24 @@ export class Vectorstore { } doc.segmented_transcript = JSON.stringify(segmentedTranscript); - // Simplify chunks for storage + // Simplify chunks for storage - ensure simplified chunks use EXACTLY the same IDs const simplifiedChunks = result.chunks.map(chunk => ({ - chunkId: chunk.id, + chunkId: chunk.id, // Use the exact same ID as the full chunk start_time: chunk.metadata.start_time, end_time: chunk.metadata.end_time, indexes: chunk.metadata.indexes, chunkType: CHUNK_TYPE.VIDEO, text: chunk.metadata.text, + doc_id: chunk.metadata.doc_id, // Include parent doc_id for completeness })); doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks }); } else { - // Existing document processing logic remains unchanged + // Process regular document console.log('Processing regular document...'); - const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path }); + const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path }); + + // Type assertion for the response + const { jobId } = createDocumentResponse as { jobId: string }; while (true) { await new Promise(resolve => setTimeout(resolve, 2000)); @@ -188,6 +211,16 @@ export class Vectorstore { progressCallback(progressResponseJson.progress, progressResponseJson.step); } } + + // Register the document with the AgentDocumentManager + this.docManager.addCustomId(doc, result.doc_id); + + // Collect all chunk IDs + const chunkIds = result.chunks.map(chunk => chunk.id); + + // Register chunks with the document manager + this.docManager.registerChunkIds(result.doc_id, chunkIds); + if (!doc.chunk_simpl) { doc.chunk_simpl = JSON.stringify({ chunks: [] }); } @@ -196,12 +229,13 @@ export class Vectorstore { result.chunks.forEach((chunk: RAGChunk) => { const chunkToAdd = { - chunkId: chunk.id, + chunkId: chunk.id, // Ensure we use the exact same ID startPage: chunk.metadata.start_page, endPage: chunk.metadata.end_page, location: chunk.metadata.location, chunkType: chunk.metadata.type as CHUNK_TYPE, text: chunk.metadata.text, + doc_id: chunk.metadata.doc_id, // Include parent doc_id for consistency }; const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd); @@ -298,39 +332,55 @@ export class Vectorstore { let queryEmbedding = queryEmbeddingResponse.data[0].embedding; - // Extract the embedding from the response. + // Get document IDs from the AgentDocumentManager + const docIds = Array.from(this.docManager.listDocs()); + console.log('Using document IDs for retrieval:', docIds); - console.log(this._doc_ids()); // Query the Pinecone index using the embedding and filter by document IDs. + // We'll query based on document IDs that are registered in the document manager const queryResponse: QueryResponse = await this.index.query({ vector: queryEmbedding, filter: { - doc_id: { $in: this._doc_ids() }, + doc_id: { $in: docIds }, }, topK, includeValues: true, includeMetadata: true, }); - console.log(queryResponse); - - // Map the results into RAGChunks and return them. - return queryResponse.matches.map( - match => - ({ - id: match.id, - values: match.values as number[], - metadata: match.metadata as { - text: string; - type: string; - original_document: string; - file_path: string; - doc_id: string; - location: string; - start_page: number; - end_page: number; - }, - }) as RAGChunk - ); + console.log(`Found ${queryResponse.matches.length} matching chunks`); + + // For each retrieved chunk, ensure its document ID is registered in the document manager + // This maintains compatibility with existing code while ensuring consistency + const processedMatches = queryResponse.matches.map(match => { + const chunk = { + id: match.id, + values: match.values as number[], + metadata: match.metadata as { + text: string; + type: string; + original_document: string; + file_path: string; + doc_id: string; + location: string; + start_page: number; + end_page: number; + }, + } as RAGChunk; + + // Ensure the document manager knows about this chunk + // This is important for maintaining backwards compatibility + if (chunk.id && !this.docManager.getDocIdByChunkId(chunk.id)) { + // If the chunk ID isn't registered but we have a doc_id in metadata + if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) { + // Register the chunk with its parent document + this.docManager.registerChunkIds(chunk.metadata.doc_id, [chunk.id]); + } + } + + return chunk; + }); + + return processedMatches; } catch (error) { console.error(`Error retrieving chunks: ${error}`); return []; -- cgit v1.2.3-70-g09d2 From e141307dbd9b951f76c908610e7b89e296ad92b8 Mon Sep 17 00:00:00 2001 From: "A.J. Shulman" Date: Sun, 11 May 2025 17:18:18 -0400 Subject: chanegd everything to be more consistent - made both web related tools use doc manager and chunk Ids --- .../views/nodes/chatbot/agentsystem/Agent.ts | 9 +- .../nodes/chatbot/chatboxcomponents/ChatBox.tsx | 50 ++---- .../nodes/chatbot/tools/DocumentMetadataTool.ts | 2 +- src/client/views/nodes/chatbot/tools/SearchTool.ts | 26 +-- .../nodes/chatbot/tools/WebsiteInfoScraperTool.ts | 30 ++-- .../views/nodes/chatbot/tools/WikipediaTool.ts | 2 +- src/client/views/nodes/chatbot/types/types.ts | 1 + .../nodes/chatbot/utils/AgentDocumentManager.ts | 192 +++++---------------- .../views/nodes/chatbot/vectorstore/Vectorstore.ts | 21 +-- src/server/ApiManagers/AssistantManager.ts | 160 ++++++++++++----- 10 files changed, 215 insertions(+), 278 deletions(-) (limited to 'src/client/views/nodes/chatbot/tools/SearchTool.ts') diff --git a/src/client/views/nodes/chatbot/agentsystem/Agent.ts b/src/client/views/nodes/chatbot/agentsystem/Agent.ts index 24471bf5b..86d40864e 100644 --- a/src/client/views/nodes/chatbot/agentsystem/Agent.ts +++ b/src/client/views/nodes/chatbot/agentsystem/Agent.ts @@ -63,10 +63,8 @@ export class Agent { */ constructor( _vectorstore: Vectorstore, - summaries: () => string, history: () => string, csvData: () => { filename: string; id: string; text: string }[], - getLinkedUrlDocId: (url: string) => string[], createImage: (result: Upload.FileInformation & Upload.InspectionResults, options: DocumentOptions) => void, createCSVInDash: (url: string, title: string, id: string, data: string) => void, docManager: AgentDocumentManager @@ -83,7 +81,7 @@ export class Agent { calculate: new CalculateTool(), rag: new RAGTool(this.vectorstore), dataAnalysis: new DataAnalysisTool(csvData), - websiteInfoScraper: new WebsiteInfoScraperTool(getLinkedUrlDocId), + websiteInfoScraper: new WebsiteInfoScraperTool(this._docManager), searchTool: new SearchTool(this._docManager), noTool: new NoTool(), //imageCreationTool: new ImageCreationTool(createImage), @@ -125,11 +123,8 @@ export class Agent { // Retrieve chat history and generate system prompt const chatHistory = this._history(); // Get document summaries directly from document manager - const documentSummaries = this._docManager.getAllDocumentSummaries(); - // Create a function that returns document summaries for the prompt - const getSummaries = () => documentSummaries; // Generate the system prompt with the summaries - const systemPrompt = getReactPrompt(Object.values(this.tools), getSummaries, chatHistory); + const systemPrompt = getReactPrompt(Object.values(this.tools), () => JSON.stringify(this._docManager.listDocs), chatHistory); // Initialize intermediate messages this.interMessages = [{ role: 'system', content: systemPrompt }]; diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx index 6349e554e..867e78860 100644 --- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx +++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx @@ -121,16 +121,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { this.vectorstore = new Vectorstore(this.vectorstore_id, this.docManager); // Create an agent with the vectorstore - this.agent = new Agent( - this.vectorstore, - this.retrieveSummaries.bind(this), - this.retrieveFormattedHistory.bind(this), - this.retrieveCSVData.bind(this), - this.retrieveDocIds.bind(this), - this.createImageInDash.bind(this), - this.createCSVInDash.bind(this), - this.docManager - ); + this.agent = new Agent(this.vectorstore, this.retrieveFormattedHistory.bind(this), this.retrieveCSVData.bind(this), this.createImageInDash.bind(this), this.createCSVInDash.bind(this), this.docManager); // Add event listeners this.addScrollListener(); @@ -228,6 +219,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { } }; + //TODO: Update for new chunk_simpl on agentDocument /** * Adds a CSV file for analysis by sending it to OpenAI and generating a summary. * @param newLinkedDoc The linked document representing the CSV file. @@ -650,18 +642,15 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { citation: JSON.stringify(citation, null, 2), }); - // First try to find the document using the document manager's chunk ID lookup - const doc: Doc | undefined = this.docManager.getDocByChunkId(chunkId); - if (!doc) { - console.warn(`Document not found for citation with chunk_id: ${chunkId}`); - return; - } - // Get the simplified chunk using the document manager - const foundChunk = this.docManager.getSimplifiedChunkById(doc, chunkId); + const { foundChunk, doc } = this.docManager.getSimplifiedChunkById(chunkId); if (!foundChunk) { - console.warn(`Chunk not found in document for chunk ID: ${chunkId}`); - DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); + if (doc) { + console.warn(`Chunk not found in document, ${doc.id}, for chunk ID: ${chunkId}`); + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); + } else { + console.warn(`Chunk not found for chunk ID: ${chunkId}`); + } return; } @@ -678,6 +667,10 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { } else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) { this.handleOtherChunkTypes(foundChunk, citation, doc); } else { + if (doc.type === 'web') { + DocumentManager.Instance.showDocument(doc, { openLocation: OpenWhere.addRight }, () => {}); + return; + } // Show the chunk text in citation popup let chunkText = citation.direct_text || 'Text content not available'; this.showCitationPopup(chunkText); @@ -986,16 +979,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { return Array.from(this.docManager.listDocs); } - /** - * Getter that retrieves summaries of all linked documents. - */ - @computed - get summaries(): string { - // Use the document manager to get all summaries - console.log(this.docManager.listDocs); - return JSON.stringify(this.docManager.listDocs); - } - /** * Getter that retrieves all linked CSV files for analysis. */ @@ -1022,7 +1005,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { // Other helper methods for retrieving document data and processing retrieveSummaries = (): string => { - return this.docManager.getAllDocumentSummaries(); + console.log(this.docManager.listDocs); + return JSON.stringify(this.docManager.listDocs); }; retrieveCSVData = () => { @@ -1033,10 +1017,6 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { return this.formattedHistory; }; - retrieveDocIds = (): string[] => { - return Array.from(this.docManager.docIds); - }; - /** * Handles follow-up questions when the user clicks on them. * Automatically sets the input value to the clicked follow-up question. diff --git a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts index 5297292bf..405949c1e 100644 --- a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts +++ b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts @@ -408,7 +408,7 @@ export class DocumentMetadataTool extends BaseTool { query, max_results: this._max_results, })) as { results: { url: string; snippet: string }[] }; - const data = results.map((result: { url: string; snippet: string }) => { - // Create a web document with the URL - const id = this._docManager.createDocInDash('web', result.url, { - title: `Search Result: ${result.url}`, - text_html: result.snippet, - data_useCors: true, - }); + const data = await Promise.all( + results.map(async (result: { url: string; snippet: string }) => { + // Create a web document with the URL + const id = await this._docManager.createDocInDash('web', result.url, { + title: `Search Result: ${result.url}`, + text_html: result.snippet, + data_useCors: true, + }); - return { - type: 'text' as const, - text: `${result.url}${result.snippet}`, - }; - }); + return { + type: 'text' as const, + text: `${result.url}${result.snippet}`, + }; + }) + ); return data; } catch (error) { console.log(error); diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts index 3c7b4e3db..495a985cb 100644 --- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts +++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts @@ -3,12 +3,14 @@ import { Networking } from '../../../../Network'; import { BaseTool } from './BaseTool'; import { Observation } from '../types/types'; import { ParametersType, ToolInfo } from '../types/tool_types'; - +import { AgentDocumentManager } from '../utils/AgentDocumentManager'; +import { Doc } from '../../../../../fields/Doc'; +import { StrCast, WebCast } from '../../../../../fields/Types'; const websiteInfoScraperToolParams = [ { - name: 'urls', + name: 'chunk_ids', type: 'string[]', - description: 'The URLs of the websites to scrape', + description: 'The chunk_ids of the urls to scrape from the SearchTool.', required: true, max_inputs: 3, }, @@ -66,11 +68,11 @@ const websiteInfoScraperToolInfo: ToolInfo = { }; export class WebsiteInfoScraperTool extends BaseTool { - private _getLinkedUrlDocId: (url: string) => string[]; + private _docManager: AgentDocumentManager; - constructor(getLinkedUrlDocIds: (url: string) => string[]) { + constructor(docManager: AgentDocumentManager) { super(websiteInfoScraperToolInfo); - this._getLinkedUrlDocId = getLinkedUrlDocIds; + this._docManager = docManager; } /** @@ -79,10 +81,13 @@ export class WebsiteInfoScraperTool extends BaseTool { + private async scrapeWithRetry(chunkDoc: Doc, maxRetries = 2): Promise { let lastError = ''; let retryCount = 0; - + const url = WebCast(chunkDoc.data!)!.url.href; + console.log(url); + console.log(chunkDoc); + console.log(chunkDoc.data); // Validate URL format try { new URL(url); // This will throw if URL is invalid @@ -110,7 +115,6 @@ export class WebsiteInfoScraperTool extends BaseTool\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n`, + text: `\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n`, } as Observation; } @@ -138,7 +142,7 @@ export class WebsiteInfoScraperTool extends BaseTool\n${website_plain_text}\n`, + text: `\n${website_plain_text}\n`, } as Observation; } catch (error) { lastError = error instanceof Error ? error.message : 'Unknown error'; @@ -156,10 +160,10 @@ export class WebsiteInfoScraperTool extends BaseTool): Promise { - const urls = args.urls; + const chunk_ids = args.chunk_ids; // Create an array of promises, each one handling a website scrape for a URL - const scrapingPromises = urls.map(url => this.scrapeWithRetry(url)); + const scrapingPromises = chunk_ids.map(chunk_id => this.scrapeWithRetry(this._docManager.getDocument(chunk_id)!)); // Wait for all scraping promises to resolve const results = await Promise.all(scrapingPromises); diff --git a/src/client/views/nodes/chatbot/tools/WikipediaTool.ts b/src/client/views/nodes/chatbot/tools/WikipediaTool.ts index ee815532a..ec5d83e52 100644 --- a/src/client/views/nodes/chatbot/tools/WikipediaTool.ts +++ b/src/client/views/nodes/chatbot/tools/WikipediaTool.ts @@ -32,7 +32,7 @@ export class WikipediaTool extends BaseTool { async execute(args: ParametersType): Promise { try { - const { text } = await Networking.PostToServer('/getWikipediaSummary', { title: args.title }); + const { text } = (await Networking.PostToServer('/getWikipediaSummary', { title: args.title })) as { text: string }; const id = uuidv4(); const url = `https://en.wikipedia.org/wiki/${args.title.replace(/ /g, '_')}`; this._addLinkedUrlDoc(url, id); diff --git a/src/client/views/nodes/chatbot/types/types.ts b/src/client/views/nodes/chatbot/types/types.ts index 90b5e7e11..0d1804b2d 100644 --- a/src/client/views/nodes/chatbot/types/types.ts +++ b/src/client/views/nodes/chatbot/types/types.ts @@ -101,6 +101,7 @@ export interface RAGChunk { export interface SimplifiedChunk { chunkId: string; + doc_id: string; startPage?: number; endPage?: number; location?: string; diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts index c8a6bb16b..5a09b945b 100644 --- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts +++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts @@ -13,7 +13,7 @@ import { LinkManager, UPDATE_SERVER_CACHE } from '../../../../util/LinkManager'; import { DocumentView } from '../../DocumentView'; import { ChatBox, parsedDoc } from '../chatboxcomponents/ChatBox'; import { supportedDocTypes } from '../types/tool_types'; -import { CHUNK_TYPE, RAGChunk } from '../types/types'; +import { CHUNK_TYPE, RAGChunk, SimplifiedChunk } from '../types/types'; /** * Interface representing a document in the freeform view @@ -31,7 +31,7 @@ export class AgentDocumentManager { private chatBox: ChatBox; private chatBoxDocument: Doc | null = null; private fieldMetadata: Record = {}; - @observable private documentIdsFromChunkIds: ObservableMap; + @observable private simplifiedChunks: ObservableMap; /** * Creates a new DocumentManager @@ -40,17 +40,21 @@ export class AgentDocumentManager { constructor(chatBox: ChatBox) { makeObservable(this); const agentDoc = DocCast(chatBox.Document.agentDocument) ?? new Doc(); - const chunkIds = DocCast(agentDoc.chunkIds) ?? new Doc(); + const chunk_simpl = DocCast(agentDoc.chunk_simpl) ?? new Doc(); agentDoc.title = chatBox.Document.title + '_agentDocument'; - chunkIds.title = '_chunkIds'; + chunk_simpl.title = '_chunk_simpl'; chatBox.Document.agentDocument = agentDoc; - DocCast(chatBox.Document.agentDocument)!.chunkIds = chunkIds; - this.documentIdsFromChunkIds = StrListCast(chunkIds.mapping).reduce((mapping, content) => { - const [chunkId, docId] = content.split(':'); - mapping.set(chunkId, docId); + DocCast(chatBox.Document.agentDocument)!.chunk_simpl = chunk_simpl; + + this.simplifiedChunks = StrListCast(chunk_simpl.mapping).reduce((mapping, chunks) => { + StrListCast(chunks).forEach(chunk => { + const parsed = JSON.parse(StrCast(chunk)); + mapping.set(parsed.chunkId, parsed); + }); return mapping; - }, new ObservableMap()); + }, new ObservableMap()); + this.documentsById = StrListCast(agentDoc.mapping).reduce((mapping, content) => { const [id, layoutId, docId] = content.split(':'); const layoutDoc = DocServer.GetCachedRefField(layoutId); @@ -76,14 +80,10 @@ export class AgentDocumentManager { //{ fireImmediately: true } ); reaction( - () => this.documentIdsFromChunkIds.values(), + () => this.simplifiedChunks.values(), () => { if (this.chatBoxDocument && DocCast(this.chatBoxDocument.agentDocument)) { - // Store the mapping with chunkId:docId format for consistency - const chunkIdsDoc = DocCast(DocCast(this.chatBoxDocument.agentDocument)!.chunkIds); - if (chunkIdsDoc) { - chunkIdsDoc.mapping = new List(Array.from(this.documentIdsFromChunkIds.entries()).map(([chunkId, docId]) => `${chunkId}:${docId}`)); - } + DocCast(DocCast(this.chatBoxDocument.agentDocument)!.chunk_simpl)!.mapping = new List(Array.from(this.simplifiedChunks.values()).map(chunk => JSON.stringify(chunk))); } } //{ fireImmediately: true } @@ -831,7 +831,8 @@ export class AgentDocumentManager { * @param options Optional configuration options * @returns The ID of the created document */ - public createDocInDash(docType: string, data: string, options?: any): string { + + public async createDocInDash(docType: string, data: string, options?: any): Promise { // Validate doc_type if (!this.isValidDocType(docType)) { throw new Error(`Invalid document type: ${docType}`); @@ -877,14 +878,15 @@ export class AgentDocumentManager { // Create link and add it to the document system const linkDoc = Docs.Create.LinkDocument(this.chatBoxDocument, doc); LinkManager.Instance.addLink(linkDoc); - - // Add document to view - this.chatBox._props.addDocument?.(doc); - - // Show document - defer actual display to prevent immediate resource loading - setTimeout(() => { - DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); - }, 100); + if (doc.type !== 'web') { + // Add document to view + this.chatBox._props.addDocument?.(doc); + + // Show document - defer actual display to prevent immediate resource loading + setTimeout(() => { + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); + }, 100); + } } }); @@ -985,67 +987,6 @@ export class AgentDocumentManager { return docInfo?.layoutDoc; } - /** - * Registers chunk IDs associated with a document in the manager - * @param docId The parent document ID - * @param chunkIds Array of chunk IDs associated with this document - */ - @action - public registerChunkIds(docId: string, chunkIds: string[]): void { - // Get the document if it exists - const docInfo = this.documentsById.get(docId); - if (!docInfo) { - console.warn(`Cannot register chunks for unknown document ID: ${docId}`); - return; - } - - // Store chunk IDs on the document for future reference - const doc = docInfo.layoutDoc; - if (!doc.chunk_ids) { - doc.chunk_ids = JSON.stringify(chunkIds); - } else { - // Merge with existing chunk IDs if they exist - const existingIds = JSON.parse(doc.chunk_ids as string); - const updatedIds = [...new Set([...existingIds, ...chunkIds])]; // Remove duplicates - doc.chunk_ids = JSON.stringify(updatedIds); - } - for (const chunkId of chunkIds) { - // Ensure each chunk ID can be linked back to its parent document - // Store a mapping from chunk ID to parent document ID - // This allows us to easily find a document by any of its chunk IDs - if (!this.documentIdsFromChunkIds.has(chunkId) && doc) { - this.documentIdsFromChunkIds.set(chunkId, doc[Id]); - } - } - } - - /** - * Gets a document ID by a chunk ID - * @param chunkId The chunk ID to look up - * @returns The parent document ID if found - */ - public getDocByChunkId(chunkId: string): Doc | undefined { - // First, look up the document ID using the chunk ID mapping - const docId = this.documentIdsFromChunkIds.get(chunkId); - console.log('this.documentIdsFromChunkIds', this.documentIdsFromChunkIds); - console.log('docId', docId); - if (!docId) { - if (this.documentsById.has(chunkId)) { - return this.documentsById.get(chunkId)?.layoutDoc; - } else { - console.error('No document found for chunkId and docId', chunkId); - return undefined; - } - } - // Then get the document using the document ID - const docInfo = this.documentsById.get(docId); - if (docInfo) { - return docInfo.layoutDoc; - } - console.error('No document found for docId', docId); - return undefined; - } - /** * Adds simplified chunks to a document for citation handling * @param doc The document to add simplified chunks to @@ -1053,21 +994,13 @@ export class AgentDocumentManager { * @param docType The type of document (e.g., 'pdf', 'video', 'audio', etc.) * @returns The updated document with simplified chunks */ - public addSimplifiedChunks(doc: Doc, chunks: RAGChunk[], docType: string): Doc { - if (!doc) { - console.error('Cannot add simplified chunks to null document'); - return doc; - } - - // Initialize empty chunks array if not exists - if (!doc.chunk_simpl) { - doc.chunk_simpl = JSON.stringify({ chunks: [] }); - } - + @action + public addSimplifiedChunks(chunks: RAGChunk[], docType: string) { + console.log('chunks', chunks, 'simplifiedChunks', this.simplifiedChunks); // Create array of simplified chunks based on document type - const simplifiedChunks = chunks.map(chunk => { + for (const chunk of chunks) { // Common properties across all chunk types - const baseChunk = { + const baseChunk: SimplifiedChunk = { chunkId: chunk.id, text: chunk.metadata.text, doc_id: chunk.metadata.doc_id, @@ -1076,38 +1009,33 @@ export class AgentDocumentManager { // Add type-specific properties if (docType === 'video' || docType === 'audio') { - return { + this.simplifiedChunks.set(chunk.id, { ...baseChunk, start_time: chunk.metadata.start_time, end_time: chunk.metadata.end_time, indexes: chunk.metadata.indexes, chunkType: docType === 'video' ? CHUNK_TYPE.VIDEO : CHUNK_TYPE.AUDIO, - }; + } as SimplifiedChunk); } else if (docType === 'pdf') { - return { + this.simplifiedChunks.set(chunk.id, { ...baseChunk, startPage: chunk.metadata.start_page, endPage: chunk.metadata.end_page, location: chunk.metadata.location, - }; + } as SimplifiedChunk); } else if (docType === 'csv') { - return { + this.simplifiedChunks.set(chunk.id, { ...baseChunk, rowStart: (chunk.metadata as any).row_start, rowEnd: (chunk.metadata as any).row_end, colStart: (chunk.metadata as any).col_start, colEnd: (chunk.metadata as any).col_end, - }; + } as SimplifiedChunk); } else { // Default for other document types - return baseChunk; + this.simplifiedChunks.set(chunk.id, baseChunk as SimplifiedChunk); } - }); - console.log('simplifiedChunks', simplifiedChunks); - // Update the document with all simplified chunks at once - doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks }); - - return doc; + } } /** @@ -1116,21 +1044,10 @@ export class AgentDocumentManager { * @param chunkId The ID of the chunk to retrieve * @returns The simplified chunk if found, undefined otherwise */ - public getSimplifiedChunkById(doc: Doc, chunkId: string): any | undefined { - let chunks: any[] = []; - if (!doc || !doc.chunk_simpl) { - chunks = []; - console.warn('No chunk found for chunkId', chunkId, '. Checking if document exists in documentsById.'); - return []; - } - try { - const parsed = JSON.parse(StrCast(doc.chunk_simpl)); - chunks = parsed.chunks || []; - } catch (e) { - console.error('Error parsing simplified chunks:', e); - return []; - } - return chunks.find(chunk => chunk.chunkId === chunkId); + public getSimplifiedChunkById(chunkId: string): any | undefined { + console.log('chunkId', chunkId, 'simplifiedChunks', this.simplifiedChunks); + console.log('doc', this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || '')); + return { foundChunk: this.simplifiedChunks.get(chunkId), doc: this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || '') }; } /** @@ -1150,27 +1067,4 @@ export class AgentDocumentManager { return []; } } - - /** - * Gets all document summaries combined into a single string - * @returns String containing all document summaries - */ - public getAllDocumentSummaries(): string { - const summaries = Array.from(this.documentsById.keys()) - .map(id => { - const doc = this.getDocument(id); - if (doc) { - // Try to get summary from either the document or its data document - const summary = doc.summary || (doc[DocData] && doc[DocData].summary); - if (summary) { - return StrCast(summary); - } - } - return null; - }) - .filter(Boolean) - .join('\n\n'); - - return summaries; - } } diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index 1349df483..f1fae6f11 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -148,10 +148,6 @@ export class Vectorstore { // Generate chunk IDs upfront so we can register them const chunkIds = segmentedTranscript.map(() => uuidv4()); - - // Register all chunk IDs with the document manager - this.docManager.registerChunkIds(doc_id, chunkIds); - // Add transcript and embeddings to metadata result = { doc_id, @@ -185,7 +181,7 @@ export class Vectorstore { doc.segmented_transcript = JSON.stringify(segmentedTranscript); // Use doc manager to add simplified chunks const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; - this.docManager.addSimplifiedChunks(doc, result.chunks, docType); + this.docManager.addSimplifiedChunks(result.chunks, docType); } else { // Process regular document console.log('Processing regular document...'); @@ -216,13 +212,10 @@ export class Vectorstore { console.log('doc_id in vectorstore', result.doc_id, 'does not match doc_id in doc', doc[Id]); } - // Register chunks with the document manager - this.docManager.registerChunkIds(result.doc_id, chunkIds); - // Use doc manager to add simplified chunks - determine document type from file extension const fileExt = path.extname(local_file_path).toLowerCase(); const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text'; - this.docManager.addSimplifiedChunks(doc, result.chunks, docType); + this.docManager.addSimplifiedChunks(result.chunks, docType); doc.summary = result.summary; doc.ai_purpose = result.purpose; @@ -351,16 +344,6 @@ export class Vectorstore { }, } as RAGChunk; - // Ensure the document manager knows about this chunk - // This is important for maintaining backwards compatibility - if (chunk.id && !this.docManager.getDocByChunkId(chunk.id)) { - // If the chunk ID isn't registered but we have a doc_id in metadata - if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) { - // Register the chunk with its parent document - this.docManager.registerChunkIds(chunk.metadata.doc_id, [chunk.id]); - } - } - return chunk; }); diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index 378f14094..b7ce4f663 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -514,30 +514,37 @@ export default class AssistantManager extends ApiManager { await browser.close(); browser = null; - // Use a try-catch block specifically for JSDOM parsing + let extractedText = ''; + + // First try with Readability try { // Parse HTML content using JSDOM const dom = new JSDOM(htmlContent, { url }); // Extract readable content using Mozilla's Readability API - const reader = new Readability(dom.window.document); + const reader = new Readability(dom.window.document, { + // Readability configuration to focus on text content + charThreshold: 100, + keepClasses: false, + }); const article = reader.parse(); - if (article) { - const plainText = article.textContent; - res.send({ website_plain_text: plainText }); + if (article && article.textContent) { + extractedText = article.textContent; } else { - // If Readability fails, fallback to extracting main content - const mainContent = await extractMainContent(htmlContent); - res.send({ website_plain_text: mainContent }); + // If Readability doesn't return useful content, try alternate method + extractedText = await extractEnhancedContent(htmlContent); } } catch (parsingError) { - console.error('Error parsing website content:', parsingError); - - // Fallback to a simplified extraction method - const mainContent = await extractMainContent(htmlContent); - res.send({ website_plain_text: mainContent }); + console.error('Error parsing website content with Readability:', parsingError); + // Fallback to enhanced content extraction + extractedText = await extractEnhancedContent(htmlContent); } + + // Clean up the extracted text + extractedText = cleanupText(extractedText); + + res.send({ website_plain_text: extractedText }); } catch (error) { console.error('Error scraping website:', error); @@ -985,48 +992,119 @@ function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) { } /** - * Extracts main content from HTML by removing scripts, styles, and non-content elements - * Used as a fallback when Readability fails + * Enhanced content extraction that focuses on meaningful text content. * @param html The HTML content to process - * @returns Extracted main text content + * @returns Extracted and cleaned text content */ -async function extractMainContent(html: string): Promise { +async function extractEnhancedContent(html: string): Promise { try { - // Create a simple DOM to extract content + // Create DOM to extract content const dom = new JSDOM(html, { runScripts: 'outside-only' }); const document = dom.window.document; - // Remove scripts, styles, and other non-content elements - const elementsToRemove = ['script', 'style', 'iframe', 'noscript', 'svg', 'header', 'footer', 'nav', 'aside', 'ads', 'banner', 'form', 'button', 'input']; - - elementsToRemove.forEach(tag => { - const elements = document.querySelectorAll(tag); + // Remove all non-content elements + const elementsToRemove = [ + 'script', + 'style', + 'iframe', + 'noscript', + 'svg', + 'canvas', + 'header', + 'footer', + 'nav', + 'aside', + 'form', + 'button', + 'input', + 'select', + 'textarea', + 'meta', + 'link', + 'img', + 'video', + 'audio', + '.ad', + '.ads', + '.advertisement', + '.banner', + '.cookie', + '.popup', + '.modal', + '.newsletter', + '[role="banner"]', + '[role="navigation"]', + '[role="complementary"]', + ]; + + elementsToRemove.forEach(selector => { + const elements = document.querySelectorAll(selector); elements.forEach(el => el.remove()); }); - // Try to find the main content container using common selectors - const mainSelectors = ['main', 'article', '#content', '.content', '#main', '.main', '.post-content', '.article-content', '.entry-content']; - - let mainContent = ''; - - // Try each selector to find main content - for (const selector of mainSelectors) { - const element = document.querySelector(selector); - if (element && element.textContent && element.textContent.trim().length > 100) { - mainContent = element.textContent; - break; + // Get all text paragraphs with meaningful content + const contentElements = [ + ...Array.from(document.querySelectorAll('p')), + ...Array.from(document.querySelectorAll('h1')), + ...Array.from(document.querySelectorAll('h2')), + ...Array.from(document.querySelectorAll('h3')), + ...Array.from(document.querySelectorAll('h4')), + ...Array.from(document.querySelectorAll('h5')), + ...Array.from(document.querySelectorAll('h6')), + ...Array.from(document.querySelectorAll('li')), + ...Array.from(document.querySelectorAll('td')), + ...Array.from(document.querySelectorAll('article')), + ...Array.from(document.querySelectorAll('section')), + ...Array.from(document.querySelectorAll('div:not([class]):not([id])')), + ]; + + // Extract text from content elements that have meaningful text + let contentParts: string[] = []; + contentElements.forEach(el => { + const text = el.textContent?.trim(); + // Only include elements with substantial text (more than just a few characters) + if (text && text.length > 10 && !contentParts.includes(text)) { + contentParts.push(text); } - } + }); - // If no main content found with selectors, use body content - if (!mainContent || mainContent.length < 200) { - mainContent = document.body.textContent || ''; + // If no significant content found with selective approach, fallback to body + if (contentParts.length < 3) { + return document.body.textContent || ''; } - // Clean up the text - return mainContent.replace(/\s+/g, ' ').replace(/\n+/g, '\n').trim(); + return contentParts.join('\n\n'); } catch (error) { - console.error('Error extracting main content:', error); + console.error('Error extracting enhanced content:', error); return 'Failed to extract content from the webpage.'; } } + +/** + * Cleans up extracted text to improve readability and focus on useful content. + * @param text The raw extracted text + * @returns Cleaned and formatted text + */ +function cleanupText(text: string): string { + if (!text) return ''; + + return ( + text + // Remove excessive whitespace and normalize line breaks + .replace(/\s+/g, ' ') + .replace(/\n\s*\n\s*\n+/g, '\n\n') + // Remove common boilerplate phrases + .replace(/cookie policy|privacy policy|terms of service|all rights reserved|copyright ©/gi, '') + // Remove email addresses + .replace(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, '') + // Remove URLs + .replace(/https?:\/\/[^\s]+/g, '') + // Remove social media handles + .replace(/@[a-zA-Z0-9_]+/g, '') + // Clean up any remaining HTML tags that might have been missed + .replace(/<[^>]*>/g, '') + // Fix spacing issues after cleanup + .replace(/ +/g, ' ') + .trim() + ); +} -- cgit v1.2.3-70-g09d2 From 0e98320d3b237f1927b9f1367494dccd7f66eda9 Mon Sep 17 00:00:00 2001 From: "A.J. Shulman" Date: Wed, 21 May 2025 12:38:55 -0400 Subject: Added codebase search and retrieval to Vectorstore Summary indexing: Added functionality to embed and index file summaries from file_summaries.json in Pinecone Vector search: Implemented semantic search to find the top 5 most relevant files for a query Content retrieval: Added method to fetch full file content from file_content.json API endpoints: /getFileSummaries - Retrieves all file summaries /getFileContent - Fetches file content by path /getRawFileContent - Returns content as plain text to avoid JSON parsing errors Error handling: Added comprehensive error handling and debugging throughout Initialization: Implemented proper async initialization sequence with verification Performance: Added streaming for large files to improve memory efficiency Testing: Added automated test queries to validate functionality --- .../nodes/chatbot/chatboxcomponents/ChatBox.tsx | 8 +- src/client/views/nodes/chatbot/tools/RAGTool.ts | 8 +- src/client/views/nodes/chatbot/tools/SearchTool.ts | 2 +- .../nodes/chatbot/tools/WebsiteInfoScraperTool.ts | 6 +- .../nodes/chatbot/utils/AgentDocumentManager.ts | 20 +- .../views/nodes/chatbot/vectorstore/Vectorstore.ts | 493 ++++++++++++++++++++- src/server/ApiManagers/AssistantManager.ts | 180 ++++++++ 7 files changed, 701 insertions(+), 16 deletions(-) (limited to 'src/client/views/nodes/chatbot/tools/SearchTool.ts') diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx index 91a7adf24..470f94a8d 100644 --- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx +++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx @@ -164,7 +164,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { }); // Process the document first to ensure it has a valid ID - this.docManager.processDocument(newLinkedDoc); + await this.docManager.processDocument(newLinkedDoc); // Add the document to the vectorstore which will also register chunks await this.vectorstore.addAIDoc(newLinkedDoc, this.updateProgress); @@ -648,7 +648,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { const { foundChunk, doc, dataDoc } = this.docManager.getSimplifiedChunkById(chunkId); console.log('doc: ', doc); console.log('dataDoc: ', dataDoc); - if (!foundChunk) { + if (!foundChunk || !doc) { if (doc) { console.warn(`Chunk not found in document, ${doc.id}, for chunk ID: ${chunkId}`); DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => {}); @@ -1102,8 +1102,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent() { // If there are stored doc IDs in our list of docs to add, process them if (this._linked_docs_to_add.size > 0) { - this._linked_docs_to_add.forEach(doc => { - this.docManager.processDocument(doc); + this._linked_docs_to_add.forEach(async doc => { + await this.docManager.processDocument(doc); }); } } diff --git a/src/client/views/nodes/chatbot/tools/RAGTool.ts b/src/client/views/nodes/chatbot/tools/RAGTool.ts index 90b803d21..af44de520 100644 --- a/src/client/views/nodes/chatbot/tools/RAGTool.ts +++ b/src/client/views/nodes/chatbot/tools/RAGTool.ts @@ -12,6 +12,12 @@ const ragToolParams = [ description: "A detailed prompt representing an ideal chunk to embed and compare against document vectors to retrieve the most relevant content for answering the user's query.", required: true, }, + { + name: 'doc_ids', + type: 'string[]', + description: 'An optional array of document IDs to retrieve chunks from. If you want to retrieve chunks from all documents, leave this as an empty array: [] (DO NOT LEAVE THIS EMPTY).', + required: false, + }, ] as const; type RAGToolParamsType = typeof ragToolParams; @@ -69,7 +75,7 @@ export class RAGTool extends BaseTool { } async execute(args: ParametersType): Promise { - const relevantChunks = await this.vectorstore.retrieve(args.hypothetical_document_chunk); + const relevantChunks = await this.vectorstore.retrieve(args.hypothetical_document_chunk, undefined, args.doc_ids ?? undefined); const formattedChunks = await this.getFormattedChunks(relevantChunks); return formattedChunks; } diff --git a/src/client/views/nodes/chatbot/tools/SearchTool.ts b/src/client/views/nodes/chatbot/tools/SearchTool.ts index 43f14ea83..8e6edce8c 100644 --- a/src/client/views/nodes/chatbot/tools/SearchTool.ts +++ b/src/client/views/nodes/chatbot/tools/SearchTool.ts @@ -22,7 +22,7 @@ type SearchToolParamsType = typeof searchToolParams; const searchToolInfo: ToolInfo = { name: 'searchTool', - citationRules: 'No citation needed. Cannot cite search results for a response. Use web scraping tools to cite specific information.', + citationRules: 'Always cite the search results for a response, if the search results are relevant to the response. Use the chunk_id to cite the search results. If the search results are not relevant to the response, do not cite them. ', parameterRules: searchToolParams, description: 'Search the web to find a wide range of websites related to a query or multiple queries. Returns a list of websites and their overviews based on the search queries.', }; diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts index 495a985cb..727d35e2c 100644 --- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts +++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts @@ -22,6 +22,7 @@ const websiteInfoScraperToolInfo: ToolInfo = { name: 'websiteInfoScraper', description: 'Scrape detailed information from specific websites relevant to the user query. Returns the text content of the webpages for further analysis and grounding.', citationRules: ` + !IMPORTANT! THESE CHUNKS REPLACE THE CHUNKS THAT ARE RETURNED FROM THE SEARCHTOOL. Your task is to provide a comprehensive response to the user's prompt using the content scraped from relevant websites. Ensure you follow these guidelines for structuring your response: 1. Grounded Text Tag Structure: @@ -88,6 +89,7 @@ export class WebsiteInfoScraperTool extends BaseTool\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n`, + text: `\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n`, } as Observation; } @@ -142,7 +144,7 @@ export class WebsiteInfoScraperTool extends BaseTool\n${website_plain_text}\n`, + text: `\n${website_plain_text}\n`, } as Observation; } catch (error) { lastError = error instanceof Error ? error.message : 'Unknown error'; diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts index 33eec5972..3c8b49f33 100644 --- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts +++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts @@ -153,9 +153,9 @@ export class AgentDocumentManager { console.log(`Found ${linkedDocs.length} linked documents via LinkManager`); // Process the linked documents - linkedDocs.forEach((doc: Doc | undefined) => { + linkedDocs.forEach(async (doc: Doc | undefined) => { if (doc) { - this.processDocument(doc); + await this.processDocument(doc); console.log('Processed linked document:', doc[Id], doc.title, doc.type); } }); @@ -170,7 +170,7 @@ export class AgentDocumentManager { * @param doc The document to process */ @action - public processDocument(doc: Doc): string { + public async processDocument(doc: Doc): Promise { // Ensure document has a persistent ID const docId = this.ensureDocumentId(doc); if (doc.chunk_simplified) { @@ -900,7 +900,7 @@ export class AgentDocumentManager { } }); - const id = this.processDocument(doc); + const id = await this.processDocument(doc); return id; } else { throw new Error(`Error creating document. Created document not found.`); @@ -1081,6 +1081,18 @@ export class AgentDocumentManager { return { foundChunk: this.simplifiedChunks.get(chunkId), doc: this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || chunkId), dataDoc: this.getDataDocument(this.simplifiedChunks.get(chunkId)?.doc_id || chunkId) }; } + public getChunkIdsFromDocIds(docIds: string[]): string[] { + return docIds + .map(docId => { + for (const chunk of this.simplifiedChunks.values()) { + if (chunk.doc_id === docId) { + return chunk.chunkId; + } + } + }) + .filter(chunkId => chunkId !== undefined) as string[]; + } + /** * Gets the original segments from a media document * @param doc The document containing original media segments diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index 252672dfc..5c2d0e5ea 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -27,11 +27,16 @@ dotenv.config(); export class Vectorstore { private pinecone!: Pinecone; // Pinecone client for managing the vector index. private index!: Index; // The specific Pinecone index used for document chunks. + private summaryIndex!: Index; // The Pinecone index used for file summaries. private openai!: OpenAI; // OpenAI client for generating embeddings. private indexName: string = 'pdf-chatbot'; // Default name for the index. + private summaryIndexName: string = 'file-summaries'; // Name for the summaries index. private _id!: string; // Unique ID for the Vectorstore instance. private docManager!: AgentDocumentManager; // Document manager for handling documents + private summaryCacheCount: number = 0; // Cache for the number of summaries documents: AI_Document[] = []; // Store the documents indexed in the vectorstore. + private debug: boolean = true; // Enable debugging + private initialized: boolean = false; /** * Initializes the Pinecone and OpenAI clients, sets up the document ID list, @@ -40,6 +45,7 @@ export class Vectorstore { * @param docManager An instance of AgentDocumentManager to handle document management. */ constructor(id: string, docManager: AgentDocumentManager) { + if (this.debug) console.log(`[DEBUG] Initializing Vectorstore with ID: ${id}`); const pineconeApiKey = 'pcsk_3txLxJ_9fxdmAph4csnq4yxoDF5De5A8bJvjWaXXigBgshy4eoXggrXcxATJiH8vzXbrKm'; if (!pineconeApiKey) { console.log('PINECONE_API_KEY is not defined - Vectorstore will be unavailable'); @@ -51,7 +57,32 @@ export class Vectorstore { this.openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, dangerouslyAllowBrowser: true }); this._id = id; this.docManager = docManager; - this.initializeIndex(); + + // Proper async initialization sequence + this.initializeAsync(id); + } + + /** + * Handles async initialization of all components + */ + private async initializeAsync(id: string) { + try { + if (this.debug) console.log(`[DEBUG] Starting async initialization sequence for Vectorstore ID: ${id}`); + + // Initialize the main document index + await this.initializeIndex(); + + // Initialize the summary index + await this.initializeSummaryIndex(); + + this.initialized = true; + if (this.debug) console.log(`[DEBUG] ✅ Vectorstore initialization complete, running test query...`); + + // Run a single test query instead of multiple + await this.runSingleTestQuery(); + } catch (error) { + console.error('[ERROR] Failed to initialize Vectorstore:', error); + } } /** @@ -59,10 +90,13 @@ export class Vectorstore { * Sets the index to use cosine similarity for vector similarity calculations. */ private async initializeIndex() { + if (this.debug) console.log(`[DEBUG] Initializing main document index: ${this.indexName}`); const indexList: IndexList = await this.pinecone.listIndexes(); + if (this.debug) console.log(`[DEBUG] Available Pinecone indexes: ${indexList.indexes?.map(i => i.name).join(', ') || 'none'}`); // Check if the index already exists, otherwise create it. if (!indexList.indexes?.some(index => index.name === this.indexName)) { + if (this.debug) console.log(`[DEBUG] Creating new index: ${this.indexName}`); await this.pinecone.createIndex({ name: this.indexName, dimension: 3072, @@ -74,12 +108,462 @@ export class Vectorstore { }, }, }); + if (this.debug) console.log(`[DEBUG] ✅ Index ${this.indexName} created successfully`); + } else { + if (this.debug) console.log(`[DEBUG] ✅ Using existing index: ${this.indexName}`); } // Set the index for future use. this.index = this.pinecone.Index(this.indexName); } + /** + * Initializes the Pinecone index for file summaries. + * Checks if it exists and creates it if necessary. + */ + private async initializeSummaryIndex() { + if (this.debug) console.log(`[DEBUG] Initializing file summaries index: ${this.summaryIndexName}`); + const indexList: IndexList = await this.pinecone.listIndexes(); + + // Check if the index already exists, otherwise create it. + if (!indexList.indexes?.some(index => index.name === this.summaryIndexName)) { + if (this.debug) console.log(`[DEBUG] Creating new summary index: ${this.summaryIndexName}`); + await this.pinecone.createIndex({ + name: this.summaryIndexName, + dimension: 3072, + metric: 'cosine', + spec: { + serverless: { + cloud: 'aws', + region: 'us-east-1', + }, + }, + }); + if (this.debug) console.log(`[DEBUG] ✅ Summary index ${this.summaryIndexName} created successfully`); + } else { + if (this.debug) console.log(`[DEBUG] ✅ Using existing summary index: ${this.summaryIndexName}`); + } + + // Set the summaries index for future use. + this.summaryIndex = this.pinecone.Index(this.summaryIndexName); + + // Check if we need to index the file summaries + await this.processFileSummaries(); + } + + /** + * Processes file summaries from the JSON file if needed. + * Checks if the index contains the correct number of summaries before embedding. + */ + private async processFileSummaries() { + if (this.debug) console.log(`[DEBUG] Starting file summaries processing`); + try { + // Get file summaries from the server + if (this.debug) console.log(`[DEBUG] Fetching file summaries from server...`); + const response = await Networking.FetchFromServer('/getFileSummaries'); + + if (!response) { + console.error('[ERROR] Failed to fetch file summaries'); + return; + } + if (this.debug) console.log(`[DEBUG] File summaries response received (${response.length} bytes)`); + + const summaries = JSON.parse(response); + const filepaths = Object.keys(summaries); + const summaryCount = filepaths.length; + this.summaryCacheCount = summaryCount; + + if (this.debug) { + console.log(`[DEBUG] File summaries parsed: ${summaryCount} files`); + console.log(`[DEBUG] Sample filepaths: ${filepaths.slice(0, 3).join(', ')}...`); + console.log(`[DEBUG] Sample summary: "${summaries[filepaths[0]].substring(0, 100)}..."`); + } + + // Check if index already has the correct number of summaries + try { + if (this.debug) console.log(`[DEBUG] Checking summary index stats...`); + const indexStats = await this.summaryIndex.describeIndexStats(); + const vectorCount = indexStats.totalRecordCount; + + if (this.debug) console.log(`[DEBUG] Summary index has ${vectorCount} records, expecting ${summaryCount}`); + + if (vectorCount === summaryCount) { + console.log(`[DEBUG] ✅ Summary index already contains ${vectorCount} entries, skipping embedding.`); + return; + } + + if (this.debug) console.log(`[DEBUG] ⚠️ Summary index contains ${vectorCount} entries, but there are ${summaryCount} summaries. Re-indexing.`); + } catch (error) { + console.error('[ERROR] Error checking summary index stats:', error); + } + + // If we get here, we need to embed the summaries + await this.embedAndIndexFileSummaries(summaries); + } catch (error) { + console.error('[ERROR] Error processing file summaries:', error); + } + } + + /** + * Embeds and indexes file summaries into the summary index. + * @param summaries Object mapping filepaths to summaries + */ + private async embedAndIndexFileSummaries(summaries: Record) { + if (this.debug) console.log(`[DEBUG] Starting embedding and indexing of file summaries...`); + + const filepaths = Object.keys(summaries); + const summaryTexts = Object.values(summaries); + + // Split into batches of 100 to avoid exceeding API limits + const batchSize = 100; + const totalBatches = Math.ceil(filepaths.length / batchSize); + + if (this.debug) console.log(`[DEBUG] Processing ${filepaths.length} files in ${totalBatches} batches of size ${batchSize}`); + + for (let i = 0; i < filepaths.length; i += batchSize) { + const batchFilepaths = filepaths.slice(i, i + batchSize); + const batchTexts = summaryTexts.slice(i, i + batchSize); + + if (this.debug) { + console.log(`[DEBUG] Processing batch ${Math.floor(i / batchSize) + 1}/${totalBatches}`); + console.log(`[DEBUG] First file in batch: ${batchFilepaths[0]}`); + console.log(`[DEBUG] First summary in batch: "${batchTexts[0].substring(0, 50)}..."`); + } + + try { + // Generate embeddings for this batch + if (this.debug) console.log(`[DEBUG] Generating embeddings for batch of ${batchTexts.length} summaries...`); + const startTime = Date.now(); + const embeddingResponse = await this.openai.embeddings.create({ + model: 'text-embedding-3-large', + input: batchTexts, + encoding_format: 'float', + }); + const duration = Date.now() - startTime; + if (this.debug) console.log(`[DEBUG] ✅ Embeddings generated in ${duration}ms`); + + // Prepare Pinecone records + if (this.debug) console.log(`[DEBUG] Preparing Pinecone records...`); + const pineconeRecords: PineconeRecord[] = batchTexts.map((text, index) => { + const embedding = (embeddingResponse.data as Embedding[])[index].embedding; + if (this.debug && index === 0) console.log(`[DEBUG] Sample embedding dimensions: ${embedding.length}, first few values: [${embedding.slice(0, 5).join(', ')}...]`); + + return { + id: uuidv4(), // Generate a unique ID for each summary + values: embedding, + metadata: { + filepath: batchFilepaths[index], + summary: text, + } as RecordMetadata, + }; + }); + + // Upload to Pinecone + if (this.debug) console.log(`[DEBUG] Upserting ${pineconeRecords.length} records to Pinecone...`); + const upsertStart = Date.now(); + try { + await this.summaryIndex.upsert(pineconeRecords); + const upsertDuration = Date.now() - upsertStart; + if (this.debug) console.log(`[DEBUG] ✅ Batch ${Math.floor(i / batchSize) + 1}/${totalBatches} indexed in ${upsertDuration}ms`); + } catch (upsertError) { + console.error(`[ERROR] Failed to upsert batch ${Math.floor(i / batchSize) + 1}/${totalBatches} to Pinecone:`, upsertError); + // Try again with smaller batch + if (batchTexts.length > 20) { + console.log(`[DEBUG] 🔄 Retrying with smaller batch size...`); + // Split the batch in half and retry recursively + const midpoint = Math.floor(batchTexts.length / 2); + const firstHalf = { + filepaths: batchFilepaths.slice(0, midpoint), + texts: batchTexts.slice(0, midpoint), + }; + const secondHalf = { + filepaths: batchFilepaths.slice(midpoint), + texts: batchTexts.slice(midpoint), + }; + + // Create a helper function to retry smaller batches + const retryBatch = async (paths: string[], texts: string[], batchNum: string) => { + try { + if (this.debug) console.log(`[DEBUG] Generating embeddings for sub-batch ${batchNum}...`); + const embRes = await this.openai.embeddings.create({ + model: 'text-embedding-3-large', + input: texts, + encoding_format: 'float', + }); + + const records = texts.map((t, idx) => ({ + id: uuidv4(), + values: (embRes.data as Embedding[])[idx].embedding, + metadata: { + filepath: paths[idx], + summary: t, + } as RecordMetadata, + })); + + if (this.debug) console.log(`[DEBUG] Upserting sub-batch ${batchNum} (${records.length} records)...`); + await this.summaryIndex.upsert(records); + if (this.debug) console.log(`[DEBUG] ✅ Sub-batch ${batchNum} upserted successfully`); + } catch (retryError) { + console.error(`[ERROR] Failed to upsert sub-batch ${batchNum}:`, retryError); + } + }; + + await retryBatch(firstHalf.filepaths, firstHalf.texts, `${Math.floor(i / batchSize) + 1}.1`); + await retryBatch(secondHalf.filepaths, secondHalf.texts, `${Math.floor(i / batchSize) + 1}.2`); + } + } + } catch (error) { + console.error('[ERROR] Error processing batch:', error); + } + } + + if (this.debug) console.log(`[DEBUG] ✅ File summary indexing complete for all ${filepaths.length} files`); + + // Verify the index was populated correctly + try { + const indexStats = await this.summaryIndex.describeIndexStats(); + const vectorCount = indexStats.totalRecordCount; + if (this.debug) console.log(`[DEBUG] 🔍 Final index verification: ${vectorCount} records in Pinecone index (expected ${filepaths.length})`); + } catch (error) { + console.error('[ERROR] Failed to verify index stats:', error); + } + } + + /** + * Searches for file summaries similar to the given query. + * @param query The search query + * @param topK Number of results to return (default: 5) + * @returns Array of filepath and summary pairs with relevance scores + */ + async searchFileSummaries(query: string, topK: number = 5): Promise> { + if (!this.initialized) { + console.error('[ERROR] Cannot search - Vectorstore not fully initialized'); + return []; + } + + if (this.debug) console.log(`[DEBUG] Searching file summaries for query: "${query}" (topK=${topK})`); + try { + // Generate embedding for the query + if (this.debug) console.log(`[DEBUG] Generating embedding for query...`); + const startTime = Date.now(); + const queryEmbeddingResponse = await this.openai.embeddings.create({ + model: 'text-embedding-3-large', + input: query, + encoding_format: 'float', + }); + const duration = Date.now() - startTime; + + const queryEmbedding = queryEmbeddingResponse.data[0].embedding; + if (this.debug) { + console.log(`[DEBUG] ✅ Query embedding generated in ${duration}ms`); + console.log(`[DEBUG] Query embedding dimensions: ${queryEmbedding.length}`); + } + + // Check if summary index is ready + try { + const indexStats = await this.summaryIndex.describeIndexStats(); + const vectorCount = indexStats.totalRecordCount; + if (this.debug) console.log(`[DEBUG] Summary index contains ${vectorCount} records`); + + if (vectorCount === 0) { + console.error('[ERROR] Summary index is empty, cannot perform search'); + return []; + } + } catch (statsError) { + console.error('[ERROR] Failed to check summary index stats:', statsError); + console.error('[ERROR] Stats error details:', JSON.stringify(statsError)); + } + + // Test direct API access to Pinecone + if (this.debug) console.log(`[DEBUG] Testing Pinecone connection...`); + try { + const indexes = await this.pinecone.listIndexes(); + console.log(`[DEBUG] Available Pinecone indexes: ${indexes.indexes?.map(idx => idx.name).join(', ')}`); + } catch (connectionError) { + console.error('[ERROR] Could not connect to Pinecone:', connectionError); + } + + // Query the summaries index + if (this.debug) console.log(`[DEBUG] Querying Pinecone summary index (${this.summaryIndexName})...`); + const queryStart = Date.now(); + + let queryResponse; + try { + // First, make sure we can access the index + const indexInfo = await this.summaryIndex.describeIndexStats(); + if (this.debug) console.log(`[DEBUG] Index stats:`, indexInfo); + + queryResponse = await this.summaryIndex.query({ + vector: queryEmbedding, + topK, + includeMetadata: true, + }); + + const queryDuration = Date.now() - queryStart; + + if (this.debug) { + console.log(`[DEBUG] ✅ Pinecone query completed in ${queryDuration}ms`); + console.log(`[DEBUG] Raw Pinecone response:`, JSON.stringify(queryResponse, null, 2)); + if (queryResponse.matches) { + console.log(`[DEBUG] Found ${queryResponse.matches.length} matching summaries`); + console.log(`[DEBUG] Match scores: ${queryResponse.matches.map(m => m.score?.toFixed(4)).join(', ')}`); + } else { + console.log(`[DEBUG] No matches in response`); + } + } + } catch (queryError) { + console.error('[ERROR] Pinecone query failed:', queryError); + if (typeof queryError === 'object' && queryError !== null) { + console.error('[ERROR] Query error details:', JSON.stringify(queryError, null, 2)); + } + return []; + } + + if (!queryResponse || !queryResponse.matches || queryResponse.matches.length === 0) { + console.log('[DEBUG] ⚠️ No matches found in Pinecone for query'); + return []; + } + + // Format results + const results = queryResponse.matches.map(match => { + if (!match.metadata) { + console.error('[ERROR] Match is missing metadata:', match); + return { filepath: 'unknown', summary: 'No summary available' }; + } + + return { + filepath: (match.metadata as { filepath: string }).filepath || 'unknown', + summary: (match.metadata as { summary: string }).summary || 'No summary available', + score: match.score, + }; + }); + + if (this.debug) { + if (results.length > 0) { + console.log(`[DEBUG] Top result filepath: ${results[0]?.filepath}`); + console.log(`[DEBUG] Top result score: ${results[0]?.score}`); + console.log(`[DEBUG] Top result summary excerpt: "${results[0]?.summary?.substring(0, 100)}..."`); + } else { + console.log(`[DEBUG] No results returned after processing`); + } + } + + return results; + } catch (error) { + console.error('[ERROR] Error searching file summaries:', error); + if (typeof error === 'object' && error !== null) { + console.error('[ERROR] Full error details:', JSON.stringify(error, null, 2)); + } + return []; + } + } + + /** + * Runs a single test query after setup to validate the file summary search functionality. + */ + private async runSingleTestQuery() { + console.log(`\n[TEST] Running single test query to validate file summary search functionality...`); + + // Verify the index is accessible + try { + const indexStats = await this.summaryIndex.describeIndexStats(); + console.log(`[TEST] Pinecone index stats:`, JSON.stringify(indexStats, null, 2)); + console.log(`[TEST] Summary index contains ${indexStats.totalRecordCount} indexed summaries`); + } catch (error) { + console.error('[TEST] ❌ Failed to access Pinecone index:', error); + return; + } + + // Add a brief delay to ensure Pinecone has finished processing + console.log('[TEST] Waiting 2 seconds for Pinecone indexing to complete...'); + await new Promise(resolve => setTimeout(resolve, 2000)); + + // Run a single test query + const query = 'React components for the UI'; + console.log(`\n[TEST] Executing query: "${query}"`); + + try { + const results = await this.searchFileSummaries(query); + console.log(`[TEST] Search returned ${results.length} results:`); + + results.forEach((result, i) => { + console.log(`\n[TEST] Result ${i + 1}:`); + console.log(`[TEST] File: ${result.filepath}`); + console.log(`[TEST] Score: ${result.score}`); + console.log(`[TEST] Summary: "${result.summary?.substring(0, 150)}..."`); + }); + + // If we have results, fetch the content for the first one + if (results.length > 0) { + const topFilepath = results[0].filepath; + console.log(`\n[TEST] Fetching full content for top result: ${topFilepath}`); + const content = await this.getFileContent(topFilepath); + + if (content) { + console.log(`[TEST] ✅ Content retrieved successfully (${content.length} chars)`); + console.log(`[TEST] Content excerpt:\n---\n${content.substring(0, 300)}...\n---`); + } else { + console.log(`[TEST] ❌ Failed to retrieve content for ${topFilepath}`); + } + } else { + console.log(`\n[TEST] ⚠️ No results to fetch content for`); + } + + console.log(`\n[TEST] ✅ Test query completed`); + } catch (testError) { + console.error(`[TEST] ❌ Test query failed:`, testError); + if (typeof testError === 'object' && testError !== null) { + console.error('[TEST] Full error details:', JSON.stringify(testError, null, 2)); + } + } + } + + /** + * Gets the full content of a file by its filepath. + * @param filepath The filepath to look up + * @returns The file content or null if not found + */ + async getFileContent(filepath: string): Promise { + if (this.debug) console.log(`[DEBUG] Getting file content for: ${filepath}`); + try { + const startTime = Date.now(); + + // Use the Networking utility for consistent API access + // But convert the response to text manually to avoid JSON parsing + const rawResponse = await fetch('/getRawFileContent', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ filepath }), + }); + + if (!rawResponse.ok) { + const errorText = await rawResponse.text(); + console.error(`[ERROR] Server returned error ${rawResponse.status}: ${errorText}`); + return null; + } + + // Get the raw text content without JSON parsing + const content = await rawResponse.text(); + const duration = Date.now() - startTime; + + if (this.debug) { + console.log(`[DEBUG] ✅ File content retrieved in ${duration}ms`); + console.log(`[DEBUG] Content length: ${content.length} chars`); + console.log(`[DEBUG] Content excerpt: "${content.substring(0, 100)}..."`); + } + + return content; + } catch (error) { + console.error('[ERROR] Error getting file content:', error); + if (typeof error === 'object' && error !== null) { + console.error('[ERROR] Full error details:', JSON.stringify(error, null, 2)); + } + return null; + } + } + /** * Adds an AI document to the vectorstore. Handles media file processing for audio/video, * and text embedding for all document types. Updates document metadata during processing. @@ -303,7 +787,7 @@ export class Vectorstore { * @param topK The number of top results to return (default is 10). * @returns A list of document chunks that match the query. */ - async retrieve(query: string, topK: number = 10): Promise { + async retrieve(query: string, topK: number = 10, docIds?: string[]): Promise { console.log(`Retrieving chunks for query: ${query}`); try { // Generate an embedding for the query using OpenAI. @@ -314,15 +798,16 @@ export class Vectorstore { }); const queryEmbedding = queryEmbeddingResponse.data[0].embedding; + const _docIds = docIds?.length === 0 || !docIds ? this.docManager.docIds : docIds; - console.log('Using document IDs for retrieval:', this.docManager.docIds); + console.log('Using document IDs for retrieval:', _docIds); // Query the Pinecone index using the embedding and filter by document IDs. // We'll query based on document IDs that are registered in the document manager const queryResponse: QueryResponse = await this.index.query({ vector: queryEmbedding, filter: { - doc_id: { $in: this.docManager.docIds }, + doc_id: { $in: _docIds }, }, topK, includeValues: true, diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index b7ce4f663..9d0427b52 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -39,6 +39,7 @@ export enum Directory { csv = 'csv', chunk_images = 'chunk_images', scrape_images = 'scrape_images', + vectorstore = 'vectorstore', } // In-memory job tracking @@ -92,6 +93,119 @@ export default class AssistantManager extends ApiManager { const customsearch = google.customsearch('v1'); const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY }); + // Register an endpoint to retrieve file summaries from the json file + register({ + method: Method.GET, + subscription: '/getFileSummaries', + secureHandler: async ({ req, res }) => { + try { + // Read the file summaries JSON file + const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_summaries.json'); + + if (!fs.existsSync(filePath)) { + res.status(404).send({ error: 'File summaries not found' }); + return; + } + + const data = fs.readFileSync(filePath, 'utf8'); + res.send(data); + } catch (error) { + console.error('Error retrieving file summaries:', error); + res.status(500).send({ + error: 'Failed to retrieve file summaries', + }); + } + }, + }); + + // Register an endpoint to retrieve file content from the content json file + register({ + method: Method.POST, + subscription: '/getFileContent', + secureHandler: async ({ req, res }) => { + const { filepath } = req.body; + + if (!filepath) { + res.status(400).send({ error: 'Filepath is required' }); + return; + } + + try { + // Read the file content JSON file + const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json'); + + if (!fs.existsSync(filePath)) { + res.status(404).send({ error: 'File content database not found' }); + return; + } + + console.log(`[DEBUG] Retrieving content for: ${filepath}`); + + // Read the JSON file in chunks to handle large files + const readStream = fs.createReadStream(filePath, { encoding: 'utf8' }); + let jsonData = ''; + + readStream.on('data', chunk => { + jsonData += chunk; + }); + + readStream.on('end', () => { + try { + // Parse the JSON + const contentMap = JSON.parse(jsonData); + + // Check if the filepath exists in the map + if (!contentMap[filepath]) { + console.log(`[DEBUG] Content not found for: ${filepath}`); + res.status(404).send({ error: `Content not found for filepath: ${filepath}` }); + return; + } + + // Return the file content as is, not as JSON + console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`); + res.send(contentMap[filepath]); + } catch (parseError) { + console.error('Error parsing file_content.json:', parseError); + res.status(500).send({ + error: 'Failed to parse file content database', + }); + } + }); + + readStream.on('error', streamError => { + console.error('Error reading file_content.json:', streamError); + res.status(500).send({ + error: 'Failed to read file content database', + }); + }); + } catch (error) { + console.error('Error retrieving file content:', error); + res.status(500).send({ + error: 'Failed to retrieve file content', + }); + } + }, + }); + + // Register an endpoint to search file summaries + register({ + method: Method.POST, + subscription: '/searchFileSummaries', + secureHandler: async ({ req, res }) => { + const { query, topK } = req.body; + + if (!query) { + res.status(400).send({ error: 'Search query is required' }); + return; + } + + // This endpoint will be called by the client-side Vectorstore to perform the search + // The actual search is implemented in the Vectorstore class + + res.send({ message: 'This endpoint should be called through the Vectorstore class' }); + }, + }); + // Register Wikipedia summary API route register({ method: Method.POST, @@ -848,6 +962,72 @@ export default class AssistantManager extends ApiManager { } }, }); + + // Register an endpoint to retrieve raw file content as plain text (no JSON parsing) + register({ + method: Method.POST, + subscription: '/getRawFileContent', + secureHandler: async ({ req, res }) => { + const { filepath } = req.body; + + if (!filepath) { + res.status(400).send('Filepath is required'); + return; + } + + try { + // Read the file content JSON file + const filePath = path.join(filesDirectory, Directory.vectorstore, 'file_content.json'); + + if (!fs.existsSync(filePath)) { + res.status(404).send('File content database not found'); + return; + } + + console.log(`[DEBUG] Retrieving raw content for: ${filepath}`); + + // Read the JSON file + const readStream = fs.createReadStream(filePath, { encoding: 'utf8' }); + let jsonData = ''; + + readStream.on('data', chunk => { + jsonData += chunk; + }); + + readStream.on('end', () => { + try { + // Parse the JSON + const contentMap = JSON.parse(jsonData); + + // Check if the filepath exists in the map + if (!contentMap[filepath]) { + console.log(`[DEBUG] Content not found for: ${filepath}`); + res.status(404).send(`Content not found for filepath: ${filepath}`); + return; + } + + // Set content type to plain text to avoid JSON parsing + res.setHeader('Content-Type', 'text/plain'); + + // Return the file content as plain text + console.log(`[DEBUG] Found content for: ${filepath} (${contentMap[filepath].length} chars)`); + res.send(contentMap[filepath]); + } catch (parseError) { + console.error('Error parsing file_content.json:', parseError); + res.status(500).send('Failed to parse file content database'); + } + }); + + readStream.on('error', streamError => { + console.error('Error reading file_content.json:', streamError); + res.status(500).send('Failed to read file content database'); + }); + } catch (error) { + console.error('Error retrieving file content:', error); + res.status(500).send('Failed to retrieve file content'); + } + }, + }); } } -- cgit v1.2.3-70-g09d2