diff options
| author | A.J. Shulman <Shulman.aj@gmail.com> | 2025-05-12 15:46:30 -0400 |
|---|---|---|
| committer | A.J. Shulman <Shulman.aj@gmail.com> | 2025-05-12 15:46:30 -0400 |
| commit | b3aa238043d01cbc58293b45867706fa9b36cefe (patch) | |
| tree | 770f07542f97e4bda2c56e00ef8118688e32fce9 /src/client/views/nodes/chatbot | |
| parent | 0a6f3fc649b37e273a501302c1dd645a5e9a18ac (diff) | |
workign better
Diffstat (limited to 'src/client/views/nodes/chatbot')
3 files changed, 170 insertions, 16 deletions
diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx index 00077d68d..af689f243 100644 --- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx +++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx @@ -15,7 +15,7 @@ import * as React from 'react'; import { v4 as uuidv4 } from 'uuid'; import { ClientUtils, OmitKeys } from '../../../../../ClientUtils'; import { Doc, DocListCast, Opt } from '../../../../../fields/Doc'; -import { DocData, DocViews } from '../../../../../fields/DocSymbols'; +import { DocData, DocLayout, DocViews } from '../../../../../fields/DocSymbols'; import { RichTextField } from '../../../../../fields/RichTextField'; import { ScriptField } from '../../../../../fields/ScriptField'; import { CsvCast, DocCast, NumCast, PDFCast, RTFCast, StrCast, VideoCast, AudioCast } from '../../../../../fields/Types'; @@ -644,6 +644,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { // Get the simplified chunk using the document manager const { foundChunk, doc } = this.docManager.getSimplifiedChunkById(chunkId); + console.log('doc: ', doc); if (!foundChunk) { if (doc) { console.warn(`Chunk not found in document, ${doc.id}, for chunk ID: ${chunkId}`); @@ -665,12 +666,14 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { console.error('No direct matching segment found for the citation.'); } } else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) { - this.handleOtherChunkTypes(foundChunk, citation, doc); + console.log('here: ', foundChunk); + this.handleOtherChunkTypes(foundChunk as SimplifiedChunk, citation, doc); } else { if (doc.type === 'web') { DocumentManager.Instance.showDocument(doc, { openLocation: OpenWhere.addRight }, () => {}); return; } + this.handleOtherChunkTypes(foundChunk, citation, doc); // Show the chunk text in citation popup let chunkText = citation.direct_text || 'Text content not available'; this.showCitationPopup(chunkText); @@ -834,10 +837,45 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { this._citationPopup = { text: citation.direct_text ?? 'No text available', visible: true }; this.startCitationPopupTimer(); + // Check if the document is a PDF (has a PDF viewer component) + const isPDF = PDFCast(doc.data) !== null || doc.type === DocumentType.PDF; + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => { - const firstView = Array.from(doc[DocViews])[0] as DocumentView; - (firstView.ComponentView as PDFBox)?.gotoPage?.(foundChunk.startPage ?? 0); - (firstView.ComponentView as PDFBox)?.search?.(citation.direct_text ?? ''); + // Add a delay to ensure document is fully loaded and rendered + setTimeout(() => { + try { + // Safety check: ensure the document has views + if (!doc[DocViews] || doc[DocViews].size === 0) { + console.warn('Document views not available yet, retrying...'); + this.retryPdfSearch(doc, citation, foundChunk, isPDF, 1); + return; + } + + const views = Array.from(doc[DocViews]); + if (!views.length) { + console.warn('No document views found, retrying...'); + this.retryPdfSearch(doc, citation, foundChunk, isPDF, 1); + return; + } + + const firstView = views[0] as DocumentView; + if (!firstView || !firstView.ComponentView) { + console.warn('Component view not available yet, retrying...'); + this.retryPdfSearch(doc, citation, foundChunk, isPDF, 1); + return; + } + + const pdfComponent = firstView.ComponentView as PDFBox; + if (isPDF && pdfComponent && citation.direct_text) { + // Use our helper to ensure fuzzy search is enabled and execute the search + this.ensureFuzzySearchAndExecute(pdfComponent, citation.direct_text.trim(), foundChunk.startPage); + } + } catch (error) { + console.error('Error accessing PDF component:', error); + // Retry with exponential backoff + this.retryPdfSearch(doc, citation, foundChunk, isPDF, 1); + } + }, 500); // Initial delay before first attempt }); break; case CHUNK_TYPE.CSV: @@ -851,6 +889,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { break; } }; + /** * Creates an annotation highlight on a PDF document for image citations. * @param x1 X-coordinate of the top-left corner of the highlight. @@ -1092,6 +1131,100 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { }; /** + * Retry PDF search with exponential backoff + */ + retryPdfSearch = (doc: Doc, citation: Citation, foundChunk: SimplifiedChunk, isPDF: boolean, attempt: number) => { + if (attempt > 5) { + console.error('Maximum retry attempts reached for PDF search'); + return; + } + + const delay = Math.min(2000, 500 * Math.pow(1.5, attempt)); // Exponential backoff with max delay of 2 seconds + + setTimeout(() => { + try { + if (!doc[DocViews] || doc[DocViews].size === 0) { + this.retryPdfSearch(doc, citation, foundChunk, isPDF, attempt + 1); + return; + } + + const views = Array.from(doc[DocViews]); + if (!views.length) { + this.retryPdfSearch(doc, citation, foundChunk, isPDF, attempt + 1); + return; + } + + const firstView = views[0] as DocumentView; + if (!firstView || !firstView.ComponentView) { + this.retryPdfSearch(doc, citation, foundChunk, isPDF, attempt + 1); + return; + } + + const pdfComponent = firstView.ComponentView as PDFBox; + if (isPDF && pdfComponent && citation.direct_text) { + console.log(`PDF component found on attempt ${attempt}, executing search...`); + this.ensureFuzzySearchAndExecute(pdfComponent, citation.direct_text.trim(), foundChunk.startPage); + } + } catch (error) { + console.error(`Error on retry attempt ${attempt}:`, error); + this.retryPdfSearch(doc, citation, foundChunk, isPDF, attempt + 1); + } + }, delay); + }; + + /** + * Ensures fuzzy search is enabled in PDFBox and performs a search + * @param pdfComponent The PDFBox component + * @param searchText The text to search for + * @param startPage Optional page to navigate to before searching + */ + private ensureFuzzySearchAndExecute = (pdfComponent: PDFBox, searchText: string, startPage?: number) => { + if (!pdfComponent) { + console.warn('PDF component is undefined, cannot perform search'); + return; + } + + if (!searchText?.trim()) { + console.warn('Search text is empty, skipping search'); + return; + } + + try { + // Check if the component has required methods + if (typeof pdfComponent.gotoPage !== 'function' || typeof pdfComponent.toggleFuzzySearch !== 'function' || typeof pdfComponent.search !== 'function') { + console.warn('PDF component missing required methods'); + return; + } + + // Navigate to the page if specified + if (typeof startPage === 'number') { + pdfComponent.gotoPage(startPage + 1); + } + + // Always try to enable fuzzy search + try { + // PDFBox.tsx toggles fuzzy search state internally + // We'll call it once to make sure it's enabled + pdfComponent.toggleFuzzySearch(); + } catch (toggleError) { + console.warn('Error toggling fuzzy search:', toggleError); + } + + // Add a sufficient delay to ensure PDF is fully loaded before searching + setTimeout(() => { + try { + console.log('Performing fuzzy search for text:', searchText); + pdfComponent.search(searchText); + } catch (searchError) { + console.error('Error performing search:', searchError); + } + }, 1000); // Increased delay for better reliability + } catch (error) { + console.error('Error in fuzzy search setup:', error); + } + }; + + /** * Main render method for the ChatBox */ render() { diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts index e9d41efbd..784e90c3c 100644 --- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts +++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts @@ -173,6 +173,16 @@ export class AgentDocumentManager { public processDocument(doc: Doc): string { // Ensure document has a persistent ID const docId = this.ensureDocumentId(doc); + if (doc.chunk_simplified) { + const newChunks: SimplifiedChunk[] = []; + for (const chunk of JSON.parse(StrCast(doc.chunk_simplified))) { + console.log('chunk', chunk); + newChunks.push(chunk as SimplifiedChunk); + } + console.log('Added simplified chunks to simplifiedChunks:', docId, newChunks); + this.addSimplifiedChunks(newChunks); + //DocCast(DocCast(this.chatBoxDocument!.agentDocument)!.chunk_simpl)!.mapping = new List<string>(Array.from(this.simplifiedChunks.values()).map(chunk => JSON.stringify(chunk))); + } // Only add if we haven't already processed this document if (!this.documentsById.has(docId)) { this.documentsById.set(docId, { layoutDoc: doc, dataDoc: doc[DocData] }); @@ -1005,21 +1015,28 @@ export class AgentDocumentManager { * @returns The updated document with simplified chunks */ @action - public addSimplifiedChunks(chunks: RAGChunk[], docType: string) { + public addSimplifiedChunks(simplifiedChunks: SimplifiedChunk[]) { + simplifiedChunks.forEach(chunk => { + this.simplifiedChunks.set(chunk.chunkId, chunk); + }); + } + + public getSimplifiedChunks(chunks: RAGChunk[], docType: string): SimplifiedChunk[] { console.log('chunks', chunks, 'simplifiedChunks', this.simplifiedChunks); + const simplifiedChunks: SimplifiedChunk[] = []; // Create array of simplified chunks based on document type for (const chunk of chunks) { // Common properties across all chunk types const baseChunk: SimplifiedChunk = { chunkId: chunk.id, - text: chunk.metadata.text, + //text: chunk.metadata.text, doc_id: chunk.metadata.doc_id, chunkType: chunk.metadata.type || CHUNK_TYPE.TEXT, }; // Add type-specific properties if (docType === 'video' || docType === 'audio') { - this.simplifiedChunks.set(chunk.id, { + simplifiedChunks.push({ ...baseChunk, start_time: chunk.metadata.start_time, end_time: chunk.metadata.end_time, @@ -1027,14 +1044,14 @@ export class AgentDocumentManager { chunkType: docType === 'video' ? CHUNK_TYPE.VIDEO : CHUNK_TYPE.AUDIO, } as SimplifiedChunk); } else if (docType === 'pdf') { - this.simplifiedChunks.set(chunk.id, { + simplifiedChunks.push({ ...baseChunk, startPage: chunk.metadata.start_page, endPage: chunk.metadata.end_page, location: chunk.metadata.location, } as SimplifiedChunk); } else if (docType === 'csv') { - this.simplifiedChunks.set(chunk.id, { + simplifiedChunks.push({ ...baseChunk, rowStart: (chunk.metadata as any).row_start, rowEnd: (chunk.metadata as any).row_end, @@ -1043,9 +1060,10 @@ export class AgentDocumentManager { } as SimplifiedChunk); } else { // Default for other document types - this.simplifiedChunks.set(chunk.id, baseChunk as SimplifiedChunk); + simplifiedChunks.push(baseChunk as SimplifiedChunk); } } + return simplifiedChunks; } /** @@ -1054,9 +1072,8 @@ export class AgentDocumentManager { * @param chunkId The ID of the chunk to retrieve * @returns The simplified chunk if found, undefined otherwise */ + @action public getSimplifiedChunkById(chunkId: string): any | undefined { - console.log('chunkId', chunkId, 'simplifiedChunks', this.simplifiedChunks); - console.log('doc', this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || '')); return { foundChunk: this.simplifiedChunks.get(chunkId), doc: this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || chunkId) }; } diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index f1fae6f11..252672dfc 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -40,7 +40,7 @@ export class Vectorstore { * @param docManager An instance of AgentDocumentManager to handle document management. */ constructor(id: string, docManager: AgentDocumentManager) { - const pineconeApiKey = process.env.PINECONE_API_KEY; + const pineconeApiKey = 'pcsk_3txLxJ_9fxdmAph4csnq4yxoDF5De5A8bJvjWaXXigBgshy4eoXggrXcxATJiH8vzXbrKm'; if (!pineconeApiKey) { console.log('PINECONE_API_KEY is not defined - Vectorstore will be unavailable'); return; @@ -181,7 +181,9 @@ export class Vectorstore { doc.segmented_transcript = JSON.stringify(segmentedTranscript); // Use doc manager to add simplified chunks const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; - this.docManager.addSimplifiedChunks(result.chunks, docType); + const simplifiedChunks = this.docManager.getSimplifiedChunks(result.chunks, docType); + doc.chunk_simplified = JSON.stringify(simplifiedChunks); + this.docManager.addSimplifiedChunks(simplifiedChunks); } else { // Process regular document console.log('Processing regular document...'); @@ -215,7 +217,9 @@ export class Vectorstore { // Use doc manager to add simplified chunks - determine document type from file extension const fileExt = path.extname(local_file_path).toLowerCase(); const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text'; - this.docManager.addSimplifiedChunks(result.chunks, docType); + const simplifiedChunks = this.docManager.getSimplifiedChunks(result.chunks, docType); + doc.chunk_simplified = JSON.stringify(simplifiedChunks); + this.docManager.addSimplifiedChunks(simplifiedChunks); doc.summary = result.summary; doc.ai_purpose = result.purpose; |
