diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/client/views/nodes/PDFBox.scss | 23 | ||||
-rw-r--r-- | src/client/views/nodes/PDFBox.tsx | 12 | ||||
-rw-r--r-- | src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx | 143 | ||||
-rw-r--r-- | src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts | 33 | ||||
-rw-r--r-- | src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts | 10 | ||||
-rw-r--r-- | src/client/views/pdf/PDFViewer.tsx | 572 |
6 files changed, 762 insertions, 31 deletions
diff --git a/src/client/views/nodes/PDFBox.scss b/src/client/views/nodes/PDFBox.scss index eaea272dc..44013a96d 100644 --- a/src/client/views/nodes/PDFBox.scss +++ b/src/client/views/nodes/PDFBox.scss @@ -344,3 +344,26 @@ font-size: 30px; } } + +.pdfBox-fuzzy { + border: none; + background-color: #4a4a4a; + color: white; + padding: 0 8px; + height: 24px; + cursor: pointer; + margin-right: 4px; + border-radius: 3px; + display: flex; + align-items: center; + justify-content: center; + + &.active { + background-color: #3498db; + color: white; + } + + &:hover { + background-color: #2980b9; + } +} diff --git a/src/client/views/nodes/PDFBox.tsx b/src/client/views/nodes/PDFBox.tsx index 55e6d5596..4ecbd65b6 100644 --- a/src/client/views/nodes/PDFBox.tsx +++ b/src/client/views/nodes/PDFBox.tsx @@ -53,6 +53,7 @@ export class PDFBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { private _sidebarRef = React.createRef<SidebarAnnos>(); @observable private _searching: boolean = false; + @observable private _fuzzySearchEnabled: boolean = true; @observable private _pdf: Opt<Pdfjs.PDFDocumentProxy> = undefined; @observable private _pageControls = false; @@ -272,6 +273,14 @@ export class PDFBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { !this.Document._layout_fitWidth && (this.Document._height = NumCast(this.Document._width) * (p.height / p.width)); }; + @action + toggleFuzzySearch = () => { + this._fuzzySearchEnabled = !this._fuzzySearchEnabled; + this._pdfViewer?.toggleFuzzySearch(); + // Clear existing search results when switching modes + this.search('', false, true); + }; + override search = action((searchString: string, bwd?: boolean, clear: boolean = false) => { if (!this._searching && !clear) { this._searching = true; @@ -412,6 +421,9 @@ export class PDFBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { <button type="button" className="pdfBox-search" title="Search" onClick={e => this.search(this._searchString, e.shiftKey)}> <FontAwesomeIcon icon="search" size="sm" /> </button> + <button type="button" className={`pdfBox-fuzzy ${this._fuzzySearchEnabled ? 'active' : ''}`} title={`${this._fuzzySearchEnabled ? 'Disable' : 'Enable'} Fuzzy Search`} onClick={this.toggleFuzzySearch}> + <FontAwesomeIcon icon="magic" size="sm" /> + </button> <button type="button" className="pdfBox-prevIcon" title="Previous Annotation" onClick={this.prevAnnotation}> <FontAwesomeIcon icon="arrow-up" size="lg" /> </button> diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx index 00077d68d..af689f243 100644 --- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx +++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx @@ -15,7 +15,7 @@ import * as React from 'react'; import { v4 as uuidv4 } from 'uuid'; import { ClientUtils, OmitKeys } from '../../../../../ClientUtils'; import { Doc, DocListCast, Opt } from '../../../../../fields/Doc'; -import { DocData, DocViews } from '../../../../../fields/DocSymbols'; +import { DocData, DocLayout, DocViews } from '../../../../../fields/DocSymbols'; import { RichTextField } from '../../../../../fields/RichTextField'; import { ScriptField } from '../../../../../fields/ScriptField'; import { CsvCast, DocCast, NumCast, PDFCast, RTFCast, StrCast, VideoCast, AudioCast } from '../../../../../fields/Types'; @@ -644,6 +644,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { // Get the simplified chunk using the document manager const { foundChunk, doc } = this.docManager.getSimplifiedChunkById(chunkId); + console.log('doc: ', doc); if (!foundChunk) { if (doc) { console.warn(`Chunk not found in document, ${doc.id}, for chunk ID: ${chunkId}`); @@ -665,12 +666,14 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { console.error('No direct matching segment found for the citation.'); } } else if (foundChunk.chunkType === CHUNK_TYPE.TABLE || foundChunk.chunkType === CHUNK_TYPE.IMAGE) { - this.handleOtherChunkTypes(foundChunk, citation, doc); + console.log('here: ', foundChunk); + this.handleOtherChunkTypes(foundChunk as SimplifiedChunk, citation, doc); } else { if (doc.type === 'web') { DocumentManager.Instance.showDocument(doc, { openLocation: OpenWhere.addRight }, () => {}); return; } + this.handleOtherChunkTypes(foundChunk, citation, doc); // Show the chunk text in citation popup let chunkText = citation.direct_text || 'Text content not available'; this.showCitationPopup(chunkText); @@ -834,10 +837,45 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { this._citationPopup = { text: citation.direct_text ?? 'No text available', visible: true }; this.startCitationPopupTimer(); + // Check if the document is a PDF (has a PDF viewer component) + const isPDF = PDFCast(doc.data) !== null || doc.type === DocumentType.PDF; + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => { - const firstView = Array.from(doc[DocViews])[0] as DocumentView; - (firstView.ComponentView as PDFBox)?.gotoPage?.(foundChunk.startPage ?? 0); - (firstView.ComponentView as PDFBox)?.search?.(citation.direct_text ?? ''); + // Add a delay to ensure document is fully loaded and rendered + setTimeout(() => { + try { + // Safety check: ensure the document has views + if (!doc[DocViews] || doc[DocViews].size === 0) { + console.warn('Document views not available yet, retrying...'); + this.retryPdfSearch(doc, citation, foundChunk, isPDF, 1); + return; + } + + const views = Array.from(doc[DocViews]); + if (!views.length) { + console.warn('No document views found, retrying...'); + this.retryPdfSearch(doc, citation, foundChunk, isPDF, 1); + return; + } + + const firstView = views[0] as DocumentView; + if (!firstView || !firstView.ComponentView) { + console.warn('Component view not available yet, retrying...'); + this.retryPdfSearch(doc, citation, foundChunk, isPDF, 1); + return; + } + + const pdfComponent = firstView.ComponentView as PDFBox; + if (isPDF && pdfComponent && citation.direct_text) { + // Use our helper to ensure fuzzy search is enabled and execute the search + this.ensureFuzzySearchAndExecute(pdfComponent, citation.direct_text.trim(), foundChunk.startPage); + } + } catch (error) { + console.error('Error accessing PDF component:', error); + // Retry with exponential backoff + this.retryPdfSearch(doc, citation, foundChunk, isPDF, 1); + } + }, 500); // Initial delay before first attempt }); break; case CHUNK_TYPE.CSV: @@ -851,6 +889,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { break; } }; + /** * Creates an annotation highlight on a PDF document for image citations. * @param x1 X-coordinate of the top-left corner of the highlight. @@ -1092,6 +1131,100 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { }; /** + * Retry PDF search with exponential backoff + */ + retryPdfSearch = (doc: Doc, citation: Citation, foundChunk: SimplifiedChunk, isPDF: boolean, attempt: number) => { + if (attempt > 5) { + console.error('Maximum retry attempts reached for PDF search'); + return; + } + + const delay = Math.min(2000, 500 * Math.pow(1.5, attempt)); // Exponential backoff with max delay of 2 seconds + + setTimeout(() => { + try { + if (!doc[DocViews] || doc[DocViews].size === 0) { + this.retryPdfSearch(doc, citation, foundChunk, isPDF, attempt + 1); + return; + } + + const views = Array.from(doc[DocViews]); + if (!views.length) { + this.retryPdfSearch(doc, citation, foundChunk, isPDF, attempt + 1); + return; + } + + const firstView = views[0] as DocumentView; + if (!firstView || !firstView.ComponentView) { + this.retryPdfSearch(doc, citation, foundChunk, isPDF, attempt + 1); + return; + } + + const pdfComponent = firstView.ComponentView as PDFBox; + if (isPDF && pdfComponent && citation.direct_text) { + console.log(`PDF component found on attempt ${attempt}, executing search...`); + this.ensureFuzzySearchAndExecute(pdfComponent, citation.direct_text.trim(), foundChunk.startPage); + } + } catch (error) { + console.error(`Error on retry attempt ${attempt}:`, error); + this.retryPdfSearch(doc, citation, foundChunk, isPDF, attempt + 1); + } + }, delay); + }; + + /** + * Ensures fuzzy search is enabled in PDFBox and performs a search + * @param pdfComponent The PDFBox component + * @param searchText The text to search for + * @param startPage Optional page to navigate to before searching + */ + private ensureFuzzySearchAndExecute = (pdfComponent: PDFBox, searchText: string, startPage?: number) => { + if (!pdfComponent) { + console.warn('PDF component is undefined, cannot perform search'); + return; + } + + if (!searchText?.trim()) { + console.warn('Search text is empty, skipping search'); + return; + } + + try { + // Check if the component has required methods + if (typeof pdfComponent.gotoPage !== 'function' || typeof pdfComponent.toggleFuzzySearch !== 'function' || typeof pdfComponent.search !== 'function') { + console.warn('PDF component missing required methods'); + return; + } + + // Navigate to the page if specified + if (typeof startPage === 'number') { + pdfComponent.gotoPage(startPage + 1); + } + + // Always try to enable fuzzy search + try { + // PDFBox.tsx toggles fuzzy search state internally + // We'll call it once to make sure it's enabled + pdfComponent.toggleFuzzySearch(); + } catch (toggleError) { + console.warn('Error toggling fuzzy search:', toggleError); + } + + // Add a sufficient delay to ensure PDF is fully loaded before searching + setTimeout(() => { + try { + console.log('Performing fuzzy search for text:', searchText); + pdfComponent.search(searchText); + } catch (searchError) { + console.error('Error performing search:', searchError); + } + }, 1000); // Increased delay for better reliability + } catch (error) { + console.error('Error in fuzzy search setup:', error); + } + }; + + /** * Main render method for the ChatBox */ render() { diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts index e9d41efbd..784e90c3c 100644 --- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts +++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts @@ -173,6 +173,16 @@ export class AgentDocumentManager { public processDocument(doc: Doc): string { // Ensure document has a persistent ID const docId = this.ensureDocumentId(doc); + if (doc.chunk_simplified) { + const newChunks: SimplifiedChunk[] = []; + for (const chunk of JSON.parse(StrCast(doc.chunk_simplified))) { + console.log('chunk', chunk); + newChunks.push(chunk as SimplifiedChunk); + } + console.log('Added simplified chunks to simplifiedChunks:', docId, newChunks); + this.addSimplifiedChunks(newChunks); + //DocCast(DocCast(this.chatBoxDocument!.agentDocument)!.chunk_simpl)!.mapping = new List<string>(Array.from(this.simplifiedChunks.values()).map(chunk => JSON.stringify(chunk))); + } // Only add if we haven't already processed this document if (!this.documentsById.has(docId)) { this.documentsById.set(docId, { layoutDoc: doc, dataDoc: doc[DocData] }); @@ -1005,21 +1015,28 @@ export class AgentDocumentManager { * @returns The updated document with simplified chunks */ @action - public addSimplifiedChunks(chunks: RAGChunk[], docType: string) { + public addSimplifiedChunks(simplifiedChunks: SimplifiedChunk[]) { + simplifiedChunks.forEach(chunk => { + this.simplifiedChunks.set(chunk.chunkId, chunk); + }); + } + + public getSimplifiedChunks(chunks: RAGChunk[], docType: string): SimplifiedChunk[] { console.log('chunks', chunks, 'simplifiedChunks', this.simplifiedChunks); + const simplifiedChunks: SimplifiedChunk[] = []; // Create array of simplified chunks based on document type for (const chunk of chunks) { // Common properties across all chunk types const baseChunk: SimplifiedChunk = { chunkId: chunk.id, - text: chunk.metadata.text, + //text: chunk.metadata.text, doc_id: chunk.metadata.doc_id, chunkType: chunk.metadata.type || CHUNK_TYPE.TEXT, }; // Add type-specific properties if (docType === 'video' || docType === 'audio') { - this.simplifiedChunks.set(chunk.id, { + simplifiedChunks.push({ ...baseChunk, start_time: chunk.metadata.start_time, end_time: chunk.metadata.end_time, @@ -1027,14 +1044,14 @@ export class AgentDocumentManager { chunkType: docType === 'video' ? CHUNK_TYPE.VIDEO : CHUNK_TYPE.AUDIO, } as SimplifiedChunk); } else if (docType === 'pdf') { - this.simplifiedChunks.set(chunk.id, { + simplifiedChunks.push({ ...baseChunk, startPage: chunk.metadata.start_page, endPage: chunk.metadata.end_page, location: chunk.metadata.location, } as SimplifiedChunk); } else if (docType === 'csv') { - this.simplifiedChunks.set(chunk.id, { + simplifiedChunks.push({ ...baseChunk, rowStart: (chunk.metadata as any).row_start, rowEnd: (chunk.metadata as any).row_end, @@ -1043,9 +1060,10 @@ export class AgentDocumentManager { } as SimplifiedChunk); } else { // Default for other document types - this.simplifiedChunks.set(chunk.id, baseChunk as SimplifiedChunk); + simplifiedChunks.push(baseChunk as SimplifiedChunk); } } + return simplifiedChunks; } /** @@ -1054,9 +1072,8 @@ export class AgentDocumentManager { * @param chunkId The ID of the chunk to retrieve * @returns The simplified chunk if found, undefined otherwise */ + @action public getSimplifiedChunkById(chunkId: string): any | undefined { - console.log('chunkId', chunkId, 'simplifiedChunks', this.simplifiedChunks); - console.log('doc', this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || '')); return { foundChunk: this.simplifiedChunks.get(chunkId), doc: this.getDocument(this.simplifiedChunks.get(chunkId)?.doc_id || chunkId) }; } diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index f1fae6f11..252672dfc 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -40,7 +40,7 @@ export class Vectorstore { * @param docManager An instance of AgentDocumentManager to handle document management. */ constructor(id: string, docManager: AgentDocumentManager) { - const pineconeApiKey = process.env.PINECONE_API_KEY; + const pineconeApiKey = 'pcsk_3txLxJ_9fxdmAph4csnq4yxoDF5De5A8bJvjWaXXigBgshy4eoXggrXcxATJiH8vzXbrKm'; if (!pineconeApiKey) { console.log('PINECONE_API_KEY is not defined - Vectorstore will be unavailable'); return; @@ -181,7 +181,9 @@ export class Vectorstore { doc.segmented_transcript = JSON.stringify(segmentedTranscript); // Use doc manager to add simplified chunks const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; - this.docManager.addSimplifiedChunks(result.chunks, docType); + const simplifiedChunks = this.docManager.getSimplifiedChunks(result.chunks, docType); + doc.chunk_simplified = JSON.stringify(simplifiedChunks); + this.docManager.addSimplifiedChunks(simplifiedChunks); } else { // Process regular document console.log('Processing regular document...'); @@ -215,7 +217,9 @@ export class Vectorstore { // Use doc manager to add simplified chunks - determine document type from file extension const fileExt = path.extname(local_file_path).toLowerCase(); const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text'; - this.docManager.addSimplifiedChunks(result.chunks, docType); + const simplifiedChunks = this.docManager.getSimplifiedChunks(result.chunks, docType); + doc.chunk_simplified = JSON.stringify(simplifiedChunks); + this.docManager.addSimplifiedChunks(simplifiedChunks); doc.summary = result.summary; doc.ai_purpose = result.purpose; diff --git a/src/client/views/pdf/PDFViewer.tsx b/src/client/views/pdf/PDFViewer.tsx index fc2567fbc..a88d8b282 100644 --- a/src/client/views/pdf/PDFViewer.tsx +++ b/src/client/views/pdf/PDFViewer.tsx @@ -50,6 +50,15 @@ interface IViewerProps extends FieldViewProps { crop: (region: Doc | undefined, addCrop?: boolean) => Doc | undefined; } +// Add this type definition right after the existing imports +interface FuzzySearchResult { + pageIndex: number; + matchIndex: number; + text: string; + score?: number; + isParagraph?: boolean; +} + /** * Handles rendering and virtualization of the pdf */ @@ -68,6 +77,9 @@ export class PDFViewer extends ObservableReactComponent<IViewerProps> { @observable _showWaiting = true; @observable Index: number = -1; @observable private _loading = false; + @observable private _fuzzySearchEnabled = true; + @observable private _fuzzySearchResults: FuzzySearchResult[] = []; + @observable private _currentFuzzyMatchIndex = 0; private _pdfViewer!: PDFJSViewer.PDFViewer; private _styleRule: number | undefined; // stylesheet rule for making hyperlinks clickable @@ -334,27 +346,557 @@ export class PDFViewer extends ObservableReactComponent<IViewerProps> { return index; }; + // Normalize text by removing extra spaces, punctuation, and converting to lowercase + private normalizeText(text: string): string { + return text + .toLowerCase() + .replace(/\s+/g, ' ') + .replace(/[^\w\s]/g, ' ') + .trim(); + } + + // Compute similarity between two strings (0-1 where 1 is exact match) + private computeSimilarity(str1: string, str2: string): number { + const s1 = this.normalizeText(str1); + const s2 = this.normalizeText(str2); + + if (s1 === s2) return 1; + if (s1.length === 0 || s2.length === 0) return 0; + + // For very long texts, check if one contains chunks of the other + if (s1.length > 50 || s2.length > 50) { + // For long texts, check if significant chunks overlap + const longerText = s1.length > s2.length ? s1 : s2; + const shorterText = s1.length > s2.length ? s2 : s1; + + // Break the shorter text into chunks + const words = shorterText.split(' '); + const chunkSize = Math.min(5, Math.floor(words.length / 2)); + + if (chunkSize > 0) { + let maxChunkMatch = 0; + + // Check different chunks of the shorter text against the longer text + for (let i = 0; i <= words.length - chunkSize; i++) { + const chunk = words.slice(i, i + chunkSize).join(' '); + if (longerText.includes(chunk)) { + maxChunkMatch = Math.max(maxChunkMatch, chunk.length / shorterText.length); + } + } + + if (maxChunkMatch > 0.2) { + return Math.min(0.9, maxChunkMatch + 0.3); // Boost the score, max 0.9 + } + } + + // Check for substantial overlap in content + const words1 = new Set(s1.split(' ')); + const words2 = new Set(s2.split(' ')); + + let commonWords = 0; + for (const word of words1) { + if (word.length > 2 && words2.has(word)) { + // Only count meaningful words (length > 2) + commonWords++; + } + } + + // Calculate ratio of common words + const overlapRatio = commonWords / Math.min(words1.size, words2.size); + + // For long text, a lower match can still be significant + if (overlapRatio > 0.4) { + return Math.min(0.9, overlapRatio); + } + } + + // Simple contains check for shorter texts + if (s1.includes(s2) || s2.includes(s1)) { + return (0.8 * Math.min(s1.length, s2.length)) / Math.max(s1.length, s2.length); + } + + // For shorter texts, use Levenshtein for more precision + if (s1.length < 100 && s2.length < 100) { + // Calculate Levenshtein distance + const dp: number[][] = Array(s1.length + 1) + .fill(0) + .map(() => Array(s2.length + 1).fill(0)); + + for (let i = 0; i <= s1.length; i++) dp[i][0] = i; + for (let j = 0; j <= s2.length; j++) dp[0][j] = j; + + for (let i = 1; i <= s1.length; i++) { + for (let j = 1; j <= s2.length; j++) { + const cost = s1[i - 1] === s2[j - 1] ? 0 : 1; + dp[i][j] = Math.min( + dp[i - 1][j] + 1, // deletion + dp[i][j - 1] + 1, // insertion + dp[i - 1][j - 1] + cost // substitution + ); + } + } + + const distance = dp[s1.length][s2.length]; + return 1 - distance / Math.max(s1.length, s2.length); + } + + return 0; + } + + // Perform fuzzy search on PDF text content + private async performFuzzySearch(searchString: string, bwd?: boolean): Promise<boolean> { + if (!this._pdfViewer || !searchString.trim()) return false; + + const normalizedSearch = this.normalizeText(searchString); + this._fuzzySearchResults = []; + + // Adjust threshold based on text length - more lenient for longer text + let similarityThreshold = 0.6; + if (searchString.length > 100) similarityThreshold = 0.35; + else if (searchString.length > 50) similarityThreshold = 0.45; + + console.log(`Using similarity threshold: ${similarityThreshold} for query length: ${searchString.length}`); + + // For longer queries, also look for partial matches + const searchWords = normalizedSearch.split(' ').filter(w => w.length > 3); + const isLongQuery = searchWords.length > 5; + + // Track best match for debugging + let bestMatchScore = 0; + let bestMatchText = ''; + + // Fallback strategy: extract key phrases for very long search queries + let keyPhrases: string[] = []; + if (searchString.length > 200) { + // Extract key phrases (chunks of 3-6 words) from the search string + const words = normalizedSearch.split(' '); + for (let i = 0; i < words.length - 2; i += 2) { + const phraseLength = Math.min(5, words.length - i); + if (phraseLength >= 3) { + keyPhrases.push(words.slice(i, i + phraseLength).join(' ')); + } + } + console.log(`Using ${keyPhrases.length} key phrases for long search text`); + } + + // Process PDF in batches to avoid memory issues + const totalPages = this._pageSizes.length; + const BATCH_SIZE = 10; // Process 10 pages at a time + + console.log(`Searching all ${totalPages} pages in batches of ${BATCH_SIZE}`); + + // Process PDF in batches + for (let batchStart = 0; batchStart < totalPages; batchStart += BATCH_SIZE) { + const batchEnd = Math.min(batchStart + BATCH_SIZE, totalPages); + console.log(`Processing pages ${batchStart + 1} to ${batchEnd} of ${totalPages}`); + + // Process each page in current batch + for (let pageIndex = batchStart; pageIndex < batchEnd; pageIndex++) { + try { + const page = await this._props.pdf.getPage(pageIndex + 1); + const textContent = await page.getTextContent(); + + // For long text, try to reconstruct paragraphs first + let paragraphs: string[] = []; + + try { + if (isLongQuery) { + // Group text items into paragraphs based on positions + let currentY: number | null = null; + let currentParagraph = ''; + + // Sort by Y position first, then X + const sortedItems = [...textContent.items].sort((a: any, b: any) => { + const aTransform = (a as any).transform || []; + const bTransform = (b as any).transform || []; + if (Math.abs(aTransform[5] - bTransform[5]) < 5) { + return (aTransform[4] || 0) - (bTransform[4] || 0); + } + return (aTransform[5] || 0) - (bTransform[5] || 0); + }); + + // Limit paragraph size to avoid overflows + const MAX_PARAGRAPH_LENGTH = 1000; + + for (const item of sortedItems) { + const text = (item as any).str || ''; + const transform = (item as any).transform || []; + const y = transform[5]; + + // If this is a new line or first item + if (currentY === null || Math.abs(y - currentY) > 5 || currentParagraph.length + text.length > MAX_PARAGRAPH_LENGTH) { + if (currentParagraph) { + paragraphs.push(currentParagraph.trim()); + } + currentParagraph = text; + currentY = y; + } else { + // Continue the current paragraph + currentParagraph += ' ' + text; + } + } + + // Add the last paragraph + if (currentParagraph) { + paragraphs.push(currentParagraph.trim()); + } + + // Limit the number of paragraph combinations to avoid exponential growth + const MAX_COMBINED_PARAGRAPHS = 5; + + // Also create overlapping larger paragraphs for better context, but limit size + if (paragraphs.length > 1) { + const combinedCount = Math.min(paragraphs.length - 1, MAX_COMBINED_PARAGRAPHS); + for (let i = 0; i < combinedCount; i++) { + if (paragraphs[i].length + paragraphs[i + 1].length < MAX_PARAGRAPH_LENGTH) { + paragraphs.push(paragraphs[i] + ' ' + paragraphs[i + 1]); + } + } + } + } + } catch (paragraphError) { + console.warn('Error during paragraph reconstruction:', paragraphError); + // Continue with individual items if paragraph reconstruction fails + } + + // For extremely long search texts, use our key phrases approach + if (keyPhrases.length > 0) { + // Check each paragraph for key phrases + for (const paragraph of paragraphs) { + let matchingPhrases = 0; + let bestPhraseScore = 0; + + for (const phrase of keyPhrases) { + const similarity = this.computeSimilarity(paragraph, phrase); + if (similarity > 0.7) matchingPhrases++; + bestPhraseScore = Math.max(bestPhraseScore, similarity); + } + + // If multiple key phrases match, this is likely a good result + if (matchingPhrases > 1 || bestPhraseScore > 0.8) { + this._fuzzySearchResults.push({ + pageIndex, + matchIndex: paragraphs.indexOf(paragraph), + text: paragraph, + score: 0.7 + matchingPhrases * 0.05, + isParagraph: true, + }); + } + } + + // Also check each item directly + for (const item of textContent.items) { + const text = (item as any).str || ''; + if (!text.trim()) continue; + + for (const phrase of keyPhrases) { + const similarity = this.computeSimilarity(text, phrase); + if (similarity > 0.7) { + this._fuzzySearchResults.push({ + pageIndex, + matchIndex: textContent.items.indexOf(item), + text: text, + score: similarity, + isParagraph: false, + }); + break; // One matching phrase is enough for direct items + } + } + } + + continue; // Skip normal processing for this page, we've used the key phrases approach + } + + // Ensure paragraphs aren't too large before checking + paragraphs = paragraphs.filter(p => p.length < 5000); + + // Check both individual items and reconstructed paragraphs + try { + const itemsToCheck = [ + ...textContent.items.map((item: any) => ({ + idx: textContent.items.indexOf(item), + text: (item as any).str || '', + isParagraph: false, + })), + ...paragraphs.map((p, i) => ({ + idx: i, + text: p, + isParagraph: true, + })), + ]; + + for (const item of itemsToCheck) { + if (!item.text.trim() || item.text.length > 5000) continue; + + const similarity = this.computeSimilarity(item.text, normalizedSearch); + + // Track best match for debugging + if (similarity > bestMatchScore) { + bestMatchScore = similarity; + bestMatchText = item.text.substring(0, 100); + } + + if (similarity > similarityThreshold) { + this._fuzzySearchResults.push({ + pageIndex, + matchIndex: item.idx, + text: item.text, + score: similarity, + isParagraph: item.isParagraph, + }); + } + } + } catch (itemCheckError) { + console.warn('Error checking items on page:', itemCheckError); + } + } catch (error) { + console.error(`Error extracting text from page ${pageIndex + 1}:`, error); + // Continue with other pages even if one fails + } + } + + // Check if we already have good matches after each batch + // This allows us to stop early if we've found excellent matches + if (this._fuzzySearchResults.length > 0) { + // Sort results by similarity (descending) + this._fuzzySearchResults.sort((a, b) => (b.score || 0) - (a.score || 0)); + + // If we have an excellent match (score > 0.8), stop searching + if (this._fuzzySearchResults[0]?.score && this._fuzzySearchResults[0].score > 0.8) { + console.log(`Found excellent match (score: ${this._fuzzySearchResults[0].score?.toFixed(2)}) - stopping early`); + break; + } + + // If we have several good matches (score > 0.6), stop searching + if (this._fuzzySearchResults.length >= 3 && this._fuzzySearchResults.every(r => r.score && r.score > 0.6)) { + console.log(`Found ${this._fuzzySearchResults.length} good matches - stopping early`); + break; + } + } + + // Perform cleanup between batches to avoid memory buildup + if (batchEnd < totalPages) { + // Give the browser a moment to breathe and release memory + await new Promise(resolve => setTimeout(resolve, 1)); + } + } + + // If no results with advanced search, try standard search with key terms + if (this._fuzzySearchResults.length === 0 && searchWords.length > 3) { + // Find the most distinctive words (longer words are often more specific) + const distinctiveWords = searchWords + .filter(w => w.length > 4) + .sort((a, b) => b.length - a.length) + .slice(0, 3); + + if (distinctiveWords.length > 0) { + console.log(`Falling back to standard search with distinctive term: ${distinctiveWords[0]}`); + this._pdfViewer.eventBus.dispatch('find', { + query: distinctiveWords[0], + phraseSearch: false, + highlightAll: true, + findPrevious: false, + }); + return true; + } + } + + console.log(`Best match (${bestMatchScore.toFixed(2)}): "${bestMatchText}"`); + console.log(`Found ${this._fuzzySearchResults.length} matches above threshold ${similarityThreshold}`); + + // Sort results by similarity (descending) + this._fuzzySearchResults.sort((a, b) => (b.score || 0) - (a.score || 0)); + + // Navigate to the first/last result based on direction + if (this._fuzzySearchResults.length > 0) { + this._currentFuzzyMatchIndex = bwd ? this._fuzzySearchResults.length - 1 : 0; + this.navigateToFuzzyMatch(this._currentFuzzyMatchIndex); + return true; + } else if (bestMatchScore > 0) { + // If we found some match but below threshold, adjust threshold and try again + if (bestMatchScore > similarityThreshold * 0.7) { + console.log(`Lowering threshold to ${bestMatchScore * 0.9} and retrying search`); + similarityThreshold = bestMatchScore * 0.9; + return this.performFuzzySearch(searchString, bwd); + } + } + + // Ultimate fallback: Use standard PDF.js search with the most common words + if (this._fuzzySearchResults.length === 0) { + // Extract a few words from the middle of the search string + const words = normalizedSearch.split(' '); + const middleIndex = Math.floor(words.length / 2); + const searchPhrase = words.slice(Math.max(0, middleIndex - 1), Math.min(words.length, middleIndex + 2)).join(' '); + + console.log(`Falling back to standard search with phrase: ${searchPhrase}`); + this._pdfViewer.eventBus.dispatch('find', { + query: searchPhrase, + phraseSearch: true, + highlightAll: true, + findPrevious: false, + }); + return true; + } + + return false; + } + + // Navigate to a specific fuzzy match + private navigateToFuzzyMatch(index: number): void { + if (index >= 0 && index < this._fuzzySearchResults.length) { + const match = this._fuzzySearchResults[index]; + console.log(`Navigating to match: ${match.text.substring(0, 50)}... (score: ${match.score?.toFixed(2) || 'unknown'})`); + + // Scroll to the page containing the match + this._pdfViewer.scrollPageIntoView({ + pageNumber: match.pageIndex + 1, + }); + + // For paragraph matches, use a more specific approach + if (match.isParagraph) { + // Break the text into smaller chunks to improve highlighting + const words = match.text.split(/\s+/); + const normalizedSearch = this.normalizeText(match.text); + + // Try to highlight with shorter chunks to get better visual feedback + if (words.length > 5) { + // Create 5-word overlapping chunks + const chunks = []; + for (let i = 0; i < words.length - 4; i += 3) { + chunks.push(words.slice(i, i + 5).join(' ')); + } + + // Highlight each chunk + if (chunks.length > 0) { + // Highlight the first chunk immediately + this._pdfViewer.eventBus.dispatch('find', { + query: chunks[0], + phraseSearch: true, + highlightAll: true, + findPrevious: false, + }); + + // Highlight the rest with small delays to avoid conflicts + chunks.slice(1).forEach((chunk, i) => { + setTimeout( + () => { + this._pdfViewer.eventBus.dispatch('find', { + query: chunk, + phraseSearch: true, + highlightAll: true, + findPrevious: false, + }); + }, + (i + 1) * 100 + ); + }); + return; + } + } + } + + // Standard highlighting for non-paragraph matches or short text + if (this._pdfViewer.findController) { + // For longer text, try to find the most unique phrases to highlight + if (match.text.length > 50) { + const words = match.text.split(/\s+/); + // Look for 3-5 word phrases that are likely to be unique + let phraseToHighlight = match.text; + + if (words.length >= 5) { + // Take a phrase from the middle of the text + const middleIndex = Math.floor(words.length / 2); + phraseToHighlight = words.slice(middleIndex - 2, middleIndex + 3).join(' '); + } + + console.log(`Highlighting phrase: "${phraseToHighlight}"`); + + this._pdfViewer.eventBus.dispatch('find', { + query: phraseToHighlight, + phraseSearch: true, + highlightAll: true, + findPrevious: false, + }); + } else { + // For shorter text, use the entire match + this._pdfViewer.eventBus.dispatch('find', { + query: match.text, + phraseSearch: true, + highlightAll: true, + findPrevious: false, + }); + } + } + } + } + + // Navigate to next fuzzy match + private nextFuzzyMatch(): boolean { + if (this._fuzzySearchResults.length === 0) return false; + + this._currentFuzzyMatchIndex = (this._currentFuzzyMatchIndex + 1) % this._fuzzySearchResults.length; + this.navigateToFuzzyMatch(this._currentFuzzyMatchIndex); + return true; + } + + // Navigate to previous fuzzy match + private prevFuzzyMatch(): boolean { + if (this._fuzzySearchResults.length === 0) return false; + + this._currentFuzzyMatchIndex = (this._currentFuzzyMatchIndex - 1 + this._fuzzySearchResults.length) % this._fuzzySearchResults.length; + this.navigateToFuzzyMatch(this._currentFuzzyMatchIndex); + return true; + } + @action search = (searchString: string, bwd?: boolean, clear: boolean = false) => { - const findOpts = { - caseSensitive: false, - findPrevious: bwd, - highlightAll: true, - phraseSearch: true, - query: searchString, - }; if (clear) { + this._fuzzySearchResults = []; this._pdfViewer?.eventBus.dispatch('findbarclose', {}); - } else if (!searchString) { + return true; + } + + if (!searchString) { bwd ? this.prevAnnotation() : this.nextAnnotation(); - } else if (this._pdfViewer?.pageViewsReady) { - this._pdfViewer?.eventBus.dispatch('find', { ...findOpts, type: 'again' }); - } else if (this._mainCont.current) { - const executeFind = () => this._pdfViewer?.eventBus.dispatch('find', findOpts); - this._mainCont.current.addEventListener('pagesloaded', executeFind); - this._mainCont.current.addEventListener('pagerendered', executeFind); + return true; } - return true; + + // If we already have fuzzy search results, navigate through them + if (this._fuzzySearchEnabled && this._fuzzySearchResults.length > 0) { + return bwd ? this.prevFuzzyMatch() : this.nextFuzzyMatch(); + } + + // For new search, decide between fuzzy and standard search + if (this._fuzzySearchEnabled) { + // Start fuzzy search + this.performFuzzySearch(searchString, bwd); + return true; + } else { + // Use original PDF.js search + const findOpts = { + caseSensitive: false, + findPrevious: bwd, + highlightAll: true, + phraseSearch: true, + query: searchString, + }; + + if (this._pdfViewer?.pageViewsReady) { + this._pdfViewer?.eventBus.dispatch('find', { ...findOpts, type: 'again' }); + } else if (this._mainCont.current) { + const executeFind = () => this._pdfViewer?.eventBus.dispatch('find', findOpts); + this._mainCont.current.addEventListener('pagesloaded', executeFind); + this._mainCont.current.addEventListener('pagerendered', executeFind); + } + return true; + } + }; + + // Toggle fuzzy search mode + @action + toggleFuzzySearch = (): boolean => { + this._fuzzySearchEnabled = !this._fuzzySearchEnabled; + return this._fuzzySearchEnabled; }; @action |