aboutsummaryrefslogtreecommitdiff
path: root/src/client/views/pdf/PDFViewer.tsx
diff options
context:
space:
mode:
Diffstat (limited to 'src/client/views/pdf/PDFViewer.tsx')
-rw-r--r--src/client/views/pdf/PDFViewer.tsx572
1 files changed, 557 insertions, 15 deletions
diff --git a/src/client/views/pdf/PDFViewer.tsx b/src/client/views/pdf/PDFViewer.tsx
index c293750e1..6e8ce0ce9 100644
--- a/src/client/views/pdf/PDFViewer.tsx
+++ b/src/client/views/pdf/PDFViewer.tsx
@@ -50,6 +50,15 @@ interface IViewerProps extends FieldViewProps {
crop: (region: Doc | undefined, addCrop?: boolean) => Doc | undefined;
}
+// Add this type definition right after the existing imports
+interface FuzzySearchResult {
+ pageIndex: number;
+ matchIndex: number;
+ text: string;
+ score?: number;
+ isParagraph?: boolean;
+}
+
/**
* Handles rendering and virtualization of the pdf
*/
@@ -68,6 +77,9 @@ export class PDFViewer extends ObservableReactComponent<IViewerProps> {
@observable _showWaiting = true;
@observable Index: number = -1;
@observable private _loading = false;
+ @observable private _fuzzySearchEnabled = true;
+ @observable private _fuzzySearchResults: FuzzySearchResult[] = [];
+ @observable private _currentFuzzyMatchIndex = 0;
private _pdfViewer!: PDFJSViewer.PDFViewer;
private _styleRule: number | undefined; // stylesheet rule for making hyperlinks clickable
@@ -326,27 +338,557 @@ export class PDFViewer extends ObservableReactComponent<IViewerProps> {
return index;
};
+ // Normalize text by removing extra spaces, punctuation, and converting to lowercase
+ private normalizeText(text: string): string {
+ return text
+ .toLowerCase()
+ .replace(/\s+/g, ' ')
+ .replace(/[^\w\s]/g, ' ')
+ .trim();
+ }
+
+ // Compute similarity between two strings (0-1 where 1 is exact match)
+ private computeSimilarity(str1: string, str2: string): number {
+ const s1 = this.normalizeText(str1);
+ const s2 = this.normalizeText(str2);
+
+ if (s1 === s2) return 1;
+ if (s1.length === 0 || s2.length === 0) return 0;
+
+ // For very long texts, check if one contains chunks of the other
+ if (s1.length > 50 || s2.length > 50) {
+ // For long texts, check if significant chunks overlap
+ const longerText = s1.length > s2.length ? s1 : s2;
+ const shorterText = s1.length > s2.length ? s2 : s1;
+
+ // Break the shorter text into chunks
+ const words = shorterText.split(' ');
+ const chunkSize = Math.min(5, Math.floor(words.length / 2));
+
+ if (chunkSize > 0) {
+ let maxChunkMatch = 0;
+
+ // Check different chunks of the shorter text against the longer text
+ for (let i = 0; i <= words.length - chunkSize; i++) {
+ const chunk = words.slice(i, i + chunkSize).join(' ');
+ if (longerText.includes(chunk)) {
+ maxChunkMatch = Math.max(maxChunkMatch, chunk.length / shorterText.length);
+ }
+ }
+
+ if (maxChunkMatch > 0.2) {
+ return Math.min(0.9, maxChunkMatch + 0.3); // Boost the score, max 0.9
+ }
+ }
+
+ // Check for substantial overlap in content
+ const words1 = new Set(s1.split(' '));
+ const words2 = new Set(s2.split(' '));
+
+ let commonWords = 0;
+ for (const word of words1) {
+ if (word.length > 2 && words2.has(word)) {
+ // Only count meaningful words (length > 2)
+ commonWords++;
+ }
+ }
+
+ // Calculate ratio of common words
+ const overlapRatio = commonWords / Math.min(words1.size, words2.size);
+
+ // For long text, a lower match can still be significant
+ if (overlapRatio > 0.4) {
+ return Math.min(0.9, overlapRatio);
+ }
+ }
+
+ // Simple contains check for shorter texts
+ if (s1.includes(s2) || s2.includes(s1)) {
+ return (0.8 * Math.min(s1.length, s2.length)) / Math.max(s1.length, s2.length);
+ }
+
+ // For shorter texts, use Levenshtein for more precision
+ if (s1.length < 100 && s2.length < 100) {
+ // Calculate Levenshtein distance
+ const dp: number[][] = Array(s1.length + 1)
+ .fill(0)
+ .map(() => Array(s2.length + 1).fill(0));
+
+ for (let i = 0; i <= s1.length; i++) dp[i][0] = i;
+ for (let j = 0; j <= s2.length; j++) dp[0][j] = j;
+
+ for (let i = 1; i <= s1.length; i++) {
+ for (let j = 1; j <= s2.length; j++) {
+ const cost = s1[i - 1] === s2[j - 1] ? 0 : 1;
+ dp[i][j] = Math.min(
+ dp[i - 1][j] + 1, // deletion
+ dp[i][j - 1] + 1, // insertion
+ dp[i - 1][j - 1] + cost // substitution
+ );
+ }
+ }
+
+ const distance = dp[s1.length][s2.length];
+ return 1 - distance / Math.max(s1.length, s2.length);
+ }
+
+ return 0;
+ }
+
+ // Perform fuzzy search on PDF text content
+ private async performFuzzySearch(searchString: string, bwd?: boolean): Promise<boolean> {
+ if (!this._pdfViewer || !searchString.trim()) return false;
+
+ const normalizedSearch = this.normalizeText(searchString);
+ this._fuzzySearchResults = [];
+
+ // Adjust threshold based on text length - more lenient for longer text
+ let similarityThreshold = 0.6;
+ if (searchString.length > 100) similarityThreshold = 0.35;
+ else if (searchString.length > 50) similarityThreshold = 0.45;
+
+ console.log(`Using similarity threshold: ${similarityThreshold} for query length: ${searchString.length}`);
+
+ // For longer queries, also look for partial matches
+ const searchWords = normalizedSearch.split(' ').filter(w => w.length > 3);
+ const isLongQuery = searchWords.length > 5;
+
+ // Track best match for debugging
+ let bestMatchScore = 0;
+ let bestMatchText = '';
+
+ // Fallback strategy: extract key phrases for very long search queries
+ let keyPhrases: string[] = [];
+ if (searchString.length > 200) {
+ // Extract key phrases (chunks of 3-6 words) from the search string
+ const words = normalizedSearch.split(' ');
+ for (let i = 0; i < words.length - 2; i += 2) {
+ const phraseLength = Math.min(5, words.length - i);
+ if (phraseLength >= 3) {
+ keyPhrases.push(words.slice(i, i + phraseLength).join(' '));
+ }
+ }
+ console.log(`Using ${keyPhrases.length} key phrases for long search text`);
+ }
+
+ // Process PDF in batches to avoid memory issues
+ const totalPages = this._pageSizes.length;
+ const BATCH_SIZE = 10; // Process 10 pages at a time
+
+ console.log(`Searching all ${totalPages} pages in batches of ${BATCH_SIZE}`);
+
+ // Process PDF in batches
+ for (let batchStart = 0; batchStart < totalPages; batchStart += BATCH_SIZE) {
+ const batchEnd = Math.min(batchStart + BATCH_SIZE, totalPages);
+ console.log(`Processing pages ${batchStart + 1} to ${batchEnd} of ${totalPages}`);
+
+ // Process each page in current batch
+ for (let pageIndex = batchStart; pageIndex < batchEnd; pageIndex++) {
+ try {
+ const page = await this._props.pdf.getPage(pageIndex + 1);
+ const textContent = await page.getTextContent();
+
+ // For long text, try to reconstruct paragraphs first
+ let paragraphs: string[] = [];
+
+ try {
+ if (isLongQuery) {
+ // Group text items into paragraphs based on positions
+ let currentY: number | null = null;
+ let currentParagraph = '';
+
+ // Sort by Y position first, then X
+ const sortedItems = [...textContent.items].sort((a: any, b: any) => {
+ const aTransform = (a as any).transform || [];
+ const bTransform = (b as any).transform || [];
+ if (Math.abs(aTransform[5] - bTransform[5]) < 5) {
+ return (aTransform[4] || 0) - (bTransform[4] || 0);
+ }
+ return (aTransform[5] || 0) - (bTransform[5] || 0);
+ });
+
+ // Limit paragraph size to avoid overflows
+ const MAX_PARAGRAPH_LENGTH = 1000;
+
+ for (const item of sortedItems) {
+ const text = (item as any).str || '';
+ const transform = (item as any).transform || [];
+ const y = transform[5];
+
+ // If this is a new line or first item
+ if (currentY === null || Math.abs(y - currentY) > 5 || currentParagraph.length + text.length > MAX_PARAGRAPH_LENGTH) {
+ if (currentParagraph) {
+ paragraphs.push(currentParagraph.trim());
+ }
+ currentParagraph = text;
+ currentY = y;
+ } else {
+ // Continue the current paragraph
+ currentParagraph += ' ' + text;
+ }
+ }
+
+ // Add the last paragraph
+ if (currentParagraph) {
+ paragraphs.push(currentParagraph.trim());
+ }
+
+ // Limit the number of paragraph combinations to avoid exponential growth
+ const MAX_COMBINED_PARAGRAPHS = 5;
+
+ // Also create overlapping larger paragraphs for better context, but limit size
+ if (paragraphs.length > 1) {
+ const combinedCount = Math.min(paragraphs.length - 1, MAX_COMBINED_PARAGRAPHS);
+ for (let i = 0; i < combinedCount; i++) {
+ if (paragraphs[i].length + paragraphs[i + 1].length < MAX_PARAGRAPH_LENGTH) {
+ paragraphs.push(paragraphs[i] + ' ' + paragraphs[i + 1]);
+ }
+ }
+ }
+ }
+ } catch (paragraphError) {
+ console.warn('Error during paragraph reconstruction:', paragraphError);
+ // Continue with individual items if paragraph reconstruction fails
+ }
+
+ // For extremely long search texts, use our key phrases approach
+ if (keyPhrases.length > 0) {
+ // Check each paragraph for key phrases
+ for (const paragraph of paragraphs) {
+ let matchingPhrases = 0;
+ let bestPhraseScore = 0;
+
+ for (const phrase of keyPhrases) {
+ const similarity = this.computeSimilarity(paragraph, phrase);
+ if (similarity > 0.7) matchingPhrases++;
+ bestPhraseScore = Math.max(bestPhraseScore, similarity);
+ }
+
+ // If multiple key phrases match, this is likely a good result
+ if (matchingPhrases > 1 || bestPhraseScore > 0.8) {
+ this._fuzzySearchResults.push({
+ pageIndex,
+ matchIndex: paragraphs.indexOf(paragraph),
+ text: paragraph,
+ score: 0.7 + matchingPhrases * 0.05,
+ isParagraph: true,
+ });
+ }
+ }
+
+ // Also check each item directly
+ for (const item of textContent.items) {
+ const text = (item as any).str || '';
+ if (!text.trim()) continue;
+
+ for (const phrase of keyPhrases) {
+ const similarity = this.computeSimilarity(text, phrase);
+ if (similarity > 0.7) {
+ this._fuzzySearchResults.push({
+ pageIndex,
+ matchIndex: textContent.items.indexOf(item),
+ text: text,
+ score: similarity,
+ isParagraph: false,
+ });
+ break; // One matching phrase is enough for direct items
+ }
+ }
+ }
+
+ continue; // Skip normal processing for this page, we've used the key phrases approach
+ }
+
+ // Ensure paragraphs aren't too large before checking
+ paragraphs = paragraphs.filter(p => p.length < 5000);
+
+ // Check both individual items and reconstructed paragraphs
+ try {
+ const itemsToCheck = [
+ ...textContent.items.map((item: any) => ({
+ idx: textContent.items.indexOf(item),
+ text: (item as any).str || '',
+ isParagraph: false,
+ })),
+ ...paragraphs.map((p, i) => ({
+ idx: i,
+ text: p,
+ isParagraph: true,
+ })),
+ ];
+
+ for (const item of itemsToCheck) {
+ if (!item.text.trim() || item.text.length > 5000) continue;
+
+ const similarity = this.computeSimilarity(item.text, normalizedSearch);
+
+ // Track best match for debugging
+ if (similarity > bestMatchScore) {
+ bestMatchScore = similarity;
+ bestMatchText = item.text.substring(0, 100);
+ }
+
+ if (similarity > similarityThreshold) {
+ this._fuzzySearchResults.push({
+ pageIndex,
+ matchIndex: item.idx,
+ text: item.text,
+ score: similarity,
+ isParagraph: item.isParagraph,
+ });
+ }
+ }
+ } catch (itemCheckError) {
+ console.warn('Error checking items on page:', itemCheckError);
+ }
+ } catch (error) {
+ console.error(`Error extracting text from page ${pageIndex + 1}:`, error);
+ // Continue with other pages even if one fails
+ }
+ }
+
+ // Check if we already have good matches after each batch
+ // This allows us to stop early if we've found excellent matches
+ if (this._fuzzySearchResults.length > 0) {
+ // Sort results by similarity (descending)
+ this._fuzzySearchResults.sort((a, b) => (b.score || 0) - (a.score || 0));
+
+ // If we have an excellent match (score > 0.8), stop searching
+ if (this._fuzzySearchResults[0]?.score && this._fuzzySearchResults[0].score > 0.8) {
+ console.log(`Found excellent match (score: ${this._fuzzySearchResults[0].score?.toFixed(2)}) - stopping early`);
+ break;
+ }
+
+ // If we have several good matches (score > 0.6), stop searching
+ if (this._fuzzySearchResults.length >= 3 && this._fuzzySearchResults.every(r => r.score && r.score > 0.6)) {
+ console.log(`Found ${this._fuzzySearchResults.length} good matches - stopping early`);
+ break;
+ }
+ }
+
+ // Perform cleanup between batches to avoid memory buildup
+ if (batchEnd < totalPages) {
+ // Give the browser a moment to breathe and release memory
+ await new Promise(resolve => setTimeout(resolve, 1));
+ }
+ }
+
+ // If no results with advanced search, try standard search with key terms
+ if (this._fuzzySearchResults.length === 0 && searchWords.length > 3) {
+ // Find the most distinctive words (longer words are often more specific)
+ const distinctiveWords = searchWords
+ .filter(w => w.length > 4)
+ .sort((a, b) => b.length - a.length)
+ .slice(0, 3);
+
+ if (distinctiveWords.length > 0) {
+ console.log(`Falling back to standard search with distinctive term: ${distinctiveWords[0]}`);
+ this._pdfViewer.eventBus.dispatch('find', {
+ query: distinctiveWords[0],
+ phraseSearch: false,
+ highlightAll: true,
+ findPrevious: false,
+ });
+ return true;
+ }
+ }
+
+ console.log(`Best match (${bestMatchScore.toFixed(2)}): "${bestMatchText}"`);
+ console.log(`Found ${this._fuzzySearchResults.length} matches above threshold ${similarityThreshold}`);
+
+ // Sort results by similarity (descending)
+ this._fuzzySearchResults.sort((a, b) => (b.score || 0) - (a.score || 0));
+
+ // Navigate to the first/last result based on direction
+ if (this._fuzzySearchResults.length > 0) {
+ this._currentFuzzyMatchIndex = bwd ? this._fuzzySearchResults.length - 1 : 0;
+ this.navigateToFuzzyMatch(this._currentFuzzyMatchIndex);
+ return true;
+ } else if (bestMatchScore > 0) {
+ // If we found some match but below threshold, adjust threshold and try again
+ if (bestMatchScore > similarityThreshold * 0.7) {
+ console.log(`Lowering threshold to ${bestMatchScore * 0.9} and retrying search`);
+ similarityThreshold = bestMatchScore * 0.9;
+ return this.performFuzzySearch(searchString, bwd);
+ }
+ }
+
+ // Ultimate fallback: Use standard PDF.js search with the most common words
+ if (this._fuzzySearchResults.length === 0) {
+ // Extract a few words from the middle of the search string
+ const words = normalizedSearch.split(' ');
+ const middleIndex = Math.floor(words.length / 2);
+ const searchPhrase = words.slice(Math.max(0, middleIndex - 1), Math.min(words.length, middleIndex + 2)).join(' ');
+
+ console.log(`Falling back to standard search with phrase: ${searchPhrase}`);
+ this._pdfViewer.eventBus.dispatch('find', {
+ query: searchPhrase,
+ phraseSearch: true,
+ highlightAll: true,
+ findPrevious: false,
+ });
+ return true;
+ }
+
+ return false;
+ }
+
+ // Navigate to a specific fuzzy match
+ private navigateToFuzzyMatch(index: number): void {
+ if (index >= 0 && index < this._fuzzySearchResults.length) {
+ const match = this._fuzzySearchResults[index];
+ console.log(`Navigating to match: ${match.text.substring(0, 50)}... (score: ${match.score?.toFixed(2) || 'unknown'})`);
+
+ // Scroll to the page containing the match
+ this._pdfViewer.scrollPageIntoView({
+ pageNumber: match.pageIndex + 1,
+ });
+
+ // For paragraph matches, use a more specific approach
+ if (match.isParagraph) {
+ // Break the text into smaller chunks to improve highlighting
+ const words = match.text.split(/\s+/);
+ const normalizedSearch = this.normalizeText(match.text);
+
+ // Try to highlight with shorter chunks to get better visual feedback
+ if (words.length > 5) {
+ // Create 5-word overlapping chunks
+ const chunks = [];
+ for (let i = 0; i < words.length - 4; i += 3) {
+ chunks.push(words.slice(i, i + 5).join(' '));
+ }
+
+ // Highlight each chunk
+ if (chunks.length > 0) {
+ // Highlight the first chunk immediately
+ this._pdfViewer.eventBus.dispatch('find', {
+ query: chunks[0],
+ phraseSearch: true,
+ highlightAll: true,
+ findPrevious: false,
+ });
+
+ // Highlight the rest with small delays to avoid conflicts
+ chunks.slice(1).forEach((chunk, i) => {
+ setTimeout(
+ () => {
+ this._pdfViewer.eventBus.dispatch('find', {
+ query: chunk,
+ phraseSearch: true,
+ highlightAll: true,
+ findPrevious: false,
+ });
+ },
+ (i + 1) * 100
+ );
+ });
+ return;
+ }
+ }
+ }
+
+ // Standard highlighting for non-paragraph matches or short text
+ if (this._pdfViewer.findController) {
+ // For longer text, try to find the most unique phrases to highlight
+ if (match.text.length > 50) {
+ const words = match.text.split(/\s+/);
+ // Look for 3-5 word phrases that are likely to be unique
+ let phraseToHighlight = match.text;
+
+ if (words.length >= 5) {
+ // Take a phrase from the middle of the text
+ const middleIndex = Math.floor(words.length / 2);
+ phraseToHighlight = words.slice(middleIndex - 2, middleIndex + 3).join(' ');
+ }
+
+ console.log(`Highlighting phrase: "${phraseToHighlight}"`);
+
+ this._pdfViewer.eventBus.dispatch('find', {
+ query: phraseToHighlight,
+ phraseSearch: true,
+ highlightAll: true,
+ findPrevious: false,
+ });
+ } else {
+ // For shorter text, use the entire match
+ this._pdfViewer.eventBus.dispatch('find', {
+ query: match.text,
+ phraseSearch: true,
+ highlightAll: true,
+ findPrevious: false,
+ });
+ }
+ }
+ }
+ }
+
+ // Navigate to next fuzzy match
+ private nextFuzzyMatch(): boolean {
+ if (this._fuzzySearchResults.length === 0) return false;
+
+ this._currentFuzzyMatchIndex = (this._currentFuzzyMatchIndex + 1) % this._fuzzySearchResults.length;
+ this.navigateToFuzzyMatch(this._currentFuzzyMatchIndex);
+ return true;
+ }
+
+ // Navigate to previous fuzzy match
+ private prevFuzzyMatch(): boolean {
+ if (this._fuzzySearchResults.length === 0) return false;
+
+ this._currentFuzzyMatchIndex = (this._currentFuzzyMatchIndex - 1 + this._fuzzySearchResults.length) % this._fuzzySearchResults.length;
+ this.navigateToFuzzyMatch(this._currentFuzzyMatchIndex);
+ return true;
+ }
+
@action
search = (searchString: string, bwd?: boolean, clear: boolean = false) => {
- const findOpts = {
- caseSensitive: false,
- findPrevious: bwd,
- highlightAll: true,
- phraseSearch: true,
- query: searchString,
- };
if (clear) {
+ this._fuzzySearchResults = [];
this._pdfViewer?.eventBus.dispatch('findbarclose', {});
- } else if (!searchString) {
+ return true;
+ }
+
+ if (!searchString) {
bwd ? this.prevAnnotation() : this.nextAnnotation();
- } else if (this._pdfViewer?.pageViewsReady) {
- this._pdfViewer?.eventBus.dispatch('find', { ...findOpts, type: 'again' });
- } else if (this._mainCont.current) {
- const executeFind = () => this._pdfViewer?.eventBus.dispatch('find', findOpts);
- this._mainCont.current.addEventListener('pagesloaded', executeFind);
- this._mainCont.current.addEventListener('pagerendered', executeFind);
+ return true;
}
- return true;
+
+ // If we already have fuzzy search results, navigate through them
+ if (this._fuzzySearchEnabled && this._fuzzySearchResults.length > 0) {
+ return bwd ? this.prevFuzzyMatch() : this.nextFuzzyMatch();
+ }
+
+ // For new search, decide between fuzzy and standard search
+ if (this._fuzzySearchEnabled) {
+ // Start fuzzy search
+ this.performFuzzySearch(searchString, bwd);
+ return true;
+ } else {
+ // Use original PDF.js search
+ const findOpts = {
+ caseSensitive: false,
+ findPrevious: bwd,
+ highlightAll: true,
+ phraseSearch: true,
+ query: searchString,
+ };
+
+ if (this._pdfViewer?.pageViewsReady) {
+ this._pdfViewer?.eventBus.dispatch('find', { ...findOpts, type: 'again' });
+ } else if (this._mainCont.current) {
+ const executeFind = () => this._pdfViewer?.eventBus.dispatch('find', findOpts);
+ this._mainCont.current.addEventListener('pagesloaded', executeFind);
+ this._mainCont.current.addEventListener('pagerendered', executeFind);
+ }
+ return true;
+ }
+ };
+
+ // Toggle fuzzy search mode
+ @action
+ toggleFuzzySearch = (): boolean => {
+ this._fuzzySearchEnabled = !this._fuzzySearchEnabled;
+ return this._fuzzySearchEnabled;
};
@action