diff options
author | A.J. Shulman <Shulman.aj@gmail.com> | 2025-05-10 20:30:24 -0400 |
---|---|---|
committer | A.J. Shulman <Shulman.aj@gmail.com> | 2025-05-10 20:30:24 -0400 |
commit | 0db4583914e43e6efdba3e86a614a19956e73b5e (patch) | |
tree | 68dfef85ea47d6d79e63a6ac0914922dc69c99c5 /src | |
parent | 0a05616fb9f685dc8534db4949a6f7ad6b85eadb (diff) |
feat: changed web document to display screenshot
Diffstat (limited to 'src')
-rw-r--r-- | src/client/views/nodes/WebBox.scss | 217 | ||||
-rw-r--r-- | src/client/views/nodes/WebBox.tsx | 759 | ||||
-rw-r--r-- | src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts | 105 | ||||
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 229 |
4 files changed, 912 insertions, 398 deletions
diff --git a/src/client/views/nodes/WebBox.scss b/src/client/views/nodes/WebBox.scss index 05d5babf9..a1991d1d0 100644 --- a/src/client/views/nodes/WebBox.scss +++ b/src/client/views/nodes/WebBox.scss @@ -1,13 +1,9 @@ @use '../global/globalCssVariables.module.scss' as global; .webBox { - height: 100%; - width: 100%; - top: 0; - left: 0; position: relative; - display: flex; overflow: hidden; + aspect-ratio: 1 / 1; // Explicitly enforce square aspect ratio .webBox-sideResizer { position: absolute; @@ -20,6 +16,119 @@ .webBox-background { width: 100%; height: 100%; + position: absolute; + top: 0; + left: 0; + } + + // Simple container for screenshot + .webBox-screenshot-container { + width: 100%; + } + + .webBox-screenshot { + width: 100%; + height: auto; // Maintain aspect ratio + display: block; + pointer-events: none; + } + + .webBox-loading { + padding: 20px; + text-align: center; + color: #666; + background-color: #f5f5f5; + min-height: 200px; + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + } + + .webBox-loading-spinner { + margin-top: 15px; + color: #1976d2; + font-size: 24px; + } + + .webBox-error { + padding: 20px; + color: #d32f2f; + text-align: center; + background-color: #ffebee; + min-height: 200px; + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + gap: 15px; + } + + .webBox-placeholder { + padding: 20px; + text-align: center; + color: #757575; + background-color: #fafafa; + min-height: 200px; + display: flex; + align-items: center; + justify-content: center; + } + + // Basic container layout + .webBox-container { + width: 100%; + height: 100%; + position: relative; + } + + // Simple scrollable container - vertical only + .webBox-outerContent { + width: 100%; + position: relative; + overflow-y: auto; + overflow-x: hidden; + background-color: #f5f5f5; + + // Improve scrollbar styling + &::-webkit-scrollbar-thumb { + background-color: #888; + border-radius: 6px; + } + + &::-webkit-scrollbar { + width: 8px; + background-color: #f5f5f5; + } + } + + .webBox-innerContent { + width: 100%; + background-color: #f5f5f5; + } + + .webBox-htmlSpan { + position: absolute; + top: 0; + left: 0; + cursor: text; + padding: 15px; + width: 100%; + height: 100%; + } + + .webBox-annotationLayer { + position: absolute; + transform-origin: left top; + top: 0; + width: 100%; + pointer-events: none; + mix-blend-mode: multiply; + } + + .webBox-annotationBox { + position: absolute; + background-color: rgba(245, 230, 95, 0.616); } .webBox-ui { @@ -68,14 +177,14 @@ } } - .webBox-nextIcon, - .webBox-prevIcon { + .webBox-refreshButton { background: #121721; - color: white; height: 20px; width: 25px; display: flex; - position: relative; + position: absolute; + bottom: 0; + right: 40px; align-items: center; justify-content: center; border-radius: 3px; @@ -83,10 +192,6 @@ padding: 0px; } - .webBox-overlayButton:hover { - background: none; - } - .webBox-overlayCont { position: absolute; width: calc(100% - 40px); @@ -118,8 +223,7 @@ justify-content: center; border-radius: 3px; pointer-events: all; - z-index: 1; // so it appears on top of the document's title, if shown - + z-index: 1; box-shadow: global.$standard-box-shadow; transition: 0.2s; @@ -134,89 +238,6 @@ opacity: 0.1; } - .webBox-annotationLayer { - position: absolute; - transform-origin: left top; - top: 0; - width: 100%; - pointer-events: none; - mix-blend-mode: multiply; // bcz: makes text fuzzy! - } - - .webBox-annotationBox { - position: absolute; - background-color: rgba(245, 230, 95, 0.616); - } - - .webBox-container { - transform-origin: top left; - width: 100%; - height: 100%; - position: absolute; - - .webBox-htmlSpan { - position: absolute; - top: 0; - left: 0; - cursor: text; - padding: 15px; - height: 100%; - } - - .webBox-cont { - pointer-events: none; - } - - .webBox-cont, - .webBox-cont-interactive { - padding: 0vw; - position: absolute; - top: 0; - left: 0; - width: 100%; - height: 100%; - transform-origin: top left; - - .webBox-iframe { - width: 100%; - height: 100%; - position: absolute; - top: 0; - left: 0; - body { - ::selection { - color: white; - background: orange; - } - } - } - } - - .webBox-cont-interactive { - span { - user-select: text !important; - } - } - - .webBox-outerContent { - width: 100%; - height: 100%; - position: absolute; - transform-origin: top left; - top: 0; - left: 0; - overflow: auto; - - .webBox-innerContent { - position: relative; - } - } - - div.webBox-outerContent::-webkit-scrollbar-thumb { - cursor: nw-resize; - } - } - .webBox-overlay { width: 100%; height: 100%; diff --git a/src/client/views/nodes/WebBox.tsx b/src/client/views/nodes/WebBox.tsx index e7a10cc29..3c4696df3 100644 --- a/src/client/views/nodes/WebBox.tsx +++ b/src/client/views/nodes/WebBox.tsx @@ -4,6 +4,7 @@ import { htmlToText } from 'html-to-text'; import { action, computed, IReactionDisposer, makeObservable, observable, ObservableMap, reaction, runInAction } from 'mobx'; import { observer } from 'mobx-react'; import * as React from 'react'; +import axios from 'axios'; import * as WebRequest from 'web-request'; import { addStyleSheet, addStyleSheetRule, clearStyleSheetRules, ClientUtils, DivHeight, getWordAtPoint, lightOrDark, returnFalse, returnOne, returnZero, setupMoveUpEvents, smoothScroll } from '../../../ClientUtils'; import { Doc, DocListCast, Field, FieldType, Opt } from '../../../fields/Doc'; @@ -69,23 +70,20 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { private _scrollTimer: NodeJS.Timeout | undefined; private _getAnchor: (savedAnnotations: Opt<ObservableMap<number, HTMLDivElement[]>>, addAsAnnotation: boolean) => Opt<Doc> = () => undefined; - @observable private _webUrl = ''; // url of the src parameter of the embedded iframe but not necessarily the rendered page - eg, when following a link, the rendered page changes but we don't want the src parameter to also change as that would cause an unnecessary re-render. - @observable private _hackHide = false; // apparently changing the value of the 'sandbox' prop doesn't necessarily apply it to the active iframe. so thisforces the ifrmae to be rebuilt when allowScripts is toggled + @observable private _webUrl = ''; // url of the page we want to display + @observable private _hackHide = false; @observable private _searching: boolean = false; @observable private _showSidebar = false; @observable private _webPageHasBeenRendered = false; @observable private _marqueeing: number[] | undefined = undefined; - get marqueeing() { - return this._marqueeing; - } - set marqueeing(val) { - val && this._marqueeref.current?.onInitiateSelection(val); - !val && this._marqueeref.current?.onTerminateSelection(); - this._marqueeing = val; - } + @observable private _screenshotUrl: string | null = null; // URL to the screenshot image + @observable private _fullHeight: number = 0; // Full height of the webpage screenshot + @observable private _isLoadingScreenshot: boolean = false; // Loading state for the screenshot @observable private _iframe: HTMLIFrameElement | null = null; @observable private _savedAnnotations = new ObservableMap<number, (HTMLDivElement & { marqueeing?: boolean })[]>(); @observable private _scrollHeight = NumCast(this.layoutDoc.scrollHeight); + @observable private _screenshotError: string | null = null; // Error message if screenshot fails + @observable private _loadingFromCache: boolean = false; @computed get _url() { return this.webField?.toString() || ''; } @@ -145,31 +143,38 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { }; updateIcon = async () => { - if (!this._iframe) return new Promise<void>(res => res()); + if (!this._screenshotUrl) { + // If we don't have a screenshot yet, capture one first + await this.captureWebScreenshot(); + } + const scrollTop = NumCast(this.layoutDoc._layout_scrollTop); const nativeWidth = NumCast(this.layoutDoc.nativeWidth); const nativeHeight = (nativeWidth * this._props.PanelHeight()) / this._props.PanelWidth(); - let htmlString = this._iframe.contentDocument && new XMLSerializer().serializeToString(this._iframe.contentDocument); - if (!htmlString) { - htmlString = await fetch(ClientUtils.CorsProxy(this.webField!.href)).then(response => response.text()); - } + this.layoutDoc.thumb = undefined; this.Document.thumbLockout = true; // lock to prevent multiple thumb updates. - return (CreateImage(this._webUrl.endsWith('/') ? this._webUrl.substring(0, this._webUrl.length - 1) : this._webUrl, this._iframe.contentDocument?.styleSheets ?? [], htmlString, nativeWidth, nativeHeight, scrollTop) as Promise<string>) - .then((dataUrl: string) => { - if (dataUrl.includes('<!DOCTYPE')) { - console.log('BAD DATA IN THUMB CREATION'); - return; - } - return ClientUtils.convertDataUri(dataUrl, this.layoutDoc[Id] + '_icon_' + new Date().getTime(), true, this.layoutDoc[Id] + '_icon_').then(returnedfilename => { + + try { + // If we have a screenshot, use it directly for the thumbnail + if (this._screenshotUrl) { + return ClientUtils.convertDataUri(this._screenshotUrl, this.layoutDoc[Id] + '_icon_' + new Date().getTime(), true, this.layoutDoc[Id] + '_icon_').then(returnedfilename => { this.Document.thumbLockout = false; this.layoutDoc.thumb = new ImageField(returnedfilename); this.layoutDoc.thumbScrollTop = scrollTop; this.layoutDoc.thumbNativeWidth = nativeWidth; this.layoutDoc.thumbNativeHeight = nativeHeight; }); - }) - .catch((error: object) => console.error('oops, something went wrong!', error)); + } else { + console.log('No screenshot available for thumbnail generation'); + this.Document.thumbLockout = false; + return Promise.resolve(); + } + } catch (error) { + console.error('Error creating thumbnail:', error); + this.Document.thumbLockout = false; + return Promise.reject(error); + } }; componentDidMount() { @@ -238,13 +243,64 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { }, { fireImmediately: true } ); + + // Check if we have a cached screenshot URL in metadata + if (this._url) { + this._webUrl = this._url; + const cachedScreenshotUrl = StrCast(this.dataDoc[this.fieldKey + '_screenshotUrl']); + const cachedHeight = NumCast(this.dataDoc[this.fieldKey + '_screenshotHeight']); + + if (cachedScreenshotUrl && cachedHeight) { + // Use cached screenshot + this._loadingFromCache = true; + this._isLoadingScreenshot = true; + + // Verify the cached screenshot exists by loading the image + const img = new Image(); + img.onload = action(() => { + this._screenshotUrl = cachedScreenshotUrl; + this._fullHeight = cachedHeight; + this._scrollHeight = cachedHeight; + this._webPageHasBeenRendered = true; + this._isLoadingScreenshot = false; + this._loadingFromCache = false; + + // Apply dimensions and initial scroll + if (this.layoutDoc._layout_autoHeight) { + this.layoutDoc._nativeHeight = this._fullHeight; + this._props.setHeight?.(this._fullHeight * (this._props.NativeDimScaling?.() || 1)); + } + + if (this._initialScroll !== undefined) { + this.setScrollPos(this._initialScroll); + } + + console.log(`Loaded cached screenshot: ${this._screenshotUrl}`); + }); + + img.onerror = action(() => { + // If image fails to load, capture a new screenshot + console.log('Cached screenshot not found, capturing new one'); + this._loadingFromCache = false; + this.captureWebScreenshot(); + }); + + img.src = cachedScreenshotUrl; + } else { + // No cached screenshot, capture a new one + this.captureWebScreenshot(); + } + } } componentWillUnmount() { - this._iframetimeout && clearTimeout(this._iframetimeout); - this._iframetimeout = undefined; + // Clean up timers + if (this._scrollTimer) { + clearTimeout(this._scrollTimer); + this._scrollTimer = undefined; + } + + // Clean up reaction disposers Object.values(this._disposers).forEach(disposer => disposer?.()); - // this._iframe?.removeEventListener('wheel', this.iframeWheel, true); - // this._iframe?.contentDocument?.removeEventListener("pointerup", this.iframeUp); } private _selectionText: string = ''; @@ -359,59 +415,6 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { savedAnnotationsCreator: () => ObservableMap<number, (HTMLDivElement & { marqueeing?: boolean })[]> = () => this._textAnnotationCreator?.() || this._savedAnnotations; @action - iframeMove = (e: PointerEvent) => { - const theclick = this.props - .ScreenToLocalTransform() - .inverse() - .transformPoint(e.clientX, e.clientY - NumCast(this.layoutDoc.layout_scrollTop)); - this._marqueeref.current?.onMove(theclick); - }; - @action - iframeUp = (e: PointerEvent) => { - this._iframe?.contentDocument?.removeEventListener('pointermove', this.iframeMove); - this.marqueeing = undefined; - this._getAnchor = AnchorMenu.Instance?.GetAnchor; // need to save AnchorMenu's getAnchor since a subsequent selection on another doc will overwrite this value - this._textAnnotationCreator = undefined; - this.DocumentView?.()?.cleanupPointerEvents(); // pointerup events aren't generated on containing document view, so we have to invoke it here. - if (this._iframe?.contentWindow && this._iframe.contentDocument && !this._iframe.contentWindow.getSelection()?.isCollapsed) { - const mainContBounds = ClientUtils.GetScreenTransform(this._mainCont.current!); - const scale = (this._props.NativeDimScaling?.() || 1) * mainContBounds.scale; - const sel = this._iframe.contentWindow.getSelection(); - if (sel) { - this._selectionText = sel.toString(); - AnchorMenu.Instance.setSelectedText(sel.toString()); - this._textAnnotationCreator = () => this.createTextAnnotation(sel, !sel.isCollapsed ? sel.getRangeAt(0) : undefined); - AnchorMenu.Instance.jumpTo(e.clientX * scale + mainContBounds.translateX, e.clientY * scale + mainContBounds.translateY - NumCast(this.layoutDoc._layout_scrollTop) * scale); - // Changing which document to add the annotation to (the currently selected WebBox) - GPTPopup.Instance.setSidebarFieldKey(`${this._props.fieldKey}_${this._urlHash ? this._urlHash + '_' : ''}sidebar`); - GPTPopup.Instance.addDoc = this.sidebarAddDocument; - } - } else { - const theclick = this.props - .ScreenToLocalTransform() - .inverse() - .transformPoint(e.clientX, e.clientY - NumCast(this.layoutDoc.layout_scrollTop)); - if (!this._marqueeref.current?.isEmpty) this._marqueeref.current?.onEnd(theclick[0], theclick[1]); - else { - if (!(e.target as HTMLElement)?.tagName?.includes('INPUT')) this.finishMarquee(theclick[0], theclick[1]); - this._getAnchor = AnchorMenu.Instance?.GetAnchor; - this.marqueeing = undefined; - } - - ContextMenu.Instance.closeMenu(); - ContextMenu.Instance.setIgnoreEvents(false); - if (e?.button === 2 || e?.altKey) { - e?.preventDefault(); - e?.stopPropagation(); - setTimeout(() => { - // if menu comes up right away, the down event can still be active causing a menu item to be selected - this.specificContextMenu(); - this.DocumentView?.().onContextMenu(undefined, theclick[0], theclick[1]); - }); - } - } - }; - @action webClipDown = (e: React.PointerEvent) => { e.stopPropagation(); const sel = window.getSelection(); @@ -451,27 +454,9 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { } }; @action - iframeDown = (e: PointerEvent) => { - this._textAnnotationCreator = undefined; - const sel = this._url ? this._iframe?.contentDocument?.getSelection() : window.document.getSelection(); - if (sel?.empty) - sel.empty(); // Chrome - else if (sel?.removeAllRanges) sel.removeAllRanges(); // Firefox - - this._props.select(false); - const theclick = this.props - .ScreenToLocalTransform() - .inverse() - .transformPoint(e.clientX, e.clientY - NumCast(this.layoutDoc.layout_scrollTop)); - MarqueeAnnotator.clearAnnotations(this._savedAnnotations); - const target = e.target as HTMLElement; - const word = target && getWordAtPoint(target, e.clientX, e.clientY); - if (!word && !target?.className?.includes('rangeslider') && !target?.onclick && !target?.parentElement?.onclick) { - this.marqueeing = theclick; - this._marqueeref.current?.onInitiateSelection(this.marqueeing); - this._iframe?.contentDocument?.addEventListener('pointermove', this.iframeMove); - e.preventDefault(); - } + iframeDown = () => { + // This is an empty replacement to avoid linter errors + // The original functionality is no longer needed }; isFirefox = () => 'InstallTrigger' in window; // navigator.userAgent.indexOf("Chrome") !== -1; @@ -497,121 +482,6 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { _iframetimeout: NodeJS.Timeout | undefined = undefined; @observable _warning = 0; @action - iframeLoaded = () => { - const iframe = this._iframe; - if (this._initialScroll !== undefined) { - this.setScrollPos(this._initialScroll); - } - this._scrollHeight = this._iframe?.contentDocument?.body?.scrollHeight ?? 0; - this.addWebStyleSheetRule(this.addWebStyleSheet(this._iframe?.contentDocument), '::selection', { color: 'white', background: 'orange' }, ''); - - let href: Opt<string>; - try { - href = iframe?.contentWindow?.location.href; - } catch { - runInAction(() => this._warning++); - href = undefined; - } - let requrlraw = decodeURIComponent(href?.replace(ClientUtils.prepend('') + '/corsProxy/', '') ?? this._url.toString()); - if (requrlraw !== this._url.toString()) { - if (requrlraw.match(/q=.*&/)?.length && this._url.toString().match(/q=.*&/)?.length) { - const matches = requrlraw.match(/[^a-zA-z]q=[^&]*/g); - const newsearch = matches?.lastElement() || ''; - if (matches) { - requrlraw = requrlraw.substring(0, requrlraw.indexOf(newsearch)); - for (let i = 1; i < Array.from(matches)?.length; i++) { - requrlraw = requrlraw.replace(matches[i], ''); - } - } - requrlraw = requrlraw - .replace(/q=[^&]*/, newsearch.substring(1)) - .replace('search&', 'search?') - .replace('?gbv=1', ''); - } - this.setData(requrlraw); - } - const iframeContent = iframe?.contentDocument; - if (iframeContent) { - iframeContent.addEventListener('pointerup', this.iframeUp); - iframeContent.addEventListener('pointerdown', this.iframeDown); - // iframeContent.addEventListener( - // 'wheel', - // e => { - // e.ctrlKey && e.preventDefault(); - // }, - // { passive: false } - // ); - const initHeights = () => { - this._scrollHeight = Math.max(this._scrollHeight, iframeContent.body.scrollHeight || 0); - if (this._scrollHeight) { - this.Document.nativeHeight = Math.min(NumCast(this.Document.nativeHeight), this._scrollHeight); - this.layoutDoc.height = Math.min(NumCast(this.layoutDoc._height), (NumCast(this.layoutDoc._width) * NumCast(this.Document.nativeHeight)) / NumCast(this.Document.nativeWidth)); - } - }; - const swidth = Math.max(NumCast(this.Document.nativeWidth), iframeContent.body.scrollWidth || 0); - if (swidth) { - const aspectResize = swidth / NumCast(this.Document.nativeWidth, swidth); - this.layoutDoc.height = NumCast(this.layoutDoc._height) * aspectResize; - this.Document.nativeWidth = swidth; - this.Document.nativeHeight = (swidth * NumCast(this.layoutDoc._height)) / NumCast(this.layoutDoc._width); - } - initHeights(); - this._iframetimeout && clearTimeout(this._iframetimeout); - this._iframetimeout = setTimeout( - action(() => initHeights), - 5000 - ); - iframeContent.addEventListener( - 'click', - undoable( - action((e: MouseEvent) => { - let eleHref = ''; - for (let ele = e.target as HTMLElement | Element | null; ele; ele = ele.parentElement) { - if (ele instanceof HTMLAnchorElement) { - eleHref = (typeof ele.href === 'string' ? ele.href : eleHref) || (ele.parentElement && 'href' in ele.parentElement ? (ele.parentElement.href as string) : eleHref); - } - } - const origin = this.webField?.origin; - if (eleHref && origin) { - const batch = UndoManager.StartBatch('webclick'); - e.stopPropagation(); - setTimeout(() => { - this.setData(eleHref.replace(ClientUtils.prepend(''), origin)); - batch.end(); - }); - if (this._outerRef.current) { - this._outerRef.current.scrollTop = NumCast(this.layoutDoc._layout_scrollTop); - this._outerRef.current.scrollLeft = 0; - } - } - }), - 'follow web link' - ) - ); - iframe.contentDocument.addEventListener('wheel', this.iframeWheel, { passive: false }); - } - }; - - @action - iframeWheel = (e: WheelEvent) => { - if (!this._scrollTimer) { - addStyleSheetRule(WebBox.webStyleSheet, 'webBox-iframe', { 'pointer-events': 'none' }); - this._scrollTimer = setTimeout(() => { - this._scrollTimer = undefined; - clearStyleSheetRules(WebBox.webStyleSheet); - }, 250); // this turns events off on the iframe which allows scrolling to change direction smoothly - } - if (e.ctrlKey) { - if (this._innerCollectionView) { - this._innerCollectionView.zoom(e.screenX, e.screenY, e.deltaY); - const offset = e.clientY - NumCast(this.layoutDoc._layout_scrollTop); - this.layoutDoc.freeform_panY = offset - offset / NumCast(this.layoutDoc._freeform_scale) + NumCast(this.layoutDoc._layout_scrollTop) - NumCast(this.layoutDoc._layout_scrollTop) / NumCast(this.layoutDoc._freeform_scale); - } - e.preventDefault(); - } - }; - - @action setDashScrollTop = (scrollTop: number, timeout: number = 250) => { const iframeHeight = Math.max(scrollTop, this._scrollHeight - this.panelHeight()); if (this._scrollTimer) { @@ -654,15 +524,23 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { this.dataDoc[this.fieldKey + '_history'] = new List<string>([...history, this._url]); this.dataDoc[this.fieldKey] = new WebField(new URL(future.pop()!)); this._scrollHeight = 0; + + // Reset screenshot state for new URL + this._screenshotUrl = null; + this._fullHeight = 0; + this._isLoadingScreenshot = false; + if (this._webUrl === this._url) { this._webUrl = curUrl; setTimeout( action(() => { this._webUrl = this._url; + this.captureWebScreenshot(); // Capture screenshot for new URL }) ); } else { this._webUrl = this._url; + this.captureWebScreenshot(); // Capture screenshot for new URL } return true; } @@ -682,15 +560,23 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { else this.dataDoc[this.fieldKey + '_future'] = new List<string>([...future, this._url]); this.dataDoc[this.fieldKey] = new WebField(new URL(history.pop()!)); this._scrollHeight = 0; + + // Reset screenshot state for new URL + this._screenshotUrl = null; + this._fullHeight = 0; + this._isLoadingScreenshot = false; + if (this._webUrl === this._url) { this._webUrl = curUrl; setTimeout( action(() => { this._webUrl = this._url; + this.captureWebScreenshot(); // Capture screenshot for new URL }) ); } else { this._webUrl = this._url; + this.captureWebScreenshot(); // Capture screenshot for new URL } return true; } @@ -709,10 +595,11 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { this.layoutDoc.thumbNativeWidth = undefined; this.layoutDoc.thumbNativeHeight = undefined; } - } - if (!preview) { + if (!dontUpdateIframe) { this._webUrl = this._url; + // Capture screenshot when URL changes + this.captureWebScreenshot(); } } } catch { @@ -721,6 +608,85 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { return true; }; + @action + captureWebScreenshot = async () => { + if (!this._url || this._loadingFromCache) return; + + try { + this._isLoadingScreenshot = true; + this._screenshotError = null; + + console.log(`Capturing screenshot for URL: ${this._url}`); + + try { + const response = await axios.post('/captureWebScreenshot', { + url: this._url, + width: NumCast(this.Document.nativeWidth, 1200), + height: NumCast(this.Document.nativeHeight, 800), + fullPage: true, // Request a full page screenshot + }); + + runInAction(() => { + this._screenshotUrl = response.data.screenshotUrl; + this._fullHeight = response.data.fullHeight; + this._scrollHeight = response.data.fullHeight; + this._webPageHasBeenRendered = true; + this._isLoadingScreenshot = false; + + // Store screenshot URL and height in document metadata + this.dataDoc[this.fieldKey + '_screenshotUrl'] = response.data.screenshotUrl; + this.dataDoc[this.fieldKey + '_screenshotHeight'] = response.data.fullHeight; + + // Update native dimensions to match the screenshot + if (!this.dataDoc[this.fieldKey + '_nativeWidth']) { + this.dataDoc[this.fieldKey + '_nativeWidth'] = 1200; // Default width + } + + if (!this.dataDoc[this.fieldKey + '_nativeHeight']) { + this.dataDoc[this.fieldKey + '_nativeHeight'] = this._fullHeight; + } + + // Set document height if needed + if (this.layoutDoc._layout_autoHeight) { + this.layoutDoc._nativeHeight = this._fullHeight; + this._props.setHeight?.(this._fullHeight * (this._props.NativeDimScaling?.() || 1)); + } + + // Apply initial scroll if needed + if (this._initialScroll !== undefined) { + this.setScrollPos(this._initialScroll); + } + + console.log(`Screenshot captured successfully: ${this._screenshotUrl} with height: ${this._fullHeight}px`); + }); + } catch (error: any) { + // Handle error from the API + console.error('Error capturing screenshot:', error); + let errorMessage = 'Failed to capture webpage screenshot'; + + // Try to extract detailed error message from response + if (error.response && error.response.data && error.response.data.error) { + errorMessage = error.response.data.error; + } else if (error.message) { + errorMessage = error.message; + } + + runInAction(() => { + this._screenshotError = errorMessage; + this._isLoadingScreenshot = false; + }); + } + } catch (error: any) { + // Handle unexpected errors + runInAction(() => { + console.error('Unexpected error in captureWebScreenshot:', error); + this._screenshotError = 'An unexpected error occurred'; + this._isLoadingScreenshot = false; + }); + } + }; + + @action onWebUrlDrop = (e: React.DragEvent) => { const { dataTransfer } = e; const html = dataTransfer.getData('text/html'); @@ -735,13 +701,28 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { setData = (data: FieldType | Promise<RefField | undefined>) => { if (!(typeof data === 'string') && !(data instanceof WebField)) return false; if (Field.toString(data) === this._url) return false; + + // Reset state for new URL this._scrollHeight = 0; + this._screenshotUrl = null; + this._fullHeight = 0; + this._isLoadingScreenshot = false; + + // Clear stored screenshot metadata for the previous URL + this.dataDoc[this.fieldKey + '_screenshotUrl'] = undefined; + this.dataDoc[this.fieldKey + '_screenshotHeight'] = undefined; + const oldUrl = this._url; const history = Cast(this.dataDoc[this.fieldKey + '_history'], listSpec('string'), []); const weburl = new WebField(Field.toString(data)); this.dataDoc[this.fieldKey + '_future'] = new List<string>([]); this.dataDoc[this.fieldKey + '_history'] = new List<string>([...(history || []), oldUrl]); this.dataDoc[this.fieldKey] = weburl; + + // Capture screenshot for the new URL + this._webUrl = weburl.toString(); + this.captureWebScreenshot(); + return true; }; onWebUrlValueKeyDown = (e: React.KeyboardEvent) => { @@ -758,26 +739,14 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { description: (this.layoutDoc[this.fieldKey + '_useCors'] ? "Don't Use" : 'Use') + ' Cors', event: () => { this.layoutDoc[this.fieldKey + '_useCors'] = !this.layoutDoc[this.fieldKey + '_useCors']; + // Re-capture screenshot with the new setting + this.captureWebScreenshot(); }, icon: 'snowflake', }); - funcs.push({ - description: (this.dataDoc[this.fieldKey + '_allowScripts'] ? 'Prevent' : 'Allow') + ' Scripts', - event: () => { - this.dataDoc[this.fieldKey + '_allowScripts'] = !this.dataDoc[this.fieldKey + '_allowScripts']; - if (this._iframe) { - runInAction(() => { - this._hackHide = true; - }); - setTimeout( - action(() => { - this._hackHide = false; - }) - ); - } - }, - icon: 'snowflake', - }); + + // Remove the "Allow Scripts" option since it's not relevant for screenshots + funcs.push({ description: (!this.layoutDoc.layout_reflowHorizontal ? 'Force' : 'Prevent') + ' Reflow', event: () => { @@ -789,7 +758,21 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { }, icon: 'snowflake', }); - !Doc.noviceMode && funcs.push({ description: 'Update Icon', event: () => this.updateIcon(), icon: 'portrait' }); + + // Add a refresh option to re-capture the screenshot + funcs.push({ + description: 'Refresh Screenshot', + event: () => this.captureWebScreenshot(), + icon: 'sync-alt', + }); + + !Doc.noviceMode && + funcs.push({ + description: 'Update Icon', + event: () => this.updateIcon(), + icon: 'portrait', + }); + cm.addItem({ description: 'Options...', subitems: funcs, icon: 'asterisk' }); } }; @@ -801,7 +784,7 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { */ @action onMarqueeDown = (e: React.PointerEvent) => { - const sel = this._url ? this._iframe?.contentDocument?.getSelection() : window.document.getSelection(); + const sel = window.document.getSelection(); this._textAnnotationCreator = undefined; if (sel?.empty) sel.empty(); // Chrome @@ -836,6 +819,7 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { @computed get urlContent() { if (this.ScreenToLocalBoxXf().Scale > 25) return <div />; + setTimeout( action(() => { if (this._initialScroll === undefined && !this._webPageHasBeenRendered) { @@ -844,7 +828,10 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { this._webPageHasBeenRendered = true; }) ); + const field = this.dataDoc[this._props.fieldKey]; + + // Handle HTML field (text content) if (field instanceof HtmlField) { return ( <span @@ -861,37 +848,83 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { /> ); } + + // Handle WebField (screenshot of webpage) if (field instanceof WebField) { - const url = this.layoutDoc[this.fieldKey + '_useCors'] ? ClientUtils.CorsProxy(this._webUrl) : this._webUrl; - const scripts = this.dataDoc[this.fieldKey + '_allowScripts'] || this._webUrl.includes('wikipedia.org') || this._webUrl.includes('google.com') || this._webUrl.startsWith('https://bing'); - // if (!scripts) console.log('No scripts for: ' + url); + // Show loading state with spinner + if (this._isLoadingScreenshot) { + return ( + <div className="webBox-loading"> + <div className="webBox-loading-message">{this._loadingFromCache ? 'Loading cached webpage preview...' : 'Loading webpage preview...'}</div> + <div className="webBox-loading-spinner"> + <FontAwesomeIcon className="documentdecorations-icon" icon="spinner" spin /> + </div> + </div> + ); + } + + // Show error state with retry button + if (this._screenshotError) { + return ( + <div className="webBox-error"> + <div className="webBox-error-icon"> + <FontAwesomeIcon icon="exclamation-triangle" size="2x" /> + </div> + <div className="webBox-error-message">{this._screenshotError}</div> + <div className="webBox-error-actions"> + <button onClick={() => this.captureWebScreenshot()} className="webBox-retry-button"> + <FontAwesomeIcon icon="sync" style={{ marginRight: '5px' }} /> + Retry + </button> + </div> + </div> + ); + } + + // Show screenshot in scrollable container + if (this._screenshotUrl) { + return ( + <div className="webBox-screenshot-container"> + <img + src={this._screenshotUrl} + alt="Webpage screenshot" + className="webBox-screenshot" + style={{ + width: '100%', + height: 'auto', + display: 'block', + }} + onError={action((e: React.SyntheticEvent<HTMLImageElement>) => { + console.error('Error loading screenshot:', e); + this._screenshotError = 'Failed to load screenshot image'; + this._isLoadingScreenshot = false; + this.dataDoc[this.fieldKey + '_screenshotUrl'] = undefined; + this.dataDoc[this.fieldKey + '_screenshotHeight'] = undefined; + })} + onLoad={() => { + this._scrollHeight = this._fullHeight; + if (this._initialScroll !== undefined) { + this.setScrollPos(this._initialScroll); + } + }} + /> + </div> + ); + } + + // Fall back to a placeholder if no screenshot yet return ( - <iframe - title="web iframe" - key={this._warning} - className="webBox-iframe" - ref={action((r: HTMLIFrameElement | null) => { - this._iframe = r; - })} - style={{ pointerEvents: SnappingManager.IsResizing ? 'none' : undefined }} - src={url} - onLoad={this.iframeLoaded} - scrolling="no" // ugh.. on windows, I get an inner scroll bar for the iframe's body even though the scrollHeight should be set to the full height of the document. - // the 'allow-top-navigation' and 'allow-top-navigation-by-user-activation' attributes are left out to prevent iframes from redirecting the top-level Dash page - // sandbox={"allow-forms allow-modals allow-orientation-lock allow-pointer-lock allow-popups allow-popups-to-escape-sandbox allow-presentation allow-same-origin allow-scripts"} />; - sandbox={`${scripts ? 'allow-scripts' : ''} allow-forms allow-modals allow-orientation-lock allow-pointer-lock allow-popups allow-popups-to-escape-sandbox allow-presentation allow-same-origin`} - /> + <div className="webBox-placeholder"> + <div>Preparing webpage preview...</div> + </div> ); } + + // Default placeholder return ( - <iframe - title="web frame" - className="webBox-iframe" - ref={action((r: HTMLIFrameElement | null) => { - this._iframe = r; - })} - src="https://crossorigin.me/https://cs.brown.edu" - /> + <div className="webBox-placeholder"> + <div>No content to display</div> + </div> ); } @@ -1078,22 +1111,30 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { childPointerEvents = () => (this._props.isContentActive() ? 'all' : undefined); @computed get webpage() { TraceMobx(); - const previewScale = this._previewNativeWidth ? 1 - this.sidebarWidth() / this._previewNativeWidth : 1; + const containerWidth = NumCast(this.layoutDoc._width) || this._props.PanelWidth(); const pointerEvents = this.layoutDoc._lockedPosition ? 'none' : (this._props.pointerEvents?.() as Property.PointerEvents | undefined); - const scale = previewScale * (this._props.NativeDimScaling?.() || 1); + return ( <div className="webBox-outerContent" ref={this._outerRef} style={{ - height: `${100 / scale}%`, + width: '100%', + height: `${containerWidth}px`, + overflowY: 'auto', + overflowX: 'hidden', pointerEvents, }} - // when active, block wheel events from propagating since they're handled by the iframe onWheel={this.onZoomWheel} onScroll={() => this.setDashScrollTop(this._outerRef.current?.scrollTop || 0)} onPointerDown={this.onMarqueeDown}> - <div className="webBox-innerContent" style={{ height: (this._webPageHasBeenRendered && this._scrollHeight > this._props.PanelHeight() && this._scrollHeight) || '100%', pointerEvents }}> + <div + className="webBox-innerContent" + style={{ + width: '100%', + pointerEvents, + backgroundColor: '#f5f5f5', + }}> {this.content} <div style={{ display: SnappingManager.CanEmbed ? 'none' : undefined, mixBlendMode: 'multiply' }}>{this.renderTransparentAnnotations}</div> {this.renderOpaqueAnnotations} @@ -1135,6 +1176,13 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { <FontAwesomeIcon icon={this._searching ? 'times' : 'search'} size="lg" /> </div> </button> + + {/* Refresh button */} + <button type="button" className="webBox-overlayButton webBox-refreshButton" title="Refresh webpage" onClick={() => this.captureWebScreenshot()}> + <div className="webBox-overlayButton-iconCont" onPointerDown={e => e.stopPropagation()}> + <FontAwesomeIcon icon="sync" size="lg" /> + </div> + </button> </div> ); } @@ -1163,23 +1211,31 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { annotationPointerEvents = () => (this._props.isContentActive() && (SnappingManager.IsDragging || Doc.ActiveTool !== InkTool.None) ? 'all' : 'none'); render() { TraceMobx(); - const previewScale = this._previewNativeWidth ? 1 - this.sidebarWidth() / this._previewNativeWidth : 1; + const containerWidth = NumCast(this.layoutDoc._width) || this._props.PanelWidth(); const pointerEvents = this.layoutDoc._lockedPosition ? 'none' : (this._props.pointerEvents?.() as Property.PointerEvents); - const scale = previewScale * (this._props.NativeDimScaling?.() || 1); + + // Force the component to be square + this.layoutDoc._height = containerWidth; + this.layoutDoc._width = containerWidth; + this.layoutDoc._forceActive = true; + return ( <div className="webBox" ref={this._mainCont} style={{ - pointerEvents: this.pointerEvents(), // + pointerEvents: this.pointerEvents(), position: SnappingManager.IsDragging ? 'absolute' : undefined, + width: `${containerWidth}px`, + height: `${containerWidth}px`, + aspectRatio: '1 / 1', // Explicitly enforce square aspect ratio }}> <div className="webBox-background" style={{ backgroundColor: this._props.styleProvider?.(this.layoutDoc, this._props, StyleProp.BackgroundColor) as string }} /> <div className="webBox-container" style={{ - width: `calc(${100 / scale}% - ${!this.SidebarShown ? 0 : ((this.sidebarWidth() - WebBox.sidebarResizerWidth) / scale) * (this._previewWidth ? scale : 1)}px)`, - transform: `scale(${scale})`, + width: `calc(100% - ${this.SidebarShown ? this.sidebarWidth() : 0}px)`, + height: '100%', pointerEvents, }} onContextMenu={this.specificContextMenu}> @@ -1236,6 +1292,15 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { </div> ); } + + get marqueeing() { + return this._marqueeing; + } + set marqueeing(val) { + val && this._marqueeref.current?.onInitiateSelection(val); + !val && this._marqueeref.current?.onTerminateSelection(); + this._marqueeing = val; + } } // eslint-disable-next-line prefer-arrow-callback ScriptingGlobals.add(function urlHash(url: string) { @@ -1246,3 +1311,149 @@ Docs.Prototypes.TemplateMap.set(DocumentType.WEB, { layout: { view: WebBox, dataField: 'data' }, options: { acl: '', _height: 300, _layout_fitWidth: true, _layout_nativeDimEditable: true, _layout_reflowVertical: true, waitForDoubleClickToClick: 'always', systemIcon: 'BsGlobe' }, }); + +// Add CSS styles for screenshot mode +const webBoxStyles = ` +.webBox-screenshot-container { + width: 100%; + position: relative; + overflow: visible; + display: flex; + align-items: flex-start; + justify-content: center; + background-color: #f5f5f5; +} + +.webBox-screenshot { + width: 100%; + pointer-events: none; + display: block; + user-select: none; + object-fit: contain; + transition: opacity 0.3s ease; +} + +.webBox-loading { + padding: 20px; + text-align: center; + color: #666; + background-color: #f5f5f5; + border-radius: 4px; + min-height: 200px; + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; +} + +.webBox-loading-message { + font-size: 16px; + margin-bottom: 15px; + color: #555; +} + +.webBox-loading-spinner { + margin-top: 10px; + color: #1976d2; +} + +.webBox-error { + padding: 20px; + color: #d32f2f; + text-align: center; + background-color: #ffebee; + border-radius: 4px; + min-height: 200px; + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + gap: 15px; +} + +.webBox-error-icon { + color: #d32f2f; + margin-bottom: 10px; +} + +.webBox-error-message { + color: #d32f2f; + font-size: 14px; + max-width: 80%; + line-height: 1.5; +} + +.webBox-error-actions { + margin-top: 10px; +} + +.webBox-retry-button { + background-color: #f44336; + color: white; + border: none; + padding: 8px 16px; + border-radius: 4px; + cursor: pointer; + font-size: 14px; + transition: background-color 0.3s; +} + +.webBox-retry-button:hover { + background-color: #d32f2f; +} + +.webBox-placeholder { + padding: 20px; + text-align: center; + color: #757575; + background-color: #fafafa; + border-radius: 4px; + min-height: 200px; + display: flex; + align-items: center; + justify-content: center; +} + +.webBox-refreshButton { + margin-right: 5px; +} + +.webBox-innerContent { + position: relative; + width: 100%; + background-color: #f5f5f5; + overflow: visible; +} + +.webBox-outerContent { + overflow: auto; + width: 100%; + background-color: #f5f5f5; + position: relative; +} + +.webBox-container { + position: relative; + display: flex; + flex-direction: column; + height: 100%; + background-color: white; + border-radius: 4px; + overflow: hidden; +} + +.webBox { + position: relative; + height: 100%; + width: 100%; + overflow: hidden; + background-color: white; + border-radius: 4px; + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12), 0 1px 2px rgba(0, 0, 0, 0.24); +} +`; + +// Add the styles to the document +const styleEl = document.createElement('style'); +styleEl.textContent = webBoxStyles; +document.head.appendChild(styleEl); diff --git a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts index bff38ae15..3c7b4e3db 100644 --- a/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts +++ b/src/client/views/nodes/chatbot/tools/WebsiteInfoScraperTool.ts @@ -73,30 +73,111 @@ export class WebsiteInfoScraperTool extends BaseTool<WebsiteInfoScraperToolParam this._getLinkedUrlDocId = getLinkedUrlDocIds; } - async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> { - const urls = args.urls; - - // Create an array of promises, each one handling a website scrape for a URL - const scrapingPromises = urls.map(async url => { + /** + * Attempts to scrape a website with retry logic + * @param url URL to scrape + * @param maxRetries Maximum number of retry attempts + * @returns The scraped content or error message + */ + private async scrapeWithRetry(url: string, maxRetries = 2): Promise<Observation> { + let lastError = ''; + let retryCount = 0; + + // Validate URL format + try { + new URL(url); // This will throw if URL is invalid + } catch (e) { + return { + type: 'text', + text: `Invalid URL format: ${url}. Please provide a valid URL including http:// or https://`, + } as Observation; + } + + while (retryCount <= maxRetries) { try { - const { website_plain_text } = (await Networking.PostToServer('/scrapeWebsite', { url })) as { website_plain_text: string }; + // Add a slight delay between retries + if (retryCount > 0) { + console.log(`Retry attempt ${retryCount} for ${url}`); + await new Promise(resolve => setTimeout(resolve, retryCount * 2000)); // Increasing delay for each retry + } + + const response = await Networking.PostToServer('/scrapeWebsite', { url }); + + if (!response || typeof response !== 'object') { + lastError = 'Empty or invalid response from server'; + retryCount++; + continue; + } + + const { website_plain_text } = response as { website_plain_text: string }; const id = this._getLinkedUrlDocId(url); + + // Validate content quality + if (!website_plain_text) { + lastError = 'Retrieved content was empty'; + retryCount++; + continue; + } + + if (website_plain_text.length < 100) { + console.warn(`Warning: Content from ${url} is very short (${website_plain_text.length} chars)`); + + // Still return it if this is our last try + if (retryCount === maxRetries) { + return { + type: 'text', + text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\nNote: Limited content was retrieved from this URL.\n</chunk>`, + } as Observation; + } + + lastError = 'Retrieved content was too short, trying again'; + retryCount++; + continue; + } + + // Process and return content if it looks good return { type: 'text', text: `<chunk chunk_id="${id}" chunk_type="url">\n${website_plain_text}\n</chunk>`, } as Observation; } catch (error) { - console.log(error); - return { - type: 'text', - text: `An error occurred while scraping the website: ${url}`, - } as Observation; + lastError = error instanceof Error ? error.message : 'Unknown error'; + console.log(`Error scraping ${url} (attempt ${retryCount + 1}):`, error); } - }); + + retryCount++; + } + + // All attempts failed + return { + type: 'text', + text: `Unable to scrape website: ${url}. Error: ${lastError}`, + } as Observation; + } + + async execute(args: ParametersType<WebsiteInfoScraperToolParamsType>): Promise<Observation[]> { + const urls = args.urls; + + // Create an array of promises, each one handling a website scrape for a URL + const scrapingPromises = urls.map(url => this.scrapeWithRetry(url)); // Wait for all scraping promises to resolve const results = await Promise.all(scrapingPromises); + // Check if we got any successful results + const successfulResults = results.filter(result => { + if (result.type !== 'text') return false; + return (result as { type: 'text'; text: string }).text.includes('chunk_id') && !(result as { type: 'text'; text: string }).text.includes('Unable to scrape'); + }); + + // If all scrapes failed, provide a more helpful error message + if (successfulResults.length === 0 && results.length > 0) { + results.push({ + type: 'text', + text: `Note: All website scraping attempts failed. Please try with different URLs or try again later.`, + } as Observation); + } + return results; } } diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index af25722a4..6d2779163 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -485,36 +485,69 @@ export default class AssistantManager extends ApiManager { subscription: '/scrapeWebsite', secureHandler: async ({ req, res }) => { const { url } = req.body; + let browser = null; try { + // Set a longer timeout for slow-loading pages + const navigationTimeout = 60000; // 60 seconds + // Launch Puppeteer browser to navigate to the webpage - const browser = await puppeteer.launch({ - args: ['--no-sandbox', '--disable-setuid-sandbox'], + browser = await puppeteer.launch({ + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'], }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'); - await page.goto(url, { waitUntil: 'networkidle2' }); + + // Set timeout for navigation + page.setDefaultNavigationTimeout(navigationTimeout); + + // Navigate with timeout and wait for content to load + await page.goto(url, { + waitUntil: 'networkidle2', + timeout: navigationTimeout, + }); + + // Wait a bit longer to ensure dynamic content loads + await new Promise(resolve => setTimeout(resolve, 2000)); // Extract HTML content const htmlContent = await page.content(); await browser.close(); + browser = null; - // Parse HTML content using JSDOM - const dom = new JSDOM(htmlContent, { url }); + // Use a try-catch block specifically for JSDOM parsing + try { + // Parse HTML content using JSDOM + const dom = new JSDOM(htmlContent, { url }); - // Extract readable content using Mozilla's Readability API - const reader = new Readability(dom.window.document); - const article = reader.parse(); + // Extract readable content using Mozilla's Readability API + const reader = new Readability(dom.window.document); + const article = reader.parse(); - if (article) { - const plainText = article.textContent; - res.send({ website_plain_text: plainText }); - } else { - res.status(500).send({ error: 'Failed to extract readable content' }); + if (article) { + const plainText = article.textContent; + res.send({ website_plain_text: plainText }); + } else { + // If Readability fails, fallback to extracting main content + const mainContent = await extractMainContent(htmlContent); + res.send({ website_plain_text: mainContent }); + } + } catch (parsingError) { + console.error('Error parsing website content:', parsingError); + + // Fallback to a simplified extraction method + const mainContent = await extractMainContent(htmlContent); + res.send({ website_plain_text: mainContent }); } } catch (error) { console.error('Error scraping website:', error); + + // Clean up browser if still open + if (browser) { + await browser.close().catch(e => console.error('Error closing browser:', e)); + } + res.status(500).send({ - error: 'Failed to scrape website', + error: 'Failed to scrape website: ' + ((error as Error).message || 'Unknown error'), }); } }, @@ -687,6 +720,127 @@ export default class AssistantManager extends ApiManager { } }, }); + + // Register an API route to capture a screenshot of a webpage using Puppeteer + // and return the image URL for display in the WebBox component + register({ + method: Method.POST, + subscription: '/captureWebScreenshot', + secureHandler: async ({ req, res }) => { + const { url, width, height, fullPage } = req.body; + + if (!url) { + res.status(400).send({ error: 'URL is required' }); + return; + } + + let browser = null; + try { + // Increase timeout for websites that load slowly + const navigationTimeout = 60000; // 60 seconds + + // Launch a headless browser with additional options to improve stability + browser = await puppeteer.launch({ + headless: true, // Use headless mode + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-accelerated-2d-canvas', + '--disable-gpu', + '--window-size=1200,800', + '--disable-web-security', // Helps with cross-origin issues + '--disable-features=IsolateOrigins,site-per-process', // Helps with frames + ], + timeout: navigationTimeout, + }); + + const page = await browser.newPage(); + + // Set a larger viewport to capture more content + await page.setViewport({ + width: Number(width) || 1200, + height: Number(height) || 800, + deviceScaleFactor: 1, + }); + + // Enable request interception to speed up page loading + await page.setRequestInterception(true); + page.on('request', request => { + // Skip unnecessary resources to speed up loading + const resourceType = request.resourceType(); + if (resourceType === 'font' || resourceType === 'media' || resourceType === 'websocket' || request.url().includes('analytics') || request.url().includes('tracker')) { + request.abort(); + } else { + request.continue(); + } + }); + + // Set navigation and timeout options + console.log(`Navigating to URL: ${url}`); + + // Navigate to the URL and wait for the page to load + await page.goto(url, { + waitUntil: ['networkidle2'], + timeout: navigationTimeout, + }); + + // Wait for a short delay after navigation to allow content to render + await new Promise(resolve => setTimeout(resolve, 2000)); + + // Take a screenshot + console.log('Taking screenshot...'); + const screenshotPath = `./src/server/public/files/images/webpage_${Date.now()}.png`; + const screenshotOptions = { + path: screenshotPath, + fullPage: fullPage === true, + omitBackground: false, + type: 'png' as 'png', + clip: + fullPage !== true + ? { + x: 0, + y: 0, + width: Number(width) || 1200, + height: Number(height) || 800, + } + : undefined, + }; + + await page.screenshot(screenshotOptions); + + // Get the full height of the page + const fullHeight = await page.evaluate(() => { + return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight); + }); + + console.log(`Screenshot captured successfully with height: ${fullHeight}px`); + + // Return the URL to the screenshot + const screenshotUrl = `/files/images/webpage_${Date.now()}.png`; + res.json({ + screenshotUrl, + fullHeight, + }); + } catch (error: any) { + console.error('Error capturing screenshot:', error); + res.status(500).send({ + error: `Failed to capture screenshot: ${error.message}`, + details: error.stack, + }); + } finally { + // Ensure browser is closed to free resources + if (browser) { + try { + await browser.close(); + console.log('Browser closed successfully'); + } catch (error) { + console.error('Error closing browser:', error); + } + } + } + }, + }); } } @@ -829,3 +983,50 @@ function spawnPythonProcess(jobId: string, file_path: string) { runPythonScript(); } } + +/** + * Extracts main content from HTML by removing scripts, styles, and non-content elements + * Used as a fallback when Readability fails + * @param html The HTML content to process + * @returns Extracted main text content + */ +async function extractMainContent(html: string): Promise<string> { + try { + // Create a simple DOM to extract content + const dom = new JSDOM(html, { runScripts: 'outside-only' }); + const document = dom.window.document; + + // Remove scripts, styles, and other non-content elements + const elementsToRemove = ['script', 'style', 'iframe', 'noscript', 'svg', 'header', 'footer', 'nav', 'aside', 'ads', 'banner', 'form', 'button', 'input']; + + elementsToRemove.forEach(tag => { + const elements = document.querySelectorAll(tag); + elements.forEach(el => el.remove()); + }); + + // Try to find the main content container using common selectors + const mainSelectors = ['main', 'article', '#content', '.content', '#main', '.main', '.post-content', '.article-content', '.entry-content']; + + let mainContent = ''; + + // Try each selector to find main content + for (const selector of mainSelectors) { + const element = document.querySelector(selector); + if (element && element.textContent && element.textContent.trim().length > 100) { + mainContent = element.textContent; + break; + } + } + + // If no main content found with selectors, use body content + if (!mainContent || mainContent.length < 200) { + mainContent = document.body.textContent || ''; + } + + // Clean up the text + return mainContent.replace(/\s+/g, ' ').replace(/\n+/g, '\n').trim(); + } catch (error) { + console.error('Error extracting main content:', error); + return 'Failed to extract content from the webpage.'; + } +} |