aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2025-05-11 13:42:00 -0400
committerA.J. Shulman <Shulman.aj@gmail.com>2025-05-11 13:42:00 -0400
commita5d7f5c38192b91b7df3bd6ecace5ba7365449a6 (patch)
treec6be94f983b5fcc65424b81d42ddb0718127404c
parent3c28aa3a706869d818bc8a089e8d1a53f7234bc0 (diff)
Made it so chunk Ids are seperately managed and made sure the doc id is sonsistent and not created in python spawn
-rw-r--r--src/client/views/nodes/WebBox.scss241
-rw-r--r--src/client/views/nodes/WebBox.tsx605
-rw-r--r--src/client/views/nodes/WebBoxRenderer.js103
-rw-r--r--src/client/views/nodes/chatbot/agentsystem/prompts.ts4
-rw-r--r--src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx36
-rw-r--r--src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts38
-rw-r--r--src/client/views/nodes/chatbot/tools/RAGTool.ts5
-rw-r--r--src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts213
-rw-r--r--src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts33
-rw-r--r--src/server/ApiManagers/AssistantManager.ts8
-rw-r--r--src/server/chunker/pdf_chunker.py13
11 files changed, 548 insertions, 751 deletions
diff --git a/src/client/views/nodes/WebBox.scss b/src/client/views/nodes/WebBox.scss
index a1991d1d0..77d7716f4 100644
--- a/src/client/views/nodes/WebBox.scss
+++ b/src/client/views/nodes/WebBox.scss
@@ -1,9 +1,13 @@
@use '../global/globalCssVariables.module.scss' as global;
.webBox {
+ height: 100%;
+ width: 100%;
+ top: 0;
+ left: 0;
position: relative;
+ display: flex;
overflow: hidden;
- aspect-ratio: 1 / 1; // Explicitly enforce square aspect ratio
.webBox-sideResizer {
position: absolute;
@@ -16,119 +20,6 @@
.webBox-background {
width: 100%;
height: 100%;
- position: absolute;
- top: 0;
- left: 0;
- }
-
- // Simple container for screenshot
- .webBox-screenshot-container {
- width: 100%;
- }
-
- .webBox-screenshot {
- width: 100%;
- height: auto; // Maintain aspect ratio
- display: block;
- pointer-events: none;
- }
-
- .webBox-loading {
- padding: 20px;
- text-align: center;
- color: #666;
- background-color: #f5f5f5;
- min-height: 200px;
- display: flex;
- flex-direction: column;
- align-items: center;
- justify-content: center;
- }
-
- .webBox-loading-spinner {
- margin-top: 15px;
- color: #1976d2;
- font-size: 24px;
- }
-
- .webBox-error {
- padding: 20px;
- color: #d32f2f;
- text-align: center;
- background-color: #ffebee;
- min-height: 200px;
- display: flex;
- flex-direction: column;
- align-items: center;
- justify-content: center;
- gap: 15px;
- }
-
- .webBox-placeholder {
- padding: 20px;
- text-align: center;
- color: #757575;
- background-color: #fafafa;
- min-height: 200px;
- display: flex;
- align-items: center;
- justify-content: center;
- }
-
- // Basic container layout
- .webBox-container {
- width: 100%;
- height: 100%;
- position: relative;
- }
-
- // Simple scrollable container - vertical only
- .webBox-outerContent {
- width: 100%;
- position: relative;
- overflow-y: auto;
- overflow-x: hidden;
- background-color: #f5f5f5;
-
- // Improve scrollbar styling
- &::-webkit-scrollbar-thumb {
- background-color: #888;
- border-radius: 6px;
- }
-
- &::-webkit-scrollbar {
- width: 8px;
- background-color: #f5f5f5;
- }
- }
-
- .webBox-innerContent {
- width: 100%;
- background-color: #f5f5f5;
- }
-
- .webBox-htmlSpan {
- position: absolute;
- top: 0;
- left: 0;
- cursor: text;
- padding: 15px;
- width: 100%;
- height: 100%;
- }
-
- .webBox-annotationLayer {
- position: absolute;
- transform-origin: left top;
- top: 0;
- width: 100%;
- pointer-events: none;
- mix-blend-mode: multiply;
- }
-
- .webBox-annotationBox {
- position: absolute;
- background-color: rgba(245, 230, 95, 0.616);
}
.webBox-ui {
@@ -177,14 +68,14 @@
}
}
- .webBox-refreshButton {
+ .webBox-nextIcon,
+ .webBox-prevIcon {
background: #121721;
+ color: white;
height: 20px;
width: 25px;
display: flex;
- position: absolute;
- bottom: 0;
- right: 40px;
+ position: relative;
align-items: center;
justify-content: center;
border-radius: 3px;
@@ -192,6 +83,10 @@
padding: 0px;
}
+ .webBox-overlayButton:hover {
+ background: none;
+ }
+
.webBox-overlayCont {
position: absolute;
width: calc(100% - 40px);
@@ -223,7 +118,8 @@
justify-content: center;
border-radius: 3px;
pointer-events: all;
- z-index: 1;
+ z-index: 1; // so it appears on top of the document's title, if shown
+
box-shadow: global.$standard-box-shadow;
transition: 0.2s;
@@ -238,6 +134,89 @@
opacity: 0.1;
}
+ .webBox-annotationLayer {
+ position: absolute;
+ transform-origin: left top;
+ top: 0;
+ width: 100%;
+ pointer-events: none;
+ mix-blend-mode: multiply; // bcz: makes text fuzzy!
+ }
+
+ .webBox-annotationBox {
+ position: absolute;
+ background-color: rgba(245, 230, 95, 0.616);
+ }
+
+ .webBox-container {
+ transform-origin: top left;
+ width: 100%;
+ height: 100%;
+ position: absolute;
+
+ .webBox-htmlSpan {
+ position: absolute;
+ top: 0;
+ left: 0;
+ cursor: text;
+ padding: 15px;
+ height: 100%;
+ }
+
+ .webBox-cont {
+ pointer-events: none;
+ }
+
+ .webBox-cont,
+ .webBox-cont-interactive {
+ padding: 0vw;
+ position: absolute;
+ top: 0;
+ left: 0;
+ width: 100%;
+ height: 100%;
+ transform-origin: top left;
+
+ .webBox-iframe {
+ width: 100%;
+ height: 100%;
+ position: absolute;
+ top: 0;
+ left: 0;
+ body {
+ ::selection {
+ color: white;
+ background: orange;
+ }
+ }
+ }
+ }
+
+ .webBox-cont-interactive {
+ span {
+ user-select: text !important;
+ }
+ }
+
+ .webBox-outerContent {
+ width: 100%;
+ height: 100%;
+ position: absolute;
+ transform-origin: top left;
+ top: 0;
+ left: 0;
+ overflow: auto;
+
+ .webBox-innerContent {
+ position: relative;
+ }
+ }
+
+ div.webBox-outerContent::-webkit-scrollbar-thumb {
+ cursor: nw-resize;
+ }
+ }
+
.webBox-overlay {
width: 100%;
height: 100%;
@@ -277,13 +256,37 @@
width: 100%;
height: 100%;
position: absolute;
+ pointer-events: all;
.indicator {
position: absolute;
+ transition: background-color 0.2s ease;
+ border-radius: 2px;
&.active {
background-color: rgba(0, 0, 0, 0.1);
+ box-shadow: 0 0 2px rgba(0, 0, 0, 0.2);
}
}
}
+
+ // Add styles to hide font errors and improve user experience
+ .font-error-hidden {
+ font-family:
+ system-ui,
+ -apple-system,
+ BlinkMacSystemFont,
+ 'Segoe UI',
+ Roboto,
+ Arial,
+ sans-serif !important;
+ }
+
+ // Change iframe behavior when resource loading errors occur
+ iframe.webBox-iframe {
+ &.loading-error {
+ // Make full content accessible when external resources fail
+ pointer-events: all !important;
+ }
+ }
}
diff --git a/src/client/views/nodes/WebBox.tsx b/src/client/views/nodes/WebBox.tsx
index 045af7ecd..1e158f484 100644
--- a/src/client/views/nodes/WebBox.tsx
+++ b/src/client/views/nodes/WebBox.tsx
@@ -4,7 +4,6 @@ import { htmlToText } from 'html-to-text';
import { action, computed, IReactionDisposer, makeObservable, observable, ObservableMap, reaction, runInAction } from 'mobx';
import { observer } from 'mobx-react';
import * as React from 'react';
-import axios from 'axios';
import * as WebRequest from 'web-request';
import { addStyleSheet, addStyleSheetRule, clearStyleSheetRules, ClientUtils, DivHeight, getWordAtPoint, lightOrDark, returnFalse, returnOne, returnZero, setupMoveUpEvents, smoothScroll } from '../../../ClientUtils';
import { Doc, DocListCast, Field, FieldType, Opt, StrListCast } from '../../../fields/Doc';
@@ -70,20 +69,23 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
private _scrollTimer: NodeJS.Timeout | undefined;
private _getAnchor: (savedAnnotations: Opt<ObservableMap<number, HTMLDivElement[]>>, addAsAnnotation: boolean) => Opt<Doc> = () => undefined;
- @observable private _webUrl = ''; // url of the page we want to display
- @observable private _hackHide = false;
+ @observable private _webUrl = ''; // url of the src parameter of the embedded iframe but not necessarily the rendered page - eg, when following a link, the rendered page changes but we don't want the src parameter to also change as that would cause an unnecessary re-render.
+ @observable private _hackHide = false; // apparently changing the value of the 'sandbox' prop doesn't necessarily apply it to the active iframe. so thisforces the ifrmae to be rebuilt when allowScripts is toggled
@observable private _searching: boolean = false;
@observable private _showSidebar = false;
@observable private _webPageHasBeenRendered = false;
@observable private _marqueeing: number[] | undefined = undefined;
- @observable private _screenshotUrl: string | null = null; // URL to the screenshot image
- @observable private _fullHeight: number = 0; // Full height of the webpage screenshot
- @observable private _isLoadingScreenshot: boolean = false; // Loading state for the screenshot
+ get marqueeing() {
+ return this._marqueeing;
+ }
+ set marqueeing(val) {
+ val && this._marqueeref.current?.onInitiateSelection(val);
+ !val && this._marqueeref.current?.onTerminateSelection();
+ this._marqueeing = val;
+ }
@observable private _iframe: HTMLIFrameElement | null = null;
@observable private _savedAnnotations = new ObservableMap<number, (HTMLDivElement & { marqueeing?: boolean })[]>();
@observable private _scrollHeight = NumCast(this.layoutDoc.scrollHeight);
- @observable private _screenshotError: string | null = null; // Error message if screenshot fails
- @observable private _loadingFromCache: boolean = false;
@computed get _url() {
return this.webField?.toString() || '';
}
@@ -143,38 +145,31 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
};
updateIcon = async () => {
- if (!this._screenshotUrl) {
- // If we don't have a screenshot yet, capture one first
- await this.captureWebScreenshot();
- }
-
+ if (!this._iframe) return new Promise<void>(res => res());
const scrollTop = NumCast(this.layoutDoc._layout_scrollTop);
const nativeWidth = NumCast(this.layoutDoc.nativeWidth);
const nativeHeight = (nativeWidth * this._props.PanelHeight()) / this._props.PanelWidth();
-
+ let htmlString = this._iframe.contentDocument && new XMLSerializer().serializeToString(this._iframe.contentDocument);
+ if (!htmlString) {
+ htmlString = await fetch(ClientUtils.CorsProxy(this.webField!.href)).then(response => response.text());
+ }
this.layoutDoc.thumb = undefined;
this.Document.thumbLockout = true; // lock to prevent multiple thumb updates.
-
- try {
- // If we have a screenshot, use it directly for the thumbnail
- if (this._screenshotUrl) {
- return ClientUtils.convertDataUri(this._screenshotUrl, this.layoutDoc[Id] + '_icon_' + new Date().getTime(), true, this.layoutDoc[Id] + '_icon_').then(returnedfilename => {
+ return (CreateImage(this._webUrl.endsWith('/') ? this._webUrl.substring(0, this._webUrl.length - 1) : this._webUrl, this._iframe.contentDocument?.styleSheets ?? [], htmlString, nativeWidth, nativeHeight, scrollTop) as Promise<string>)
+ .then((dataUrl: string) => {
+ if (dataUrl.includes('<!DOCTYPE')) {
+ console.log('BAD DATA IN THUMB CREATION');
+ return;
+ }
+ return ClientUtils.convertDataUri(dataUrl, this.layoutDoc[Id] + '_icon_' + new Date().getTime(), true, this.layoutDoc[Id] + '_icon_').then(returnedfilename => {
this.Document.thumbLockout = false;
this.layoutDoc.thumb = new ImageField(returnedfilename);
this.layoutDoc.thumbScrollTop = scrollTop;
this.layoutDoc.thumbNativeWidth = nativeWidth;
this.layoutDoc.thumbNativeHeight = nativeHeight;
});
- } else {
- console.log('No screenshot available for thumbnail generation');
- this.Document.thumbLockout = false;
- return Promise.resolve();
- }
- } catch (error) {
- console.error('Error creating thumbnail:', error);
- this.Document.thumbLockout = false;
- return Promise.reject(error);
- }
+ })
+ .catch((error: object) => console.error('oops, something went wrong!', error));
};
componentDidMount() {
@@ -243,64 +238,13 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
},
{ fireImmediately: true }
);
-
- // Check if we have a cached screenshot URL in metadata
- if (this._url) {
- this._webUrl = this._url;
- const cachedScreenshotUrl = StrCast(this.dataDoc[this.fieldKey + '_screenshotUrl']);
- const cachedHeight = NumCast(this.dataDoc[this.fieldKey + '_screenshotHeight']);
-
- if (cachedScreenshotUrl && cachedHeight) {
- // Use cached screenshot
- this._loadingFromCache = true;
- this._isLoadingScreenshot = true;
-
- // Verify the cached screenshot exists by loading the image
- const img = new Image();
- img.onload = action(() => {
- this._screenshotUrl = cachedScreenshotUrl;
- this._fullHeight = cachedHeight;
- this._scrollHeight = cachedHeight;
- this._webPageHasBeenRendered = true;
- this._isLoadingScreenshot = false;
- this._loadingFromCache = false;
-
- // Apply dimensions and initial scroll
- if (this.layoutDoc._layout_autoHeight) {
- this.layoutDoc._nativeHeight = this._fullHeight;
- this._props.setHeight?.(this._fullHeight * (this._props.NativeDimScaling?.() || 1));
- }
-
- if (this._initialScroll !== undefined) {
- this.setScrollPos(this._initialScroll);
- }
-
- console.log(`Loaded cached screenshot: ${this._screenshotUrl}`);
- });
-
- img.onerror = action(() => {
- // If image fails to load, capture a new screenshot
- console.log('Cached screenshot not found, capturing new one');
- this._loadingFromCache = false;
- this.captureWebScreenshot();
- });
-
- img.src = cachedScreenshotUrl;
- } else {
- // No cached screenshot, capture a new one
- this.captureWebScreenshot();
- }
- }
}
componentWillUnmount() {
- // Clean up timers
- if (this._scrollTimer) {
- clearTimeout(this._scrollTimer);
- this._scrollTimer = undefined;
- }
-
- // Clean up reaction disposers
+ this._iframetimeout && clearTimeout(this._iframetimeout);
+ this._iframetimeout = undefined;
Object.values(this._disposers).forEach(disposer => disposer?.());
+ // this._iframe?.removeEventListener('wheel', this.iframeWheel, true);
+ // this._iframe?.contentDocument?.removeEventListener("pointerup", this.iframeUp);
}
private _selectionText: string = '';
@@ -415,6 +359,59 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
savedAnnotationsCreator: () => ObservableMap<number, (HTMLDivElement & { marqueeing?: boolean })[]> = () => this._textAnnotationCreator?.() || this._savedAnnotations;
@action
+ iframeMove = (e: PointerEvent) => {
+ const theclick = this.props
+ .ScreenToLocalTransform()
+ .inverse()
+ .transformPoint(e.clientX, e.clientY - NumCast(this.layoutDoc.layout_scrollTop));
+ this._marqueeref.current?.onMove(theclick);
+ };
+ @action
+ iframeUp = (e: PointerEvent) => {
+ this._iframe?.contentDocument?.removeEventListener('pointermove', this.iframeMove);
+ this.marqueeing = undefined;
+ this._getAnchor = AnchorMenu.Instance?.GetAnchor; // need to save AnchorMenu's getAnchor since a subsequent selection on another doc will overwrite this value
+ this._textAnnotationCreator = undefined;
+ this.DocumentView?.()?.cleanupPointerEvents(); // pointerup events aren't generated on containing document view, so we have to invoke it here.
+ if (this._iframe?.contentWindow && this._iframe.contentDocument && !this._iframe.contentWindow.getSelection()?.isCollapsed) {
+ const mainContBounds = ClientUtils.GetScreenTransform(this._mainCont.current!);
+ const scale = (this._props.NativeDimScaling?.() || 1) * mainContBounds.scale;
+ const sel = this._iframe.contentWindow.getSelection();
+ if (sel) {
+ this._selectionText = sel.toString();
+ AnchorMenu.Instance.setSelectedText(sel.toString());
+ this._textAnnotationCreator = () => this.createTextAnnotation(sel, !sel.isCollapsed ? sel.getRangeAt(0) : undefined);
+ AnchorMenu.Instance.jumpTo(e.clientX * scale + mainContBounds.translateX, e.clientY * scale + mainContBounds.translateY - NumCast(this.layoutDoc._layout_scrollTop) * scale);
+ // Changing which document to add the annotation to (the currently selected WebBox)
+ GPTPopup.Instance.setSidebarFieldKey(`${this._props.fieldKey}_${this._urlHash ? this._urlHash + '_' : ''}sidebar`);
+ GPTPopup.Instance.addDoc = this.sidebarAddDocument;
+ }
+ } else {
+ const theclick = this.props
+ .ScreenToLocalTransform()
+ .inverse()
+ .transformPoint(e.clientX, e.clientY - NumCast(this.layoutDoc.layout_scrollTop));
+ if (!this._marqueeref.current?.isEmpty) this._marqueeref.current?.onEnd(theclick[0], theclick[1]);
+ else {
+ if (!(e.target as HTMLElement)?.tagName?.includes('INPUT')) this.finishMarquee(theclick[0], theclick[1]);
+ this._getAnchor = AnchorMenu.Instance?.GetAnchor;
+ this.marqueeing = undefined;
+ }
+
+ ContextMenu.Instance.closeMenu();
+ ContextMenu.Instance.setIgnoreEvents(false);
+ if (e?.button === 2 || e?.altKey) {
+ e?.preventDefault();
+ e?.stopPropagation();
+ setTimeout(() => {
+ // if menu comes up right away, the down event can still be active causing a menu item to be selected
+ this.specificContextMenu();
+ this.DocumentView?.().onContextMenu(undefined, theclick[0], theclick[1]);
+ });
+ }
+ }
+ };
+ @action
webClipDown = (e: React.PointerEvent) => {
e.stopPropagation();
const sel = window.getSelection();
@@ -508,6 +505,98 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
this._scrollHeight = this._iframe?.contentDocument?.body?.scrollHeight ?? 0;
this.addWebStyleSheetRule(this.addWebStyleSheet(this._iframe?.contentDocument), '::selection', { color: 'white', background: 'orange' }, '');
+ // Add error handler to suppress font CORS errors
+ if (this._iframe?.contentWindow) {
+ try {
+ // Track if any resource errors occurred
+ let hasResourceErrors = false;
+
+ // Override the console.error to filter out font CORS errors
+ const win = this._iframe.contentWindow as Window & { console: Console };
+ const originalConsoleError = win.console.error;
+ win.console.error = (...args: unknown[]) => {
+ const errorMsg = args.map(arg => String(arg)).join(' ');
+ if (errorMsg.includes('Access to font') && errorMsg.includes('has been blocked by CORS policy')) {
+ // Mark that we have font errors
+ hasResourceErrors = true;
+ // Ignore font CORS errors
+ return;
+ }
+ // Also catch other resource loading errors
+ if (errorMsg.includes('ERR_FAILED') || errorMsg.includes('ERR_BLOCKED_BY_CLIENT')) {
+ hasResourceErrors = true;
+ }
+ originalConsoleError.apply(win.console, args);
+ };
+
+ // Listen for resource loading errors
+ this._iframe.contentWindow.addEventListener(
+ 'error',
+ (e: Event) => {
+ const target = e.target as HTMLElement;
+ if (target instanceof HTMLElement) {
+ // If it's a resource that failed to load
+ if (target.tagName === 'LINK' || target.tagName === 'IMG' || target.tagName === 'SCRIPT') {
+ hasResourceErrors = true;
+ // Apply error class after a short delay to allow initial content to load
+ setTimeout(() => {
+ if (this._iframe && hasResourceErrors) {
+ this._iframe.classList.add('loading-error');
+ }
+ }, 1000);
+ }
+ }
+ },
+ true
+ );
+
+ // Add fallback CSS for fonts that fail to load
+ const style = this._iframe.contentDocument?.createElement('style');
+ if (style) {
+ style.textContent = `
+ @font-face {
+ font-family: 'CORS-fallback-serif';
+ src: local('Times New Roman'), local('Georgia'), serif;
+ }
+ @font-face {
+ font-family: 'CORS-fallback-sans';
+ src: local('Arial'), local('Helvetica'), sans-serif;
+ }
+ /* Fallback for all fonts that fail to load */
+ @font-face {
+ font-display: swap !important;
+ }
+
+ /* Add a script to find and fix elements with failed fonts */
+ @font-face {
+ font-family: '__failed_font__';
+ src: local('Arial');
+ unicode-range: U+0000;
+ }
+ `;
+ this._iframe.contentDocument?.head.appendChild(style);
+
+ // Add a script to detect and fix font loading issues
+ const script = this._iframe.contentDocument?.createElement('script');
+ if (script) {
+ script.textContent = `
+ // Fix font loading issues with fallbacks
+ setTimeout(function() {
+ document.querySelectorAll('*').forEach(function(el) {
+ if (window.getComputedStyle(el).fontFamily.includes('__failed_font__')) {
+ el.classList.add('font-error-hidden');
+ }
+ });
+ }, 1000);
+ `;
+ this._iframe.contentDocument?.head.appendChild(script);
+ }
+ }
+ } catch (e) {
+ console.log('Error setting up font error handling:', e);
+ }
+ }
+
let href: Opt<string>;
try {
href = iframe?.contentWindow?.location.href;
@@ -658,23 +747,15 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
this.dataDoc[this.fieldKey + '_history'] = new List<string>([...history, this._url]);
this.dataDoc[this.fieldKey] = new WebField(new URL(future.pop()!));
this._scrollHeight = 0;
-
- // Reset screenshot state for new URL
- this._screenshotUrl = null;
- this._fullHeight = 0;
- this._isLoadingScreenshot = false;
-
if (this._webUrl === this._url) {
this._webUrl = curUrl;
setTimeout(
action(() => {
this._webUrl = this._url;
- this.captureWebScreenshot(); // Capture screenshot for new URL
})
);
} else {
this._webUrl = this._url;
- this.captureWebScreenshot(); // Capture screenshot for new URL
}
return true;
}
@@ -694,18 +775,11 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
else this.dataDoc[this.fieldKey + '_future'] = new List<string>([...future, this._url]);
this.dataDoc[this.fieldKey] = new WebField(new URL(history.pop()!));
this._scrollHeight = 0;
-
- // Reset screenshot state for new URL
- this._screenshotUrl = null;
- this._fullHeight = 0;
- this._isLoadingScreenshot = false;
-
if (this._webUrl === this._url) {
this._webUrl = curUrl;
setTimeout(action(() => (this._webUrl = this._url)));
} else {
this._webUrl = this._url;
- this.captureWebScreenshot(); // Capture screenshot for new URL
}
return true;
}
@@ -724,11 +798,10 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
this.layoutDoc.thumbNativeWidth = undefined;
this.layoutDoc.thumbNativeHeight = undefined;
}
-
+ }
+ if (!preview) {
if (!dontUpdateIframe) {
this._webUrl = this._url;
- // Capture screenshot when URL changes
- this.captureWebScreenshot();
}
}
} catch {
@@ -737,85 +810,6 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
return true;
};
- @action
- captureWebScreenshot = async () => {
- if (!this._url || this._loadingFromCache) return;
-
- try {
- this._isLoadingScreenshot = true;
- this._screenshotError = null;
-
- console.log(`Capturing screenshot for URL: ${this._url}`);
-
- try {
- const response = await axios.post('/captureWebScreenshot', {
- url: this._url,
- width: NumCast(this.Document.nativeWidth, 1200),
- height: NumCast(this.Document.nativeHeight, 800),
- fullPage: true, // Request a full page screenshot
- });
-
- runInAction(() => {
- this._screenshotUrl = response.data.screenshotUrl;
- this._fullHeight = response.data.fullHeight;
- this._scrollHeight = response.data.fullHeight;
- this._webPageHasBeenRendered = true;
- this._isLoadingScreenshot = false;
-
- // Store screenshot URL and height in document metadata
- this.dataDoc[this.fieldKey + '_screenshotUrl'] = response.data.screenshotUrl;
- this.dataDoc[this.fieldKey + '_screenshotHeight'] = response.data.fullHeight;
-
- // Update native dimensions to match the screenshot
- if (!this.dataDoc[this.fieldKey + '_nativeWidth']) {
- this.dataDoc[this.fieldKey + '_nativeWidth'] = 1200; // Default width
- }
-
- if (!this.dataDoc[this.fieldKey + '_nativeHeight']) {
- this.dataDoc[this.fieldKey + '_nativeHeight'] = this._fullHeight;
- }
-
- // Set document height if needed
- if (this.layoutDoc._layout_autoHeight) {
- this.layoutDoc._nativeHeight = this._fullHeight;
- this._props.setHeight?.(this._fullHeight * (this._props.NativeDimScaling?.() || 1));
- }
-
- // Apply initial scroll if needed
- if (this._initialScroll !== undefined) {
- this.setScrollPos(this._initialScroll);
- }
-
- console.log(`Screenshot captured successfully: ${this._screenshotUrl} with height: ${this._fullHeight}px`);
- });
- } catch (error: any) {
- // Handle error from the API
- console.error('Error capturing screenshot:', error);
- let errorMessage = 'Failed to capture webpage screenshot';
-
- // Try to extract detailed error message from response
- if (error.response && error.response.data && error.response.data.error) {
- errorMessage = error.response.data.error;
- } else if (error.message) {
- errorMessage = error.message;
- }
-
- runInAction(() => {
- this._screenshotError = errorMessage;
- this._isLoadingScreenshot = false;
- });
- }
- } catch (error: any) {
- // Handle unexpected errors
- runInAction(() => {
- console.error('Unexpected error in captureWebScreenshot:', error);
- this._screenshotError = 'An unexpected error occurred';
- this._isLoadingScreenshot = false;
- });
- }
- };
-
- @action
onWebUrlDrop = (e: React.DragEvent) => {
const { dataTransfer } = e;
const html = dataTransfer.getData('text/html');
@@ -830,28 +824,13 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
setData = (data: FieldType | Promise<RefField | undefined>) => {
if (!(typeof data === 'string') && !(data instanceof WebField)) return false;
if (Field.toString(data) === this._url) return false;
-
- // Reset state for new URL
this._scrollHeight = 0;
- this._screenshotUrl = null;
- this._fullHeight = 0;
- this._isLoadingScreenshot = false;
-
- // Clear stored screenshot metadata for the previous URL
- this.dataDoc[this.fieldKey + '_screenshotUrl'] = undefined;
- this.dataDoc[this.fieldKey + '_screenshotHeight'] = undefined;
-
const oldUrl = this._url;
const history = Cast(this.dataDoc[this.fieldKey + '_history'], listSpec('string'), []);
const weburl = new WebField(Field.toString(data));
this.dataDoc[this.fieldKey + '_future'] = new List<string>([]);
this.dataDoc[this.fieldKey + '_history'] = new List<string>([...(history || []), oldUrl]);
this.dataDoc[this.fieldKey] = weburl;
-
- // Capture screenshot for the new URL
- this._webUrl = weburl.toString();
- this.captureWebScreenshot();
-
return true;
};
onWebUrlValueKeyDown = (e: React.KeyboardEvent) => {
@@ -868,14 +847,26 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
description: (this.layoutDoc[this.fieldKey + '_useCors'] ? "Don't Use" : 'Use') + ' Cors',
event: () => {
this.layoutDoc[this.fieldKey + '_useCors'] = !this.layoutDoc[this.fieldKey + '_useCors'];
- // Re-capture screenshot with the new setting
- this.captureWebScreenshot();
},
icon: 'snowflake',
});
-
- // Remove the "Allow Scripts" option since it's not relevant for screenshots
-
+ funcs.push({
+ description: (this.dataDoc[this.fieldKey + '_allowScripts'] ? 'Prevent' : 'Allow') + ' Scripts',
+ event: () => {
+ this.dataDoc[this.fieldKey + '_allowScripts'] = !this.dataDoc[this.fieldKey + '_allowScripts'];
+ if (this._iframe) {
+ runInAction(() => {
+ this._hackHide = true;
+ });
+ setTimeout(
+ action(() => {
+ this._hackHide = false;
+ })
+ );
+ }
+ },
+ icon: 'snowflake',
+ });
funcs.push({
description: (!this.layoutDoc.layout_reflowHorizontal ? 'Force' : 'Prevent') + ' Reflow',
event: () => {
@@ -887,21 +878,7 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
},
icon: 'snowflake',
});
-
- // Add a refresh option to re-capture the screenshot
- funcs.push({
- description: 'Refresh Screenshot',
- event: () => this.captureWebScreenshot(),
- icon: 'sync-alt',
- });
-
- !Doc.noviceMode &&
- funcs.push({
- description: 'Update Icon',
- event: () => this.updateIcon(),
- icon: 'portrait',
- });
-
+ !Doc.noviceMode && funcs.push({ description: 'Update Icon', event: () => this.updateIcon(), icon: 'portrait' });
cm.addItem({ description: 'Options...', subitems: funcs, icon: 'asterisk' });
}
};
@@ -913,7 +890,7 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
*/
@action
onMarqueeDown = (e: React.PointerEvent) => {
- const sel = window.document.getSelection();
+ const sel = this._url ? this._iframe?.contentDocument?.getSelection() : window.document.getSelection();
this._textAnnotationCreator = undefined;
if (sel?.empty)
sel.empty(); // Chrome
@@ -948,7 +925,6 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
@computed get urlContent() {
if (this.ScreenToLocalBoxXf().Scale > 25) return <div />;
-
setTimeout(
action(() => {
if (this._initialScroll === undefined && !this._webPageHasBeenRendered) {
@@ -957,10 +933,7 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
this._webPageHasBeenRendered = true;
})
);
-
const field = this.dataDoc[this._props.fieldKey];
-
- // Handle HTML field (text content)
if (field instanceof HtmlField) {
return (
<span
@@ -977,8 +950,6 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
/>
);
}
-
- // Handle WebField (screenshot of webpage)
if (field instanceof WebField) {
const url = this.layoutDoc[this.fieldKey + '_useCors'] ? '/corsproxy/' + this._webUrl : this._webUrl;
const scripts = this.dataDoc[this.fieldKey + '_allowScripts'] || this._webUrl.includes('wikipedia.org') || this._webUrl.includes('google.com') || this._webUrl.startsWith('https://bing');
@@ -1198,7 +1169,7 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
TraceMobx();
// const previewScale = this._previewNativeWidth ? 1 - this.sidebarWidth() / this._previewNativeWidth : 1;
const pointerEvents = this.layoutDoc._lockedPosition ? 'none' : (this._props.pointerEvents?.() as Property.PointerEvents | undefined);
- const scale = this._props.NativeDimScaling?.() || 1;
+ // const scale = previewScale * (this._props.NativeDimScaling?.() || 1);
return (
<div
className="webBox-outerContent"
@@ -1207,16 +1178,11 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
height: '100%', //`${100 / scale}%`,
pointerEvents,
}}
+ // when active, block wheel events from propagating since they're handled by the iframe
onWheel={this.onZoomWheel}
onScroll={() => this.setDashScrollTop(this._outerRef.current?.scrollTop || 0)}
onPointerDown={this.onMarqueeDown}>
- <div
- className="webBox-innerContent"
- style={{
- width: '100%',
- pointerEvents,
- backgroundColor: '#f5f5f5',
- }}>
+ <div className="webBox-innerContent" style={{ height: (this._webPageHasBeenRendered && this._scrollHeight > this._props.PanelHeight() && this._scrollHeight) || '100%', pointerEvents }}>
{this.content}
<div style={{ display: SnappingManager.CanEmbed ? 'none' : undefined, mixBlendMode: 'multiply' }}>{this.renderTransparentAnnotations}</div>
{this.renderOpaqueAnnotations}
@@ -1258,13 +1224,6 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
<FontAwesomeIcon icon={this._searching ? 'times' : 'search'} size="lg" />
</div>
</button>
-
- {/* Refresh button */}
- <button type="button" className="webBox-overlayButton webBox-refreshButton" title="Refresh webpage" onClick={() => this.captureWebScreenshot()}>
- <div className="webBox-overlayButton-iconCont" onPointerDown={e => e.stopPropagation()}>
- <FontAwesomeIcon icon="sync" size="lg" />
- </div>
- </button>
</div>
);
}
@@ -1293,25 +1252,16 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
annotationPointerEvents = () => (this._props.isContentActive() && (SnappingManager.IsDragging || Doc.ActiveTool !== InkTool.None) ? 'all' : 'none');
render() {
TraceMobx();
- const containerWidth = NumCast(this.layoutDoc._width) || this._props.PanelWidth();
+ const previewScale = this._previewNativeWidth ? 1 - this.sidebarWidth() / this._previewNativeWidth : 1;
const pointerEvents = this.layoutDoc._lockedPosition ? 'none' : (this._props.pointerEvents?.() as Property.PointerEvents);
- const scale = this._props.NativeDimScaling?.() || 1;
-
- // Force the component to be square
- this.layoutDoc._height = containerWidth;
- this.layoutDoc._width = containerWidth;
- this.layoutDoc._forceActive = true;
-
+ const scale = previewScale * (this._props.NativeDimScaling?.() || 1);
return (
<div
className="webBox"
ref={this._mainCont}
style={{
- pointerEvents: this.pointerEvents(),
+ pointerEvents: this.pointerEvents(), //
position: SnappingManager.IsDragging ? 'absolute' : undefined,
- width: `${containerWidth}px`,
- height: `${containerWidth}px`,
- aspectRatio: '1 / 1', // Explicitly enforce square aspect ratio
}}>
<div className="webBox-background" style={{ backgroundColor: this._props.styleProvider?.(this.layoutDoc, this._props, StyleProp.BackgroundColor) as string }} />
<div
@@ -1376,15 +1326,6 @@ export class WebBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
</div>
);
}
-
- get marqueeing() {
- return this._marqueeing;
- }
- set marqueeing(val) {
- val && this._marqueeref.current?.onInitiateSelection(val);
- !val && this._marqueeref.current?.onTerminateSelection();
- this._marqueeing = val;
- }
}
// eslint-disable-next-line prefer-arrow-callback
ScriptingGlobals.add(function urlHash(url: string) {
@@ -1395,149 +1336,3 @@ Docs.Prototypes.TemplateMap.set(DocumentType.WEB, {
layout: { view: WebBox, dataField: 'data' },
options: { acl: '', _height: 300, _layout_fitWidth: true, _layout_nativeDimEditable: true, _layout_reflowVertical: true, waitForDoubleClickToClick: 'always', systemIcon: 'BsGlobe' },
});
-
-// Add CSS styles for screenshot mode
-const webBoxStyles = `
-.webBox-screenshot-container {
- width: 100%;
- position: relative;
- overflow: visible;
- display: flex;
- align-items: flex-start;
- justify-content: center;
- background-color: #f5f5f5;
-}
-
-.webBox-screenshot {
- width: 100%;
- pointer-events: none;
- display: block;
- user-select: none;
- object-fit: contain;
- transition: opacity 0.3s ease;
-}
-
-.webBox-loading {
- padding: 20px;
- text-align: center;
- color: #666;
- background-color: #f5f5f5;
- border-radius: 4px;
- min-height: 200px;
- display: flex;
- flex-direction: column;
- align-items: center;
- justify-content: center;
-}
-
-.webBox-loading-message {
- font-size: 16px;
- margin-bottom: 15px;
- color: #555;
-}
-
-.webBox-loading-spinner {
- margin-top: 10px;
- color: #1976d2;
-}
-
-.webBox-error {
- padding: 20px;
- color: #d32f2f;
- text-align: center;
- background-color: #ffebee;
- border-radius: 4px;
- min-height: 200px;
- display: flex;
- flex-direction: column;
- align-items: center;
- justify-content: center;
- gap: 15px;
-}
-
-.webBox-error-icon {
- color: #d32f2f;
- margin-bottom: 10px;
-}
-
-.webBox-error-message {
- color: #d32f2f;
- font-size: 14px;
- max-width: 80%;
- line-height: 1.5;
-}
-
-.webBox-error-actions {
- margin-top: 10px;
-}
-
-.webBox-retry-button {
- background-color: #f44336;
- color: white;
- border: none;
- padding: 8px 16px;
- border-radius: 4px;
- cursor: pointer;
- font-size: 14px;
- transition: background-color 0.3s;
-}
-
-.webBox-retry-button:hover {
- background-color: #d32f2f;
-}
-
-.webBox-placeholder {
- padding: 20px;
- text-align: center;
- color: #757575;
- background-color: #fafafa;
- border-radius: 4px;
- min-height: 200px;
- display: flex;
- align-items: center;
- justify-content: center;
-}
-
-.webBox-refreshButton {
- margin-right: 5px;
-}
-
-.webBox-innerContent {
- position: relative;
- width: 100%;
- background-color: #f5f5f5;
- overflow: visible;
-}
-
-.webBox-outerContent {
- overflow: auto;
- width: 100%;
- background-color: #f5f5f5;
- position: relative;
-}
-
-.webBox-container {
- position: relative;
- display: flex;
- flex-direction: column;
- height: 100%;
- background-color: white;
- border-radius: 4px;
- overflow: hidden;
-}
-
-.webBox {
- position: relative;
- height: 100%;
- width: 100%;
- overflow: hidden;
- background-color: white;
- border-radius: 4px;
- box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12), 0 1px 2px rgba(0, 0, 0, 0.24);
-}
-`;
-
-// Add the styles to the document
-const styleEl = document.createElement('style');
-styleEl.textContent = webBoxStyles;
-document.head.appendChild(styleEl);
diff --git a/src/client/views/nodes/WebBoxRenderer.js b/src/client/views/nodes/WebBoxRenderer.js
index ef465c453..31e0ef5e4 100644
--- a/src/client/views/nodes/WebBoxRenderer.js
+++ b/src/client/views/nodes/WebBoxRenderer.js
@@ -146,6 +146,29 @@ const ForeignHtmlRenderer = function (styleSheets) {
};
/**
+ * Extracts font-face URLs from CSS rules
+ * @param {String} cssRuleStr
+ * @returns {String[]}
+ */
+ const getFontFaceUrlsFromCss = function (cssRuleStr) {
+ const fontFaceUrls = [];
+ // Find @font-face blocks
+ const fontFaceBlocks = cssRuleStr.match(/@font-face\s*{[^}]*}/g) || [];
+
+ fontFaceBlocks.forEach(block => {
+ // Extract URLs from src properties
+ const urls = block.match(/src\s*:\s*[^;]*/g) || [];
+ urls.forEach(srcDeclaration => {
+ // Find all url() references in the src declaration
+ const fontUrls = getUrlsFromCssString(srcDeclaration);
+ fontFaceUrls.push(...fontUrls);
+ });
+ });
+
+ return fontFaceUrls;
+ };
+
+ /**
*
* @param {String} html
* @returns {String[]}
@@ -159,6 +182,61 @@ const ForeignHtmlRenderer = function (styleSheets) {
};
/**
+ * Create a fallback font-face rule for handling CORS errors
+ * @returns {String}
+ */
+ const createFallbackFontFaceRules = function () {
+ return `
+ @font-face {
+ font-family: 'CORS-fallback-serif';
+ src: local('Times New Roman'), local('Georgia'), serif;
+ }
+ @font-face {
+ font-family: 'CORS-fallback-sans';
+ src: local('Arial'), local('Helvetica'), sans-serif;
+ }
+ /* Add fallback font handling */
+ [data-font-error] {
+ font-family: 'CORS-fallback-sans', sans-serif !important;
+ }
+ [data-font-error="serif"] {
+ font-family: 'CORS-fallback-serif', serif !important;
+ }
+ `;
+ };
+
+ /**
+ * Clean up and optimize CSS for better rendering
+ * @param {String} cssStyles
+ * @returns {String}
+ */
+ const optimizeCssForRendering = function (cssStyles) {
+ // Add fallback font-face rules
+ const enhanced = cssStyles + createFallbackFontFaceRules();
+
+ // Replace problematic font-face declarations with proxied versions
+ let optimized = enhanced.replace(/(url\(['"]?)(https?:\/\/[^)'"]+)(['"]?\))/gi, (match, prefix, url, suffix) => {
+ // If it's a font file, proxy it
+ if (url.match(/\.(woff2?|ttf|eot|otf)(\?.*)?$/i)) {
+ return `${prefix}${CorsProxy(url)}${suffix}`;
+ }
+ return match;
+ });
+
+ // Add error handling for fonts
+ optimized += `
+ /* Suppress font CORS errors in console */
+ @supports (font-display: swap) {
+ @font-face {
+ font-display: swap !important;
+ }
+ }
+ `;
+
+ return optimized;
+ };
+
+ /**
*
* @param {String} contentHtml
* @param {Number} width
@@ -175,6 +253,7 @@ const ForeignHtmlRenderer = function (styleSheets) {
// copy styles
let cssStyles = '';
const urlsFoundInCss = [];
+ const fontUrlsInCss = [];
for (let i = 0; i < styleSheets.length; i += 1) {
try {
@@ -182,6 +261,7 @@ const ForeignHtmlRenderer = function (styleSheets) {
for (let j = 0; j < rules.length; j += 1) {
const cssRuleStr = rules[j].cssText;
urlsFoundInCss.push(...getUrlsFromCssString(cssRuleStr));
+ fontUrlsInCss.push(...getFontFaceUrlsFromCss(cssRuleStr));
cssStyles += cssRuleStr;
}
} catch (e) {
@@ -189,6 +269,9 @@ const ForeignHtmlRenderer = function (styleSheets) {
}
}
+ // Optimize and enhance CSS
+ cssStyles = optimizeCssForRendering(cssStyles);
+
// const fetchedResourcesFromStylesheets = await getMultipleResourcesAsBase64(webUrl, urlsFoundInCss);
// for (let i = 0; i < fetchedResourcesFromStylesheets.length; i++) {
// const r = fetchedResourcesFromStylesheets[i];
@@ -203,6 +286,26 @@ const ForeignHtmlRenderer = function (styleSheets) {
.replace(/<div class="mediaset"><\/div>/g, '') // when scripting isn't available (ie, rendering web pages here), <noscript> tags should become <div>'s. But for Brown CS, there's a layout problem if you leave the empty <mediaset> tag
.replace(/<link[^>]*>/g, '') // don't need to keep any linked style sheets because we've already processed all style sheets above
.replace(/srcset="([^ "]*)[^"]*"/g, 'src="$1"'); // instead of converting each item in the srcset to a data url, just convert the first one and use that
+
+ // Add script to handle font loading errors
+ contentHtml += `
+ <script>
+ // Handle font loading errors with fallbacks
+ document.addEventListener('DOMContentLoaded', function() {
+ // Mark elements with font issues
+ document.querySelectorAll('*').forEach(function(el) {
+ const style = window.getComputedStyle(el);
+ const fontFamily = style.getPropertyValue('font-family');
+ if (fontFamily && !fontFamily.includes('serif') && !fontFamily.includes('sans')) {
+ el.setAttribute('data-font-error', 'sans');
+ } else if (fontFamily && fontFamily.includes('serif')) {
+ el.setAttribute('data-font-error', 'serif');
+ }
+ });
+ });
+ </script>
+ `;
+
const urlsFoundInHtml = getImageUrlsFromFromHtml(contentHtml).filter(url => !url.startsWith('data:'));
return getMultipleResourcesAsBase64(webUrl, urlsFoundInHtml).then(fetchedResources => {
for (let i = 0; i < fetchedResources.length; i += 1) {
diff --git a/src/client/views/nodes/chatbot/agentsystem/prompts.ts b/src/client/views/nodes/chatbot/agentsystem/prompts.ts
index e551ef830..fcb4ab450 100644
--- a/src/client/views/nodes/chatbot/agentsystem/prompts.ts
+++ b/src/client/views/nodes/chatbot/agentsystem/prompts.ts
@@ -103,9 +103,9 @@ export function getReactPrompt(tools: BaseTool<ReadonlyArray<Parameter>>[], summ
<note>If no external tool is required, use 'no_tool', but if there might be relevant external information, use the appropriate tool.</note>
</tools>
- <summaries>
+ <available_documents>
${summaries()}
- </summaries>
+ </available_documents>
<chat_history>
${chatHistory}
diff --git a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
index d919b5f7f..34a1ade2e 100644
--- a/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
+++ b/src/client/views/nodes/chatbot/chatboxcomponents/ChatBox.tsx
@@ -650,16 +650,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
citation: JSON.stringify(citation, null, 2),
});
- // Try to find the document
- let doc: Doc | undefined;
-
// First try to find the document using the document manager's chunk ID lookup
- const parentDocId = this.docManager.getDocIdByChunkId(chunkId);
- if (parentDocId) {
- doc = this.docManager.getDocument(parentDocId);
- console.log(`Found document by chunk ID lookup: ${parentDocId}`);
- }
-
+ const doc: Doc | undefined = this.docManager.getDocByChunkId(chunkId);
if (!doc) {
console.warn(`Document not found for citation with chunk_id: ${chunkId}`);
return;
@@ -989,32 +981,13 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
componentWillUnmount() {
this.removeScrollListener();
}
-
- /**
- * Getter that retrieves all linked documents for the current document.
- */
- @computed get linkedDocs(): Doc[] {
- const docIds = this.docManager.listDocs();
- const docs: Doc[] = [];
-
- // Get documents from the document manager using the getDocument method
- docIds.forEach(id => {
- const doc = this.docManager.getDocument(id);
- if (doc) {
- docs.push(doc);
- }
- });
-
- return docs;
- }
-
/**
* Getter that retrieves document IDs of linked documents that have PDF_chunker–parsed content.
*/
@computed
get docIds(): string[] {
// Use the document manager to get all document IDs
- return Array.from(this.docManager.listDocs());
+ return Array.from(this.docManager.listDocs);
}
/**
@@ -1023,7 +996,8 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
@computed
get summaries(): string {
// Use the document manager to get all summaries
- return this.docManager.getAllDocumentSummaries();
+ console.log(this.docManager.listDocs);
+ return JSON.stringify(this.docManager.listDocs);
}
/**
@@ -1064,7 +1038,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() {
};
retrieveDocIds = (): string[] => {
- return Array.from(this.docManager.listDocs());
+ return Array.from(this.docManager.docIds);
};
/**
diff --git a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
index e6c2421e5..5297292bf 100644
--- a/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
+++ b/src/client/views/nodes/chatbot/tools/DocumentMetadataTool.ts
@@ -18,13 +18,13 @@ const parameterDefinitions: ReadonlyArray<Parameter> = [
name: 'action',
type: 'string',
required: true,
- description: 'The action to perform: "get" to retrieve metadata, "edit" to modify metadata, "list" to enumerate documents, "getFieldOptions" to retrieve all available field options, or "create" to create a new document',
+ description: 'The action to perform: "get" to retrieve metadata, "edit" to modify metadata, "getFieldOptions" to retrieve all available field options, or "create" to create a new document',
},
{
name: 'documentId',
type: 'string',
required: false,
- description: 'The ID of the document to get or edit metadata for. Required for "edit", optional for "get", ignored for "list", "getFieldOptions", and "create"',
+ description: 'The ID of the document to get or edit metadata for. Required for "edit", optional for "get", ignored for "getFieldOptions", and "create"',
},
{
name: 'fieldEdits',
@@ -68,7 +68,6 @@ This tool provides the following capabilities:
- Get metadata from a specific document
- Edit metadata fields on documents (in either layout or data documents)
- Edit multiple fields at once (useful for updating dependent fields together)
-- List all available documents in the current view
- Retrieve all available field options with metadata (IMPORTANT: always call this before editing)
- Understand which fields are stored where (layout vs data document)
- Get detailed information about all available document fields
@@ -137,8 +136,8 @@ SPECIAL FIELD HANDLING:
- Width/Height: Set layout_autoHeight/layout_autoWidth to false before editing
RECOMMENDED WORKFLOW:
-1. First call action="list" to identify available documents
-2. Then call action="getFieldOptions" to understand available fields
+0. Understand the currently available documents that were provided as <available_documents> in the prompt
+1. Call action="getFieldOptions" to understand available fields
3. Get document metadata with action="get" to see current values
4. Edit fields with action="edit" using proper dependencies
OR
@@ -159,10 +158,6 @@ HANDLING DEPENDENT FIELDS:
- width → layout_autoWidth (set to false to allow manual width)
- Other auto-sizing related properties
-To LIST available documents:
-- Use action="list" to get a simple list of all documents in the current view
-- This is useful when you need to identify documents before getting details or editing them
-
Editing fields follows these rules:
1. First checks if the field exists on the layout document using Doc.Get
2. If it exists on the layout document, it's updated there
@@ -172,7 +167,6 @@ Editing fields follows these rules:
Examples:
- To get field options: { action: "getFieldOptions" }
-- To list all documents: { action: "list" }
- To get all document metadata: { action: "get" }
- To get metadata for a specific document: { action: "get", documentId: "doc123" }
- To edit a single field: { action: "edit", documentId: "doc123", fieldEdits: [{ fieldName: "backgroundColor", fieldValue: "#ff0000" }] }
@@ -186,7 +180,8 @@ Examples:
{ fieldName: "layout_autoHeight", fieldValue: false },
{ fieldName: "height", fieldValue: 200 }
]}
-- IMPORTANT: MULTI STEP WORKFLOWS ARE NOT ONLY ALLOWED BUT ENCOURAGED. TAKE THINGS 1 STEP AT A TIME.`;
+- IMPORTANT: MULTI STEP WORKFLOWS ARE NOT ONLY ALLOWED BUT ENCOURAGED. TAKE THINGS 1 STEP AT A TIME.
+- IMPORTANT: WHEN CITING A DOCUMENT, MAKE THE CHUNK ID THE DOCUMENT ID. WHENEVER YOU CITE A DOCUMENT, ALWAYS MAKE THE CITATION TYPE "text", THE "direct_text" FIELD BLANK, AND THE "chunk_id" FIELD THE DOCUMENT ID.`;
const documentMetadataToolInfo: ToolInfo<DocumentMetadataToolParamsType> = {
name: 'documentMetadata',
description: toolDescription,
@@ -232,11 +227,11 @@ export class DocumentMetadataTool extends BaseTool<DocumentMetadataToolParamsTyp
// Ensure the action is valid and convert to string
const action = String(args.action);
- if (!['get', 'edit', 'list', 'getFieldOptions', 'create'].includes(action)) {
+ if (!['get', 'edit', 'getFieldOptions', 'create'].includes(action)) {
return [
{
type: 'text',
- text: 'Error: Invalid action. Valid actions are "get", "edit", "list", "getFieldOptions", or "create".',
+ text: 'Error: Invalid action. Valid actions are "get", "edit", "getFieldOptions", or "create".',
},
];
}
@@ -386,10 +381,6 @@ export class DocumentMetadataTool extends BaseTool<DocumentMetadataToolParamsTyp
}
}
- case 'list': {
- this._docManager.listDocs();
- }
-
case 'getFieldOptions': {
// Get all available field options with metadata
const fieldOptions = this._docManager.getAllFieldMetadata();
@@ -457,7 +448,7 @@ ${JSON.stringify(createdMetadata, null, 2)}`,
return [
{
type: 'text',
- text: 'Error: Unknown action. Valid actions are "get", "edit", "list", "getFieldOptions", or "create".',
+ text: 'Error: Unknown action. Valid actions are "get", "edit", "getFieldOptions", or "create".',
},
];
}
@@ -537,11 +528,6 @@ ${JSON.stringify(createdMetadata, null, 2)}`,
return true;
}
- // list action doesn't require any additional parameters
- if (params.action === 'list') {
- return true;
- }
-
return true;
}
@@ -552,7 +538,7 @@ ${JSON.stringify(createdMetadata, null, 2)}`,
*/
private getParameterRequirementsByAction(action?: string): string {
if (!action) {
- return 'Please specify an action: "get", "edit", "list", "getFieldOptions", or "create".';
+ return 'Please specify an action: "get", "edit", "getFieldOptions", or "create".';
}
switch (action.toLowerCase()) {
@@ -560,14 +546,12 @@ ${JSON.stringify(createdMetadata, null, 2)}`,
return 'The "get" action accepts an optional documentId parameter.';
case 'edit':
return 'The "edit" action requires documentId and fieldEdits parameters. fieldEdits must be a JSON array of field edits.';
- case 'list':
- return 'The "list" action does not require any additional parameters.';
case 'getFieldOptions':
return 'The "getFieldOptions" action does not require any additional parameters. It returns metadata about all available document fields.';
case 'create':
return 'The "create" action requires title, data, and doc_type parameters.';
default:
- return `Unknown action "${action}". Valid actions are "get", "edit", "list", "getFieldOptions", or "create".`;
+ return `Unknown action "${action}". Valid actions are "get", "edit", "getFieldOptions", or "create".`;
}
}
}
diff --git a/src/client/views/nodes/chatbot/tools/RAGTool.ts b/src/client/views/nodes/chatbot/tools/RAGTool.ts
index ef374ed22..90b803d21 100644
--- a/src/client/views/nodes/chatbot/tools/RAGTool.ts
+++ b/src/client/views/nodes/chatbot/tools/RAGTool.ts
@@ -3,6 +3,7 @@ import { Observation, RAGChunk } from '../types/types';
import { ParametersType, ToolInfo } from '../types/tool_types';
import { Vectorstore } from '../vectorstore/Vectorstore';
import { BaseTool } from './BaseTool';
+import { DocumentMetadataTool } from './DocumentMetadataTool';
const ragToolParams = [
{
@@ -17,7 +18,7 @@ type RAGToolParamsType = typeof ragToolParams;
const ragToolInfo: ToolInfo<RAGToolParamsType> = {
name: 'rag',
- description: 'Performs a RAG (Retrieval-Augmented Generation) search on user documents and returns a set of document chunks (text or images) to provide a grounded response based on user documents.',
+ description: `Performs a RAG (Retrieval-Augmented Generation) search on user documents (only PDF, audio, and video are supported—for information about other document types, use the ${DocumentMetadataTool.name} tool) and returns a set of document chunks (text or images) to provide a grounded response based on user documents.`,
citationRules: `When using the RAG tool, the structure must adhere to the format described in the ReAct prompt. Below are additional guidelines specifically for RAG-based responses:
1. **Grounded Text Guidelines**:
@@ -75,7 +76,7 @@ export class RAGTool extends BaseTool<RAGToolParamsType> {
async getFormattedChunks(relevantChunks: RAGChunk[]): Promise<Observation[]> {
try {
- const { formattedChunks } = await Networking.PostToServer('/formatChunks', { relevantChunks }) as { formattedChunks: Observation[]}
+ const { formattedChunks } = (await Networking.PostToServer('/formatChunks', { relevantChunks })) as { formattedChunks: Observation[] };
if (!formattedChunks) {
throw new Error('Failed to format chunks');
diff --git a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
index 14cffcb70..c8a6bb16b 100644
--- a/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
+++ b/src/client/views/nodes/chatbot/utils/AgentDocumentManager.ts
@@ -1,4 +1,4 @@
-import { action, makeObservable, observable, ObservableMap, reaction, runInAction } from 'mobx';
+import { action, computed, makeObservable, observable, ObservableMap, reaction, runInAction } from 'mobx';
import { observer } from 'mobx-react';
import { v4 as uuidv4 } from 'uuid';
import { Doc, StrListCast } from '../../../../../fields/Doc';
@@ -31,7 +31,7 @@ export class AgentDocumentManager {
private chatBox: ChatBox;
private chatBoxDocument: Doc | null = null;
private fieldMetadata: Record<string, any> = {};
- private readonly DOCUMENT_ID_FIELD = '_dash_document_id';
+ @observable private documentIdsFromChunkIds: ObservableMap<string, string>;
/**
* Creates a new DocumentManager
@@ -40,8 +40,17 @@ export class AgentDocumentManager {
constructor(chatBox: ChatBox) {
makeObservable(this);
const agentDoc = DocCast(chatBox.Document.agentDocument) ?? new Doc();
+ const chunkIds = DocCast(agentDoc.chunkIds) ?? new Doc();
+
agentDoc.title = chatBox.Document.title + '_agentDocument';
+ chunkIds.title = '_chunkIds';
chatBox.Document.agentDocument = agentDoc;
+ DocCast(chatBox.Document.agentDocument)!.chunkIds = chunkIds;
+ this.documentIdsFromChunkIds = StrListCast(chunkIds.mapping).reduce((mapping, content) => {
+ const [chunkId, docId] = content.split(':');
+ mapping.set(chunkId, docId);
+ return mapping;
+ }, new ObservableMap<string, string>());
this.documentsById = StrListCast(agentDoc.mapping).reduce((mapping, content) => {
const [id, layoutId, docId] = content.split(':');
const layoutDoc = DocServer.GetCachedRefField(layoutId);
@@ -66,6 +75,19 @@ export class AgentDocumentManager {
}
//{ fireImmediately: true }
);
+ reaction(
+ () => this.documentIdsFromChunkIds.values(),
+ () => {
+ if (this.chatBoxDocument && DocCast(this.chatBoxDocument.agentDocument)) {
+ // Store the mapping with chunkId:docId format for consistency
+ const chunkIdsDoc = DocCast(DocCast(this.chatBoxDocument.agentDocument)!.chunkIds);
+ if (chunkIdsDoc) {
+ chunkIdsDoc.mapping = new List<string>(Array.from(this.documentIdsFromChunkIds.entries()).map(([chunkId, docId]) => `${chunkId}:${docId}`));
+ }
+ }
+ }
+ //{ fireImmediately: true }
+ );
this.processDocument(this.chatBoxDocument);
this.initializeFieldMetadata();
}
@@ -120,7 +142,7 @@ export class AgentDocumentManager {
try {
// Use the LinkManager approach which is proven to work in ChatBox
if (this.chatBoxDocument) {
- console.log('Finding documents linked to ChatBox document with ID:', this.chatBoxDocument.id);
+ console.log('Finding documents linked to ChatBox document with ID:', this.chatBoxDocument[Id]);
// Get directly linked documents via LinkManager
const linkedDocs = LinkManager.Instance.getAllRelatedLinks(this.chatBoxDocument)
@@ -134,57 +156,10 @@ export class AgentDocumentManager {
linkedDocs.forEach((doc: Doc | undefined) => {
if (doc) {
this.processDocument(doc);
- console.log('Processed linked document:', doc.id, doc.title, doc.type);
+ console.log('Processed linked document:', doc[Id], doc.title, doc.type);
}
});
-
- // Include the ChatBox document itself
- this.processDocument(this.chatBoxDocument);
-
- // If we have access to the Document's parent, try to find sibling documents
- if (this.chatBoxDocument.parent) {
- const parent = this.chatBoxDocument.parent;
- console.log('Found parent document, checking for siblings');
-
- // Check if parent is a Doc type and has a childDocs function
- if (parent && typeof parent === 'object' && 'childDocs' in parent && typeof parent.childDocs === 'function') {
- try {
- const siblingDocs = parent.childDocs();
- if (Array.isArray(siblingDocs)) {
- console.log(`Found ${siblingDocs.length} sibling documents via parent.childDocs()`);
- siblingDocs.forEach((doc: Doc) => {
- if (doc) {
- this.processDocument(doc);
- }
- });
- }
- } catch (e) {
- console.warn('Error accessing parent.childDocs:', e);
- }
- }
- }
- } else if (this.chatBox && this.chatBox.linkedDocs) {
- // If we have direct access to the linkedDocs computed property from ChatBox
- console.log('Using ChatBox.linkedDocs directly');
- const linkedDocs = this.chatBox.linkedDocs;
- if (Array.isArray(linkedDocs)) {
- console.log(`Found ${linkedDocs.length} documents via ChatBox.linkedDocs`);
- linkedDocs.forEach((doc: Doc) => {
- if (doc) {
- this.processDocument(doc);
- }
- });
- }
-
- // Process the ChatBox document if available
- if (this.chatBox.Document) {
- this.processDocument(this.chatBox.Document);
- }
- } else {
- console.warn('No ChatBox document reference available for finding linked documents');
}
-
- console.log(`DocumentMetadataTool found ${this.documentsById.size} total documents`);
} catch (error) {
console.error('Error finding documents in Freeform view:', error);
}
@@ -201,6 +176,7 @@ export class AgentDocumentManager {
// Only add if we haven't already processed this document
if (!this.documentsById.has(docId)) {
this.documentsById.set(docId, { layoutDoc: doc, dataDoc: doc[DocData] });
+ console.log('Added document to documentsById:', doc[Id], docId, doc[Id], doc[DocData][Id]);
}
return docId;
}
@@ -213,37 +189,12 @@ export class AgentDocumentManager {
private ensureDocumentId(doc: Doc): string {
let docId: string | undefined;
- // First try to get the ID from our custom field
- if (doc[this.DOCUMENT_ID_FIELD]) {
- docId = String(doc[this.DOCUMENT_ID_FIELD]);
- return docId;
- }
-
- // Try different ways to get a document ID
-
// 1. Try the direct id property if it exists
- if (doc.id && typeof doc.id === 'string') {
- docId = doc.id;
- }
- // 2. Try doc._id if it exists
- else if (doc._id && typeof doc._id === 'string') {
- docId = doc._id;
- }
- // 3. Try doc.data?.id if it exists
- else if (doc.data && typeof doc.data === 'object' && 'id' in doc.data && typeof doc.data.id === 'string') {
- docId = doc.data.id;
- }
- // 4. If none of the above work, generate a UUID
- else {
- docId = uuidv4();
- console.log(`Generated new UUID for document with title: ${doc.title || 'Untitled'}`);
- }
-
- // Store the ID in the document's metadata so it persists
- try {
- doc[this.DOCUMENT_ID_FIELD] = docId;
- } catch (e) {
- console.warn(`Could not assign ID to document property`, e);
+ if (doc[Id]) {
+ console.log('Found document ID (normal):', doc[Id]);
+ docId = doc[Id];
+ } else {
+ throw new Error('No document ID found');
}
return docId;
@@ -256,13 +207,13 @@ export class AgentDocumentManager {
*/
public extractDocumentMetadata(id: string) {
if (!id) return null;
- const doc = this.documentsById.get(id);
- if (!doc) return null;
- const layoutDoc = doc.layoutDoc;
- const dataDoc = doc.dataDoc;
+ const agentDoc = this.documentsById.get(id);
+ if (!agentDoc) return null;
+ const layoutDoc = agentDoc.layoutDoc;
+ const dataDoc = agentDoc.dataDoc;
const metadata: Record<string, any> = {
- id: layoutDoc.dash_document_id || layoutDoc.id || '',
+ id: layoutDoc[Id] || dataDoc[Id] || '',
title: layoutDoc.title || '',
type: layoutDoc.type || '',
fields: {
@@ -355,7 +306,7 @@ export class AgentDocumentManager {
if (value instanceof Doc) {
return {
type: 'Doc',
- id: value.id || this.ensureDocumentId(value),
+ id: value[Id] || this.ensureDocumentId(value),
title: value.title || '',
docType: value.type || '',
};
@@ -1011,33 +962,17 @@ export class AgentDocumentManager {
* Returns a list of all document IDs in the manager.
* @returns An array of document IDs (strings).
*/
- public listDocs(): string[] {
- return Array.from(this.documentsById.keys());
+ @computed
+ public get listDocs(): string[] {
+ console.log(
+ Array.from(this.documentsById.entries()).map(([id, agentDoc]) => JSON.stringify({ id, title: agentDoc.layoutDoc.title, type: agentDoc.layoutDoc.type, summary: agentDoc.layoutDoc.summary || 'No summary available for this document.' }))
+ );
+ return Array.from(this.documentsById.entries()).map(([id, agentDoc]) => JSON.stringify({ id, title: agentDoc.layoutDoc.title, type: agentDoc.layoutDoc.type, summary: agentDoc.layoutDoc.summary || 'No summary available for this document.' }));
}
- /**
- * Adds a document with a custom ID to the manager
- * @param doc The document to add
- * @param customId The custom ID to assign to the document
- * @returns The customId that was assigned
- */
- @action
- public addCustomId(doc: Doc, customId: string): string {
- if (!doc) {
- console.error('Cannot add null document with custom ID');
- return '';
- }
-
- // Set the custom ID in the document's metadata
- doc[this.DOCUMENT_ID_FIELD] = customId;
-
- // Store the document in our map
- this.documentsById.set(customId, {
- layoutDoc: doc,
- dataDoc: doc,
- });
-
- return customId;
+ @computed
+ public get docIds(): string[] {
+ return Array.from(this.documentsById.keys());
}
/**
@@ -1078,11 +1013,8 @@ export class AgentDocumentManager {
// Ensure each chunk ID can be linked back to its parent document
// Store a mapping from chunk ID to parent document ID
// This allows us to easily find a document by any of its chunk IDs
- if (!this.documentsById.has(chunkId)) {
- this.documentsById.set(chunkId, {
- layoutDoc: doc,
- dataDoc: docInfo.dataDoc,
- });
+ if (!this.documentIdsFromChunkIds.has(chunkId) && doc) {
+ this.documentIdsFromChunkIds.set(chunkId, doc[Id]);
}
}
}
@@ -1092,11 +1024,25 @@ export class AgentDocumentManager {
* @param chunkId The chunk ID to look up
* @returns The parent document ID if found
*/
- public getDocIdByChunkId(chunkId: string): string | undefined {
- const docInfo = this.documentsById.get(chunkId);
+ public getDocByChunkId(chunkId: string): Doc | undefined {
+ // First, look up the document ID using the chunk ID mapping
+ const docId = this.documentIdsFromChunkIds.get(chunkId);
+ console.log('this.documentIdsFromChunkIds', this.documentIdsFromChunkIds);
+ console.log('docId', docId);
+ if (!docId) {
+ if (this.documentsById.has(chunkId)) {
+ return this.documentsById.get(chunkId)?.layoutDoc;
+ } else {
+ console.error('No document found for chunkId and docId', chunkId);
+ return undefined;
+ }
+ }
+ // Then get the document using the document ID
+ const docInfo = this.documentsById.get(docId);
if (docInfo) {
- return docInfo.layoutDoc[this.DOCUMENT_ID_FIELD] as string;
+ return docInfo.layoutDoc;
}
+ console.error('No document found for docId', docId);
return undefined;
}
@@ -1157,7 +1103,7 @@ export class AgentDocumentManager {
return baseChunk;
}
});
-
+ console.log('simplifiedChunks', simplifiedChunks);
// Update the document with all simplified chunks at once
doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
@@ -1165,32 +1111,25 @@ export class AgentDocumentManager {
}
/**
- * Gets the simplified chunks from a document
- * @param doc The document to get simplified chunks from
- * @returns Array of simplified chunks or empty array if none exist
+ * Gets a specific simplified chunk by ID
+ * @param doc The document containing chunks
+ * @param chunkId The ID of the chunk to retrieve
+ * @returns The simplified chunk if found, undefined otherwise
*/
- public getSimplifiedChunks(doc: Doc): any[] {
+ public getSimplifiedChunkById(doc: Doc, chunkId: string): any | undefined {
+ let chunks: any[] = [];
if (!doc || !doc.chunk_simpl) {
+ chunks = [];
+ console.warn('No chunk found for chunkId', chunkId, '. Checking if document exists in documentsById.');
return [];
}
-
try {
const parsed = JSON.parse(StrCast(doc.chunk_simpl));
- return parsed.chunks || [];
+ chunks = parsed.chunks || [];
} catch (e) {
console.error('Error parsing simplified chunks:', e);
return [];
}
- }
-
- /**
- * Gets a specific simplified chunk by ID
- * @param doc The document containing chunks
- * @param chunkId The ID of the chunk to retrieve
- * @returns The simplified chunk if found, undefined otherwise
- */
- public getSimplifiedChunkById(doc: Doc, chunkId: string): any | undefined {
- const chunks = this.getSimplifiedChunks(doc);
return chunks.find(chunk => chunk.chunkId === chunkId);
}
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index 3df1294e9..1349df483 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -16,6 +16,7 @@ import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
import OpenAI from 'openai';
import { Embedding } from 'openai/resources';
import { AgentDocumentManager } from '../utils/AgentDocumentManager';
+import { Id } from '../../../../../fields/FieldSymbols';
dotenv.config();
@@ -24,13 +25,12 @@ dotenv.config();
* and OpenAI text-embedding-3-large for text embedding. It handles AI document management, uploads, and query-based retrieval.
*/
export class Vectorstore {
- private pinecone: Pinecone; // Pinecone client for managing the vector index.
+ private pinecone!: Pinecone; // Pinecone client for managing the vector index.
private index!: Index; // The specific Pinecone index used for document chunks.
- private openai: OpenAI; // OpenAI client for generating embeddings.
+ private openai!: OpenAI; // OpenAI client for generating embeddings.
private indexName: string = 'pdf-chatbot'; // Default name for the index.
- private _id: string; // Unique ID for the Vectorstore instance.
- private docManager: AgentDocumentManager; // Document manager for handling documents
-
+ private _id!: string; // Unique ID for the Vectorstore instance.
+ private docManager!: AgentDocumentManager; // Document manager for handling documents
documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
/**
@@ -143,10 +143,8 @@ export class Vectorstore {
progressCallback(85, 'Embeddings generated. Finalizing document...');
doc.original_segments = JSON.stringify(typedResponse.full);
- const doc_id = uuidv4();
-
- // Register the document with the AgentDocumentManager
- this.docManager.addCustomId(doc, doc_id);
+ const doc_id = doc[Id];
+ console.log('doc_id in vectorstore', doc_id);
// Generate chunk IDs upfront so we can register them
const chunkIds = segmentedTranscript.map(() => uuidv4());
@@ -191,7 +189,7 @@ export class Vectorstore {
} else {
// Process regular document
console.log('Processing regular document...');
- const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+ const createDocumentResponse = await Networking.PostToServer('/createDocument', { file_path: local_file_path, doc_id: doc[Id] });
// Type assertion for the response
const { jobId } = createDocumentResponse as { jobId: string };
@@ -211,12 +209,13 @@ export class Vectorstore {
}
}
- // Register the document with the AgentDocumentManager
- this.docManager.addCustomId(doc, result.doc_id);
-
// Collect all chunk IDs
const chunkIds = result.chunks.map(chunk => chunk.id);
+ if (result.doc_id !== doc[Id]) {
+ console.log('doc_id in vectorstore', result.doc_id, 'does not match doc_id in doc', doc[Id]);
+ }
+
// Register chunks with the document manager
this.docManager.registerChunkIds(result.doc_id, chunkIds);
@@ -319,16 +318,14 @@ export class Vectorstore {
const queryEmbedding = queryEmbeddingResponse.data[0].embedding;
- // Get document IDs from the AgentDocumentManager
- const docIds = Array.from(this.docManager.listDocs());
- console.log('Using document IDs for retrieval:', docIds);
+ console.log('Using document IDs for retrieval:', this.docManager.docIds);
// Query the Pinecone index using the embedding and filter by document IDs.
// We'll query based on document IDs that are registered in the document manager
const queryResponse: QueryResponse = await this.index.query({
vector: queryEmbedding,
filter: {
- doc_id: { $in: docIds },
+ doc_id: { $in: this.docManager.docIds },
},
topK,
includeValues: true,
@@ -356,7 +353,7 @@ export class Vectorstore {
// Ensure the document manager knows about this chunk
// This is important for maintaining backwards compatibility
- if (chunk.id && !this.docManager.getDocIdByChunkId(chunk.id)) {
+ if (chunk.id && !this.docManager.getDocByChunkId(chunk.id)) {
// If the chunk ID isn't registered but we have a doc_id in metadata
if (chunk.metadata.doc_id && this.docManager.has(chunk.metadata.doc_id)) {
// Register the chunk with its parent document
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 6d2779163..378f14094 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -559,7 +559,7 @@ export default class AssistantManager extends ApiManager {
method: Method.POST,
subscription: '/createDocument',
secureHandler: async ({ req, res }) => {
- const { file_path } = req.body;
+ const { file_path, doc_id } = req.body;
const public_path = path.join(publicDirectory, file_path); // Resolve the file path in the public directory
const file_name = path.basename(file_path); // Extract the file name from the path
@@ -572,7 +572,7 @@ export default class AssistantManager extends ApiManager {
// Spawn the Python process and track its progress/output
// eslint-disable-next-line no-use-before-define
- spawnPythonProcess(jobId, public_path);
+ spawnPythonProcess(jobId, public_path, doc_id);
// Send the job ID back to the client for tracking
res.send({ jobId });
@@ -850,7 +850,7 @@ export default class AssistantManager extends ApiManager {
* @param file_name The name of the file to process.
* @param file_path The filepath of the file to process.
*/
-function spawnPythonProcess(jobId: string, file_path: string) {
+function spawnPythonProcess(jobId: string, file_path: string, doc_id: string) {
const venvPath = path.join(__dirname, '../chunker/venv');
const requirementsPath = path.join(__dirname, '../chunker/requirements.txt');
const pythonScriptPath = path.join(__dirname, '../chunker/pdf_chunker.py');
@@ -860,7 +860,7 @@ function spawnPythonProcess(jobId: string, file_path: string) {
function runPythonScript() {
const pythonPath = process.platform === 'win32' ? path.join(venvPath, 'Scripts', 'python') : path.join(venvPath, 'bin', 'python3');
- const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory]);
+ const pythonProcess = spawn(pythonPath, [pythonScriptPath, jobId, file_path, outputDirectory, doc_id]);
let pythonOutput = '';
let stderrOutput = '';
diff --git a/src/server/chunker/pdf_chunker.py b/src/server/chunker/pdf_chunker.py
index e9b9ef2b3..e34753176 100644
--- a/src/server/chunker/pdf_chunker.py
+++ b/src/server/chunker/pdf_chunker.py
@@ -622,7 +622,7 @@ class Document:
Represents a document being processed, such as a PDF, handling chunking, embedding, and summarization.
"""
- def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str):
+ def __init__(self, file_path: str, file_name: str, job_id: str, output_folder: str, doc_id: str):
"""
Initialize the Document with file data, file name, and job ID.
@@ -635,7 +635,7 @@ class Document:
self.file_path = file_path
self.job_id = job_id
self.type = self._get_document_type(file_name) # Determine the document type (PDF, CSV, etc.)
- self.doc_id = job_id # Use the job ID as the document ID
+ self.doc_id = doc_id # Use the job ID as the document ID
self.chunks = [] # List to hold text and visual chunks
self.num_pages = 0 # Number of pages in the document (if applicable)
self.summary = "" # The generated summary for the document
@@ -755,7 +755,7 @@ class Document:
"doc_id": self.doc_id
}, indent=2) # Convert the document's attributes to JSON format
-def process_document(file_path, job_id, output_folder):
+def process_document(file_path, job_id, output_folder, doc_id):
"""
Top-level function to process a document and return the JSON output.
@@ -763,26 +763,27 @@ def process_document(file_path, job_id, output_folder):
:param job_id: The job ID for this document processing task.
:return: The processed document's data in JSON format.
"""
- new_document = Document(file_path, file_path, job_id, output_folder)
+ new_document = Document(file_path, file_path, job_id, output_folder, doc_id)
return new_document.to_json()
def main():
"""
Main entry point for the script, called with arguments from Node.js.
"""
- if len(sys.argv) != 4:
+ if len(sys.argv) != 5:
print(json.dumps({"error": "Invalid arguments"}), file=sys.stderr)
return
job_id = sys.argv[1]
file_path = sys.argv[2]
output_folder = sys.argv[3] # Get the output folder from arguments
+ doc_id = sys.argv[4]
try:
os.makedirs(output_folder, exist_ok=True)
# Process the document
- document_result = process_document(file_path, job_id, output_folder) # Pass output_folder
+ document_result = process_document(file_path, job_id, output_folder,doc_id) # Pass output_folder
# Output the final result as JSON to stdout
print(document_result)