diff options
-rw-r--r-- | package-lock.json | 153 | ||||
-rw-r--r-- | package.json | 3 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/Agent.ts | 26 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/ChatBox.tsx | 186 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/ChunkManager.ts | 24 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/prompts.ts | 420 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/tools/BaseTool.ts | 19 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/tools/DataAnalysisTool.ts | 17 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/tools/SearchTool.ts | 47 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts | 35 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/tools/WikipediaTool.ts | 10 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/types.ts | 15 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts | 35 | ||||
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 97 |
14 files changed, 806 insertions, 281 deletions
diff --git a/package-lock.json b/package-lock.json index 0a485dcb7..c73470775 100644 --- a/package-lock.json +++ b/package-lock.json @@ -74,6 +74,7 @@ "bson": "^6.2.0", "canvas": "^2.11.2", "chart.js": "^4.4.0", + "cheerio": "^1.0.0", "child_process": "^1.0.2", "class-transformer": "^0.5.1", "cohere-ai": "^7.10.6", @@ -306,7 +307,7 @@ "eslint-plugin-react": "^7.34.1", "eslint-plugin-react-hooks": "^4.6.0", "globals": "^15.1.0", - "jsdom": "^24.0.0", + "jsdom": "^24.1.1", "mocha": "^10.2.0", "prettier": "^3.1.0", "scss-loader": "0.0.1", @@ -16599,6 +16600,79 @@ "node": ">= 16" } }, + "node_modules/cheerio": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0.tgz", + "integrity": "sha512-quS9HgjQpdaXOvsZz82Oz7uxtXiy6UIsIQcpBj7HRw2M63Skasm9qlDocAM7jNuaxdhpPU7c4kJN+gA5MCu4ww==", + "dependencies": { + "cheerio-select": "^2.1.0", + "dom-serializer": "^2.0.0", + "domhandler": "^5.0.3", + "domutils": "^3.1.0", + "encoding-sniffer": "^0.2.0", + "htmlparser2": "^9.1.0", + "parse5": "^7.1.2", + "parse5-htmlparser2-tree-adapter": "^7.0.0", + "parse5-parser-stream": "^7.1.2", + "undici": "^6.19.5", + "whatwg-mimetype": "^4.0.0" + }, + "engines": { + "node": ">=18.17" + }, + "funding": { + "url": "https://github.com/cheeriojs/cheerio?sponsor=1" + } + }, + "node_modules/cheerio-select": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz", + "integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==", + "dependencies": { + "boolbase": "^1.0.0", + "css-select": "^5.1.0", + "css-what": "^6.1.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/cheerio-select/node_modules/css-select": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", + "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/cheerio/node_modules/htmlparser2": { + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz", + "integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.1.0", + "entities": "^4.5.0" + } + }, "node_modules/child_process": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/child_process/-/child_process-1.0.2.tgz", @@ -18866,6 +18940,29 @@ "iconv-lite": "^0.6.2" } }, + "node_modules/encoding-sniffer": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.0.tgz", + "integrity": "sha512-ju7Wq1kg04I3HtiYIOrUrdfdDvkyO9s5XM8QAj/bN61Yo/Vb4vgJxy5vi4Yxk01gWHbrofpPtpxM8bKger9jhg==", + "dependencies": { + "iconv-lite": "^0.6.3", + "whatwg-encoding": "^3.1.1" + }, + "funding": { + "url": "https://github.com/fb55/encoding-sniffer?sponsor=1" + } + }, + "node_modules/encoding-sniffer/node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/encoding/node_modules/iconv-lite": { "version": "0.6.3", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", @@ -24992,9 +25089,9 @@ "integrity": "sha512-UVU9dibq2JcFWxQPA6KCqj5O42VOmAY3zQUfEKxU0KpTGXwNoCjkX1e13eHNvw/xPynt6pU0rZ1htjWTNTSXsg==" }, "node_modules/jsdom": { - "version": "24.1.0", - "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-24.1.0.tgz", - "integrity": "sha512-6gpM7pRXCwIOKxX47cgOyvyQDN/Eh0f1MeKySBV2xGdKtqJBLj8P25eY3EVCWo2mglDDzozR2r2MW4T+JiNUZA==", + "version": "24.1.1", + "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-24.1.1.tgz", + "integrity": "sha512-5O1wWV99Jhq4DV7rCLIoZ/UIhyQeDR7wHVyZAHAshbrvZsLs+Xzz7gtwnlJTJDjleiTKh54F4dXrX70vJQTyJQ==", "dev": true, "dependencies": { "cssstyle": "^4.0.1", @@ -25003,11 +25100,11 @@ "form-data": "^4.0.0", "html-encoding-sniffer": "^4.0.0", "http-proxy-agent": "^7.0.2", - "https-proxy-agent": "^7.0.4", + "https-proxy-agent": "^7.0.5", "is-potential-custom-element-name": "^1.0.1", - "nwsapi": "^2.2.10", + "nwsapi": "^2.2.12", "parse5": "^7.1.2", - "rrweb-cssom": "^0.7.0", + "rrweb-cssom": "^0.7.1", "saxes": "^6.0.0", "symbol-tree": "^3.2.4", "tough-cookie": "^4.1.4", @@ -25016,7 +25113,7 @@ "whatwg-encoding": "^3.1.1", "whatwg-mimetype": "^4.0.0", "whatwg-url": "^14.0.0", - "ws": "^8.17.0", + "ws": "^8.18.0", "xml-name-validator": "^5.0.0" }, "engines": { @@ -35598,9 +35695,9 @@ } }, "node_modules/nwsapi": { - "version": "2.2.10", - "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.10.tgz", - "integrity": "sha512-QK0sRs7MKv0tKe1+5uZIQk/C8XGza4DAnztJG8iD+TpJIORARrCxczA738awHrZoHeTjSSoHqao2teO0dC/gFQ==", + "version": "2.2.12", + "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.12.tgz", + "integrity": "sha512-qXDmcVlZV4XRtKFzddidpfVP4oMSGhga+xdMc25mv8kaLUHtgzCDhUxkrN8exkGdTlLNaXj7CV3GtON7zuGZ+w==", "dev": true }, "node_modules/oauth": { @@ -36157,6 +36254,29 @@ "url": "https://github.com/inikulin/parse5?sponsor=1" } }, + "node_modules/parse5-htmlparser2-tree-adapter": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.0.0.tgz", + "integrity": "sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g==", + "dependencies": { + "domhandler": "^5.0.2", + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-parser-stream": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz", + "integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==", + "dependencies": { + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/parseley": { "version": "0.12.1", "resolved": "https://registry.npmjs.org/parseley/-/parseley-0.12.1.tgz", @@ -42277,6 +42397,14 @@ "resolved": "https://registry.npmjs.org/undefsafe/-/undefsafe-2.0.5.tgz", "integrity": "sha512-WxONCrssBM8TSPRqN5EmsjVrsv4A8X12J4ArBiiayv3DyyG3ZlIg6yysuuSYdZsVz3TKcTg2fd//Ujd4CHV1iA==" }, + "node_modules/undici": { + "version": "6.19.7", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.19.7.tgz", + "integrity": "sha512-HR3W/bMGPSr90i8AAp2C4DM3wChFdJPLrWYpIS++LxS8K+W535qftjt+4MyjNYHeWabMj1nvtmLIi7l++iq91A==", + "engines": { + "node": ">=18.17" + } + }, "node_modules/undici-types": { "version": "5.26.5", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", @@ -43380,7 +43508,6 @@ "version": "3.1.1", "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", - "dev": true, "dependencies": { "iconv-lite": "0.6.3" }, @@ -43392,7 +43519,6 @@ "version": "0.6.3", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", - "dev": true, "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" }, @@ -43409,7 +43535,6 @@ "version": "4.0.0", "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz", "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==", - "dev": true, "engines": { "node": ">=18" } diff --git a/package.json b/package.json index 4e9946e2e..e285205dd 100644 --- a/package.json +++ b/package.json @@ -81,7 +81,7 @@ "eslint-plugin-react": "^7.34.1", "eslint-plugin-react-hooks": "^4.6.0", "globals": "^15.1.0", - "jsdom": "^24.0.0", + "jsdom": "^24.1.1", "mocha": "^10.2.0", "prettier": "^3.1.0", "scss-loader": "0.0.1", @@ -159,6 +159,7 @@ "bson": "^6.2.0", "canvas": "^2.11.2", "chart.js": "^4.4.0", + "cheerio": "^1.0.0", "child_process": "^1.0.2", "class-transformer": "^0.5.1", "cohere-ai": "^7.10.6", diff --git a/src/client/views/nodes/ChatBox/Agent.ts b/src/client/views/nodes/ChatBox/Agent.ts index 69b83c1b5..825cd831b 100644 --- a/src/client/views/nodes/ChatBox/Agent.ts +++ b/src/client/views/nodes/ChatBox/Agent.ts @@ -10,6 +10,11 @@ import { Vectorstore } from './vectorstore/Vectorstore'; import { ChatCompletionAssistantMessageParam, ChatCompletionMessageParam } from 'openai/resources'; import dotenv from 'dotenv'; import { ChatBox } from './ChatBox'; +import { DataAnalysisTool } from './tools/DataAnalysisTool'; +import { string } from 'cohere-ai/core/schemas'; +import { WebsiteInfoScraperTool } from './tools/WebsiteInfoScraperTool'; +import { SearchTool } from './tools/SearchTool'; +import { add } from 'lodash'; dotenv.config(); export class Agent { @@ -20,17 +25,22 @@ export class Agent { private vectorstore: Vectorstore; private _history: () => string; private _summaries: () => string; + private _csvData: () => { filename: string; id: string; text: string }[]; - constructor(_vectorstore: Vectorstore, summaries: () => string, history: () => string) { + constructor(_vectorstore: Vectorstore, summaries: () => string, history: () => string, csvData: () => { filename: string; id: string; text: string }[], addLinkedUrlDoc: (url: string, id: string) => void) { console.log(process.env.OPENAI_KEY); this.client = new OpenAI({ apiKey: process.env.OPENAI_KEY, dangerouslyAllowBrowser: true }); this.vectorstore = _vectorstore; this._history = history; this._summaries = summaries; + this._csvData = csvData; this.tools = { - wikipedia: new WikipediaTool(), + //wikipedia: new WikipediaTool(addLinkedUrlDoc), calculate: new CalculateTool(), rag: new RAGTool(this.vectorstore), + dataAnalysis: new DataAnalysisTool(csvData), + websiteInfoScraper: new WebsiteInfoScraperTool(addLinkedUrlDoc), + searchTool: new SearchTool(addLinkedUrlDoc), no_tool: new NoTool(), }; } @@ -44,13 +54,13 @@ export class Agent { console.log(`System prompt: ${systemPrompt}`); this.interMessages = [{ role: 'system', content: systemPrompt }]; - this.interMessages.push({ role: 'user', content: `<step0 role="user"><query>${question}</query></step>` }); + this.interMessages.push({ role: 'user', content: `<step1 role="user"><query>${question}</query></step>` }); const parser = new XMLParser(); const builder = new XMLBuilder(); let currentAction: string | undefined; - for (let i = 1; i < maxTurns; i++) { + for (let i = 3; i < maxTurns; i += 2) { console.log(`Turn ${i}/${maxTurns}`); const result = await this.execute(); @@ -74,12 +84,10 @@ export class Agent { currentAction = step[key] as string; console.log(`Action: ${currentAction}`); if (this.tools[currentAction]) { - i++; - console.log(builder.build({ action_rules: this.tools[currentAction].getActionRule(true) })); const nextPrompt = [ { type: 'text', - text: `<step${i} role="user">` + builder.build({ action_rules: this.tools[currentAction].getActionRule(true) }) + `<\step>`, + text: `<step${i} role="user">` + builder.build({ action_rules: this.tools[currentAction].getActionRule() }) + `<\step>`, }, ]; this.interMessages.push({ role: 'user', content: nextPrompt }); @@ -87,7 +95,6 @@ export class Agent { break; } else { console.log('Error: No valid action'); - i++; this.interMessages.push({ role: 'user', content: `<step${i}>No valid action, try again.</step>` }); break; } @@ -101,8 +108,7 @@ export class Agent { // const rootTagName = stepElement.tagName; // const match = rootTagName.match(/step(\d+)/); // const currentStep = match ? parseInt(match[1]) + 1 : 1; - i++; - const nextPrompt = [{ type: 'text', text: `<step${i}<observation>` }, ...observation, { type: 'text', text: '</observation></step>' }]; + const nextPrompt = [{ type: 'text', text: `<step${i}> <observation>` }, ...observation, { type: 'text', text: '</observation></step>' }]; console.log(observation); this.interMessages.push({ role: 'user', content: nextPrompt }); break; diff --git a/src/client/views/nodes/ChatBox/ChatBox.tsx b/src/client/views/nodes/ChatBox/ChatBox.tsx index 56c1e37f8..8d09cde1e 100644 --- a/src/client/views/nodes/ChatBox/ChatBox.tsx +++ b/src/client/views/nodes/ChatBox/ChatBox.tsx @@ -11,7 +11,7 @@ import { ViewBoxAnnotatableComponent } from '../../DocComponent'; import { FieldView, FieldViewProps } from '../FieldView'; import './ChatBox.scss'; import MessageComponentBox from './MessageComponent'; -import { ASSISTANT_ROLE, AssistantMessage, AI_Document, Citation, CHUNK_TYPE, RAGChunk, getChunkType, TEXT_TYPE } from './types'; +import { ASSISTANT_ROLE, AssistantMessage, AI_Document, Citation, CHUNK_TYPE, RAGChunk, getChunkType, TEXT_TYPE, SimplifiedChunk } from './types'; import { Vectorstore } from './vectorstore/Vectorstore'; import { Agent } from './Agent'; import dotenv from 'dotenv'; @@ -19,6 +19,8 @@ import { DocData, DocViews } from '../../../../fields/DocSymbols'; import { AnswerParser } from './AnswerParser'; import { DocumentManager } from '../../../util/DocumentManager'; import { v4 as uuidv4 } from 'uuid'; +import { chunk } from 'lodash'; +import { DocUtils } from '../../../documents/DocUtils'; dotenv.config(); @@ -32,6 +34,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { @observable expandedScratchpadIndex: number | null = null; @observable inputValue: string = ''; @observable private linked_docs_to_add: ObservableSet<Doc> = observable.set(); + @observable private linked_csv_files: { filename: string; id: string; text: string }[] = []; private openai: OpenAI; private vectorstore_id: string; private documents: AI_Document[] = []; @@ -55,7 +58,7 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { this.vectorstore_id = StrCast(this.dataDoc.vectorstore_id); } this.vectorstore = new Vectorstore(this.vectorstore_id, this.retrieveDocIds); - this.agent = new Agent(this.vectorstore, this.retrieveSummaries, this.retrieveFormattedHistory); + this.agent = new Agent(this.vectorstore, this.retrieveSummaries, this.retrieveFormattedHistory, this.retrieveCSVData, this.addLinkedUrlDoc); reaction( () => this.history.map((msg: AssistantMessage) => ({ role: msg.role, content: msg.content, follow_up_questions: msg.follow_up_questions, citations: msg.citations })), @@ -71,6 +74,52 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { }; @action + addCSVForAnalysis = async (newLinkedDoc: Doc) => { + console.log('adding csv file for analysis'); + if (!newLinkedDoc.chunk_simpl) { + const csvData: string = StrCast(newLinkedDoc.text); + console.log('CSV Data:', csvData); + const completion = await this.openai.chat.completions.create({ + messages: [ + { + role: 'system', + content: + 'You are an AI assistant tasked with summarizing the content of a CSV file. You will be provided with the data from the CSV file and your goal is to generate a concise summary that captures the main themes, trends, and key points represented in the data.', + }, + { + role: 'user', + content: `Please provide a comprehensive summary of the CSV file based on the provided data. Ensure the summary highlights the most important information, patterns, and insights. Your response should be in paragraph form and be concise. + + CSV Data: + + ${csvData} + + ********** + Summary:`, + }, + ], + model: 'gpt-3.5-turbo', + }); + console.log('CSV Data:', csvData); + const csvId = uuidv4(); + + this.linked_csv_files.push({ + filename: CsvCast(newLinkedDoc.data).url.pathname, + id: csvId, + text: csvData, + }); + + console.log(this.linked_csv_files); + const chunkToAdd = { + chunkId: csvId, + chunkType: CHUNK_TYPE.CSV, + }; + newLinkedDoc.chunk_simpl = JSON.stringify({ chunks: [chunkToAdd] }); + newLinkedDoc.summary = completion.choices[0].message.content!; + } + }; + + @action toggleToolLogs = (index: number) => { this.expandedScratchpadIndex = this.expandedScratchpadIndex === index ? null : index; }; @@ -132,61 +181,77 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { }; @action - handleCitationClick = (citation: Citation) => { - console.log('Citation clicked:', citation); - const currentLinkedDocs: Doc[] = this.linkedDocs; - const chunk_id = citation.chunk_id; - for (let doc of currentLinkedDocs) { - if (doc.chunk_simpl) { - //console.log(JSON.parse(StrCast(doc.chunk_simpl))); - const doc_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); - console.log(doc_chunk_simpl); - const text_chunks = doc_chunk_simpl.text_chunks as [{ chunk_id: string; start_page: number; end_page: number }] | []; - const image_chunks = doc_chunk_simpl.image_chunks as [{ chunk_id: string; location: string; page: number }] | []; - - const found_text_chunk = text_chunks.find(chunk => chunk.chunk_id === chunk_id); - if (found_text_chunk) { - const doc_url = CsvCast(doc.data, PDFCast(doc.data)).url.pathname; - console.log('URL: ' + doc_url); - - //const ai_field_id = doc[this.Document[Id] + '_ai_field_id']; - DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => { - console.log(doc.data); - //look at context path for each docview and choose the doc view that has as - //its parent the same collection view the chatbox is in - const first_view = Array.from(doc[DocViews])[0]; - first_view.ComponentView?.search?.(citation.direct_text); - }); - } + addLinkedUrlDoc = async (url: string, id: string) => { + const doc = Docs.Create.WebDocument(url); - const found_image_chunk = image_chunks.find(chunk => chunk.chunk_id === chunk_id); - if (found_image_chunk) { - const location_string: string = found_image_chunk.location; + const linkDoc = Docs.Create.LinkDocument(this.Document, doc); + LinkManager.Instance.addLink(linkDoc); - // Extract variables from location_string - const values = location_string.replace(/[\[\]]/g, '').split(','); - - // Ensure we have exactly 4 values - if (values.length !== 4) { - console.error('Location string must contain exactly 4 numbers'); - return; // or handle this error as appropriate - } + const chunkToAdd = { + chunkId: id, + chunkType: CHUNK_TYPE.URL, + }; - const x1 = parseFloat(values[0]) * Doc.NativeWidth(doc); - const y1 = parseFloat(values[1]) * Doc.NativeHeight(doc); - const x2 = parseFloat(values[2]) * Doc.NativeWidth(doc); - const y2 = parseFloat(values[3]) * Doc.NativeHeight(doc); + doc.chunk_simpl = JSON.stringify({ chunks: [chunkToAdd] }); + }; - const annotationKey = Doc.LayoutFieldKey(doc) + '_annotations'; + @action + handleCitationClick = (citation: Citation) => { + console.log('Citation clicked:', citation); + const currentLinkedDocs: Doc[] = this.linkedDocs; - const existingDoc = DocListCast(doc[DocData][annotationKey]).find(d => d.citation_id === citation.citation_id); - const highlight_doc = existingDoc ?? this.createImageCitationHighlight(x1, y1, x2, y2, citation, annotationKey, doc); + const chunkId = citation.chunk_id; - DocumentManager.Instance.showDocument(highlight_doc, { willZoomCentered: true }, () => {}); + for (let doc of currentLinkedDocs) { + if (doc.chunk_simpl) { + const docChunkSimpl = JSON.parse(StrCast(doc.chunk_simpl)) as { chunks: SimplifiedChunk[] }; + console.log(docChunkSimpl); + const foundChunk = docChunkSimpl.chunks.find(chunk => chunk.chunkId === chunkId); + if (foundChunk) { + switch (getChunkType(foundChunk.chunkType)) { + case CHUNK_TYPE.IMAGE: + const values = foundChunk.location?.replace(/[\[\]]/g, '').split(','); + + if (values?.length !== 4) { + console.error('Location string must contain exactly 4 numbers'); + return; + } + + const x1 = parseFloat(values[0]) * Doc.NativeWidth(doc); + const y1 = parseFloat(values[1]) * Doc.NativeHeight(doc); + const x2 = parseFloat(values[2]) * Doc.NativeWidth(doc); + const y2 = parseFloat(values[3]) * Doc.NativeHeight(doc); + + const annotationKey = Doc.LayoutFieldKey(doc) + '_annotations'; + + const existingDoc = DocListCast(doc[DocData][annotationKey]).find(d => d.citation_id === citation.citation_id); + const highlightDoc = existingDoc ?? this.createImageCitationHighlight(x1, y1, x2, y2, citation, annotationKey, doc); + + DocumentManager.Instance.showDocument(highlightDoc, { willZoomCentered: true }, () => {}); + break; + case CHUNK_TYPE.TEXT: + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => { + const firstView = Array.from(doc[DocViews])[0]; + firstView.ComponentView?.search?.(citation.direct_text); + }); + break; + case CHUNK_TYPE.URL: + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => { + const firstView = Array.from(doc[DocViews])[0]; + }); + break; + case CHUNK_TYPE.CSV: + DocumentManager.Instance.showDocument(doc, { willZoomCentered: true }, () => { + const firstView = Array.from(doc[DocViews])[0]; + }); + break; + default: + console.log('Chunk type not supported'); + break; + } } } } - // You can implement additional functionality here, such as showing a modal with the full citation content }; createImageCitationHighlight = (x1: number, y1: number, x2: number, y2: number, citation: Citation, annotationKey: string, pdfDoc: Doc): Doc => { @@ -247,7 +312,11 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { runInAction(() => { this.isUploadingDocs = true; }); - this.addDocToVectorstore(change.newValue); + if (PDFCast(change.newValue.data)) { + this.addDocToVectorstore(change.newValue); + } else if (CsvCast(change.newValue.data)) { + this.addCSVForAnalysis(change.newValue); + } runInAction(() => { this.isUploadingDocs = false; }); @@ -283,12 +352,25 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { .map(d => DocCast(d?.annotationOn, d)) .filter(d => d) .filter(d => d.summary) - .map((doc, index) => `${index + 1}) ${doc.summary}`) + .map((doc, index) => { + if (PDFCast(doc.data)) { + return `<summary file_name="${PDFCast(doc.data).url.pathname}" applicable_tools=["rag"]>${doc.summary}</summary>`; + } else if (CsvCast(doc.data)) { + return `<summary file_name="${CsvCast(doc.data).url.pathname}" applicable_tools=["dataAnalysis"]>${doc.summary}</summary>`; + } else { + return `${index + 1}) ${doc.summary}`; + } + }) .join('\n') + '\n' ); } @computed + get linkedCSVs(): { filename: string; id: string; text: string }[] { + return this.linked_csv_files; + } + + @computed get formattedHistory(): string { let history = '<chat_history>\n'; for (const message of this.history) { @@ -302,6 +384,10 @@ export class ChatBox extends ViewBoxAnnotatableComponent<FieldViewProps>() { return this.summaries; }; + retrieveCSVData = () => { + return this.linkedCSVs; + }; + retrieveFormattedHistory = () => { return this.formattedHistory; }; diff --git a/src/client/views/nodes/ChatBox/ChunkManager.ts b/src/client/views/nodes/ChatBox/ChunkManager.ts new file mode 100644 index 000000000..64c073640 --- /dev/null +++ b/src/client/views/nodes/ChatBox/ChunkManager.ts @@ -0,0 +1,24 @@ +import { SimplifiedChunk } from './types'; + +class ChunkManager { + private chunks: SimplifiedChunk[]; + + constructor() { + this.chunks = []; + } + + addChunk(chunk: SimplifiedChunk) { + this.chunks.push(chunk); + } + + removeChunk(chunk: SimplifiedChunk) { + const index = this.chunks.indexOf(chunk); + if (index !== -1) { + this.chunks.splice(index, 1); + } + } + + getChunks() { + return this.chunks; + } +} diff --git a/src/client/views/nodes/ChatBox/prompts.ts b/src/client/views/nodes/ChatBox/prompts.ts index d520a7b7d..2d04595bc 100644 --- a/src/client/views/nodes/ChatBox/prompts.ts +++ b/src/client/views/nodes/ChatBox/prompts.ts @@ -3,169 +3,263 @@ import { Tool } from './types'; export function getReactPrompt(tools: Tool[], summaries: () => string, chatHistory: string): string { - const toolDescriptions = tools.map(tool => `${tool.name}:\n${tool.briefSummary}`).join('\n*****\n'); - - return ` -<step1 type="system"> - You are an advanced AI assistant with access to various tools. Your task is to answer user queries accurately and efficiently. Follow these instructions meticulously: - - 1. Operation Loop: - You operate in a loop of Thought, Action, (STOP), *Action Rules*, Action Input, (STOP), *Observation*, and Answer. Each iteration is numbered (step1, step2, etc.). - - 2. Response Structure: - a. Enclose each step in numbered XML tags: <step1>, <step2>, etc. (the number of steps will vary for each situation) - b. Within each step, use the following tags as needed: - <thought> - Your reasoning process - <action> - The tool you choose to use - <action_input> - Parameters for the chosen tool - <answer> - Your final response (only in the last step) - c. Stop after <action> and <action_input> tags for system processing. You will receive a user response after each time you stop in the form of either action rules or an observation. - - 3. Detailed Process: - a. Analyze the user's query carefully. - b. Determine if a tool is necessary or if you can answer directly. - c. If a tool is needed: - c.1) Select the most appropriate tool. - c.2) Use <action> to specify the tool. - c.3) End your response in order for action rules to be provided. - c.4) Based on the action rules, provide tool parameters in <action_input>. - c.5) End your response again and wait for the observation from the tool. - d. If no tool is needed, use the 'no_tool' action but still follow the same structure. - e. !!!Use the retrieval (RAG) tool ANYTIME the question may potentially (even if you are not sure) relate to one of the user's documents. Here are the summaries of the user's documents: - ${summaries()} - f. Based on observations or your knowledge, formulate your answer. - g. Provide the final answer in the <answer> tag, including user-perspective follow-up questions. - - 4. Available Tools: - ${toolDescriptions} - no_tool: Use when no external tool is required to answer the question. If a user document may relate to the query, do not use, and instead, use the RAG tool (even if unsure). - - 5. !!!Critical Rules!!!: - - Use tools ONLY when absolutely necessary for accurate answers (except when query may relate to user documents, then use RAG tool ALWAYS to start). - - Only provide one step at a time and only write assistant steps. Do not skip steps. Do not provide multiple steps at once. Decide the step that you will provide based on previous steps taken. - - Ensure ALL XML is valid, properly nested, and complete. - - ALWAYS stop after <action> and <action_input> tags. - - If the initial answer is inadequate, iterate through additional steps to refine it. - - Utilize context from past conversations when relevant (provided in Chat History). - - ALWAYS include your final response within a single <answer> tag. - - 6. Answer Format: - Your final <answer> tag must contain: - - The complete answer to the user's query. - - An array of EXACTLY 3 follow-up questions (WRITTEN IN THE PERSPECTIVE OF THE USER ASKING A FOLLOW-UP QUESTION) within <follow_up_questions> tags. - - 7. Example Interaction (YOU ONLY OUTPUT THE ASSISTANT STEPS): - SYSTEM: - <step1 role="system"> - ***SYSTEM MESSAGE ELIDED*** - </step1> - - USER: - <step2 role="user"> - <query>What is the population of Tokyo, and how does it compare to New York City?</query> - </step2> - - ASSISTANT: - <step3 role="assistant"> - <thought>To answer this question accurately, I need to look up the current population figures for both Tokyo and New York City. I'll use the Wikipedia tool for this information.</thought> - <action>wikipedia</action> - </step3> - - USER: - <step4 role="user"> - <action_rules> - { - "wikipedia": { - "name": "wikipedia", - "description": "Search Wikipedia and return a summary", - "parameters": [ - { - "title": { - "type": "string", - "description": "The title of the Wikipedia article to search", - "required": "true" - } - } - ] - } - } - </action_rules> - </step4> - - ASSISTANT: - <step5 role="assistant"> - <action_input> - <title>Tokyo</title> - </action_input> - </step5> - - USER: - <step6 role="user"> - <observation>Tokyo is the capital and most populous prefecture of Japan. The Tokyo metropolitan area, which includes Tokyo and several surrounding prefectures, is the world's most populous metropolitan area with an estimated 37.468 million residents as of 2018.</observation> - </step6> - - ASSISTANT: - <step7 role="assistant"> - <thought>Now that I have information about Tokyo, I need to get information about New York City for comparison.</thought> - <action>wikipedia</action> - </step7> - - USER: - <step8 role="user"> - <action_rules> - { - "wikipedia": { - "name": "wikipedia", - "description": "Search Wikipedia and return a summary", - "parameters": [ - { - "title": { - "type": "string", - "description": "The title of the Wikipedia article to search", - "required": "true" - } - } - ] - } - } - </action_rules> - </step8> - - ASSISTANT: - <step9 role="assistant"> - <action_input> - <title>New York City</title> - </action_input> - </step9> - - USER: - <step10 role="user"> - <observation>New York City is the most populous city in the United States. With an estimated 2020 population of 8,804,190 distributed over 300.46 square miles (778.2 km2), New York City is also the most densely populated major city in the United States.</observation> - </step10> - - ASSISTANT: - <step11 role="assistant"> - <thought>Now that I have the population information for both Tokyo and New York City, I can provide a comprehensive answer comparing the two.</thought> - <answer> - The population of Tokyo metropolitan area is approximately 37.468 million (as of 2018), while New York City has a population of about 8.8 million (as of 2020). Tokyo's population is significantly larger, more than four times that of New York City. It's important to note that the Tokyo figure refers to the broader metropolitan area, while the New York City figure is for the city proper. Even accounting for this difference, Tokyo remains substantially more populous than New York City. - - <follow_up_questions> - <question>What factors contribute to Tokyo's significantly larger population compared to New York City?</question> - <question>How do the population densities of Tokyo and New York City compare?</question> - <question>What challenges do these megacities face due to their large populations?</question> - </follow_up_questions> - </answer> - </step11> - - 8. Chat History: - ${chatHistory} - - Remember to use this history for context when appropriate. - - Now, process the user's query and provide your response following the format and rules outlined above. Ensure your final answer is comprehensive and entirely contained within a single <answer> tag. - - !!!IMPORTANT Very importantly, even if you use no tool or have an answer, follow the structure and output ONE step at a time. You will be provided with user steps and will output the appropriate single assistant step. FOLLOW THE STRUCTURE; Do not skip to the answer right away or skip steps. -</step1>`; + const toolDescriptions: string = tools + .map( + tool => ` + <tool> + <title>${tool.name}</title> + <brief_summary>${tool.briefSummary}</brief_summary> + </tool> + ` + ) + .join('\n'); + + return `<system_message> + <task> + You are an advanced AI assistant equipped with various tools to answer user queries accurately and efficiently. Your task is to provide a comprehensive response based on the user’s prompt using available tools, chat history, and provided information. Follow these guidelines meticulously to ensure the accuracy and structure of your response. + </task> + + <critical_points> + <point>**MOST IMPORTANT**: Always output responses within step number tags, using the step number and the assistant role as the root tag (e.g., <step2 role="assistant">, <step4 role="assistant">, etc.). This is crucial and should never be overlooked.</point> + <point>**STOP after every step and wait for the system to provide the next input (e.g., action rules or observations).</point> + <point>Only output **ONE step at a time** in your responses. Do not skip steps or provide multiple steps at once. Thus, you should only output even step number root tags.</point> + <point>Always structure your responses using valid, well-formed XML with properly nested tags.</point> + <point>If a tool is needed, ALWAYS select the most appropriate tool based on the user's query.</point> + <point>If the query could relate to user documents or require external information (e.g., RAG, search + website scraping, data analysis, Wikipedia), USE the appropriate tool to gather that information.</point> + <point>If there are no user docs or the user docs have not yielded helpful information, either use Wikipedia if a brief summary of a direct topic is all that is needed, or use the search tool to find websites followed by the website scraper tool to get useful infromation from one of those websites. + </critical_points> + + <response_structure> + <instruction> + When providing your final response, use the following structure: + </instruction> + <answer> + <section> + <tag><grounded_text> - Wrap text that is derived from tool-based or chunk-based information within these tags, ensuring proper citation.</tag> + <tag><normal_text> - Wrap text that is not derived from tool-based or chunk-based information within these tags.</tag> + </section> + <citations> + <tag><citation> - Provide citations for each grounded text, referencing the tool or chunk used.</tag> + </citations> + <follow_up_questions> + <tag><question> - Include exactly three follow-up questions from the user’s perspective within these tags.</tag> + </follow_up_questions> + </answer> + </response_structure> + + <grounded_text_guidelines> + <step>Wrap all information derived from tools (e.g., RAG, Wikipedia, CSV analysis) or chunks in <grounded_text> tags.</step> + <step>DO NOT PUT ANYTHING THAT IS NOT DIRECTLY DERIVED FROM TOOLS OR CHUNKS IN <grounded_text> TAGS.</step> + <step>Use a single <grounded_text> tag for sequential and closely related information that references the same citation.</step> + <step>If other citations are used sequentially, create new <grounded_text> tags.</step> + <step>Ensure each <grounded_text> tag has corresponding citations (up to three, and one is fine). Separate multiple citation indices with commas.</step> + <step>Grounded text can be as short as a few words or as long as several sentences.</step> + <step>Avoid overlapping or nesting <grounded_text> tags; use sequential tags instead.</step> + <step>Should be in Markdown format.</step> + </grounded_text_guidelines> + + <normal_text_guidelines> + <step>Wrap all information that is not derived from tools or chunks in <normal_text> tags.</step> + <step>Ensure that these tags are used for your reasoning, background knowledge, or general information that does not require a citation.</step> + <step>Do not use <normal_text> tags for information that needs grounding or citation.</step> + <step>Anything that is in any user docs should be grounded text and cited, not normal text, even if it is background or general information.</step> + <step>Should be in Markdown format.</step> + </normal_text_guidelines> + + <citation_guidelines> + <step>Create a unique citation for each distinct piece of information from tools or chunks that is used to support <grounded_text>.</step> + <step>Ensure each citation has a unique index number.</step> + <step>Specify the correct type: "text", "image", "table", "csv", or "url".</step> + <step>For text-based information, include only the relevant subset of the original information that the <grounded_text> is based on.</step> + <step>For image, table, csv, or url citation types, leave the citation content empty.</step> + <step>ALL CITATIONS MUST use the chunk_id field to reference the source, whether it’s from RAG, Wikipedia, CSV analysis, or any other tool.</step> + <step>One citation can be used for multiple <grounded_text> tags if they are based on the same tool or chunk information.</step> + <step>!!!DO NOT OVERCITE - only include citations for information that is directly relevant to the <grounded_text>.</step> + </citation_guidelines> + + <operational_process> + <step>Analyze the user’s query carefully.</step> + <step>Determine whether a tool is required to answer the query accurately.</step> + <step>If a tool is necessary:</step> + <substeps> + <substep>Select the most appropriate tool.</substep> + <substep>Use the <action> tag to specify the tool.</substep> + <substep>End your response after the <action> tag and wait for action rules to be provided.</substep> + <substep>Based on the action rules, provide the necessary tool parameters within the <action_input> tag.</substep> + <substep>End your response again and wait for the observation from the tool.</substep> + </substeps> + <step>If no tool is needed, use the 'no_tool' action but still follow the same response structure.</step> + <step>If the query might relate to user documents or requires external information, **ALWAYS** use the appropriate tool to retrieve the information (either rag or dataAnalysis).</step> + <step>Once all observations are collected, or if no tool was needed, provide your comprehensive answer within the <answer> tag, using the <grounded_text> and <normal_text> tags as required.</step> + </operational_process> + + <final_answer_requirements> + <requirement>Your final <answer> tag must contain:</requirement> + <elements> + <element>The complete answer to the user’s query, with grounded information wrapped in <grounded_text> tags and general information wrapped in <normal_text> tags.</element> + <element>Exactly three follow-up questions written from the user’s perspective, enclosed within <follow_up_questions> tags.</element> + </elements> + </final_answer_requirements> + + <tools> + ${toolDescriptions} + <note>If no external tool is required to answer the question, use the 'no_tool' action. However, if the query might relate to user documents or require external information, do not use 'no_tool'—instead, use the appropriate tool (RAG, Wikipedia, etc.), even if unsure.</note> + </tools> + + <user_information> + <summaries> + ${summaries()} + </summaries> + </user_information> + + <example_interaction> + <note>YOU ONLY OUTPUT THE ASSISTANT STEPS:</note> + <interaction> + <step1 role="system"> + ***SYSTEM MESSAGE ELIDED*** + </step1> + + <step2 role="user"> + <query>Could you provide a detailed overview of the 2010 Vancouver Winter Olympics, including the overall summary of the games, key moments from official statements, and how the medal count compared across countries?</query> + </step2> + + <step3 role="assistant"> + <thought> + To answer this query comprehensively, I will start by retrieving an overview of the 2010 Vancouver Winter Olympics from Wikipedia. After that, I'll look for any official press release statements for key moments during the games using the RAG tool. Finally, I'll analyze the medal count using the CSV data to compare how different countries performed. + </thought> + <action>wikipedia</action> + </step3> + + <step4 role="user"> + <action_rules>***Action rules elided***</action_rules> + </step4> + + <step5 role="assistant"> + <action_input> + <title>2010 Winter Olympics</title> + </action_input> + </step5> + + <step6 role="user"> + <chunk chunk_id="123e4567-e89b-12d3-a456-426614174000" chunk_type="url"> + The 2010 Winter Olympics, officially known as the XXI Olympic Winter Games, took place in Vancouver, British Columbia, Canada, from February 12 to 28, 2010. It featured 86 events in 15 disciplines across 7 sports, with 2,566 athletes from 82 National Olympic Committees participating. This edition of the Winter Olympics was notable for being the first hosted by Canada since the 1988 Winter Olympics in Calgary. + </chunk> + </step6> + + <step7 role="assistant"> + <thought> + Now that I have an overview from Wikipedia, I will use the RAG tool to find relevant information from the user's documents, specifically focusing on key moments and statements from an official press release. + </thought> + <action>rag</action> + </step7> + + <step8 role="user"> + <action_rules>***Action rules elided***</action_rules> + </step8> + + <step9 role="assistant"> + <action_input> + <hypothetical_document_chunk> + The user is asking for key moments and statements from official sources regarding the 2010 Vancouver Winter Olympics. Search the provided documents for any press releases or official statements that highlight significant events, achievements, or noteworthy aspects of the games. + </hypothetical_document_chunk> + </action_input> + </step9> + + <step10 role="user"> + <chunk chunk_id="987f6543-e21b-43c9-a987-654321fedcba" chunk_type="text"> + PRESS RELEASE: 2010 VANCOUVER WINTER OLYMPICS + + Vancouver, BC - The 2010 Winter Olympics have been hailed as a monumental success, showcasing the best of Canadian culture and athletic prowess. The Opening Ceremony, held on February 12, 2010, was a spectacular celebration of the nation's history and diversity... + + Key Moments: + - The Canadian men's and women's ice hockey teams both claimed gold medals, an achievement that brought immense pride to the host nation. + - Tragedy struck early in the games with the death of Georgian luger Nodar Kumaritashvili during a training run. This incident led to increased scrutiny of safety measures across all events... + + ***More chunk content elided for brevity*** + + The games concluded on February 28, 2010, with a memorable Closing Ceremony that highlighted the achievements of the athletes and the spirit of unity that defined the 2010 Vancouver Winter Olympics. + </chunk> + </step10> + + <step11 role="assistant"> + <thought> + With information on key moments from the press release, I will now analyze the medal count data to compare how different countries performed during the games. + </thought> + <action>dataAnalysis</action> + </step11> + + <step12 role="user"> + <action_rules>***Action rules elided***</action_rules> + </step12> + + <step13 role="assistant"> + <action_input> + <csv_file_name>2010_Vancouver_Olympics_Medal_Count.csv</csv_file_name> + </action_input> + </step13> + + <step14 role="user"> + <chunk chunk_id="789a1234-e567-890f-gh12-34567ijklmn8" chunk_type="csv"> + Country,Gold,Silver,Bronze,Total + USA,9,15,13,37 + Germany,10,13,7,30 + Canada,14,7,5,26 + Norway,9,8,6,23 + </chunk> + </step14> + + <step15 role="assistant"> + <thought> + Now that I have gathered comprehensive information from Wikipedia, the official press release, and the medal count data, I will provide a detailed answer to the user's query. + </thought> + <answer> + <grounded_text citation_index="1"> + The 2010 Winter Olympics, officially known as the XXI Olympic Winter Games, took place in Vancouver, British Columbia, Canada, from February 12 to 28, 2010. It featured 86 events in 15 disciplines across 7 sports, with 2,566 athletes from 82 National Olympic Committees participating. + </grounded_text> + <grounded_text citation_index="2"> + The Opening Ceremony was one of the most spectacular in Olympic history, celebrating Canadian culture and heritage. A significant highlight for the host nation was the victory of the Canadian men's and women's ice hockey teams, both securing gold medals, which was a source of immense national pride. + </grounded_text> + <grounded_text citation_index="3"> + In terms of medal performance, the United States led the overall tally with 37 medals, followed by Germany with 30 medals. Canada, while placing third in total medals, achieved a record number of 14 gold medals, the highest of any country in the 2010 games. + </grounded_text> + <grounded_text citation_index="4"> + However, the games were also marked by tragedy with the unfortunate death of Georgian luger Nodar Kumaritashvili during a training run, which deeply affected the Olympic community. + </grounded_text> + <citations> + <citation index="1" chunk_id="123e4567-e89b-12d3-a456-426614174000" type="url"></citation> + <citation index="2" chunk_id="987f6543-e21b-43c9-a987-654321fedcba" type="text"> + Vancouver, BC - The 2010 Winter Olympics have been hailed as a monumental success, showcasing the best of Canadian culture and athletic prowess. The Opening Ceremony, held on February 12, 2010, was a spectacular celebration of the nation's history and diversity... + + Key Moments: + - The Canadian men's and women's ice hockey teams both claimed gold medals, an achievement that brought immense pride to the host nation. + </citation> + <citation index="3" chunk_id="789a1234-e567-890f-gh12-34567ijklmn8" type="csv"></citation> + <citation index="4" chunk_id="987f6543-e21b-43c9-a987-654321fedcba" type="text"> + Tragedy struck early in the games with the death of Georgian luger Nodar Kumaritashvili during a training run. + </citation> + </citations> + + <follow_up_questions> + <question>What were the economic impacts on Vancouver after hosting the 2010 Winter Olympics?</question> + <question>How did the tragic accident of Nodar Kumaritashvili influence safety protocols in luge and other winter sports?</question> + <question>Can you provide more information on other significant performances by athletes during the 2010 Winter Olympics?</question> + </follow_up_questions> + </answer> + </step15> + </interaction> +</example_interaction> + + + <chat_history> + ${chatHistory} + <note>Use this history for context when appropriate.</note> + </chat_history> + + <final_instruction> + Now, process the user’s query and provide your response following the format and rules outlined above. Ensure your final answer is comprehensive, correctly cited, and entirely contained within the structured tags. Do not get stuck in infinite loops and keep responses concise, grounded, and most importantly, HELPFUL AND USEFUL! + </final_instruction> +</system_message> +`; } export function getSummarizedChunksPrompt(chunks: string): string { diff --git a/src/client/views/nodes/ChatBox/tools/BaseTool.ts b/src/client/views/nodes/ChatBox/tools/BaseTool.ts index c7942e359..2e2267653 100644 --- a/src/client/views/nodes/ChatBox/tools/BaseTool.ts +++ b/src/client/views/nodes/ChatBox/tools/BaseTool.ts @@ -5,26 +5,19 @@ export abstract class BaseTool<T extends Record<string, any> = Record<string, an public name: string, public description: string, public parameters: Record<string, any>, - public useRules: string, + public citationRules: string, public briefSummary: string ) {} abstract execute(args: T): Promise<any>; - getActionRule(isCurrentTool: boolean): Record<string, any> { - if (isCurrentTool) { - return { - [this.name]: { - name: this.name, - useRules: this.useRules, - description: this.description, - parameters: this.parameters, - }, - }; - } + getActionRule(): Record<string, any> { return { [this.name]: { - description: 'This tool is not currently selected.', + name: this.name, + citationRules: this.citationRules, + description: this.description, + parameters: this.parameters, }, }; } diff --git a/src/client/views/nodes/ChatBox/tools/DataAnalysisTool.ts b/src/client/views/nodes/ChatBox/tools/DataAnalysisTool.ts index d2edc4847..b45733639 100644 --- a/src/client/views/nodes/ChatBox/tools/DataAnalysisTool.ts +++ b/src/client/views/nodes/ChatBox/tools/DataAnalysisTool.ts @@ -1,8 +1,9 @@ import { BaseTool } from './BaseTool'; export class DataAnalysisTool extends BaseTool<{ csv_file_name: string }> { - private csv_files_function: () => { [filename: string]: string }; - constructor(csv_files: () => { [filename: string]: string }) { + private csv_files_function: () => { filename: string; id: string; text: string }[]; + + constructor(csv_files: () => { filename: string; id: string; text: string }[]) { super( 'dataAnalysis', 'Analyzes, and provides insights, from a CSV file', @@ -21,10 +22,18 @@ export class DataAnalysisTool extends BaseTool<{ csv_file_name: string }> { getFileContent(filename: string): string | undefined { const files = this.csv_files_function(); - return files[filename]; + const file = files.find(f => f.filename === filename); + return file?.text; + } + + getFileID(filename: string): string | undefined { + const files = this.csv_files_function(); + const file = files.find(f => f.filename === filename); + return file?.id; } async execute(args: { csv_file_name: string }): Promise<any> { - return [{ type: 'text', text: this.getFileContent(args.csv_file_name) }]; + console.log(this.csv_files_function()); + return [{ type: 'text', text: `<chunk chunk_id=${this.getFileID(args.csv_file_name)} chunk_type=csv}>` + this.getFileContent(args.csv_file_name) + '</chunk>' }]; } } diff --git a/src/client/views/nodes/ChatBox/tools/SearchTool.ts b/src/client/views/nodes/ChatBox/tools/SearchTool.ts new file mode 100644 index 000000000..91ecc71ff --- /dev/null +++ b/src/client/views/nodes/ChatBox/tools/SearchTool.ts @@ -0,0 +1,47 @@ +import { Networking } from '../../../../Network'; +import { BaseTool } from './BaseTool'; +import { v4 as uuidv4 } from 'uuid'; + +export class SearchTool extends BaseTool<{ query: string }> { + private _addLinkedUrlDoc: (url: string, id: string) => void; + + constructor(addLinkedUrlDoc: (url: string, id: string) => void) { + super( + 'searchTool', + 'Search the web to find a wide range of websites related to a query', + { + query: { + type: 'string', + description: 'The search query to use for finding websites', + required: true, + }, + }, + 'Provide a search query to find a broad range of websites. This tool is intended to help you identify relevant websites, but not to be used for providing the final answer. Use this information to determine which specific website to investigate further.', + 'Returns a list of websites and their overviews based on the search query, helping to identify which website might contain the most relevant information.' + ); + this._addLinkedUrlDoc = addLinkedUrlDoc; + } + + async execute(args: { query: string }): Promise<any> { + try { + const { results } = await Networking.PostToServer('/getWebSearchResults', { query: args.query }); + console.log(results); + const data: { type: string; text: string }[] = results.map((result: { url: string; snippet: string }) => { + console.log; + const id = uuidv4(); + this._addLinkedUrlDoc(result.url, id); + return { + type: 'text', + text: `<chunk chunk_id="${id}" chunk_type="text"> + <url>${result.url}</url> + <overview>${result.snippet}</overview> + </chunk>`, + }; + }); + return data; + } catch (error) { + console.log(error); + return [{ type: 'text', text: 'An error occurred while performing the web search.' }]; + } + } +} diff --git a/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts b/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts new file mode 100644 index 000000000..59fd47b7a --- /dev/null +++ b/src/client/views/nodes/ChatBox/tools/WebsiteInfoScraperTool.ts @@ -0,0 +1,35 @@ +import { Networking } from '../../../../Network'; +import { BaseTool } from './BaseTool'; +import { v4 as uuidv4 } from 'uuid'; + +export class WebsiteInfoScraperTool extends BaseTool<{ url: string }> { + private _addLinkedUrlDoc: (url: string, id: string) => void; + + constructor(addLinkedUrlDoc: (url: string, id: string) => void) { + super( + 'websiteInfoScraper', + 'Scrape detailed information from a specific website identified as the most relevant', + { + url: { + type: 'string', + description: 'The URL of the website to scrape', + required: true, + }, + }, + 'Provide the URL of the website that you have identified as the most relevant from the previous search. This tool will scrape and process detailed information from that specific website. It will also create a document from the scraped content for future reference.', + 'Returns the full HTML content from the provided URL and creates a document from the content for further analysis.' + ); + this._addLinkedUrlDoc = addLinkedUrlDoc; + } + + async execute(args: { url: string }): Promise<any> { + try { + const { html } = await Networking.PostToServer('/scrapeWebsite', { url: args.url }); + const id = uuidv4(); + this._addLinkedUrlDoc(args.url, id); + return [{ type: 'text', text: `<chunk chunk_id=${id} chunk_type=text> ${html} </chunk>` }]; + } catch (error) { + return [{ type: 'text', text: 'An error occurred while scraping the website.' }]; + } + } +} diff --git a/src/client/views/nodes/ChatBox/tools/WikipediaTool.ts b/src/client/views/nodes/ChatBox/tools/WikipediaTool.ts index e2c5009a1..143d91d80 100644 --- a/src/client/views/nodes/ChatBox/tools/WikipediaTool.ts +++ b/src/client/views/nodes/ChatBox/tools/WikipediaTool.ts @@ -2,9 +2,11 @@ import { title } from 'process'; import { Networking } from '../../../../Network'; import { BaseTool } from './BaseTool'; import axios from 'axios'; +import { v4 as uuidv4 } from 'uuid'; export class WikipediaTool extends BaseTool<{ title: string }> { - constructor() { + private _addLinkedUrlDoc: (url: string, id: string) => void; + constructor(addLinkedUrlDoc: (url: string, id: string) => void) { super( 'wikipedia', 'Search Wikipedia and return a summary', @@ -18,12 +20,16 @@ export class WikipediaTool extends BaseTool<{ title: string }> { 'Provide simply the title you want to search on Wikipedia and nothing more. If re-using this tool, try a different title for different information.', 'Returns a summary from searching an article title on Wikipedia' ); + this._addLinkedUrlDoc = addLinkedUrlDoc; } async execute(args: { title: string }): Promise<any> { try { const { text } = await Networking.PostToServer('/getWikipediaSummary', { title: args.title }); - return [{ type: 'text', text: text }]; + const id = uuidv4(); + const url = `https://en.wikipedia.org/wiki/${args.title.replace(/ /g, '_')}`; + this._addLinkedUrlDoc(url, id); + return [{ type: 'text', text: `<chunk chunk_id=${id} chunk_type=csv}> ${text} </chunk>` }]; } catch (error) { return [{ type: 'text', text: 'An error occurred while fetching the article.' }]; } diff --git a/src/client/views/nodes/ChatBox/types.ts b/src/client/views/nodes/ChatBox/types.ts index 4a0a9cfce..1c7aaa4b7 100644 --- a/src/client/views/nodes/ChatBox/types.ts +++ b/src/client/views/nodes/ChatBox/types.ts @@ -1,3 +1,4 @@ +import { breadcrumbsClasses } from '@mui/material'; import { Doc } from '../../../../fields/Doc'; import { StrCast } from '../../../../fields/Types'; @@ -16,20 +17,29 @@ export enum CHUNK_TYPE { IMAGE = 'image', TABLE = 'table', URL = 'url', + CSV = 'CSV', } export function getChunkType(type: string): CHUNK_TYPE { switch (type.toLowerCase()) { case 'text': return CHUNK_TYPE.TEXT; + break; case 'image': return CHUNK_TYPE.IMAGE; + break; case 'table': return CHUNK_TYPE.TABLE; + break; + case 'CSV': + return CHUNK_TYPE.CSV; + break; case 'url': return CHUNK_TYPE.URL; + break; default: return CHUNK_TYPE.TEXT; + break; } } @@ -52,6 +62,7 @@ export interface Citation { type: CHUNK_TYPE; chunk_id: string; citation_id: string; + url?: string; } export interface RAGChunk { @@ -93,10 +104,10 @@ export interface Tool<T extends Record<string, any> = Record<string, any>> { name: string; description: string; parameters: Record<string, any>; - useRules: string; + citationRules: string; briefSummary: string; execute: (args: T) => Promise<any>; - getActionRule: (isCurrentTool: boolean) => Record<string, any>; + getActionRule: () => Record<string, any>; } export interface AgentMessage { diff --git a/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts index 8e7be6eec..4383bb72d 100644 --- a/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts @@ -3,6 +3,7 @@ import { CohereClient } from 'cohere-ai'; import { EmbedResponse } from 'cohere-ai/api'; import dotenv from 'dotenv'; import axios from 'axios'; +import { SimplifiedChunk } from '../types'; import { RAGChunk, AI_Document, CHUNK_TYPE } from '../types'; import { Doc } from '../../../../../fields/Doc'; @@ -19,6 +20,7 @@ export class Vectorstore { private indexName: string = 'pdf-chatbot'; private _id: string; private _doc_ids: string[] = []; + documents: AI_Document[] = []; constructor(id: string, doc_ids: () => string[]) { @@ -91,31 +93,20 @@ export class Vectorstore { doc.vectorstore_id = JSON.stringify(JSON.parse(StrCast(doc.vectorstore_id)).concat([this._id])); } if (doc.chunk_simpl === undefined || doc.chunk_simpl === null || doc.chunk_simpl === '' || doc.chunk_simpl === '[]') { - doc.chunk_simpl = JSON.stringify({ text_chunks: [], image_chunks: [] }); + doc.chunk_simpl = JSON.stringify({ chunks: [] }); } - let new_chunk_simpl: { text_chunks: { chunk_id: string; start_page: number; end_page: number }[]; image_chunks: { chunk_id: string; location: string; page: number }[] } = { - text_chunks: [], - image_chunks: [], - }; document_json.chunks.forEach((chunk: RAGChunk) => { - let chunk_to_add: { chunk_id: string; start_page: number; end_page: number }[] | { chunk_id: string; location: string; page: number }[]; - switch (chunk.metadata.type) { - case CHUNK_TYPE.TEXT: - chunk_to_add = [{ chunk_id: chunk.id, start_page: chunk.metadata.start_page, end_page: chunk.metadata.end_page }]; - new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); - new_chunk_simpl.text_chunks = new_chunk_simpl.text_chunks.concat(chunk_to_add); - doc.chunk_simpl = JSON.stringify(new_chunk_simpl); - break; - case CHUNK_TYPE.IMAGE: - case CHUNK_TYPE.TABLE: - console.log('Location:', chunk.metadata.location); - chunk_to_add = [{ chunk_id: chunk.id, location: chunk.metadata.location, page: chunk.metadata.start_page }]; - new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); - new_chunk_simpl.image_chunks = new_chunk_simpl.image_chunks.concat(chunk_to_add); - doc.chunk_simpl = JSON.stringify(new_chunk_simpl); - break; - } + const chunkToAdd = { + chunkId: chunk.id, + startPage: chunk.metadata.start_page, + endPage: chunk.metadata.end_page, + location: chunk.metadata.location, + chunkType: chunk.metadata.type as CHUNK_TYPE, + }; + const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); + new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd); + doc.chunk_simpl = JSON.stringify(new_chunk_simpl); }); doc.ai_document_status = 'COMPLETED'; diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index 91185e042..6460edb9a 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -12,6 +12,7 @@ import { RAGChunk } from '../../client/views/nodes/ChatBox/types'; import { UnstructuredClient } from 'unstructured-client'; import { PartitionResponse } from 'unstructured-client/sdk/models/operations'; import { ChunkingStrategy, Strategy } from 'unstructured-client/sdk/models/shared'; +import * as cheerio from 'cheerio'; export enum Directory { parsed_files = 'parsed_files', @@ -80,6 +81,102 @@ export default class AssistantManager extends ApiManager { register({ method: Method.POST, + subscription: '/getWebSearchResults', + secureHandler: async ({ req, res }) => { + const { query } = req.body; + try { + const response = await axios.get('http://api.serpstack.com/search', { + params: { + access_key: process.env._CLIENT_SERPSTACK_API_KEY, + query: query, + }, + }); + console.log(response.data); + + const results = response.data.organic_results.map((result: any) => ({ + url: result.url, + snippet: result.snippet, + })); + + console.log(results); + + res.send({ results }); + } catch (error: any) { + console.error('Error performing web search:', error); + res.status(500).send({ error: 'Failed to perform web search', details: error.message }); + } + }, + }); + + const axiosInstance = axios.create({ + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + }, + }); + + const delay = (ms: number) => new Promise(resolve => setTimeout(resolve, ms)); + + const fetchWithRetry = async (url: string, retries = 3, backoff = 300) => { + try { + const response = await axiosInstance.get(url); + return response.data; + } catch (error: any) { + if (retries > 0 && error.response && error.response.status === 429) { + console.log(`Rate limited. Retrying in ${backoff}ms...`); + await delay(backoff); + return fetchWithRetry(url, retries - 1, backoff * 2); + } + throw error; + } + }; + + register({ + method: Method.POST, + subscription: '/scrapeWebsite', + secureHandler: async ({ req, res }) => { + const { url } = req.body; + try { + const html = await fetchWithRetry(url); + const $ = cheerio.load(html); + + // Remove script, style tags, and other non-content elements + $('script, style, noscript, iframe, svg, img, video, audio').remove(); + + // Function to extract text from an element and its children + function extractText(element: any): string { + let text = ''; + element.contents().each((_: any, el: any) => { + if (el.type === 'text') { + text += $(el).text().trim() + ' '; + } else if (el.type === 'tag' && !['script', 'style'].includes(el.name)) { + text += extractText($(el)) + ' '; + } + }); + return text.trim(); + } + + // Extract all visible text from the body + const bodyText = extractText($('body')); + + // Split the text into lines and remove empty lines + const lines = bodyText + .split('\n') + .map(line => line.trim()) + .filter(line => line.length > 0); + + // Join the lines back together + const extractedContent = lines.join('\n'); + + res.send({ content: extractedContent }); + } catch (error: any) { + console.error('Error scraping website:', error); + res.status(500).send({ error: 'Failed to scrape website', details: error.message }); + } + }, + }); + + register({ + method: Method.POST, subscription: '/createDocument', secureHandler: async ({ req, res }) => { const { file_path } = req.body; |