diff options
-rw-r--r-- | package-lock.json | 54 | ||||
-rw-r--r-- | package.json | 1 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/Agent.ts | 2 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/ChatBox.tsx | 2 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/tools/RAGTool.ts | 2 | ||||
-rw-r--r-- | src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts (renamed from src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts) | 38 | ||||
-rw-r--r-- | src/server/ApiManagers/AssistantManager.ts | 54 |
7 files changed, 132 insertions, 21 deletions
diff --git a/package-lock.json b/package-lock.json index 74a7826cd..0a485dcb7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -231,6 +231,7 @@ "typescript-collections": "^1.3.3", "typescript-language-server": "^4.1.3", "uninstall": "^0.0.0", + "unstructured-client": "^0.14.3", "url": "^0.11.3", "url-loader": "^4.1.1", "util": "^0.12.5", @@ -5080,6 +5081,22 @@ "@octokit/openapi-types": "^22.2.0" } }, + "node_modules/@pdf-lib/standard-fonts": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@pdf-lib/standard-fonts/-/standard-fonts-1.0.0.tgz", + "integrity": "sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA==", + "dependencies": { + "pako": "^1.0.6" + } + }, + "node_modules/@pdf-lib/upng": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@pdf-lib/upng/-/upng-1.0.1.tgz", + "integrity": "sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==", + "dependencies": { + "pako": "^1.0.10" + } + }, "node_modules/@pinecone-database/pinecone": { "version": "2.2.2", "resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-2.2.2.tgz", @@ -36337,6 +36354,22 @@ "pbf": "bin/pbf" } }, + "node_modules/pdf-lib": { + "version": "1.17.1", + "resolved": "https://registry.npmjs.org/pdf-lib/-/pdf-lib-1.17.1.tgz", + "integrity": "sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw==", + "dependencies": { + "@pdf-lib/standard-fonts": "^1.0.0", + "@pdf-lib/upng": "^1.0.1", + "pako": "^1.0.11", + "tslib": "^1.11.1" + } + }, + "node_modules/pdf-lib/node_modules/tslib": { + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz", + "integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==" + }, "node_modules/pdf-parse": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-1.1.1.tgz", @@ -42494,6 +42527,18 @@ "node": ">= 0.8" } }, + "node_modules/unstructured-client": { + "version": "0.14.3", + "resolved": "https://registry.npmjs.org/unstructured-client/-/unstructured-client-0.14.3.tgz", + "integrity": "sha512-+QEkwNm0GtLGs3TdbaG3ES1upR5ygzM8Jtm3epqNlz1cLREruI/jl342jGRwg75N9jkrMWsye3GHgZcN4TpfCw==", + "dependencies": { + "async": "^3.2.5", + "pdf-lib": "^1.17.1" + }, + "peerDependencies": { + "zod": ">= 3" + } + }, "node_modules/update-browserslist-db": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.0.tgz", @@ -43958,6 +44003,15 @@ "node": ">= 14" } }, + "node_modules/zod": { + "version": "3.23.8", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.23.8.tgz", + "integrity": "sha512-XBx9AXhXktjUqnepgTiE5flcKIYWi/rme0Eaj+5Y0lftuGBq+jyRu/md4WnuxqgP1ubdpNCsYEYPxrzVHD8d6g==", + "peer": true, + "funding": { + "url": "https://github.com/sponsors/colinhacks" + } + }, "node_modules/zwitch": { "version": "2.0.4", "resolved": "https://registry.npmjs.org/zwitch/-/zwitch-2.0.4.tgz", diff --git a/package.json b/package.json index 52ad21dea..4e9946e2e 100644 --- a/package.json +++ b/package.json @@ -316,6 +316,7 @@ "typescript-collections": "^1.3.3", "typescript-language-server": "^4.1.3", "uninstall": "^0.0.0", + "unstructured-client": "^0.14.3", "url": "^0.11.3", "url-loader": "^4.1.1", "util": "^0.12.5", diff --git a/src/client/views/nodes/ChatBox/Agent.ts b/src/client/views/nodes/ChatBox/Agent.ts index 04729414a..7b3703449 100644 --- a/src/client/views/nodes/ChatBox/Agent.ts +++ b/src/client/views/nodes/ChatBox/Agent.ts @@ -6,7 +6,7 @@ import { WikipediaTool } from './tools/WikipediaTool'; import { CalculateTool } from './tools/CalculateTool'; import { RAGTool } from './tools/RAGTool'; import { NoTool } from './tools/NoTool'; -import { Vectorstore } from './vectorstore/VectorstoreUpload'; +import { Vectorstore } from './vectorstore/Vectorstore'; import { ChatCompletionAssistantMessageParam, ChatCompletionMessageParam } from 'openai/resources'; import dotenv from 'dotenv'; import { ChatBox } from './ChatBox'; diff --git a/src/client/views/nodes/ChatBox/ChatBox.tsx b/src/client/views/nodes/ChatBox/ChatBox.tsx index 3de5c1da3..13c418b32 100644 --- a/src/client/views/nodes/ChatBox/ChatBox.tsx +++ b/src/client/views/nodes/ChatBox/ChatBox.tsx @@ -12,7 +12,7 @@ import { FieldView, FieldViewProps } from '../FieldView'; import './ChatBox.scss'; import MessageComponentBox from './MessageComponent'; import { ASSISTANT_ROLE, AssistantMessage, AI_Document, Citation, CHUNK_TYPE, Chunk, getChunkType, TEXT_TYPE } from './types'; -import { Vectorstore } from './vectorstore/VectorstoreUpload'; +import { Vectorstore } from './vectorstore/Vectorstore'; import { Agent } from './Agent'; import dotenv from 'dotenv'; import { DocData, DocViews } from '../../../../fields/DocSymbols'; diff --git a/src/client/views/nodes/ChatBox/tools/RAGTool.ts b/src/client/views/nodes/ChatBox/tools/RAGTool.ts index 23b93b0f0..be591fa9a 100644 --- a/src/client/views/nodes/ChatBox/tools/RAGTool.ts +++ b/src/client/views/nodes/ChatBox/tools/RAGTool.ts @@ -1,5 +1,5 @@ import { BaseTool } from './BaseTool'; -import { Vectorstore } from '../vectorstore/VectorstoreUpload'; +import { Vectorstore } from '../vectorstore/Vectorstore'; import { Chunk } from '../types'; import * as fs from 'fs'; import { Networking } from '../../../../Network'; diff --git a/src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts index 787705bb6..25aec751f 100644 --- a/src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts +++ b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts @@ -2,6 +2,7 @@ import { Pinecone, Index, IndexList, PineconeRecord, RecordMetadata, QueryRespon import { CohereClient } from 'cohere-ai'; import { EmbedResponse } from 'cohere-ai/api'; import dotenv from 'dotenv'; +import axios from 'axios'; import { Chunk, AI_Document, CHUNK_TYPE } from '../types'; import { Doc } from '../../../../../fields/Doc'; @@ -43,7 +44,7 @@ export class Vectorstore { if (!indexList.indexes?.some(index => index.name === this.indexName)) { await this.pinecone.createIndex({ name: this.indexName, - dimension: 1024, + dimension: 768, metric: 'cosine', spec: { serverless: { @@ -138,25 +139,26 @@ export class Vectorstore { async retrieve(query: string, topK: number = 10): Promise<Chunk[]> { console.log(`Retrieving chunks for query: ${query}`); try { - const queryEmbeddingResponse: EmbedResponse = await this.cohere.embed({ - texts: [query], - model: 'embed-english-v3.0', - inputType: 'search_query', - }); - - let queryEmbedding: number[]; - - if (Array.isArray(queryEmbeddingResponse.embeddings)) { - queryEmbedding = queryEmbeddingResponse.embeddings[0]; - } else if (queryEmbeddingResponse.embeddings && 'embeddings' in queryEmbeddingResponse.embeddings) { - queryEmbedding = (queryEmbeddingResponse.embeddings as { embeddings: number[][] }).embeddings[0]; - } else { - throw new Error('Invalid embedding response format'); + const url = 'https://api.jina.ai/v1/embeddings'; + const headers = { + 'Content-Type': 'application/json', + Authorization: `Bearer ${process.env.JINA_API_KEY}`, + }; + const data = { + model: 'jina-clip-v1', + normalized: true, + embedding_type: 'float', + input: [{ text: query }], + }; + + const response = await axios.post(url, data, { headers }); + const embeddings = response.data?.data?.[0]?.embedding; + + if (!embeddings || !Array.isArray(embeddings)) { + throw new Error('Invalid embedding response format from Jina API'); } - if (!Array.isArray(queryEmbedding)) { - throw new Error('Query embedding is not an array'); - } + const queryEmbedding = embeddings; const queryResponse: QueryResponse<RecordMetadata> = await this.index.query({ vector: queryEmbedding, diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index 36468157a..f69ca1383 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -9,6 +9,9 @@ import { Method } from '../RouteManager'; import ApiManager, { Registration } from './ApiManager'; import axios from 'axios'; import { Chunk } from '../../client/views/nodes/ChatBox/types'; +import { UnstructuredClient } from 'unstructured-client'; +import { PartitionResponse } from 'unstructured-client/sdk/models/operations'; +import { ChunkingStrategy, Strategy } from 'unstructured-client/sdk/models/shared'; export enum Directory { parsed_files = 'parsed_files', @@ -42,6 +45,11 @@ export default class AssistantManager extends ApiManager { apiKey: process.env._CLIENT_OPENAI_KEY, // Use client key so don't have to set key seperately for client and server. dangerouslyAllowBrowser: true, }); + const unstructuredClient = new UnstructuredClient({ + security: { + apiKeyAuth: process.env._CLIENT_UNSTRUCTURED_API_KEY!, + }, + }); register({ method: Method.POST, @@ -187,5 +195,51 @@ export default class AssistantManager extends ApiManager { res.send({ formattedChunks: content }); }, }); + + register({ + method: Method.POST, + subscription: '/chunkDocument', + secureHandler: async ({ req, res }) => { + const { file_path } = req.body; + const public_path = path.join(publicDirectory, file_path); + const file_name = path.basename(file_path); + + try { + // Read file data and convert to base64 + const file_data = await fs.promises.readFile(public_path); + + try { + const result = await unstructuredClient.general.partition({ + partitionParameters: { + files: { + content: file_data, + fileName: file_name, + }, + strategy: Strategy.Auto, + chunkingStrategy: ChunkingStrategy.ByTitle, + extractImageBlockTypes: ['Image', 'Table'], + }, + }); + + if (result.statusCode === 200) { + console.log(result.elements); + const jsonElements = JSON.stringify(result.elements, null, 2); + // Print the processed data. + console.log(jsonElements); + res.send({ document_json: jsonElements }); + } else { + console.error(`Unexpected status code: ${result.statusCode}`); + res.status(result.statusCode).send({ error: 'Failed to process the document', details: result }); + } + } catch (e: any) { + console.error('Error during partitioning:', e); + res.status(500).send({ error: 'Failed to partition the document', details: e.message }); + } + } catch (error: any) { + console.error('Error reading file:', error); + res.status(500).send({ error: 'Failed to read the file', details: error.message }); + } + }, + }); } } |