aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2024-08-14 12:48:39 -0400
committerA.J. Shulman <Shulman.aj@gmail.com>2024-08-14 12:48:39 -0400
commitcd4b13bacd6639d2a731a05dfca700b201b2073c (patch)
tree00044399c9b8028f5c3d82f830879faaef881cac
parentb7c024c8c5b85f91828d6cd20ffc3bfca229af21 (diff)
attempt at new multimodal vector
-rw-r--r--package-lock.json54
-rw-r--r--package.json1
-rw-r--r--src/client/views/nodes/ChatBox/Agent.ts2
-rw-r--r--src/client/views/nodes/ChatBox/ChatBox.tsx2
-rw-r--r--src/client/views/nodes/ChatBox/tools/RAGTool.ts2
-rw-r--r--src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts (renamed from src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts)38
-rw-r--r--src/server/ApiManagers/AssistantManager.ts54
7 files changed, 132 insertions, 21 deletions
diff --git a/package-lock.json b/package-lock.json
index 74a7826cd..0a485dcb7 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -231,6 +231,7 @@
"typescript-collections": "^1.3.3",
"typescript-language-server": "^4.1.3",
"uninstall": "^0.0.0",
+ "unstructured-client": "^0.14.3",
"url": "^0.11.3",
"url-loader": "^4.1.1",
"util": "^0.12.5",
@@ -5080,6 +5081,22 @@
"@octokit/openapi-types": "^22.2.0"
}
},
+ "node_modules/@pdf-lib/standard-fonts": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/@pdf-lib/standard-fonts/-/standard-fonts-1.0.0.tgz",
+ "integrity": "sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA==",
+ "dependencies": {
+ "pako": "^1.0.6"
+ }
+ },
+ "node_modules/@pdf-lib/upng": {
+ "version": "1.0.1",
+ "resolved": "https://registry.npmjs.org/@pdf-lib/upng/-/upng-1.0.1.tgz",
+ "integrity": "sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==",
+ "dependencies": {
+ "pako": "^1.0.10"
+ }
+ },
"node_modules/@pinecone-database/pinecone": {
"version": "2.2.2",
"resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-2.2.2.tgz",
@@ -36337,6 +36354,22 @@
"pbf": "bin/pbf"
}
},
+ "node_modules/pdf-lib": {
+ "version": "1.17.1",
+ "resolved": "https://registry.npmjs.org/pdf-lib/-/pdf-lib-1.17.1.tgz",
+ "integrity": "sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw==",
+ "dependencies": {
+ "@pdf-lib/standard-fonts": "^1.0.0",
+ "@pdf-lib/upng": "^1.0.1",
+ "pako": "^1.0.11",
+ "tslib": "^1.11.1"
+ }
+ },
+ "node_modules/pdf-lib/node_modules/tslib": {
+ "version": "1.14.1",
+ "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz",
+ "integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg=="
+ },
"node_modules/pdf-parse": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-1.1.1.tgz",
@@ -42494,6 +42527,18 @@
"node": ">= 0.8"
}
},
+ "node_modules/unstructured-client": {
+ "version": "0.14.3",
+ "resolved": "https://registry.npmjs.org/unstructured-client/-/unstructured-client-0.14.3.tgz",
+ "integrity": "sha512-+QEkwNm0GtLGs3TdbaG3ES1upR5ygzM8Jtm3epqNlz1cLREruI/jl342jGRwg75N9jkrMWsye3GHgZcN4TpfCw==",
+ "dependencies": {
+ "async": "^3.2.5",
+ "pdf-lib": "^1.17.1"
+ },
+ "peerDependencies": {
+ "zod": ">= 3"
+ }
+ },
"node_modules/update-browserslist-db": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.0.tgz",
@@ -43958,6 +44003,15 @@
"node": ">= 14"
}
},
+ "node_modules/zod": {
+ "version": "3.23.8",
+ "resolved": "https://registry.npmjs.org/zod/-/zod-3.23.8.tgz",
+ "integrity": "sha512-XBx9AXhXktjUqnepgTiE5flcKIYWi/rme0Eaj+5Y0lftuGBq+jyRu/md4WnuxqgP1ubdpNCsYEYPxrzVHD8d6g==",
+ "peer": true,
+ "funding": {
+ "url": "https://github.com/sponsors/colinhacks"
+ }
+ },
"node_modules/zwitch": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/zwitch/-/zwitch-2.0.4.tgz",
diff --git a/package.json b/package.json
index 52ad21dea..4e9946e2e 100644
--- a/package.json
+++ b/package.json
@@ -316,6 +316,7 @@
"typescript-collections": "^1.3.3",
"typescript-language-server": "^4.1.3",
"uninstall": "^0.0.0",
+ "unstructured-client": "^0.14.3",
"url": "^0.11.3",
"url-loader": "^4.1.1",
"util": "^0.12.5",
diff --git a/src/client/views/nodes/ChatBox/Agent.ts b/src/client/views/nodes/ChatBox/Agent.ts
index 04729414a..7b3703449 100644
--- a/src/client/views/nodes/ChatBox/Agent.ts
+++ b/src/client/views/nodes/ChatBox/Agent.ts
@@ -6,7 +6,7 @@ import { WikipediaTool } from './tools/WikipediaTool';
import { CalculateTool } from './tools/CalculateTool';
import { RAGTool } from './tools/RAGTool';
import { NoTool } from './tools/NoTool';
-import { Vectorstore } from './vectorstore/VectorstoreUpload';
+import { Vectorstore } from './vectorstore/Vectorstore';
import { ChatCompletionAssistantMessageParam, ChatCompletionMessageParam } from 'openai/resources';
import dotenv from 'dotenv';
import { ChatBox } from './ChatBox';
diff --git a/src/client/views/nodes/ChatBox/ChatBox.tsx b/src/client/views/nodes/ChatBox/ChatBox.tsx
index 3de5c1da3..13c418b32 100644
--- a/src/client/views/nodes/ChatBox/ChatBox.tsx
+++ b/src/client/views/nodes/ChatBox/ChatBox.tsx
@@ -12,7 +12,7 @@ import { FieldView, FieldViewProps } from '../FieldView';
import './ChatBox.scss';
import MessageComponentBox from './MessageComponent';
import { ASSISTANT_ROLE, AssistantMessage, AI_Document, Citation, CHUNK_TYPE, Chunk, getChunkType, TEXT_TYPE } from './types';
-import { Vectorstore } from './vectorstore/VectorstoreUpload';
+import { Vectorstore } from './vectorstore/Vectorstore';
import { Agent } from './Agent';
import dotenv from 'dotenv';
import { DocData, DocViews } from '../../../../fields/DocSymbols';
diff --git a/src/client/views/nodes/ChatBox/tools/RAGTool.ts b/src/client/views/nodes/ChatBox/tools/RAGTool.ts
index 23b93b0f0..be591fa9a 100644
--- a/src/client/views/nodes/ChatBox/tools/RAGTool.ts
+++ b/src/client/views/nodes/ChatBox/tools/RAGTool.ts
@@ -1,5 +1,5 @@
import { BaseTool } from './BaseTool';
-import { Vectorstore } from '../vectorstore/VectorstoreUpload';
+import { Vectorstore } from '../vectorstore/Vectorstore';
import { Chunk } from '../types';
import * as fs from 'fs';
import { Networking } from '../../../../Network';
diff --git a/src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts
index 787705bb6..25aec751f 100644
--- a/src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts
+++ b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts
@@ -2,6 +2,7 @@ import { Pinecone, Index, IndexList, PineconeRecord, RecordMetadata, QueryRespon
import { CohereClient } from 'cohere-ai';
import { EmbedResponse } from 'cohere-ai/api';
import dotenv from 'dotenv';
+import axios from 'axios';
import { Chunk, AI_Document, CHUNK_TYPE } from '../types';
import { Doc } from '../../../../../fields/Doc';
@@ -43,7 +44,7 @@ export class Vectorstore {
if (!indexList.indexes?.some(index => index.name === this.indexName)) {
await this.pinecone.createIndex({
name: this.indexName,
- dimension: 1024,
+ dimension: 768,
metric: 'cosine',
spec: {
serverless: {
@@ -138,25 +139,26 @@ export class Vectorstore {
async retrieve(query: string, topK: number = 10): Promise<Chunk[]> {
console.log(`Retrieving chunks for query: ${query}`);
try {
- const queryEmbeddingResponse: EmbedResponse = await this.cohere.embed({
- texts: [query],
- model: 'embed-english-v3.0',
- inputType: 'search_query',
- });
-
- let queryEmbedding: number[];
-
- if (Array.isArray(queryEmbeddingResponse.embeddings)) {
- queryEmbedding = queryEmbeddingResponse.embeddings[0];
- } else if (queryEmbeddingResponse.embeddings && 'embeddings' in queryEmbeddingResponse.embeddings) {
- queryEmbedding = (queryEmbeddingResponse.embeddings as { embeddings: number[][] }).embeddings[0];
- } else {
- throw new Error('Invalid embedding response format');
+ const url = 'https://api.jina.ai/v1/embeddings';
+ const headers = {
+ 'Content-Type': 'application/json',
+ Authorization: `Bearer ${process.env.JINA_API_KEY}`,
+ };
+ const data = {
+ model: 'jina-clip-v1',
+ normalized: true,
+ embedding_type: 'float',
+ input: [{ text: query }],
+ };
+
+ const response = await axios.post(url, data, { headers });
+ const embeddings = response.data?.data?.[0]?.embedding;
+
+ if (!embeddings || !Array.isArray(embeddings)) {
+ throw new Error('Invalid embedding response format from Jina API');
}
- if (!Array.isArray(queryEmbedding)) {
- throw new Error('Query embedding is not an array');
- }
+ const queryEmbedding = embeddings;
const queryResponse: QueryResponse<RecordMetadata> = await this.index.query({
vector: queryEmbedding,
diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts
index 36468157a..f69ca1383 100644
--- a/src/server/ApiManagers/AssistantManager.ts
+++ b/src/server/ApiManagers/AssistantManager.ts
@@ -9,6 +9,9 @@ import { Method } from '../RouteManager';
import ApiManager, { Registration } from './ApiManager';
import axios from 'axios';
import { Chunk } from '../../client/views/nodes/ChatBox/types';
+import { UnstructuredClient } from 'unstructured-client';
+import { PartitionResponse } from 'unstructured-client/sdk/models/operations';
+import { ChunkingStrategy, Strategy } from 'unstructured-client/sdk/models/shared';
export enum Directory {
parsed_files = 'parsed_files',
@@ -42,6 +45,11 @@ export default class AssistantManager extends ApiManager {
apiKey: process.env._CLIENT_OPENAI_KEY, // Use client key so don't have to set key seperately for client and server.
dangerouslyAllowBrowser: true,
});
+ const unstructuredClient = new UnstructuredClient({
+ security: {
+ apiKeyAuth: process.env._CLIENT_UNSTRUCTURED_API_KEY!,
+ },
+ });
register({
method: Method.POST,
@@ -187,5 +195,51 @@ export default class AssistantManager extends ApiManager {
res.send({ formattedChunks: content });
},
});
+
+ register({
+ method: Method.POST,
+ subscription: '/chunkDocument',
+ secureHandler: async ({ req, res }) => {
+ const { file_path } = req.body;
+ const public_path = path.join(publicDirectory, file_path);
+ const file_name = path.basename(file_path);
+
+ try {
+ // Read file data and convert to base64
+ const file_data = await fs.promises.readFile(public_path);
+
+ try {
+ const result = await unstructuredClient.general.partition({
+ partitionParameters: {
+ files: {
+ content: file_data,
+ fileName: file_name,
+ },
+ strategy: Strategy.Auto,
+ chunkingStrategy: ChunkingStrategy.ByTitle,
+ extractImageBlockTypes: ['Image', 'Table'],
+ },
+ });
+
+ if (result.statusCode === 200) {
+ console.log(result.elements);
+ const jsonElements = JSON.stringify(result.elements, null, 2);
+ // Print the processed data.
+ console.log(jsonElements);
+ res.send({ document_json: jsonElements });
+ } else {
+ console.error(`Unexpected status code: ${result.statusCode}`);
+ res.status(result.statusCode).send({ error: 'Failed to process the document', details: result });
+ }
+ } catch (e: any) {
+ console.error('Error during partitioning:', e);
+ res.status(500).send({ error: 'Failed to partition the document', details: e.message });
+ }
+ } catch (error: any) {
+ console.error('Error reading file:', error);
+ res.status(500).send({ error: 'Failed to read the file', details: error.message });
+ }
+ },
+ });
}
}