From cd4b13bacd6639d2a731a05dfca700b201b2073c Mon Sep 17 00:00:00 2001 From: "A.J. Shulman" Date: Wed, 14 Aug 2024 12:48:39 -0400 Subject: attempt at new multimodal vector --- src/client/views/nodes/ChatBox/Agent.ts | 2 +- src/client/views/nodes/ChatBox/ChatBox.tsx | 2 +- src/client/views/nodes/ChatBox/tools/RAGTool.ts | 2 +- .../views/nodes/ChatBox/vectorstore/Vectorstore.ts | 186 +++++++++++++++++++++ .../nodes/ChatBox/vectorstore/VectorstoreUpload.ts | 184 -------------------- src/server/ApiManagers/AssistantManager.ts | 54 ++++++ 6 files changed, 243 insertions(+), 187 deletions(-) create mode 100644 src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts delete mode 100644 src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts (limited to 'src') diff --git a/src/client/views/nodes/ChatBox/Agent.ts b/src/client/views/nodes/ChatBox/Agent.ts index 04729414a..7b3703449 100644 --- a/src/client/views/nodes/ChatBox/Agent.ts +++ b/src/client/views/nodes/ChatBox/Agent.ts @@ -6,7 +6,7 @@ import { WikipediaTool } from './tools/WikipediaTool'; import { CalculateTool } from './tools/CalculateTool'; import { RAGTool } from './tools/RAGTool'; import { NoTool } from './tools/NoTool'; -import { Vectorstore } from './vectorstore/VectorstoreUpload'; +import { Vectorstore } from './vectorstore/Vectorstore'; import { ChatCompletionAssistantMessageParam, ChatCompletionMessageParam } from 'openai/resources'; import dotenv from 'dotenv'; import { ChatBox } from './ChatBox'; diff --git a/src/client/views/nodes/ChatBox/ChatBox.tsx b/src/client/views/nodes/ChatBox/ChatBox.tsx index 3de5c1da3..13c418b32 100644 --- a/src/client/views/nodes/ChatBox/ChatBox.tsx +++ b/src/client/views/nodes/ChatBox/ChatBox.tsx @@ -12,7 +12,7 @@ import { FieldView, FieldViewProps } from '../FieldView'; import './ChatBox.scss'; import MessageComponentBox from './MessageComponent'; import { ASSISTANT_ROLE, AssistantMessage, AI_Document, Citation, CHUNK_TYPE, Chunk, getChunkType, TEXT_TYPE } from './types'; -import { Vectorstore } from './vectorstore/VectorstoreUpload'; +import { Vectorstore } from './vectorstore/Vectorstore'; import { Agent } from './Agent'; import dotenv from 'dotenv'; import { DocData, DocViews } from '../../../../fields/DocSymbols'; diff --git a/src/client/views/nodes/ChatBox/tools/RAGTool.ts b/src/client/views/nodes/ChatBox/tools/RAGTool.ts index 23b93b0f0..be591fa9a 100644 --- a/src/client/views/nodes/ChatBox/tools/RAGTool.ts +++ b/src/client/views/nodes/ChatBox/tools/RAGTool.ts @@ -1,5 +1,5 @@ import { BaseTool } from './BaseTool'; -import { Vectorstore } from '../vectorstore/VectorstoreUpload'; +import { Vectorstore } from '../vectorstore/Vectorstore'; import { Chunk } from '../types'; import * as fs from 'fs'; import { Networking } from '../../../../Network'; diff --git a/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts new file mode 100644 index 000000000..25aec751f --- /dev/null +++ b/src/client/views/nodes/ChatBox/vectorstore/Vectorstore.ts @@ -0,0 +1,186 @@ +import { Pinecone, Index, IndexList, PineconeRecord, RecordMetadata, QueryResponse } from '@pinecone-database/pinecone'; +import { CohereClient } from 'cohere-ai'; +import { EmbedResponse } from 'cohere-ai/api'; +import dotenv from 'dotenv'; +import axios from 'axios'; + +import { Chunk, AI_Document, CHUNK_TYPE } from '../types'; +import { Doc } from '../../../../../fields/Doc'; +import { DocData } from '../../../../../fields/DocSymbols'; +import { CsvCast, PDFCast, StrCast } from '../../../../../fields/Types'; +import { Networking } from '../../../../Network'; + +dotenv.config(); + +export class Vectorstore { + private pinecone: Pinecone; + private index!: Index; + private cohere: CohereClient; + private indexName: string = 'pdf-chatbot'; + private _id: string; + private _doc_ids: string[] = []; + documents: AI_Document[] = []; + + constructor(id: string, doc_ids: () => string[]) { + const pineconeApiKey = process.env.PINECONE_API_KEY; + if (!pineconeApiKey) { + throw new Error('PINECONE_API_KEY is not defined.'); + } + + this.pinecone = new Pinecone({ + apiKey: pineconeApiKey, + }); + this.cohere = new CohereClient({ + token: process.env.COHERE_API_KEY, + }); + this._id = id; + this._doc_ids = doc_ids(); + this.initializeIndex(); + } + + private async initializeIndex() { + const indexList: IndexList = await this.pinecone.listIndexes(); + + if (!indexList.indexes?.some(index => index.name === this.indexName)) { + await this.pinecone.createIndex({ + name: this.indexName, + dimension: 768, + metric: 'cosine', + spec: { + serverless: { + cloud: 'aws', + region: 'us-east-1', + }, + }, + }); + } + + this.index = this.pinecone.Index(this.indexName); + } + + async addAIDoc(doc: Doc) { + console.log('Adding AI Document:', doc); + const ai_document_status: string = StrCast(doc.ai_document_status); + if (ai_document_status !== undefined && ai_document_status !== null && ai_document_status !== '' && ai_document_status !== ' ' && ai_document_status !== '{}') { + if (ai_document_status === 'IN PROGRESS') { + console.log('Already in progress.'); + return; + } + if (!this._doc_ids.includes(StrCast(doc.ai_doc_id))) this._doc_ids.push(StrCast(doc.ai_doc_id)); + } else { + doc.ai_document_status = 'PROGRESS'; + console.log(doc); + console.log(PDFCast(doc.data)?.url?.pathname); + console.log(CsvCast(doc.data)?.url?.pathname); + const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname; + console.log('Local File Path:', local_file_path); + if (local_file_path !== undefined || local_file_path !== null || local_file_path !== '') { + const { document_json } = await Networking.PostToServer('/createDocument', { file_path: local_file_path }); + console.log('Document JSON:', document_json); + //const ai_document: AI_Document = convertToAIDocument(document_json); + this.documents.push(document_json); + await this.indexDocument(JSON.parse(JSON.stringify(document_json, (key, value) => (value === null || value === undefined ? undefined : value)))); + console.log(`Document added: ${document_json.file_name}`); + doc.summary = document_json.summary; + doc.ai_doc_id = document_json.doc_id; + this._doc_ids.push(document_json.doc_id); + doc.ai_purpose = document_json.purpose; + if (doc.vectorstore_id === undefined || doc.vectorstore_id === null || doc.vectorstore_id === '' || doc.vectorstore_id === '[]') { + doc.vectorstore_id = JSON.stringify([this._id]); + } else { + doc.vectorstore_id = JSON.stringify(JSON.parse(StrCast(doc.vectorstore_id)).concat([this._id])); + } + if (doc.chunk_simpl === undefined || doc.chunk_simpl === null || doc.chunk_simpl === '' || doc.chunk_simpl === '[]') { + doc.chunk_simpl = JSON.stringify({ text_chunks: [], image_chunks: [] }); + } + let new_chunk_simpl: { text_chunks: { chunk_id: string; start_page: number; end_page: number }[]; image_chunks: { chunk_id: string; location: string; page: number }[] } = { + text_chunks: [], + image_chunks: [], + }; + + document_json.chunks.forEach((chunk: Chunk) => { + let chunk_to_add: { chunk_id: string; start_page: number; end_page: number }[] | { chunk_id: string; location: string; page: number }[]; + switch (chunk.metadata.type) { + case CHUNK_TYPE.TEXT: + chunk_to_add = [{ chunk_id: chunk.id, start_page: chunk.metadata.start_page, end_page: chunk.metadata.end_page }]; + new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); + new_chunk_simpl.text_chunks = new_chunk_simpl.text_chunks.concat(chunk_to_add); + doc.chunk_simpl = JSON.stringify(new_chunk_simpl); + break; + case CHUNK_TYPE.IMAGE: + case CHUNK_TYPE.TABLE: + console.log('Location:', chunk.metadata.location); + chunk_to_add = [{ chunk_id: chunk.id, location: chunk.metadata.location, page: chunk.metadata.start_page }]; + new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); + new_chunk_simpl.image_chunks = new_chunk_simpl.image_chunks.concat(chunk_to_add); + doc.chunk_simpl = JSON.stringify(new_chunk_simpl); + break; + } + }); + + doc.ai_document_status = 'COMPLETED'; + } + } + } + + private async indexDocument(document: any) { + console.log('Uploading vectors to content namespace...'); + const pineconeRecords: PineconeRecord[] = (document.chunks as Chunk[]).map( + chunk => + ({ + id: chunk.id, + values: chunk.values, + metadata: { ...chunk.metadata } as RecordMetadata, + }) as PineconeRecord + ); + await this.index.upsert(pineconeRecords); + } + + async retrieve(query: string, topK: number = 10): Promise { + console.log(`Retrieving chunks for query: ${query}`); + try { + const url = 'https://api.jina.ai/v1/embeddings'; + const headers = { + 'Content-Type': 'application/json', + Authorization: `Bearer ${process.env.JINA_API_KEY}`, + }; + const data = { + model: 'jina-clip-v1', + normalized: true, + embedding_type: 'float', + input: [{ text: query }], + }; + + const response = await axios.post(url, data, { headers }); + const embeddings = response.data?.data?.[0]?.embedding; + + if (!embeddings || !Array.isArray(embeddings)) { + throw new Error('Invalid embedding response format from Jina API'); + } + + const queryEmbedding = embeddings; + + const queryResponse: QueryResponse = await this.index.query({ + vector: queryEmbedding, + filter: { + doc_id: { $in: this._doc_ids }, + }, + topK, + includeValues: true, + includeMetadata: true, + }); + + return queryResponse.matches.map( + match => + ({ + id: match.id, + values: match.values as number[], + metadata: match.metadata as { text: string; type: string; original_document: string; file_path: string; doc_id: string; location: string; start_page: number; end_page: number }, + }) as Chunk + ); + } catch (error) { + console.error(`Error retrieving chunks: ${error}`); + return []; + } + } +} diff --git a/src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts b/src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts deleted file mode 100644 index 787705bb6..000000000 --- a/src/client/views/nodes/ChatBox/vectorstore/VectorstoreUpload.ts +++ /dev/null @@ -1,184 +0,0 @@ -import { Pinecone, Index, IndexList, PineconeRecord, RecordMetadata, QueryResponse } from '@pinecone-database/pinecone'; -import { CohereClient } from 'cohere-ai'; -import { EmbedResponse } from 'cohere-ai/api'; -import dotenv from 'dotenv'; - -import { Chunk, AI_Document, CHUNK_TYPE } from '../types'; -import { Doc } from '../../../../../fields/Doc'; -import { DocData } from '../../../../../fields/DocSymbols'; -import { CsvCast, PDFCast, StrCast } from '../../../../../fields/Types'; -import { Networking } from '../../../../Network'; - -dotenv.config(); - -export class Vectorstore { - private pinecone: Pinecone; - private index!: Index; - private cohere: CohereClient; - private indexName: string = 'pdf-chatbot'; - private _id: string; - private _doc_ids: string[] = []; - documents: AI_Document[] = []; - - constructor(id: string, doc_ids: () => string[]) { - const pineconeApiKey = process.env.PINECONE_API_KEY; - if (!pineconeApiKey) { - throw new Error('PINECONE_API_KEY is not defined.'); - } - - this.pinecone = new Pinecone({ - apiKey: pineconeApiKey, - }); - this.cohere = new CohereClient({ - token: process.env.COHERE_API_KEY, - }); - this._id = id; - this._doc_ids = doc_ids(); - this.initializeIndex(); - } - - private async initializeIndex() { - const indexList: IndexList = await this.pinecone.listIndexes(); - - if (!indexList.indexes?.some(index => index.name === this.indexName)) { - await this.pinecone.createIndex({ - name: this.indexName, - dimension: 1024, - metric: 'cosine', - spec: { - serverless: { - cloud: 'aws', - region: 'us-east-1', - }, - }, - }); - } - - this.index = this.pinecone.Index(this.indexName); - } - - async addAIDoc(doc: Doc) { - console.log('Adding AI Document:', doc); - const ai_document_status: string = StrCast(doc.ai_document_status); - if (ai_document_status !== undefined && ai_document_status !== null && ai_document_status !== '' && ai_document_status !== ' ' && ai_document_status !== '{}') { - if (ai_document_status === 'IN PROGRESS') { - console.log('Already in progress.'); - return; - } - if (!this._doc_ids.includes(StrCast(doc.ai_doc_id))) this._doc_ids.push(StrCast(doc.ai_doc_id)); - } else { - doc.ai_document_status = 'PROGRESS'; - console.log(doc); - console.log(PDFCast(doc.data)?.url?.pathname); - console.log(CsvCast(doc.data)?.url?.pathname); - const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname; - console.log('Local File Path:', local_file_path); - if (local_file_path !== undefined || local_file_path !== null || local_file_path !== '') { - const { document_json } = await Networking.PostToServer('/createDocument', { file_path: local_file_path }); - console.log('Document JSON:', document_json); - //const ai_document: AI_Document = convertToAIDocument(document_json); - this.documents.push(document_json); - await this.indexDocument(JSON.parse(JSON.stringify(document_json, (key, value) => (value === null || value === undefined ? undefined : value)))); - console.log(`Document added: ${document_json.file_name}`); - doc.summary = document_json.summary; - doc.ai_doc_id = document_json.doc_id; - this._doc_ids.push(document_json.doc_id); - doc.ai_purpose = document_json.purpose; - if (doc.vectorstore_id === undefined || doc.vectorstore_id === null || doc.vectorstore_id === '' || doc.vectorstore_id === '[]') { - doc.vectorstore_id = JSON.stringify([this._id]); - } else { - doc.vectorstore_id = JSON.stringify(JSON.parse(StrCast(doc.vectorstore_id)).concat([this._id])); - } - if (doc.chunk_simpl === undefined || doc.chunk_simpl === null || doc.chunk_simpl === '' || doc.chunk_simpl === '[]') { - doc.chunk_simpl = JSON.stringify({ text_chunks: [], image_chunks: [] }); - } - let new_chunk_simpl: { text_chunks: { chunk_id: string; start_page: number; end_page: number }[]; image_chunks: { chunk_id: string; location: string; page: number }[] } = { - text_chunks: [], - image_chunks: [], - }; - - document_json.chunks.forEach((chunk: Chunk) => { - let chunk_to_add: { chunk_id: string; start_page: number; end_page: number }[] | { chunk_id: string; location: string; page: number }[]; - switch (chunk.metadata.type) { - case CHUNK_TYPE.TEXT: - chunk_to_add = [{ chunk_id: chunk.id, start_page: chunk.metadata.start_page, end_page: chunk.metadata.end_page }]; - new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); - new_chunk_simpl.text_chunks = new_chunk_simpl.text_chunks.concat(chunk_to_add); - doc.chunk_simpl = JSON.stringify(new_chunk_simpl); - break; - case CHUNK_TYPE.IMAGE: - case CHUNK_TYPE.TABLE: - console.log('Location:', chunk.metadata.location); - chunk_to_add = [{ chunk_id: chunk.id, location: chunk.metadata.location, page: chunk.metadata.start_page }]; - new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); - new_chunk_simpl.image_chunks = new_chunk_simpl.image_chunks.concat(chunk_to_add); - doc.chunk_simpl = JSON.stringify(new_chunk_simpl); - break; - } - }); - - doc.ai_document_status = 'COMPLETED'; - } - } - } - - private async indexDocument(document: any) { - console.log('Uploading vectors to content namespace...'); - const pineconeRecords: PineconeRecord[] = (document.chunks as Chunk[]).map( - chunk => - ({ - id: chunk.id, - values: chunk.values, - metadata: { ...chunk.metadata } as RecordMetadata, - }) as PineconeRecord - ); - await this.index.upsert(pineconeRecords); - } - - async retrieve(query: string, topK: number = 10): Promise { - console.log(`Retrieving chunks for query: ${query}`); - try { - const queryEmbeddingResponse: EmbedResponse = await this.cohere.embed({ - texts: [query], - model: 'embed-english-v3.0', - inputType: 'search_query', - }); - - let queryEmbedding: number[]; - - if (Array.isArray(queryEmbeddingResponse.embeddings)) { - queryEmbedding = queryEmbeddingResponse.embeddings[0]; - } else if (queryEmbeddingResponse.embeddings && 'embeddings' in queryEmbeddingResponse.embeddings) { - queryEmbedding = (queryEmbeddingResponse.embeddings as { embeddings: number[][] }).embeddings[0]; - } else { - throw new Error('Invalid embedding response format'); - } - - if (!Array.isArray(queryEmbedding)) { - throw new Error('Query embedding is not an array'); - } - - const queryResponse: QueryResponse = await this.index.query({ - vector: queryEmbedding, - filter: { - doc_id: { $in: this._doc_ids }, - }, - topK, - includeValues: true, - includeMetadata: true, - }); - - return queryResponse.matches.map( - match => - ({ - id: match.id, - values: match.values as number[], - metadata: match.metadata as { text: string; type: string; original_document: string; file_path: string; doc_id: string; location: string; start_page: number; end_page: number }, - }) as Chunk - ); - } catch (error) { - console.error(`Error retrieving chunks: ${error}`); - return []; - } - } -} diff --git a/src/server/ApiManagers/AssistantManager.ts b/src/server/ApiManagers/AssistantManager.ts index 36468157a..f69ca1383 100644 --- a/src/server/ApiManagers/AssistantManager.ts +++ b/src/server/ApiManagers/AssistantManager.ts @@ -9,6 +9,9 @@ import { Method } from '../RouteManager'; import ApiManager, { Registration } from './ApiManager'; import axios from 'axios'; import { Chunk } from '../../client/views/nodes/ChatBox/types'; +import { UnstructuredClient } from 'unstructured-client'; +import { PartitionResponse } from 'unstructured-client/sdk/models/operations'; +import { ChunkingStrategy, Strategy } from 'unstructured-client/sdk/models/shared'; export enum Directory { parsed_files = 'parsed_files', @@ -42,6 +45,11 @@ export default class AssistantManager extends ApiManager { apiKey: process.env._CLIENT_OPENAI_KEY, // Use client key so don't have to set key seperately for client and server. dangerouslyAllowBrowser: true, }); + const unstructuredClient = new UnstructuredClient({ + security: { + apiKeyAuth: process.env._CLIENT_UNSTRUCTURED_API_KEY!, + }, + }); register({ method: Method.POST, @@ -187,5 +195,51 @@ export default class AssistantManager extends ApiManager { res.send({ formattedChunks: content }); }, }); + + register({ + method: Method.POST, + subscription: '/chunkDocument', + secureHandler: async ({ req, res }) => { + const { file_path } = req.body; + const public_path = path.join(publicDirectory, file_path); + const file_name = path.basename(file_path); + + try { + // Read file data and convert to base64 + const file_data = await fs.promises.readFile(public_path); + + try { + const result = await unstructuredClient.general.partition({ + partitionParameters: { + files: { + content: file_data, + fileName: file_name, + }, + strategy: Strategy.Auto, + chunkingStrategy: ChunkingStrategy.ByTitle, + extractImageBlockTypes: ['Image', 'Table'], + }, + }); + + if (result.statusCode === 200) { + console.log(result.elements); + const jsonElements = JSON.stringify(result.elements, null, 2); + // Print the processed data. + console.log(jsonElements); + res.send({ document_json: jsonElements }); + } else { + console.error(`Unexpected status code: ${result.statusCode}`); + res.status(result.statusCode).send({ error: 'Failed to process the document', details: result }); + } + } catch (e: any) { + console.error('Error during partitioning:', e); + res.status(500).send({ error: 'Failed to partition the document', details: e.message }); + } + } catch (error: any) { + console.error('Error reading file:', error); + res.status(500).send({ error: 'Failed to read the file', details: error.message }); + } + }, + }); } } -- cgit v1.2.3-70-g09d2