diff options
author | A.J. Shulman <Shulman.aj@gmail.com> | 2024-07-09 13:55:03 -0400 |
---|---|---|
committer | A.J. Shulman <Shulman.aj@gmail.com> | 2024-07-09 13:55:03 -0400 |
commit | c789d3d41a68c89e75fdfc12b1b05377ceef32d1 (patch) | |
tree | 1079016e962a4f0ece1dd02aff92e07c6c2826ab /src/client/apis | |
parent | a578f43335b0009927df4c341be3aee4f74be6d9 (diff) |
starting to improve vectorstore
Diffstat (limited to 'src/client/apis')
-rw-r--r-- | src/client/apis/vectorstore/VectorstoreUpload.ts | 130 |
1 files changed, 0 insertions, 130 deletions
diff --git a/src/client/apis/vectorstore/VectorstoreUpload.ts b/src/client/apis/vectorstore/VectorstoreUpload.ts deleted file mode 100644 index 6c60ad0c8..000000000 --- a/src/client/apis/vectorstore/VectorstoreUpload.ts +++ /dev/null @@ -1,130 +0,0 @@ -import { Pinecone, Index, IndexList, PineconeRecord } from '@pinecone-database/pinecone'; -import { CohereClient } from 'cohere-ai'; -import { EmbedResponse } from 'cohere-ai/api'; -import dotenv from 'dotenv'; - -dotenv.config(); - -interface ChunkMetadata { - text: string; - type: string; - original_document: string; - file_path: string; - location: string; - start_page: number; - end_page: number; - [key: string]: string | number; // Add this line -} - -interface Chunk { - id: string; - values: number[]; - metadata: ChunkMetadata; -} - -interface Document { - purpose: string; - file_name: string; - num_pages: number; - summary: string; - chunks: Chunk[]; - type: string; -} - -class Vectorstore { - private pinecone: Pinecone; - private index: Index; - private cohere: CohereClient; - private indexName: string = 'pdf-chatbot'; - private documents: Document[] = []; - - constructor() { - this.pinecone = new Pinecone({ - apiKey: process.env.PINECONE_API_KEY!, - }); - this.cohere = new CohereClient({ - token: process.env.COHERE_API_KEY!, - }); - this.createIndex(); - } - - private async createIndex() { - const indexList: IndexList = await this.pinecone.listIndexes(); - - if (!indexList.indexes?.some(index => index.name === this.indexName)) { - await this.pinecone.createIndex({ - name: this.indexName, - dimension: 1024, - metric: 'cosine', - spec: { - serverless: { - cloud: 'aws', - region: 'us-east-1', - }, - }, - }); - } - - this.index = this.pinecone.Index(this.indexName); - } - - async addDocument(document: Document) { - this.documents.push(document); - await this.indexDocument(document); - } - - private async indexDocument(document: Document) { - console.log('Uploading vectors to content namespace...'); - const pineconeRecords: PineconeRecord[] = document.chunks.map(chunk => ({ - id: chunk.id, - values: chunk.values, - metadata: chunk.metadata, - })); - await this.index.upsert(pineconeRecords); - } - - async retrieve(query: string, topK: number = 10): Promise<Chunk[]> { - console.log(`Retrieving chunks for query: ${query}`); - try { - const queryEmbeddingResponse: EmbedResponse = await this.cohere.embed({ - texts: [query], - model: 'embed-english-v3.0', - inputType: 'search_query', - }); - - let queryEmbedding: number[]; - - if (Array.isArray(queryEmbeddingResponse.embeddings)) { - queryEmbedding = queryEmbeddingResponse.embeddings[0]; - } else if (queryEmbeddingResponse.embeddings && 'embeddings' in queryEmbeddingResponse.embeddings) { - queryEmbedding = (queryEmbeddingResponse.embeddings as { embeddings: number[][] }).embeddings[0]; - } else { - throw new Error('Invalid embedding response format'); - } - - if (!Array.isArray(queryEmbedding)) { - throw new Error('Query embedding is not an array'); - } - - const queryResponse = await this.index.query({ - vector: queryEmbedding, - topK, - includeValues: true, - includeMetadata: true, - }); - - return queryResponse.matches.map(match => ({ - id: match.id, - values: match.values as number[], - metadata: match.metadata as ChunkMetadata, - })); - } catch (error) { - console.error(`Error retrieving chunks: ${error}`); - return []; - } - } - - getSummaries(): string { - return this.documents.map((doc, index) => `${index + 1}. ${doc.summary}`).join('\n') + '\n'; - } -} |