diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/client/apis/vectorstore/VectorstoreUpload.ts | 146 |
1 files changed, 82 insertions, 64 deletions
diff --git a/src/client/apis/vectorstore/VectorstoreUpload.ts b/src/client/apis/vectorstore/VectorstoreUpload.ts index 78f652d9a..6c60ad0c8 100644 --- a/src/client/apis/vectorstore/VectorstoreUpload.ts +++ b/src/client/apis/vectorstore/VectorstoreUpload.ts @@ -1,19 +1,11 @@ -import * as dotenv from 'dotenv'; -import { Pinecone, ServerlessSpec } from '@pinecone-database/pinecone'; -import { Configuration, OpenAI } from 'openai'; -import * as fs from 'fs'; -import * as path from 'path'; -import { Document } from './file_processing'; // Assuming you have this file -import { getSummarizedSystemPrompt, getSummarizedChunksPrompt } from './prompt_generator'; // Assuming you have this file +import { Pinecone, Index, IndexList, PineconeRecord } from '@pinecone-database/pinecone'; import { CohereClient } from 'cohere-ai'; +import { EmbedResponse } from 'cohere-ai/api'; +import dotenv from 'dotenv'; dotenv.config(); -const pinecone = new Pinecone({ - apiKey: process.env.PINECONE_API_KEY || '', -}); - -interface ChunkMetaData { +interface ChunkMetadata { text: string; type: string; original_document: string; @@ -21,92 +13,118 @@ interface ChunkMetaData { location: string; start_page: number; end_page: number; + [key: string]: string | number; // Add this line } interface Chunk { id: string; values: number[]; - metadata: ChunkMetaData; + metadata: ChunkMetadata; +} + +interface Document { + purpose: string; + file_name: string; + num_pages: number; + summary: string; + chunks: Chunk[]; + type: string; } class Vectorstore { - private documents: Document[]; - private index_name: string; - private index: any; // Type this properly based on Pinecone's TypeScript definitions - private documents_folder: string; + private pinecone: Pinecone; + private index: Index; + private cohere: CohereClient; + private indexName: string = 'pdf-chatbot'; + private documents: Document[] = []; constructor() { - this.documents = []; - this.index_name = 'pdf-chatbot'; - this.index = this.createIndex(); - this.documents_folder = path.join('output', 'documents'); - fs.mkdirSync(this.documents_folder, { recursive: true }); + this.pinecone = new Pinecone({ + apiKey: process.env.PINECONE_API_KEY!, + }); + this.cohere = new CohereClient({ + token: process.env.COHERE_API_KEY!, + }); + this.createIndex(); + } + + private async createIndex() { + const indexList: IndexList = await this.pinecone.listIndexes(); + + if (!indexList.indexes?.some(index => index.name === this.indexName)) { + await this.pinecone.createIndex({ + name: this.indexName, + dimension: 1024, + metric: 'cosine', + spec: { + serverless: { + cloud: 'aws', + region: 'us-east-1', + }, + }, + }); + } + + this.index = this.pinecone.Index(this.indexName); } - addDocument(document: Document): void { + async addDocument(document: Document) { this.documents.push(document); - this.indexDocument(document); + await this.indexDocument(document); } - private async indexDocument(document: Document): Promise<void> { - console.log('Uploading vectors to content namespace..'); - await this.index.upsert(document.chunks); + private async indexDocument(document: Document) { + console.log('Uploading vectors to content namespace...'); + const pineconeRecords: PineconeRecord[] = document.chunks.map(chunk => ({ + id: chunk.id, + values: chunk.values, + metadata: chunk.metadata, + })); + await this.index.upsert(pineconeRecords); } - async retrieve(query: string, top_k: number = 10): Promise<Chunk[]> { + async retrieve(query: string, topK: number = 10): Promise<Chunk[]> { console.log(`Retrieving chunks for query: ${query}`); - - const cohere = new CohereClient({ - token: process.env.COHERE_API_KEY || '', - }); - try { - const embedResponse = await cohere.embed({ + const queryEmbeddingResponse: EmbedResponse = await this.cohere.embed({ texts: [query], model: 'embed-english-v3.0', inputType: 'search_query', }); - const queryEmb = embedResponse.embeddings[0]; + let queryEmbedding: number[]; + + if (Array.isArray(queryEmbeddingResponse.embeddings)) { + queryEmbedding = queryEmbeddingResponse.embeddings[0]; + } else if (queryEmbeddingResponse.embeddings && 'embeddings' in queryEmbeddingResponse.embeddings) { + queryEmbedding = (queryEmbeddingResponse.embeddings as { embeddings: number[][] }).embeddings[0]; + } else { + throw new Error('Invalid embedding response format'); + } + + if (!Array.isArray(queryEmbedding)) { + throw new Error('Query embedding is not an array'); + } const queryResponse = await this.index.query({ - vector: queryEmb, - topK: top_k, + vector: queryEmbedding, + topK, includeValues: true, includeMetadata: true, }); - return queryResponse.matches as Chunk[]; - } catch (e) { - console.error(`Error embedding query: ${e}`); + return queryResponse.matches.map(match => ({ + id: match.id, + values: match.values as number[], + metadata: match.metadata as ChunkMetadata, + })); + } catch (error) { + console.error(`Error retrieving chunks: ${error}`); return []; } } getSummaries(): string { - const summaries = this.documents.map(doc => doc.summary); - return summaries.map((summary, i) => `${i + 1}. ${summary}`).join('\n') + '\n'; - } - - private async createIndex(): Promise<any> { - const indexes = await pinecone.listIndexes(); - if (indexes.includes(this.index_name)) { - console.log('Index already exists...'); - } else { - await pinecone.createIndex({ - name: this.index_name, - dimension: 1024, - metric: 'cosine', - spec: { - serverless: { - cloud: 'aws', - region: 'us-east-1', - }, - }, - }); - } - return pinecone.Index(this.index_name); + return this.documents.map((doc, index) => `${index + 1}. ${doc.summary}`).join('\n') + '\n'; } } - -export { Vectorstore, Chunk, ChunkMetaData }; |