diff options
Diffstat (limited to 'src/client')
-rw-r--r-- | src/client/apis/vectorstore/VectorstoreUpload.ts | 112 |
1 files changed, 112 insertions, 0 deletions
diff --git a/src/client/apis/vectorstore/VectorstoreUpload.ts b/src/client/apis/vectorstore/VectorstoreUpload.ts new file mode 100644 index 000000000..78f652d9a --- /dev/null +++ b/src/client/apis/vectorstore/VectorstoreUpload.ts @@ -0,0 +1,112 @@ +import * as dotenv from 'dotenv'; +import { Pinecone, ServerlessSpec } from '@pinecone-database/pinecone'; +import { Configuration, OpenAI } from 'openai'; +import * as fs from 'fs'; +import * as path from 'path'; +import { Document } from './file_processing'; // Assuming you have this file +import { getSummarizedSystemPrompt, getSummarizedChunksPrompt } from './prompt_generator'; // Assuming you have this file +import { CohereClient } from 'cohere-ai'; + +dotenv.config(); + +const pinecone = new Pinecone({ + apiKey: process.env.PINECONE_API_KEY || '', +}); + +interface ChunkMetaData { + text: string; + type: string; + original_document: string; + file_path: string; + location: string; + start_page: number; + end_page: number; +} + +interface Chunk { + id: string; + values: number[]; + metadata: ChunkMetaData; +} + +class Vectorstore { + private documents: Document[]; + private index_name: string; + private index: any; // Type this properly based on Pinecone's TypeScript definitions + private documents_folder: string; + + constructor() { + this.documents = []; + this.index_name = 'pdf-chatbot'; + this.index = this.createIndex(); + this.documents_folder = path.join('output', 'documents'); + fs.mkdirSync(this.documents_folder, { recursive: true }); + } + + addDocument(document: Document): void { + this.documents.push(document); + this.indexDocument(document); + } + + private async indexDocument(document: Document): Promise<void> { + console.log('Uploading vectors to content namespace..'); + await this.index.upsert(document.chunks); + } + + async retrieve(query: string, top_k: number = 10): Promise<Chunk[]> { + console.log(`Retrieving chunks for query: ${query}`); + + const cohere = new CohereClient({ + token: process.env.COHERE_API_KEY || '', + }); + + try { + const embedResponse = await cohere.embed({ + texts: [query], + model: 'embed-english-v3.0', + inputType: 'search_query', + }); + + const queryEmb = embedResponse.embeddings[0]; + + const queryResponse = await this.index.query({ + vector: queryEmb, + topK: top_k, + includeValues: true, + includeMetadata: true, + }); + + return queryResponse.matches as Chunk[]; + } catch (e) { + console.error(`Error embedding query: ${e}`); + return []; + } + } + + getSummaries(): string { + const summaries = this.documents.map(doc => doc.summary); + return summaries.map((summary, i) => `${i + 1}. ${summary}`).join('\n') + '\n'; + } + + private async createIndex(): Promise<any> { + const indexes = await pinecone.listIndexes(); + if (indexes.includes(this.index_name)) { + console.log('Index already exists...'); + } else { + await pinecone.createIndex({ + name: this.index_name, + dimension: 1024, + metric: 'cosine', + spec: { + serverless: { + cloud: 'aws', + region: 'us-east-1', + }, + }, + }); + } + return pinecone.Index(this.index_name); + } +} + +export { Vectorstore, Chunk, ChunkMetaData }; |