import * as dotenv from 'dotenv'; import { Pinecone, ServerlessSpec } from '@pinecone-database/pinecone'; import { Configuration, OpenAI } from 'openai'; import * as fs from 'fs'; import * as path from 'path'; import { Document } from './file_processing'; // Assuming you have this file import { getSummarizedSystemPrompt, getSummarizedChunksPrompt } from './prompt_generator'; // Assuming you have this file import { CohereClient } from 'cohere-ai'; dotenv.config(); const pinecone = new Pinecone({ apiKey: process.env.PINECONE_API_KEY || '', }); interface ChunkMetaData { text: string; type: string; original_document: string; file_path: string; location: string; start_page: number; end_page: number; } interface Chunk { id: string; values: number[]; metadata: ChunkMetaData; } class Vectorstore { private documents: Document[]; private index_name: string; private index: any; // Type this properly based on Pinecone's TypeScript definitions private documents_folder: string; constructor() { this.documents = []; this.index_name = 'pdf-chatbot'; this.index = this.createIndex(); this.documents_folder = path.join('output', 'documents'); fs.mkdirSync(this.documents_folder, { recursive: true }); } addDocument(document: Document): void { this.documents.push(document); this.indexDocument(document); } private async indexDocument(document: Document): Promise { console.log('Uploading vectors to content namespace..'); await this.index.upsert(document.chunks); } async retrieve(query: string, top_k: number = 10): Promise { console.log(`Retrieving chunks for query: ${query}`); const cohere = new CohereClient({ token: process.env.COHERE_API_KEY || '', }); try { const embedResponse = await cohere.embed({ texts: [query], model: 'embed-english-v3.0', inputType: 'search_query', }); const queryEmb = embedResponse.embeddings[0]; const queryResponse = await this.index.query({ vector: queryEmb, topK: top_k, includeValues: true, includeMetadata: true, }); return queryResponse.matches as Chunk[]; } catch (e) { console.error(`Error embedding query: ${e}`); return []; } } getSummaries(): string { const summaries = this.documents.map(doc => doc.summary); return summaries.map((summary, i) => `${i + 1}. ${summary}`).join('\n') + '\n'; } private async createIndex(): Promise { const indexes = await pinecone.listIndexes(); if (indexes.includes(this.index_name)) { console.log('Index already exists...'); } else { await pinecone.createIndex({ name: this.index_name, dimension: 1024, metric: 'cosine', spec: { serverless: { cloud: 'aws', region: 'us-east-1', }, }, }); } return pinecone.Index(this.index_name); } } export { Vectorstore, Chunk, ChunkMetaData };