diff options
| author | A.J. Shulman <Shulman.aj@gmail.com> | 2024-07-08 13:47:53 -0400 | 
|---|---|---|
| committer | A.J. Shulman <Shulman.aj@gmail.com> | 2024-07-08 13:47:53 -0400 | 
| commit | ca8a9125ee88cc633755d80ca6b3bb888a6dc7d7 (patch) | |
| tree | 9d7191d3cfdfd8cae54a4d41ec2f5d8fb3103d55 /src/client/apis/vectorstore/VectorstoreUpload.ts | |
| parent | e297c75cdcc8bb5b1b138d1272f1f6f27b222f4c (diff) | |
starting vectorstore
Diffstat (limited to 'src/client/apis/vectorstore/VectorstoreUpload.ts')
| -rw-r--r-- | src/client/apis/vectorstore/VectorstoreUpload.ts | 112 | 
1 files changed, 112 insertions, 0 deletions
| diff --git a/src/client/apis/vectorstore/VectorstoreUpload.ts b/src/client/apis/vectorstore/VectorstoreUpload.ts new file mode 100644 index 000000000..78f652d9a --- /dev/null +++ b/src/client/apis/vectorstore/VectorstoreUpload.ts @@ -0,0 +1,112 @@ +import * as dotenv from 'dotenv'; +import { Pinecone, ServerlessSpec } from '@pinecone-database/pinecone'; +import { Configuration, OpenAI } from 'openai'; +import * as fs from 'fs'; +import * as path from 'path'; +import { Document } from './file_processing'; // Assuming you have this file +import { getSummarizedSystemPrompt, getSummarizedChunksPrompt } from './prompt_generator'; // Assuming you have this file +import { CohereClient } from 'cohere-ai'; + +dotenv.config(); + +const pinecone = new Pinecone({ +    apiKey: process.env.PINECONE_API_KEY || '', +}); + +interface ChunkMetaData { +    text: string; +    type: string; +    original_document: string; +    file_path: string; +    location: string; +    start_page: number; +    end_page: number; +} + +interface Chunk { +    id: string; +    values: number[]; +    metadata: ChunkMetaData; +} + +class Vectorstore { +    private documents: Document[]; +    private index_name: string; +    private index: any; // Type this properly based on Pinecone's TypeScript definitions +    private documents_folder: string; + +    constructor() { +        this.documents = []; +        this.index_name = 'pdf-chatbot'; +        this.index = this.createIndex(); +        this.documents_folder = path.join('output', 'documents'); +        fs.mkdirSync(this.documents_folder, { recursive: true }); +    } + +    addDocument(document: Document): void { +        this.documents.push(document); +        this.indexDocument(document); +    } + +    private async indexDocument(document: Document): Promise<void> { +        console.log('Uploading vectors to content namespace..'); +        await this.index.upsert(document.chunks); +    } + +    async retrieve(query: string, top_k: number = 10): Promise<Chunk[]> { +        console.log(`Retrieving chunks for query: ${query}`); + +        const cohere = new CohereClient({ +            token: process.env.COHERE_API_KEY || '', +        }); + +        try { +            const embedResponse = await cohere.embed({ +                texts: [query], +                model: 'embed-english-v3.0', +                inputType: 'search_query', +            }); + +            const queryEmb = embedResponse.embeddings[0]; + +            const queryResponse = await this.index.query({ +                vector: queryEmb, +                topK: top_k, +                includeValues: true, +                includeMetadata: true, +            }); + +            return queryResponse.matches as Chunk[]; +        } catch (e) { +            console.error(`Error embedding query: ${e}`); +            return []; +        } +    } + +    getSummaries(): string { +        const summaries = this.documents.map(doc => doc.summary); +        return summaries.map((summary, i) => `${i + 1}. ${summary}`).join('\n') + '\n'; +    } + +    private async createIndex(): Promise<any> { +        const indexes = await pinecone.listIndexes(); +        if (indexes.includes(this.index_name)) { +            console.log('Index already exists...'); +        } else { +            await pinecone.createIndex({ +                name: this.index_name, +                dimension: 1024, +                metric: 'cosine', +                spec: { +                    serverless: { +                        cloud: 'aws', +                        region: 'us-east-1', +                    }, +                }, +            }); +        } +        return pinecone.Index(this.index_name); +    } +} + +export { Vectorstore, Chunk, ChunkMetaData }; | 
