aboutsummaryrefslogtreecommitdiff
path: root/src/client/apis
diff options
context:
space:
mode:
Diffstat (limited to 'src/client/apis')
-rw-r--r--src/client/apis/vectorstore/VectorstoreUpload.ts112
1 files changed, 112 insertions, 0 deletions
diff --git a/src/client/apis/vectorstore/VectorstoreUpload.ts b/src/client/apis/vectorstore/VectorstoreUpload.ts
new file mode 100644
index 000000000..78f652d9a
--- /dev/null
+++ b/src/client/apis/vectorstore/VectorstoreUpload.ts
@@ -0,0 +1,112 @@
+import * as dotenv from 'dotenv';
+import { Pinecone, ServerlessSpec } from '@pinecone-database/pinecone';
+import { Configuration, OpenAI } from 'openai';
+import * as fs from 'fs';
+import * as path from 'path';
+import { Document } from './file_processing'; // Assuming you have this file
+import { getSummarizedSystemPrompt, getSummarizedChunksPrompt } from './prompt_generator'; // Assuming you have this file
+import { CohereClient } from 'cohere-ai';
+
+dotenv.config();
+
+const pinecone = new Pinecone({
+ apiKey: process.env.PINECONE_API_KEY || '',
+});
+
+interface ChunkMetaData {
+ text: string;
+ type: string;
+ original_document: string;
+ file_path: string;
+ location: string;
+ start_page: number;
+ end_page: number;
+}
+
+interface Chunk {
+ id: string;
+ values: number[];
+ metadata: ChunkMetaData;
+}
+
+class Vectorstore {
+ private documents: Document[];
+ private index_name: string;
+ private index: any; // Type this properly based on Pinecone's TypeScript definitions
+ private documents_folder: string;
+
+ constructor() {
+ this.documents = [];
+ this.index_name = 'pdf-chatbot';
+ this.index = this.createIndex();
+ this.documents_folder = path.join('output', 'documents');
+ fs.mkdirSync(this.documents_folder, { recursive: true });
+ }
+
+ addDocument(document: Document): void {
+ this.documents.push(document);
+ this.indexDocument(document);
+ }
+
+ private async indexDocument(document: Document): Promise<void> {
+ console.log('Uploading vectors to content namespace..');
+ await this.index.upsert(document.chunks);
+ }
+
+ async retrieve(query: string, top_k: number = 10): Promise<Chunk[]> {
+ console.log(`Retrieving chunks for query: ${query}`);
+
+ const cohere = new CohereClient({
+ token: process.env.COHERE_API_KEY || '',
+ });
+
+ try {
+ const embedResponse = await cohere.embed({
+ texts: [query],
+ model: 'embed-english-v3.0',
+ inputType: 'search_query',
+ });
+
+ const queryEmb = embedResponse.embeddings[0];
+
+ const queryResponse = await this.index.query({
+ vector: queryEmb,
+ topK: top_k,
+ includeValues: true,
+ includeMetadata: true,
+ });
+
+ return queryResponse.matches as Chunk[];
+ } catch (e) {
+ console.error(`Error embedding query: ${e}`);
+ return [];
+ }
+ }
+
+ getSummaries(): string {
+ const summaries = this.documents.map(doc => doc.summary);
+ return summaries.map((summary, i) => `${i + 1}. ${summary}`).join('\n') + '\n';
+ }
+
+ private async createIndex(): Promise<any> {
+ const indexes = await pinecone.listIndexes();
+ if (indexes.includes(this.index_name)) {
+ console.log('Index already exists...');
+ } else {
+ await pinecone.createIndex({
+ name: this.index_name,
+ dimension: 1024,
+ metric: 'cosine',
+ spec: {
+ serverless: {
+ cloud: 'aws',
+ region: 'us-east-1',
+ },
+ },
+ });
+ }
+ return pinecone.Index(this.index_name);
+ }
+}
+
+export { Vectorstore, Chunk, ChunkMetaData };