aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/client/apis/vectorstore/VectorstoreUpload.ts146
1 files changed, 82 insertions, 64 deletions
diff --git a/src/client/apis/vectorstore/VectorstoreUpload.ts b/src/client/apis/vectorstore/VectorstoreUpload.ts
index 78f652d9a..6c60ad0c8 100644
--- a/src/client/apis/vectorstore/VectorstoreUpload.ts
+++ b/src/client/apis/vectorstore/VectorstoreUpload.ts
@@ -1,19 +1,11 @@
-import * as dotenv from 'dotenv';
-import { Pinecone, ServerlessSpec } from '@pinecone-database/pinecone';
-import { Configuration, OpenAI } from 'openai';
-import * as fs from 'fs';
-import * as path from 'path';
-import { Document } from './file_processing'; // Assuming you have this file
-import { getSummarizedSystemPrompt, getSummarizedChunksPrompt } from './prompt_generator'; // Assuming you have this file
+import { Pinecone, Index, IndexList, PineconeRecord } from '@pinecone-database/pinecone';
import { CohereClient } from 'cohere-ai';
+import { EmbedResponse } from 'cohere-ai/api';
+import dotenv from 'dotenv';
dotenv.config();
-const pinecone = new Pinecone({
- apiKey: process.env.PINECONE_API_KEY || '',
-});
-
-interface ChunkMetaData {
+interface ChunkMetadata {
text: string;
type: string;
original_document: string;
@@ -21,92 +13,118 @@ interface ChunkMetaData {
location: string;
start_page: number;
end_page: number;
+ [key: string]: string | number; // Add this line
}
interface Chunk {
id: string;
values: number[];
- metadata: ChunkMetaData;
+ metadata: ChunkMetadata;
+}
+
+interface Document {
+ purpose: string;
+ file_name: string;
+ num_pages: number;
+ summary: string;
+ chunks: Chunk[];
+ type: string;
}
class Vectorstore {
- private documents: Document[];
- private index_name: string;
- private index: any; // Type this properly based on Pinecone's TypeScript definitions
- private documents_folder: string;
+ private pinecone: Pinecone;
+ private index: Index;
+ private cohere: CohereClient;
+ private indexName: string = 'pdf-chatbot';
+ private documents: Document[] = [];
constructor() {
- this.documents = [];
- this.index_name = 'pdf-chatbot';
- this.index = this.createIndex();
- this.documents_folder = path.join('output', 'documents');
- fs.mkdirSync(this.documents_folder, { recursive: true });
+ this.pinecone = new Pinecone({
+ apiKey: process.env.PINECONE_API_KEY!,
+ });
+ this.cohere = new CohereClient({
+ token: process.env.COHERE_API_KEY!,
+ });
+ this.createIndex();
+ }
+
+ private async createIndex() {
+ const indexList: IndexList = await this.pinecone.listIndexes();
+
+ if (!indexList.indexes?.some(index => index.name === this.indexName)) {
+ await this.pinecone.createIndex({
+ name: this.indexName,
+ dimension: 1024,
+ metric: 'cosine',
+ spec: {
+ serverless: {
+ cloud: 'aws',
+ region: 'us-east-1',
+ },
+ },
+ });
+ }
+
+ this.index = this.pinecone.Index(this.indexName);
}
- addDocument(document: Document): void {
+ async addDocument(document: Document) {
this.documents.push(document);
- this.indexDocument(document);
+ await this.indexDocument(document);
}
- private async indexDocument(document: Document): Promise<void> {
- console.log('Uploading vectors to content namespace..');
- await this.index.upsert(document.chunks);
+ private async indexDocument(document: Document) {
+ console.log('Uploading vectors to content namespace...');
+ const pineconeRecords: PineconeRecord[] = document.chunks.map(chunk => ({
+ id: chunk.id,
+ values: chunk.values,
+ metadata: chunk.metadata,
+ }));
+ await this.index.upsert(pineconeRecords);
}
- async retrieve(query: string, top_k: number = 10): Promise<Chunk[]> {
+ async retrieve(query: string, topK: number = 10): Promise<Chunk[]> {
console.log(`Retrieving chunks for query: ${query}`);
-
- const cohere = new CohereClient({
- token: process.env.COHERE_API_KEY || '',
- });
-
try {
- const embedResponse = await cohere.embed({
+ const queryEmbeddingResponse: EmbedResponse = await this.cohere.embed({
texts: [query],
model: 'embed-english-v3.0',
inputType: 'search_query',
});
- const queryEmb = embedResponse.embeddings[0];
+ let queryEmbedding: number[];
+
+ if (Array.isArray(queryEmbeddingResponse.embeddings)) {
+ queryEmbedding = queryEmbeddingResponse.embeddings[0];
+ } else if (queryEmbeddingResponse.embeddings && 'embeddings' in queryEmbeddingResponse.embeddings) {
+ queryEmbedding = (queryEmbeddingResponse.embeddings as { embeddings: number[][] }).embeddings[0];
+ } else {
+ throw new Error('Invalid embedding response format');
+ }
+
+ if (!Array.isArray(queryEmbedding)) {
+ throw new Error('Query embedding is not an array');
+ }
const queryResponse = await this.index.query({
- vector: queryEmb,
- topK: top_k,
+ vector: queryEmbedding,
+ topK,
includeValues: true,
includeMetadata: true,
});
- return queryResponse.matches as Chunk[];
- } catch (e) {
- console.error(`Error embedding query: ${e}`);
+ return queryResponse.matches.map(match => ({
+ id: match.id,
+ values: match.values as number[],
+ metadata: match.metadata as ChunkMetadata,
+ }));
+ } catch (error) {
+ console.error(`Error retrieving chunks: ${error}`);
return [];
}
}
getSummaries(): string {
- const summaries = this.documents.map(doc => doc.summary);
- return summaries.map((summary, i) => `${i + 1}. ${summary}`).join('\n') + '\n';
- }
-
- private async createIndex(): Promise<any> {
- const indexes = await pinecone.listIndexes();
- if (indexes.includes(this.index_name)) {
- console.log('Index already exists...');
- } else {
- await pinecone.createIndex({
- name: this.index_name,
- dimension: 1024,
- metric: 'cosine',
- spec: {
- serverless: {
- cloud: 'aws',
- region: 'us-east-1',
- },
- },
- });
- }
- return pinecone.Index(this.index_name);
+ return this.documents.map((doc, index) => `${index + 1}. ${doc.summary}`).join('\n') + '\n';
}
}
-
-export { Vectorstore, Chunk, ChunkMetaData };