aboutsummaryrefslogtreecommitdiff
path: root/src/client/views/nodes/chatbot/vectorstore
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2024-12-18 20:34:33 -0500
committerA.J. Shulman <Shulman.aj@gmail.com>2024-12-18 20:34:33 -0500
commit57e3c9b9977228a561e8972a469a67f17f4bcd9c (patch)
tree1a4f23921e121ca891b3fa6a49a30a92ea76d233 /src/client/views/nodes/chatbot/vectorstore
parentad1e0cf62187e0f8bbb19b4720b7681585361de9 (diff)
trying new image generation plus new implementaion of video and audio
Diffstat (limited to 'src/client/views/nodes/chatbot/vectorstore')
-rw-r--r--src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts256
1 files changed, 141 insertions, 115 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index af27ebe80..3ed433778 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -15,6 +15,7 @@ import { Networking } from '../../../../Network';
import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
import path from 'path';
import { v4 as uuidv4 } from 'uuid';
+import { indexes } from 'd3';
dotenv.config();
@@ -28,7 +29,7 @@ export class Vectorstore {
private cohere: CohereClient; // Cohere client for generating embeddings.
private indexName: string = 'pdf-chatbot'; // Default name for the index.
private _id: string; // Unique ID for the Vectorstore instance.
- private _doc_ids: string[] = []; // List of document IDs handled by this instance.
+ private _doc_ids: () => string[]; // List of document IDs handled by this instance.
documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
@@ -48,7 +49,7 @@ export class Vectorstore {
this.pinecone = new Pinecone({ apiKey: pineconeApiKey });
this.cohere = new CohereClient({ token: process.env.COHERE_API_KEY });
this._id = id;
- this._doc_ids = doc_ids();
+ this._doc_ids = doc_ids;
this.initializeIndex();
}
@@ -85,131 +86,155 @@ export class Vectorstore {
* @param progressCallback Callback to track progress.
*/
async addAIDoc(doc: Doc, progressCallback: (progress: number, step: string) => void) {
- const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname;
-
- if (!local_file_path) {
- throw new Error('Invalid file path.');
- }
-
- const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4');
- let result: AI_Document & { doc_id: string };
-
- if (isAudioOrVideo) {
- console.log('Processing media file...');
- const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
- const segmentedTranscript = response;
+ const ai_document_status: string = StrCast(doc.ai_document_status);
+
+ // Skip if the document is already in progress or completed.
+ if (ai_document_status !== undefined && ai_document_status.trim() !== '' && ai_document_status !== '{}') {
+ if (ai_document_status === 'PROGRESS') {
+ console.log('Already in progress.');
+ return;
+ } else if (ai_document_status === 'COMPLETED') {
+ console.log('Already completed.');
+ return;
+ }
+ } else {
+ // Start processing the document.
+ doc.ai_document_status = 'PROGRESS';
+ const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname;
- // Generate embeddings for each chunk
- const texts = segmentedTranscript.map((chunk: any) => chunk.text);
+ if (!local_file_path) {
+ console.log('Invalid file path.');
+ return;
+ }
- try {
- const embeddingsResponse = await this.cohere.v2.embed({
- model: 'embed-english-v3.0',
- inputType: 'classification',
- embeddingTypes: ['float'], // Specify that embeddings should be floats
- texts, // Pass the array of chunk texts
- });
+ const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4');
+ let result: AI_Document & { doc_id: string };
+ if (isAudioOrVideo) {
+ console.log('Processing media file...');
+ const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
+ const segmentedTranscript = response.condensed;
+ console.log(segmentedTranscript);
+ const summary = response.summary;
+ doc.summary = summary;
+ // Generate embeddings for each chunk
+ const texts = segmentedTranscript.map((chunk: any) => chunk.text);
+
+ try {
+ const embeddingsResponse = await this.cohere.v2.embed({
+ model: 'embed-english-v3.0',
+ inputType: 'classification',
+ embeddingTypes: ['float'], // Specify that embeddings should be floats
+ texts, // Pass the array of chunk texts
+ });
+
+ if (!embeddingsResponse.embeddings.float || embeddingsResponse.embeddings.float.length !== texts.length) {
+ throw new Error('Mismatch between embeddings and the number of chunks');
+ }
- if (!embeddingsResponse.embeddings.float || embeddingsResponse.embeddings.float.length !== texts.length) {
- throw new Error('Mismatch between embeddings and the number of chunks');
+ // Assign embeddings to each chunk
+ segmentedTranscript.forEach((chunk: any, index: number) => {
+ if (!embeddingsResponse.embeddings || !embeddingsResponse.embeddings.float) {
+ throw new Error('Invalid embeddings response');
+ }
+ });
+ doc.original_segments = JSON.stringify(response.full);
+ doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
+ const doc_id = uuidv4();
+
+ // Add transcript and embeddings to metadata
+ result = {
+ doc_id,
+ purpose: '',
+ file_name: local_file_path,
+ num_pages: 0,
+ summary: '',
+ chunks: segmentedTranscript.map((chunk: any, index: number) => ({
+ id: uuidv4(),
+ values: (embeddingsResponse.embeddings.float as number[][])[index], // Assign embedding
+ metadata: {
+ indexes: chunk.indexes,
+ original_document: local_file_path,
+ doc_id: doc_id,
+ file_path: local_file_path,
+ start_time: chunk.start,
+ end_time: chunk.end,
+ text: chunk.text,
+ chunkType: 'text',
+ },
+ })),
+ type: 'media',
+ };
+ } catch (error) {
+ console.error('Error generating embeddings:', error);
+ throw new Error('Embedding generation failed');
}
- // Assign embeddings to each chunk
- segmentedTranscript.forEach((chunk: any, index: number) => {
- if (!embeddingsResponse.embeddings || !embeddingsResponse.embeddings.float) {
- throw new Error('Invalid embeddings response');
+ doc.segmented_transcript = JSON.stringify(segmentedTranscript);
+ // Simplify chunks for storage
+ const simplifiedChunks = result.chunks.map(chunk => ({
+ chunkId: chunk.id,
+ start_time: chunk.metadata.start_time,
+ end_time: chunk.metadata.end_time,
+ indexes: chunk.metadata.indexes,
+ chunkType: CHUNK_TYPE.TEXT,
+ text: chunk.metadata.text,
+ }));
+ doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
+ } else {
+ // Existing document processing logic remains unchanged
+ console.log('Processing regular document...');
+ const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+
+ while (true) {
+ await new Promise(resolve => setTimeout(resolve, 2000));
+ const resultResponse = await Networking.FetchFromServer(`/getResult/${jobId}`);
+ const resultResponseJson = JSON.parse(resultResponse);
+ if (resultResponseJson.status === 'completed') {
+ result = resultResponseJson;
+ break;
+ }
+ const progressResponse = await Networking.FetchFromServer(`/getProgress/${jobId}`);
+ const progressResponseJson = JSON.parse(progressResponse);
+ if (progressResponseJson) {
+ progressCallback(progressResponseJson.progress, progressResponseJson.step);
}
- //chunk.embedding = embeddingsResponse.embeddings.float[index];
- });
-
- // Add transcript and embeddings to metadata
- result = {
- purpose: '',
- file_name: path.basename(local_file_path),
- num_pages: 0,
- summary: '',
- chunks: segmentedTranscript.map((chunk: any, index: number) => ({
- id: uuidv4(),
- values: (embeddingsResponse.embeddings.float as number[][])[index], // Assign embedding
- metadata: {
- ...chunk,
- original_document: doc.id,
- doc_id: doc.id,
- file_path: local_file_path,
- start_time: chunk.start,
- end_time: chunk.end,
- text: chunk.text,
- },
- })),
- type: 'media',
- doc_id: StrCast(doc.id),
- };
- } catch (error) {
- console.error('Error generating embeddings:', error);
- throw new Error('Embedding generation failed');
- }
-
- doc.segmented_transcript = JSON.stringify(segmentedTranscript);
- } else {
- // Existing document processing logic remains unchanged
- console.log('Processing regular document...');
- const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
-
- while (true) {
- await new Promise(resolve => setTimeout(resolve, 2000));
- const resultResponse = await Networking.FetchFromServer(`/getResult/${jobId}`);
- const resultResponseJson = JSON.parse(resultResponse);
- if (resultResponseJson.status === 'completed') {
- result = resultResponseJson;
- break;
}
- const progressResponse = await Networking.FetchFromServer(`/getProgress/${jobId}`);
- const progressResponseJson = JSON.parse(progressResponse);
- if (progressResponseJson) {
- progressCallback(progressResponseJson.progress, progressResponseJson.step);
+ if (!doc.chunk_simpl) {
+ doc.chunk_simpl = JSON.stringify({ chunks: [] });
}
+ doc.summary = result.summary;
+ doc.ai_purpose = result.purpose;
+
+ result.chunks.forEach((chunk: RAGChunk) => {
+ const chunkToAdd = {
+ chunkId: chunk.id,
+ startPage: chunk.metadata.start_page,
+ endPage: chunk.metadata.end_page,
+ location: chunk.metadata.location,
+ chunkType: chunk.metadata.type as CHUNK_TYPE,
+ text: chunk.metadata.text,
+ };
+ const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl));
+ new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd);
+ doc.chunk_simpl = JSON.stringify(new_chunk_simpl);
+ });
}
- }
- // Index the document
- await this.indexDocument(result);
+ // Index the document
+ await this.indexDocument(result);
- // Simplify chunks for storage
- const simplifiedChunks = result.chunks.map(chunk => ({
- chunkId: chunk.id,
- start_time: chunk.metadata.start_time,
- end_time: chunk.metadata.end_time,
- chunkType: CHUNK_TYPE.TEXT,
- text: chunk.metadata.text,
- }));
- doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
+ // Preserve existing metadata updates
+ if (!doc.vectorstore_id) {
+ doc.vectorstore_id = JSON.stringify([this._id]);
+ } else {
+ doc.vectorstore_id = JSON.stringify(JSON.parse(StrCast(doc.vectorstore_id)).concat([this._id]));
+ }
- // Preserve existing metadata updates
- if (!doc.vectorstore_id) {
- doc.vectorstore_id = JSON.stringify([this._id]);
- } else {
- doc.vectorstore_id = JSON.stringify(JSON.parse(StrCast(doc.vectorstore_id)).concat([this._id]));
- }
+ doc.ai_doc_id = result.doc_id;
- if (!doc.chunk_simpl) {
- doc.chunk_simpl = JSON.stringify({ chunks: [] });
+ console.log(`Document added: ${result.file_name}`);
+ doc.ai_document_status = 'COMPLETED';
}
-
- result.chunks.forEach((chunk: RAGChunk) => {
- const chunkToAdd = {
- chunkId: chunk.id,
- startPage: chunk.metadata.start_page,
- endPage: chunk.metadata.end_page,
- location: chunk.metadata.location,
- chunkType: chunk.metadata.type as CHUNK_TYPE,
- text: chunk.metadata.text,
- };
- const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl));
- new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd);
- doc.chunk_simpl = JSON.stringify(new_chunk_simpl);
- });
-
- console.log(`Document added: ${result.file_name}`);
}
/**
@@ -294,17 +319,18 @@ export class Vectorstore {
if (!Array.isArray(queryEmbedding)) {
throw new Error('Query embedding is not an array');
}
-
+ console.log(this._doc_ids());
// Query the Pinecone index using the embedding and filter by document IDs.
const queryResponse: QueryResponse = await this.index.query({
vector: queryEmbedding,
filter: {
- doc_id: { $in: this._doc_ids },
+ doc_id: { $in: this._doc_ids() },
},
topK,
includeValues: true,
includeMetadata: true,
});
+ console.log(queryResponse);
// Map the results into RAGChunks and return them.
return queryResponse.matches.map(