trying new image generation plus new implementaion of video and audio

author: A.J. Shulman <Shulman.aj@gmail.com> 2024-12-18 20:34:33 -0500
committer: A.J. Shulman <Shulman.aj@gmail.com> 2024-12-18 20:34:33 -0500
commit: 57e3c9b9977228a561e8972a469a67f17f4bcd9c (patch)
tree: 1a4f23921e121ca891b3fa6a49a30a92ea76d233 /src/client/views/nodes/chatbot/vectorstore
parent: ad1e0cf62187e0f8bbb19b4720b7681585361de9 (diff)
1 files changed, 141 insertions, 115 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index af27ebe80..3ed433778 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -15,6 +15,7 @@ import { Networking } from '../../../../Network';
 import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types';
 import path from 'path';
 import { v4 as uuidv4 } from 'uuid';
+import { indexes } from 'd3';
 
 dotenv.config();
 
@@ -28,7 +29,7 @@ export class Vectorstore {
     private cohere: CohereClient; // Cohere client for generating embeddings.
     private indexName: string = 'pdf-chatbot'; // Default name for the index.
     private _id: string; // Unique ID for the Vectorstore instance.
-    private _doc_ids: string[] = []; // List of document IDs handled by this instance.
+    private _doc_ids: () => string[]; // List of document IDs handled by this instance.
 
     documents: AI_Document[] = []; // Store the documents indexed in the vectorstore.
 
@@ -48,7 +49,7 @@ export class Vectorstore {
         this.pinecone = new Pinecone({ apiKey: pineconeApiKey });
         this.cohere = new CohereClient({ token: process.env.COHERE_API_KEY });
         this._id = id;
-        this._doc_ids = doc_ids();
+        this._doc_ids = doc_ids;
         this.initializeIndex();
     }
 
@@ -85,131 +86,155 @@ export class Vectorstore {
      * @param progressCallback Callback to track progress.
      */
     async addAIDoc(doc: Doc, progressCallback: (progress: number, step: string) => void) {
-        const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname;
-
-        if (!local_file_path) {
-            throw new Error('Invalid file path.');
-        }
-
-        const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4');
-        let result: AI_Document & { doc_id: string };
-
-        if (isAudioOrVideo) {
-            console.log('Processing media file...');
-            const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
-            const segmentedTranscript = response;
+        const ai_document_status: string = StrCast(doc.ai_document_status);
+
+        // Skip if the document is already in progress or completed.
+        if (ai_document_status !== undefined && ai_document_status.trim() !== '' && ai_document_status !== '{}') {
+            if (ai_document_status === 'PROGRESS') {
+                console.log('Already in progress.');
+                return;
+            } else if (ai_document_status === 'COMPLETED') {
+                console.log('Already completed.');
+                return;
+            }
+        } else {
+            // Start processing the document.
+            doc.ai_document_status = 'PROGRESS';
+            const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname;
 
-            // Generate embeddings for each chunk
-            const texts = segmentedTranscript.map((chunk: any) => chunk.text);
+            if (!local_file_path) {
+                console.log('Invalid file path.');
+                return;
+            }
 
-            try {
-                const embeddingsResponse = await this.cohere.v2.embed({
-                    model: 'embed-english-v3.0',
-                    inputType: 'classification',
-                    embeddingTypes: ['float'], // Specify that embeddings should be floats
-                    texts, // Pass the array of chunk texts
-                });
+            const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4');
+            let result: AI_Document & { doc_id: string };
+            if (isAudioOrVideo) {
+                console.log('Processing media file...');
+                const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
+                const segmentedTranscript = response.condensed;
+                console.log(segmentedTranscript);
+                const summary = response.summary;
+                doc.summary = summary;
+                // Generate embeddings for each chunk
+                const texts = segmentedTranscript.map((chunk: any) => chunk.text);
+
+                try {
+                    const embeddingsResponse = await this.cohere.v2.embed({
+                        model: 'embed-english-v3.0',
+                        inputType: 'classification',
+                        embeddingTypes: ['float'], // Specify that embeddings should be floats
+                        texts, // Pass the array of chunk texts
+                    });
+
+                    if (!embeddingsResponse.embeddings.float || embeddingsResponse.embeddings.float.length !== texts.length) {
+                        throw new Error('Mismatch between embeddings and the number of chunks');
+                    }
 
-                if (!embeddingsResponse.embeddings.float || embeddingsResponse.embeddings.float.length !== texts.length) {
-                    throw new Error('Mismatch between embeddings and the number of chunks');
+                    // Assign embeddings to each chunk
+                    segmentedTranscript.forEach((chunk: any, index: number) => {
+                        if (!embeddingsResponse.embeddings || !embeddingsResponse.embeddings.float) {
+                            throw new Error('Invalid embeddings response');
+                        }
+                    });
+                    doc.original_segments = JSON.stringify(response.full);
+                    doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
+                    const doc_id = uuidv4();
+
+                    // Add transcript and embeddings to metadata
+                    result = {
+                        doc_id,
+                        purpose: '',
+                        file_name: local_file_path,
+                        num_pages: 0,
+                        summary: '',
+                        chunks: segmentedTranscript.map((chunk: any, index: number) => ({
+                            id: uuidv4(),
+                            values: (embeddingsResponse.embeddings.float as number[][])[index], // Assign embedding
+                            metadata: {
+                                indexes: chunk.indexes,
+                                original_document: local_file_path,
+                                doc_id: doc_id,
+                                file_path: local_file_path,
+                                start_time: chunk.start,
+                                end_time: chunk.end,
+                                text: chunk.text,
+                                chunkType: 'text',
+                            },
+                        })),
+                        type: 'media',
+                    };
+                } catch (error) {
+                    console.error('Error generating embeddings:', error);
+                    throw new Error('Embedding generation failed');
                 }
 
-                // Assign embeddings to each chunk
-                segmentedTranscript.forEach((chunk: any, index: number) => {
-                    if (!embeddingsResponse.embeddings || !embeddingsResponse.embeddings.float) {
-                        throw new Error('Invalid embeddings response');
+                doc.segmented_transcript = JSON.stringify(segmentedTranscript);
+                // Simplify chunks for storage
+                const simplifiedChunks = result.chunks.map(chunk => ({
+                    chunkId: chunk.id,
+                    start_time: chunk.metadata.start_time,
+                    end_time: chunk.metadata.end_time,
+                    indexes: chunk.metadata.indexes,
+                    chunkType: CHUNK_TYPE.TEXT,
+                    text: chunk.metadata.text,
+                }));
+                doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
+            } else {
+                // Existing document processing logic remains unchanged
+                console.log('Processing regular document...');
+                const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
+
+                while (true) {
+                    await new Promise(resolve => setTimeout(resolve, 2000));
+                    const resultResponse = await Networking.FetchFromServer(`/getResult/${jobId}`);
+                    const resultResponseJson = JSON.parse(resultResponse);
+                    if (resultResponseJson.status === 'completed') {
+                        result = resultResponseJson;
+                        break;
+                    }
+                    const progressResponse = await Networking.FetchFromServer(`/getProgress/${jobId}`);
+                    const progressResponseJson = JSON.parse(progressResponse);
+                    if (progressResponseJson) {
+                        progressCallback(progressResponseJson.progress, progressResponseJson.step);
                     }
-                    //chunk.embedding = embeddingsResponse.embeddings.float[index];
-                });
-
-                // Add transcript and embeddings to metadata
-                result = {
-                    purpose: '',
-                    file_name: path.basename(local_file_path),
-                    num_pages: 0,
-                    summary: '',
-                    chunks: segmentedTranscript.map((chunk: any, index: number) => ({
-                        id: uuidv4(),
-                        values: (embeddingsResponse.embeddings.float as number[][])[index], // Assign embedding
-                        metadata: {
-                            ...chunk,
-                            original_document: doc.id,
-                            doc_id: doc.id,
-                            file_path: local_file_path,
-                            start_time: chunk.start,
-                            end_time: chunk.end,
-                            text: chunk.text,
-                        },
-                    })),
-                    type: 'media',
-                    doc_id: StrCast(doc.id),
-                };
-            } catch (error) {
-                console.error('Error generating embeddings:', error);
-                throw new Error('Embedding generation failed');
-            }
-
-            doc.segmented_transcript = JSON.stringify(segmentedTranscript);
-        } else {
-            // Existing document processing logic remains unchanged
-            console.log('Processing regular document...');
-            const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path });
-
-            while (true) {
-                await new Promise(resolve => setTimeout(resolve, 2000));
-                const resultResponse = await Networking.FetchFromServer(`/getResult/${jobId}`);
-                const resultResponseJson = JSON.parse(resultResponse);
-                if (resultResponseJson.status === 'completed') {
-                    result = resultResponseJson;
-                    break;
                 }
-                const progressResponse = await Networking.FetchFromServer(`/getProgress/${jobId}`);
-                const progressResponseJson = JSON.parse(progressResponse);
-                if (progressResponseJson) {
-                    progressCallback(progressResponseJson.progress, progressResponseJson.step);
+                if (!doc.chunk_simpl) {
+                    doc.chunk_simpl = JSON.stringify({ chunks: [] });
                 }
+                doc.summary = result.summary;
+                doc.ai_purpose = result.purpose;
+
+                result.chunks.forEach((chunk: RAGChunk) => {
+                    const chunkToAdd = {
+                        chunkId: chunk.id,
+                        startPage: chunk.metadata.start_page,
+                        endPage: chunk.metadata.end_page,
+                        location: chunk.metadata.location,
+                        chunkType: chunk.metadata.type as CHUNK_TYPE,
+                        text: chunk.metadata.text,
+                    };
+                    const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl));
+                    new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd);
+                    doc.chunk_simpl = JSON.stringify(new_chunk_simpl);
+                });
             }
-        }
 
-        // Index the document
-        await this.indexDocument(result);
+            // Index the document
+            await this.indexDocument(result);
 
-        // Simplify chunks for storage
-        const simplifiedChunks = result.chunks.map(chunk => ({
-            chunkId: chunk.id,
-            start_time: chunk.metadata.start_time,
-            end_time: chunk.metadata.end_time,
-            chunkType: CHUNK_TYPE.TEXT,
-            text: chunk.metadata.text,
-        }));
-        doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
+            // Preserve existing metadata updates
+            if (!doc.vectorstore_id) {
+                doc.vectorstore_id = JSON.stringify([this._id]);
+            } else {
+                doc.vectorstore_id = JSON.stringify(JSON.parse(StrCast(doc.vectorstore_id)).concat([this._id]));
+            }
 
-        // Preserve existing metadata updates
-        if (!doc.vectorstore_id) {
-            doc.vectorstore_id = JSON.stringify([this._id]);
-        } else {
-            doc.vectorstore_id = JSON.stringify(JSON.parse(StrCast(doc.vectorstore_id)).concat([this._id]));
-        }
+            doc.ai_doc_id = result.doc_id;
 
-        if (!doc.chunk_simpl) {
-            doc.chunk_simpl = JSON.stringify({ chunks: [] });
+            console.log(`Document added: ${result.file_name}`);
+            doc.ai_document_status = 'COMPLETED';
         }
-
-        result.chunks.forEach((chunk: RAGChunk) => {
-            const chunkToAdd = {
-                chunkId: chunk.id,
-                startPage: chunk.metadata.start_page,
-                endPage: chunk.metadata.end_page,
-                location: chunk.metadata.location,
-                chunkType: chunk.metadata.type as CHUNK_TYPE,
-                text: chunk.metadata.text,
-            };
-            const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl));
-            new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd);
-            doc.chunk_simpl = JSON.stringify(new_chunk_simpl);
-        });
-
-        console.log(`Document added: ${result.file_name}`);
     }
 
     /**
@@ -294,17 +319,18 @@ export class Vectorstore {
             if (!Array.isArray(queryEmbedding)) {
                 throw new Error('Query embedding is not an array');
             }
-
+            console.log(this._doc_ids());
             // Query the Pinecone index using the embedding and filter by document IDs.
             const queryResponse: QueryResponse = await this.index.query({
                 vector: queryEmbedding,
                 filter: {
-                    doc_id: { $in: this._doc_ids },
+                    doc_id: { $in: this._doc_ids() },
                 },
                 topK,
                 includeValues: true,
                 includeMetadata: true,
             });
+            console.log(queryResponse);
 
             // Map the results into RAGChunks and return them.
             return queryResponse.matches.map(
author	A.J. Shulman <Shulman.aj@gmail.com>	2024-12-18 20:34:33 -0500
committer	A.J. Shulman <Shulman.aj@gmail.com>	2024-12-18 20:34:33 -0500
commit	57e3c9b9977228a561e8972a469a67f17f4bcd9c (patch)
tree	1a4f23921e121ca891b3fa6a49a30a92ea76d233 /src/client/views/nodes/chatbot/vectorstore
parent	ad1e0cf62187e0f8bbb19b4720b7681585361de9 (diff)