diff options
| author | A.J. Shulman <Shulman.aj@gmail.com> | 2024-12-18 20:34:33 -0500 |
|---|---|---|
| committer | A.J. Shulman <Shulman.aj@gmail.com> | 2024-12-18 20:34:33 -0500 |
| commit | 57e3c9b9977228a561e8972a469a67f17f4bcd9c (patch) | |
| tree | 1a4f23921e121ca891b3fa6a49a30a92ea76d233 /src/client/views/nodes/chatbot/vectorstore | |
| parent | ad1e0cf62187e0f8bbb19b4720b7681585361de9 (diff) | |
trying new image generation plus new implementaion of video and audio
Diffstat (limited to 'src/client/views/nodes/chatbot/vectorstore')
| -rw-r--r-- | src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts | 256 |
1 files changed, 141 insertions, 115 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index af27ebe80..3ed433778 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -15,6 +15,7 @@ import { Networking } from '../../../../Network'; import { AI_Document, CHUNK_TYPE, RAGChunk } from '../types/types'; import path from 'path'; import { v4 as uuidv4 } from 'uuid'; +import { indexes } from 'd3'; dotenv.config(); @@ -28,7 +29,7 @@ export class Vectorstore { private cohere: CohereClient; // Cohere client for generating embeddings. private indexName: string = 'pdf-chatbot'; // Default name for the index. private _id: string; // Unique ID for the Vectorstore instance. - private _doc_ids: string[] = []; // List of document IDs handled by this instance. + private _doc_ids: () => string[]; // List of document IDs handled by this instance. documents: AI_Document[] = []; // Store the documents indexed in the vectorstore. @@ -48,7 +49,7 @@ export class Vectorstore { this.pinecone = new Pinecone({ apiKey: pineconeApiKey }); this.cohere = new CohereClient({ token: process.env.COHERE_API_KEY }); this._id = id; - this._doc_ids = doc_ids(); + this._doc_ids = doc_ids; this.initializeIndex(); } @@ -85,131 +86,155 @@ export class Vectorstore { * @param progressCallback Callback to track progress. */ async addAIDoc(doc: Doc, progressCallback: (progress: number, step: string) => void) { - const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname; - - if (!local_file_path) { - throw new Error('Invalid file path.'); - } - - const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4'); - let result: AI_Document & { doc_id: string }; - - if (isAudioOrVideo) { - console.log('Processing media file...'); - const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) }); - const segmentedTranscript = response; + const ai_document_status: string = StrCast(doc.ai_document_status); + + // Skip if the document is already in progress or completed. + if (ai_document_status !== undefined && ai_document_status.trim() !== '' && ai_document_status !== '{}') { + if (ai_document_status === 'PROGRESS') { + console.log('Already in progress.'); + return; + } else if (ai_document_status === 'COMPLETED') { + console.log('Already completed.'); + return; + } + } else { + // Start processing the document. + doc.ai_document_status = 'PROGRESS'; + const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname; - // Generate embeddings for each chunk - const texts = segmentedTranscript.map((chunk: any) => chunk.text); + if (!local_file_path) { + console.log('Invalid file path.'); + return; + } - try { - const embeddingsResponse = await this.cohere.v2.embed({ - model: 'embed-english-v3.0', - inputType: 'classification', - embeddingTypes: ['float'], // Specify that embeddings should be floats - texts, // Pass the array of chunk texts - }); + const isAudioOrVideo = local_file_path.endsWith('.mp3') || local_file_path.endsWith('.mp4'); + let result: AI_Document & { doc_id: string }; + if (isAudioOrVideo) { + console.log('Processing media file...'); + const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) }); + const segmentedTranscript = response.condensed; + console.log(segmentedTranscript); + const summary = response.summary; + doc.summary = summary; + // Generate embeddings for each chunk + const texts = segmentedTranscript.map((chunk: any) => chunk.text); + + try { + const embeddingsResponse = await this.cohere.v2.embed({ + model: 'embed-english-v3.0', + inputType: 'classification', + embeddingTypes: ['float'], // Specify that embeddings should be floats + texts, // Pass the array of chunk texts + }); + + if (!embeddingsResponse.embeddings.float || embeddingsResponse.embeddings.float.length !== texts.length) { + throw new Error('Mismatch between embeddings and the number of chunks'); + } - if (!embeddingsResponse.embeddings.float || embeddingsResponse.embeddings.float.length !== texts.length) { - throw new Error('Mismatch between embeddings and the number of chunks'); + // Assign embeddings to each chunk + segmentedTranscript.forEach((chunk: any, index: number) => { + if (!embeddingsResponse.embeddings || !embeddingsResponse.embeddings.float) { + throw new Error('Invalid embeddings response'); + } + }); + doc.original_segments = JSON.stringify(response.full); + doc.ai_type = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; + const doc_id = uuidv4(); + + // Add transcript and embeddings to metadata + result = { + doc_id, + purpose: '', + file_name: local_file_path, + num_pages: 0, + summary: '', + chunks: segmentedTranscript.map((chunk: any, index: number) => ({ + id: uuidv4(), + values: (embeddingsResponse.embeddings.float as number[][])[index], // Assign embedding + metadata: { + indexes: chunk.indexes, + original_document: local_file_path, + doc_id: doc_id, + file_path: local_file_path, + start_time: chunk.start, + end_time: chunk.end, + text: chunk.text, + chunkType: 'text', + }, + })), + type: 'media', + }; + } catch (error) { + console.error('Error generating embeddings:', error); + throw new Error('Embedding generation failed'); } - // Assign embeddings to each chunk - segmentedTranscript.forEach((chunk: any, index: number) => { - if (!embeddingsResponse.embeddings || !embeddingsResponse.embeddings.float) { - throw new Error('Invalid embeddings response'); + doc.segmented_transcript = JSON.stringify(segmentedTranscript); + // Simplify chunks for storage + const simplifiedChunks = result.chunks.map(chunk => ({ + chunkId: chunk.id, + start_time: chunk.metadata.start_time, + end_time: chunk.metadata.end_time, + indexes: chunk.metadata.indexes, + chunkType: CHUNK_TYPE.TEXT, + text: chunk.metadata.text, + })); + doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks }); + } else { + // Existing document processing logic remains unchanged + console.log('Processing regular document...'); + const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path }); + + while (true) { + await new Promise(resolve => setTimeout(resolve, 2000)); + const resultResponse = await Networking.FetchFromServer(`/getResult/${jobId}`); + const resultResponseJson = JSON.parse(resultResponse); + if (resultResponseJson.status === 'completed') { + result = resultResponseJson; + break; + } + const progressResponse = await Networking.FetchFromServer(`/getProgress/${jobId}`); + const progressResponseJson = JSON.parse(progressResponse); + if (progressResponseJson) { + progressCallback(progressResponseJson.progress, progressResponseJson.step); } - //chunk.embedding = embeddingsResponse.embeddings.float[index]; - }); - - // Add transcript and embeddings to metadata - result = { - purpose: '', - file_name: path.basename(local_file_path), - num_pages: 0, - summary: '', - chunks: segmentedTranscript.map((chunk: any, index: number) => ({ - id: uuidv4(), - values: (embeddingsResponse.embeddings.float as number[][])[index], // Assign embedding - metadata: { - ...chunk, - original_document: doc.id, - doc_id: doc.id, - file_path: local_file_path, - start_time: chunk.start, - end_time: chunk.end, - text: chunk.text, - }, - })), - type: 'media', - doc_id: StrCast(doc.id), - }; - } catch (error) { - console.error('Error generating embeddings:', error); - throw new Error('Embedding generation failed'); - } - - doc.segmented_transcript = JSON.stringify(segmentedTranscript); - } else { - // Existing document processing logic remains unchanged - console.log('Processing regular document...'); - const { jobId } = await Networking.PostToServer('/createDocument', { file_path: local_file_path }); - - while (true) { - await new Promise(resolve => setTimeout(resolve, 2000)); - const resultResponse = await Networking.FetchFromServer(`/getResult/${jobId}`); - const resultResponseJson = JSON.parse(resultResponse); - if (resultResponseJson.status === 'completed') { - result = resultResponseJson; - break; } - const progressResponse = await Networking.FetchFromServer(`/getProgress/${jobId}`); - const progressResponseJson = JSON.parse(progressResponse); - if (progressResponseJson) { - progressCallback(progressResponseJson.progress, progressResponseJson.step); + if (!doc.chunk_simpl) { + doc.chunk_simpl = JSON.stringify({ chunks: [] }); } + doc.summary = result.summary; + doc.ai_purpose = result.purpose; + + result.chunks.forEach((chunk: RAGChunk) => { + const chunkToAdd = { + chunkId: chunk.id, + startPage: chunk.metadata.start_page, + endPage: chunk.metadata.end_page, + location: chunk.metadata.location, + chunkType: chunk.metadata.type as CHUNK_TYPE, + text: chunk.metadata.text, + }; + const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); + new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd); + doc.chunk_simpl = JSON.stringify(new_chunk_simpl); + }); } - } - // Index the document - await this.indexDocument(result); + // Index the document + await this.indexDocument(result); - // Simplify chunks for storage - const simplifiedChunks = result.chunks.map(chunk => ({ - chunkId: chunk.id, - start_time: chunk.metadata.start_time, - end_time: chunk.metadata.end_time, - chunkType: CHUNK_TYPE.TEXT, - text: chunk.metadata.text, - })); - doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks }); + // Preserve existing metadata updates + if (!doc.vectorstore_id) { + doc.vectorstore_id = JSON.stringify([this._id]); + } else { + doc.vectorstore_id = JSON.stringify(JSON.parse(StrCast(doc.vectorstore_id)).concat([this._id])); + } - // Preserve existing metadata updates - if (!doc.vectorstore_id) { - doc.vectorstore_id = JSON.stringify([this._id]); - } else { - doc.vectorstore_id = JSON.stringify(JSON.parse(StrCast(doc.vectorstore_id)).concat([this._id])); - } + doc.ai_doc_id = result.doc_id; - if (!doc.chunk_simpl) { - doc.chunk_simpl = JSON.stringify({ chunks: [] }); + console.log(`Document added: ${result.file_name}`); + doc.ai_document_status = 'COMPLETED'; } - - result.chunks.forEach((chunk: RAGChunk) => { - const chunkToAdd = { - chunkId: chunk.id, - startPage: chunk.metadata.start_page, - endPage: chunk.metadata.end_page, - location: chunk.metadata.location, - chunkType: chunk.metadata.type as CHUNK_TYPE, - text: chunk.metadata.text, - }; - const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); - new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd); - doc.chunk_simpl = JSON.stringify(new_chunk_simpl); - }); - - console.log(`Document added: ${result.file_name}`); } /** @@ -294,17 +319,18 @@ export class Vectorstore { if (!Array.isArray(queryEmbedding)) { throw new Error('Query embedding is not an array'); } - + console.log(this._doc_ids()); // Query the Pinecone index using the embedding and filter by document IDs. const queryResponse: QueryResponse = await this.index.query({ vector: queryEmbedding, filter: { - doc_id: { $in: this._doc_ids }, + doc_id: { $in: this._doc_ids() }, }, topK, includeValues: true, includeMetadata: true, }); + console.log(queryResponse); // Map the results into RAGChunks and return them. return queryResponse.matches.map( |
