diff options
| author | A.J. Shulman <Shulman.aj@gmail.com> | 2025-04-27 14:57:39 -0400 |
|---|---|---|
| committer | A.J. Shulman <Shulman.aj@gmail.com> | 2025-04-27 14:57:39 -0400 |
| commit | 393b7f8286422c933102449eba1ba82874a48896 (patch) | |
| tree | c34cd5dffc7306a66fcfe54c81d8656c341facb9 /src/client/views/nodes/chatbot/vectorstore | |
| parent | 67a7996278ce176e227393fa410e7afc80228a83 (diff) | |
improved consistency across doc types and parsing
Diffstat (limited to 'src/client/views/nodes/chatbot/vectorstore')
| -rw-r--r-- | src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts | 49 |
1 files changed, 18 insertions, 31 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts index 4512ae3e6..4268c0180 100644 --- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts +++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts @@ -103,7 +103,7 @@ export class Vectorstore { const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname; if (!local_file_path) { - console.log('Invalid file path.'); + console.log('Not adding to vectorstore. Invalid file path for vectorstore addition.'); return; } @@ -112,7 +112,11 @@ export class Vectorstore { if (isAudioOrVideo) { console.log('Processing media file...'); + progressCallback(10, 'Preparing media file for transcription...'); + + // Post to processMediaFile endpoint to get the transcript const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) }); + progressCallback(60, 'Transcription completed. Processing transcript...'); // Type assertion to handle the response properties const typedResponse = response as { @@ -135,6 +139,7 @@ export class Vectorstore { input: texts, encoding_format: 'float', }); + progressCallback(85, 'Embeddings generated. Finalizing document...'); doc.original_segments = JSON.stringify(typedResponse.full); const doc_id = uuidv4(); @@ -154,7 +159,7 @@ export class Vectorstore { purpose: '', file_name: local_file_path, num_pages: 0, - summary: '', + summary: summary, chunks: segmentedTranscript.map((chunk, index) => ({ id: chunkIds[index], // Use pre-generated chunk ID values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding @@ -171,23 +176,17 @@ export class Vectorstore { })), type: 'media', }; + progressCallback(95, 'Adding document to vectorstore...'); } catch (error) { console.error('Error generating embeddings:', error); + doc.ai_document_status = 'ERROR'; throw new Error('Embedding generation failed'); } doc.segmented_transcript = JSON.stringify(segmentedTranscript); - // Simplify chunks for storage - ensure simplified chunks use EXACTLY the same IDs - const simplifiedChunks = result.chunks.map(chunk => ({ - chunkId: chunk.id, // Use the exact same ID as the full chunk - start_time: chunk.metadata.start_time, - end_time: chunk.metadata.end_time, - indexes: chunk.metadata.indexes, - chunkType: CHUNK_TYPE.VIDEO, - text: chunk.metadata.text, - doc_id: chunk.metadata.doc_id, // Include parent doc_id for completeness - })); - doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks }); + // Use doc manager to add simplified chunks + const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video'; + this.docManager.addSimplifiedChunks(doc, result.chunks, docType); } else { // Process regular document console.log('Processing regular document...'); @@ -220,30 +219,18 @@ export class Vectorstore { // Register chunks with the document manager this.docManager.registerChunkIds(result.doc_id, chunkIds); - if (!doc.chunk_simpl) { - doc.chunk_simpl = JSON.stringify({ chunks: [] }); - } + // Use doc manager to add simplified chunks - determine document type from file extension + const fileExt = path.extname(local_file_path).toLowerCase(); + const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text'; + this.docManager.addSimplifiedChunks(doc, result.chunks, docType); + doc.summary = result.summary; doc.ai_purpose = result.purpose; - - result.chunks.forEach((chunk: RAGChunk) => { - const chunkToAdd = { - chunkId: chunk.id, // Ensure we use the exact same ID - startPage: chunk.metadata.start_page, - endPage: chunk.metadata.end_page, - location: chunk.metadata.location, - chunkType: chunk.metadata.type as CHUNK_TYPE, - text: chunk.metadata.text, - doc_id: chunk.metadata.doc_id, // Include parent doc_id for consistency - }; - const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl)); - new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd); - doc.chunk_simpl = JSON.stringify(new_chunk_simpl); - }); } // Index the document await this.indexDocument(result); + progressCallback(100, 'Document added successfully!'); // Preserve existing metadata updates if (!doc.vectorstore_id) { |
