aboutsummaryrefslogtreecommitdiff
path: root/src/client/views/nodes/chatbot/vectorstore
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2025-04-27 14:57:39 -0400
committerA.J. Shulman <Shulman.aj@gmail.com>2025-04-27 14:57:39 -0400
commit393b7f8286422c933102449eba1ba82874a48896 (patch)
treec34cd5dffc7306a66fcfe54c81d8656c341facb9 /src/client/views/nodes/chatbot/vectorstore
parent67a7996278ce176e227393fa410e7afc80228a83 (diff)
improved consistency across doc types and parsing
Diffstat (limited to 'src/client/views/nodes/chatbot/vectorstore')
-rw-r--r--src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts49
1 files changed, 18 insertions, 31 deletions
diff --git a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
index 4512ae3e6..4268c0180 100644
--- a/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
+++ b/src/client/views/nodes/chatbot/vectorstore/Vectorstore.ts
@@ -103,7 +103,7 @@ export class Vectorstore {
const local_file_path: string = CsvCast(doc.data)?.url?.pathname ?? PDFCast(doc.data)?.url?.pathname ?? VideoCast(doc.data)?.url?.pathname ?? AudioCast(doc.data)?.url?.pathname;
if (!local_file_path) {
- console.log('Invalid file path.');
+ console.log('Not adding to vectorstore. Invalid file path for vectorstore addition.');
return;
}
@@ -112,7 +112,11 @@ export class Vectorstore {
if (isAudioOrVideo) {
console.log('Processing media file...');
+ progressCallback(10, 'Preparing media file for transcription...');
+
+ // Post to processMediaFile endpoint to get the transcript
const response = await Networking.PostToServer('/processMediaFile', { fileName: path.basename(local_file_path) });
+ progressCallback(60, 'Transcription completed. Processing transcript...');
// Type assertion to handle the response properties
const typedResponse = response as {
@@ -135,6 +139,7 @@ export class Vectorstore {
input: texts,
encoding_format: 'float',
});
+ progressCallback(85, 'Embeddings generated. Finalizing document...');
doc.original_segments = JSON.stringify(typedResponse.full);
const doc_id = uuidv4();
@@ -154,7 +159,7 @@ export class Vectorstore {
purpose: '',
file_name: local_file_path,
num_pages: 0,
- summary: '',
+ summary: summary,
chunks: segmentedTranscript.map((chunk, index) => ({
id: chunkIds[index], // Use pre-generated chunk ID
values: (embeddingsResponse.data as Embedding[])[index].embedding, // Assign embedding
@@ -171,23 +176,17 @@ export class Vectorstore {
})),
type: 'media',
};
+ progressCallback(95, 'Adding document to vectorstore...');
} catch (error) {
console.error('Error generating embeddings:', error);
+ doc.ai_document_status = 'ERROR';
throw new Error('Embedding generation failed');
}
doc.segmented_transcript = JSON.stringify(segmentedTranscript);
- // Simplify chunks for storage - ensure simplified chunks use EXACTLY the same IDs
- const simplifiedChunks = result.chunks.map(chunk => ({
- chunkId: chunk.id, // Use the exact same ID as the full chunk
- start_time: chunk.metadata.start_time,
- end_time: chunk.metadata.end_time,
- indexes: chunk.metadata.indexes,
- chunkType: CHUNK_TYPE.VIDEO,
- text: chunk.metadata.text,
- doc_id: chunk.metadata.doc_id, // Include parent doc_id for completeness
- }));
- doc.chunk_simpl = JSON.stringify({ chunks: simplifiedChunks });
+ // Use doc manager to add simplified chunks
+ const docType = local_file_path.endsWith('.mp3') ? 'audio' : 'video';
+ this.docManager.addSimplifiedChunks(doc, result.chunks, docType);
} else {
// Process regular document
console.log('Processing regular document...');
@@ -220,30 +219,18 @@ export class Vectorstore {
// Register chunks with the document manager
this.docManager.registerChunkIds(result.doc_id, chunkIds);
- if (!doc.chunk_simpl) {
- doc.chunk_simpl = JSON.stringify({ chunks: [] });
- }
+ // Use doc manager to add simplified chunks - determine document type from file extension
+ const fileExt = path.extname(local_file_path).toLowerCase();
+ const docType = fileExt === '.pdf' ? 'pdf' : fileExt === '.csv' ? 'csv' : 'text';
+ this.docManager.addSimplifiedChunks(doc, result.chunks, docType);
+
doc.summary = result.summary;
doc.ai_purpose = result.purpose;
-
- result.chunks.forEach((chunk: RAGChunk) => {
- const chunkToAdd = {
- chunkId: chunk.id, // Ensure we use the exact same ID
- startPage: chunk.metadata.start_page,
- endPage: chunk.metadata.end_page,
- location: chunk.metadata.location,
- chunkType: chunk.metadata.type as CHUNK_TYPE,
- text: chunk.metadata.text,
- doc_id: chunk.metadata.doc_id, // Include parent doc_id for consistency
- };
- const new_chunk_simpl = JSON.parse(StrCast(doc.chunk_simpl));
- new_chunk_simpl.chunks = new_chunk_simpl.chunks.concat(chunkToAdd);
- doc.chunk_simpl = JSON.stringify(new_chunk_simpl);
- });
}
// Index the document
await this.indexDocument(result);
+ progressCallback(100, 'Document added successfully!');
// Preserve existing metadata updates
if (!doc.vectorstore_id) {