src/client/views/nodes/chatbot/response_parsers/AnswerParser.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

/**
 * @file AnswerParser.ts
 * @description This file defines the AnswerParser class, which processes structured XML-like responses
 * from the AI system, parsing grounded text, normal text, citations, follow-up questions, and loop summaries.
 * The parser converts the XML response into an AssistantMessage format, extracting key information like
 * citations and processing steps for further use in the assistant's workflow.
 */

import { v4 as uuid } from 'uuid';
import { ASSISTANT_ROLE, AssistantMessage, Citation, ProcessingInfo, TEXT_TYPE, getChunkType } from '../types/types';

export class AnswerParser {
    static parse(xml: string, processingInfo: ProcessingInfo[]): AssistantMessage {
        const answerRegex = /<answer>([\s\S]*?)<\/answer>/;
        const citationsRegex = /<citations>([\s\S]*?)<\/citations>/;
        const citationRegex = /<citation index="([^"]+)" chunk_id="([^"]+)" type="([^"]+)">([\s\S]*?)<\/citation>/g;
        const followUpQuestionsRegex = /<follow_up_questions>([\s\S]*?)<\/follow_up_questions>/;
        const questionRegex = /<question>(.*?)<\/question>/g;
        const groundedTextRegex = /<grounded_text citation_index="([^"]+)">([\s\S]*?)<\/grounded_text>/g;
        const normalTextRegex = /<normal_text>([\s\S]*?)<\/normal_text>/g;
        const loopSummaryRegex = /<loop_summary>([\s\S]*?)<\/loop_summary>/;

        const answerMatch = answerRegex.exec(xml);
        const citationsMatch = citationsRegex.exec(xml);
        const followUpQuestionsMatch = followUpQuestionsRegex.exec(xml);
        const loopSummaryMatch = loopSummaryRegex.exec(xml);

        if (!answerMatch) {
            throw new Error('Invalid XML: Missing <answer> tag.');
        }

        let rawTextContent = answerMatch[1].trim();
        const content: AssistantMessage['content'] = [];
        const citations: Citation[] = [];
        let contentIndex = 0;

        // Remove citations and follow-up questions from rawTextContent
        if (citationsMatch) {
            rawTextContent = rawTextContent.replace(citationsMatch[0], '').trim();
        }
        if (followUpQuestionsMatch) {
            rawTextContent = rawTextContent.replace(followUpQuestionsMatch[0], '').trim();
        }
        if (loopSummaryMatch) {
            rawTextContent = rawTextContent.replace(loopSummaryMatch[0], '').trim();
        }

        // Parse citations
        let citationMatch;
        const citationMap = new Map<string, string>();
        if (citationsMatch) {
            const citationsContent = citationsMatch[1];
            while ((citationMatch = citationRegex.exec(citationsContent)) !== null) {
                // eslint-disable-next-line @typescript-eslint/no-unused-vars
                const [_, index, chunk_id, type, direct_text] = citationMatch;
                const citation_id = uuid();
                citationMap.set(index, citation_id);
                citations.push({
                    direct_text: direct_text.trim(),
                    type: getChunkType(type),
                    chunk_id,
                    citation_id,
                });
            }
        }

        rawTextContent = rawTextContent.replace(normalTextRegex, '$1');

        // Parse text content (normal and grounded)
        let lastIndex = 0;
        let match;

        while ((match = groundedTextRegex.exec(rawTextContent)) !== null) {
            const [fullMatch, citationIndex, groundedText] = match;

            // Add normal text that is before the grounded text
            if (match.index > lastIndex) {
                const normalText = rawTextContent.slice(lastIndex, match.index).trim();
                if (normalText) {
                    content.push({
                        index: contentIndex++,
                        type: TEXT_TYPE.NORMAL,
                        text: normalText,
                        citation_ids: null,
                    });
                }
            }

            // Add grounded text
            const citation_ids = citationIndex.split(',').map(index => citationMap.get(index) || '');
            content.push({
                index: contentIndex++,
                type: TEXT_TYPE.GROUNDED,
                text: groundedText.trim(),
                citation_ids,
            });

            lastIndex = match.index + fullMatch.length;
        }

        // Add any remaining normal text after the last grounded text
        if (lastIndex < rawTextContent.length) {
            const remainingText = rawTextContent.slice(lastIndex).trim();
            if (remainingText) {
                content.push({
                    index: contentIndex++,
                    type: TEXT_TYPE.NORMAL,
                    text: remainingText,
                    citation_ids: null,
                });
            }
        }

        const followUpQuestions: string[] = [];
        if (followUpQuestionsMatch) {
            const questionsText = followUpQuestionsMatch[1];
            let questionMatch;
            while ((questionMatch = questionRegex.exec(questionsText)) !== null) {
                followUpQuestions.push(questionMatch[1].trim());
            }
        }

        const assistantResponse: AssistantMessage = {
            role: ASSISTANT_ROLE.ASSISTANT,
            content,
            follow_up_questions: followUpQuestions,
            citations,
            processing_info: processingInfo,
            loop_summary: loopSummaryMatch ? loopSummaryMatch[1].trim() : undefined,
        };

        return assistantResponse;
    }
}