src/client/views/nodes/ChatBox/AnswerParser.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

import { ASSISTANT_ROLE, AssistantMessage, Citation, CHUNK_TYPE, TEXT_TYPE, getChunkType, ProcessingInfo } from './types';
import { v4 as uuid } from 'uuid';

export class AnswerParser {
    static parse(xml: string, processingInfo: ProcessingInfo[]): AssistantMessage {
        const answerRegex = /<answer>([\s\S]*?)<\/answer>/;
        const citationsRegex = /<citations>([\s\S]*?)<\/citations>/;
        const citationRegex = /<citation index="([^"]+)" chunk_id="([^"]+)" type="([^"]+)">([\s\S]*?)<\/citation>/g;
        const followUpQuestionsRegex = /<follow_up_questions>([\s\S]*?)<\/follow_up_questions>/;
        const questionRegex = /<question>(.*?)<\/question>/g;
        const groundedTextRegex = /<grounded_text citation_index="([^"]+)">([\s\S]*?)<\/grounded_text>/g;
        const normalTextRegex = /<normal_text>([\s\S]*?)<\/normal_text>/g;
        const loopSummaryRegex = /<loop_summary>([\s\S]*?)<\/loop_summary>/;

        const answerMatch = answerRegex.exec(xml);
        const citationsMatch = citationsRegex.exec(xml);
        const followUpQuestionsMatch = followUpQuestionsRegex.exec(xml);
        const loopSummaryMatch = loopSummaryRegex.exec(xml);

        if (!answerMatch) {
            throw new Error('Invalid XML: Missing <answer> tag.');
        }

        let rawTextContent = answerMatch[1].trim();
        let content: AssistantMessage['content'] = [];
        let citations: Citation[] = [];
        let contentIndex = 0;

        // Remove citations and follow-up questions from rawTextContent
        if (citationsMatch) {
            rawTextContent = rawTextContent.replace(citationsMatch[0], '').trim();
        }
        if (followUpQuestionsMatch) {
            rawTextContent = rawTextContent.replace(followUpQuestionsMatch[0], '').trim();
        }
        if (loopSummaryMatch) {
            rawTextContent = rawTextContent.replace(loopSummaryMatch[0], '').trim();
        }

        // Parse citations
        let citationMatch;
        const citationMap = new Map<string, string>();
        if (citationsMatch) {
            const citationsContent = citationsMatch[1];
            while ((citationMatch = citationRegex.exec(citationsContent)) !== null) {
                const [_, index, chunk_id, type, direct_text] = citationMatch;
                const citation_id = uuid();
                citationMap.set(index, citation_id);
                citations.push({
                    direct_text: direct_text.trim(),
                    type: getChunkType(type),
                    chunk_id,
                    citation_id,
                });
            }
        }

        rawTextContent = rawTextContent.replace(normalTextRegex, '$1');

        // Parse text content (normal and grounded)
        let lastIndex = 0;
        let match;

        while ((match = groundedTextRegex.exec(rawTextContent)) !== null) {
            const [fullMatch, citationIndex, groundedText] = match;

            // Add normal text that is before the grounded text
            if (match.index > lastIndex) {
                const normalText = rawTextContent.slice(lastIndex, match.index).trim();
                if (normalText) {
                    content.push({
                        index: contentIndex++,
                        type: TEXT_TYPE.NORMAL,
                        text: normalText,
                        citation_ids: null,
                    });
                }
            }

            // Add grounded text
            const citation_ids = citationIndex.split(',').map(index => citationMap.get(index) || '');
            content.push({
                index: contentIndex++,
                type: TEXT_TYPE.GROUNDED,
                text: groundedText.trim(),
                citation_ids,
            });

            lastIndex = match.index + fullMatch.length;
        }

        // Add any remaining normal text after the last grounded text
        if (lastIndex < rawTextContent.length) {
            const remainingText = rawTextContent.slice(lastIndex).trim();
            if (remainingText) {
                content.push({
                    index: contentIndex++,
                    type: TEXT_TYPE.NORMAL,
                    text: remainingText,
                    citation_ids: null,
                });
            }
        }

        let followUpQuestions: string[] = [];
        if (followUpQuestionsMatch) {
            const questionsText = followUpQuestionsMatch[1];
            let questionMatch;
            while ((questionMatch = questionRegex.exec(questionsText)) !== null) {
                followUpQuestions.push(questionMatch[1].trim());
            }
        }

        const assistantResponse: AssistantMessage = {
            role: ASSISTANT_ROLE.ASSISTANT,
            content,
            follow_up_questions: followUpQuestions,
            citations,
            processing_info: processingInfo,
            loop_summary: loopSummaryMatch ? loopSummaryMatch[1].trim() : undefined,
        };

        return assistantResponse;
    }
}