import { ASSISTANT_ROLE, AssistantMessage, Citation, CHUNK_TYPE, TEXT_TYPE, getChunkType } from './types'; import { v4 as uuid } from 'uuid'; export class AnswerParser { static parse(xml: string): AssistantMessage { const answerRegex = /([\s\S]*?)<\/answer>/; const citationRegex = /([\s\S]*?)<\/citation>/g; const followUpQuestionsRegex = /([\s\S]*?)<\/follow_up_questions>/; const questionRegex = /(.*?)<\/question>/g; const groundedTextRegex = /([\s\S]*?)<\/grounded_text>/g; const answerMatch = answerRegex.exec(xml); const followUpQuestionsMatch = followUpQuestionsRegex.exec(xml); if (!answerMatch) { throw new Error('Invalid XML: Missing tag.'); } const rawTextContent = answerMatch[1].trim(); let textContent: AssistantMessage['content'] = []; let citations: Citation[] = []; let contentIndex = 0; // Parse citations let citationMatch; while ((citationMatch = citationRegex.exec(rawTextContent)) !== null) { const [_, index, chunk_id, type, direct_text] = citationMatch; citations.push({ direct_text: direct_text.trim(), type: getChunkType(type), chunk_id, citation_id: uuid(), }); } // Parse text content (normal and grounded) let lastIndex = 0; let matches = []; // Find all grounded text matches let groundedTextMatch; while ((groundedTextMatch = groundedTextRegex.exec(rawTextContent)) !== null) { matches.push({ type: 'grounded', index: groundedTextMatch.index, length: groundedTextMatch[0].length, citationIndexes: groundedTextMatch[1], text: groundedTextMatch[2], }); } // Sort matches by their index in the original text matches.sort((a, b) => a.index - b.index); // Process normal and grounded text in order for (let i = 0; i <= matches.length; i++) { const currentMatch = matches[i]; const nextMatchIndex = currentMatch ? currentMatch.index : rawTextContent.length; // Add normal text before the current grounded text (or end of content) if (nextMatchIndex > lastIndex) { const normalText = rawTextContent.slice(lastIndex, nextMatchIndex).trim(); if (normalText) { textContent.push({ index: contentIndex++, type: TEXT_TYPE.NORMAL, text: normalText, citation_ids: null, }); } } // Add grounded text if there's a match if (currentMatch) { const citationIds = currentMatch.citationIndexes.split(',').map(index => citations[parseInt(index) - 1].citation_id); textContent.push({ index: contentIndex++, type: TEXT_TYPE.GROUNDED, text: currentMatch.text.trim(), citation_ids: citationIds, }); lastIndex = currentMatch.index + currentMatch.length; } } let followUpQuestions: string[] = []; if (followUpQuestionsMatch) { const questionsText = followUpQuestionsMatch[1]; let questionMatch; while ((questionMatch = questionRegex.exec(questionsText)) !== null) { followUpQuestions.push(questionMatch[1].trim()); } } const assistantResponse: AssistantMessage = { role: ASSISTANT_ROLE.ASSISTANT, content: textContent, follow_up_questions: followUpQuestions, citations, }; return assistantResponse; } }