diff options
author | A.J. Shulman <Shulman.aj@gmail.com> | 2024-07-15 12:31:35 -0400 |
---|---|---|
committer | A.J. Shulman <Shulman.aj@gmail.com> | 2024-07-15 12:31:35 -0400 |
commit | ef79b7d617035c52fea159225ba9a39b8222e8f4 (patch) | |
tree | 2ad28baadada16e2688f922c906216b39652c28b /src/client/views/nodes/ChatBox/AnswerParser.ts | |
parent | f1cdfc1d02488c4a513fbf67f729f702526a345d (diff) |
citation parsing working much better
Diffstat (limited to 'src/client/views/nodes/ChatBox/AnswerParser.ts')
-rw-r--r-- | src/client/views/nodes/ChatBox/AnswerParser.ts | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/src/client/views/nodes/ChatBox/AnswerParser.ts b/src/client/views/nodes/ChatBox/AnswerParser.ts new file mode 100644 index 000000000..f77d2261d --- /dev/null +++ b/src/client/views/nodes/ChatBox/AnswerParser.ts @@ -0,0 +1,60 @@ +import { ASSISTANT_ROLE, AssistantMessage, Citation, getChunkType } from './types'; + +export class AnswerParser { + static parse(xml: string): AssistantMessage { + const answerRegex = /<answer>([\s\S]*?)<\/answer>/; + const citationRegex = /<citation chunk_id="([^"]+)" type="([^"]+)">(.*?)<\/citation>/g; + const followUpQuestionsRegex = /<follow_up_questions>([\s\S]*?)<\/follow_up_questions>/; + const questionRegex = /<question>(.*?)<\/question>/g; + + const answerMatch = answerRegex.exec(xml); + const followUpQuestionsMatch = followUpQuestionsRegex.exec(xml); + + if (!answerMatch) { + throw new Error('Invalid XML: Missing <answer> tag.'); + } + + const rawTextContent = answerMatch[1].trim(); + const textContentWithCitations = rawTextContent.replace(citationRegex, ''); + const textContent = textContentWithCitations.replace(followUpQuestionsRegex, '').trim(); + + let citations: Citation[] = []; + let match: RegExpExecArray | null; + + let plainTextOffset = 0; + let citationOffset = 0; + + while ((match = citationRegex.exec(rawTextContent)) !== null) { + const [fullMatch, chunk_id, type, direct_text] = match; + const citationStartIndex = match.index; + const citationPlainStart = citationStartIndex - citationOffset; + + citations.push({ + direct_text: direct_text.trim(), + type: getChunkType(type), + chunk_id: chunk_id, + location: citationPlainStart, + }); + + citationOffset += fullMatch.length; + } + + let followUpQuestions: string[] = []; + if (followUpQuestionsMatch) { + const questionsText = followUpQuestionsMatch[1]; + let questionMatch: RegExpExecArray | null; + + while ((questionMatch = questionRegex.exec(questionsText)) !== null) { + followUpQuestions.push(questionMatch[1].trim()); + } + } + const assistantResponse: AssistantMessage = { + role: ASSISTANT_ROLE.ASSISTANT, + text_content: textContent, + follow_up_questions: followUpQuestions, + citations: citations, + }; + + return assistantResponse; + } +} |