diff --git a/model/parsers/qwen3vl.go b/model/parsers/qwen3vl.go index a8e7376c..87f49e89 100644 --- a/model/parsers/qwen3vl.go +++ b/model/parsers/qwen3vl.go @@ -16,6 +16,8 @@ const ( CollectingThinkingContent qwenParserState = iota CollectingContent CollectingToolContent + ThinkingDoneEatingWhitespace + ToolCallDoneEatingWhitespace ) const ( @@ -111,17 +113,28 @@ func (p *Qwen3VLParser) parseEvents() []qwenEvent { return all } -func emitContentBeforeTag(p *Qwen3VLParser, events []qwenEvent, tag string) []qwenEvent { +func splitAtTag(p *Qwen3VLParser, tag string, trimAfter bool) (string, string) { split := strings.SplitN(p.buffer.String(), tag, 2) before := split[0] before = strings.TrimRightFunc(before, unicode.IsSpace) - if len(before) > 0 { - events = append(events, qwenEventContent{content: before}) - } after := split[1] + if trimAfter { + after = strings.TrimLeftFunc(after, unicode.IsSpace) + } p.buffer.Reset() p.buffer.WriteString(after) - return events + return before, after // return events +} + +func (p *Qwen3VLParser) eatLeadingWhitespaceAndTransitionTo(nextState qwenParserState) ([]qwenEvent, bool) { + trimmed := strings.TrimLeftFunc(p.buffer.String(), unicode.IsSpace) + p.buffer.Reset() + if trimmed == "" { + return nil, false + } + p.state = nextState + p.buffer.WriteString(trimmed) + return nil, true } func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) { @@ -130,7 +143,11 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) { switch p.state { case CollectingContent: if strings.Contains(p.buffer.String(), toolOpenTag) { - events = emitContentBeforeTag(p, events, toolOpenTag) + // events = emitContentBeforeTag(p, events, toolOpenTag) + before, _ := splitAtTag(p, toolOpenTag, false) + if len(before) > 0 { + events = append(events, qwenEventContent{content: before}) + } p.state = CollectingToolContent return events, true } else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 { @@ -167,27 +184,26 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) { slog.Warn("qwen tool call closing tag found but no content before it") } - after := strings.TrimLeftFunc(split[1], unicode.IsSpace) + after := split[1] events = append(events, qwenEventRawToolCall{raw: before}) p.buffer.Reset() p.buffer.WriteString(after) - p.state = CollectingContent + p.state = ToolCallDoneEatingWhitespace return events, true } else { return events, false } case CollectingThinkingContent: if strings.Contains(p.buffer.String(), thinkingCloseTag) { - split := strings.SplitN(p.buffer.String(), thinkingCloseTag, 2) - // before := split[0] - before := strings.TrimRightFunc(split[0], unicode.IsSpace) - after := strings.TrimLeftFunc(split[1], unicode.IsSpace) - if len(before) > 0 { - events = append(events, qwenEventThinkingContent{content: before}) + thinking, remaining := splitAtTag(p, thinkingCloseTag, true) + if len(thinking) > 0 { + events = append(events, qwenEventThinkingContent{content: thinking}) + } + if remaining == "" { + p.state = ThinkingDoneEatingWhitespace + } else { + p.state = CollectingContent } - p.buffer.Reset() - p.buffer.WriteString(after) - p.state = CollectingContent return events, true } else if overlapLen := overlap(p.buffer.String(), thinkingCloseTag); overlapLen > 0 { beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen] @@ -215,6 +231,10 @@ func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) { } return events, false } + case ThinkingDoneEatingWhitespace: + return p.eatLeadingWhitespaceAndTransitionTo(CollectingContent) + case ToolCallDoneEatingWhitespace: + return p.eatLeadingWhitespaceAndTransitionTo(CollectingContent) default: panic("unreachable") } diff --git a/model/parsers/qwen3vl_nonthinking_test.go b/model/parsers/qwen3vl_nonthinking_test.go index 74392946..e0b9a02b 100644 --- a/model/parsers/qwen3vl_nonthinking_test.go +++ b/model/parsers/qwen3vl_nonthinking_test.go @@ -653,3 +653,189 @@ func TestQwen3VLNonThinkingToolParser(t *testing.T) { } } } + +func TestQwen3VLNonThinkingToolCallWhitespaceHandling(t *testing.T) { + type step struct { + input string + wantEvents []qwenEvent + } + + cases := []struct { + desc string + steps []step + only bool + }{ + { + desc: "whitespace inside tool call preserves trailing space", + steps: []step{ + { + input: "before tool content after", + wantEvents: []qwenEvent{ + qwenEventContent{content: "before"}, + qwenEventRawToolCall{raw: " tool content "}, + qwenEventContent{content: "after"}, + }, + }, + }, + }, + { + desc: "whitespace inside tool call preserves trailing space", + steps: []step{ + { + input: "\n \n \n \n \n \n blahhhhhhhhhh blahhhh blahhhh \n\n\n\t\t tool content \n\n\n\n\n\n\n after", + wantEvents: []qwenEvent{ + qwenEventContent{content: "\n \n \n \n \n \n blahhhhhhhhhh blahhhh blahhhh"}, + qwenEventRawToolCall{raw: " tool content "}, + qwenEventContent{content: "after"}, + }, + }, + }, + }, + { + desc: "whitespace inside tool call preserves trailing space", + steps: []step{ + { + input: " tool content ", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: " tool content "}, + }, + }, + { + input: "\n \n \n \n \n \n blahhhhhhhhhh blahhhh blahhhh \n\n\n\t\t anotha one \n\n\n\n\n\n\n after \n\n\n\n\n\n blep", + wantEvents: []qwenEvent{ + qwenEventContent{content: "blahhhhhhhhhh blahhhh blahhhh"}, + qwenEventRawToolCall{raw: " anotha one "}, + qwenEventContent{content: "after \n\n\n\n\n\n blep"}, + }, + }, + }, + }, + { + desc: "whitespace between content and tool call", + steps: []step{ + { + input: "content \n tool \n more content", + wantEvents: []qwenEvent{ + qwenEventContent{content: "content"}, + qwenEventRawToolCall{raw: "tool"}, + qwenEventContent{content: "more content"}, + }, + }, + }, + }, + { + desc: "consecutive tool calls with whitespace", + steps: []step{ + { + input: "first \n second \n third", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "first"}, + qwenEventRawToolCall{raw: "second"}, + qwenEventRawToolCall{raw: "third"}, + }, + }, + }, + }, + { + desc: "whitespace before and after tool open tag", + steps: []step{ + { + input: "text \n content", + wantEvents: []qwenEvent{ + qwenEventContent{content: "text"}, + qwenEventRawToolCall{raw: "content"}, + }, + }, + }, + }, + { + desc: "unicode whitespace around tool calls", + steps: []step{ + { + input: "text\u00a0\u3000content\u00a0\u3000text", + wantEvents: []qwenEvent{ + qwenEventContent{content: "text"}, + qwenEventRawToolCall{raw: "content"}, + qwenEventContent{content: "text"}, + }, + }, + }, + }, + { + desc: "empty tool call with surrounding whitespace", + steps: []step{ + { + input: "before after", + wantEvents: []qwenEvent{ + qwenEventContent{content: "before"}, + qwenEventRawToolCall{raw: ""}, + qwenEventContent{content: "after"}, + }, + }, + }, + }, + { + desc: "whitespace in tool call split across chunks", + steps: []step{ + { + input: "before ", + wantEvents: []qwenEvent{qwenEventContent{content: "before"}}, + }, + { + input: "tool", + wantEvents: []qwenEvent{}, + }, + { + input: " after", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: " tool "}, + qwenEventContent{content: "after"}, + }, + }, + }, + }, + { + desc: "mixed whitespace types between tool calls", + steps: []step{ + { + input: "first \t\n\r second", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "first"}, + qwenEventRawToolCall{raw: "second"}, + }, + }, + }, + }, + } + + anyOnlies := false + for _, tc := range cases { + if tc.only { + anyOnlies = true + } + } + + for _, tc := range cases { + if anyOnlies && !tc.only { + continue + } + + t.Run(tc.desc, func(t *testing.T) { + parser := Qwen3VLParser{hasThinkingSupport: false} + parser.Init([]api.Tool{}, nil) + + for i, step := range tc.steps { + parser.buffer.WriteString(step.input) + gotEvents := parser.parseEvents() + + if len(gotEvents) == 0 && len(step.wantEvents) == 0 { + continue + } + + if !reflect.DeepEqual(gotEvents, step.wantEvents) { + t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents) + } + } + }) + } +} diff --git a/model/parsers/qwen3vl_thinking_test.go b/model/parsers/qwen3vl_thinking_test.go index d85a60fd..04b2a7db 100644 --- a/model/parsers/qwen3vl_thinking_test.go +++ b/model/parsers/qwen3vl_thinking_test.go @@ -546,3 +546,333 @@ func TestQwen3VLThinkingParserStreamingAssistantPrefillContent(t *testing.T) { } } } + +func TestQwen3VLThinkingWhitespaceHandling(t *testing.T) { + type step struct { + input string + wantEvents []qwenEvent + } + + cases := []struct { + desc string + steps []step + only bool + }{ + { + desc: "whitespace after thinking tag is trimmed", + steps: []step{ + { + input: "thinking content \n\t content starts here", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "thinking content"}, + qwenEventContent{content: "content starts here"}, + }, + }, + }, + }, + { + desc: "whitespace after thinking tag split across chunks", + steps: []step{ + { + input: "thinking content ", + wantEvents: []qwenEvent{qwenEventThinkingContent{content: "thinking content"}}, + }, + { + input: " \n\t", + wantEvents: []qwenEvent{}, + }, + { + input: "content", + wantEvents: []qwenEvent{ + qwenEventContent{content: "content"}, + }, + }, + }, + }, + { + desc: "only whitespace after thinking tag", + steps: []step{ + { + input: "thinking content \n\t ", + wantEvents: []qwenEvent{qwenEventThinkingContent{content: "thinking content"}}, + }, + }, + }, + { + desc: "multiple spaces and tabs after thinking", + steps: []step{ + { + input: "think \t\t\n\n text", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "think"}, + qwenEventContent{content: "text"}, + }, + }, + }, + }, + { + desc: "trailing whitespace before thinking tag is preserved in content", + steps: []step{ + { + input: "thinking with spaces text", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "thinking with spaces"}, + qwenEventContent{content: "text"}, + }, + }, + }, + }, + { + desc: "whitespace between thinking and tool call", + steps: []step{ + { + input: "thinking \n {\"name\":\"test\"}", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "thinking"}, + qwenEventRawToolCall{raw: "{\"name\":\"test\"}"}, + }, + }, + }, + }, + { + desc: "no whitespace after thinking tag", + steps: []step{ + { + input: "thinkingcontent", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "thinking"}, + qwenEventContent{content: "content"}, + }, + }, + }, + }, + { + desc: "unicode whitespace after thinking tag", + steps: []step{ + { + input: "thinking\u00a0\u3000content", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "thinking"}, + qwenEventContent{content: "content"}, + }, + }, + }, + }, + { + desc: "whitespace split with partial thinking tag", + steps: []step{ + { + input: "thinking \n", + wantEvents: []qwenEvent{}, + }, + { + input: " content", + wantEvents: []qwenEvent{ + qwenEventContent{content: "content"}, + }, + }, + }, + }, + { + desc: "empty thinking tag with whitespace after", + steps: []step{ + { + input: " \ncontent", + wantEvents: []qwenEvent{ + qwenEventContent{content: "content"}, + }, + }, + }, + }, + { + desc: "whitespace inside tool call preserves trailing space", + steps: []step{ + { + input: "bruh \n \n \n \n \n \n blahhhhhhhhhh blahhhh blahhhh \n\n\n\t\t tool content \n\n\n\n\n\n\n after", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "bruh"}, + qwenEventContent{content: "blahhhhhhhhhh blahhhh blahhhh"}, + qwenEventRawToolCall{raw: " tool content "}, + qwenEventContent{content: "after"}, + }, + }, + }, + }, + { + desc: "whitespace inside tool call preserves trailing space", + steps: []step{ + { + input: "bruh shdjfhksdhfj ", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "bruh"}, + qwenEventContent{content: "shdjfhksdhfj"}, + }, + }, + { + input: "another word ", + wantEvents: []qwenEvent{ + qwenEventContent{content: " another word"}, + }, + }, + { + input: " tool content ", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: " tool content "}, + }, + }, + { + input: "\n \n \n \n \n \n blahhhhhhhhhh blahhhh blahhhh \n\n\n\t\t anotha one \n\n\n\n\n\n\n after \n\n\n\n\n\n blep", + wantEvents: []qwenEvent{ + qwenEventContent{content: "blahhhhhhhhhh blahhhh blahhhh"}, + qwenEventRawToolCall{raw: " anotha one "}, + qwenEventContent{content: "after \n\n\n\n\n\n blep"}, + }, + }, + }, + }, + } + + anyOnlies := false + for _, tc := range cases { + if tc.only { + anyOnlies = true + } + } + + for _, tc := range cases { + if anyOnlies && !tc.only { + continue + } + + t.Run(tc.desc, func(t *testing.T) { + parser := Qwen3VLParser{hasThinkingSupport: true} + parser.Init([]api.Tool{}, nil) + + for i, step := range tc.steps { + parser.buffer.WriteString(step.input) + gotEvents := parser.parseEvents() + + if len(gotEvents) == 0 && len(step.wantEvents) == 0 { + continue + } + + if !reflect.DeepEqual(gotEvents, step.wantEvents) { + t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents) + } + } + }) + } +} + +func TestQwen3VLToolCallWhitespaceHandling(t *testing.T) { + type step struct { + input string + wantEvents []qwenEvent + } + + cases := []struct { + desc string + steps []step + only bool + prefillMsg *api.Message // allows starting in content mode instead of thinking mode + }{ + { + desc: "whitespace inside tool call is fully preserved (with content prefill)", + prefillMsg: &api.Message{Role: "assistant", Content: "prefill"}, + steps: []step{ + { + input: "before tool content \n after", + wantEvents: []qwenEvent{ + qwenEventContent{content: "before"}, + qwenEventRawToolCall{raw: " tool content "}, + qwenEventContent{content: "after"}, + }, + }, + }, + }, + { + desc: "whitespace after tool call trimmed across chunks (with content prefill)", + prefillMsg: &api.Message{Role: "assistant", Content: "prefill"}, + steps: []step{ + { + input: "beforetool ", + wantEvents: []qwenEvent{ + qwenEventContent{content: "before"}, + qwenEventRawToolCall{raw: "tool"}, + }, + }, + { + input: "\n\t", + wantEvents: []qwenEvent{}, + }, + { + input: "after \n this is a song", + wantEvents: []qwenEvent{ + qwenEventContent{content: "after \n this is a song"}, + }, + }, + }, + }, + { + desc: "multiple tool calls with whitespace between (with content prefill)", + prefillMsg: &api.Message{Role: "assistant", Content: "prefill"}, + steps: []step{ + { + input: "first \n second", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "first"}, + qwenEventRawToolCall{raw: "second"}, + }, + }, + }, + }, + { + desc: "thinking with whitespace then tool call", + steps: []step{ + { + input: "thinking \n tool \n content", + wantEvents: []qwenEvent{ + qwenEventThinkingContent{content: "thinking"}, + qwenEventRawToolCall{raw: "tool"}, + qwenEventContent{content: "content"}, + }, + }, + }, + }, + } + + anyOnlies := false + for _, tc := range cases { + if tc.only { + anyOnlies = true + } + } + + for _, tc := range cases { + if anyOnlies && !tc.only { + continue + } + + t.Run(tc.desc, func(t *testing.T) { + parser := Qwen3VLParser{hasThinkingSupport: true} + parser.Init([]api.Tool{}, tc.prefillMsg) + + for i, step := range tc.steps { + parser.buffer.WriteString(step.input) + gotEvents := parser.parseEvents() + + if len(gotEvents) == 0 && len(step.wantEvents) == 0 { + continue + } + + if !reflect.DeepEqual(gotEvents, step.wantEvents) { + t.Errorf("step %d: input %q: got events %#v, want %#v", i, step.input, gotEvents, step.wantEvents) + } + } + }) + } +}