interleaved mrope (#12807)

* ml(ggml): mrope * interleave mrope
2025-12-06 00:19:51 +01:00 · 2025-10-30 11:29:00 -07:00 · 2025-10-30 11:29:00 -07:00 · f67a6df110
commit f67a6df110
parent 75e75d9afe
10 changed files with 209 additions and 119 deletions
--- a/llama/patches/0032-interleave-multi-rope.patch
+++ b/llama/patches/0032-interleave-multi-rope.patch
@ -0,0 +1,113 @@
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Michael Yang <git@mxy.ng>
 Date: Web, 16 Oct 2025 20:37:19 -0700
 Subject: [PATCH] interleave multi rope
 since ollama doesn't use mrope for anything else, change it to mean the
 interleaved version used for qwen3vl
 ---
 ggml/src/ggml-cpu/ops.cpp                           |  7 ++-----
 ggml/src/ggml-cuda/rope.cu                          | 12 +++---------
 ggml/src/ggml-metal/ggml-metal.metal                | 10 +++-------
 ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp | 12 +++---------
 4 files changed, 11 insertions(+), 30 deletions(-)
 diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
 index 31478dd8e..4d1ed207e 100644
 --- a/ggml/src/ggml-cpu/ops.cpp
 +++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5509,15 +5509,12 @@ static void ggml_mrope_cache_init(
         }
         float theta = theta_t;
 -        if (sector >= sections[0] && sector < sec_w) {
 +        if (sector % 3 == 1 && sector < 1 + 3 * sections[1]) {
             theta = theta_h;
         }
 -        else if (sector >= sec_w && sector < sec_w + sections[2]) {
 +        else if (sector % 3 == 2 && sector < 2 + 3 * sections[2]) {
             theta = theta_w;
         }
 -        else if (sector >= sec_w + sections[2]) {
 -            theta = theta_e;
 -        }
         rope_yarn(
             theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
 diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
 index d058504cd..287fe9d2c 100644
 --- a/ggml/src/ggml-cuda/rope.cu
 +++ b/ggml/src/ggml-cuda/rope.cu
@@ -151,19 +151,13 @@ static __global__ void rope_multi(
     const int sec_w = sections.v[1] + sections.v[0];
     const int sector = (i0 / 2) % sect_dims;
 -    float theta_base = 0.0;
 -    if (sector < sections.v[0]) {
 -        theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
 -    }
 -    else if (sector >= sections.v[0] && sector < sec_w) {
 +    float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
 +    if (sector % 3 == 1 && sector < 1 + 3 * sections.v[1]) {
         theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
     }
 -    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
 +    else if (sector % 3 == 2 && sector < 2 + 3 * sections.v[2]) {
         theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
     }
 -    else if (sector >= sec_w + sections.v[2]) {
 -        theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
 -    }
     const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
 diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
 index 375a0c7fd..9866c96b4 100644
 --- a/ggml/src/ggml-metal/ggml-metal.metal
 +++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -3858,15 +3858,11 @@ kernel void kernel_rope_multi(
             const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
             const int sector    = ic % sect_dims;
 -            float theta_base;
 -            if (sector < args.sect_0) {
 -                theta_base = (float) pos[i2];
 -            } else if (sector < sec_w01) {
 +            float theta_base = (float) pos[i2];
 +            if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) {
                 theta_base = (float) pos[i2 + args.ne02];
 -            } else if (sector < sec_w012) {
 +            } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) {
                 theta_base = (float) pos[i2 + args.ne02 * 2];
 -            } else {
 -                theta_base = (float) pos[i2 + args.ne02 * 3];
             }
             // end of mrope
 diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
 index 111286b49..6fc2b42f8 100644
 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
 +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
@@ -31,19 +31,13 @@ void main() {
     const int sec_w = p.sections[1] + p.sections[0];
     const uint sector = (i0 / 2) % sect_dims;
 -    float theta_base = 0.0;
 -    if (sector < p.sections[0]) {
 -        theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
 -    }
 -    else if (sector >= p.sections[0] && sector < sec_w) {
 +    float theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
 +    if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) {
         theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
     }
 -    else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
 +    else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) {
         theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
     }
 -    else if (sector >= sec_w + p.sections[2]) {
 -        theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
 -    }
     const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -11,6 +11,7 @@ package ggml
 import "C"
 import (
 	"cmp"
 	"context"
 	"encoding/binary"
 	"errors"
@ -1490,14 +1491,7 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
 	// Default options
-	opts := rope.Options{
+	opts := rope.Options{Factors: &Tensor{}}
 		Factors:               &Tensor{},
 		OriginalContextLength: 131072,
 		ExtrapolationFactor:   0.,
 		AttentionFactor:       1.,
 		BetaFast:              32.,
 		BetaSlow:              1.,
 	}
 	// Apply any provided options
 	for _, option := range options {
@ -1509,24 +1503,44 @@ func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase
 		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
 	}
-	return &Tensor{
+	var tt *C.struct_ggml_tensor
-		b: t.b,
+	if len(opts.MRoPE.Sections) > 0 {
-		t: C.ggml_rope_ext(
+		mropeSections := make([]C.int32_t, 4)
 		for i, section := range opts.MRoPE.Sections {
 			mropeSections[i] = C.int32_t(section)
 		}
 		tt = C.ggml_rope_multi(
 			ctx.(*Context).ctx,
 			dequant,
 			positions.(*Tensor).t,
 			opts.Factors.(*Tensor).t,
 			C.int(ropeDim),
 			unsafe.SliceData(mropeSections),
 			C.int(opts.Type),
-			C.int(opts.OriginalContextLength),
+			cmp.Or(C.int(opts.YaRN.OriginalContextLength), 128<<10),
-			C.float(ropeBase),
+			C.float(ropeBase), C.float(ropeScale),
-			C.float(ropeScale),
+			C.float(opts.YaRN.ExtrapolationFactor),
-			C.float(opts.ExtrapolationFactor),
+			cmp.Or(C.float(opts.YaRN.AttentionFactor), 1),
-			C.float(opts.AttentionFactor),
+			cmp.Or(C.float(opts.YaRN.BetaFast), 32),
-			C.float(opts.BetaFast),
+			cmp.Or(C.float(opts.YaRN.BetaSlow), 1),
-			C.float(opts.BetaSlow),
+		)
-		),
+	} else {
 		tt = C.ggml_rope_ext(
 			ctx.(*Context).ctx,
 			dequant,
 			positions.(*Tensor).t,
 			opts.Factors.(*Tensor).t,
 			C.int(ropeDim), C.int(opts.Type),
 			cmp.Or(C.int(opts.YaRN.OriginalContextLength), 128<<10),
 			C.float(ropeBase), C.float(ropeScale),
 			C.float(opts.YaRN.ExtrapolationFactor),
 			cmp.Or(C.float(opts.YaRN.AttentionFactor), 1),
 			cmp.Or(C.float(opts.YaRN.BetaFast), 32),
 			cmp.Or(C.float(opts.YaRN.BetaSlow), 1),
 		)
 	}
 	return &Tensor{b: t.b, t: tt}
 }
 func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
--- a/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ops.cpp
@ -5509,15 +5509,12 @@ static void ggml_mrope_cache_init(
        }
        float theta = theta_t;
-        if (sector >= sections[0] && sector < sec_w) {
+        if (sector % 3 == 1 && sector < 1 + 3 * sections[1]) {
            theta = theta_h;
        }
-        else if (sector >= sec_w && sector < sec_w + sections[2]) {
+        else if (sector % 3 == 2 && sector < 2 + 3 * sections[2]) {
            theta = theta_w;
        }
        else if (sector >= sec_w + sections[2]) {
            theta = theta_e;
        }
        rope_yarn(
            theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
--- a/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu
@ -151,19 +151,13 @@ static __global__ void rope_multi(
    const int sec_w = sections.v[1] + sections.v[0];
    const int sector = (i0 / 2) % sect_dims;
-    float theta_base = 0.0;
+    float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
-    if (sector < sections.v[0]) {
+    if (sector % 3 == 1 && sector < 1 + 3 * sections.v[1]) {
        theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
    }
    else if (sector >= sections.v[0] && sector < sec_w) {
        theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
    }
-    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
+    else if (sector % 3 == 2 && sector < 2 + 3 * sections.v[2]) {
        theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
    }
    else if (sector >= sec_w + sections.v[2]) {
        theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
    }
    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@ -6523,15 +6523,11 @@ kernel void kernel_rope_multi(
            const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
            const int sector    = ic % sect_dims;
-            float theta_base;
+            float theta_base = (float) pos[i2];
-            if (sector < args.sect_0) {
+            if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) {
                theta_base = (float) pos[i2];
            } else if (sector < sec_w01) {
                theta_base = (float) pos[i2 + args.ne02];
-            } else if (sector < sec_w012) {
+            } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) {
                theta_base = (float) pos[i2 + args.ne02 * 2];
            } else {
                theta_base = (float) pos[i2 + args.ne02 * 3];
            }
            // end of mrope
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@ -3858,15 +3858,11 @@ kernel void kernel_rope_multi(
            const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
            const int sector    = ic % sect_dims;
-            float theta_base;
+            float theta_base = (float) pos[i2];
-            if (sector < args.sect_0) {
+            if (sector % 3 == 1 && sector < 1 + 3 * args.sect_1) {
                theta_base = (float) pos[i2];
            } else if (sector < sec_w01) {
                theta_base = (float) pos[i2 + args.ne02];
-            } else if (sector < sec_w012) {
+            } else if (sector % 3 == 2 && sector < 2 + 3 * args.sect_2) {
                theta_base = (float) pos[i2 + args.ne02 * 2];
            } else {
                theta_base = (float) pos[i2 + args.ne02 * 3];
            }
            // end of mrope
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
@ -31,19 +31,13 @@ void main() {
    const int sec_w = p.sections[1] + p.sections[0];
    const uint sector = (i0 / 2) % sect_dims;
-    float theta_base = 0.0;
+    float theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
-    if (sector < p.sections[0]) {
+    if (sector % 3 == 1 && sector < 1 + 3 * p.sections[1]) {
        theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
    }
    else if (sector >= p.sections[0] && sector < sec_w) {
        theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
    }
-    else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
+    else if (sector % 3 == 2 && sector < 2 + 3 * p.sections[2]) {
        theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
    }
    else if (sector >= sec_w + p.sections[2]) {
        theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
    }
    const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
--- a/ml/nn/rope/rope.go
+++ b/ml/nn/rope/rope.go
@ -4,21 +4,21 @@ import "github.com/ollama/ollama/ml"
 // Options contains optional parameters for RoPE function
 type Options struct {
-	Type                  int
+	Type    int
-	Factors               ml.Tensor
+	Factors ml.Tensor
 	OriginalContextLength int
 	// YaRN options
-	ExtrapolationFactor,
+	YaRN struct {
-	AttentionFactor,
+		OriginalContextLength int
-	BetaFast,
+		ExtrapolationFactor,
-	BetaSlow float32
+		AttentionFactor,
-}
+		BetaFast,
 		BetaSlow float32
 	}
-// WithOriginalContextLength sets a custom context length
+	// MRoPE options
-func WithOriginalContextLength(n int) func(*Options) {
+	MRoPE struct {
-	return func(opts *Options) {
+		Sections []int
 		opts.OriginalContextLength = n
 	}
 }
@ -38,14 +38,28 @@ func WithFactors(factors ml.Tensor) func(*Options) {
 	}
 }
 // WithOriginalContextLength sets a custom context length
 func WithOriginalContextLength(n int) func(*Options) {
 	return func(opts *Options) {
 		opts.YaRN.OriginalContextLength = n
 	}
 }
 func WithExtrapolationFactor(extrapolationFactor float32) func(*Options) {
 	return func(opts *Options) {
-		opts.ExtrapolationFactor = extrapolationFactor
+		opts.YaRN.ExtrapolationFactor = extrapolationFactor
 	}
 }
 func WithAttentionFactor(attentionFactor float32) func(*Options) {
 	return func(opts *Options) {
-		opts.AttentionFactor = attentionFactor
+		opts.YaRN.AttentionFactor = attentionFactor
 	}
 }
 func WithMRoPESections(sections []int) func(*Options) {
 	return func(opts *Options) {
 		opts.Type |= 1 << 3
 		opts.MRoPE.Sections = sections
 	}
 }
--- a/model/models/qwen3vl/model.go
+++ b/model/models/qwen3vl/model.go
@ -112,7 +112,8 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 }
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positionSlice := slices.Collect(makeSlice2D[int32](3, len(batch.Positions)))
+	// ggml mrope requires 4 positions per token: [time, height, width, extra]
 	positionSlice := slices.Collect(makeSlice2D[int32](4, len(batch.Positions)))
 	for i, id := range batch.Positions {
 		if id < int32(len(m.positionCache)) {
 			id = m.positionCache[id]
@ -123,6 +124,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 		positionSlice[0][i] = id
 		positionSlice[1][i] = id
 		positionSlice[2][i] = id
 		// positionSlice[3] is intentionally left as zeros
 	}
 	hiddenStates := m.TextModel.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
@ -147,8 +149,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 		}
 	}
-	positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0]), len(positionSlice))
+	positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
 	cos, sin := m.rotaryEmbedding(ctx, positions)
 	for i, layer := range m.TextModel.Layers {
 		if m.Cache != nil {
 			m.Cache.SetLayer(i)
@ -159,7 +160,7 @@ func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 			outputs = batch.Outputs
 		}
-		hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, outputs, m.Cache, m.Options)
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
 		if i < len(deepstackVisualEmbeds) {
 			hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i])
 		}
@ -191,9 +192,10 @@ func New(c fs.Config) (model.Model, error) {
 		ImageProcessor: newImageProcessor(c),
 	}
-	m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, position ml.Tensor) (ml.Tensor, error) {
+	m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, positions ml.Tensor) (ml.Tensor, error) {
 		m.positionCache = nil
-		return nil, kvcache.ErrNotSupported
+		positions = positions.Repeat(ctx, 1, 4).Reshape(ctx, -1)
 		return m.Options.applyRotaryPositionalEmbedding(ctx, key, positions), nil
 	})
 	return &m, nil
 }
--- a/model/models/qwen3vl/model_text.go
+++ b/model/models/qwen3vl/model_text.go
@ -10,6 +10,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
 	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 )
@ -27,14 +29,18 @@ type TextOptions struct {
 	numExperts, numExpertsUsed int
 	normTopKProb               bool
 	inverseFrequenciesCache []float32
 }
 func (o TextOptions) headDim() int {
 	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
 }
 func (o TextOptions) applyRotaryPositionalEmbedding(ctx ml.Context, t, p ml.Tensor) ml.Tensor {
 	return fast.RoPE(ctx, t, p, o.headDim(), o.ropeBase, 1/float32(math.Sqrt(float64(o.ropeScale))),
 		rope.WithMRoPESections(o.mropeSections),
 	)
 }
 type TextAttention struct {
 	Query     *nn.Linear  `gguf:"attn_q"`
 	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
@ -44,7 +50,7 @@ type TextAttention struct {
 	Output    *nn.Linear  `gguf:"attn_output"`
 }
-func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenStates.Dim(1)
 	query := sa.Query.Forward(ctx, hiddenStates)
@ -58,8 +64,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tenso
 	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
 	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
-	query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
+	query = opts.applyRotaryPositionalEmbedding(ctx, query, positions)
-	key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
+	key = opts.applyRotaryPositionalEmbedding(ctx, key, positions)
 	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
 	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
@ -125,10 +131,10 @@ type TextLayer struct {
 	TextMLP
 }
-func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, cos, sin, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
+func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	residual := hiddenStates
 	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
-	hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, cos, sin, cache, opts)
+	hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, positions, cache, opts)
 	if outputs != nil {
 		hiddenStates = hiddenStates.Rows(ctx, outputs)
@ -153,42 +159,6 @@ type TextModel struct {
 	Options *TextOptions
 }
 func (m *TextModel) rotaryEmbedding(ctx ml.Context, positions ml.Tensor) (_, _ ml.Tensor) {
 	positions = positions.Reshape(ctx, 1, positions.Dim(0), positions.Dim(1))
 	if len(m.Options.inverseFrequenciesCache) == 0 {
 		m.Options.inverseFrequenciesCache = make([]float32, m.Options.headDim()/2)
 		for i := range m.Options.inverseFrequenciesCache {
 			frequency := float32(math.Pow(float64(m.Options.ropeBase), float64(i*2)/float64(m.Options.headDim())))
 			m.Options.inverseFrequenciesCache[i] = 1 / frequency
 		}
 	}
 	inverseFrequencies := ctx.Input().FromFloats(m.Options.inverseFrequenciesCache, 1, len(m.Options.inverseFrequenciesCache))
 	positions = positions.Cast(ctx, ml.DTypeF32)
 	frequencies := inverseFrequencies.Mulmat(ctx, positions)
 	interleaved := frequencies.View(ctx,
 		0, frequencies.Dim(0),
 		frequencies.Stride(1), frequencies.Dim(1),
 	)
 	for _, i := range []int{1, 2} {
 		args := []int{
 			i * frequencies.Stride(0), 1,
 			3 * frequencies.Stride(0), m.Options.mropeSections[i],
 			frequencies.Stride(1), frequencies.Dim(1),
 		}
 		ctx.Forward(frequencies.View(ctx, i*frequencies.Stride(2)+args[0], args[1:]...).
 			Copy(ctx, interleaved.View(ctx, args[0], args[1:]...)))
 	}
 	interleaved = interleaved.Concat(ctx, interleaved, 0)
 	interleaved = interleaved.Reshape(ctx, interleaved.Dim(0), 1, interleaved.Dim(1), interleaved.Dim(2))
 	return interleaved.Cos(ctx), interleaved.Sin(ctx)
 }
 var _ model.Model = (*Model)(nil)
 func newTextModel(c fs.Config) *TextModel {