Implement narrow from a regular tensor to jagged tensor (#112770)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/112770 Approved by: https://github.com/cpuhrsch
2025-12-07 00:21:07 +01:00 · 2023-11-10 20:52:56 +00:00 · 2023-11-10 20:52:56 +00:00 · 1aece432ba
commit 1aece432ba
parent 3700894099
6 changed files with 271 additions and 20 deletions
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@ -14,7 +14,11 @@ from torch._dynamo.testing import normalize_gm
 from torch._higher_order_ops.wrap import wrap
 from torch.fx.experimental.symbolic_shapes import DimDynamic, ShapeEnv
-from torch.nested._internal.nested_tensor import jagged_from_list, ViewBufferFromNested
+from torch.nested._internal.nested_tensor import (
    jagged_from_list,
    jagged_from_tensor_and_lengths,
    ViewBufferFromNested,
 )
 from torch.testing._internal.inductor_utils import HAS_CUDA
@ -799,6 +803,19 @@ class TestNestedTensor(torch._dynamo.test_case.TestCase):
            )
        return jagged_from_list(out, offsets)
    def _get_nc_jagged_tensor(self, inner_dim, starts, lengths, requires_grad=True):
        # Makes a jagged tensor with N constituent tensors with size
        # as specified ((S0, S1, S2), D)
        max_dim = (starts + lengths).max()
        values_tensor = torch.randn(
            starts.shape[0],
            max_dim.item(),
            inner_dim,
            requires_grad=requires_grad,
            dtype=torch.float64,
        )
        return jagged_from_tensor_and_lengths(values_tensor, starts, lengths)
    def _check_recompiles(self, fn, inputs1, inputs2, recompiles):
        compile_count = [0]
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@ -3325,6 +3325,66 @@ class TestNestedTensorSubclass(NestedTestCase):
            nt, _ = jagged_from_list(test_tensor_list, None)
            _ = torch.nn.functional.layer_norm(nt, (nt.shape[-2], nt.shape[-1]))
    def test_narrow(self, device):
        starts = torch.tensor([0, 1, 2, 3, 4], device=device, dtype=torch.int64)
        lengths = torch.tensor([3, 2, 2, 1, 5], device=device, dtype=torch.int64)
        nt = torch.nested.narrow(
            torch.arange(0, 10, device=device, dtype=torch.int64).unsqueeze(0).expand(5, -1).clone().detach(),
            1,
            starts,
            lengths,
            layout=torch.jagged
        )
        # TODO: Use this approach when unbind is functional
        # unbinded_nt = nt.unbind()
        # for i in range(starts.shape[0]):
        #     self.assertEqual(torch.arange(starts[i], starts[i] + lengths[i], device=device, dtype=torch.int64), unbinded_nt[i])
        for i in range(starts.shape[0]):
            self.assertEqual(
                torch.arange(starts[i], starts[i] + lengths[i], device=device, dtype=torch.int64),
                nt.values()[nt.offsets()[i]:(nt.offsets()[i] + nt.lengths()[i])]
            )
    def test_is_contiguous(self, device):
        a = torch.randn(2, 3, requires_grad=True, dtype=torch.float64, device=device)
        b = torch.randn(3, 3, requires_grad=True, dtype=torch.float64, device=device)
        c = torch.randn(4, 3, requires_grad=True, dtype=torch.float64, device=device)
        nt_contiguous, _ = jagged_from_list([a, b, c], None)
        starts_nc = torch.tensor([0, 1, 2, 3, 4], device=device, dtype=torch.int64)
        lengths_nc = torch.tensor([3, 2, 2, 1, 5], device=device, dtype=torch.int64)
        narrow_base = torch.arange(0, 10, device=device, dtype=torch.int64).unsqueeze(0).expand(5, -1).clone()
        nt_noncontiguous = torch.nested.narrow(
            narrow_base,
            1,
            starts_nc,
            lengths_nc,
            layout=torch.jagged
        )
        starts_c = torch.tensor([1, 0, 0, 0, 0], device=device, dtype=torch.int64)
        lengths_c = torch.tensor([9, 10, 10, 10, 8], device=device, dtype=torch.int64)
        nt_contiguous_narrow = torch.nested.narrow(
            narrow_base,
            1,
            starts_c,
            lengths_c,
            layout=torch.jagged
        )
        # Test contiguous case
        assert nt_contiguous.is_contiguous()
        # Test narrow case
        assert not nt_noncontiguous.is_contiguous()
        assert nt_contiguous_narrow.is_contiguous()
        # Test querying by memory_format
        self.assertTrue(nt_contiguous.is_contiguous(memory_format=torch.contiguous_format))
        self.assertTrue(not nt_noncontiguous.is_contiguous(memory_format=torch.contiguous_format))
        self.assertTrue(nt_contiguous_narrow.is_contiguous(memory_format=torch.contiguous_format))
 instantiate_parametrized_tests(TestNestedTensor)
 instantiate_device_type_tests(TestNestedTensorDeviceType, globals())
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@ -309,7 +309,7 @@ class MetaConverter:
                    from torch._dynamo.source import AttrSource
                    from torch.fx.experimental.symbolic_shapes import DimDynamic
-                    if shape_env and not t.is_nested:
+                    if shape_env and not t.is_nested and not t._base.is_nested:
                        base_dynamic_dims = [DimDynamic.STATIC] * t._base.dim()
                    else:
                        base_dynamic_dims = None
--- a/torch/nested/init.py
+++ b/torch/nested/init.py
@ -1,7 +1,7 @@
-from typing import List, Optional
+from typing import List, Optional, Union
 import torch
-from torch import Tensor
+from torch import SymInt, Tensor
 from torch._C import _add_docstr, _nested  # type: ignore[attr-defined]
 from torch.types import _device as Device, _dtype as DType
@ -10,6 +10,7 @@ __all__ = [
    "to_padded_tensor",
    "as_nested_tensor",
    "nested_tensor",
    "narrow",
 ]
 # Nested Tensor constructor functions
@ -187,3 +188,69 @@ Example::
        return nt
    else:
        raise RuntimeError(f"Specified layout is unsupported for nested tensors: {layout}")
 def narrow(tensor: Tensor, dim: int, start: Union[int, Tensor], length: Union[int, Tensor], layout=torch.strided) -> Tensor:
    r"""
 Constructs a nested tensor (which might be a view) from :attr:`tensor`, a strided tensor. This follows
 similar semantics to torch.Tensor.narrow, where in the :attr:`dim`-th dimension the new nested tensor
 shows only the elements in the interval `[start, start+length)`. As nested representations
 allow for a different `start` and `length` at each 'row' of that dimension, :attr:`start` and :attr:`length`
 can also be tensors of shape `tensor.shape[0]`.
 There's some differences depending on the layout you use for the nested tensor. If using strided layout,
 torch.narrow will do a copy of the narrowed data into a contiguous NT with strided layout, while
 jagged layout narrow() will create a non-contiguous view of your original strided tensor. This particular
 representation is really useful for representing kv-caches in Transformer models, as specialized
 SDPA kernels can deal with format easily, resulting in performance improvements.
 Args:
    tensor (:class:`torch.Tensor`): a strided tensor, which will be used as the underlying data
        for the nested tensor if using the jagged layout or will be copied for the strided layout.
    dim (int): the dimension where narrow will be applied. Only `dim=1` is supported for the
        jagged layout, while strided supports all dim
    start (Union[int, :class:`torch.Tensor`]): starting element for the narrow operation
    length (Union[int, :class:`torch.Tensor`]): number of elements taken during the narrow op
 Keyword arguments:
    layout (:class:`torch.layout`, optional): the desired layout of returned nested tensor.
        Only strided and jagged layouts are supported. Default: if None, the strided layout.
 Example::
    >>> starts = torch.tensor([0, 1, 2, 3, 4], dtype=torch.int64)
    >>> lengths = torch.tensor([3, 2, 2, 1, 5], dtype=torch.int64)
    >>> narrow_base = torch.randn(5, 10, 20)
    >>> nt_narrowed = torch.nested.narrow(narrow_base, 1, starts, lengths, layout=torch.jagged)
    >>> nt_narrowed.is_contiguous()
    False
    """
    if not isinstance(start, (int, SymInt, Tensor)):
        raise RuntimeError("start must be an integer or a tensor")
    if not isinstance(length, (int, SymInt, Tensor)):
        raise RuntimeError("length must be an integer or a tensor")
    if layout == torch.strided:
        if isinstance(start, Tensor) or isinstance(length, Tensor):
            raise RuntimeError("start and length must be integers for the strided layout NT impl")
        # TODO: switch to as_nested_tensor(tensor) when it is available
        nt = as_nested_tensor(torch.unbind(tensor), layout=torch.strided).narrow(dim, start, length)
    elif layout == torch.jagged:
        if dim != 1:
            raise RuntimeError("jagged layout only supports dim=1")
        from torch.nested._internal.nested_tensor import jagged_from_tensor_and_lengths
        if isinstance(start, (int, SymInt)):
            start = torch.tensor([start], device=tensor.device, dtype=torch.int64)
        if isinstance(length, (int, SymInt)):
            length = torch.tensor([length], device=tensor.device, dtype=torch.int64)
        nt, _, _ = jagged_from_tensor_and_lengths(tensor, start, length)
    else:
        raise RuntimeError(f"Specified layout is unsupported for nested narrow: {layout}")
    return nt
--- a/torch/nested/_internal/nested_tensor.py
+++ b/torch/nested/_internal/nested_tensor.py
@ -2,6 +2,7 @@ from typing import Tuple
 import torch
 from torch._C import DispatchKey, DispatchKeySet
 from torch._prims_common import is_expandable_to
 from torch.fx.experimental.symbolic_shapes import has_free_symbols
 from torch.utils.weak import WeakTensorKeyDictionary
 from typing import *  # noqa: F403
@ -21,6 +22,7 @@ def get_tensor_id(tensor, *, coeff=1):
 class NestedTensor(torch.Tensor):
    _values: torch.Tensor  # type: ignore[assignment]
    _offsets: torch.Tensor
    _lengths: Optional[torch.Tensor]
    # NOTE [ Singleton ints for ragged sizes and strides ]
    #
    # Jagged layout tensors are tensors that represent a n-dim tensor with a
@ -46,6 +48,7 @@ class NestedTensor(torch.Tensor):
        values,
        offsets,
        *,
        lengths=None,
        ragged_size=None,
        **kwargs,
    ):
@ -69,7 +72,7 @@ class NestedTensor(torch.Tensor):
        )
        return r
-    def __init__(self, values, offsets, *, ragged_size=None, **kwargs):
+    def __init__(self, values, offsets, *, lengths=None, ragged_size=None, **kwargs):
        super().__init__()
        # Only support jagged for now.
        assert offsets is not None
@ -82,7 +85,10 @@ class NestedTensor(torch.Tensor):
            # we perform operations on fake nested tensors.
            # Calling get_tensor_id won't work in those cases because we want
            # the existing symbolic ragged_size to be propagated.
            if lengths is None:
                ragged_size = get_tensor_id(offsets, coeff=1)
            else:
                ragged_size = get_tensor_id(lengths, coeff=1)
        B = offsets.shape[0] - 1
        Ds = values.shape[1:]
        self._size = (B, ragged_size, *Ds)
@ -97,6 +103,7 @@ class NestedTensor(torch.Tensor):
            )
        self._values = values
        self._offsets = offsets
        self._lengths = lengths
    def values(self):
        return self._values
@ -104,6 +111,9 @@ class NestedTensor(torch.Tensor):
    def offsets(self):
        return self._offsets
    def lengths(self):
        return self._lengths
    def __repr__(self):
        # We should implement this in torch/_tensor_str.py instead
        grad_fn_str = (
@ -111,7 +121,7 @@ class NestedTensor(torch.Tensor):
        )
        if self.grad_fn:
            grad_fn_str = f", grad_fn={self.grad_fn}"
-        return f"NestedTensor(size={self._size}, offsets={self._offsets}{grad_fn_str})"
+        return f"NestedTensor(size={self._size}, offsets={self._offsets}{grad_fn_str}, contiguous={self._lengths is None})"
    def __reduce_ex__(self, proto):
        state = torch._utils._get_obj_state(self)
@ -131,13 +141,20 @@ class NestedTensor(torch.Tensor):
            "requires_grad": self.requires_grad,
            "ragged_size": self._size[self._ragged_idx],
        }
-        return ["_values", "_offsets"], ctx
+        inner_tensors = ["_values", "_offsets"]
        if self._lengths is not None:
            inner_tensors.append("_lengths")
        return inner_tensors, ctx
    @staticmethod
    def __tensor_unflatten__(inner_tensors: Dict, meta):
-        assert len(inner_tensors) == 2
+        assert len(inner_tensors) >= 2 and len(inner_tensors) <= 3
        values = inner_tensors["_values"]
        offsets = inner_tensors["_offsets"]
        if "_lengths" in inner_tensors and inner_tensors["_lengths"] is not None:
            lengths = inner_tensors["_lengths"]
        else:
            lengths = None
        # NOTE [ Storing symbolic values as plain attributes on subclasses ]
        #
@ -173,6 +190,7 @@ class NestedTensor(torch.Tensor):
        return NestedTensor(
            values,
            offsets=offsets,
            lengths=lengths,
            ragged_size=meta["ragged_size"],
            requires_grad=meta["requires_grad"],
        )
@ -232,6 +250,18 @@ class ViewNestedFromBuffer(torch.autograd.Function):
        return gO.values(), None, None
 # Not actually a view!
 # NOTE: @jbschlosser is working on making it a view
 class ViewNonContiguousNestedFromBuffer(torch.autograd.Function):
    @staticmethod
    def forward(ctx, values: torch.Tensor, offsets: torch.Tensor, lengths: torch.Tensor):  # type: ignore[override]
        return NestedTensor(values.detach(), offsets=offsets, lengths=lengths)
    @staticmethod
    def backward(ctx, gO: NestedTensor):  # type: ignore[override]
        return gO.values(), None, None
 # Need to make it obvious that users should be passing in offsets
 def jagged_from_list(
    tensors: List[torch.Tensor],
@ -285,5 +315,65 @@ def jagged_from_list(
    return ViewNestedFromBuffer.apply(values, offsets), offsets  # type: ignore[call-overload]
 def jagged_from_tensor_and_lengths(
    tensor: torch.Tensor, starts: torch.Tensor, lengths: torch.Tensor
 ) -> Tuple[NestedTensor, torch.Tensor, Optional[torch.Tensor]]:
    """Constructs a NestedTensor backed by jagged layout from a tensor, starts of sequences, and sequence lengths"""
    batch_size = tensor.shape[0]
    if is_expandable_to(starts.shape, (batch_size,)) and is_expandable_to(
        lengths.shape, (batch_size,)
    ):
        start_list = starts.expand(batch_size)
        length_list = lengths.expand(batch_size)
    else:
        raise RuntimeError(
            "When constructing a jagged nested tensor using narrow(), "
            "your start and length must be Tensors that broadcast to input.shape[0]"
        )
    # Calculate jagged offsets
    assert (
        len(tensor.shape) >= 2
    ), "tensor must at least be 2D for the nested narrow op to work"
    max_seq_len = tensor.shape[1]
    offset_lengths = max_seq_len * torch.arange(
        0, batch_size, dtype=torch.int64, device=tensor.device
    )
    # Jagged layout specifies that offsets are stored as int64 on the same device as values.
    offsets = torch.cat(
        [
            start_list + offset_lengths,
            (start_list[-1] + offset_lengths[-1] + length_list[-1]).unsqueeze(0),
        ]
    )
    # Reshape buffer to flatten the 1st and 2nd dimension (view used to enforce non-copy)
    if len(tensor.shape) > 2:
        values = tensor.view(-1, *tensor.shape[2:])
    else:
        values = tensor.view(-1)
    # Check if offsets and lengths make it possibly contiguous and return a regular NT
    is_contiguous = True
    orig_dim = tensor.shape[1]
    if torch.any(length_list[1:-1].ne(orig_dim)):
        is_contiguous = False
    if torch.any(offsets[1:-2].diff().ne(orig_dim)):
        is_contiguous = False
    if offsets[0] + length_list[0] != orig_dim:
        is_contiguous = False
    if is_contiguous:
        return (
            ViewNestedFromBuffer.apply(
                values[offsets[0] : offsets[-1]], offsets - offsets[0]
            ),
            offsets,
            None,
        )
    return ViewNonContiguousNestedFromBuffer.apply(values, offsets, length_list), offsets, length_list  # type: ignore[call-overload]
 def buffer_from_jagged(jagged):
    return ViewBufferFromNested.apply(jagged)
--- a/torch/nested/_internal/ops.py
+++ b/torch/nested/_internal/ops.py
@ -1,4 +1,5 @@
 import functools
 import math
 import torch
@ -65,7 +66,11 @@ def check_schema(schema_str: str, func, *args, **kwargs) -> None:
    arg_type_check_fns = {
        "t": lambda x: isinstance(x, torch.Tensor) and not isinstance(x, NestedTensor),
-        "jt": lambda x: isinstance(x, NestedTensor),
+        "jt": lambda x: isinstance(x, NestedTensor)
        and x._lengths is None,  # ops with "jt" require contiguous JT only
        "jt_all": lambda x: isinstance(
            x, NestedTensor
        ),  # ops with "jt_all" can accept all kinds of JT
        "any": lambda x: True,
    }
    for i, named_arg_type in enumerate(named_arg_types):
@ -297,7 +302,7 @@ def jagged_torch_function(func, *args, **kwargs):
        torch.ops.aten.sym_stride.default,
        torch.ops.aten.sym_storage_offset.default,
    ],
-    "self: jt",
+    "self: jt_all",
 )
 def tensor_attr_supported_getter(func, *args, **kwargs):
    if func == torch.ops.aten.is_non_overlapping_and_dense.default:
@ -310,23 +315,25 @@ def tensor_attr_supported_getter(func, *args, **kwargs):
        return len(args[0]._size)
    if func == torch.ops.aten.sym_numel.default:
        if args[0]._lengths is not None:
            return int(sum(args[0]._lengths) * math.prod(args[0]._size[2:]))
        return args[0]._values.numel()
    if func == torch.ops.aten.sym_stride.default:
        return args[0]._strides
    if func == torch.ops.aten.sym_storage_offset.default:
-        return 0
+        return args[0]._values.storage_offset()
-@register_jagged_func(torch.ops.prim.layout.default, "self: jt")
+@register_jagged_func(torch.ops.prim.layout.default, "self: jt_all")
 def prim_layout_default(func, *args, **kwargs):
    return torch.jagged
@register_jagged_func(
    [torch.ops.aten.size.default],
-    "self: jt",
+    "self: jt_all",
 )
 def tensor_attr_unsupported_getter(func, *args, **kwargs):
    if func == torch.ops.aten.size.default:
@ -336,7 +343,7 @@ def tensor_attr_unsupported_getter(func, *args, **kwargs):
        )
-@register_jagged_func(torch.ops.aten.is_contiguous.default, "self: jt")
+@register_jagged_func(torch.ops.aten.is_contiguous.default, "self: jt_all")
 def is_contiguous_general(func, *args, **kwargs):
    from torch._prims_common import is_contiguous_for_memory_format
@ -344,16 +351,21 @@ def is_contiguous_general(func, *args, **kwargs):
        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
    )
    inp = new_kwargs.pop("input")
    # If created from narrow() check for lengths
    if inp.lengths() is not None:
        return False
    new_kwargs["memory_format"] = new_kwargs.get(
        "memory_format", torch.contiguous_format
    )
    if new_kwargs["memory_format"] == torch.preserve_format:
        return True
-    return is_contiguous_for_memory_format(inp, **new_kwargs)
+    return is_contiguous_for_memory_format(inp.values(), **new_kwargs)
 register_jagged_func(
-    torch.ops.aten.is_contiguous.memory_format, "self: jt, memory_format: any?"
+    torch.ops.aten.is_contiguous.memory_format, "self: jt_all, memory_format: any?"
 )(is_contiguous_general)
@ -508,7 +520,7 @@ def split_with_sizes_default(func, *args, **kwargs):
    ]
-@register_jagged_func(torch.ops.aten.unbind.int, "self: jt, dim: any?")
+@register_jagged_func(torch.ops.aten.unbind.int, "self: jt_all, dim: any?")
 def unbind_int(func, *args, **kwargs):
    # Note that this specializes on the length of the offsets
    _, new_kwargs = normalize_function(
@ -520,10 +532,15 @@ def unbind_int(func, *args, **kwargs):
        raise RuntimeError("unbind(): only supported for NestedTensor on dim=0")
    inp = new_kwargs.pop("input")
-    values = inp._values
+    values = inp.values()
    offsets = inp.offsets()
    lengths = inp.lengths()
    if lengths is None:
        return torch.split(values, offsets.diff().tolist())
    return [
        values[offsets[i] : (offsets[i] + lengths[i])] for i in range(lengths.shape[0])
    ]
@register_jagged_func(torch.ops.aten.unsqueeze.default, "self: jt, dim: any")