diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 0655a390205..73f7a10fe2d 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1217,7 +1217,9 @@ Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t
 // Should just use narrow_copy_out, but this API is used internally at Meta:
 // https://github.com/pytorch/pytorch/pull/87045#issuecomment-1309353561
 Tensor narrow_copy_dense_cpu(const Tensor& self, int64_t dim, int64_t start, int64_t length){
-  auto output = at::empty_like(self);
+  // narrow_copy_dense_cpu_out always resize output's size, so there only create
+  // a zero size tensor.
+  auto output = at::empty({0}, self.options());
   return narrow_copy_dense_cpu_out(self, dim, start, length, output);
 }
 
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 8bdda6a1a9c..47f871367b0 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3542,7 +3542,6 @@ class TestVmapOperatorsOpInfo(TestCase):
         xfail('bitwise_left_shift', device_type='cpu'),
         decorate('bitwise_right_shift', device_type='cpu',
                  decorator=expectedFailureIf(not (IS_MACOS and IS_X86))),
-        xfail('narrow_copy', device_type='cpu'),
 
         # UBSAN: runtime error: shift exponent -1 is negative
         decorate('bitwise_left_shift', decorator=unittest.skipIf(TEST_WITH_UBSAN, "Fails with above error")),
@@ -3721,11 +3720,6 @@ class TestVmapOperatorsOpInfo(TestCase):
         xfail('le'),
         xfail('lt'),
         xfail('ne'),
-        # AssertionError
-        # Mismatched elements: 18 / 20 (90.0%)
-        # Greatest absolute difference: 14.031710147857666 at index (0, 5) (up to 0.0001 allowed)
-        # Greatest relative difference: 2.9177700113052603 at index (0, 3) (up to 0.0001 allowed)
-        xfail('narrow_copy', device_type='cpu'),
         # UBSAN: runtime error: 1.27043e+262 is outside the range of representable values of type 'float'
         decorate('special.zeta', decorator=unittest.skipIf(TEST_WITH_UBSAN, "Fails with above error")),
         # RuntimeError: Expected all tensors to be on the same device,
diff --git a/test/test_torch.py b/test/test_torch.py
index aae0a7297f3..d58f14aac59 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -2971,6 +2971,13 @@ else:
             sz[d] = 0
             self.assertEqual(sz, y.size())
 
+    def test_narrow_copy_non_contiguous(self, device):
+        # see https://github.com/pytorch/pytorch/issues/91690.
+        inp = torch.randn(10, 2, device=device).movedim(-1, 0)
+        expected = torch.narrow_copy(inp.contiguous(), 1, 0, 10)
+        actual = torch.narrow_copy(inp, 1, 0, 10)
+        self.assertEqual(expected, actual)
+
     # FIXME: move to indexing test suite
     @parametrize("reduce", ['prod', 'amin', 'amax', 'mean'])
     @dtypes(*all_types_and(torch.half, torch.bfloat16))