Jetson Update for CI Redo (#94549)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/94549 Approved by: https://github.com/ezyang, https://github.com/malfet
2025-12-06 12:20:52 +01:00 · 2023-02-21 17:13:38 +00:00 · 2023-02-21 17:13:38 +00:00 · 8aa34602f7
commit 8aa34602f7
parent c6d8d10b3e
17 changed files with 94 additions and 27 deletions
--- a/test/inductor/test_minifier.py
+++ b/test/inductor/test_minifier.py
@ -7,7 +7,7 @@ import torch
 import torch._dynamo
 import torch._inductor.utils
 from torch._dynamo.test_minifier_common import MinifierTestBase
-from torch.testing._internal.common_utils import IS_MACOS
+from torch.testing._internal.common_utils import IS_JETSON, IS_MACOS

 _HAS_TRITON = torch._inductor.utils.has_triton()
 requires_cuda = functools.partial(unittest.skipIf, not _HAS_TRITON, "requires cuda")
@ -99,11 +99,13 @@ torch._dynamo.config.debug_dir_root = "{self.DEBUG_DIR}"
            (test_proc.returncode, repro_proc.returncode),
        )

+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_after_aot_cpu_compile_error(self):
        (tb1, tb2), _ = self._test_after_aot("cpu", CPP_COMPILE_ERROR, 2)
        self.assertIn("CppCompileError", tb1)
        self.assertIn("CppCompileError", tb2)

+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_after_aot_cpu_accuracy_error(self):
        (tb1, tb2), _ = self._test_after_aot("cpu", CPP_ACCURACY_ERROR, 4)
        self.assertIn("AccuracyError", tb1)
@ -149,6 +151,7 @@ torch._dynamo.config.debug_dir_root = "{self.DEBUG_DIR}"
        self.assertEqual(test_proc.returncode, repro_proc.returncode)
        self.assertNotEqual(test_proc.returncode, 0)

+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_after_aot_cpu_runtime_error(self):
        self._test_after_aot_runtime_error("cpu", CPP_RUNTIME_ERROR)

@ -181,12 +184,15 @@ torch._dynamo.config.debug_dir_root = "{self.DEBUG_DIR}"
        self.assertEqual(proc.returncode, 0)
        self.assertIsNone(repro_dir)

+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_after_aot_cpu_compile_backend_passes(self):
        self._test_after_aot_backend_passes("cpu", 2, CPP_COMPILE_ERROR)

+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_after_aot_cpu_runtime_backend_passes(self):
        self._test_after_aot_backend_passes("cpu", 2, CPP_RUNTIME_ERROR)

+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_after_aot_cpu_accuracy_backend_passes(self):
        self._test_after_aot_backend_passes("cpu", 4, CPP_ACCURACY_ERROR)

@ -206,6 +212,7 @@ torch._dynamo.config.debug_dir_root = "{self.DEBUG_DIR}"

    # Test that inductor config can be saved and restored, especially class
    # variables.
+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_inductor_config_serialization(self):
        run_code = textwrap.dedent(
            """\
@ -248,11 +255,13 @@ inner(torch.randn(20, 20).to("cpu"))
        )
        return (test_proc.stderr.decode("utf-8"), repro_proc.stderr.decode("utf-8"))

+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_after_aot_with_modified_config_compile_error(self):
        tb1, tb2 = self._test_after_aot_with_modified_config(CPP_COMPILE_ERROR, 2)
        self.assertIn("CppCompileError", tb1)
        self.assertIn("CppCompileError", tb2)

+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_after_aot_with_modified_config_accuracy_error(self):
        tb1, tb2 = self._test_after_aot_with_modified_config(CPP_ACCURACY_ERROR, 4)
        self.assertIn("AccuracyError", tb1)
@ -287,21 +296,25 @@ inner(torch.randn(20, 20).to("cpu"))
            (test_proc.returncode, repro_proc.returncode),
        )

+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_torch_compile_after_dynamo_compile_error(self):
        (tb1, tb2), _ = self._test_torch_compile("dynamo", 2, CPP_COMPILE_ERROR)
        self.assertIn("CppCompileError", tb1)
        self.assertIn("CppCompileError", tb2)

+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_torch_compile_after_dynamo_accuracy_error(self):
        (tb1, tb2), _ = self._test_torch_compile("dynamo", 4, CPP_ACCURACY_ERROR)
        self.assertIn("AccuracyError", tb1)
        self.assertIn("AccuracyError", tb2)

+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_torch_compile_after_aot_compile_error(self):
        (tb1, tb2), _ = self._test_torch_compile("aot", 2, CPP_COMPILE_ERROR)
        self.assertIn("CppCompileError", tb1)
        self.assertIn("CppCompileError", tb2)

+    @unittest.skipIf(IS_JETSON, "Fails on Jetson")
    def test_torch_compile_after_aot_accuracy_error(self):
        (tb1, tb2), _ = self._test_torch_compile("aot", 4, CPP_ACCURACY_ERROR)
        self.assertIn("AccuracyError", tb1)
--- a/test/inductor/test_smoke.py
+++ b/test/inductor/test_smoke.py
@ -1,10 +1,12 @@
 # Owner(s): ["module: inductor"]
 import logging
+import unittest

 import torch
 import torch._dynamo as torchdynamo
 import torch._inductor.config as torchinductor_config
 from torch.testing._internal.common_utils import IS_LINUX, TestCase
+from torch.testing._internal.inductor_utils import HAS_CUDA


 class MLP(torch.nn.Module):
@ -24,6 +26,7 @@ def _test_f(x):


 class SmokeTest(TestCase):
+    @unittest.skipIf(not HAS_CUDA, "Triton is not available")
    def test_mlp(self):
        torchdynamo.config.log_level = logging.INFO
        torchdynamo.config.verbose = True
@ -36,6 +39,7 @@ class SmokeTest(TestCase):
        torchdynamo.config.verbose = False
        torchinductor_config.debug = False

+    @unittest.skipIf(not HAS_CUDA, "Triton is not available")
    def test_compile_decorator(self):
        @torch.compile
        def foo(x):
--- a/test/nn/test_embedding.py
+++ b/test/nn/test_embedding.py
@ -6,7 +6,7 @@ from itertools import product

 import torch
 from torch.testing._internal.common_utils import run_tests, set_default_dtype, \
-    instantiate_parametrized_tests, parametrize as parametrize_test, _assertGradAndGradgradChecks
+    instantiate_parametrized_tests, parametrize as parametrize_test, _assertGradAndGradgradChecks, IS_JETSON
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_nn import NNTestCase
 from torch.testing._internal.common_device_type import onlyNativeDeviceTypes, dtypes, \
@ -1172,6 +1172,8 @@ class TestEmbeddingNNDeviceType(NNTestCase):
    @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
                                     (torch.float, torch.double, torch.half)))
    def test_embedding_bag_device(self, device, dtypes):
+        if IS_JETSON and torch.bfloat16 in dtypes and device == "cpu":
+            self.skipTest("bfloat16 not supported with Jetson cpu")
        with set_default_dtype(torch.double):
            self._test_EmbeddingBag(device, 'sum', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1])
            self._test_EmbeddingBag(device, 'mean', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1])
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@ -14,7 +14,7 @@ from torch import inf, nan
 import torch
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, set_default_dtype, \
-    instantiate_parametrized_tests, slowTest, parametrize as parametrize_test, subtest, skipIfMps
+    instantiate_parametrized_tests, slowTest, parametrize as parametrize_test, subtest, skipIfMps, gcIfJetson
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_nn import NNTestCase, _test_bfloat16_ops, _test_module_empty_input
 from torch.testing._internal.common_device_type import largeTensorTest, onlyNativeDeviceTypes, dtypes, \
@ -711,6 +711,7 @@ torch.cuda.synchronize()
                    output = module(input)

    @onlyNativeDeviceTypes
+    @gcIfJetson
    @dtypes(torch.float, torch.double)
    @dtypesIfCUDA(torch.half, torch.float, torch.double)
    def test_avg_pool2d_nhwc(self, device, dtype):
@ -798,6 +799,7 @@ torch.cuda.synchronize()
        check(tensor.transpose(1, 2), 3, 2, 1, 2, ceil_mode=True)

    @onlyCUDA
+    @gcIfJetson
    def test_max_pool2d(self, device):
        def helper(n, c, h, w, ks):
            x = torch.randn(n, c, h, w, device='cuda', dtype=torch.float, requires_grad=True)
@ -821,6 +823,7 @@ torch.cuda.synchronize()
    @onlyNativeDeviceTypes
    @dtypes(torch.float, torch.double)
    @dtypesIfCUDA(torch.half, torch.float, torch.double)
+    @gcIfJetson
    def test_max_pool2d_nhwc(self, device, dtype):
        def helper(n, c, h, w, kernel_size, stride=None):
            if stride is None:
@ -857,6 +860,7 @@ torch.cuda.synchronize()
    @onlyNativeDeviceTypes
    @dtypes(torch.half, torch.float, torch.double)
    @onlyCUDA
+    @gcIfJetson
    def test_max_pool3d_ndhwc(self, device, dtype):
        def helper(n, c, h, w, d, kernel_size, stride=None):
            batch = n
@ -946,6 +950,7 @@ torch.cuda.synchronize()
        helper(1, 19, 20, 10, 8, 2, torch.channels_last)

    @onlyCUDA
+    @gcIfJetson
    def test_max_pool2d_indices(self, device):
        def helper(n, c, h, w, ks):
            if n is None:
@ -1259,6 +1264,7 @@ torch.cuda.synchronize()
    @dtypesIfCUDA(torch.half, torch.float, torch.double)
    @dtypes(torch.float)
    @onlyNativeDeviceTypes  # TODO: Fails on XLA
+    @gcIfJetson
    def test_max_pool_nan_inf(self, device, dtype):
        for adaptive in ['', 'adaptive_']:
            for num_dim in [1, 2, 3]:
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@ -57,6 +57,7 @@ from torch.profiler._pattern_matcher import (
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_device_type import skipCUDAVersionIn
 from torch.testing._internal.common_utils import (
+    IS_JETSON,
    IS_WINDOWS,
    instantiate_parametrized_tests,
    parametrize,
@ -924,6 +925,7 @@ class TestProfiler(TestCase):
                ]
            )

+    @unittest.skipIf(IS_JETSON, "Jetson has a guard against OOM since host and gpu memory are shared")
    def test_oom_tracing(self):
        def run_profiler(tensor_creation_fn):
            with _profile(profile_memory=True, record_shapes=True) as prof:
@ -2685,6 +2687,7 @@ class TestExperimentalUtils(TestCase):
 0 [CPU (After GPU)]
 100000 [CPU (After GPU)]""")

+    @unittest.skipIf(IS_JETSON, "JSON not behaving as expected on Jetson")
    def test_utils_get_optimizable_events(self):
        basic_evaluation = _utils.BasicEvaluation(self.load_mock_profile())
        optimizable_events = basic_evaluation.get_optimizable_events(
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -28,7 +28,7 @@ from torch.utils.checkpoint import checkpoint_sequential
 from torch.testing._internal.common_utils import TestCase, freeze_rng_state, run_tests, \
    NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_REMOTE_GPU, IS_SANDCASTLE, IS_WINDOWS, \
    slowTest, skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf, TEST_WITH_ROCM, TEST_NUMPY, \
-    get_cycles_per_ms, parametrize, instantiate_parametrized_tests, subtest
+    get_cycles_per_ms, parametrize, instantiate_parametrized_tests, subtest, IS_JETSON, gcIfJetson
 from torch.testing._internal.autocast_test_lists import AutocastTestLists

 # load_tests from common_utils is used to automatically filter tests for
@ -390,7 +390,7 @@ class TestCuda(TestCase):
        self.assertTrue((tensor == 1).all())


-    @unittest.skipIf(TEST_CUDAMALLOCASYNC, "Segmentation fault (core dumped)")
+    @unittest.skipIf(TEST_CUDAMALLOCASYNC or IS_JETSON, "Segmentation fault (core dumped)")
    def test_out_of_memory_retry(self):
        torch.cuda.empty_cache()
        total_memory = torch.cuda.get_device_properties(0).total_memory
@ -1746,6 +1746,10 @@ except RuntimeError as e:
            before_free_bytes, before_available_bytes = torch.cuda.mem_get_info(idx)
            # increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms
            t = torch.randn(1024 * 1024 * 8, device='cuda:' + str(idx))
+            if IS_JETSON:
+                # w/o syncing, mem_get_info will run before memory allocated has actually increased.
+                # This race condition causes consistent failure
+                torch.cuda.synchronize()
            after_free_bytes, after_available_bytes = torch.cuda.mem_get_info(idx)

            self.assertTrue(after_free_bytes < before_free_bytes)
@ -1769,9 +1773,18 @@ except RuntimeError as e:
            l.append(torch.randn(1024 * 1024 * 8, device=torch.device("cuda:0")))

        no_leak()
-
-        with self.assertRaisesRegex(RuntimeError, r"CUDA driver API confirmed .+ on device 0.+"):
-            leak_gpu0()
+        regex = r"CUDA driver API confirmed .+ on device 0.+"
+        if IS_JETSON:
+            try:
+                leak_gpu0()
+            except RuntimeError as e:
+                import re
+                assert re.match(regex, str(e)), str(e) + "\n does not match: \n" + regex
+        else:
+            # assertRaisesRegex does not pass with Python for Jetson,
+            # even though the RuntimeError matches regex using re.match
+            with self.assertRaisesRegex(RuntimeError, regex):
+                leak_gpu0()

        if TEST_MULTIGPU:
            @self.wrap_with_cuda_memory_check
@ -1800,6 +1813,7 @@ except RuntimeError as e:
        self.assertEqual(y[0, 0, 0, 2**30], expected)

    @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
+    @gcIfJetson
    def test_cuda_kernel_loop_overflow_large(self):
        # Make sure input.numel() > INT_MAX is handled:
        x = torch.randn(1, 1, 1, 2**31, dtype=torch.float16, device="cuda")
--- a/test/test_cuda_nvml_based_avail.py
+++ b/test/test_cuda_nvml_based_avail.py
@ -13,7 +13,7 @@ with patch.dict(os.environ, {"PYTORCH_NVML_BASED_CUDA_CHECK": "1"}):
    # Before executing the desired tests, we need to disable CUDA initialization and fork_handler additions that would
    # otherwise be triggered by the `torch.testing._internal.common_utils` module import
    from torch.testing._internal.common_utils import (parametrize, instantiate_parametrized_tests, run_tests, TestCase,
-                                                      IS_WINDOWS)
+                                                      IS_WINDOWS, IS_JETSON)
    # NOTE: Because `remove_device_and_dtype_suffixes` initializes CUDA context (triggered via the import of
    # `torch.testing._internal.common_device_type` which imports `torch.testing._internal.common_cuda`) we need
    # to bypass that method here which should be irrelevant to the parameterized tests in this module.
@ -48,6 +48,8 @@ class TestExtendedCUDAIsAvail(TestCase):
    @parametrize("nvml_avail", [True, False])
    @parametrize("avoid_init", ['1', '0', None])
    def test_cuda_is_available(self, avoid_init, nvml_avail):
+        if IS_JETSON and nvml_avail and avoid_init == '1':
+            self.skipTest('Not working for Jetson')
        patch_env = {"PYTORCH_NVML_BASED_CUDA_CHECK": avoid_init} if avoid_init else {}
        with patch.dict(os.environ, **patch_env):
            if nvml_avail:
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@ -31,7 +31,7 @@ from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL
 from torch.utils.data.dataset import random_split
 from torch.utils.data.datapipes.iter import IterableWrapper
 from torch._utils import ExceptionWrapper
-from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS,
+from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS, IS_JETSON,
                                                  IS_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest,
                                                  load_tests, TEST_WITH_ASAN, TEST_WITH_TSAN, IS_SANDCASTLE,
                                                  IS_MACOS)
@ -78,11 +78,6 @@ load_tests = load_tests
 # as well during the execution of this test suite, and it will cause
 # CUDA OOM error on Windows.
 TEST_CUDA = torch.cuda.is_available()
-if TEST_CUDA:
-    dev_name = torch.cuda.get_device_name(torch.cuda.current_device()).lower()
-    IS_JETSON = 'xavier' in dev_name or 'nano' in dev_name or 'jetson' in dev_name or 'tegra' in dev_name
-else:
-    IS_JETSON = False

 if not NO_MULTIPROCESSING_SPAWN:
    # We want to use `spawn` if able because some of our tests check that the
@ -1111,6 +1106,7 @@ except RuntimeError as e:
            self.assertTrue(input.is_pinned())
            self.assertTrue(target.is_pinned())

+    @unittest.skipIf(IS_JETSON, "Not working on Jetson")
    def test_multiple_dataloaders(self):
        for multiprocessing_context in supported_multiprocessing_contexts:
            loader1_it = iter(self._get_data_loader(self.dataset, num_workers=1))
@ -1435,6 +1431,7 @@ except RuntimeError as e:
            list(iter(ChainDataset([dataset1, self.dataset])))

    @unittest.skipIf(IS_MACOS, "Not working on macos")
+    @unittest.skipIf(IS_MACOS or IS_JETSON, "Not working on macos or Jetson")
    @skipIfRocm  # https://github.com/pytorch/pytorch/issues/90940
    def test_multiprocessing_contexts(self):
        reference = [
@ -1460,6 +1457,7 @@ except RuntimeError as e:
                    reference, list(self._get_data_loader(ds_cls(counting_ds_n), multiprocessing_context=ctx, **dl_common_args)))

    @skipIfNoNumpy
+    @unittest.skipIf(IS_JETSON, "Not working on Jetson")
    def test_multiprocessing_iterdatapipe(self):
        # Testing to make sure that function from global scope (e.g. imported from library) can be serialized
        # and used with multiprocess DataLoader
--- a/test/test_dlpack.py
+++ b/test/test_dlpack.py
@ -3,7 +3,7 @@

 import torch
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import TestCase, run_tests, IS_JETSON
 from torch.testing._internal.common_device_type import (
    instantiate_device_type_tests, onlyCUDA, dtypes, skipMeta,
    onlyNativeDeviceTypes)
@ -52,6 +52,10 @@ class TestTorchDlPack(TestCase):
        # (hence data dependency) at the exchange boundary.
        # DLPack manages this synchronization for us, so we don't need to
        # explicitly wait until x is populated
+        if IS_JETSON:
+            # DLPack protocol that establishes correct stream order
+            # does not behave as expected on Jetson
+            stream.synchronize()
        stream = torch.cuda.Stream()
        with torch.cuda.stream(stream):
            z = from_dlpack(x)
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@ -17,6 +17,7 @@ from torch.testing._internal.common_device_type import (

 from torch.testing._internal.common_utils import (
    IS_ARM64,
+    IS_JETSON,
    parametrize,
    run_tests,
    TEST_WITH_ROCM,
@ -114,6 +115,7 @@ class TestMatmulCuda(TestCase):

    @onlyCUDA
    @unittest.skipIf(TEST_WITH_ROCM, "Only CUDA 11+ is supported")
+    @unittest.skipIf(IS_JETSON, "Too large for Jetson")
    @toleranceOverride({torch.float32: xtol(atol=1e-5, rtol=1e-5)})
    @dtypes(*([torch.float32, torch.float16] +
              [torch.bfloat16] if TEST_WITH_ROCM or SM53OrLater else []))
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -39,7 +39,7 @@ from torch.testing._internal.common_utils import freeze_rng_state, run_tests, Te
    download_file, get_function_arglist, load_tests, skipIfMps,\
    TEST_WITH_UBSAN, IS_PPC, \
    parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
-    skipIfTorchDynamo, IS_WINDOWS
+    skipIfTorchDynamo, IS_WINDOWS, gcIfJetson
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
    module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
@ -9625,6 +9625,7 @@ class TestNNDeviceType(NNTestCase):
                    )

    @onlyCUDA
+    @gcIfJetson
    def test_masked_softmax_devices_parity(self):
        # Test that softmax with mask type 0 (LxL attention mask), mask type 1 (BxL padding mask),
        # and mask type 2 (BxHxLxL generic mask) gives the same result on CPU and on CUDA.
@ -10220,6 +10221,7 @@ class TestNNDeviceType(NNTestCase):
        self.assertEqual(out_ref, out)

    @onlyCUDA
+    @gcIfJetson
    def test_upsamplingNearest3d_launch_config(self, device):
        m = nn.Upsample(scale_factor=2)
        inp = torch.rand(2**25, 1, 1, 1, 1, device=device)
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Owner(s): ["module: autograd"]

-from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS
+from torch.testing._internal.common_utils import TestCase, run_tests, IS_JETSON, IS_WINDOWS
 import pkgutil
 import torch
 import sys
@ -271,7 +271,7 @@ class TestPublicBindings(TestCase):
        self.assertTrue(torch_C_bindings.issubset(torch_C_allowlist_superset), msg)

    # AttributeError: module 'torch.distributed' has no attribute '_shard'
-    @unittest.skipIf(IS_WINDOWS, "Distributed Attribute Error")
+    @unittest.skipIf(IS_WINDOWS or IS_JETSON, "Distributed Attribute Error")
    def test_correct_module_names(self):
        '''
        An API is considered public, if  its  `__module__` starts with `torch.`
--- a/test/test_shape_ops.py
+++ b/test/test_shape_ops.py
@ -7,11 +7,12 @@ from itertools import product, combinations, permutations, chain
 from functools import partial
 import random
 import warnings
+import unittest

 from torch import nan
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, skipIfTorchDynamo, torch_to_numpy_dtype_dict)
+    TestCase, run_tests, skipIfTorchDynamo, torch_to_numpy_dtype_dict, IS_JETSON)
 from torch.testing._internal.common_device_type import (
    instantiate_device_type_tests, onlyCPU, onlyCUDA, dtypes, onlyNativeDeviceTypes,
    dtypesIfCUDA, largeTensorTest)
@ -505,6 +506,7 @@ class TestShapeOps(TestCase):
    @onlyCUDA  # CPU is too slow
    @largeTensorTest('17GB')  # 4 tensors of 4GB (in, out) x (torch, numpy) + 1GB
    @largeTensorTest("81GB", "cpu")  # even for CUDA test, sufficient system memory is required
+    @unittest.skipIf(IS_JETSON, "Too large for Jetson")
    def test_flip_large_tensor(self, device):
        t_in = torch.empty(2**32 + 1, dtype=torch.uint8).random_()
        torch_fn = partial(torch.flip, dims=(0,))
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@ -14,7 +14,7 @@ from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
    TestCase, run_tests, do_test_empty_full, TEST_WITH_ROCM, suppress_warnings,
    torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict, slowTest,
-    TEST_SCIPY, IS_MACOS, IS_PPC, IS_WINDOWS, parametrize, skipIfTorchDynamo)
+    TEST_SCIPY, IS_MACOS, IS_PPC, IS_JETSON, IS_WINDOWS, parametrize, skipIfTorchDynamo)
 from torch.testing._internal.common_device_type import (
    expectedFailureMeta, instantiate_device_type_tests, deviceCountAtLeast, onlyNativeDeviceTypes,
    onlyCPU, largeTensorTest, precisionOverride, dtypes,
@ -953,8 +953,9 @@ class TestTensorCreation(TestCase):
    # errors with UBSAN. These casts are deliberate in PyTorch, however, and
    # NumPy has the same behavior.
    @onlyNativeDeviceTypes
-    @unittest.skipIf(IS_MACOS, "Test is broken on MacOS, see https://github.com/pytorch/pytorch/issues/38752")
-    @unittest.skipIf(IS_PPC, "Test is borken on PowerPC, see https://github.com/pytorch/pytorch/issues/39671")
+    @unittest.skipIf(IS_MACOS or IS_JETSON, "Test is broken on MacOS and Jetson, \
+        see https://github.com/pytorch/pytorch/issues/38752")
+    @unittest.skipIf(IS_PPC, "Test is broken on PowerPC, see https://github.com/pytorch/pytorch/issues/39671")
    @dtypes(torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64)
    def test_float_to_int_conversion_finite(self, device, dtype):
        min = torch.finfo(torch.float).min
--- a/test/test_testing.py
+++ b/test/test_testing.py
@ -18,7 +18,7 @@ import torch

 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import \
-    (IS_FBCODE, IS_MACOS, IS_SANDCASTLE, IS_WINDOWS, TestCase, run_tests, skipIfRocm, slowTest,
+    (IS_FBCODE, IS_JETSON, IS_MACOS, IS_SANDCASTLE, IS_WINDOWS, TestCase, run_tests, skipIfRocm, slowTest,
     parametrize, subtest, instantiate_parametrized_tests, dtype_name, TEST_WITH_ROCM)
 from torch.testing._internal.common_device_type import \
    (PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY, PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY, dtypes,
@ -1992,9 +1992,9 @@ class TestImports(TestCase):
        # See https://github.com/pytorch/pytorch/issues/77801
        if not sys.version_info >= (3, 9):
            ignored_modules.append("torch.utils.benchmark")
-        if IS_WINDOWS or IS_MACOS:
+        if IS_WINDOWS or IS_MACOS or IS_JETSON:
            # Distributed should be importable on Windows(except nn.api.), but not on Mac
-            if IS_MACOS:
+            if IS_MACOS or IS_JETSON:
                ignored_modules.append("torch.distributed.")
            else:
                ignored_modules.append("torch.distributed.nn.api.")
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -30,7 +30,7 @@ from functools import partial
 from torch import multiprocessing as mp
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
-    TEST_WITH_TORCHINDUCTOR, TestCase, TEST_WITH_ROCM, run_tests,
+    TEST_WITH_TORCHINDUCTOR, TestCase, TEST_WITH_ROCM, run_tests, IS_JETSON,
    IS_WINDOWS, IS_FILESYSTEM_UTF8_ENCODING, NO_MULTIPROCESSING_SPAWN,
    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, load_tests, skipIfTorchInductor, slowTest,
    TEST_WITH_CROSSREF, skipIfTorchDynamo,
@ -2781,6 +2781,7 @@ else:
        torch.testing.assert_close(expected, actual)

    @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "sandcastle OOM with current tpx gpu/re configuration")
+    @unittest.skipIf(IS_JETSON, "psutil issue for largeTensorTest. Too large for Jetson.")
    @onlyCUDA
    @dtypes(torch.half)  # only small dtype not to get oom
    @largeTensorTest('25GB', device='cpu')
@ -2797,6 +2798,7 @@ else:
    @dtypes(torch.half)  # only small dtype not to get oom
    @largeTensorTest('25GB', device='cpu')
    @largeTensorTest('4GB', device='cuda')
+    @unittest.skipIf(IS_JETSON, "psutil issue for largeTensorTest. Too large for Jetson.")
    def test_large_cumprod(self, device, dtype):
        # initialization to avoid overflow and half caveats
        x = torch.empty(2**30 + 200, device=device, dtype=dtype)
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@ -124,6 +124,18 @@ if os.getenv("DISABLED_TESTS_FILE", ""):

 NATIVE_DEVICES = ('cpu', 'cuda', 'meta')

+check_names = ['orin', 'concord', 'galen', 'xavier', 'nano', 'jetson', 'tegra']
+IS_JETSON = any(name in platform.platform() for name in check_names)
+
+def gcIfJetson(fn):
+    # Irregular Jetson host/device memory setup requires cleanup to avoid tests being killed
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if IS_JETSON:
+            gc.collect()
+            torch.cuda.empty_cache()
+        fn(*args, **kwargs)
+    return wrapper

 class _TestParametrizer:
    """