mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[ROCm] Fixes to enable VM-based MI300 CI runners (#152133)
New VM-based MI300 CI runners tested in https://github.com/pytorch/pytorch/pull/151708 exposed some issues in CI that this PR fixes: * HSAKMT_DEBUG_LEVEL is a debug env var that was introduced to debug driver issues. However, in the new MI300 runners being tested, since they run inside a VM, the driver emits a debug message `Failed to map remapped mmio page on gpu_mem 0` when calling `rocminfo` or doing other GPU-related work. This results in multiple PyTorch unit tests failing when doing a string match on the stdout vs expected output. * HSA_FORCE_FINE_GRAIN_PCIE was relevant for rccl performance improvement, but is not required now. * amdsmi doesn't return metrics like [power_info](https://rocm.docs.amd.com/projects/amdsmi/en/latest/reference/amdsmi-py-api.html#amdsmi-get-power-cap-info) and [clock_info](https://rocm.docs.amd.com/projects/amdsmi/en/latest/reference/amdsmi-py-api.html#amdsmi-get-clock-info) in a VM ("Guest") environment. Return 0 as the default in cases where amdsmi returns "N/A" * amdsmi throws an exception when calling `amdsmi.amdsmi_get_clock_info` on the VM-based runners. Temporarily skipping the unit test for MI300 until we find a resolution. Pull Request resolved: https://github.com/pytorch/pytorch/pull/152133 Approved by: https://github.com/jeffdaily
This commit is contained in:
parent
0dae27d75b
commit
bcf1031cb8
|
|
@ -13,10 +13,6 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
|
|||
echo 'Skipping tests'
|
||||
exit 0
|
||||
fi
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then
|
||||
# temporary to locate some kernel issues on the CI nodes
|
||||
export HSAKMT_DEBUG_LEVEL=4
|
||||
fi
|
||||
# These additional packages are needed for circleci ROCm builds.
|
||||
if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
|
||||
# Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by
|
||||
|
|
|
|||
|
|
@ -13,10 +13,6 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
|
|||
# HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
|
||||
unset HIP_PLATFORM
|
||||
export PYTORCH_TEST_WITH_ROCM=1
|
||||
# temporary to locate some kernel issues on the CI nodes
|
||||
export HSAKMT_DEBUG_LEVEL=4
|
||||
# improve rccl performance for distributed tests
|
||||
export HSA_FORCE_FINE_GRAIN_PCIE=1
|
||||
fi
|
||||
|
||||
# TODO: Renable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ from torch.testing._internal.common_utils import (
|
|||
IS_WINDOWS,
|
||||
IS_X86,
|
||||
load_tests,
|
||||
MI300_ARCH,
|
||||
parametrize,
|
||||
run_tests,
|
||||
serialTest,
|
||||
|
|
@ -67,6 +68,7 @@ from torch.testing._internal.common_utils import (
|
|||
skipCUDAMemoryLeakCheckIf,
|
||||
skipCUDANonDefaultStreamIf,
|
||||
skipIfRocm,
|
||||
skipIfRocmArch,
|
||||
slowTest,
|
||||
subtest,
|
||||
TemporaryFileName,
|
||||
|
|
@ -4382,6 +4384,7 @@ class TestCudaMallocAsync(TestCase):
|
|||
self.assertTrue(torch.cuda.power_draw() >= 0)
|
||||
|
||||
@unittest.skipIf(not TEST_PYNVML, "pynvml/amdsmi is not available")
|
||||
@skipIfRocmArch(MI300_ARCH)
|
||||
def test_clock_speed(self):
|
||||
self.assertTrue(torch.cuda.clock_rate() >= 0)
|
||||
|
||||
|
|
|
|||
|
|
@ -1254,16 +1254,24 @@ def _get_amdsmi_power_draw(device: Optional[Union[Device, int]] = None) -> int:
|
|||
if socket_power != "N/A":
|
||||
return socket_power
|
||||
else:
|
||||
return amdsmi.amdsmi_get_power_info(handle)["current_socket_power"]
|
||||
socket_power = amdsmi.amdsmi_get_power_info(handle)["current_socket_power"]
|
||||
if socket_power != "N/A":
|
||||
return socket_power
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def _get_amdsmi_clock_rate(device: Optional[Union[Device, int]] = None) -> int:
|
||||
handle = _get_amdsmi_handler(device)
|
||||
clock_info = amdsmi.amdsmi_get_clock_info(handle, amdsmi.AmdSmiClkType.GFX)
|
||||
if "cur_clk" in clock_info: # ROCm 6.2 deprecation
|
||||
return clock_info["cur_clk"]
|
||||
clock_rate = clock_info["cur_clk"]
|
||||
else:
|
||||
return clock_info["clk"]
|
||||
clock_rate = clock_info["clk"]
|
||||
if clock_rate != "N/A":
|
||||
return clock_rate
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def device_memory_used(device: Optional[Union[Device, int]] = None) -> int:
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user