[torch][amdsmi] Look for amdsmi in ROCM_HOME/ROCM_PATH before using rpath (#147117)

Summary: ROCm uses ROCM_HOME/ROCM_PATH to specify which version of rocm the user wants to use. This is especially important in multi-version setups. Let's respect that behavior when loading amdsmi.

Test Plan:
CI
```
NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=INIT,COLL MSCCL_ALGO_DIR=~/2fbsource/third-party/rccl/develop/tools/msccl-algorithms RCCL_MSCCLPP_THRESHOLD=(math '128*1024*1024')  RCCL_MSCCLPP_ENABLE=1 ENABLE_MSCCLPP=1 buck2 run fbcode//mode/opt-amd-gpu -m rocm621 fbcode//accelerators/workloads/microbench:bench_comm -- --shape moe_17b --comm_algo nccl_allreduce
```

Differential Revision: D69597647

Pull Request resolved: https://github.com/pytorch/pytorch/pull/147117
Approved by: https://github.com/malfet
This commit is contained in:
Dan Zimmerman 2025-02-14 01:11:59 +00:00 committed by PyTorch MergeBot
parent 20a369aa3a
commit 6419076db9

View File

@ -82,15 +82,20 @@ try:
class amdsmi_cdll_hook:
def __init__(self) -> None:
self.original_CDLL = ctypes.CDLL # type: ignore[misc,assignment]
paths = ["libamd_smi.so"]
if rocm_home := os.getenv("ROCM_HOME", os.getenv("ROCM_PATH")):
paths = [os.path.join(rocm_home, "lib/libamd_smi.so")] + paths
self.paths: List[str] = paths
def hooked_CDLL(
self, name: Union[str, Path, None], *args: Any, **kwargs: Any
) -> ctypes.CDLL:
if name and Path(name).name == "libamd_smi.so":
try:
return self.original_CDLL("libamd_smi.so", *args, **kwargs)
except OSError:
pass
for path in self.paths:
try:
return self.original_CDLL(path, *args, **kwargs)
except OSError:
pass
return self.original_CDLL(name, *args, **kwargs) # type: ignore[arg-type]
def __enter__(self) -> None: