mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[torch][amdsmi] Look for amdsmi in ROCM_HOME/ROCM_PATH before using rpath (#147117)
Summary: ROCm uses ROCM_HOME/ROCM_PATH to specify which version of rocm the user wants to use. This is especially important in multi-version setups. Let's respect that behavior when loading amdsmi. Test Plan: CI ``` NCCL_DEBUG=INFO NCCL_DEBUG_SUBSYS=INIT,COLL MSCCL_ALGO_DIR=~/2fbsource/third-party/rccl/develop/tools/msccl-algorithms RCCL_MSCCLPP_THRESHOLD=(math '128*1024*1024') RCCL_MSCCLPP_ENABLE=1 ENABLE_MSCCLPP=1 buck2 run fbcode//mode/opt-amd-gpu -m rocm621 fbcode//accelerators/workloads/microbench:bench_comm -- --shape moe_17b --comm_algo nccl_allreduce ``` Differential Revision: D69597647 Pull Request resolved: https://github.com/pytorch/pytorch/pull/147117 Approved by: https://github.com/malfet
This commit is contained in:
parent
20a369aa3a
commit
6419076db9
|
|
@ -82,15 +82,20 @@ try:
|
|||
class amdsmi_cdll_hook:
|
||||
def __init__(self) -> None:
|
||||
self.original_CDLL = ctypes.CDLL # type: ignore[misc,assignment]
|
||||
paths = ["libamd_smi.so"]
|
||||
if rocm_home := os.getenv("ROCM_HOME", os.getenv("ROCM_PATH")):
|
||||
paths = [os.path.join(rocm_home, "lib/libamd_smi.so")] + paths
|
||||
self.paths: List[str] = paths
|
||||
|
||||
def hooked_CDLL(
|
||||
self, name: Union[str, Path, None], *args: Any, **kwargs: Any
|
||||
) -> ctypes.CDLL:
|
||||
if name and Path(name).name == "libamd_smi.so":
|
||||
try:
|
||||
return self.original_CDLL("libamd_smi.so", *args, **kwargs)
|
||||
except OSError:
|
||||
pass
|
||||
for path in self.paths:
|
||||
try:
|
||||
return self.original_CDLL(path, *args, **kwargs)
|
||||
except OSError:
|
||||
pass
|
||||
return self.original_CDLL(name, *args, **kwargs) # type: ignore[arg-type]
|
||||
|
||||
def __enter__(self) -> None:
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user