mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
# Motivation Add context variable `torch.bachend.mkldnn.allow_tf32` to control tf32 computation in convolution kernels at XPU side. The tf32 data type is beneficial to improve the performance of deep learning workloads during training/inference. Current PR uses the [oneDNN API fpmath_mode](https://oneapi-src.github.io/oneDNN/dev_guide_attributes_fpmath_mode.html#the-floating-point-math-mode-attribute) to trigger the tf32 acceleration in convolution kernels. # Valiadation * ut to test context variable `python test/xpu/test_conv.py -k test_mkldnn_allow_tf32_get_set` * Runtime exemplification ``` onednn_verbose,primitive,exec,gpu:0,convolution,jit:ir,forward_training,src_f32::blocked:abcd::f0 wei_f32::blocked:abcd::f0 bia_f32::blocked:a::f0 dst_f32::blocked:abcd::f0,attr-scratchpad:user attr-fpmath:tf32,alg:convolution_direct,mb20_ic16oc33_ih50oh24kh3sh2dh0ph0_iw100ow49kw3sw2dw0pw0,0.649902 onednn_verbose,primitive,exec,gpu:0,convolution,jit:ir,forward_training,src_f32::blocked:abcd::f0 wei_f32::blocked:abcd::f0 bia_f32::blocked:a::f0 dst_f32::blocked:abcd::f0,attr-scratchpad:user attr-fpmath:tf32,alg:convolution_direct,mb20_ic33oc33_ih24oh24kh3sh1dh0ph1_iw49ow49kw3sw1dw0pw1,0.151855 onednn_verbose,primitive,exec,gpu:0,convolution,jit:ir,backward_data,src_f32::blocked:abcd::f0 wei_f32::blocked:abcd::f0 bia_undef::undef::: dst_f32::blocked:abcd::f0,attr-scratchpad:user attr-fpmath:tf32,alg:convolution_direct,mb20_ic33oc33_ih24oh24kh3sh1dh0ph1_iw49ow49kw3sw1dw0pw1,0.167969 onednn_verbose,primitive,exec,gpu:0,convolution,jit:ir,backward_weights,src_f32::blocked:abcd::f0 wei_f32::blocked:abcd::f0 bia_f32::blocked:a::f0 dst_f32::blocked:abcd::f0,attr-scratchpad:user attr-fpmath:tf32,alg:convolution_direct,mb20_ic33oc33_ih24oh24kh3sh1dh0ph1_iw49ow49kw3sw1dw0pw1,0.26709 onednn_verbose,primitive,exec,gpu:0,convolution,jit:ir,backward_weights,src_f32::blocked:abcd::f0 wei_f32::blocked:abcd::f0 bia_f32::blocked:a::f0 dst_f32::blocked:abcd::f0,attr-scratchpad:user attr-fpmath:tf32,alg:convolution_direct,mb20_ic16oc33_ih50oh24kh3sh2dh0ph0_iw100ow49kw3sw2dw0pw0,0.219971 ``` According to the field `fpmath:tf32` in verbose, we could see that, current context setting utils could successfully trigger tf32 computation in conv forward/backward_data/backward_weights kernels. Pull Request resolved: https://github.com/pytorch/pytorch/pull/137570 Approved by: https://github.com/guangyey, https://github.com/EikanWang, https://github.com/atalman, https://github.com/malfet Co-authored-by: Yu, Guangye <guangye.yu@intel.com>
112 lines
3.4 KiB
Python
112 lines
3.4 KiB
Python
# mypy: allow-untyped-defs
|
|
import sys
|
|
from contextlib import contextmanager
|
|
from typing import TYPE_CHECKING
|
|
|
|
import torch
|
|
from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
|
|
|
|
|
|
def is_available():
|
|
r"""Return whether PyTorch is built with MKL-DNN support."""
|
|
return torch._C._has_mkldnn
|
|
|
|
|
|
VERBOSE_OFF = 0
|
|
VERBOSE_ON = 1
|
|
VERBOSE_ON_CREATION = 2
|
|
|
|
|
|
class verbose:
|
|
"""
|
|
On-demand oneDNN (former MKL-DNN) verbosing functionality.
|
|
|
|
To make it easier to debug performance issues, oneDNN can dump verbose
|
|
messages containing information like kernel size, input data size and
|
|
execution duration while executing the kernel. The verbosing functionality
|
|
can be invoked via an environment variable named `DNNL_VERBOSE`. However,
|
|
this methodology dumps messages in all steps. Those are a large amount of
|
|
verbose messages. Moreover, for investigating the performance issues,
|
|
generally taking verbose messages for one single iteration is enough.
|
|
This on-demand verbosing functionality makes it possible to control scope
|
|
for verbose message dumping. In the following example, verbose messages
|
|
will be dumped out for the second inference only.
|
|
|
|
.. highlight:: python
|
|
.. code-block:: python
|
|
|
|
import torch
|
|
model(data)
|
|
with torch.backends.mkldnn.verbose(torch.backends.mkldnn.VERBOSE_ON):
|
|
model(data)
|
|
|
|
Args:
|
|
level: Verbose level
|
|
- ``VERBOSE_OFF``: Disable verbosing
|
|
- ``VERBOSE_ON``: Enable verbosing
|
|
- ``VERBOSE_ON_CREATION``: Enable verbosing, including oneDNN kernel creation
|
|
"""
|
|
|
|
def __init__(self, level):
|
|
self.level = level
|
|
|
|
def __enter__(self):
|
|
if self.level == VERBOSE_OFF:
|
|
return
|
|
st = torch._C._verbose.mkldnn_set_verbose(self.level)
|
|
assert (
|
|
st
|
|
), "Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
torch._C._verbose.mkldnn_set_verbose(VERBOSE_OFF)
|
|
return False
|
|
|
|
|
|
def set_flags(_enabled=None, _deterministic=None, _allow_tf32=None):
|
|
orig_flags = (
|
|
torch._C._get_mkldnn_enabled(),
|
|
torch._C._get_mkldnn_deterministic(),
|
|
torch._C._get_onednn_allow_tf32(),
|
|
)
|
|
if _enabled is not None:
|
|
torch._C._set_mkldnn_enabled(_enabled)
|
|
if _deterministic is not None:
|
|
torch._C._set_mkldnn_deterministic(_deterministic)
|
|
if _allow_tf32 is not None:
|
|
torch._C._set_onednn_allow_tf32(_allow_tf32)
|
|
return orig_flags
|
|
|
|
|
|
@contextmanager
|
|
def flags(enabled=False, deterministic=False, allow_tf32=True):
|
|
with __allow_nonbracketed_mutation():
|
|
orig_flags = set_flags(enabled, deterministic, allow_tf32)
|
|
try:
|
|
yield
|
|
finally:
|
|
with __allow_nonbracketed_mutation():
|
|
set_flags(*orig_flags)
|
|
|
|
|
|
class MkldnnModule(PropModule):
|
|
def __init__(self, m, name):
|
|
super().__init__(m, name)
|
|
|
|
enabled = ContextProp(torch._C._get_mkldnn_enabled, torch._C._set_mkldnn_enabled)
|
|
deterministic = ContextProp(
|
|
torch._C._get_mkldnn_deterministic, torch._C._set_mkldnn_deterministic
|
|
)
|
|
allow_tf32 = ContextProp(
|
|
torch._C._get_onednn_allow_tf32, torch._C._set_onednn_allow_tf32
|
|
)
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
enabled: ContextProp
|
|
deterministic: ContextProp
|
|
allow_tf32: ContextProp
|
|
|
|
sys.modules[__name__] = MkldnnModule(sys.modules[__name__], __name__)
|