Fixes jiterator cache macro include + updates CUDA note with cache variables (#71452)

Summary:
Per title.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/71452

Reviewed By: ngimel

Differential Revision: D33646495

Pulled By: mruberry

fbshipit-source-id: bbf627e6d7a724a83a3ea2ae9c0f50430f8d578e
(cherry picked from commit d1e72b144a)
This commit is contained in:
Mike Ruberry 2022-01-18 19:38:51 -08:00 committed by PyTorch MergeBot
parent 125bdb6d51
commit 9b9b878c89
3 changed files with 25 additions and 3 deletions

View File

@ -14,4 +14,6 @@
// Currently unsupported on Windows
#ifndef _WIN32
#define BUILD_JITERATOR_WITH_CACHE true
#else
#define BUILD_JITERATOR_WITH_CACHE false
#endif // _WIN32

View File

@ -3,6 +3,7 @@
#include <c10/util/hash.h>
#include <c10/util/Optional.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <ATen/jit_macros.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/detail/OffsetCalculator.cuh>
#include <ATen/code_template.h>
@ -727,7 +728,7 @@ std::string generate_code(
}
#ifdef BUILD_JITERATOR_WITH_CACHE
#if BUILD_JITERATOR_WITH_CACHE
// Acquires (possibly creating) the kernel cache directory
c10::optional<std::string> get_cache_dir() {
// If the environment variable USE_TORCH_KERNEL_CACHE is set to "0" then no persistent cache is used
@ -813,7 +814,7 @@ NvrtcFunction jit_pwise_function(
NvrtcFunction compiled_kernel_;
std::string name = kernel_name + "_kernel";
#ifdef BUILD_JITERATOR_WITH_CACHE
#if BUILD_JITERATOR_WITH_CACHE
static const c10::optional<std::string> cache_dir = get_cache_dir();
std::string file_path;
@ -927,7 +928,7 @@ NvrtcFunction jit_pwise_function(
// TODO: use guards to avoid leaking
AT_CUDA_NVRTC_CHECK(nvrtc.nvrtcDestroyProgram(&program));
#ifdef BUILD_JITERATOR_WITH_CACHE
#if BUILD_JITERATOR_WITH_CACHE
if (cache_dir.has_value()) {
// Writes the program to the cache if caching
// NOTE: Actually writes to a per-process temporary file to avoid multi-process contention.

View File

@ -393,6 +393,25 @@ object or a device index, and access one of the above attributes. E.g., to set
the capacity of the cache for device ``1``, one can write
``torch.backends.cuda.cufft_plan_cache[1].max_size = 10``.
.. _cuda-just-in-time-compilation:
Just-in-Time Compilation
------------------------
PyTorch just-in-time compiles some operations, like torch.special.zeta, when
performed on CUDA tensors. This compilation can be time consuming
(up to a few seconds depending on your hardware and software)
and may occur multiple times for a single operator since many PyTorch operators actually
select from a variety of kernels, each of which must be compiled once, depending on their input.
This compilation occurs once per process, or just once if a kernel cache is used.
By default, PyTorch creates a kernel cache in $XDG_CACHE_HOME/torch/kernels if
XDG_CACHE_HOME is defined and $HOME/.cache/torch/kernels if it's not (except on Windows,
where the kernel cache is not yet supported). The caching behavior can be directly
controlled with two environment variables. If USE_PYTORCH_KERNEL_CACHE is set to 0 then no
cache will be used, and if PYTORCH_KERNEL_CACHE_PATH is set then that path will be used
as a kernel cache instead of the default location.
Best practices
--------------