Fixes jiterator cache macro include + updates CUDA note with cache variables (#71452)

Summary: Per title. Pull Request resolved: https://github.com/pytorch/pytorch/pull/71452 Reviewed By: ngimel Differential Revision: D33646495 Pulled By: mruberry fbshipit-source-id: bbf627e6d7a724a83a3ea2ae9c0f50430f8d578e (cherry picked from commit d1e72b144a)
2025-12-06 12:20:52 +01:00 · 2022-01-18 19:38:51 -08:00 · 2022-01-18 19:38:51 -08:00 · 9b9b878c89
commit 9b9b878c89
parent 125bdb6d51
3 changed files with 25 additions and 3 deletions
--- a/aten/src/ATen/jit_macros.h
+++ b/aten/src/ATen/jit_macros.h
@ -14,4 +14,6 @@
 // Currently unsupported on Windows
 #ifndef _WIN32
    #define BUILD_JITERATOR_WITH_CACHE true
+#else
+    #define BUILD_JITERATOR_WITH_CACHE false
 #endif // _WIN32
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@ -3,6 +3,7 @@
 #include <c10/util/hash.h>
 #include <c10/util/Optional.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <ATen/jit_macros.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/code_template.h>
@ -727,7 +728,7 @@ std::string generate_code(
 }


-#ifdef BUILD_JITERATOR_WITH_CACHE
+#if BUILD_JITERATOR_WITH_CACHE
 // Acquires (possibly creating) the kernel cache directory
 c10::optional<std::string> get_cache_dir() {
  // If the environment variable USE_TORCH_KERNEL_CACHE is set to "0" then no persistent cache is used
@ -813,7 +814,7 @@ NvrtcFunction jit_pwise_function(
  NvrtcFunction compiled_kernel_;
  std::string name = kernel_name + "_kernel";

-  #ifdef BUILD_JITERATOR_WITH_CACHE
+  #if BUILD_JITERATOR_WITH_CACHE
    static const c10::optional<std::string> cache_dir = get_cache_dir();

    std::string file_path;
@ -927,7 +928,7 @@ NvrtcFunction jit_pwise_function(
  // TODO: use guards to avoid leaking
  AT_CUDA_NVRTC_CHECK(nvrtc.nvrtcDestroyProgram(&program));

-  #ifdef BUILD_JITERATOR_WITH_CACHE
+  #if BUILD_JITERATOR_WITH_CACHE
    if (cache_dir.has_value()) {
      // Writes the program to the cache if caching
      // NOTE: Actually writes to a per-process temporary file to avoid multi-process contention.
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@ -393,6 +393,25 @@ object or a device index, and access one of the above attributes. E.g., to set
 the capacity of the cache for device ``1``, one can write
 ``torch.backends.cuda.cufft_plan_cache[1].max_size = 10``.

+.. _cuda-just-in-time-compilation:
+
+Just-in-Time Compilation
+------------------------
+
+PyTorch just-in-time compiles some operations, like torch.special.zeta, when
+performed on CUDA tensors. This compilation can be time consuming
+(up to a few seconds depending on your hardware and software)
+and may occur multiple times for a single operator since many PyTorch operators actually
+select from a variety of kernels, each of which must be compiled once, depending on their input.
+This compilation occurs once per process, or just once if a kernel cache is used.
+
+By default, PyTorch creates a kernel cache in $XDG_CACHE_HOME/torch/kernels if
+XDG_CACHE_HOME is defined and $HOME/.cache/torch/kernels if it's not (except on Windows,
+where the kernel cache is not yet supported). The caching behavior can be directly
+controlled with two environment variables. If USE_PYTORCH_KERNEL_CACHE is set to 0 then no
+cache will be used, and if PYTORCH_KERNEL_CACHE_PATH is set then that path will be used
+as a kernel cache instead of the default location.
+
 Best practices
 --------------