add Cuda{2D,3D}LaunchConfig that maximizes occupancy (#10032)

* add Cuda{2D,3D}LaunchConfig that max occupancy * remove default val, check input<=0 * add max size check * fix typo * tests, docs, and related changes * build the test * buildify * cudaOccupancy... call check success, and style fix
2025-12-07 12:20:24 +01:00 · 2017-06-06 15:33:15 -04:00 · 2017-06-06 15:33:15 -04:00 · b440abce7f
commit b440abce7f
parent 187d233374
4 changed files with 518 additions and 18 deletions
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -82,6 +82,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test_mkl")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_only_cc_test")
 # For platform specific build config
 load(
@ -2323,6 +2324,18 @@ tf_cc_test_gpu(
    ],
 )
 tf_cuda_only_cc_test(
    name = "util_cuda_kernel_helper_test",
    srcs = [
        "util/cuda_kernel_helper_test.cu.cc",
    ],
    deps = [
        ":test",
        ":test_main",
        "//third_party/eigen3",
    ],
 )
 tf_cc_test_gpu(
    name = "memory_types_test",
    size = "small",
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@ -20,13 +20,95 @@ limitations under the License.
 #include <algorithm>
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/platform/default/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 // Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
 // GetCuda3DLaunchConfig:
 //
 // There are two versions of GetCudaLaunchConfig and GetCuda2DLaunchConfig, one
 // version uses heuristics without any knowledge of the device kernel, the other
 // version uses cudaOccupancyMaxPotentialBlockSize to determine the theoretical
 // launch parameters that maximize occupancy. Currently, only the maximum
 // occupancy version of GetCuda3DLaunchConfig is available.
 //
 // For large number of work elements, the convention is that each kernel would
 // iterate through its assigned range. The return value of GetCudaLaunchConfig
 // is struct CudaLaunchConfig, which contains all the information needed for the
 // kernel launch, including: virtual number of threads, the number of threads
 // per block and number of threads per block used inside <<< >>> of a kernel
 // launch. GetCuda2DLaunchConfig and GetCuda3DLaunchConfig does the same thing
 // as CudaLaunchConfig. The only difference is the dimension. The macros
 // CUDA_1D_KERNEL_LOOP and CUDA_AXIS_KERNEL_LOOP might be used to do inner loop.
 //
 /* Sample code:
 __global__ void MyKernel1D(CudaLaunchConfig config, other_args...) {
  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
    do_your_job_here;
  }
 }
 __global__ void MyKernel2D(Cuda2DLaunchConfig config, other_args...) {
  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
      do_your_job_here;
    }
  }
 }
 __global__ void MyKernel3D(Cuda3DLaunchConfig config, other_args...) {
  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
        do_your_job_here;
      }
    }
  }
 }
 void MyDriverFunc(const GPUDevice &d) {
  // use heuristics
  CudaLaunchConfig cfg1 = GetCudaLaunchConfig(10240, d);
  MyKernel1D <<<config.block_count,
                config.thread_per_block, 0, d.stream()>>> (cfg1, other_args...);
  Cuda2DLaunchConfig cfg2 = GetCuda2DLaunchConfig(10240, 10240, d);
  MyKernel2D <<<config.block_count,
                config.thread_per_block, 0, d.stream()>>> (cfg2, other_args...);
  Cuda3DLaunchConfig cfg3 = GetCuda3DLaunchConfig(4096, 4096, 100, d);
  MyKernel3D <<<config.block_count,
                config.thread_per_block, 0, d.stream()>>> (cfg3, other_args...);
  // maximize occupancy
  CudaLaunchConfig cfg4 = GetCudaLaunchConfig(10240, d, MyKernel1D, 0, 0 );
  MyKernel1D <<<config.block_count,
                config.thread_per_block, 0, d.stream()>>> (cfg4, other_args...);
  Cuda2DLaunchConfig cfg5 = GetCuda2DLaunchConfig(10240, 10240, d,
                                                  MyKernel1D, 0, 0);
  MyKernel2D <<<config.block_count,
                config.thread_per_block, 0, d.stream()>>> (cfg5, other_args...);
  Cuda3DLaunchConfig cfg6 = GetCuda3DLaunchConfig(4096, 4096, 100, d,
                                                  MyKernel1D, 0, 0);
  MyKernel3D <<<config.block_count,
                config.thread_per_block, 0, d.stream()>>> (cfg6, other_args...);
 }
 // See the test for this for more example:
 // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
 */
 #define CUDA_1D_KERNEL_LOOP(i, n)                            \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
       i += blockDim.x * gridDim.x)
 #define CUDA_AXIS_KERNEL_LOOP(i, n, axis)                                  \
  for (int i = blockIdx.axis * blockDim.axis + threadIdx.axis; i < n.axis; \
       i += blockDim.axis * gridDim.axis)
 #define DIV_UP(a, b) (((a) + (b) - 1) / (b))
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
@ -47,16 +129,22 @@ struct CudaLaunchConfig {
 // memory-limited.
 inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
                                            const GPUDevice& d) {
  CudaLaunchConfig config;
  // in case of invalid input, return the default value config, which has all -1
  if (work_element_count <= 0) {
    return config;
  }
  const int virtual_thread_count = work_element_count;
  const int physical_thread_count = std::min(
      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
      virtual_thread_count);
  const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
-  const int block_count = std::min(
+  const int block_count =
-      (physical_thread_count + thread_per_block - 1) / thread_per_block,
+      std::min(DIV_UP(physical_thread_count, thread_per_block),
-      d.getNumCudaMultiProcessors());
+               d.getNumCudaMultiProcessors());
  CudaLaunchConfig config;
  config.virtual_thread_count = virtual_thread_count;
  config.thread_per_block = thread_per_block;
  config.block_count = block_count;
@ -70,16 +158,23 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
                                            const GPUDevice& d, DeviceFunc func,
                                            size_t dynamic_shared_memory_size,
                                            int block_size_limit) {
  CudaLaunchConfig config;
  if (work_element_count <= 0) {
    return config;
  }
  int block_count = 0;
  int thread_per_block = 0;
  cudaOccupancyMaxPotentialBlockSize(&block_count, &thread_per_block, func,
                                     dynamic_shared_memory_size,
                                     block_size_limit);
  block_count =
      std::min(block_count,
               (work_element_count + thread_per_block - 1) / thread_per_block);
-  CudaLaunchConfig config;
+  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
      block_size_limit);
  CHECK_EQ(err, cudaSuccess);
  block_count =
      std::min(block_count, DIV_UP(work_element_count, thread_per_block));
  config.virtual_thread_count = work_element_count;
  config.thread_per_block = thread_per_block;
  config.block_count = block_count;
@ -87,16 +182,18 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
 }
 struct Cuda2DLaunchConfig {
-  dim3 virtual_thread_count;
+  dim3 virtual_thread_count = dim3(0, 0, 0);
-  dim3 thread_per_block;
+  dim3 thread_per_block = dim3(0, 0, 0);
-  dim3 block_count;
+  dim3 block_count = dim3(0, 0, 0);
 };
 inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
                                                const GPUDevice& d) {
  Cuda2DLaunchConfig config;
-  config.virtual_thread_count = dim3(xdim, ydim, 1);
+  if (xdim <= 0 || ydim <= 0) {
    return config;
  }
  const int kThreadsPerBlock = 256;
  int block_cols = std::min(xdim, kThreadsPerBlock);
@ -108,16 +205,78 @@ inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
  const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
  config.virtual_thread_count = dim3(xdim, ydim, 1);
  config.thread_per_block = dim3(block_cols, block_rows, 1);
-  int grid_x = std::min((xdim + block_cols - 1) / block_cols, max_blocks);
+  int grid_x = std::min(DIV_UP(xdim, block_cols), max_blocks);
  config.block_count = dim3(
      grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
  return config;
 }
 // Calculate the Cuda 2D and 3D launch config we should use for a kernel launch.
 // This variant takes the resource limits of func into account to maximize
 // occupancy.
 using Cuda3DLaunchConfig = Cuda2DLaunchConfig;
 template <typename DeviceFunc>
 inline Cuda3DLaunchConfig GetCuda3DLaunchConfig(
    int xdim, int ydim, int zdim, const GPUDevice& d, DeviceFunc func,
    size_t dynamic_shared_memory_size, int block_size_limit) {
  Cuda3DLaunchConfig config;
  if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
    return config;
  }
  int dev;
  cudaGetDevice(&dev);
  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, dev);
  int xthreadlimit = deviceProp.maxThreadsDim[0];
  int ythreadlimit = deviceProp.maxThreadsDim[1];
  int zthreadlimit = deviceProp.maxThreadsDim[2];
  int xgridlimit = deviceProp.maxGridSize[0];
  int ygridlimit = deviceProp.maxGridSize[1];
  int zgridlimit = deviceProp.maxGridSize[2];
  int block_count = 0;
  int thread_per_block = 0;
  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
      block_size_limit);
  CHECK_EQ(err, cudaSuccess);
 #define MIN3(a, b, c) std::min((a), std::min((b), (c)))
  int threadsx = MIN3(xdim, thread_per_block, xthreadlimit);
  int threadsy =
      MIN3(ydim, std::max(thread_per_block / threadsx, 1), ythreadlimit);
  int threadsz =
      MIN3(zdim, std::max(thread_per_block / (threadsx * threadsy), 1),
           zthreadlimit);
  int blocksx = MIN3(block_count, DIV_UP(xdim, threadsx), xgridlimit);
  int blocksy =
      MIN3(DIV_UP(block_count, blocksx), DIV_UP(ydim, threadsy), ygridlimit);
  int blocksz = MIN3(DIV_UP(block_count, (blocksx * blocksy)),
                     DIV_UP(zdim, threadsz), zgridlimit);
 #undef MIN3
  config.virtual_thread_count = dim3(xdim, ydim, zdim);
  config.thread_per_block = dim3(threadsx, threadsy, threadsz);
  config.block_count = dim3(blocksx, blocksy, blocksz);
  return config;
 }
 template <typename DeviceFunc>
 inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(
    int xdim, int ydim, const GPUDevice& d, DeviceFunc func,
    size_t dynamic_shared_memory_size, int block_size_limit) {
  return GetCuda3DLaunchConfig(xdim, ydim, 1, d, func,
                               dynamic_shared_memory_size, block_size_limit);
 }
 namespace gpu {
 template <typename IntType>
@ -511,6 +670,8 @@ __device__ EIGEN_ALWAYS_INLINE double CudaShuffleXor(double value, int laneMask,
 }  // namespace tensorflow
 #undef DIV_UP
 #endif  // GOOGLE_CUDA
 #endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
--- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
@ -0,0 +1,303 @@
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 #include <numeric>
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #define CUDA_EXPECT_SUCCESS                                 \
  {                                                         \
    cudaDeviceSynchronize();                                \
    cudaError_t err = cudaGetLastError();                   \
    EXPECT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
  }
 #define CUDA_ASSERT_SUCCESS                                 \
  {                                                         \
    cudaDeviceSynchronize();                                \
    cudaError_t err = cudaGetLastError();                   \
    ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \
  }
 namespace tensorflow {
 namespace {
 __global__ void SetOutbufZero(CudaLaunchConfig config, int* outbuf) {
  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) { outbuf[x] = 0; }
 }
 // counting number of jobs by using atomic +1
 __global__ void Count1D(CudaLaunchConfig config, int bufsize, int* outbuf) {
  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
    if (x < 0) {  // x might overflow when testing extreme case
      break;
    }
    atomicAdd(&outbuf[x % bufsize], 1);
  }
 }
 __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
    if (x < 0) {  // x might overflow when testing extreme case
      break;
    }
    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
      if (y < 0) {  // y might overflow when testing extreme case
        break;
      }
      int idx = x * config.virtual_thread_count.y + y;
      atomicAdd(&outbuf[idx % bufsize], 1);
    }
  }
 }
 __global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
    if (x < 0) {  // x might overflow when testing extreme case
      break;
    }
    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
      if (y < 0) {  // y might overflow when testing extreme case
        break;
      }
      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
        if (z < 0) {  // z might overflow when testing extreme case
          break;
        }
        int idx =
            x * config.virtual_thread_count.y * config.virtual_thread_count.z +
            y * config.virtual_thread_count.z + z;
        atomicAdd(&outbuf[idx % bufsize], 1);
      }
    }
  }
 }
 }  // namespace
 class CudaLaunchConfigTest : public ::testing::Test {
 protected:
  const int bufsize = 1024;
  int* outbuf = nullptr;
  Eigen::CudaStreamDevice stream;
  GPUDevice d = GPUDevice(&stream);
  virtual void SetUp() {
    cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
    ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err);
  }
  virtual void TearDown() {
    cudaDeviceSynchronize();
    cudaFree(outbuf);
    outbuf = nullptr;
  }
 };
 TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
  CudaLaunchConfig cfg;
  // test invalid inputs
  CudaLaunchConfig default_value;
  cfg = GetCudaLaunchConfig(0, d);
  EXPECT_EQ(default_value.virtual_thread_count, cfg.virtual_thread_count);
  EXPECT_EQ(default_value.block_count, cfg.block_count);
  EXPECT_EQ(default_value.thread_per_block, cfg.thread_per_block);
  cfg = GetCudaLaunchConfig(-1, d);
  EXPECT_EQ(default_value.virtual_thread_count, cfg.virtual_thread_count);
  EXPECT_EQ(default_value.block_count, cfg.block_count);
  EXPECT_EQ(default_value.thread_per_block, cfg.thread_per_block);
  cfg = GetCudaLaunchConfig(0, d, Count1D, 0, 0);
  EXPECT_EQ(default_value.virtual_thread_count, cfg.virtual_thread_count);
  EXPECT_EQ(default_value.block_count, cfg.block_count);
  EXPECT_EQ(default_value.thread_per_block, cfg.thread_per_block);
  cfg = GetCudaLaunchConfig(-1, d, Count1D, 0, 0);
  EXPECT_EQ(default_value.virtual_thread_count, cfg.virtual_thread_count);
  EXPECT_EQ(default_value.block_count, cfg.block_count);
  EXPECT_EQ(default_value.thread_per_block, cfg.thread_per_block);
  // test valid inputs
  #define TEST_LAUNCH_PARAMETER(work_element_count)                             \
    cfg = GetCudaLaunchConfig(bufsize, d);                                      \
    SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>     \
                                                                (cfg, outbuf);  \
    CUDA_ASSERT_SUCCESS                                                         \
    cfg = GetCudaLaunchConfig(work_element_count, d);                           \
    Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
        cfg, bufsize, outbuf);                                                  \
    CUDA_EXPECT_SUCCESS                                                         \
    EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0));\
                                                                                \
    cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                 \
    SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>     \
                                                                (cfg, outbuf);  \
    CUDA_ASSERT_SUCCESS                                                         \
    cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);            \
    Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
        cfg, bufsize, outbuf);                                                  \
    CUDA_EXPECT_SUCCESS                                                         \
    EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
  TEST_LAUNCH_PARAMETER(128);
  TEST_LAUNCH_PARAMETER(129);
  TEST_LAUNCH_PARAMETER(511);
  TEST_LAUNCH_PARAMETER(512);
  TEST_LAUNCH_PARAMETER(2048);
  TEST_LAUNCH_PARAMETER(2049);
  TEST_LAUNCH_PARAMETER(8191);
  TEST_LAUNCH_PARAMETER(8192);
  TEST_LAUNCH_PARAMETER(123456);
  TEST_LAUNCH_PARAMETER(1 << 31 - 1);  // max value of int
  #undef TEST_LAUNCH_PARAMETER
 }
 bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
  return a.thread_per_block.x == b.thread_per_block.x &&
         a.thread_per_block.y == b.thread_per_block.y &&
         a.thread_per_block.z == b.thread_per_block.z &&
         a.block_count.x == b.block_count.x &&
         a.block_count.y == b.block_count.y &&
         a.block_count.z == b.block_count.z &&
         a.thread_per_block.x == b.thread_per_block.x &&
         a.thread_per_block.y == b.thread_per_block.y &&
         a.thread_per_block.z == b.thread_per_block.z;
 }
 TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
  Cuda2DLaunchConfig cfg;
  CudaLaunchConfig cfg1d;
  // test invalid inputs
  Cuda2DLaunchConfig default_value;
  cfg = GetCuda2DLaunchConfig(1, 0, d);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda2DLaunchConfig(1, -1, d);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda2DLaunchConfig(-1, 1, d);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda2DLaunchConfig(-1, 1, d);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda2DLaunchConfig(0, -1, d);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda2DLaunchConfig(0, 0, d);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda2DLaunchConfig(1, 0, d, Count2D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda2DLaunchConfig(1, -1, d, Count2D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda2DLaunchConfig(-1, 1, d, Count2D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda2DLaunchConfig(-1, 1, d, Count2D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda2DLaunchConfig(0, -1, d, Count2D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda2DLaunchConfig(0, 0, d, Count2D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  // test valid inputs
  #define TEST_LAUNCH_PARAMETER(dimx, dimy)                                     \
    cfg1d = GetCudaLaunchConfig(bufsize, d);                                    \
    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
                                                                (cfg1d, outbuf);\
    CUDA_ASSERT_SUCCESS                                                         \
    cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                 \
    Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
        cfg, bufsize, outbuf);                                                  \
    CUDA_EXPECT_SUCCESS                                                         \
    EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0));       \
                                                                                \
    cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
                                                                (cfg1d, outbuf);\
    CUDA_ASSERT_SUCCESS                                                         \
    cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                  \
    Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
        cfg, bufsize, outbuf);                                                  \
    CUDA_EXPECT_SUCCESS                                                         \
    EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
  TEST_LAUNCH_PARAMETER(128, 128);
  TEST_LAUNCH_PARAMETER(129, 64);
  TEST_LAUNCH_PARAMETER(511, 2048);
  TEST_LAUNCH_PARAMETER(512, 512);
  TEST_LAUNCH_PARAMETER(2048, 1024);
  TEST_LAUNCH_PARAMETER(2049, 32);
  TEST_LAUNCH_PARAMETER(8191, 1);
  TEST_LAUNCH_PARAMETER(8192, 10);
  TEST_LAUNCH_PARAMETER(123456, 12);
  TEST_LAUNCH_PARAMETER(1, (1 << 31 - 1));
  TEST_LAUNCH_PARAMETER((1 << 31 - 1), 1);
  #undef TEST_LAUNCH_PARAMETER
 }
 TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
  Cuda3DLaunchConfig cfg;
  CudaLaunchConfig cfg1d;
  // test invalid inputs
  Cuda3DLaunchConfig default_value;
  cfg = GetCuda3DLaunchConfig(0, 1, 1, d, Count3D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda3DLaunchConfig(-1, 1, 1, d, Count3D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda3DLaunchConfig(1, 0, 1, d, Count3D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda3DLaunchConfig(1, -1, 1, d, Count3D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda3DLaunchConfig(1, 1, 0, d, Count3D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda3DLaunchConfig(1, 1, -1, d, Count3D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda3DLaunchConfig(0, 0, 0, d, Count3D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  cfg = GetCuda3DLaunchConfig(-1, -1, -1, d, Count3D, 0, 0);
  EXPECT_EQ(default_value, cfg);
  // test valid inputs
  #define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                               \
    cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
                                                                (cfg1d, outbuf);\
    CUDA_ASSERT_SUCCESS                                                         \
    cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);            \
    Count3D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
        cfg, bufsize, outbuf);                                                  \
    CUDA_EXPECT_SUCCESS                                                         \
    EXPECT_EQ(dimx * dimy * dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
  TEST_LAUNCH_PARAMETER(128, 128, 128);
  TEST_LAUNCH_PARAMETER(129, 64, 1024);
  TEST_LAUNCH_PARAMETER(511, 2048, 128);
  TEST_LAUNCH_PARAMETER(512, 512, 64);
  TEST_LAUNCH_PARAMETER(2048, 1024, 128);
  TEST_LAUNCH_PARAMETER(2049, 32, 1024);
  TEST_LAUNCH_PARAMETER(8191, 1, 1024);
  TEST_LAUNCH_PARAMETER(8192, 10, 32);
  TEST_LAUNCH_PARAMETER(123456, 12, 21);
  TEST_LAUNCH_PARAMETER(1, 1, (1 << 31 - 1));
  TEST_LAUNCH_PARAMETER(1, (1 << 31 - 1), 1);
  TEST_LAUNCH_PARAMETER((1 << 31 - 1), 1, 1);
  #undef TEST_LAUNCH_PARAMETER
 }
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@ -461,6 +461,29 @@ def tf_cuda_cc_test(name,
      linkopts=linkopts,
      args=args)
 def tf_cuda_only_cc_test(name,
                    srcs=[],
                    deps=[],
                    tags=[],
                    data=[],
                    size="medium",
                    linkstatic=0,
                    args=[],
                    linkopts=[]):
  native.cc_test(
    name="%s%s" % (name, "_gpu"),
    srcs=srcs,
    size=size,
    args=args,
    copts= _cuda_copts() + tf_copts(),
    data=data,
    deps=deps + if_cuda([
        clean_dep("//tensorflow/core:cuda"),
        clean_dep("//tensorflow/core:gpu_lib"),
    ]),
    linkopts=["-lpthread", "-lm"] + linkopts,
    linkstatic=linkstatic,
    tags=tags)
 # Create a cc_test for each of the tensorflow tests listed in "tests"
 def tf_cc_tests(srcs,