Move contrib/nccl to core/nccl.

PiperOrigin-RevId: 218908694
2025-12-06 12:20:11 +01:00 · 2018-10-26 13:54:44 -07:00 · 2018-10-26 13:54:44 -07:00 · fc6cd33c33
commit fc6cd33c33
parent 2c164ed32f
28 changed files with 207 additions and 394 deletions
--- a/2
+++ b/2
@ -1,6 +1,7 @@
 # Where component owners are known, add them here.
 /tenosrflow/core/debug @caisq
 /tensorflow/core/nccl/ @azaks @csigg
 /tensorflow/core/platform/windows/ @mrry
 /tensorflow/core/platform/s3 @yongtang
 /tensorflow/go @asimshankar
@ -46,7 +47,6 @@
 /tensorflow/contrib/losses/ @alextp @ispirmustafa
 /tensorflow/contrib/makefile/ @petewarden @satok16 @wolffg
 /tensorflow/contrib/metrics/ @alextp @honkentuber @ispirmustafa
 /tensorflow/contrib/nccl/ @cwhipkey @zheng-xq
 /tensorflow/contrib/opt/ @strategist333 @alextp
 /tensorflow/contrib/pi_examples/ @maciekcc
 /tensorflow/contrib/quantization/ @petewarden
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@ -72,7 +72,6 @@ py_library(
        "//tensorflow/contrib/metrics:metrics_py",
        "//tensorflow/contrib/mixed_precision:mixed_precision",
        "//tensorflow/contrib/model_pruning",
        "//tensorflow/contrib/nccl:nccl_py",
        "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_py",
        "//tensorflow/contrib/nn:nn_py",
        "//tensorflow/contrib/opt:opt_py",
@ -179,9 +178,7 @@ cc_library(
        "//tensorflow/contrib/tensor_forest:stats_ops_kernels",
        "//tensorflow/contrib/tensor_forest:tensor_forest_kernels",
        "//tensorflow/contrib/text:all_kernels",
-    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_cuda([
+    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + select({
        "//tensorflow/contrib/nccl:nccl_kernels",
    ]) + select({
        "//tensorflow:android": [],
        "//tensorflow:ios": [],
        "//tensorflow:linux_s390x": [],
@ -215,7 +212,6 @@ cc_library(
        "//tensorflow/contrib/hadoop:dataset_ops_op_lib",
        "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
        "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
        "//tensorflow/contrib/nccl:nccl_ops_op_lib",
        "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_op_lib",
        "//tensorflow/contrib/rnn:all_ops",
        "//tensorflow/contrib/seq2seq:beam_search_ops_op_lib",
--- a/tensorflow/contrib/init.py
+++ b/tensorflow/contrib/init.py
@ -62,7 +62,6 @@ from tensorflow.contrib import memory_stats
 from tensorflow.contrib import metrics
 from tensorflow.contrib import mixed_precision
 from tensorflow.contrib import model_pruning
 from tensorflow.contrib import nccl
 from tensorflow.contrib import nn
 from tensorflow.contrib import opt
 from tensorflow.contrib import periodic_resample
--- a/tensorflow/contrib/all_reduce/BUILD
+++ b/tensorflow/contrib/all_reduce/BUILD
@ -29,10 +29,10 @@ py_library(
    srcs_version = "PY2AND3",
    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/contrib/nccl:nccl_py",
        "//tensorflow/python:array_ops",
        "//tensorflow/python:framework_ops",
        "//tensorflow/python:math_ops",
        "//tensorflow/python:nccl_ops",
    ],
 )
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@ -21,11 +21,11 @@ from __future__ import print_function
 import collections
 import math
 from tensorflow.contrib import nccl
 from tensorflow.python.framework import device as device_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nccl_ops
 def _flatten_tensors(tensors):
@ -693,7 +693,7 @@ def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
    ValueError: red_op not supported.
  """
  if red_op == math_ops.add:
-    output_tensors = nccl.all_sum(input_tensors)
+    output_tensors = nccl_ops.all_sum(input_tensors)
  else:
    raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
  if un_op:
@ -745,7 +745,7 @@ def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
  for w in range(0, num_workers):
    dst_tensors = []
    with ops.device(per_worker_devices[w][0]):
-      broadcast_src = nccl.broadcast(array_ops.identity(level_2_output[w]))
+      broadcast_src = nccl_ops.broadcast(array_ops.identity(level_2_output[w]))
    for d in per_worker_devices[w]:
      with ops.device(d):
        dst_tensors.append(array_ops.identity(broadcast_src))
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@ -308,11 +308,6 @@ tensorflow/contrib/model_pruning/examples
 tensorflow/contrib/model_pruning/examples/cifar10
 tensorflow/contrib/model_pruning/python
 tensorflow/contrib/model_pruning/python/layers
 tensorflow/contrib/nccl
 tensorflow/contrib/nccl/kernels
 tensorflow/contrib/nccl/ops
 tensorflow/contrib/nccl/python
 tensorflow/contrib/nccl/python/ops
 tensorflow/contrib/nearest_neighbor
 tensorflow/contrib/nearest_neighbor/kernels
 tensorflow/contrib/nearest_neighbor/ops
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@ -97,9 +97,6 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
      "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/libsvm/ops/libsvm_ops.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/ops/nearest_neighbor_ops.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/resampler/kernels/resampler_ops.cc"
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@ -99,7 +99,6 @@ GENERATE_CONTRIB_OP_LIBRARY(image_distort_image "${tensorflow_source_dir}/tensor
 GENERATE_CONTRIB_OP_LIBRARY(image_sirds "${tensorflow_source_dir}/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(layers_sparse_feature_cross "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc")
 GENERATE_CONTRIB_OP_LIBRARY(memory_stats "${tensorflow_source_dir}/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(nccl "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(periodic_resample "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/ops/array_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(nearest_neighbor "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/ops/nearest_neighbor_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(resampler "${tensorflow_source_dir}/tensorflow/contrib/resampler/ops/resampler_ops.cc")
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@ -594,7 +594,6 @@ py_library(
    deps = [
        ":values",
        "//tensorflow/contrib/all_reduce:all_reduce_py",
        "//tensorflow/contrib/nccl:nccl_py",
        "//tensorflow/python:array_ops",
        "//tensorflow/python:collective_ops",
        "//tensorflow/python:device",
@ -602,6 +601,7 @@ py_library(
        "//tensorflow/python:framework_ops",
        "//tensorflow/python:gradients",
        "//tensorflow/python:math_ops",
        "//tensorflow/python:nccl_ops",
    ],
 )
--- a/tensorflow/contrib/distribute/python/cross_tower_utils.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_utils.py
@ -21,7 +21,6 @@ from __future__ import print_function
 import collections as pycoll
 import threading
 from tensorflow.contrib import nccl
 from tensorflow.contrib.all_reduce.python import all_reduce
 from tensorflow.contrib.distribute.python import values as value_lib
 from tensorflow.python.framework import device as pydev
@ -31,6 +30,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nccl_ops
 def aggregate_gradients_using_nccl(replica_grads):
@ -38,7 +38,7 @@ def aggregate_gradients_using_nccl(replica_grads):
  agg_all_g_and_v = []
  for single_g_and_v in zip(*replica_grads):
    single_grads = [g for g, _ in single_g_and_v]
-    agg_grads = nccl.all_sum(single_grads)
+    agg_grads = nccl_ops.all_sum(single_grads)
    agg_all_g_and_v.append(
        [(g, v) for g, (_, v) in zip(agg_grads, single_g_and_v)])
@ -376,7 +376,7 @@ def sum_grad_and_var_all_reduce(grad_and_vars,
    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
    scaled_grads = [g for g, _ in grad_and_vars]
    if alg == 'nccl':
-      summed_grads = nccl.all_sum(scaled_grads)
+      summed_grads = nccl_ops.all_sum(scaled_grads)
    elif alg == 'xring':
      summed_grads = all_reduce.build_ring_all_reduce(
          scaled_grads, num_workers, num_shards, gpu_indices, math_ops.add)
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@ -1,177 +0,0 @@
 # Description:
 #   Wrap NVIDIA (https://github.com/NVIDIA/nccl) NCCL with tensorflow ops.
 #   APIs are meant to change over time.
 package(default_visibility = ["//tensorflow:__subpackages__"])
 licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 load(
    "//tensorflow:tensorflow.bzl",
    "tf_custom_op_library",
    "tf_gen_op_libs",
    "tf_gen_op_wrapper_py",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load("//tensorflow:tensorflow.bzl", "if_not_windows_cuda")
 tf_custom_op_library(
    name = "python/ops/_nccl_ops.so",
    srcs = [
        "ops/nccl_ops.cc",
    ],
    gpu_srcs = if_not_windows_cuda([
        "kernels/nccl_manager.cc",
        "kernels/nccl_manager.h",
        "kernels/nccl_ops.cc",
    ]),
    deps = [] + if_cuda([
        "@local_config_nccl//:nccl",
        "//tensorflow/core:gpu_headers_lib",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core:protos_all_proto_text",
    ]),
 )
 tf_cuda_cc_test(
    name = "nccl_manager_test",
    size = "medium",
    srcs = if_cuda(
        [
            "kernels/nccl_manager.cc",
            "kernels/nccl_manager.h",
            "kernels/nccl_manager_test.cc",
        ],
        [],
    ),
    # Disabled on jenkins until errors finding nvmlShutdown are found.
    tags = [
        "manual",
        "multi_gpu",
        "no_oss",
        "noguitar",
        "notap",
    ],
    deps =
        if_cuda([
            "@local_config_nccl//:nccl",
            "//tensorflow/core:cuda",
            "//tensorflow/core:test",
            "//tensorflow/core:test_main",
            "//tensorflow/core:testlib",
        ]),
 )
 tf_kernel_library(
    name = "nccl_kernels",
    srcs = if_cuda([
        "kernels/nccl_manager.cc",
        "kernels/nccl_manager.h",
        "kernels/nccl_ops.cc",
        "kernels/nccl_rewrite.cc",
    ]),
    deps = if_cuda([
        "@local_config_nccl//:nccl",
        "//tensorflow/core:core_cpu",
        "//tensorflow/core:framework",
        "//tensorflow/core:gpu_headers_lib",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor",
    ]),
    alwayslink = 1,
 )
 tf_gen_op_libs(
    op_lib_names = ["nccl_ops"],
    deps = [
        "//tensorflow/core:lib",
    ],
 )
 tf_gen_op_wrapper_py(
    name = "nccl_ops",
    deps = [":nccl_ops_op_lib"],
 )
 # Test only nccl ops lib without dso to test behavior when NCCL lib is not
 # installed. See nccl_dependency_test for more details.
 #
 # Users should use the public nccl_py lib that also adds the dso.
 tf_custom_op_py_library(
    name = "nccl_ops_lib_without_dso",
    srcs = [
        "__init__.py",
        "python/ops/nccl_ops.py",
    ],
    kernels = if_cuda([":nccl_kernels"]) + [
        ":nccl_ops_op_lib",
    ],
    deps = [
        ":nccl_ops",
        "//tensorflow/contrib/util:util_py",
        "//tensorflow/python:device",
        "//tensorflow/python:framework_ops",
        "//tensorflow/python:platform",
        "//tensorflow/python:util",
        "//tensorflow/python/eager:context",
    ],
 )
 tf_custom_op_py_library(
    name = "nccl_py",
    dso = [":python/ops/_nccl_ops.so"],
    visibility = ["//visibility:public"],
    deps = [
        ":nccl_ops_lib_without_dso",
    ],
 )
 cuda_py_test(
    name = "nccl_ops_test",
    size = "small",
    srcs = ["python/ops/nccl_ops_test.py"],
    additional_deps = [
        ":nccl_py",
        "//tensorflow/python:array_ops",
        "//tensorflow/python:client_testlib",
        "//tensorflow/python:framework_for_generated_wrappers",
        "//tensorflow/python:framework_test_lib",
        "//tensorflow/python:platform_test",
    ],
    # Disabled on jenkins until errors finding nvmlShutdown are found.
    tags = [
        "manual",
        "multi_gpu",
        "no_oss",
        "noguitar",
        "notap",
    ],
 )
 cuda_py_test(
    name = "nccl_dependency_test",
    size = "small",
    srcs = ["python/ops/nccl_dependency_test.py"],
    additional_deps = [
        ":nccl_ops_lib_without_dso",
        "//tensorflow/python:constant_op",
        "//tensorflow/python:errors",
        "//tensorflow/python:framework_ops",
        "//tensorflow/python:util",
        "//tensorflow/python:client_testlib",
        "//tensorflow/python:platform_test",
    ],
    # Disable this test internally as static linking is used internally and only
    # run for OSS to verify that NCCL is an optional dynamic dependency.
    tags = [
        "manual",
        "noguitar",
        "notap",
    ],
 )
--- a/tensorflow/contrib/nccl/init.py
+++ b/tensorflow/contrib/nccl/init.py
@ -1,38 +0,0 @@
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Functions for using NVIDIA nccl collective ops.
@@all_max
@@all_min
@@all_prod
@@all_sum
@@reduce_sum
@@broadcast
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from tensorflow.contrib.nccl.python.ops.nccl_ops import all_max
 from tensorflow.contrib.nccl.python.ops.nccl_ops import all_min
 from tensorflow.contrib.nccl.python.ops.nccl_ops import all_prod
 from tensorflow.contrib.nccl.python.ops.nccl_ops import all_sum
 from tensorflow.contrib.nccl.python.ops.nccl_ops import broadcast
 from tensorflow.contrib.nccl.python.ops.nccl_ops import reduce_sum
 from tensorflow.python.util.all_util import remove_undocumented
 remove_undocumented(__name__)
--- a/tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py
@ -1,59 +0,0 @@
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Dependency test for nccl to test behavior when NCCL is not installed."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from tensorflow.contrib import nccl
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
 class NcclDependencyTest(test.TestCase):
  """Verifies that importing nccl ops lib does not fail even if NCCL is not
  installed but nccl ops throws an exception on use if NCCL is not installed.
  """
  def test_nccl_ops(self):
    """Tests behavior of nccl ops when NCCL is not installed."""
    public_methods = [
        m[0]
        for m in tf_inspect.getmembers(nccl, tf_inspect.isfunction)
        if not m[0].startswith('_')
    ]
    for method_name in public_methods:
      with ops.device('/device:CPU:0'):
        tensor = constant_op.constant(1)
      if method_name == 'broadcast':
        arg = tensor
      else:
        arg = [tensor]
      nccl_op = getattr(nccl, method_name)
      with ops.device('/device:CPU:0'):
        with self.assertRaisesRegexp(errors_impl.NotFoundError,
                                     r'cannot open shared object file'):
          nccl_op(arg)
 if __name__ == '__main__':
  test.main()
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -1068,6 +1068,7 @@ tf_gen_op_libs(
        "logging_ops",
        "manip_ops",
        "math_ops",
        "nccl_ops",
        "nn_ops",
        "no_op",
        "parsing_ops",
@ -1216,6 +1217,7 @@ cc_library(
        ":lookup_ops_op_lib",
        ":manip_ops_op_lib",
        ":math_ops_op_lib",
        ":nccl_ops_op_lib",
        ":nn_ops_op_lib",
        ":no_op_op_lib",
        ":parsing_ops_op_lib",
@ -1395,6 +1397,7 @@ cc_library(
        "//tensorflow/core/kernels:fact_op",
        "//tensorflow/core/kernels:array_not_windows",
        "//tensorflow/core/kernels:math_not_windows",
        "//tensorflow/core/kernels:nccl_kernels",
        "//tensorflow/core/kernels:quantized_ops",
        "//tensorflow/core/kernels/neon:neon_depthwise_conv_op",
    ]) + if_mkl([
--- a/tensorflow/core/api_def/base_api/api_def_NcclAllReduce.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_NcclAllReduce.pbtxt
@ -0,0 +1,19 @@
 op {
  graph_op_name: "NcclAllReduce"
  summary: "Outputs a tensor containing the reduction across all input tensors."
  description: <<END
 Outputs a tensor containing the reduction across all input tensors passed to ops
 within the same `shared_name.
 The graph should be constructed so if one op runs with shared_name value `c`,
 then `num_devices` ops will run with shared_name value `c`.  Failure to do so
 will cause the graph execution to fail to complete.
 input: the input to the reduction
 data: the value of the reduction across all `num_devices` devices.
 reduction: the reduction operation to perform.
 num_devices: The number of devices participating in this reduction.
 shared_name: Identifier that shared between ops of the same reduction.
 END
  visibility: HIDDEN
 }
--- a/tensorflow/core/api_def/base_api/api_def_NcclBroadcast.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_NcclBroadcast.pbtxt
@ -0,0 +1,17 @@
 op {
  graph_op_name: "NcclBroadcast"
  summary: "Sends `input` to all devices that are connected to the output."
  description: <<END
 Sends `input` to all devices that are connected to the output.
 The graph should be constructed so that all ops connected to the output have a
 valid device assignment, and the op itself is assigned one of these devices.
 input: The input to the broadcast.
 output: The same as input.
 shape: The shape of the input tensor.
 END
  visibility: HIDDEN
 }
--- a/tensorflow/core/api_def/base_api/api_def_NcclReduce.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_NcclReduce.pbtxt
@ -0,0 +1,15 @@
 op {
  graph_op_name: "NcclReduce"
  summary: "Reduces `input` from `num_devices` using `reduction` to a single device."
  description: <<END
 Reduces `input` from `num_devices` using `reduction` to a single device.
 The graph should be constructed so that all inputs have a valid device
 assignment, and the op itself is assigned one of these devices.
 input: The input to the reduction.
 data: the value of the reduction across all `num_devices` devices.
 reduction: the reduction operation to perform.
 END
  visibility: HIDDEN
 }
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -270,6 +270,20 @@ cc_library(
    ],
 )
 tf_kernel_library(
    name = "nccl_kernels",
    srcs = if_cuda([
        "nccl_ops.cc",
    ]),
    deps = if_cuda([
        "@local_config_nccl//:nccl",
        "//tensorflow/core/nccl:nccl_lib",
        "//tensorflow/core:framework",
        "//tensorflow/core:gpu_headers_lib",
        "//tensorflow/core:nccl_ops_op_lib",
    ]),
 )
 tf_cuda_library(
    name = "ops_testutil",
    testonly = 1,
--- a/tensorflow/contrib/nccl/kernels/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
@ -18,8 +18,8 @@ limitations under the License.
 #include <vector>
 #include "third_party/nccl/nccl.h"
 #include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
 namespace tensorflow {
 namespace {
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@ -0,0 +1,60 @@
 # Description:
 #   Wrap NVIDIA (https://github.com/NVIDIA/nccl) NCCL with tensorflow ops.
 #   APIs are meant to change over time.
 package(default_visibility = ["//tensorflow:__subpackages__"])
 licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 cc_library(
    name = "nccl_lib",
    srcs = if_cuda([
        "nccl_manager.cc",
        "nccl_manager.h",
        "nccl_rewrite.cc",
    ]),
    copts = tf_copts(),
    deps = if_cuda([
        "@local_config_nccl//:nccl",
        "//tensorflow/core:core_cpu",
        "//tensorflow/core:framework",
        "//tensorflow/core:gpu_headers_lib",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor",
    ]),
    alwayslink = 1,
 )
 tf_cuda_cc_test(
    name = "nccl_manager_test",
    size = "medium",
    srcs = if_cuda(
        [
            "nccl_manager_test.cc",
        ],
        [],
    ),
    # Disabled on jenkins until errors finding nvmlShutdown are found.
    tags = [
        "manual",
        "multi_gpu",
        "no_oss",
        "noguitar",
        "notap",
    ],
    deps =
        if_cuda([
            ":nccl_lib",
            "@local_config_nccl//:nccl",
            "//tensorflow/core:cuda",
            "//tensorflow/core:test",
            "//tensorflow/core:test_main",
            "//tensorflow/core:testlib",
        ]),
 )
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
 #include <utility>
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_NCCL_KERNELS_NCCL_MANAGER_H_
+#ifndef TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
-#define TENSORFLOW_CONTRIB_NCCL_KERNELS_NCCL_MANAGER_H_
+#define TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
 #ifdef GOOGLE_CUDA
@ -135,4 +135,4 @@ class NcclManager {
 #endif  // GOOGLE_CUDA
-#endif  // TENSORFLOW_CONTRIB_NCCL_KERNELS_NCCL_MANAGER_H_
+#endif  // TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
@ -19,11 +19,11 @@ limitations under the License.
 #include <random>
 #include <vector>
 #include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/nccl/nccl_manager.h"
 #include "tensorflow/core/platform/test.h"
 namespace tensorflow {
--- a/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
--- a/tensorflow/contrib/nccl/ops/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/ops/nccl_ops.cc
@ -29,21 +29,7 @@ REGISTER_OP("NcclAllReduce")
    .Attr("num_devices: int")
    .Attr("shared_name: string")
    .SetIsStateful()
-    .SetShapeFn(shape_inference::UnchangedShape)
+    .SetShapeFn(shape_inference::UnchangedShape);
    .Doc(R"doc(
 Outputs a tensor containing the reduction across all input tensors passed to ops
 within the same `shared_name.
 The graph should be constructed so if one op runs with shared_name value `c`,
 then `num_devices` ops will run with shared_name value `c`.  Failure to do so
 will cause the graph execution to fail to complete.
 input: the input to the reduction
 data: the value of the reduction across all `num_devices` devices.
 reduction: the reduction operation to perform.
 num_devices: The number of devices participating in this reduction.
 shared_name: Identifier that shared between ops of the same reduction.
 )doc");
 // Note: This op has no kernel implementation, but is replaced by
 // _NcclReduceSend and _NcclReduceRecv during graph optimization stage.
@ -54,17 +40,7 @@ REGISTER_OP("NcclReduce")
    .Attr("T: {half, float, float64, int32, int64}")
    .Attr("num_devices: int")
    .SetIsStateful()
-    .SetShapeFn(shape_inference::UnchangedShape)
+    .SetShapeFn(shape_inference::UnchangedShape);
    .Doc(R"doc(
 Reduces `input` from `num_devices` using `reduction` to a single device.
 The graph should be constructed so that all inputs have a valid device
 assignment, and the op itself is assigned one of these devices.
 input: The input to the reduction.
 data: the value of the reduction across all `num_devices` devices.
 reduction: the reduction operation to perform.
    )doc");
 REGISTER_OP("_NcclReduceSend")
    .Input("input: T")
@ -121,17 +97,7 @@ REGISTER_OP("NcclBroadcast")
    .Attr("T: {half, float, float64, int32, int64}")
    .Attr("shape: shape")
    .SetIsStateful()
-    .SetShapeFn(shape_inference::UnchangedShape)
+    .SetShapeFn(shape_inference::UnchangedShape);
    .Doc(R"doc(
 Sends `input` to all devices that are connected to the output.
 The graph should be constructed so that all ops connected to the output have a
 valid device assignment, and the op itself is assigned one of these devices.
 input: The input to the broadcast.
 output: The same as input.
 shape: The shape of the input tensor.
    )doc");
 REGISTER_OP("_NcclBroadcastSend")
    .Input("input: T")
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -109,6 +109,7 @@ py_library(
        ":manip_ops",
        ":math_ops",
        ":metrics",
        ":nccl_ops",
        ":nn",
        ":ops",
        ":platform",
@ -5757,6 +5758,48 @@ py_test(
    ],
 )
 tf_gen_op_wrapper_private_py(
    name = "nccl_ops_gen",
    visibility = ["//tensorflow:internal"],
    deps = [
        "//tensorflow/core:nccl_ops_op_lib",
    ],
 )
 py_library(
    name = "nccl_ops",
    srcs = ["ops/nccl_ops.py"],
    srcs_version = "PY2AND3",
    visibility = visibility + [
        "//learning/deepmind/tensorflow:__subpackages__",
    ],
    deps = [
        ":framework_for_generated_wrappers",
        ":nccl_ops_gen",
    ],
 )
 cuda_py_test(
    name = "nccl_ops_test",
    size = "small",
    srcs = ["ops/nccl_ops_test.py"],
    additional_deps = [
        ":nccl_ops",
        ":array_ops",
        ":client_testlib",
        ":framework_test_lib",
        ":platform_test",
    ],
    # Disabled on jenkins until errors finding nvmlShutdown are found.
    tags = [
        "manual",
        "multi_gpu",
        "no_oss",
        "noguitar",
        "notap",
    ],
 )
 py_binary(
    name = "graph_analyzer",
    srcs = [
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops.py
@ -19,15 +19,11 @@ from __future__ import print_function
 import threading
 from tensorflow.contrib.nccl.ops import gen_nccl_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device
 from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
+from tensorflow.python.ops import gen_nccl_ops
 _nccl_ops_so = None
 _module_lock = threading.Lock()
 _shared_name_counter = 0
@ -182,7 +178,6 @@ def broadcast(tensor):
    A tensor with the value of `src_tensor`, which can be used as input to
    ops on other GPU devices.
  """
  _validate_and_load_nccl_so()
  _check_device(tensor)
  with ops.device(tensor.device):
@ -214,7 +209,6 @@ def _apply_all_reduce(reduction, tensors):
  """Helper function for all_* functions."""
  if not tensors:
    raise ValueError('Must pass >0 tensors to all reduce operations')
  _validate_and_load_nccl_so()
  shared_name = _get_shared_name()
  res = []
@ -236,7 +230,6 @@ def _apply_reduce(reduction, tensors):
  """Helper function for reduce_* functions."""
  if not tensors:
    raise ValueError('Must pass >0 tensors to reduce operations')
  _validate_and_load_nccl_so()
  for t in tensors:
    _check_device(t)
@ -262,27 +255,3 @@ def _check_device(tensor, expected=None):
    raise ValueError('Device assignment required for nccl collective ops')
  if expected and expected != tensor.device:
    raise ValueError('Expected device %s, got %s' % (expected, tensor.device))
 def _maybe_load_nccl_ops_so():
  """Loads nccl ops so if it hasn't been loaded already."""
  with _module_lock:
    global _nccl_ops_so
    if not _nccl_ops_so:
      _nccl_ops_so = loader.load_op_library(
          resource_loader.get_path_to_datafile('_nccl_ops.so'))
 def _validate_and_load_nccl_so():
  """Validates calling context and loads nccl ops so file.
  Raises:
    ValueError: Ops are not supported.
    errors_impl.NotFoundError: nccl library is not installed.
  """
  if context.executing_eagerly():
    raise ValueError('Nccl ops are not supported in eager mode')
  _maybe_load_nccl_ops_so()
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
@ -19,14 +19,13 @@ from __future__ import division
 from __future__ import print_function
 from functools import partial
 import os
 import numpy as np
 from tensorflow.contrib import nccl
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import nccl_ops
 from tensorflow.python.platform import test
@ -52,7 +51,7 @@ def _NcclBroadcast(tensors, devices):
  sender = np.random.randint(0, len(devices))
  with ops.device(devices[sender]):
    tensor = array_ops.identity(tensors[0])
-    broadcast = nccl.broadcast(tensor)
+    broadcast = nccl_ops.broadcast(tensor)
  return _DeviceTensors([broadcast] * len(devices), devices)
@ -61,7 +60,6 @@ class NcclTestCase(test.TestCase):
  def _Test(self,
            nccl_reduce,
            numpy_fn,
            dtypes=[np.float16, np.float32, np.int32, np.int64, np.float64],
            device_sets=(['/device:GPU:1', '/device:GPU:2', '/device:GPU:0'],
                         ['/device:GPU:1', '/device:GPU:0'])):
    """Tests that nccl_reduce does the same as reduction with numpy_fn.
@ -74,10 +72,7 @@ class NcclTestCase(test.TestCase):
          two.
      device_sets: Tuple of virtual devices to run test on.
    """
-    # Enable NCCL printouts.
+    for dtype in [np.float16, np.float32, np.int32, np.int64, np.float64]:
    os.environ["NCCL_DEBUG"] = "INFO"
    for dtype in dtypes:
      # Create session inside outer loop to test use of
      # same communicator across multiple sessions.
      with self.test_session(use_gpu=True) as sess:
@ -129,36 +124,36 @@ class NcclTestCase(test.TestCase):
          reduce_tensors, inputs, losses, colocate_gradients_with_ops=True)
      return [g for g in grads if g is not None]
-    # int types are considered not 'trainable' and no gradients are generated.
+    self._Test(_Gradient, numpy_fn)
    self._Test(_Gradient, numpy_fn, dtypes=[np.float16, np.float32, np.float64])
 class AllReduceTest(NcclTestCase):
  def testAllReduce(self):
-    self._Test(partial(_NcclAllReduce, nccl.all_sum), lambda x, y: x + y)
+    self._Test(partial(_NcclAllReduce, nccl_ops.all_sum), lambda x, y: x + y)
-    self._Test(partial(_NcclAllReduce, nccl.all_prod), lambda x, y: x * y)
+    self._Test(partial(_NcclAllReduce, nccl_ops.all_prod), lambda x, y: x * y)
-    self._Test(partial(_NcclAllReduce, nccl.all_min), np.minimum)
+    self._Test(partial(_NcclAllReduce, nccl_ops.all_min), np.minimum)
-    self._Test(partial(_NcclAllReduce, nccl.all_max), np.maximum)
+    self._Test(partial(_NcclAllReduce, nccl_ops.all_max), np.maximum)
  def testAllSumGrad(self):
    self._TestGradient(
-        partial(_NcclAllReduce, nccl.all_sum), lambda x, y: x + y)
+        partial(_NcclAllReduce, nccl_ops.all_sum), lambda x, y: x + y)
  def testErrors(self):
    with self.assertRaisesRegexp(ValueError, 'Device assignment required'):
-      nccl.all_sum([array_ops.identity(np.random.random_sample((3, 4)))])
+      nccl_ops.all_sum([array_ops.identity(np.random.random_sample((3, 4)))])
    with self.assertRaisesRegexp(ValueError, 'Must pass >0 tensors'):
-      nccl.all_sum([])
+      nccl_ops.all_sum([])
 class SingleReduceTest(NcclTestCase):
  def testSum(self):
-    self._Test(partial(_NcclReduce, nccl.reduce_sum), lambda x, y: x + y)
+    self._Test(partial(_NcclReduce, nccl_ops.reduce_sum), lambda x, y: x + y)
  def testSumGrad(self):
-    self._TestGradient(partial(_NcclReduce, nccl.reduce_sum), lambda x, y: x)
+    self._TestGradient(partial(_NcclReduce, nccl_ops.reduce_sum),
                       lambda x, y: x)
 class BroadcastTest(NcclTestCase):
@ -189,8 +184,8 @@ class CombinedTest(NcclTestCase):
  """Test all-reduce vs. single-reduce plus broadcast in one session.run."""
  def _Combined(self, tensors, devices):
-    all_reduce_tensors = _NcclAllReduce(nccl.all_sum, tensors, devices)
+    all_reduce_tensors = _NcclAllReduce(nccl_ops.all_sum, tensors, devices)
-    single_reduce_tensors = _NcclReduce(nccl.reduce_sum, tensors, devices)
+    single_reduce_tensors = _NcclReduce(nccl_ops.reduce_sum, tensors, devices)
    broadcast_tensors = _NcclBroadcast(single_reduce_tensors, devices)
    return all_reduce_tensors + broadcast_tensors