diff --git a/.gitignore b/.gitignore
index fdc61ee8251..c227f50d552 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,5 @@ node_modules
 __pycache__
 *.swp
 .vscode/
+cmake_build/
+.idea/**
diff --git a/CODEOWNERS b/CODEOWNERS
index 1401951b865..0a12176aaa2 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,52 +1,53 @@
+# NOTE: Disabled temporarily because it's too noisy on pushes.
 # Where component owners are known, add them here.
 
-tensorflow/core/platform/windows/* @mrry
-tensorflow/java/* @asimshankar
-tensorflow/tensorboard/* @jart @dandelionmane
-tensorflow/tools/docs/* @markdaoust
+#tensorflow/core/platform/windows/* @mrry
+#tensorflow/java/* @asimshankar
+#tensorflow/tensorboard/* @jart @dandelionmane
+#tensorflow/tools/docs/* @markdaoust
 
 # contrib
 
 # NEED OWNER: tensorflow/contrib/avro/*
-tensorflow/contrib/batching/* @alextp @chrisolston
-tensorflow/contrib/bayesflow/* @ebrevdo @rsepassi @jvdillon
-tensorflow/contrib/cmake/* @mrry @benoitsteiner
-tensorflow/contrib/copy_graph/* @tucker @poxvoculi
-tensorflow/contrib/crf/* @kentonl
-tensorflow/contrib/data/* @mrry
-tensorflow/contrib/distributions/* @jvdillon @langmore @rsepassi
-tensorflow/contrib/factorization/* @agarwal-ashish @xavigonzalvo
-tensorflow/contrib/ffmpeg/* @fredbertsch
+#tensorflow/contrib/batching/* @alextp @chrisolston
+#tensorflow/contrib/bayesflow/* @ebrevdo @rsepassi @jvdillon
+#tensorflow/contrib/cmake/* @mrry @benoitsteiner
+#tensorflow/contrib/copy_graph/* @tucker @poxvoculi
+#tensorflow/contrib/crf/* @kentonl
+#tensorflow/contrib/data/* @mrry
+#tensorflow/contrib/distributions/* @jvdillon @langmore @rsepassi
+#tensorflow/contrib/factorization/* @agarwal-ashish @xavigonzalvo
+#tensorflow/contrib/ffmpeg/* @fredbertsch
 # NEED OWNER: tensorflow/contrib/framework/*
-tensorflow/contrib/graph_editor/* @purpledog
+#tensorflow/contrib/graph_editor/* @purpledog
 # NEED OWNER: tensorflow/contrib/grid_rnn/*
-tensorflow/contrib/hvx/* @satok16
-tensorflow/contrib/imperative/* @keveman
-tensorflow/contrib/integrate/* @shoyer
-tensorflow/contrib/kernel_methods/* @petrosmol
-tensorflow/contrib/ios_examples/* @petewarden
-tensorflow/contrib/labeled_tensor/* @shoyer
-tensorflow/contrib/layers/* @fchollet @martinwicke
-tensorflow/contrib/learn/* @martinwicke @ispirmustafa @alextp
-tensorflow/contrib/linalg/* @langmore
-tensorflow/contrib/linear_optimizer/* @petrosmol @andreasst @katsiapis
-tensorflow/contrib/lookup/* @ysuematsu @andreasst
-tensorflow/contrib/losses/* @alextp @ispirmustafa
-tensorflow/contrib/makefile/* @petewarden @satok16 @wolffg
-tensorflow/contrib/metrics/* @alextp @honkentuber @ispirmustafa
-tensorflow/contrib/nccl/* @cwhipkey @zheng-xq
-tensorflow/contrib/opt/* @strategist333
-tensorflow/contrib/pi_examples/* @maciekcc
-tensorflow/contrib/quantization/* @petewarden @cwhipkey @keveman
-tensorflow/contrib/rnn/* @ebrevdo
-tensorflow/contrib/saved_model/* @nfiedel @sukritiramesh
-tensorflow/contrib/seq2seq/* @lukaszkaiser
-tensorflow/contrib/session_bundle/* @nfiedel @sukritiramesh
-tensorflow/contrib/slim/* @sguada @thenbasilmanran
-tensorflow/contrib/stateless/* @girving
-tensorflow/contrib/tensor_forest/* @gilberthendry @thomascolthurst
-tensorflow/contrib/testing/* @dandelionmane
-tensorflow/contrib/timeseries/* @allenlavoie
-tensorflow/contrib/tpu/* @frankchn @saeta @jhseu
-tensorflow/contrib/training/* @joel-shor @ebrevdo
-tensorflow/contrib/util/* @sherrym
+#tensorflow/contrib/hvx/* @satok16
+#tensorflow/contrib/imperative/* @keveman
+#tensorflow/contrib/integrate/* @shoyer
+#tensorflow/contrib/kernel_methods/* @petrosmol
+#tensorflow/contrib/ios_examples/* @petewarden
+#tensorflow/contrib/labeled_tensor/* @shoyer
+#tensorflow/contrib/layers/* @fchollet @martinwicke
+#tensorflow/contrib/learn/* @martinwicke @ispirmustafa @alextp
+#tensorflow/contrib/linalg/* @langmore
+#tensorflow/contrib/linear_optimizer/* @petrosmol @andreasst @katsiapis
+#tensorflow/contrib/lookup/* @ysuematsu @andreasst
+#tensorflow/contrib/losses/* @alextp @ispirmustafa
+#tensorflow/contrib/makefile/* @petewarden @satok16 @wolffg
+#tensorflow/contrib/metrics/* @alextp @honkentuber @ispirmustafa
+#tensorflow/contrib/nccl/* @cwhipkey @zheng-xq
+#tensorflow/contrib/opt/* @strategist333
+#tensorflow/contrib/pi_examples/* @maciekcc
+#tensorflow/contrib/quantization/* @petewarden @cwhipkey @keveman
+#tensorflow/contrib/rnn/* @ebrevdo
+#tensorflow/contrib/saved_model/* @nfiedel @sukritiramesh
+#tensorflow/contrib/seq2seq/* @lukaszkaiser
+#tensorflow/contrib/session_bundle/* @nfiedel @sukritiramesh
+#tensorflow/contrib/slim/* @sguada @thenbasilmanran
+#tensorflow/contrib/stateless/* @girving
+#tensorflow/contrib/tensor_forest/* @gilberthendry @thomascolthurst
+#tensorflow/contrib/testing/* @dandelionmane
+#tensorflow/contrib/timeseries/* @allenlavoie
+#tensorflow/contrib/tpu/* @frankchn @saeta @jhseu
+#tensorflow/contrib/training/* @joel-shor @ebrevdo
+#tensorflow/contrib/util/* @sherrym
diff --git a/README.md b/README.md
index a1f9dad22c2..d265949194e 100644
--- a/README.md
+++ b/README.md
@@ -30,16 +30,16 @@ tracking requests and bugs. So please see
 and discussion, and please direct specific questions to [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).**
 
 ## Installation
-*See [Installing TensorFlow](https://www.tensorflow.org/install) for instructions on how to install our release binaries or how to build from source.*
+*See [Installing TensorFlow](https://www.tensorflow.org/get_started/os_setup.html) for instructions on how to install our release binaries or how to build from source.*
 
 People who are a little more adventurous can also try our nightly binaries:
 
 
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.3.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.3.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.3.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.3.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.3.0rc1-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.3.0rc1-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.3.0rc1-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.3.0rc2-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.3.0rc2-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.3.0rc2-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc2-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.3.0rc2-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.3.0rc2-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.3.0rc2-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.3.0rc2-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.3.0rc2-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
 * Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
 
diff --git a/RELEASE.md b/RELEASE.md
index da297b2e86d..ffe38004a2d 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -9,6 +9,7 @@
   * `DNNLinearCombinedClassifier`
   * `DNNLinearCombinedRegressor`.
 * All our prebuilt binaries have been built with cuDNN 6.
+* `import tensorflow` now goes much faster.
 * Adds a file cache to the GCS filesystem with configurable max staleness for file contents. This permits caching of file contents across close/open boundaries.
 * Added an axis parameter to `tf.gather`.
 * Added a `constant_values` keyword argument to `tf.pad`.
@@ -31,6 +32,7 @@
 * GPU kernels and speed improvements for for unary `tf.where` and `tf.nn.top_k`.
 * Monotonic Attention wrappers added to `tf.contrib.seq2seq`.
 * Added `tf.contrib.signal`, a library for signal processing primitives.
+* Added `tf.contrib.resampler`, containing CPU and GPU ops for differentiable resampling of images.
 
 ## Breaking Changes to the API
 * `tf.RewriterConfig` was removed from the Python API after being available in 1.2 release candidates (it was never in an actual release). Graph rewriting is still available, just not as `tf.RewriterConfig`. Instead add an explicit import.
@@ -64,7 +66,7 @@
 * Exported model signatures using the 'predict' method will no longer have their input and output keys silently ignored and rewritten to 'inputs' and 'outputs'. If a model was exported with different names before 1.2, and is now served with tensorflow/serving, it will accept requests using 'inputs' and 'outputs'. Starting at 1.2, such a model will accept the keys specified during export. Therefore, inference requests using 'inputs' and 'outputs' may start to fail. To fix this, either update any inference clients to send requests with the actual input and output keys used by the trainer code, or conversely, update the trainer code to name the input and output Tensors 'inputs' and 'outputs', respectively. Signatures using the 'classify' and 'regress' methods are not affected by this change; they will continue to standardize their input and output keys as before.
 * Add in-memory caching to the Dataset API.
 * Set default end_of_sequence variable in datasets iterators to false.
-* [Performance] Increase performance of `tf.layers.con2d` when setting use_bias=True by 2x by using nn.bias_add.
+* [Performance] Increase performance of `tf.layers.conv2d` when setting use_bias=True by 2x by using nn.bias_add.
 * Update iOS examples to use CocoaPods, and moved to tensorflow/examples/ios.
 * Adds a family= attribute in `tf.summary` ops to allow controlling the tab name used in Tensorboard for organizing summaries.
 * When GPU is configured, do not require --config=cuda, instead, automatically build for GPU if this is requested in the configure script.
diff --git a/configure.py b/configure.py
index a78399079aa..36466702637 100644
--- a/configure.py
+++ b/configure.py
@@ -384,12 +384,16 @@ def set_action_env_var(environ_cp,
 def convert_version_to_int(version):
   """Convert a version number to a integer that can be used to compare.
 
+  Version strings of the form X.YZ and X.Y.Z-xxxxx are supported. The
+  'xxxxx' part, for instance 'homebrew' on OS/X, is ignored.
+
   Args:
-    version: a version to be covnerted
+    version: a version to be converted
 
   Returns:
     An integer if converted successfully, otherwise return None.
   """
+  version = version.split('-')[0]
   version_segments = version.split('.')
   for seg in version_segments:
     if not seg.isdigit():
@@ -428,6 +432,8 @@ def check_bazel_version(min_version):
     print('Make sure you are running at least bazel %s' % min_version)
     return curr_version
 
+  print("You have bazel %s installed." % curr_version)
+
   if curr_version_int < min_version_int:
     print('Please upgrade your bazel installation to version %s or higher to '
           'build TensorFlow!' % min_version)
@@ -938,6 +944,8 @@ def main():
                 'with_hdfs_support', False)
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 False)
+  set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support',
+                False)
   set_build_var(environ_cp, 'TF_NEED_VERBS', 'VERBS', 'with_verbs_support',
                 False)
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 71f6d83da3f..2296c6d5e0d 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -182,6 +182,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_gdr_support",
+    values = {"define": "with_gdr_support=true"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "with_verbs_support",
     values = {"define": "with_verbs_support=true"},
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index e3c4bb02d23..77e0cf6757c 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -146,7 +146,7 @@ class TF_ManagedBuffer : public TensorBuffer {
 void* allocate_tensor(const char* operation, size_t len) {
   void* data =
       tensorflow::cpu_allocator()->AllocateRaw(EIGEN_MAX_ALIGN_BYTES, len);
-  if (tensorflow::LogMemory::IsEnabled()) {
+  if (tensorflow::LogMemory::IsEnabled() && data != nullptr) {
     tensorflow::LogMemory::RecordRawAllocation(
         operation, tensorflow::LogMemory::EXTERNAL_TENSOR_ALLOCATION_STEP_ID,
         len, data, tensorflow::cpu_allocator());
@@ -155,7 +155,7 @@ void* allocate_tensor(const char* operation, size_t len) {
 }
 
 void deallocate_buffer(void* data, size_t len, void* arg) {
-  if (tensorflow::LogMemory::IsEnabled()) {
+  if (tensorflow::LogMemory::IsEnabled() && data != nullptr) {
     tensorflow::LogMemory::RecordRawDeallocation(
         "TensorFlow C Api",
         tensorflow::LogMemory::EXTERNAL_TENSOR_ALLOCATION_STEP_ID, data,
diff --git a/tensorflow/cc/tutorials/example_trainer.cc b/tensorflow/cc/tutorials/example_trainer.cc
index 49d3cca3a4e..3675d72ee35 100644
--- a/tensorflow/cc/tutorials/example_trainer.cc
+++ b/tensorflow/cc/tutorials/example_trainer.cc
@@ -101,7 +101,7 @@ void ConcurrentSteps(const Options* opts, int session_index) {
   std::unique_ptr<Session> session(NewSession(options));
   GraphDef def = CreateGraphDef();
   if (options.target.empty()) {
-    graph::SetDefaultDevice(opts->use_gpu ? "/gpu:0" : "/cpu:0", &def);
+    graph::SetDefaultDevice(opts->use_gpu ? "/device:GPU:0" : "/cpu:0", &def);
   }
 
   TF_CHECK_OK(session->Create(def));
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
index 3034ed06b7e..cecbb01ff88 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
@@ -222,7 +222,7 @@ class MatcherBase {
   TF_DISALLOW_COPY_AND_ASSIGN(MatcherBase);
 };
 
-// WhileConditionComputationMatcher attempst to match a target computation
+// WhileConditionComputationMatcher attempts to match a target computation
 // pattern in the while condition sub-computation.
 // If the target pattern is matched, two pieces of information are extracted
 // from 'tagged' instructions returned by the matcher:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 0126f0b9d83..83128a2e49e 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -626,7 +626,7 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
   CHECK_EQ(opcode_, HloOpcode::kFusion);
   CHECK(instruction_to_fuse->IsFusable());
   if (GetModule()) {
-    XLA_VLOG_LINES(1, GetModule()->ToString());
+    XLA_VLOG_LINES(3, GetModule()->ToString());
   }
   HloInstruction* clone = nullptr;
   if (called_computations_.empty()) {
@@ -1909,9 +1909,10 @@ bool HloInstruction::IsFusable() const {
     case HloOpcode::kRecv:
       return false;
     // Only fuse Rng if it is used once, otherwise the random numbers generated
-    // will be different in each fusion.
+    // will be different in each fusion. If it is the root (user count = 0)
+    // then it is the equivalent of having one user.
     case HloOpcode::kRng:
-      return users_.size() == 1;
+      return users_.size() <= 1;
     default:
       return true;
   }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index f795a6ef629..ea5749581b5 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -1077,6 +1077,48 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
                                root2->operand(1)->operand(0)->shape()));
 }
 
+TEST_F(HloInstructionTest, IsRandomFusable) {
+  auto shape = ShapeUtil::MakeShape(F32, {2, 2});
+  {
+    auto builder = HloComputation::Builder(TestName());
+    auto hlo_module = CreateNewModule();
+    auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
+        Literal::CreateR0<float>(0.0)));
+    auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
+        Literal::CreateR0<float>(1.0)));
+    auto rng = builder.AddInstruction(HloInstruction::CreateRng(
+        shape, RandomDistribution::RNG_NORMAL, {const0, const1}));
+
+    auto* computation = hlo_module->AddEntryComputation(builder.Build());
+    computation->CreateFusionInstruction({rng, const0, const1},
+      HloInstruction::FusionKind::kLoop);
+
+    auto* root = computation->root_instruction();
+
+    EXPECT_EQ(HloOpcode::kFusion, root->opcode());
+  }
+  {
+    auto builder = HloComputation::Builder(TestName());
+    auto hlo_module = CreateNewModule();
+    auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
+        Literal::CreateR0<float>(0.0)));
+    auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
+        Literal::CreateR0<float>(1.0)));
+    auto rng = builder.AddInstruction(HloInstruction::CreateRng(
+        shape, RandomDistribution::RNG_NORMAL, {const0, const1}));
+    builder.AddInstruction(HloInstruction::CreateUnary(
+        shape, HloOpcode::kNegate, rng));
+    auto* computation = hlo_module->AddEntryComputation(builder.Build());
+    computation->CreateFusionInstruction({rng, const0, const1},
+      HloInstruction::FusionKind::kLoop);
+
+    auto* root = computation->root_instruction();
+
+    EXPECT_EQ(HloOpcode::kFusion, root->operand(0)->opcode());
+  }
+}
+
+
 TEST_F(HloInstructionTest, CloneSuffixNames) {
   // Test that the suffix string added to cloned instructions is not
   // duplicated. Rather a numeric incrementing value should be appended. That
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 7dba4e52f03..192477555d0 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -57,7 +57,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, NegConstantZeroElementF32) {
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, NegConstantF32) {
+XLA_TEST_F(ArrayElementwiseOpTest, NegConstantF32) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
   auto result = builder.Neg(a);
@@ -66,7 +66,7 @@ TEST_F(ArrayElementwiseOpTest, NegConstantF32) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, NegConstantS32) {
+XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS32) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<int32>({-1, 0, 1, 324,
                                       std::numeric_limits<int32>::min(),
@@ -126,7 +126,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteR1F32s) {
                             {});
 }
 
-TEST_F(ArrayElementwiseOpTest, AddTwoConstantF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
   auto b = builder.ConstantR1<float>({100.0f, 3.13f, 2.75f, 10.5f, -999.0f});
@@ -185,7 +185,7 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, SubTwoConstantF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
   auto b = builder.ConstantR1<float>({100.0f, 3.13f, 2.75f, 10.5f, -999.0f});
@@ -204,7 +204,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementF32s) {
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, SubTwoConstantS32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<int32>({-1, 0, 2, 1000000000});
   auto b = builder.ConstantR1<int32>({-1, 2, 1, -1});
@@ -222,7 +222,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementS32s) {
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, DivTwoConstantF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f});
   auto b = builder.ConstantR1<float>({10.0f, 5.1f, 1.0f, 10.0f, -6.0f});
@@ -241,7 +241,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantZeroElementF32s) {
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, DivS32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, DivS32s) {
   // clang-format off
   // Some interesting values to test.
   std::vector<int32> vals = {
@@ -316,7 +316,7 @@ TEST_F(ArrayElementwiseOpTest, DivS32s) {
   }
 }
 
-TEST_F(ArrayElementwiseOpTest, DivU32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
   // clang-format off
   // Some interesting values to test.
   std::vector<uint32> vals = {
@@ -420,7 +420,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, RemF64s) {
       error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, MulTwoConstantF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f});
   auto b = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, -6.0f});
@@ -439,7 +439,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantZeroElementF32s) {
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, MulTwoConstantS32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantS32s) {
   std::vector<int32> data = {0,
                              1,
                              -1,
@@ -474,7 +474,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantZeroElementS32s) {
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, MulTwoConstantU32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantU32s) {
   std::vector<uint32> data = {0,          1,          0xDEADBEEF, 1234,
                               0x1a243514, 0xFFFFFFFF, 0x80808080};
 
@@ -496,7 +496,7 @@ TEST_F(ArrayElementwiseOpTest, MulTwoConstantU32s) {
   ComputeAndCompareR1<uint32>(&builder, expected, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, LogicalAnd) {
+XLA_TEST_F(ArrayElementwiseOpTest, LogicalAnd) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<bool>({false, false, true, true});
   auto b = builder.ConstantR1<bool>({false, true, false, true});
@@ -514,7 +514,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogicalAndZeroElement) {
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, LogicalOr) {
+XLA_TEST_F(ArrayElementwiseOpTest, LogicalOr) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<bool>({false, false, true, true});
   auto b = builder.ConstantR1<bool>({false, true, false, true});
@@ -532,7 +532,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogicalOrZeroElement) {
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, LogicalNot) {
+XLA_TEST_F(ArrayElementwiseOpTest, LogicalNot) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<bool>({false, true, true, false});
   auto out = builder.LogicalNot(a);
@@ -548,7 +548,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogicalNotZeroElement) {
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareEqF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareEqF32s) {
   SetFastMathDisabled(true);
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
@@ -567,7 +567,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementF32s) {
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareGeF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareGeF32s) {
   SetFastMathDisabled(true);
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
@@ -577,7 +577,7 @@ TEST_F(ArrayElementwiseOpTest, CompareGeF32s) {
   ComputeAndCompareR1<bool>(&builder, {false, true, true, false, false}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareGtF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareGtF32s) {
   SetFastMathDisabled(true);
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
@@ -587,7 +587,7 @@ TEST_F(ArrayElementwiseOpTest, CompareGtF32s) {
   ComputeAndCompareR1<bool>(&builder, {false, true, true, false, false}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareLeF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareLeF32s) {
   SetFastMathDisabled(true);
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 5.0f, 2.25f, NAN, 6.0f});
@@ -597,7 +597,7 @@ TEST_F(ArrayElementwiseOpTest, CompareLeF32s) {
   ComputeAndCompareR1<bool>(&builder, {true, true, false, false, false}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareLtF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareLtF32s) {
   SetFastMathDisabled(true);
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
@@ -607,7 +607,7 @@ TEST_F(ArrayElementwiseOpTest, CompareLtF32s) {
   ComputeAndCompareR1<bool>(&builder, {true, false, false, false, false}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareEqS32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareEqS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
   ComputationBuilder builder(client_, TestName());
@@ -629,7 +629,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementS32s) {
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareNeF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareNeF32s) {
   // Disable fast-math because we're operating on NaNs.
   SetFastMathDisabled(true);
 
@@ -641,7 +641,7 @@ TEST_F(ArrayElementwiseOpTest, CompareNeF32s) {
   ComputeAndCompareR1<bool>(&builder, {true, false, true, true, true}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareNeS32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareNeS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
   ComputationBuilder builder(client_, TestName());
@@ -653,7 +653,7 @@ TEST_F(ArrayElementwiseOpTest, CompareNeS32s) {
       &builder, {false, true, true, true, false, true, true, true, false}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareGeS32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareGeS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
   ComputationBuilder builder(client_, TestName());
@@ -665,7 +665,7 @@ TEST_F(ArrayElementwiseOpTest, CompareGeS32s) {
       &builder, {true, false, false, true, true, false, true, true, true}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareGtS32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareGtS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
   ComputationBuilder builder(client_, TestName());
@@ -678,7 +678,7 @@ TEST_F(ArrayElementwiseOpTest, CompareGtS32s) {
       {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareLeS32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareLeS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
   ComputationBuilder builder(client_, TestName());
@@ -690,7 +690,7 @@ TEST_F(ArrayElementwiseOpTest, CompareLeS32s) {
       &builder, {true, true, true, false, true, true, false, false, true}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareLtS32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareLtS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
   ComputationBuilder builder(client_, TestName());
@@ -703,7 +703,7 @@ TEST_F(ArrayElementwiseOpTest, CompareLtS32s) {
       {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareEqU32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareEqU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
@@ -715,7 +715,7 @@ TEST_F(ArrayElementwiseOpTest, CompareEqU32s) {
       {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareNeU32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareNeU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
@@ -726,7 +726,7 @@ TEST_F(ArrayElementwiseOpTest, CompareNeU32s) {
       &builder, {false, true, true, true, false, true, true, true, false}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareGeU32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareGeU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
@@ -737,7 +737,7 @@ TEST_F(ArrayElementwiseOpTest, CompareGeU32s) {
       &builder, {true, false, false, true, true, false, true, true, true}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareGtU32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareGtU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
@@ -749,7 +749,7 @@ TEST_F(ArrayElementwiseOpTest, CompareGtU32s) {
       {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareLeU32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareLeU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
@@ -760,7 +760,7 @@ TEST_F(ArrayElementwiseOpTest, CompareLeU32s) {
       &builder, {true, true, true, false, true, true, false, false, true}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, CompareLtU32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, CompareLtU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR1<uint32>({0, 0, 0, 5, 5, 5, max, max, max});
@@ -772,7 +772,7 @@ TEST_F(ArrayElementwiseOpTest, CompareLtU32s) {
       {});
 }
 
-TEST_F(ArrayElementwiseOpTest, PowF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, PowF32s) {
   SetFastMathDisabled(true);
   ComputationBuilder builder(client_, TestName());
   auto lhs =
@@ -795,7 +795,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) {
 }
 
 // Some Pow cases that can be implemented more efficiently.
-TEST_F(ArrayElementwiseOpTest, PowSpecialF32) {
+XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) {
   ComputationBuilder b(client_, TestName());
 
   std::vector<float> values = {1.0f, 2.0f, 3.2f, -4.0f};
@@ -823,7 +823,7 @@ TEST_F(ArrayElementwiseOpTest, PowSpecialF32) {
   ComputeAndCompareR1<float>(&b, expected, {param_data.get()}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, PowOfExpF32) {
+XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) {
   ComputationBuilder b(client_, TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
@@ -848,7 +848,7 @@ TEST_F(ArrayElementwiseOpTest, PowOfExpF32) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) {
+XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) {
   ComputationBuilder b(client_, TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, 4.0f, 0.5f, 5.7f};
@@ -873,7 +873,7 @@ TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, MulOfExpF32) {
+XLA_TEST_F(ArrayElementwiseOpTest, MulOfExpF32) {
   ComputationBuilder b(client_, TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
@@ -898,7 +898,7 @@ TEST_F(ArrayElementwiseOpTest, MulOfExpF32) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, DivOfExpF32) {
+XLA_TEST_F(ArrayElementwiseOpTest, DivOfExpF32) {
   ComputationBuilder b(client_, TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f};
@@ -923,7 +923,7 @@ TEST_F(ArrayElementwiseOpTest, DivOfExpF32) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) {
+XLA_TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) {
   ComputationBuilder b(client_, TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
@@ -955,7 +955,7 @@ TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) {
       &b, expected, {data0.get(), data1.get(), data2.get()}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) {
+XLA_TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) {
   ComputationBuilder b(client_, TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
@@ -988,7 +988,7 @@ TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) {
       &b, expected, {data0.get(), data1.get(), data2.get()}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) {
+XLA_TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) {
   ComputationBuilder b(client_, TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
@@ -1021,7 +1021,7 @@ TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) {
       &b, expected, {data0.get(), data1.get(), data2.get()}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, Div4F32) {
+XLA_TEST_F(ArrayElementwiseOpTest, Div4F32) {
   ComputationBuilder b(client_, TestName());
 
   std::vector<float> values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f};
@@ -1081,7 +1081,7 @@ TEST_P(ArrayElementwiseOpTestParamCount, SquareManyValues) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, SquareIn4D) {
+XLA_TEST_F(ArrayElementwiseOpTest, SquareIn4D) {
   ComputationBuilder builder(client_, TestName());
   Array4D<float> values(2, 2, 2, 2);
 
@@ -1120,7 +1120,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, SquareIn4DZeroElements) {
 //
 // TODO(b/28180546): Make this compile in a way that is consistent
 // among backends.
-TEST_F(ArrayElementwiseOpTest, MinF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, MinF32s) {
   ComputationBuilder builder(client_, TestName());
 #if !defined(XLA_TEST_BACKEND_CPU)
   auto lhs = builder.ConstantR1<float>({1.0f, 1.0f, 2.25f});
@@ -1174,7 +1174,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MinF64s) {
 
 // TODO(b/28180546): Make this compile in a way that is consistent
 // among backends. See comment on MinF32s test above.
-TEST_F(ArrayElementwiseOpTest, MaxF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, MaxF32s) {
   ComputationBuilder builder(client_, TestName());
 #if !defined(XLA_TEST_BACKEND_CPU)
   auto lhs = builder.ConstantR1<float>({1.0f, 1.0f, 2.25f});
@@ -1226,7 +1226,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxF64s) {
                               {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, MaxS32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, MaxS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
   ComputationBuilder builder(client_, TestName());
@@ -1241,7 +1241,7 @@ TEST_F(ArrayElementwiseOpTest, MaxS32s) {
   ComputeAndCompareR1<int32>(&builder, expected, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, MinS32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, MinS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
   ComputationBuilder builder(client_, TestName());
@@ -1256,7 +1256,7 @@ TEST_F(ArrayElementwiseOpTest, MinS32s) {
   ComputeAndCompareR1<int32>(&builder, expected, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, MaxU32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, MaxU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
   ComputationBuilder builder(client_, TestName());
   auto x = builder.ConstantR1<uint32>({0, 0, 1, 1, 1, max, max, max});
@@ -1267,7 +1267,7 @@ TEST_F(ArrayElementwiseOpTest, MaxU32s) {
   ComputeAndCompareR1<uint32>(&builder, expected, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, MinU32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, MinU32s) {
   const uint32 max = std::numeric_limits<uint32>::max();
   ComputationBuilder builder(client_, TestName());
   auto x = builder.ConstantR1<uint32>({0, 0, 1, 1, 1, max, max, max});
@@ -1278,7 +1278,7 @@ TEST_F(ArrayElementwiseOpTest, MinU32s) {
   ComputeAndCompareR1<uint32>(&builder, expected, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, MaxTenF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, MaxTenF32s) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.ConstantR1<float>(
       {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0});
@@ -1311,7 +1311,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MaxR1S0AndR2S0x2F32s) {
   }
 }
 
-TEST_F(ArrayElementwiseOpTest, Max1DAnd2DF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, Max1DAnd2DF32s) {
   ComputationBuilder builder(client_, TestName());
   auto v = builder.ConstantR1<float>({2.0f, 3.0f, 4.0f});
   auto m =
@@ -1354,7 +1354,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Max3DAndScalarZeroElementS32s) {
   ComputeAndCompareR3<int32>(&builder, expected, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, Min2DTo1DF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, Min2DTo1DF32s) {
   ComputationBuilder builder(client_, TestName());
   auto m =
       builder.ConstantR2<float>({{-10.4f, 64.0f, 6.0f}, {0.1f, 32.0f, 16.1f}});
@@ -1431,7 +1431,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, RemTwoConstantS32s) {
   ComputeAndCompareR1<int32>(&builder, {-3, 1, 0, -1, 1}, {});
 }
 
-TEST_F(ArrayElementwiseOpTest, NonNanClampF32) {
+XLA_TEST_F(ArrayElementwiseOpTest, NonNanClampF32) {
   ComputationBuilder builder(client_, TestName());
   auto minimum = builder.ConstantR1<float>({1.0f, -6.5f, 1.0f, 2.25f, 0.0f});
   auto argument = builder.ConstantR1<float>({2.0f, 10.0f, -5.0f, 1.0f, 10.0f});
@@ -1442,7 +1442,7 @@ TEST_F(ArrayElementwiseOpTest, NonNanClampF32) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, ClampF32Scalar) {
+XLA_TEST_F(ArrayElementwiseOpTest, ClampF32Scalar) {
   ComputationBuilder builder(client_, TestName());
   auto minimum = builder.ConstantR0<float>(0.0f);
   auto argument = builder.ConstantR1<float>({2.0f, 10.0f, -5.0f, 1.0f, 4.0f});
@@ -1453,7 +1453,7 @@ TEST_F(ArrayElementwiseOpTest, ClampF32Scalar) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) {
+XLA_TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) {
   ComputationBuilder builder(client_, TestName());
   auto min_scalar = builder.ConstantR0<float>(0.0f);
   auto min_vector = builder.ConstantR1<float>({1.0f, -6.5f, 1.0f, 2.25f, 0.0f});
@@ -1472,7 +1472,7 @@ TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> param0_literal =
@@ -1516,7 +1516,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersZeroElementF32s) {
       &builder, expected, {param0_data.get(), param1_data.get()}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, AddParameterToConstantF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, AddParameterToConstantF32s) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> param0_literal =
@@ -1550,7 +1550,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, SinF32s) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, TanhF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, TanhF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f});
   auto result = builder.Tanh(a);
@@ -1559,7 +1559,7 @@ TEST_F(ArrayElementwiseOpTest, TanhF32s) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, TanhF32sVector) {
+XLA_TEST_F(ArrayElementwiseOpTest, TanhF32sVector) {
   // This is like the test ArrayElementwiseOpTest.TanhF32s above, except that
   // the input tensor is large enough to exercise the vectorized tanh
   // implementation.
@@ -1603,7 +1603,7 @@ TEST_F(ArrayElementwiseOpTest, TanhF32sVector) {
       ErrorSpec(0.004, 0.004));
 }
 
-TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) {
+XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) {
   // a ------ (add) --------- (add)
   //         /               /
   // b -----/               /
@@ -1621,7 +1621,7 @@ TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, AddChainFoldRight) {
+XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldRight) {
   // b ------ (add) --------- (add)
   //         /               /
   // c -----/               /
@@ -1639,7 +1639,7 @@ TEST_F(ArrayElementwiseOpTest, AddChainFoldRight) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, AddWithNeg) {
+XLA_TEST_F(ArrayElementwiseOpTest, AddWithNeg) {
   // a ----- (neg) ----- (add)
   //                    /
   // b ----- (neg) ----/
@@ -1656,7 +1656,7 @@ TEST_F(ArrayElementwiseOpTest, AddWithNeg) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, AddChainTwoSide) {
+XLA_TEST_F(ArrayElementwiseOpTest, AddChainTwoSide) {
   // a ------ (add) ------------\
   //         /                   \
   // b -----/                    (add)
@@ -1679,7 +1679,7 @@ TEST_F(ArrayElementwiseOpTest, AddChainTwoSide) {
                              error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, 2DBinaryOpF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, 2DBinaryOpF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a =
       builder.ConstantR2<float>({{-2.5f, 3.14f, 1.0f}, {2.25f, -10.0f, 3.33f}});
@@ -1704,7 +1704,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, ScalarPlus2DF32) {
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, 2DPlusScalarF32) {
+XLA_TEST_F(ArrayElementwiseOpTest, 2DPlusScalarF32) {
   // Add a matrix + scalar.
   ComputationBuilder builder(client_, TestName());
   auto a =
@@ -1820,7 +1820,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Lt) {
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
 
-TEST_F(ArrayElementwiseOpTest, Mul2Dby1DF32) {
+XLA_TEST_F(ArrayElementwiseOpTest, Mul2Dby1DF32) {
   // Test simple broadcasting of a R1F32 over R2F32 when the order of binary op
   // arguments is reversed.
   ComputationBuilder builder(client_, TestName());
@@ -1831,7 +1831,7 @@ TEST_F(ArrayElementwiseOpTest, Mul2Dby1DF32) {
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, Add2DTo2DWithDegenerateDim1) {
+XLA_TEST_F(ArrayElementwiseOpTest, Add2DTo2DWithDegenerateDim1) {
   // Tests broadcasting for arrays with degenerate (size == 1) dimensions.
   ComputationBuilder builder(client_, TestName());
   // m's shape in XLA notation is {3, 2}
@@ -1891,7 +1891,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32TwoWaysOver1) {
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32TwoWaysOver0) {
+XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32TwoWaysOver0) {
   // Add together a (2,2) array and a (2) array, using dimension 1 for
   // broadcasting (though there are two ways to broadcast these shapes).
   ComputationBuilder builder(client_, TestName());
@@ -1902,7 +1902,7 @@ TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32TwoWaysOver0) {
   ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, 3DBinaryOpF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, 3DBinaryOpF32s) {
   // Binary add of two R3s together
   ComputationBuilder builder(client_, TestName());
   Array3D<float> a_3d({{{1.0f, 2.0f}, {3.0f, 4.0f}, {5.0f, 6.0f}},
@@ -2033,7 +2033,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtR3F32sWithDegenerateDim2) {
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
 
-TEST_F(ArrayElementwiseOpTest, 4DBinaryOpF32s) {
+XLA_TEST_F(ArrayElementwiseOpTest, 4DBinaryOpF32s) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Array4D<float>> operand_a_4d(new Array4D<float>(2, 3, 4, 5));
@@ -2060,7 +2060,7 @@ TEST_F(ArrayElementwiseOpTest, 4DBinaryOpF32s) {
   ComputeAndCompareR4<float>(&builder, *expected_4d, {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, R4PlusR1InDim1) {
+XLA_TEST_F(ArrayElementwiseOpTest, R4PlusR1InDim1) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Array4D<float>> operand_a_4d(new Array4D<float>(2, 3, 4, 5));
@@ -2088,7 +2088,7 @@ TEST_F(ArrayElementwiseOpTest, R4PlusR1InDim1) {
   ComputeAndCompareR4<float>(&builder, *expected_4d, {}, error_spec_);
 }
 
-TEST_F(ArrayElementwiseOpTest, R4_16x16x2x2_Plus_R1_16) {
+XLA_TEST_F(ArrayElementwiseOpTest, R4_16x16x2x2_Plus_R1_16) {
   constexpr int d0 = 16;
   constexpr int d1 = 16;
   constexpr int d2 = 2;
@@ -2119,7 +2119,7 @@ TEST_F(ArrayElementwiseOpTest, R4_16x16x2x2_Plus_R1_16) {
 }
 
 // Show that we can't add two opaques.
-TEST_F(ArrayElementwiseOpTest, CannotAddOpaques) {
+XLA_TEST_F(ArrayElementwiseOpTest, CannotAddOpaques) {
   ComputationBuilder builder(client_, TestName());
   auto shape = ShapeUtil::MakeOpaqueShape();
   auto x = builder.Parameter(0, shape, "x");
@@ -2133,7 +2133,7 @@ TEST_F(ArrayElementwiseOpTest, CannotAddOpaques) {
 
 // Regression test for b/31927799. "slice - y" is fused and requires implicit
 // broadcast.
-TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) {
+XLA_TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) {
   ComputationBuilder builder(client_, TestName());
   auto x_literal = Literal::CreateR1<float>({1, 2, 3});
   auto y_literal = Literal::CreateR1<float>({4, 5});
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 7b707cd3601..82935157f51 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -31,6 +31,7 @@ def xla_test(name,
              args=[],
              tags=[],
              copts=[],
+             data=[],
              backend_tags={},
              backend_args={},
              **kwargs):
@@ -114,6 +115,7 @@ def xla_test(name,
     this_backend_tags = ["xla_%s" % backend]
     this_backend_copts = []
     this_backend_args = backend_args.get(backend, [])
+    this_backend_data = []
     if backend == "cpu":
       backend_deps = ["//tensorflow/compiler/xla/service:cpu_plugin"]
       backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_cpu"]
@@ -130,6 +132,7 @@ def xla_test(name,
       this_backend_copts += plugins[backend]["copts"]
       this_backend_tags += plugins[backend]["tags"]
       this_backend_args += plugins[backend]["args"]
+      this_backend_data += plugins[backend]["data"]
     else:
       fail("Unknown backend %s" % backend)
 
@@ -145,6 +148,7 @@ def xla_test(name,
         this_backend_copts,
         args=args + this_backend_args,
         deps=deps + backend_deps,
+        data=data + this_backend_data,
         **kwargs)
 
     test_names.append(test_name)
@@ -227,14 +231,18 @@ def generate_backend_test_macros(backends=[]):
   if not backends:
     backends = all_backends
   for backend in filter_backends(backends):
+    manifest = ""
+    if backend in plugins:
+      manifest = plugins[backend]["disabled_manifest"]
+
     native.cc_library(
         name="test_macros_%s" % backend,
         testonly = True,
         srcs = ["test_macros.cc"],
         hdrs = ["test_macros.h"],
         copts = [
-            "-DXLA_PLATFORM=\\\"%s\\\"" % backend.upper(),
-            "-DXLA_DISABLED_MANIFEST=\\\"\\\""
+          "-DXLA_PLATFORM=\\\"%s\\\"" % backend.upper(),
+          "-DXLA_DISABLED_MANIFEST=\\\"%s\\\"" % manifest,
         ],
         deps = [
             "//tensorflow/compiler/xla:types",
diff --git a/tensorflow/compiler/xla/tests/plugin.bzl b/tensorflow/compiler/xla/tests/plugin.bzl
index 1b10c778ce3..8a5d91363b6 100644
--- a/tensorflow/compiler/xla/tests/plugin.bzl
+++ b/tensorflow/compiler/xla/tests/plugin.bzl
@@ -22,9 +22,13 @@
 #       "//tensorflow/compiler/plugin/foo:foo_lib",
 #       "//tensorflow/compiler/plugin/foo:test_macros",
 #     ],
+#     "disabled_manifest": "tensorflow/compiler/plugin/foo/disabled_test_manifest.txt",
 #     "copts": [],
 #     "tags": [],
 #     "args": []
+#     "data": [
+#       "//tensorflow/compiler/plugin/foo:disabled_test_manifest.txt",
+#     ],
 #   },
 # }
 
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index 25ca035366c..f3cbc013238 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -69,35 +69,35 @@ class ScalarComputationsTest : public ClientLibraryTestBase {
   }
 };
 
-TEST_F(ScalarComputationsTest, NegateScalarF32) {
+XLA_TEST_F(ScalarComputationsTest, NegateScalarF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Neg(builder.ConstantR0<float>(2.1f));
 
   ComputeAndCompareR0<float>(&builder, -2.1f, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, NegateScalarS32) {
+XLA_TEST_F(ScalarComputationsTest, NegateScalarS32) {
   ComputationBuilder builder(client_, TestName());
   builder.Neg(builder.ConstantR0<int32>(2));
 
   ComputeAndCompareR0<int32>(&builder, -2, {});
 }
 
-TEST_F(ScalarComputationsTest, AddTwoScalarsF32) {
+XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Add(builder.ConstantR0<float>(2.1f), builder.ConstantR0<float>(5.5f));
 
   ComputeAndCompareR0<float>(&builder, 7.6f, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, AddTwoScalarsS32) {
+XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsS32) {
   ComputationBuilder builder(client_, TestName());
   builder.Add(builder.ConstantR0<int32>(2), builder.ConstantR0<int32>(5));
 
   ComputeAndCompareR0<int32>(&builder, 7, {});
 }
 
-TEST_F(ScalarComputationsTest, AddTwoScalarsU32) {
+XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsU32) {
   ComputationBuilder builder(client_, TestName());
   builder.Add(builder.ConstantR0<uint32>(35), builder.ConstantR0<uint32>(57));
 
@@ -137,21 +137,21 @@ XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsF64) {
   ComputeAndCompareR0<double>(&builder, 3.75, {});
 }
 
-TEST_F(ScalarComputationsTest, SubtractTwoScalarsF32) {
+XLA_TEST_F(ScalarComputationsTest, SubtractTwoScalarsF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Sub(builder.ConstantR0<float>(2.1f), builder.ConstantR0<float>(5.5f));
 
   ComputeAndCompareR0<float>(&builder, -3.4f, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, SubtractTwoScalarsS32) {
+XLA_TEST_F(ScalarComputationsTest, SubtractTwoScalarsS32) {
   ComputationBuilder builder(client_, TestName());
   builder.Sub(builder.ConstantR0<int32>(2), builder.ConstantR0<int32>(5));
 
   ComputeAndCompareR0<int32>(&builder, -3, {});
 }
 
-TEST_F(ScalarComputationsTest, MulThreeScalarsF32) {
+XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Mul(builder.Mul(builder.ConstantR0<float>(2.1f),
                           builder.ConstantR0<float>(5.5f)),
@@ -160,7 +160,7 @@ TEST_F(ScalarComputationsTest, MulThreeScalarsF32) {
   ComputeAndCompareR0<float>(&builder, 5.775f, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, MulTwoScalarsS32) {
+XLA_TEST_F(ScalarComputationsTest, MulTwoScalarsS32) {
   std::vector<int32> data = {0,
                              1,
                              -1,
@@ -184,7 +184,7 @@ TEST_F(ScalarComputationsTest, MulTwoScalarsS32) {
   }
 }
 
-TEST_F(ScalarComputationsTest, MulTwoScalarsU32) {
+XLA_TEST_F(ScalarComputationsTest, MulTwoScalarsU32) {
   std::vector<uint32> data = {0,          1,          0xDEADBEEF, 1234,
                               0x1a243514, 0xFFFFFFFF, 0x80808080};
 
@@ -199,7 +199,7 @@ TEST_F(ScalarComputationsTest, MulTwoScalarsU32) {
   }
 }
 
-TEST_F(ScalarComputationsTest, MulThreeScalarsS32) {
+XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsS32) {
   ComputationBuilder builder(client_, TestName());
   builder.Mul(
       builder.Mul(builder.ConstantR0<int32>(2), builder.ConstantR0<int32>(5)),
@@ -208,7 +208,7 @@ TEST_F(ScalarComputationsTest, MulThreeScalarsS32) {
   ComputeAndCompareR0<int32>(&builder, 10, {});
 }
 
-TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) {
+XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) {
   ComputationBuilder builder(client_, TestName());
   std::unique_ptr<Literal> a_literal = Literal::CreateR0<float>(2.1f);
   std::unique_ptr<Literal> b_literal = Literal::CreateR0<float>(5.5f);
@@ -231,7 +231,7 @@ TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) {
                              error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, DivideTwoScalarsF32) {
+XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Div(builder.ConstantR0<float>(5.0f), builder.ConstantR0<float>(2.5f));
 
@@ -337,7 +337,7 @@ INSTANTIATE_TEST_CASE_P(
         DivS32Params{INT32_MIN, -0x40000000, 2, 0},                //
         DivS32Params{INT32_MIN + 1, -0x40000000, 1, -0x3fffffff}));
 
-TEST_F(ScalarComputationsTest, DivU32s) {
+XLA_TEST_F(ScalarComputationsTest, DivU32s) {
   // clang-format off
   // Some interesting values to test.
   std::vector<uint32> vals = {
@@ -378,7 +378,7 @@ TEST_F(ScalarComputationsTest, DivU32s) {
   }
 }
 
-TEST_F(ScalarComputationsTest, RemU32s) {
+XLA_TEST_F(ScalarComputationsTest, RemU32s) {
   // clang-format off
   // Some interesting values to test.
   std::vector<uint32> vals = {
@@ -419,7 +419,7 @@ TEST_F(ScalarComputationsTest, RemU32s) {
   }
 }
 
-TEST_F(ScalarComputationsTest, RemainderTwoScalarsNonConstDividendS32) {
+XLA_TEST_F(ScalarComputationsTest, RemainderTwoScalarsNonConstDividendS32) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "x");
   builder.Rem(x, builder.ConstantR0<int32>(80000));
@@ -446,7 +446,7 @@ XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsU32) {
   ComputeAndCompareR0<uint32>(&builder, 2, {});
 }
 
-TEST_F(ScalarComputationsTest, LogicalAnd) {
+XLA_TEST_F(ScalarComputationsTest, LogicalAnd) {
   for (bool x : {false, true}) {
     for (bool y : {false, true}) {
       ComputationBuilder builder(client_, TestName());
@@ -458,7 +458,7 @@ TEST_F(ScalarComputationsTest, LogicalAnd) {
   }
 }
 
-TEST_F(ScalarComputationsTest, LogicalOr) {
+XLA_TEST_F(ScalarComputationsTest, LogicalOr) {
   for (bool x : {false, true}) {
     for (bool y : {false, true}) {
       ComputationBuilder builder(client_, TestName());
@@ -470,7 +470,7 @@ TEST_F(ScalarComputationsTest, LogicalOr) {
   }
 }
 
-TEST_F(ScalarComputationsTest, LogicalNot) {
+XLA_TEST_F(ScalarComputationsTest, LogicalNot) {
   for (bool x : {false, true}) {
     ComputationBuilder builder(client_, TestName());
     builder.LogicalNot(builder.ConstantR0<bool>(x));
@@ -479,7 +479,7 @@ TEST_F(ScalarComputationsTest, LogicalNot) {
   }
 }
 
-TEST_F(ScalarComputationsTest, SelectScalarTrue) {
+XLA_TEST_F(ScalarComputationsTest, SelectScalarTrue) {
   ComputationBuilder builder(client_, TestName());
   builder.Select(builder.ConstantR0<bool>(true),     // The predicate.
                  builder.ConstantR0<float>(123.0f),  // The value on true.
@@ -488,7 +488,7 @@ TEST_F(ScalarComputationsTest, SelectScalarTrue) {
   ComputeAndCompareR0<float>(&builder, 123.0f, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, SelectScalarFalse) {
+XLA_TEST_F(ScalarComputationsTest, SelectScalarFalse) {
   ComputationBuilder builder(client_, TestName());
   builder.Select(builder.ConstantR0<bool>(false),    // The predicate.
                  builder.ConstantR0<float>(123.0f),  // The value on true.
@@ -499,7 +499,7 @@ TEST_F(ScalarComputationsTest, SelectScalarFalse) {
 
 // This test is an explicit version of what is happening in the following
 // templatized comparison tests.
-TEST_F(ScalarComputationsTest, CompareGtScalar) {
+XLA_TEST_F(ScalarComputationsTest, CompareGtScalar) {
   ComputationBuilder builder(client_, TestName());
   builder.Gt(builder.ConstantR0<float>(2.0f), builder.ConstantR0<float>(1.0f));
 
@@ -507,30 +507,30 @@ TEST_F(ScalarComputationsTest, CompareGtScalar) {
 }
 
 // S32 comparisons.
-TEST_F(ScalarComputationsTest, CompareEqS32Greater) {
+XLA_TEST_F(ScalarComputationsTest, CompareEqS32Greater) {
   TestCompare<int32>(2, 1, false, &ComputationBuilder::Eq);
 }
-TEST_F(ScalarComputationsTest, CompareEqS32Equal) {
+XLA_TEST_F(ScalarComputationsTest, CompareEqS32Equal) {
   TestCompare<int32>(3, 3, true, &ComputationBuilder::Eq);
 }
 
-TEST_F(ScalarComputationsTest, CompareNeS32) {
+XLA_TEST_F(ScalarComputationsTest, CompareNeS32) {
   TestCompare<int32>(2, 1, true, &ComputationBuilder::Ne);
 }
 
-TEST_F(ScalarComputationsTest, CompareGeS32) {
+XLA_TEST_F(ScalarComputationsTest, CompareGeS32) {
   TestCompare<int32>(2, 1, true, &ComputationBuilder::Ge);
 }
 
-TEST_F(ScalarComputationsTest, CompareGtS32) {
+XLA_TEST_F(ScalarComputationsTest, CompareGtS32) {
   TestCompare<int32>(1, 5, false, &ComputationBuilder::Gt);
 }
 
-TEST_F(ScalarComputationsTest, CompareLeS32) {
+XLA_TEST_F(ScalarComputationsTest, CompareLeS32) {
   TestCompare<int32>(2, 1, false, &ComputationBuilder::Le);
 }
 
-TEST_F(ScalarComputationsTest, CompareLtS32) {
+XLA_TEST_F(ScalarComputationsTest, CompareLtS32) {
   TestCompare<int32>(9, 7, false, &ComputationBuilder::Lt);
   TestCompare<int32>(std::numeric_limits<int32>::min(),
                      std::numeric_limits<int32>::max(), true,
@@ -538,105 +538,105 @@ TEST_F(ScalarComputationsTest, CompareLtS32) {
 }
 
 // U32 comparisons.
-TEST_F(ScalarComputationsTest, CompareEqU32False) {
+XLA_TEST_F(ScalarComputationsTest, CompareEqU32False) {
   TestCompare<uint32>(2, 1, false, &ComputationBuilder::Eq);
 }
 
-TEST_F(ScalarComputationsTest, CompareNeU32) {
+XLA_TEST_F(ScalarComputationsTest, CompareNeU32) {
   TestCompare<uint32>(2, 1, true, &ComputationBuilder::Ne);
 }
 
-TEST_F(ScalarComputationsTest, CompareGeU32Greater) {
+XLA_TEST_F(ScalarComputationsTest, CompareGeU32Greater) {
   TestCompare<uint32>(2, 1, true, &ComputationBuilder::Ge);
 }
 
-TEST_F(ScalarComputationsTest, CompareGeU32Equal) {
+XLA_TEST_F(ScalarComputationsTest, CompareGeU32Equal) {
   TestCompare<uint32>(3, 3, true, &ComputationBuilder::Ge);
 }
 
-TEST_F(ScalarComputationsTest, CompareGtU32) {
+XLA_TEST_F(ScalarComputationsTest, CompareGtU32) {
   TestCompare<uint32>(1, 5, false, &ComputationBuilder::Gt);
   TestCompare<uint32>(5, 5, false, &ComputationBuilder::Gt);
   TestCompare<uint32>(5, 1, true, &ComputationBuilder::Gt);
 }
 
-TEST_F(ScalarComputationsTest, CompareLeU32) {
+XLA_TEST_F(ScalarComputationsTest, CompareLeU32) {
   TestCompare<uint32>(2, 1, false, &ComputationBuilder::Le);
 }
 
-TEST_F(ScalarComputationsTest, CompareLtU32) {
+XLA_TEST_F(ScalarComputationsTest, CompareLtU32) {
   TestCompare<uint32>(9, 7, false, &ComputationBuilder::Lt);
   TestCompare<uint32>(0, std::numeric_limits<uint32>::max(), true,
                       &ComputationBuilder::Lt);
 }
 
 // F32 comparisons.
-TEST_F(ScalarComputationsTest, CompareEqF32False) {
+XLA_TEST_F(ScalarComputationsTest, CompareEqF32False) {
   TestCompare<float>(2.0, 1.3, false, &ComputationBuilder::Eq);
 }
 
-TEST_F(ScalarComputationsTest, CompareNeF32) {
+XLA_TEST_F(ScalarComputationsTest, CompareNeF32) {
   TestCompare<float>(2.0, 1.3, true, &ComputationBuilder::Ne);
 }
 
-TEST_F(ScalarComputationsTest, CompareGeF32Greater) {
+XLA_TEST_F(ScalarComputationsTest, CompareGeF32Greater) {
   TestCompare<float>(2.0, 1.9, true, &ComputationBuilder::Ge);
 }
-TEST_F(ScalarComputationsTest, CompareGeF32Equal) {
+XLA_TEST_F(ScalarComputationsTest, CompareGeF32Equal) {
   TestCompare<float>(3.5, 3.5, true, &ComputationBuilder::Ge);
 }
 
-TEST_F(ScalarComputationsTest, CompareGtF32) {
+XLA_TEST_F(ScalarComputationsTest, CompareGtF32) {
   TestCompare<float>(1.0, 5.2, false, &ComputationBuilder::Gt);
 }
 
-TEST_F(ScalarComputationsTest, CompareLeF32) {
+XLA_TEST_F(ScalarComputationsTest, CompareLeF32) {
   TestCompare<float>(2.0, 1.2, false, &ComputationBuilder::Le);
 }
 
-TEST_F(ScalarComputationsTest, CompareLtF32) {
+XLA_TEST_F(ScalarComputationsTest, CompareLtF32) {
   TestCompare<float>(9.0, 7.2, false, &ComputationBuilder::Lt);
 }
 
 // F32 comparisons with exceptional values.  The test names encode the
 // left/right operands at the end, and use Minf and Mzero for -inf and -0.0.
-TEST_F(ScalarComputationsTest, CompareLtF32MinfMzero) {
+XLA_TEST_F(ScalarComputationsTest, CompareLtF32MinfMzero) {
   TestCompare<float>(-INFINITY, -0.0, true, &ComputationBuilder::Lt);
 }
-TEST_F(ScalarComputationsTest, CompareLtF32MzeroZero) {
+XLA_TEST_F(ScalarComputationsTest, CompareLtF32MzeroZero) {
   // Comparisons of 0.0 to -0.0 consider them equal in IEEE 754.
   TestCompare<float>(-0.0, 0.0, false, &ComputationBuilder::Lt);
 }
-TEST_F(ScalarComputationsTest, CompareLtF32ZeroInf) {
+XLA_TEST_F(ScalarComputationsTest, CompareLtF32ZeroInf) {
   TestCompare<float>(0.0, INFINITY, true, &ComputationBuilder::Lt);
 }
 
-TEST_F(ScalarComputationsTest, CompareGeF32MinfMzero) {
+XLA_TEST_F(ScalarComputationsTest, CompareGeF32MinfMzero) {
   TestCompare<float>(-INFINITY, -0.0, false, &ComputationBuilder::Ge);
 }
-TEST_F(ScalarComputationsTest, CompareGeF32MzeroZero) {
+XLA_TEST_F(ScalarComputationsTest, CompareGeF32MzeroZero) {
   // Comparisons of 0.0 to -0.0 consider them equal in IEEE 754.
   TestCompare<float>(-0.0, 0.0, true, &ComputationBuilder::Ge);
 }
-TEST_F(ScalarComputationsTest, CompareGeF32ZeroInf) {
+XLA_TEST_F(ScalarComputationsTest, CompareGeF32ZeroInf) {
   TestCompare<float>(0.0, INFINITY, false, &ComputationBuilder::Ge);
 }
 
-TEST_F(ScalarComputationsTest, ExpScalar) {
+XLA_TEST_F(ScalarComputationsTest, ExpScalar) {
   ComputationBuilder builder(client_, TestName());
   builder.Exp(builder.ConstantR0<float>(2.0f));
 
   ComputeAndCompareR0<float>(&builder, 7.3890562, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, LogScalar) {
+XLA_TEST_F(ScalarComputationsTest, LogScalar) {
   ComputationBuilder builder(client_, "log");
   builder.Log(builder.ConstantR0<float>(2.0f));
 
   ComputeAndCompareR0<float>(&builder, 0.6931471, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, TanhScalar) {
+XLA_TEST_F(ScalarComputationsTest, TanhScalar) {
   ComputationBuilder builder(client_, TestName());
   builder.Tanh(builder.ConstantR0<float>(2.0f));
 
@@ -650,14 +650,14 @@ XLA_TEST_F(ScalarComputationsTest, TanhDoubleScalar) {
   ComputeAndCompareR0<double>(&builder, 0.96402758, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, PowScalar) {
+XLA_TEST_F(ScalarComputationsTest, PowScalar) {
   ComputationBuilder builder(client_, TestName());
   builder.Pow(builder.ConstantR0<float>(2.0f), builder.ConstantR0<float>(3.0f));
 
   ComputeAndCompareR0<float>(&builder, 8.0, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, ClampScalarHigh) {
+XLA_TEST_F(ScalarComputationsTest, ClampScalarHigh) {
   ComputationBuilder builder(client_, TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(5.0f),   // The operand to be clamped.
@@ -666,7 +666,7 @@ TEST_F(ScalarComputationsTest, ClampScalarHigh) {
   ComputeAndCompareR0<float>(&builder, 3.0, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, ClampScalarMiddle) {
+XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddle) {
   ComputationBuilder builder(client_, TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(2.5f),   // The operand to be clamped.
@@ -675,7 +675,7 @@ TEST_F(ScalarComputationsTest, ClampScalarMiddle) {
   ComputeAndCompareR0<float>(&builder, 2.5, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, ClampScalarLow) {
+XLA_TEST_F(ScalarComputationsTest, ClampScalarLow) {
   ComputationBuilder builder(client_, TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(-5.0f),  // The operand to be clamped.
@@ -684,57 +684,57 @@ TEST_F(ScalarComputationsTest, ClampScalarLow) {
   ComputeAndCompareR0<float>(&builder, 2.0, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, MinS32Above) {
+XLA_TEST_F(ScalarComputationsTest, MinS32Above) {
   TestMinMax<int32>(10, 3, 3, &ComputationBuilder::Min);
 }
 
-TEST_F(ScalarComputationsTest, MinS32Below) {
+XLA_TEST_F(ScalarComputationsTest, MinS32Below) {
   TestMinMax<int32>(-100, 3, -100, &ComputationBuilder::Min);
 }
 
-TEST_F(ScalarComputationsTest, MaxS32Above) {
+XLA_TEST_F(ScalarComputationsTest, MaxS32Above) {
   TestMinMax<int32>(10, 3, 10, &ComputationBuilder::Max);
 }
 
-TEST_F(ScalarComputationsTest, MaxS32Below) {
+XLA_TEST_F(ScalarComputationsTest, MaxS32Below) {
   TestMinMax<int32>(-100, 3, 3, &ComputationBuilder::Max);
 }
 
-TEST_F(ScalarComputationsTest, MinU32Above) {
+XLA_TEST_F(ScalarComputationsTest, MinU32Above) {
   const uint32 large = std::numeric_limits<int32>::max();
   TestMinMax<uint32>(large, 3, 3, &ComputationBuilder::Min);
 }
 
-TEST_F(ScalarComputationsTest, MinU32Below) {
+XLA_TEST_F(ScalarComputationsTest, MinU32Below) {
   TestMinMax<uint32>(0, 5, 0, &ComputationBuilder::Min);
 }
 
-TEST_F(ScalarComputationsTest, MaxU32Above) {
+XLA_TEST_F(ScalarComputationsTest, MaxU32Above) {
   const uint32 large = std::numeric_limits<int32>::max();
   TestMinMax<uint32>(large, 3, large, &ComputationBuilder::Max);
 }
 
-TEST_F(ScalarComputationsTest, MaxU32Below) {
+XLA_TEST_F(ScalarComputationsTest, MaxU32Below) {
   TestMinMax<uint32>(0, 5, 5, &ComputationBuilder::Max);
 }
 
-TEST_F(ScalarComputationsTest, MinF32Above) {
+XLA_TEST_F(ScalarComputationsTest, MinF32Above) {
   TestMinMax<float>(10.1f, 3.1f, 3.1f, &ComputationBuilder::Min);
 }
 
-TEST_F(ScalarComputationsTest, MinF32Below) {
+XLA_TEST_F(ScalarComputationsTest, MinF32Below) {
   TestMinMax<float>(-100.1f, 3.1f, -100.1f, &ComputationBuilder::Min);
 }
 
-TEST_F(ScalarComputationsTest, MaxF32Above) {
+XLA_TEST_F(ScalarComputationsTest, MaxF32Above) {
   TestMinMax<float>(10.1f, 3.1f, 10.1f, &ComputationBuilder::Max);
 }
 
-TEST_F(ScalarComputationsTest, MaxF32Below) {
+XLA_TEST_F(ScalarComputationsTest, MaxF32Below) {
   TestMinMax<float>(-100.1f, 3.1f, 3.1f, &ComputationBuilder::Max);
 }
 
-TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionF32) {
+XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionF32) {
   // Compute the expression (1 * (3 - 1) * (7 + 0) - 4) / 20.
   ComputationBuilder b(client_, TestName());
   b.Div(
@@ -747,7 +747,7 @@ TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionF32) {
   ComputeAndCompareR0<float>(&b, 0.5, {}, error_spec_);
 }
 
-TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionS32) {
+XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionS32) {
   // Compute the expression 1 * (3 - 1) * (7 + 0) - 4.
   ComputationBuilder b(client_, TestName());
   b.Sub(b.Mul(b.ConstantR0<int32>(1),
@@ -758,7 +758,7 @@ TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionS32) {
   ComputeAndCompareR0<int32>(&b, 10, {});
 }
 
-TEST_F(ScalarComputationsTest, SqrtF320) {
+XLA_TEST_F(ScalarComputationsTest, SqrtF320) {
   ComputationBuilder builder(client_, TestName());
   Literal zero_literal = Literal::Zero(PrimitiveType::F32);
 
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index 35a9fcb055e..efae13a43a0 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -85,12 +85,12 @@ XLA_TEST_F(UnaryOpTest, AbsTestR1Size0) {
   AbsSize0TestHelper<float>();
 }
 
-TEST_F(UnaryOpTest, AbsTestR1) {
+XLA_TEST_F(UnaryOpTest, AbsTestR1) {
   AbsTestHelper<int>();
   AbsTestHelper<float>();
 }
 
-TEST_F(UnaryOpTest, AbsTestR0) {
+XLA_TEST_F(UnaryOpTest, AbsTestR0) {
   ComputationBuilder builder(client_, TestName());
   auto argi = builder.ConstantR0<int>(-5);
   auto absi = builder.Abs(argi);
@@ -104,7 +104,7 @@ TEST_F(UnaryOpTest, AbsTestR0) {
   ComputeAndCompareR0<float>(&builder, 8.0f, {});
 }
 
-TEST_F(UnaryOpTest, SignTestR0) {
+XLA_TEST_F(UnaryOpTest, SignTestR0) {
   ComputationBuilder builder(client_, TestName());
   auto argi = builder.ConstantR0<int>(-5);
   auto absi = builder.Sign(argi);
@@ -118,17 +118,17 @@ TEST_F(UnaryOpTest, SignTestR0) {
   ComputeAndCompareR0<float>(&builder, -2.0f, {});
 }
 
-TEST_F(UnaryOpTest, SignTestR1) {
+XLA_TEST_F(UnaryOpTest, SignTestR1) {
   SignTestHelper<int>();
   SignTestHelper<float>();
 }
 
-TEST_F(UnaryOpTest, SignAbsTestR1) {
+XLA_TEST_F(UnaryOpTest, SignAbsTestR1) {
   SignAbsTestHelper<int>();
   SignAbsTestHelper<float>();
 }
 
-TEST_F(UnaryOpTest, UnsignedAbsTestR1) {
+XLA_TEST_F(UnaryOpTest, UnsignedAbsTestR1) {
   ComputationBuilder builder(client_, TestName());
   auto arg = builder.ConstantR1<unsigned int>(
       {2, 25, 0, 123, std::numeric_limits<unsigned int>::max()});
@@ -138,7 +138,7 @@ TEST_F(UnaryOpTest, UnsignedAbsTestR1) {
       &builder, {2, 25, 0, 123, std::numeric_limits<unsigned int>::max()}, {});
 }
 
-TEST_F(UnaryOpTest, UnsignedSignTestR1) {
+XLA_TEST_F(UnaryOpTest, UnsignedSignTestR1) {
   ComputationBuilder builder(client_, TestName());
   auto arg = builder.ConstantR1<unsigned int>(
       {2, 25, 0, 123, std::numeric_limits<unsigned int>::max()});
@@ -147,7 +147,7 @@ TEST_F(UnaryOpTest, UnsignedSignTestR1) {
   ComputeAndCompareR1<unsigned int>(&builder, {1, 1, 0, 1, 1}, {});
 }
 
-TEST_F(UnaryOpTest, SignAbsTestR2) {
+XLA_TEST_F(UnaryOpTest, SignAbsTestR2) {
   ComputationBuilder builder(client_, TestName());
   auto arg = builder.ConstantR2<float>({{1.0, -2.0}, {-3.0, 4.0}});
   auto sign = builder.Sign(arg);
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index fbb9c259da8..48a85f16a22 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -48,7 +48,7 @@ class VecOpsSimpleTest : public ClientLibraryTestBase {
   ErrorSpec error_spec_{0.0001};
 };
 
-TEST_F(VecOpsSimpleTest, ExpTenValues) {
+XLA_TEST_F(VecOpsSimpleTest, ExpTenValues) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -61,7 +61,7 @@ TEST_F(VecOpsSimpleTest, ExpTenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
-TEST_F(VecOpsSimpleTest, ExpManyValues) {
+XLA_TEST_F(VecOpsSimpleTest, ExpManyValues) {
   for (int count : {63, 64, 65, 127, 128, 129, 17 * 4096}) {
     ComputationBuilder builder(client_, TestName());
     std::vector<float> exponents;
@@ -83,7 +83,7 @@ TEST_F(VecOpsSimpleTest, ExpManyValues) {
   }
 }
 
-TEST_F(VecOpsSimpleTest, ExpIn4D) {
+XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) {
   ComputationBuilder builder(client_, TestName());
   Array4D<float> exponents(2, 2, 2, 2);
 
@@ -105,7 +105,7 @@ TEST_F(VecOpsSimpleTest, ExpIn4D) {
                              ErrorSpec(/*aabs=*/1e-2, /*arel=*/1e-3));
 }
 
-TEST_F(VecOpsSimpleTest, NegateTenFloatValues) {
+XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -116,7 +116,7 @@ TEST_F(VecOpsSimpleTest, NegateTenFloatValues) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
-TEST_F(VecOpsSimpleTest, NegateTenInt32Values) {
+XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.ConstantR1<int32>({2, -2, 12, -4, 5, 20, -15, 0, -2, 1});
   builder.Neg(x);
@@ -125,7 +125,7 @@ TEST_F(VecOpsSimpleTest, NegateTenInt32Values) {
   ComputeAndCompareR1<int32>(&builder, expected, {});
 }
 
-TEST_F(VecOpsSimpleTest, NegateUint32Values) {
+XLA_TEST_F(VecOpsSimpleTest, NegateUint32Values) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.ConstantR1<uint32>(
       {0, 1, 42, static_cast<uint32>(-1), static_cast<uint32>(-12)});
@@ -135,7 +135,7 @@ TEST_F(VecOpsSimpleTest, NegateUint32Values) {
   ComputeAndCompareR1<uint32>(&builder, expected, {});
 }
 
-TEST_F(VecOpsSimpleTest, SquareTenValues) {
+XLA_TEST_F(VecOpsSimpleTest, SquareTenValues) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -146,7 +146,7 @@ TEST_F(VecOpsSimpleTest, SquareTenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
-TEST_F(VecOpsSimpleTest, ReciprocalTenValues) {
+XLA_TEST_F(VecOpsSimpleTest, ReciprocalTenValues) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -187,7 +187,7 @@ XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
-TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) {
+XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) {
   ComputationBuilder builder(client_, TestName());
   auto add = CreateScalarAddComputation(F32, &builder);
 
@@ -202,7 +202,7 @@ TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
-TEST_F(VecOpsSimpleTest, MaxTenValues) {
+XLA_TEST_F(VecOpsSimpleTest, MaxTenValues) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -215,7 +215,7 @@ TEST_F(VecOpsSimpleTest, MaxTenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) {
+XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) {
   // Similar to MaxTenValues, except that the inputs come from params rather
   // than constants.
   ComputationBuilder builder(client_, TestName());
@@ -233,7 +233,7 @@ TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) {
                              error_spec_);
 }
 
-TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) {
+XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) {
   // Similar to MaxTenValuesFromParams, except that the data size passed in and
   // out is large.
   ComputationBuilder builder(client_, TestName());
@@ -273,7 +273,7 @@ TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) {
                              error_spec_);
 }
 
-TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) {
+XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -285,7 +285,7 @@ TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-TEST_F(VecOpsSimpleTest, MinTenValues) {
+XLA_TEST_F(VecOpsSimpleTest, MinTenValues) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.ConstantR1<float>(
       {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6});
@@ -298,7 +298,7 @@ TEST_F(VecOpsSimpleTest, MinTenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-TEST_F(VecOpsSimpleTest, MinMaxTenValues) {
+XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) {
   ComputationBuilder builder(client_, TestName());
   auto zero = builder.ConstantR0<float>(0);
   auto one = builder.ConstantR0<float>(1);
@@ -311,7 +311,7 @@ TEST_F(VecOpsSimpleTest, MinMaxTenValues) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) {
+XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) {
   ComputationBuilder builder(client_, TestName());
   auto zero = builder.ConstantR0<float>(0);
   auto one = builder.ConstantR0<float>(1);
@@ -324,7 +324,7 @@ TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) {
+XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) {
   ComputationBuilder builder(client_, TestName());
   auto zero = builder.ConstantR1<float>({0.0f, 0.0f});
   auto one = builder.ConstantR1<float>({1.0f, 1.0f});
@@ -335,7 +335,7 @@ TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
+XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
   ComputationBuilder builder(client_, TestName());
   auto one = builder.ConstantR0<float>(1);
   auto two = builder.ConstantR0<float>(2);
@@ -348,7 +348,7 @@ TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
   ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
-TEST_F(VecOpsSimpleTest, MapTenValues) {
+XLA_TEST_F(VecOpsSimpleTest, MapTenValues) {
   Computation add_half;
   {
     // add_half(x) = x + 0.5
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops.py b/tensorflow/contrib/batching/python/ops/batch_ops.py
index bec4b98cc23..cee4d7b4a97 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops.py
@@ -67,7 +67,7 @@ def batch_function(num_batch_threads, max_batch_size, batch_timeout_micros,
 
   So, for example, in the following code
 
-  ```
+  ```python
   @batch_function(1, 2, 3)
   def layer(a):
     return tf.matmul(a, a)
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 83c82c75bea..3cbb430f0b1 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -29,6 +29,7 @@ option(tensorflow_BUILD_ALL_KERNELS "Build all OpKernels" ON)
 option(tensorflow_BUILD_CONTRIB_KERNELS "Build OpKernels from tensorflow/contrib/..." ON)
 option(tensorflow_BUILD_CC_TESTS "Build cc unit tests " OFF)
 option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)
+option(tensorflow_BUILD_MORE_PYTHON_TESTS "Build more python unit tests for contrib packages" OFF)
 option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
 option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 8ad85275591..4ddfec5960d 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -241,6 +241,13 @@ Step-by-step Windows build
      ```
      ctest -C RelWithDebInfo
      ```
+   * `-Dtensorflow_BUILD_MORE_PYTHON_TESTS=(ON|OFF)`. Defaults to `OFF`. This enables python tests on
+     serveral major packages. This option is only valid if this and tensorflow_BUILD_PYTHON_TESTS are both set as `ON`.
+     After building the python wheel, you need to install the new wheel before running the tests.
+     To execute the tests, use
+     ```
+     ctest -C RelWithDebInfo
+     ```
 
 4. Invoke MSBuild to build TensorFlow.
 
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 4e5741d81dc..335551d3998 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -76,7 +76,9 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       #"${tensorflow_source_dir}/tensorflow/contrib/ffmpeg/encode_audio_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/framework/kernels/generate_vocab_remapping_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/framework/kernels/load_and_remap_matrix_op.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/framework/kernels/zero_initializer_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/checkpoint_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/variable_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index a4ee010fceb..6e8b48089f6 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -156,6 +156,21 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/*_test.py"
   )
 
+  if (tensorflow_BUILD_MORE_PYTHON_TESTS)
+    # Adding other major packages
+    file(GLOB_RECURSE tf_test_src_py
+      ${tf_test_src_py}
+      "${tensorflow_source_dir}/tensorflow/contrib/legacy_seq2seq/*_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/linalg/*_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/graph_editor/*_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/bayesflow/*_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/framework/*_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/keras/*_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/*_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/learn/*_test.py"
+    )
+  endif()
+
   # exclude the ones we don't want
   set(tf_test_src_py_exclude
     # Python source line inspection tests are flaky on Windows (b/36375074).
@@ -183,6 +198,9 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     # Loading resources in contrib doesn't seem to work on Windows
     "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/client/random_forest_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py"
+    # dask need fix
+    "${tensorflow_source_dir}/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py"
     # Test is flaky on Windows GPU builds (b/38283730).
     "${tensorflow_source_dir}/tensorflow/contrib/factorization/python/ops/gmm_test.py"
   )
@@ -215,11 +233,8 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/py_func_test.py"
       # training tests
       "${tensorflow_source_dir}/tensorflow/python/training/basic_session_run_hooks_test.py"  # Needs tf.contrib fix.
-      "${tensorflow_source_dir}/tensorflow/python/training/evaluation_test.py"  # Needs tf.contrib fix.
       "${tensorflow_source_dir}/tensorflow/python/training/localhost_cluster_performance_test.py"  # Needs portpicker.
-      "${tensorflow_source_dir}/tensorflow/python/training/monitored_session_test.py"  # Needs tf.contrib fix.
       "${tensorflow_source_dir}/tensorflow/python/training/quantize_training_test.py"  # Needs quantization ops to be included in windows.
-      "${tensorflow_source_dir}/tensorflow/python/training/saver_large_variable_test.py"  # Overflow error.
       "${tensorflow_source_dir}/tensorflow/python/training/supervisor_test.py"  # Flaky I/O error on rename.
       "${tensorflow_source_dir}/tensorflow/python/training/sync_replicas_optimizer_test.py"  # Needs portpicker.
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/array_ops_test.py"  # depends on python/framework/test_ops
@@ -233,6 +248,45 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/ops/cloud/bigquery_reader_ops_test.py"  # No libcurl support
       # Newly running on Windows since TensorBoard backend move. Fail on Windows and need debug.
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on Windows.
+      # Dask.Dataframe bugs on Window Build
+      "${tensorflow_source_dir}/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/learn/python/learn/learn_io/io_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/learn/python/learn/graph_actions_test.py"
+      # Need extra build
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/conditional_distribution_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py"
+      # Windows Path
+      "${tensorflow_source_dir}/tensorflow/contrib/framework/python/ops/checkpoint_ops_test.py" #TODO: Fix path
+      "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/models_test.py"
+      # Related to Windows Multiprocessing https://github.com/fchollet/keras/issues/5071
+      "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/engine/training_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/utils/data_utils_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/callbacks_test.py"
+      # Scipy needed
+      "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/preprocessing/image_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/chi2_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/negative_binomial_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/relaxed_onehot_categorical_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py"
+      # Failing with TF 1.3 (TODO)
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/estimator_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_test.py"
   )
   endif()
   list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index 448bcafffe6..9174c5eb989 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -23,6 +23,7 @@ import itertools
 import numpy as np
 
 from tensorflow.contrib.crf.python.ops import crf
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -199,6 +200,52 @@ class CrfTest(test.TestCase):
       self.assertEqual(actual_max_sequence,
                        expected_max_sequence[:sequence_lengths])
 
+  def testCrfDecode(self):
+    inputs = np.array(
+        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
+    transition_params = np.array(
+        [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    sequence_lengths = np.array(3, dtype=np.int32)
+    num_words = inputs.shape[0]
+    num_tags = inputs.shape[1]
+
+    with self.test_session() as sess:
+      all_sequence_scores = []
+      all_sequences = []
+
+      # Compare the dynamic program with brute force computation.
+      for tag_indices in itertools.product(
+          range(num_tags), repeat=sequence_lengths):
+        tag_indices = list(tag_indices)
+        tag_indices.extend([0] * (num_words - sequence_lengths))
+        all_sequences.append(tag_indices)
+        sequence_score = crf.crf_sequence_score(
+            inputs=array_ops.expand_dims(inputs, 0),
+            tag_indices=array_ops.expand_dims(tag_indices, 0),
+            sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
+            transition_params=constant_op.constant(transition_params))
+        sequence_score = array_ops.squeeze(sequence_score, [0])
+        all_sequence_scores.append(sequence_score)
+
+      tf_all_sequence_scores = sess.run(all_sequence_scores)
+
+      expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+      expected_max_sequence = all_sequences[expected_max_sequence_index]
+      expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
+
+      actual_max_sequence, actual_max_score = crf.crf_decode(
+          array_ops.expand_dims(inputs, 0),
+          constant_op.constant(transition_params),
+          array_ops.expand_dims(sequence_lengths, 0))
+      actual_max_sequence = array_ops.squeeze(actual_max_sequence, [0])
+      actual_max_score = array_ops.squeeze(actual_max_score, [0])
+      tf_actual_max_sequence, tf_actual_max_score = sess.run(
+          [actual_max_sequence, actual_max_score])
+
+      self.assertAllClose(tf_actual_max_score, expected_max_score)
+      self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
+                       expected_max_sequence[:sequence_lengths])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index a19c70717a5..7166e38b283 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -16,13 +16,24 @@
 
 The following snippet is an example of a CRF layer on top of a batched sequence
 of unary scores (logits for every word). This example also decodes the most
-likely sequence at test time:
+likely sequence at test time. There are two ways to do decoding. One
+is using crf_decode to do decoding in Tensorflow , and the other one is using
+viterbi_decode in Numpy.
 
 log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
     unary_scores, gold_tags, sequence_lengths)
+
 loss = tf.reduce_mean(-log_likelihood)
 train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
 
+# Decoding in Tensorflow.
+viterbi_sequence, viterbi_score = tf.contrib.crf.crf_decode(
+    unary_scores, transition_params, sequence_lengths)
+
+tf_viterbi_sequence, tf_viterbi_score, _ = session.run(
+    [viterbi_sequence, viterbi_score, train_op])
+
+# Decoding in Numpy.
 tf_unary_scores, tf_sequence_lengths, tf_transition_params, _ = session.run(
     [unary_scores, sequence_lengths, transition_params, train_op])
 for tf_unary_scores_, tf_sequence_length_ in zip(tf_unary_scores,
@@ -31,7 +42,7 @@ for tf_unary_scores_, tf_sequence_length_ in zip(tf_unary_scores,
 tf_unary_scores_ = tf_unary_scores_[:tf_sequence_length_]
 
 # Compute the highest score and its tag sequence.
-viterbi_sequence, viterbi_score = tf.contrib.crf.viterbi_decode(
+tf_viterbi_sequence, tf_viterbi_score = tf.contrib.crf.viterbi_decode(
     tf_unary_scores_, tf_transition_params)
 """
 
@@ -43,6 +54,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
@@ -50,7 +62,9 @@ from tensorflow.python.ops import variable_scope as vs
 
 __all__ = [
     "crf_sequence_score", "crf_log_norm", "crf_log_likelihood",
-    "crf_unary_score", "crf_binary_score", "CrfForwardRnnCell", "viterbi_decode"
+    "crf_unary_score", "crf_binary_score", "CrfForwardRnnCell",
+    "viterbi_decode", "crf_decode", "CrfDecodeForwardRnnCell",
+    "CrfDecodeBackwardRnnCell"
 ]
 
 
@@ -310,3 +324,154 @@ def viterbi_decode(score, transition_params):
 
   viterbi_score = np.max(trellis[-1])
   return viterbi, viterbi_score
+
+
+class CrfDecodeForwardRnnCell(rnn_cell.RNNCell):
+  """Computes the forward decoding in a linear-chain CRF.
+  """
+
+  def __init__(self, transition_params):
+    """Initialize the CrfDecodeForwardRnnCell.
+
+    Args:
+      transition_params: A [num_tags, num_tags] matrix of binary
+        potentials. This matrix is expanded into a
+        [1, num_tags, num_tags] in preparation for the broadcast
+        summation occurring within the cell.
+    """
+    self._transition_params = array_ops.expand_dims(transition_params, 0)
+    self._num_tags = transition_params.get_shape()[0].value
+
+  @property
+  def state_size(self):
+    return self._num_tags
+
+  @property
+  def output_size(self):
+    return self._num_tags
+
+  def __call__(self, inputs, state, scope=None):
+    """Build the CrfDecodeForwardRnnCell.
+
+    Args:
+      inputs: A [batch_size, num_tags] matrix of unary potentials.
+      state: A [batch_size, num_tags] matrix containing the previous step's
+            score values.
+      scope: Unused variable scope of this cell.
+
+    Returns:
+      backpointers: [batch_size, num_tags], containing backpointers.
+      new_state: [batch_size, num_tags], containing new score values.
+    """
+    # For simplicity, in shape comments, denote:
+    # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
+    state = array_ops.expand_dims(state, 2)                         # [B, O, 1]
+
+    # This addition op broadcasts self._transitions_params along the zeroth
+    # dimension and state along the second dimension.
+    # [B, O, 1] + [1, O, O] -> [B, O, O]
+    transition_scores = state + self._transition_params             # [B, O, O]
+    new_state = inputs + math_ops.reduce_max(transition_scores, [1])  # [B, O]
+    backpointers = math_ops.argmax(transition_scores, 1)
+    backpointers = math_ops.cast(backpointers, dtype=dtypes.int32)    # [B, O]
+    return backpointers, new_state
+
+
+class CrfDecodeBackwardRnnCell(rnn_cell.RNNCell):
+  """Computes backward decoding in a linear-chain CRF.
+  """
+
+  def __init__(self, num_tags):
+    """Initialize the CrfDecodeBackwardRnnCell.
+
+    Args:
+      num_tags
+    """
+    self._num_tags = num_tags
+
+  @property
+  def state_size(self):
+    return 1
+
+  @property
+  def output_size(self):
+    return 1
+
+  def __call__(self, inputs, state, scope=None):
+    """Build the CrfDecodeBackwardRnnCell.
+
+    Args:
+      inputs: [batch_size, num_tags], backpointer of next step (in time order).
+      state: [batch_size, 1], next position's tag index.
+      scope: Unused variable scope of this cell.
+
+    Returns:
+      new_tags, new_tags: A pair of [batch_size, num_tags]
+        tensors containing the new tag indices.
+    """
+    state = array_ops.squeeze(state, axis=[1])                # [B]
+    batch_size = array_ops.shape(inputs)[0]
+    b_indices = math_ops.range(batch_size)                    # [B]
+    indices = array_ops.stack([b_indices, state], axis=1)     # [B, 2]
+    new_tags = array_ops.expand_dims(
+        gen_array_ops.gather_nd(inputs, indices),             # [B]
+        axis=-1)                                              # [B, 1]
+
+    return new_tags, new_tags
+
+
+def crf_decode(potentials, transition_params, sequence_length):
+  """Decode the highest scoring sequence of tags in TensorFlow.
+
+  This is a function for tensor.
+
+  Args:
+    potentials: A [batch_size, max_seq_len, num_tags] tensor, matrix of
+              unary potentials.
+    transition_params: A [num_tags, num_tags] tensor, matrix of
+              binary potentials.
+    sequence_length: A [batch_size] tensor, containing sequence lengths.
+
+  Returns:
+    decode_tags: A [batch_size, max_seq_len] tensor, with dtype tf.int32.
+                Contains the highest scoring tag indicies.
+    best_score: A [batch_size] tensor, containing the score of decode_tags.
+  """
+  # For simplicity, in shape comments, denote:
+  # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
+  num_tags = potentials.get_shape()[2].value
+
+  # Computes forward decoding. Get last score and backpointers.
+  crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
+  initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
+  initial_state = array_ops.squeeze(initial_state, axis=[1])      # [B, O]
+  inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])   # [B, T-1, O]
+  backpointers, last_score = rnn.dynamic_rnn(
+      crf_fwd_cell,
+      inputs=inputs,
+      sequence_length=sequence_length - 1,
+      initial_state=initial_state,
+      time_major=False,
+      dtype=dtypes.int32)             # [B, T - 1, O], [B, O]
+  backpointers = gen_array_ops.reverse_sequence(
+      backpointers, sequence_length - 1, seq_dim=1)               # [B, T-1, O]
+
+  # Computes backward decoding. Extract tag indices from backpointers.
+  crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
+  initial_state = math_ops.cast(math_ops.argmax(last_score, axis=1),
+                                dtype=dtypes.int32)               # [B]
+  initial_state = array_ops.expand_dims(initial_state, axis=-1)   # [B, 1]
+  decode_tags, _ = rnn.dynamic_rnn(
+      crf_bwd_cell,
+      inputs=backpointers,
+      sequence_length=sequence_length - 1,
+      initial_state=initial_state,
+      time_major=False,
+      dtype=dtypes.int32)           # [B, T - 1, 1]
+  decode_tags = array_ops.squeeze(decode_tags, axis=[2])           # [B, T - 1]
+  decode_tags = array_ops.concat([initial_state, decode_tags], axis=1)  # [B, T]
+  decode_tags = gen_array_ops.reverse_sequence(
+      decode_tags, sequence_length, seq_dim=1)                     # [B, T]
+
+  best_score = math_ops.reduce_max(last_score, axis=1)             # [B]
+  return decode_tags, best_score
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
index 6ca38c2e479..ff409ac7182 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
@@ -93,7 +93,7 @@ class CudnnRNNBenchmark(test.Benchmark):
       batch_size = config["batch_size"]
       seq_length = config["seq_length"]
 
-      with ops.Graph().as_default(), ops.device("/gpu:0"):
+      with ops.Graph().as_default(), ops.device("/device:GPU:0"):
         model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, num_units)
         params_size_t = model.params_size()
         input_data = variables.Variable(
@@ -125,7 +125,7 @@ class CudnnRNNBenchmark(test.Benchmark):
       batch_size = config["batch_size"]
       seq_length = config["seq_length"]
 
-      with ops.Graph().as_default(), ops.device("/gpu:0"):
+      with ops.Graph().as_default(), ops.device("/device:GPU:0"):
         inputs = seq_length * [
             array_ops.zeros([batch_size, num_units], dtypes.float32)
         ]
@@ -153,7 +153,7 @@ class CudnnRNNBenchmark(test.Benchmark):
       batch_size = config["batch_size"]
       seq_length = config["seq_length"]
 
-      with ops.Graph().as_default(), ops.device("/gpu:0"):
+      with ops.Graph().as_default(), ops.device("/device:GPU:0"):
         inputs = seq_length * [
             array_ops.zeros([batch_size, num_units], dtypes.float32)
         ]
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index 2e70d2d5ec8..aebdcea10b5 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -286,14 +286,14 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
       save_path = os.path.join(self.get_temp_dir(),
                                "save-restore-variable-test")
       saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-      # Passing graph explictly, otherwise an old sess would be reused.
+      # Passing graph explicitly, otherwise an old sess would be reused.
       with self.test_session(
           use_gpu=True, graph=ops.get_default_graph()) as sess:
         sess.run(variables.global_variables_initializer())
         params_v = sess.run(params)
         val = saver.save(sess, save_path)
         self.assertEqual(save_path, val)
-      # Passing graph explictly, otherwise an old sess would be reused.
+      # Passing graph explicitly, otherwise an old sess would be reused.
       with self.test_session(
           use_gpu=True, graph=ops.get_default_graph()) as sess:
         reset_params = state_ops.assign(
@@ -328,14 +328,14 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
       save_path = os.path.join(self.get_temp_dir(),
                                "save-restore-variable-test")
       saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-      # Passing graph explictly, otherwise an old sess would be reused.
+      # Passing graph explicitly, otherwise an old sess would be reused.
       with self.test_session(
           use_gpu=True, graph=ops.get_default_graph()) as sess:
         sess.run(variables.global_variables_initializer())
         params_v = sess.run(param_vars)
         val = saver.save(sess, save_path)
         self.assertEqual(save_path, val)
-      # Passing graph explictly, otherwise an old sess would be reused.
+      # Passing graph explicitly, otherwise an old sess would be reused.
       with self.test_session(
           use_gpu=True, graph=ops.get_default_graph()) as sess:
         reset_params = [
@@ -398,14 +398,14 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
             params=params,
             is_training=False)
       total_sum = sum(map(math_ops.reduce_sum, outputs))
-      # Passing graph explictly, otherwise an old sess would be reused.
+      # Passing graph explicitly, otherwise an old sess would be reused.
       with self.test_session(
           use_gpu=True, graph=ops.get_default_graph()) as sess:
         sess.run(variables.global_variables_initializer())
         total_sum_v = sess.run(total_sum)
         val = saver.save(sess, save_path)
         self.assertEqual(save_path, val)
-      # Passing graph explictly, otherwise an old sess would be reused.
+      # Passing graph explicitly, otherwise an old sess would be reused.
       with self.test_session(
           use_gpu=True, graph=ops.get_default_graph()) as sess:
         reset_params = state_ops.assign(
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
index 949453bb736..6ef960037f0 100644
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -258,11 +258,12 @@ class Iterator(object):
       # initializers that simply reset their state to the beginning.
       raise ValueError("Iterator does not have an initializer.")
 
-  def make_initializer(self, dataset):
+  def make_initializer(self, dataset, name=None):
     """Returns a `tf.Operation` that initializes this iterator on `dataset`.
 
     Args:
       dataset: A `Dataset` with compatible structure to this iterator.
+      name: (Optional.) A name for the created operation.
 
     Returns:
       A `tf.Operation` that can be run to initialize this iterator on the given
@@ -272,22 +273,25 @@ class Iterator(object):
       TypeError: If `dataset` and this iterator do not have a compatible
         element structure.
     """
-    nest.assert_same_structure(self._output_types, dataset.output_types)
-    nest.assert_same_structure(self._output_shapes, dataset.output_shapes)
-    for iterator_dtype, dataset_dtype in zip(
-        nest.flatten(self._output_types), nest.flatten(dataset.output_types)):
-      if iterator_dtype != dataset_dtype:
-        raise TypeError(
-            "Expected output types %r but got dataset with output types %r." %
-            (self._output_types, dataset.output_types))
-    for iterator_shape, dataset_shape in zip(
-        nest.flatten(self._output_shapes), nest.flatten(dataset.output_shapes)):
-      if not iterator_shape.is_compatible_with(dataset_shape):
-        raise TypeError("Expected output shapes compatible with %r but got "
-                        "dataset with output shapes %r." %
-                        (self._output_shapes, dataset.output_shapes))
-    return gen_dataset_ops.make_iterator(dataset.make_dataset_resource(),
-                                         self._iterator_resource)
+    with ops.name_scope(name, "make_initializer") as name:
+      nest.assert_same_structure(self._output_types, dataset.output_types)
+      nest.assert_same_structure(self._output_shapes, dataset.output_shapes)
+      for iterator_dtype, dataset_dtype in zip(
+          nest.flatten(self._output_types), nest.flatten(dataset.output_types)):
+        if iterator_dtype != dataset_dtype:
+          raise TypeError(
+              "Expected output types %r but got dataset with output types %r." %
+              (self._output_types, dataset.output_types))
+      for iterator_shape, dataset_shape in zip(
+          nest.flatten(self._output_shapes),
+          nest.flatten(dataset.output_shapes)):
+        if not iterator_shape.is_compatible_with(dataset_shape):
+          raise TypeError("Expected output shapes compatible with %r but got "
+                          "dataset with output shapes %r." %
+                          (self._output_shapes, dataset.output_shapes))
+      return gen_dataset_ops.make_iterator(dataset.make_dataset_resource(),
+                                           self._iterator_resource,
+                                           name=name)
 
   def get_next(self, name=None):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 3bbf1c2f5e2..dfded47b003 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -49,6 +49,7 @@ from tensorflow.contrib.distributions.python.ops.quantized_distribution import *
 from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
 from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.sample_stats import *
+from tensorflow.contrib.distributions.python.ops.test_util import *
 from tensorflow.contrib.distributions.python.ops.vector_exponential_diag import *
 from tensorflow.contrib.distributions.python.ops.vector_laplace_diag import *
 from tensorflow.contrib.distributions.python.ops.wishart import *
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
index aa523a95118..2705b96f271 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
@@ -634,7 +634,7 @@ class MixtureBenchmark(test.Benchmark):
     np.random.seed(127)
     with session.Session(config=config, graph=ops.Graph()) as sess:
       random_seed.set_random_seed(0)
-      with ops.device("/gpu:0" if use_gpu else "/cpu:0"):
+      with ops.device("/device:GPU:0" if use_gpu else "/cpu:0"):
         mixture = create_distribution(
             num_components=num_components,
             batch_size=batch_size,
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util.py b/tensorflow/contrib/framework/python/framework/tensor_util.py
index ec68d3b170e..8839da2947a 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util.py
@@ -17,7 +17,9 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import numpy as np
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index bc6bc952ee4..2effe8eb26e 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -20,7 +20,9 @@ from __future__ import division
 from __future__ import print_function
 
 import re
+
 import numpy as np
+
 from tensorflow.contrib.framework.python.framework import tensor_util
 from tensorflow.contrib.framework.python.ops import variables as variables_lib2
 from tensorflow.python.framework import constant_op
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index 411b4facdb1..1bd9a14a7f3 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -37,6 +37,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.training import training_util
+from tensorflow.python.util.deprecation import deprecated
 
 
 __all__ = ['add_model_variable',
@@ -82,7 +83,7 @@ def zero_initializer(ref, use_locking=True, name="zero_initializer"):
       resource_loader.get_path_to_datafile("_variable_ops.so"))
   return gen_variable_ops.zero_initializer(ref, name=name)
 
-
+@deprecated(None, "Please switch to tf.train.assert_global_step")
 def assert_global_step(global_step_tensor):
   training_util.assert_global_step(global_step_tensor)
 
@@ -110,11 +111,11 @@ def assert_or_get_global_step(graph=None, global_step_tensor=None):
     assert_global_step(global_step_tensor)
   return global_step_tensor
 
-
+@deprecated(None, "Please switch to tf.train.get_global_step")
 def get_global_step(graph=None):
   return training_util.get_global_step(graph)
 
-
+@deprecated(None, "Please switch to tf.train.create_global_step")
 def create_global_step(graph=None):
   """Create global step tensor in graph.
 
@@ -132,7 +133,7 @@ def create_global_step(graph=None):
   """
   return training_util.create_global_step(graph)
 
-
+@deprecated(None, "Please switch to tf.train.get_or_create_global_step")
 def get_or_create_global_step(graph=None):
   """Returns and create (if necessary) the global step tensor.
 
@@ -561,7 +562,7 @@ def assign_from_checkpoint(model_path, var_list, ignore_missing_vars=False):
       grouped_vars[ckpt_name].append(var)
 
   else:
-    for ckpt_name, value in var_list.iteritems():
+    for ckpt_name, value in var_list.items():
       if isinstance(value, (tuple, list)):
         grouped_vars[ckpt_name] = value
       else:
diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py
index cb278707202..6a74e4e8666 100644
--- a/tensorflow/contrib/framework/python/ops/variables_test.py
+++ b/tensorflow/contrib/framework/python/ops/variables_test.py
@@ -443,19 +443,19 @@ class VariablesTest(test.TestCase):
         e = variables_lib2.variable('e', initializer=e_init)
       # The values below highlight how the VariableDeviceChooser puts initial
       # values on the same device as the variable job.
-      self.assertDeviceEqual(a.device, '/gpu:0')
+      self.assertDeviceEqual(a.device, '/device:GPU:0')
       self.assertEqual(a.initial_value.op.colocation_groups(),
                        a.op.colocation_groups())
-      self.assertDeviceEqual(b.device, '/gpu:0')
+      self.assertDeviceEqual(b.device, '/device:GPU:0')
       self.assertEqual(b.initial_value.op.colocation_groups(),
                        b.op.colocation_groups())
       self.assertDeviceEqual(c.device, '/cpu:12')
       self.assertEqual(c.initial_value.op.colocation_groups(),
                        c.op.colocation_groups())
-      self.assertDeviceEqual(d.device, '/gpu:0')
+      self.assertDeviceEqual(d.device, '/device:GPU:0')
       self.assertEqual(d.initial_value.op.colocation_groups(),
                        d.op.colocation_groups())
-      self.assertDeviceEqual(e.device, '/gpu:0')
+      self.assertDeviceEqual(e.device, '/device:GPU:0')
       self.assertDeviceEqual(e.initial_value.device, '/cpu:99')
 
 
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
new file mode 100644
index 00000000000..645e364d191
--- /dev/null
+++ b/tensorflow/contrib/gdr/BUILD
@@ -0,0 +1,125 @@
+# Description:
+#   GPU Direct RDMA Out-of-Band Tensor transport for TensorFlow.
+
+package(default_visibility = [
+    "//tensorflow:__subpackages__",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cuda_library",
+)
+
+# For platform specific build config
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_cc",
+)
+
+tf_proto_library_cc(
+    name = "gdr_proto",
+    srcs = ["gdr.proto"],
+    cc_api_version = 2,
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+tf_cuda_library(
+    name = "gdr_memory_manager",
+    srcs = ["gdr_memory_manager.cc"],
+    hdrs = ["gdr_memory_manager.h"],
+    linkopts = select({
+        "//tensorflow:with_gdr_support": [
+            "-libverbs",
+            "-lrdmacm",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":gdr_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cuda_library(
+    name = "gdr_worker",
+    srcs = ["gdr_worker.cc"],
+    hdrs = ["gdr_worker.h"],
+    deps = [
+        ":gdr_memory_manager",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:graph_mgr",
+        "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
+        "//tensorflow/core/distributed_runtime:worker",
+        "//tensorflow/core/distributed_runtime:worker_cache",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime:worker_session",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_call",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_tensor_coding",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+    ],
+)
+
+cc_library(
+    name = "gdr_rendezvous_mgr",
+    srcs = ["gdr_rendezvous_mgr.cc"],
+    hdrs = ["gdr_rendezvous_mgr.h"],
+    deps = [
+        ":gdr_memory_manager",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:worker_cache",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime:worker_interface",
+    ],
+)
+
+cc_library(
+    name = "gdr_server_lib",
+    srcs = ["gdr_server_lib.cc"],
+    hdrs = ["gdr_server_lib.h"],
+    linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
+    deps = [
+        ":gdr_memory_manager",
+        ":gdr_rendezvous_mgr",
+        ":gdr_worker",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/contrib/gdr/README.md b/tensorflow/contrib/gdr/README.md
new file mode 100644
index 00000000000..34ce60b3608
--- /dev/null
+++ b/tensorflow/contrib/gdr/README.md
@@ -0,0 +1,122 @@
+Introduction
+===
+
+This is an implementation of GDR out-of-band transport for TensorFlow distributed runtime, complementary to current gRPC transport. It uses gRPC as control plane to setup rendezvous for each tensor transmission, and utilizes [GPU Direct RDMA](https://developer.nvidia.com/gpudirect) whenever possible to transmit tensors in remote GPU memory through network interface card (NIC), bypassing host memory and CPU entirely. It gracefully falls back to ordinary RDMA or even gRPC when GDR is not available.
+
+Design
+===
+
+The GDR out-of-band transport is designed to avoid any unnecessary memory copies, especially for large tensors (>100MB). That typically requires registration of tensor buffers to NIC in an ad-hoc manner, which is rather slow as described in the design trade-off of the verbs runtime. The verbs runtime thus chooses to manage its own NIC-registered buffers and copy the tensors from/to those buffers for every single tensor transfer.
+
+We show that, however, such design trade-off is not always relevant. In this patch, we manage both computation and communication buffers in a unified manner. By pre-registration of large buffers to NIC and allocating small tensors from the buffer pool using a BFC allocator, it is possible to avoid both ad-hoc buffer registration and memory copies all together.
+
+For the actual tensor transport, we rely on gRPC to transmit the [remote buffer information](gdr.proto). This greatly simplifies our design, and there are only 2 types of RDMA messages: a single READ to retrieve the tensor data (bypassing remote CPU), and another invalidate using WRITE with IMM to release the tensor buffer on the remote side. The remote side will only be polling the invalidate message and `Unref` the tensor buffers that read by its peer.
+
+Environment
+===
+
+To fully utilize GDR, the target environment has to meet 3 conditions:
+
+1. There is an RDMA capable device with corresponding [OFED package](https://www.openfabrics.org/index.php/overview.html) installed (detailed information is available from your [Infiniband/RoCE](http://www.mellanox.com/page/products_dyn?product_family=116)/[iWarp](http://www.chelsio.com/gpudirect-rdma/) vendor), which could be verified through `ibv_devinfo`, e.g.
+
+```
+$ ibv_devinfo
+hca_id:	mlx4_0
+	transport:			InfiniBand (0)
+	fw_ver:				2.40.7000
+	node_guid:			248a:0703:00f6:3370
+	sys_image_guid:			248a:0703:00f6:3370
+	vendor_id:			0x02c9
+	vendor_part_id:			4099
+	hw_ver:				0x1
+	board_id:			MT_1090110023
+	phys_port_cnt:			2
+	Device ports:
+		port:	1
+			state:			PORT_ACTIVE (4)
+			max_mtu:		4096 (5)
+			active_mtu:		1024 (3)
+			sm_lid:			0
+			port_lid:		0
+			port_lmc:		0x00
+			link_layer:		Ethernet
+
+		port:	2
+			state:			PORT_ACTIVE (4)
+			max_mtu:		4096 (5)
+			active_mtu:		1024 (3)
+			sm_lid:			0
+			port_lid:		0
+			port_lmc:		0x00
+			link_layer:		Ethernet
+```
+
+2. There is a GDR capable GPU, i.e. of Fermi, Kepler or later architecture with [corresponding driver](http://docs.nvidia.com/cuda/gpudirect-rdma/index.html) installed. The PCI-e topology could be confirmed by `nvidia-smi topo -m`. For example, in the following topology, `GPU2` and `GPU3` are adjacent to `mlx4_0`, and tensors on these devices could benefit from GDR in current implementation.
+
+```
+$ nvidia-smi topo -m
+	GPU0	GPU1	GPU2	GPU3	mlx4_0	CPU Affinity
+GPU0	 X 	PHB	SOC	SOC	SOC	0-5
+GPU1	PHB	 X 	SOC	SOC	SOC	0-5
+GPU2	SOC	SOC	 X 	PHB	PHB	6-11
+GPU3	SOC	SOC	PHB	 X 	PHB	6-11
+mlx4_0	SOC	SOC	PHB	PHB	 X
+
+Legend:
+
+  X   = Self
+  SOC  = Connection traversing PCIe as well as the SMP link between CPU sockets(e.g. QPI)
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing a single PCIe switch
+  NV#  = Connection traversing a bonded set of # NVLinks
+```
+
+3. The [`nv_peer_mem`](https://github.com/Mellanox/nv_peer_memory) kernel module is installed.
+
+How to build and run in GDR mode
+===
+
+To test it out on a GDR capable environment, choose to enable GDR in your configure script.
+
+```
+Do you wish to build TensorFlow with GDR support? [y/N]: y
+GDR support will be enabled for TensorFlow.
+```
+
+Change your `protocol` to `grpc+gdr` to enable GDR in your deployment.
+
+```
+server = tf.train.Server(cluster, job_name="local", task_index=0, protocol='grpc+gdr') # default protocol is 'grpc'
+```
+
+Currently the out-of-band transport service listens to the same IP and port address as specified in gRPC.
+
+A successful initialization looks like this:
+
+```
+2017-08-05 19:10:38.601718: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K40m, pci bus id: 0000:02:00.0)
+2017-08-05 19:10:38.601728: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K40m, pci bus id: 0000:03:00.0)
+2017-08-05 19:10:38.601736: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K40m, pci bus id: 0000:82:00.0)
+2017-08-05 19:10:38.601742: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K40m, pci bus id: 0000:83:00.0)
+2017-08-05 19:10:39.591026: I tensorflow/contrib/gdr/gdr_memory_manager.cc:235] RDMA server is listening on 10.40.2.200:5001
+2017-08-05 19:10:39.591071: I tensorflow/contrib/gdr/gdr_memory_manager.cc:285] Instrumenting CPU allocator cuda_host_bfc
+2017-08-05 19:10:39.591083: I tensorflow/contrib/gdr/gdr_memory_manager.cc:285] Instrumenting CPU allocator cpu_pool
+2017-08-05 19:10:39.591095: I tensorflow/contrib/gdr/gdr_memory_manager.cc:285] Instrumenting CPU allocator cpu_rdma_bfc
+2017-08-05 19:10:39.591278: I tensorflow/contrib/gdr/gdr_memory_manager.cc:78] NUMA node for device: mlx4_0 is 1
+2017-08-05 19:10:39.740253: I tensorflow/contrib/gdr/gdr_memory_manager.cc:296] Instrumenting GPU allocator with bus_id 2
+```
+
+The last line suggests that the GPUs with bus id 2 (mapped to pci bus id prefixed 0000:8) will benefit from GDR and host memory bypass, which is `/gpu:2` and `/gpu:3` in this case.
+
+Caveats
+===
+
+In current implementation, only tensors that reside in host memory or in GPU memory such that the GPU is adjacent to an RDMA capable NIC will use direct RDMA as its transport. When RDMA is available but not GDR, a temporary tensor copy on host memory will be used as RDMA source/destination (and copied from/to the target device). When there is no RDMA device present, it can even fallback to the original gRPC runtime. While it is theoretically possible to mix GDR enabled TF with non-GDR deployments in the same job, make sure the environment is properly setup so the GDR mode is enabled whenever possible (i.e. do not fall back to gRPC when it is not absolutely necessary).
+
+In the original design (as in the reference), tensor buffers are only registered to NIC when we could determine that the tensor will be either a source of Send or a sink of Recv across physical machine boundary. However, to implement the precise allocations, we need to change all the devices to possibly return a NIC compatible allocator. As GDR is currently in contrib, we would like to avoid the unnecessary code disruption to the TF core, so we allocate all tensors from NIC-registered buffers using a BFC allocator. This behaviour is similar to the effect of enabling the extra GPU option `force_gpu_compatible`, which allocate all host tensors in GPU-registered buffers no matter they will be transferred from/to GPUs or not.
+
+Reference
+===
+
+Bairen Yi, Jiacheng Xia, Li Chen, and Kai Chen. 2017. Towards Zero Copy Dataflows using RDMA. In Proceedings of SIGCOMM Posters and Demos'17, Los Angeles, CA, USA, August 22-24, 2017, 3 pages. https://doi.org/10.1145/3123878.3123907
diff --git a/tensorflow/contrib/gdr/gdr.proto b/tensorflow/contrib/gdr/gdr.proto
new file mode 100644
index 00000000000..c0b89245b15
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr.proto
@@ -0,0 +1,13 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+
+message RemoteMemoryRegion {
+  string host = 1;
+  string port = 2;
+  uint64 addr = 3;
+  uint32 rkey = 4;
+  uint32 tensor_key = 5;
+  uint64 checksum = 6;
+}
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
new file mode 100644
index 00000000000..c55989e3e5c
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -0,0 +1,682 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_GDR
+
+#include "tensorflow/contrib/gdr/gdr_memory_manager.h"
+
+#include <atomic>
+#include <cerrno>
+#include <fstream>
+#include <list>
+#include <map>
+#include <set>
+
+#include <fcntl.h>
+#include <rdma/rdma_cma.h>
+#include <rdma/rdma_verbs.h>
+#include <sys/epoll.h>
+
+#include "tensorflow/contrib/gdr/gdr.pb.h"
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
+#endif  // GOOGLE_CUDA
+#include "tensorflow/core/framework/allocator_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+bool IsGDRAvailable() {
+#if defined(__APPLE__)
+  return false;
+#elif defined(PLATFORM_WINDOWS)
+  return false;
+#else
+  std::ifstream ifs("/proc/modules");
+  string line;
+  while (std::getline(ifs, line)) {
+    auto sep = line.find(' ');
+    CHECK_NE(sep, std::string::npos);
+    if (line.substr(0, sep) == "nv_peer_mem") {
+      return true;
+    }
+  }
+  return false;
+#endif
+}
+
+int TryToReadNumaNode(ibv_device* device) {
+#if defined(__APPLE__)
+  LOG(INFO) << "OS X does not support NUMA - returning NUMA node 0";
+  return 0;
+#elif defined(PLATFORM_WINDOWS)
+  // Windows support for NUMA is not currently implemented. Return node 0.
+  return 0;
+#else
+  VLOG(2) << "Trying to read NUMA node for device: " << device->name;
+  static const int kUnknownNumaNode = -1;
+
+  auto filename = string(device->ibdev_path) + "/device/numa_node";
+
+  std::ifstream ifs(filename.c_str());
+  string content;
+  CHECK(std::getline(ifs, content));
+
+  int32 value;
+  if (strings::safe_strto32(content, &value)) {
+    if (value < 0) {
+      LOG(INFO) << "Successful NUMA node read from SysFS had negative value ("
+                << value << "), but there must be at least one NUMA node"
+                            ", so returning NUMA node zero";
+      return 0;
+    }
+    LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
+    return value;
+  }
+  return kUnknownNumaNode;
+#endif
+}
+
+void EndpointDeleter(rdma_cm_id* id) {
+  if (id) {
+    rdma_destroy_ep(id);
+  }
+}
+
+void MRDeleter(ibv_mr* mr) {
+  if (mr) {
+    rdma_dereg_mr(mr);
+  }
+}
+
+using RdmaEndpointPtr = std::unique_ptr<rdma_cm_id, decltype(&EndpointDeleter)>;
+
+using MemoryRegionPtr = std::unique_ptr<ibv_mr, decltype(&MRDeleter)>;
+
+class GdrMemoryManager : public RemoteMemoryManager {
+ public:
+  GdrMemoryManager(const string& host, const string& port);
+
+  virtual ~GdrMemoryManager();
+
+  virtual Status Init() override;
+
+  virtual void Run() override;
+
+  virtual void Stop() override;
+
+  virtual Status TransportOptionsFromTensor(
+      ::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
+      Device* device, DeviceContext* device_context, bool on_host) override;
+
+  virtual Status TensorFromTransportOptions(
+      Tensor* tensor, const ::google::protobuf::Any& transport_options,
+      Device* device, DeviceContext* device_context, bool on_host) override;
+
+ protected:
+  Status CreateEndpoint(const string& host, const string& port,
+                        RdmaEndpointPtr& endpoint);
+
+  static bool Comparator(const void* ptr, const MemoryRegionPtr& other) {
+    return ptr < reinterpret_cast<char*>(other->addr) + other->length;
+  }
+
+  ibv_mr* FindMemoryRegion(void* addr, size_t length);
+
+  void InsertMemoryRegion(void* addr, size_t length);
+
+#if GOOGLE_CUDA
+  void InsertCUDAMemoryRegion(void* addr, size_t length);
+#endif
+
+  void EvictMemoryRegion(void* addr, size_t length);
+
+ private:
+  const string host_;
+  const string port_;
+  RdmaEndpointPtr listening_;
+  std::atomic<bool> stopped_;
+  int epfd_;
+
+  // Server side endpoints
+  // Accessed sequentially in Run() so not protected by lock
+  std::list<RdmaEndpointPtr> server_clients_;
+
+  using TensorKey = uint32_t;
+  std::atomic<TensorKey> next_key_;
+
+  // Server side on-the-fly tensor buffers
+  mutex server_mu_;
+  std::map<TensorKey, const TensorBuffer*> tensor_buffers_
+      GUARDED_BY(server_mu_);
+
+  // Client side endpoints
+  mutex client_mu_;
+  std::map<std::pair<string, string>, RdmaEndpointPtr> clients_
+      GUARDED_BY(cient_mu_);
+
+  // Managed memory regions
+  mutex alloc_mu_;
+  std::vector<MemoryRegionPtr> mrs_ GUARDED_BY(alloc_mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GdrMemoryManager);
+};
+
+// TODO(byronyi): remove this class duplicated from the one in
+// common/runtime/gpu/pool_allocator.h when it is available in common_runtime
+class BasicCPUAllocator : public SubAllocator {
+ public:
+  ~BasicCPUAllocator() override {}
+
+  void* Alloc(size_t alignment, size_t num_bytes) override {
+    return port::AlignedMalloc(num_bytes, alignment);
+  }
+  void Free(void* ptr, size_t) override { port::AlignedFree(ptr); }
+};
+
+// TODO(byronyi): remove this class and its registration when the default
+// cpu_allocator() returns visitable allocator
+class BFCRdmaAllocator : public BFCAllocator {
+ public:
+  BFCRdmaAllocator()
+      : BFCAllocator(new BasicCPUAllocator(), 1LL << 36, true, "cpu_rdma_bfc") {
+  }
+};
+
+REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocator);
+
+GdrMemoryManager::GdrMemoryManager(const string& host, const string& port)
+    : host_(host),
+      port_(port),
+      listening_(nullptr, EndpointDeleter),
+      stopped_(true),
+      next_key_(0) {}
+
+GdrMemoryManager::~GdrMemoryManager() { close(epfd_); }
+
+Status GdrMemoryManager::Init() {
+  epfd_ = epoll_create1(0);
+  if (epfd_ == -1) {
+    return errors::Unavailable(strerror(errno), ": ", "epoll_create");
+  }
+
+  rdma_addrinfo* addrinfo;
+  rdma_addrinfo hints = {};
+  hints.ai_port_space = RDMA_PS_TCP;
+  hints.ai_flags = RAI_PASSIVE;
+  if (rdma_getaddrinfo(const_cast<char*>(host_.c_str()),
+                       const_cast<char*>(port_.c_str()), &hints, &addrinfo)) {
+    return errors::Unavailable(strerror(errno), ": ", "cannot resolve rdma://",
+                               host_, ":", port_);
+  }
+
+  ibv_qp_init_attr init_attr = {};
+  init_attr.qp_type = IBV_QPT_RC;
+  init_attr.cap.max_recv_wr = 32;
+  init_attr.cap.max_send_wr = 1;
+  init_attr.cap.max_recv_sge = 1;
+  init_attr.cap.max_send_sge = 1;
+
+  // Create listening endpoint
+  rdma_cm_id* id;
+  if (rdma_create_ep(&id, addrinfo, nullptr, &init_attr)) {
+    return errors::Unavailable(strerror(errno), ": ", "cannot bind to rdma://",
+                               host_, ":", port_);
+  }
+  listening_.reset(id);
+  rdma_freeaddrinfo(addrinfo);
+
+  // Listen without backlog
+  if (rdma_listen(listening_.get(), 0)) {
+    return errors::Unavailable(strerror(errno), ": ",
+                               "cannot listen on rdma://", host_, ":", port_);
+  }
+  LOG(INFO) << "RDMA server is listening on " << host_ << ":" << port_;
+
+  if (listening_->verbs == nullptr) {
+    return errors::Unimplemented(
+        "Unsupported address ", host_, ":", port_,
+        " as it does not bind to a particular RDMA device");
+  }
+
+  int flags = fcntl(listening_->channel->fd, F_GETFL, 0);
+  if (fcntl(listening_->channel->fd, F_SETFL, flags | O_NONBLOCK)) {
+    return errors::Unavailable(strerror(errno), ": ",
+                               "cannot set server to non-blocking mode");
+  }
+
+  epoll_event event = {};
+  event.events = EPOLLIN | EPOLLPRI;
+  event.data.ptr = listening_.get();
+  if (epoll_ctl(epfd_, EPOLL_CTL_ADD, listening_->channel->fd, &event)) {
+    return errors::Unavailable(strerror(errno), ": ",
+                               "cannot add server to epoll");
+  }
+
+  Allocator* allocators[] = {
+#if GOOGLE_CUDA
+    ProcessState::singleton()->GetCUDAHostAllocator(0),
+    ProcessState::singleton()->GetCPUAllocator(0),
+#endif  // GOOGLE_CUDA
+    cpu_allocator(),
+  };
+
+  using namespace std::placeholders;
+  VisitableAllocator::Visitor alloc_visitor =
+      std::bind(&GdrMemoryManager::InsertMemoryRegion, this, _1, _2);
+  VisitableAllocator::Visitor free_visitor =
+      std::bind(&GdrMemoryManager::EvictMemoryRegion, this, _1, _2);
+
+  std::set<Allocator*> instrumented_;
+
+  // Host memory allocators
+  for (Allocator* allocator : allocators) {
+    auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
+    CHECK(visitable_allocator) << "is not visitable for instrumentation"
+                               << allocator->Name();
+    // Make sure we don't instrument the same allocator twice
+    if (instrumented_.find(allocator) == std::end(instrumented_)) {
+      visitable_allocator->AddAllocVisitor(alloc_visitor);
+      visitable_allocator->AddFreeVisitor(free_visitor);
+      instrumented_.insert(allocator);
+      LOG(INFO) << "Instrumenting CPU allocator " << allocator->Name();
+    }
+  }
+
+#if GOOGLE_CUDA
+  VisitableAllocator::Visitor cuda_alloc_visitor =
+      std::bind(&GdrMemoryManager::InsertMemoryRegion, this, _1, _2);
+  if (IsGDRAvailable()) {
+    // Note we don't free allocated GPU memory so there is no free visitor
+    int32_t bus_id = TryToReadNumaNode(listening_->verbs->device) + 1;
+    ProcessState::singleton()->AddGPUAllocVisitor(bus_id, cuda_alloc_visitor);
+    LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
+  }
+#endif  // GOOGLE_CUDA
+
+  return Status::OK();
+}
+
+void GdrMemoryManager::Run() {
+  stopped_ = false;
+  while (!stopped_) {
+    epoll_event events[32];
+    int ret = epoll_wait(epfd_, events, 32, 1);
+    if (ret == -1) {
+      LOG(ERROR) << "epoll_wait: " << strerror(errno);
+      return;
+    }
+    for (int i = 0; i < ret; i++) {
+      rdma_cm_id* id = static_cast<rdma_cm_id*>(events[i].data.ptr);
+      if (id == listening_.get()) {
+        // Accept incoming connections
+        if (!rdma_get_request(listening_.get(), &id)) {
+          if (!rdma_accept(id, nullptr)) {
+            LOG(INFO) << "Accepted new RDMA connection";
+            if (ibv_req_notify_cq(id->recv_cq, 0)) {
+              LOG(ERROR) << strerror(errno) << ": ibv_req_notify_cq failed";
+              EndpointDeleter(id);
+              continue;
+            }
+            for (int i = 0; i < 32; i++) {
+              if (rdma_post_recvv(id, nullptr, nullptr, 0)) {
+                LOG(ERROR) << strerror(errno) << ": rdma_post_recvv failed";
+                EndpointDeleter(id);
+                continue;
+              }
+            }
+            int flags = fcntl(id->recv_cq_channel->fd, F_GETFL, 0);
+            if (fcntl(id->recv_cq_channel->fd, F_SETFL, flags | O_NONBLOCK)) {
+              LOG(ERROR) << strerror(errno)
+                         << ": cannot set server_client to non-blocking mode";
+              EndpointDeleter(id);
+              continue;
+            }
+            epoll_event event = {};
+            event.events = EPOLLIN | EPOLLPRI;
+            event.data.ptr = id;
+            if (epoll_ctl(epfd_, EPOLL_CTL_ADD, id->recv_cq_channel->fd,
+                          &event)) {
+              LOG(ERROR) << strerror(errno)
+                         << ": cannot add server client to epoll";
+              EndpointDeleter(id);
+              continue;
+            }
+            server_clients_.push_back({id, EndpointDeleter});
+          }
+        }
+      } else {
+        // Polling work completions
+        ibv_cq* cq;
+        void* context;
+        if (!ibv_get_cq_event(id->recv_cq_channel, &cq, &context)) {
+          ibv_ack_cq_events(id->recv_cq, 1);
+          if (ibv_req_notify_cq(id->recv_cq, 0)) {
+            LOG(ERROR) << strerror(errno) << ": ibv_req_notify_cq failed";
+            continue;
+          }
+          ibv_wc wc[32];
+          int ret = ibv_poll_cq(id->recv_cq, 32, wc);
+          if (ret < 0) {
+            LOG(ERROR) << "ibv_poll_cq failed";
+            continue;
+          }
+          for (int i = 0; i < ret; i++) {
+            if (wc[i].opcode != IBV_WC_RECV_RDMA_WITH_IMM) {
+              LOG(ERROR) << "Received unknown operation " << wc[i].opcode;
+            }
+            if (wc[i].status != 0) {
+              LOG(ERROR) << ibv_wc_status_str(wc[i].status);
+            }
+            TensorKey tensor_key = ntohl(wc[i].imm_data);
+            {
+              mutex_lock l(server_mu_);
+              auto iter = tensor_buffers_.find(tensor_key);
+              if (iter == std::end(tensor_buffers_)) {
+                LOG(ERROR) << "Cannot find tensor buffer for tensor key "
+                           << tensor_key;
+              } else {
+                const TensorBuffer* buffer = iter->second;
+                buffer->Unref();
+                tensor_buffers_.erase(iter);
+              }
+            }
+            if (rdma_post_recvv(id, nullptr, nullptr, 0)) {
+              perror("rdma_post_recvv");
+              LOG(ERROR) << "rdma_post_recvv failed";
+              continue;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void GdrMemoryManager::Stop() { stopped_ = true; }
+
+Status GdrMemoryManager::TransportOptionsFromTensor(
+    ::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
+    Device* device, DeviceContext* device_context, bool on_host) {
+  auto buffer = DMAHelper::buffer(&tensor);
+  void* addr = buffer->data();
+  size_t length = buffer->size();
+  if (length == 0) {
+    return errors::Unavailable("Cannot register tensor buffer of size 0");
+  }
+
+  ibv_mr* mr = FindMemoryRegion(addr, length);
+
+  Tensor host_copy;
+#if GOOGLE_CUDA
+  if (!on_host && mr != nullptr) {
+    TF_RETURN_IF_ERROR(GPUUtil::Sync(device));
+  } else if (!on_host) {
+    Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+    host_copy = Tensor(alloc, tensor.dtype(), tensor.shape());
+    Status s;
+    Notification n;
+    GPUUtil::CopyGPUTensorToCPU(device, device_context, &tensor, &host_copy,
+                                [&s, &n](const Status& status) {
+                                  s.Update(status);
+                                  n.Notify();
+                                });
+    n.WaitForNotification();
+    if (!s.ok()) {
+      return s;
+    }
+    buffer = DMAHelper::buffer(&host_copy);
+    addr = buffer->data();
+    length = buffer->size();
+    mr = FindMemoryRegion(addr, length);
+  }
+#endif
+
+  if (mr == nullptr) {
+    return errors::Unavailable("Cannot find pinned memory region");
+  }
+
+  buffer->Ref();
+  TensorKey tensor_key = next_key_++;
+  {
+    mutex_lock l(server_mu_);
+    tensor_buffers_.insert(std::make_pair(tensor_key, buffer));
+  }
+
+  uint64_t checksum = 0;
+  if (VLOG_IS_ON(2)) {
+#ifdef GOOGLE_CUDA
+    if (device->tensorflow_gpu_device_info() && (!on_host)) {
+      if (host_copy.NumElements() > 0) {
+        checksum = GPUUtil::Checksum(device, device_context, host_copy);
+      } else {
+        checksum = GPUUtil::Checksum(device, device_context, tensor);
+      }
+    } else {
+      checksum = GPUUtil::Checksum(tensor);
+    }
+#endif
+  }
+
+  RemoteMemoryRegion remote_mr;
+  remote_mr.set_host(host_);
+  remote_mr.set_port(port_);
+  remote_mr.set_addr(reinterpret_cast<uint64_t>(addr));
+  remote_mr.set_rkey(mr->rkey);
+  remote_mr.set_tensor_key(tensor_key);
+  remote_mr.set_checksum(checksum);
+  mutable_transport_options->PackFrom(remote_mr);
+
+  return Status::OK();
+}
+
+Status GdrMemoryManager::TensorFromTransportOptions(
+    Tensor* tensor, const ::google::protobuf::Any& transport_options,
+    Device* device, DeviceContext* device_context, bool on_host) {
+  RemoteMemoryRegion remote_mr;
+  if (!transport_options.UnpackTo(&remote_mr)) {
+    return errors::NotFound("No RDMA transport options found");
+  }
+
+  auto buffer = DMAHelper::buffer(tensor);
+  void* addr = buffer->data();
+  size_t length = buffer->size();
+  ibv_mr* mr = FindMemoryRegion(addr, length);
+
+  Tensor host_copy;
+#if GOOGLE_CUDA
+  if (!on_host && mr != nullptr) {
+    TF_RETURN_IF_ERROR(GPUUtil::Sync(device));
+  } else if (!on_host) {
+    Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+    host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
+    buffer = DMAHelper::buffer(&host_copy);
+    addr = buffer->data();
+    length = buffer->size();
+    mr = FindMemoryRegion(addr, length);
+  }
+#endif  // GOOGLE_CUDA
+
+  if (mr == nullptr) {
+    return errors::Unavailable("Cannot find pinned memory region");
+  }
+
+  decltype(clients_)::iterator iter;
+  bool success;
+  {
+    mutex_lock l(client_mu_);
+    std::tie(iter, success) = clients_.insert(
+        std::make_pair(std::make_pair(remote_mr.host(), remote_mr.port()),
+                       RdmaEndpointPtr(nullptr, EndpointDeleter)));
+    if (success || iter->second.get() == nullptr) {
+      TF_RETURN_IF_ERROR(
+          CreateEndpoint(remote_mr.host(), remote_mr.port(), iter->second));
+    }
+  }
+  rdma_cm_id* id = iter->second.get();
+
+  uint64_t start = Env::Default()->NowMicros();
+
+  if (rdma_post_read(id, nullptr, buffer->data(), buffer->size(), mr, 0,
+                     remote_mr.addr(), remote_mr.rkey())) {
+    return errors::Unavailable(strerror(errno), ": ", "rdma_post_read failed");
+  }
+
+  ibv_send_wr wr = {};
+  wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+  wr.imm_data = htonl(remote_mr.tensor_key());
+  wr.send_flags = IBV_SEND_FENCE | IBV_SEND_SIGNALED;
+  ibv_send_wr* bad_wr;
+  if (ibv_post_send(id->qp, &wr, &bad_wr)) {
+    return errors::Unavailable(strerror(errno), ": ", "ibv_post_send failed");
+  }
+
+  ibv_wc wc = {};
+  int ret = rdma_get_send_comp(id, &wc);
+  if (ret < 0 || wc.status) {
+    return errors::Unavailable(ibv_wc_status_str(wc.status));
+  }
+
+#if GOOGLE_CUDA
+  if (host_copy.NumElements() > 0) {
+    Status s;
+    Notification n;
+    GPUUtil::CopyCPUTensorToGPU(&host_copy, device_context, device, tensor,
+                                [&s, &n](const Status& status) {
+                                  s.Update(status);
+                                  n.Notify();
+                                });
+    n.WaitForNotification();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+#endif  // GOOGLE_CUDA
+
+  uint64_t end = Env::Default()->NowMicros();
+
+  VLOG(2) << "RDMA from remote memory region " << remote_mr.rkey()
+          << " of size " << buffer->size() << " with tensor key "
+          << remote_mr.tensor_key() << " took " << (end - start) << " micros";
+
+  uint64_t checksum = 0;
+  if (VLOG_IS_ON(2)) {
+#ifdef GOOGLE_CUDA
+    if (device->tensorflow_gpu_device_info() && (!on_host)) {
+      if (host_copy.NumElements() > 0) {
+        checksum = GPUUtil::Checksum(device, device_context, host_copy);
+      } else {
+        checksum = GPUUtil::Checksum(device, device_context, *tensor);
+      }
+    } else {
+      checksum = GPUUtil::Checksum(*tensor);
+    }
+    CHECK(checksum == remote_mr.checksum()) << "Checksum mismatch: " << checksum
+                                            << "!=" << remote_mr.checksum();
+#endif
+  }
+  return Status::OK();
+}
+
+Status GdrMemoryManager::CreateEndpoint(const string& host, const string& port,
+                                        RdmaEndpointPtr& endpoint) {
+  rdma_addrinfo* addrinfo;
+  rdma_addrinfo hints = {};
+  hints.ai_port_space = RDMA_PS_TCP;
+  if (rdma_getaddrinfo(const_cast<char*>(host.c_str()),
+                       const_cast<char*>(port.c_str()), &hints, &addrinfo)) {
+    return errors::InvalidArgument(
+        strerror(errno), ": ", "cannot connect to rdma://", host, ":", port);
+  }
+
+  ibv_qp_init_attr init_attr = {};
+  init_attr.qp_type = IBV_QPT_RC;
+  init_attr.cap.max_recv_wr = 1;
+  init_attr.cap.max_send_wr = 32;
+  init_attr.cap.max_recv_sge = 1;
+  init_attr.cap.max_send_sge = 1;
+
+  rdma_cm_id* id;
+  if (rdma_create_ep(&id, addrinfo, nullptr, &init_attr)) {
+    rdma_freeaddrinfo(addrinfo);
+    return errors::Unavailable(strerror(errno), ": ",
+                               "cannot create endpoint to rdma://", host, ":",
+                               port);
+  }
+  rdma_freeaddrinfo(addrinfo);
+
+  if (rdma_connect(id, nullptr)) {
+    rdma_destroy_ep(id);
+    return errors::Unavailable(strerror(errno), ": ",
+                               "cannot connect to rdma://", host, ":", port);
+  }
+
+  LOG(INFO) << "RDMA endpoint connected to rdma://" << host << ":" << port;
+  endpoint = RdmaEndpointPtr(id, EndpointDeleter);
+  return Status::OK();
+}
+
+ibv_mr* GdrMemoryManager::FindMemoryRegion(void* addr, size_t length) {
+  if (length == 0) return nullptr;
+  mutex_lock l(alloc_mu_);
+  auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
+  if (iter == std::end(mrs_) || iter->get()->addr > addr) {
+    return nullptr;
+  } else {
+    return iter->get();
+  }
+}
+
+void GdrMemoryManager::InsertMemoryRegion(void* addr, size_t length) {
+  if (length == 0) return;
+  ibv_mr* mr = rdma_reg_read(listening_.get(), addr, length);
+  if (mr != nullptr) {
+    mutex_lock l(alloc_mu_);
+    auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
+    mrs_.insert(iter, {mr, &MRDeleter});
+  } else {
+    LOG(WARNING) << "Cannot register memory region";
+  }
+}
+
+void GdrMemoryManager::EvictMemoryRegion(void* addr, size_t length) {
+  if (length == 0) return;
+  mutex_lock l(alloc_mu_);
+  auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
+  if (iter != std::end(mrs_) && iter->get()->addr == addr) {
+    mrs_.erase(iter);
+  } else {
+    LOG(WARNING) << "Failed to de-register memory region";
+  }
+}
+
+}  // namespace
+
+RemoteMemoryManager* CreateRemoteMemoryManager(const string& host,
+                                               const string& port) {
+  return new GdrMemoryManager(host, port);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_GDR
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.h b/tensorflow/contrib/gdr/gdr_memory_manager.h
new file mode 100644
index 00000000000..7e9fe01e979
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef GDR_MEMORY_MANAGER_H_
+#define GDR_MEMORY_MANAGER_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace google {
+namespace protobuf {
+class Any;
+}
+}
+
+namespace tensorflow {
+
+class Device;
+class DeviceContext;
+class Tensor;
+
+// Abstract interface that handles out-of-band tensor transport.
+//
+// The transport options are encoded into a protocol buffer and transmitted via
+// some other communication channels like RPC.
+// See RecvTensorRequest in tensorflow/core/protobuf/worker.proto
+class RemoteMemoryManager {
+ public:
+  virtual ~RemoteMemoryManager() {}
+  virtual Status Init() = 0;
+  virtual void Run() = 0;
+  virtual void Stop() = 0;
+
+  // Encodes the tensor information to an arbitrary protocol buffer
+  // The protocol buffer needs to be transmitted via some other channel
+  virtual Status TransportOptionsFromTensor(
+      ::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
+      Device* device, DeviceContext* device_context, bool on_host) = 0;
+
+  // Retrieve the tensor from the encoded protocol buffer
+  // Note that the tensor has to be allocated, but not initialized
+  virtual Status TensorFromTransportOptions(
+      Tensor* tensor, const ::google::protobuf::Any& transport_options,
+      Device* device, DeviceContext* device_context, bool on_host) = 0;
+};
+
+RemoteMemoryManager* CreateRemoteMemoryManager(const string& host,
+                                               const string& port);
+
+}  // namespace tensorflow
+
+#endif  // GDR_MEMORY_MANAGER_H_
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
new file mode 100644
index 00000000000..259ee8817dd
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -0,0 +1,201 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/gdr/gdr_rendezvous_mgr.h"
+
+#include "google/protobuf/any.pb.h"
+#include "tensorflow/contrib/gdr/gdr_memory_manager.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/distributed_runtime/worker_interface.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace {
+
+class GdrRecvTensorCall : public BaseRecvTensorCall {
+ public:
+  GdrRecvTensorCall(WorkerInterface* wi, Device* dst_device,
+                    RemoteMemoryManager* remote_memory_manager,
+                    const Rendezvous::Args& recv_args, int64 step_id,
+                    StringPiece key)
+      : wi_(wi),
+        dst_device_(dst_device),
+        remote_memory_manager_(remote_memory_manager),
+        recv_args_(recv_args) {
+    req_.set_step_id(step_id);
+    req_.set_rendezvous_key(key.data(), key.size());
+  }
+
+  ~GdrRecvTensorCall() override {}
+
+  void Start(std::function<void()> recv_done) override {
+    req_.set_dma_ok(true);
+    resp_.InitAlloc(dst_device_, recv_args_.alloc_attrs);
+    StatusCallback cb = [this, recv_done](const Status& s) {
+      bool dma_ok = resp_.metadata().has_transport_options();
+      if (s.ok() && tensor().TotalBytes() > 0 && (!is_dead()) && dma_ok) {
+        auto transport_options = resp_.metadata().transport_options();
+        const bool on_host =
+            (dst_device_->tensorflow_gpu_device_info() == nullptr) ||
+            recv_args_.alloc_attrs.on_host();
+        Status s = remote_memory_manager_->TensorFromTransportOptions(
+            const_cast<Tensor*>(&tensor()), transport_options, dst_device_,
+            recv_args_.device_context, on_host);
+        if (!s.ok()) {
+          mutex_lock l(mu_);
+          status_.Update(s);
+          LOG(ERROR)
+              << "Cannot find pinned memory region from allocator "
+              << dst_device_->GetAllocator(recv_args_.alloc_attrs)->Name();
+        }
+      }
+      if (!s.ok()) {
+        mutex_lock l(mu_);
+        status_.Update(s);
+      }
+      recv_done();
+    };
+    wi_->RecvTensorAsync(&opts_, &req_, &resp_, std::move(cb));
+  }
+
+  void StartAbort(const Status& s) override {
+    {
+      mutex_lock l(mu_);
+      status_.Update(s);
+    }
+    opts_.StartCancel();
+  }
+
+  Status status() const override {
+    mutex_lock l(mu_);
+    return status_;
+  }
+
+  const Tensor& tensor() const { return resp_.tensor(); }
+
+  bool is_dead() const { return resp_.metadata().is_dead(); }
+
+  Device* dst_device() const { return dst_device_; }
+
+  const Rendezvous::Args& recv_args() const { return recv_args_; }
+
+ private:
+  WorkerInterface* wi_;
+  Device* dst_device_;
+  RemoteMemoryManager* remote_memory_manager_;
+  CallOptions opts_;
+  RecvTensorRequest req_;
+  TensorResponse resp_;
+  Rendezvous::Args recv_args_;
+
+  mutable mutex mu_;
+  Status status_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GdrRecvTensorCall);
+};
+
+class GdrRemoteRendezvous : public BaseRemoteRendezvous {
+ public:
+  GdrRemoteRendezvous(const WorkerEnv* env, int64 step_id,
+                      RemoteMemoryManager* remote_memory_manager)
+      : BaseRemoteRendezvous(env, step_id),
+        remote_memory_manager_(remote_memory_manager) {}
+
+ protected:
+  void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
+                           const Rendezvous::Args& recv_args,
+                           DoneCallback done) override {
+    CHECK(is_initialized());
+
+    string src_worker;
+    string src_rel_device;
+    if (!DeviceNameUtils::SplitDeviceName(parsed.src_device, &src_worker,
+                                          &src_rel_device)) {
+      Status s = errors::Internal(parsed.src_device,
+                                  " is invalid remote source device.");
+      done(s, Args(), recv_args, Tensor{}, false);
+      return;
+    }
+
+    WorkerSession* sess = session();
+    WorkerInterface* rwi = sess->worker_cache->CreateWorker(src_worker);
+    if (rwi == nullptr) {
+      Status s = errors::Internal("No worker known as ", src_worker);
+      done(s, Args(), recv_args, Tensor{}, false);
+      return;
+    }
+
+    Device* dst_device;
+    Status s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+    if (!s.ok()) {
+      sess->worker_cache->ReleaseWorker(src_worker, rwi);
+      done(s, Args(), recv_args, Tensor{}, false);
+      return;
+    }
+
+    // Prepare a RecvTensor call that can handle being aborted.
+    GdrRecvTensorCall* call =
+        new GdrRecvTensorCall(rwi, dst_device, remote_memory_manager_,
+                              recv_args, step_id_, parsed.FullKey());
+
+    // Record "call" in active_ so that it can be aborted cleanly.
+    RegisterCall(call);
+
+    // Start "call".
+    Ref();
+    call->Start([this, call, src_worker, rwi, done]() {
+      // Removes "call" from active_. Prevent StartAbort().
+      DeregisterCall(call);
+      // If StartAbort was called prior to DeregisterCall, then the
+      // current status should be bad.
+      Status s = call->status();
+      done(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
+      session()->worker_cache->ReleaseWorker(src_worker, rwi);
+      delete call;
+      Unref();
+    });
+  }
+
+ private:
+  ~GdrRemoteRendezvous() override {}
+
+  RemoteMemoryManager* remote_memory_manager_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GdrRemoteRendezvous);
+};
+
+}  // namespace
+
+GdrRendezvousMgr::GdrRendezvousMgr(const WorkerEnv* env,
+                                   RemoteMemoryManager* remote_memory_manager)
+    : BaseRendezvousMgr(env), remote_memory_manager_(remote_memory_manager) {}
+
+BaseRemoteRendezvous* GdrRendezvousMgr::Create(int64 step_id,
+                                               const WorkerEnv* worker_env) {
+  return new GdrRemoteRendezvous(worker_env, step_id, remote_memory_manager_);
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.h b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.h
new file mode 100644
index 00000000000..7fedd04f549
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef GDR_RENDEZVOUS_MGR_H_
+#define GDR_RENDEZVOUS_MGR_H_
+
+#include "tensorflow/contrib/gdr/gdr_memory_manager.h"
+#include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class GdrRendezvousMgr : public BaseRendezvousMgr {
+ public:
+  explicit GdrRendezvousMgr(const WorkerEnv* env,
+                            RemoteMemoryManager* remote_memory_manager);
+
+ protected:
+  BaseRemoteRendezvous* Create(int64 step_id, const WorkerEnv* worker_env);
+
+ private:
+  RemoteMemoryManager* remote_memory_manager_;  // Not owned
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GdrRendezvousMgr);
+};
+
+}  // end namespace tensorflow
+
+#endif  // GDR_RENDEZVOUS_MGR_H_
diff --git a/tensorflow/contrib/gdr/gdr_server_lib.cc b/tensorflow/contrib/gdr/gdr_server_lib.cc
new file mode 100644
index 00000000000..ae6a612ecfc
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_server_lib.cc
@@ -0,0 +1,127 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/gdr/gdr_server_lib.h"
+#include "tensorflow/contrib/gdr/gdr_memory_manager.h"
+#include "tensorflow/contrib/gdr/gdr_rendezvous_mgr.h"
+#include "tensorflow/contrib/gdr/gdr_worker.h"
+
+#include "net/grpc/public/include/grpc/support/alloc.h"
+
+namespace tensorflow {
+
+GdrServer::GdrServer(const ServerDef& server_def, Env* env)
+    : GrpcServer(server_def, env) {
+  string host;
+  string port;
+  for (const auto& job : server_def.cluster().job()) {
+    if (job.name() == server_def.job_name()) {
+      auto iter = job.tasks().find(server_def.task_index());
+      if (iter != job.tasks().end()) {
+        const std::vector<string> hostname_port =
+            str_util::Split(iter->second, ':');
+        if (hostname_port.size() == 2) {
+          host = hostname_port[0];
+          port = hostname_port[1];
+        }
+      }
+    }
+  }
+  remote_memory_manager_ = std::unique_ptr<RemoteMemoryManager>(
+      CreateRemoteMemoryManager(host, port));
+}
+
+GdrServer::~GdrServer() {}
+
+Status GdrServer::Init() {
+  RendezvousMgrCreationFunction rendezvous_mgr_func =
+      [this](const WorkerEnv* env) {
+        return new GdrRendezvousMgr(env, remote_memory_manager_.get());
+      };
+  WorkerCreationFunction worker_func = [this](WorkerEnv* env) {
+    return std::unique_ptr<GdrWorker>(
+        new GdrWorker(env, remote_memory_manager_.get()));
+  };
+  TF_RETURN_IF_ERROR(
+      GrpcServer::Init(nullptr, rendezvous_mgr_func, worker_func));
+
+  return remote_memory_manager_->Init();
+}
+
+Status GdrServer::Start() {
+  {
+    mutex_lock l(mu_);
+    gdr_thread_.reset(worker_env()->env->StartThread(
+        ThreadOptions(), "TF_gdr_service",
+        [this] { remote_memory_manager_->Run(); }));
+  }
+  return GrpcServer::Start();
+}
+
+Status GdrServer::Stop() {
+  TF_RETURN_IF_ERROR(GrpcServer::Stop());
+  remote_memory_manager_->Stop();
+  return Status::OK();
+}
+
+Status GdrServer::Join() {
+  {
+    mutex_lock l(mu_);
+    gdr_thread_.reset();
+  }
+  return GrpcServer::Join();
+}
+
+/* static */
+Status GdrServer::Create(const ServerDef& server_def, Env* env,
+                         std::unique_ptr<ServerInterface>* out_server) {
+  std::unique_ptr<GdrServer> ret(
+      new GdrServer(server_def, env == nullptr ? Env::Default() : env));
+  TF_RETURN_IF_ERROR(ret->Init());
+  *out_server = std::move(ret);
+  return Status::OK();
+}
+
+namespace {
+
+class GdrServerFactory : public ServerFactory {
+ public:
+  bool AcceptsOptions(const ServerDef& server_def) override {
+    return server_def.protocol() == "grpc+gdr";
+  }
+
+  Status NewServer(const ServerDef& server_def,
+                   std::unique_ptr<ServerInterface>* out_server) override {
+    return GdrServer::Create(server_def, Env::Default(), out_server);
+  }
+};
+
+// Registers a `ServerFactory` for `GdrServer` instances.
+class GdrServerRegistrar {
+ public:
+  GdrServerRegistrar() {
+    gpr_allocation_functions alloc_fns;
+    memset(&alloc_fns, 0, sizeof(alloc_fns));
+    alloc_fns.malloc_fn = port::Malloc;
+    alloc_fns.realloc_fn = port::Realloc;
+    alloc_fns.free_fn = port::Free;
+    gpr_set_allocation_functions(alloc_fns);
+    ServerFactory::Register("GDR_SERVER", new GdrServerFactory());
+  }
+};
+static GdrServerRegistrar registrar;
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/gdr/gdr_server_lib.h b/tensorflow/contrib/gdr/gdr_server_lib.h
new file mode 100644
index 00000000000..d6c40d429e2
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_server_lib.h
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef GDR_SERVER_LIB_H_
+#define GDR_SERVER_LIB_H_
+
+#include "tensorflow/contrib/gdr/gdr_memory_manager.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+
+namespace tensorflow {
+
+class GdrServer : public GrpcServer {
+ protected:
+  GdrServer(const ServerDef& server_def, Env* env);
+
+ public:
+  static Status Create(const ServerDef& server_def, Env* env,
+                       std::unique_ptr<ServerInterface>* out_server);
+
+  virtual ~GdrServer() override;
+
+  virtual Status Start() override;
+
+  virtual Status Stop() override;
+
+  virtual Status Join() override;
+
+ protected:
+  Status Init();
+
+ private:
+  mutex mu_;
+
+  std::unique_ptr<RemoteMemoryManager> remote_memory_manager_;
+  std::unique_ptr<Thread> gdr_thread_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // GDR_SERVER_LIB_H_
diff --git a/tensorflow/contrib/gdr/gdr_worker.cc b/tensorflow/contrib/gdr/gdr_worker.cc
new file mode 100644
index 00000000000..0bff0aff6d3
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_worker.cc
@@ -0,0 +1,146 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/gdr/gdr_worker.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#endif  // GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/distributed_runtime/worker.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/distributed_runtime/worker_session.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/tracing.h"
+
+namespace tensorflow {
+
+GdrWorker::GdrWorker(WorkerEnv* worker_env,
+                     RemoteMemoryManager* remote_memory_manager)
+    : GrpcWorker(worker_env), remote_memory_manager_(remote_memory_manager) {}
+
+void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
+                                    const RecvTensorRequest* request,
+                                    ::grpc::ByteBuffer* response,
+                                    StatusCallback done) {
+  const int64 step_id = request->step_id();
+  const string& key = request->rendezvous_key();
+  TRACEPRINTF("RecvTensor: %lld %s", step_id, key.c_str());
+  Rendezvous::ParsedKey parsed;
+  Status s = Rendezvous::ParseKey(key, &parsed);
+  Device* src_dev = nullptr;
+  if (s.ok()) {
+    s = PrepareRecvTensor(parsed, &src_dev);
+  }
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+
+  // Request the tensor associated with the rendezvous key. Any time
+  // while waiting for the tensor to be produced, up until the start
+  // of execution of the callback lambda body below, an RPC
+  // cancellation should abort the rendezvous.
+  opts->SetCancelCallback([this, step_id]() { AbortStep(step_id); });
+  const bool dma_ok = request->dma_ok();
+  env_->rendezvous_mgr->RecvLocalAsync(
+      step_id, parsed,
+      [this, opts, response, done, src_dev, dma_ok](
+          const Status& status, const Rendezvous::Args& send_args,
+          const Rendezvous::Args&, const Tensor& val, const bool is_dead) {
+        opts->ClearCancelCallback();
+        if (status.ok()) {
+          // DMA can only be used for Tensors that do not fall into
+          // the following three odd edge cases: 1) a zero-size
+          // buffer, 2) a dead tensor which has an uninit value, and
+          // 3) the tensor has the on_host allocation attribute,
+          // i.e. it's in CPU RAM *independent of its assigned
+          // device type*.
+          const bool on_host =
+              (src_dev->tensorflow_gpu_device_info() == nullptr) ||
+              send_args.alloc_attrs.on_host();
+          if (val.TotalBytes() > 0 && (!is_dead) &&
+              DMAHelper::CanUseDMA(&val) && dma_ok) {
+            // DMA cases.
+            RecvTensorResponse proto;
+            auto transport_options = proto.mutable_transport_options();
+            Status s = remote_memory_manager_->TransportOptionsFromTensor(
+                transport_options, val, src_dev, send_args.device_context,
+                on_host);
+            if (s.ok()) {
+              proto.set_is_dead(is_dead);
+              proto.set_send_start_micros(Env::Default()->NowMicros());
+              TensorProto* tensor_proto = proto.mutable_tensor();
+              tensor_proto->set_dtype(val.dtype());
+              val.shape().AsProto(tensor_proto->mutable_tensor_shape());
+              grpc::EncodeRecvTensorResponseToByteBuffer(proto, response);
+              done(Status::OK());
+              return;
+            } else {
+              done(s);
+              return;
+            }
+          } else {
+            // Non-DMA cases.
+            if (src_dev->tensorflow_gpu_device_info() && (!on_host)) {
+#if GOOGLE_CUDA
+              const DeviceContext* send_dev_context = send_args.device_context;
+              AllocatorAttributes alloc_attrs;
+              alloc_attrs.set_gpu_compatible(true);
+              alloc_attrs.set_on_host(true);
+              Allocator* alloc = src_dev->GetAllocator(alloc_attrs);
+              Tensor* copy = new Tensor(alloc, val.dtype(), val.shape());
+              CHECK(send_dev_context)
+                  << "send dev name: " << src_dev->name()
+                  << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+              // "val" is on a GPU. Uses GPUUtil to fill the response proto.
+              StatusCallback copy_ready = [response, done, copy,
+                                           is_dead](const Status& s) {
+                // The value is now ready to be returned on the wire.
+                grpc::EncodeTensorToByteBuffer(is_dead, *copy, response);
+                done(s);
+                delete copy;
+              };
+
+              GPUUtil::CopyGPUTensorToCPU(src_dev, send_dev_context, &val, copy,
+                                          copy_ready);
+#else
+              done(errors::Internal("No GPU device in process"));
+#endif  // GOOGLE_CUDA
+            } else {
+              grpc::EncodeTensorToByteBuffer(is_dead, val, response);
+              done(Status::OK());
+            }
+          }
+        } else {
+          //  !s.ok()
+          done(status);
+        }
+      });
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/gdr/gdr_worker.h b/tensorflow/contrib/gdr/gdr_worker.h
new file mode 100644
index 00000000000..a30b7baaedc
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_worker.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef GDR_WORKER_H_
+#define GDR_WORKER_H_
+
+#include "tensorflow/contrib/gdr/gdr_memory_manager.h"
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
+
+namespace tensorflow {
+
+class GdrWorker : public GrpcWorker {
+ public:
+  GdrWorker(WorkerEnv* env, RemoteMemoryManager* remote_memory_manager);
+
+  // Serve the RecvTensorRequest but omit the tensor content and transmit it
+  // out-of-band using GPU Direct RDMA whenever possible.
+  // If it's not possible, it falls back to gRPC in-band tensor transport by
+  // encoding the tensor content into the grpc::ByteBuffer.
+  // The RecvTensorResponse will carry the necessary information for RDMA.
+  virtual void GrpcRecvTensorAsync(CallOptions* opts,
+                                   const RecvTensorRequest* request,
+                                   ::grpc::ByteBuffer* response,
+                                   StatusCallback done) override;
+
+ private:
+  RemoteMemoryManager* remote_memory_manager_;  // Not owned
+};
+
+}  // namespace tensorflow
+
+#endif  // GDR_WORKER_H_
diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py
index 4fa4ec0dd49..6d7429d20d0 100644
--- a/tensorflow/contrib/keras/python/keras/backend.py
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@@ -3570,7 +3570,7 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
 
   Returns:
       the tensor after 1d conv with un-shared weights, with shape (batch_size,
-      output_lenght, filters)
+      output_length, filters)
 
   Raises:
       ValueError: if `data_format` is neither `channels_last` or
diff --git a/tensorflow/contrib/keras/python/keras/utils/generic_utils.py b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
index ed57144f9c8..3428476b173 100644
--- a/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import marshal
+import os
 import sys
 import time
 import types as python_types
@@ -195,7 +196,10 @@ def func_dump(func):
   Returns:
       A tuple `(code, defaults, closure)`.
   """
-  code = marshal.dumps(func.__code__).decode('raw_unicode_escape')
+  if os.name == 'nt':
+    code = marshal.dumps(func.__code__).replace(b'\\',b'/').decode('raw_unicode_escape')
+  else:
+    code = marshal.dumps(func.__code__).decode('raw_unicode_escape')
   defaults = func.__defaults__
   if func.__closure__:
     closure = tuple(c.cell_contents for c in func.__closure__)
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 0499b115542..09def361495 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -1944,7 +1944,7 @@ def gdn(inputs,
   spatial dimensions. It is similar to local response normalization, but much
   more flexible, as `beta` and `gamma` are trainable parameters.
 
-  Arguments:
+  Args:
     inputs: Tensor input.
     inverse: If `False` (default), compute GDN response. If `True`, compute IGDN
       response (one step of fixed point iteration to invert GDN; the division
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 22a6fa317bc..e2e2988cf29 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -66,11 +66,11 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import builder as saved_model_builder
 from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.summary import summary as core_summary
 from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import device_setter
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver
-from tensorflow.python.training import summary_io
 from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 from tensorflow.python.util import tf_decorator
@@ -337,7 +337,7 @@ def _write_dict_to_summary(output_dir, dictionary, current_global_step):
   """
   logging.info('Saving dict for global step %d: %s', current_global_step,
                _dict_to_str(dictionary))
-  summary_writer = summary_io.SummaryWriterCache.get(output_dir)
+  summary_writer = core_summary.FileWriterCache.get(output_dir)
   summary_proto = summary_pb2.Summary()
   for key in dictionary:
     if dictionary[key] is None:
@@ -1034,7 +1034,7 @@ class BaseEstimator(
         loss = None
         while not mon_sess.should_stop():
           _, loss = mon_sess.run([model_fn_ops.train_op, model_fn_ops.loss])
-      summary_io.SummaryWriterCache.clear()
+      core_summary.FileWriterCache.clear()
       return loss
 
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index fe712bdf7b2..be2b0cb3ca9 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -506,7 +506,7 @@ class EstimatorModelFnTest(test.TestCase):
       return input_fn_utils.InputFnOps(
           features, labels, {'examples': serialized_tf_example})
 
-    est.export_savedmodel(est.model_dir + '/export', serving_input_fn)
+    est.export_savedmodel(os.path.join(est.model_dir, 'export'), serving_input_fn)
     self.assertTrue(self.mock_saver.restore.called)
 
 
@@ -988,10 +988,11 @@ class EstimatorTest(test.TestCase):
         self.assertTrue('input_example_tensor' in graph_ops)
         self.assertTrue('ParseExample/ParseExample' in graph_ops)
         self.assertTrue('linear/linear/feature/matmul' in graph_ops)
-        self.assertSameElements(
-            ['bogus_lookup', 'feature'],
-            graph.get_collection(
-                constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS))
+        self.assertItemsEqual(
+          ['bogus_lookup', 'feature'],
+          [compat.as_str_any(x) for x in graph.get_collection(
+            constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS)])
+
 
     # cleanup
     gfile.DeleteRecursively(tmpdir)
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index d9d22485eea..3051f4048fa 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -44,15 +44,16 @@ import six
 
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
-from tensorflow.contrib.learn.python.learn import session_run_hook
 from tensorflow.contrib.learn.python.learn.summary_writer_cache import SummaryWriterCache
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary as core_summary
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training import summary_io
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_inspect
 
@@ -521,7 +522,7 @@ class SummarySaver(EveryN):
     self._summary_op = summary_op
     self._summary_writer = summary_writer
     if summary_writer is None and output_dir:
-      self._summary_writer = summary_io.SummaryWriter(output_dir)
+      self._summary_writer = core_summary.FileWriter(output_dir)
     self._scaffold = scaffold
     # TODO(mdan): Throw an error if output_dir and summary_writer are None.
 
@@ -529,7 +530,7 @@ class SummarySaver(EveryN):
     super(SummarySaver, self).set_estimator(estimator)
     # TODO(mdan): This line looks redundant.
     if self._summary_writer is None:
-      self._summary_writer = summary_io.SummaryWriter(estimator.model_dir)
+      self._summary_writer = core_summary.FileWriter(estimator.model_dir)
 
   def every_n_step_begin(self, step):
     super(SummarySaver, self).every_n_step_begin(step)
@@ -1029,7 +1030,7 @@ class CheckpointSaver(BaseMonitor):
     logging.info("Create CheckpointSaver.")
     super(CheckpointSaver, self).__init__()
     self._saver = saver
-    self._summary_writer = SummaryWriterCache.get(checkpoint_dir)
+    self._summary_writer = core_summary.FileWriterCache.get(checkpoint_dir)
     self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
     self._scaffold = scaffold
     self._save_secs = save_secs
@@ -1098,12 +1099,12 @@ class StepCounter(EveryN):
     self._last_reported_time = None
     self._summary_writer = summary_writer
     if summary_writer is None and output_dir:
-      self._summary_writer = SummaryWriterCache.get(output_dir)
+      self._summary_writer = core_summary.FileWriterCache.get(output_dir)
 
   def set_estimator(self, estimator):
     super(StepCounter, self).set_estimator(estimator)
     if self._summary_writer is None:
-      self._summary_writer = SummaryWriterCache.get(estimator.model_dir)
+      self._summary_writer = core_summary.FileWriterCache.get(estimator.model_dir)
 
   def every_n_step_end(self, current_step, outputs):
     current_time = time.time()
@@ -1169,7 +1170,7 @@ class RunHookAdapterForMonitors(session_run_hook.SessionRunHook):
 
   def begin(self):
     self._last_step = None
-    self._global_step_tensor = contrib_variables.get_global_step()
+    self._global_step_tensor = training_util.get_global_step()
     for m in self._monitors:
       m.begin(max_steps=None)
 
diff --git a/tensorflow/contrib/learn/python/learn/monitors_test.py b/tensorflow/contrib/learn/python/learn/monitors_test.py
index e8fe6026b71..b2b24776c60 100644
--- a/tensorflow/contrib/learn/python/learn/monitors_test.py
+++ b/tensorflow/contrib/learn/python/learn/monitors_test.py
@@ -27,7 +27,6 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib import testing
 from tensorflow.contrib.framework.python.framework import checkpoint_utils
-from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn import estimators
 from tensorflow.python.client import session as session_lib
@@ -43,6 +42,7 @@ from tensorflow.python.summary import summary
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver
+from tensorflow.python.training import training_util
 
 
 class _MyEveryN(learn.monitors.EveryN):
@@ -616,7 +616,7 @@ class CheckpointSaverTest(test.TestCase):
     self.graph = ops.Graph()
     with self.graph.as_default():
       self.scaffold = monitored_session.Scaffold()
-      self.global_step = variables_lib.get_or_create_global_step()
+      self.global_step = training_util.get_or_create_global_step()
       self.train_op = state_ops.assign_add(self.global_step, 1)
 
   def tearDown(self):
@@ -780,7 +780,7 @@ class RunHookAdapterForMonitorsTest(test.TestCase):
 
   def test_calls_and_steps(self):
     with ops.Graph().as_default(), session_lib.Session() as sess:
-      global_step_tensor = variables_lib.create_global_step()
+      global_step_tensor = training_util.create_global_step()
       inc_5 = state_ops.assign_add(global_step_tensor, 5)
       mock_mon = FakeMonitor()
       mock_mon2 = FakeMonitor()
@@ -821,7 +821,7 @@ class RunHookAdapterForMonitorsTest(test.TestCase):
 
   def test_requests(self):
     with ops.Graph().as_default(), session_lib.Session() as sess:
-      variables_lib.create_global_step()
+      training_util.create_global_step()
       mock_mon = FakeMonitor()
       mock_mon2 = FakeMonitor()
 
diff --git a/tensorflow/contrib/learn/python/learn/utils/export_test.py b/tensorflow/contrib/learn/python/learn/utils/export_test.py
index ce1d73256a6..95070ada3b9 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export_test.py
@@ -31,6 +31,7 @@ from tensorflow.contrib.session_bundle import exporter
 from tensorflow.contrib.session_bundle import manifest_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import gfile
@@ -49,9 +50,8 @@ def _training_input_fn():
 
 
 class ExportTest(test.TestCase):
-
   def _get_default_signature(self, export_meta_filename):
-    """Gets the default signature from the export.meta file."""
+    """ Gets the default signature from the export.meta file. """
     with session.Session():
       save = saver.import_meta_graph(export_meta_filename)
       meta_graph_def = save.export_meta_graph()
@@ -68,18 +68,19 @@ class ExportTest(test.TestCase):
     self.assertTrue(gfile.Exists(export_dir))
     # Only the written checkpoints are exported.
     self.assertTrue(
-        saver.checkpoint_exists(export_dir + '00000001/export'),
+        saver.checkpoint_exists(os.path.join(export_dir, '00000001', 'export')),
         'Exported checkpoint expected but not found: %s' %
-        (export_dir + '00000001/export'))
+        os.path.join(export_dir, '00000001', 'export'))
     self.assertTrue(
-        saver.checkpoint_exists(export_dir + '00000010/export'),
+        saver.checkpoint_exists(os.path.join(export_dir, '00000010', 'export')),
         'Exported checkpoint expected but not found: %s' %
-        (export_dir + '00000010/export'))
+        os.path.join(export_dir, '00000010', 'export'))
     self.assertEquals(
         six.b(os.path.join(export_dir, '00000010')),
         export_monitor.last_export_dir)
     # Validate the signature
-    signature = self._get_default_signature(export_dir + '00000010/export.meta')
+    signature = self._get_default_signature(
+      os.path.join(export_dir, '00000010', 'export.meta'))
     self.assertTrue(signature.HasField(expected_signature))
 
   def testExportMonitor_EstimatorProvidesSignature(self):
@@ -88,7 +89,7 @@ class ExportTest(test.TestCase):
     y = 2 * x + 3
     cont_features = [feature_column.real_valued_column('', dimension=1)]
     regressor = learn.LinearRegressor(feature_columns=cont_features)
-    export_dir = tempfile.mkdtemp() + 'export/'
+    export_dir = os.path.join(tempfile.mkdtemp(), 'export')
     export_monitor = learn.monitors.ExportMonitor(
         every_n_steps=1, export_dir=export_dir, exports_to_keep=2)
     regressor.fit(x, y, steps=10, monitors=[export_monitor])
@@ -99,7 +100,7 @@ class ExportTest(test.TestCase):
     x = np.random.rand(1000)
     y = 2 * x + 3
     cont_features = [feature_column.real_valued_column('', dimension=1)]
-    export_dir = tempfile.mkdtemp() + 'export/'
+    export_dir = os.path.join(tempfile.mkdtemp(), 'export')
     export_monitor = learn.monitors.ExportMonitor(
         every_n_steps=1,
         export_dir=export_dir,
@@ -122,7 +123,7 @@ class ExportTest(test.TestCase):
     input_feature_key = 'my_example_key'
     monitor = learn.monitors.ExportMonitor(
         every_n_steps=1,
-        export_dir=tempfile.mkdtemp() + 'export/',
+        export_dir=os.path.join(tempfile.mkdtemp(), 'export'),
         input_fn=_serving_input_fn,
         input_feature_key=input_feature_key,
         exports_to_keep=2,
@@ -140,7 +141,7 @@ class ExportTest(test.TestCase):
 
     monitor = learn.monitors.ExportMonitor(
         every_n_steps=1,
-        export_dir=tempfile.mkdtemp() + 'export/',
+        export_dir=os.path.join(tempfile.mkdtemp(), 'export'),
         input_fn=_serving_input_fn,
         input_feature_key=input_feature_key,
         exports_to_keep=2,
@@ -165,7 +166,7 @@ class ExportTest(test.TestCase):
 
     monitor = learn.monitors.ExportMonitor(
         every_n_steps=1,
-        export_dir=tempfile.mkdtemp() + 'export/',
+        export_dir=os.path.join(tempfile.mkdtemp(), 'export'),
         input_fn=_serving_input_fn,
         input_feature_key=input_feature_key,
         exports_to_keep=2,
@@ -187,7 +188,7 @@ class ExportTest(test.TestCase):
 
     monitor = learn.monitors.ExportMonitor(
         every_n_steps=1,
-        export_dir=tempfile.mkdtemp() + 'export/',
+        export_dir=os.path.join(tempfile.mkdtemp(), 'export'),
         input_fn=_serving_input_fn,
         input_feature_key=input_feature_key,
         exports_to_keep=2,
@@ -210,7 +211,7 @@ class ExportTest(test.TestCase):
                   shape=(1,), minval=0.0, maxval=1000.0)
       }, None
 
-    export_dir = tempfile.mkdtemp() + 'export/'
+    export_dir = os.path.join(tempfile.mkdtemp(), 'export')
     monitor = learn.monitors.ExportMonitor(
         every_n_steps=1,
         export_dir=export_dir,
@@ -235,7 +236,7 @@ class ExportTest(test.TestCase):
     y = 2 * x + 3
     cont_features = [feature_column.real_valued_column('', dimension=1)]
     regressor = learn.LinearRegressor(feature_columns=cont_features)
-    export_dir = tempfile.mkdtemp() + 'export/'
+    export_dir = os.path.join(tempfile.mkdtemp(), 'export')
     export_monitor = learn.monitors.ExportMonitor(
         every_n_steps=1,
         export_dir=export_dir,
@@ -244,10 +245,13 @@ class ExportTest(test.TestCase):
     regressor.fit(x, y, steps=10, monitors=[export_monitor])
 
     self.assertTrue(gfile.Exists(export_dir))
-    self.assertFalse(saver.checkpoint_exists(export_dir + '00000000/export'))
-    self.assertTrue(saver.checkpoint_exists(export_dir + '00000010/export'))
+    with self.assertRaises(errors.NotFoundError):
+      saver.checkpoint_exists(os.path.join(export_dir, '00000000', 'export'))
+    self.assertTrue(
+      saver.checkpoint_exists(os.path.join(export_dir, '00000010', 'export')))
     # Validate the signature
-    signature = self._get_default_signature(export_dir + '00000010/export.meta')
+    signature = self._get_default_signature(
+      os.path.join(export_dir, '00000010', 'export.meta'))
     self.assertTrue(signature.HasField('regression_signature'))
 
 
diff --git a/tensorflow/contrib/learn/python/learn/utils/gc_test.py b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
index 0c1a1f43279..76cfd88e1d6 100644
--- a/tensorflow/contrib/learn/python/learn/utils/gc_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
@@ -33,8 +33,13 @@ from tensorflow.python.util import compat
 def _create_parser(base_dir):
   # create a simple parser that pulls the export_version from the directory.
   def parser(path):
-    match = re.match("^" + compat.as_str_any(base_dir) + "/(\\d+)$",
-                     compat.as_str_any(path.path))
+    # Modify the path object for RegEx match for Windows Paths
+    if os.name == 'nt':
+      match = re.match("^" + compat.as_str_any(base_dir).replace('\\','/') + "/(\\d+)$",
+                      compat.as_str_any(path.path).replace('\\','/'))
+    else:
+      match = re.match("^" + compat.as_str_any(base_dir) + "/(\\d+)$",
+                      compat.as_str_any(path.path))
     if not match:
       return None
     return path._replace(export_version=int(match.group(1)))
@@ -48,13 +53,13 @@ class GcTest(test_util.TensorFlowTestCase):
     paths = [gc.Path("/foo", 8), gc.Path("/foo", 9), gc.Path("/foo", 10)]
     newest = gc.largest_export_versions(2)
     n = newest(paths)
-    self.assertEquals(n, [gc.Path("/foo", 9), gc.Path("/foo", 10)])
+    self.assertEqual(n, [gc.Path("/foo", 9), gc.Path("/foo", 10)])
 
   def testLargestExportVersionsDoesNotDeleteZeroFolder(self):
     paths = [gc.Path("/foo", 0), gc.Path("/foo", 3)]
     newest = gc.largest_export_versions(2)
     n = newest(paths)
-    self.assertEquals(n, [gc.Path("/foo", 0), gc.Path("/foo", 3)])
+    self.assertEqual(n, [gc.Path("/foo", 0), gc.Path("/foo", 3)])
 
   def testModExportVersion(self):
     paths = [
@@ -62,9 +67,9 @@ class GcTest(test_util.TensorFlowTestCase):
         gc.Path("/foo", 9)
     ]
     mod = gc.mod_export_version(2)
-    self.assertEquals(mod(paths), [gc.Path("/foo", 4), gc.Path("/foo", 6)])
+    self.assertEqual(mod(paths), [gc.Path("/foo", 4), gc.Path("/foo", 6)])
     mod = gc.mod_export_version(3)
-    self.assertEquals(mod(paths), [gc.Path("/foo", 6), gc.Path("/foo", 9)])
+    self.assertEqual(mod(paths), [gc.Path("/foo", 6), gc.Path("/foo", 9)])
 
   def testOneOfEveryNExportVersions(self):
     paths = [
@@ -73,7 +78,7 @@ class GcTest(test_util.TensorFlowTestCase):
         gc.Path("/foo", 8), gc.Path("/foo", 33)
     ]
     one_of = gc.one_of_every_n_export_versions(3)
-    self.assertEquals(
+    self.assertEqual(
         one_of(paths), [
             gc.Path("/foo", 3), gc.Path("/foo", 6), gc.Path("/foo", 8),
             gc.Path("/foo", 33)
@@ -84,14 +89,14 @@ class GcTest(test_util.TensorFlowTestCase):
     # Test that here.
     paths = [gc.Path("/foo", 0), gc.Path("/foo", 4), gc.Path("/foo", 5)]
     one_of = gc.one_of_every_n_export_versions(3)
-    self.assertEquals(one_of(paths), [gc.Path("/foo", 0), gc.Path("/foo", 5)])
+    self.assertEqual(one_of(paths), [gc.Path("/foo", 0), gc.Path("/foo", 5)])
 
   def testUnion(self):
     paths = []
     for i in xrange(10):
       paths.append(gc.Path("/foo", i))
     f = gc.union(gc.largest_export_versions(3), gc.mod_export_version(3))
-    self.assertEquals(
+    self.assertEqual(
         f(paths), [
             gc.Path("/foo", 0), gc.Path("/foo", 3), gc.Path("/foo", 6),
             gc.Path("/foo", 7), gc.Path("/foo", 8), gc.Path("/foo", 9)
@@ -103,9 +108,9 @@ class GcTest(test_util.TensorFlowTestCase):
         gc.Path("/foo", 9)
     ]
     mod = gc.negation(gc.mod_export_version(2))
-    self.assertEquals(mod(paths), [gc.Path("/foo", 5), gc.Path("/foo", 9)])
+    self.assertEqual(mod(paths), [gc.Path("/foo", 5), gc.Path("/foo", 9)])
     mod = gc.negation(gc.mod_export_version(3))
-    self.assertEquals(mod(paths), [gc.Path("/foo", 4), gc.Path("/foo", 5)])
+    self.assertEqual(mod(paths), [gc.Path("/foo", 4), gc.Path("/foo", 5)])
 
   def testPathsWithParse(self):
     base_dir = os.path.join(test.get_temp_dir(), "paths_parse")
@@ -115,7 +120,7 @@ class GcTest(test_util.TensorFlowTestCase):
     # add a base_directory to ignore
     gfile.MakeDirs(os.path.join(base_dir, "ignore"))
 
-    self.assertEquals(
+    self.assertEqual(
         gc.get_paths(base_dir, _create_parser(base_dir)),
         [
             gc.Path(os.path.join(base_dir, "0"), 0),
diff --git a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
index 23a682fb905..3b88535dce4 100644
--- a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
+++ b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
@@ -57,6 +57,11 @@ REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_CPU), BytesLimitOp);
 REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_GPU).HostMemory("out"),
                         BytesLimitOp);
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_SYCL).HostMemory("out"),
+                        BytesLimitOp);
+#endif // TENSORFLOW_USE_SYCL
+
 // Op that measures the peak memory in bytes.
 class MaxBytesInUseOp : public MemoryStatsOp {
  public:
@@ -76,4 +81,10 @@ REGISTER_KERNEL_BUILDER(
     Name("MaxBytesInUse").Device(DEVICE_GPU).HostMemory("out"),
     MaxBytesInUseOp);
 
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("MaxBytesInUse").Device(DEVICE_SYCL).HostMemory("out"),
+    MaxBytesInUseOp);
+#endif // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/mpi/mpi_server_lib.cc b/tensorflow/contrib/mpi/mpi_server_lib.cc
index 3b2fba97a99..d585c0565eb 100644
--- a/tensorflow/contrib/mpi/mpi_server_lib.cc
+++ b/tensorflow/contrib/mpi/mpi_server_lib.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "grpc/support/alloc.h"
+
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/contrib/mpi/mpi_utils.cc b/tensorflow/contrib/mpi/mpi_utils.cc
index b8e7d1a274f..8184b856264 100644
--- a/tensorflow/contrib/mpi/mpi_utils.cc
+++ b/tensorflow/contrib/mpi/mpi_utils.cc
@@ -61,7 +61,7 @@ void MPIUtils::InitMPI() {
     MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &number_of_procs));
     MPI_CHECK(MPI_Get_processor_name(my_host_name, &len));
     fprintf(stderr,
-            "MPI Environment initialised. Process id: %d Total processes: %d "
+            "MPI Environment initialized. Process id: %d Total processes: %d "
             "|| Hostname: %s \n",
             proc_id, number_of_procs, my_host_name);
   }
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
index 130cb4ca12c..ae658e73227 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
@@ -43,7 +43,7 @@ class AllReduceTest(test.TestCase):
         self._testSingleAllReduce(sess, dtype, nccl.all_max, np.maximum)
 
   def _testSingleAllReduce(self, sess, np_type, nccl_fn, numpy_accumulation_fn):
-    for devices in [['/gpu:0', '/gpu:0', '/gpu:0'], ['/gpu:0', '/gpu:0']]:
+    for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
       shape = (3, 4)
       np_ans = None
       tensors = []
@@ -84,7 +84,7 @@ class BroadcastTest(test.TestCase):
       # Create session inside outer loop to test use of
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
-        for devices in [['/gpu:0', '/gpu:0', '/gpu:0'], ['/gpu:0', '/gpu:0']]:
+        for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
           shape = (3, 4)
           sender = np.random.randint(0, len(devices) - 1)
           with ops.device(devices[sender]):
@@ -115,7 +115,7 @@ class CombinedTest(test.TestCase):
       # Create session inside outer loop to test use of
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
-        for devices in [['/gpu:0', '/gpu:0', '/gpu:0'], ['/gpu:0', '/gpu:0']]:
+        for devices in [['/device:GPU:0', '/device:GPU:0', '/device:GPU:0'], ['/device:GPU:0', '/device:GPU:0']]:
           shape = (3, 4)
 
           # all-reduce
diff --git a/tensorflow/contrib/nn/BUILD b/tensorflow/contrib/nn/BUILD
index af33496e5d7..a5535e771b8 100644
--- a/tensorflow/contrib/nn/BUILD
+++ b/tensorflow/contrib/nn/BUILD
@@ -15,6 +15,7 @@ py_library(
         "__init__.py",
         "python/__init__.py",
         "python/ops/__init__.py",
+        "python/ops/alpha_dropout.py",
         "python/ops/cross_entropy.py",
         "python/ops/sampling_ops.py",
     ],
@@ -44,6 +45,23 @@ py_test(
     ],
 )
 
+py_test(
+    name = "alpha_dropout_test",
+    size = "small",
+    srcs = ["python/ops/alpha_dropout_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":nn_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:random_ops",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/nn/__init__.py b/tensorflow/contrib/nn/__init__.py
index ec832cbd490..2cfeaa955dd 100644
--- a/tensorflow/contrib/nn/__init__.py
+++ b/tensorflow/contrib/nn/__init__.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Module for variants of ops in tf.nn.
 
+@@alpha_dropout
 @@deprecated_flipped_softmax_cross_entropy_with_logits
 @@deprecated_flipped_sparse_softmax_cross_entropy_with_logits
 @@deprecated_flipped_sigmoid_cross_entropy_with_logits
@@ -27,6 +28,7 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.nn.python.ops.cross_entropy import *
 from tensorflow.contrib.nn.python.ops.sampling_ops import *
+from tensorflow.contrib.nn.python.ops.alpha_dropout import *
 # pylint: enable=unused-import,wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/nn/python/ops/alpha_dropout.py b/tensorflow/contrib/nn/python/ops/alpha_dropout.py
new file mode 100644
index 00000000000..d7b61a58447
--- /dev/null
+++ b/tensorflow/contrib/nn/python/ops/alpha_dropout.py
@@ -0,0 +1,88 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numbers
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl
+
+
+def alpha_dropout(x, keep_prob, noise_shape=None, seed=None, name=None): # pylint: disable=invalid-name
+  """Computes alpha dropout.
+
+  Alpha Dropout is a dropout that maintains the self-normalizing property. For
+  an input with zero mean and unit standard deviation, the output of
+  Alpha Dropout maintains the original mean and standard deviation of the input.
+
+  See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+
+  Args:
+    x: A tensor.
+    keep_prob: A scalar `Tensor` with the same type as x. The probability
+      that each element is kept.
+    noise_shape: A 1-D `Tensor` of type `int32`, representing the
+      shape for randomly generated keep/drop flags.
+    seed: A Python integer. Used to create random seeds. See
+      @{tf.set_random_seed} for behavior.
+    name: A name for this operation (optional).
+
+  Returns:
+    A Tensor of the same shape of `x`.
+
+  Raises:
+    ValueError: If `keep_prob` is not in `(0, 1]`.
+
+  """
+  with ops.name_scope(name, "alpha_dropout", [x]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1.:
+      raise ValueError("keep_prob must be a scalar tensor or a float in the "
+                       "range (0, 1], got %g" % keep_prob)
+    keep_prob = ops.convert_to_tensor(keep_prob,
+                                      dtype=x.dtype,
+                                      name="keep_prob")
+    keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
+
+    # Do nothing if we know keep_prob == 1
+    if tensor_util.constant_value(keep_prob) == 1:
+      return x
+
+    alpha = -1.7580993408473766
+
+    noise_shape = noise_shape if noise_shape is not None else array_ops.shape(x)
+    random_tensor = random_ops.random_uniform(noise_shape,
+                                              seed=seed,
+                                              dtype=x.dtype)
+    kept_idx = gen_math_ops.greater_equal(random_tensor, 1 - keep_prob)
+    kept_idx = math_ops.cast(kept_idx, x.dtype)
+    # Mask
+    x = x * kept_idx + alpha * (1 - kept_idx)
+
+    # Affine transformation parameters
+    a = (keep_prob + keep_prob * (1 - keep_prob) * alpha ** 2) ** -0.5
+    b = -a * alpha * (1 - keep_prob)
+
+    # Affine transformation
+    return a * x + b
diff --git a/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py b/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py
new file mode 100644
index 00000000000..2ff978ab897
--- /dev/null
+++ b/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py
@@ -0,0 +1,88 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sampling_ops.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.nn.python.ops.alpha_dropout import alpha_dropout
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.platform import test
+
+
+class AlphaDropoutTest(test.TestCase):
+
+  def testAlphaDropout(self):
+    x_dim, y_dim = 40, 30
+    for keep_prob in [0.1, 0.5, 0.8]:
+      with self.test_session():
+        t = random_ops.random_normal([x_dim, y_dim])
+        output = alpha_dropout(t, keep_prob)
+        self.assertEqual([x_dim, y_dim], output.get_shape())
+        t_mean, t_std = nn_impl.moments(t, axes=[0, 1])
+        output_mean, output_std = nn_impl.moments(output, axes=[0, 1])
+        self.assertLess(abs(t_mean.eval() - output_mean.eval()), 0.1)
+        self.assertLess(abs(t_std.eval() - output_std.eval()), 0.1)
+
+  def testShapedDropoutShapeError(self):
+    # Runs shaped dropout and verifies an error is thrown on misshapen noise.
+    x_dim = 40
+    y_dim = 30
+    keep_prob = 0.5
+    t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+    with self.assertRaises(ValueError):
+      _ = alpha_dropout(t, keep_prob, noise_shape=[x_dim, y_dim + 10])
+    with self.assertRaises(ValueError):
+      _ = alpha_dropout(t, keep_prob, noise_shape=[x_dim, y_dim, 5])
+    with self.assertRaises(ValueError):
+      _ = alpha_dropout(t, keep_prob, noise_shape=[x_dim + 3])
+    with self.assertRaises(ValueError):
+      _ = alpha_dropout(t, keep_prob, noise_shape=[x_dim])
+
+    # test that broadcasting proceeds
+    _ = alpha_dropout(t, keep_prob, noise_shape=[y_dim])
+    _ = alpha_dropout(t, keep_prob, noise_shape=[1, y_dim])
+    _ = alpha_dropout(t, keep_prob, noise_shape=[x_dim, 1])
+    _ = alpha_dropout(t, keep_prob, noise_shape=[1, 1])
+
+  def testInvalidKeepProb(self):
+    x_dim, y_dim = 40, 30
+    t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+    with self.assertRaises(ValueError):
+      alpha_dropout(t, -1.0)
+    with self.assertRaises(ValueError):
+      alpha_dropout(t, 1.1)
+    with self.assertRaises(ValueError):
+      alpha_dropout(t, [0.0, 1.0])
+    with self.assertRaises(ValueError):
+      alpha_dropout(t, array_ops.placeholder(dtypes.float64))
+    with self.assertRaises(ValueError):
+      alpha_dropout(t, array_ops.placeholder(dtypes.float32, shape=[2]))
+
+  def testNoDropoutFast(self):
+    x = array_ops.zeros((5,))
+    for p in 1, constant_op.constant(1.0):
+      y = alpha_dropout(x, keep_prob=p)
+      self.assertTrue(x is y)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/rnn/__init__.py b/tensorflow/contrib/rnn/__init__.py
index d39c1f062aa..895f1c2fbdd 100644
--- a/tensorflow/contrib/rnn/__init__.py
+++ b/tensorflow/contrib/rnn/__init__.py
@@ -50,6 +50,10 @@ See @{$python/contrib.rnn} guide.
 @@UGRNNCell
 @@IntersectionRNNCell
 @@PhasedLSTMCell
+@@ConvLSTMCell
+@@Conv1DLSTMCell
+@@Conv2DLSTMCell
+@@Conv3DLSTMCell
 @@HighwayWrapper
 @@GLSTMCell
 
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index c14463bdad2..a77097e0c3a 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.framework import test_util
 
 
 # pylint: enable=protected-access
@@ -445,11 +446,12 @@ class RNNCellTest(test.TestCase):
       # Can't perform this test w/o a GPU
       return
 
+    gpu_dev = test.gpu_device_name()
     with self.test_session(use_gpu=True) as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 1, 3])
-        cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), "/gpu:0")
+        cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), gpu_dev)
         with ops.device("/cpu:0"):
           outputs, _ = rnn.dynamic_rnn(
               cell=cell, inputs=x, dtype=dtypes.float32)
@@ -461,7 +463,7 @@ class RNNCellTest(test.TestCase):
         _ = sess.run(outputs, options=opts, run_metadata=run_metadata)
 
       step_stats = run_metadata.step_stats
-      ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1
+      ix = 0 if gpu_dev in step_stats.dev_stats[0].device else 1
       gpu_stats = step_stats.dev_stats[ix].node_stats
       cpu_stats = step_stats.dev_stats[1 - ix].node_stats
       self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name])
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 701590a8feb..40a3fb2fb0b 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
-from tensorflow.python.framework import test_util
 
 class Plus1RNNCell(rnn_lib.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
@@ -2208,11 +2207,11 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     if not test.is_gpu_available():
       return  # Test requires access to a GPU
 
+    gpu_dev = test.gpu_device_name()
     run_metadata = self._execute_rnn_on(
-        rnn_device="/cpu:0", cell_device=test_util.gpu_device_name())
+        rnn_device="/cpu:0", cell_device=gpu_dev)
     step_stats = run_metadata.step_stats
-    ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or
-    ("sycl" in step_stats.dev_stats[0].device)) else 1
+    ix = 0 if (gpu_dev in step_stats.dev_stats[0].device) else 1
     gpu_stats = step_stats.dev_stats[ix].node_stats
     cpu_stats = step_stats.dev_stats[1 - ix].node_stats
 
@@ -2233,12 +2232,12 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     if not test.is_gpu_available():
       return  # Test requires access to a GPU
 
+    gpu_dev = test.gpu_device_name()
     run_metadata = self._execute_rnn_on(
         rnn_device="/cpu:0", cell_device="/cpu:0",
-        input_device=test_util.gpu_device_name())
+        input_device=gpu_dev)
     step_stats = run_metadata.step_stats
-    ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or
-    ("sycl" in step_stats.dev_stats[0].device)) else 1
+    ix = 0 if (gpu_dev in step_stats.dev_stats[0].device) else 1
     gpu_stats = step_stats.dev_stats[ix].node_stats
     cpu_stats = step_stats.dev_stats[1 - ix].node_stats
 
@@ -2253,11 +2252,11 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     if not test.is_gpu_available():
       return  # Test requires access to a GPU
 
+    gpu_dev = test.gpu_device_name()
     run_metadata = self._execute_rnn_on(
-        input_device=test_util.gpu_device_name())
+        input_device=gpu_dev)
     step_stats = run_metadata.step_stats
-    ix = 0 if (("gpu" in step_stats.dev_stats[0].device) or
-    ("sycl" in step_stats.dev_stats[0].device)) else 1
+    ix = 0 if (gpu_dev in step_stats.dev_stats[0].device) else 1
     gpu_stats = step_stats.dev_stats[ix].node_stats
     cpu_stats = step_stats.dev_stats[1 - ix].node_stats
 
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
index baf17431f35..4239e32ab93 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
@@ -357,7 +357,7 @@ def training_gru_block_vs_gru_cell(batch_size,
   ops.reset_default_graph()
   with session.Session(graph=ops.Graph()) as sess:
     # Specify the device which is been used.
-    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
+    with ops.device("/cpu:0" if not use_gpu else "/device:GPU:0"):
 
       # Random initializers.
       seed = 1994
@@ -429,7 +429,7 @@ def inference_gru_block_vs_gru_cell(batch_size,
   """Benchmark inference speed between GRUBlockCell vs GRUCell."""
   ops.reset_default_graph()
   with session.Session(graph=ops.Graph()) as sess:
-    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
+    with ops.device("/cpu:0" if not use_gpu else "/device:GPU:0"):
 
       # Random initializers.
       seed = 1994
@@ -484,7 +484,7 @@ def single_bprop_step_gru_block_vs_gru_cell(batch_size,
   """Benchmark single bprop step speed between GRUBlockCell vs GRUCell."""
   ops.reset_default_graph()
   with session.Session(graph=ops.Graph()) as sess:
-    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
+    with ops.device("/cpu:0" if not use_gpu else "/device:GPU:0"):
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=1989)
       # Inputs
       x = vs.get_variable("x", [batch_size, input_size])
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index fb91fe14f4e..ebd4564f120 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -875,6 +875,152 @@ class RNNCellTest(test.TestCase):
         self.assertAllClose(res[1].c, expected_state_c)
         self.assertAllClose(res[1].h, expected_state_h)
 
+  def testConv1DLSTMCell(self):
+    with self.test_session() as sess:
+      shape = [2,1]
+      filter_size = [3]
+      num_features = 1
+      batch_size = 2
+      expected_state_c = np.array(
+          [[[1.4375670191], [1.4375670191]],
+           [[2.7542609292], [2.7542609292]]],
+          dtype=np.float32)
+      expected_state_h = np.array(
+          [[[0.6529865603], [0.6529865603]],
+           [[0.8736877431], [0.8736877431]]],
+          dtype=np.float32)
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(1.0/2.0)):
+        x = array_ops.placeholder(dtypes.float32, [None, None, 1])
+        cell = contrib_rnn_cell.Conv1DLSTMCell(input_shape=shape,
+                                               kernel_shape=filter_size,
+                                               output_channels=num_features)
+        hidden = cell.zero_state(array_ops.shape(x)[0], dtypes.float32)
+        output, state = cell(x, hidden)
+
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([output, state], {
+            hidden[0].name:
+                np.array([[[1.],[1.]],
+                          [[2.],[2.]]]),
+            x.name:
+                np.array([[[1.],[1.]],
+                          [[2.],[2.]]]),
+        })
+        # This is a smoke test, making sure expected values are unchanged.
+        self.assertEqual(len(res), 2)
+        self.assertAllClose(res[0], res[1].h)
+        self.assertAllClose(res[1].c, expected_state_c)
+        self.assertAllClose(res[1].h, expected_state_h)
+
+  def testConv2DLSTMCell(self):
+    with self.test_session() as sess:
+      shape = [2,2,1]
+      filter_size = [3,3]
+      num_features = 1
+      batch_size = 2
+      expected_state_c = np.array(
+          [[[[1.4375670191], [1.4375670191]],
+            [[1.4375670191], [1.4375670191]]],
+           [[[2.7542609292], [2.7542609292]],
+            [[2.7542609292], [2.7542609292]]]],
+          dtype=np.float32)
+      expected_state_h = np.array(
+          [[[[0.6529865603], [0.6529865603]],
+            [[0.6529865603], [0.6529865603]]],
+           [[[0.8736877431], [0.8736877431]],
+            [[0.8736877431], [0.8736877431]]]],
+          dtype=np.float32)
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(1.0/4.0)):
+        x = array_ops.placeholder(dtypes.float32, [None, None, None, 1])
+        cell = contrib_rnn_cell.Conv2DLSTMCell(input_shape=shape,
+                                               kernel_shape=filter_size,
+                                               output_channels=num_features)
+        hidden = cell.zero_state(array_ops.shape(x)[0], dtypes.float32)
+        output, state = cell(x, hidden)
+
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([output, state], {
+            hidden[0].name:
+                np.array([[[[1.],[1.]],
+                           [[1.],[1.]]],
+                          [[[2.],[2.]],
+                           [[2.],[2.]]]]),
+            x.name:
+                np.array([[[[1.],[1.]],
+                           [[1.],[1.]]],
+                          [[[2.],[2.]],
+                           [[2.],[2.]]]]),
+        })
+        # This is a smoke test, making sure expected values are unchanged.
+        self.assertEqual(len(res), 2)
+        self.assertAllClose(res[0], res[1].h)
+        self.assertAllClose(res[1].c, expected_state_c)
+        self.assertAllClose(res[1].h, expected_state_h)
+
+  def testConv3DLSTMCell(self):
+    with self.test_session() as sess:
+      shape = [2,2,2,1]
+      filter_size = [3,3,3]
+      num_features = 1
+      batch_size = 2
+      expected_state_c = np.array(
+         [[[[[1.4375670191], [1.4375670191]],
+            [[1.4375670191], [1.4375670191]]],
+           [[[1.4375670191], [1.4375670191]],
+            [[1.4375670191], [1.4375670191]]]],
+          [[[[2.7542609292], [2.7542609292]],
+            [[2.7542609292], [2.7542609292]]],
+           [[[2.7542609292], [2.7542609292]],
+            [[2.7542609292], [2.7542609292]]]]],
+          dtype=np.float32)
+      expected_state_h = np.array(
+         [[[[[0.6529865603], [0.6529865603]],
+            [[0.6529865603], [0.6529865603]]],
+           [[[0.6529865603], [0.6529865603]],
+            [[0.6529865603], [0.6529865603]]]],
+          [[[[0.8736877431], [0.8736877431]],
+            [[0.8736877431], [0.8736877431]]],
+           [[[0.8736877431], [0.8736877431]],
+            [[0.8736877431], [0.8736877431]]]]],
+          dtype=np.float32)
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(1.0/8.0)):
+        x = array_ops.placeholder(dtypes.float32, [None, None, None, None, 1])
+        cell = contrib_rnn_cell.Conv3DLSTMCell(input_shape=shape,
+                                               kernel_shape=filter_size,
+                                               output_channels=num_features)
+        hidden = cell.zero_state(array_ops.shape(x)[0], dtypes.float32)
+        output, state = cell(x, hidden)
+
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([output, state], {
+            hidden[0].name:
+                np.array([[[[[1.],[1.]], 
+                            [[1.],[1.]]],
+                           [[[1.],[1.]],
+                            [[1.],[1.]]]], 
+                          [[[[2.],[2.]],
+                            [[2.],[2.]]],
+                           [[[2.],[2.]],
+                            [[2.],[2.]]]]]),
+            x.name:
+                np.array([[[[[1.],[1.]],
+                            [[1.],[1.]]],
+                           [[[1.],[1.]],
+                            [[1.],[1.]]]],
+                          [[[[2.],[2.]],
+                            [[2.],[2.]]],
+                           [[[2.],[2.]],
+                           [[2.],[2.]]]]])
+        })
+        # This is a smoke test, making sure expected values are unchanged.
+        self.assertEqual(len(res), 2)
+        self.assertAllClose(res[0], res[1].h)
+        self.assertAllClose(res[1].c, expected_state_c)
+        self.assertAllClose(res[1].h, expected_state_h)
+
   def testHighwayWrapper(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 090d28a078d..7b28222257f 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import init_ops
@@ -1921,6 +1922,181 @@ class PhasedLSTMCell(rnn_cell_impl.RNNCell):
 
     return new_h, new_state
 
+class ConvLSTMCell(rnn_cell_impl.RNNCell):
+  """Convolutional LSTM recurrent network cell.
+
+  https://arxiv.org/pdf/1506.04214v1.pdf
+  """
+
+  def __init__(self,
+               conv_ndims,
+               input_shape,
+               output_channels,
+               kernel_shape,
+               use_bias=True,
+               skip_connection=False,
+               forget_bias=1.0,
+               initializers=None,
+               name="conv_lstm_cell"):
+    """Construct ConvLSTMCell.
+    Args:
+      conv_ndims: Convolution dimensionality (1, 2 or 3).
+      input_shape: Shape of the input as int tuple, excluding the batch size.
+      output_channels: int, number of output channels of the conv LSTM.
+      kernel_shape: Shape of kernel as in tuple (of size 1,2 or 3).
+      use_bias: Use bias in convolutions.
+      skip_connection: If set to `True`, concatenate the input to the
+      output of the conv LSTM. Default: `False`.
+      forget_bias: Forget bias.
+      name: Name of the module.
+    Raises:
+      ValueError: If `skip_connection` is `True` and stride is different from 1
+        or if `input_shape` is incompatible with `conv_ndims`.
+    """
+    super(ConvLSTMCell, self).__init__(name=name)
+
+    if conv_ndims != len(input_shape)-1:
+      raise ValueError("Invalid input_shape {} for conv_ndims={}.".format(
+          input_shape, conv_ndims))
+
+    self._conv_ndims = conv_ndims
+    self._input_shape = input_shape
+    self._output_channels = output_channels
+    self._kernel_shape = kernel_shape
+    self._use_bias = use_bias
+    self._forget_bias = forget_bias
+    self._skip_connection = skip_connection
+
+    self._total_output_channels = output_channels
+    if self._skip_connection:
+      self._total_output_channels += self._input_shape[-1]
+
+    state_size = tensor_shape.TensorShape(self._input_shape[:-1] 
+                                          + [self._output_channels])
+    self._state_size = rnn_cell_impl.LSTMStateTuple(state_size, state_size)
+    self._output_size = tensor_shape.TensorShape(self._input_shape[:-1]
+                                                 + [self._total_output_channels])
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  def call(self, inputs, state, scope=None):
+    cell, hidden = state
+    new_hidden = _conv([inputs, hidden],
+                       self._kernel_shape,
+                       4*self._output_channels,
+                       self._use_bias)
+    gates = array_ops.split(value=new_hidden,
+                            num_or_size_splits=4,
+                            axis=self._conv_ndims+1)
+
+    input_gate, new_input, forget_gate, output_gate = gates
+    new_cell = math_ops.sigmoid(forget_gate + self._forget_bias) * cell
+    new_cell += math_ops.sigmoid(input_gate) * math_ops.tanh(new_input)
+    output = math_ops.tanh(new_cell) * math_ops.sigmoid(output_gate)
+
+    if self._skip_connection:
+      output = array_ops.concat([output, inputs], axis=-1)
+    new_state = rnn_cell_impl.LSTMStateTuple(new_cell, output)
+    return output, new_state
+
+class Conv1DLSTMCell(ConvLSTMCell):
+  """1D Convolutional LSTM recurrent network cell.
+
+  https://arxiv.org/pdf/1506.04214v1.pdf
+  """
+  def __init__(self, name="conv_1d_lstm_cell", **kwargs):
+    """Construct Conv1DLSTM. See `ConvLSTMCell` for more details."""
+    super(Conv1DLSTMCell, self).__init__(conv_ndims=1, **kwargs)
+
+class Conv2DLSTMCell(ConvLSTMCell):
+  """2D Convolutional LSTM recurrent network cell.
+
+  https://arxiv.org/pdf/1506.04214v1.pdf
+  """
+  def __init__(self, name="conv_2d_lstm_cell", **kwargs):
+    """Construct Conv2DLSTM. See `ConvLSTMCell` for more details."""
+    super(Conv2DLSTMCell, self).__init__(conv_ndims=2, **kwargs)
+
+class Conv3DLSTMCell(ConvLSTMCell):
+  """3D Convolutional LSTM recurrent network cell.
+
+  https://arxiv.org/pdf/1506.04214v1.pdf
+  """
+  def __init__(self, name="conv_3d_lstm_cell", **kwargs):
+    """Construct Conv3DLSTM. See `ConvLSTMCell` for more details."""
+    super(Conv3DLSTMCell, self).__init__(conv_ndims=3, **kwargs)
+
+def _conv(args, 
+          filter_size,
+          num_features,
+          bias,
+          bias_start=0.0):
+  """convolution:
+  Args:
+    args: a Tensor or a list of Tensors of dimension 3D, 4D or 5D, 
+    batch x n, Tensors.
+    filter_size: int tuple of filter height and width.
+    num_features: int, number of features.
+    bias_start: starting value to initialize the bias; 0 by default.
+  Returns:
+    A 3D, 4D, or 5D Tensor with shape [batch ... num_features]
+  Raises:
+    ValueError: if some of the arguments has unspecified or wrong shape.
+  """
+
+  # Calculate the total size of arguments on dimension 1.
+  total_arg_size_depth = 0
+  shapes = [a.get_shape().as_list() for a in args]
+  shape_length = len(shapes[0])
+  for shape in shapes:
+    if len(shape) not in [3,4,5]:
+      raise ValueError("Conv Linear expects 3D, 4D or 5D arguments: %s" % str(shapes))
+    if len(shape) != len(shapes[0]):
+      raise ValueError("Conv Linear expects all args to be of same Dimensiton: %s" % str(shapes))
+    else:
+      total_arg_size_depth += shape[-1]
+  dtype = [a.dtype for a in args][0]
+
+  # determine correct conv operation
+  if   shape_length == 3:
+    conv_op = nn_ops.conv1d
+    strides = 1
+  elif shape_length == 4:
+    conv_op = nn_ops.conv2d
+    strides = shape_length*[1]
+  elif shape_length == 5:
+    conv_op = nn_ops.conv3d
+    strides = shape_length*[1]
+
+  # Now the computation.
+  kernel = vs.get_variable(
+      "kernel", 
+      filter_size + [total_arg_size_depth, num_features],
+      dtype=dtype)
+  if len(args) == 1:
+    res = conv_op(args[0],
+                  kernel,
+                  strides,
+                  padding='SAME')
+  else:
+    res = conv_op(array_ops.concat(axis=shape_length-1, values=args),
+                  kernel,
+                  strides,
+                  padding='SAME')
+  if not bias:
+    return res
+  bias_term = vs.get_variable(
+      "biases", [num_features],
+      dtype=dtype,
+      initializer=init_ops.constant_initializer(
+          bias_start, dtype=dtype))
+  return res + bias_term
 
 class GLSTMCell(rnn_cell_impl.RNNCell):
   """Group LSTM cell (G-LSTM).
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
index 3496b355b4b..50cccf392fd 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
@@ -78,7 +78,7 @@ class GatherTreeTest(test.TestCase):
     sequence_length = [[3, 3, 3]]
     expected_result = _transpose_batch_time(
         [[[2, -1, 2], [6, 5, 6], [7, 8, 9], [-1, -1, -1]]])
-    with ops.device("/gpu:0"):
+    with ops.device("/device:GPU:0"):
       beams = beam_search_ops.gather_tree(
           step_ids=step_ids, parent_ids=parent_ids,
           sequence_length=sequence_length)
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index c434113520f..259c8e08ad9 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -979,9 +979,9 @@ def _compute_attention(attention_mechanism, cell_output, previous_alignments,
   # alignments shape is
   #   [batch_size, 1, memory_time]
   # attention_mechanism.values shape is
-  #   [batch_size, memory_time, attention_mechanism.num_units]
+  #   [batch_size, memory_time, memory_size]
   # the batched matmul is over memory_time, so the output shape is
-  #   [batch_size, 1, attention_mechanism.num_units].
+  #   [batch_size, 1, memory_size].
   # we then squeeze out the singleton dim.
   context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
   context = array_ops.squeeze(context, [1])
diff --git a/tensorflow/contrib/session_bundle/exporter.py b/tensorflow/contrib/session_bundle/exporter.py
index dcc7fbaa2d6..f6f663aae76 100644
--- a/tensorflow/contrib/session_bundle/exporter.py
+++ b/tensorflow/contrib/session_bundle/exporter.py
@@ -301,7 +301,12 @@ class Exporter(object):
     if exports_to_keep:
       # create a simple parser that pulls the export_version from the directory.
       def parser(path):
-        match = re.match("^" + export_dir_base + "/(\\d{8})$", path.path)
+        if os.name == 'nt':
+          match = re.match("^" + export_dir_base.replace('\\','/') + "/(\\d{8})$",
+                           path.path.replace('\\','/'))
+        else:
+          match = re.match("^" + export_dir_base + "/(\\d{8})$",
+                           path.path)
         if not match:
           return None
         return path._replace(export_version=int(match.group(1)))
diff --git a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
new file mode 100644
index 00000000000..0ef5f111b2a
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
@@ -0,0 +1,187 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""SGDR learning rate decay function."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops, control_flow_ops
+
+
+def sgdr_decay(learning_rate, global_step, initial_period_steps,
+               t_mul=2.0, m_mul=1.0, name=None):
+  """Implements Stochastic Gradient Descent with Warm Restarts (SGDR).
+  
+  As described in "SGDR: Stochastic Gradient Descent
+  with Warm Restarts" by Ilya Loshchilov & Frank Hutter, Proceedings of
+  ICLR'2017, available at https://arxiv.org/pdf/1608.03983.pdf
+
+  The learning rate decreases according to cosine annealing:
+
+  ```python
+  learning_rate * 0.5 * (1 + cos(x_val * pi)) # for x_val defined in [0, 1]
+  ```
+
+  Thus, at the beginning (when the restart index i = 0),
+  the learning rate decreases for `initial_period_steps` steps from the initial
+  learning rate `learning_rate` (when `x_val=0`, we get `cos(0)=1`) to
+  0 (when `x_val=1`, we get `cos(pi)=-1`).
+
+  The decrease within the i-th period takes `t_i` steps,
+  where `t_0` = `initial_period_steps` is the user-defined number of batch
+  iterations (not epochs as in the paper) to be performed before the first
+  restart is launched.
+  
+  Then, we perform the first restart (i=1) by setting the learning rate to
+  `learning_rate*(m_mul^i)`, where `m_mul in [0,1]` (set to 1 by default).
+  The i-th restart runs for `t_i=t_0*(t_mul^i)` steps, i.e., every new
+  restart runs `t_mul` times longer than the previous one.
+
+  Importantly, when one has no access to a validation set, SGDR suggests
+  to report the best expected / recommended solution in the following way:
+  When we are within our initial run (i=0), every new solution represents
+  SGDR's recommended solution. Instead, when i>0, the recommended solution is
+  the one obtained at the end of each restart.
+
+  Note that the minimum learning rate is set to 0 for simplicity,
+  you can adjust the code to deal with any positive minimum learning rate
+  as defined in the paper.
+
+  `initial_period_steps` is the duration of the first period measured in terms
+  of number of minibatch updates. If one wants to use epochs, one should compute
+  the number of updates required for an epoch.
+
+  For example, assume the following parameters and intention:
+      Minibatch size: 100
+      Training dataset size: 10000
+      If the user wants the first decay period to span across 5 epochs, then
+      `initial_period_steps` = 5 * 10000/100 = 500
+  
+      Train for 10000 batch iterations with the initial learning rate set to
+      0.1, then restart to run 2 times longer, i.e, for 20000 batch iterations
+      and with the initial learning rate 0.05, then restart again and again,
+      doubling the runtime of each new period and with two times smaller
+      initial learning rate.
+
+  To accomplish the above, one would write:
+
+  ```python
+  ...
+  global_step = tf.Variable(0, trainable=False)
+  starter_learning_rate = 0.1
+  learning_rate = sgdr_decay(starter_learning_rate, global_step,
+                             initial_period_steps=10000, t_mul=2, m_mul=0.5)
+  # Passing global_step to minimize() will increment it at each step.
+  learning_step = (
+      tf.train.GradientDescentOptimizer(learning_rate)
+      .minimize(...my loss..., global_step=global_step)
+  )
+
+  # Step  | 0   | 1000  | 5000 | 9000  | 9999 | 10000 | 11000  |
+  # LR    | 0.1 | 0.097 | 0.05 | 0.002 | 0.00 | 0.05  | 0.0496 |
+
+  # Step  | 20000 | 29000  | 29999 | 30000 |
+  # LR    | 0.025 | 0.0003 | 0.00  | 0.025 |
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a
+      Python number.  The initial learning rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Global step to use for the decay computation.  Must not be negative.
+    initial_period_steps: Duration of the first period measured as the number
+      of minibatch updates, if one wants to use epochs, one should compute
+      the number of updates required for an epoch.
+    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+      Must be positive.
+      Used to derive the number of iterations in the i-th period:
+      `initial_period_steps * (t_mul^i)`. Defaults to 2.0.
+    m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+      Must be positive.
+      Used to derive the initial learning rate of the i-th period:
+      `learning_rate * (m_mul^i)`. Defaults to 1.0
+
+  Returns:
+    A scalar `Tensor` of the same type as `learning_rate`.
+    The learning rate for a provided global_step.
+  Raises:
+    ValueError: if `global_step` is not supplied.
+  """
+
+  if global_step is None:
+    raise ValueError("global_step is required for sgdr_decay.")
+  with ops.name_scope(name, "SGDRDecay",
+                      [learning_rate, global_step,
+                       initial_period_steps, t_mul, m_mul]) as name:
+    learning_rate = ops.convert_to_tensor(learning_rate,
+                                          name="initial_learning_rate")
+    dtype = learning_rate.dtype
+    global_step = math_ops.cast(global_step, dtype)
+    t_0 = math_ops.cast(initial_period_steps, dtype)
+    t_mul = math_ops.cast(t_mul, dtype)
+    m_mul = math_ops.cast(m_mul, dtype)
+
+    c_one = math_ops.cast(constant_op.constant(1.0), dtype)
+    c_half = math_ops.cast(constant_op.constant(0.5), dtype)
+    c_pi = math_ops.cast(constant_op.constant(math.pi), dtype)
+
+    # Find normalized value of the current step
+    x_val = math_ops.div(global_step, t_0)
+
+    def compute_step(x_val, geometric=False):
+      if geometric:
+        # Consider geometric series where t_mul != 1
+        # 1 + t_mul + t_mul^2 ... = (1 - t_mul^i_restart) / (1 - t_mul)
+
+        # First find how many restarts were performed for a given x_val
+        # Find maximal integer i_restart value for which this equation holds
+        # x_val >= (1 - t_mul^i_restart) / (1 - t_mul)
+        # x_val * (1 - t_mul) <= (1 - t_mul^i_restart)
+        # t_mul^i_restart <= (1 - x_val * (1 - t_mul))
+
+        # tensorflow allows only log with base e
+        # i_restart <= log(1 - x_val * (1 - t_mul) / log(t_mul)
+        # Find how many restarts were performed
+
+        i_restart = math_ops.floor(
+            math_ops.log(c_one - x_val * (c_one - t_mul)) / math_ops.log(t_mul))
+        # Compute the sum of all restarts before the current one
+        sum_r = (c_one - t_mul ** i_restart) / (c_one - t_mul)
+        # Compute our position within the current restart
+        x_val = (x_val - sum_r) / t_mul ** i_restart
+
+      else:
+        # Find how many restarts were performed
+        i_restart = math_ops.floor(x_val)
+        # Compute our position within the current restart
+        x_val = x_val - i_restart
+      return i_restart, x_val
+
+    i_restart, x_val = control_flow_ops.cond(
+        math_ops.equal(t_mul, c_one),
+        lambda: compute_step(x_val, geometric=False),
+        lambda: compute_step(x_val, geometric=True))
+
+    # If m_mul < 1, then the initial learning rate of every new restart will be
+    # smaller, i.e., by a factor of m_mul ** i_restart at i_restart-th restart
+    m_fac = learning_rate * (m_mul ** i_restart)
+
+  return math_ops.multiply(c_half * m_fac,
+                           (math_ops.cos(x_val * c_pi) + c_one), name=name)
diff --git a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py
new file mode 100644
index 00000000000..4a46e9a49ef
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py
@@ -0,0 +1,145 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functional test for sgdr learning rate decay."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from sgdr_learning_rate_decay import sgdr_decay
+from tensorflow.python.platform import googletest
+from tensorflow.python.framework import test_util
+from tensorflow.python.framework import dtypes
+from tensorflow import placeholder
+
+
+class SGDRDecayTest(test_util.TensorFlowTestCase):
+  """Unit tests for SGDR learning rate decay."""
+
+  def get_original_values(self, lr, t_e, mult_factor, iter_per_epoch, epochs):
+    """Get an array with learning rate values from the consecutive steps using
+    the original implementation
+    (https://github.com/loshchil/SGDR/blob/master/SGDR_WRNs.py)."""
+    t0 = math.pi / 2.0
+    tt = 0
+    te_next = t_e
+
+    lr_values = []
+    sh_lr = lr
+    for epoch in range(epochs):
+      for _ in range(iter_per_epoch):
+        # In the original approach training function is executed here
+        lr_values.append(sh_lr)
+        dt = 2.0 * math.pi / float(2.0 * t_e)
+        tt = tt + float(dt) / iter_per_epoch
+        if tt >= math.pi:
+          tt = tt - math.pi
+        cur_t = t0 + tt
+        new_lr = lr * (1.0 + math.sin(cur_t)) / 2.0  # lr_min = 0, lr_max = lr
+        sh_lr = new_lr
+      if (epoch + 1) == te_next:  # time to restart
+        sh_lr = lr
+        tt = 0                # by setting to 0 we set lr to lr_max, see above
+        t_e = t_e * mult_factor  # change the period of restarts
+        te_next = te_next + t_e  # note the next restart's epoch
+
+    return lr_values
+
+  def get_sgdr_values(self, lr, initial_period_steps, t_mul, iters):
+    """Get an array with learning rate values from the consecutive steps
+    using current tensorflow implementation."""
+    with self.test_session():
+      step = placeholder(dtypes.int32)
+
+      decay = sgdr_decay(lr, step, initial_period_steps, t_mul)
+      lr_values = []
+      for i in range(iters):
+        lr_values.append(decay.eval(feed_dict={step: i}))
+
+      return lr_values
+
+  def testCompareToOriginal(self):
+    """Compare values generated by tensorflow implementation to the values
+    generated by the original implementation
+    (https://github.com/loshchil/SGDR/blob/master/SGDR_WRNs.py)."""
+    with self.test_session():
+      lr = 10.0
+      init_steps = 2
+      t_mul = 3
+      iters = 10
+      epochs = 50
+
+      org_lr = self.get_original_values(lr, init_steps, t_mul, iters, epochs)
+      sgdr_lr = self.get_sgdr_values(lr, init_steps*iters, t_mul, iters*epochs)
+
+      for org, sgdr in zip(org_lr, sgdr_lr):
+        self.assertAllClose(org, sgdr)
+
+  def testMDecay(self):
+    """Test m_mul argument. Check values for learning rate at the beginning
+    of the first, second, third and fourth period. """
+    with self.test_session():
+      step = placeholder(dtypes.int32)
+
+      lr = 0.1
+      t_e = 10
+      t_mul = 3
+      m_mul = 0.9
+
+      decay = sgdr_decay(lr, step, t_e, t_mul, m_mul)
+
+      test_step = 0
+      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
+                          lr)
+
+      test_step = t_e
+      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
+                          lr * m_mul)
+
+      test_step = t_e + t_e*t_mul
+      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
+                          lr * m_mul**2)
+
+      test_step = t_e + t_e*t_mul + t_e * (t_mul**2)
+      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
+                          lr * (m_mul**3))
+
+  def testCos(self):
+    """Check learning rate values at the beginning, in the middle
+    and at the end of the period."""
+    with self.test_session():
+      step = placeholder(dtypes.int32)
+      lr = 0.2
+      t_e = 1000
+      t_mul = 1
+
+      decay = sgdr_decay(lr, step, t_e, t_mul)
+
+      test_step = 0
+      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr)
+
+      test_step = t_e//2
+      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr/2)
+
+      test_step = t_e
+      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr)
+
+      test_step = t_e*3//2
+      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr/2)
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index c3597249354..6d1c79c0fb2 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "tensorflow/contrib/verbs/verbs_server_lib.h"
 
+#include "grpc/support/alloc.h"
+
 #include "tensorflow/contrib/verbs/rdma_mgr.h"
 #include "tensorflow/contrib/verbs/rdma_rendezvous_mgr.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7b6022fa337..1d7afa072b5 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -116,6 +116,7 @@ load(
     "tf_lib_proto_parsing_deps",
     "tf_additional_verbs_lib_defines",
     "tf_additional_mpi_lib_defines",
+    "tf_additional_gdr_lib_defines",
     "tf_additional_gpu_tracer_srcs",
     "tf_additional_gpu_tracer_deps",
     "tf_additional_gpu_tracer_cuda_deps",
@@ -1245,72 +1246,36 @@ tf_proto_library_cc(
     ],
 )
 
-LIB_INTERNAL_WINDOWS_DEPS = glob(
-    [
-        "lib/**/*.h",
-        "lib/**/*.cc",
-        "platform/*.h",
-        "platform/*.cc",
-        "platform/profile_utils/**/*.h",
-        "platform/profile_utils/**/*.cc",
-    ] + [
-        "framework/resource_handle.h",
-        "framework/resource_handle.cc",
-        "framework/variant_tensor_data.h",
-        "framework/variant_tensor_data.cc",
-    ],
-    exclude = [
-        "**/*test*",
-        "lib/hash/crc32c_accelerate.cc",
-        "lib/gif/**/*",
-        "lib/jpeg/**/*",
-        "platform/gif.h",
-        "platform/jpeg.h",
-        "platform/**/env_time.cc",
-        "platform/**/cuda.h",
-        "platform/**/cuda_libdevice_path.cc",
-        "platform/**/stream_executor.h",
-        "platform/load_library.cc",
-        "platform/variant_coding.cc",
-        "platform/**/variant_cord_coding.cc",
-    ],
-)
-
 cc_library(
     name = "lib_internal",
-    srcs = select({
-        "//tensorflow:windows": LIB_INTERNAL_WINDOWS_DEPS,
-        "//tensorflow:windows_msvc": LIB_INTERNAL_WINDOWS_DEPS,
-        "//conditions:default": glob(
-            [
-                "lib/**/*.h",
-                "lib/**/*.cc",
-                "platform/*.h",
-                "platform/*.cc",
-                "platform/profile_utils/**/*.h",
-                "platform/profile_utils/**/*.cc",
-                "framework/resource_handle.h",
-                "framework/resource_handle.cc",
-            ],
-            exclude = [
-                "**/*test*",
-                "framework/variant.cc",
-                "platform/variant_coding.cc",
-                "lib/hash/crc32c_accelerate.cc",
-                "lib/gif/**/*",
-                "lib/jpeg/**/*",
-                "platform/gif.h",
-                "platform/jpeg.h",
-                "platform/**/env_time.cc",
-                "platform/**/cuda.h",
-                "platform/**/cuda_libdevice_path.cc",
-                "platform/**/stream_executor.h",
-                "platform/**/gpu_tracer.cc",
-                "platform/variant_coding.cc",
-                "platform/**/variant_cord_coding.cc",
-            ],
-        ),
-    }) + tf_additional_lib_srcs(
+    srcs = glob(
+        [
+            "lib/**/*.h",
+            "lib/**/*.cc",
+            "platform/*.h",
+            "platform/*.cc",
+            "platform/profile_utils/**/*.h",
+            "platform/profile_utils/**/*.cc",
+            "framework/resource_handle.h",
+            "framework/resource_handle.cc",
+        ],
+        exclude = [
+            "**/*test*",
+            "framework/variant.cc",
+            "lib/hash/crc32c_accelerate.cc",
+            "lib/gif/**/*",
+            "lib/jpeg/**/*",
+            "platform/gif.h",
+            "platform/jpeg.h",
+            "platform/**/env_time.cc",
+            "platform/**/cuda.h",
+            "platform/**/cuda_libdevice_path.cc",
+            "platform/**/stream_executor.h",
+            "platform/**/gpu_tracer.cc",
+            "platform/variant_coding.cc",
+            "platform/**/variant_cord_coding.cc",
+        ],
+    ) + tf_additional_lib_srcs(
         exclude = [
             "**/*test*",
             "platform/**/cuda.h",
@@ -1370,9 +1335,12 @@ cc_library(
     defines = tf_additional_lib_defines() + [
                   "SNAPPY",
               ] + tf_additional_verbs_lib_defines() +
-              tf_additional_mpi_lib_defines(),
+              tf_additional_mpi_lib_defines() +
+              tf_additional_gdr_lib_defines(),
     linkopts = select({
         "//tensorflow:freebsd": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-ldl",
             "-lpthread",
@@ -1407,6 +1375,8 @@ cc_library(
     copts = tf_copts(),
     linkopts = select({
         "//tensorflow:freebsd": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
         "//conditions:default": ["-ldl"],
     }),
     deps = [
@@ -1430,6 +1400,8 @@ cc_library(
     copts = tf_copts(),
     linkopts = select({
         "//tensorflow:freebsd": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
         "//conditions:default": ["-ldl"],
     }),
     deps = [
@@ -1605,6 +1577,8 @@ tf_cuda_library(
     copts = tf_copts(),
     linkopts = select({
         "//tensorflow:freebsd": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:windows_msvc": [],
         "//conditions:default": ["-ldl"],
     }) + [
         "-lm",
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index ded7e383d17..1d450aad7ff 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -22,7 +22,7 @@ limitations under the License.
 // Device names
 // * Every Device should have a unique name with the format:
 //     /job:___/replica:___/task:___/(gpu|cpu):___
-//   An example name would be "/job:train/replica:0/task:3/gpu:2".
+//   An example name would be "/job:train/replica:0/task:3/device:GPU:2".
 // * Task numbers are within the specified replica, so there are as
 //   many "task zeros" as replicas.
 
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index b92bf620b17..a6630f38a51 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -471,7 +471,7 @@ Status DirectSession::Run(const RunOptions& run_options,
   args.step_id = step_id_counter_.fetch_add(1);
 
   TF_RETURN_IF_ERROR(
-      GetOrCreateExecutors(pool, input_tensor_names, output_names, target_nodes,
+      GetOrCreateExecutors(input_tensor_names, output_names, target_nodes,
                            &executors_and_keys, &run_state_args));
   const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
 
@@ -711,7 +711,7 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
   DebugOptions debug_options;
   RunStateArgs run_state_args(debug_options);
   run_state_args.is_partial_run = true;
-  TF_RETURN_IF_ERROR(GetOrCreateExecutors(pool, input_names, output_names,
+  TF_RETURN_IF_ERROR(GetOrCreateExecutors(input_names, output_names,
                                           target_nodes, &executors_and_keys,
                                           &run_state_args));
 
@@ -1042,9 +1042,9 @@ Status DirectSession::CheckFetch(const NamedTensorList& feeds,
 }
 
 Status DirectSession::GetOrCreateExecutors(
-    thread::ThreadPool* pool, gtl::ArraySlice<string> inputs,
-    gtl::ArraySlice<string> outputs, gtl::ArraySlice<string> target_nodes,
-    ExecutorsAndKeys** executors_and_keys, RunStateArgs* run_state_args) {
+    gtl::ArraySlice<string> inputs, gtl::ArraySlice<string> outputs,
+    gtl::ArraySlice<string> target_nodes, ExecutorsAndKeys** executors_and_keys,
+    RunStateArgs* run_state_args) {
   int64 handle_name_counter_value = -1;
   if (LogMemory::IsEnabled() || run_state_args->is_partial_run) {
     handle_name_counter_value = handle_name_counter_.fetch_add(1);
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 8c6fe0d88a5..020831d6cc5 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -194,8 +194,8 @@ class DirectSession : public Session {
   // Retrieves an already existing set of executors to run 'inputs' and
   // 'outputs', or creates and caches them for future use.
   ::tensorflow::Status GetOrCreateExecutors(
-      thread::ThreadPool* pool, gtl::ArraySlice<string> inputs,
-      gtl::ArraySlice<string> outputs, gtl::ArraySlice<string> target_nodes,
+      gtl::ArraySlice<string> inputs, gtl::ArraySlice<string> outputs,
+      gtl::ArraySlice<string> target_nodes,
       ExecutorsAndKeys** executors_and_keys, RunStateArgs* run_state_args);
 
   // Creates several graphs given the existing graph_def_ and the
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 097dab8406f..05f683f6082 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -476,7 +476,7 @@ TEST(DirectSessionTest, PlacePrunedGraph) {
     vx.scalar<float>()() = 1.0;
     Node* x = test::graph::Constant(&g, vx);
     Node* y = test::graph::Unary(&g, "Darth", x);
-    y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
     GraphDef def;
     test::graph::ToGraphDef(&g, &def);
 
@@ -494,7 +494,7 @@ TEST(DirectSessionTest, PlacePrunedGraph) {
     vx.scalar<float>()() = 1.0;
     Node* x = test::graph::Constant(&g, vx);
     Node* y = test::graph::Unary(&g, "Darth", x);
-    y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
     GraphDef def;
     test::graph::ToGraphDef(&g, &def);
 
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index da76ac83db7..459c20ef20b 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -154,14 +154,14 @@ static void TestHWAccelerator(bool enableHWTrace) {
   Tensor x_tensor(DT_FLOAT, TensorShape({2, 1}));
   test::FillValues<float>(&x_tensor, {1, 1});
   Node* x = test::graph::Constant(&graph, x_tensor);
-  x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+  x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
 #endif // TENSORFLOW_USE_SYCL
 
   // y = A * x
   Node* y = test::graph::Matmul(&graph, a, x, false, false);
-  y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+  y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
 y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
 #endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index d3b6099d5bf..63956afce25 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -114,14 +114,14 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
             << num_bytes << ". See error logs for more detailed info.";
       }
     }
-    if (LogMemory::IsEnabled()) {
+    if (LogMemory::IsEnabled() && ret != nullptr) {
       LogMemory::RecordRawAllocation(operation_, step_id_, num_bytes, ret,
                                      allocator_);
     }
     return ret;
   }
   void deallocate(void* buffer) const override {
-    if (LogMemory::IsEnabled()) {
+    if (LogMemory::IsEnabled() && buffer != nullptr) {
       LogMemory::RecordRawDeallocation(operation_, step_id_, buffer, allocator_,
                                        true);
     }
@@ -588,7 +588,7 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
   for (int i = 0; i < n; i++) {
     BaseGPUDevice* gpu_device;
     TF_RETURN_IF_ERROR(CreateGPUDevice(options,
-                                       strings::StrCat(name_prefix, "/gpu:", i),
+                                       strings::StrCat(name_prefix, "/device:GPU:", i),
                                        valid_gpu_ids[i], &gpu_device));
     TF_RETURN_IF_ERROR(gpu_device->Init(options));
     devices->push_back(gpu_device);
@@ -1049,7 +1049,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     size_t new_id = ids->size();
     ids->push_back(visible_gpu_id);
 
-    LOG(INFO) << "Creating TensorFlow device (/gpu:" << new_id << ") -> "
+    LOG(INFO) << "Creating TensorFlow device (/device:GPU:" << new_id << ") -> "
               << "(" << GetShortDeviceDescription(visible_gpu_id, desc) << ")";
   }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 08c58867eed..a7e078e97cc 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -141,7 +141,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
                                          Allocator* cpu_allocator) = 0;
 
   // Returns into 'ids' the list of valid GPU ids, in the order that
-  // they should map to logical gpu ids "/gpu:0", "/gpu:1", etc, based
+  // they should map to logical gpu ids "/device:GPU:0", "/device:GPU:1", etc, based
   // upon 'visible_device_list', a comma-separated list of 'visible
   // gpu ids'.
   Status GetValidDeviceIds(const string& visible_device_list,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
index a8bad5b94dc..003e416bbe6 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
@@ -106,9 +106,9 @@ TEST_F(GpuStreamUtilTest, SimpleGraphManyStreams) {
 TEST_F(GpuStreamUtilTest, StreamOverrides) {
   auto root = Scope::NewRootScope().ExitOnError();
   ops::_Recv(root.WithOpName("input"), DT_FLOAT, "input", "/cpu:0", 0,
-             "/gpu:0");
+             "/device:GPU:0");
   Output n = ops::MatMul(root, {}, {});
-  ops::_Send(root.WithOpName("output"), n, "output", "/gpu:0", 0, "/cpu:0");
+  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0, "/cpu:0");
   Graph g(OpRegistry::Global());
   TF_ASSERT_OK(root.ToGraph(&g));
 
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 7a1c10d900d..6b3c58ac9c5 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -167,7 +167,7 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
     if (!status.ok()) {
       LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
     }
-    Allocator* allocator;
+    VisitableAllocator* allocator;
     if (use_bfc_allocator) {
       // TODO(reedwm): evaluate whether 64GB by default is the best choice.
       int64 cpu_mem_limit_in_mb = -1;
@@ -192,7 +192,7 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
     if (LogMemory::IsEnabled()) {
       // Wrap the allocator to track allocation ids for better logging
       // at the cost of performance.
-      allocator = new TrackingAllocator(allocator, true);
+      allocator = new TrackingVisitableAllocator(allocator, true);
     }
     cpu_allocators_.push_back(allocator);
   }
@@ -237,14 +237,14 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
       LOG(ERROR) << "GetCUDAHostAllocator: " << status.error_message();
     }
     int64 cuda_host_mem_limit = cuda_host_mem_limit_in_mb * (1LL << 20);
-    Allocator* allocator =
+    VisitableAllocator* allocator =
         new BFCAllocator(new CUDAHostAllocator(se), cuda_host_mem_limit,
                          true /*allow_growth*/, "cuda_host_bfc" /*name*/);
 
     if (LogMemory::IsEnabled()) {
       // Wrap the allocator to track allocation ids for better logging
       // at the cost of performance.
-      allocator = new TrackingAllocator(allocator, true);
+      allocator = new TrackingVisitableAllocator(allocator, true);
     }
     cuda_host_allocators_.push_back(allocator);
     if (FLAGS_brain_gpu_record_mem_types) {
diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc
index b3a43d35046..2a834ddca42 100644
--- a/tensorflow/core/common_runtime/memory_types_test.cc
+++ b/tensorflow/core/common_runtime/memory_types_test.cc
@@ -53,7 +53,7 @@ TEST(MemoryTypeChecker, Int32NotOk) {
   EXPECT_TRUE(errors::IsInternal(ValidateMemoryTypes(DEVICE_GPU, g)));
 
   // But we can insert _HostSend/_HostRecv to ensure the invariant.
-  TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/gpu:0", g));
+  TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/device:GPU:0", g));
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/simple_placer.cc b/tensorflow/core/common_runtime/simple_placer.cc
index 5e6c3d164b1..f3406ac850b 100644
--- a/tensorflow/core/common_runtime/simple_placer.cc
+++ b/tensorflow/core/common_runtime/simple_placer.cc
@@ -682,7 +682,7 @@ Status SimplePlacer::Run() {
       int dst_root_id = colocation_graph.FindRoot(dst->id());
       auto& src_root = colocation_graph.members_[src_root_id];
       auto& dst_root = colocation_graph.members_[dst_root_id];
-      // If both the source node and this node have paritally
+      // If both the source node and this node have partially
       // specified a device, then 'node's device should be
       // cleared: the reference edge forces 'node' to be on the
       // same device as the source node.
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
index 0ddd4dce513..65b0db5bf61 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-SYCLAllocator::SYCLAllocator(Eigen::QueueInterface *queue)
+SYCLAllocator::SYCLAllocator(Eigen::QueueInterface* queue)
     : sycl_device_(new Eigen::SyclDevice(queue)) {
   cl::sycl::queue& sycl_queue = sycl_device_->sycl_queue();
   const cl::sycl::device& device = sycl_queue.get_device();
@@ -28,14 +28,15 @@ SYCLAllocator::SYCLAllocator(Eigen::QueueInterface *queue)
 }
 
 SYCLAllocator::~SYCLAllocator() {
-  if(sycl_device_) {
+  if (sycl_device_) {
     delete sycl_device_;
   }
 }
 
 string SYCLAllocator::Name() { return "device:SYCL"; }
 
-void *SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
+void* SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
+  mutex_lock lock(mu_);
   assert(sycl_device_);
   if (num_bytes == 0) {
     // Cannot allocate no bytes in SYCL, so instead allocate a single byte
@@ -45,7 +46,6 @@ void *SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   const auto& allocated_buffer = sycl_device_->get_sycl_buffer(p);
   const std::size_t bytes_allocated = allocated_buffer.get_range().size();
 
-  mutex_lock lock(mu_);
   ++stats_.num_allocs;
   stats_.bytes_in_use += bytes_allocated;
   stats_.max_bytes_in_use =
@@ -56,12 +56,12 @@ void *SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   return p;
 }
 
-void SYCLAllocator::DeallocateRaw(void *ptr) {
-  const auto& buffer_to_delete = sycl_device_->get_sycl_buffer(ptr);
-  const std::size_t dealloc_size = buffer_to_delete.get_range().size();
+void SYCLAllocator::DeallocateRaw(void* ptr) {
   mutex_lock lock(mu_);
-  stats_.bytes_in_use -= dealloc_size;
   if (sycl_device_) {
+    const auto& buffer_to_delete = sycl_device_->get_sycl_buffer(ptr);
+    const std::size_t dealloc_size = buffer_to_delete.get_range().size();
+    stats_.bytes_in_use -= dealloc_size;
     sycl_device_->deallocate(ptr);
   }
 }
@@ -72,6 +72,10 @@ void SYCLAllocator::GetStats(AllocatorStats* stats) {
 }
 
 size_t SYCLAllocator::RequestedSize(void* ptr) {
+  mutex_lock lock(mu_);
+  if(!sycl_device_) {
+    return 0;
+  }
   const auto& buffer = sycl_device_->get_sycl_buffer(ptr);
   return buffer.get_size();
 }
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
index 3597afa5bab..3066e0e4426 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@@ -29,15 +29,20 @@ namespace tensorflow {
 
 class SYCLAllocator : public Allocator {
  public:
-  SYCLAllocator(Eigen::QueueInterface *queue);
+  SYCLAllocator(Eigen::QueueInterface* queue);
   virtual ~SYCLAllocator() override;
   string Name() override;
-  void *AllocateRaw(size_t alignment, size_t num_bytes) override;
-  void DeallocateRaw(void *ptr) override;
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+  void DeallocateRaw(void* ptr) override;
 
   virtual bool ShouldAllocateEmptyTensors() override final { return true; }
-  void Synchronize() { sycl_device_->synchronize(); }
-  bool Ok() { return sycl_device_->ok(); }
+  void Synchronize() {
+    mutex_lock lock(mu_);
+    if (sycl_device_) {
+      sycl_device_->synchronize();
+    }
+  }
+  bool Ok() { return sycl_device_ && sycl_device_->ok(); }
   void GetStats(AllocatorStats* stats) override;
   // The SYCL buffers keep track of their size, so we already have tracking.
   bool TracksAllocationSizes() override { return true; }
@@ -46,10 +51,19 @@ class SYCLAllocator : public Allocator {
   // AllocatedSize(void* ptr) by default.
   size_t RequestedSize(void* ptr) override;
   Eigen::SyclDevice* getSyclDevice() { return sycl_device_; }
+  // Clear the SYCL device used by the Allocator
+  void ClearSYCLDevice() {
+    mutex_lock lock(mu_);
+    if(sycl_device_) {
+      delete sycl_device_;
+      sycl_device_ = nullptr;
+    }
+  }
+
  private:
-  Eigen::SyclDevice *sycl_device_;  // owned
 
   mutable mutex mu_;
+  Eigen::SyclDevice* sycl_device_ GUARDED_BY(mu_);  // owned
   AllocatorStats stats_ GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator);
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
index 17f5edd5725..6e1a45b3efa 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@@ -22,20 +22,10 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 
 namespace tensorflow {
-std::mutex GSYCLInterface::mutex_;
-GSYCLInterface *GSYCLInterface::s_instance = 0;
-
-void ShutdownSycl() {
-  GSYCLInterface::Reset();
-}
-
-void SYCLDevice::RegisterDevice() {
-    atexit(ShutdownSycl);
-}
 
 SYCLDevice::~SYCLDevice() {}
 
-void SYCLDevice::Compute(OpKernel *op_kernel, OpKernelContext *context) {
+void SYCLDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   assert(context);
   if (port::Tracing::IsActive()) {
     // TODO(pbar) We really need a useful identifier of the graph node.
@@ -46,16 +36,16 @@ void SYCLDevice::Compute(OpKernel *op_kernel, OpKernelContext *context) {
   op_kernel->Compute(context);
 }
 
-Allocator *SYCLDevice::GetAllocator(AllocatorAttributes attr) {
+Allocator* SYCLDevice::GetAllocator(AllocatorAttributes attr) {
   if (attr.on_host())
     return cpu_allocator_;
   else
     return sycl_allocator_;
 }
 
-Status SYCLDevice::MakeTensorFromProto(const TensorProto &tensor_proto,
+Status SYCLDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                                        const AllocatorAttributes alloc_attrs,
-                                       Tensor *tensor) {
+                                       Tensor* tensor) {
   AllocatorAttributes attr;
   attr.set_on_host(true);
   Allocator* host_alloc = GetAllocator(attr);
@@ -79,18 +69,18 @@ Status SYCLDevice::MakeTensorFromProto(const TensorProto &tensor_proto,
     }
 
     device_context_->CopyCPUTensorToDevice(
-        &parsed, this, &copy, [&status](const Status &s) { status = s; });
+        &parsed, this, &copy, [&status](const Status& s) { status = s; });
     *tensor = copy;
   }
   return status;
 }
 
-Status SYCLDevice::FillContextMap(const Graph *graph,
-                                  DeviceContextMap *device_context_map) {
+Status SYCLDevice::FillContextMap(const Graph* graph,
+                                  DeviceContextMap* device_context_map) {
   // Fill in the context map.  It is OK for this map to contain
   // duplicate DeviceContexts so long as we increment the refcount.
   device_context_map->resize(graph->num_node_ids());
-  for (Node *n : graph->nodes()) {
+  for (Node* n : graph->nodes()) {
     device_context_->Ref();
     (*device_context_map)[n->id()] = device_context_;
   }
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index b4123ca071a..9caa076c722 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -27,201 +27,190 @@ limitations under the License.
 
 namespace tensorflow {
 
-
-class GSYCLInterface
-{
-    std::vector<Eigen::QueueInterface*>     m_queue_interface_;    // owned
-    std::vector<Allocator*>                 m_cpu_allocator_;      // not owned
-    std::vector<SYCLAllocator*>             m_sycl_allocator_;     // owned
-    std::vector<SYCLDeviceContext*>         m_sycl_context_;       // owned
-
-    static std::mutex mutex_;
-    static GSYCLInterface* s_instance;
-    GSYCLInterface() {
-      bool found_device =false;
-      auto device_list = Eigen::get_sycl_supported_devices();
-      // Obtain list of supported devices from Eigen
-      for (const auto& device : device_list) {
-        if(device.is_gpu()) {
-          // returns first found GPU
-          AddDevice(device);
-          found_device = true;
-        }
-      }
-
-      if(!found_device) {
-        // Currently Intel GPU is not supported
-        LOG(WARNING) << "No OpenCL GPU found that is supported by ComputeCpp, trying OpenCL CPU";
-      }
-
-      for (const auto& device : device_list) {
-        if(device.is_cpu()) {
-          // returns first found CPU
-          AddDevice(device);
-          found_device = true;
-        }
-      }
-
-      if(!found_device) {
-        // Currently Intel GPU is not supported
-        LOG(FATAL) << "No OpenCL GPU nor CPU found that is supported by ComputeCpp";
-      } else {
-        LOG(INFO) << "Found following OpenCL devices:";
-        for (int i = 0; i < device_list.size(); i++) {
-          LOG(INFO) << GetShortDeviceDescription(i);
-        }
+class GSYCLInterface {
+  std::vector<Eigen::QueueInterface*> m_queue_interface_;  // owned
+  std::vector<Allocator*> m_cpu_allocator_;                // not owned
+  std::vector<SYCLAllocator*> m_sycl_allocator_;           // owned
+  std::vector<SYCLDeviceContext*> m_sycl_context_;         // ref counted
+  GSYCLInterface() {
+    bool found_device = false;
+    auto device_list = Eigen::get_sycl_supported_devices();
+    // Obtain list of supported devices from Eigen
+    for (const auto& device : device_list) {
+      if (device.is_gpu()) {
+        // returns first found GPU
+        AddDevice(device);
+        found_device = true;
       }
     }
 
-    ~GSYCLInterface() {
-        m_cpu_allocator_.clear();
-
-        for (auto p : m_sycl_allocator_) {
-          p->Synchronize();
-          delete p;
-        }
-        m_sycl_allocator_.clear();
-
-        for(auto p : m_sycl_context_) {
-          p->Unref();
-        }
-        m_sycl_context_.clear();
-
-        for (auto p : m_queue_interface_) {
-          p->deallocate_all();
-          delete p;
-          p = nullptr;
-        }
-        m_queue_interface_.clear();
+    if (!found_device) {
+      // Currently Intel GPU is not supported
+      LOG(WARNING) << "No OpenCL GPU found that is supported by ComputeCpp, "
+                      "trying OpenCL CPU";
     }
 
-    void AddDevice(const cl::sycl::device & d) {
-      m_queue_interface_.push_back(new Eigen::QueueInterface(d));
-      m_cpu_allocator_.push_back(cpu_allocator());
-      m_sycl_allocator_.push_back(new SYCLAllocator(m_queue_interface_.back()));
-      m_sycl_context_.push_back(new SYCLDeviceContext());
-    }
-
-  public:
-    static GSYCLInterface *instance()
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      if (!s_instance) {
-        s_instance = new GSYCLInterface();
-      }
-      return s_instance;
-    }
-
-    static void Reset()
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      if(s_instance) {
-        delete s_instance;
-        s_instance = NULL;
+    for (const auto& device : device_list) {
+      if (device.is_cpu()) {
+        // returns first found CPU
+        AddDevice(device);
+        found_device = true;
       }
     }
 
-    Eigen::QueueInterface * GetQueueInterface(size_t i = 0) {
-      if(!m_queue_interface_.empty()) {
-        return m_queue_interface_[i];
-      } else {
-        std::cerr << "No cl::sycl::device has been added" << std::endl;
-        return nullptr;
+    if (!found_device) {
+      // Currently Intel GPU is not supported
+      LOG(FATAL)
+          << "No OpenCL GPU nor CPU found that is supported by ComputeCpp";
+    } else {
+      LOG(INFO) << "Found following OpenCL devices:";
+      for (int i = 0; i < device_list.size(); i++) {
+        LOG(INFO) << GetShortDeviceDescription(i);
       }
     }
+  }
 
-    SYCLAllocator * GetSYCLAllocator(size_t i = 0) {
-      if(!m_sycl_allocator_.empty()) {
-        return m_sycl_allocator_[i];
-      } else {
-        std::cerr << "No cl::sycl::device has been added" << std::endl;
-        return nullptr;
-      }
+  ~GSYCLInterface() {
+    m_cpu_allocator_.clear();
+
+    for (auto p : m_sycl_allocator_) {
+      p->Synchronize();
+      p->ClearSYCLDevice();
+      // Cannot delete the Allocator instances, as the Allocator lifetime
+      // needs to exceed any Tensor created by it. There is no way of
+      // knowing when all Tensors have been deallocated, as they are
+      // RefCounted and wait until all instances of a Tensor have been
+      // destroyed before calling Allocator.Deallocate. This could happen at
+      // program exit, which can set up a race condition between destroying
+      // Tensors and Allocators when the program is cleaning up.
+    }
+    m_sycl_allocator_.clear();
+
+    for (auto p : m_sycl_context_) {
+      p->Unref();
+    }
+    m_sycl_context_.clear();
+
+    for (auto p : m_queue_interface_) {
+      p->deallocate_all();
+      delete p;
+    }
+    m_queue_interface_.clear();
+  }
+
+  void AddDevice(const cl::sycl::device& d) {
+    m_queue_interface_.push_back(new Eigen::QueueInterface(d));
+    m_cpu_allocator_.push_back(cpu_allocator());
+    m_sycl_allocator_.push_back(new SYCLAllocator(m_queue_interface_.back()));
+    m_sycl_context_.push_back(new SYCLDeviceContext());
+  }
+
+ public:
+  static const GSYCLInterface* instance() {
+    // c++11 guarantees that this will be constructed in a thread safe way
+    static const GSYCLInterface instance;
+    return &instance;
+  }
+
+  Eigen::QueueInterface* GetQueueInterface(size_t i = 0) const {
+    if (!m_queue_interface_.empty()) {
+      return m_queue_interface_[i];
+    } else {
+      std::cerr << "No cl::sycl::device has been added" << std::endl;
+      return nullptr;
+    }
+  }
+
+  SYCLAllocator* GetSYCLAllocator(size_t i = 0) const {
+    if (!m_sycl_allocator_.empty()) {
+      return m_sycl_allocator_[i];
+    } else {
+      std::cerr << "No cl::sycl::device has been added" << std::endl;
+      return nullptr;
+    }
+  }
+
+  Allocator* GetCPUAllocator(size_t i = 0) const {
+    if (!m_cpu_allocator_.empty()) {
+      return m_cpu_allocator_[i];
+    } else {
+      std::cerr << "No cl::sycl::device has been added" << std::endl;
+      return nullptr;
+    }
+  }
+
+  SYCLDeviceContext* GetSYCLContext(size_t i = 0) const {
+    if (!m_sycl_context_.empty()) {
+      return m_sycl_context_[i];
+    } else {
+      std::cerr << "No cl::sycl::device has been added" << std::endl;
+      return nullptr;
+    }
+  }
+
+  string GetShortDeviceDescription(int device_id = 0) const {
+    Eigen::QueueInterface* queue_ptr = GetQueueInterface(device_id);
+    if (!queue_ptr) {
+      LOG(ERROR)
+          << "Device name cannot be given after Eigen QueueInterface destroyed";
+      return "";
+    }
+    auto device = queue_ptr->sycl_queue().get_device();
+    auto name = device.get_info<cl::sycl::info::device::name>();
+    auto vendor = device.get_info<cl::sycl::info::device::vendor>();
+    auto profile = device.get_info<cl::sycl::info::device::profile>();
+
+    std::string type;
+    if (device.is_host()) {
+      type = "Host";
+    } else if (device.is_cpu()) {
+      type = "CPU";
+    } else if (device.is_gpu()) {
+      type = "GPU";
+    } else if (device.is_accelerator()) {
+      type = "Accelerator";
+    } else {
+      type = "Unknown";
     }
 
-    Allocator * GetCPUAllocator(size_t i = 0) {
-      if(!m_cpu_allocator_.empty()) {
-        return m_cpu_allocator_[i];
-      } else {
-        std::cerr << "No cl::sycl::device has been added" << std::endl;
-        return nullptr;
-      }
-    }
-
-    SYCLDeviceContext * GetSYCLContext(size_t i = 0) {
-      if(!m_sycl_context_.empty()) {
-        return m_sycl_context_[i];
-      } else {
-        std::cerr << "No cl::sycl::device has been added" << std::endl;
-        return nullptr;
-      }
-    }
-
-    string GetShortDeviceDescription(int device_id = 0) {
-      auto _device = GetSYCLAllocator(device_id)
-                         ->getSyclDevice()
-                         ->sycl_queue()
-                         .get_device();
-      auto _name = _device.get_info<cl::sycl::info::device::name>();
-      auto _vendor = _device.get_info<cl::sycl::info::device::vendor>();
-      auto _profile = _device.get_info<cl::sycl::info::device::profile>();
-
-      std::string _type;
-      if (_device.is_host()) {
-        _type = "Host";
-      } else if (_device.is_cpu()) {
-        _type = "CPU";
-      } else if (_device.is_gpu()) {
-        _type = "GPU";
-      } else if (_device.is_accelerator()) {
-        _type = "Accelerator";
-      } else {
-        _type = "Unknown";
-      }
-
-      return strings::StrCat("id: ", device_id, " ,type: ", _type, " ,name: ",
-                             _name.c_str(), " ,vendor: ", _vendor.c_str(),
-                             " ,profile: ", _profile.c_str());
-    }
+    return strings::StrCat("id: ", device_id, ", type: ", type, ", name: ",
+                           name.c_str(), ", vendor: ", vendor.c_str(),
+                           ", profile: ", profile.c_str());
+  }
 };
 
-
 class SYCLDevice : public LocalDevice {
  public:
-  SYCLDevice(const SessionOptions &options, const string &name,
-             Bytes memory_limit, const DeviceLocality &locality,
-             const string &physical_device_desc, SYCLAllocator * sycl_allocator,
-             Allocator *cpu_allocator, SYCLDeviceContext* ctx)
-      : LocalDevice(
-            options,
-            Device::BuildDeviceAttributes(name, DEVICE_SYCL, memory_limit,
-                                          locality, physical_device_desc)),
+  SYCLDevice(const SessionOptions& options, const string& name,
+             Bytes memory_limit, const DeviceLocality& locality,
+             const string& physical_device_desc, SYCLAllocator* sycl_allocator,
+             Allocator* cpu_allocator, SYCLDeviceContext* ctx)
+      : LocalDevice(options, Device::BuildDeviceAttributes(
+                                 name, DEVICE_SYCL, memory_limit, locality,
+                                 physical_device_desc)),
         cpu_allocator_(cpu_allocator),
         sycl_allocator_(sycl_allocator),
         device_context_(ctx) {
-    RegisterDevice();
     set_eigen_sycl_device(sycl_allocator->getSyclDevice());
   }
 
   ~SYCLDevice() override;
 
-  void Compute(OpKernel *op_kernel, OpKernelContext *context) override;
-  Allocator *GetAllocator(AllocatorAttributes attr) override;
-  Status MakeTensorFromProto(const TensorProto &tensor_proto,
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
+  Allocator* GetAllocator(AllocatorAttributes attr) override;
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
                              const AllocatorAttributes alloc_attrs,
-                             Tensor *tensor) override;
+                             Tensor* tensor) override;
 
-  Status FillContextMap(const Graph *graph,
-                        DeviceContextMap *device_context_map) override;
+  Status FillContextMap(const Graph* graph,
+                        DeviceContextMap* device_context_map) override;
 
   Status Sync() override;
 
  private:
-  void RegisterDevice();
-
-  Allocator         *cpu_allocator_;           // not owned
-  SYCLAllocator     *sycl_allocator_;          // not owned
-  SYCLDeviceContext *device_context_;
+  Allocator* cpu_allocator_;           // not owned
+  SYCLAllocator* sycl_allocator_;      // not owned
+  SYCLDeviceContext* device_context_;  // not owned
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/sycl/sycl_util.h b/tensorflow/core/common_runtime/sycl/sycl_util.h
index f58614c4ff9..83016b706a5 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_util.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_util.h
@@ -21,17 +21,60 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 // For DMA helper
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
 
 namespace tensorflow {
-  inline void* GetBase(const Tensor* src) {
-    return const_cast<void*>(DMAHelper::base(src));
+inline void const* GetBase(const Tensor* src) { return DMAHelper::base(src); }
+inline void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
+
+inline void SYCLmemcpy(Eigen::SyclDevice const& device,
+                       Tensor const& src_tensor, Tensor* dst_tensor) {
+  const size_t size = src_tensor.TotalBytes();
+  void* dst_ptr = GetBase(dst_tensor);
+  void const* src_ptr = GetBase(&src_tensor);
+
+#define COPY_WITH_TYPE(T) \
+  device.memcpy(dst_ptr, static_cast<T const*>(src_ptr), size);
+  switch (src_tensor.dtype()) {
+    case DT_COMPLEX128:
+      COPY_WITH_TYPE(cl::sycl::cl_ulong2);
+      break;
+    case DT_DOUBLE:
+    case DT_COMPLEX64:
+    case DT_INT64:
+      COPY_WITH_TYPE(cl::sycl::cl_ulong);
+      break;
+    case DT_FLOAT:
+    case DT_INT32:
+    case DT_QINT32:
+      COPY_WITH_TYPE(cl::sycl::cl_uint);
+      break;
+    case DT_INT16:
+    case DT_UINT16:
+    case DT_BFLOAT16:
+    case DT_QINT16:
+    case DT_QUINT16:
+    case DT_HALF:
+      COPY_WITH_TYPE(cl::sycl::cl_ushort);
+      break;
+    case DT_BOOL:
+      COPY_WITH_TYPE(bool);
+      break;
+    case DT_UINT8:
+    case DT_INT8:
+    case DT_QINT8:
+    case DT_QUINT8:
+      COPY_WITH_TYPE(cl::sycl::cl_uchar);
+      break;
+    default:
+      LOG(FATAL) << "Unknown data type " << src_tensor.dtype();
+      break;
   }
-
-  inline void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
-
+#undef COPY_WITH_TYPE
 }
+}  // namespace tensorflow
 
-#endif // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
diff --git a/tensorflow/core/debug/debug_gateway.cc b/tensorflow/core/debug/debug_gateway.cc
index 2aaed9563a6..616ced3d0f3 100644
--- a/tensorflow/core/debug/debug_gateway.cc
+++ b/tensorflow/core/debug/debug_gateway.cc
@@ -86,7 +86,7 @@ void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
     // Determine if the tensor is on device (GPU) or host (CPU).
     // The second part of the check is necessary because even an OpKernel on
     // may have output tensors allocated on CPU.
-    if ((device->name().find("gpu:") != string::npos || device->name().find("SYCL:") != string::npos) &&
+    if ((device->name().find("GPU:") != string::npos || device->name().find("SYCL:") != string::npos) &&
         !ctx->output_alloc_attr(output_slot).on_host()) {
       // GPU tensors: Copy it to host (CPU).
       DeviceContext* device_ctxt = ctx->op_device_context();
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index f25d91a3c27..9a74a4bb4cf 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -47,7 +47,7 @@ class SessionDebugMinusAXTest : public ::testing::Test {
     Graph graph(OpRegistry::Global());
 
 #if GOOGLE_CUDA
-    const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:GPU:0";
 #elif defined(TENSORFLOW_USE_SYCL)
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
@@ -505,7 +505,7 @@ class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
     Graph graph(OpRegistry::Global());
 
 #if GOOGLE_CUDA
-    const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:GPU:0";
 #elif defined(TENSORFLOW_USE_SYCL)
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
@@ -607,7 +607,7 @@ class SessionDebugVariableTest : public ::testing::Test {
     Graph graph(OpRegistry::Global());
 
 #if GOOGLE_CUDA
-    const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:GPU:0";
 #elif defined(TENSORFLOW_USE_SYCL)
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
@@ -879,7 +879,7 @@ class SessionDebugGPUSwitchTest : public ::testing::Test {
     Graph graph(OpRegistry::Global());
 
 #ifdef GOOGLE_CUDA
-    const string kDeviceName = "/job:localhost/replica:0/task:0/gpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:GPU:0";
 #elif TENSORFLOW_USE_SYCL
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #endif
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index eee9d3f97e7..c0bb65e7f45 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -53,14 +53,14 @@ class DebugIOUtilsTest : public ::testing::Test {
 };
 
 TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
-  DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/gpu:2",
+  DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/device:GPU:2",
                               "hidden_1/MatMul", 0, "DebugIdentity");
-  EXPECT_EQ("/job:worker/replica:1/task:0/gpu:2", debug_node_key.device_name);
+  EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2", debug_node_key.device_name);
   EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
   EXPECT_EQ(0, debug_node_key.output_slot);
   EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
   EXPECT_EQ("hidden_1/MatMul:0:DebugIdentity", debug_node_key.debug_node_name);
-  EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,gpu_2",
+  EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,device_GPU_2",
             debug_node_key.device_path);
 }
 
diff --git a/tensorflow/core/distributed_runtime/executor_test.cc b/tensorflow/core/distributed_runtime/executor_test.cc
index 1a4980a61b2..5b115f9a4d4 100644
--- a/tensorflow/core/distributed_runtime/executor_test.cc
+++ b/tensorflow/core/distributed_runtime/executor_test.cc
@@ -140,7 +140,7 @@ Rendezvous::ParsedKey Key(const string& sender, const uint64 incarnation,
 }
 
 #define ALICE "/job:j/replica:0/task:0/cpu:0"
-#define BOB "/job:j/replica:0/task:0/gpu:0"
+#define BOB "/job:j/replica:0/task:0/device:GPU:0"
 
 TEST_F(ExecutorTest, SimpleAdd) {
   // c = a + b
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
index c975563a21f..a17acc85b38 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
@@ -31,9 +31,9 @@ TEST(GrpcChannelTest, IsSameAddressSpace) {
   EXPECT_TRUE(IsSameAddrSp("/job:mnist/replica:10/task:10/cpu:0",
                            "/job:mnist/replica:10/task:10/cpu:1"));
   EXPECT_TRUE(IsSameAddrSp("/job:mnist/replica:10/task:10/cpu:0",
-                           "/job:mnist/replica:10/task:10/gpu:2"));
+                           "/job:mnist/replica:10/task:10/device:GPU:2"));
   EXPECT_TRUE(IsSameAddrSp("/job:mnist/replica:10/task:10",
-                           "/job:mnist/replica:10/task:10/gpu:2"));
+                           "/job:mnist/replica:10/task:10/device:GPU:2"));
   EXPECT_TRUE(IsSameAddrSp("/job:mnist/replica:10/task:10/cpu:1",
                            "/job:mnist/replica:10/task:10"));
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 9ee471b0761..a94f75418eb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -129,28 +129,14 @@ class GrpcRemoteWorker : public WorkerInterface {
                        TensorResponse* response, StatusCallback done) override {
     VLOG(1) << "RecvTensorAsync req: " << request->DebugString();
     int64 start_usec = Env::Default()->NowMicros();
-    // Don't propagate dma_ok over gRPC.
-    RecvTensorRequest* req_copy = nullptr;
-    if (request->dma_ok()) {
-      req_copy = new RecvTensorRequest;
-      *req_copy = *request;
-      req_copy->set_dma_ok(false);
-    }
     // Type-specialized logging for this method.
     bool logging_active = logger_->LoggingActive() || VLOG_IS_ON(2);
     StatusCallback wrapper_done;
     const StatusCallback* cb_to_use;
-    if (!logging_active && req_copy == nullptr) {
+    if (!logging_active) {
       cb_to_use = &done;  // No additional work to do, so just use done directly
-    } else if (!logging_active) {
-      wrapper_done = [req_copy, done](Status s) {
-        delete req_copy;
-        done(s);
-      };
-      cb_to_use = &wrapper_done;
     } else {
-      wrapper_done = [this, request, req_copy, response, done,
-                      start_usec](Status s) {
+      wrapper_done = [this, request, response, done, start_usec](Status s) {
         if (logger_->LoggingActive()) {
           int64 end_usec = Env::Default()->NowMicros();
           int64 step_id = request->step_id();
@@ -189,14 +175,12 @@ class GrpcRemoteWorker : public WorkerInterface {
         }
         VLOG(2) << "done callback, req: " << request->DebugString()
                 << " response " << response->metadata().DebugString();
-        delete req_copy;
         done(s);
       };
       cb_to_use = &wrapper_done;
     }
 
-    IssueRequest(req_copy ? req_copy : request, response, recvtensor_,
-                 *cb_to_use, call_opts);
+    IssueRequest(request, response, recvtensor_, *cb_to_use, call_opts);
   }
 
   void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 3867dd1f4d0..4883e503e69 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -105,7 +105,8 @@ GrpcServer::~GrpcServer() {
 
 Status GrpcServer::Init(
     ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func) {
+    const RendezvousMgrCreationFunction& rendezvous_mgr_func,
+    const WorkerCreationFunction& worker_func) {
   mutex_lock l(mu_);
   CHECK_EQ(state_, NEW);
   master_env_.env = env_;
@@ -183,7 +184,8 @@ Status GrpcServer::Init(
   master_impl_ = CreateMaster(&master_env_);
   master_service_ = NewGrpcMasterService(
       master_impl_.get(), config.operation_timeout_in_ms(), &builder);
-  worker_impl_ = NewGrpcWorker(&worker_env_);
+  worker_impl_ =
+      worker_func ? worker_func(&worker_env_) : NewGrpcWorker(&worker_env_);
   worker_service_ =
       NewGrpcWorkerService(worker_impl_.get(), &builder).release();
   // extra service:
@@ -239,7 +241,13 @@ Status GrpcServer::Init(
   return Status::OK();
 }
 
-Status GrpcServer::Init() { return Init(nullptr, nullptr); }
+Status GrpcServer::Init(
+    ServiceInitFunction service_func,
+    const RendezvousMgrCreationFunction& rendezvous_mgr_func) {
+  return Init(service_func, rendezvous_mgr_func, nullptr);
+}
+
+Status GrpcServer::Init() { return Init(nullptr, nullptr, nullptr); }
 
 Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                                     GrpcChannelSpec* channel_spec) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 7b54bb84c88..c3f513d4926 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -45,6 +45,10 @@ typedef std::function<RendezvousMgrInterface*(const WorkerEnv*)>
 typedef std::function<void(const WorkerEnv*, ::grpc::ServerBuilder*)>
     ServiceInitFunction;
 
+// function that creates a grpc based worker implementation.
+typedef std::function<std::unique_ptr<GrpcWorker>(WorkerEnv*)>
+    WorkerCreationFunction;
+
 class GrpcServer : public ServerInterface {
  protected:
   GrpcServer(const ServerDef& server_def, Env* env);
@@ -64,6 +68,10 @@ class GrpcServer : public ServerInterface {
   const string target() const override;
 
  protected:
+  Status Init(ServiceInitFunction service_func,
+              const RendezvousMgrCreationFunction& rendezvous_mgr_func,
+              const WorkerCreationFunction& worker_func);
+
   Status Init(ServiceInitFunction service_func,
               const RendezvousMgrCreationFunction& rendezvous_mgr_func);
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index a3b523943f2..4ee5ae09017 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -347,32 +347,25 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
             if (src_dev->tensorflow_gpu_device_info() && (!on_host)) {
 #if GOOGLE_CUDA
               const DeviceContext* send_dev_context = send_args.device_context;
-              RecvTensorResponse* tmp = new RecvTensorResponse;
-              tmp->set_is_dead(is_dead);
+              AllocatorAttributes alloc_attrs;
+              alloc_attrs.set_gpu_compatible(true);
+              alloc_attrs.set_on_host(true);
+              Allocator* alloc = src_dev->GetAllocator(alloc_attrs);
+              Tensor* copy = new Tensor(alloc, val.dtype(), val.shape());
               CHECK(send_dev_context)
                   << "send dev name: " << src_dev->name()
                   << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-              // "val" is on a GPU. Uses GPUUtil to fill the response proto.
-              StatusCallback response_ready = [response, done,
-                                               tmp](const Status& s) {
+              // "val" is on a GPU. Uses GPUUtil to fill the copy on host.
+              StatusCallback copy_ready = [response, done, copy,
+                                           is_dead](const Status& s) {
                 // The value is now ready to be returned on the wire.
-                tmp->set_send_start_micros(Env::Default()->NowMicros());
-
-                grpc::EncodeRecvTensorResponseToByteBuffer(*tmp, response);
+                grpc::EncodeTensorToByteBuffer(is_dead, *copy, response);
                 done(s);
-                delete tmp;
+                delete copy;
               };
 
-              // TODO (jeff,sanjay,mrry): Avoid copy on GPU path by
-              // modifying GPUUtil::SetProtoFromGPU to accept a
-              // ::grpc::ByteBuffer to serialize to, rather than
-              // encoding into a protocol buffer and then
-              // serializing that (i.e. figure out how to use
-              // EncodeTensorToByteBuffer on this path rather than
-              // EncodeRecvTensorResponseToByteBuffer)
-              GPUUtil::SetProtoFromGPU(val, src_dev, send_dev_context,
-                                       tmp->mutable_tensor(), is_dead,
-                                       response_ready);
+              GPUUtil::CopyGPUTensorToCPU(src_dev, send_dev_context, &val, copy,
+                                          copy_ready);
 #else
               done(errors::Internal("No GPU device in process"));
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index f6cf0f9c7ad..64d7c986daf 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -34,8 +34,10 @@ class GrpcWorker : public Worker {
   GrpcWorker(WorkerEnv* env);
 
   // Specialized version of RecvTensor for gRPC, which avoids a copy.
-  void GrpcRecvTensorAsync(CallOptions* opts, const RecvTensorRequest* request,
-                           ::grpc::ByteBuffer* response, StatusCallback done);
+  virtual void GrpcRecvTensorAsync(CallOptions* opts,
+                                   const RecvTensorRequest* request,
+                                   ::grpc::ByteBuffer* response,
+                                   StatusCallback done);
 
   WorkerEnv* env();
 };
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 9df5cbdec06..bd5d6e4af4e 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -673,6 +673,116 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+Status MaxPoolV2Shape(shape_inference::InferenceContext* c, int num_inputs) {
+  ShapeHandle input_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+
+  string data_format;
+  Status s = c->GetAttr("data_format", &data_format);
+
+  std::vector<int32> kernel_sizes;
+  std::vector<int32> strides;
+
+  if (c->num_inputs() + 2 == num_inputs) {
+    TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
+
+    TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
+  } else {
+    // Verify shape of ksize and strides input.
+    ShapeHandle size;
+    DimensionHandle unused;
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 1, &size));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 4, &unused));
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 1, &size));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 4, &unused));
+
+    const Tensor* kernel_sizes_tensor = c->input_tensor(c->num_inputs() - 2);
+    if (kernel_sizes_tensor == nullptr) {
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    }
+    kernel_sizes.resize(kernel_sizes_tensor->shape().num_elements());
+    auto kernel_sizes_vec = kernel_sizes_tensor->flat<int32>();
+    std::copy_n(&kernel_sizes_vec(0), kernel_sizes.size(), kernel_sizes.begin());
+
+    const Tensor* strides_tensor = c->input_tensor(c->num_inputs() - 1);
+    if (strides_tensor == nullptr) {
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    }
+    strides.resize(strides_tensor->shape().num_elements());
+    auto strides_vec = strides_tensor->flat<int32>();
+    std::copy_n(&strides_vec(0), strides.size(), strides.begin());
+  }
+
+  if (strides.size() != 4) {
+    return errors::InvalidArgument(
+        "MaxPool requires the stride attribute to contain 4 values, but "
+        "got: ",
+        strides.size());
+  }
+  if (kernel_sizes.size() != 4) {
+    return errors::InvalidArgument(
+        "MaxPool requires the ksize attribute to contain 4 values, but got: ",
+        kernel_sizes.size());
+  }
+
+  int32 stride_rows, stride_cols, stride_depth;
+  int32 kernel_rows, kernel_cols, kernel_depth;
+
+  if (s.ok() && data_format == "NCHW") {
+    // Canonicalize input shape to NHWC so the shape inference code below can
+    // process it.
+    auto dim = [&](char dimension) {
+      return c->Dim(input_shape, GetTensorDimIndex<2>(FORMAT_NCHW, dimension));
+    };
+    input_shape = c->MakeShape({{dim('N'), dim('0'), dim('1'), dim('C')}});
+    stride_depth = strides[1];
+    stride_rows = strides[2];
+    stride_cols = strides[3];
+    kernel_depth = kernel_sizes[1];
+    kernel_rows = kernel_sizes[2];
+    kernel_cols = kernel_sizes[3];
+  } else {
+    stride_rows = strides[1];
+    stride_cols = strides[2];
+    stride_depth = strides[3];
+    kernel_rows = kernel_sizes[1];
+    kernel_cols = kernel_sizes[2];
+    kernel_depth = kernel_sizes[3];
+  }
+
+  DimensionHandle batch_size_dim = c->Dim(input_shape, 0);
+  DimensionHandle in_rows_dim = c->Dim(input_shape, 1);
+  DimensionHandle in_cols_dim = c->Dim(input_shape, 2);
+  DimensionHandle in_depth_dim = c->Dim(input_shape, 3);
+
+  Padding padding;
+  TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
+
+  ShapeHandle output_shape;
+  DimensionHandle output_rows, output_cols, output_depth;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
+      c, in_rows_dim, kernel_rows, stride_rows, padding, &output_rows));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
+      c, in_cols_dim, kernel_cols, stride_cols, padding, &output_cols));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
+      c, in_depth_dim, kernel_depth, stride_depth, padding, &output_depth));
+
+  output_shape =
+      c->MakeShape({batch_size_dim, output_rows, output_cols, output_depth});
+  if (data_format == "NCHW") {
+    // Convert output shape back to expected NCHW data format.
+    auto dim = [&](char dimension) {
+      return c->Dim(output_shape, GetTensorDimIndex<2>(FORMAT_NHWC, dimension));
+    };
+    output_shape = c->MakeShape({{dim('N'), dim('C'), dim('0'), dim('1')}});
+  }
+
+  c->set_output(0, output_shape);
+  return Status::OK();
+}
+
 Status Pool3DShape(shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 5, &input_shape));
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 73b915652f6..fb79df07a4f 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -179,6 +179,9 @@ Status AvgPoolShape(shape_inference::InferenceContext* c);
 // Shape function for MaxPool-like operations.
 Status MaxPoolShape(shape_inference::InferenceContext* c);
 
+// Shape function for MaxPoolV2-like operations.
+Status MaxPoolV2Shape(shape_inference::InferenceContext* c, int num_inputs);
+
 // Shape function for 3D Pooling operations.
 Status Pool3DShape(shape_inference::InferenceContext* c);
 
diff --git a/tensorflow/core/framework/node_def.proto b/tensorflow/core/framework/node_def.proto
index d145fac8c14..53aa03108ab 100644
--- a/tensorflow/core/framework/node_def.proto
+++ b/tensorflow/core/framework/node_def.proto
@@ -38,8 +38,8 @@ message NodeDef {
   //              | ( ("gpu" | "cpu") ":" ([1-9][0-9]* | "*") )
   //
   // Valid values for this string include:
-  // * "/job:worker/replica:0/task:1/gpu:3"  (full specification)
-  // * "/job:worker/gpu:3"                   (partial specification)
+  // * "/job:worker/replica:0/task:1/device:GPU:3"  (full specification)
+  // * "/job:worker/device:GPU:3"                   (partial specification)
   // * ""                                    (no specification)
   //
   // If the constraints do not resolve to a single device (or if this
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index fe37b16bb6c..32b8ad784d5 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -39,11 +39,11 @@ namespace {
 TEST(RendezvousTest, Key) {
   const string key = Rendezvous::CreateKey(
       "/job:mnist/replica:1/task:2/CPU:0", 7890,
-      "/job:mnist/replica:1/task:2/GPU:0", "var0", FrameAndIter(0, 0));
+      "/job:mnist/replica:1/task:2/device:GPU:0", "var0", FrameAndIter(0, 0));
   EXPECT_EQ(key,
             "/job:mnist/replica:1/task:2/CPU:0;"
             "0000000000001ed2;"  // 7890 = 0x1ed2
-            "/job:mnist/replica:1/task:2/GPU:0;"
+            "/job:mnist/replica:1/task:2/device:GPU:0;"
             "var0;"
             "0:0");
   Rendezvous::ParsedKey parsed;
@@ -51,12 +51,12 @@ TEST(RendezvousTest, Key) {
   EXPECT_EQ(parsed.src_device, "/job:mnist/replica:1/task:2/CPU:0");
   EXPECT_EQ(parsed.src_incarnation, 7890);
   EXPECT_EQ(parsed.src.type, "CPU");
-  EXPECT_EQ(parsed.dst_device, "/job:mnist/replica:1/task:2/GPU:0");
+  EXPECT_EQ(parsed.dst_device, "/job:mnist/replica:1/task:2/device:GPU:0");
   EXPECT_EQ(parsed.dst.type, "GPU");
 
   EXPECT_FALSE(Rendezvous::ParseKey("foo;bar;baz", &parsed).ok());
   EXPECT_FALSE(Rendezvous::ParseKey("/job:mnist/replica:1/task:2/CPU:0;"
-                                    "/job:mnist/replica:1/task:2/GPU:0;",
+                                    "/job:mnist/replica:1/task:2/device:GPU:0;",
                                     &parsed)
                    .ok());
   EXPECT_FALSE(
@@ -99,7 +99,7 @@ string V(const Tensor& tensor) {
 
 Rendezvous::ParsedKey MakeKey(const string& name) {
   string s = Rendezvous::CreateKey("/job:mnist/replica:1/task:2/CPU:0", 7890,
-                                   "/job:mnist/replica:1/task:2/GPU:0", name,
+                                   "/job:mnist/replica:1/task:2/device:GPU:0", name,
                                    FrameAndIter(0, 0));
   Rendezvous::ParsedKey k;
   TF_EXPECT_OK(Rendezvous::ParseKey(s, &k));
diff --git a/tensorflow/core/framework/tensor_slice.h b/tensorflow/core/framework/tensor_slice.h
index 3a00e523c4d..6019737342a 100644
--- a/tensorflow/core/framework/tensor_slice.h
+++ b/tensorflow/core/framework/tensor_slice.h
@@ -126,7 +126,7 @@ class TensorSlice {
   // Interaction with other TensorSlices.
 
   // Compute the intersection with another slice and if "result" is not
-  // nullptr, store the results in *result; returns true is there is any real
+  // nullptr, store the results in *result; returns true if there is any real
   // intersection.
   bool Intersect(const TensorSlice& other, TensorSlice* result) const;
   // A short hand.
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index f222b9b5f1d..6be8e36ab6a 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -2325,7 +2325,93 @@ TEST_F(GraphConstructorTest, ImportGraphDefProvidedShapeRefinerVersions) {
   ImportGraphDefOptions opts;
   // A valid graph at producer version 20, but one
   // that would not import if the graph_def_version were 21.
-  string gdef_ascii = strings::StrCat(R"EOF(
+  string gdef_ascii;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  gdef_ascii = strings::StrCat(R"EOF(
+node {
+  name: "Sum/input"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\000\000\000\001\000\000\000\002"
+      }
+    }
+  }
+}
+node {
+  name: "Sum/reduction_indices"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\001"
+      }
+    }
+  }
+}
+node {
+  name: "Sum"
+  op: "Sum"
+  input: "Sum/input"
+  input: "Sum/reduction_indices"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+versions {
+  producer: 20
+})EOF");
+
+#else
+  gdef_ascii = strings::StrCat(R"EOF(
 node {
   name: "Sum/input"
   op: "Const"
@@ -2407,7 +2493,7 @@ node {
 versions {
   producer: 20
 })EOF");
-
+#endif
   // Create a shape refiner with the latest TF_GRAPH_DEF_VERSION.
   // Importing the graphdef with an existing refiner should
   // make the refiner inherit the graphdef version from the
@@ -2416,6 +2502,40 @@ versions {
   ExpectOK(gdef_ascii, opts, &refiner);
 
   // Add another node with a higher producer
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  gdef_ascii = strings::StrCat(R"EOF(
+node {
+  name: "RandomConst"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\000\000\000\001\000\000\000\002"
+      }
+    }
+  }
+}
+versions {
+  producer: 21
+})EOF");
+
+#else
   gdef_ascii = strings::StrCat(R"EOF(
 node {
   name: "RandomConst"
@@ -2447,6 +2567,7 @@ node {
 versions {
   producer: 21
 })EOF");
+#endif
 
   ExpectOK(gdef_ascii, opts, &refiner);
   // Check that the refiner's graph def version is the lowest of
@@ -2454,6 +2575,40 @@ versions {
   EXPECT_EQ(20, refiner.graph_def_version());
 
   // Add another node with a lower producer
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  gdef_ascii = strings::StrCat(R"EOF(
+node {
+  name: "RandomConst2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\000\000\000\001\000\000\000\002"
+      }
+    }
+  }
+}
+versions {
+  producer: 17
+})EOF");
+
+#else
   gdef_ascii = strings::StrCat(R"EOF(
 node {
   name: "RandomConst2"
@@ -2485,6 +2640,7 @@ node {
 versions {
   producer: 17
 })EOF");
+#endif
   ExpectOK(gdef_ascii, opts, &refiner);
 
   // Check that the refiner's graph def version is the lowest of
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 3c12ed2689e..d84c62d4546 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -50,7 +50,7 @@ extern Status TopologicalSortNodesWithTimePriority(
 
 namespace {
 
-const char gpu_device[] = "/job:a/replica:0/task:0/gpu:0";
+const char gpu_device[] = "/job:a/replica:0/task:0/device:GPU:0";
 
 string SplitByDevice(const Node* node) { return node->assigned_device_name(); }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 625780e7c91..2f9ceaa3bd0 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -477,27 +477,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   static ContextInfo biasaddgrad_matmul_context_;
   static ContextInfo biasaddgrad_conv2dwithbias_context_;
 
-  /// Hash table to maintain nodes visited in the graph.
-  std::unordered_set<const Node*> visited_nodes_;
-
  private:
-  // Check if we rewrote node 'n'
-  //
-  // If we rewrote the node, then the rewritten node will produce
-  // Mkl tensor as output. If we did not rewrite the node, then
-  // we need to insert dummy Mkl node on the input side.
-  //
-  // Returns true if node is rewritten, false otherwise.
-  inline bool IsRewrittenNode(Node* n) const {
-    return visited_nodes_.find(n) != visited_nodes_.end();
-  }
-
-  // Mark the node as rewritten
-  inline void MarkRewrittenNode(Node* n) { visited_nodes_.insert(n); }
-
-  // Clear all visited nodes
-  inline void UnMarkRewrittenNodes() { visited_nodes_.clear(); }
-
   // Is OpDef::ArgDef a list type? It could be N * T or list(type).
   // Refer to opdef.proto for details of list type.
   inline bool ArgIsList(const OpDef::ArgDef& arg) const {
@@ -1087,15 +1067,13 @@ void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
   CHECK_NOTNULL(n);
   CHECK_NOTNULL(mkl_node);
   CHECK_NOTNULL(mkl_node_output_slot);
-  if (IsRewrittenNode(n)) {
-    // If we have visited this node and rewritten it, then it will generate
-    // an edge that will receive Mkl tensor from a node.
-    // First, let's assert that this op is Mkl layer.
-    DataType T;
-    TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T));
-    // If this op has been rewritten, then its name must have been same as
-    // Mkl op.
-    CHECK_EQ(mkl_op_registry::IsMklOp(n->type_string(), T), true);
+
+  // If this is an MKL op, then it will create extra output for MKL layout.
+  DataType T;
+  if (GetNodeAttr(n->def(), "T", &T).ok() &&
+      mkl_op_registry::IsMklOp(n->type_string(), T)) {
+    // If this is an MKL op, then it will generate an edge that will receive
+    // Mkl tensor from a node.
     // output slot number for Mkl tensor would be N+slot number of TensorFlow
     // tensor, where N is total number of TensorFlow tensors.
     *mkl_node = n;
@@ -1801,7 +1779,6 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
 
     (*g)->RemoveNode(succ);
     (*g)->RemoveNode(pred);
-    MarkRewrittenNode(new_node);
 
     return Status::OK();
   }
@@ -1932,7 +1909,6 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
 
   // Delete original node and mark new node as rewritten.
   (*g)->RemoveNode(orig_node);
-  MarkRewrittenNode(new_node);
 
   VLOG(1) << "MklLayoutRewritePass: New node:" << new_node->DebugString();
   return Status::OK();
@@ -2062,9 +2038,6 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass", &**g);
 
-  // Clear marked nodes as the same graph pass may be used multiple times.
-  UnMarkRewrittenNodes();
-
   return result;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index efbe2134e0f..482e339802f 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -40,7 +40,7 @@ namespace tensorflow {
 namespace {
 
 const char kCPUDevice[] = "/job:a/replica:0/task:0/cpu:0";
-const char kGPUDevice[] = "/job:a/replica:0/task:0/gpu:0";
+const char kGPUDevice[] = "/job:a/replica:0/task:0/device:GPU:0";
 
 static void InitGraph(const string& s, Graph* graph,
                       const string& device = kCPUDevice) {
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 3481b2b158d..7139c2444a1 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -89,7 +89,7 @@ Status SingleMachine::Provision() {
   VLOG(1) << "Number of GPUs: " << num_gpus_;
   for (int i = 0; i < num_gpus_; ++i) {
     string device_name =
-        strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i);
+        strings::StrCat("/job:localhost/replica:0/task:0/device:GPU:", i);
     VLOG(1) << "Adding GPU device " << device_name;
     devices_[device_name] = GetLocalGPUInfo(i);
   }
@@ -112,10 +112,10 @@ Status SingleMachine::Shutdown() {
   TF_RETURN_IF_ERROR(CloseSession(true /*use_timeout*/));
 
   // Delete the threadpool: this ensures that all the pending closures complete
-  // before we return. Note that if that if TF deadlocked on us, the closures
-  // will never complete, and the call to thread_pool_.reset() will never
-  // return: therefore we need to delete the threadpool with the background
-  // thread. That thread itself will also never complete, so the user should
+  // before we return. Note that if TF deadlocked on us, the closures will
+  // never complete, and the call to thread_pool_.reset() will never return:
+  // therefore we need to delete the threadpool with the background thread.
+  // That thread itself will also never complete, so the user should
   // abort the process to avoid leaking too many resources.
   auto n = std::make_shared<Notification>();
   Env::Default()->SchedClosure([this, n]() {
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index 02156fbf580..d1f3e36aa81 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -42,7 +42,7 @@ class AnalyticalCostEstimatorTest : public ::testing::Test {
     gpu_device.set_frequency(1100);
     gpu_device.set_bandwidth(180 * 1024 * 1024);
     (*gpu_device.mutable_environment())["architecture"] = "6";
-    devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
+    devices["/job:localhost/replica:0/task:0/device:GPU:0"] = gpu_device;
 
     cluster_.reset(new VirtualCluster(devices));
   }
diff --git a/tensorflow/core/grappler/costs/virtual_placer_test.cc b/tensorflow/core/grappler/costs/virtual_placer_test.cc
index 65a03fb5575..a16455cb703 100644
--- a/tensorflow/core/grappler/costs/virtual_placer_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_placer_test.cc
@@ -30,14 +30,14 @@ TEST(VirtualPlacerTest, LocalDevices) {
   devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
   DeviceProperties gpu_device;
   gpu_device.set_type("GPU");
-  devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
+  devices["/job:localhost/replica:0/task:0/device:GPU:0"] = gpu_device;
   VirtualCluster cluster(devices);
   VirtualPlacer placer(&cluster);
 
   NodeDef node;
   node.set_op("Conv2D");
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:localhost/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 
   node.set_device("CPU");
@@ -47,7 +47,7 @@ TEST(VirtualPlacerTest, LocalDevices) {
 
   node.set_device("GPU:0");
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:localhost/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 }
 
@@ -60,7 +60,7 @@ TEST(VirtualPlacerTest, EmptyJobBecomesLocalhost) {
   devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
   DeviceProperties gpu_device;
   gpu_device.set_type("GPU");
-  devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
+  devices["/job:localhost/replica:0/task:0/device:GPU:0"] = gpu_device;
   VirtualCluster cluster(devices);
   VirtualPlacer placer(&cluster);
 
@@ -70,7 +70,7 @@ TEST(VirtualPlacerTest, EmptyJobBecomesLocalhost) {
   EXPECT_EQ("/job:localhost/replica:0/task:0/cpu:0",
             placer.get_canonical_device_name(node));
   node.set_device("/device:GPU:0");
-  EXPECT_EQ("/job:localhost/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:localhost/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 }
 
@@ -113,7 +113,7 @@ TEST(VirtualPlacerTest, RemoteDevices) {
   devices["/job:my_job/replica:0/task:0/cpu:0"] = cpu_device;
   DeviceProperties gpu_device;
   gpu_device.set_type("GPU");
-  devices["/job:my_job/replica:0/task:0/gpu:0"] = gpu_device;
+  devices["/job:my_job/replica:0/task:0/device:GPU:0"] = gpu_device;
   VirtualCluster cluster(devices);
   VirtualPlacer placer(&cluster);
 
@@ -122,7 +122,7 @@ TEST(VirtualPlacerTest, RemoteDevices) {
 
   // Device falls back to GPU.
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 
   node.set_device("/job:my_job/replica:0/task:0/cpu:0");
@@ -130,27 +130,27 @@ TEST(VirtualPlacerTest, RemoteDevices) {
   EXPECT_EQ("/job:my_job/replica:0/task:0/cpu:0",
             placer.get_canonical_device_name(node));
 
-  node.set_device("/job:my_job/replica:0/task:0/gpu:0");
+  node.set_device("/job:my_job/replica:0/task:0/device:GPU:0");
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 
   // There is no local cpu available. Device falls back to GPU.
   node.set_device("CPU");
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 
   node.set_device("GPU:0");
   // There is no local GPU available. Fall back to default GPU.
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 
   // This isn't a valid name. Fall back to GPU.
   node.set_device("/job:my_job/replica:0/task:0");
   EXPECT_EQ("GPU", placer.get_device(node).type());
-  EXPECT_EQ("/job:my_job/replica:0/task:0/gpu:0",
+  EXPECT_EQ("/job:my_job/replica:0/task:0/device:GPU:0",
             placer.get_canonical_device_name(node));
 }
 
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index aea1fcd7c93..ee722f311ed 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -320,14 +320,14 @@ TEST_F(ModelPrunerTest, PruningPerservesCrossDeviceIdentity) {
   Output c = ops::Const(s.WithOpName("c").WithDevice("/cpu:0"), 0.0f, {10, 10});
 
   // Node i1 should be preserved.
-  Output i1 = ops::Identity(s.WithOpName("i1").WithDevice("/gpu:0"), c);
-  Output a1 = ops::Sqrt(s.WithOpName("a1").WithDevice("/gpu:0"), {i1});
-  Output a2 = ops::Sqrt(s.WithOpName("a2").WithDevice("/gpu:0"), {i1});
+  Output i1 = ops::Identity(s.WithOpName("i1").WithDevice("/device:GPU:0"), c);
+  Output a1 = ops::Sqrt(s.WithOpName("a1").WithDevice("/device:GPU:0"), {i1});
+  Output a2 = ops::Sqrt(s.WithOpName("a2").WithDevice("/device:GPU:0"), {i1});
 
   // Node i2 should be pruned since it resides on the sender's device.
   Output i2 = ops::Identity(s.WithOpName("i2").WithDevice("/cpu:0"), c);
-  Output a3 = ops::Sqrt(s.WithOpName("a3").WithDevice("/gpu:0"), {i2});
-  Output a4 = ops::Sqrt(s.WithOpName("a4").WithDevice("/gpu:0"), {i2});
+  Output a3 = ops::Sqrt(s.WithOpName("a3").WithDevice("/device:GPU:0"), {i2});
+  Output a4 = ops::Sqrt(s.WithOpName("a4").WithDevice("/device:GPU:0"), {i2});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index a5e3a5feea2..05974f5a902 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -103,7 +103,6 @@ tf_kernel_library(
         "strided_slice_op.h",
         "strided_slice_op_impl.h",
         "strided_slice_op_gpu.cu.cc",
-        "slice_op_gpu.cu.cc",
     ],
     deps = [
         ":bounds_check",
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index 10f5d4ce85d..b3a77d1caad 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -35,14 +35,13 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
-class BiasOp;
-
-template <typename T>
-class BiasOp<CPUDevice, T> : public BinaryOp<T> {
+class BiasOp : public BinaryOp<T> {
  public:
-  typedef CPUDevice Device;
   explicit BiasOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
     string data_format;
     if (context->GetAttr("data_format", &data_format).ok()) {
@@ -52,7 +51,8 @@ class BiasOp<CPUDevice, T> : public BinaryOp<T> {
       data_format_ = FORMAT_NHWC;
     }
     OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
-                errors::InvalidArgument("CPU BiasOp only supports NHWC."));
+                errors::InvalidArgument(context->device()->attributes().name() +
+                                        " BiasOp only supports NHWC."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -122,6 +122,21 @@ class BiasOp<CPUDevice, T> : public BinaryOp<T> {
 TF_CALL_NUMBER_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_KERNEL(type)                                          \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("BiasAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
+      BiasOp<SYCLDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("BiasAddV1").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      BiasOp<SYCLDevice, type>);
+
+TF_CALL_INTEGRAL_TYPES(REGISTER_KERNEL);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+#endif  // TENSORFLOW_USE_SYCL
+
 namespace {
 
 void GetBiasValueDims(const Tensor& value_tensor, TensorFormat data_format,
@@ -165,12 +180,8 @@ struct AccumulatorType<Eigen::half> {
 }  // namespace
 
 template <typename Device, typename T>
-class BiasGradOp;
-
-template <typename T>
-class BiasGradOp<CPUDevice, T> : public OpKernel {
+class BiasGradOp : public OpKernel {
  public:
-  typedef CPUDevice Device;
   explicit BiasGradOp(OpKernelConstruction* context) : OpKernel(context) {
     string data_format;
     if (context->GetAttr("data_format", &data_format).ok()) {
@@ -180,7 +191,8 @@ class BiasGradOp<CPUDevice, T> : public OpKernel {
       data_format_ = FORMAT_NHWC;
     }
     OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
-                errors::InvalidArgument("CPU BiasGradOp only supports NHWC."));
+                errors::InvalidArgument(context->device()->attributes().name() +
+                                        " BiasGradOp only supports NHWC."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -192,8 +204,9 @@ class BiasGradOp<CPUDevice, T> : public OpKernel {
                                         output_backprop.shape().DebugString()));
 
     OP_REQUIRES(
-        context, FastBoundsCheck(output_backprop.NumElements(),
-                                 std::numeric_limits<int32>::max()),
+        context,
+        FastBoundsCheck(output_backprop.NumElements(),
+                        std::numeric_limits<int32>::max()),
         errors::InvalidArgument("BiasGrad requires tensor size <= int32 max"));
 
     int32 batch, height, width, channel;
@@ -215,7 +228,7 @@ class BiasGradOp<CPUDevice, T> : public OpKernel {
 #else
       Eigen::array<int, 1> reduction_axis = {0};
 #endif
-      output->template flat<T>().device(context->eigen_device<CPUDevice>()) =
+      output->template flat<T>().device(context->eigen_device<Device>()) =
           output_backprop.flat<T>()
               .template cast<typename AccumulatorType<T>::type>()
               .reshape(two_dims)
@@ -237,6 +250,18 @@ class BiasGradOp<CPUDevice, T> : public OpKernel {
 TF_CALL_NUMBER_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_KERNEL(type)                                            \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("BiasAddGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      BiasGradOp<SYCLDevice, type>);
+
+TF_CALL_INTEGRAL_TYPES(REGISTER_KERNEL);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+#endif  // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 template <typename T>
 class BiasOp<GPUDevice, T> : public BinaryOp<T> {
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 5159cdaa6ec..319ead49efd 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -117,6 +117,7 @@ TF_CALL_complex64(REGISTER);
 TF_CALL_complex128(REGISTER);
 TF_CALL_int64(REGISTER);
 REGISTER(bfloat16);
+REGISTER(bool);
 
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index f971637d5db..0f7adaf24a8 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -203,24 +203,28 @@ TF_CALL_complex64(REGISTER_GPUCONCAT32);
 TF_CALL_complex128(REGISTER_GPUCONCAT32);
 TF_CALL_int64(REGISTER_GPUCONCAT32);
 REGISTER_GPUCONCAT32(bfloat16);
+REGISTER_GPUCONCAT32(bool);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
 TF_CALL_complex64(REGISTER_GPUCONCAT64);
 TF_CALL_complex128(REGISTER_GPUCONCAT64);
 TF_CALL_int64(REGISTER_GPUCONCAT64);
 REGISTER_GPUCONCAT64(bfloat16);
+REGISTER_GPUCONCAT64(bool);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
 TF_CALL_complex64(REGISTER_GPU32);
 TF_CALL_complex128(REGISTER_GPU32);
 TF_CALL_int64(REGISTER_GPU32);
 REGISTER_GPU32(bfloat16);
+REGISTER_GPU32(bool);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
 TF_CALL_complex64(REGISTER_GPU64);
 TF_CALL_complex128(REGISTER_GPU64);
 TF_CALL_int64(REGISTER_GPU64);
 REGISTER_GPU64(bfloat16);
+REGISTER_GPU64(bool);
 
 #undef REGISTER_GPUCONCAT32
 #undef REGISTER_GPUCONCAT64
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index 01a744dc7ec..8e480aa9952 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -196,6 +196,7 @@ REGISTER_GPU(bfloat16);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
+REGISTER_GPU(bool);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index ef12e2e42cb..2c210531211 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -94,12 +94,7 @@ class CopyOp : public OpKernel {
                             !context->input_alloc_attr(0).on_host();
 
       if (off_host_input) {
-        auto size = src_tensor.NumElements() * sizeof(src_tensor.dtype());
-        auto dst_ptr = GetBase(copied_tensor);
-        auto src_ptr = GetBase(&src_tensor);
-        typedef decltype(src_tensor.dtype()) ttype;
-        context->eigen_sycl_device().memcpy(
-            dst_ptr, static_cast<const ttype*>(src_ptr), size);
+        SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor);
       } else {
         *copied_tensor = tensor::DeepCopy(src_tensor);
       }
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 6cb56797bff..8d825c13d76 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -208,22 +208,26 @@ class MaxPoolingGradOp : public OpKernel {
         errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC ",
                                 "on device type ",
                                 DeviceTypeString(context->device_type())));
-    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
-    OP_REQUIRES(context, ksize_.size() == 4,
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-    OP_REQUIRES(context, stride_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
+
+    if (context->num_inputs() == 3) {
+      OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+      OP_REQUIRES(context, ksize_.size() == 4,
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+      OP_REQUIRES(context, stride_.size() == 4,
+                  errors::InvalidArgument("Sliding window strides field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                  errors::Unimplemented(
+                      "Pooling is not yet supported on the batch dimension."));
+      OP_REQUIRES(
+          context, ksize_[3] == 1 && stride_[3] == 1,
+          errors::Unimplemented(
+              "MaxPoolingGrad is not yet supported on the depth dimension."));
+    }
+
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
-                errors::Unimplemented(
-                    "Pooling is not yet supported on the batch dimension."));
-    OP_REQUIRES(
-        context, ksize_[3] == 1 && stride_[3] == 1,
-        errors::Unimplemented(
-            "MaxPoolingGrad is not yet supported on the depth dimension."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -250,8 +254,35 @@ class MaxPoolingGradOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
                                                    tensor_out.shape(),
                                                    &tensor_out_arg_max));
+    std::vector<int32> ksize = ksize_;
+    std::vector<int32> stride = stride_;
+    if (context->num_inputs() == 5) {
+      const Tensor& tensor_ksize = context->input(3);
+      auto value_ksize = tensor_ksize.flat<int32>();
+      ksize.resize(tensor_ksize.shape().num_elements());
+      std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
-    PoolParameters params{context,  ksize_,      stride_,
+      const Tensor& tensor_stride = context->input(4);
+      auto value_stride = tensor_stride.flat<int32>();
+      stride.resize(tensor_stride.shape().num_elements());
+      std::copy_n(&value_stride(0), stride.size(), stride.begin());
+    }
+
+    OP_REQUIRES(context, ksize.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, stride.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    OP_REQUIRES(
+        context, ksize[3] == 1 && stride[3] == 1,
+        errors::Unimplemented(
+            "MaxPoolingGrad is not yet supported on the depth dimension."));
+
+    PoolParameters params{context,  ksize,       stride,
                           padding_, FORMAT_NHWC, tensor_in.shape()};
     if (!context->status().ok()) {
       return;
@@ -309,20 +340,22 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
-    OP_REQUIRES(context, ksize_.size() == 4,
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-    OP_REQUIRES(context, stride_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
+    if (context->num_inputs() == 3) {
+      OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+      OP_REQUIRES(context, ksize_.size() == 4,
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+      OP_REQUIRES(context, stride_.size() == 4,
+                  errors::InvalidArgument("Sliding window strides field must "
+                                          "specify 4 dimensions"));
+      const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
+      const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
+      OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
+                  errors::Unimplemented(
+                      "Pooling is not yet supported on the batch dimension."));
+    }
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
-    const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
-    OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
-                errors::Unimplemented(
-                    "Pooling is not yet supported on the batch dimension."));
 
     use_dnn_ = CanUseCudnn();
   }
@@ -343,15 +376,40 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
 
     TensorShape output_shape = tensor_in.shape();
 
+    std::vector<int32> ksize = ksize_;
+    std::vector<int32> stride = stride_;
+    if (context->num_inputs() == 5) {
+      const Tensor& tensor_ksize = context->input(3);
+      auto value_ksize = tensor_ksize.flat<int32>();
+      ksize.resize(tensor_ksize.shape().num_elements());
+      std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
+
+      const Tensor& tensor_stride = context->input(4);
+      auto value_stride = tensor_stride.flat<int32>();
+      stride.resize(tensor_stride.shape().num_elements());
+      std::copy_n(&value_stride(0), stride.size(), stride.begin());
+    }
+    OP_REQUIRES(context, ksize.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, stride.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N');
+    const int32 stride_n = GetTensorDim(stride, data_format_, 'N');
+    OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+
     if (use_dnn_) {
       DnnPoolingGradOp<T>::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize_,
-          stride_, padding_, data_format_, &tensor_in, &tensor_out,
-          out_backprop, output_shape);
+          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize,
+          stride, padding_, data_format_, &tensor_in, &tensor_out, out_backprop,
+          output_shape);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
           << "Non-Cudnn MaxPoolGrad only supports NHWC format";
-      MaxPoolingBackwardCustomKernel<T>(context, ksize_, stride_, padding_,
+      MaxPoolingBackwardCustomKernel<T>(context, ksize, stride, padding_,
                                         &tensor_in, out_backprop, output_shape);
     }
   }
@@ -386,22 +444,25 @@ class MaxPoolingGradGradOp : public OpKernel {
         errors::InvalidArgument(
             "Default MaxPoolingGradGradOp only supports NHWC ",
             "on device type ", DeviceTypeString(context->device_type())));
-    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
-    OP_REQUIRES(context, ksize_.size() == 4,
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-    OP_REQUIRES(context, stride_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
+
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
-                errors::Unimplemented(
-                    "Pooling is not yet supported on the batch dimension."));
-    OP_REQUIRES(
-        context, ksize_[3] == 1 && stride_[3] == 1,
-        errors::Unimplemented(
-            "MaxPoolingGradGrad is not yet supported on the depth dimension."));
+
+    if (context->num_inputs() == 3) {
+      OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+      OP_REQUIRES(context, ksize_.size() == 4,
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+      OP_REQUIRES(context, stride_.size() == 4,
+                  errors::InvalidArgument("Sliding window strides field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                  errors::Unimplemented(
+                      "Pooling is not yet supported on the batch dimension."));
+      OP_REQUIRES(context, ksize_[3] == 1 && stride_[3] == 1,
+                  errors::Unimplemented("MaxPoolingGradGrad is not yet "
+                                        "supported on the depth dimension."));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -419,7 +480,35 @@ class MaxPoolingGradGradOp : public OpKernel {
         context, out_grad_backprop.dims() == 4,
         errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
 
-    PoolParameters params{context,  ksize_,      stride_,
+    std::vector<int32> ksize = ksize_;
+    std::vector<int32> stride = stride_;
+    if (context->num_inputs() == 5) {
+      const Tensor& tensor_ksize = context->input(3);
+      auto value_ksize = tensor_ksize.flat<int32>();
+      ksize.resize(tensor_ksize.shape().num_elements());
+      std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
+
+      const Tensor& tensor_stride = context->input(4);
+      auto value_stride = tensor_stride.flat<int32>();
+      stride.resize(tensor_stride.shape().num_elements());
+      std::copy_n(&value_stride(0), stride.size(), stride.begin());
+    }
+
+    OP_REQUIRES(context, ksize.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, stride.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    OP_REQUIRES(
+        context, ksize[3] == 1 && stride[3] == 1,
+        errors::Unimplemented(
+            "MaxPoolingGrad is not yet supported on the depth dimension."));
+
+    PoolParameters params{context,  ksize,       stride,
                           padding_, FORMAT_NHWC, tensor_in.shape()};
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
@@ -474,7 +563,7 @@ class MaxPoolingGradGradOp : public OpKernel {
     //    tensor_out_as_matrix with the corresponding values in
     //    top_diff_as_matrix.
     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
-                     int64 start, int64 limit) {
+        int64 start, int64 limit) {
       const int32 depth = params.depth;
       const int32 in_rows = params.tensor_in_rows;
       const int32 in_cols = params.tensor_in_cols;
@@ -555,20 +644,22 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
-    OP_REQUIRES(context, ksize_.size() == 4,
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-    OP_REQUIRES(context, stride_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
+    if (context->num_inputs() == 3) {
+      OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+      OP_REQUIRES(context, ksize_.size() == 4,
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+      OP_REQUIRES(context, stride_.size() == 4,
+                  errors::InvalidArgument("Sliding window strides field must "
+                                          "specify 4 dimensions"));
+      const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
+      const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
+      OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
+                  errors::Unimplemented(
+                      "Pooling is not yet supported on the batch dimension."));
+    }
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
-    const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
-    OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
-                errors::Unimplemented(
-                    "Pooling is not yet supported on the batch dimension."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -590,7 +681,33 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, tensor_out.shape(), &output));
 
-    PoolParameters params{context,  ksize_,       stride_,
+    std::vector<int32> ksize = ksize_;
+    std::vector<int32> stride = stride_;
+    if (context->num_inputs() == 5) {
+      const Tensor& tensor_ksize = context->input(3);
+      auto value_ksize = tensor_ksize.flat<int32>();
+      ksize.resize(tensor_ksize.shape().num_elements());
+      std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
+
+      const Tensor& tensor_stride = context->input(4);
+      auto value_stride = tensor_stride.flat<int32>();
+      stride.resize(tensor_stride.shape().num_elements());
+      std::copy_n(&value_stride(0), stride.size(), stride.begin());
+    }
+
+    OP_REQUIRES(context, ksize.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, stride.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N');
+    const int32 stride_n = GetTensorDim(stride, data_format_, 'N');
+    OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+
+    PoolParameters params{context,  ksize,        stride,
                           padding_, data_format_, tensor_in.shape()};
 
     functor::MaxPoolGradBackwardNoMask<T>()(
@@ -669,6 +786,84 @@ class MaxPoolingNoMaskOp : public OpKernel {
   TensorFormat data_format_;
 };
 
+template <typename Device, typename T>
+class MaxPoolingNoMaskV2Op : public OpKernel {
+ public:
+  explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(
+        context, data_format_ == FORMAT_NHWC,
+        errors::InvalidArgument(
+            "Default MaxPoolingNoMaskOp only supports NHWC on device type ",
+            DeviceTypeString(context->device_type())));
+    if (context->num_inputs() == 1) {
+      OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+      OP_REQUIRES(context, ksize_.size() == 4,
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+      OP_REQUIRES(context, stride_.size() == 4,
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                  errors::Unimplemented(
+                      "Pooling is not yet supported on the batch dimension."));
+    }
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+
+    std::vector<int32> ksize = ksize_;
+    std::vector<int32> stride = stride_;
+
+    if (context->num_inputs() != 1) {
+      const Tensor& tensor_ksize = context->input(1);
+      auto value_ksize = tensor_ksize.flat<int32>();
+      ksize.resize(tensor_ksize.shape().num_elements());
+      std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
+
+      const Tensor& tensor_stride = context->input(2);
+      auto value_stride = tensor_stride.flat<int32>();
+      stride.resize(tensor_stride.shape().num_elements());
+      std::copy_n(&value_stride(0), stride.size(), stride.begin());
+    }
+    OP_REQUIRES(context, ksize.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, stride.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+    PoolParameters params{context,  ksize,        stride,
+                          padding_, data_format_, tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    TensorShape out_shape({params.tensor_in_batch, params.out_height,
+                           params.out_width, params.depth});
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
+                                              output);
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
 template <typename Device, typename T>
 struct LaunchMaxPoolingWithArgmax;
 
@@ -878,6 +1073,95 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   bool use_dnn_;
 };
 
+template <typename T>
+class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
+ public:
+  typedef GPUDevice Device;
+  explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    if (context->num_inputs() == 1) {
+      OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+      OP_REQUIRES(context, ksize_.size() == 4,
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+      OP_REQUIRES(context, stride_.size() == 4,
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify 4 dimensions"));
+      const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
+      const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
+      OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
+                  errors::Unimplemented(
+                      "Pooling is not yet supported on the batch dimension."));
+    }
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    use_dnn_ = CanUseCudnn();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+
+    std::vector<int32> ksize = ksize_;
+    std::vector<int32> stride = stride_;
+
+    if (context->num_inputs() != 1) {
+      const Tensor& tensor_ksize = context->input(1);
+      auto value_ksize = tensor_ksize.flat<int32>();
+      ksize.resize(tensor_ksize.shape().num_elements());
+      std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
+
+      const Tensor& tensor_stride = context->input(2);
+      auto value_stride = tensor_stride.flat<int32>();
+      stride.resize(tensor_stride.shape().num_elements());
+      std::copy_n(&value_stride(0), stride.size(), stride.begin());
+    }
+    OP_REQUIRES(context, ksize.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, stride.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    const int32 ksize_n = GetTensorDim(ksize, data_format_, 'N');
+    const int32 stride_n = GetTensorDim(stride, data_format_, 'N');
+    OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+
+    PoolParameters params{context,  ksize,        stride,
+                          padding_, data_format_, tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
+                        params.out_width, params.depth);
+    if (use_dnn_ && data_format_ == FORMAT_NCHW) {
+      DnnPoolingOp<T>::Compute(
+          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize,
+          stride, padding_, data_format_, tensor_in, out_shape);
+    } else {
+      CHECK(data_format_ == FORMAT_NHWC)
+          << "Non-Cudnn MaxPool only supports NHWC format";
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+      LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
+                                                output);
+    }
+  }
+
+ private:
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+  bool use_dnn_;
+};
+
 template <typename T>
 struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
@@ -969,13 +1253,28 @@ struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
       MaxPoolingGradOp<D##Device, T>);                                   \
   REGISTER_KERNEL_BUILDER(                                               \
       Name("MaxPoolGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
-      MaxPoolingGradGradOp<D##Device, T>);
+      MaxPoolingGradGradOp<D##Device, T>);                               \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradV2")                          \
+                              .Device(DEVICE_##D)                        \
+                              .HostMemory("ksize")                       \
+                              .HostMemory("strides")                     \
+                              .TypeConstraint<T>("T"),                   \
+                          MaxPoolingGradOp<D##Device, T>);               \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradV2")                      \
+                              .Device(DEVICE_##D)                        \
+                              .HostMemory("ksize")                       \
+                              .HostMemory("strides")                     \
+                              .TypeConstraint<T>("T"),                   \
+                          MaxPoolingGradGradOp<D##Device, T>);
 
 // Below kernels implemented only for CPU device.
-#define REGISTER_CPU_ONLY_POOL_KERNELS(T)                        \
-  REGISTER_KERNEL_BUILDER(                                       \
-      Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      MaxPoolingOp<CPUDevice, T>);
+#define REGISTER_CPU_ONLY_POOL_KERNELS(T)                          \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+      MaxPoolingOp<CPUDevice, T>);                                 \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("MaxPoolV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      MaxPoolingV2Op<CPUDevice, T>);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
 #undef REGISTER_CPU_ONLY_POOL_KERNELS
 
@@ -1015,9 +1314,22 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
                               .TypeConstraint<T>("T")                \
                               .Label("eigen_tensor"),                \
                           MaxPoolingOp<GPUDevice, T>);               \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                          \
+                              .Device(DEVICE_GPU)                    \
+                              .HostMemory("ksize")                   \
+                              .HostMemory("strides")                 \
+                              .TypeConstraint<T>("T")                \
+                              .Label("eigen_tensor"),                \
+                          MaxPoolingV2Op<GPUDevice, T>);             \
   REGISTER_KERNEL_BUILDER(                                           \
       Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
       MaxPoolingNoMaskOp<GPUDevice, T>);                             \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")                          \
+                              .Device(DEVICE_GPU)                    \
+                              .HostMemory("ksize")                   \
+                              .HostMemory("strides")                 \
+                              .TypeConstraint<T>("T"),               \
+                          MaxPoolingNoMaskV2Op<GPUDevice, T>);       \
   REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                  \
                               .Device(DEVICE_GPU)                    \
                               .TypeConstraint<int64>("Targmax")      \
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 3b23c72f0f1..f81a448e515 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -206,15 +206,10 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
     // Mkl needs the entities in its native format.
     // So create temporary tensors along with buffers to
     // convert the received entities.
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_out_backprop_buf_tensor,
-           mkl_tmp_buf_trans_input;
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_out_backprop_buf_tensor;
     // This preparation sets (1) dnnResourceSrc (2) dnnResourceDiffDst
-    mkl_context.MklPrepareInputs(context, data_format_,
-                                 input_in_mkl_format,
-                                 out_backprop_in_mkl_format,
-                                 &mkl_tmp_input_buf_tensor,
-                                 &mkl_tmp_out_backprop_buf_tensor,
-                                 &mkl_tmp_buf_trans_input);
+    mkl_context.MklPrepareInputs(context, &mkl_tmp_input_buf_tensor,
+                                 &mkl_tmp_out_backprop_buf_tensor);
 
     // Final conv-grad-filter should be in TF layout.
     Tensor* grad_filter;
@@ -312,58 +307,34 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
 
     // Compare incoming tensor layouts with MKL preferred layouts and convert
     // data to the preferred layout if necessary
-    void MklPrepareInputs(OpKernelContext* context, TensorFormat format,
-                          bool input_in_mkl_format,
-                          bool out_backprop_in_mkl_format,
+    void MklPrepareInputs(OpKernelContext* context,
                           Tensor* mkl_tmp_input_buf_tensor,
-                          Tensor* mkl_tmp_out_backprop_buf_tensor,
-                          Tensor* mkl_tmp_buf_trans_input) {
+                          Tensor* mkl_tmp_out_backprop_buf_tensor) {
       bool mkl_convert_input, mkl_convert_out_backprop;
       dnnPrimitive_t mkl_prim_convert_input, mkl_prim_convert_out_backprop;
-      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_out_backprop,
-                  mkl_lt_trans_input;
+      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_out_backprop;
       void *mkl_buf_convert_input, *mkl_buf_convert_out_backprop;
-      void *mkl_buf_input, *mkl_buf_out_backprop;
 
       mkl_prim_convert_input = nullptr;
       mkl_prim_convert_out_backprop = nullptr;
       mkl_lt_internal_input = nullptr;
       mkl_lt_internal_out_backprop = nullptr;
-      mkl_lt_trans_input = nullptr;
       mkl_buf_convert_input = nullptr;
       mkl_buf_convert_out_backprop = nullptr;
-      mkl_buf_input = nullptr;
-      mkl_buf_out_backprop = nullptr;
 
       // Compare with internal layouts and convert if needed
       const Tensor& input = MklGetInput(context, 0);
-      if (!input_in_mkl_format && format == FORMAT_NHWC){
-        TensorShape nchw_shape = ShapeFromFormat(FORMAT_NCHW,
-            in_sizes[MklDims::N], in_sizes[MklDims::H],
-            in_sizes[MklDims::W], in_sizes[MklDims::C]);
-        OP_REQUIRES_OK(context, context->allocate_temp(
-            DataTypeToEnum<float>::value, nchw_shape, mkl_tmp_buf_trans_input));
-        MklNHWCToNCHW(input, &mkl_tmp_buf_trans_input);
-        mkl_buf_input = const_cast<void*>(static_cast<const void*>(
-            mkl_tmp_buf_trans_input->flat<float>().data()));
-        size_t strides[4];
-        GetStridesFromSizes(FORMAT_NCHW, strides, in_sizes);
-        CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_trans_input, in_dims, in_sizes,
-            strides), E_SUCCESS);
-      }
-      else {
-        mkl_buf_input =
-            const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-        mkl_lt_trans_input = lt_input;
-      }
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
                    &mkl_lt_internal_input, prim_conv_bwdfilter, dnnResourceSrc),
                E_SUCCESS);
       mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_trans_input);
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
       if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
-                     mkl_lt_trans_input, mkl_lt_internal_input), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
                        &mkl_buf_convert_input);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
@@ -372,30 +343,26 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
         dnnDelete_F32(mkl_prim_convert_input);
       }
       dnnLayoutDelete_F32(mkl_lt_internal_input);
-      if (!input_in_mkl_format && format == FORMAT_NHWC)
-        dnnLayoutDelete_F32(mkl_lt_trans_input);
-
 
       conv_res[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
 
       const Tensor& out_backprop = MklGetInput(context, 2);
-      mkl_buf_out_backprop = const_cast<void*>(
-                      static_cast<const void*>(out_backprop.flat<T>().data()));
+      void* mkl_buf_out_backprop = const_cast<void*>(static_cast<const void*>(
+                                      out_backprop.flat<T>().data()));
 
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
                                                 prim_conv_bwdfilter,
                                                 dnnResourceDiffDst),
                E_SUCCESS);
       mkl_convert_out_backprop =
-          !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop,
-                                lt_out_backprop);
+          !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop);
       if (mkl_convert_out_backprop) {
         CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
                       lt_out_backprop, mkl_lt_internal_out_backprop),
                  E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor,
-            mkl_lt_internal_out_backprop, &mkl_buf_convert_out_backprop);
+            lt_out_backprop, &mkl_buf_convert_out_backprop);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
                                           mkl_buf_out_backprop,
                                           mkl_buf_convert_out_backprop),
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 45d22556aa7..203e6946314 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -272,13 +272,11 @@ class MklConv2DOp : public OpKernel {
 
     // Temp tensor used to allocate tmp buffers
     Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
-        mkl_tmp_bias_buf_tensor, mkl_tmp_buf_trans_input;
-    mkl_context.MklPrepareConvolutionInputs(context, data_format_,
-                                            input_in_mkl_format,
+        mkl_tmp_bias_buf_tensor;
+    mkl_context.MklPrepareConvolutionInputs(context,
                                             &mkl_tmp_input_buf_tensor,
                                             &mkl_tmp_filter_buf_tensor,
-                                            &mkl_tmp_bias_buf_tensor,
-                                            &mkl_tmp_buf_trans_input);
+                                            &mkl_tmp_bias_buf_tensor);
 
     // Execute convolution
     CHECK_EQ(dnnExecute_F32(mkl_context.prim_fwd, mkl_context.conv_res),
@@ -329,59 +327,38 @@ class MklConv2DOp : public OpKernel {
     // Compare incoming tensor layouts with MKL preferred layouts and convert
     // data to the preferred layout if necessary
     void MklPrepareConvolutionInputs(OpKernelContext* context,
-                                     TensorFormat format,
-                                     bool input_in_mkl_format,
                                      Tensor* mkl_tmp_input_buf_tensor,
                                      Tensor* mkl_tmp_filter_buf_tensor,
-                                     Tensor* mkl_tmp_bias_buf_tensor,
-                                     Tensor* mkl_tmp_buf_trans_input) {
+                                     Tensor* mkl_tmp_bias_buf_tensor) {
       bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias;
       dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias,
           mkl_prim_convert_input;
       dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias,
-          mkl_lt_internal_input, mkl_lt_trans_input;
+          mkl_lt_internal_input;
       void *mkl_buf_convert_input, *mkl_buf_convert_filter,
-          *mkl_buf_convert_bias, *mkl_buf_input;
+          *mkl_buf_convert_bias;
       mkl_prim_convert_filter = nullptr;
       mkl_prim_convert_bias = nullptr;
       mkl_prim_convert_input = nullptr;
       mkl_lt_internal_filter = nullptr;
       mkl_lt_internal_bias = nullptr;
       mkl_lt_internal_input = nullptr;
-      mkl_lt_trans_input = nullptr;
       mkl_buf_convert_input = nullptr;
       mkl_buf_convert_filter = nullptr;
       mkl_buf_convert_bias = nullptr;
-      mkl_buf_input = nullptr;
 
       // Compare with internal layouts and convert if needed
       const Tensor& input = MklGetInput(context, 0);
-      if (!input_in_mkl_format && format == FORMAT_NHWC) {
-        TensorShape nchw_shape = ShapeFromFormat(FORMAT_NCHW,
-            in_sizes[MklDims::N], in_sizes[MklDims::H],
-            in_sizes[MklDims::W], in_sizes[MklDims::C]);
-        OP_REQUIRES_OK(context, context->allocate_temp(
-            DataTypeToEnum<float>::value, nchw_shape, mkl_tmp_buf_trans_input));
-        MklNHWCToNCHW(input, &mkl_tmp_buf_trans_input);
-        mkl_buf_input = const_cast<void*>(static_cast<const void*>(
-            mkl_tmp_buf_trans_input->flat<float>().data()));
-        size_t strides[4];
-        GetStridesFromSizes(FORMAT_NCHW, strides, in_sizes);
-        CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_trans_input, in_dims, in_sizes,
-            strides), E_SUCCESS);
-      } else {
-          mkl_buf_input = const_cast<void*>(
-                              static_cast<const void*>(input.flat<T>().data()));
-          mkl_lt_trans_input = lt_input;
-      }
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_input,
                                                 prim_fwd, dnnResourceSrc),
                E_SUCCESS);
       mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_trans_input);
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
       if (mkl_convert_input) {
         CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
-                 mkl_lt_trans_input, mkl_lt_internal_input), E_SUCCESS);
+                 lt_input, mkl_lt_internal_input), E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
                        &mkl_buf_convert_input);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
@@ -390,8 +367,6 @@ class MklConv2DOp : public OpKernel {
         dnnDelete_F32(mkl_prim_convert_input);
       }
       dnnLayoutDelete_F32(mkl_lt_internal_input);
-      if (!input_in_mkl_format && format == FORMAT_NHWC)
-        dnnLayoutDelete_F32(mkl_lt_trans_input);
 
       conv_res[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
index c8e5df32ce5..b48c735d124 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@@ -83,42 +83,16 @@ class MklToTfOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_shape, &output_tensor));
 
-    // If data format is NHWC, transform MKL tensor to NCHW format and then
-    // do NCHW -> NHWC.
-    dnnLayout_t lt_trans_input = nullptr;
-    Tensor mkl_tmp_trans_input_buf_tensor;
-    void* buf_trans_input = nullptr;
-    bool input_fmt_nhwc = input_shape.IsTensorInNHWCFormat();
-    if (input_fmt_nhwc && ndims == 4 && has_avx512f_) {
-      size_t strides_nchw[4];
-      GetStridesFromSizes(FORMAT_NCHW, strides_nchw, in_sizes);
-      CHECK_EQ(
-          dnnLayoutCreate_F32(&lt_trans_input, ndims, in_sizes, strides_nchw),
-          E_SUCCESS);
-      AllocTmpBuffer(context, &mkl_tmp_trans_input_buf_tensor, lt_trans_input,
-                     &buf_trans_input);
-    } else {
-      lt_trans_input = static_cast<dnnLayout_t>(input_shape.GetTfLayout());
-      buf_trans_input =
-          static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
-    }
-
+    dnnLayout_t output_layout =
+        static_cast<dnnLayout_t>(input_shape.GetTfLayout());
     // Execute DNNConversion.
     void* input_buffer =
         static_cast<void*>(const_cast<T*>(input_tensor.flat<T>().data()));
-    input_shape.GetConvertedFlatData(lt_trans_input, input_buffer,
-                                     buf_trans_input);
-    // NCHW -> NHWC, if data format is NHWC
-    if (input_fmt_nhwc && ndims == 4 && has_avx512f_) {
-      dnnLayoutDelete_F32(lt_trans_input);
-      TensorShape nhwc_shape = ShapeFromFormat(
-          FORMAT_NHWC, in_sizes[MklDims::N], in_sizes[MklDims::H],
-          in_sizes[MklDims::W], in_sizes[MklDims::C]);
-      MklNCHWToNHWC(mkl_tmp_trans_input_buf_tensor, &output_tensor);
-    }
-
     delete[] in_sizes;
-
+    void* output_buffer =
+        static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
+    input_shape.GetConvertedFlatData(output_layout, input_buffer,
+                                     output_buffer);
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
 
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 75820e3106f..814128d99ac 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -158,6 +158,7 @@ REGISTER_PACK(string);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
+REGISTER_GPU(bool);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index 2c097c0ce24..1b59c18df79 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -69,6 +69,8 @@ struct PoolParameters {
 };
 
 // An implementation of MaxPooling (forward).
+// TODO (yongtang): Remove MaxPoolingOp and use MaxPoolingV2Op,
+//     QuantizedMaxPoolingOp depends on MaxPoolingOp so keep intact for now
 template <typename Device, typename T>
 class MaxPoolingOp : public OpKernel {
  public:
@@ -254,6 +256,219 @@ class MaxPoolingOp : public OpKernel {
   TensorFormat data_format_;
 };
 
+template <typename Device, typename T>
+class MaxPoolingV2Op : public OpKernel {
+ public:
+  explicit MaxPoolingV2Op(OpKernelConstruction* context) : OpKernel(context) {
+    string data_format;
+    auto status = context->GetAttr("data_format", &data_format);
+    if (status.ok()) {
+      OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                  errors::InvalidArgument("Invalid data format"));
+      OP_REQUIRES(
+          context, data_format_ == FORMAT_NHWC,
+          errors::InvalidArgument("Default MaxPoolingOp only supports NHWC."));
+    } else {
+      data_format_ = FORMAT_NHWC;
+    }
+    if (context->num_inputs() == 1) {
+      OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
+      OP_REQUIRES(context, ksize_.size() == 4,
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+      OP_REQUIRES(context, stride_.size() == 4,
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
+                  errors::Unimplemented(
+                      "Pooling is not yet supported on the batch dimension."));
+    }
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_in = context->input(0);
+
+    std::vector<int32> ksize = ksize_;
+    std::vector<int32> stride = stride_;
+
+    if (context->num_inputs() != 1) {
+      const Tensor& tensor_ksize = context->input(1);
+      auto value_ksize = tensor_ksize.flat<int32>();
+      ksize.resize(tensor_ksize.shape().num_elements());
+      std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
+
+      const Tensor& tensor_stride = context->input(2);
+      auto value_stride = tensor_stride.flat<int32>();
+      stride.resize(tensor_stride.shape().num_elements());
+      std::copy_n(&value_stride(0), stride.size(), stride.begin());
+    }
+
+    OP_REQUIRES(context, ksize.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, stride.size() == 4,
+                errors::InvalidArgument("Sliding window stride field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
+                errors::Unimplemented(
+                    "Pooling is not yet supported on the batch dimension."));
+
+    PoolParameters params{context,  ksize,       stride,
+                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    if (!context->status().ok()) {
+      return;
+    }
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, params.forward_output_shape(), &output));
+
+    if (params.depth_window > 1) {
+      // Validate spec against the current implementation.  A
+      // relaxation of these requirements would be ideal.
+      OP_REQUIRES(context, params.depth % params.depth_window == 0,
+                  errors::Unimplemented(
+                      "Depthwise max pooling requires "
+                      "the depth window to evenly divide the input depth."));
+      OP_REQUIRES(
+          context, params.depth_window == params.depth_stride,
+          errors::Unimplemented("Depthwise max pooling requires "
+                                "the depth window to equal the depth stride."));
+
+      DepthwiseMaxPool(context, output, tensor_in, params);
+    } else {
+      SpatialMaxPool(context, output, tensor_in, params, padding_);
+    }
+  }
+
+ private:
+  // Single-threaded implementation of DepthwiseMaxPool which
+  // does not handle all of the same options as SpatialMaxPool
+  // (strict assumptions on no padding, stride).
+  //
+  // TODO(vrv): implement a more general depthwise-max pool that works
+  // on GPU as well.
+  void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
+                        const Tensor& tensor_in, const PoolParameters& params) {
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+        in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
+                   tensor_in.NumElements() / params.depth_window);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
+        output->flat<T>().data(), 1, output->NumElements());
+    out_by_pool = in_by_pool.colwise().maxCoeff();
+  }
+
+  void SpatialMaxPool(OpKernelContext* context, Tensor* output,
+                      const Tensor& tensor_in, const PoolParameters& params,
+                      const Padding& padding) {
+    // On GPU, use Eigen's Spatial Max Pooling.  On CPU, use an
+    // EigenMatrix version that is currently faster than Eigen's
+    // Spatial MaxPooling implementation.
+    //
+    // TODO(vrv): Remove this once we no longer need it.
+    if (std::is_same<Device, GPUDevice>::value) {
+      Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
+      functor::SpatialMaxPooling<Device, T>()(
+          context->eigen_device<Device>(), output->tensor<T, 4>(),
+          tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
+          params.row_stride, params.col_stride, pt);
+    } else {
+      typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+          ConstEigenMatrixMap;
+      typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+          EigenMatrixMap;
+
+      ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
+                                 params.tensor_in_cols * params.tensor_in_rows *
+                                     params.tensor_in_batch);
+      EigenMatrixMap out_mat(
+          output->flat<T>().data(), params.depth,
+          params.out_width * params.out_height * params.tensor_in_batch);
+
+      const DeviceBase::CpuWorkerThreads& worker_threads =
+          *(context->device()->tensorflow_cpu_worker_threads());
+
+      // The following code basically does the following:
+      // 1. Flattens the input and output tensors into two dimensional arrays.
+      //    tensor_in_as_matrix:
+      //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
+      //    output_as_matrix:
+      //      depth by (out_width * out_height * tensor_in_batch)
+      //
+      // 2. Walks through the set of columns in the flattened
+      // tensor_in_as_matrix,
+      //    and updates the corresponding column(s) in output_as_matrix with the
+      //    max value.
+      auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
+
+        const int32 in_rows = params.tensor_in_rows;
+        const int32 in_cols = params.tensor_in_cols;
+        const int32 pad_rows = params.pad_rows;
+        const int32 pad_cols = params.pad_cols;
+        const int32 window_rows = params.window_rows;
+        const int32 window_cols = params.window_cols;
+        const int32 row_stride = params.row_stride;
+        const int32 col_stride = params.col_stride;
+        const int32 out_height = params.out_height;
+        const int32 out_width = params.out_width;
+
+        {
+          // Initializes the output tensor with MIN<T>.
+          const int32 output_image_size = out_height * out_width * params.depth;
+          EigenMatrixMap out_shard(out_mat.data() + start * output_image_size,
+                                   1, (limit - start) * output_image_size);
+          out_shard.setConstant(Eigen::NumTraits<T>::lowest());
+        }
+
+        for (int32 b = start; b < limit; ++b) {
+          const int32 out_offset_batch = b * out_height;
+          for (int32 h = 0; h < in_rows; ++h) {
+            for (int32 w = 0; w < in_cols; ++w) {
+              // (h_start, h_end) * (w_start, w_end) is the range that the input
+              // vector projects to.
+              const int32 hpad = h + pad_rows;
+              const int32 wpad = w + pad_cols;
+              const int32 h_start = (hpad < window_rows)
+                                        ? 0
+                                        : (hpad - window_rows) / row_stride + 1;
+              const int32 h_end = std::min(hpad / row_stride + 1, out_height);
+              const int32 w_start = (wpad < window_cols)
+                                        ? 0
+                                        : (wpad - window_cols) / col_stride + 1;
+              const int32 w_end = std::min(wpad / col_stride + 1, out_width);
+              // compute elementwise max
+              const int32 in_offset = (b * in_rows + h) * in_cols + w;
+              for (int32 ph = h_start; ph < h_end; ++ph) {
+                const int32 out_offset_base =
+                    (out_offset_batch + ph) * out_width;
+                for (int32 pw = w_start; pw < w_end; ++pw) {
+                  const int32 out_offset = out_offset_base + pw;
+                  out_mat.col(out_offset) =
+                      out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
+                }
+              }
+            }
+          }
+        }
+      };
+
+      // TODO(andydavis) Consider sharding across batch x rows x cols.
+      // TODO(andydavis) Consider a higher resolution shard cost model.
+      const int64 shard_cost =
+          params.tensor_in_rows * params.tensor_in_cols * params.depth;
+      Shard(worker_threads.num_threads, worker_threads.workers,
+            params.tensor_in_batch, shard_cost, shard);
+    }
+  }
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+
 template <typename Device, typename T>
 void SpatialAvgPool(OpKernelContext* context, Tensor* output,
                     const Tensor& input, const PoolParameters& params,
diff --git a/tensorflow/core/kernels/qr_op_impl.h b/tensorflow/core/kernels/qr_op_impl.h
index 029ef834808..ab664fa6d33 100644
--- a/tensorflow/core/kernels/qr_op_impl.h
+++ b/tensorflow/core/kernels/qr_op_impl.h
@@ -20,6 +20,10 @@ limitations under the License.
 // improve compilation times.
 #include <algorithm>
 
+#ifdef INTEL_MKL
+#define EIGEN_USE_MKL_ALL
+#endif // INTEL_MKL
+
 #include "third_party/eigen3/Eigen/QR"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
index 04454b76c1a..16db8a6bb13 100644
--- a/tensorflow/core/kernels/reshape_op.cc
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -32,6 +32,7 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
                               .TypeConstraint<int32>("Tshape"), \
                           ReshapeOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+REGISTER_GPU_KERNEL(bool);
 #undef REGISTER_GPU_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index 098b2d65000..308b641b54f 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -153,6 +153,32 @@ EIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) {
 }
 #endif
 
+#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
+// Return a packet with the first value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) {
+  return vec_splat (a, 0);
+}
+
+// Return a packet with the second value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) {
+  return vec_splat (a, 1);
+}
+
+// Return a packet with the third value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) {
+  return vec_splat (a, 2);
+}
+
+// Return a packet with the fourth value of the input Packet replicated
+template <>
+EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
+  return vec_splat (a, 3);
+}
+#endif
+
 #ifdef EIGEN_VECTORIZE_SSE2
 // For PacketSize of 4 floats the Packet is not modified
 template <>
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index f1da3c8afb4..c49ebc06852 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -536,6 +536,12 @@ REGISTER_KERNEL_BUILDER(Name("Tile")
                             .TypeConstraint<int32>("Tmultiples")
                             .HostMemory("multiples"),
                         TileOp<GPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("Tile")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tmultiples")
+                            .HostMemory("multiples"),
+                        TileOp<GPUDevice>);
 REGISTER_KERNEL_BUILDER(Name("Tile")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<complex64>("T")
@@ -573,6 +579,12 @@ REGISTER_KERNEL_BUILDER(Name("TileGrad")
                             .TypeConstraint<int32>("Tmultiples")
                             .HostMemory("multiples"),
                         TileGradientOp<GPUDevice>);
+REGISTER_KERNEL_BUILDER(Name("TileGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Tmultiples")
+                            .HostMemory("multiples"),
+                        TileGradientOp<GPUDevice>);
 REGISTER_KERNEL_BUILDER(Name("TileGrad")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<complex64>("T")
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 6e51696d6f4..701c5f6d2b3 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -26,7 +26,7 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-template <typename T>
+template <typename T, typename TIndex>
 class UniqueOp : public OpKernel {
  public:
   explicit UniqueOp(OpKernelConstruction* context) : OpKernel(context) {}
@@ -48,9 +48,9 @@ class UniqueOp : public OpKernel {
     Tensor* idx = nullptr;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                 {0}, 1, input.shape(), &idx));
-    auto idx_vec = idx->template vec<int32>();
+    auto idx_vec = idx->template vec<TIndex>();
 
-    std::unordered_map<T, int32> uniq;
+    std::unordered_map<T, TIndex> uniq;
     uniq.reserve(2 * N);
     for (int64 i = 0, j = 0; i < N; ++i) {
       auto it = uniq.insert(std::make_pair(Tin(i), j));
@@ -72,7 +72,7 @@ class UniqueOp : public OpKernel {
     if (num_outputs() > 2) {
       OP_REQUIRES_OK(context, context->allocate_output(
                                   2, TensorShape({uniq_size}), &output));
-      auto count_output_vec = output->template vec<int32>();
+      auto count_output_vec = output->template vec<TIndex>();
       count_output_vec.setZero();
       for (int64 i = 0; i < N; ++i) {
         count_output_vec(idx_vec(i))++;
@@ -86,12 +86,22 @@ class UniqueOp : public OpKernel {
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int32>("out_idx"), \
-                          UniqueOp<type>);                       \
+                          UniqueOp<type, int32>);                \
+  REGISTER_KERNEL_BUILDER(Name("Unique")                         \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
+                          UniqueOp<type, int64>);                \
   REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts")               \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int32>("out_idx"), \
-                          UniqueOp<type>)
+                          UniqueOp<type, int32>)                 \
+  REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts")               \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
+                          UniqueOp<type, int64>)
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
 REGISTER_UNIQUE(string)
 #undef REGISTER_UNIQUE
@@ -107,7 +117,15 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("idx"),
-                        UniqueOp<int32>);
+                        UniqueOp<int32, int32>);
+REGISTER_KERNEL_BUILDER(Name("Unique")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("out_idx")
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("idx"),
+                        UniqueOp<int32, int64>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int64>("T")
@@ -115,7 +133,15 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("idx"),
-                        UniqueOp<int64>);
+                        UniqueOp<int64, int32>);
+REGISTER_KERNEL_BUILDER(Name("Unique")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int64>("T")
+                            .TypeConstraint<int64>("out_idx")
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("idx"),
+                        UniqueOp<int64, int64>);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Unique")
@@ -125,7 +151,7 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("idx"),
-                        UniqueOp<int32>);
+                        UniqueOp<int32, int32>);
 REGISTER_KERNEL_BUILDER(Name("Unique")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int64>("T")
@@ -133,6 +159,22 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("idx"),
-                        UniqueOp<int64>);
+                        UniqueOp<int64, int32>);
+REGISTER_KERNEL_BUILDER(Name("Unique")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("out_idx")
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("idx"),
+                        UniqueOp<int32, int64>);
+REGISTER_KERNEL_BUILDER(Name("Unique")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int64>("T")
+                            .TypeConstraint<int64>("out_idx")
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("idx"),
+                        UniqueOp<int64, int64>);
 #endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index c3bebfcbf9d..7fd1def1fe0 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -69,6 +69,8 @@ class UnpackOp : public OpKernel {
                                  std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("output size must fit in Eigen DenseIndex"));
 
+// This optimization is currently not applicable for SYCL devices
+#ifndef TENSORFLOW_USE_SYCL
     // Special case: Aligned, so we can share the underlying buffer.
     //
     // Apply this optimization conservatively: if input is aligned,
@@ -85,6 +87,7 @@ class UnpackOp : public OpKernel {
       }
       return;
     }
+#endif  // TENSORFLOW_USE_SYCL
 
     int64 before_dim = 1;
     for (int i = 0; i < axis; ++i) {
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index 85eb607cc7f..3b8a322854f 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -120,17 +120,19 @@ typedef std::function<void(const Status&)> StatusCallback;
 
 extern tensorflow::string* TfCheckOpHelperOutOfLine(
     const ::tensorflow::Status& v, const char* msg);
+
 inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v,
                                            const char* msg) {
   if (v.ok()) return nullptr;
   return TfCheckOpHelperOutOfLine(v, msg);
 }
-#define TF_CHECK_OK(val)                                             \
-  while (::tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
-  LOG(FATAL) << *(_result)
-#define TF_QCHECK_OK(val)                                            \
-  while (::tensorflow::string* _result = TfCheckOpHelper(val, #val)) \
-  LOG(QFATAL) << *(_result)
+
+#define TF_DO_CHECK_OK(val, level)                  \
+  while (auto _result = TfCheckOpHelper(val, #val)) \
+    LOG(level) << *(_result)
+
+#define TF_CHECK_OK(val)  TF_DO_CHECK_OK(val, FATAL)
+#define TF_QCHECK_OK(val) TF_DO_CHECK_OK(val, QFATAL)
 
 // DEBUG only version of TF_CHECK_OK.  Compiler still parses 'val' even in opt
 // mode.
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index 1db4746c89e..77d3c88998e 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -111,7 +111,7 @@ Status ReadValue(const string& data, T* value, int* offset) {
         reinterpret_cast<const uint8*>(data.data() + *offset);
     int shift = 0;
     for (int i = 0; i < sizeof(T); ++i, shift += 8) {
-      *value = *value | (data_buf[i] >> shift);
+      *value = *value | (data_buf[i] << shift);
     }
   }
   *offset = new_offset;
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 10187425214..0a96258dd1f 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -1368,6 +1368,34 @@ input: 4-D input to pool over.
 output: The max pooled output tensor.
 )doc");
 
+REGISTER_OP("MaxPoolV2")
+    .Attr("T: realnumbertype = DT_FLOAT")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Input("input: T")
+    .Input("ksize: int32")
+    .Input("strides: int32")
+    .Output("output: T")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::MaxPoolV2Shape(c, 3));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Performs max pooling on the input.
+
+ksize: The size of the window for each dimension of the input tensor.
+strides: The stride of the sliding window for each dimension of the
+  input tensor.
+padding: The type of padding algorithm to use.
+data_format: Specify the data format of the input and output data. With the
+    default format "NHWC", the data is stored in the order of:
+        [batch, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCHW", the data storage order of:
+        [batch, in_channels, in_height, in_width].
+input: 4-D input to pool over.
+output: The max pooled output tensor.
+)doc");
+
 REGISTER_OP("MaxPoolGrad")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
@@ -1399,6 +1427,37 @@ grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
 output: Gradients w.r.t. the input to `max_pool`.
 )doc");
 
+REGISTER_OP("MaxPoolGradV2")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Input("ksize: int32")
+    .Input("strides: int32")
+    .Output("output: T")
+    .Attr("T: realnumbertype = DT_FLOAT")
+    .SetShapeFn([](InferenceContext* c) {
+      return UnchangedShapeWithRank(c, 4);
+    })
+    .Doc(R"doc(
+Computes gradients of the maxpooling function.
+
+ksize: The size of the window for each dimension of the input tensor.
+strides: The stride of the sliding window for each dimension of the
+  input tensor.
+padding: The type of padding algorithm to use.
+data_format: Specify the data format of the input and output data. With the
+    default format "NHWC", the data is stored in the order of:
+        [batch, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCHW", the data storage order of:
+        [batch, in_channels, in_height, in_width].
+orig_input: The original input tensor.
+orig_output: The original output tensor.
+grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+output: Gradients w.r.t. the input to `max_pool`.
+)doc");
+
 REGISTER_OP("MaxPoolGradGrad")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
@@ -1436,6 +1495,43 @@ grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
 output: Gradients of gradients w.r.t. the input to `max_pool`.
 )doc");
 
+REGISTER_OP("MaxPoolGradGradV2")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Input("ksize: int32")
+    .Input("strides: int32")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::MaxPoolV2Shape(c, 5));
+      ShapeHandle unused;
+      // Validate 'orig_input' is the same shape as 'grad'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(2), &unused));
+      // Validate 'orig_output' is same shape as 'output'
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->output(0), &unused));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Computes second-order gradients of the maxpooling function.
+
+ksize: The size of the window for each dimension of the input tensor.
+strides: The stride of the sliding window for each dimension of the
+  input tensor.
+padding: The type of padding algorithm to use.
+data_format: Specify the data format of the input and output data. With the
+    default format "NHWC", the data is stored in the order of:
+        [batch, in_height, in_width, in_channels].
+    Alternatively, the format could be "NCHW", the data storage order of:
+        [batch, in_channels, in_height, in_width].
+orig_input: The original input tensor.
+orig_output: The original output tensor.
+grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+output: Gradients of gradients w.r.t. the input to `max_pool`.
+)doc");
+
 REGISTER_OP("MaxPoolWithArgmax")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index 7a9588b56ad..b2ada534fc3 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <openssl/bio.h>
 #include <openssl/evp.h>
 #include <openssl/pem.h>
+#include <openssl/rsa.h>
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/base64.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index ffb38a169f7..0af38affd5e 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -293,6 +293,12 @@ def tf_additional_mpi_lib_defines():
       "//conditions:default": [],
   })
 
+def tf_additional_gdr_lib_defines():
+  return select({
+      "//tensorflow:with_gdr_support": ["TENSORFLOW_USE_GDR"],
+      "//conditions:default": [],
+  })
+
 def tf_pyclif_proto_library(name, proto_lib, proto_srcfile="", visibility=None,
                             **kwargs):
   pass
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 04bf2aeca65..1ef4588965b 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -39,3 +39,11 @@ def tf_additional_mpi_deps():
       ],
       "//conditions:default": [],
   })
+
+def tf_additional_gdr_deps():
+  return select({
+      "//tensorflow:with_gdr_support": [
+          "//tensorflow/contrib/gdr:gdr_server_lib",
+      ],
+      "//conditions:default": [],
+  })
diff --git a/tensorflow/core/platform/default/gpu_tracer.cc b/tensorflow/core/platform/default/gpu_tracer.cc
index 50c27b3cf6b..3f855461276 100644
--- a/tensorflow/core/platform/default/gpu_tracer.cc
+++ b/tensorflow/core/platform/default/gpu_tracer.cc
@@ -579,8 +579,8 @@ Status GPUTracerImpl::Collect(StepStatsCollector *collector) {
   // TODO(pbar) Handle device IDs and prefix properly.
   const string prefix = "";
   const int id = 0;
-  const string stream_device = strings::StrCat(prefix, "/gpu:", id, "/stream:");
-  const string memcpy_device = strings::StrCat(prefix, "/gpu:", id, "/memcpy");
+  const string stream_device = strings::StrCat(prefix, "/device:GPU:", id, "/stream:");
+  const string memcpy_device = strings::StrCat(prefix, "/device:GPU:", id, "/memcpy");
 
   mutex_lock l2(trace_mu_);
   for (const auto &rec : kernel_records_) {
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 44f11aef968..12ef55ec26e 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #endif
 #if defined(PLATFORM_WINDOWS)
 #include <windows.h>
+#include "tensorflow/core/platform/windows/windows_file_system.h"
 #define PATH_MAX MAX_PATH
 #else
 #include <unistd.h>
@@ -266,8 +267,11 @@ string Env::GetExecutablePath() {
   _NSGetExecutablePath(unresolved_path, &buffer_size);
   CHECK(realpath(unresolved_path, exe_path));
 #elif defined(PLATFORM_WINDOWS)
-  HMODULE hModule = GetModuleHandle(NULL);
-  GetModuleFileName(hModule, exe_path, MAX_PATH);
+  HMODULE hModule = GetModuleHandleW(NULL);
+  WCHAR wc_file_path[MAX_PATH] = {0};
+  GetModuleFileNameW(hModule, wc_file_path, MAX_PATH);
+  string file_path = WindowsFileSystem::WideCharToUtf8(wc_file_path);
+  std::copy(file_path.begin(), file_path.end(), exe_path);
 #else
   CHECK_NE(-1, readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
 #endif
diff --git a/tensorflow/core/platform/gpu_tracer_test.cc b/tensorflow/core/platform/gpu_tracer_test.cc
index 713282c1fd8..f6c2c6cb379 100644
--- a/tensorflow/core/platform/gpu_tracer_test.cc
+++ b/tensorflow/core/platform/gpu_tracer_test.cc
@@ -63,12 +63,12 @@ class GPUTracerTest : public ::testing::Test {
     Tensor x_tensor(DT_FLOAT, TensorShape({2, 1}));
     test::FillValues<float>(&x_tensor, {1, 1});
     Node* x = test::graph::Constant(&graph, x_tensor);
-    x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
     x_ = x->name();
 
     // y = A * x
     Node* y = test::graph::Matmul(&graph, a, x, false, false);
-    y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
     y_ = y->name();
 
     // Use an Identity op to force a memcpy to CPU and back to GPU.
@@ -77,7 +77,7 @@ class GPUTracerTest : public ::testing::Test {
 
     Node* y_neg = test::graph::Unary(&graph, "Neg", i);
     y_neg_ = y_neg->name();
-    y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+    y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 
     test::graph::ToGraphDef(&graph, &def_);
   }
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index 52df84e81ce..d3362690d7e 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -28,7 +28,7 @@ namespace profile_utils {
 
 static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
 
-#if defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
    /* static */ uint64 CpuUtils::GetCycleCounterFrequency() {
      static const uint64 cpu_frequency = GetCycleCounterFrequencyImpl();
      return cpu_frequency;
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index 8979a40ea10..5d215b4804d 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -97,7 +97,7 @@ class CpuUtils {
   // Return cycle counter frequency.
   // As this method caches the cpu frequency internally,
   // the first call will incur overhead, but not subsequent calls.
-  #if defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+  #if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
      static uint64 GetCycleCounterFrequency();
   #else
      static int64 GetCycleCounterFrequency();
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
index e1ec4aaac0f..5b11b684dd9 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
@@ -53,7 +53,7 @@ TEST_F(CpuUtilsTest, CheckGetCurrentClockCycle) {
 }
 
 TEST_F(CpuUtilsTest, CheckCycleCounterFrequency) {
-  #if defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+  #if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
      const uint64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
      CHECK_GT(cpu_frequency, 0);
      CHECK_NE(cpu_frequency, unsigned(CpuUtils::INVALID_FREQUENCY));
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 98fcf927ac4..788a4bf4b1a 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -59,7 +59,7 @@ class WindowsEnv : public Env {
     // versions of Windows. For that reason, we try to look it up in
     // kernel32.dll at runtime and use an alternative option if the function
     // is not available.
-    HMODULE module = GetModuleHandle("kernel32.dll");
+    HMODULE module = GetModuleHandleW(L"kernel32.dll");
     if (module != NULL) {
       auto func = (FnGetSystemTimePreciseAsFileTime)GetProcAddress(
           module, "GetSystemTimePreciseAsFileTime");
@@ -72,7 +72,9 @@ class WindowsEnv : public Env {
   }
 
   bool MatchPath(const string& path, const string& pattern) override {
-    return PathMatchSpec(path.c_str(), pattern.c_str()) == TRUE;
+      std::wstring ws_path(WindowsFileSystem::Utf8ToWideChar(path));
+      std::wstring ws_pattern(WindowsFileSystem::Utf8ToWideChar(pattern));
+    return PathMatchSpecW(ws_path.c_str(), ws_pattern.c_str()) == TRUE;
   }
 
   void SleepForMicroseconds(int64 micros) override { Sleep(micros / 1000); }
@@ -124,7 +126,9 @@ class WindowsEnv : public Env {
     std::string file_name = library_filename;
     std::replace(file_name.begin(), file_name.end(), '/', '\\');
 
-    HMODULE hModule = LoadLibraryEx(file_name.c_str(), NULL,
+    std::wstring ws_file_name(WindowsFileSystem::Utf8ToWideChar(file_name));
+
+    HMODULE hModule = LoadLibraryExW(ws_file_name.c_str(), NULL,
       LOAD_WITH_ALTERED_SEARCH_PATH);
     if (!hModule) {
       return errors::NotFound(file_name + " not found");
diff --git a/tensorflow/core/platform/windows/env_time.cc b/tensorflow/core/platform/windows/env_time.cc
index 2765cb72500..16cc9dc6755 100644
--- a/tensorflow/core/platform/windows/env_time.cc
+++ b/tensorflow/core/platform/windows/env_time.cc
@@ -30,7 +30,7 @@ class WindowsEnvTime : public EnvTime {
     // versions of Windows. For that reason, we try to look it up in
     // kernel32.dll at runtime and use an alternative option if the function
     // is not available.
-    HMODULE module = GetModuleHandle("kernel32.dll");
+    HMODULE module = GetModuleHandleW(L"kernel32.dll");
     if (module != NULL) {
       auto func = (FnGetSystemTimePreciseAsFileTime)GetProcAddress(
           module, "GetSystemTimePreciseAsFileTime");
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 72e7e06e65c..604348fe03a 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -227,6 +227,7 @@ class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 Status WindowsFileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
   string translated_fname = TranslateName(fname);
+  std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
   result->reset();
 
   // Open the file for read-only random access
@@ -237,7 +238,7 @@ Status WindowsFileSystem::NewRandomAccessFile(
   // almost all tests would work with a possible exception of fault_injection.
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
 
-  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_READ,
+  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ,
                                share_mode, NULL, OPEN_EXISTING, file_flags,
                                NULL);
 
@@ -253,10 +254,11 @@ Status WindowsFileSystem::NewRandomAccessFile(
 Status WindowsFileSystem::NewWritableFile(
     const string& fname, std::unique_ptr<WritableFile>* result) {
   string translated_fname = TranslateName(fname);
+  std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
   result->reset();
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_WRITE,
+  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE,
                                share_mode, NULL, CREATE_ALWAYS,
                                FILE_ATTRIBUTE_NORMAL, NULL);
 
@@ -272,10 +274,11 @@ Status WindowsFileSystem::NewWritableFile(
 Status WindowsFileSystem::NewAppendableFile(
     const string& fname, std::unique_ptr<WritableFile>* result) {
   string translated_fname = TranslateName(fname);
+  std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
   result->reset();
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_WRITE,
+  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE,
                                share_mode, NULL, OPEN_ALWAYS,
                                FILE_ATTRIBUTE_NORMAL, NULL);
 
@@ -301,6 +304,7 @@ Status WindowsFileSystem::NewAppendableFile(
 Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
     const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
   string translated_fname = TranslateName(fname);
+  std::wstring ws_translated_fname = Utf8ToWideChar(translated_fname);
   result->reset();
   Status s = Status::OK();
 
@@ -312,7 +316,7 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
   file_flags |= FILE_FLAG_OVERLAPPED;
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_READ,
+  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ,
                                share_mode, NULL, OPEN_EXISTING, file_flags,
                                NULL);
 
@@ -382,28 +386,30 @@ Status WindowsFileSystem::FileExists(const string& fname) {
 Status WindowsFileSystem::GetChildren(const string& dir,
                                       std::vector<string>* result) {
   string translated_dir = TranslateName(dir);
+  std::wstring ws_translated_dir = Utf8ToWideChar(translated_dir);
   result->clear();
 
-  string pattern = translated_dir;
+  std::wstring pattern = ws_translated_dir;
   if (!pattern.empty() && pattern.back() != '\\' && pattern.back() != '/') {
-    pattern += "\\*";
+    pattern += L"\\*";
   } else {
-    pattern += '*';
+    pattern += L'*';
   }
 
-  WIN32_FIND_DATA find_data;
-  HANDLE find_handle = ::FindFirstFileA(pattern.c_str(), &find_data);
+  WIN32_FIND_DATAW find_data;
+  HANDLE find_handle = ::FindFirstFileW(pattern.c_str(), &find_data);
   if (find_handle == INVALID_HANDLE_VALUE) {
     string context = "FindFirstFile failed for: " + translated_dir;
     return IOErrorFromWindowsError(context, ::GetLastError());
   }
 
   do {
-    const StringPiece basename = find_data.cFileName;
+	string file_name = WideCharToUtf8(find_data.cFileName);
+	const StringPiece basename = file_name;
     if (basename != "." && basename != "..") {
-      result->push_back(find_data.cFileName);
+      result->push_back(file_name);
     }
-  } while (::FindNextFileA(find_handle, &find_data));
+  } while (::FindNextFileW(find_handle, &find_data));
 
   if (!::FindClose(find_handle)) {
     string context = "FindClose failed for: " + translated_dir;
@@ -415,7 +421,8 @@ Status WindowsFileSystem::GetChildren(const string& dir,
 
 Status WindowsFileSystem::DeleteFile(const string& fname) {
   Status result;
-  if (unlink(TranslateName(fname).c_str()) != 0) {
+  std::wstring file_name = Utf8ToWideChar(fname);
+  if (_wunlink(file_name.c_str()) != 0) {
     result = IOError("Failed to delete a file: " + fname, errno);
   }
   return result;
@@ -423,7 +430,8 @@ Status WindowsFileSystem::DeleteFile(const string& fname) {
 
 Status WindowsFileSystem::CreateDir(const string& name) {
   Status result;
-  if (_mkdir(TranslateName(name).c_str()) != 0) {
+  std::wstring ws_name = Utf8ToWideChar(name);
+  if (_wmkdir(ws_name.c_str()) != 0) {
     result = IOError("Failed to create a directory: " + name, errno);
   }
   return result;
@@ -431,7 +439,8 @@ Status WindowsFileSystem::CreateDir(const string& name) {
 
 Status WindowsFileSystem::DeleteDir(const string& name) {
   Status result;
-  if (_rmdir(TranslateName(name).c_str()) != 0) {
+  std::wstring ws_name = Utf8ToWideChar(name);
+  if (_wrmdir(ws_name.c_str()) != 0) {
     result = IOError("Failed to remove a directory: " + name, errno);
   }
   return result;
@@ -439,9 +448,10 @@ Status WindowsFileSystem::DeleteDir(const string& name) {
 
 Status WindowsFileSystem::GetFileSize(const string& fname, uint64* size) {
   string translated_fname = TranslateName(fname);
+  std::wstring ws_translated_dir = Utf8ToWideChar(translated_fname);
   Status result;
   WIN32_FILE_ATTRIBUTE_DATA attrs;
-  if (TRUE == ::GetFileAttributesExA(translated_fname.c_str(),
+  if (TRUE == ::GetFileAttributesExW(ws_translated_dir.c_str(),
                                      GetFileExInfoStandard, &attrs)) {
     ULARGE_INTEGER file_size;
     file_size.HighPart = attrs.nFileSizeHigh;
@@ -459,7 +469,9 @@ Status WindowsFileSystem::RenameFile(const string& src, const string& target) {
   Status result;
   // rename() is not capable of replacing the existing file as on Linux
   // so use OS API directly
-  if (!::MoveFileExA(TranslateName(src).c_str(), TranslateName(target).c_str(),
+  std::wstring ws_translated_src = Utf8ToWideChar(TranslateName(src));
+  std::wstring ws_translated_target = Utf8ToWideChar(TranslateName(target));
+  if (!::MoveFileExW(ws_translated_src.c_str(), ws_translated_target.c_str(),
       MOVEFILE_REPLACE_EXISTING)) {
     string context(strings::StrCat("Failed to rename: ", src, " to: ", target));
     result = IOErrorFromWindowsError(context, ::GetLastError());
@@ -487,12 +499,13 @@ Status WindowsFileSystem::GetMatchingPaths(const string& pattern,
 Status WindowsFileSystem::Stat(const string& fname, FileStatistics* stat) {
   Status result;
   struct _stat sbuf;
-  if (_stat(TranslateName(fname).c_str(), &sbuf) != 0) {
+  std::wstring ws_translated_fname = Utf8ToWideChar(TranslateName(fname));
+  if (_wstat(ws_translated_fname.c_str(), &sbuf) != 0) {
     result = IOError(fname, errno);
   } else {
     stat->mtime_nsec = sbuf.st_mtime * 1e9;
     stat->length = sbuf.st_size;
-    stat->is_directory = PathIsDirectory(TranslateName(fname).c_str());
+    stat->is_directory = PathIsDirectoryW(ws_translated_fname.c_str());
   }
   return result;
 }
diff --git a/tensorflow/core/platform/windows/windows_file_system.h b/tensorflow/core/platform/windows/windows_file_system.h
index 507290e9e66..8dcc1530370 100644
--- a/tensorflow/core/platform/windows/windows_file_system.h
+++ b/tensorflow/core/platform/windows/windows_file_system.h
@@ -66,6 +66,21 @@ class WindowsFileSystem : public FileSystem {
   string TranslateName(const string& name) const override {
     return name;
   }
+
+  static std::wstring Utf8ToWideChar(const string& utf8str) {
+      int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), NULL, 0);
+      std::wstring ws_translated_str(size_required, 0);
+      MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), &ws_translated_str[0], size_required);
+      return ws_translated_str;
+  }
+
+  static string WideCharToUtf8(const std::wstring &wstr) {
+      if (wstr.empty()) return std::string();
+      int size_required = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
+      string utf8_translated_str(size_required, 0);
+      WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), &utf8_translated_str[0], size_required, NULL, NULL);
+      return utf8_translated_str;
+  }
 };
 
 class LocalWinFileSystem : public WindowsFileSystem {
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 6db38a59aef..06118e6eb21 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -127,10 +127,10 @@ tfprof> advise
 Not running under xxxx. Skip JobChecker.
 
 AcceleratorUtilizationChecker:
-device: /job:worker/replica:0/task:0/gpu:0 low utilization: 0.03
-device: /job:worker/replica:0/task:0/gpu:1 low utilization: 0.08
-device: /job:worker/replica:0/task:0/gpu:2 low utilization: 0.04
-device: /job:worker/replica:0/task:0/gpu:3 low utilization: 0.21
+device: /job:worker/replica:0/task:0/device:GPU:0 low utilization: 0.03
+device: /job:worker/replica:0/task:0/device:GPU:1 low utilization: 0.08
+device: /job:worker/replica:0/task:0/device:GPU:2 low utilization: 0.04
+device: /job:worker/replica:0/task:0/device:GPU:3 low utilization: 0.21
 
 OperationChecker:
 Found operation using NHWC data_format on GPU. Maybe NCHW is faster.
diff --git a/tensorflow/core/profiler/g3doc/advise.md b/tensorflow/core/profiler/g3doc/advise.md
index cc16c8fdffd..d87b0d8603d 100644
--- a/tensorflow/core/profiler/g3doc/advise.md
+++ b/tensorflow/core/profiler/g3doc/advise.md
@@ -31,10 +31,10 @@ tfprof --graph_path=graph.pbtxt \
 
 tfprof> advise
 AcceleratorUtilizationChecker:
-device: /job:worker/replica:0/task:0/gpu:0 low utilization: 0.03
-device: /job:worker/replica:0/task:0/gpu:1 low utilization: 0.08
-device: /job:worker/replica:0/task:0/gpu:2 low utilization: 0.04
-device: /job:worker/replica:0/task:0/gpu:3 low utilization: 0.21
+device: /job:worker/replica:0/task:0/device:GPU:0 low utilization: 0.03
+device: /job:worker/replica:0/task:0/device:GPU:1 low utilization: 0.08
+device: /job:worker/replica:0/task:0/device:GPU:2 low utilization: 0.04
+device: /job:worker/replica:0/task:0/device:GPU:3 low utilization: 0.21
 
 OperationChecker:
 Found operation using NHWC data_format on GPU. Maybe NCHW is faster.
diff --git a/tensorflow/core/profiler/g3doc/profile_time.md b/tensorflow/core/profiler/g3doc/profile_time.md
index db555b36174..e11a75553b2 100644
--- a/tensorflow/core/profiler/g3doc/profile_time.md
+++ b/tensorflow/core/profiler/g3doc/profile_time.md
@@ -134,7 +134,7 @@ AddN                            50.10ms (17.33%, 1.34%),       5481
 tfprof> op -select micros,device -order_by micros
 node name | execution time | assigned devices
 SoftmaxCrossEntropyWithLogits     1.37sec (100.00%, 36.44%), /job:worker/replica:0/task:0/cpu:0
-MatMul                        618.97ms (63.56%, 16.51%), |/job:worker/replica:0/task:0/cpu:0|/job:worker/replica:0/task:0/gpu:0|/job:worker/replica:0/task:0/gpu:1|/job:worker/replica:0/task:0/gpu:2|/job:worker/replica:0/task:0/gpu:3
+MatMul                        618.97ms (63.56%, 16.51%), |/job:worker/replica:0/task:0/cpu:0|/job:worker/replica:0/task:0/device:GPU:0|/job:worker/replica:0/task:0/device:GPU:1|/job:worker/replica:0/task:0/device:GPU:2|/job:worker/replica:0/task:0/device:GPU:3
 ```
 
 
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index 096c1d915ca..23ed287f7bb 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -53,10 +53,10 @@ class TFProfAdvisorTest : public ::testing::Test {
     NodeExecStats node_stat;
     node_stat.set_all_start_micros(start_miros);
     node_stat.set_op_end_rel_micros(end_rel_micros);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0", node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0:stream:all",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0", node_stat);
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:all",
                       node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/gpu:0:stream:0",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:0",
                       node_stat);
     return node;
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_code.h b/tensorflow/core/profiler/internal/tfprof_code.h
index 5e64104d9fa..8da036e6b7b 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.h
+++ b/tensorflow/core/profiler/internal/tfprof_code.h
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // Build a tree structure based on the TensorFlow model's python code stacks.
-// Stats are aggregated from descendants from ancestors.
+// Stats are aggregated from descendants to ancestors.
 
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 70b91c37e4b..d4a784ffaa6 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -25,7 +25,7 @@ bool CountAsAcceleratorTime(const string& device) {
 }
 
 bool CountAsCPUTime(const string& device) {
-  return RE2::FullMatch(device, ".*/(gpu|cpu|device:sycl):\\d+");
+  return RE2::FullMatch(device, ".*/(device:gpu|gpu|cpu|device:sycl):\\d+");
 }
 
 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
@@ -143,7 +143,7 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
 
   // TODO(xpan): Make this more robust?
   // See run_metadata_test.py
-  // It can be /job:0/replica:0/xxxx/gpu:0, or simply /gpu:0.
+  // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0.
   // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
   if (IsCanonicalDevice(dev)) {
     if (!canonical_device_.empty()) {
diff --git a/tensorflow/core/profiler/internal/tfprof_scope.h b/tensorflow/core/profiler/internal/tfprof_scope.h
index 5e1fa2a32ad..710991dde6b 100644
--- a/tensorflow/core/profiler/internal/tfprof_scope.h
+++ b/tensorflow/core/profiler/internal/tfprof_scope.h
@@ -15,7 +15,7 @@ limitations under the License.
 
 // Build a tree structure based on the TensorFlow op names.
 // For example, 'name1/name2' is a child of 'name1'.
-// Stats are aggregated from descendants from ancestors.
+// Stats are aggregated from descendants to ancestors.
 
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 69311e3a7f3..56bb709e119 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -42,7 +42,7 @@ message GPUOptions {
   // A comma-separated list of GPU ids that determines the 'visible'
   // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
   // can see 8 GPU devices in the process, and one wanted to map
-  // visible GPU devices 5 and 3 as "/gpu:0", and "/gpu:1", then one
+  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1", then one
   // would specify this field as "5,3".  This field is similar in
   // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
   // it applies to the visible GPU devices in the process.
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index b71edb4568e..aea00b17d9b 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -30,7 +30,7 @@ message RewriterConfig {
   // Fold constants (default is OFF)
   Toggle constant_folding = 3;
 
-  // If true, don't remove unecessary ops from the graph
+  // If true, don't remove unnecessary ops from the graph
   bool disable_model_pruning = 2;
 
   enum MemOptType {
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 4626ab8ea52..2fefa67d7d0 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 2
-#define TF_PATCH_VERSION 1
+#define TF_MINOR_VERSION 3
+#define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX "-rc2"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/device_name_utils_test.cc b/tensorflow/core/util/device_name_utils_test.cc
index 008100aa446..9a3f8849a65 100644
--- a/tensorflow/core/util/device_name_utils_test.cc
+++ b/tensorflow/core/util/device_name_utils_test.cc
@@ -76,21 +76,21 @@ TEST(DeviceNameUtilsTest, Basic) {
     DeviceNameUtils::ParsedName p;
     EXPECT_FALSE(DeviceNameUtils::ParseFullName("foobar", &p));
     EXPECT_FALSE(
-        DeviceNameUtils::ParseFullName("/job:123/replica:1/task:2/gpu:3", &p));
+        DeviceNameUtils::ParseFullName("/job:123/replica:1/task:2/device:GPU:3", &p));
     EXPECT_FALSE(
         DeviceNameUtils::ParseFullName("/job:123/replica:1/task:2/gpu:", &p));
     EXPECT_FALSE(DeviceNameUtils::ParseFullName(
         "/job:123/replica:1/task:2/device:gpu:", &p));
     EXPECT_FALSE(
-        DeviceNameUtils::ParseFullName("/job:foo/replica:-1/task:2/gpu:3", &p));
+        DeviceNameUtils::ParseFullName("/job:foo/replica:-1/task:2/device:GPU:3", &p));
     EXPECT_FALSE(
-        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:-2/gpu:3", &p));
+        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:-2/device:GPU:3", &p));
     EXPECT_FALSE(
         DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:2/bar:3", &p));
     EXPECT_FALSE(DeviceNameUtils::ParseFullName(
-        "/job:foo/replica:1/task:2/gpu:3/extra", &p));
+        "/job:foo/replica:1/task:2/device:GPU:3/extra", &p));
     EXPECT_TRUE(
-        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:2/gpu:3", &p));
+        DeviceNameUtils::ParseFullName("/job:foo/replica:1/task:2/device:GPU:3", &p));
     EXPECT_TRUE(p.has_job);
     EXPECT_TRUE(p.has_replica);
     EXPECT_TRUE(p.has_task);
@@ -106,7 +106,7 @@ TEST(DeviceNameUtilsTest, Basic) {
     // Allow _ in job names.
     DeviceNameUtils::ParsedName p;
     EXPECT_TRUE(DeviceNameUtils::ParseFullName(
-        "/job:foo_bar/replica:1/task:2/gpu:3", &p));
+        "/job:foo_bar/replica:1/task:2/device:GPU:3", &p));
     EXPECT_TRUE(p.has_job);
     EXPECT_TRUE(p.has_replica);
     EXPECT_TRUE(p.has_task);
@@ -193,7 +193,7 @@ TEST(DeviceNameUtilsTest, Basic) {
   }
   {
     DeviceNameUtils::ParsedName p;
-    EXPECT_TRUE(DeviceNameUtils::ParseFullName("/job:*/replica:4/gpu:5", &p));
+    EXPECT_TRUE(DeviceNameUtils::ParseFullName("/job:*/replica:4/device:GPU:5", &p));
     EXPECT_FALSE(p.has_job);
     EXPECT_TRUE(p.has_replica);
     EXPECT_FALSE(p.has_task);
@@ -216,13 +216,13 @@ TEST(DeviceNameUtilsTest, Basic) {
   }
 
   EXPECT_TRUE(DeviceNameUtils::IsSameAddressSpace(
-      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:1/task:2/gpu:4"));
+      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:1/task:2/device:GPU:4"));
   EXPECT_FALSE(DeviceNameUtils::IsSameAddressSpace(
-      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:1/task:3/gpu:4"));
+      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:1/task:3/device:GPU:4"));
   EXPECT_FALSE(DeviceNameUtils::IsSameAddressSpace(
-      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:10/task:2/gpu:4"));
+      "/job:foo/replica:1/task:2/cpu:3", "/job:foo/replica:10/task:2/device:GPU:4"));
   EXPECT_FALSE(DeviceNameUtils::IsSameAddressSpace(
-      "/job:foo/replica:1/task:2/cpu:3", "/job:bar/replica:1/task:2/gpu:4"));
+      "/job:foo/replica:1/task:2/cpu:3", "/job:bar/replica:1/task:2/device:GPU:4"));
 
   EXPECT_EQ(DeviceNameUtils::LocalName("CPU", 1), "CPU:1");
   EXPECT_EQ(DeviceNameUtils::LocalName("GPU", 2), "GPU:2");
@@ -284,17 +284,17 @@ static bool IsCSHelper(StringPiece pattern, StringPiece actual) {
 }
 
 TEST(DeviceNameUtilsTest, IsCompleteSpecification) {
-  EXPECT_TRUE(IsCSHelper("/job:*", "/job:work/replica:1/task:2/gpu:3"));
+  EXPECT_TRUE(IsCSHelper("/job:*", "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(
-      IsCSHelper("/job:*/replica:*", "/job:work/replica:1/task:2/gpu:3"));
-  EXPECT_TRUE(IsCSHelper("/job:*/task:*", "/job:work/replica:1/task:2/gpu:3"));
+      IsCSHelper("/job:*/replica:*", "/job:work/replica:1/task:2/device:GPU:3"));
+  EXPECT_TRUE(IsCSHelper("/job:*/task:*", "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(IsCSHelper("/job:*/replica:*/task:*",
-                         "/job:work/replica:1/task:2/gpu:3"));
+                         "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(
-      IsCSHelper("/job:*/replica:*/gpu:*", "/job:work/replica:1/task:2/gpu:3"));
-  EXPECT_FALSE(IsCSHelper("/cpu:*", "/job:worker/replica:1/task:2/gpu:3"));
-  EXPECT_FALSE(IsCSHelper("/gpu:2", "/job:worker/replica:1/task:2/gpu:1"));
-  EXPECT_TRUE(IsCSHelper("/gpu:*", "/job:worker/replica:1/task:2/gpu:3"));
+      IsCSHelper("/job:*/replica:*/gpu:*", "/job:work/replica:1/task:2/device:GPU:3"));
+  EXPECT_FALSE(IsCSHelper("/cpu:*", "/job:worker/replica:1/task:2/device:GPU:3"));
+  EXPECT_FALSE(IsCSHelper("/device:GPU:2", "/job:worker/replica:1/task:2/device:GPU:1"));
+  EXPECT_TRUE(IsCSHelper("/gpu:*", "/job:worker/replica:1/task:2/device:GPU:3"));
 }
 
 static bool IsSpecHelper(StringPiece pattern, StringPiece actual) {
@@ -305,36 +305,36 @@ static bool IsSpecHelper(StringPiece pattern, StringPiece actual) {
 }
 
 TEST(DeviceNameUtilsTest, IsSpecification) {
-  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1/task:2/gpu:3"));
-  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1/gpu:3"));
+  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1/task:2/device:GPU:3"));
+  EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1/device:GPU:3"));
   EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work/replica:1"));
   EXPECT_TRUE(IsSpecHelper("/job:*", "/replica:1"));
   EXPECT_TRUE(IsSpecHelper("/job:*", "/job:work"));
   EXPECT_TRUE(
-      IsSpecHelper("/job:*/replica:*", "/job:work/replica:1/task:2/gpu:3"));
+      IsSpecHelper("/job:*/replica:*", "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(IsSpecHelper("/job:work/replica:1/gpu:*",
-                           "/job:work/replica:1/task:2/gpu:3"));
-  EXPECT_TRUE(IsSpecHelper("/job:work/replica:1/gpu:3",
-                           "/job:work/replica:1/task:2/gpu:3"));
+                           "/job:work/replica:1/task:2/device:GPU:3"));
+  EXPECT_TRUE(IsSpecHelper("/job:work/replica:1/device:GPU:3",
+                           "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(IsSpecHelper("/job:work/replica:1/task:2",
-                           "/job:work/replica:1/task:2/gpu:3"));
+                           "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(IsSpecHelper("/job:work/replica:*/task:2",
-                           "/job:work/replica:1/task:2/gpu:3"));
-  EXPECT_TRUE(IsSpecHelper("/task:*", "/job:*/replica:1/task:2/gpu:3"));
-  EXPECT_TRUE(IsSpecHelper("/task:2", "/job:*/replica:1/task:2/gpu:3"));
+                           "/job:work/replica:1/task:2/device:GPU:3"));
+  EXPECT_TRUE(IsSpecHelper("/task:*", "/job:*/replica:1/task:2/device:GPU:3"));
+  EXPECT_TRUE(IsSpecHelper("/task:2", "/job:*/replica:1/task:2/device:GPU:3"));
   EXPECT_TRUE(IsSpecHelper("/cpu:*", "/job:*/replica:1/task:2/cpu:1"));
   EXPECT_TRUE(IsSpecHelper("/cpu:0", "/cpu:0"));
-  EXPECT_TRUE(IsSpecHelper("/gpu:*", "/job:worker/replica:1/task:2/gpu:3"));
+  EXPECT_TRUE(IsSpecHelper("/gpu:*", "/job:worker/replica:1/task:2/device:GPU:3"));
 
-  EXPECT_FALSE(IsSpecHelper("/job:worker/replica:1/task:2/gpu:3", "/gpu:*"));
+  EXPECT_FALSE(IsSpecHelper("/job:worker/replica:1/task:2/device:GPU:3", "/gpu:*"));
   EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:*/replica:1/task:2"));
-  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:*/replica:1/task:2/gpu:1"));
-  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:worker/replica:1/task:2/gpu:3"));
-  EXPECT_FALSE(IsSpecHelper("/gpu:2", "/job:worker/replica:1/task:2/gpu:1"));
+  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:*/replica:1/task:2/device:GPU:1"));
+  EXPECT_FALSE(IsSpecHelper("/cpu:*", "/job:worker/replica:1/task:2/device:GPU:3"));
+  EXPECT_FALSE(IsSpecHelper("/device:GPU:2", "/job:worker/replica:1/task:2/device:GPU:1"));
   EXPECT_FALSE(IsSpecHelper("/job:work/replica:*/task:0",
-                            "/job:work/replica:1/task:2/gpu:3"));
+                            "/job:work/replica:1/task:2/device:GPU:3"));
   EXPECT_FALSE(IsSpecHelper("/job:work/replica:0/task:2",
-                            "/job:work/replica:*/task:2/gpu:3"));
+                            "/job:work/replica:*/task:2/device:GPU:3"));
 }
 
 TEST(DeviceNameUtilsTest, SplitDeviceName) {
@@ -348,7 +348,7 @@ TEST(DeviceNameUtilsTest, SplitDeviceName) {
       "/job:foo/cpu:1/task:2/replica:1", &task, &device));
   EXPECT_EQ("/job:foo/replica:1/task:2", task);
   EXPECT_EQ("CPU:1", device);
-  EXPECT_TRUE(DeviceNameUtils::SplitDeviceName("/gpu:3", &task, &device));
+  EXPECT_TRUE(DeviceNameUtils::SplitDeviceName("/device:GPU:3", &task, &device));
   EXPECT_EQ("", task);
   EXPECT_EQ("GPU:3", device);
   EXPECT_FALSE(DeviceNameUtils::SplitDeviceName("gpu:3", &task, &device));
@@ -413,11 +413,11 @@ TEST(DeviceNameUtilsTest, MergeDevNames) {
   MergeDevNamesHelper("", "/job:foo", "/job:foo");
   MergeDevNamesHelper("", "/replica:2", "/replica:2");
   MergeDevNamesHelper("", "/task:7", "/task:7");
-  // MergeDevNamesHelper("", "/gpu:1", "/gpu:1");
+  // MergeDevNamesHelper("", "/device:GPU:1", "/device:GPU:1");
 
   // Combining disjoint names.
   MergeDevNamesHelper("/job:foo", "/task:7", "/job:foo/task:7");
-  MergeDevNamesHelper("/job:foo", "/gpu:1", "/job:foo/gpu:1");
+  MergeDevNamesHelper("/job:foo", "/device:GPU:1", "/job:foo/device:GPU:1");
 
   // Combining overlapping names.
   MergeDevNamesHelper("/job:foo/replica:0", "/replica:0/task:1",
@@ -426,25 +426,25 @@ TEST(DeviceNameUtilsTest, MergeDevNames) {
   // Wildcard tests.
   MergeDevNamesHelper("", "/gpu:*", "/gpu:*");
   MergeDevNamesHelper("/gpu:*", "/gpu:*", "/gpu:*");
-  MergeDevNamesHelper("/gpu:1", "/gpu:*", "/gpu:1");
+  MergeDevNamesHelper("/device:GPU:1", "/gpu:*", "/device:GPU:1");
 
   // Incompatible components.
   MergeDevNamesError("/job:foo", "/job:bar", "incompatible jobs");
   MergeDevNamesError("/replica:0", "/replica:1", "incompatible replicas");
   MergeDevNamesError("/task:0", "/task:1", "incompatible tasks");
   MergeDevNamesError("/gpu:*", "/cpu:*", "incompatible types");
-  MergeDevNamesError("/gpu:0", "/gpu:1", "incompatible ids");
+  MergeDevNamesError("/device:GPU:0", "/device:GPU:1", "incompatible ids");
 }
 
 TEST(DeviceNameUtilsTest, MergeDevNamesAllowSoftPlacement) {
   // Incompatible components with allow_soft_placement.
   MergeDevNamesHelperAllowSoftPlacement("/gpu:*", "/cpu:1", "");
-  MergeDevNamesHelperAllowSoftPlacement("/cpu:*", "/gpu:1", "");
-  MergeDevNamesHelperAllowSoftPlacement("/gpu:1", "/gpu:2", "/gpu:*");
+  MergeDevNamesHelperAllowSoftPlacement("/cpu:*", "/device:GPU:1", "");
+  MergeDevNamesHelperAllowSoftPlacement("/device:GPU:1", "/device:GPU:2", "/device:GPU:*");
 }
 
 TEST(DeviceNameUtilsTest, GetNamesForDeviceMappings) {
-  DeviceNameUtils::ParsedName p = Name("/job:foo/replica:10/task:0/gpu:1");
+  DeviceNameUtils::ParsedName p = Name("/job:foo/replica:10/task:0/device:GPU:1");
   EXPECT_EQ(str_util::Join(DeviceNameUtils::GetNamesForDeviceMappings(p), ","),
             "/job:foo/replica:10/task:0/device:GPU:1,"
             "/job:foo/replica:10/task:0/gpu:1");
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 35aca709d92..cb22a50e8f1 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -616,8 +616,6 @@ inline void ForwarMklTensorInToOut(OpKernelContext* context,
   }
 }
 
-  // TODO(intel_tf): Remove this routine when faster MKL layout conversion is
-  // out. 
 inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) {
   const float* buf_in = input.flat<float>().data();
   float* buf_out = (*output)->flat<float>().data();
@@ -634,8 +632,6 @@ inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) {
   }
 }
 
-  // TODO(intel_tf): Remove this routine when faster MKL layout conversion is
-  // out. 
 inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
   const float* buf_in = input.flat<float>().data();
   float* buf_out = (*output)->flat<float>().data();
diff --git a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
index b56a4884b4c..496d43dfd7e 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
@@ -73,12 +73,12 @@ other wrappers and the dynamic decoder described below.  For example, one can
 write:
 
 ```python
-cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/gpu:0")
+cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/device:GPU:0")
 attention_mechanism = tf.contrib.seq2seq.LuongAttention(512, encoder_outputs)
 attn_cell = tf.contrib.seq2seq.AttentionWrapper(
   cell, attention_mechanism, attention_size=256)
-attn_cell = tf.contrib.rnn.DeviceWrapper(attn_cell, "/gpu:1")
-top_cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/gpu:1")
+attn_cell = tf.contrib.rnn.DeviceWrapper(attn_cell, "/device:GPU:1")
+top_cell = tf.contrib.rnn.DeviceWrapper(LSTMCell(512), "/device:GPU:1")
 multi_cell = MultiRNNCell([attn_cell, top_cell])
 ```
 
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 1426fb3e021..ec373116247 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -35,7 +35,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for Mac OS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.3.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.3.0-rc2.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index f0299f516d0..b7dc033efc0 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -35,7 +35,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.3.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.3.0-rc2.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 2d177d7ffdf..f9b7b322ca3 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -34,7 +34,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.3.0-rc1</version>
+  <version>1.3.0-rc2</version>
 </dependency>
 ```
 
@@ -63,7 +63,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.3.0-rc1</version>
+                 <version>1.3.0-rc2</version>
                </dependency>
              </dependencies>
          </project>
@@ -122,7 +122,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or Mac OS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0-rc2.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -141,7 +141,7 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.3.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.3.0-rc2.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -149,10 +149,10 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0-rc2.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.3.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.3.0-rc2.zip).
   3. Extract this .zip file.
 
 
@@ -200,7 +200,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.3.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.3.0-rc2.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -214,11 +214,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and Mac OS X:
 
-<pre><b>java -cp libtensorflow-1.3.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.3.0-rc2.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.3.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.3.0-rc2.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 4885bb12c53..85182cc74fc 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -172,7 +172,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc2-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -277,7 +277,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc1-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc2-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -464,7 +464,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc2-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -632,14 +632,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc2-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0rc2-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -651,14 +651,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc2-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0rc2-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -670,14 +670,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc2-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0rc2-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -689,14 +689,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.3.0rc2-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.3.0rc2-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 6fa63dd14ca..733ecc37fbb 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -109,7 +109,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc2-py2-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -230,7 +230,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc1-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc2-py2-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -339,7 +339,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc2-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -512,7 +512,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc2-py2-none-any.whl
 </pre>
 
 
@@ -520,7 +520,7 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.3.0rc2-py3-none-any.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 4f365893d7a..a69f982d76c 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -343,10 +343,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.3.0rc1 on Linux:
+for TensorFlow 1.3.0rc2 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.3.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.3.0rc2-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 2895438f465..a9d7dd955ab 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -115,12 +115,12 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      environment. To install the CPU-only version of TensorFlow, enter the
      following command:
 
-     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-1.3.0rc1-cp35-cp35m-win_amd64.whl</b> </pre>
+     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-1.3.0rc2-cp35-cp35m-win_amd64.whl</b> </pre>
 
      To install the GPU version of TensorFlow, enter the following command
      (on a single line):
 
-     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-1.3.0rc1-cp35-cp35m-win_amd64.whl</b> </pre>
+     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-1.3.0rc2-cp35-cp35m-win_amd64.whl</b> </pre>
 
 ## Validate your installation
 
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index d050fc5c56d..544274cab68 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -89,12 +89,14 @@ here's how you can translate the latest GoogLeNet model into a version that uses
 eight-bit computations:
 
 ```sh
-curl http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz -o /tmp/inceptionv3.tgz
-tar xzf /tmp/inceptionv3.tgz -C /tmp/
+curl -L "https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz" |
+  tar -C tensorflow/examples/label_image/data -xz
 bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
-  --inputs="Mul" --in_graph=/tmp/classify_image_graph_def.pb \
-  --outputs="softmax" --out_graph=/tmp/quantized_graph.pb \
+  --in_graph=tensorflow/examples/label_image/data/inception_v3_2016_08_28_frozen.pb \
+  --out_graph=/tmp/quantized_graph.pb \
+  --inputs=input \
+  --outputs=InceptionV3/Predictions/Reshape_1 \
   --transforms='add_default_attributes strip_unused_nodes(type=float, shape="1,299,299,3")
     remove_nodes(op=Identity, op=CheckNumerics) fold_constants(ignore_errors=true)
     fold_batch_norms fold_old_batch_norms quantize_weights quantize_nodes
@@ -110,15 +112,7 @@ outputs though, and you should get equivalent results. Here's an example:
 ```sh
 bazel build tensorflow/examples/label_image:label_image
 bazel-bin/tensorflow/examples/label_image/label_image \
---image=<input-image> \
 --graph=/tmp/quantized_graph.pb \
---labels=/tmp/imagenet_synset_to_human_label_map.txt \
---input_width=299 \
---input_height=299 \
---input_mean=128 \
---input_std=128 \
---input_layer="Mul:0" \
---output_layer="softmax:0"
 ```
 
 You'll see that this runs the newly-quantized graph, and outputs a very similar
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index dd18760e1dd..b265dbbe3e1 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -110,7 +110,7 @@ devices. For example, the following snippet creates a variable named `v` and
 places it on the second GPU device:
 
 ``` python
-with tf.device("/gpu:1"):
+with tf.device("/device:GPU:1"):
   v = tf.get_variable("v", [1])
 ```
 
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index a9802b0849f..591b8ea6aad 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -411,7 +411,7 @@ the first tower are prepended with `tower_0`, e.g. `tower_0/conv1/Conv2D`.
 
 * A preferred hardware device to run the operation within a tower.
 @{tf.device} specifies this. For
-instance, all operations in the first tower reside within `device('/gpu:0')`
+instance, all operations in the first tower reside within `device('/device:GPU:0')`
 scope indicating that they should be run on the first GPU.
 
 All variables are pinned to the CPU and accessed via
diff --git a/tensorflow/docs_src/tutorials/using_gpu.md b/tensorflow/docs_src/tutorials/using_gpu.md
index dcec62d2749..b6edbe33451 100644
--- a/tensorflow/docs_src/tutorials/using_gpu.md
+++ b/tensorflow/docs_src/tutorials/using_gpu.md
@@ -7,8 +7,8 @@ supported device types are `CPU` and `GPU`. They are represented as `strings`.
 For example:
 
 *   `"/cpu:0"`: The CPU of your machine.
-*   `"/gpu:0"`: The GPU of your machine, if you have one.
-*   `"/gpu:1"`: The second GPU of your machine, etc.
+*   `"/device:GPU:0"`: The GPU of your machine, if you have one.
+*   `"/device:GPU:1"`: The second GPU of your machine, etc.
 
 If a TensorFlow operation has both CPU and GPU implementations, the GPU devices
 will be given priority when the operation is assigned to a device. For example,
@@ -35,11 +35,11 @@ You should see the following output:
 
 ```
 Device mapping:
-/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K40c, pci bus
+/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K40c, pci bus
 id: 0000:05:00.0
-b: /job:localhost/replica:0/task:0/gpu:0
-a: /job:localhost/replica:0/task:0/gpu:0
-MatMul: /job:localhost/replica:0/task:0/gpu:0
+b: /job:localhost/replica:0/task:0/device:GPU:0
+a: /job:localhost/replica:0/task:0/device:GPU:0
+MatMul: /job:localhost/replica:0/task:0/device:GPU:0
 [[ 22.  28.]
  [ 49.  64.]]
 
@@ -71,11 +71,11 @@ example) and automatically copy tensors between devices if required.
 
 ```
 Device mapping:
-/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K40c, pci bus
+/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K40c, pci bus
 id: 0000:05:00.0
 b: /job:localhost/replica:0/task:0/cpu:0
 a: /job:localhost/replica:0/task:0/cpu:0
-MatMul: /job:localhost/replica:0/task:0/gpu:0
+MatMul: /job:localhost/replica:0/task:0/device:GPU:0
 [[ 22.  28.]
  [ 49.  64.]]
 ```
@@ -127,7 +127,7 @@ to specify the preference explicitly:
 
 ```python
 # Creates a graph.
-with tf.device('/gpu:2'):
+with tf.device('/device:GPU:2'):
   a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
   b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
   c = tf.matmul(a, b)
@@ -142,9 +142,9 @@ If the device you have specified does not exist, you will get
 
 ```
 InvalidArgumentError: Invalid argument: Cannot assign a device to node 'b':
-Could not satisfy explicit device specification '/gpu:2'
+Could not satisfy explicit device specification '/device:GPU:2'
    [[Node: b = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [3,2]
-   values: 1 2 3...>, _device="/gpu:2"]()]]
+   values: 1 2 3...>, _device="/device:GPU:2"]()]]
 ```
 
 If you would like TensorFlow to automatically choose an existing and supported
@@ -154,7 +154,7 @@ the session.
 
 ```python
 # Creates a graph.
-with tf.device('/gpu:2'):
+with tf.device('/device:GPU:2'):
   a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
   b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
   c = tf.matmul(a, b)
@@ -175,7 +175,7 @@ For example:
 ```
 # Creates a graph.
 c = []
-for d in ['/gpu:2', '/gpu:3']:
+for d in ['/device:GPU:2', '/device:GPU:3']:
   with tf.device(d):
     a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
     b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2])
@@ -192,20 +192,20 @@ You will see the following output.
 
 ```
 Device mapping:
-/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: Tesla K20m, pci bus
+/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla K20m, pci bus
 id: 0000:02:00.0
-/job:localhost/replica:0/task:0/gpu:1 -> device: 1, name: Tesla K20m, pci bus
+/job:localhost/replica:0/task:0/device:GPU:1 -> device: 1, name: Tesla K20m, pci bus
 id: 0000:03:00.0
-/job:localhost/replica:0/task:0/gpu:2 -> device: 2, name: Tesla K20m, pci bus
+/job:localhost/replica:0/task:0/device:GPU:2 -> device: 2, name: Tesla K20m, pci bus
 id: 0000:83:00.0
-/job:localhost/replica:0/task:0/gpu:3 -> device: 3, name: Tesla K20m, pci bus
+/job:localhost/replica:0/task:0/device:GPU:3 -> device: 3, name: Tesla K20m, pci bus
 id: 0000:84:00.0
-Const_3: /job:localhost/replica:0/task:0/gpu:3
-Const_2: /job:localhost/replica:0/task:0/gpu:3
-MatMul_1: /job:localhost/replica:0/task:0/gpu:3
-Const_1: /job:localhost/replica:0/task:0/gpu:2
-Const: /job:localhost/replica:0/task:0/gpu:2
-MatMul: /job:localhost/replica:0/task:0/gpu:2
+Const_3: /job:localhost/replica:0/task:0/device:GPU:3
+Const_2: /job:localhost/replica:0/task:0/device:GPU:3
+MatMul_1: /job:localhost/replica:0/task:0/device:GPU:3
+Const_1: /job:localhost/replica:0/task:0/device:GPU:2
+Const: /job:localhost/replica:0/task:0/device:GPU:2
+MatMul: /job:localhost/replica:0/task:0/device:GPU:2
 AddN: /job:localhost/replica:0/task:0/cpu:0
 [[  44.   56.]
  [  98.  128.]]
diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md
index fdf43955eaf..3571a55a2ec 100644
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@@ -24,13 +24,13 @@ To try the code for this tutorial:
         # Ubuntu/Linux 64-bit
         $ sudo apt-get install python-pip python-dev
 
-        # Mac OS X
+        # macOS
         $ sudo easy_install pip
         $ sudo easy_install --upgrade six
 
     b. Use `pip` to install pandas:
 
-        $ sudo pip install pandas
+        $ pip install -U pandas
 
     If you have trouble installing pandas, consult the
     [instructions](http://pandas.pydata.org/pandas-docs/stable/install.html)
@@ -127,7 +127,7 @@ Here's a list of columns available in the Census Income dataset:
 :                :             : individual.                       :
 | income         | Categorical | ">50K" or "<=50K", meaning        |
 :                :             : whether the person makes more     :
-:                :             : than \$50,000 annually.           :
+:                :             : than $50,000 annually.           :
 
 ## Converting Data into Tensors
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
index 69f202b5681..8b4248d8fbc 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
@@ -481,7 +481,7 @@ public class ObjectTracker {
   /**
    * A TrackedObject represents a native TrackedObject, and provides access to the
    * relevant native tracking information available after every frame update. They may
-   * be safely passed around and acessed externally, but will become invalid after
+   * be safely passed around and accessed externally, but will become invalid after
    * stopTracking() is called or the related creating ObjectTracker is deactivated.
    *
    * @author andrewharp@google.com (Andrew Harp)
diff --git a/tensorflow/examples/learn/multiple_gpu.py b/tensorflow/examples/learn/multiple_gpu.py
index c7364d1f720..a294950a386 100644
--- a/tensorflow/examples/learn/multiple_gpu.py
+++ b/tensorflow/examples/learn/multiple_gpu.py
@@ -47,12 +47,12 @@ def my_model(features, labels, mode):
   # Create three fully connected layers respectively of size 10, 20, and 10 with
   # each layer having a dropout probability of 0.1.
   net = features[X_FEATURE]
-  with tf.device('/gpu:1'):
+  with tf.device('/device:GPU:1'):
     for units in [10, 20, 10]:
       net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
       net = tf.layers.dropout(net, rate=0.1)
 
-  with tf.device('/gpu:2'):
+  with tf.device('/device:GPU:2'):
     # Compute logits (1 per class).
     logits = tf.layers.dense(net, 3, activation=None)
 
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index 9c2fa600176..376e22b3808 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -23,9 +23,9 @@ from source.
 
 -   [bazel](https://www.bazel.build/versions/master/docs/install.html)
 -   Environment to build TensorFlow from source code
-    ([Linux](https://www.tensorflow.org/versions/master/get_started/os_setup.html#prepare-environment-for-linux)
+    ([Linux](https://www.tensorflow.org/install/install_sources#PrepareLinux)
     or [OS
-    X](https://www.tensorflow.org/versions/master/get_started/os_setup.html#prepare-environment-for-mac-os-x)).
+    X](https://www.tensorflow.org/install/install_sources#PrepareMac)).
     If you don't need GPU support, then try the following: `sh # Linux sudo
     apt-get install python swig python-numpy # OS X with homebrew brew install
     swig`
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 2abee05f4e2..2f1ce253b2f 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -22,9 +22,9 @@ native libraries will need to be built from source.
 1.  Install [bazel](https://www.bazel.build/versions/master/docs/install.html)
 
 2.  Setup the environment to build TensorFlow from source code
-    ([Linux](https://www.tensorflow.org/versions/master/get_started/os_setup.html#prepare-environment-for-linux)
+    ([Linux](https://www.tensorflow.org/install/install_sources#PrepareLinux)
     or [Mac OS
-    X](https://www.tensorflow.org/versions/master/get_started/os_setup.html#prepare-environment-for-mac-os-x)).
+    X](https://www.tensorflow.org/install/install_sources#PrepareMac)).
     If you'd like to skip reading those details and do not care about GPU
     support, try the following:
 
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 6af0d2bbe05..b5a6781afb3 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -30,6 +30,7 @@ load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_verbs_deps")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_mpi_deps")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_gdr_deps")
 
 py_library(
     name = "python",
@@ -2380,6 +2381,7 @@ cuda_py_test(
         ":math_ops",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows_gpu"],
 )
 
 cuda_py_test(
@@ -2397,6 +2399,7 @@ cuda_py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows_gpu"],
 )
 
 cuda_py_test(
@@ -2477,6 +2480,7 @@ cuda_py_test(
         ":special_math_ops",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows_gpu"],
 )
 
 py_library(
@@ -2877,7 +2881,8 @@ tf_py_wrap_cc(
     ] + (tf_additional_lib_deps() +
          tf_additional_plugin_deps() +
          tf_additional_verbs_deps() +
-         tf_additional_mpi_deps()),
+         tf_additional_mpi_deps() +
+         tf_additional_gdr_deps()),
 )
 
 py_library(
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index 6a89755bbda..b77912b4f74 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -173,7 +173,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     #
     # W0718 17:14:41.521534  190121 device_mgr.cc:107] Unknown device:
     #     /job:worker/replica:0/task:0/device:CPU:0 all devices:
-    #     /job:local/replica:0/task:0/gpu:0,
+    #     /job:local/replica:0/task:0/device:GPU:0,
     #     /job:local/replica:0/task:0/device:GPU:0,
     #     /job:local/replica:0/task:0/cpu:1, CPU:0, GPU:0,
     #     /job:local/replica:0/task:0/device:CPU:1,
@@ -198,7 +198,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
         sum1 = input1 + input2
 
       if test.is_gpu_available():
-        device_str = '/job:worker/task:0/gpu:0'
+        device_str = '/job:worker/task:0/device:GPU:0'
       else:
         device_str = '/job:worker/task:0/cpu:1'
       with ops.device(device_str):
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 15e7ae18bb0..b4f0fd6f404 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -1124,7 +1124,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     # which is why placing this is invalid.  If at some point
     # GPU kernels are added to this test, some other different
     # op / device combo should be chosen.
-    with ops.device('/gpu:0'):
+    with ops.device('/device:GPU:0'):
       a = constant_op.constant(1.0, shape=[1, 2])
 
     b = constant_op.constant(1.0, shape=[1, 2])
@@ -1145,7 +1145,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     # which is why placing this is invalid.  If at some point
     # GPU kernels are added to this test, some other different
     # op / device combo should be chosen.
-    with ops.device('/gpu:0'):
+    with ops.device('/device:GPU:0'):
       _ = constant_op.constant(1.0, shape=[1, 2])
 
     b = constant_op.constant(1.0, shape=[1, 2])
@@ -1494,7 +1494,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         allow_soft_placement=True,
         graph_options=config_pb2.GraphOptions(build_cost_model=100))
     with session.Session(config=config) as sess:
-      with ops.device('/gpu:0'):
+      with ops.device('/device:GPU:0'):
         a = array_ops.placeholder(dtypes.float32, shape=[])
         b = math_ops.add(a, a)
         c = array_ops.identity(b)
diff --git a/tensorflow/python/client/timeline_test.py b/tensorflow/python/client/timeline_test.py
index e8797712e91..8396df5f400 100644
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@@ -100,8 +100,8 @@ class TimelineTest(test.TestCase):
     self.assertTrue(run_metadata.HasField('step_stats'))
     step_stats = run_metadata.step_stats
     devices = [d.device for d in step_stats.dev_stats]
-    self.assertTrue('/job:localhost/replica:0/task:0/gpu:0' in devices)
-    self.assertTrue('/gpu:0/stream:all' in devices)
+    self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:0' in devices)
+    self.assertTrue('/device:GPU:0/stream:all' in devices)
     tl = timeline.Timeline(step_stats)
     ctf = tl.generate_chrome_trace_format()
     self._validateTrace(ctf)
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index 044a91a7ce6..b2b3ec5d470 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -380,7 +380,8 @@ def device_path_to_device_name(device_dir):
   path_items = os.path.basename(device_dir)[
       len(METADATA_FILE_PREFIX) + len(DEVICE_TAG):].split(",")
   return "/".join([
-      path_item.replace("_", ":", 1) for path_item in path_items])
+      path_item.replace("device_", "device:").replace("_", ":", 1)
+      for path_item in path_items])
 
 
 class DebugTensorDatum(object):
diff --git a/tensorflow/python/debug/lib/debug_data_test.py b/tensorflow/python/debug/lib/debug_data_test.py
index eff70b662bd..694010a23cd 100644
--- a/tensorflow/python/debug/lib/debug_data_test.py
+++ b/tensorflow/python/debug/lib/debug_data_test.py
@@ -237,11 +237,11 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
     gpu_0_dir = os.path.join(
         self._dump_root,
         debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
-        ",job_localhost,replica_0,task_0,gpu_0")
+        ",job_localhost,replica_0,task_0,device_GPU_0")
     gpu_1_dir = os.path.join(
         self._dump_root,
         debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
-        ",job_localhost,replica_0,task_0,gpu_1")
+        ",job_localhost,replica_0,task_0,device_GPU_1")
     os.makedirs(cpu_0_dir)
     os.makedirs(gpu_0_dir)
     os.makedirs(gpu_1_dir)
@@ -281,12 +281,12 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
     node = graph_gpu_0.node.add()
     node.name = "node_foo_1"
     node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:0"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:0"
     graph_gpu_1 = graph_pb2.GraphDef()
     node = graph_gpu_1.node.add()
     node.name = "node_foo_1"
     node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:1"
 
     dump_dir = debug_data.DebugDumpDir(
         self._dump_root,
@@ -294,14 +294,14 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
 
     self.assertItemsEqual(
         ["/job:localhost/replica:0/task:0/cpu:0",
-         "/job:localhost/replica:0/task:0/gpu:0",
-         "/job:localhost/replica:0/task:0/gpu:1"], dump_dir.devices())
+         "/job:localhost/replica:0/task:0/device:GPU:0",
+         "/job:localhost/replica:0/task:0/device:GPU:1"], dump_dir.devices())
     self.assertEqual(1472563253536385, dump_dir.t0)
     self.assertEqual(3, dump_dir.size)
 
     with self.assertRaisesRegexp(
         ValueError, r"Invalid device name: "):
-      dump_dir.nodes("/job:localhost/replica:0/task:0/gpu:2")
+      dump_dir.nodes("/job:localhost/replica:0/task:0/device:GPU:2")
     self.assertItemsEqual(["node_foo_1", "node_foo_1", "node_foo_1"],
                           dump_dir.nodes())
     self.assertItemsEqual(
@@ -319,16 +319,16 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
     node = graph_gpu_0.node.add()
     node.name = "node_foo_1"
     node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:0"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:0"
     graph_gpu_1 = graph_pb2.GraphDef()
     node = graph_gpu_1.node.add()
     node.name = "node_foo_1"
     node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:1"
     node = graph_gpu_1.node.add()  # Here is the duplicate.
     node.name = "node_foo_1"
     node.op = "FooOp"
-    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+    node.device = "/job:localhost/replica:0/task:0/device:GPU:1"
 
     with self.assertRaisesRegexp(
         ValueError, r"Duplicate node name on device "):
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index e54590adfea..08b3e75e7c8 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -711,7 +711,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     # Test node name list lookup of the DebugDumpDir object.
     if test_util.gpu_device_name():
       node_names = dump.nodes(
-          device_name="/job:localhost/replica:0/task:0/gpu:0")
+          device_name="/job:localhost/replica:0/task:0/device:GPU:0")
     else:
       node_names = dump.nodes()
     self.assertTrue(u_name in node_names)
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 575d74bbf09..3d18d7727ab 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -402,7 +402,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def testRuntimeErrorBeforeGraphExecutionIsRaised(self):
     # Use an impossible device name to cause an error before graph execution.
-    with ops.device("/gpu:1337"):
+    with ops.device("/device:GPU:1337"):
       w = variables.Variable([1.0] * 10, name="w")
 
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 37da89c4848..44ab1a622e8 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -1468,11 +1468,11 @@ class _LazyBuilder(object):
   We're trying to use the following `_FeatureColumn`s:
 
   ```python
-    bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)
-    keywords = fc.categorical_column_with_hash_buckets("keywords", ...)
-    age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])
-    ... = linear_model(features,
-                            [bucketized_age, keywords, age_X_keywords]
+  bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)
+  keywords = fc.categorical_column_with_hash_buckets("keywords", ...)
+  age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])
+  ... = linear_model(features,
+                          [bucketized_age, keywords, age_X_keywords]
   ```
 
   If we transform each column independently, then we'll get duplication of
diff --git a/tensorflow/python/framework/device_test.py b/tensorflow/python/framework/device_test.py
index e6dc3c80637..0859e956ffd 100644
--- a/tensorflow/python/framework/device_test.py
+++ b/tensorflow/python/framework/device_test.py
@@ -79,17 +79,17 @@ class DeviceTest(test_util.TensorFlowTestCase):
     self.assertEquals("/replica:1/task:0/device:CPU:0", d.to_string())
     d.parse_from_string("/replica:1/task:0/device:CPU:0")
     self.assertEquals("/replica:1/task:0/device:CPU:0", d.to_string())
-    d.parse_from_string("/job:muu/gpu:2")
+    d.parse_from_string("/job:muu/device:GPU:2")
     self.assertEquals("/job:muu/device:GPU:2", d.to_string())
     with self.assertRaises(Exception) as e:
-      d.parse_from_string("/job:muu/gpu:2/cpu:0")
+      d.parse_from_string("/job:muu/device:GPU:2/cpu:0")
     self.assertTrue("Cannot specify multiple device" in str(e.exception))
 
   def testFromString(self):
     d = device.DeviceSpec.from_string("/job:foo/replica:0")
     self.assertEquals("/job:foo/replica:0", d.to_string())
     with self.assertRaises(Exception) as e:
-      d = device.DeviceSpec.from_string("/job:muu/gpu:2/cpu:0")
+      d = device.DeviceSpec.from_string("/job:muu/device:GPU:2/cpu:0")
     self.assertTrue("Cannot specify multiple device" in str(e.exception))
 
     d = device.DeviceSpec.from_string("/job:foo/replica:0/task:3/cpu:*")
@@ -102,13 +102,13 @@ class DeviceTest(test_util.TensorFlowTestCase):
   def testMerge(self):
     d = device.DeviceSpec.from_string("/job:foo/replica:0")
     self.assertEquals("/job:foo/replica:0", d.to_string())
-    d.merge_from(device.DeviceSpec.from_string("/task:1/gpu:2"))
+    d.merge_from(device.DeviceSpec.from_string("/task:1/device:GPU:2"))
     self.assertEquals("/job:foo/replica:0/task:1/device:GPU:2", d.to_string())
 
     d = device.DeviceSpec()
     d.merge_from(device.DeviceSpec.from_string("/task:1/cpu:0"))
     self.assertEquals("/task:1/device:CPU:0", d.to_string())
-    d.merge_from(device.DeviceSpec.from_string("/job:boo/gpu:0"))
+    d.merge_from(device.DeviceSpec.from_string("/job:boo/device:GPU:0"))
     self.assertEquals("/job:boo/task:1/device:GPU:0", d.to_string())
     d.merge_from(device.DeviceSpec.from_string("/job:muu/cpu:2"))
     self.assertEquals("/job:muu/task:1/device:CPU:2", d.to_string())
@@ -134,10 +134,10 @@ class DeviceTest(test_util.TensorFlowTestCase):
 
     self.assertEqual("/job:foo/replica:0/task:0/device:GPU:0",
                      device.canonical_name(
-                         "/job:foo/replica:0/task:0/gpu:0"))
+                         "/job:foo/replica:0/task:0/device:GPU:0"))
     self.assertEqual("/job:foo/replica:0/task:0/device:GPU:0",
                      device.canonical_name(
-                         "/gpu:0/task:0/replica:0/job:foo"))
+                         "/device:GPU:0/task:0/replica:0/job:foo"))
 
   def testCheckValid(self):
     device.check_valid("/job:foo/replica:0")
@@ -155,7 +155,7 @@ class DeviceTest(test_util.TensorFlowTestCase):
     self.assertTrue("Unknown attribute: 'bar'" in str(e.exception))
 
     with self.assertRaises(Exception) as e:
-      device.check_valid("/cpu:0/gpu:2")
+      device.check_valid("/cpu:0/device:GPU:2")
     self.assertTrue("Cannot specify multiple device" in str(e.exception))
 
 
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 34295d8c200..7220f85dc41 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -584,7 +584,7 @@ class _FuncGraph(ops.Graph):
   _FuncGraph overrides ops.Graph's create_op() so that we can keep
   track of all inputs into every op created inside the function.  If
   any input is from other graphs, we keep track of it in self.capture
-  and substitue the input with a place holder.
+  and substitute the input with a place holder.
 
   Each captured input's corresponding place holder is converted into a
   function argument and the caller passes in the captured tensor.
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index c94e05c4ee9..589db9ef4dc 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -505,7 +505,7 @@ class FunctionTest(test.TestCase):
 
       _ = PlusOne(1, name="p1")
       with self.assertRaisesRegexp(ValueError, "Unknown keyword arguments"):
-        _ = PlusOne(1, device="/gpu:0")
+        _ = PlusOne(1, device="/device:GPU:0")
 
   def testFunctionDecorator(self):
 
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index f6e9bc9dad3..647ed1583a0 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -106,9 +106,9 @@ class DeviceFunctionsTest(test.TestCase):
       var_0 = variables.Variable(0)
       with ops.device(test_device_func_pin_variable_to_cpu):
         var_1 = variables.Variable(1)
-        with ops.device(lambda op: "/gpu:0"):
+        with ops.device(lambda op: "/device:GPU:0"):
           var_2 = variables.Variable(2)
-        with ops.device("/gpu:0"):  # Implicit merging device function.
+        with ops.device("/device:GPU:0"):  # Implicit merging device function.
           var_3 = variables.Variable(3)
 
     self.assertDeviceEqual(var_0.device, None)
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index cfba6af5232..8ce8e76629d 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -878,7 +878,7 @@ class ImportGraphDefTest(test.TestCase):
         self.assertEqual(c.device, c4.device)  # worker overrides ps.
 
     with ops.Graph().as_default():
-      with ops.device(device.merge_device("/gpu:0")):
+      with ops.device(device.merge_device("/device:GPU:0")):
         a5, b5, c5 = importer.import_graph_def(
             gdef, return_elements=["a", "b", "c"])
         self.assertEqual("/device:GPU:0", a5.device)
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 13a92c3c7ec..65abb695991 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -550,7 +550,7 @@ class ScopedMetaGraphTest(test.TestCase):
         a = variables.Variable(
             constant_op.constant(
                 1.0, shape=[2, 2]), name="a")
-      with ops.device("/job:ps/replica:0/task:0/gpu:0"):
+      with ops.device("/job:ps/replica:0/task:0/device:GPU:0"):
         b = variables.Variable(
             constant_op.constant(
                 2.0, shape=[2, 2]), name="b")
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 5948e59824c..1b7b9eea121 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1632,7 +1632,7 @@ class Operation(object):
   def _create_c_op(self, graph, node_def, inputs, control_inputs):
     """Creates a TF_Operation.
 
-    Arguments:
+    Args:
       graph: a `Graph`.
       node_def: `node_def_pb2.NodeDef` for the operation to create.
       inputs: A list of `Tensor`s (corresponding to scalar inputs) and lists of
@@ -1677,7 +1677,7 @@ class Operation(object):
   def _reconstruct_sequence_inputs(self, op_def, inputs, attrs):
     """Regroups a flat list of input tensors into scalar and sequence inputs.
 
-    Arguments:
+    Args:
       op_def: The `op_def_pb2.OpDef` (for knowing the input types)
       inputs: a list of input `Tensor`s to the op.
       attrs: mapping from attr name to `attr_value_pb2.AttrValue` (these define
@@ -3763,7 +3763,7 @@ class Graph(object):
     For example:
 
     ```python
-    with g.device('/gpu:0'):
+    with g.device('/device:GPU:0'):
       # All operations constructed in this context will be placed
       # on GPU 0.
       with g.device(None):
@@ -3773,7 +3773,7 @@ class Graph(object):
     # Defines a function from `Operation` to device string.
     def matmul_on_gpu(n):
       if n.type == "MatMul":
-        return "/gpu:0"
+        return "/device:GPU:0"
       else:
         return "/cpu:0"
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index acb5fa53bf3..4cbb9deed7b 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -1562,26 +1562,26 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
 
   def testColocationDeviceInteraction(self):
     with ops.device("/cpu:0"):
-      with ops.device("/gpu:0"):
+      with ops.device("/device:GPU:0"):
         a = constant_op.constant([2.0], name="a")
       with ops.colocate_with(a.op):
         # 'b' is created in the scope of /cpu:0, but it is
-        # colocated with 'a', which is on '/gpu:0'.  colocate_with
+        # colocated with 'a', which is on '/device:GPU:0'.  colocate_with
         # overrides devices because it is a stronger constraint.
         b = constant_op.constant(3.0)
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
     self.assertEqual(a.op.device, b.op.device)
 
   def testColocationCanonicalization(self):
-    with ops.device("/gpu:0"):
+    with ops.device("/device:GPU:0"):
       _ = constant_op.constant(2.0)
-    with ops.device(lambda op: "/gpu:0"):
+    with ops.device(lambda op: "/device:GPU:0"):
       b = constant_op.constant(3.0)
     with ops.get_default_graph().colocate_with(b):
-      with ops.device("/gpu:0"):
+      with ops.device("/device:GPU:0"):
         c = constant_op.constant(4.0)
 
-    # A's device will be /gpu:0
+    # A's device will be /device:GPU:0
     # B's device will be /device:GPU:0
     # C's device will be /device:GPU:0 because it
     # inherits B's device name, after canonicalizing the names.
@@ -1589,10 +1589,10 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
 
   def testLocationOverrides(self):
     with ops.device("/cpu:0"):
-      with ops.device("/gpu:0"):
+      with ops.device("/device:GPU:0"):
         a = constant_op.constant([2.0], name="a")
         # Note that this colocation is "redundant", since we are
-        # within the scope of "/gpu:0".  However, we would like to
+        # within the scope of "/device:GPU:0".  However, we would like to
         # preserve in the GraphDef that these two ops should be
         # colocated in a portable way.
         with ops.colocate_with(a.op):
@@ -1659,7 +1659,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
 
   def testInconsistentDeviceWithinColocate(self):
-    with ops.device("/gpu:0"):
+    with ops.device("/device:GPU:0"):
       a = constant_op.constant([2.0], name="a")
       with ops.colocate_with(a.op):
         # This is allowed due to legacy but clearly wrong, since we
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 66c05335b4f..54ec15ea66d 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -116,11 +116,11 @@ class Dimension(object):
     Dimensions are combined as follows:
 
     ```python
-        Dimension(n)   .merge_with(Dimension(n))    == Dimension(n)
-        Dimension(n)   .merge_with(Dimension(None)) == Dimension(n)
-        Dimension(None).merge_with(Dimension(n))    == Dimension(n)
-        Dimension(None).merge_with(Dimension(None)) == Dimension(None)
-        Dimension(n)   .merge_with(Dimension(m)) raises ValueError for n != m
+    tf.Dimension(n)   .merge_with(tf.Dimension(n))    == tf.Dimension(n)
+    tf.Dimension(n)   .merge_with(tf.Dimension(None)) == tf.Dimension(n)
+    tf.Dimension(None).merge_with(tf.Dimension(n))    == tf.Dimension(n)
+    tf.Dimension(None).merge_with(tf.Dimension(None)) == tf.Dimension(None)
+    tf.Dimension(n)   .merge_with(tf.Dimension(m))  # raises ValueError for n != m
     ```
 
     Args:
@@ -146,10 +146,12 @@ class Dimension(object):
 
     Dimensions are summed as follows:
 
-      Dimension(m)    + Dimension(n)    == Dimension(m + n)
-      Dimension(m)    + Dimension(None) == Dimension(None)
-      Dimension(None) + Dimension(n)    == Dimension(None)
-      Dimension(None) + Dimension(None) == Dimension(None)
+    ```python
+    tf.Dimension(m)    + tf.Dimension(n)    == tf.Dimension(m + n)
+    tf.Dimension(m)    + tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(None) + tf.Dimension(n)    == tf.Dimension(None)
+    tf.Dimension(None) + tf.Dimension(None) == tf.Dimension(None)
+    ```
 
     Args:
       other: Another Dimension.
@@ -168,10 +170,12 @@ class Dimension(object):
 
     Dimensions are subtracted as follows:
 
-      Dimension(m)    - Dimension(n)    == Dimension(m - n)
-      Dimension(m)    - Dimension(None) == Dimension(None)
-      Dimension(None) - Dimension(n)    == Dimension(None)
-      Dimension(None) - Dimension(None) == Dimension(None)
+    ```python
+    tf.Dimension(m)    - tf.Dimension(n)    == tf.Dimension(m - n)
+    tf.Dimension(m)    - tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(None) - tf.Dimension(n)    == tf.Dimension(None)
+    tf.Dimension(None) - tf.Dimension(None) == tf.Dimension(None)
+    ```
 
     Args:
       other: Another Dimension.
@@ -190,11 +194,11 @@ class Dimension(object):
 
     Dimensions are summed as follows:
 
-    ```
-      Dimension(m)    * Dimension(n)    == Dimension(m * n)
-      Dimension(m)    * Dimension(None) == Dimension(None)
-      Dimension(None) * Dimension(n)    == Dimension(None)
-      Dimension(None) * Dimension(None) == Dimension(None)
+    ```python
+    tf.Dimension(m)    * tf.Dimension(n)    == tf.Dimension(m * n)
+    tf.Dimension(m)    * tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(None) * tf.Dimension(n)    == tf.Dimension(None)
+    tf.Dimension(None) * tf.Dimension(None) == tf.Dimension(None)
     ```
 
     Args:
@@ -214,10 +218,12 @@ class Dimension(object):
 
     Dimensions are divided as follows:
 
-      Dimension(m)    // Dimension(n)    == Dimension(m // n)
-      Dimension(m)    // Dimension(None) == Dimension(None)
-      Dimension(None) // Dimension(n)    == Dimension(None)
-      Dimension(None) // Dimension(None) == Dimension(None)
+    ```python
+    tf.Dimension(m)    // tf.Dimension(n)    == tf.Dimension(m // n)
+    tf.Dimension(m)    // tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(None) // tf.Dimension(n)    == tf.Dimension(None)
+    tf.Dimension(None) // tf.Dimension(None) == tf.Dimension(None)
+    ```
 
     Args:
       other: Another `Dimension`.
@@ -250,12 +256,14 @@ class Dimension(object):
   def __mod__(self, other):
     """Returns `self` modulo `other.
 
-    Dimension moduli are computed  as follows:
+    Dimension moduli are computed as follows:
 
-      Dimension(m)    % Dimension(n)     == Dimension(m % n)
-      Dimension(m)    % Dimension(None)  == Dimension(None)
-      Dimension(None) % Dimension(n)     == Dimension(None)
-      Dimension(None) %  Dimension(None) == Dimension(None)
+    ```python
+    tf.Dimension(m)    % tf.Dimension(n)    == tf.Dimension(m % n)
+    tf.Dimension(m)    % tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(None) % tf.Dimension(n)    == tf.Dimension(None)
+    tf.Dimension(None) % tf.Dimension(None) == tf.Dimension(None)
+    ```
 
     Args:
       other: Another Dimension.
@@ -274,10 +282,12 @@ class Dimension(object):
 
     Dimensions are compared as follows:
 
-      Dimension(m)    < Dimension(n)    == m < n
-      Dimension(m)    < Dimension(None) == None
-      Dimension(None) < Dimension(n)    == None
-      Dimension(None) < Dimension(None) == None
+    ```python
+    (tf.Dimension(m)    < tf.Dimension(n))    == (m < n)
+    (tf.Dimension(m)    < tf.Dimension(None)) == None
+    (tf.Dimension(None) < tf.Dimension(n))    == None
+    (tf.Dimension(None) < tf.Dimension(None)) == None
+    ```
 
     Args:
       other: Another Dimension.
@@ -297,10 +307,12 @@ class Dimension(object):
 
     Dimensions are compared as follows:
 
-      Dimension(m)    <= Dimension(n)    == m <= n
-      Dimension(m)    <= Dimension(None) == None
-      Dimension(None) <= Dimension(n)    == None
-      Dimension(None) <= Dimension(None) == None
+    ```python
+    (tf.Dimension(m)    <= tf.Dimension(n))    == (m <= n)
+    (tf.Dimension(m)    <= tf.Dimension(None)) == None
+    (tf.Dimension(None) <= tf.Dimension(n))    == None
+    (tf.Dimension(None) <= tf.Dimension(None)) == None
+    ```
 
     Args:
       other: Another Dimension.
@@ -320,10 +332,12 @@ class Dimension(object):
 
     Dimensions are compared as follows:
 
-      Dimension(m)    > Dimension(n)    == m > n
-      Dimension(m)    > Dimension(None) == None
-      Dimension(None) > Dimension(n)    == None
-      Dimension(None) > Dimension(None) == None
+    ```python
+    (tf.Dimension(m)    > tf.Dimension(n))    == (m > n)
+    (tf.Dimension(m)    > tf.Dimension(None)) == None
+    (tf.Dimension(None) > tf.Dimension(n))    == None
+    (tf.Dimension(None) > tf.Dimension(None)) == None
+    ```
 
     Args:
       other: Another Dimension.
@@ -343,10 +357,12 @@ class Dimension(object):
 
     Dimensions are compared as follows:
 
-      Dimension(m)    >= Dimension(n)    == m >= n
-      Dimension(m)    >= Dimension(None) == None
-      Dimension(None) >= Dimension(n)    == None
-      Dimension(None) >= Dimension(None) == None
+    ```python
+    (tf.Dimension(m)    >= tf.Dimension(n))    == (m >= n)
+    (tf.Dimension(m)    >= tf.Dimension(None)) == None
+    (tf.Dimension(None) >= tf.Dimension(n))    == None
+    (tf.Dimension(None) >= tf.Dimension(None)) == None
+    ```
 
     Args:
       other: Another Dimension.
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index d9e507d23ce..e159cfa44bd 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -405,7 +405,7 @@ class TensorFlowTestCase(googletest.TestCase):
     trigger the creation of a new session.
 
     Use the `use_gpu` and `force_gpu` options to control where ops are run. If
-    `force_gpu` is True, all ops are pinned to `/gpu:0`. Otherwise, if `use_gpu`
+    `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if `use_gpu`
     is True, TensorFlow tries to run as many ops on the GPU as possible. If both
     `force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
 
@@ -427,7 +427,7 @@ class TensorFlowTestCase(googletest.TestCase):
       config: An optional config_pb2.ConfigProto to use to configure the
         session.
       use_gpu: If True, attempt to run as many ops as possible on GPU.
-      force_gpu: If True, pin all ops to `/gpu:0`.
+      force_gpu: If True, pin all ops to `/device:GPU:0`.
 
     Returns:
       A Session object that should be used as a context manager to surround
@@ -466,11 +466,11 @@ class TensorFlowTestCase(googletest.TestCase):
       sess = self._cached_session
       with sess.graph.as_default(), sess.as_default():
         if force_gpu:
-          # Use the name of an actual device if one is detected, or '/gpu:0'
+          # Use the name of an actual device if one is detected, or '/device:GPU:0'
           # otherwise
           gpu_name = gpu_device_name()
           if not gpu_name:
-            gpu_name = "/gpu:0"
+            gpu_name = "/device:GPU:0"
           with sess.graph.device(gpu_name):
             yield sess
         elif use_gpu:
@@ -481,11 +481,11 @@ class TensorFlowTestCase(googletest.TestCase):
     else:
       with session.Session(graph=graph, config=prepare_config(config)) as sess:
         if force_gpu:
-          # Use the name of an actual device if one is detected, or '/gpu:0'
+          # Use the name of an actual device if one is detected, or '/device:GPU:0'
           # otherwise
           gpu_name = gpu_device_name()
           if not gpu_name:
-            gpu_name = "/gpu:0"
+            gpu_name = "/device:GPU:0"
           with sess.graph.device(gpu_name):
             yield sess
         elif use_gpu:
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 22306e08d71..4dc63166f52 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -134,6 +134,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
     ],
     shard_count = 5,
+    tags = ["no_windows_gpu"],
 )
 
 tf_py_test(
@@ -1444,6 +1445,7 @@ cuda_py_test(
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
     ],
+    tags = ["no_windows_gpu"],
 )
 
 cuda_py_test(
@@ -1661,6 +1663,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 4,
+    tags = ["no_windows_gpu"],
 )
 
 cuda_py_test(
@@ -2640,6 +2643,7 @@ cuda_py_test(
         "//tensorflow/python:linalg_ops",
     ],
     shard_count = 20,
+    tags = ["no_windows_gpu"],
 )
 
 cuda_py_test(
@@ -2725,6 +2729,7 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 3,
+    tags = ["no_windows_gpu"],
 )
 
 tf_py_test(
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 155aad8bd9a..405651e8ae9 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -238,7 +238,7 @@ class GpuMultiSessionMemoryTest(test_util.TensorFlowTestCase):
     n_iterations = 500
     with session as s:
       data = variables.Variable(1.0)
-      with ops.device('/gpu:0'):
+      with ops.device('/device:GPU:0'):
         random_seed.set_random_seed(1)
         matrix1 = variables.Variable(
             random_ops.truncated_normal([1024, 1]), name='matrix1')
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index 5369d2d5c49..d783522e820 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -311,7 +311,7 @@ class CholeskyBenchmark(test.Benchmark):
       if test.is_gpu_available(True):
         with ops.Graph().as_default(), \
             session.Session() as sess, \
-            ops.device("/gpu:0"):
+            ops.device("/device:GPU:0"):
           l = linalg_ops.cholesky(data)
           self.run_op_benchmark(
               sess,
@@ -338,11 +338,11 @@ class CholeskyBenchmark(test.Benchmark):
 
     if test.is_gpu_available(True):
       _BenchmarkGrad(
-          MatrixInverseCompositeGrad, "composite_matrix_inverse", "/gpu:0")
+          MatrixInverseCompositeGrad, "composite_matrix_inverse", "/device:GPU:0")
       _BenchmarkGrad(
-          TriAngInvCompositeGrad, "composite_tri_ang_inverse", "/gpu:0")
+          TriAngInvCompositeGrad, "composite_tri_ang_inverse", "/device:GPU:0")
       _BenchmarkGrad(
-          TriAngSolveCompositeGrad, "composite_triangular_solve", "/gpu:0")
+          TriAngSolveCompositeGrad, "composite_triangular_solve", "/device:GPU:0")
 
     _BenchmarkGrad(
         MatrixInverseCompositeGrad, "composite_matrix_inverse", "/cpu:0")
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index aba4224dc62..a5fd3bc3345 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -138,6 +138,7 @@ class ConcatOpTest(test.TestCase):
         self.assertAllClose(result[ind], params[p[i]], 0.01)
 
   def testRandom(self):
+    self._testRandom(dtypes.bool)
     self._testRandom(dtypes.float32)
     self._testRandom(dtypes.int16)
     self._testRandom(dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 118643966cc..fdecea1dc10 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -1427,9 +1427,8 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(45, rx.eval())
 
   def _testWhileGrad_ColocateGradients(self, colocate):
-    gpu_dev_name = test.gpu_device_name().lower() if test.is_gpu_available(
-    ) else "/gpu:0"
-    gpu_short_name = gpu_dev_name.split("/")[-1]
+    gpu_dev_name = test.gpu_device_name() if test.is_gpu_available(
+    ) else "/device:GPU:0"
 
     with self.test_session(graph=ops.Graph()) as sess:
       v = constant_op.constant(2.0, name="v")
@@ -1443,19 +1442,19 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(
           loop, v, colocate_gradients_with_ops=colocate)[0]
     r_ops = r.graph.get_operations()
-    r_devices = [(op.name, op.device.lower()) for op in r_ops]
+    r_devices = [(op.name, op.device) for op in r_ops]
 
     self.assertTrue(any("Square" in op.name for op in r_ops))
 
     for (name, dev) in r_devices:
       if not colocate and name.endswith("Square"):
         # Only forward graph contain gpu in Square device
-        self.assertTrue(gpu_short_name in dev)
+        self.assertTrue(gpu_dev_name in dev)
       elif colocate and "Square" in name:
         # Forward and backward graphs contain gpu in Square/Square_grad devices
-        self.assertTrue(gpu_short_name in dev)
+        self.assertTrue(gpu_dev_name in dev)
       else:
-        self.assertFalse(gpu_short_name in dev)
+        self.assertFalse(gpu_dev_name in dev)
     self.assertAllClose(1024.0, sess.run(r))
 
   def testWhileGrad_ColocateGradients(self):
@@ -2431,7 +2430,7 @@ class ControlFlowTest(test.TestCase):
 
       # device set on tensor, default device on graph => default device on dep.
       vdef = variables.Variable([0.0], name="vdef")
-      with ops.device("/job:worker/gpu:1"):
+      with ops.device("/job:worker/device:GPU:1"):
         with_vdef_dep = control_flow_ops.with_dependencies([vdef.initializer],
                                                            vdef)
         # The device is empty, but the colocation constraint is set.
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index f3b1a8768f5..2d48cd4163c 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+import platform
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
@@ -34,6 +35,10 @@ class DenormalTest(test.TestCase):
       self.assertEqual(tiny, tiny / 16 * 16)
 
   def _flushDenormalsTest(self, use_gpu, dtypes):
+    if platform.machine() == "ppc64le":
+      # Disabled denormal_test on power platform
+      # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
+      return
     with self.test_session(use_gpu=use_gpu):
       array_ops.identity(7).eval()
       for dtype in dtypes:
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index b4a5e1f4221..9b9aa98b376 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 import tensorflow.python.ops.data_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
+from tensorflow.python.framework import dtypes
 
 
 class DynamicStitchTestBase(object):
@@ -216,6 +217,44 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
       for datum, grad in zip(data, sess.run(grads[3:])):
         self.assertAllEqual(7.0 * datum.eval(), grad)
 
+  # GPU version unit tests
+  def testScalarGPU(self):
+    with self.test_session():
+      indices = [constant_op.constant(0), constant_op.constant(1)]
+      data = [constant_op.constant(40.0), constant_op.constant(60.0)]
+      for step in -1, 1:
+        stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
+        stitched_val = stitched_t.eval()
+        self.assertAllEqual([40.0, 60.0][::step], stitched_val)
+        # Dimension 0 is determined by the max index in indices, so we
+        # can only infer that the output is a vector of some unknown
+        # length.
+        self.assertEqual([None], stitched_t.get_shape().as_list())
+
+  def testHigherRankGPU(self):
+    with self.test_session() as sess:
+      indices = [
+          constant_op.constant(6), constant_op.constant([4, 1]),
+          constant_op.constant([[5, 2], [0, 3]])
+      ]
+      data = [
+          constant_op.constant([61, 62], dtype=dtypes.float32),
+          constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
+          constant_op.constant([[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
+      ]
+      stitched_t = data_flow_ops.dynamic_stitch(indices, data)
+      stitched_val = stitched_t.eval()
+      correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
+      self.assertAllEqual(correct, stitched_val)
+      self.assertEqual([None, 2], stitched_t.get_shape().as_list())
+      # Test gradients
+      stitched_grad = 7 * stitched_val
+      grads = gradients_impl.gradients(stitched_t, indices + data,
+                                       stitched_grad)
+      self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
+      for datum, grad in zip(data, sess.run(grads[3:])):
+        self.assertAllEqual(7.0 * datum.eval(), grad)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index f5fb7e4e03e..da14871c872 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.framework import ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
@@ -76,7 +77,7 @@ def GetShrunkInceptionMaxPoolShapes(shrink=30):
 class PoolingTest(test.TestCase):
 
   def _VerifyOneType(self, pool_func, input_sizes, ksize, strides, padding,
-                     data_format, data_type, expected, use_gpu):
+                     data_format, data_type, expected, use_gpu, v2):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -103,20 +104,35 @@ class PoolingTest(test.TestCase):
         t = test_util.NHWCToNCHW(t)
         ksize = test_util.NHWCToNCHW(ksize)
         strides = test_util.NHWCToNCHW(strides)
-      t = pool_func(
-          t,
-          ksize=ksize,
-          strides=strides,
-          padding=padding,
-          data_format=data_format)
+      v2 = v2 and data_format != "NCHW"
+      ksize_placeholder = array_ops.placeholder(dtypes.int32, shape=[4])
+      strides_placeholder = array_ops.placeholder(dtypes.int32, shape=[4])
+      if v2:
+        t = pool_func(
+            t,
+            ksize=ksize_placeholder,
+            strides=strides_placeholder,
+            padding=padding,
+            data_format=data_format)
+      else:
+        t = pool_func(
+            t,
+            ksize=ksize,
+            strides=strides,
+            padding=padding,
+            data_format=data_format)
       if data_format == "NCHW":
         t = test_util.NCHWToNHWC(t)
-      actual = t.eval()
+      if v2:
+        actual = t.eval(feed_dict={ksize_placeholder: ksize,
+                                   strides_placeholder: strides})
+      else:
+        actual = t.eval()
+        self.assertShapeEqual(actual, t)
       self.assertAllCloseAccordingToType(expected, actual.flatten())
-      self.assertShapeEqual(actual, t)
 
   def _VerifyOneTest(self, pool_func, input_sizes, ksize, strides, padding,
-                     data_format, expected, use_gpu):
+                     data_format, expected, use_gpu, v2):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -131,14 +147,14 @@ class PoolingTest(test.TestCase):
       use_gpu: Whether we are running on GPU.
     """
     self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
-                        data_format, dtypes.float32, expected, use_gpu)
+                        data_format, dtypes.float32, expected, use_gpu, v2)
 
     if not use_gpu or test_util.CudaSupportsHalfMatMulAndConv():
       self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
-                          data_format, dtypes.float16, expected, use_gpu)
+                          data_format, dtypes.float16, expected, use_gpu, v2)
 
   def _VerifyValues(self, pool_func, input_sizes, ksize, strides, padding,
-                    expected, use_gpu):
+                    expected, use_gpu, v2=False):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -154,7 +170,7 @@ class PoolingTest(test.TestCase):
     for (data_format, use_gpu_2) in GetTestConfigs():
       if use_gpu_2 == use_gpu:
         self._VerifyOneTest(pool_func, input_sizes, ksize, strides, padding,
-                            data_format, expected, use_gpu)
+                            data_format, expected, use_gpu, v2)
 
   def _testAvgPoolValidPadding(self, use_gpu):
     expected_output = [7.0, 8.0, 9.0]
@@ -325,6 +341,17 @@ class PoolingTest(test.TestCase):
         expected=expected_output,
         use_gpu=use_gpu)
 
+    for v2 in [True, False]:
+      self._VerifyValues(
+          gen_nn_ops._max_pool_v2,
+          input_sizes=[1, 3, 3, 3],
+          ksize=[1, 2, 2, 1],
+          strides=[1, 2, 2, 1],
+          padding="VALID",
+          expected=expected_output,
+          use_gpu=use_gpu,
+          v2=v2)
+
   def _testMaxPoolSamePadding(self, use_gpu):
     expected_output = [13.0, 14.0, 15.0, 16.0, 17.0, 18.0]
     self._VerifyValues(
@@ -336,6 +363,17 @@ class PoolingTest(test.TestCase):
         expected=expected_output,
         use_gpu=use_gpu)
 
+    for v2 in [True, False]:
+      self._VerifyValues(
+          gen_nn_ops._max_pool_v2,
+          input_sizes=[1, 2, 3, 3],
+          ksize=[1, 2, 2, 1],
+          strides=[1, 2, 2, 1],
+          padding="SAME",
+          expected=expected_output,
+          use_gpu=use_gpu,
+          v2=v2)
+
   def _testMaxPoolSamePaddingNonSquareWindow(self, use_gpu):
     # input is:
     # [1.0, 2.0
@@ -354,6 +392,17 @@ class PoolingTest(test.TestCase):
         expected=[2.0, 2.0, 4.0, 4.0],
         use_gpu=use_gpu)
 
+    for v2 in [True, False]:
+      self._VerifyValues(
+          gen_nn_ops._max_pool_v2,
+          input_sizes=[1, 2, 2, 1],
+          ksize=[1, 1, 2, 1],
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          expected=[2.0, 2.0, 4.0, 4.0],
+          use_gpu=use_gpu,
+          v2=v2)
+
   def _testMaxPoolValidPaddingUnevenStride(self, use_gpu):
     self._VerifyValues(
         nn_ops.max_pool,
@@ -372,6 +421,26 @@ class PoolingTest(test.TestCase):
         expected=[6.0, 7.0, 8.0, 14.0, 15.0, 16.0],
         use_gpu=use_gpu)
 
+    for v2 in [True, False]:
+      self._VerifyValues(
+          gen_nn_ops._max_pool_v2,
+          input_sizes=[1, 4, 4, 1],
+          ksize=[1, 2, 2, 1],
+          strides=[1, 1, 2, 1],
+          padding="VALID",
+          expected=[6.0, 8.0, 10.0, 12.0, 14.0, 16.0],
+          use_gpu=use_gpu,
+          v2=v2)
+      self._VerifyValues(
+          gen_nn_ops._max_pool_v2,
+          input_sizes=[1, 4, 4, 1],
+          ksize=[1, 2, 2, 1],
+          strides=[1, 2, 1, 1],
+          padding="VALID",
+          expected=[6.0, 7.0, 8.0, 14.0, 15.0, 16.0],
+          use_gpu=use_gpu,
+          v2=v2)
+
   def _testMaxPoolSamePaddingPacket4(self, use_gpu):
     expected_output = [
         21.0, 22.0, 23.0, 24.0, 29.0, 30.0, 31.0, 32.0, 53.0, 54.0, 55.0, 56.0,
@@ -386,6 +455,17 @@ class PoolingTest(test.TestCase):
         expected=expected_output,
         use_gpu=use_gpu)
 
+    for v2 in [True, False]:
+      self._VerifyValues(
+          gen_nn_ops._max_pool_v2,
+          input_sizes=[1, 4, 4, 4],
+          ksize=[1, 2, 2, 1],
+          strides=[1, 2, 2, 1],
+          padding="SAME",
+          expected=expected_output,
+          use_gpu=use_gpu,
+          v2=v2)
+
   def _testMaxPoolSamePaddingPacket8(self, use_gpu):
     expected_output = [
         145.0, 146.0, 147.0, 148.0, 149.0, 150.0, 151.0, 152.0, 161.0, 162.0,
@@ -411,6 +491,17 @@ class PoolingTest(test.TestCase):
         expected=expected_output,
         use_gpu=use_gpu)
 
+    for v2 in [True, False]:
+      self._VerifyValues(
+          gen_nn_ops._max_pool_v2,
+          input_sizes=[1, 8, 8, 8],
+          ksize=[1, 3, 3, 1],
+          strides=[1, 2, 2, 1],
+          padding="SAME",
+          expected=expected_output,
+          use_gpu=use_gpu,
+          v2=v2)
+
   def testMaxPooling(self):
     for use_gpu in True, False:
       self._testMaxPoolValidPadding(use_gpu)
@@ -435,6 +526,17 @@ class PoolingTest(test.TestCase):
         expected=[2.0, 4.0, 6.0, 8.0, 10.0],
         use_gpu=False)
 
+    for v2 in [True, False]:
+      self._VerifyValues(
+          gen_nn_ops._max_pool_v2,
+          input_sizes=[1, 1, 1, 10],
+          ksize=[1, 1, 1, 2],
+          strides=[1, 1, 1, 2],
+          padding="SAME",
+          expected=[2.0, 4.0, 6.0, 8.0, 10.0],
+          use_gpu=False,
+          v2=v2)
+
   def testDepthwiseMaxPool2x2DepthWindow3(self):
     # input is:
     #
@@ -450,6 +552,17 @@ class PoolingTest(test.TestCase):
         expected=[3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0],
         use_gpu=False)
 
+    for v2 in [True, False]:
+      self._VerifyValues(
+          gen_nn_ops._max_pool_v2,
+          input_sizes=[1, 2, 2, 6],
+          ksize=[1, 1, 1, 3],
+          strides=[1, 1, 1, 3],
+          padding="SAME",
+          expected=[3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0],
+          use_gpu=False,
+          v2=v2)
+
   def testKernelSmallerThanStrideValid(self):
     for use_gpu in [True, False]:
       self._VerifyValues(
@@ -461,6 +574,17 @@ class PoolingTest(test.TestCase):
           expected=[9, 12, 30, 33],
           use_gpu=use_gpu)
 
+      for v2 in [True, False]:
+        self._VerifyValues(
+            gen_nn_ops._max_pool_v2,
+            input_sizes=[1, 7, 7, 1],
+            ksize=[1, 2, 2, 1],
+            strides=[1, 3, 3, 1],
+            padding="VALID",
+            expected=[9, 12, 30, 33],
+            use_gpu=use_gpu,
+            v2=v2)
+
       self._VerifyValues(
           nn_ops.avg_pool,
           input_sizes=[1, 7, 7, 1],
@@ -491,6 +615,27 @@ class PoolingTest(test.TestCase):
             expected=[1, 3, 9, 11],
             use_gpu=use_gpu)
 
+      for v2 in [True, False]:
+        self._VerifyValues(
+            gen_nn_ops._max_pool_v2,
+            input_sizes=[1, 3, 3, 1],
+            ksize=[1, 1, 1, 1],
+            strides=[1, 2, 2, 1],
+            padding="SAME",
+            expected=[1, 3, 7, 9],
+            use_gpu=use_gpu,
+            v2=v2)
+
+        self._VerifyValues(
+            gen_nn_ops._max_pool_v2,
+            input_sizes=[1, 4, 4, 1],
+            ksize=[1, 1, 1, 1],
+            strides=[1, 2, 2, 1],
+            padding="SAME",
+            expected=[1, 3, 9, 11],
+            use_gpu=use_gpu,
+            v2=v2)
+
   def _testDepthwiseMaxPoolInvalidConfig(self,
                                          in_size,
                                          ksize,
@@ -812,99 +957,107 @@ class PoolingTest(test.TestCase):
     self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
-    self._ConstructAndTestGradient(
-        nn_ops.max_pool,
-        input_sizes=[1, 3, 3, 1],
-        output_sizes=[1, 3, 3, 1],
-        window_rows=1,
-        window_cols=1,
-        row_stride=1,
-        col_stride=1,
-        padding="VALID",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[1, 3, 3, 1],
+          output_sizes=[1, 3, 3, 1],
+          window_rows=1,
+          window_cols=1,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradValidPadding2_1_6(self, data_format, use_gpu):
-    self._ConstructAndTestGradient(
-        nn_ops.max_pool,
-        input_sizes=[2, 6, 6, 3],
-        output_sizes=[2, 5, 5, 3],
-        window_rows=2,
-        window_cols=2,
-        row_stride=1,
-        col_stride=1,
-        padding="VALID",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[2, 6, 6, 3],
+          output_sizes=[2, 5, 5, 3],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradValidPadding2_1_7(self, data_format, use_gpu):
-    self._ConstructAndTestGradient(
-        nn_ops.max_pool,
-        input_sizes=[2, 7, 7, 3],
-        output_sizes=[2, 6, 6, 3],
-        window_rows=2,
-        window_cols=2,
-        row_stride=1,
-        col_stride=1,
-        padding="VALID",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[2, 7, 7, 3],
+          output_sizes=[2, 6, 6, 3],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradValidPadding2_2(self, data_format, use_gpu):
-    self._ConstructAndTestGradient(
-        nn_ops.max_pool,
-        input_sizes=[2, 2, 2, 3],
-        output_sizes=[2, 1, 1, 3],
-        window_rows=2,
-        window_cols=2,
-        row_stride=2,
-        col_stride=2,
-        padding="VALID",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[2, 2, 2, 3],
+          output_sizes=[2, 1, 1, 3],
+          window_rows=2,
+          window_cols=2,
+          row_stride=2,
+          col_stride=2,
+          padding="VALID",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradSamePadding1_1(self, data_format, use_gpu):
-    self._ConstructAndTestGradient(
-        nn_ops.max_pool,
-        input_sizes=[2, 2, 4, 3],
-        output_sizes=[2, 2, 4, 3],
-        window_rows=1,
-        window_cols=1,
-        row_stride=1,
-        col_stride=1,
-        padding="SAME",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[2, 2, 4, 3],
+          output_sizes=[2, 2, 4, 3],
+          window_rows=1,
+          window_cols=1,
+          row_stride=1,
+          col_stride=1,
+          padding="SAME",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradSamePadding2_1(self, data_format, use_gpu):
-    self._ConstructAndTestGradient(
-        nn_ops.max_pool,
-        input_sizes=[2, 2, 4, 3],
-        output_sizes=[2, 2, 4, 3],
-        window_rows=2,
-        window_cols=2,
-        row_stride=1,
-        col_stride=1,
-        padding="SAME",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[2, 2, 4, 3],
+          output_sizes=[2, 2, 4, 3],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="SAME",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradSamePadding2_2(self, data_format, use_gpu):
-    self._ConstructAndTestGradient(
-        nn_ops.max_pool,
-        input_sizes=[2, 2, 4, 3],
-        output_sizes=[2, 1, 2, 3],
-        window_rows=2,
-        window_cols=2,
-        row_stride=2,
-        col_stride=2,
-        padding="SAME",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[2, 2, 4, 3],
+          output_sizes=[2, 1, 2, 3],
+          window_rows=2,
+          window_cols=2,
+          row_stride=2,
+          col_stride=2,
+          padding="SAME",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradSamePadding3_1(self, data_format, use_gpu):
-    self._ConstructAndTestGradient(
-        nn_ops.max_pool,
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+        pool_func,
         input_sizes=[1, 7, 7, 1],
         output_sizes=[1, 7, 7, 1],
         window_rows=3,
@@ -927,7 +1080,7 @@ class PoolingTest(test.TestCase):
       self._testMaxPoolGradSamePadding3_1(data_format, use_gpu)
 
   def _MaxPoolGrad(self, orig_input, orig_output, grad, window_rows,
-                   window_cols, row_stride, col_stride, padding):
+                   window_cols, row_stride, col_stride, padding, v2):
     """Max Pooling Gradient.
 
     Args:
@@ -944,26 +1097,29 @@ class PoolingTest(test.TestCase):
     Returns:
       A Tensor.
     """
-    return gen_nn_ops._max_pool_grad(orig_input, orig_output, grad,
-                                     [1, window_rows, window_cols, 1],
-                                     [1, row_stride, col_stride, 1], padding)
+    pool_func = gen_nn_ops.max_pool_grad_v2 if v2 else gen_nn_ops._max_pool_grad
+    return pool_func(orig_input, orig_output, grad,
+                     [1, window_rows, window_cols, 1],
+                     [1, row_stride, col_stride, 1], padding)
 
   def _testMaxPoolGradDirect(self, input_data, output_backprop,
                              expected_input_backprop, input_sizes, output_sizes,
                              window_rows, window_cols, row_stride, col_stride,
-                             padding, use_gpu):
+                             padding, use_gpu, v2):
+    pool_func = gen_nn_ops._max_pool_v2 if v2 else nn_ops.max_pool
     with self.test_session(use_gpu=use_gpu):
       input_tensor = constant_op.constant(input_data, shape=input_sizes)
-      output_tensor = nn_ops.max_pool(input_tensor,
-                                      [1, window_rows, window_cols, 1],
-                                      [1, row_stride, col_stride, 1], padding)
+      output_tensor = pool_func(input_tensor,
+                                [1, window_rows, window_cols, 1],
+                                [1, row_stride, col_stride, 1], padding)
       output_backprop_tensor = constant_op.constant(
           output_backprop, shape=output_sizes)
 
       input_backprop_tensor = self._MaxPoolGrad(input_tensor, output_tensor,
                                                 output_backprop_tensor,
                                                 window_rows, window_cols,
-                                                row_stride, col_stride, padding)
+                                                row_stride, col_stride,
+                                                padding, v2)
 
       actual_input_backprop = input_backprop_tensor.eval()
       self.assertShapeEqual(actual_input_backprop, input_backprop_tensor)
@@ -988,18 +1144,20 @@ class PoolingTest(test.TestCase):
     ]
 
     for use_gpu in True, False:
-      self._testMaxPoolGradDirect(
-          input_data,
-          output_backprop,
-          expected_input_backprop,
-          input_sizes=[1, 4, 4, 1],
-          output_sizes=[1, 3, 3, 1],
-          window_rows=2,
-          window_cols=2,
-          row_stride=1,
-          col_stride=1,
-          padding="VALID",
-          use_gpu=use_gpu)
+      for v2 in [True, False]:
+        self._testMaxPoolGradDirect(
+            input_data,
+            output_backprop,
+            expected_input_backprop,
+            input_sizes=[1, 4, 4, 1],
+            output_sizes=[1, 3, 3, 1],
+            window_rows=2,
+            window_cols=2,
+            row_stride=1,
+            col_stride=1,
+            padding="VALID",
+            use_gpu=use_gpu,
+            v2=v2)
 
   def _testMaxPoolGradDirect1_2(self):
     input_data = [
@@ -1013,18 +1171,20 @@ class PoolingTest(test.TestCase):
     ]
 
     for use_gpu in True, False:
-      self._testMaxPoolGradDirect(
-          input_data,
-          output_backprop,
-          expected_input_backprop,
-          input_sizes=[1, 4, 4, 1],
-          output_sizes=[1, 3, 3, 1],
-          window_rows=2,
-          window_cols=2,
-          row_stride=1,
-          col_stride=1,
-          padding="VALID",
-          use_gpu=use_gpu)
+      for v2 in [True, False]:
+        self._testMaxPoolGradDirect(
+            input_data,
+            output_backprop,
+            expected_input_backprop,
+            input_sizes=[1, 4, 4, 1],
+            output_sizes=[1, 3, 3, 1],
+            window_rows=2,
+            window_cols=2,
+            row_stride=1,
+            col_stride=1,
+            padding="VALID",
+            use_gpu=use_gpu,
+            v2=v2)
 
   def _testMaxPoolGradDirect1_3(self):
     input_data = [
@@ -1069,18 +1229,20 @@ class PoolingTest(test.TestCase):
     ]
 
     for use_gpu in True, False:
-      self._testMaxPoolGradDirect(
-          input_data,
-          output_backprop,
-          expected_input_backprop,
-          input_sizes=[1, 4, 4, 1],
-          output_sizes=[1, 4, 4, 1],
-          window_rows=3,
-          window_cols=3,
-          row_stride=1,
-          col_stride=1,
-          padding="SAME",
-          use_gpu=use_gpu)
+      for v2 in [True, False]:
+        self._testMaxPoolGradDirect(
+            input_data,
+            output_backprop,
+            expected_input_backprop,
+            input_sizes=[1, 4, 4, 1],
+            output_sizes=[1, 4, 4, 1],
+            window_rows=3,
+            window_cols=3,
+            row_stride=1,
+            col_stride=1,
+            padding="SAME",
+            use_gpu=use_gpu,
+            v2=v2)
 
   def _testMaxPoolGradDirectWithNans2_1(self):
     input_data = [float("nan")] * 16
@@ -1090,18 +1252,20 @@ class PoolingTest(test.TestCase):
         11.0, 12.0, 13.0, 0.0, 15.0, 16.0, 17.0, 0.0, 19.0, 20.0, 21.0, 0.0,
         0.0, 0.0, 0.0, 0.0
     ]
-    self._testMaxPoolGradDirect(
-        input_data,
-        output_backprop,
-        expected_input_backprop_tf_cpu,
-        input_sizes=[1, 4, 4, 1],
-        output_sizes=[1, 3, 3, 1],
-        window_rows=2,
-        window_cols=2,
-        row_stride=1,
-        col_stride=1,
-        padding="VALID",
-        use_gpu=False)
+    for v2 in [True, False]:
+      self._testMaxPoolGradDirect(
+          input_data,
+          output_backprop,
+          expected_input_backprop_tf_cpu,
+          input_sizes=[1, 4, 4, 1],
+          output_sizes=[1, 3, 3, 1],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          use_gpu=False,
+          v2=v2)
 
     if not test.is_gpu_available():
       return
@@ -1112,18 +1276,20 @@ class PoolingTest(test.TestCase):
         0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
         0.0, 0.0
     ]
-    self._testMaxPoolGradDirect(
-        input_data,
-        output_backprop,
-        expected_input_backprop_cudnn,
-        input_sizes=[1, 4, 4, 1],
-        output_sizes=[1, 3, 3, 1],
-        window_rows=2,
-        window_cols=2,
-        row_stride=1,
-        col_stride=1,
-        padding="VALID",
-        use_gpu=True)
+    for v2 in [True, False]:
+      self._testMaxPoolGradDirect(
+          input_data,
+          output_backprop,
+          expected_input_backprop_cudnn,
+          input_sizes=[1, 4, 4, 1],
+          output_sizes=[1, 3, 3, 1],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          use_gpu=True,
+          v2=v2)
 
   def _testMaxPoolGradDirectWithNans2_2(self):
     input_data = [float("nan")] * 16
@@ -1136,18 +1302,20 @@ class PoolingTest(test.TestCase):
         float("nan"), 12.0, 13.0, 0.0, 15.0, float("nan"), 17.0, 0.0, 19.0,
         20.0, float("nan"), 0.0, 0.0, 0.0, 0.0, 0.0
     ]
-    self._testMaxPoolGradDirect(
-        input_data,
-        output_backprop,
-        expected_input_backprop_tf_cpu,
-        input_sizes=[1, 4, 4, 1],
-        output_sizes=[1, 3, 3, 1],
-        window_rows=2,
-        window_cols=2,
-        row_stride=1,
-        col_stride=1,
-        padding="VALID",
-        use_gpu=False)
+    for v2 in [True, False]:
+      self._testMaxPoolGradDirect(
+          input_data,
+          output_backprop,
+          expected_input_backprop_tf_cpu,
+          input_sizes=[1, 4, 4, 1],
+          output_sizes=[1, 3, 3, 1],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          use_gpu=False,
+          v2=v2)
 
     if not test.is_gpu_available():
       return
@@ -1158,18 +1326,20 @@ class PoolingTest(test.TestCase):
         0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
         0.0, 0.0
     ]
-    self._testMaxPoolGradDirect(
-        input_data,
-        output_backprop,
-        expected_input_backprop_cudnn,
-        input_sizes=[1, 4, 4, 1],
-        output_sizes=[1, 3, 3, 1],
-        window_rows=2,
-        window_cols=2,
-        row_stride=1,
-        col_stride=1,
-        padding="VALID",
-        use_gpu=True)
+    for v2 in [True, False]:
+      self._testMaxPoolGradDirect(
+          input_data,
+          output_backprop,
+          expected_input_backprop_cudnn,
+          input_sizes=[1, 4, 4, 1],
+          output_sizes=[1, 3, 3, 1],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          use_gpu=True,
+          v2=v2)
 
   def testMaxPoolGradDirect(self):
     self._testMaxPoolGradDirect1_1()
@@ -1179,108 +1349,116 @@ class PoolingTest(test.TestCase):
     self._testMaxPoolGradDirectWithNans2_2()
 
   def _testMaxPoolGradGradValidPadding1_1(self, data_format, use_gpu):
-    self._ConstructAndTestSecondGradient(
-        nn_ops.max_pool,
-        input_sizes=[1, 3, 3, 1],
-        output_sizes=[1, 3, 3, 1],
-        window_rows=1,
-        window_cols=1,
-        row_stride=1,
-        col_stride=1,
-        padding="VALID",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestSecondGradient(
+          pool_func,
+          input_sizes=[1, 3, 3, 1],
+          output_sizes=[1, 3, 3, 1],
+          window_rows=1,
+          window_cols=1,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradGradValidPadding2_1_6(self, data_format, use_gpu):
-    self._ConstructAndTestSecondGradient(
-        nn_ops.max_pool,
-        input_sizes=[2, 6, 6, 3],
-        output_sizes=[2, 5, 5, 3],
-        window_rows=2,
-        window_cols=2,
-        row_stride=1,
-        col_stride=1,
-        padding="VALID",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestSecondGradient(
+          pool_func,
+          input_sizes=[2, 6, 6, 3],
+          output_sizes=[2, 5, 5, 3],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradGradValidPadding2_1_7(self, data_format, use_gpu):
-    self._ConstructAndTestSecondGradient(
-        nn_ops.max_pool,
-        input_sizes=[2, 7, 7, 3],
-        output_sizes=[2, 6, 6, 3],
-        window_rows=2,
-        window_cols=2,
-        row_stride=1,
-        col_stride=1,
-        padding="VALID",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestSecondGradient(
+          pool_func,
+          input_sizes=[2, 7, 7, 3],
+          output_sizes=[2, 6, 6, 3],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradGradValidPadding2_2(self, data_format, use_gpu):
-    self._ConstructAndTestSecondGradient(
-        nn_ops.max_pool,
-        input_sizes=[2, 2, 2, 3],
-        output_sizes=[2, 1, 1, 3],
-        window_rows=2,
-        window_cols=2,
-        row_stride=2,
-        col_stride=2,
-        padding="VALID",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestSecondGradient(
+          pool_func,
+          input_sizes=[2, 2, 2, 3],
+          output_sizes=[2, 1, 1, 3],
+          window_rows=2,
+          window_cols=2,
+          row_stride=2,
+          col_stride=2,
+          padding="VALID",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradGradSamePadding1_1(self, data_format, use_gpu):
-    self._ConstructAndTestSecondGradient(
-        nn_ops.max_pool,
-        input_sizes=[2, 2, 4, 3],
-        output_sizes=[2, 2, 4, 3],
-        window_rows=1,
-        window_cols=1,
-        row_stride=1,
-        col_stride=1,
-        padding="SAME",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestSecondGradient(
+          pool_func,
+          input_sizes=[2, 2, 4, 3],
+          output_sizes=[2, 2, 4, 3],
+          window_rows=1,
+          window_cols=1,
+          row_stride=1,
+          col_stride=1,
+          padding="SAME",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradGradSamePadding2_1(self, data_format, use_gpu):
-    self._ConstructAndTestSecondGradient(
-        nn_ops.max_pool,
-        input_sizes=[2, 2, 4, 3],
-        output_sizes=[2, 2, 4, 3],
-        window_rows=2,
-        window_cols=2,
-        row_stride=1,
-        col_stride=1,
-        padding="SAME",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestSecondGradient(
+          pool_func,
+          input_sizes=[2, 2, 4, 3],
+          output_sizes=[2, 2, 4, 3],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="SAME",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradGradSamePadding2_2(self, data_format, use_gpu):
-    self._ConstructAndTestSecondGradient(
-        nn_ops.max_pool,
-        input_sizes=[2, 2, 4, 3],
-        output_sizes=[2, 1, 2, 3],
-        window_rows=2,
-        window_cols=2,
-        row_stride=2,
-        col_stride=2,
-        padding="SAME",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestSecondGradient(
+          pool_func,
+          input_sizes=[2, 2, 4, 3],
+          output_sizes=[2, 1, 2, 3],
+          window_rows=2,
+          window_cols=2,
+          row_stride=2,
+          col_stride=2,
+          padding="SAME",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def _testMaxPoolGradGradSamePadding3_1(self, data_format, use_gpu):
-    self._ConstructAndTestSecondGradient(
-        nn_ops.max_pool,
-        input_sizes=[1, 7, 7, 1],
-        output_sizes=[1, 7, 7, 1],
-        window_rows=3,
-        window_cols=3,
-        row_stride=1,
-        col_stride=1,
-        padding="SAME",
-        data_format=data_format,
-        use_gpu=use_gpu)
+    for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
+      self._ConstructAndTestSecondGradient(
+          pool_func,
+          input_sizes=[1, 7, 7, 1],
+          output_sizes=[1, 7, 7, 1],
+          window_rows=3,
+          window_cols=3,
+          row_stride=1,
+          col_stride=1,
+          padding="SAME",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def testMaxPoolGradGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
diff --git a/tensorflow/python/kernel_tests/reshape_op_test.py b/tensorflow/python/kernel_tests/reshape_op_test.py
index 67aeb67d8dd..9d6e7e60a4b 100644
--- a/tensorflow/python/kernel_tests/reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/reshape_op_test.py
@@ -41,6 +41,10 @@ class ReshapeTest(test.TestCase):
     self._testReshape(x, y, False)
     self._testReshape(x, y, True)
 
+  def testBoolBasic(self):
+    x = np.arange(1., 7.).reshape([1, 6]) > 3
+    self._testBothReshape(x, [2, 3])
+
   def testFloatBasic(self):
     x = np.arange(1., 7.).reshape([1, 6]).astype(np.float32)
     self._testBothReshape(x, [2, 3])
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index a0bd178e247..e20c6992525 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -347,7 +347,7 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
           ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
               x_t, y_t, adjoint_a, adjoint_b)
       else:
-        with ops.device("/gpu:0"):
+        with ops.device("/device:GPU:0"):
           x_t = constant_op.constant(x)
           y_t = constant_op.constant(y)
           ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
@@ -365,7 +365,7 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
         ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
             x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
     else:
-      with ops.device("/gpu:0"):
+      with ops.device("/device:GPU:0"):
         x_ind = constant_op.constant(np.vstack(np.where(x)).astype(np.int64).T)
         x_val = constant_op.constant(x[np.where(x)])
         x_shape = constant_op.constant(np.array(x.shape).astype(np.int64))
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index 95ea3a90473..8e1f3eda7cd 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -45,7 +45,7 @@ class StackOpTest(test.TestCase):
     np.random.seed(7)
     with self.test_session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        for dtype in [np.float32, np.int32, np.int64]:
+        for dtype in [np.bool, np.float32, np.int32, np.int64]:
           data = np.random.randn(*shape).astype(dtype)
           # Convert [data[0], data[1], ...] separately to tensorflow
           # TODO(irving): Remove list() once we handle maps correctly
@@ -67,7 +67,7 @@ class StackOpTest(test.TestCase):
     np.random.seed(7)
     with self.test_session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        for dtype in [np.float32, np.int32, np.int64]:
+        for dtype in [np.bool, np.float32, np.int32, np.int64]:
           data = np.random.randn(*shape).astype(dtype)
           # Pack back into a single tensorflow tensor directly using np array
           c = array_ops.stack(data)
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index 71230ba0005..f3751572874 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.ops import array_ops
@@ -84,6 +85,22 @@ class TensordotTest(test_lib.TestCase):
                                    b_ph: b,
                                    axes_ph: axes_value})
 
+  # Test case for 11950
+  def test_valid_axis(self):
+    for axes_value in [1, 2], [[1], [2]]:
+      with self.test_session() as sess:
+        np_a = np.ones((3,3))
+        np_b = np.array([2, 3, 1])[None, None]
+        np_ans = np.tensordot(np_a, np_b, axes_value)
+
+        tf_a = array_ops.ones((3,3), dtype=dtypes.float32)
+        tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
+        tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value).eval()
+
+        self.assertAllEqual(tf_ans.shape, np_ans.shape)
+        self.assertAllEqual(tf_ans, np_ans)
+
+
   def test_partial_shape_inference(self):
     a = array_ops.placeholder(dtypes.float32)
     b = array_ops.placeholder(dtypes.float32)
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index a1903887c72..a50f53b3cd3 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -37,6 +38,17 @@ class UniqueTest(test.TestCase):
     for i in range(len(x)):
       self.assertEqual(x[i], tf_y[tf_idx[i]])
 
+  def testInt32OutIdxInt64(self):
+    x = np.random.randint(2, high=10, size=7000)
+    with self.test_session() as sess:
+      y, idx = array_ops.unique(x, out_idx=dtypes.int64)
+      tf_y, tf_idx = sess.run([y, idx])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+
   def testString(self):
     indx = np.random.randint(65, high=122, size=7000)
     x = [chr(i) for i in indx]
@@ -49,7 +61,6 @@ class UniqueTest(test.TestCase):
     for i in range(len(x)):
       self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
 
-
 class UniqueWithCountsTest(test.TestCase):
 
   def testInt32(self):
@@ -65,6 +76,19 @@ class UniqueWithCountsTest(test.TestCase):
     for value, count in zip(tf_y, tf_count):
       self.assertEqual(count, np.sum(x == value))
 
+  def testInt32OutIdxInt64(self):
+    x = np.random.randint(2, high=10, size=7000)
+    with self.test_session() as sess:
+      y, idx, count = array_ops.unique_with_counts(x, out_idx=dtypes.int64)
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      self.assertEqual(count, np.sum(x == value))
+
   def testString(self):
     indx = np.random.randint(65, high=122, size=7000)
     x = [chr(i) for i in indx]
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index c140bc2b932..4adc3e521ce 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -722,7 +722,7 @@ class VariableScopeTest(test.TestCase):
     def device_func(op):
       if op.type in ["Variable", "VariableV2", "VarHandleOp"]:
         varname_type.append((op.name, op.get_attr("dtype")))
-      return "/gpu:0"
+      return "/device:GPU:0"
 
     with g.as_default():
       with ops.device(device_func):
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index e812d1f3b64..12421c1dcc3 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -290,6 +290,21 @@ class VariablesTestCase(test.TestCase):
                        variables.global_variables())
       self.assertEqual([var_x, var_z, var_t], variables.trainable_variables())
 
+  def testCollectionsWithScope(self):
+    with self.test_session():
+      with ops.name_scope("scope_1"):
+        var_x = variables.Variable(2.0)
+      with ops.name_scope("scope_2"):
+        var_y = variables.Variable(2.0)
+
+      self.assertEqual([var_x, var_y], variables.global_variables())
+      self.assertEqual([var_x], variables.global_variables("scope_1"))
+      self.assertEqual([var_y], variables.global_variables("scope_2"))
+
+      self.assertEqual([var_x, var_y], variables.trainable_variables())
+      self.assertEqual([var_x], variables.trainable_variables("scope_1"))
+      self.assertEqual([var_y], variables.trainable_variables("scope_2"))
+
   def testOperators(self):
     with self.test_session():
       var_f = variables.Variable([2.0])
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 9eea4016ea6..5f21a2bdfa2 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -2119,11 +2119,9 @@ def _unique_layer_name(name):
 
   Example:
 
-  ```
-    >>> _unique_layer_name('dense')
-    dense_1
-    >>> _unique_layer_name('dense')
-    dense_2
+  ```python
+  _unique_layer_name('dense')  # dense_1
+  _unique_layer_name('dense')  # dense_2
   ```
   """
   graph = ops.get_default_graph()
diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index 5e206c3bf9d..98c287e63e6 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -198,7 +198,7 @@ def smart_cond(pred, fn1, fn2, name=None):
     Tensors returned by the call to either `fn1` or `fn2`.
 
   Raises:
-    TypeError is fn1 or fn2 is not callable.
+    TypeError: If `fn1` or `fn2` is not callable.
   """
   if not callable(fn1):
     raise TypeError('`fn1` must be callable.')
@@ -226,7 +226,7 @@ def constant_value(pred):
     True or False if `pred` has a constant boolean value, None otherwise.
 
   Raises:
-    TypeError is pred is not a Variable, Tensor or bool.
+    TypeError: If `pred` is not a Variable, Tensor or bool.
   """
   if isinstance(pred, bool):
     pred_value = pred
diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py
index e60b93b84fb..82653901a05 100644
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@@ -1,3 +1,4 @@
+# This Python file uses the following encoding: utf-8
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -451,6 +452,12 @@ class FileIoTest(test.TestCase):
     lines = f.readlines()
     self.assertSequenceEqual(lines, data)
 
+  def testUTF8StringPath(self):
+    file_path = os.path.join(self._base_dir, "UTF8测试_file")
+    file_io.write_string_to_file(file_path, "testing")
+    with file_io.FileIO(file_path, mode="rb") as f:
+      self.assertEqual(b"testing", f.read())
+
   def testEof(self):
     """Test that reading past EOF does not raise an exception."""
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 9c9b852b76d..0042f929ee7 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -146,14 +146,14 @@ def expand_dims(input, axis=None, name=None, dim=None):
 
   ```python
   # 't' is a tensor of shape [2]
-  shape(expand_dims(t, 0)) ==> [1, 2]
-  shape(expand_dims(t, 1)) ==> [2, 1]
-  shape(expand_dims(t, -1)) ==> [2, 1]
+  tf.shape(tf.expand_dims(t, 0))  # [1, 2]
+  tf.shape(tf.expand_dims(t, 1))  # [2, 1]
+  tf.shape(tf.expand_dims(t, -1))  # [2, 1]
 
   # 't2' is a tensor of shape [2, 3, 5]
-  shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-  shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-  shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+  tf.shape(tf.expand_dims(t2, 0))  # [1, 2, 3, 5]
+  tf.shape(tf.expand_dims(t2, 2))  # [2, 3, 1, 5]
+  tf.shape(tf.expand_dims(t2, 3))  # [2, 3, 5, 1]
   ```
 
   This operation requires that:
@@ -252,8 +252,8 @@ def shape(input, name=None, out_type=dtypes.int32):
   For example:
 
   ```python
-  # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-  shape(t) ==> [2, 2, 3]
+  t = tf.constant([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]])
+  tf.shape(t)  # [2, 2, 3]
   ```
 
   Args:
@@ -305,8 +305,8 @@ def size(input, name=None, out_type=dtypes.int32):
   For example:
 
   ```python
-  # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-  size(t) ==> 12
+  t = tf.constant([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]])
+  tf.size(t)  # 12
   ```
 
   Args:
@@ -357,9 +357,9 @@ def rank(input, name=None):
   For example:
 
   ```python
-  # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
   # shape of tensor 't' is [2, 2, 3]
-  rank(t) ==> 3
+  t = tf.constant([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]])
+  tf.rank(t)  # 3
   ```
 
   **Note**: The rank of a tensor is not the same as the rank of a matrix. The
@@ -424,11 +424,11 @@ def _SliceHelper(tensor, slice_spec, var=None):
   ```python
   # strip leading and trailing 2 elements
   foo = tf.constant([1,2,3,4,5,6])
-  print(foo[2:-2].eval()) # => [3,4]
+  print(foo[2:-2].eval())  # [3,4]
 
   # skip every row and reverse every column
   foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-  print(foo[::2,::-1].eval()) # => [[3,2,1], [9,8,7]]
+  print(foo[::2,::-1].eval())  # [[3,2,1], [9,8,7]]
 
   # Insert another dimension
   foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
@@ -439,9 +439,9 @@ def _SliceHelper(tensor, slice_spec, var=None):
 
   # Ellipses (3 equivalent operations)
   foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-  print(foo[tf.newaxis, :, :].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
-  print(foo[tf.newaxis, ...].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
-  print(foo[tf.newaxis].eval()) # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis, :, :].eval())  # [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis, ...].eval())  # [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis].eval())  # [[[1,2,3], [4,5,6], [7,8,9]]]
   ```
 
   Notes:
@@ -563,14 +563,14 @@ def slice(input_, begin, size, name=None):
   For example:
 
   ```python
-  # 'input' is [[[1, 1, 1], [2, 2, 2]],
-  #             [[3, 3, 3], [4, 4, 4]],
-  #             [[5, 5, 5], [6, 6, 6]]]
-  tf.slice(input, [1, 0, 0], [1, 1, 3]) ==> [[[3, 3, 3]]]
-  tf.slice(input, [1, 0, 0], [1, 2, 3]) ==> [[[3, 3, 3],
-                                              [4, 4, 4]]]
-  tf.slice(input, [1, 0, 0], [2, 1, 3]) ==> [[[3, 3, 3]],
-                                             [[5, 5, 5]]]
+  t = tf.constant([[[1, 1, 1], [2, 2, 2]],
+                   [[3, 3, 3], [4, 4, 4]],
+                   [[5, 5, 5], [6, 6, 6]]])
+  tf.slice(t, [1, 0, 0], [1, 1, 3])  # [[[3, 3, 3]]]
+  tf.slice(t, [1, 0, 0], [1, 2, 3])  # [[[3, 3, 3],
+                                     #   [4, 4, 4]]]
+  tf.slice(t, [1, 0, 0], [2, 1, 3])  # [[[3, 3, 3]],
+                                     #  [[5, 5, 5]]]
   ```
 
   Args:
@@ -658,14 +658,14 @@ def strided_slice(input_,
 
 
   ```python
-  # 'input' is [[[1, 1, 1], [2, 2, 2]],
-  #             [[3, 3, 3], [4, 4, 4]],
-  #             [[5, 5, 5], [6, 6, 6]]]
-  tf.strided_slice(input, [1, 0, 0], [2, 1, 3], [1, 1, 1]) ==> [[[3, 3, 3]]]
-  tf.strided_slice(input, [1, 0, 0], [2, 2, 3], [1, 1, 1]) ==> [[[3, 3, 3],
-                                                                 [4, 4, 4]]]
-  tf.strided_slice(input, [1, -1, 0], [2, -3, 3], [1, -1, 1]) ==>[[[4, 4, 4],
-                                                                   [3, 3, 3]]]
+  t = tf.constant([[[1, 1, 1], [2, 2, 2]],
+                   [[3, 3, 3], [4, 4, 4]],
+                   [[5, 5, 5], [6, 6, 6]]])
+  tf.strided_slice(t, [1, 0, 0], [2, 1, 3], [1, 1, 1])  # [[[3, 3, 3]]]
+  tf.strided_slice(t, [1, 0, 0], [2, 2, 3], [1, 1, 1])  # [[[3, 3, 3],
+                                                        #   [4, 4, 4]]]
+  tf.strided_slice(t, [1, -1, 0], [2, -3, 3], [1, -1, 1])  # [[[4, 4, 4],
+                                                           #   [3, 3, 3]]]
   ```
 
   Args:
@@ -788,10 +788,10 @@ def parallel_stack(values, name="parallel_stack"):
   For example:
 
   ```python
-  # 'x' is [1, 4]
-  # 'y' is [2, 5]
-  # 'z' is [3, 6]
-  parallel_stack([x, y, z])  # => [[1, 4], [2, 5], [3, 6]]
+  x = tf.constant([1, 4])
+  y = tf.constant([2, 5])
+  z = tf.constant([3, 6])
+  tf.parallel_stack([x, y, z])  # [[1, 4], [2, 5], [3, 6]]
   ```
 
   The difference between `stack` and `parallel_stack` is that `stack` requires
@@ -839,11 +839,11 @@ def stack(values, axis=0, name="stack"):
   For example:
 
   ```python
-  # 'x' is [1, 4]
-  # 'y' is [2, 5]
-  # 'z' is [3, 6]
-  stack([x, y, z])  # => [[1, 4], [2, 5], [3, 6]] (Pack along first dim.)
-  stack([x, y, z], axis=1)  # => [[1, 2, 3], [4, 5, 6]]
+  x = tf.constant([1, 4])
+  y = tf.constant([2, 5])
+  z = tf.constant([3, 6])
+  tf.stack([x, y, z])  # [[1, 4], [2, 5], [3, 6]] (Pack along first dim.)
+  tf.stack([x, y, z], axis=1)  # [[1, 2, 3], [4, 5, 6]]
   ```
 
   This is the opposite of unstack.  The numpy equivalent is
@@ -1043,13 +1043,13 @@ def concat(values, axis, name="concat"):
   ```python
   t1 = [[1, 2, 3], [4, 5, 6]]
   t2 = [[7, 8, 9], [10, 11, 12]]
-  tf.concat([t1, t2], 0) ==> [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
-  tf.concat([t1, t2], 1) ==> [[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]]
+  tf.concat([t1, t2], 0)  # [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
+  tf.concat([t1, t2], 1)  # [[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]]
 
   # tensor t3 with shape [2, 3]
   # tensor t4 with shape [2, 3]
-  tf.shape(tf.concat([t3, t4], 0)) ==> [4, 3]
-  tf.shape(tf.concat([t3, t4], 1)) ==> [2, 6]
+  tf.shape(tf.concat([t3, t4], 0))  # [4, 3]
+  tf.shape(tf.concat([t3, t4], 1))  # [2, 6]
   ```
 
   Note: If you are concatenating along a new axis consider using stack.
@@ -1098,7 +1098,7 @@ def boolean_mask(tensor, mask, name="boolean_mask"):
   # 1-D example
   tensor = [0, 1, 2, 3]
   mask = np.array([True, False, True, False])
-  boolean_mask(tensor, mask) ==> [0, 2]
+  boolean_mask(tensor, mask)  # [0, 2]
   ```
 
   In general, `0 < dim(mask) = K <= dim(tensor)`, and `mask`'s shape must match
@@ -1124,7 +1124,7 @@ def boolean_mask(tensor, mask, name="boolean_mask"):
   # 2-D example
   tensor = [[1, 2], [3, 4], [5, 6]]
   mask = np.array([True, False, True])
-  boolean_mask(tensor, mask) ==> [[1, 2], [5, 6]]
+  boolean_mask(tensor, mask)  # [[1, 2], [5, 6]]
   ```
   """
 
@@ -1176,17 +1176,16 @@ def sparse_mask(a, mask_indices, name=None):
   ```python
   # `a` contains slices at indices [12, 26, 37, 45] from a large tensor
   # with shape [1000, 10]
-  a.indices => [12, 26, 37, 45]
-  tf.shape(a.values) => [4, 10]
+  a.indices  # [12, 26, 37, 45]
+  tf.shape(a.values)  # [4, 10]
 
   # `b` will be the subset of `a` slices at its second and third indices, so
   # we want to mask its first and last indices (which are at absolute
   # indices 12, 45)
   b = tf.sparse_mask(a, [12, 45])
 
-  b.indices => [26, 37]
-  tf.shape(b.values) => [2, 10]
-
+  b.indices  # [26, 37]
+  tf.shape(b.values)  # [2, 10]
   ```
 
   Args:
@@ -1222,12 +1221,12 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   # 'value' is a tensor with shape [5, 30]
   # Split 'value' into 3 tensors with sizes [4, 15, 11] along dimension 1
   split0, split1, split2 = tf.split(value, [4, 15, 11], 1)
-  tf.shape(split0) ==> [5, 4]
-  tf.shape(split1) ==> [5, 15]
-  tf.shape(split2) ==> [5, 11]
+  tf.shape(split0)  # [5, 4]
+  tf.shape(split1)  # [5, 15]
+  tf.shape(split2)  # [5, 11]
   # Split 'value' into 3 tensors along dimension 1
   split0, split1, split2 = tf.split(value, num_or_size_splits=3, axis=1)
-  tf.shape(split0) ==> [5, 10]
+  tf.shape(split0)  # [5, 10]
   ```
 
   Args:
@@ -1281,30 +1280,29 @@ def transpose(a, perm=None, name="transpose"):
   For example:
 
   ```python
-  # 'x' is [[1 2 3]
-  #         [4 5 6]]
-  tf.transpose(x) ==> [[1 4]
-                       [2 5]
-                       [3 6]]
+  x = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.transpose(x)  # [[1, 4]
+                   #  [2, 5]
+                   #  [3, 6]]
 
   # Equivalently
-  tf.transpose(x, perm=[1, 0]) ==> [[1 4]
-                                    [2 5]
-                                    [3 6]]
+  tf.transpose(x, perm=[1, 0])  # [[1, 4]
+                                #  [2, 5]
+                                #  [3, 6]]
 
   # 'perm' is more useful for n-dimensional tensors, for n > 2
-  # 'x' is   [[[1  2  3]
-  #            [4  5  6]]
-  #           [[7  8  9]
-  #            [10 11 12]]]
-  # Take the transpose of the matrices in dimension-0
-  tf.transpose(x, perm=[0, 2, 1]) ==> [[[1  4]
-                                        [2  5]
-                                        [3  6]]
+  x = tf.constant([[[ 1,  2,  3],
+                    [ 4,  5,  6]],
+                   [[ 7,  8,  9],
+                    [10, 11, 12]]])
 
-                                       [[7 10]
-                                        [8 11]
-                                        [9 12]]]
+  # Take the transpose of the matrices in dimension-0
+  tf.transpose(x, perm=[0, 2, 1])  # [[[1,  4],
+                                   #   [2,  5],
+                                   #   [3,  6]],
+                                   #  [[7, 10],
+                                   #   [8, 11],
+                                   #   [9, 12]]]
   ```
 
   Args:
@@ -1337,12 +1335,10 @@ def matrix_transpose(a, name="matrix_transpose"):
   For example:
 
   ```python
-  # Matrix with no batch dimension.
-  # 'x' is [[1 2 3]
-  #         [4 5 6]]
-  tf.matrix_transpose(x) ==> [[1 4]
-                                   [2 5]
-                                   [3 6]]
+  x = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.matrix_transpose(x)  # [[1, 4],
+                          #  [2, 5],
+                          #  [3, 6]]
 
   # Matrix with two batch dimensions.
   # x.shape is [1, 2, 3, 4]
@@ -1352,7 +1348,7 @@ def matrix_transpose(a, name="matrix_transpose"):
   Note that `tf.matmul` provides kwargs allowing for transpose of arguments.
   This is done with minimal cost, and is preferable to using this function. E.g.
 
-  ```
+  ```python
   # Good!  Transpose is taken at minimal additional cost.
   tf.matmul(matrix, b, transpose_b=True)
 
@@ -1405,7 +1401,7 @@ def zeros(shape, dtype=dtypes.float32, name=None):
   For example:
 
   ```python
-  tf.zeros([3, 4], tf.int32) ==> [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]
+  tf.zeros([3, 4], tf.int32)  # [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]
   ```
 
   Args:
@@ -1445,8 +1441,8 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
   For example:
 
   ```python
-  # 'tensor' is [[1, 2, 3], [4, 5, 6]]
-  tf.zeros_like(tensor) ==> [[0, 0, 0], [0, 0, 0]]
+  tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.zeros_like(tensor)  # [[0, 0, 0], [0, 0, 0]]
   ```
 
   Args:
@@ -1485,8 +1481,8 @@ def ones_like(tensor, dtype=None, name=None, optimize=True):
   For example:
 
   ```python
-  # 'tensor' is [[1, 2, 3], [4, 5, 6]]
-  tf.ones_like(tensor) ==> [[1, 1, 1], [1, 1, 1]]
+  tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.ones_like(tensor)  # [[1, 1, 1], [1, 1, 1]]
   ```
 
   Args:
@@ -1521,7 +1517,7 @@ def ones(shape, dtype=dtypes.float32, name=None):
   For example:
 
   ```python
-  tf.ones([2, 3], tf.int32) ==> [[1, 1, 1], [1, 1, 1]]
+  tf.ones([2, 3], tf.int32)  # [[1, 1, 1], [1, 1, 1]]
   ```
 
   Args:
@@ -1668,24 +1664,24 @@ def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pyl
   For example:
 
   ```python
-  # 't' is [[1, 2, 3], [4, 5, 6]].
-  # 'paddings' is [[1, 1,], [2, 2]].
+  t = tf.constant([[1, 2, 3], [4, 5, 6]])
+  paddings = tf.constant([[1, 1,], [2, 2]])
   # 'constant_values' is 0.
   # rank of 't' is 2.
-  pad(t, paddings, "CONSTANT") ==> [[0, 0, 0, 0, 0, 0, 0],
-                                    [0, 0, 1, 2, 3, 0, 0],
-                                    [0, 0, 4, 5, 6, 0, 0],
-                                    [0, 0, 0, 0, 0, 0, 0]]
+  tf.pad(t, paddings, "CONSTANT")  # [[0, 0, 0, 0, 0, 0, 0],
+                                   #  [0, 0, 1, 2, 3, 0, 0],
+                                   #  [0, 0, 4, 5, 6, 0, 0],
+                                   #  [0, 0, 0, 0, 0, 0, 0]]
 
-  pad(t, paddings, "REFLECT") ==> [[6, 5, 4, 5, 6, 5, 4],
-                                   [3, 2, 1, 2, 3, 2, 1],
-                                   [6, 5, 4, 5, 6, 5, 4],
-                                   [3, 2, 1, 2, 3, 2, 1]]
+  tf.pad(t, paddings, "REFLECT")  # [[6, 5, 4, 5, 6, 5, 4],
+                                  #  [3, 2, 1, 2, 3, 2, 1],
+                                  #  [6, 5, 4, 5, 6, 5, 4],
+                                  #  [3, 2, 1, 2, 3, 2, 1]]
 
-  pad(t, paddings, "SYMMETRIC") ==> [[2, 1, 1, 2, 3, 3, 2],
-                                     [2, 1, 1, 2, 3, 3, 2],
-                                     [5, 4, 4, 5, 6, 6, 5],
-                                     [5, 4, 4, 5, 6, 6, 5]]
+  tf.pad(t, paddings, "SYMMETRIC")  # [[2, 1, 1, 2, 3, 3, 2],
+                                    #  [2, 1, 1, 2, 3, 3, 2],
+                                    #  [5, 4, 4, 5, 6, 6, 5],
+                                    #  [5, 4, 4, 5, 6, 6, 5]]
   ```
 
   Args:
@@ -1757,19 +1753,15 @@ def meshgrid(*args, **kwargs):
   Calling `X, Y = meshgrid(x, y)` with the tensors
 
   ```python
-    x = [1, 2, 3]
-    y = [4, 5, 6]
-  ```
-
-  results in
-
-  ```python
-    X = [[1, 2, 3],
-         [1, 2, 3],
-         [1, 2, 3]]
-    Y = [[4, 4, 4],
-         [5, 5, 5],
-         [6, 6, 6]]
+  x = [1, 2, 3]
+  y = [4, 5, 6]
+  X, Y = tf.meshgrid(x, y)
+  # X = [[1, 2, 3],
+  #      [1, 2, 3],
+  #      [1, 2, 3]]
+  # Y = [[4, 4, 4],
+  #      [5, 5, 5],
+  #      [6, 6, 6]]
   ```
 
   Args:
@@ -2146,66 +2138,35 @@ def one_hot(indices,
   Note: If a non-numeric data type output is desired (`tf.string`, `tf.bool`,
   etc.), both `on_value` and `off_value` _must_ be provided to `one_hot`.
 
-  Examples
-  =========
-
-  Suppose that
+  For example:
 
   ```python
-    indices = [0, 2, -1, 1]
-    depth = 3
-    on_value = 5.0
-    off_value = 0.0
-    axis = -1
-  ```
+  indices = [0, 1, 2]
+  depth = 3
+  tf.one_hot(indices, depth)  # output: [3 x 3]
+  # [[1., 0., 0.],
+  #  [0., 1., 0.],
+  #  [0., 0., 1.]]
 
-  Then output is `[4 x 3]`:
+  indices = [0, 2, -1, 1]
+  depth = 3
+  tf.one_hot(indices, depth,
+             on_value=5.0, off_value=0.0,
+             axis=-1)  # output: [4 x 3]
+  # [[5.0, 0.0, 0.0],  # one_hot(0)
+  #  [0.0, 0.0, 5.0],  # one_hot(2)
+  #  [0.0, 0.0, 0.0],  # one_hot(-1)
+  #  [0.0, 5.0, 0.0]]  # one_hot(1)
 
-  ```python
-    output =
-    [5.0 0.0 0.0]  // one_hot(0)
-    [0.0 0.0 5.0]  // one_hot(2)
-    [0.0 0.0 0.0]  // one_hot(-1)
-    [0.0 5.0 0.0]  // one_hot(1)
-  ```
-
-  Suppose that
-
-  ```python
-    indices = [[0, 2], [1, -1]]
-    depth = 3
-    on_value = 1.0
-    off_value = 0.0
-    axis = -1
-  ```
-
-  Then output is `[2 x 2 x 3]`:
-
-  ```python
-    output =
-    [
-      [1.0, 0.0, 0.0]  // one_hot(0)
-      [0.0, 0.0, 1.0]  // one_hot(2)
-    ][
-      [0.0, 1.0, 0.0]  // one_hot(1)
-      [0.0, 0.0, 0.0]  // one_hot(-1)
-    ]
-  ```
-
-  Using default values for `on_value` and `off_value`:
-
-  ```python
-    indices = [0, 1, 2]
-    depth = 3
-  ```
-
-  The output will be
-
-  ```python
-    output =
-    [[1., 0., 0.],
-     [0., 1., 0.],
-     [0., 0., 1.]]
+  indices = [[0, 2], [1, -1]]
+  depth = 3
+  tf.one_hot(indices, depth,
+             on_value=1.0, off_value=0.0,
+             axis=-1)  # output: [2 x 2 x 3]
+  # [[[1.0, 0.0, 0.0],   # one_hot(0)
+  #   [0.0, 0.0, 1.0]],  # one_hot(2)
+  #  [[0.0, 1.0, 0.0],   # one_hot(1)
+  #   [0.0, 0.0, 0.0]]]  # one_hot(-1)
   ```
 
   Args:
@@ -2275,10 +2236,9 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
   Example:
 
   ```python
-  tf.sequence_mask([1, 3, 2], 5) =
-    [[True, False, False, False, False],
-     [True, True, True, False, False],
-     [True, True, False, False, False]]
+  tf.sequence_mask([1, 3, 2], 5)  # [[True, False, False, False, False],
+                                  #  [True, True, True, False, False],
+                                  #  [True, True, False, False, False]]
   ```
 
   Args:
@@ -2336,14 +2296,14 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
 
   ```python
   # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-  shape(squeeze(t))  # => [2, 3]
+  tf.shape(tf.squeeze(t))  # [2, 3]
   ```
 
   Or, to remove specific size 1 dimensions:
 
   ```python
   # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-  shape(squeeze(t, [2, 4]))  # => [1, 2, 3, 1]
+  tf.shape(tf.squeeze(t, [2, 4]))  # [1, 2, 3, 1]
   ```
 
   Args:
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 73bd6a63772..3797919f52b 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1046,10 +1046,10 @@ class ControlFlowState(object):
     Otherwise, they will enter the backprop loop with None. As an example,
     people often write:
 
-           ```
-           v1, _ = tf.while_loop(p, b, [x1, x2])
-           result = gradients(v1, x1)
-           ```
+    ```python
+    v1, _ = tf.while_loop(p, b, [x1, x2])
+    result = gradients(v1, x1)
+    ```
 
     The exit node for x2 is not included by the betweenness analysis. But we
     need to backprop x2 if x2 is involved in computing v1.
@@ -1779,13 +1779,13 @@ def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
   Example:
 
   ```python
-    x = tf.constant(2)
-    y = tf.constant(5)
-    def f1(): return tf.multiply(x, 17)
-    def f2(): return tf.add(y, 23)
-    r = tf.cond(tf.less(x, y), f1, f2)
-    # r is set to f1().
-    # Operations in f2 (e.g., tf.add) are not executed.
+  x = tf.constant(2)
+  y = tf.constant(5)
+  def f1(): return tf.multiply(x, 17)
+  def f2(): return tf.add(y, 23)
+  r = tf.cond(tf.less(x, y), f1, f2)
+  # r is set to f1().
+  # Operations in f2 (e.g., tf.add) are not executed.
   ```
 
   """
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index fbfbb50d8cb..2a2ef5809d0 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -1492,7 +1492,7 @@ class BaseStagingArea(object):
     # Sanity check number of values
     if not len(vals) <= len(self._dtypes):
       raise ValueError("Unexpected number of inputs '%s' vs '%s'" % (
-                          len(values), len(self._dtypes)))
+                          len(vals), len(self._dtypes)))
 
     tensors = []
 
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 63fb87e93c5..59add19a581 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -586,15 +586,15 @@ def rotate_transpose(x, shift, name="rotate_transpose"):
 
   Example:
 
-    ```python
-    x = ...  # Tensor of shape [1, 2, 3, 4].
-    rotate_transpose(x, -1)  # result shape: [2, 3, 4, 1]
-    rotate_transpose(x, -2)  # result shape: [3, 4, 1, 2]
-    rotate_transpose(x,  1)  # result shape: [4, 1, 2, 3]
-    rotate_transpose(x,  2)  # result shape: [3, 4, 1, 2]
-    rotate_transpose(x, 7) == rotate_transpose(x, 3)
-    rotate_transpose(x, -7) == rotate_transpose(x, -3)
-    ```
+  ```python
+  x = tf.random_normal([1, 2, 3, 4])  # Tensor of shape [1, 2, 3, 4].
+  rotate_transpose(x, -1).shape == [2, 3, 4, 1]
+  rotate_transpose(x, -2).shape == [3, 4, 1, 2]
+  rotate_transpose(x,  1).shape == [4, 1, 2, 3]
+  rotate_transpose(x,  2).shape == [3, 4, 1, 2]
+  rotate_transpose(x,  7).shape == rotate_transpose(x, 3).shape  # [2, 3, 4, 1]
+  rotate_transpose(x, -7).shape == rotate_transpose(x, -3).shape  # [4, 1, 2, 3]
+  ```
 
   Args:
     x: `Tensor`.
@@ -667,10 +667,8 @@ def pick_vector(cond,
   Example:
 
   ```python
-  pick_vector(tf.less(0, 5), tf.range(10, 12), tf.range(15, 18))
-  # result is tensor: [10, 11].
-  pick_vector(tf.less(5, 0), tf.range(10, 12), tf.range(15, 18))
-  # result is tensor: [15, 16, 17].
+  pick_vector(tf.less(0, 5), tf.range(10, 12), tf.range(15, 18))  # [10, 11]
+  pick_vector(tf.less(5, 0), tf.range(10, 12), tf.range(15, 18))  # [15, 16, 17]
   ```
 
   Returns:
@@ -733,10 +731,9 @@ def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
   Example:
 
   ```python
-  fill_lower_triangular([1, 2, 3, 4, 5, 6])
-  # Returns: [[1, 0, 0],
-  #           [2, 3, 0],
-  #           [4, 5, 6]]
+  fill_lower_triangular([1, 2, 3, 4, 5, 6])  # [[1, 0, 0],
+                                             #  [2, 3, 0],
+                                             #  [4, 5, 6]]
   ```
 
   For comparison, a pure numpy version of this function can be found in
@@ -753,7 +750,7 @@ def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
     tril: `Tensor` with lower triangular elements filled from `x`.
 
   Raises:
-    ValueError: if shape if `x` has static shape which cannot be mapped to a
+    ValueError: if shape of `x` has static shape which cannot be mapped to a
       lower triangular matrix.
   """
   # TODO(jvdillon): Replace this code with dedicated op when it exists.
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index aefed34d744..11c204b5b7f 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -163,20 +163,20 @@ class GradientsTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default() as g:
       w = constant(1.0, shape=[1, 1])
       x = constant(1.0, shape=[1, 2])
-      with g.device("/gpu:0"):
+      with g.device("/device:GPU:0"):
         wx = math_ops.matmul(w, x)
       gw = gradients.gradients(wx, [w], colocate_gradients_with_ops=True)[0]
     self.assertEqual(gw.op.colocation_groups(), wx.op.colocation_groups())
 
   def testColocateGradientsWithAggregation(self):
     with ops.Graph().as_default() as g:
-      with g.device("/gpu:1"):
+      with g.device("/device:GPU:1"):
         w = constant(1.0, shape=[1, 1])
       x = constant(1.0, shape=[1, 2])
       y = constant(1.0, shape=[1, 2])
       wx = math_ops.matmul(w, x)
       wy = math_ops.matmul(w, y)
-      with g.device("/gpu:0"):
+      with g.device("/device:GPU:0"):
         z = wx + wy
 
       gw1 = gradients.gradients(z, [w], colocate_gradients_with_ops=True)[0]
@@ -187,7 +187,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
 
   def testColocateGradientsWithAggregationInMultipleDevices(self):
     with ops.Graph().as_default() as g:
-      with g.device("/gpu:1"):
+      with g.device("/device:GPU:1"):
         w = constant(1.0, shape=[1, 1])
       x = constant(1.0, shape=[1, 2])
       y = constant(1.0, shape=[1, 2])
@@ -195,7 +195,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
         wx = math_ops.matmul(w, x)
       with g.device("/task:2"):
         wy = math_ops.matmul(w, y)
-      with g.device("/gpu:0"):
+      with g.device("/device:GPU:0"):
         z = wx + wy
 
       gw1 = gradients.gradients(z, [w], colocate_gradients_with_ops=True)[0]
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index ffa5dc4d623..eeaf418c8b3 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -302,6 +302,7 @@ BiasAddV1
 Relu6
 AvgPool
 MaxPool
+MaxPoolV2
 Softmax
 LogSoftmax
 FractionalAvgPoolGrad
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index c145b11191e..c2077d51af9 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -62,7 +62,7 @@ def histogram_fixed_width(values,
   value_range = [0.0, 5.0]
   new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
 
-  with tf.default_session() as sess:
+  with tf.get_default_session() as sess:
     hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
     variables.global_variables_initializer().run()
     sess.run(hist) => [2, 1, 1, 0, 2]
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 878d6ea63ae..9eea3c21f89 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -65,13 +65,13 @@ class Initializer(object):
 
     Example:
 
-    ```
+    ```python
     initializer = RandomUniform(-1, 1)
     config = initializer.get_config()
     initializer = RandomUniform.from_config(config)
     ```
 
-    Arguments:
+    Args:
       config: A Python dictionary.
         It will typically be the output of `get_config`.
 
@@ -388,7 +388,7 @@ class VarianceScaling(Initializer):
   With `distribution="uniform"`, samples are drawn from a uniform distribution
   within [-limit, limit], with `limit = sqrt(3 * scale / n)`.
 
-  Arguments:
+  Args:
     scale: Scaling factor (positive float).
     mode: One of "fan_in", "fan_out", "fan_avg".
     distribution: Random distribution to use. One of "normal", "uniform".
@@ -570,7 +570,7 @@ def glorot_uniform_initializer(seed=None, dtype=dtypes.float32):
 
   Reference: http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
 
-  Arguments:
+  Args:
     seed: A Python integer. Used to create random seeds. See
       @{tf.set_random_seed}
       for behavior.
@@ -593,7 +593,7 @@ def glorot_normal_initializer(seed=None, dtype=dtypes.float32):
 
   Reference: http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
 
-  Arguments:
+  Args:
     seed: A Python integer. Used to create random seeds. See
       @{tf.set_random_seed}
       for behavior.
@@ -612,7 +612,7 @@ def glorot_normal_initializer(seed=None, dtype=dtypes.float32):
 def _compute_fans(shape):
   """Computes the number of input and output units for a weight shape.
 
-  Arguments:
+  Args:
     shape: Integer shape tuple or TF tensor shape.
 
   Returns:
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 3e91ec06841..7ee095745ae 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -233,9 +233,9 @@ def abs(x, name=None):
   `float32` or `float64` that is the absolute value of each element in `x`. All
   elements in `x` must be complex numbers of the form \\(a + bj\\). The
   absolute value is computed as \\( \sqrt{a^2 + b^2}\\).  For example:
-  ```
-  # tensor 'x' is [[-2.25 + 4.75j], [-3.25 + 5.75j]]
-  tf.complex_abs(x) ==> [5.25594902, 6.60492229]
+  ```python
+  x = tf.constant([[-2.25 + 4.75j], [-3.25 + 5.75j]])
+  tf.abs(x)  # [5.25594902, 6.60492229]
   ```
 
   Args:
@@ -524,10 +524,10 @@ def pow(x, y, name=None):
   Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
   corresponding elements in `x` and `y`. For example:
 
-  ```
-  # tensor 'x' is [[2, 2], [3, 3]]
-  # tensor 'y' is [[8, 16], [2, 3]]
-  tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+  ```python
+  x = tf.constant([[2, 2], [3, 3]])
+  y = tf.constant([[8, 16], [2, 3]])
+  tf.pow(x, y)  # [[256, 65536], [9, 27]]
   ```
 
   Args:
@@ -557,10 +557,10 @@ def complex(real, imag, name=None):
 
   For example:
 
-  ```
-  # tensor 'real' is [2.25, 3.25]
-  # tensor `imag` is [4.75, 5.75]
-  tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+  ```python
+  real = tf.constant([2.25, 3.25])
+  imag = tf.constant([4.75, 5.75])
+  tf.complex(real, imag)  # [[2.25 + 4.75j], [3.25 + 5.75j]]
   ```
 
   Args:
@@ -597,9 +597,9 @@ def real(input, name=None):
 
   For example:
 
-  ```
-  # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-  tf.real(input) ==> [-2.25, 3.25]
+  ```python
+  x = tf.constant([-2.25 + 4.75j, 3.25 + 5.75j])
+  tf.real(x)  # [-2.25, 3.25]
   ```
 
   If `input` is already real, it is returned unchanged.
@@ -629,9 +629,9 @@ def imag(input, name=None):
 
   For example:
 
-  ```
-  # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-  tf.imag(input) ==> [4.75, 5.75]
+  ```python
+  x = tf.constant([-2.25 + 4.75j, 3.25 + 5.75j])
+  tf.imag(x)  # [4.75, 5.75]
   ```
 
   Args:
@@ -657,8 +657,8 @@ def round(x, name=None):
   For example:
 
   ```python
-  # 'a' is [0.9, 2.5, 2.3, 1.5, -4.5]
-  tf.round(a) ==> [ 1.0, 2.0, 2.0, 2.0, -4.0 ]
+  x = tf.constant([0.9, 2.5, 2.3, 1.5, -4.5])
+  tf.round(x)  # [ 1.0, 2.0, 2.0, 2.0, -4.0 ]
   ```
 
   Args:
@@ -684,8 +684,8 @@ def cast(x, dtype, name=None):
   For example:
 
   ```python
-  # tensor `a` is [1.8, 2.2], dtype=tf.float
-  tf.cast(a, tf.int32) ==> [1, 2]  # dtype=tf.int32
+  x = tf.constant([1.8, 2.2], dtype=tf.float32)
+  tf.cast(x, tf.int32)  # [1, 2], dtype=tf.int32
   ```
 
   Args:
@@ -1147,18 +1147,18 @@ def range(start, limit=None, delta=1, dtype=None, name="range"):
   For example:
 
   ```python
-  # 'start' is 3
-  # 'limit' is 18
-  # 'delta' is 3
-  tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+  start = 3
+  limit = 18
+  delta = 3
+  tf.range(start, limit, delta)  # [3, 6, 9, 12, 15]
 
-  # 'start' is 3
-  # 'limit' is 1
-  # 'delta' is -0.5
-  tf.range(start, limit, delta) ==> [3, 2.5, 2, 1.5]
+  start = 3
+  limit = 1
+  delta = -0.5
+  tf.range(start, limit, delta)  # [3, 2.5, 2, 1.5]
 
-  # 'limit' is 5
-  tf.range(limit) ==> [0, 1, 2, 3, 4]
+  limit = 5
+  tf.range(limit)  # [0, 1, 2, 3, 4]
   ```
 
   Args:
@@ -1247,13 +1247,12 @@ def reduce_sum(input_tensor,
   For example:
 
   ```python
-  # 'x' is [[1, 1, 1]
-  #         [1, 1, 1]]
-  tf.reduce_sum(x) ==> 6
-  tf.reduce_sum(x, 0) ==> [2, 2, 2]
-  tf.reduce_sum(x, 1) ==> [3, 3]
-  tf.reduce_sum(x, 1, keep_dims=True) ==> [[3], [3]]
-  tf.reduce_sum(x, [0, 1]) ==> 6
+  x = tf.constant([[1, 1, 1], [1, 1, 1]])
+  tf.reduce_sum(x)  # 6
+  tf.reduce_sum(x, 0)  # [2, 2, 2]
+  tf.reduce_sum(x, 1)  # [3, 3]
+  tf.reduce_sum(x, 1, keep_dims=True)  # [[3], [3]]
+  tf.reduce_sum(x, [0, 1])  # 6
   ```
 
   Args:
@@ -1302,13 +1301,12 @@ def count_nonzero(input_tensor,
   For example:
 
   ```python
-  # 'x' is [[0, 1, 0]
-  #         [1, 1, 0]]
-  tf.count_nonzero(x) ==> 3
-  tf.count_nonzero(x, 0) ==> [1, 2, 0]
-  tf.count_nonzero(x, 1) ==> [1, 2]
-  tf.count_nonzero(x, 1, keep_dims=True) ==> [[1], [2]]
-  tf.count_nonzero(x, [0, 1]) ==> 3
+  x = tf.constant([[0, 1, 0], [1, 1, 0]])
+  tf.count_nonzero(x)  # 3
+  tf.count_nonzero(x, 0)  # [1, 2, 0]
+  tf.count_nonzero(x, 1)  # [1, 2]
+  tf.count_nonzero(x, 1, keep_dims=True)  # [[1], [2]]
+  tf.count_nonzero(x, [0, 1])  # 3
   ```
 
   Args:
@@ -1355,11 +1353,10 @@ def reduce_mean(input_tensor,
   For example:
 
   ```python
-  # 'x' is [[1., 1.]
-  #         [2., 2.]]
-  tf.reduce_mean(x) ==> 1.5
-  tf.reduce_mean(x, 0) ==> [1.5, 1.5]
-  tf.reduce_mean(x, 1) ==> [1.,  2.]
+  x = tf.constant([[1., 1.], [2., 2.]])
+  tf.reduce_mean(x)  # 1.5
+  tf.reduce_mean(x, 0)  # [1.5, 1.5]
+  tf.reduce_mean(x, 1)  # [1.,  2.]
   ```
 
   Args:
@@ -1517,11 +1514,10 @@ def reduce_all(input_tensor,
   For example:
 
   ```python
-  # 'x' is [[True,  True]
-  #         [False, False]]
-  tf.reduce_all(x) ==> False
-  tf.reduce_all(x, 0) ==> [False, False]
-  tf.reduce_all(x, 1) ==> [True, False]
+  x = tf.constant([[True,  True], [False, False]])
+  tf.reduce_all(x)  # False
+  tf.reduce_all(x, 0)  # [False, False]
+  tf.reduce_all(x, 1)  # [True, False]
   ```
 
   Args:
@@ -1565,11 +1561,10 @@ def reduce_any(input_tensor,
   For example:
 
   ```python
-  # 'x' is [[True,  True]
-  #         [False, False]]
-  tf.reduce_any(x) ==> True
-  tf.reduce_any(x, 0) ==> [True, True]
-  tf.reduce_any(x, 1) ==> [True, False]
+  x = tf.constant([[True,  True], [False, False]])
+  tf.reduce_any(x)  # True
+  tf.reduce_any(x, 0)  # [True, True]
+  tf.reduce_any(x, 1)  # [True, False]
   ```
 
   Args:
@@ -1617,13 +1612,12 @@ def reduce_logsumexp(input_tensor,
   For example:
 
   ```python
-  # 'x' is [[0, 0, 0]]
-  #         [0, 0, 0]]
-  tf.reduce_logsumexp(x) ==> log(6)
-  tf.reduce_logsumexp(x, 0) ==> [log(2), log(2), log(2)]
-  tf.reduce_logsumexp(x, 1) ==> [log(3), log(3)]
-  tf.reduce_logsumexp(x, 1, keep_dims=True) ==> [[log(3)], [log(3)]]
-  tf.reduce_logsumexp(x, [0, 1]) ==> log(6)
+  x = tf.constant([[0., 0., 0.], [0., 0., 0.]])
+  tf.reduce_logsumexp(x)  # log(6)
+  tf.reduce_logsumexp(x, 0)  # [log(2), log(2), log(2)]
+  tf.reduce_logsumexp(x, 1)  # [log(3), log(3)]
+  tf.reduce_logsumexp(x, 1, keep_dims=True)  # [[log(3)], [log(3)]]
+  tf.reduce_logsumexp(x, [0, 1])  # log(6)
   ```
 
   Args:
@@ -1639,12 +1633,16 @@ def reduce_logsumexp(input_tensor,
     The reduced tensor.
   """
   with ops.name_scope(name, "ReduceLogSumExp", [input_tensor]) as name:
+    raw_max = reduce_max(
+        input_tensor,
+        axis=axis,
+        reduction_indices=reduction_indices,
+        keep_dims=True)
     my_max = array_ops.stop_gradient(
-        reduce_max(
-            input_tensor,
-            axis=axis,
-            reduction_indices=reduction_indices,
-            keep_dims=True))
+        array_ops.where(
+            gen_math_ops.is_finite(raw_max),
+            raw_max,
+            array_ops.zeros_like(raw_max)))
     result = gen_math_ops.log(
         reduce_sum(
             gen_math_ops.exp(input_tensor - my_max),
@@ -1670,22 +1668,21 @@ def trace(x, name=None):
   For example:
 
   ```python
-  # 'x' is [[1, 2],
-  #         [3, 4]]
-  tf.trace(x) ==> 5
+  x = tf.constant([[1, 2], [3, 4]])
+  tf.trace(x)  # 5
 
-  # 'x' is [[1,2,3],
-  #         [4,5,6],
-  #         [7,8,9]]
-  tf.trace(x) ==> 15
+  x = tf.constant([[1, 2, 3],
+                   [4, 5, 6],
+                   [7, 8, 9]])
+  tf.trace(x)  # 15
 
-  # 'x' is [[[1,2,3],
-  #          [4,5,6],
-  #          [7,8,9]],
-  #         [[-1,-2,-3],
-  #          [-4,-5,-6],
-  #          [-7,-8,-9]]]
-  tf.trace(x) ==> [15,-15]
+  x = tf.constant([[[1, 2, 3],
+                    [4, 5, 6],
+                    [7, 8, 9]],
+                   [[-1, -2, -3],
+                    [-4, -5, -6],
+                    [-7, -8, -9]]])
+  tf.trace(x)  # [15, -15]
   ```
 
   Args:
@@ -1732,35 +1729,46 @@ def matmul(a,
 
   ```python
   # 2-D tensor `a`
-  a = tf.constant([1, 2, 3, 4, 5, 6], shape=[2, 3]) => [[1. 2. 3.]
-                                                        [4. 5. 6.]]
+  # [[1, 2, 3],
+  #  [4, 5, 6]]
+  a = tf.constant([1, 2, 3, 4, 5, 6], shape=[2, 3])
+
   # 2-D tensor `b`
-  b = tf.constant([7, 8, 9, 10, 11, 12], shape=[3, 2]) => [[7. 8.]
-                                                           [9. 10.]
-                                                           [11. 12.]]
-  c = tf.matmul(a, b) => [[58 64]
-                          [139 154]]
+  # [[ 7,  8],
+  #  [ 9, 10],
+  #  [11, 12]]
+  b = tf.constant([7, 8, 9, 10, 11, 12], shape=[3, 2])
+
+  # `a` * `b`
+  # [[ 58,  64],
+  #  [139, 154]]
+  c = tf.matmul(a, b)
 
 
   # 3-D tensor `a`
+  # [[[ 1,  2,  3],
+  #   [ 4,  5,  6]],
+  #  [[ 7,  8,  9],
+  #   [10, 11, 12]]]
   a = tf.constant(np.arange(1, 13, dtype=np.int32),
-                  shape=[2, 2, 3])                  => [[[ 1.  2.  3.]
-                                                         [ 4.  5.  6.]],
-                                                        [[ 7.  8.  9.]
-                                                         [10. 11. 12.]]]
+                  shape=[2, 2, 3])
 
   # 3-D tensor `b`
+  # [[[13, 14],
+  #   [15, 16],
+  #   [17, 18]],
+  #  [[19, 20],
+  #   [21, 22],
+  #   [23, 24]]]
   b = tf.constant(np.arange(13, 25, dtype=np.int32),
-                  shape=[2, 3, 2])                   => [[[13. 14.]
-                                                          [15. 16.]
-                                                          [17. 18.]],
-                                                         [[19. 20.]
-                                                          [21. 22.]
-                                                          [23. 24.]]]
-  c = tf.matmul(a, b) => [[[ 94 100]
-                           [229 244]],
-                          [[508 532]
-                           [697 730]]]
+                  shape=[2, 3, 2])
+
+  # `a` * `b`
+  # [[[ 94, 100],
+  #   [229, 244]],
+  #  [[508, 532],
+  #   [697, 730]]]
+  c = tf.matmul(a, b)
 
   # Since python >= 3.5 the @ operator is supported (see PEP 465).
   # In TensorFlow, it simply calls the `tf.matmul()` function, so the
@@ -1980,13 +1988,13 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   For example:
 
   ```python
-  # tensor 'a' is [[1, 2], [3, 4]]
-  # tensor `b` is [[5, 0], [0, 6]]
-  tf.accumulate_n([a, b, a]) ==> [[7, 4], [6, 14]]
+  a = tf.constant([[1, 2], [3, 4]])
+  b = tf.constant([[5, 0], [0, 6]])
+  tf.accumulate_n([a, b, a])  # [[7, 4], [6, 14]]
 
   # Explicitly pass shape and type
-  tf.accumulate_n([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)
-    ==> [[7, 4], [6, 14]]
+  tf.accumulate_n([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)  # [[7,  4],
+                                                                   #  [6, 14]]
   ```
 
   Args:
@@ -2151,21 +2159,21 @@ def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
   element of the input is identical to the first element of the output:
 
   ```python
-  tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+  tf.cumsum([a, b, c])  # [a, a + b, a + b + c]
   ```
 
   By setting the `exclusive` kwarg to `True`, an exclusive cumsum is performed
   instead:
 
   ```python
-  tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+  tf.cumsum([a, b, c], exclusive=True)  # [0, a, a + b]
   ```
 
   By setting the `reverse` kwarg to `True`, the cumsum is performed in the
   opposite direction:
 
   ```python
-  tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+  tf.cumsum([a, b, c], reverse=True)  # [a + b + c, b + c, c]
   ```
 
   This is more efficient than using separate `tf.reverse` ops.
@@ -2173,7 +2181,7 @@ def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
   The `reverse` and `exclusive` kwargs can also be combined:
 
   ```python
-  tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+  tf.cumsum([a, b, c], exclusive=True, reverse=True)  # [b + c, c, 0]
   ```
 
   Args:
@@ -2202,7 +2210,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
   first element of the input is identical to the first element of the output:
 
   ```python
-  tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+  tf.cumprod([a, b, c])  # [a, a * b, a * b * c]
   ```
 
   By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
@@ -2210,21 +2218,21 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
   instead:
 
   ```python
-  tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+  tf.cumprod([a, b, c], exclusive=True)  # [1, a, a * b]
   ```
 
   By setting the `reverse` kwarg to `True`, the cumprod is performed in the
   opposite direction:
 
   ```python
-  tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+  tf.cumprod([a, b, c], reverse=True)  # [a * b * c, b * c, c]
   ```
 
   This is more efficient than using separate `tf.reverse` ops.
   The `reverse` and `exclusive` kwargs can also be combined:
 
   ```python
-  tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+  tf.cumprod([a, b, c], exclusive=True, reverse=True)  # [b * c, c, 1]
   ```
 
   Args:
@@ -2448,6 +2456,10 @@ def tensordot(a, b, axes, name=None):
         raise ValueError("'axes' must be an integer or have length 2.")
       a_axes = axes[0]
       b_axes = axes[1]
+      if isinstance(a_axes, compat.integral_types) and \
+          isinstance(b_axes, compat.integral_types):
+        a_axes = [a_axes]
+        b_axes = [b_axes]
       if len(a_axes) != len(b_axes):
         raise ValueError(
             "Different number of contraction axes 'a' and 'b', %s != %s.",
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 617d2305bd8..46a57924745 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -134,6 +134,11 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
         y_np = log(np.sum(exp(x_np - max_np))) + max_np
         self.assertAllClose(y_tf_np, y_np)
 
+  def testInfinity(self):
+    with self.test_session(use_gpu=True):
+      res = math_ops.reduce_logsumexp(-np.inf).eval()
+      self.assertEqual(-np.inf, res)
+
 
 class RoundTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/matmul_benchmark.py b/tensorflow/python/ops/matmul_benchmark.py
index b777ace9d0c..f95cf08de1a 100644
--- a/tensorflow/python/ops/matmul_benchmark.py
+++ b/tensorflow/python/ops/matmul_benchmark.py
@@ -47,7 +47,7 @@ def build_graph(device, n, m, k, transpose_a, transpose_b, dtype):
   Returns:
     A matmul operation to run()
   """
-  with ops.device('/%s:0' % device):
+  with ops.device('%s' % device):
     if not transpose_a:
       x = variables.Variable(random_ops.random_uniform([n, m], dtype=dtype))
     else:
@@ -112,7 +112,7 @@ class MatmulBenchmark(test.Benchmark):
     return duration
 
   def run_test_gpu(self, n, m, k, transpose_a, transpose_b, dtype, num_iters):
-    self.run_graph('gpu', n, m, k, transpose_a, transpose_b, num_iters, dtype)
+    self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b, num_iters, dtype)
 
   def test_round(self, num_iters):
     dtypes = [np.float32, np.float64]
diff --git a/tensorflow/python/ops/matmul_benchmark_test.py b/tensorflow/python/ops/matmul_benchmark_test.py
index a7914dba787..5a9c0a7a495 100644
--- a/tensorflow/python/ops/matmul_benchmark_test.py
+++ b/tensorflow/python/ops/matmul_benchmark_test.py
@@ -71,37 +71,39 @@ class MatmulBenchmarkTest(googletest.TestCase):
   def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype):
     graph = ops.Graph()
     with graph.as_default():
-      matmul_benchmark.build_graph("gpu", n, m, k, transpose_a, transpose_b,
+      matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k, transpose_a, transpose_b,
                                    dtype)
       gd = graph.as_graph_def()
-      self.assertProtoEquals("""
-      node { name: "random_uniform/shape" op: "Const" device: "/device:GPU:0" }
-      node { name: "random_uniform/min" op: "Const" device: "/device:GPU:0" }
-      node { name: "random_uniform/max" op: "Const" device: "/device:GPU:0" }
-      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: "/device:GPU:0" }
-      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: "/device:GPU:0" }
-      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: "/device:GPU:0" }
-      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: "/device:GPU:0" }
-      node { name: "Variable" op: "VariableV2" device: "/device:GPU:0" }
-      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: "/device:GPU:0" }
-      node { name: "Variable/read" op: "Identity" input: "Variable" device: "/device:GPU:0" }
-      node { name: "random_uniform_1/shape" op: "Const" device: "/device:GPU:0" }
-      node { name: "random_uniform_1/min" op: "Const" device: "/device:GPU:0" }
-      node { name: "random_uniform_1/max" op: "Const" device: "/device:GPU:0" }
-      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: "/device:GPU:0" }
-      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: "/device:GPU:0" }
-      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: "/device:GPU:0" }
-      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: "/device:GPU:0" }
-      node { name: "Variable_1" op: "VariableV2" device: "/device:GPU:0" }
-      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: "/device:GPU:0" }
-      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: "/device:GPU:0" }
-      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: "/device:GPU:0" }
-      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: "/device:GPU:0" }
-                             """, self._StripGraph(gd))
+      dev=googletest.gpu_device_name()
+      proto_expected = """
+      node { name: "random_uniform/shape" op: "Const" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/min" op: "Const" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/max" op: "Const" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \""""+ dev +"""\" }
+      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \""""+ dev +"""\" }
+      node { name: "Variable" op: "VariableV2" device: \""""+ dev +"""\" }
+      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \""""+ dev +"""\" }
+      node { name: "Variable/read" op: "Identity" input: "Variable" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1/shape" op: "Const" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1/min" op: "Const" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1/max" op: "Const" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \""""+ dev +"""\" }
+      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
+      node { name: "Variable_1" op: "VariableV2" device: \""""+ dev +"""\" }
+      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \""""+ dev +"""\" }
+      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \""""+ dev +"""\" }
+      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \""""+ dev +"""\" }
+      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \""""+ dev +"""\" }
+                       """
+      self.assertProtoEquals(str(proto_expected), self._StripGraph(gd))
 
   def _VerifyRunGraph(self, n, m, k, transpose_a, transpose_b, dtype):
     benchmark_instance = matmul_benchmark.MatmulBenchmark()
-    duration = benchmark_instance.run_graph("gpu", n, m, k, transpose_a,
+    duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m, k, transpose_a,
                                             transpose_b, 1, dtype)
     self.assertTrue(duration > 1e-6)
 
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 094757dac9f..de302a22712 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -541,6 +541,19 @@ def _MaxPoolGrad(op, grad):
                                    data_format=op.get_attr("data_format"))
 
 
+@ops.RegisterGradient("MaxPoolV2")
+def _MaxPoolGradV2(op, grad):
+  ksize = op.inputs[1]
+  strides = op.inputs[2]
+  return gen_nn_ops.max_pool_grad_v2(op.inputs[0],
+                                     op.outputs[0],
+                                     grad,
+                                     ksize,
+                                     strides,
+                                     padding=op.get_attr("padding"),
+                                     data_format=op.get_attr("data_format")), None, None
+
+
 @ops.RegisterGradient("MaxPoolWithArgmax")
 def _MaxPoolGradWithArgmax(op, grad, unused_argmax_grad):
   return gen_nn_ops._max_pool_grad_with_argmax(op.inputs[0],
@@ -567,6 +580,24 @@ def _MaxPoolGradGrad(op, grad):
               data_format=op.get_attr("data_format")))
 
 
+@ops.RegisterGradient("MaxPoolGradV2")
+def _MaxPoolGradGradV2(op, grad):
+  ksize = op.inputs[3]
+  strides = op.inputs[4]
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops.max_pool_grad_grad_v2(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              ksize,
+              strides,
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")), None, None)
+
+
 @ops.RegisterGradient("MaxPoolGradGrad")
 def _MaxPoolGradGradGrad(op, grad):
   return (array_ops.zeros(
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 98ede2031bc..53b8996c0c5 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -417,7 +417,7 @@ def separable_conv2d(input,
 
   In detail,
 
-      output[b, i, j, k] = sum_{di, dj, q, r]
+      output[b, i, j, k] = sum_{di, dj, q, r}
           input[b, strides[1] * i + di, strides[2] * j + dj, q] *
           depthwise_filter[di, dj, q, r] *
           pointwise_filter[0, 0, q * channel_multiplier + r, k]
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 0071e7d868f..06eae123ab2 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -417,7 +417,7 @@ def parse_example(serialized, features, name=None, example_names=None):
 
   then the output will look like:
 
-  ```
+  ```python
   {"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]],
                       values=[1.0, 2.0, 3.0],
                       dense_shape=(3, 2)) }
@@ -426,7 +426,7 @@ def parse_example(serialized, features, name=None, example_names=None):
   If instead a `FixedLenSequenceFeature` with `default_value = -1.0` and
   `shape=[]` is used then the output will look like:
 
-  ```
+  ```python
   {"ft": [[1.0, 2.0], [3.0, -1.0]]}
   ```
 
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 932727c17dc..b174956e604 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -294,10 +294,6 @@ def _reverse_seq(input_seq, lengths):
     # Join into (time, batch_size, depth)
     s_joined = array_ops.stack(sequence)
 
-    # TODO(schuster, ebrevdo): Remove cast when reverse_sequence takes int32
-    if lengths is not None:
-      lengths = math_ops.to_int64(lengths)
-
     # Reverse along dimension 0
     s_reversed = array_ops.reverse_sequence(s_joined, lengths, 0, 1)
     # Split again into list
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index b561203bb47..87561cff92a 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -82,7 +82,7 @@ def lbeta(x, name='lbeta'):
     return result
 
 
-def einsum(equation, *inputs):
+def einsum(equation, *inputs, **kwargs):
   """A generalized contraction between tensors of arbitrary dimension.
 
   This function returns a tensor whose elements are defined by `equation`,
@@ -138,6 +138,7 @@ def einsum(equation, *inputs):
       `numpy.einsum`.
     *inputs: the inputs to contract (each one a `Tensor`), whose shapes should
       be consistent with `equation`.
+    name: A name for the operation (optional).
 
   Returns:
     The contracted `Tensor`, with shape determined by `equation`.
@@ -151,70 +152,76 @@ def einsum(equation, *inputs):
         indices in its subscript, or
       - the input shapes are inconsistent along a particular axis.
   """
-  if '...' in equation:
-    raise ValueError('Subscripts with ellipses are not yet supported.')
+  name = kwargs.pop("name", None)
+  if kwargs:
+    raise TypeError("invalid keyword arguments for this function: " +
+                    ", ".join([format(key)
+                               for key in sorted(list(kwargs.keys()))]))
+  with ops.name_scope(name, "einsum", [equation, inputs]) as name:
+    if '...' in equation:
+      raise ValueError('Subscripts with ellipses are not yet supported.')
 
-  match = re.match('([a-z,]+)(->[a-z]*)?', equation)
-  if not match:
-    raise ValueError(
-        'Indices have incorrect format: %s' % equation
-    )
+    match = re.match('([a-z,]+)(->[a-z]*)?', equation)
+    if not match:
+      raise ValueError(
+          'Indices have incorrect format: %s' % equation
+      )
 
-  inputs = list(inputs)
-  input_axis_labels = match.group(1).split(',')
+    inputs = list(inputs)
+    input_axis_labels = match.group(1).split(',')
 
-  if len(inputs) != len(input_axis_labels):
-    raise ValueError('Got %d arguments for equation "%s", expecting %d' % (
-        len(inputs), equation, len(input_axis_labels)))
+    if len(inputs) != len(input_axis_labels):
+      raise ValueError('Got %d arguments for equation "%s", expecting %d' % (
+          len(inputs), equation, len(input_axis_labels)))
 
-  axis_labels = set(''.join(input_axis_labels))
-  if match.group(2):
-    output_axis_labels = match.group(2)[2:]
-  else:
-    # infer the output subscripts if not given, assume alphabetical order
-    indices = ''.join(sorted(axis_labels))
-    counts = {ax: 0 for ax in indices}
-    for axes_ in input_axis_labels:
-      for ax in axes_:
-        counts[ax] += 1
+    axis_labels = set(''.join(input_axis_labels))
+    if match.group(2):
+      output_axis_labels = match.group(2)[2:]
+    else:
+      # infer the output subscripts if not given, assume alphabetical order
+      indices = ''.join(sorted(axis_labels))
+      counts = {ax: 0 for ax in indices}
+      for axes_ in input_axis_labels:
+        for ax in axes_:
+          counts[ax] += 1
 
-    output_axis_labels = ''.join(sorted(
-        ax for ax in indices
-        if counts[ax] == 1
-    ))
+      output_axis_labels = ''.join(sorted(
+          ax for ax in indices
+          if counts[ax] == 1
+      ))
 
-  for a in axis_labels:
-    input_count = sum(1 for s in input_axis_labels if a in s)
-    if input_count > 2 and a not in output_axis_labels:
-      logging.warn(
-          'Falling back to exponential-space implementation of einsum() because'
-          ' index "%s" is summed over more than two inputs.', a)
-      return _exponential_space_einsum(equation, *inputs)
+    for a in axis_labels:
+      input_count = sum(1 for s in input_axis_labels if a in s)
+      if input_count > 2 and a not in output_axis_labels:
+        logging.warn(
+            'Falling back to exponential-space implementation of einsum() because'
+            ' index "%s" is summed over more than two inputs.', a)
+        return _exponential_space_einsum(equation, *inputs)
 
-  temp = inputs[0]
-  temp_axis_labels = input_axis_labels[0]
-  for i in xrange(len(inputs)-1):
-    axes_to_sum = (set(temp_axis_labels) & set(input_axis_labels[i+1])
-                   - set(output_axis_labels))
-    temp, temp_axis_labels = _einsum_reduction(temp,
-                                               temp_axis_labels,
-                                               inputs[i+1],
-                                               input_axis_labels[i+1],
-                                               axes_to_sum)
+    temp = inputs[0]
+    temp_axis_labels = input_axis_labels[0]
+    for i in xrange(len(inputs)-1):
+      axes_to_sum = (set(temp_axis_labels) & set(input_axis_labels[i+1])
+                     - set(output_axis_labels))
+      temp, temp_axis_labels = _einsum_reduction(temp,
+                                                 temp_axis_labels,
+                                                 inputs[i+1],
+                                                 input_axis_labels[i+1],
+                                                 axes_to_sum)
 
-  missing_indices = set(temp_axis_labels) - set(output_axis_labels)
-  if missing_indices:
-    reduction_indices = [i for i, a in enumerate(temp_axis_labels)
-                         if a not in output_axis_labels]
-    temp = math_ops.reduce_sum(temp, reduction_indices=reduction_indices)
-    temp_axis_labels = ''.join(a for a in temp_axis_labels
-                               if a in output_axis_labels)
+    missing_indices = set(temp_axis_labels) - set(output_axis_labels)
+    if missing_indices:
+      reduction_indices = [i for i, a in enumerate(temp_axis_labels)
+                           if a not in output_axis_labels]
+      temp = math_ops.reduce_sum(temp, reduction_indices=reduction_indices)
+      temp_axis_labels = ''.join(a for a in temp_axis_labels
+                                 if a in output_axis_labels)
 
-  if sorted(temp_axis_labels) != sorted(output_axis_labels):
-    raise ValueError('Invalid equation: %s' % equation)
+    if sorted(temp_axis_labels) != sorted(output_axis_labels):
+      raise ValueError('Invalid equation: %s' % equation)
 
-  perm = [temp_axis_labels.index(a) for a in output_axis_labels]
-  return _transpose_if_necessary(temp, perm)
+    perm = [temp_axis_labels.index(a) for a in output_axis_labels]
+    return _transpose_if_necessary(temp, perm)
 
 
 def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 13cd9b7ba44..6581e9f9225 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -242,6 +242,14 @@ class EinsumTest(test.TestCase):
       with self.assertRaises(ValueError):
         _ = special_math_ops.einsum(axes, *inputs)
 
+  def test_invalid_keyword_arguments(self):
+    m0 = array_ops.placeholder(dtypes.int32, shape=(1, None))
+    m1 = array_ops.placeholder(dtypes.int32, shape=(None, 1))
+    with self.assertRaisesRegexp(TypeError,
+        'invalid keyword arguments for this function: invalid1, invalid2'):
+      _ = special_math_ops.einsum('ij,jk->ik', m0, m1, name="name",
+                                  invalid1="value1", invalid2="value2")
+
   def test_dim_mismatch(self):
     for axes, input_shapes in self.dim_mismatch_cases:
       inputs = [
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index a706193a40b..f0d2b8bf8c9 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -1198,7 +1198,7 @@ class PartitionedVariable(object):
         "assign() has not been implemented for PartitionedVariable.")
 
 
-def global_variables():
+def global_variables(scope=None):
   """Returns global variables.
 
   Global variables are variables that are shared across machines in a
@@ -1210,10 +1210,17 @@ def global_variables():
   An alternative to global variables are local variables. See
   @{tf.local_variables}
 
+  Args:
+    scope: (Optional.) A string. If supplied, the resulting list is filtered
+      to include only items whose `name` attribute matches `scope` using
+      `re.match`. Items without a `name` attribute are never returned if a
+      scope is supplied. The choice of `re.match` means that a `scope` without
+      special tokens filters by prefix.
+
   Returns:
     A list of `Variable` objects.
   """
-  return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+  return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES, scope)
 
 
 @deprecated("2017-03-02", "Please use tf.global_variables instead.")
@@ -1222,18 +1229,25 @@ def all_variables():
   return global_variables()
 
 
-def _all_saveable_objects():
+def _all_saveable_objects(scope=None):
   """Returns all variables and `SaveableObject`s that must be checkpointed.
 
+  Args:
+    scope: (Optional.) A string. If supplied, the resulting list is filtered
+      to include only items whose `name` attribute matches `scope` using
+      `re.match`. Items without a `name` attribute are never returned if a
+      scope is supplied. The choice of `re.match` means that a `scope` without
+      special tokens filters by prefix.
+
   Returns:
     A list of `Variable` and `SaveableObject` to be checkpointed
   """
   # TODO(andreasst): make this function public once things are settled.
-  return (ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) +
-          ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS))
+  return (ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES, scope) +
+          ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS, scope))
 
 
-def local_variables():
+def local_variables(scope=None):
   """Returns local variables.
 
   Local variables - per process variables, usually not saved/restored to
@@ -1247,22 +1261,36 @@ def local_variables():
   An alternative to local variables are global variables. See
   @{tf.global_variables}
 
+  Args:
+    scope: (Optional.) A string. If supplied, the resulting list is filtered
+      to include only items whose `name` attribute matches `scope` using
+      `re.match`. Items without a `name` attribute are never returned if a
+      scope is supplied. The choice of `re.match` means that a `scope` without
+      special tokens filters by prefix.
+
   Returns:
     A list of local `Variable` objects.
   """
-  return ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES)
+  return ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES, scope)
 
 
-def model_variables():
+def model_variables(scope=None):
   """Returns all variables in the MODEL_VARIABLES collection.
 
+  Args:
+    scope: (Optional.) A string. If supplied, the resulting list is filtered
+      to include only items whose `name` attribute matches `scope` using
+      `re.match`. Items without a `name` attribute are never returned if a
+      scope is supplied. The choice of `re.match` means that a `scope` without
+      special tokens filters by prefix.
+
   Returns:
     A list of local Variable objects.
   """
-  return ops.get_collection(ops.GraphKeys.MODEL_VARIABLES)
+  return ops.get_collection(ops.GraphKeys.MODEL_VARIABLES, scope)
 
 
-def trainable_variables():
+def trainable_variables(scope=None):
   """Returns all variables created with `trainable=True`.
 
   When passed `trainable=True`, the `Variable()` constructor automatically
@@ -1270,13 +1298,20 @@ def trainable_variables():
   `GraphKeys.TRAINABLE_VARIABLES`. This convenience function returns the
   contents of that collection.
 
+  Args:
+    scope: (Optional.) A string. If supplied, the resulting list is filtered
+      to include only items whose `name` attribute matches `scope` using
+      `re.match`. Items without a `name` attribute are never returned if a
+      scope is supplied. The choice of `re.match` means that a `scope` without
+      special tokens filters by prefix.
+
   Returns:
     A list of Variable objects.
   """
-  return ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+  return ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES, scope)
 
 
-def moving_average_variables():
+def moving_average_variables(scope=None):
   """Returns all variables that maintain their moving averages.
 
   If an `ExponentialMovingAverage` object is created and the `apply()`
@@ -1284,10 +1319,17 @@ def moving_average_variables():
   be added to the `GraphKeys.MOVING_AVERAGE_VARIABLES` collection.
   This convenience function returns the contents of that collection.
 
+  Args:
+    scope: (Optional.) A string. If supplied, the resulting list is filtered
+      to include only items whose `name` attribute matches `scope` using
+      `re.match`. Items without a `name` attribute are never returned if a
+      scope is supplied. The choice of `re.match` means that a `scope` without
+      special tokens filters by prefix.
+
   Returns:
     A list of Variable objects.
   """
-  return ops.get_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES)
+  return ops.get_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES, scope)
 
 
 def variables_initializer(var_list, name="init"):
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index 62b2314aea0..b758edf87ef 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -97,21 +97,22 @@ class RunMetadataTest(test.TestCase):
     if not test.is_gpu_available(cuda_only=True):
       return
 
+    gpu_dev = test.gpu_device_name()
     ops.reset_default_graph()
-    with ops.device('/gpu:0'):
+    with ops.device(gpu_dev):
       tfprof_node, run_meta = _run_model()
       self.assertEqual(tfprof_node.children[0].name, 'MatMul')
       self.assertGreater(tfprof_node.children[0].exec_micros, 10)
 
     ret = _extract_node(run_meta, ['MatMul', 'MatMul:MatMul'])
     self.assertEqual(len(ret), 3)
-    self.assertTrue('/job:localhost/replica:0/task:0/gpu:0' in ret)
-    del ret['/job:localhost/replica:0/task:0/gpu:0']
+    self.assertTrue('/job:localhost/replica:0/task:0' + gpu_dev in ret)
+    del ret['/job:localhost/replica:0/task:0' + gpu_dev]
 
     has_all_stream = False
     for k, _ in six.iteritems(ret):
-      self.assertTrue('gpu:0/stream' in k)
-      if 'gpu:0/stream:all' in k:
+      self.assertTrue(gpu_dev + '/stream' in k)
+      if gpu_dev + '/stream:all' in k:
         has_all_stream = True
     self.assertTrue(has_all_stream)
 
@@ -159,24 +160,24 @@ class RunMetadataTest(test.TestCase):
       return
 
     ops.reset_default_graph()
-    with ops.device('/gpu:0'):
+    with ops.device('/device:GPU:0'):
       tfprof_node, run_meta = _run_loop_model()
       # The while-loop caused a node to appear 4 times in scheduling.
       ret = _extract_node(run_meta,
                           'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul')
-      self.assertEqual(len(ret['/job:localhost/replica:0/task:0/gpu:0']), 4)
+      self.assertEqual(len(ret['/job:localhost/replica:0/task:0/device:GPU:0']), 4)
 
       total_cpu_execs = 0
-      for node in ret['/job:localhost/replica:0/task:0/gpu:0']:
+      for node in ret['/job:localhost/replica:0/task:0/device:GPU:0']:
         total_cpu_execs += node.op_end_rel_micros
 
       ret = _extract_node(
           run_meta,
           'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul:MatMul')
-      self.assertGreaterEqual(len(ret['/gpu:0/stream:all']), 4)
+      self.assertGreaterEqual(len(ret['/device:GPU:0/stream:all']), 4)
 
       total_accelerator_execs = 0
-      for node in ret['/gpu:0/stream:all']:
+      for node in ret['/device:GPU:0/stream:all']:
         total_accelerator_execs += node.op_end_rel_micros
 
       mm_node = lib.SearchTFProfNode(
diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
index e2e022425dd..502fc49bb62 100644
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@@ -315,7 +315,7 @@ class ProfileOptionBuilder(object):
     """Selectively counting statistics based on node types.
 
     Here, 'types' means the profiler nodes' properties. Profiler by default
-    consider device name (e.g. /job:xx/.../gpu:0) and operation type
+    consider device name (e.g. /job:xx/.../device:GPU:0) and operation type
     (e.g. MatMul) as profiler nodes' properties. User can also associate
     customized 'types' to profiler nodes through OpLogProto proto.
 
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 8ce49d623d8..bd465335724 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -336,6 +336,14 @@ class FileWriter(SummaryToEventTransformer):
                                    filename_suffix)
     super(FileWriter, self).__init__(event_writer, graph, graph_def)
 
+  def __enter__(self):
+    """Make usable with "with" statement."""
+    return self
+
+  def __exit__(self, unused_type, unused_value, unused_traceback):
+    """Make usable with "with" statement."""
+    self.close()
+
   def get_logdir(self):
     """Returns the directory where event file will be written."""
     return self.event_writer.get_logdir()
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 56629eb9166..9d3e20e408a 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -267,6 +267,13 @@ class SummaryWriterTestCase(test.TestCase):
     sw.close()
     self._assertRecent(time_before_close)
 
+  def testWithStatement(self):
+    test_dir = self._CleanTestDir("with_statement")
+    with writer.FileWriter(test_dir) as sw:
+      sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
+    event_paths = sorted(glob.glob(os.path.join(test_dir, "event*")))
+    self.assertEquals(1, len(event_paths))
+
   # Checks that values returned from session Run() calls are added correctly to
   # summaries.  These are numpy types so we need to check they fit in the
   # protocol buffers correctly.
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 9075a707a2e..d2caf2aac5e 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -646,6 +646,8 @@ def create_parser():
 def main():
   parser = create_parser()
   args = parser.parse_args()
+  if not hasattr(args.func):
+    parser.error("too few arguments")
   args.func(args)
 
 
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 4f1237f3a21..f5b5c728ff7 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -575,7 +575,7 @@ class Optimizer(object):
       grad: A `Tensor`.
       var: A `Variable` object.
 
-    Return:
+    Returns:
       An `Operation`.
     """
     raise NotImplementedError()
@@ -688,7 +688,7 @@ class Optimizer(object):
       grad: `IndexedSlices`, with no repeated indices.
       var: A `Variable` object.
 
-    Return:
+    Returns:
       An `Operation`.
     """
     raise NotImplementedError()
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index f1830bd3fcf..dcf14408c77 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -127,7 +127,7 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
 
   To use SyncReplicasOptimizer with an `Estimator`, you need to send
   sync_replicas_hook while calling the fit.
-  ```
+  ```python
   my_estimator = DNNClassifier(..., optimizer=opt)
   my_estimator.fit(..., hooks=[sync_replicas_hook])
   ```
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 00faccced6f..b1d5cbeaf58 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -46,6 +46,10 @@ cc_library(
             exclude = ["cuda/cuda_platform_id.cc"],
         ),
     ),
+    copts = select({
+        "//tensorflow:windows": ["/DNOGDI"],
+        "//conditions:default": [],
+    }),
     linkopts = select({
         "//tensorflow:freebsd": [],
         "//conditions:default": ["-ldl"],
diff --git a/tensorflow/stream_executor/host/host_stream.h b/tensorflow/stream_executor/host/host_stream.h
index 9894d17febc..e22f49b1e67 100644
--- a/tensorflow/stream_executor/host/host_stream.h
+++ b/tensorflow/stream_executor/host/host_stream.h
@@ -48,7 +48,7 @@ class HostStream : public internal::StreamInterface {
 
   mutex mu_;
   int pending_tasks_ GUARDED_BY(mu_) = 0;
-  condition_variable completion_condition_;
+  ConditionVariableForMutex completion_condition_;
 };
 
 }  // namespace host
diff --git a/tensorflow/stream_executor/platform/default/mutex.h b/tensorflow/stream_executor/platform/default/mutex.h
index f28a2c9318e..ac2f123d5c1 100644
--- a/tensorflow/stream_executor/platform/default/mutex.h
+++ b/tensorflow/stream_executor/platform/default/mutex.h
@@ -42,8 +42,10 @@ enum ConditionResult { kCond_Timeout, kCond_MaybeNotified };
 
 #ifdef STREAM_EXECUTOR_USE_SHARED_MUTEX
 typedef std::shared_timed_mutex BaseMutex;
+typedef std::condition_variable_any ConditionVariableForMutex;
 #else
 typedef std::mutex BaseMutex;
+typedef std::condition_variable ConditionVariableForMutex;
 #endif
 
 // A class that wraps around the std::mutex implementation, only adding an
@@ -82,7 +84,7 @@ typedef mutex_lock shared_lock;
 using std::condition_variable;
 
 inline ConditionResult WaitForMilliseconds(mutex_lock* mu,
-                                           condition_variable* cv, int64 ms) {
+                                           ConditionVariableForMutex* cv, int64 ms) {
   std::cv_status s = cv->wait_for(*mu, std::chrono::milliseconds(ms));
   return (s == std::cv_status::timeout) ? kCond_Timeout : kCond_MaybeNotified;
 }
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 902de7bb7d9..30068ec42d7 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -155,6 +155,7 @@ WIN_COPTS = [
     "/Iexternal/gemmlowp",
     "/wd4018", # -Wno-sign-compare
     "/U_HAS_EXCEPTIONS", "/D_HAS_EXCEPTIONS=1", "/EHsc", # -fno-exceptions
+    "/DNOGDI",
 ]
 
 # LINT.IfChange
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index fff201b2df5..7f67701e158 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -910,7 +910,7 @@ tf_module {
   }
   member_method {
     name: "einsum"
-    argspec: "args=[\'equation\'], varargs=inputs, keywords=None, defaults=None"
+    argspec: "args=[\'equation\'], varargs=inputs, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "encode_base64"
@@ -1070,7 +1070,7 @@ tf_module {
   }
   member_method {
     name: "global_variables"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "global_variables_initializer"
@@ -1226,7 +1226,7 @@ tf_module {
   }
   member_method {
     name: "local_variables"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "local_variables_initializer"
@@ -1346,11 +1346,11 @@ tf_module {
   }
   member_method {
     name: "model_variables"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "moving_average_variables"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "multinomial"
@@ -1950,7 +1950,7 @@ tf_module {
   }
   member_method {
     name: "trainable_variables"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "transpose"
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi b/tensorflow/tools/ci_build/Dockerfile.pi
new file mode 100644
index 00000000000..9d12ededb8b
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.pi
@@ -0,0 +1,20 @@
+FROM ubuntu:14.04
+
+MAINTAINER Jan Prach <jendap@google.com>
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa && \
+    add-apt-repository -y ppa:george-edison55/cmake-3.x
+RUN /install/install_deb_packages.sh
+RUN /install/install_pip_packages.sh
+RUN /install/install_bazel.sh
+RUN /install/install_proto3.sh
+RUN /install/install_buildifier.sh
+RUN /install/install_auditwheel.sh
+RUN /install/install_golang.sh
+RUN /install/install_pi_toolchain.sh
+
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
diff --git a/tensorflow/tools/ci_build/install/install_pi_toolchain.sh b/tensorflow/tools/ci_build/install/install_pi_toolchain.sh
new file mode 100755
index 00000000000..ef30ba58c28
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_pi_toolchain.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+dpkg --add-architecture armhf
+echo 'deb [arch=armhf] http://ports.ubuntu.com/ trusty main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=armhf] http://ports.ubuntu.com/ trusty-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=armhf] http://ports.ubuntu.com/ trusty-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=armhf] http://ports.ubuntu.com/ trusty-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+sed -i 's#deb http://archive.ubuntu.com/ubuntu/#deb [arch=amd64] http://archive.ubuntu.com/ubuntu/#g' /etc/apt/sources.list
+apt-get update
+apt-get install -y libpython-all-dev:armhf
+echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
+curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add -
+apt-get update
+rm -rf /usr/local/bin/bazel
+apt-get install -y bazel python python-numpy python-dev python-pip
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
new file mode 100755
index 00000000000..9e6cfc017e9
--- /dev/null
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# By default this builds packages for the Pi Two and Three only, since the NEON support
+# this allows makes calculations many times faster. To support the Pi One or Zero, pass
+# PI_ONE as the first argument to the script, for example:
+# tensorflow/tools/ci_build/pi/build_raspberry_pi.sh PI_ONE
+#
+# To install the cross-compilation support for Python this script needs on Ubuntu Trusty, run
+# something like these steps, after backing up your original /etc/apt/sources.list file:
+#
+# dpkg --add-architecture armhf
+# echo 'deb [arch=armhf] http://ports.ubuntu.com/ trusty main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+# echo 'deb [arch=armhf] http://ports.ubuntu.com/ trusty-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+# echo 'deb [arch=armhf] http://ports.ubuntu.com/ trusty-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+# echo 'deb [arch=armhf] http://ports.ubuntu.com/ trusty-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+# sed -i 's#deb http://archive.ubuntu.com/ubuntu/#deb [arch=amd64] http://archive.ubuntu.com/ubuntu/#g' /etc/apt/sources.list
+# apt-get update
+# apt-get install -y libpython-all-dev:armhf
+#
+# Make sure you have an up to date version of the Bazel build tool installed too.
+
+yes '' | ./configure
+
+# We need to update the Eigen version, because of compiler failures on ARM when
+# using the version currently (Aug 10th 2017) pulled by mainline TensorFlow. We
+# should be able to get rid of this hack once
+# https://github.com/tensorflow/tensorflow/issues/9697 is addressed.
+sed -i 's/f3a22f35b044/d781c1de9834/g' tensorflow/workspace.bzl
+sed -i 's/ca7beac153d4059c02c8fc59816c82d54ea47fe58365e8aded4082ded0b820c4/a34b208da6ec18fa8da963369e166e4a368612c14d956dd2f9d7072904675d9b/g' tensorflow/workspace.bzl
+
+# Fix for curl build problem in 32-bit, see https://stackoverflow.com/questions/35181744/size-of-array-curl-rule-01-is-negative
+sudo sed -i 's/define CURL_SIZEOF_LONG 8/define CURL_SIZEOF_LONG 4/g' /usr/include/curl/curlbuild.h
+sudo sed -i 's/define CURL_SIZEOF_CURL_OFF_T 8/define CURL_SIZEOF_CURL_OFF_T 4/g' /usr/include/curl/curlbuild.h
+
+if [[ $1 == "PI_ONE" ]]; then
+  PI_COPTS="--copt=-march=armv6 --copt=-mfpu=vfp"
+  echo "Building for the Pi One/Zero, with no NEON support"
+else
+  PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
+  --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
+  --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
+  --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
+  echo "Building for the Pi Two/Three, with NEON acceleration"
+fi
+
+bazel build -c opt ${PI_COPTS} \
+  --copt=-funsafe-math-optimizations --copt=-ftree-vectorize \
+  --copt=-fomit-frame-pointer --cpu=armeabi \
+  --crosstool_top=@local_config_arm_compiler//:toolchain \
+  --verbose_failures \
+  //tensorflow/tools/benchmark:benchmark_model \
+  //tensorflow/tools/pip_package:build_pip_package
+
+OUTDIR=bazel-out/pi
+mkdir -p ${OUTDIR}
+echo "Final outputs will go to ${OUTDIR}"
+
+# Build a universal wheel.
+BDIST_OPTS="--universal" \
+  bazel-bin/tensorflow/tools/pip_package/build_pip_package "${OUTDIR}"
+
+OLD_FN=$(ls "${OUTDIR}" | grep \.whl)
+SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-any.whl/; print'
+NEW_FN=$(echo "${OLD_FN}" | perl -ne "${SUB}")
+mv "${OUTDIR}/${OLD_FN}" "${OUTDIR}/${NEW_FN}"
+cp bazel-bin/tensorflow/tools/benchmark/benchmark_model "${OUTDIR}"
+
+echo "Output can be found here:"
+find "${OUTDIR}"
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index dff4707cbef..7f7bc06e542 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -136,9 +136,9 @@ function run_configure_for_gpu_build {
   export TF_NEED_CUDA=1
   export TF_CUDA_VERSION=8.0
   export CUDA_TOOLKIT_PATH="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0"
-  export TF_CUDNN_VERSION=5
+  export TF_CUDNN_VERSION=6.0
   export CUDNN_INSTALL_PATH="C:/tools/cuda"
-  export TF_CUDA_COMPUTE_CAPABILITIES="3.5,5.2"
+  export TF_CUDA_COMPUTE_CAPABILITIES="3.7"
   if [ -z "$TF_ENABLE_XLA" ]; then
     export TF_ENABLE_XLA=0
   fi
@@ -150,6 +150,11 @@ function run_configure_for_gpu_build {
   export TF_NEED_GCP=0
   export TF_NEED_HDFS=0
   export TF_NEED_OPENCL=0
+
+  # TODO(pcloudy): Remove this after TensorFlow uses its own CRSOOTOOL
+  # for GPU build on Windows
+  export USE_MSVC_WRAPPER=1
+
   echo "" | ./configure
 }
 
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 05392c27248..3aa034ef6ea 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -33,11 +33,11 @@ mkdir -p "$TMPDIR"
 export BAZEL_SH=${BAZEL_SH:-"C:/tools/msys64/usr/bin/bash"}
 
 # Set Python path for ./configure
-export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python"
+export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python.exe"
 export PYTHON_LIB_PATH="C:/Program Files/Anaconda3/lib/site-packages"
 
 # Set Python path for cc_configure.bzl
-export BAZEL_PYTHON="C:/Program Files/Anaconda3/python"
+export BAZEL_PYTHON="C:/Program Files/Anaconda3/python.exe"
 
 # Set Visual Studio path
 export BAZEL_VS="C:/Program Files (x86)/Microsoft Visual Studio 14.0"
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 7cb81c20f02..e1972a31004 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -46,7 +46,7 @@ run_configure_for_gpu_build
 
 clean_output_base
 
-bazel build -c opt --config=win-cuda $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
 
 # Create a python test directory to avoid package name conflict
 PY_TEST_DIR="py_test_dir"
@@ -61,8 +61,8 @@ reinstall_tensorflow_pip ${PIP_NAME}
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
 # GPU tests are very flaky when running concurrently, so set local_test_jobs=1
-bazel test -c opt --config=win-cuda $BUILD_OPTS -k --test_output=errors \
+bazel test -c opt $BUILD_OPTS -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
-  --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu \
-  --build_tag_filters=-no_pip,-no_windows,-no_windows_gpu \
+  --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu \
+  --build_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu \
   --local_test_jobs=1 --build_tests_only //${PY_TEST_DIR}/tensorflow/python/...
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index b320a6222dd..830e3dcd32e 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Paramterized build and test for TensorFlow Docker images.
+# Parameterized build and test for TensorFlow Docker images.
 #
 # Usage:
 #   parameterized_docker_build.sh
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 563e5be814c..730afdf7cc7 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -1488,7 +1488,7 @@ class _GeneratedFile(object):
 def _get_defined_in(py_object, parser_config):
   """Returns a description of where the passed in python object was defined.
 
-  Arguments:
+  Args:
     py_object: The Python object.
     parser_config: A ParserConfig object.
 
diff --git a/tensorflow/tools/graph_transforms/remove_device_test.cc b/tensorflow/tools/graph_transforms/remove_device_test.cc
index 554c5e35952..17a87cd2366 100644
--- a/tensorflow/tools/graph_transforms/remove_device_test.cc
+++ b/tensorflow/tools/graph_transforms/remove_device_test.cc
@@ -50,7 +50,7 @@ class RemoveDeviceTest : public ::testing::Test {
     add_node2->set_op("Add");
     add_node2->add_input("const_node1");
     add_node2->add_input("const_node2");
-    add_node2->set_device("//gpu:1");
+    add_node2->set_device("//device:GPU:1");
 
     NodeDef* add_node3 = graph_def.add_node();
     add_node3->set_name("add_node3");
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 45bb2e9ea55..cdce308a50e 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -57,6 +57,8 @@ function main() {
     fi
     if [[ "$1" == "--gpu" ]]; then
       GPU_BUILD=1
+    elif [[ "$1" == "--gpudirect" ]]; then
+      GPU_FLAG="--project_name tensorflow_gpudirect"
     fi
     shift
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 0cac0ee289b..7a5dbbb75e7 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,7 +29,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.3.0-rc1'
+_VERSION = '1.3.0-rc2'
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.11.0',
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index a33b7d2b3a2..b80b9d17026 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -153,11 +153,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl",
       urls = [
-          "http://mirror.bazel.build/github.com/01org/mkl-dnn/releases/download/v0.7/mklml_lnx_2018.0.20170425.tgz",
-          "https://github.com/01org/mkl-dnn/releases/download/v0.7/mklml_lnx_2018.0.20170425.tgz",
+          "http://mirror.bazel.build/github.com/01org/mkl-dnn/releases/download/v0.9/mklml_lnx_2018.0.20170720.tgz",
+          "https://github.com/01org/mkl-dnn/releases/download/v0.9/mklml_lnx_2018.0.20170720.tgz",
       ],
-      sha256 = "3cc2501fb209e1fd0960a5f61c919438f9619c68a644dcebf0fdf69b07460c57",
-      strip_prefix = "mklml_lnx_2018.0.20170425",
+      sha256 = "57ba56c4c243f403ff78f417ff854ef50b9eddf4a610a917b7c95e7fa8553a4b",
+      strip_prefix = "mklml_lnx_2018.0.20170720",
       build_file = str(Label("//third_party/mkl:mkl.BUILD")),
       repository = tf_repo_name,
   )
diff --git a/third_party/sycl/crosstool/CROSSTOOL.tpl b/third_party/sycl/crosstool/CROSSTOOL.tpl
index 2a96cdbf95c..32884d71e78 100755
--- a/third_party/sycl/crosstool/CROSSTOOL.tpl
+++ b/third_party/sycl/crosstool/CROSSTOOL.tpl
@@ -76,6 +76,18 @@ toolchain {
   unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
   unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
 
+  compiler_flag: "-fPIE"
+
+  # Keep stack frames for debugging, even in opt mode.
+  compiler_flag: "-fno-omit-frame-pointer"
+
+  # Anticipated future default.
+  linker_flag: "-no-canonical-prefixes"
+  unfiltered_cxx_flag: "-fno-canonical-system-headers"
+
+  # Have gcc return the exit code from ld.
+  linker_flag: "-pass-exit-codes"
+
   # All warnings are enabled. Maybe enable -Werror as well?
   compiler_flag: "-Wall"
 
@@ -105,6 +117,9 @@ toolchain {
     compiler_flag: "-g0"
     compiler_flag: "-O2"
     compiler_flag: "-DNDEBUG"
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
   }
 }