Merge changes from github.

Change: 137532946
2025-12-06 12:20:11 +01:00 · 2016-10-28 10:29:28 -08:00 · 2016-10-28 10:29:28 -08:00 · e2d51a87f0
commit e2d51a87f0
parent f80ef2d696
97 changed files with 1731 additions and 405 deletions
--- a/README.md
+++ b/README.md
@ -33,10 +33,10 @@ and discussion.**
 People who are a little more adventurous can also try our nightly binaries:
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
 * [Android](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/))
 #### *Try your first TensorFlow program*
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -15,6 +15,7 @@ cmake_policy(SET CMP0022 NEW)
 # Options
 option(tensorflow_VERBOSE "Enable for verbose output" OFF)
 option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_BUILD_CC_EXAMPLE "Build the C++ tutorial example" ON)
@ -48,8 +49,13 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\")
  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
  add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
  add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
  # Suppress warnings to reduce build log size.
  add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
  add_definitions(/wd4099 /wd4146 /wd4267 /wd4305 /wd4307)
  add_definitions(/wd4715 /wd4722 /wd4723 /wd4838 /wd4309 /wd4334)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
 endif()
@ -80,7 +86,16 @@ set(tensorflow_EXTERNAL_LIBRARIES
    ${protobuf_STATIC_LIBRARIES}
 )
 set(tensorflow_EXTERNAL_DEPENDENCIES
-  gif_copy_headers_to_destination png_copy_headers_to_destination jpeg_copy_headers_to_destination jsoncpp farmhash_copy_headers_to_destination highwayhash_copy_headers_to_destination protobuf eigen)
+    zlib_copy_headers_to_destination
    gif_copy_headers_to_destination
    png_copy_headers_to_destination
    jpeg_copy_headers_to_destination
    jsoncpp
    farmhash_copy_headers_to_destination
    highwayhash_copy_headers_to_destination
    protobuf
    eigen
 )
 include_directories(
    # Source and generated code.
@ -118,19 +133,67 @@ if(UNIX)
  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS})
 endif()
 if (tensorflow_ENABLE_GPU)
  if (WIN32)
    find_package(CUDA 8.0 REQUIRED)
    # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
    # CUDA_NVCC_FLAGS and cuda_config.h below
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
    set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
    include_directories(${CUDA_INCLUDE})
    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.5,5.2)
    # add cudnn
    include_directories(${CUDNN_HOME})
    set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDNN_HOME}/lib/x64/cudnn.lib)
    # create cuda_config.h
    FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
      "#ifndef CUDA_CUDA_CONFIG_H_\n"
      "#define CUDA_CUDA_CONFIG_H_\n"
      "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
      "#define TF_CUDA_VERSION \"64_80\"\n"
      "#define TF_CUDNN_VERSION \"64_5\"\n"
      "#endif  // CUDA_CUDA_CONFIG_H_\n"
    )
    # tf assumes in various places header files to be in cuda/include. On windows the cuda sdk
    # installs them under cuda/version/include and to avoid that we need to change tf we copy a
    # few files to cuda/include
    FILE(COPY
      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
      ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_HOME}/include/cudnn.h
      ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
      DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
    )
    include_directories(${tensorflow_source_dir}/third_party/gpus)
    # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
    list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
  endif()
 endif()
 # Let's get to work!
 include(tf_core_framework.cmake)
 include(tf_tools.cmake)
 # NOTE: Disabled until issue #3996 is fixed.
 # include(tf_stream_executor.cmake)
 if (tensorflow_ENABLE_GPU)
  if (WIN32)
    include(tf_stream_executor.cmake)
  endif()
 endif()
 include(tf_core_cpu.cmake)
 include(tf_models.cmake)
 include(tf_core_ops.cmake)
 include(tf_core_direct_session.cmake)
 include(tf_core_kernels.cmake)
 if(tensorflow_ENABLE_GRPC_SUPPORT)
  include(tf_core_distributed_runtime.cmake)
 endif()
-include(tf_core_kernels.cmake)
+
 include(tf_cc_ops.cmake)
 if(tensorflow_BUILD_CC_EXAMPLE)
  include(tf_tutorials.cmake)
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@ -15,14 +15,13 @@ Current Status
 The CMake files in this directory can build the core TensorFlow runtime, an
 example C++ binary, and a PIP package containing the runtime and Python
-bindings. Currently, only CPU builds are supported, but we are working on
+bindings.
 providing a GPU build as well.
 Note: Windows support is in an **alpha** state, and we welcome your feedback.
 ### Pre-requisites
-* CMake version 3.1 or later
+* CMake version 3.1 up to 3.6
 * [Git](http://git-scm.com)
@ -45,21 +44,13 @@ Note: Windows support is in an **alpha** state, and we welcome your feedback.
  - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.continuum.io/downloads)
  - [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
  - [swigwin-3.0.10](http://www.swig.org/download.html)
-
+  - [NVidia CUDA Toolkit 8.0] (https://developer.nvidia.com/cuda-downloads)
  - [NVidia CUDNN 5.1] (https://developer.nvidia.com/cudnn)
 * Ubuntu 14.04
  - Makefile generator
  - Docker 1.9.1 (for automated testing)
 ### Current known limitations
 * CPU support only
  - We are in the process of porting the GPU code in
    `tensorflow/stream_executor` to build with CMake and work on non-POSIX
    platforms.
 * Additional limitations for the Windows build:
  - The Python package supports **Python 3.5 only**, because that is the only
    version for which standard Python binaries exist and those binaries are
    compatible with the TensorFlow runtime. (On Windows, the standard Python
@ -114,6 +105,17 @@ Step-by-step Windows build
     D:\temp> "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\amd64\vcvarsall.bat"
     ```
   * When building with GPU support after installing the CUDNN zip file from NVidia, append its 
     bin directory to your PATH environment variable.
     In case TensorFlow fails to find the CUDA dll's during initialization, check your PATH environment variable. 
     It should contain the directory of the CUDA dlls and the directory of the CUDNN dll.
     For example:
     ```
     D:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin
     D:\local\cuda\bin
     ```
   * We assume that `cmake` and `git` are installed and in your `%PATH%`. If
     for example `cmake` is not in your path and it is installed in
     `C:\Program Files (x86)\CMake\bin\cmake.exe`, you can add this directory
@ -145,9 +147,14 @@ Step-by-step Windows build
   D:\...\build> cmake .. -A x64 -DCMAKE_BUILD_TYPE=Release ^
   More? -DSWIG_EXECUTABLE=C:/tools/swigwin-3.0.10/swig.exe ^
   More? -DPYTHON_EXECUTABLE=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/python.exe ^
-   More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib
+   More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib 
   ```
-
+   To build with GPU support add "^" at the end of the last line above following with:
   ```
   More? -Dtensorflow_ENABLE_GPU=ON ^
   More? -DCUDNN_HOME="D:\...\cudnn"
   ```
   Note that the `-DCMAKE_BUILD_TYPE=Release` flag must match the build
   configuration that you choose when invoking `msbuild`. The known-good
   values are `Release` and `RelWithDebInfo`. The `Debug` build type is
@ -184,6 +191,11 @@ Step-by-step Windows build
     SSL support (for making secure HTTP requests) in the TensorFlow runtime.
     This support is incomplete, and will be used for Google Cloud Storage
     support.
   * `-Dtensorflow_ENABLE_GPU=(ON|OFF)`. Defaults to `OFF`. Include
     GPU support. If GPU is enabled you need to install the CUDA 8.0 Toolkit and CUDNN 5.1.
     CMake will expect the location of CUDNN in -DCUDNN_HOME=path_you_unziped_cudnn.
 4. Invoke MSBuild to build TensorFlow.
@ -202,7 +214,6 @@ Step-by-step Windows build
   D:\...\build> MSBuild /p:Configuration=Release tf_python_build_pip_package.vcxproj
   ```
 Linux Continuous Integration build
 ==================================
--- a/tensorflow/contrib/cmake/setup.py
+++ b/tensorflow/contrib/cmake/setup.py
@ -26,7 +26,7 @@ from setuptools import find_packages, setup, Command
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
-_VERSION = '0.11.0rc0-cmake-experimental'
+_VERSION = '0.11.0rc1-cmake-experimental'
 REQUIRED_PACKAGES = [
    'numpy >= 1.11.0',
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@ -21,13 +21,27 @@ file(GLOB_RECURSE tf_core_cpu_exclude_srcs
    "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_factory.cc"
    "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_options.cc"
 )
 list(REMOVE_ITEM tf_core_cpu_srcs ${tf_core_cpu_exclude_srcs}) 
 # We need to include stubs for the GPU tracer, which are in the exclude glob.
 list(APPEND tf_core_cpu_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_tracer.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_tracer.h"
 )
 if (tensorflow_ENABLE_GPU)
  file(GLOB_RECURSE tf_core_gpu_srcs
    "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/*.cc"
    "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu/cupti_wrapper.cc"
    "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu_device_factory.cc"
  )
  file(GLOB_RECURSE tf_core_gpu_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/*test*.cc"
     "${tensorflow_source_dir}/tensorflow/core/*test*.cc"
  )
  list(REMOVE_ITEM tf_core_gpu_srcs ${tf_core_gpu_exclude_srcs})
  list(APPEND tf_core_cpu_srcs ${tf_core_gpu_srcs})
 endif()
 add_library(tf_core_cpu OBJECT ${tf_core_cpu_srcs})
 add_dependencies(tf_core_cpu tf_core_framework)
--- a/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
+++ b/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
@ -38,9 +38,11 @@ add_executable(grpc_tensorflow_server
    $<TARGET_OBJECTS:tf_core_ops>
    $<TARGET_OBJECTS:tf_core_direct_session>
    $<TARGET_OBJECTS:tf_core_distributed_runtime>
    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 target_link_libraries(grpc_tensorflow_server PUBLIC
    tf_protos_cc
    ${tf_core_gpu_kernels_lib}
    ${tensorflow_EXTERNAL_LIBRARIES}
 )
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@ -38,6 +38,7 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
      "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/metrics/kernels/set_kernels.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/metrics/ops/set_ops.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
@ -83,7 +84,7 @@ list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_exclude_srcs})
 if(WIN32)
  file(GLOB_RECURSE tf_core_kernels_windows_exclude_srcs
-      # Not currently working on Windows:
+      # not working on windows yet
      "${tensorflow_source_dir}/tensorflow/core/kernels/depthwise_conv_op.cc"  # Cannot find symbol: tensorflow::LaunchConv2DOp<struct Eigen::ThreadPoolDevice, double>::launch(...).
      "${tensorflow_source_dir}/tensorflow/core/kernels/fact_op.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/immutable_constant_op.cc"
@ -93,14 +94,38 @@ if(WIN32)
      "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_matmul_op.h"
      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/svd*.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/avgpooling_op.*"
  )
  list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs})
 endif(WIN32)
 file(GLOB_RECURSE tf_core_gpu_kernels_srcs
   "${tensorflow_source_dir}/tensorflow/core/kernels/*.cu.cc"
   "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/*.cu.cc"
 )
 if(WIN32)
  file(GLOB_RECURSE tf_core_gpu_kernels_exclude_srcs
      # not working on windows yet
      "${tensorflow_source_dir}/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc"
  )
  list(REMOVE_ITEM tf_core_gpu_kernels_srcs ${tf_core_gpu_kernels_exclude_srcs})
 endif(WIN32)
 add_library(tf_core_kernels OBJECT ${tf_core_kernels_srcs})
 add_dependencies(tf_core_kernels tf_core_cpu)
 if(WIN32)
  target_compile_options(tf_core_kernels PRIVATE /MP)
  if (tensorflow_ENABLE_GPU)
    set_source_files_properties(${tf_core_gpu_kernels_srcs} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
    set(tf_core_gpu_kernels_lib tf_core_gpu_kernels)
    cuda_add_library(${tf_core_gpu_kernels_lib} ${tf_core_gpu_kernels_srcs})
    set_target_properties(${tf_core_gpu_kernels_lib}
                          PROPERTIES DEBUG_POSTFIX ""
                          COMPILE_FLAGS "${TF_REGULAR_CXX_FLAGS}"
    )
    add_dependencies(${tf_core_gpu_kernels_lib} tf_core_cpu)
  endif()
 endif()
 add_dependencies(tf_core_kernels tf_core_cpu)
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@ -302,12 +302,14 @@ add_library(pywrap_tensorflow SHARED
    $<TARGET_OBJECTS:tf_core_direct_session>
    $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
    $<TARGET_OBJECTS:tf_core_kernels>
    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 target_include_directories(pywrap_tensorflow PUBLIC
    ${PYTHON_INCLUDE_DIR}
    ${NUMPY_INCLUDE_DIR}
 )
 target_link_libraries(pywrap_tensorflow
    ${tf_core_gpu_kernels_lib}
    ${tensorflow_EXTERNAL_LIBRARIES}
    tf_protos_cc
    ${PYTHON_LIBRARIES}
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@ -47,11 +47,17 @@ file(GLOB tf_stream_executor_srcs
    "${tensorflow_source_dir}/tensorflow/stream_executor/platform/default/*.h"
 )
 if (tensorflow_ENABLE_GPU)    
    file(GLOB tf_stream_executor_gpu_srcs
        "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
    )
    list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs})
 endif()    
 #file(GLOB_RECURSE tf_stream_executor_test_srcs
 #    "${tensorflow_source_dir}/tensorflow/stream_executor/*_test.cc"
 #    "${tensorflow_source_dir}/tensorflow/stream_executor/*_test.h"
 #)
 #
 #list(REMOVE_ITEM tf_stream_executor_srcs ${tf_stream_executor_test_srcs}) 
 add_library(tf_stream_executor OBJECT ${tf_stream_executor_srcs})
--- a/tensorflow/contrib/cmake/tf_tutorials.cmake
+++ b/tensorflow/contrib/cmake/tf_tutorials.cmake
@ -12,9 +12,11 @@ add_executable(tf_tutorials_example_trainer
    $<TARGET_OBJECTS:tf_cc_ops>
    $<TARGET_OBJECTS:tf_core_ops>
    $<TARGET_OBJECTS:tf_core_direct_session>
    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 target_link_libraries(tf_tutorials_example_trainer PUBLIC
    tf_protos_cc
    ${tf_core_gpu_kernels_lib}
    ${tensorflow_EXTERNAL_LIBRARIES}
 )
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@ -942,6 +942,7 @@ def convolution2d_transpose(
    kernel_size,
    stride=1,
    padding='SAME',
    data_format=DATA_FORMAT_NHWC,
    activation_fn=nn.relu,
    normalizer_fn=None,
    normalizer_params=None,
@ -961,7 +962,9 @@ def convolution2d_transpose(
  second variable called 'biases' is added to the result of the operation.
  Args:
-    inputs: a tensor of size [batch_size, height, width, channels].
+    inputs: A 4-D `Tensor` of type `float` and shape
      `[batch, height, width, in_channels]` for `NHWC` data format or
      `[batch, in_channels, height, width]` for `NCHW` data format.
    num_outputs: integer, the number of output filters.
    kernel_size: a list of length 2 holding the [kernel_height, kernel_width] of
      of the filters. Can be an int if both values are the same.
@ -969,6 +972,7 @@ def convolution2d_transpose(
      Can be an int if both strides are the same.  Note that presently
      both strides must have the same value.
    padding: one of 'VALID' or 'SAME'.
    data_format: A string. `NHWC` (default) and `NCHW` are supported.
    activation_fn: activation function, set to None to skip it and maintain
      a linear activation.
    normalizer_fn: normalization function to use instead of `biases`. If
@ -993,14 +997,23 @@ def convolution2d_transpose(
  Raises:
    ValueError: if 'kernel_size' is not a list of length 2.
    ValueError: if `data_format` is neither `NHWC` nor `NCHW`.
    ValueError: if `C` dimension of `inputs` is None.
  """
  with variable_scope.variable_scope(
      scope, 'Conv2d_transpose', [inputs], reuse=reuse) as sc:
    if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
      raise ValueError('data_format has to be either NCHW or NHWC.')
    dtype = inputs.dtype.base_dtype
    kernel_h, kernel_w = utils.two_element_tuple(kernel_size)
    stride_h, stride_w = utils.two_element_tuple(stride)
-    num_filters_in = utils.last_dimension(
+    if data_format == DATA_FORMAT_NCHW:
-        inputs.get_shape(), min_rank=4)
+      c_axis, h_axis, w_axis = 1, 2, 3
    else:
      h_axis, w_axis, c_axis = 1, 2, 3
    num_filters_in = inputs.get_shape()[c_axis].value
    if num_filters_in is None:
      raise ValueError('`C` dimension of `inputs` must be known but is None.')
    weights_shape = [kernel_h, kernel_w, num_outputs, num_filters_in]
    weights_collections = utils.get_variable_collections(
        variables_collections, 'weights')
@ -1015,7 +1028,7 @@ def convolution2d_transpose(
    inputs_shape = array_ops.shape(inputs)
    batch_size = inputs_shape[0]
-    height, width = inputs_shape[1], inputs_shape[2]
+    height, width = inputs_shape[h_axis], inputs_shape[w_axis]
    def get_deconv_dim(dim_size, stride_size, kernel_size, padding):
      if isinstance(dim_size, ops.Tensor):
@ -1031,17 +1044,25 @@ def convolution2d_transpose(
    out_height = get_deconv_dim(height, stride_h, kernel_h, padding)
    out_width = get_deconv_dim(width, stride_w, kernel_w, padding)
-    output_shape = array_ops.pack(
+    if data_format == DATA_FORMAT_NHWC:
-        [batch_size, out_height, out_width, num_outputs])
+      output_shape = [batch_size, out_height, out_width, num_outputs]
      strides = [1, stride_h, stride_w, 1]
    else:
      output_shape = [batch_size, num_outputs, out_height, out_width]
      strides = [1, 1, stride_h, stride_w]
    output_shape = array_ops.pack(output_shape)
    outputs = nn.conv2d_transpose(inputs, weights, output_shape,
-                                  [1, stride_h, stride_w, 1],
+                                  strides,
-                                  padding=padding)
+                                  padding=padding,
                                  data_format=data_format)
    # Infer the static output shape:
    out_shape = inputs.get_shape().as_list()
-    out_shape[-1] = num_outputs
+    out_shape[c_axis] = num_outputs
-    out_shape[1] = get_deconv_dim(out_shape[1], stride_h, kernel_h, padding)
+    out_shape[h_axis] = get_deconv_dim(out_shape[h_axis], stride_h, kernel_h, padding)
-    out_shape[2] = get_deconv_dim(out_shape[2], stride_w, kernel_w, padding)
+    out_shape[w_axis] = get_deconv_dim(out_shape[w_axis], stride_w, kernel_w, padding)
    outputs.set_shape(out_shape)
    if normalizer_fn is not None:
@ -1057,7 +1078,7 @@ def convolution2d_transpose(
                                          initializer=biases_initializer,
                                          regularizer=biases_regularizer,
                                          collections=biases_collections)
-        outputs = nn.bias_add(outputs, biases)
+        outputs = nn.bias_add(outputs, biases, data_format=data_format)
    if activation_fn is not None:
      outputs = activation_fn(outputs)
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@ -588,6 +588,175 @@ class ConvolutionTest(tf.test.TestCase):
 class Convolution2dTransposeTests(tf.test.TestCase):
  def testInvalidDataFormat(self):
    height, width = 7, 9
    with self.test_session():
      images = tf.random_uniform((5, height, width, 3), seed=1)
      with self.assertRaisesRegexp(
          ValueError, 'data_format has to be either NCHW or NHWC.'):
        tf.contrib.layers.convolution2d_transpose(
            images, 32, 3, data_format='CHWN')
  def testOutputSizeWithStrideOneSamePaddingNCHW(self):
    # `NCHW` data fomat is only supported for `GPU` device.
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True) as sess:
        num_filters = 32
        input_size = [5, 3, 10, 12]
        expected_size = [5, num_filters, 10, 12]
        images = tf.random_uniform(input_size, seed=1)
        output = tf.contrib.layers.conv2d_transpose(
            images, num_filters, [3, 3], stride=1,
            padding='SAME', data_format='NCHW')
        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
        sess.run(tf.initialize_all_variables())
        self.assertListEqual(list(output.eval().shape), expected_size)
  def testOutputSizeWithStrideOneValidPaddingNCHW(self):
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True) as sess:
        num_filters = 32
        input_size = [5, 3, 10, 12]
        expected_size = [5, num_filters, 12, 14]
        images = tf.random_uniform(input_size, seed=1)
        output = tf.contrib.layers.conv2d_transpose(
            images, num_filters, [3, 3], stride=1,
            padding='VALID', data_format='NCHW')
        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
        sess.run(tf.initialize_all_variables())
        self.assertListEqual(list(output.eval().shape), expected_size)
  def testOutputSizeWithStrideTwoValidPaddingNCHW(self):
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True) as sess:
        num_filters = 32
        input_size = [5, 3, 9, 11]
        expected_size = [5, num_filters, 19, 23]
        images = tf.random_uniform(input_size, seed=1)
        output = tf.contrib.layers.conv2d_transpose(
            images, num_filters, [3, 3], stride=[2, 2],
            padding='VALID', data_format='NCHW')
        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
        self.assertListEqual(list(output.get_shape().as_list()), expected_size)
        sess.run(tf.initialize_all_variables())
        self.assertListEqual(list(output.eval().shape), expected_size)
  def testOutputSizeWith1x1StrideTwoSamePaddingNCHW(self):
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True) as sess:
        num_filters = 1
        input_size = [1, 1, 1, 1]
        expected_size = [1, num_filters, 2, 2]
        images = tf.random_uniform(input_size, seed=1)
        output = tf.contrib.layers.conv2d_transpose(
            images, num_filters, [2, 2], stride=[2, 2],
            padding='SAME', data_format='NCHW')
        self.assertListEqual(list(output.get_shape().as_list()), expected_size)
        sess.run(tf.initialize_all_variables())
        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
        self.assertListEqual(list(output.eval().shape), expected_size)
  def testOutputSizeWith1x1StrideTwoValidPaddingNCHW(self):
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True) as sess:
        num_filters = 1
        input_size = [1, 1, 1, 1]
        expected_size = [1, num_filters, 2, 2]
        images = tf.random_uniform(input_size, seed=1)
        output = tf.contrib.layers.conv2d_transpose(
            images, num_filters, [2, 2], stride=[2, 2],
            padding='VALID', data_format='NCHW')
        sess.run(tf.initialize_all_variables())
        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
        self.assertListEqual(list(output.eval().shape), expected_size)
  def testOutputSizeWith2x2StrideTwoSamePaddingNCHW(self):
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True) as sess:
        num_filters = 1
        input_size = [1, 1, 2, 2]
        expected_size = [1, num_filters, 4, 4]
        images = tf.random_uniform(input_size, seed=1)
        output = tf.contrib.layers.conv2d_transpose(
            images, num_filters, [2, 2], stride=[2, 2],
            padding='SAME', data_format='NCHW')
        sess.run(tf.initialize_all_variables())
        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
        self.assertListEqual(list(output.eval().shape), expected_size)
  def testOutputSizeWith2x2StrideTwoValidPaddingNCHW(self):
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True) as sess:
        num_filters = 1
        input_size = [1, 1, 2, 2]
        expected_size = [1, num_filters, 4, 4]
        images = tf.random_uniform(input_size, seed=1)
        output = tf.contrib.layers.conv2d_transpose(
            images, num_filters, [2, 2], stride=[2, 2],
            padding='VALID', data_format='NCHW')
        sess.run(tf.initialize_all_variables())
        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
        self.assertListEqual(list(output.eval().shape), expected_size)
  def testOutputSizeWithStride2x1NCHW(self):
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True) as sess:
        num_filters = 1
        input_size = [1, 1, 3, 2]
        expected_size = [1, num_filters, 6, 5]
        images = tf.random_uniform(input_size, seed=1)
        output = tf.contrib.layers.conv2d_transpose(
            images, num_filters, [2, 4], stride=[2, 1],
            padding='VALID', data_format='NCHW')
        sess.run(tf.initialize_all_variables())
        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
        self.assertListEqual(list(output.eval().shape), expected_size)
  def testOutputSizeWithStride2x4NCHW(self):
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True) as sess:
        num_filters = 1
        input_size = [1, 1, 3, 2]
        expected_size = [1, num_filters, 6, 8]
        images = tf.random_uniform(input_size, seed=1)
        output = tf.contrib.layers.conv2d_transpose(
            images, num_filters, [2, 4], stride=[2, 4],
            padding='VALID', data_format='NCHW')
        sess.run(tf.initialize_all_variables())
        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
        self.assertListEqual(list(output.eval().shape), expected_size)
  def testOutputSizeWithStride2x5NCHW(self):
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True) as sess:
        num_filters = 1
        input_size = [1, 1, 3, 2]
        expected_size = [1, num_filters, 6, 10]
        images = tf.random_uniform(input_size, seed=1)
        output = tf.contrib.layers.conv2d_transpose(
            images, num_filters, [2, 4], stride=[2, 5],
            padding='VALID', data_format='NCHW')
        sess.run(tf.initialize_all_variables())
        self.assertEqual(output.op.name, 'Conv2d_transpose/Relu')
        self.assertListEqual(list(output.eval().shape), expected_size)
  def testOutputSizeWithStrideOneSamePadding(self):
    num_filters = 32
    input_size = [5, 10, 12, 3]
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@ -244,7 +244,7 @@ class GraphIOTest(tf.test.TestCase):
      session.run(tf.initialize_local_variables())
      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)
      self.assertAllEqual(session.run(inputs), [b"ABC"])
      self.assertAllEqual(session.run(inputs), [b"DEF"])
@ -253,6 +253,7 @@ class GraphIOTest(tf.test.TestCase):
        session.run(inputs)
      coord.request_stop()
      coord.join(threads)
  def test_read_keyed_batch_features_mutual_exclusive_args(self):
    filename = self._create_temp_file("abcde")
@ -307,6 +308,7 @@ class GraphIOTest(tf.test.TestCase):
        coord.request_stop()
      coord.join(threads)
    parsed_records = [item for sublist in [d["sequence"] for d in data]
                      for item in sublist]
    # Check that the number of records matches expected and all records
@ -331,7 +333,7 @@ class GraphIOTest(tf.test.TestCase):
      session.run(tf.initialize_local_variables())
      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)
      self.assertEqual("%s:1" % name, inputs.name)
      file_name_queue_name = "%s/file_name_queue" % name
@ -352,6 +354,7 @@ class GraphIOTest(tf.test.TestCase):
        session.run(inputs)
      coord.request_stop()
      coord.join(threads)
  def test_read_text_lines_multifile_with_shared_queue(self):
    gfile.Glob = self._orig_glob
@ -375,7 +378,7 @@ class GraphIOTest(tf.test.TestCase):
      session.run(tf.initialize_local_variables())
      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)
      self.assertEqual("%s:1" % name, inputs.name)
      shared_file_name_queue_name = "%s/file_name_queue" % name
@ -398,6 +401,7 @@ class GraphIOTest(tf.test.TestCase):
        session.run(inputs)
      coord.request_stop()
      coord.join(threads)
  def _get_qr(self, name):
    for qr in ops.get_collection(ops.GraphKeys.QUEUE_RUNNERS):
@ -490,7 +494,7 @@ class GraphIOTest(tf.test.TestCase):
      session.run(tf.initialize_local_variables())
      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)
      self.assertAllEqual(session.run(inputs), [b"A", b"B", b"C"])
      self.assertAllEqual(session.run(inputs), [b"D", b"E"])
@ -498,6 +502,7 @@ class GraphIOTest(tf.test.TestCase):
        session.run(inputs)
      coord.request_stop()
      coord.join(threads)
  def test_keyed_read_text_lines(self):
    gfile.Glob = self._orig_glob
@ -517,7 +522,7 @@ class GraphIOTest(tf.test.TestCase):
      session.run(tf.initialize_local_variables())
      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)
      self.assertAllEqual(session.run([keys, inputs]),
                          [[filename.encode("utf-8") + b":1"], [b"ABC"]])
@ -529,6 +534,7 @@ class GraphIOTest(tf.test.TestCase):
        session.run(inputs)
      coord.request_stop()
      coord.join(threads)
  def test_keyed_parse_json(self):
    gfile.Glob = self._orig_glob
@ -557,7 +563,7 @@ class GraphIOTest(tf.test.TestCase):
      session.run(tf.initialize_local_variables())
      coord = tf.train.Coordinator()
-      tf.train.start_queue_runners(session, coord=coord)
+      threads = tf.train.start_queue_runners(session, coord=coord)
      key, age = session.run([keys, inputs["age"]])
      self.assertAllEqual(age, [[0]])
@ -572,6 +578,7 @@ class GraphIOTest(tf.test.TestCase):
        session.run(inputs)
      coord.request_stop()
      coord.join(threads)
 if __name__ == "__main__":
--- a/tensorflow/contrib/learn/python/learn/utils/export_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export_test.py
@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 import random
 import six
 import tempfile
 import numpy as np
@ -63,8 +64,8 @@ class ExportTest(tf.test.TestCase):
    # Only the written checkpoints are exported.
    self.assertTrue(tf.gfile.Exists(export_dir + '00000001/export'))
    self.assertTrue(tf.gfile.Exists(export_dir + '00000010/export'))
-    self.assertEquals(export_monitor.last_export_dir, os.path.join(export_dir,
+    self.assertEquals(export_monitor.last_export_dir,
-                                                                   '00000010'))
+                      six.b(os.path.join(export_dir, '00000010')))
    # Validate the signature
    signature = self._get_default_signature(export_dir + '00000010/export.meta')
    self.assertTrue(signature.HasField('regression_signature'))
@ -86,8 +87,8 @@ class ExportTest(tf.test.TestCase):
    # Only the written checkpoints are exported.
    self.assertTrue(tf.gfile.Exists(export_dir + '00000001/export'))
    self.assertTrue(tf.gfile.Exists(export_dir + '00000010/export'))
-    self.assertEquals(export_monitor.last_export_dir, os.path.join(export_dir,
+    self.assertEquals(export_monitor.last_export_dir,
-                                                                   '00000010'))
+                      six.b(os.path.join(export_dir, '00000010')))
    # Validate the signature
    signature = self._get_default_signature(export_dir + '00000010/export.meta')
    self.assertTrue(signature.HasField('generic_signature'))
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@ -351,6 +351,10 @@ class BFCAllocator : public VisitableAllocator {
  inline int Log2FloorNonZero(uint64 n) {
 #if defined(__GNUC__)
    return 63 ^ __builtin_clzll(n);
 #elif defined(PLATFORM_WINDOWS)
    unsigned long index;
    _BitScanReverse64(&index, n);
    return index;
 #else
    int r = 0;
    while (n > 0) {
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@ -873,7 +873,9 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
  if (visible_device_list.empty()) {
    visible_gpu_order.resize(gpu_manager->VisibleDeviceCount());
    // By default, visible to virtual mapping is unchanged.
-    std::iota(visible_gpu_order.begin(), visible_gpu_order.end(), 0);
+    int deviceNo = 0;
    std::generate(visible_gpu_order.begin(), visible_gpu_order.end(),
 	              [&deviceNo]{ return deviceNo++; });
  } else {
    std::vector<string> order_str = str_util::Split(visible_device_list, ',');
    for (int i = 0; i < order_str.size(); ++i) {
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
@ -254,6 +254,10 @@ CUPTIManager *GetCUPTIManager() {
  return manager;
 }
 #ifdef _MSC_VER
 #define __thread __declspec(thread) 
 #endif
 // TODO(pbar) Move this to platform specific header file?
 // Static thread local variable for POD types.
 #define TF_STATIC_THREAD_LOCAL_POD(_Type_, _var_)                  \
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.cc
@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/pool_allocator.h"
 #include <errno.h>
 #ifndef _MSC_VER
 #include <strings.h>
 #include <sys/mman.h>  // for munmap
 #endif
 #include <map>
 #include <utility>
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@ -126,7 +126,7 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options, int gpu_id,
    gpu::StreamExecutor* se =
        gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
    int bus_id = se->GetDeviceDescription().numa_node();
-    if (bus_id < static_cast<int64>(gpu_visitors_.size())) {
+    if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
      for (auto v : gpu_visitors_[bus_id]) {
        gpu_allocators_[gpu_id]->AddAllocVisitor(v);
      }
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@ -152,7 +152,7 @@ class Allocator {
  // allocated by this allocator.
  virtual size_t RequestedSize(void* ptr) {
    CHECK(false) << "allocator doesn't track sizes";
-    return 0;
+    return size_t(0);
  }
  // Returns the allocated size of the buffer at 'ptr' if known,
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@ -149,6 +149,7 @@ class DeviceBase {
  // attributes requested.  See allocator.h for more details.
  virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) {
    LOG(FATAL) << "GetAllocator() is not implemented.";
    return nullptr;
  }
  // Return the Allocator implementation to use based on the allocator
@ -180,6 +181,8 @@ class DeviceBase {
  virtual const DeviceAttributes& attributes() const {
    LOG(FATAL) << "Device does not implement attributes()";
    static DeviceAttributes dummy;
    return dummy;
  }
  // Materializes the given TensorProto into 'tensor' stored in Device
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@ -348,6 +348,15 @@ TEST(Tensor_Float, Reshape) {
 }
 TEST(Tensor_Scalar, Basics) {
  {
    Tensor t(DT_BOOL, TensorShape({}));
    EXPECT_EQ(1, t.NumElements());
    auto Tt = t.scalar<bool>();
    EXPECT_EQ(1, Tt.size());
    EXPECT_EQ(0, Tt.rank());
    t.scalar<bool>()() = true;
    EXPECT_TRUE(Tt());
  }
  {
    Tensor t(DT_FLOAT, TensorShape({}));
    EXPECT_EQ(1, t.NumElements());
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@ -16,6 +16,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 namespace tensorflow {
 namespace functor {
@ -31,6 +32,28 @@ struct SelectFunctor<GPUDevice, T> {
  }
 };
 template <typename T>
 struct SelectScalarFunctor<GPUDevice, T> {
  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
                  typename TTypes<bool>::ConstScalar cond,
                  typename TTypes<T>::ConstFlat then_flat,
                  typename TTypes<T>::ConstFlat else_flat) {
 #if !defined(EIGEN_HAS_INDEX_LIST)
  Eigen::array<int, 1> rank1{1};
 #else
  Eigen::IndexList<Eigen::type2index<1>> rank1;
 #endif
  const int size  = then_flat.dimension(0);
  Eigen::array<int, 1> broadcast_dims{size};
  To32Bit(out).device(d) = cond.reshape(rank1)
                               .broadcast(broadcast_dims)
                               .select(then_flat, else_flat);
  }
 };
 template <typename T>
 struct BatchSelectFunctor<GPUDevice, T> {
  void operator()(const GPUDevice& d,
@ -68,6 +91,7 @@ struct BatchSelectFunctor<GPUDevice, T> {
 #define SELECT_FUNCTOR(T)                      \
  template struct SelectFunctor<GPUDevice, T>; \
  template struct SelectScalarFunctor<GPUDevice, T>; \
  template struct BatchSelectFunctor<GPUDevice, T>;
 SELECT_FUNCTOR(Eigen::half);
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@ -41,6 +41,11 @@ class SelectOp : public OpKernel {
    OP_REQUIRES_OK(ctx, ctx->input("t", &then));
    OP_REQUIRES_OK(ctx, ctx->input("e", &else_));
    if (TensorShapeUtils::IsScalar(cond->shape())){
        ComputeScalar(ctx, cond, then, else_);
        return;
    }
    bool broadcasting = (TensorShapeUtils::IsVector(cond->shape()) &&
                         !TensorShapeUtils::IsVector(then->shape()));
@ -108,6 +113,25 @@ class SelectOp : public OpKernel {
    }
  }
  void ComputeScalar(OpKernelContext* ctx, const Tensor* cond,
                          const Tensor* then, const Tensor* else_) {
    OP_REQUIRES(
        ctx, then->shape().IsSameSize(else_->shape()),
        errors::InvalidArgument(
            "'then' and 'else' must have the same size.  but received: ",
            then->shape().DebugString(), " vs. ",
            else_->shape().DebugString()));
    Tensor* output = nullptr;
    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, then->shape(), &output));
    if (output->NumElements() > 0) {
      functor::SelectScalarFunctor<Device, T> func;
      TTypes<bool>::ConstScalar cond_scalar = cond->scalar<bool>();
      func(ctx->eigen_device<Device>(), output->flat<T>(), cond_scalar,
           then->flat<T>(), else_->flat<T>());
    }
  }
 private:
  TF_DISALLOW_COPY_AND_ASSIGN(SelectOp);
 };
@ -152,6 +176,17 @@ struct SelectFunctor<CPUDevice, T> {
  }
 };
 // CPU Specializations of Select functors with scalar
 template <typename T>
 struct SelectScalarFunctor<CPUDevice, T> {
  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
                  TTypes<bool>::ConstScalar cond,
                  typename TTypes<T>::ConstFlat then_flat,
                  typename TTypes<T>::ConstFlat else_flat) {
    out.device(d) = cond() ? then_flat : else_flat;
  }
 };
 template <typename T>
 struct BatchSelectFunctor<CPUDevice, T> {
  void operator()(const CPUDevice& d,
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@ -719,6 +719,14 @@ struct SelectFunctor {
                  typename TTypes<T>::ConstFlat else_flat);
 };
 template <typename Device, typename T>
 struct SelectScalarFunctor {
  void operator()(const Device& d, typename TTypes<T>::Flat out,
                  typename TTypes<bool>::ConstScalar cond,
                  typename TTypes<T>::ConstFlat then_flat,
                  typename TTypes<T>::ConstFlat else_flat);
 };
 template <typename Device, typename T>
 struct BatchSelectFunctor {
  void operator()(const Device& d,
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@ -21,7 +21,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #if !defined(_MSC_VER)
 #define UNROLL _Pragma("unroll")
 #else
 #define UNROLL 
 #endif
 namespace tensorflow {
--- a/tensorflow/core/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
@ -25,8 +25,25 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 namespace tensorflow {
 #if GOOGLE_CUDA
 namespace {
 template <typename Scalar>
 perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
    const Scalar* cuda_memory) {
  perftools::gputools::DeviceMemoryBase wrapped(
      const_cast<Scalar*>(cuda_memory));
  perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
  return typed;
 }
 }  // namespace
 #endif  // GOOGLE_CUDA
 template <class Scalar>
 class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
 public:
@ -60,7 +77,9 @@ class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
    double rows = static_cast<double>(input_matrix_shapes[0].dim_size(0));
    double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
-    double cost = rows * rows * num_rhss;
+    double cost = rows * rows * num_rhss * 
          (Eigen::TensorOpCost::AddCost<Scalar>() + 
           Eigen::TensorOpCost::MulCost<Scalar>());
    return cost >= static_cast<double>(kint64max) ? kint64max
                                                  : static_cast<int64>(cost);
  }
@ -103,6 +122,121 @@ class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
  TF_DISALLOW_COPY_AND_ASSIGN(MatrixTriangularSolveOp);
 };
 #ifdef GOOGLE_CUDA
 template <class Scalar>
 class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
 public:
  typedef LinearAlgebraOp<Scalar> Base;
  explicit MatrixTriangularSolveOpGPU(OpKernelConstruction* context)
      : Base(context), lower_(true), adjoint_(false) {
    OP_REQUIRES_OK(context, context->GetAttr("lower", &lower_));
    OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
  }
  using TensorShapes = typename Base::TensorShapes;
  using Matrix = typename Base::Matrix;
  using MatrixMap = typename Base::MatrixMap;
  using MatrixMaps = typename Base::MatrixMaps;
  using ConstMatrixMap = typename Base::ConstMatrixMap;
  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
  virtual void ValidateInputMatrixShapes(
      OpKernelContext* context,
      const TensorShapes& input_matrix_shapes) const final {
    Base::ValidateSquareSolver(context, input_matrix_shapes);
  }
  TensorShapes GetOutputMatrixShapes(
      const TensorShapes& input_matrix_shapes) const final {
    return TensorShapes({TensorShape({input_matrix_shapes[0].dim_size(1),
                                      input_matrix_shapes[1].dim_size(1)})});
  }
  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
    double rows = static_cast<double>(input_matrix_shapes[0].dim_size(0));
    double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
    double cost = rows * rows * num_rhss * 
          (Eigen::TensorOpCost::AddCost<Scalar>() + 
           Eigen::TensorOpCost::MulCost<Scalar>());
    return cost >= static_cast<double>(kint64max) ? kint64max
                                                  : static_cast<int64>(cost);
  }
  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                     MatrixMaps* outputs) final {
    const ConstMatrixMap& matrix = inputs[0];
    const ConstMatrixMap& rhs = inputs[1];
    MatrixMap& output = outputs->at(0);
    if (matrix.rows() == 0 || rhs.cols() == 0) {
      // To be consistent with the MatrixInverse op, we define the solution for
      // an empty set of equation as the empty matrix.
      return;
    }
    auto matrix_ptr = AsDeviceMemory(matrix.data());
    auto rhs_ptr = AsDeviceMemory(rhs.data());
    auto out_ptr = AsDeviceMemory(output.data());
    auto* stream = context->op_device_context()->stream();
    uint64 rhs_elems = rhs.rows() * rhs.cols();
    bool copy_status =
        stream->ThenMemcpyD2D(&out_ptr, rhs_ptr, sizeof(Scalar) * rhs_elems)
        .ok();
    if (!copy_status) {
      context->SetStatus(
          errors::Internal("Failed to copy rhs into output before solve"));
    }
    // Cublas does
    // output = matrix \ rhs
    // where matrix, rhs and output are assumed to be in column major.
    // We want the output to be in row-major, so we can compute
    // output' = rhs' / matrix' (' stands for transpose)
    // Upper/lower needs to be swapped for this.
    perftools::gputools::blas::UpperLower upper_lower_matrix;
    perftools::gputools::blas::Transpose transpose_matrix;
    if (lower_) {
      upper_lower_matrix = perftools::gputools::blas::UpperLower::kUpper;
    } else {
      upper_lower_matrix = perftools::gputools::blas::UpperLower::kLower;
    }
    if (adjoint_) {
      transpose_matrix = perftools::gputools::blas::Transpose::kTranspose;
    } else {
      transpose_matrix = perftools::gputools::blas::Transpose::kNoTranspose;
    }
    uint64 leading_dim_matrix = matrix.cols();   
    uint64 leading_dim_output = output.cols();      
    uint64 colmajor_rows = output.cols(); 
    uint64 colmajor_cols = output.rows(); 
    bool blas_launch_status =
      stream
        ->ThenBlasTrsm(perftools::gputools::blas::Side::kRight /*side*/, 
                       upper_lower_matrix /*uplo*/, 
                       transpose_matrix /*trans*/,
                       perftools::gputools::blas::Diagonal::kNonUnit /*diag*/,
                       colmajor_rows /*m*/, colmajor_cols /*n*/, 
                       Scalar(1.0) /*alpha*/, 
                       matrix_ptr, leading_dim_matrix /*lda*/, 
                       &out_ptr, leading_dim_output /*ldb*/)
        .ok();
    if (!blas_launch_status) {
      context->SetStatus(errors::Internal("Blas TRSM launch failed"));
    }
  }
 private:
  bool lower_;
  bool adjoint_;
  TF_DISALLOW_COPY_AND_ASSIGN(MatrixTriangularSolveOpGPU);
 };
 #endif  // GOOGLE_CUDA
 REGISTER_LINALG_OP("MatrixTriangularSolve", (MatrixTriangularSolveOp<float>),
                   float);
 REGISTER_LINALG_OP("MatrixTriangularSolve", (MatrixTriangularSolveOp<double>),
@ -112,4 +246,30 @@ REGISTER_LINALG_OP("BatchMatrixTriangularSolve",
 REGISTER_LINALG_OP("BatchMatrixTriangularSolve",
                   (MatrixTriangularSolveOp<double>), double);
 #ifdef GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
    Name("MatrixTriangularSolve")
        .Device(DEVICE_GPU)
        .TypeConstraint<float>("T"),
    MatrixTriangularSolveOpGPU<float>);
 REGISTER_KERNEL_BUILDER(
    Name("MatrixTriangularSolve")
        .Device(DEVICE_GPU)
        .TypeConstraint<double>("T"),
    MatrixTriangularSolveOpGPU<double>);
 REGISTER_KERNEL_BUILDER(
    Name("BatchMatrixTriangularSolve")
        .Device(DEVICE_GPU)
        .TypeConstraint<float>("T"),
    MatrixTriangularSolveOpGPU<float>);
 REGISTER_KERNEL_BUILDER(
    Name("BatchMatrixTriangularSolve")
        .Device(DEVICE_GPU)
        .TypeConstraint<double>("T"),
    MatrixTriangularSolveOpGPU<double>);
 #endif  //GOOGLE_CUDA
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/range_sampler.h
+++ b/tensorflow/core/kernels/range_sampler.h
@ -115,10 +115,12 @@ class AllSampler : public RangeSampler {
  int64 Sample(random::SimplePhilox* rnd) const override {
    LOG(FATAL) << "Should not be called";
    return 0;
  }
  float Probability(int64 value) const override {
    LOG(FATAL) << "Should not be called";
    return 0;
  }
  void SampleBatchGetExpectedCountAvoid(
--- a/tensorflow/core/lib/io/path.cc
+++ b/tensorflow/core/lib/io/path.cc
@ -55,7 +55,10 @@ string JoinPathImpl(std::initializer_list<StringPiece> paths) {
 // the first part of the output.
 std::pair<StringPiece, StringPiece> SplitPath(StringPiece path) {
  auto pos = path.rfind('/');
-
+#ifdef PLATFORM_WINDOWS
  if (pos == StringPiece::npos)
    pos = path.rfind('\\');
 #endif
  // Handle the case with no '/' in 'path'.
  if (pos == StringPiece::npos)
    return std::make_pair(StringPiece(path.data(), 0), path);
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@ -913,7 +913,8 @@ REGISTER_OP("Select")
    .SetShapeFn([](InferenceContext* c) {
      // The inputs 'then' and 'else' must have the same shape.
      ShapeHandle data = c->input(1);
-      TF_RETURN_IF_ERROR(c->Merge(data, c->input(2), &data));
+      ShapeHandle other = c->input(2);
      TF_RETURN_IF_ERROR(c->Merge(data, other, &data));
      // The input 'cond' must either have the same shape as 'then' and
      // 'else', or be a vector if 'then' and 'else' are at least vectors.
@ -929,30 +930,49 @@ REGISTER_OP("Select")
      const int32 cond_rank = c->Rank(cond);
      const int32 data_rank = c->Rank(data);
-      if (cond_rank != 1) {
+      if (cond_rank == 0){
-        // If the rank of 'cond' is != 1, the shape must match 'then' and 'else'
+        // The rank of 'cond' is a scalar.
-        TF_RETURN_IF_ERROR(c->Merge(data, cond, &data));
+        // t and e can have any shape.
        c->set_output(0, data);
        return Status::OK();
      }
-      if (data_rank != 0) {
+
-        // If then and else are not scalars, then cond must be at least
+      if (cond_rank != 1) {
-        // a vector, and its first value must match that of 'else'
+        // If 'cond' is not a vector, and not a scalar,
-        TF_RETURN_IF_ERROR(c->WithRankAtLeast(cond, 1, &cond));
+        // then shape must match 'then' and 'else'
-        if (cond_rank == 1) {
+        TF_RETURN_IF_ERROR(c->Merge(data, cond, &data));
-          TF_RETURN_IF_ERROR(c->Merge(cond, c->Vector(c->Dim(data, 0)), &cond));
+        c->set_output(0, data);
-        }
+        return Status::OK();
      }
      if (data_rank == 0) {
        // if 'then' and 'else' are scalar also the cond must be
        TF_RETURN_IF_ERROR(c->Merge(data, cond, &data));
        c->set_output(0, data);
        return Status::OK();
      }
      if (cond_rank == 1) {
        // if the cond is a vector and the 'then' is not a scalar,
        // the first dimension of 'then' and 'else'
        TF_RETURN_IF_ERROR(c->Merge(cond, c->Vector(c->Dim(data, 0)), &cond));
        c->set_output(0, data);
        return Status::OK();
      }
      c->set_output(0, data);
      return Status::OK();
-    })
+   })
    .Doc(R"doc(
 Selects elements from `t` or `e`, depending on `condition`.
-The `t`, and `e` tensors must all have the same shape,
+The `t`, and `e` tensors must all have the same shape, and the
-and the output will also have that shape.  The `condition` tensor
+output will also have that shape.
-must be a scalar if `t` and `e` are scalars.  If `t` and `e` are vectors
+
-or higher rank, then `condition` must be either a vector with size
+The `condition` tensor must be a scalar if `t` and `e` are scalars.
-matching the first dimension of `t`, or must have the same shape as `t`.
+If `t` and `e` are vectors or higher rank, then `condition` must be either a
 scalar, a vector with size matching the first dimension of `t`, or must have
 the same shape as `t`.
 The `condition` tensor acts as a mask that chooses, based on the value at each
 element, whether the corresponding element / row in the output should be
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@ -188,7 +188,10 @@ TEST(MathOpsTest, Select_ShapeFn) {
  ShapeInferenceTestOp op("Select");
  INFER_OK(op, "?;?;?", "in1|in2");
  // scalar case
  INFER_OK(op, "[];[1];?", "in1");
  INFER_OK(op, "[];?;?", "in1|in2");
  INFER_OK(op, "[1];?;?",
           "in1|in2");  // When cond is vector, t/e may not match it.
  INFER_OK(op, "[1,2];?;?", "in1|in2?");
@ -200,8 +203,8 @@ TEST(MathOpsTest, Select_ShapeFn) {
  INFER_OK(op, "?;[1,2];?", "in1");
  INFER_OK(op, "?;?;[1,2]", "in2");
-  INFER_OK(op, "[1];[];?", "in1");
+  INFER_ERROR("Shapes must be equal rank, but are 0 and 1", op, "[1];[];?");
-  INFER_ERROR("Shapes must be equal rank, but are 1 and 0", op, "[];[1];?");
+  INFER_ERROR("Shapes must be equal rank, but are 1 and 2", op, "[];[1];[1,2]");
  INFER_ERROR("Shapes must be equal rank, but are 1 and 2", op, "[1,2];[1];?");
  INFER_OK(op, "[2];[?];[?]", "in1|in2");
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.h
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
@ -20,9 +20,11 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
-
+#if defined(WIN32)
 #include "extras/CUPTI/include/cupti.h"
 #else
 #include "cuda/extras/CUPTI/include/cupti.h"
-
+#endif
 namespace perftools {
 namespace gputools {
 namespace profiler {
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@ -261,6 +261,14 @@ class Env {
  virtual Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
                                      void** symbol) = 0;
  // \brief build the name of dynamic library.
  //
  // "name" should be name of the library.
  // "version" should be the version of the library or NULL
  // returns the name that LoadLibrary() can use
  virtual string FormatLibraryFileName(const string& name,
      const string& version) = 0;
 private:
  std::unique_ptr<FileSystemRegistry> file_system_registry_;
  TF_DISALLOW_COPY_AND_ASSIGN(Env);
@ -318,7 +326,10 @@ class EnvWrapper : public Env {
                              void** symbol) override {
    return target_->GetSymbolFromLibrary(handle, symbol_name, symbol);
  }
-
+  string FormatLibraryFileName(const string& name,
                               const string& version) override {
    return target_->FormatLibraryFileName(name, version);
  }
 private:
  Env* target_;
 };
--- a/tensorflow/core/platform/load_library.h
+++ b/tensorflow/core/platform/load_library.h
@ -25,8 +25,6 @@ namespace internal {
 Status LoadLibrary(const char* library_filename, void** handle);
 Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
                            void** symbol);
 // Return the filename of a dynamically linked library formatted according to
 // platform naming conventions
 string FormatLibraryFileName(const string& name, const string& version);
 }  // namespace internal
--- a/tensorflow/core/platform/platform.h
+++ b/tensorflow/core/platform/platform.h
@ -20,7 +20,8 @@ limitations under the License.
 // mobile.
 #if !defined(PLATFORM_POSIX) && !defined(PLATFORM_GOOGLE) && \
-    !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID)
+    !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID) && \
    !defined(PLATFORM_WINDOWS)
 // Choose which platform we are on.
 #if defined(ANDROID) || defined(__ANDROID__)
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@ -119,6 +119,10 @@ class PosixEnv : public Env {
    return tensorflow::internal::GetSymbolFromLibrary(handle, symbol_name,
                                                      symbol);
  }
  string FormatLibraryFileName(const string& name, const string& version) {
    return tensorflow::internal::FormatLibraryFileName(name, version);
  }
 };
 }  // namespace
--- a/tensorflow/core/platform/stacktrace.h
+++ b/tensorflow/core/platform/stacktrace.h
@ -22,7 +22,7 @@ limitations under the License.
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/stacktrace.h"
 #elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
-    defined(PLATFORM_GOOGLE_ANDROID)
+    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS)
 #include "tensorflow/core/platform/default/stacktrace.h"
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@ -26,6 +26,7 @@ limitations under the License.
 #include <thread>
 #include <vector>
 #include <string>
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/load_library.h"
@ -52,7 +53,20 @@ class StdThread : public Thread {
 class WindowsEnv : public Env {
 public:
-  WindowsEnv() {}
+  WindowsEnv()
      : GetSystemTimePreciseAsFileTime_(NULL) {
    // GetSystemTimePreciseAsFileTime function is only available in the latest
    // versions of Windows. For that reason, we try to look it up in
    // kernel32.dll at runtime and use an alternative option if the function
    // is not available.
    HMODULE module = GetModuleHandle("kernel32.dll");
    if (module != NULL) {
      auto func = (FnGetSystemTimePreciseAsFileTime)GetProcAddress(
          module, "GetSystemTimePreciseAsFileTime");
      GetSystemTimePreciseAsFileTime_ = func;
    }
  }
  ~WindowsEnv() override {
    LOG(FATAL) << "Env::Default() must not be destroyed";
  }
@ -62,11 +76,32 @@ class WindowsEnv : public Env {
  }
  uint64 NowMicros() override {
-    FILETIME temp;
+    if (GetSystemTimePreciseAsFileTime_ != NULL) {
-    GetSystemTimeAsFileTime(&temp);
+      // GetSystemTimePreciseAsFileTime function is only available in latest
-    uint64 now_ticks =
+      // versions of Windows, so we need to check for its existence here.
-        (uint64)temp.dwLowDateTime + ((uint64)(temp.dwHighDateTime) << 32LL);
+      // All std::chrono clocks on Windows proved to return
-    return now_ticks / 10LL;
+      // values that may repeat, which is not good enough for some uses.
      constexpr int64_t kUnixEpochStartTicks = 116444736000000000i64;
      constexpr int64_t kFtToMicroSec = 10;
      // This interface needs to return system time and not
      // just any microseconds because it is often used as an argument
      // to TimedWait() on condition variable
      FILETIME system_time;
      GetSystemTimePreciseAsFileTime_(&system_time);
      LARGE_INTEGER li;
      li.LowPart = system_time.dwLowDateTime;
      li.HighPart = system_time.dwHighDateTime;
      // Subtract unix epoch start
      li.QuadPart -= kUnixEpochStartTicks;
      // Convert to microsecs
      li.QuadPart /= kFtToMicroSec;
      return li.QuadPart;
    }
    using namespace std::chrono;
    return duration_cast<microseconds>(
        system_clock::now().time_since_epoch()).count();
  }
  void SleepForMicroseconds(int64 micros) override { Sleep(micros / 1000); }
@ -94,19 +129,53 @@ class WindowsEnv : public Env {
    });
  }
-  Status LoadLibrary(const char* library_filename, void** handle) override {
+  Status LoadLibrary(const char *library_filename, void** handle) override {
-    return errors::Unimplemented("WindowsEnv::LoadLibrary");
+    std::string file_name = library_filename;
    std::replace(file_name.begin(), file_name.end(), '/', '\\');
    HMODULE hModule = LoadLibraryEx(file_name.c_str(), NULL,
      LOAD_WITH_ALTERED_SEARCH_PATH);
    if (!hModule) {
      return errors::NotFound(file_name + " not found");
    }
    *handle = hModule;
    return Status::OK();
  }
  Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
-                              void** symbol) override {
+    void** symbol) override {
-    return errors::Unimplemented("WindowsEnv::GetSymbolFromLibrary");
+    FARPROC found_symbol;
    found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
    if (found_symbol == NULL) {
      return errors::NotFound(std::string(symbol_name) + " not found");
    }
    *symbol = (void **)found_symbol;
    return Status::OK();
  }
  string FormatLibraryFileName(const string& name, const string& version)
    override {
    string filename;
    if (version.size() == 0) {
      filename = name + ".dll";
    }
    else {
      filename = name + version + ".dll";
    }
    return filename;
  }
 private:
  typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
  FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
 };
 }  // namespace
 REGISTER_FILE_SYSTEM("", WindowsFileSystem);
 REGISTER_FILE_SYSTEM("file", LocalWinFileSystem);
 Env* Env::Default() {
  static Env* default_env = new WindowsEnv;
  return default_env;
--- a/tensorflow/core/platform/windows/error.cc
+++ b/tensorflow/core/platform/windows/error.cc
@ -0,0 +1,33 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/platform/windows/error.h"
 namespace tensorflow {
 namespace internal {
 std::string GetWindowsErrorMessage(DWORD err) {
  LPSTR buffer = NULL;
  DWORD flags = FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
      FORMAT_MESSAGE_IGNORE_INSERTS;
  FormatMessageA(flags, NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
                 reinterpret_cast<LPSTR>(&buffer), 0, NULL);
  std::string message = buffer;
  LocalFree(buffer);
  return message;
 }
 }  // namespace internal
 }  // namespace tensorflow
--- a/tensorflow/core/platform/windows/error.h
+++ b/tensorflow/core/platform/windows/error.h
@ -0,0 +1,32 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
 #define TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
 #include <string>
 #include <Windows.h>
 namespace tensorflow {
 namespace internal {
 std::string GetWindowsErrorMessage(DWORD err);
 }
 }
 #endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
--- a/tensorflow/core/platform/windows/net.cc
+++ b/tensorflow/core/platform/windows/net.cc
@ -15,25 +15,27 @@ limitations under the License.
 #include "tensorflow/core/platform/net.h"
 #include <cerrno>
 #include <cstdlib>
 #include <unordered_set>
 #include <sys/types.h>
-#include <winsock.h>
+#include <winsock2.h>
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/windows/error.h"
 #undef ERROR
 #pragma comment(lib,"Ws2_32.lib")
 namespace tensorflow {
 namespace internal {
 namespace {
 bool IsPortAvailable(int* port, bool is_tcp) {
  const int protocol = is_tcp ? IPPROTO_TCP : 0;
-  const int fd = socket(AF_INET, is_tcp ? SOCK_STREAM : SOCK_DGRAM, protocol);
+  SOCKET sock = socket(AF_INET, is_tcp ? SOCK_STREAM : SOCK_DGRAM, protocol);
  struct sockaddr_in addr;
  int addr_len = static_cast<int>(sizeof(addr));
@ -41,17 +43,20 @@ bool IsPortAvailable(int* port, bool is_tcp) {
  CHECK_GE(*port, 0);
  CHECK_LE(*port, 65535);
-  if (fd < 0) {
+  if (sock == INVALID_SOCKET) {
-    LOG(ERROR) << "socket() failed: " << strerror(errno);
+    LOG(ERROR) << "socket() failed: " <<
        GetWindowsErrorMessage(WSAGetLastError());
    return false;
  }
-  // SO_REUSEADDR lets us start up a server immediately after it exists.
+  // SO_REUSEADDR lets us start up a server immediately after it exits.
-  int one = 1;
+  const int one = 1;
-  if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (const char*)&one, sizeof(one)) <
+  int result = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-      0) {
+                          reinterpret_cast<const char*>(&one), sizeof(one));
-    LOG(ERROR) << "setsockopt() failed: " << strerror(errno);
+  if (result == SOCKET_ERROR) {
-    closesocket(fd);
+    LOG(ERROR) << "setsockopt() failed: " <<
        GetWindowsErrorMessage(WSAGetLastError());
    closesocket(sock);
    return false;
  }
@ -59,18 +64,23 @@ bool IsPortAvailable(int* port, bool is_tcp) {
  addr.sin_family = AF_INET;
  addr.sin_addr.s_addr = INADDR_ANY;
  addr.sin_port = htons((uint16_t)*port);
-  if (bind(fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
+  result = bind(sock, (struct sockaddr*)&addr, sizeof(addr));
-    LOG(WARNING) << "bind(port=" << *port << ") failed: " << strerror(errno);
+  if (result == SOCKET_ERROR) {
-    closesocket(fd);
+    LOG(WARNING) << "bind(port=" << *port << ") failed: " <<
        GetWindowsErrorMessage(WSAGetLastError());
    closesocket(sock);
    return false;
  }
  // Get the bound port number.
-  if (getsockname(fd, (struct sockaddr*)&addr, &addr_len) < 0) {
+  result = getsockname(sock, (struct sockaddr*)&addr, &addr_len);
-    LOG(WARNING) << "getsockname() failed: " << strerror(errno);
+  if (result == SOCKET_ERROR) {
-    closesocket(fd);
+    LOG(WARNING) << "getsockname() failed: " <<
        GetWindowsErrorMessage(WSAGetLastError());
    closesocket(sock);
    return false;
  }
  CHECK_LE(addr_len, sizeof(addr));
  actual_port = ntohs(addr.sin_port);
  CHECK_GT(actual_port, 0);
@ -79,7 +89,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
  } else {
    CHECK_EQ(*port, actual_port);
  }
-  closesocket(fd);
+
  closesocket(sock);
  return true;
 }
@ -89,6 +100,12 @@ const int kMaximumTrials = 1000;
 }  // namespace
 int PickUnusedPortOrDie() {
  WSADATA wsaData;
  if (WSAStartup(MAKEWORD(2, 2), &wsaData) != NO_ERROR) {
    LOG(ERROR) << "Error at WSAStartup()";
    return false;
  }
  static std::unordered_set<int> chosen_ports;
  // Type of port to first pick in the next iteration.
@ -121,6 +138,7 @@ int PickUnusedPortOrDie() {
    }
    chosen_ports.insert(port);
    WSACleanup();
    return port;
  }
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@ -19,8 +19,8 @@ limitations under the License.
 #ifdef SNAPPY
 #include <snappy.h>
 #endif
-#include <WinSock2.h>
+
-#pragma comment(lib, "Ws2_32.lib")
+#include <Windows.h>
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/demangle.h"
@ -37,10 +37,13 @@ namespace port {
 void InitMain(const char* usage, int* argc, char*** argv) {}
 string Hostname() {
-  char hostname[1024];
+  char name[1024];
-  gethostname(hostname, sizeof hostname);
+  DWORD name_size = sizeof(name);
-  hostname[sizeof hostname - 1] = 0;
+  name[0] = 0;
-  return string(hostname);
+  if (::GetComputerNameA(name, &name_size)) {
    name[name_size] = 0;
  }
  return name;
 }
 int NumSchedulableCPUs() {
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
 #include "tensorflow/core/platform/windows/error.h"
 #include "tensorflow/core/platform/windows/windows_file_system.h"
 // TODO(mrry): Prevent this Windows.h #define from leaking out of our headers.
@ -39,19 +40,71 @@ namespace tensorflow {
 namespace {
 // RAII helpers for HANDLEs
 const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); };
 typedef std::unique_ptr<void, decltype(CloseHandleFunc)> UniqueCloseHandlePtr;
 inline Status IOErrorFromWindowsError(const string& context, DWORD err) {
  return IOError(
      context + string(" : ") + internal::GetWindowsErrorMessage(err), err);
 }
 // PLEASE NOTE: hfile is expected to be an async handle
 // (i.e. opened with FILE_FLAG_OVERLAPPED)
 SSIZE_T pread(HANDLE hfile, char* src, size_t num_bytes, uint64_t offset) {
  assert(num_bytes <= std::numeric_limits<DWORD>::max());
  OVERLAPPED overlapped = {0};
  ULARGE_INTEGER offset_union;
  offset_union.QuadPart = offset;
  overlapped.Offset = offset_union.LowPart;
  overlapped.OffsetHigh = offset_union.HighPart;
  overlapped.hEvent = ::CreateEvent(NULL, TRUE, FALSE, NULL);
  if (NULL == overlapped.hEvent) {
    return -1;
  }
  SSIZE_T result = 0;
  unsigned long bytes_read = 0;
  DWORD last_error = ERROR_SUCCESS;
  BOOL read_result = ::ReadFile(hfile, src, static_cast<DWORD>(num_bytes),
                                &bytes_read, &overlapped);
  if ((FALSE == read_result) &&
      ((last_error = GetLastError()) != ERROR_IO_PENDING)) {
    result = (last_error == ERROR_HANDLE_EOF) ? 0 : -1;
  } else {
    if (ERROR_IO_PENDING == last_error) { // Otherwise bytes_read already has the result.
      BOOL overlapped_result = ::GetOverlappedResult(hfile, &overlapped,
                                                     &bytes_read, TRUE);
      if (FALSE == overlapped_result) {
        result = (::GetLastError() == ERROR_HANDLE_EOF) ? 0 : -1;
      }
      else {
        result = bytes_read;
      }
    }
  }
  ::CloseHandle(overlapped.hEvent);
  return result;
 }
 // read() based random-access
 class WindowsRandomAccessFile : public RandomAccessFile {
 private:
  string filename_;
-  FILE* file_;
+  HANDLE hfile_;
 public:
-  WindowsRandomAccessFile(const string& fname, FILE* f)
+  WindowsRandomAccessFile(const string& fname, HANDLE hfile)
-      : filename_(fname), file_(f) {}
+      : filename_(fname), hfile_(hfile) {}
  ~WindowsRandomAccessFile() override {
-    if (file_ != NULL) {
+    if (hfile_ != NULL && hfile_ != INVALID_HANDLE_VALUE) {
-      // Ignoring any potential errors
+      ::CloseHandle(hfile_);
      fclose(file_);
    }
  }
@ -59,13 +112,10 @@ class WindowsRandomAccessFile : public RandomAccessFile {
              char* scratch) const override {
    Status s;
    char* dst = scratch;
    int seek_result = fseek(file_, offset, SEEK_SET);
    if (seek_result) {
      return IOError(filename_, errno);
    }
    while (n > 0 && s.ok()) {
-      size_t r = fread(dst, 1, n, file_);
+      SSIZE_T r = pread(hfile_, dst, n, offset);
      if (r > 0) {
        offset += r;
        dst += r;
        n -= r;
      } else if (r == 0) {
@ -84,104 +134,246 @@ class WindowsRandomAccessFile : public RandomAccessFile {
 class WindowsWritableFile : public WritableFile {
 private:
  string filename_;
-  FILE* file_;
+  HANDLE hfile_;
 public:
-  WindowsWritableFile(const string& fname, FILE* f)
+  WindowsWritableFile(const string& fname, HANDLE hFile)
-      : filename_(fname), file_(f) {}
+      : filename_(fname), hfile_(hFile) {}
  ~WindowsWritableFile() override {
-    if (file_ != NULL) {
+    if (hfile_ != NULL && hfile_ != INVALID_HANDLE_VALUE) {
-      // Ignoring any potential errors
+      WindowsWritableFile::Close();
      fclose(file_);
    }
  }
  Status Append(const StringPiece& data) override {
-    size_t r = fwrite(data.data(), 1, data.size(), file_);
+    DWORD bytes_written = 0;
-    if (r != data.size()) {
+    DWORD data_size = static_cast<DWORD>(data.size());
-      return IOError(filename_, errno);
+    BOOL write_result = ::WriteFile(hfile_, data.data(), data_size,
                                    &bytes_written, NULL);
    if (FALSE == write_result) {
      return IOErrorFromWindowsError(
          "Failed to WriteFile: " + filename_, ::GetLastError());
    }
    assert(size_t(bytes_written) == data.size());
    return Status::OK();
  }
  Status Close() override {
-    Status result;
+    assert(INVALID_HANDLE_VALUE != hfile_);
-    if (fclose(file_) != 0) {
+
-      result = IOError(filename_, errno);
+    Status result = Flush();
    if (!result.ok()) {
      return result;
    }
-    file_ = NULL;
+
-    return result;
+    if (FALSE == ::CloseHandle(hfile_)) {
      return IOErrorFromWindowsError(
          "CloseHandle failed for: " + filename_, ::GetLastError());
    }
    hfile_ = INVALID_HANDLE_VALUE;
    return Status::OK();
  }
  Status Flush() override {
-    if (fflush(file_) != 0) {
+    if (FALSE == ::FlushFileBuffers(hfile_)) {
-      return IOError(filename_, errno);
+      return IOErrorFromWindowsError(
          "FlushFileBuffers failed for: " + filename_, ::GetLastError());
    }
    return Status::OK();
  }
  Status Sync() override {
-    Status s;
+    return Flush();
    if (fflush(file_) != 0) {
      s = IOError(filename_, errno);
    }
    return s;
  }
 };
 class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 private:
  const std::string filename_;
  HANDLE hfile_;
  HANDLE hmap_;
  const void* const address_;
  const uint64 length_;
 public:
  WinReadOnlyMemoryRegion(const std::string& filename, HANDLE hfile,
                          HANDLE hmap, const void* address, uint64 length)
      : filename_(filename), hfile_(hfile), hmap_(hmap), address_(address),
        length_(length) {}
  ~WinReadOnlyMemoryRegion() {
    BOOL ret = ::UnmapViewOfFile(address_);
    assert(ret);
    ret = ::CloseHandle(hmap_);
    assert(ret);
    ret = ::CloseHandle(hfile_);
    assert(ret);
  }
  const void* data() override { return address_; }
  uint64 length() override { return length_; }
 };
 }  // namespace
 Status WindowsFileSystem::NewRandomAccessFile(
    const string& fname, std::unique_ptr<RandomAccessFile>* result) {
  string translated_fname = TranslateName(fname);
  result->reset();
-  Status s;
+
-  FILE* f = fopen(translated_fname.c_str(), "r");
+  // Open the file for read-only random access
-  if (f == NULL) {
+  // Random access is to disable read-ahead as the system reads too much data
-    s = IOError(fname, errno);
+  // Open in async mode which makes Windows allow more parallelism even
-  } else {
+  // if we need to do sync I/O on top of it.
-    result->reset(new WindowsRandomAccessFile(translated_fname, f));
+  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS |
      FILE_FLAG_OVERLAPPED;
  // Shared access is necessary for tests to pass
  // almost all tests would work with a possible exception of fault_injection.
  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_READ,
                               share_mode, NULL, OPEN_EXISTING, file_flags,
                               NULL);
  if (INVALID_HANDLE_VALUE == hfile) {
    string context = "NewRandomAccessFile failed to Create/Open: " + fname;
    return IOErrorFromWindowsError(context, ::GetLastError());
  }
-  return s;
+
  result->reset(new WindowsRandomAccessFile(translated_fname, hfile));
  return Status::OK();
 }
 Status WindowsFileSystem::NewWritableFile(
    const string& fname, std::unique_ptr<WritableFile>* result) {
  string translated_fname = TranslateName(fname);
-  Status s;
+  result->reset();
-  FILE* f = fopen(translated_fname.c_str(), "w");
+
-  if (f == NULL) {
+  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-    result->reset();
+  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_WRITE,
-    s = IOError(fname, errno);
+                               share_mode, NULL, CREATE_ALWAYS,
-  } else {
+                               FILE_ATTRIBUTE_NORMAL, NULL);
-    result->reset(new WindowsWritableFile(translated_fname, f));
+
  if (INVALID_HANDLE_VALUE == hfile) {
    string context = "Failed to create a NewWriteableFile: " + fname;
    return IOErrorFromWindowsError(context, ::GetLastError());
  }
-  return s;
+
  result->reset(new WindowsWritableFile(translated_fname, hfile));
  return Status::OK();
 }
 Status WindowsFileSystem::NewAppendableFile(
    const string& fname, std::unique_ptr<WritableFile>* result) {
  string translated_fname = TranslateName(fname);
-  Status s;
+  result->reset();
-  FILE* f = fopen(translated_fname.c_str(), "a");
+
-  if (f == NULL) {
+  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-    result->reset();
+  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_WRITE,
-    s = IOError(fname, errno);
+                               share_mode, NULL, OPEN_ALWAYS,
-  } else {
+                               FILE_ATTRIBUTE_NORMAL, NULL);
-    result->reset(new WindowsWritableFile(translated_fname, f));
+
  if (INVALID_HANDLE_VALUE == hfile) {
    string context = "Failed to create a NewAppendableFile: " + fname;
    return IOErrorFromWindowsError(context, ::GetLastError());
  }
-  return s;
+
  UniqueCloseHandlePtr file_guard(hfile, CloseHandleFunc);
  DWORD file_ptr = ::SetFilePointer(hfile, NULL, NULL, FILE_END);
  if (INVALID_SET_FILE_POINTER == file_ptr) {
    string context = "Failed to create a NewAppendableFile: " + fname;
    return IOErrorFromWindowsError(context, ::GetLastError());
  }
  result->reset(new WindowsWritableFile(translated_fname, hfile));
  file_guard.release();
  return Status::OK();
 }
 Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
    const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
-  return errors::Unimplemented(
+  string translated_fname = TranslateName(fname);
-      "WindowsFileSystem::NewReadOnlyMemoryRegionFromFile");
+  result->reset();
  Status s = Status::OK();
  // Open the file for read-only random access
  DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS;
  // Open in async mode which makes Windows allow more parallelism even
  // if we need to do sync I/O on top of it.
  file_flags |= FILE_FLAG_OVERLAPPED;
  DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
  HANDLE hfile = ::CreateFileA(translated_fname.c_str(), GENERIC_READ,
                               share_mode, NULL, OPEN_EXISTING, file_flags,
                               NULL);
  if (INVALID_HANDLE_VALUE == hfile) {
    return IOErrorFromWindowsError(
        "NewReadOnlyMemoryRegionFromFile failed to Create/Open: " + fname,
        ::GetLastError());
  }
  UniqueCloseHandlePtr file_guard(hfile, CloseHandleFunc);
  // Use mmap when virtual address-space is plentiful.
  uint64_t file_size;
  s = GetFileSize(translated_fname, &file_size);
  if (s.ok()) {
    // Will not map empty files
    if (file_size == 0) {
      return IOError(
          "NewReadOnlyMemoryRegionFromFile failed to map empty file: " + fname,
          EINVAL);
    }
    HANDLE hmap = ::CreateFileMappingA(hfile, NULL, PAGE_READONLY,
                                       0,  // Whole file at its present length
                                       0,
                                       NULL);  // Mapping name
    if (!hmap) {
      string context = "Failed to create file mapping for "
                       "NewReadOnlyMemoryRegionFromFile: " + fname;
      return IOErrorFromWindowsError(context, ::GetLastError());
    }
    UniqueCloseHandlePtr map_guard(hmap, CloseHandleFunc);
    const void* mapped_region = ::MapViewOfFileEx(
        hmap, FILE_MAP_READ,
        0,  // High DWORD of access start
        0,  // Low DWORD
        file_size,
        NULL);  // Let the OS choose the mapping
    if (!mapped_region) {
      string context = "Failed to MapViewOfFile for "
                       "NewReadOnlyMemoryRegionFromFile: " + fname;
      return IOErrorFromWindowsError(context, ::GetLastError());
    }
    result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap,
                                              mapped_region, file_size));
    map_guard.release();
    file_guard.release();
  }
  return s;
 }
 bool WindowsFileSystem::FileExists(const string& fname) {
-  return _access(TranslateName(fname).c_str(), 0) == 0;
+  constexpr int kOk = 0;
  return _access(TranslateName(fname).c_str(), kOk) == 0;
 }
 Status WindowsFileSystem::GetChildren(const string& dir,
@ -189,27 +381,39 @@ Status WindowsFileSystem::GetChildren(const string& dir,
  string translated_dir = TranslateName(dir);
  result->clear();
  string pattern = translated_dir;
  if (!pattern.empty() && pattern.back() != '\\' && pattern.back() != '/') {
    pattern += '\\*';
  } else {
    pattern += '*';
  }
  WIN32_FIND_DATA find_data;
-  HANDLE find_handle = FindFirstFile(translated_dir.c_str(), &find_data);
+  HANDLE find_handle = ::FindFirstFileA(pattern.c_str(), &find_data);
  if (find_handle == INVALID_HANDLE_VALUE) {
-    // TODO(mrry): Convert to a more specific error.
+    string context = "FindFirstFile failed for: " + translated_dir;
-    return errors::Unknown("Error code: ", GetLastError());
+    return IOErrorFromWindowsError(context, ::GetLastError());
  }
-  result->push_back(find_data.cFileName);
+
-  while (FindNextFile(find_handle, &find_data)) {
+  do {
-    result->push_back(find_data.cFileName);
+    const StringPiece basename = find_data.cFileName;
-  }
+    if (basename != "." && basename != "..") {
-  if (!FindClose(find_handle)) {
+      result->push_back(find_data.cFileName);
-    // TODO(mrry): Convert to a more specific error.
+    }
-    return errors::Unknown("Error closing find handle: ", GetLastError());
+  } while (::FindNextFileA(find_handle, &find_data));
  if (!::FindClose(find_handle)) {
    string context = "FindClose failed for: " + translated_dir;
    return IOErrorFromWindowsError(context, ::GetLastError());
  }
  return Status::OK();
 }
 Status WindowsFileSystem::DeleteFile(const string& fname) {
  Status result;
  if (unlink(TranslateName(fname).c_str()) != 0) {
-    result = IOError(fname, errno);
+    result = IOError("Failed to delete a file: " + fname, errno);
  }
  return result;
 }
@ -217,7 +421,7 @@ Status WindowsFileSystem::DeleteFile(const string& fname) {
 Status WindowsFileSystem::CreateDir(const string& name) {
  Status result;
  if (_mkdir(TranslateName(name).c_str()) != 0) {
-    result = IOError(name, errno);
+    result = IOError("Failed to create a directory: " + name, errno);
  }
  return result;
 }
@ -225,42 +429,52 @@ Status WindowsFileSystem::CreateDir(const string& name) {
 Status WindowsFileSystem::DeleteDir(const string& name) {
  Status result;
  if (_rmdir(TranslateName(name).c_str()) != 0) {
-    result = IOError(name, errno);
+    result = IOError("Failed to remove a directory: " + name, errno);
  }
  return result;
 }
 Status WindowsFileSystem::GetFileSize(const string& fname, uint64* size) {
-  Status s;
+  string translated_fname = TranslateName(fname);
-  struct _stat sbuf;
+  Status result;
-  if (_stat(TranslateName(fname).c_str(), &sbuf) != 0) {
+  WIN32_FILE_ATTRIBUTE_DATA attrs;
-    *size = 0;
+  if (TRUE == ::GetFileAttributesExA(translated_fname.c_str(),
-    s = IOError(fname, errno);
+                                     GetFileExInfoStandard, &attrs)) {
-  } else {
+    ULARGE_INTEGER file_size;
-    *size = sbuf.st_size;
+    file_size.HighPart = attrs.nFileSizeHigh;
    file_size.LowPart = attrs.nFileSizeLow;
    *size = file_size.QuadPart;
  }
-  return s;
+  else {
    string context = "Can not get size for: " + fname;
    result = IOErrorFromWindowsError(context, ::GetLastError());
  }
  return result;
 }
 Status WindowsFileSystem::RenameFile(const string& src, const string& target) {
  Status result;
-  if (rename(TranslateName(src).c_str(), TranslateName(target).c_str()) != 0) {
+  // rename() is not capable of replacing the existing file as on Linux
-    result = IOError(src, errno);
+  // so use OS API directly
  if (!::MoveFileExA(TranslateName(src).c_str(), TranslateName(target).c_str(),
      MOVEFILE_REPLACE_EXISTING)) {
    string context(strings::StrCat("Failed to rename: ", src, " to: ", target));
    result = IOErrorFromWindowsError(context, ::GetLastError());
  }
  return result;
 }
 Status WindowsFileSystem::Stat(const string& fname, FileStatistics* stat) {
-  Status s;
+  Status result;
  struct _stat sbuf;
  if (_stat(TranslateName(fname).c_str(), &sbuf) != 0) {
-    s = IOError(fname, errno);
+    result = IOError(fname, errno);
  } else {
    stat->mtime_nsec = sbuf.st_mtime * 1e9;
    stat->length = sbuf.st_size;
    stat->is_directory = PathIsDirectory(TranslateName(fname).c_str());
  }
-  return s;
+  return result;
 }
 }  // namespace tensorflow
--- a/tensorflow/core/platform/windows/windows_file_system.h
+++ b/tensorflow/core/platform/windows/windows_file_system.h
@ -64,7 +64,14 @@ class WindowsFileSystem : public FileSystem {
  }
 };
-Status IOError(const string& context, int err_number);
+class LocalWinFileSystem : public WindowsFileSystem {
 public:
    string TranslateName(const string& name) const override {
      StringPiece scheme, host, path;
      ParseURI(name, &scheme, &host, &path);
      return path.ToString();
    }
 };
 }  // namespace tensorflow
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@ -20,7 +20,7 @@ limitations under the License.
 #define TF_MAJOR_VERSION 0
 #define TF_MINOR_VERSION 11
-#define TF_PATCH_VERSION 0rc0
+#define TF_PATCH_VERSION 0rc1
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
--- a/tensorflow/examples/learn/README.md
+++ b/tensorflow/examples/learn/README.md
@ -21,7 +21,7 @@ Some examples use the `pandas` library for data processing (`sudo pip install pa
 * [Deep Neural Network with Customized Decay Function](iris_custom_decay_dnn.py)
 ## Specialized Models
-* [Building a Random Forest Model](random_forest.py)
+* [Building a Random Forest Model](random_forest_mnist.py)
 * [Building a Wide & Deep Model](wide_n_deep_tutorial.py)
 * [Building a Residual Network Model](resnet.py)
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@ -84,7 +84,6 @@ py_test(
    args = [
        "--fake_data",
        "--max_steps=10",
        "--train_dir=/tmp/mnist",
    ],
    main = "fully_connected_feed.py",
    srcs_version = "PY2AND3",
--- a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
+++ b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
@ -117,7 +117,7 @@ def run_training():
  """Train MNIST for a number of steps."""
  # Get the sets of images and labels for training, validation, and
  # test on MNIST.
-  data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)
+  data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)
  # Tell TensorFlow that the model will be built into the default Graph.
  with tf.Graph().as_default():
@ -146,13 +146,13 @@ def run_training():
    init = tf.initialize_all_variables()
    # Create a saver for writing training checkpoints.
-    saver = tf.train.Saver()
+    saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)
    # Create a session for running Ops on the Graph.
    sess = tf.Session()
    # Instantiate a SummaryWriter to output summaries and the Graph.
-    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)
+    summary_writer = tf.train.SummaryWriter(FLAGS.log_dir, sess.graph)
    # And then after everything is built:
@ -190,7 +190,7 @@ def run_training():
      # Save a checkpoint and evaluate the model periodically.
      if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
-        checkpoint_file = os.path.join(FLAGS.train_dir, 'checkpoint')
+        checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
        saver.save(sess, checkpoint_file, global_step=step)
        # Evaluate against the training set.
        print('Training Data Eval:')
@ -216,6 +216,9 @@ def run_training():
 def main(_):
  if tf.gfile.Exists(FLAGS.log_dir):
    tf.gfile.DeleteRecursively(FLAGS.log_dir)
  tf.gfile.MakeDirs(FLAGS.log_dir)
  run_training()
@ -252,10 +255,16 @@ if __name__ == '__main__':
      help='Batch size.  Must divide evenly into the dataset sizes.'
  )
  parser.add_argument(
-      '--train_dir',
+      '--input_data_dir',
      type=str,
-      default='data',
+      default='/tmp/tensorflow/mnist/input_data',
-      help='Directory to put the training data.'
+      help='Directory to put the input data.'
  )
  parser.add_argument(
      '--log_dir',
      type=str,
      default='/tmp/tensorflow/mnist/logs/fully_connected_feed',
      help='Directory to put the log data.'
  )
  parser.add_argument(
      '--fake_data',
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@ -72,7 +72,7 @@ def main(_):
 if __name__ == '__main__':
  parser = argparse.ArgumentParser()
-  parser.add_argument('--data_dir', type=str, default='/tmp/data',
+  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
-                      help='Directory for storing data')
+                      help='Directory for storing input data')
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@ -137,9 +137,9 @@ def train():
  # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
  merged = tf.summary.merge_all()
-  train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train',
+  train_writer = tf.train.SummaryWriter(FLAGS.log_dir + '/train',
                                        sess.graph)
-  test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
+  test_writer = tf.train.SummaryWriter(FLAGS.log_dir + '/test')
  tf.initialize_all_variables().run()
  # Train the model, and also write summaries.
@ -180,9 +180,9 @@ def train():
 def main(_):
-  if tf.gfile.Exists(FLAGS.summaries_dir):
+  if tf.gfile.Exists(FLAGS.log_dir):
-    tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
+    tf.gfile.DeleteRecursively(FLAGS.log_dir)
-  tf.gfile.MakeDirs(FLAGS.summaries_dir)
+  tf.gfile.MakeDirs(FLAGS.log_dir)
  train()
@ -197,10 +197,9 @@ if __name__ == '__main__':
                      help='Initial learning rate')
  parser.add_argument('--dropout', type=float, default=0.9,
                      help='Keep probability for training dropout.')
-  parser.add_argument('--data_dir', type=str, default='/tmp/data',
+  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
-                      help='Directory for storing data')
+                      help='Directory for storing input data')
-  parser.add_argument('--summaries_dir', type=str, default='/tmp/mnist_logs',
+  parser.add_argument('--log_dir', type=str, default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
-                      help='Summaries directory')
+                      help='Summaries log directory')
  FLAGS, unparsed = parser.parse_known_args()
  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.nn.sampled_softmax_loss.md
@ -11,8 +11,8 @@ the full softmax loss.
 At inference time, you can compute full softmax probabilities with the
 expression `tf.nn.softmax(tf.matmul(inputs, tf.transpose(weights)) + biases)`.
-See our [Candidate Sampling Algorithms Reference]
+See our
-(../../extras/candidate_sampling.pdf)
+[Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf)
 Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
 ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d_native.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d_native.md
@ -17,7 +17,7 @@ for k in 0..in_channels-1
                        filter[di, dj, k, q]
 Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
 ##### Args:
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.nce_loss.md
@ -42,8 +42,7 @@ with an otherwise unused class.
      where a sampled class equals one of the target classes.  If set to
      `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
      learning to generate log-odds instead of log probabilities.  See
-      our [Candidate Sampling Algorithms Reference]
+      our [Candidate Sampling Algorithms Reference](../../extras/candidate_sampling.pdf).
      (../../extras/candidate_sampling.pdf).
      Default is False.
 *  <b>`partition_strategy`</b>: A string specifying the partitioning strategy, relevant
      if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.nn.local_response_normalization.md
@ -11,8 +11,8 @@ each component is divided by the weighted, squared sum of inputs within
        sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
    output = input / (bias + alpha * sqr_sum) ** beta
-For details, see [Krizhevsky et al., ImageNet classification with deep
+For details, see
-convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+[Krizhevsky et al., ImageNet classification with deep convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
 ##### Args:
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.conv2d.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.conv2d.md
@ -22,7 +22,7 @@ In detail, with the default NHWC format,
                        filter[di, dj, q, k]
 Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
 ##### Args:
--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@ -63,37 +63,37 @@ Then, select the correct binary to install:
 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 # Mac OS X, CPU only, Python 2.7:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py2-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py2-none-any.whl
 # Mac OS X, GPU enabled, Python 2.7:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py2-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py2-none-any.whl
 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl
 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl
 # Mac OS X, CPU only, Python 3.4 or 3.5:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py3-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py3-none-any.whl
 # Mac OS X, GPU enabled, Python 3.4 or 3.5:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py3-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py3-none-any.whl
 ```
 Install TensorFlow:
@ -159,37 +159,37 @@ Now, install TensorFlow just as you would for a regular Pip installation. First
 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 # Mac OS X, CPU only, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py2-none-any.whl
 # Mac OS X, GPU enabled, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py2-none-any.whl
 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl
 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl
 # Mac OS X, CPU only, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py3-none-any.whl
 # Mac OS X, GPU enabled, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py3-none-any.whl
 ```
 Finally install TensorFlow:
@ -298,37 +298,37 @@ select the correct binary to install:
 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 # Mac OS X, CPU only, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py2-none-any.whl
 # Mac OS X, GPU enabled, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py2-none-any.whl
 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp34-cp34m-linux_x86_64.whl
 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
-# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Install from sources" below.
+# Requires CUDA toolkit 7.5 and CuDNN v5. For other versions, see "Installing from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.11.0rc1-cp35-cp35m-linux_x86_64.whl
 # Mac OS X, CPU only, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py3-none-any.whl
 # Mac OS X, GPU enabled, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc1-py3-none-any.whl
 ```
 Finally install TensorFlow:
@ -396,13 +396,13 @@ code.
 code.
 We also have tags with `latest` replaced by a released version (e.g.,
-`0.11.0-gpu`).
+`0.11.0rc1-gpu`).
 With Docker the installation is as follows:
 *  Install Docker on your machine.
 *  Create a [Docker
-group](http://docs.docker.com/engine/installation/ubuntulinux/#create-a-docker-group)
+group](https://docs.docker.com/engine/installation/linux/ubuntulinux/#/create-a-docker-group)
 to allow launching containers without `sudo`.
 *  Launch a Docker container with the TensorFlow image.  The image
   gets downloaded automatically on first launch.
@ -780,7 +780,7 @@ $ bazel build -c opt --config=cuda //tensorflow/tools/pip_package:build_pip_pack
 $ bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
 # The name of the .whl file will depend on your platform.
-$ sudo pip install /tmp/tensorflow_pkg/tensorflow-0.11.0rc0-py2-none-any.whl
+$ sudo pip install /tmp/tensorflow_pkg/tensorflow-0.11.0rc1-py2-none-any.whl
 ```
 ## Setting up TensorFlow for Development
--- a/tensorflow/g3doc/tutorials/wide/index.md
+++ b/tensorflow/g3doc/tutorials/wide/index.md
@ -222,12 +222,12 @@ To define a feature column for a categorical feature, we can create a
 feature values of a column and there are only a few of them, you can use
 `sparse_column_with_keys`. Each key in the list will get assigned an
 auto-incremental ID starting from 0. For example, for the `gender` column we can
-assign the feature string "female" to an integer ID of 0 and "male" to 1 by
+assign the feature string "Female" to an integer ID of 0 and "Male" to 1 by
 doing:
 ```python
 gender = tf.contrib.layers.sparse_column_with_keys(
-  column_name="gender", keys=["female", "male"])
+  column_name="gender", keys=["Female", "Male"])
 ```
 What if we don't know the set of possible values in advance? Not a problem. We
--- a/tensorflow/g3doc/tutorials/wide_and_deep/index.md
+++ b/tensorflow/g3doc/tutorials/wide_and_deep/index.md
@ -16,7 +16,8 @@ large-scale regression and classification problems with sparse input features
 you're interested in learning more about how Wide & Deep Learning works, please
 check out our [research paper](http://arxiv.org/abs/1606.07792).
-![Wide & Deep Spectrum of Models](../../images/wide_n_deep.svg "Wide & Deep")
+![Wide & Deep Spectrum of Models]
 (../../images/wide_n_deep.svg "Wide & Deep")
 The figure above shows a comparison of a wide model (logistic regression with
 sparse features and transformations), a deep model (feed-forward neural network
@ -85,7 +86,9 @@ part and the deep part of the model.
 import tensorflow as tf
 # Categorical base columns.
-gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["female", "male"])
+gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["Female", "Male"])
 race = tf.contrib.layers.sparse_column_with_keys(column_name="race", keys=[
  "Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"])
 education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
 relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
 workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
--- a/tensorflow/models/image/cifar10/cifar10.py
+++ b/tensorflow/models/image/cifar10/cifar10.py
@ -391,4 +391,5 @@ def maybe_download_and_extract():
    print()
    statinfo = os.stat(filepath)
    print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
-    tarfile.open(filepath, 'r:gz').extractall(dest_directory)
+  
  tarfile.open(filepath, 'r:gz').extractall(dest_directory)
--- a/tensorflow/models/rnn/ptb/ptb_word_lm.py
+++ b/tensorflow/models/rnn/ptb/ptb_word_lm.py
@ -339,7 +339,7 @@ def main(_):
      tf.scalar_summary("Validation Loss", mvalid.cost)
    with tf.name_scope("Test"):
-      test_input = PTBInput(config=config, data=test_data, name="TestInput")
+      test_input = PTBInput(config=eval_config, data=test_data, name="TestInput")
      with tf.variable_scope("Model", reuse=True, initializer=initializer):
        mtest = PTBModel(is_training=False, config=eval_config,
                         input_=test_input)
@ -347,7 +347,7 @@ def main(_):
    sv = tf.train.Supervisor(logdir=FLAGS.save_path)
    with sv.managed_session() as session:
      for i in range(config.max_max_epoch):
-        lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
+        lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
        m.assign_lr(session, config.learning_rate * lr_decay)
        print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@ -213,7 +213,7 @@ tf_py_test(
    additional_deps = ["//tensorflow:tensorflow_py"],
 )
-tf_py_test(
+cuda_py_test(
    name = "matrix_triangular_solve_op_test",
    size = "small",
    srcs = ["matrix_triangular_solve_op_test.py"],
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 from tensorflow.python.client import device_lib
 class Conv2DTransposeTest(tf.test.TestCase):
@ -157,6 +158,119 @@ class Conv2DTransposeTest(tf.test.TestCase):
    err_tolerance = 0.0005
    self.assertLess(err, err_tolerance)
  def testConv2DTransposeSingleStrideNCHW(self):
    # `NCHW` data fomat is only supported for `GPU` device.
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True):
        strides = [1, 1, 1, 1]
        # Input, output: [batch, depth, height, width, depth]
        x_shape = [2, 3, 6, 4]
        y_shape = [2, 2, 6, 4]
        # Filter: [kernel_height, kernel_width, output_depth, input_depth]
        f_shape = [3, 3, 2, 3]
        x = tf.constant(1.0, shape=x_shape, name="x", dtype=tf.float32)
        f = tf.constant(1.0, shape=f_shape, name="filter", dtype=tf.float32)
        output = tf.nn.conv2d_transpose(x, f, y_shape, strides=strides,
                                     padding="SAME", data_format='NCHW')
        value = output.eval()
        for n in xrange(x_shape[0]):
          for k in xrange(f_shape[2]):
            for w in xrange(y_shape[3]):
              for h in xrange(y_shape[2]):
                target = 4 * 3.0
                h_in = h > 0 and h < y_shape[2] - 1
                w_in = w > 0 and w < y_shape[3] - 1
                if h_in and w_in:
                  target += 5 * 3.0
                elif h_in or w_in:
                  target += 2 * 3.0
                self.assertAllClose(target, value[n, k, h, w])
  def testConv2DTransposeSameNCHW(self):
    # `NCHW` data fomat is only supported for `GPU` device.
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True):
        strides = [1, 1, 2, 2]
        # Input, output: [batch, depth, height, width]
        x_shape = [2, 3, 6, 4]
        y_shape = [2, 2, 12, 8]
        # Filter: [kernel_height, kernel_width, output_depth, input_depth]
        f_shape = [3, 3, 2, 3]
        x = tf.constant(1.0, shape=x_shape, name="x", dtype=tf.float32)
        f = tf.constant(1.0, shape=f_shape, name="filter", dtype=tf.float32)
        output = tf.nn.conv2d_transpose(x, f, y_shape, strides=strides,
                                          padding="SAME", data_format='NCHW')
        value = output.eval()
        for n in xrange(x_shape[0]):
          for k in xrange(f_shape[2]):
            for w in xrange(y_shape[3]):
              for h in xrange(y_shape[2]):
                target = 3.0
                # We add a case for locations divisible by the stride.
                h_in = h % strides[2] == 0 and h > 0 and h < y_shape[2] - 1
                w_in = w % strides[3] == 0 and w > 0 and w < y_shape[3] - 1
                if h_in and w_in:
                  target += 9.0
                elif h_in or w_in:
                  target += 3.0
                self.assertAllClose(target, value[n, k, h, w])
  def testConv2DTransposeValidNCHW(self):
    # `NCHW` data fomat is only supported for `GPU` device.
    if tf.test.is_gpu_available():
      with self.test_session(use_gpu=True):
        strides = [1, 1, 2, 2]
        # Input, output: [batch, depth, height, width]
        x_shape = [2, 3, 6, 4]
        y_shape = [2, 2, 13, 9]
        # Filter: [kernel_height, kernel_width, output_depth, input_depth]
        f_shape = [3, 3, 2, 3]
        x = tf.constant(1.0, shape=x_shape, name="x", dtype=tf.float32)
        f = tf.constant(1.0, shape=f_shape, name="filter", dtype=tf.float32)
        output = tf.nn.conv2d_transpose(x, f, y_shape, strides=strides,
                                        padding="VALID", data_format='NCHW')
        value = output.eval()
        cache_values = np.zeros(y_shape, dtype=np.float32)
        # The amount of padding added
        pad = 1
        for n in xrange(x_shape[0]):
          for k in xrange(f_shape[2]):
            for w in xrange(pad, y_shape[3] - pad):
              for h in xrange(pad, y_shape[2] - pad):
                target = 3.0
                # We add a case for locations divisible by the stride.
                h_in = h % strides[
                    2] == 0 and h > pad and h < y_shape[2] - 1 - pad
                w_in = w % strides[
                    3] == 0 and w > pad and w < y_shape[3] - 1 - pad
                if h_in and w_in:
                  target += 9.0
                elif h_in or w_in:
                  target += 3.0
                cache_values[n, k, h, w] = target
            # copy values in the border
            cache_values[n, k, :, 0] = cache_values[n, k, :, 1]
            cache_values[n, k, :, -1] = cache_values[n, k, :, -2]
            cache_values[n, k, 0, :] = cache_values[n, k, 1, :]
            cache_values[n, k, -1, :] = cache_values[n, k, -2, :]
        self.assertAllClose(cache_values, value)
 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@ -1356,6 +1356,18 @@ class SelectOpTest(tf.test.TestCase):
    elif x.dtype == np.float64:
      self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
  def testScalar(self):
    c = True
    x = np.random.rand(1, 3, 2) * 100
    y = np.random.rand(1, 3, 2) * 100
    for t in [np.float16, np.float32, np.float64, np.int32, np.int64,
              np.complex64, np.complex128]:
      xt = x.astype(t)
      yt = y.astype(t)
      self._compare(c, xt, yt, use_gpu=False)
      if t in [np.float16, np.float32, np.float64]:
        self._compare(c, xt, yt, use_gpu=True)
  def testBasic(self):
    c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
    x = np.random.rand(1, 3, 2) * 100
--- a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
@ -24,15 +24,17 @@ import tensorflow as tf
 class MatrixTriangularSolveOpTest(tf.test.TestCase):
  def _verifySolveAllWays(self, x, y, batch_dims=None):
-    for lower in True, False:
+    for use_gpu in True, False:
-      for adjoint in True, False:
+      for lower in True, False:
-        self._verifySolve(x,
+        for adjoint in True, False:
-                          y,
+          self._verifySolve(x,
-                          lower=lower,
+                            y,
-                          adjoint=adjoint,
+                            lower=lower,
-                          batch_dims=batch_dims)
+                            adjoint=adjoint,
                            batch_dims=batch_dims,
                            use_gpu=use_gpu)
-  def _verifySolve(self, x, y, lower=True, adjoint=False, batch_dims=None):
+  def _verifySolve(self, x, y, lower=True, adjoint=False, batch_dims=None, use_gpu=False):
    for np_type in [np.float32, np.float64]:
      a = x.astype(np_type)
      b = y.astype(np_type)
@ -52,7 +54,7 @@ class MatrixTriangularSolveOpTest(tf.test.TestCase):
        a_np = np.tile(a_np, batch_dims + [1, 1])
        b = np.tile(b, batch_dims + [1, 1])
-      with self.test_session():
+      with self.test_session(use_gpu=use_gpu):
        tf_ans = tf.matrix_triangular_solve(a, b, lower=lower, adjoint=adjoint)
        out = tf_ans.eval()
        np_ans = np.linalg.solve(a_np, b)
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@ -264,6 +264,42 @@ class EluTest(tf.test.TestCase):
    print("elu (float64) gradient err = ", err)
    self.assertLess(err, 1e-6)
  def testGradGradFloat32(self):
    with self.test_session():
      x = tf.constant(
          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
          shape=[2, 5], name="x")
      y = tf.nn.elu(x, name="elu")
      z = tf.gradients(y, x)
      x_init = np.asarray(
          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
          dtype=np.float32, order="F")
      err = tf.test.compute_gradient_error(x,
                                           [2, 5],
                                           z[0],
                                           [2, 5],
                                           x_init_value=x_init)
    print("elu (float32) gradient of gradient err = ", err)
    self.assertLess(err, 1e-4)
  def testGradGradFloat64(self):
    with self.test_session():
      x = tf.constant(
          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
          shape=[2, 5], dtype=tf.float64, name="x")
      y = tf.nn.elu(x, name="elu")
      z = tf.gradients(y, x)
      x_init = np.asarray(
          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
          dtype=np.float64, order="F")
      err = tf.test.compute_gradient_error(x,
                                           [2, 5],
                                           z[0],
                                           [2, 5],
                                           x_init_value=x_init)
    print("elu (float64) gradient of gradient err = ", err)
    self.assertLess(err, 1e-6)
 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@ -1795,7 +1795,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
  performed
  instead:
  ```prettyprint
-  tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
+  tf.cumprod([a, b, c], exclusive=True) ==> [1, a, a * b]
  ```
  By setting the `reverse` kwarg to `True`, the cumprod is performed in the
@ -1807,7 +1807,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
  The `reverse` and `exclusive` kwargs can also be combined:
  ```prettyprint
-  tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
+  tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 1]
  ```
  Args:
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@ -25,7 +25,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import gen_nn_ops
-
+from tensorflow.python.ops import gen_math_ops
@ops.RegisterGradient("Conv2DBackpropInput")
 def _Conv2DBackpropInputGrad(op, grad):
@ -268,6 +268,14 @@ def _ReluGrad(op, grad):
  return gen_nn_ops._relu_grad(grad, op.outputs[0])
@ops.RegisterGradient("EluGrad")
 def _EluGradGrad(op, grad):
  x = op.inputs[1]
  return (gen_nn_ops._elu_grad(grad, op.outputs[0]), 
          gen_math_ops.select(x < 0., gen_nn_ops._elu_grad(grad, op.outputs[0] + 1), 
          array_ops.zeros(shape = array_ops.shape(x), dtype = x.dtype)))
@ops.RegisterGradient("Relu6")
 def _Relu6Grad(op, grad):
  return gen_nn_ops._relu6_grad(grad, op.inputs[0])
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@ -1010,6 +1010,7 @@ def conv2d_transpose(value,
                     output_shape,
                     strides,
                     padding="SAME",
                     data_format="NHWC",
                     name=None):
  """The transpose of `conv2d`.
@ -1020,7 +1021,8 @@ def conv2d_transpose(value,
  Args:
    value: A 4-D `Tensor` of type `float` and shape
-      `[batch, height, width, in_channels]`.
+      `[batch, height, width, in_channels]` for `NHWC` data format or
      `[batch, in_channels, height, width]` for `NCHW` data format.
    filter: A 4-D `Tensor` with the same type as `value` and shape
      `[height, width, output_channels, in_channels]`.  `filter`'s
      `in_channels` dimension must match that of `value`.
@ -1030,6 +1032,7 @@ def conv2d_transpose(value,
      dimension of the input tensor.
    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
      See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
    data_format: A string. 'NHWC' and 'NCHW' are supported.
    name: Optional name for the returned tensor.
  Returns:
@ -1041,9 +1044,12 @@ def conv2d_transpose(value,
  """
  with ops.name_scope(name, "conv2d_transpose",
                      [value, filter, output_shape]) as name:
    if data_format not in ("NCHW", "NHWC"):
      raise ValueError("data_format has to be either NCHW or NHWC.")
    value = ops.convert_to_tensor(value, name="value")
    filter = ops.convert_to_tensor(filter, name="filter")
-    if not value.get_shape()[3].is_compatible_with(filter.get_shape()[3]):
+    axis = 3 if data_format=="NHWC" else 1
    if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[3]):
      raise ValueError("input channels does not match filter's input channels, "
                       "{} != {}".format(value.get_shape()[3], filter.get_shape(
                       )[3]))
@ -1055,10 +1061,10 @@ def conv2d_transpose(value,
    if isinstance(output_shape, (list, np.ndarray)):
      # output_shape's shape should be == [4] if reached this point.
-      if not filter.get_shape()[2].is_compatible_with(output_shape[3]):
+      if not filter.get_shape()[2].is_compatible_with(output_shape[axis]):
        raise ValueError(
            "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[3], filter.get_shape()[2]))
+            "{} != {}".format(output_shape[axis], filter.get_shape()[2]))
    if padding != "VALID" and padding != "SAME":
      raise ValueError("padding must be either VALID or SAME:"
@ -1069,6 +1075,7 @@ def conv2d_transpose(value,
                                            out_backprop=value,
                                            strides=strides,
                                            padding=padding,
                                            data_format=data_format,
                                            name=name)
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@ -68,7 +68,7 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
      Must be positive.  See the decay computation above.
    decay_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The decay rate.
-    staircase: Boolean.  It `True` decay the learning rate at discrete intervals
+    staircase: Boolean.  If `True` decay the learning rate at discrete intervals
    name: String.  Optional name of the operation.  Defaults to
      'ExponentialDecay'.
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@ -15,7 +15,10 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #if !defined(PLATFORM_WINDOWS)
 #include <dirent.h>
 #endif
 #include <limits.h>
 #include <stddef.h>
 #include <stdio.h>
@ -25,11 +28,13 @@ limitations under the License.
 #include <IOKit/kext/KextManager.h>
 #include <mach-o/dyld.h>
 #else
 #if !defined(PLATFORM_WINDOWS)
 #include <link.h>
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
 #endif
 #include <unistd.h>
 #endif
 #include <sys/stat.h>
 #endif
 #include <algorithm>
 #include <memory>
 #include <vector>
@ -135,7 +140,7 @@ void Diagnostician::LogDiagnosticInformation() {
              << "(" << port::Hostname() << ")";
  }
  CFRelease(kext_infos);
-#else
+#elif !defined(PLATFORM_WINDOWS)
  if (access(kDriverVersionPath, F_OK) != 0) {
    LOG(INFO) << "kernel driver does not appear to be running on this host "
              << "(" << port::Hostname() << "): "
@ -158,7 +163,7 @@ void Diagnostician::LogDiagnosticInformation() {
 /* static */ void Diagnostician::LogDriverVersionInformation() {
  LOG(INFO) << "hostname: " << port::Hostname();
-
+#ifndef PLATFORM_WINDOWS
  if (VLOG_IS_ON(1)) {
    const char *value = getenv("LD_LIBRARY_PATH");
    string library_path = value == nullptr ? "" : value;
@ -180,17 +185,17 @@ void Diagnostician::LogDiagnosticInformation() {
      closedir(dir);
    }
  }
  port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
  LOG(INFO) << "libcuda reported version is: "
            << DriverVersionStatusToString(dso_version);
  port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
  LOG(INFO) << "kernel reported version is: "
-            << DriverVersionStatusToString(kernel_version);
+	  << DriverVersionStatusToString(kernel_version);
 #endif
  // OS X kernel driver does not report version accurately
-#if !defined(__APPLE__)
+#if !defined(__APPLE__) && !defined(PLATFORM_WINDOWS)
  if (kernel_version.ok() && dso_version.ok()) {
    WarnOnDsoKernelMismatch(dso_version, kernel_version);
  }
@ -227,6 +232,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
      result = StringToDriverVersion(version);
    }
 #else
 #if !defined(PLATFORM_WINDOWS)
  // Callback used when iterating through DSOs. Looks for the driver-interfacing
  // DSO and yields its version number into the callback data, when found.
  auto iterate_phdr =
@ -258,6 +264,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
  };
  dl_iterate_phdr(iterate_phdr, &result);
 #endif
 #endif
  return result;
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -3200,6 +3200,7 @@ bool CudnnSupport::DoNormalize(
    Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
    const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
  LOG(FATAL) << "not yet implemented";  // TODO(leary)
  return false;
 }
 bool CudnnSupport::DoNormalizeWithDimensions(
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@ -19,8 +19,8 @@ limitations under the License.
 #include <stdint.h>
 #include <stdlib.h>
 #include <set>
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/dso_loader.h"
 #include "tensorflow/stream_executor/lib/casts.h"
@ -38,6 +38,14 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/lib/inlined_vector.h"
 #if defined(PLATFORM_WINDOWS)
 // TODO: in windows ARRAYSIZE is defined in winnt.h but including it
 //  here creates a conflict with cuda.h - for now define it here.
 #define ARRAYSIZE(a) \
  ((sizeof(a) / sizeof(*(a))) / \
  static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
 #endif
 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
 bool FLAGS_gpuexec_cuda_device_0_only = false;
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@ -18,8 +18,12 @@ limitations under the License.
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>
 #endif
 #if defined(PLATFORM_WINDOWS)
 #include <windows.h>
 #define PATH_MAX MAX_PATH
 #else
 #include <unistd.h>
-
+#endif
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/cuda/cuda_event.h"
@ -204,7 +208,12 @@ static string GetBinaryDir(bool strip_exe) {
    _NSGetExecutablePath(unresolved_path, &buffer_size);
    CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
 #else
-    CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+#if defined(PLATFORM_WINDOWS)
  HMODULE hModule = GetModuleHandle(NULL);
  GetModuleFileName(hModule, exe_path, MAX_PATH);
 #else
  CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
 #endif
 #endif
  // Make sure it's null-terminated:
  exe_path[sizeof(exe_path) - 1] = 0;
@ -908,8 +917,10 @@ static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
  // could use the file::* utilities).
  FILE *file = fopen(filename.c_str(), "r");
  if (file == nullptr) {
 #if !defined(PLATFORM_WINDOWS)
    LOG(ERROR) << "could not open file to read NUMA node: " << filename
               << "\nYour kernel may have been built without NUMA support.";
 #endif
    return kUnknownNumaNode;
  }
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@ -15,8 +15,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_rng.h"
 #include <dlfcn.h>
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
 #include "tensorflow/stream_executor/cuda/cuda_helpers.h"
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@ -18,13 +18,17 @@ limitations under the License.
 #include "tensorflow/stream_executor/dso_loader.h"
 #include <dlfcn.h>
 #include <limits.h>
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>
 #endif
 #include <stdlib.h>
 #if defined(PLATFORM_WINDOWS)
 #include <windows.h>
 #define PATH_MAX MAX_PATH
 #else
 #include <unistd.h>
 #endif
 #include <initializer_list>
 #include <vector>
@ -45,7 +49,7 @@ string GetCudaVersion() { return TF_CUDA_VERSION; }
 string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 /* static */ port::Status DsoLoader::GetCublasDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                      "cublas", GetCudaVersion()),
                                  GetCudaLibraryDirPath()),
                      dso_handle);
@ -55,35 +59,42 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
  // libcudnn is versioned differently than the other libraries and may have a
  // different version number than other CUDA libraries.  See b/22397368 for
  // some details about the complications surrounding this.
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                      "cudnn", GetCudnnVersion()),
                                  GetCudaLibraryDirPath()),
                      dso_handle);
 }
 /* static */ port::Status DsoLoader::GetCufftDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                      "cufft", GetCudaVersion()),
                                  GetCudaLibraryDirPath()),
                      dso_handle);
 }
 /* static */ port::Status DsoLoader::GetCurandDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                      "curand", GetCudaVersion()),
                                  GetCudaLibraryDirPath()),
                      dso_handle);
 }
 /* static */ port::Status DsoLoader::GetLibcudaDsoHandle(void** dso_handle) {
 #if defined(PLATFORM_WINDOWS)
  return GetDsoHandle(
-      FindDsoPath(tensorflow::internal::FormatLibraryFileName("cuda", "1"),
+      FindDsoPath(port::Env::Default()->FormatLibraryFileName("nvcuda", ""),
                  GetCudaDriverLibraryPath()),
      dso_handle);
 #else
  return GetDsoHandle(
      FindDsoPath(port::Env::Default()->FormatLibraryFileName("cuda", "1"),
                  GetCudaDriverLibraryPath()),
      dso_handle);
 #endif
 }
 /* static */ port::Status DsoLoader::GetLibcuptiDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName(
+  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                      "cupti", GetCudaVersion()),
                                  GetCudaCuptiLibraryPath()),
                      dso_handle);
@ -101,8 +112,6 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
    return port::Status(port::error::INVALID_ARGUMENT,
                        "Only LoadKind::kLocal is currently supported");
  }
  int dynload_flags =
      RTLD_LAZY | (load_kind == LoadKind::kLocal ? RTLD_LOCAL : RTLD_GLOBAL);
  string path_string = path.ToString();
  port::Status s =
      port::Env::Default()->LoadLibrary(path_string.c_str(), dso_handle);
@ -125,6 +134,9 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
  char unresolved_path[buffer_size];
  _NSGetExecutablePath(unresolved_path, &buffer_size);
  CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
 #elif defined(PLATFORM_WINDOWS)
  HMODULE hModule = GetModuleHandle(NULL);
  GetModuleFileName(hModule, exe_path, MAX_PATH);
 #else
  CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
 #endif
@ -159,6 +171,9 @@ static std::vector<string>* CreatePrimordialRpaths() {
 }
 /* static */ bool DsoLoader::TrySymbolicDereference(string* candidate) {
 #if defined(PLATFORM_WINDOWS)
  return false;
 #else
  char buf[PATH_MAX];
  char* result = realpath(candidate->c_str(), buf);
  if (result == nullptr) {
@ -168,6 +183,7 @@ static std::vector<string>* CreatePrimordialRpaths() {
          << result << "\"";
  *candidate = result;
  return true;
 #endif
 }
 /* static */ string DsoLoader::FindDsoPath(port::StringPiece library_name,
@ -206,6 +222,8 @@ static std::vector<string>* CreatePrimordialRpaths() {
 /* static */ string DsoLoader::GetCudaDriverLibraryPath() {
 #if defined(__APPLE__)
  return "external/local_config_cuda/cuda/driver/lib";
 #elif defined(PLATFORM_WINDOWS)
  return "";
 #else
  return "external/local_config_cuda/cuda/driver/lib64";
 #endif
--- a/tensorflow/stream_executor/lib/process_state.cc
+++ b/tensorflow/stream_executor/lib/process_state.cc
@ -15,8 +15,13 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/process_state.h"
 #if defined(PLATFORM_WINDOWS)
 #include <direct.h>
 #include <stdlib.h>
 #include <WinSock2.h>
 #else
 #include <unistd.h>
-
+#endif
 #include <memory>
 namespace perftools {
@ -27,7 +32,7 @@ string Hostname() {
  char hostname[1024];
  gethostname(hostname, sizeof hostname);
  hostname[sizeof hostname - 1] = 0;
-  return hostname;
+  return std::string(hostname);
 }
 bool GetCurrentDirectory(string* dir) {
--- a/tensorflow/stream_executor/lib/static_threadlocal.h
+++ b/tensorflow/stream_executor/lib/static_threadlocal.h
@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
 #ifdef _MSC_VER
 #define __thread __declspec(thread) 
 #endif
 // For POD types in TLS mode, s_obj_VAR is the thread-local variable.
 #define SE_STATIC_THREAD_LOCAL_POD(_Type_, _var_)               \
  static __thread _Type_ s_obj_##_var_;                         \
--- a/tensorflow/tensorboard/backend/server.py
+++ b/tensorflow/tensorboard/backend/server.py
@ -81,7 +81,7 @@ def ParseEventFilesSpec(logdir):
    else:
      run_name = None
      path = specification
-    if not io_wrapper.IsGCSPath(path):
+    if not (io_wrapper.IsGCSPath(path) or path.startswith('hdfs://')):
      path = os.path.realpath(path)
    files[path] = run_name
  return files
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@ -563,7 +563,7 @@ def _py_wrap_cc_impl(ctx):
  for dep in ctx.attr.deps:
    inputs += dep.cc.transitive_headers
  inputs += ctx.files._swiglib
-  swig_include_dirs = set([f.root.path for f in inputs if f.root.path])
+  swig_include_dirs = set(_get_repository_roots(ctx, inputs))
  swig_include_dirs += sorted([f.dirname for f in ctx.files._swiglib])
  args = ["-c++",
          "-python",
@ -616,6 +616,35 @@ _py_wrap_cc = rule(
    implementation = _py_wrap_cc_impl,
 )
 def _get_repository_roots(ctx, files):
  """Returns abnormal root directories under which files reside.
  When running a ctx.action, source files within the main repository are all
  relative to the current directory; however, files that are generated or exist
  in remote repositories will have their root directory be a subdirectory,
  e.g. bazel-out/local-fastbuild/genfiles/external/jpeg_archive. This function
  returns the set of these devious directories, ranked and sorted by popularity
  in order to hopefully minimize the number of I/O system calls within the
  compiler, because includes have quadratic complexity.
  """
  result = {}
  for f in files:
    root = f.root.path
    if root:
      if root not in result:
        result[root] = 0
      result[root] -= 1
    work = f.owner.workspace_root
    if work:
      if root:
        root += "/"
      root += work
    if root:
      if root not in result:
        result[root] = 0
      result[root] -= 1
  return [k for v, k in sorted([(v, k) for k, v in result.items()])]
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
  outputs = set()
--- a/tensorflow/tools/ci_build/builds/test_installation.sh
+++ b/tensorflow/tools/ci_build/builds/test_installation.sh
@ -47,10 +47,6 @@
 # TF_BUILD_BAZEL_CLEAN, if set to any non-empty and non-0 value, directs the
 # script to perform bazel clean prior to main build and test steps.
 #
 # TF_BUILD_SERIAL_INSTALL_TESTS, if set to any non-empty and non-0 value,
 # will force the Python install tests to run serially, overriding than the
 # concurrent testing behavior.
 #
 # TF_GPU_COUNT, Set the number of GPUs in the system. We run only this many
 # concurrent tests when running GPU tests.
 #
@ -411,21 +407,21 @@ SKIP_COUNTER=0
 FAILED_TESTS=""
 FAILED_TEST_LOGS=""
-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+if [[ "${IS_GPU}" == "1" ]]; then
 if [[ -z ${N_JOBS} ]]; then
  # Try the Mac way of getting number of CPUs
  N_JOBS=$(sysctl -n hw.ncpu)
 fi
 if [[ -z ${N_JOBS} ]]; then
  N_JOBS=8
  echo "Cannot determine the number of processors"
  echo "Using default concurrent job counter ${N_JOBS}"
 fi
 if [[ ! -z "${TF_BUILD_SERIAL_INSTALL_TESTS}" ]] &&
   [[ "${TF_BUILD_SERIAL_INSTALL_TESTS}" != "0" ]]; then
  N_JOBS=$TF_GPU_COUNT
 else
  N_JOBS=$(grep -c ^processor /proc/cpuinfo)
  if [[ -z ${N_JOBS} ]]; then
    # Try the Mac way of getting number of CPUs
    N_JOBS=$(sysctl -n hw.ncpu)
  fi
  # If still cannot determine the number of CPUs, pick 8.
  if [[ -z ${N_JOBS} ]]; then
    N_JOBS=8
    echo "Cannot determine the number of processors"
    echo "Using default concurrent job counter ${N_JOBS}"
  fi
 fi
 echo "Running Python tests-on-install with ${N_JOBS} concurrent jobs..."
@ -485,9 +481,14 @@ while true; do
    TEST_LOGS="${TEST_LOGS} ${TEST_LOG}"
    # Launch test asynchronously
-    "${SCRIPT_DIR}/../gpu_build/parallel_gpu_execute.sh" \
+    if [[ "${IS_GPU}" == "1" ]]; then
      "${SCRIPT_DIR}/../gpu_build/parallel_gpu_execute.sh" \
        "${SCRIPT_DIR}/py_test_delegate.sh" \
        "${PYTHON_BIN_PATH}" "${PY_TEST_DIR}/${TEST_BASENAME}" "${TEST_LOG}" &
    else
      "${SCRIPT_DIR}/py_test_delegate.sh" \
-      "${PYTHON_BIN_PATH}" "${PY_TEST_DIR}/${TEST_BASENAME}" "${TEST_LOG}" &
+        "${PYTHON_BIN_PATH}" "${PY_TEST_DIR}/${TEST_BASENAME}" "${TEST_LOG}" &
    fi
    if [[ "${TEST_COUNTER}" -ge "${N_PAR_TESTS}" ]]; then
      # Run in exclusive mode
--- a/tensorflow/tools/ci_build/builds/test_tutorials.sh
+++ b/tensorflow/tools/ci_build/builds/test_tutorials.sh
@ -146,7 +146,7 @@ test_mnist_with_summaries() {
  run_in_directory "${TEST_DIR}" "${LOG_FILE}" \
    tensorflow/examples/tutorials/mnist/mnist_with_summaries.py \
-    --data_dir="${TUT_TEST_DATA_DIR}/mnist" --summaries_dir="${SUMMARIES_DIR}"
+    --data_dir="${TUT_TEST_DATA_DIR}/mnist" --log_dir="${SUMMARIES_DIR}"
  # Verify final accuracy
  FINAL_ACCURACY=$(grep "Accuracy at step" "${LOG_FILE}" \
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@ -103,10 +103,8 @@ WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
 BUILD_TAG="${BUILD_TAG:-tf_ci}"
 # Add extra params for cuda devices and libraries for GPU container.
-if [ "${CONTAINER_TYPE}" == "gpu" ]; then
+# And clear them if we are not building for GPU.
-  # GPU pip tests-on-install concurrency is limited to the number of GPUs.
+if [ "${CONTAINER_TYPE}" != "gpu" ]; then
  GPU_EXTRA_PARAMS="${GPU_EXTRA_PARAMS} -e TF_BUILD_SERIAL_INSTALL_TESTS=1"
 else
  GPU_EXTRA_PARAMS=""
 fi
--- a/tensorflow/tools/dist_test/build_server.sh
+++ b/tensorflow/tools/dist_test/build_server.sh
@ -16,7 +16,14 @@
 #
 # Builds the test server for distributed (GRPC) TensorFlow
 #
-# Usage: build_server.sh <docker_image_name> [--test]
+# Usage: build_server.sh <docker_image_name> <whl_url> [--test]
 #
 # Arguments:
 #   docker_image_name: Name of the docker image to build.
 #     E.g.: tensorflow/tf_grpc_test_server:0.11.0rc1
 #
 #   whl_url: URL from which the TensorFlow whl file will be downloaded.
 #     E.g.: https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 #
 # The optional flag --test lets the script to use the Dockerfile for the
 # testing GRPC server. Without the flag, the script will build the non-test
@ -33,22 +40,35 @@ die() {
 }
 # Check arguments
-if [[ $# != 1 ]] && [[ $# != 2 ]]; then
+if [[ $# -lt 2 ]]; then
-  die "Usage: $0 <docker_image_name> [--test]"
+  die "Usage: $0 <docker_image_name> <whl_url> [--test]"
 fi
 DOCKER_IMG_NAME=$1
-shift
+WHL_URL=$2
 shift 2
 # Current script directory
 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-DOCKER_FILE="${DIR}/server/Dockerfile"
+BUILD_DIR=$(mktemp -d)
 echo ""
 echo "Using whl file URL: ${WHL_URL}"
 echo "Building in temporary directory: ${BUILD_DIR}"
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
    die "Failed to copy files to ${BUILD_DIR}"
 DOCKER_FILE="${BUILD_DIR}/server/Dockerfile"
 if [[ $1 == "--test" ]]; then
-  DOCKER_FILE="${DIR}/server/Dockerfile.test"
+  DOCKER_FILE="${BUILD_DIR}/server/Dockerfile.test"
 fi
 echo "Using Docker file: ${DOCKER_FILE}"
 # Download whl file into the build context directory.
 wget -P "${BUILD_DIR}" ${WHL_URL} || \
    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
 if [[ ! -f "${DOCKER_FILE}" ]]; then
  die "ERROR: Unable to find dockerfile: ${DOCKER_FILE}"
 fi
@ -56,5 +76,8 @@ echo "Dockerfile: ${DOCKER_FILE}"
 # Call docker build
 docker build --no-cache -t "${DOCKER_IMG_NAME}" \
-   -f "${DOCKER_FILE}" \
+   -f "${DOCKER_FILE}" "${BUILD_DIR}" || \
-   "${DIR}"
+   die "Failed to build docker image: ${DOCKER_IMG_NAME}"
 # Clean up docker build context directory.
 rm -rf "${BUILD_DIR}"
--- a/tensorflow/tools/dist_test/server/Dockerfile
+++ b/tensorflow/tools/dist_test/server/Dockerfile
@ -34,9 +34,10 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
    python get-pip.py && \
    rm get-pip.py
-# Install TensorFlow CPU version from nightly build
+# Install TensorFlow wheel
-RUN pip --no-cache-dir install \
+COPY tensorflow-*.whl /
-    https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+RUN pip install /tensorflow-*.whl && \
    rm -f /tensorflow-*.whl
 # Copy files, including the GRPC server binary at
 # server/grpc_tensorflow_server.py
--- a/tensorflow/tools/dist_test/server/Dockerfile.test
+++ b/tensorflow/tools/dist_test/server/Dockerfile.test
@ -40,9 +40,10 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
 # Install python panda for the census wide&deep test
 RUN pip install --upgrade pandas==0.18.1
-# Install TensorFlow CPU version.
+# Install TensorFlow wheel
-RUN pip --no-cache-dir install \
+COPY tensorflow-*.whl /
-    https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+RUN pip install /tensorflow-*.whl && \
    rm -f /tensorflow-*.whl
 # Copy files, including the GRPC server binary at
 # server/grpc_tensorflow_server.py
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@ -33,7 +33,7 @@ RUN pip --no-cache-dir install \
        && \
    python -m ipykernel.kernelspec
-ENV TENSORFLOW_VERSION 0.11.0rc0
+ENV TENSORFLOW_VERSION 0.11.0rc1
 # --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 # These lines will be edited automatically by parameterized_docker_build.sh. #
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@ -33,7 +33,7 @@ RUN pip --no-cache-dir install \
        && \
    python -m ipykernel.kernelspec
-ENV TENSORFLOW_VERSION 0.11.0rc0
+ENV TENSORFLOW_VERSION 0.11.0rc1
 # --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 # These lines will be edited automatically by parameterized_docker_build.sh. #
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@ -17,7 +17,7 @@ RUN ./install_google_cloud_sdk.bash --disable-prompts --install-dir=/var/gcloud
 # Install nightly TensorFlow pip
 RUN pip install \
-   https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc0-cp27-none-linux_x86_64.whl
+   https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 # Copy test files
 RUN mkdir -p /gcs-smoke/python
--- a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
@ -81,7 +81,6 @@ fi
 cat ${LOG_FILE}
 echo ""
 # Clean up the newly created tfrecord file in GCS bucket.
 # First, activate gcloud service account
 "${GCLOUD_BIN}" auth activate-service-account \
@ -96,13 +95,3 @@ fi
 "${GSUTIL_BIN}" rm "${NEW_TFREC_URL}" && \
    echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
 # Also clean up newly created GCS dir.
 NEW_DIR_URL=$(grep "Creating dir" "${LOG_FILE}" | \
                awk '{print $NF}')
 if [[ -z ${NEW_DIR_URL} ]]; then
  die "FAIL: Unable to determine the URL to the new directory created in GCS."
 fi
 "${GSUTIL_BIN}" rm -r "${NEW_DIR_URL}" && \
    echo "Cleaned up new directory created in GCS: ${NEW_DIR_URL}" || \
    die "FAIL: Unable to clean up new directory created in GCS: ${NEW_DIR_URL}"
--- a/tensorflow/tools/gcs_test/python/gcs_smoke.py
+++ b/tensorflow/tools/gcs_test/python/gcs_smoke.py
@ -35,7 +35,6 @@ flags.DEFINE_integer("num_examples", 10, "Number of examples to generate")
 FLAGS = flags.FLAGS
 def create_examples(num_examples, input_mean):
  """Create ExampleProto's containg data."""
  ids = np.arange(num_examples).reshape([num_examples, 1])
@ -64,12 +63,48 @@ def create_dir_test():
  print("%s directory exists: %s" % (dir_name, dir_exists))
  # List contents of just created directory.
  starttime = int(round(time.time() * 1000))
  print("Listing directory %s." % dir_name)
  starttime = int(round(time.time() * 1000))
  print(file_io.list_directory(dir_name))
  elapsed = int(round(time.time() * 1000)) - starttime
  print("Listed directory %s in %s milliseconds" % (dir_name, elapsed))
  # Delete directory.
  print("Deleting directory %s." % dir_name)
  starttime = int(round(time.time() * 1000))
  file_io.delete_recursively(dir_name)
  elapsed = int(round(time.time() * 1000)) - starttime
  print("Deleted directory %s in %s milliseconds" % (dir_name, elapsed))
 def create_object_test():
  """Verifies file_io's object manipulation methods ."""
  starttime = int(round(time.time() * 1000))
  dir_name = "%s/tf_gcs_test_%s" % (FLAGS.gcs_bucket_url, starttime)
  print("Creating dir %s." % dir_name)
  file_io.create_dir(dir_name)
  # Create a file in this directory.
  file_name = "%s/test_file.txt" % dir_name
  print("Creating file %s." % file_name)
  file_io.write_string_to_file(file_name, "test file creation.")
  list_files_pattern = "%s/test_file*.txt" % dir_name
  print("Getting files matching pattern %s." % list_files_pattern)
  files_list = file_io.get_matching_files(list_files_pattern)
  print(files_list)
  assert len(files_list) == 1
  assert files_list[0] == file_name
  # Cleanup test files.
  print("Deleting file %s." % file_name)
  file_io.delete_file(file_name)
  # Delete directory.
  print("Deleting directory %s." % dir_name)
  file_io.delete_recursively(dir_name)
 if __name__ == "__main__":
  # Sanity check on the GCS bucket URL.
  if not FLAGS.gcs_bucket_url or not FLAGS.gcs_bucket_url.startswith("gs://"):
@ -132,4 +167,5 @@ if __name__ == "__main__":
        print("Successfully caught the expected OutOfRangeError while "
              "reading one more record than is available")
-    create_dir_test()
+  create_dir_test()
  create_object_test()
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@ -147,7 +147,7 @@ def get_git_version(git_base_path):
  """
  unknown_label = b"unknown"
  try:
-    val = subprocess.check_output(["git", "-C", git_base_path, "describe",
+    val = subprocess.check_output(["git", str("--git-dir="+git_base_path+"/.git"), str("--work-tree="+git_base_path), "describe",
                                   "--long", "--dirty", "--tags"]).strip()
    return val if val else unknown_label
  except subprocess.CalledProcessError:
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@ -107,7 +107,8 @@ function main() {
  mkdir -p ${TMPDIR}/third_party
  pushd ${RUNFILES%org_tensorflow}
  for header in $(find protobuf -name \*.h); do
-    cp --parents "$header" ${TMPDIR}/google;
+    mkdir -p "${TMPDIR}/google/$(dirname ${header})"
    cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
  done
  popd
  cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@ -26,7 +26,7 @@ from setuptools import find_packages, setup, Command
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
-_VERSION = '0.11.0rc0'
+_VERSION = '0.11.0rc1'
 REQUIRED_PACKAGES = [
    'numpy >= 1.11.0',
--- a/tensorflow/tools/swig/.gitignore
+++ b/tensorflow/tools/swig/.gitignore
@ -0,0 +1 @@
 swig_path
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -98,9 +98,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
  native.http_archive(
    name = "protobuf",
-    url = "http://github.com/google/protobuf/archive/c2b3e70efd2038a54ef8973771ac58192885125e.tar.gz",
+    url = "http://github.com/google/protobuf/archive/008b5a228b37c054f46ba478ccafa5e855cb16db.tar.gz",
-    sha256 = "eafc1bc4c27970d62effe64ba6610823fdd66711f440d8ca4a168167786a2fcb",
+    sha256 = "2737ad055eb8a9bc63ed068e32c4ea280b62d8236578cb4d4120eb5543f759ab",
-    strip_prefix = "protobuf-c2b3e70efd2038a54ef8973771ac58192885125e",
+    strip_prefix = "protobuf-008b5a228b37c054f46ba478ccafa5e855cb16db",
  )
  native.new_http_archive(
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@ -1,3 +1,6 @@
 #ifdef _WIN32
 #define sleep(seconds) Sleep(1000*seconds)
 #endif  // _WIN32
 #include "unsupported/Eigen/CXX11/Tensor"
 #ifdef _WIN32
--- a/util/python/python_config.sh
+++ b/util/python/python_config.sh
@ -113,29 +113,33 @@ function setup_python {
    echo -e "\n\nERROR: Problem getting python include path.  Is distutils installed?"
    exit 1
  fi
-  local python_lib_path
+
-  # Split python_path into an array of paths, this allows path containing spaces
+  if [ -z "$PYTHON_LIB_PATH" ]; then
-  IFS=','
+    local python_lib_path
-  python_lib_path=($(python_path))
+    # Split python_path into an array of paths, this allows path containing spaces
-  unset IFS
+    IFS=','
-  echo "Found possible Python library paths:"
+    python_lib_path=($(python_path))
-  for x in "${python_lib_path[@]}"; do
+    unset IFS
-    echo "  $x"
+    echo "Found possible Python library paths:"
-  done
+    for x in "${python_lib_path[@]}"; do
-  set -- "${python_lib_path[@]}"
+      echo "  $x"
-  echo "Please input the desired Python library path to use.  Default is ["$1"]"
+    done
-  read b || true
+    set -- "${python_lib_path[@]}"
-  if [ "$b" == "" ]; then
+    echo "Please input the desired Python library path to use.  Default is ["$1"]"
-   python_lib="$(default_python_path "${python_lib_path[0]}")"
+    read b || true
-   echo $python_lib
+    if [ "$b" == "" ]; then
-  else
+      PYTHON_LIB_PATH="$(default_python_path "${python_lib_path[0]}")"
-    if test -d "$b" -a -x "$b"; then
+      echo $PYTHON_LIB_PATH
      python_lib="$b"
    else
-      echo -e "\n\nERROR: The path you have entered does not exist."
+      PYTHON_LIB_PATH="$b"
      exit 1
    fi
  fi
  if test -d "$PYTHON_LIB_PATH" -a -x "$PYTHON_LIB_PATH"; then
    python_lib="$PYTHON_LIB_PATH"
  else
    echo -e "\n\nERROR: Invalid python library path: ${PYTHON_LIB_PATH}."
    exit 1
  fi
  local numpy_include=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import numpy; print(numpy.get_include());')
  if [ "$numpy_include" == "" ]; then