Update on "[iOS][CI] Disable automatically code signing for TestApp"

## Summary Since the nightly jobs are lack of testing phases, we don't really have a way to test the binary before uploading it to AWS. To make the work more solid, we need to figure out a way to verify the binary. Fortunately, the XCode tool chain offers a way to build your app without XCode app, which is the [xcodebuild](https://developer.apple.com/library/archive/technotes/tn2339/_index.html) command. Now we can link our binary to a testing app and run `xcodebuild` to to see if there is any linking error. The PRs below have already done some of the preparation jobs - [#26261](https://github.com/pytorch/pytorch/pull/26261) - [#26632](https://github.com/pytorch/pytorch/pull/26632) The challenge comes when testing the arm64 build as we don't have a way to code-sign our TestApp. Circle CI has a [tutorial](https://circleci.com/docs/2.0/ios-codesigning/) but is too complicated to implement. Anyway, I figured out an easier way to do it 1. Disable automatically code sign in XCode 2. Export the encoded developer certificate and provisioning profile to org-context in Circle CI (done) 3. Install the developer certificate to the key chain store on CI machines via Fastlane. 4. Add the testing code to PR jobs and verify the result. 5. Add the testing code to nightly jobs and verify the result. ## Test Plan - Both PR jobs and nightly jobs can finish successfully. - `xcodebuild` can finish successfully Differential Revision: [D17844036](https://our.internmc.facebook.com/intern/diff/D17844036)
2025-12-06 12:20:52 +01:00 · 2019-10-09 16:48:17 -07:00 · 2019-10-09 16:48:17 -07:00 · bd1f58ee48
commit bd1f58ee48
parent 29842d63f1 abcd221f19
111 changed files with 6922 additions and 631 deletions
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 """
 This module models the tree of configuration variants
 for "smoketest" builds.
--- a/.circleci/cimodel/data/binary_build_definitions.py
+++ b/.circleci/cimodel/data/binary_build_definitions.py
@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 from collections import OrderedDict

 import cimodel.data.binary_build_data as binary_build_data
--- a/.circleci/cimodel/data/caffe2_build_data.py
+++ b/.circleci/cimodel/data/caffe2_build_data.py
@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 from cimodel.lib.conf_tree import ConfigNode, X, XImportant
 from cimodel.lib.conf_tree import Ver

--- a/.circleci/cimodel/data/caffe2_build_definitions.py
+++ b/.circleci/cimodel/data/caffe2_build_definitions.py
@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 from collections import OrderedDict

 import cimodel.data.dimensions as dimensions
--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@ -1,6 +1,3 @@
-#!/usr/bin/env python3
-
-
 PHASES = ["build", "test"]

 CUDA_VERSIONS = [
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 from cimodel.lib.conf_tree import ConfigNode, X, XImportant


--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 from collections import OrderedDict

 from cimodel.data.pytorch_build_data import TopLevelNode, CONFIG_TREE_DATA
--- a/.circleci/cimodel/lib/conf_tree.py
+++ b/.circleci/cimodel/lib/conf_tree.py
@ -1,6 +1,3 @@
-#!/usr/bin/env python3
-
-
 from dataclasses import dataclass, field
 from typing import Optional, Dict

--- a/.circleci/cimodel/lib/miniutils.py
+++ b/.circleci/cimodel/lib/miniutils.py
@ -1,6 +1,3 @@
-#!/usr/bin/env python3
-
-
 def quote(s):
    return sandwich('"', s)

--- a/.circleci/cimodel/lib/miniyaml.py
+++ b/.circleci/cimodel/lib/miniyaml.py
@ -1,6 +1,3 @@
-#!/usr/bin/env python3
-
-
 from collections import OrderedDict


--- a/.circleci/cimodel/lib/visualization.py
+++ b/.circleci/cimodel/lib/visualization.py
@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 """
 This module encapsulates dependencies on pygraphviz
 """
--- a/.circleci/scripts/cpp_doc_push_script.sh
+++ b/.circleci/scripts/cpp_doc_push_script.sh
@ -53,7 +53,7 @@ sudo apt-get -y install doxygen
 # Generate ATen files
 pushd "${pt_checkout}"
 pip install -r requirements.txt
-time GEN_TO_SOURCE=1 python aten/src/ATen/gen.py \
+time python aten/src/ATen/gen.py \
  -s aten/src/ATen \
  -d build/aten/src/ATen \
  aten/src/ATen/Declarations.cwrap \
--- a/.flake8
+++ b/.flake8
@ -5,10 +5,8 @@ max-line-length = 120
 # E501 is not flexible enough, we're using B950 instead
 ignore =
    E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
-    # EXE001 is skipped for now because some files use shebang to determine Python version.
-    EXE001,
    # these ignores are from flake8-bugbear; please fix!
    B007,B008,
    # these ignores are from flake8-comprehensions; please fix!
    C400,C401,C402,C403,C404,C405,C407,C411,
-exclude = docs/src,venv,third_party,caffe2,scripts,docs/caffe2,torch/lib/include,torch/lib/tmp_install,build,torch/include,*.pyi
+exclude = docs/src,venv,third_party,caffe2,scripts,docs/caffe2,torch/lib/include,torch/lib/tmp_install,build,torch/include,*.pyi,.git
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -440,6 +440,38 @@ ccache -F 0
 # deploy (and add to ~/.bashrc for later)
 export PATH="/usr/lib/ccache:$PATH"
 ```
+
+It is also possible to install `ccache` via `conda` by installing it from the
+community-maintained `conda-forge` channel. Here is how to set up `ccache` this
+way:
+
+```bash
+# install ccache
+conda install -c conda-forge ccache
+
+# set up ccache compiler symlinks
+mkdir ~/ccache
+mkdir ~/ccache/lib
+mkdir ~/ccache/cuda
+ln -s $CONDA_PREFIX/bin/ccache ~/ccache/lib/cc
+ln -s $CONDA_PREFIX/bin/ccache ~/ccache/lib/c++
+ln -s $CONDA_PREFIX/bin/ccache ~/ccache/lib/gcc
+ln -s $CONDA_PREFIX/bin/ccache ~/ccache/lib/g++
+ln -s $CONDA_PREFIX/bin/ccache ~/ccache/cuda/nvcc
+
+# update PATH to reflect symlink locations, consider
+# adding this to your .bashrc
+export PATH=~/ccache/lib:$PATH
+export CUDA_NVCC_EXECUTABLE=~/ccache/cuda/nvcc
+
+# increase ccache cache size to 25 GiB
+ccache -M 25Gi
+```
+
+To check this is working, do two clean builds of pytorch in a row. The second
+build should be substantially and noticeably faster than the first build.
+
+
 #### Use a faster linker
 If you are editing a single file and rebuilding in a tight loop, the time spent
 linking will dominate. The system linker available in most Linux distributions
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -146,10 +146,24 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
    switch (_st) {                                                                                               \
      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)                                          \
      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)                                            \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__)                                          \
      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex<double>, __VA_ARGS__)                     \
      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex<float>, __VA_ARGS__)                       \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexHalf, std::complex<at::Half>, __VA_ARGS__)                     \
+      default:                                                                                                   \
+        AT_ERROR(#NAME, " not implemented for '", toString(_st), "'");                                           \
+    }                                                                                                            \
+  }()
+
+ #define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(SCALARTYPE, TYPE, NAME, ...)                                \
+  [&] {                                                                                                          \
+    const auto& the_type = TYPE;                                                                                 \
+    /* don't use TYPE again in case it is an expensive or side-effect op */                                      \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                                                        \
+    switch (_st) {                                                                                               \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)                                          \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)                                            \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexDouble, std::complex<double>, __VA_ARGS__)                     \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::ComplexFloat, std::complex<float>, __VA_ARGS__)                       \
+      AT_PRIVATE_CASE_TYPE(SCALARTYPE, decltype(c10::impl::ScalarTypeToCPPType<SCALARTYPE>::t), __VA_ARGS__)     \
      default:                                                                                                   \
        AT_ERROR(#NAME, " not implemented for '", toString(_st), "'");                                           \
    }                                                                                                            \
--- a/aten/src/ATen/NumericUtils.h
+++ b/aten/src/ATen/NumericUtils.h
@ -7,6 +7,7 @@
 #include <cmath>
 #include <type_traits>
 #include <c10/util/BFloat16.h>
+#include <c10/util/Complex.h>
 #include <c10/macros/Macros.h>

 namespace at {
@ -31,6 +32,12 @@ inline C10_HOST_DEVICE bool _isnan(T val) {
 #endif
 }

+template <typename T,
+          typename std::enable_if<std::is_complex_t<T>::value, int>::type = 0>
+inline bool _isnan(T val) {
+  return std::isnan(std::real(val)) || std::isnan(std::imag(val));
+}
+
 inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) {
  return at::_isnan(float(val));
 }
--- a/aten/src/ATen/core/ATenDispatch.h
+++ b/aten/src/ATen/core/ATenDispatch.h
@ -40,7 +40,8 @@ namespace impl {
 // question is whether or not we have access to all the relevant TLS at this
 // point.
 static inline TensorTypeId dispatchTypeId(TensorTypeSet ts) {
-  return (ts - c10::impl::tls_excluded_tensor_type_set()).highestPriorityTypeId();
+  c10::impl::LocalTensorTypeSet local = c10::impl::tls_local_tensor_type_set();
+  return ((ts | local.included_) - local.excluded_).highestPriorityTypeId();
 }

 }
--- a/aten/src/ATen/core/LegacyTypeDispatch.h
+++ b/aten/src/ATen/core/LegacyTypeDispatch.h
@ -12,6 +12,7 @@
 #include <c10/core/ScalarType.h>
 #include <c10/util/Exception.h>
 #include <ATen/core/LegacyDeviceTypeInit.h>
+#include <c10/core/impl/LocalTensorTypeSet.h>
 #include <c10/core/TensorImpl.h>
 #include <ATen/core/ATenDispatch.h>
 #include <ATen/core/TensorBody.h>
@ -47,22 +48,19 @@ class CAFFE2_API LegacyTypeDispatch {

 CAFFE2_API LegacyTypeDispatch& globalLegacyTypeDispatch();

-// A RAII, thread local (!) guard that has the following effect:
-//
-// Upon construction: sets NonVariableTypeMode_enabled for the current thread to
-// control whether we are in non-Variable-type mode.
-//
-// Upon destruction: sets NonVariableTypeMode_enabled back to the original value.
+// A RAII, thread local (!) guard that will disable dispatch to variable
+// handler.
 //
 // See NOTE [ Treating Variables as non-Variables in type dispatch ] for details.
 struct CAFFE2_API AutoNonVariableTypeMode {
-  AutoNonVariableTypeMode(bool enabled) : prev_mode(NonVariableTypeMode::is_enabled()) {
-    NonVariableTypeMode::set_enabled(enabled);
+  // NB: The enabled parameter must ALWAYS be black, as Henry Ford used to say.
+  // TODO: Eliminate this parameter entirely
+  AutoNonVariableTypeMode(bool enabled = true) :
+    guard_(TensorTypeId::VariableTensorId) {
+
+    TORCH_INTERNAL_ASSERT(enabled);
  }
-  ~AutoNonVariableTypeMode() {
-    NonVariableTypeMode::set_enabled(prev_mode);
-  }
-  bool prev_mode;
+  c10::impl::ExcludeTensorTypeIdGuard guard_;
 };

 } // namespace at
--- a/aten/src/ATen/cpu/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec256/vec256.h
@ -7,6 +7,8 @@
 #include <ATen/cpu/vec256/vec256_double.h>
 #include <ATen/cpu/vec256/vec256_int.h>
 #include <ATen/cpu/vec256/vec256_qint.h>
+#include <ATen/cpu/vec256/vec256_complex_float.h>
+#include <ATen/cpu/vec256/vec256_complex_double.h>

 #include <algorithm>
 #include <cstddef>
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@ -12,6 +12,7 @@
 #include <ATen/NumericUtils.h>
 #include <c10/util/C++17.h>
 #include <c10/util/BFloat16.h>
+#include <ATen/native/cpu/zmath.h>

 #if defined(__GNUC__)
 #define __at_align32__ __attribute__((aligned(32)))
@ -169,12 +170,19 @@ public:
    }
    return ret;
  }
-  template <typename non_float_t = T,
-            typename std::enable_if<!std::is_floating_point<non_float_t>::value, int>::type = 0>
+  Vec256<T> map(T (*f)(const T &)) const {
+    Vec256<T> ret;
+    for (int64_t i = 0; i != size(); i++) {
+      ret[i] = f(values[i]);
+    }
+    return ret;
+  }
+  template <typename other_t = T,
+            typename std::enable_if<!std::is_floating_point<other_t>::value && !std::is_complex_t<other_t>::value, int>::type = 0>
  Vec256<T> abs() const {
-    // non_float_t is for SFINAE and clarity. Make sure it is not changed.
-    static_assert(std::is_same<non_float_t, T>::value, "non_float_t must be T");
-    return map([](T x) -> T { return x < static_cast<non_float_t>(0) ? -x : x; });
+    // other_t is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<other_t, T>::value, "other_t must be T");
+    return map([](T x) -> T { return x < static_cast<other_t>(0) ? -x : x; });
  }
  template <typename float_t = T,
            typename std::enable_if<std::is_floating_point<float_t>::value, int>::type = 0>
@ -185,6 +193,26 @@ public:
    // 0.0) properly.
    return map(std::abs);
  }
+  template <typename complex_t = T,
+            typename std::enable_if<std::is_complex_t<complex_t>::value, int>::type = 0>
+  Vec256<T> abs() const {
+    // complex_t is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<complex_t, T>::value, "complex_t must be T");
+    // Specifically map() does not perform the type conversion needed by abs.
+    return map([](T x) { return (T)std::abs(x); });
+  }
+  Vec256<T> angle() const {
+    return *this;
+  }
+  Vec256<T> real() const {
+    return *this;
+  }
+  Vec256<T> imag() const {
+    return *this;
+  }
+  Vec256<T> conj() const {
+    return *this;
+  }
  Vec256<T> acos() const {
    return map(std::acos);
  }
@ -232,7 +260,7 @@ public:
    return map(std::log2);
  }
  Vec256<T> ceil() const {
-    return map(std::ceil);
+    return map(at::native::ceil_impl);
  }
  Vec256<T> cos() const {
    return map(std::cos);
@ -241,7 +269,7 @@ public:
    return map(std::cosh);
  }
  Vec256<T> floor() const {
-    return map(std::floor);
+    return map(at::native::floor_impl);
  }
  Vec256<T> neg() const {
    // NB: the trailing return type is needed because we need to coerce the
@ -251,7 +279,7 @@ public:
  }
  Vec256<T> round() const {
    // We do not use std::round because we would like to round midway numbers to the nearest even integer.
-    return map(std::nearbyint);
+    return map(at::native::round_impl);
  }
  Vec256<T> sin() const {
    return map(std::sin);
@ -266,7 +294,7 @@ public:
    return map(std::tanh);
  }
  Vec256<T> trunc() const {
-    return map(std::trunc);
+    return map(at::native::trunc_impl);
  }
  Vec256<T> lgamma() const {
    return map(std::lgamma);
@ -278,7 +306,7 @@ public:
    return map([](T x) { return (T)(1) / x; });
  }
  Vec256<T> rsqrt() const {
-    return map([](T x) { return 1 / std::sqrt(x); });
+    return map([](T x) { return (T)1 / std::sqrt(x); });
  }
  Vec256<T> pow(const Vec256<T> &exp) const {
    Vec256<T> ret;
@ -352,7 +380,9 @@ template <class T> Vec256<T> inline operator||(

 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
 // either input is a NaN.
-template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
+template <class T,
+          typename std::enable_if<!std::is_complex_t<T>::value, int>::type = 0>
+Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = (a[i] > b[i]) ? a[i] : b[i];
@ -366,6 +396,22 @@ template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T>
  return c;
 }

+template <class T,
+          typename std::enable_if<std::is_complex_t<T>::value, int>::type = 0>
+Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
+  Vec256<T> c = Vec256<T>();
+  for (int i = 0; i != Vec256<T>::size(); i++) {
+    c[i] = (std::abs(a[i]) > std::abs(b[i])) ? a[i] : b[i];
+    if (_isnan(a[i])) {
+      // If either input is NaN, propagate a NaN.
+      // NOTE: The case where b[i] was NaN is handled correctly by the naive
+      // ternary operator above.
+      c[i] = a[i];
+    }
+  }
+  return c;
+}
+
 template <typename T>
 inline T maximum(const T& a, const T& b) {
  T c = (a > b) ? a : b;
@ -377,7 +423,9 @@ inline T maximum(const T& a, const T& b) {

 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
 // either input is a NaN.
-template <class T> Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T> &b) {
+template <class T,
+          typename std::enable_if<!std::is_complex_t<T>::value, int>::type = 0>
+Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T> &b) {
  Vec256<T> c = Vec256<T>();
  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = (a[i] < b[i]) ? a[i] : b[i];
@ -391,6 +439,22 @@ template <class T> Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T>
  return c;
 }

+template <class T,
+          typename std::enable_if<std::is_complex_t<T>::value, int>::type = 0>
+Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T> &b) {
+  Vec256<T> c = Vec256<T>();
+  for (int i = 0; i != Vec256<T>::size(); i++) {
+    c[i] = (std::abs(a[i]) < std::abs(b[i])) ? a[i] : b[i];
+    if (_isnan(a[i])) {
+      // If either input is NaN, propagate a NaN.
+      // NOTE: The case where b[i] was NaN is handled correctly by the naive
+      // ternary operator above.
+      c[i] = a[i];
+    }
+  }
+  return c;
+}
+
 template <typename T>
 inline T minimum(const T& a, const T& b) {
  T c = (a < b) ? a : b;
@ -401,7 +465,9 @@ inline T minimum(const T& a, const T& b) {
 }

 // To save BC, it will not propagate NaN based on IEEE 754 201X
-template <class T> Vec256<T> inline clamp(const Vec256<T> &a, const Vec256<T> &min_vec, const Vec256<T> &max_vec) {
+template <class T,
+          typename std::enable_if<!std::is_complex_t<T>::value, int>::type = 0>
+Vec256<T> inline clamp(const Vec256<T> &a, const Vec256<T> &min_vec, const Vec256<T> &max_vec) {
  Vec256<T> c = Vec256<T>();
  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] < min_vec[i] ? min_vec[i] : (a[i] > max_vec[i] ? max_vec[i] : a[i]);
@ -409,7 +475,19 @@ template <class T> Vec256<T> inline clamp(const Vec256<T> &a, const Vec256<T> &m
  return c;
 }

-template <class T> Vec256<T> inline clamp_max(const Vec256<T> &a, const Vec256<T> &max_vec) {
+template <class T,
+          typename std::enable_if<std::is_complex_t<T>::value, int>::type = 0>
+Vec256<T> inline clamp(const Vec256<T> &a, const Vec256<T> &min_vec, const Vec256<T> &max_vec) {
+  Vec256<T> c = Vec256<T>();
+  for (int i = 0; i != Vec256<T>::size(); i++) {
+    c[i] = std::abs(a[i]) < std::abs(min_vec[i]) ? min_vec[i] : (std::abs(a[i]) > std::abs(max_vec[i]) ? max_vec[i] : a[i]);
+  }
+  return c;
+}
+
+template <class T,
+          typename std::enable_if<!std::is_complex_t<T>::value, int>::type = 0>
+Vec256<T> inline clamp_max(const Vec256<T> &a, const Vec256<T> &max_vec) {
  Vec256<T> c = Vec256<T>();
  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] > max_vec[i] ? max_vec[i] : a[i];
@ -417,7 +495,19 @@ template <class T> Vec256<T> inline clamp_max(const Vec256<T> &a, const Vec256<T
  return c;
 }

-template <class T> Vec256<T> inline clamp_min(const Vec256<T> &a, const Vec256<T> &min_vec) {
+template <class T,
+          typename std::enable_if<std::is_complex_t<T>::value, int>::type = 0>
+Vec256<T> inline clamp_max(const Vec256<T> &a, const Vec256<T> &max_vec) {
+  Vec256<T> c = Vec256<T>();
+  for (int i = 0; i != Vec256<T>::size(); i++) {
+    c[i] = std::abs(a[i]) > std::abs(max_vec[i]) ? max_vec[i] : a[i];
+  }
+  return c;
+}
+
+template <class T,
+          typename std::enable_if<!std::is_complex_t<T>::value, int>::type = 0>
+Vec256<T> inline clamp_min(const Vec256<T> &a, const Vec256<T> &min_vec) {
  Vec256<T> c = Vec256<T>();
  for (int i = 0; i != Vec256<T>::size(); i++) {
    c[i] = a[i] < min_vec[i] ? min_vec[i] : a[i];
@ -425,6 +515,16 @@ template <class T> Vec256<T> inline clamp_min(const Vec256<T> &a, const Vec256<T
  return c;
 }

+template <class T,
+          typename std::enable_if<std::is_complex_t<T>::value, int>::type = 0>
+Vec256<T> inline clamp_min(const Vec256<T> &a, const Vec256<T> &min_vec) {
+  Vec256<T> c = Vec256<T>();
+  for (int i = 0; i != Vec256<T>::size(); i++) {
+    c[i] = std::abs(a[i]) < std::abs(min_vec[i]) ? min_vec[i] : a[i];
+  }
+  return c;
+}
+
 #define DEFINE_BITWISE_OP(op)                                               \
 template <class T>                                                          \
 Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) {      \
--- a/aten/src/ATen/cpu/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
@ -0,0 +1,369 @@
+#pragma once
+
+#include <ATen/cpu/vec256/intrinsics.h>
+#include <ATen/cpu/vec256/vec256_base.h>
+#if defined(__AVX__) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at {
+namespace vec256 {
+// See Note [Acceptable use of anonymous namespace in header]
+namespace {
+
+#if defined(__AVX__) && !defined(_MSC_VER)
+
+template <> class Vec256<std::complex<double>> {
+private:
+  __m256d values;
+public:
+  using value_type = std::complex<double>;
+  static constexpr int size() {
+    return 2;
+  }
+  Vec256() {}
+  Vec256(__m256d v) : values(v) {}
+  Vec256(std::complex<double> val) {
+    double real_value = std::real(val);
+    double imag_value = std::imag(val);
+    values = _mm256_setr_pd(real_value, imag_value,
+                            real_value, imag_value);
+  }
+  Vec256(std::complex<double> val1, std::complex<double> val2) {
+    values = _mm256_setr_pd(std::real(val1), std::imag(val1),
+                            std::real(val2), std::imag(val2));
+  }
+  operator __m256d() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vec256<std::complex<double>> blend(const Vec256<std::complex<double>>& a, const Vec256<std::complex<double>>& b) {
+     // convert std::complex<V> index mask to V index mask: xy -> xxyy
+    switch (mask) {
+      case 0:
+        return a;
+      case 1:
+        return _mm256_blend_pd(a.values, b.values, 0x03);
+      case 2:
+        return _mm256_blend_pd(a.values, b.values, 0x0c);
+    }
+    return b;
+  }
+  static Vec256<std::complex<double>> blendv(const Vec256<std::complex<double>>& a, const Vec256<std::complex<double>>& b,
+                               const Vec256<std::complex<double>>& mask) {
+    // convert std::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm256_unpacklo_pd(mask.values, mask.values);
+    return _mm256_blendv_pd(a.values, b.values, mask_);
+
+  }
+  static Vec256<std::complex<double>> arange(std::complex<double> base = 0., std::complex<double> step = 1.) {
+    return Vec256<std::complex<double>>(base,
+                                        base + step);
+  }
+  static Vec256<std::complex<double>> set(const Vec256<std::complex<double>>& a, const Vec256<std::complex<double>>& b,
+                            int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+    }
+    return b;
+  }
+  static Vec256<std::complex<double>> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
+
+    __at_align32__ double tmp_values[2*size()];
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const double*>(ptr),
+        count * sizeof(std::complex<double>));
+    return _mm256_load_pd(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
+    } else if (count > 0) {
+      double tmp_values[2*size()];
+      _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(std::complex<double>));
+    }
+  }
+  const std::complex<double>& operator[](int idx) const  = delete;
+  std::complex<double>& operator[](int idx) = delete;
+  Vec256<std::complex<double>> map(std::complex<double> (*f)(const std::complex<double> &)) const {
+    __at_align32__ std::complex<double> tmp[size()];
+    store(tmp);
+    for (int i = 0; i < size(); i++) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  __m256d abs_2_() const {
+    auto val_2 = _mm256_mul_pd(values, values);     // a*a     b*b
+    return _mm256_hadd_pd(val_2, val_2);            // a*a+b*b a*a+b*b
+  }
+  __m256d abs_() const {
+    return _mm256_sqrt_pd(abs_2_());                // abs     abs
+  }
+  Vec256<std::complex<double>> abs() const {
+    const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                     0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
+    return _mm256_and_pd(abs_(), real_mask);        // abs     0
+  }
+  __m256d angle_() const {
+    //angle = atan2(b/a)
+    auto b_a = _mm256_permute_pd(values, 0x05);     // b        a
+    return Sleef_atan2d4_u10(values, b_a);          // 90-angle angle
+  }
+  Vec256<std::complex<double>> angle() const {
+    const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                     0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
+    auto angle = _mm256_permute_pd(angle_(), 0x05); // angle    90-angle
+    return _mm256_and_pd(angle, real_mask);         // angle    0
+  }
+  __m256d real_() const {
+    const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                     0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
+    return _mm256_and_pd(values, real_mask);
+  }
+  Vec256<std::complex<double>> real() const {
+    return real_();
+  }
+  __m256d imag_() const {
+    const __m256d imag_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF,
+                                                                     0x0000000000000000, 0xFFFFFFFFFFFFFFFF));
+    return _mm256_and_pd(values, imag_mask);
+    }
+  Vec256<std::complex<double>> imag() const {
+    return _mm256_permute_pd(imag_(), 0x05);           //b        a
+  }
+  __m256d conj_() const {
+    const __m256d conj_mask = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
+    return _mm256_mul_pd(values, conj_mask);           //a        -b
+  }
+  Vec256<std::complex<double>> conj() const {
+    return conj_();
+  }
+  Vec256<std::complex<double>> acos() const {
+    return map(std::acos);
+  }
+  Vec256<std::complex<double>> asin() const {
+    return map(std::asin);
+  }
+  Vec256<std::complex<double>> atan() const {
+    return map(std::atan);
+  }
+  Vec256<std::complex<double>> atan2(const Vec256<std::complex<double>> &b) const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<double>> erf() const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<double>> erfc() const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<double>> exp() const {
+    return map(std::exp);
+  }
+  Vec256<std::complex<double>> expm1() const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<double>> log() const {
+    return map(std::log);
+  }
+  Vec256<std::complex<double>> log2() const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<double>> log10() const {
+    return map(std::log10);
+  }
+  Vec256<std::complex<double>> log1p() const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<double>> sin() const {
+    return map(std::sin);
+  }
+  Vec256<std::complex<double>> sinh() const {
+    return map(std::sinh);
+  }
+  Vec256<std::complex<double>> cos() const {
+    return map(std::cos);
+  }
+  Vec256<std::complex<double>> cosh() const {
+    return map(std::cosh);
+  }
+  Vec256<std::complex<double>> ceil() const {
+    return _mm256_ceil_pd(values);
+  }
+  Vec256<std::complex<double>> floor() const {
+    return _mm256_floor_pd(values);
+  }
+  Vec256<std::complex<double>> neg() const {
+    auto zero = _mm256_setzero_pd();
+    return _mm256_sub_pd(zero, values);
+  }
+  Vec256<std::complex<double>> round() const {
+    return _mm256_round_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vec256<std::complex<double>> tan() const {
+    return map(std::tan);
+  }
+  Vec256<std::complex<double>> tanh() const {
+    return map(std::tanh);
+  }
+  Vec256<std::complex<double>> trunc() const {
+    return _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vec256<std::complex<double>> sqrt() const {
+    return map(std::sqrt);
+  }
+  Vec256<std::complex<double>> reciprocal() const;
+  Vec256<std::complex<double>> rsqrt() const {
+    return map([](const std::complex<double> &x) { return (std::complex<double>)(1)/std::sqrt(x); });
+  }
+  Vec256<std::complex<double>> pow(const Vec256<std::complex<double>> &exp) const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vec256<std::complex<double>> operator==(const Vec256<std::complex<double>>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ);
+  }
+  Vec256<std::complex<double>> operator!=(const Vec256<std::complex<double>>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_NEQ_OQ);
+  }
+  Vec256<std::complex<double>> operator<(const Vec256<std::complex<double>>& other) const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<double>> operator<=(const Vec256<std::complex<double>>& other) const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<double>> operator>(const Vec256<std::complex<double>>& other) const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<double>> operator>=(const Vec256<std::complex<double>>& other) const {
+    AT_ERROR("not supported for complex numbers");
+  }
+};
+
+template <> Vec256<std::complex<double>> inline operator+(const Vec256<std::complex<double>> &a, const Vec256<std::complex<double>> &b) {
+  return _mm256_add_pd(a, b);
+}
+
+template <> Vec256<std::complex<double>> inline operator-(const Vec256<std::complex<double>> &a, const Vec256<std::complex<double>> &b) {
+  return _mm256_sub_pd(a, b);
+}
+
+template <> Vec256<std::complex<double>> inline operator*(const Vec256<std::complex<double>> &a, const Vec256<std::complex<double>> &b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
+  auto ac_bd = _mm256_mul_pd(a, b);         //ac       bd
+
+  auto d_c = _mm256_permute_pd(b, 0x05);    //d        c
+  d_c = _mm256_mul_pd(neg, d_c);            //d       -c
+  auto ad_bc = _mm256_mul_pd(a, d_c);       //ad      -bc
+
+  auto ret = _mm256_hsub_pd(ac_bd, ad_bc);  //ac - bd  ad + bc
+  return ret;
+}
+
+template <> Vec256<std::complex<double>> inline operator/(const Vec256<std::complex<double>> &a, const Vec256<std::complex<double>> &b) {
+  //re + im*i = (a + bi)  / (c + di)
+  //re = (ac + bd)/abs_2()
+  //im = (bc - ad)/abs_2()
+  const __m256d neg = _mm256_setr_pd(-1.0, 1.0, -1.0, 1.0);
+  auto ac_bd = _mm256_mul_pd(a, b);         //ac       bd
+
+  auto d_c = _mm256_permute_pd(b, 0x05);    //d        c
+  d_c = _mm256_mul_pd(neg, d_c);            //-d       c
+  auto ad_bc = _mm256_mul_pd(a, d_c);       //-ad      bc
+
+  auto re_im = _mm256_hadd_pd(ac_bd, ad_bc);//ac + bd  bc - ad
+  return _mm256_div_pd(re_im, b.abs_2_());
+}
+
+// reciprocal. Implement this here so we can use multiplication.
+Vec256<std::complex<double>> Vec256<std::complex<double>>::reciprocal() const{
+  //re + im*i = (a + bi)  / (c + di)
+  //re = (ac + bd)/abs_2() = c/abs_2()
+  //im = (bc - ad)/abs_2() = d/abs_2()
+  const __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
+  auto c_d = _mm256_mul_pd(neg, values);    //c       -d
+  return _mm256_div_pd(c_d, abs_2_());
+}
+
+template <>
+Vec256<std::complex<double>> inline maximum(const Vec256<std::complex<double>>& a, const Vec256<std::complex<double>>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm256_cmp_pd(abs_a, abs_b, _CMP_LT_OQ);
+  auto max = _mm256_blendv_pd(a, b, mask);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan = _mm256_cmp_pd(abs_a, abs_b, _CMP_UNORD_Q);
+  return _mm256_or_pd(max, isnan);
+}
+
+template <>
+Vec256<std::complex<double>> inline minimum(const Vec256<std::complex<double>>& a, const Vec256<std::complex<double>>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm256_cmp_pd(abs_a, abs_b, _CMP_GT_OQ);
+  auto min = _mm256_blendv_pd(a, b, mask);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan = _mm256_cmp_pd(abs_a, abs_b, _CMP_UNORD_Q);
+  return _mm256_or_pd(min, isnan);
+}
+
+template <>
+Vec256<std::complex<double>> inline clamp(const Vec256<std::complex<double>>& a, const Vec256<std::complex<double>>& min, const Vec256<std::complex<double>>& max) {
+  auto abs_a = a.abs_2_();
+  auto abs_min = min.abs_2_();
+  auto max_mask = _mm256_cmp_pd(abs_a, abs_min, _CMP_LT_OQ);
+  auto abs_max = max.abs_2_();
+  auto min_mask = _mm256_cmp_pd(abs_a, abs_max, _CMP_GT_OQ);
+  return _mm256_blendv_pd(_mm256_blendv_pd(a, min, max_mask), max, min_mask);
+}
+
+template <>
+Vec256<std::complex<double>> inline clamp_min(const Vec256<std::complex<double>>& a, const Vec256<std::complex<double>>& min) {
+  auto abs_a = a.abs_2_();
+  auto abs_min = min.abs_2_();
+  auto max_mask = _mm256_cmp_pd(abs_a, abs_min, _CMP_LT_OQ);
+  return _mm256_blendv_pd(a, min, max_mask);
+}
+
+template <>
+Vec256<std::complex<double>> inline clamp_max(const Vec256<std::complex<double>>& a, const Vec256<std::complex<double>>& max) {
+  auto abs_a = a.abs_2_();
+  auto abs_max = max.abs_2_();
+  auto min_mask = _mm256_cmp_pd(abs_a, abs_max, _CMP_GT_OQ);
+  return _mm256_blendv_pd(a, max, min_mask);
+}
+
+template <>
+Vec256<std::complex<double>> inline operator&(const Vec256<std::complex<double>>& a, const Vec256<std::complex<double>>& b) {
+  return _mm256_and_pd(a, b);
+}
+
+template <>
+Vec256<std::complex<double>> inline operator|(const Vec256<std::complex<double>>& a, const Vec256<std::complex<double>>& b) {
+  return _mm256_or_pd(a, b);
+}
+
+template <>
+Vec256<std::complex<double>> inline operator^(const Vec256<std::complex<double>>& a, const Vec256<std::complex<double>>& b) {
+  return _mm256_xor_pd(a, b);
+}
+
+#ifdef __AVX2__
+template <> inline Vec256<std::complex<double>> fmadd(const Vec256<std::complex<double>>& a, const Vec256<std::complex<double>>& b, const Vec256<std::complex<double>>& c) {
+  return a * b + c;
+}
+#endif
+
+#endif
+
+}}}
--- a/aten/src/ATen/cpu/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
@ -0,0 +1,405 @@
+#pragma once
+
+#include <ATen/cpu/vec256/intrinsics.h>
+#include <ATen/cpu/vec256/vec256_base.h>
+#if defined(__AVX__) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at {
+namespace vec256 {
+// See Note [Acceptable use of anonymous namespace in header]
+namespace {
+
+#if defined(__AVX__) && !defined(_MSC_VER)
+
+template <> class Vec256<std::complex<float>> {
+private:
+  __m256 values;
+public:
+  using value_type = std::complex<float>;
+  static constexpr int size() {
+    return 4;
+  }
+  Vec256() {}
+  Vec256(__m256 v) : values(v) {}
+  Vec256(std::complex<float> val) {
+    float real_value = std::real(val);
+    float imag_value = std::imag(val);
+    values = _mm256_setr_ps(real_value, imag_value,
+                            real_value, imag_value,
+                            real_value, imag_value,
+                            real_value, imag_value
+                            );
+  }
+  Vec256(std::complex<float> val1, std::complex<float> val2, std::complex<float> val3, std::complex<float> val4) {
+    values = _mm256_setr_ps(std::real(val1), std::imag(val1),
+                            std::real(val2), std::imag(val2),
+                            std::real(val3), std::imag(val3),
+                            std::real(val4), std::imag(val4)
+                            );
+  }
+  operator __m256() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vec256<std::complex<float>> blend(const Vec256<std::complex<float>>& a, const Vec256<std::complex<float>>& b) {
+     // convert std::complex<V> index mask to V index mask: xy -> xxyy
+    switch (mask) {
+      case 0:
+        return a;
+      case 1:
+        return _mm256_blend_ps(a.values, b.values, 0x03); //b0000 0001 = b0000 0011
+      case 2:
+        return _mm256_blend_ps(a.values, b.values, 0x0C); //b0000 0010 = b0000 1100
+      case 3:
+        return _mm256_blend_ps(a.values, b.values, 0x0F); //b0000 0011 = b0000 1111
+      case 4:
+        return _mm256_blend_ps(a.values, b.values, 0x30); //b0000 0100 = b0011 0000
+      case 5:
+        return _mm256_blend_ps(a.values, b.values, 0x33); //b0000 0101 = b0011 0011
+      case 6:
+        return _mm256_blend_ps(a.values, b.values, 0x3C); //b0000 0110 = b0011 1100
+      case 7:
+        return _mm256_blend_ps(a.values, b.values, 0x3F); //b0000 0111 = b0011 1111
+      case 8:
+        return _mm256_blend_ps(a.values, b.values, 0xC0); //b0000 1000 = b1100 0000
+      case 9:
+        return _mm256_blend_ps(a.values, b.values, 0xC3); //b0000 1001 = b1100 0011
+      case 10:
+        return _mm256_blend_ps(a.values, b.values, 0xCC); //b0000 1010 = b1100 1100
+      case 11:
+        return _mm256_blend_ps(a.values, b.values, 0xCF); //b0000 1011 = b1100 1111
+      case 12:
+        return _mm256_blend_ps(a.values, b.values, 0xF0); //b0000 1100 = b1111 0000
+      case 13:
+        return _mm256_blend_ps(a.values, b.values, 0xF3); //b0000 1101 = b1111 0011
+      case 14:
+        return _mm256_blend_ps(a.values, b.values, 0xFC); //b0000 1110 = b1111 1100
+    }
+    return b;
+  }
+  static Vec256<std::complex<float>> blendv(const Vec256<std::complex<float>>& a, const Vec256<std::complex<float>>& b,
+                               const Vec256<std::complex<float>>& mask) {
+    // convert std::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm256_unpacklo_ps(mask.values, mask.values);
+    return _mm256_blendv_ps(a.values, b.values, mask_);
+
+  }
+  static Vec256<std::complex<float>> arange(std::complex<float> base = 0., std::complex<float> step = 1.) {
+    return Vec256<std::complex<float>>(base,
+                                        base + step,
+                                        base + std::complex<float>(2)*step,
+                                        base + std::complex<float>(3)*step);
+  }
+  static Vec256<std::complex<float>> set(const Vec256<std::complex<float>>& a, const Vec256<std::complex<float>>& b,
+                            int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+  static Vec256<std::complex<float>> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
+
+    __at_align32__ float tmp_values[2*size()];
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const float*>(ptr),
+        count * sizeof(std::complex<float>));
+    return _mm256_load_ps(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
+    } else if (count > 0) {
+      float tmp_values[2*size()];
+      _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(std::complex<float>));
+    }
+  }
+  const std::complex<float>& operator[](int idx) const  = delete;
+  std::complex<float>& operator[](int idx) = delete;
+  Vec256<std::complex<float>> map(std::complex<float> (*f)(const std::complex<float> &)) const {
+    __at_align32__ std::complex<float> tmp[size()];
+    store(tmp);
+    for (int i = 0; i < size(); i++) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  __m256 abs_2_() const {
+    auto val_2 = _mm256_mul_ps(values, values);     // a*a     b*b
+    return _mm256_hadd_ps(val_2, val_2);            // a*a+b*b a*a+b*b
+  }
+  __m256 abs_() const {
+    return _mm256_sqrt_ps(abs_2_());                // abs     abs
+  }
+  Vec256<std::complex<float>> abs() const {
+    const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
+    return _mm256_and_ps(abs_(), real_mask);        // abs     0
+  }
+  __m256 angle_() const {
+    //angle = atan2(b/a)
+    auto b_a = _mm256_permute_ps(values, 0x55);     // b        a
+    return Sleef_atan2f8_u10(values, b_a);          // 90-angle angle
+  }
+  Vec256<std::complex<float>> angle() const {
+    const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
+    auto angle = _mm256_permute_ps(angle_(), 0x55); // angle    90-angle
+    return _mm256_and_ps(angle, real_mask);         // angle    0
+  }
+  __m256 real_() const {
+    const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
+    return _mm256_and_ps(values, real_mask);
+  }
+  Vec256<std::complex<float>> real() const {
+    return real_();
+  }
+  __m256 imag_() const {
+    const __m256 imag_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF,
+                                                                   0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF));
+    return _mm256_and_ps(values, imag_mask);
+    }
+  Vec256<std::complex<float>> imag() const {
+    return _mm256_permute_ps(imag_(), 0x55);        //b        a
+  }
+  __m256 conj_() const {
+    const __m256 conj_mask = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
+    return _mm256_mul_ps(values, conj_mask);        //a        -b
+  }
+  Vec256<std::complex<float>> conj() const {
+    return conj_();
+  }
+  Vec256<std::complex<float>> acos() const {
+    return map(std::acos);
+  }
+  Vec256<std::complex<float>> asin() const {
+    return map(std::asin);
+  }
+  Vec256<std::complex<float>> atan() const {
+    return map(std::atan);
+  }
+  Vec256<std::complex<float>> atan2(const Vec256<std::complex<float>> &b) const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<float>> erf() const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<float>> erfc() const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<float>> exp() const {
+    return map(std::exp);
+  }
+  Vec256<std::complex<float>> expm1() const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<float>> log() const {
+    return map(std::log);
+  }
+  Vec256<std::complex<float>> log2() const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<float>> log10() const {
+    return map(std::log10);
+  }
+  Vec256<std::complex<float>> log1p() const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<float>> sin() const {
+    return map(std::sin);
+  }
+  Vec256<std::complex<float>> sinh() const {
+    return map(std::sinh);
+  }
+  Vec256<std::complex<float>> cos() const {
+    return map(std::cos);
+  }
+  Vec256<std::complex<float>> cosh() const {
+    return map(std::cosh);
+  }
+  Vec256<std::complex<float>> ceil() const {
+    return _mm256_ceil_ps(values);
+  }
+  Vec256<std::complex<float>> floor() const {
+    return _mm256_floor_ps(values);
+  }
+  Vec256<std::complex<float>> neg() const {
+    auto zero = _mm256_setzero_ps();
+    return _mm256_sub_ps(zero, values);
+  }
+  Vec256<std::complex<float>> round() const {
+    return _mm256_round_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vec256<std::complex<float>> tan() const {
+    return map(std::tan);
+  }
+  Vec256<std::complex<float>> tanh() const {
+    return map(std::tanh);
+  }
+  Vec256<std::complex<float>> trunc() const {
+    return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vec256<std::complex<float>> sqrt() const {
+    return map(std::sqrt);
+  }
+  Vec256<std::complex<float>> reciprocal() const;
+  Vec256<std::complex<float>> rsqrt() const {
+    return map([](const std::complex<float> &x) { return (std::complex<float>)(1)/std::sqrt(x); });
+  }
+  Vec256<std::complex<float>> pow(const Vec256<std::complex<float>> &exp) const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vec256<std::complex<float>> operator==(const Vec256<std::complex<float>>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ);
+  }
+  Vec256<std::complex<float>> operator!=(const Vec256<std::complex<float>>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_NEQ_OQ);
+  }
+  Vec256<std::complex<float>> operator<(const Vec256<std::complex<float>>& other) const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<float>> operator<=(const Vec256<std::complex<float>>& other) const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<float>> operator>(const Vec256<std::complex<float>>& other) const {
+    AT_ERROR("not supported for complex numbers");
+  }
+  Vec256<std::complex<float>> operator>=(const Vec256<std::complex<float>>& other) const {
+    AT_ERROR("not supported for complex numbers");
+  }
+};
+
+template <> Vec256<std::complex<float>> inline operator+(const Vec256<std::complex<float>> &a, const Vec256<std::complex<float>> &b) {
+  return _mm256_add_ps(a, b);
+}
+
+template <> Vec256<std::complex<float>> inline operator-(const Vec256<std::complex<float>> &a, const Vec256<std::complex<float>> &b) {
+  return _mm256_sub_ps(a, b);
+}
+
+template <> Vec256<std::complex<float>> inline operator*(const Vec256<std::complex<float>> &a, const Vec256<std::complex<float>> &b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m256 neg = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
+  auto ac_bd = _mm256_mul_ps(a, b);         //ac       bd
+
+  auto d_c = _mm256_permute_ps(b, 0x55);    //d        c
+  d_c = _mm256_mul_ps(neg, d_c);            //d       -c
+  auto ad_bc = _mm256_mul_ps(a, d_c);       //ad      -bc
+
+  auto ret = _mm256_hsub_ps(ac_bd, ad_bc);  //ac - bd  ad + bc
+  return ret;
+}
+
+template <> Vec256<std::complex<float>> inline operator/(const Vec256<std::complex<float>> &a, const Vec256<std::complex<float>> &b) {
+  //re + im*i = (a + bi)  / (c + di)
+  //re = (ac + bd)/abs_2()
+  //im = (bc - ad)/abs_2()
+  const __m256 neg = _mm256_setr_ps(-1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0);
+  auto ac_bd = _mm256_mul_ps(a, b);         //ac       bd
+
+  auto d_c = _mm256_permute_ps(b, 0x05);    //d        c
+  d_c = _mm256_mul_ps(neg, d_c);            //-d       c
+  auto ad_bc = _mm256_mul_ps(a, d_c);       //-ad      bc
+
+  auto re_im = _mm256_hadd_ps(ac_bd, ad_bc);//ac + bd  bc - ad
+  return _mm256_div_ps(re_im, b.abs_2_());
+}
+
+// reciprocal. Implement this here so we can use multiplication.
+Vec256<std::complex<float>> Vec256<std::complex<float>>::reciprocal() const {
+  //re + im*i = (a + bi)  / (c + di)
+  //re = (ac + bd)/abs_2() = c/abs_2()
+  //im = (bc - ad)/abs_2() = d/abs_2()
+  const __m256 neg = _mm256_setr_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
+  auto c_d = _mm256_mul_ps(neg, values);    //c       -d
+  return _mm256_div_ps(c_d, abs_2_());
+}
+
+template <>
+Vec256<std::complex<float>> inline maximum(const Vec256<std::complex<float>>& a, const Vec256<std::complex<float>>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ);
+  auto max = _mm256_blendv_ps(a, b, mask);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  return _mm256_or_ps(max, isnan);
+}
+
+template <>
+Vec256<std::complex<float>> inline minimum(const Vec256<std::complex<float>>& a, const Vec256<std::complex<float>>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ);
+  auto min = _mm256_blendv_ps(a, b, mask);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  return _mm256_or_ps(min, isnan);
+}
+
+template <>
+Vec256<std::complex<float>> inline clamp(const Vec256<std::complex<float>>& a, const Vec256<std::complex<float>>& min, const Vec256<std::complex<float>>& max) {
+  auto abs_a = a.abs_2_();
+  auto abs_min = min.abs_2_();
+  auto max_mask = _mm256_cmp_ps(abs_a, abs_min, _CMP_LT_OQ);
+  auto abs_max = max.abs_2_();
+  auto min_mask = _mm256_cmp_ps(abs_a, abs_max, _CMP_GT_OQ);
+  return _mm256_blendv_ps(_mm256_blendv_ps(a, min, max_mask), max, min_mask);
+}
+
+template <>
+Vec256<std::complex<float>> inline clamp_min(const Vec256<std::complex<float>>& a, const Vec256<std::complex<float>>& min) {
+  auto abs_a = a.abs_2_();
+  auto abs_min = min.abs_2_();
+  auto max_mask = _mm256_cmp_ps(abs_a, abs_min, _CMP_LT_OQ);
+  return _mm256_blendv_ps(a, min, max_mask);
+}
+
+template <>
+Vec256<std::complex<float>> inline clamp_max(const Vec256<std::complex<float>>& a, const Vec256<std::complex<float>>& max) {
+  auto abs_a = a.abs_2_();
+  auto abs_max = max.abs_2_();
+  auto min_mask = _mm256_cmp_ps(abs_a, abs_max, _CMP_GT_OQ);
+  return _mm256_blendv_ps(a, max, min_mask);
+}
+
+template <>
+Vec256<std::complex<float>> inline operator&(const Vec256<std::complex<float>>& a, const Vec256<std::complex<float>>& b) {
+  return _mm256_and_ps(a, b);
+}
+
+template <>
+Vec256<std::complex<float>> inline operator|(const Vec256<std::complex<float>>& a, const Vec256<std::complex<float>>& b) {
+  return _mm256_or_ps(a, b);
+}
+
+template <>
+Vec256<std::complex<float>> inline operator^(const Vec256<std::complex<float>>& a, const Vec256<std::complex<float>>& b) {
+  return _mm256_xor_ps(a, b);
+}
+
+#ifdef __AVX2__
+template <> inline Vec256<std::complex<float>> fmadd(const Vec256<std::complex<float>>& a, const Vec256<std::complex<float>>& b, const Vec256<std::complex<float>>& c) {
+  return a * b + c;
+}
+#endif
+
+#endif
+
+}}}
--- a/aten/src/ATen/cpu/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@ -91,6 +91,18 @@ public:
    auto mask = _mm256_set1_pd(-0.f);
    return _mm256_andnot_pd(mask, values);
  }
+  Vec256<double> angle() const {
+    return _mm256_set1_pd(0);
+  }
+  Vec256<double> real() const {
+    return *this;
+  }
+  Vec256<double> imag() const {
+    return _mm256_set1_pd(0);
+  }
+  Vec256<double> conj() const {
+    return *this;
+  }
  Vec256<double> acos() const {
    return Vec256<double>(Sleef_acosd4_u10(values));
  }
--- a/aten/src/ATen/cpu/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@ -99,6 +99,18 @@ public:
    auto mask = _mm256_set1_ps(-0.f);
    return _mm256_andnot_ps(mask, values);
  }
+  Vec256<float> angle() const {
+    return _mm256_set1_ps(0);
+  }
+  Vec256<float> real() const {
+    return *this;
+  }
+  Vec256<float> imag() const {
+    return _mm256_set1_ps(0);
+  }
+  Vec256<float> conj() const {
+    return *this;
+  }
  Vec256<float> acos() const {
    return Vec256<float>(Sleef_acosf8_u10(values));
  }
--- a/aten/src/ATen/cpu/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@ -97,6 +97,19 @@ struct Vec256<int64_t> : public Vec256i {
    auto inverse = _mm256_xor_si256(values, is_larger);
    return _mm256_sub_epi64(inverse, is_larger);
  }
+  Vec256<int64_t> angle() const {
+    return _mm256_set1_epi64x(0);
+  }
+  Vec256<int64_t> real() const {
+    return *this;
+  }
+  Vec256<int64_t> imag() const {
+    return _mm256_set1_epi64x(0);
+  }
+  Vec256<int64_t> conj() const {
+    return *this;
+  }
+  Vec256<int64_t> frac() const;
  Vec256<int64_t> neg() const;
  Vec256<int64_t> operator==(const Vec256<int64_t>& other) const {
    return _mm256_cmpeq_epi64(values, other.values);
@ -194,6 +207,19 @@ struct Vec256<int32_t> : public Vec256i {
  Vec256<int32_t> abs() const {
    return _mm256_abs_epi32(values);
  }
+  Vec256<int32_t> angle() const {
+    return _mm256_set1_epi32(0);
+  }
+  Vec256<int32_t> real() const {
+    return *this;
+  }
+  Vec256<int32_t> imag() const {
+    return _mm256_set1_epi32(0);
+  }
+  Vec256<int32_t> conj() const {
+    return *this;
+  }
+  Vec256<int32_t> frac() const;
  Vec256<int32_t> neg() const;
  Vec256<int32_t> operator==(const Vec256<int32_t>& other) const {
    return _mm256_cmpeq_epi32(values, other.values);
@ -380,6 +406,19 @@ struct Vec256<int16_t> : public Vec256i {
  Vec256<int16_t> abs() const {
    return _mm256_abs_epi16(values);
  }
+  Vec256<int16_t> angle() const {
+    return _mm256_set1_epi16(0);
+  }
+  Vec256<int16_t> real() const {
+    return *this;
+  }
+  Vec256<int16_t> imag() const {
+    return _mm256_set1_epi16(0);
+  }
+  Vec256<int16_t> conj() const {
+    return *this;
+  }
+  Vec256<int16_t> frac() const;
  Vec256<int16_t> neg() const;
  Vec256<int16_t> operator==(const Vec256<int16_t>& other) const {
    return _mm256_cmpeq_epi16(values, other.values);
--- a/aten/src/ATen/cpu/vml.h
+++ b/aten/src/ATen/cpu/vml.h
@ -39,9 +39,10 @@
 // https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280. Calling zeroall
 // when using AVX/AVX2 code resolves this.
 #if defined(__AVX__) && defined(__GLIBC__) && __GLIBC_MINOR__ == 23
-#define DL_RUNTIME_BUG(op, type) \
-  volatile type x = (type)(1);   \
-  x = std::op(x);                \
+#define DL_RUNTIME_BUG(op, type)                              \
+  using value_t = typename at::native::ztype<type>::value_t;  \
+  volatile value_t x = (value_t)(1);                          \
+  x = std::op(x);                                             \
  _mm256_zeroall();
 #else
 #define DL_RUNTIME_BUG(op, type)
--- a/aten/src/ATen/native/NNPACK.cpp
+++ b/aten/src/ATen/native/NNPACK.cpp
@ -55,53 +55,67 @@ bool _nnpack_available() {

 #include "nnpack.h"

-#include <stdlib.h>
-
-#include <ATen/Parallel.h>
-#include <thread>
+#include "caffe2/utils/threadpool/ThreadPoolMobile.h"

 namespace at {
 namespace native {

-// Stolen from Caffe2
-static pthreadpool_t nnpack_threadpool_ = nullptr;
-static bool called_nnpack_threadpool_ = false;
+static bool init_nnpack() {
+  static std::once_flag once_;
+  static bool nnpack_successfully_initialized_ = false;
+
+  std::call_once(once_, []() {
+    const nnp_status nnpack_status = nnp_initialize();
+    nnpack_successfully_initialized_ = (nnp_status_success == nnpack_status);

-pthreadpool_t nnpack_threadpool() {
-  if (! called_nnpack_threadpool_) {
-    called_nnpack_threadpool_ = true;
-    enum nnp_status nnpack_status = nnp_initialize();
    if (nnpack_status != nnp_status_success) {
      if (nnpack_status == nnp_status_out_of_memory) {
-        throw std::runtime_error("could not initialize NNPack (out of memory)");
+        LOG(WARNING) << "Could not initialize NNPACK! Reason: Out of memory.";
      } else if (nnpack_status == nnp_status_unsupported_hardware) {
-        throw std::runtime_error("could not initialize NNPack (unsupported hardware)");
+        LOG(WARNING) << "Could not initialize NNPACK! Reason: Unsupported hardware.";
      } else {
-        throw std::runtime_error("could not initialize NNPack (unknown error)");
+        LOG(WARNING) << "Could not initialize NNPACK! Reason: Unknown error!";
      }
    }
-    unsigned int threads;
-#ifdef INTRA_OP_PARALLEL
-    threads = at::get_num_threads();
+  });
+
+  return nnpack_successfully_initialized_;
+}
+
+static pthreadpool_t nnpack_threadpool() {
+  // Try initializing a threadpool for NNPACK's use.  If we fail to
+  // successfully initialize an implementation, return nullptr which will
+  // instruct NNPACK to run single threaded.
+
+#ifdef C10_MOBILE
+  // If building for mobile, use Caffe 2's mobile-friendly threadpool.
+  return caffe2::mobile_pthreadpool();
 #else
-    threads = std::thread::hardware_concurrency();
+  // Otherwise, try using pthreadpool if we manage to initialize it successfully.
+  static pthreadpool_t nnpack_threadpool_ = nullptr;
+  static bool called_nnpack_threadpool_ = false;
+
+  if (!called_nnpack_threadpool_) {
+    called_nnpack_threadpool_ = true;
+
+#ifdef INTRA_OP_PARALLEL
+    const uint32_t threads = at::get_num_threads();
+#else
+    const uint32_t threads = std::thread::hardware_concurrency();
 #endif
+
    nnpack_threadpool_ = pthreadpool_create(threads);
-    if (nnpack_threadpool_ == nullptr) {
-      throw std::runtime_error("could not initialize NNPack's pthreadpool");
+    if (!nnpack_threadpool_) {
+      LOG(WARNING) << "Failed to initialize pthreadpool! Running NNPACK in single-threaded mode.";
    }
  }
+
  return nnpack_threadpool_;
+#endif
 }

 bool _nnpack_available() {
-  if (! called_nnpack_threadpool_) {
-    try {
-      return nnpack_threadpool() != nullptr;
-    } catch (std::runtime_error e) {
-    }
-  }
-  return nnpack_threadpool() != nullptr;
+  return init_nnpack();
 }

 // Make thread_local for safety in cases where we have multiple threads running
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@ -246,7 +246,8 @@ Tensor& mvlgamma_(Tensor& self, int64_t p) {
    return at::op##_out(self, self);                                   \
  }                                                                    \
  Tensor& _##op##_out_##prefix(Tensor& result, const Tensor& self) {   \
-    checkBackend(#op, result, Backend::device);                        \
+    checkDeviceType(#op, result, DeviceType::device);                  \
+    checkLayout(#op, result, Layout::Strided);                         \
    auto iter = TensorIterator::unary_op(result, self,                 \
      /*check_mem_overlap=*/true);                                     \
    op##_stub(iter.device_type(), iter);                               \
@ -263,6 +264,10 @@ Tensor& mvlgamma_(Tensor& self, int64_t p) {
  IMPLEMENT_UNARY_OP_OUT_INPLACE(op, cuda, CUDA)

 IMPLEMENT_UNARY_OP_VEC(abs)
+IMPLEMENT_UNARY_OP_VEC(angle)
+IMPLEMENT_UNARY_OP_VEC(real)
+IMPLEMENT_UNARY_OP_VEC(imag)
+IMPLEMENT_UNARY_OP_VEC(conj)
 IMPLEMENT_UNARY_OP_VEC(acos)
 IMPLEMENT_UNARY_OP_VEC(asin)
 IMPLEMENT_UNARY_OP_VEC(atan)
@ -285,6 +290,10 @@ IMPLEMENT_UNARY_OP_VEC(tanh)
 IMPLEMENT_UNARY_OP_VEC_CUDA(lgamma)

 DEFINE_DISPATCH(abs_stub);
+DEFINE_DISPATCH(angle_stub);
+DEFINE_DISPATCH(real_stub);
+DEFINE_DISPATCH(imag_stub);
+DEFINE_DISPATCH(conj_stub);
 DEFINE_DISPATCH(acos_stub);
 DEFINE_DISPATCH(asin_stub);
 DEFINE_DISPATCH(atan_stub);
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@ -13,6 +13,10 @@ using unary_fn = void(*)(TensorIterator&);
 using unary_fn_with_scalar = void(*)(TensorIterator&, Scalar a);

 DECLARE_DISPATCH(unary_fn, abs_stub);
+DECLARE_DISPATCH(unary_fn, angle_stub);
+DECLARE_DISPATCH(unary_fn, real_stub);
+DECLARE_DISPATCH(unary_fn, imag_stub);
+DECLARE_DISPATCH(unary_fn, conj_stub);
 DECLARE_DISPATCH(unary_fn, acos_stub);
 DECLARE_DISPATCH(unary_fn, asin_stub);
 DECLARE_DISPATCH(unary_fn, atan_stub);
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -14,14 +14,13 @@ namespace {
 using namespace vec256;

 void add_kernel(TensorIterator& iter, Scalar alpha_scalar) {
-  if (iter.dtype() == ScalarType::Bool || isComplexType(iter.dtype())) {
-    AT_DISPATCH_COMPLEX_TYPES_AND(kBool, iter.dtype(), "add_cpu/sub_cpu", [&]() {
+  if (iter.dtype() == ScalarType::Bool) {
+      using scalar_t = bool;
      auto alpha = alpha_scalar.to<scalar_t>();
      cpu_kernel(iter,
        [=](scalar_t a, scalar_t b) -> scalar_t { return a + alpha * b; });
-      });
  } else {
-    AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.dtype(), "add_cpu/sub_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, iter.dtype(), "add_cpu/sub_cpu", [&]() {
      auto alpha = alpha_scalar.to<scalar_t>();
      auto alpha_vec = Vec256<scalar_t>(alpha);
      cpu_kernel_vec(iter,
@ -51,13 +50,8 @@ void sub_kernel(TensorIterator& iter, Scalar alpha_scalar) {
 void mul_kernel(TensorIterator& iter) {
  if (iter.dtype() == ScalarType::Bool) {
    cpu_kernel(iter, [=](bool a, bool b) -> bool { return a && b; });
-  } else if (isComplexType(iter.dtype())) {
-      AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "mul_cpu", [&]() {
-        cpu_kernel(iter,
-          [=](scalar_t a, scalar_t b) -> scalar_t { return a * b; });
-     });
  } else {
-    AT_DISPATCH_ALL_TYPES_AND(kBFloat16, iter.dtype(), "mul_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, iter.dtype(), "mul_cpu", [&]() {
      cpu_kernel_vec(iter,
        [=](scalar_t a, scalar_t b) -> scalar_t { return a * b; },
        [=](Vec256<scalar_t> a, Vec256<scalar_t> b) {
@ -78,9 +72,12 @@ void div_kernel(TensorIterator& iter) {
    });
  } else if (isComplexType(iter.dtype())) {
      AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "div_cpu", [&]() {
-        cpu_kernel(iter,
+        cpu_kernel_vec(iter,
          [=](scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
             return a / b;
+          },
+          [=](Vec256<scalar_t> a, Vec256<scalar_t> b) {
+            return a / b;
          });
      });
    } else {
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@ -42,13 +42,13 @@ namespace at { namespace native { namespace {

 using namespace vec256;

-template <typename traits, std::size_t... I>
+template <typename traits, std::size_t... INDEX>
 typename traits::ArgsTuple
 dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i,
-                 c10::guts::index_sequence<I...>) {
+                 c10::guts::index_sequence<INDEX...>) {
  return std::make_tuple(
-      *(typename traits::template arg<I>::type*)
-        (data[I] + i * strides[I])...);
+      *(typename traits::template arg<INDEX>::type*)
+        (data[INDEX] + i * strides[INDEX])...);
 }

 template <typename traits>
@ -58,19 +58,19 @@ dereference(char* C10_RESTRICT data[], const int64_t* strides, int64_t i) {
  return dereference_impl<traits>(data, strides, i, Indices{});
 }

-template <typename traits, std::size_t... I>
+template <typename traits, std::size_t... INDEX>
 typename traits::ArgsTuple
 dereference_vec_impl(char* C10_RESTRICT data[],
                     const typename traits::result_type& opt_scalar,
                     size_t S,
                     int64_t i,
-                     c10::guts::index_sequence<I...>) {
+                     c10::guts::index_sequence<INDEX...>) {
  using Vec = typename traits::result_type;
  using scalar_t = typename Vec::value_type;
  return std::make_tuple(
-      S == I + 1 ?
+      S == INDEX + 1 ?
      opt_scalar :
-      Vec::loadu(data[I] + i * sizeof(scalar_t))...);
+      Vec::loadu(data[INDEX] + i * sizeof(scalar_t))...);
 }

 template <typename traits>
@ -171,15 +171,15 @@ static inline void unroll_contiguous_scalar_checks(
  cb(0);
 }

-template <typename traits, typename cb_t, size_t I0, size_t ...I>
+template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX>
 static inline void unroll_contiguous_scalar_checks(
    const int64_t* strides,
-    c10::guts::index_sequence<I0, I...>,
+    c10::guts::index_sequence<INDEX0, INDEX...>,
    const cb_t& cb) {
-  if (is_contiguous_scalar<traits, I0 + 1>(strides)) {
-    cb(I0 + 1);
+  if (is_contiguous_scalar<traits, INDEX0 + 1>(strides)) {
+    cb(INDEX0 + 1);
  } else {
-    unroll_contiguous_scalar_checks<traits>(strides, c10::guts::index_sequence<I...>{}, cb);
+    unroll_contiguous_scalar_checks<traits>(strides, c10::guts::index_sequence<INDEX...>{}, cb);
  }
 }

--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@ -16,6 +16,7 @@
 #include <ATen/native/UnaryOps.h>

 #include <ATen/native/cpu/Loops.h>
+#include <ATen/native/cpu/zmath.h>
 #include <ATen/native/Math.h>


@ -29,10 +30,10 @@ namespace {
 using namespace vec256;

 static void sigmoid_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "sigmoid_cpu", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "sigmoid_cpu", [&]() {
    cpu_kernel_vec(
        iter,
-        [=](scalar_t a) -> scalar_t { return (1 / (1 + std::exp((-a)))); },
+        [=](scalar_t a) -> scalar_t { return ((scalar_t)(1) / ((scalar_t)(1) + std::exp((-a)))); },
        [=](Vec256<scalar_t> a) {
          a = Vec256<scalar_t>((scalar_t)(0)) - a;
          a = a.exp();
@ -53,7 +54,7 @@ uint8_t abs_impl(uint8_t v) {
 }

 static void abs_kernel(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES(iter.dtype(), "abs_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "abs_cpu", [&]() {
    cpu_kernel_vec(
        iter,
        [=](scalar_t a) -> scalar_t { return abs_impl(a); },
@ -61,6 +62,42 @@ static void abs_kernel(TensorIterator& iter) {
  });
 }

+static void angle_kernel(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "angle_cpu", [&]() {
+    cpu_kernel_vec(
+        iter,
+        [=](scalar_t a) -> scalar_t { return angle_impl(a); },
+        [=](Vec256<scalar_t> a) { return a.angle(); });
+  });
+}
+
+static void real_kernel(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "real_cpu", [&]() {
+    cpu_kernel_vec(
+        iter,
+        [=](scalar_t a) -> scalar_t { return real_impl(a); },
+        [=](Vec256<scalar_t> a) { return a.real(); });
+  });
+}
+
+static void imag_kernel(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "imag_cpu", [&]() {
+    cpu_kernel_vec(
+        iter,
+        [=](scalar_t a) -> scalar_t { return imag_impl(a); },
+        [=](Vec256<scalar_t> a) { return a.imag(); });
+  });
+}
+
+static void conj_kernel(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "conj_cpu", [&]() {
+    cpu_kernel_vec(
+        iter,
+        [=](scalar_t a) -> scalar_t { return conj_impl(a); },
+        [=](Vec256<scalar_t> a) { return a.conj(); });
+  });
+}
+
 static void bitwise_not_kernel(TensorIterator& iter) {
  if (iter.dtype() == ScalarType::Bool) {
    // Boolean type does not work with ~ (bitwise NOT) in C++. bitwise_not wraps this operation for both Boolean and
@ -100,7 +137,7 @@ static void logical_not_kernel(TensorIterator& iter) {
 }

 static void reciprocal_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "reciprocal_cpu", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "reciprocal_cpu", [&]() {
    cpu_kernel_vec(
        iter,
        [=](scalar_t a) -> scalar_t { return decltype(a)(1.0) / a; },
@ -109,7 +146,7 @@ static void reciprocal_kernel(TensorIterator& iter) {
 }

 static void neg_kernel(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES(iter.dtype(), "neg_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "neg_cpu", [&]() {
    cpu_kernel_vec(
        iter,
        [=](scalar_t a) -> scalar_t { return -a; },
@ -141,7 +178,7 @@ static void sign_kernel(TensorIterator& iter){
 }

 static void sinh_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "sinh_cpu", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "sinh_cpu", [&]() {
    cpu_kernel(
        iter,
        [=](scalar_t a) -> scalar_t { return std::sinh(a); });
@ -149,7 +186,7 @@ static void sinh_kernel(TensorIterator& iter) {
 }

 static void cosh_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "cosh_cpu", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "cosh_cpu", [&]() {
    cpu_kernel(
        iter,
        [=](scalar_t a) -> scalar_t { return std::cosh(a); });
@ -181,33 +218,36 @@ static void polygamma_kernel(TensorIterator& iter, int64_t n) {
 }

 static void clamp_kernel(TensorIterator& iter, Scalar min_scalar, Scalar max_scalar) {
-  AT_DISPATCH_ALL_TYPES(iter.dtype(), "clamp_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "clamp_cpu", [&]() {
+    ztype<scalar_t>::value_t (*zabs_)(scalar_t) = zabs;
    auto min = min_scalar.to<scalar_t>();
    auto max = max_scalar.to<scalar_t>();
    auto min_vec = Vec256<scalar_t>(min);
    auto max_vec = Vec256<scalar_t>(max);
    cpu_kernel_vec(iter,
-     [=](scalar_t a) -> scalar_t { return a < min ? min : (a > max ? max : a); },
+     [=](scalar_t a) -> scalar_t { return zabs_(a) < zabs_(min) ? min : (zabs_(a) > zabs_(max) ? max : a); },
     [=](Vec256<scalar_t> a) { return vec256::clamp(a, min_vec, max_vec); });
  });
 }

 static void clamp_max_kernel(TensorIterator& iter, Scalar max_scalar) {
-  AT_DISPATCH_ALL_TYPES(iter.dtype(), "clamp_max_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "clamp_max_cpu", [&]() {
+    ztype<scalar_t>::value_t (*zabs_)(scalar_t) = zabs;
    auto max = max_scalar.to<scalar_t>();
    auto max_vec = Vec256<scalar_t>(max);
    cpu_kernel_vec(iter,
-     [=](scalar_t a) -> scalar_t { return a > max ? max : a; },
+     [=](scalar_t a) -> scalar_t { return zabs_(a) > zabs_(max) ? max : a; },
     [=](Vec256<scalar_t> a) { return vec256::clamp_max(a, max_vec); });
  });
 }

 static void clamp_min_kernel(TensorIterator& iter, Scalar min_scalar) {
-  AT_DISPATCH_ALL_TYPES(iter.dtype(), "clamp_min_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "clamp_min_cpu", [&]() {
+    ztype<scalar_t>::value_t (*zabs_)(scalar_t) = zabs;
    auto min = min_scalar.to<scalar_t>();
    auto min_vec = Vec256<scalar_t>(min);
    cpu_kernel_vec(iter,
-     [=](scalar_t a) -> scalar_t { return a < min ? min : a; },
+     [=](scalar_t a) -> scalar_t { return zabs_(a) < zabs_(min) ? min : a; },
     [=](Vec256<scalar_t> a) { return vec256::clamp_min(a, min_vec); });
  });
 }
@ -272,7 +312,7 @@ void bernoulli_mkl_kernel(Tensor &self, const double p, Generator* gen) {
 #endif

 static void rsqrt_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "rsqrt_cpu", [&] {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "rsqrt_cpu", [&] {
    cpu_kernel_vec(
        iter,
        [=](scalar_t a) -> scalar_t {
@ -315,12 +355,47 @@ static void rsqrt_kernel(TensorIterator& iter) {
  }                                                                           \
  REGISTER_DISPATCH(op##_stub, &op##_kernel)

+#define IMPLEMENT_COMPLEX_KERNEL(dispatchtypes, op)                             \
+  static void op##_kernel(TensorIterator& iter) {                             \
+    TORCH_INTERNAL_ASSERT(iter.ntensors() == 2);                              \
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), op##_vml_cpu, [&]() {\
+      iter.serial_for_each(                                                   \
+          [&](char** data_, const int64_t* strides, int64_t n) {              \
+            scalar_t* out_data = reinterpret_cast<scalar_t*>(data_[0]);       \
+            scalar_t* in_data = reinterpret_cast<scalar_t*>(data_[1]);        \
+            int64_t out_stride = strides[0] / sizeof(scalar_t);               \
+            int64_t in_stride = strides[1] / sizeof(scalar_t);                \
+            if (out_stride == 1 && in_stride == 1) {                          \
+              vml::v##op(out_data, in_data, n);                               \
+            } else {                                                          \
+              static constexpr int64_t WIDTH = 131072 / sizeof(scalar_t);     \
+              for (int64_t i = 0; i < n; i += WIDTH) {                        \
+                scalar_t buffer[WIDTH];                                       \
+                int64_t width = WIDTH;                                        \
+                width = std::min(width, n - i);                               \
+                for (int64_t j = 0; j < width; j++)                           \
+                  buffer[j] = in_data[in_stride * (i + j)];                   \
+                vml::v##op(buffer, buffer, width);                            \
+                for (int64_t j = 0; j < width; j++)                           \
+                  out_data[out_stride * (i + j)] = buffer[j];                 \
+              }                                                               \
+            }                                                                 \
+          },                                                                  \
+          {0, iter.numel()});                                                 \
+    });                                                                       \
+  }                                                                           \
+  REGISTER_DISPATCH(op##_stub, &op##_kernel)
+
 } // anonymous namespace

 REGISTER_DISPATCH(rsqrt_stub, &rsqrt_kernel);
 REGISTER_DISPATCH(sigmoid_stub, &sigmoid_kernel);
 REGISTER_DISPATCH(bernoulli_mkl_stub, &bernoulli_mkl_kernel);
 REGISTER_DISPATCH(abs_stub, &abs_kernel);
+REGISTER_DISPATCH(angle_stub, &angle_kernel);
+REGISTER_DISPATCH(real_stub, &real_kernel);
+REGISTER_DISPATCH(imag_stub, &imag_kernel);
+REGISTER_DISPATCH(conj_stub, &conj_kernel);
 REGISTER_DISPATCH(bitwise_not_stub, &bitwise_not_kernel);
 REGISTER_DISPATCH(logical_not_stub, &logical_not_kernel);
 REGISTER_DISPATCH(frac_stub, &frac_kernel);
@ -338,29 +413,29 @@ REGISTER_DISPATCH(clamp_min_stub, &clamp_min_kernel);


 // IMPLEMENT_FLOAT_KERNEL(ALL, abs)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, acos)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, asin)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, atan)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, ceil)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, cos)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, acos)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, asin)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, atan)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, ceil)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, cos)
 // IMPLEMENT_FLOAT_KERNEL(FLOATING, cosh)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, erf)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, erfc)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, erfinv)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, exp)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, exp)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, expm1)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, floor)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, log)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, log10)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, floor)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, log)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, log10)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, log1p)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, log2)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, round)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, sin)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, round)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, sin)
 // IMPLEMENT_FLOAT_KERNEL(FLOATING, sinh)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, sqrt)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, tan)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, tanh)
-IMPLEMENT_FLOAT_KERNEL(FLOATING, trunc)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, sqrt)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, tan)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, tanh)
+IMPLEMENT_COMPLEX_KERNEL(FLOATING, trunc)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, lgamma)

 }} // namespace at::native
--- a/aten/src/ATen/native/cpu/zmath.h
+++ b/aten/src/ATen/native/cpu/zmath.h
@ -0,0 +1,160 @@
+#pragma once
+
+// Complex number math operations that act as no-ops for other dtypes.
+#include <complex.h>
+
+namespace at { namespace native {
+namespace {
+
+template <typename TYPE>
+struct ztype {
+  using value_t = TYPE;
+};
+
+template <>
+struct ztype<std::complex<double>> {
+  using value_t = double;
+};
+
+template <>
+struct ztype<std::complex<float>> {
+  using value_t = float;
+};
+
+template <typename SCALAR_TYPE, typename VALUE_TYPE>
+inline VALUE_TYPE zabs (SCALAR_TYPE z) {
+  return z;
+}
+
+template<>
+inline float zabs <std::complex<float>> (std::complex<float> z) {
+  return std::abs(z);
+}
+
+template<>
+inline double zabs <std::complex<double>> (std::complex<double> z) {
+  return std::abs(z);
+}
+
+template <typename TYPE>
+inline TYPE angle_impl (TYPE z) {
+  return 0;
+}
+
+template<>
+inline std::complex<float> angle_impl <std::complex<float>> (std::complex<float> z) {
+  return std::complex<float>(std::arg(z), 0.0);
+}
+
+template<>
+inline std::complex<double> angle_impl <std::complex<double>> (std::complex<double> z) {
+  return std::complex<double>(std::arg(z), 0.0);
+}
+
+template <typename TYPE>
+inline TYPE real_impl (TYPE z) {
+  return z; //No-Op
+}
+
+template<>
+inline std::complex<float> real_impl <std::complex<float>> (std::complex<float> z) {
+  return std::complex<float>(std::real(z), 0.0);
+}
+
+template<>
+inline std::complex<double> real_impl <std::complex<double>> (std::complex<double> z) {
+  return std::complex<double>(std::real(z), 0.0);
+}
+
+template <typename TYPE>
+inline TYPE imag_impl (TYPE z) {
+  return 0;
+}
+
+template<>
+inline std::complex<float> imag_impl <std::complex<float>> (std::complex<float> z) {
+  return std::complex<float>(std::imag(z), 0.0);
+}
+
+template<>
+inline std::complex<double> imag_impl <std::complex<double>> (std::complex<double> z) {
+  return std::complex<double>(std::imag(z), 0.0);
+}
+
+template <typename TYPE>
+inline TYPE conj_impl (TYPE z) {
+  return z; //No-Op
+}
+
+template<>
+inline std::complex<float> conj_impl <std::complex<float>> (std::complex<float> z) {
+  return std::complex<float>(std::real(z), -std::imag(z));
+}
+
+template<>
+inline std::complex<double> conj_impl <std::complex<double>> (std::complex<double> z) {
+  return std::complex<double>(std::real(z), -std::imag(z));
+}
+
+template <typename TYPE>
+inline TYPE ceil_impl (TYPE z) {
+  return std::ceil(z);
+}
+
+template <>
+inline std::complex<float> ceil_impl (std::complex<float> z) {
+  return std::complex<float>(std::ceil(std::real(z)), std::ceil(std::imag(z)));
+}
+
+template <>
+inline std::complex<double> ceil_impl (std::complex<double> z) {
+  return std::complex<double>(std::ceil(std::real(z)), std::ceil(std::imag(z)));
+}
+
+template <typename TYPE>
+inline TYPE floor_impl (TYPE z) {
+  return std::floor(z);
+}
+
+template <>
+inline std::complex<float> floor_impl (std::complex<float> z) {
+  return std::complex<float>(std::floor(std::real(z)), std::floor(std::imag(z)));
+}
+
+template <>
+inline std::complex<double> floor_impl (std::complex<double> z) {
+  return std::complex<double>(std::floor(std::real(z)), std::floor(std::imag(z)));
+}
+
+template <typename TYPE>
+inline TYPE round_impl (TYPE z) {
+  return std::nearbyint(z);
+}
+
+template <>
+inline std::complex<float> round_impl (std::complex<float> z) {
+  return std::complex<float>(std::nearbyint(std::real(z)), std::nearbyint(std::imag(z)));
+}
+
+template <>
+inline std::complex<double> round_impl (std::complex<double> z) {
+  return std::complex<double>(std::nearbyint(std::real(z)), std::nearbyint(std::imag(z)));
+}
+
+template <typename TYPE>
+inline TYPE trunc_impl (TYPE z) {
+  return std::trunc(z);
+}
+
+template <>
+inline std::complex<float> trunc_impl (std::complex<float> z) {
+  return std::complex<float>(std::trunc(std::real(z)), std::trunc(std::imag(z)));
+}
+
+template <>
+inline std::complex<double> trunc_impl (std::complex<double> z) {
+  return std::complex<double>(std::trunc(std::real(z)), std::trunc(std::imag(z)));
+}
+
+} // end namespace
+}} //end at::native
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -196,6 +196,54 @@
    CPU: _abs_out_cpu
    CUDA: _abs_out_cuda

+- func: angle(Tensor self) -> Tensor
+  variants: function, method
+  supports_named_tensor: True
+  named_guard: False
+
+- func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  named_guard: False
+  supports_named_tensor: True
+  dispatch:
+    CPU: _angle_out_cpu
+    CUDA: _abs_out_cuda
+
+- func: real(Tensor self) -> Tensor
+  variants: function, method
+  named_guard: False
+  supports_named_tensor: True
+
+- func: real.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  named_guard: False
+  supports_named_tensor: True
+  dispatch:
+    CPU: _real_out_cpu
+    CUDA: _abs_out_cuda
+
+- func: imag(Tensor self) -> Tensor
+  variants: function, method
+  named_guard: False
+  supports_named_tensor: True
+
+- func: imag.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  named_guard: False
+  supports_named_tensor: True
+  dispatch:
+    CPU: _imag_out_cpu
+    CUDA: _abs_out_cuda
+
+- func: conj(Tensor self) -> Tensor
+  variants: function, method
+  named_guard: False
+  supports_named_tensor: True
+
+- func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  named_guard: False
+  supports_named_tensor: True
+  dispatch:
+    CPU: _conj_out_cpu
+    CUDA: _abs_out_cuda
+
 - func: acos(Tensor self) -> Tensor
  use_c10_dispatcher: full
  supports_named_tensor: True
@ -3556,12 +3604,6 @@
    SparseCUDA: copy_sparse_
  requires_tensor: True

- func: numel(Tensor self) -> int
-  use_c10_dispatcher: full
-  variants: function, method
-  device_guard: False
-  supports_named_tensor: True
-
 - func: unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[]
  use_c10_dispatcher: unboxed_only
  variants: function, method
--- a/aten/src/ATen/templates/Functions.h
+++ b/aten/src/ATen/templates/Functions.h
@ -69,6 +69,10 @@ inline Tensor from_blob(
  return from_blob(data, sizes, detail::defaultStrides(sizes), [](void*) {}, options);
 }

+inline int64_t numel(const Tensor& tensor) {
+  return tensor.numel();
+}
+
 // function definitions are all static inline because
 // they are one-line statically dispatched functions that
 // invoke the actual dynamic dispatch on the correct argument
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@ -211,6 +211,10 @@ class CAFFE2_API Tensor {
    return impl_->numel() * impl_->itemsize();
  }

+  int64_t numel() const {
+    return impl_->numel();
+  }
+
  // Length of one array element in bytes.  This is the traditional
  // Numpy naming.
  size_t itemsize() const {
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@ -26,10 +26,12 @@ list(APPEND ATen_CPU_TEST_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/weakref_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/quantized_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/extension_backend_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/boxed_fallback_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/xla_tensor_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/tensor_iterator_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cpu_generator_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/pow_test.cpp)
+  ${CMAKE_CURRENT_SOURCE_DIR}/pow_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/variant_test.cpp)

 list(APPEND ATen_CUDA_TEST_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_integer_divider_test.cu
--- a/aten/src/ATen/test/boxed_fallback_test.cpp
+++ b/aten/src/ATen/test/boxed_fallback_test.cpp
@ -0,0 +1,146 @@
+#include <gtest/gtest.h>
+
+#include <c10/core/TensorTypeId.h>
+
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/core/ATenDispatch.h>
+
+#include <torch/csrc/jit/operator.h>
+
+using namespace at;
+
+// This test file gives an example of a simple use case for "wrapper"
+// and "mode" style tensor type ids.  In both cases, the implementation
+// of the wrapper/mode simply passes through the call to underlying JIT
+// implementation (so the wrapper/mode doesn't actually do anything),
+// but this could be used as a starting point to do more interesting things.
+
+// TODO: This to be rewritten when bwasti sets up direct access to
+// JIT data structures
+std::shared_ptr<torch::jit::Operator> getOperator(const char* schema_str) {
+  auto schema = torch::jit::parseSchema(schema_str);
+  auto s = Symbol::fromQualString(schema.name());
+  auto operators = torch::jit::getAllOperatorsFor(s);
+  // Find the exact match
+  std::shared_ptr<torch::jit::Operator> op;
+  for (const auto& candidate_op : operators) {
+    auto candidate_schema = candidate_op->schema();
+    // NB: this is a VERY slow equality test
+    if (candidate_schema == schema) {
+      op = candidate_op;
+      break;
+    }
+  }
+  TORCH_INTERNAL_ASSERT(op);
+  return op;
+}
+
+// Global counter for ease of testing
+static int64_t override_call_count = 0;
+
+// Mode implementation
+
+void generic_mode_fallback(const char* schema_str, torch::jit::Stack* stack) {
+  override_call_count++;
+  auto operation = getOperator(schema_str)->getOperation();
+  c10::impl::ExcludeTensorTypeIdGuard guard(TensorTypeId::TESTING_ONLY_GenericModeTensorId);
+  auto offset = operation(*stack);
+  TORCH_INTERNAL_ASSERT(offset == 0);
+}
+
+// Wrapper implementation
+
+struct GenericWrapperTensorImpl : public c10::TensorImpl {
+  explicit GenericWrapperTensorImpl(at::Tensor rep)
+    : TensorImpl(
+        c10::TensorTypeSet(c10::TensorTypeId::TESTING_ONLY_GenericWrapperTensorId),
+        rep.dtype(),
+        rep.device()
+        // TODO: propagate size!
+      )
+    , rep_(std::move(rep)) {}
+
+  at::Tensor rep_;
+};
+
+void generic_wrapper_fallback(const char* schema_str, torch::jit::Stack* stack) {
+  override_call_count++;
+  auto op = getOperator(schema_str);
+  auto operation = op->getOperation();
+
+  const auto& schema = op->schema();
+  auto num_arguments = schema.arguments().size();
+  auto num_returns = schema.returns().size();
+
+  // Unwrap all arguments
+  auto args = torch::jit::pop(*stack, num_arguments);
+  for (size_t i = 0; i < num_arguments; i++) {
+    // TODO: Handle tensor list
+    if (args[i].isTensor()) {
+      auto* impl = args[i].unsafeToTensorImpl();
+      if (impl->type_set().has(TensorTypeId::TESTING_ONLY_GenericWrapperTensorId)) {
+        auto* wrapper = static_cast<GenericWrapperTensorImpl*>(impl);
+        torch::jit::push(*stack, wrapper->rep_);  // no move!
+      } else {
+        torch::jit::push(*stack, std::move(args[i]));
+      }
+    } else {
+      torch::jit::push(*stack, std::move(args[i]));
+    }
+  }
+
+  auto offset = operation(*stack);
+
+  // Rewrap outputs
+  auto rets = torch::jit::pop(*stack, num_returns);
+  for (size_t i = 0; i < num_returns; i++) {
+    // TODO: Handle tensor list
+    if (args[i].isTensor()) {
+      torch::jit::push(*stack, at::detail::make_tensor<GenericWrapperTensorImpl>(std::move(std::move(args[i]).toTensor())) );  // yes move!
+    } else {
+      torch::jit::push(*stack, std::move(args[i]));
+    }
+  }
+
+  TORCH_INTERNAL_ASSERT(offset == 0);
+}
+
+// As the current API does not support unregistering fallback boxed ops,
+// settings of these values are PROCESS global.  Therefore the environment
+// here.
+class Environment : public ::testing::Environment {
+ public:
+  virtual ~Environment() {}
+
+  void SetUp() override {
+    globalATenDispatch().registerFallbackBoxedOp(TensorTypeId::TESTING_ONLY_GenericWrapperTensorId, &generic_wrapper_fallback);
+    globalATenDispatch().registerFallbackBoxedOp(TensorTypeId::TESTING_ONLY_GenericModeTensorId, &generic_mode_fallback);
+  }
+
+  void TearDown() override {}
+};
+
+::testing::Environment* const env =
+    ::testing::AddGlobalTestEnvironment(new Environment);
+
+// There's a case to be made that a more comprehensive test suite would be able
+// to capture many more edge cases.  This test suite is just to show that
+// basic functionality works.
+
+TEST(BoxedFallbackTest, TestBoxedFallbackWithMode) {
+  c10::impl::IncludeTensorTypeIdGuard guard(TensorTypeId::TESTING_ONLY_GenericModeTensorId);
+
+  override_call_count = 0;
+  Tensor a = ones({5, 5}, kDouble);
+  Tensor b = batch_norm(a, {}, {}, {}, {}, true, 0.1, 1e-05, false);
+  ASSERT_EQ(override_call_count, 2);
+}
+
+TEST(BoxedFallbackTest, TestBoxedFallbackWithWrapper) {
+  override_call_count = 0;
+  Tensor a = at::detail::make_tensor<GenericWrapperTensorImpl>(ones({5, 5}, kDouble));
+  Tensor b = batch_norm(a, {}, {}, {}, {}, true, 0.1, 1e-05, false);
+  ASSERT_EQ(override_call_count, 1);
+}
--- a/aten/src/ATen/test/extension_backend_test.cpp
+++ b/aten/src/ATen/test/extension_backend_test.cpp
@ -4,6 +4,10 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/core/op_registration/op_registration.h>

+#include <ATen/core/ATenDispatch.h>
+
+#include <torch/csrc/jit/operator.h>
+
 using namespace at;

 static int test_int;
--- a/aten/src/ATen/test/variant_test.cpp
+++ b/aten/src/ATen/test/variant_test.cpp
@ -0,0 +1,67 @@
+#include <gtest/gtest.h>
+
+#include <c10/util/variant.h>
+
+namespace testns {
+
+namespace enumtype {
+  // NOTE: We need to provide the default constructor for each struct,
+  // otherwise Clang 3.8 would complain:
+  // ```
+  // error: default initialization of an object of const type 'const enumtype::Enum1'
+  // without a user-provided default constructor
+  // ```
+  struct Enum1 { Enum1() {}; };
+  struct Enum2 { Enum2() {}; };
+  struct Enum3 { Enum3() {}; };
+} // namespace enumtype
+
+struct enum_name {
+  std::string operator()(enumtype::Enum1& v) const {
+    return "Enum1";
+  }
+  std::string operator()(enumtype::Enum2& v) const {
+    return "Enum2";
+  }
+  std::string operator()(enumtype::Enum3& v) const {
+    return "Enum3";
+  }
+};
+
+const enumtype::Enum1 kEnum1;
+const enumtype::Enum2 kEnum2;
+const enumtype::Enum3 kEnum3;
+
+} // namespace testns
+
+std::string func(c10::variant<testns::enumtype::Enum1, testns::enumtype::Enum2, testns::enumtype::Enum3> v) {
+  if (c10::get_if<testns::enumtype::Enum1>(&v)) {
+    return "Enum1";
+  } else if (c10::get_if<testns::enumtype::Enum2>(&v)) {
+    return "Enum2";
+  } else if (c10::get_if<testns::enumtype::Enum3>(&v)) {
+    return "Enum3";
+  } else {
+    return "Unsupported enum";
+  }
+}
+
+TEST(VariantTest, Basic) {
+  ASSERT_EQ(func(testns::kEnum1), "Enum1");
+  ASSERT_EQ(func(testns::kEnum2), "Enum2");
+  ASSERT_EQ(func(testns::kEnum3), "Enum3");
+
+  c10::variant<testns::enumtype::Enum1, testns::enumtype::Enum2, testns::enumtype::Enum3> v;
+  {
+    v = testns::kEnum1;
+    ASSERT_EQ(c10::visit(testns::enum_name{}, v), "Enum1");
+  }
+  {
+    v = testns::kEnum2;
+    ASSERT_EQ(c10::visit(testns::enum_name{}, v), "Enum2");
+  }
+  {
+    v = testns::kEnum3;
+    ASSERT_EQ(c10::visit(testns::enum_name{}, v), "Enum3");
+  }
+}
--- a/aten/src/TH/generic/THTensorMath.cpp
+++ b/aten/src/TH/generic/THTensorMath.cpp
@ -298,29 +298,20 @@ static void THTensor_(addmmImpl)(THTensor *r_, THTensor *t, THTensor *m1, THTens
  int64_t ldm1_ = (transpose_m1 == 'n' ? m1_->stride((transpose_r == 'n' ? 1 : 0)) : m1_->stride((transpose_r == 'n' ? 0 : 1)));
  int64_t ldm2_ = (transpose_m2 == 'n' ? m2_->stride((transpose_r == 'n' ? 1 : 0)) : m2_->stride((transpose_r == 'n' ? 0 : 1)));

-  // Don't go through GEMM if result is empty matrix, since this is not
-  // supported by BLAS.
-  if (m != 0 && n != 0) {
-    if (k == 0) {
-      THTensor_(mul)(r__, r__, beta);
-    } else {
-      /* do the operation */
-      THBlas_(gemm)(transpose_m1,
-                    transpose_m2,
-                    m,
-                    n,
-                    k,
-                    alpha,
-                    m1_->data<scalar_t>(),
-                    ldm1_,
-                    m2_->data<scalar_t>(),
-                    ldm2_,
-                    beta,
-                    r__->data<scalar_t>(),
-                    ldr__);
-    }
-  }
-
+  /* do the operation */
+  THBlas_(gemm)(transpose_m1,
+                transpose_m2,
+                m,
+                n,
+                k,
+                alpha,
+                m1_->data<scalar_t>(),
+                ldm1_,
+                m2_->data<scalar_t>(),
+                ldm2_,
+                beta,
+                r__->data<scalar_t>(),
+                ldr__);

  /* free intermediate variables */
  if(free_m1)
--- a/aten/src/THC/generic/THCTensorMathBlas.cu
+++ b/aten/src/THC/generic/THCTensorMathBlas.cu
@ -305,20 +305,6 @@ static void THCTensor_(addmmImpl)(THCState *state, THCTensor *r_, THCTensor *t,
    }
  }

-  // Special casing for empty matrices
-  if (r_->size(0) == 0 || r_->size(1) == 0) {
-    // No multiplication needed for case of empty result matrix.
-    return;
-  } else if (m1->size(1) == 0) {
-    // k == 0
-    if (ScalarConvert<scalar_t, double>::to(beta) != 0.0) {
-      THCTensor_(mul)(state, r_, r_, beta);
-    } else {
-      THCTensor_(zero)(state, r_);
-    }
-    return;
-  }
-
  /* r_ */
  if(r_->stride(0) == 1 &&
     r_->stride(1) != 0)
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@ -12,6 +12,7 @@ import torch
 # needs to be imported after torch
 import cpp_extension # noqa

+import cpp_extension # noqa
 import benchmark_utils
 from collections import namedtuple

--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@ -94,11 +94,17 @@ class TorchBenchmarkBase(object):
        """ this is a globally unique name which can be used to 
            label a specific test 
        """
+
+        # This is a list of attributes which will not be included
+        # in the test name. 
+        skip_key_list = ['device']
+
        test_name_str = []
        for key in kargs:
            value = kargs[key]
            test_name_str.append(
-                key + str(value if type(value) != bool else int(value)))
+                ('' if key in skip_key_list else key)
+                + str(value if type(value) != bool else int(value)))
        name = (self.module_name() + '_' +
                '_'.join(test_name_str)).replace(" ", "")
        return name
--- a/benchmarks/operator_benchmark/benchmark_utils.py
+++ b/benchmarks/operator_benchmark/benchmark_utils.py
@ -7,6 +7,7 @@ import numpy as np
 import itertools
 import random
 import os
+import bisect


 """Performance microbenchmarks's utils.
@ -14,6 +15,8 @@ import os
 This module contains utilities for writing microbenchmark tests.
 """

+# Here are the reserved keywords in the benchmark suite
+_reserved_keywords = {"probs", "total_samples", "tags"}

 def shape_to_string(shape):
    return ', '.join([str(x) for x in shape])
@ -109,32 +112,159 @@ def cross_product_configs(**configs):


 def config_list(**configs):
-    """
-    Take specific inputs from users
-    For example, given 
+    """ Generate configs based on the list of input shapes.
+    This function will take input shapes specified in a list from user. Besides
+    that, all other parameters will be cross producted first and each of the 
+    generated list will be merged with the input shapes list. 
+
+    Reserved Args: 
+        attr_names(reserved): a list of names for input shapes. 
+        attrs(reserved): a list of values for each input shape.  
+        corss_product: a dictionary of attributes which will be 
+                       cross producted with the input shapes. 
+        tags(reserved): a tag used to filter inputs. 
+
+    Here is an example: 
    attrs = [
        [1, 2],
        [4, 5],
-    ]
-    attr_names = ["M", "N"]
-    we will generate (({'M': 1}, {'N' : 2}),
-                      ({'M': 4}, {'N' : 5}))
+    ],
+    attr_names = ['M', 'N'],
+    cross_product_configs={
+        'device': ['cpu', 'cuda'],
+    },
+
+    we will generate [[{'M': 1}, {'N' : 2}, {'device' : 'cpu'}],
+                      [{'M': 1}, {'N' : 2}, {'device' : 'cuda'}],
+                      [{'M': 4}, {'N' : 5}, {'device' : 'cpu'}],
+                      [{'M': 4}, {'N' : 5}, {'device' : 'cuda'}]]
    """
    generated_configs = []
-    if "attrs" not in configs:
+    reserved_names = ['attrs', 'attr_names', 'tags']
+    if any(attr not in configs for attr in reserved_names): 
        raise ValueError("Missing attrs in configs")
-    for inputs in configs["attrs"]:
-        tmp_result = [{configs["attr_names"][i] : input_value} 
+
+    cross_configs = None
+    if 'cross_product_configs' in configs: 
+        cross_configs = cross_product_configs(**configs['cross_product_configs'])
+
+    for inputs in configs['attrs']:
+        tmp_result = [{configs['attr_names'][i] : input_value} 
                      for i, input_value in enumerate(inputs)]
        # TODO(mingzhe0908): 
-        # If multiple "tags" were provided, do they get concat?
-        # If a config has both ["short", "medium"], it should match 
-        # both "short" and "medium" tag-filter?
-        tmp_result.append({"tags" : '_'.join(configs["tags"])})
-        generated_configs.append(tmp_result)
+        # If multiple 'tags' were provided, do they get concat?
+        # If a config has both ['short', 'medium'], it should match 
+        # both 'short' and 'medium' tag-filter?
+        tmp_result.append({'tags' : '_'.join(configs['tags'])})
+        if cross_configs: 
+            generated_configs += [tmp_result + list(config) for config in cross_configs]
+        else: 
+            generated_configs.append(tmp_result)
+
    return generated_configs


+def attr_probs(**probs):
+    """ return the inputs in a dictionary  
+    """
+    return probs
+
+
+class RandomSample(object):
+
+    def __init__(self, configs):
+        self.saved_cum_distribution = {}
+        self.configs = configs
+
+    def _distribution_func(self, key, weights):
+        """ this is a cumulative distribution function used for random sampling inputs 
+        """
+        if key in self.saved_cum_distribution:
+            return self.saved_cum_distribution[key]
+
+        total = sum(weights)
+        result = []
+        cumsum = 0
+        for w in weights:
+            cumsum += w
+            result.append(cumsum / total)
+        self.saved_cum_distribution[key] = result
+        return result
+
+    def _random_sample(self, key, values, weights):
+        """ given values and weights, this function randomly sample values based their weights 
+        """
+        # TODO(mingzhe09088): cache the results to avoid recalculation overhead 
+        assert len(values) == len(weights)
+        _distribution_func_vals = self._distribution_func(key, weights)
+        x = random.random()
+        idx = bisect.bisect(_distribution_func_vals, x)
+
+        assert idx <= len(values), "Wrong index value is returned"
+        # Due to numerical property, the last value in cumsum could be slightly 
+        # smaller than 1, and lead to the (index == len(values)).
+        if idx == len(values):
+            idx -= 1
+        return values[idx]
+
+    def get_one_set_of_inputs(self): 
+        tmp_attr_list = []
+        for key, values in self.configs.items():
+            if key in _reserved_keywords:
+                continue
+            value = self._random_sample(key, values, self.configs["probs"][str(key)])
+            tmp_results = {key : value}
+            tmp_attr_list.append(tmp_results)
+        return (tmp_attr_list)
+
+
+def random_sample_configs(**configs): 
+    """
+    This function randomly sample <total_samples> values from the given inputs based on 
+    their weights. 
+    Here is an example showing what are the expected inputs and outpus from this function: 
+    M = [1, 2],
+    N = [4, 5],
+    K = [7, 8],
+    probs = attr_probs( 
+        M = [0.7, 0.2],
+        N = [0.5, 0.2],
+        K = [0.6, 0.2],
+    ),
+    total_samples=10, 
+    this function will generate 
+    [
+        [{'K': 7}, {'M': 1}, {'N': 4}], 
+        [{'K': 7}, {'M': 2}, {'N': 5}], 
+        [{'K': 8}, {'M': 2}, {'N': 4}],
+        ...
+    ]
+    Note: 
+    The probs is optional. Without them, it implies everything is 1. The probs doesn't 
+    have to reflect the actual normalized probability, the implementation will 
+    normalize it.
+    TODO (mingzhe09088):  
+    (1):  a lambda that accepts or rejects a config as a sample. For example: for matmul
+    with M, N, and K, this function could get rid of (M * N * K > 1e8) to filter out 
+    very slow benchmarks.
+    (2): Make sure each sample is unique. If the number of samples are larger than the 
+    total combinations, just return the cross product. Otherwise, if the number of samples 
+    is close to the number of cross-products, it is numerical safer to generate the list 
+    that you don't want, and remove them. 
+    """
+    if "probs" not in configs: 
+        raise ValueError("probs is missing. Consider adding probs or"
+                         "using other config functions")
+
+    configs_attrs_list = []
+    randomsample = RandomSample(configs)
+    for i in range(configs["total_samples"]):  
+        tmp_attr_list = randomsample.get_one_set_of_inputs()
+        tmp_attr_list.append({"tags" : '_'.join(configs["tags"])})
+        configs_attrs_list.append(tmp_attr_list)
+    return configs_attrs_list
+
+
 def op_list(**configs):
    """Generate a list of ops organized in a specific format.
       It takes two parameters which are "attr_names" and "attr". 
--- a/benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py
+++ b/benchmarks/operator_benchmark/common/tests/pt_configs_list_test.py
@ -0,0 +1,40 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import operator_benchmark as op_bench
+import torch
+
+"""Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch."""
+
+add_short_configs = op_bench.config_list(
+    attr_names=['M', 'N', 'K'], 
+    attrs=[
+        [8, 16, 32],
+        [16, 16, 64],
+        [64, 64, 128],
+    ],
+    cross_product_configs={
+        'device': ['cpu', 'cuda'],
+        'dtype': [torch.float, torch.float64],
+    },
+    tags=['short'], 
+)
+
+
+class AddBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, K, device, dtype): 
+        self.input_one = torch.rand(M, N, K, device=device, dtype=dtype, requires_grad=True)
+        self.input_two = torch.rand(M, N, K, device=device, dtype=dtype)
+        self.set_module_name('add')
+
+    def forward(self):
+        return torch.add(self.input_one, self.input_two)
+
+
+op_bench.generate_pt_test(add_short_configs, AddBenchmark)
+
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()
--- a/benchmarks/operator_benchmark/common/tests/random_sample_test.py
+++ b/benchmarks/operator_benchmark/common/tests/random_sample_test.py
@ -0,0 +1,36 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import operator_benchmark as op_bench
+import torch
+
+
+configs = op_bench.random_sample_configs(
+    M=[1, 2, 3, 4, 5, 6],
+    N=[7, 8, 9, 10, 11, 12],
+    K=[13, 14, 15, 16, 17, 18],
+    # probs saves the weights of each value
+    probs=op_bench.attr_probs( 
+        M=[0.5, 0.2, 0.1, 0.05, 0.03, 0.1],
+        N=[0.1, 0.3, 0.4, 0.02, 0.03, 0.04],
+        K=[0.03, 0.6, 0.04, 0.02, 0.03, 0.01],
+    ),
+    # this is the number of returned inputs 
+    total_samples=10, 
+    tags=["short"],
+)
+
+
+class AddBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, K): 
+        self.input_one = torch.rand(M, N, K)
+        self.input_two = torch.rand(M, N, K)
+        self.set_module_name("add")
+
+    def forward(self):
+        return torch.add(self.input_one, self.input_two)
+
+
+op_bench.generate_pt_test(configs, AddBenchmark)
+
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()
--- a/benchmarks/operator_benchmark/pt/add_test.py
+++ b/benchmarks/operator_benchmark/pt/add_test.py
@ -11,33 +11,49 @@ import torch
 # Configs for PT add operator
 add_long_configs = op_bench.cross_product_configs(
    M=[8, 64, 128],
-    N=range(2, 10, 3),
-    K=[2 ** x for x in range(0, 3)],
+    N=range(2, 128, 64),
+    K=[8 ** x for x in range(0, 3)], 
+    device=['cpu', 'cuda'],
    tags=["long"]
 )


 add_short_configs = op_bench.config_list(
+    attr_names=["M", "N", "K"], 
    attrs=[
        [64, 64, 64],
        [64, 64, 128],
    ],
-    attr_names=["M", "N", "K"],
-    tags=["short"],
+    cross_product_configs={
+        'device': ['cpu', 'cuda'],
+    },
+    tags=["short"], 
 )


 class AddBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, K):
-        self.input_one = torch.rand(M, N, K)
-        self.input_two = torch.rand(M, N, K)
+    def init(self, M, N, K, device): 
+        self.input_one = torch.rand(M, N, K, device=device, requires_grad=self.auto_set())
+        self.input_two = torch.rand(M, N, K, device=device, requires_grad=self.auto_set())
        self.set_module_name("add")

    def forward(self):
        return torch.add(self.input_one, self.input_two)

+# The generated test names based on add_short_configs will be in the following pattern: 
+# add_M8_N16_K32_devicecpu
+# add_M8_N16_K32_devicecuda
+# add_M8_N16_K32_devicecpu_bwdall
+# add_M8_N16_K32_devicecpu_bwd1
+# add_M8_N16_K32_devicecpu_bwd2
+# add_M8_N16_K32_devicecuda_bwdall
+# add_M8_N16_K32_devicecuda_bwd1
+# add_M8_N16_K32_devicecuda_bwd2
+# ...
+# Those names can be used to filter tests. 

 op_bench.generate_pt_test(add_long_configs + add_short_configs, AddBenchmark)
+op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddBenchmark)


 if __name__ == "__main__":
--- a/benchmarks/operator_benchmark/pt/as_strided_test.py
+++ b/benchmarks/operator_benchmark/pt/as_strided_test.py
@ -0,0 +1,41 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import operator_benchmark as op_bench
+import torch
+
+
+"""Microbenchmarks for as_strided operator"""
+
+
+# Configs for PT as_strided operator
+split_short_configs = op_bench.cross_product_configs(
+    M=[256, 512],
+    N=[256, 512],
+    size=[(32, 32), (64, 64)],
+    stride=[(1, 1), (2, 2)],
+    storage_offset=[0, 1],
+    tags=['short']
+)
+
+
+class As_stridedBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, size, stride, storage_offset):
+        self.input_one = torch.rand(M, N)
+        self.size = size
+        self.stride = stride
+        self.storage_offset = storage_offset
+        self.set_module_name('as_strided')
+
+    def forward(self):
+        return torch.as_strided(
+            self.input_one, self.size, self.stride, self.storage_offset)
+
+
+op_bench.generate_pt_test(split_short_configs, As_stridedBenchmark)
+
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()
--- a/binaries/bench_gen/bench_gen.py
+++ b/binaries/bench_gen/bench_gen.py
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@ -365,10 +365,6 @@ static inline ScalarType promoteTypes(ScalarType a, ScalarType b) {
  if (a == ud || b == ud) {
    return ScalarType::Undefined;
  }
-  if (isComplexType(a) || isComplexType(b)) {
-    AT_ERROR(
-        "promoteTypes with complex numbers is not handled yet; figure out what the correct rules should be for ", toString(a), " and ", toString(b));
-  }

  // For QInt types, we only allow exact match
  if (isQIntType(a) && a == b) {
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@ -224,12 +224,4 @@ at::DataPtr PlacementDeleteContext::makeDataPtr(

 AutogradMetaInterface::~AutogradMetaInterface() {}

-bool NonVariableTypeMode::is_enabled() {
-  return !impl::tls_variable_is_enabled();
-}
-
-void NonVariableTypeMode::set_enabled(bool enabled) {
-  impl::tls_variable_set_enabled(!enabled);
-}
-
 } // namespace c10
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -9,6 +9,7 @@
 #include <c10/core/Storage.h>
 #include <c10/core/TensorOptions.h>
 #include <c10/core/TensorTypeSet.h>
+#include <c10/core/impl/LocalTensorTypeSet.h>
 #include <c10/core/CopyBytes.h>

 #include <c10/util/Exception.h>
@ -138,11 +139,6 @@ struct C10_API AutogradMetaInterface {
  virtual ~AutogradMetaInterface();
 };

-struct C10_API NonVariableTypeMode {
-  static bool is_enabled();
-  static void set_enabled(bool enabled);
-};
-
 struct C10_API NamedTensorMetaInterface {
  virtual ~NamedTensorMetaInterface() {};
  virtual std::unique_ptr<NamedTensorMetaInterface> clone() const {
@ -808,7 +804,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   * True if a tensor is a variable.  See Note [Tensor versus Variable in C++]
   */
  bool is_variable() const {
-    return autograd_meta_ != nullptr && !at::NonVariableTypeMode::is_enabled();
+    return autograd_meta_ != nullptr && !impl::tls_local_tensor_type_set().excluded_.has(TensorTypeId::VariableTensorId);
  }

  /**
--- a/c10/core/TensorTypeId.cpp
+++ b/c10/core/TensorTypeId.cpp
@ -40,6 +40,10 @@ const char* toString(TensorTypeId t) {
      return "ComplexCUDATensorId";
    case TensorTypeId::VariableTensorId:
      return "VariableTensorId";
+    case TensorTypeId::TESTING_ONLY_GenericModeTensorId:
+      return "TESTING_ONLY_GenericModeTensorId";
+    case TensorTypeId::TESTING_ONLY_GenericWrapperTensorId:
+      return "TESTING_ONLY_GenericWrapperTensorId";
    default:
      return "UNKNOWN_TENSOR_TYPE_ID";
  }
--- a/c10/core/TensorTypeId.h
+++ b/c10/core/TensorTypeId.h
@ -49,6 +49,19 @@ enum class TensorTypeId : uint8_t {

  VariableTensorId,

+  // TESTING: This is intended to be a generic testing tensor type id.
+  // Don't use it for anything real; its only acceptible use is within a single
+  // process test.  Use it by creating a TensorImpl with this TensorTypeId, and
+  // then registering operators to operate on this type id.
+  TESTING_ONLY_GenericWrapperTensorId,
+
+  // TESTING: This is intended to be a generic testing tensor type id.
+  // Don't use it for anything real; its only acceptible use is within a ingle
+  // process test.  Use it by toggling the mode on and off via
+  // TESTING_ONLY_tls_generic_mode_set_enabled and then registering operators
+  // to operate on this type id.
+  TESTING_ONLY_GenericModeTensorId,
+
  NumTensorIds, // Sentinel
 };

--- a/c10/core/impl/LocalTensorTypeSet.cpp
+++ b/c10/core/impl/LocalTensorTypeSet.cpp
@ -8,34 +8,55 @@ namespace impl {
 namespace {

 /// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting,
-/// thread_local is not supported. In that case, we don't provide
-/// `at::NonVariableTypeMode`.
+/// thread_local is not supported.
 #ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY

-// NB: Zero initialized!
-thread_local uint64_t raw_excluded;
+// NB: POD, zero initialized!
+thread_local PODLocalTensorTypeSet raw_local_tensor_type_set;

 #else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)

-uint64_t raw_excluded = 0;
+static PODLocalTensorTypeSet raw_local_tensor_type_set;

 #endif

+} // anonymous namespace
+
+LocalTensorTypeSet tls_local_tensor_type_set() {
+  return raw_local_tensor_type_set;
 }

-TensorTypeSet tls_excluded_tensor_type_set() {
-  return TensorTypeSet(TensorTypeSet::RAW, raw_excluded);
+// We could have also just snapshotted the entire state.  I'm not sure which is
+// better; but right now only the guard API is allowed so the two cases are
+// not distinguishable.
+
+IncludeTensorTypeIdGuard::IncludeTensorTypeIdGuard(TensorTypeId x)
+  : tls_(&raw_local_tensor_type_set)
+  , id_(x)
+  , prev_state_(tls_->included().has(x)) {
+  if (!prev_state_) {
+    tls_->set_included(tls_->included().add(x));
+  }
 }

-bool tls_variable_is_enabled() {
-  return !tls_excluded_tensor_type_set().has(TensorTypeId::VariableTensorId);
+IncludeTensorTypeIdGuard::~IncludeTensorTypeIdGuard() {
+  if (!prev_state_) {
+    tls_->set_included(tls_->included().remove(id_));
+  }
 }

-void tls_variable_set_enabled(bool enabled) {
-  if (enabled) {
-    raw_excluded = tls_excluded_tensor_type_set().remove(TensorTypeId::VariableTensorId).raw_repr();
-  } else {
-    raw_excluded = tls_excluded_tensor_type_set().add(TensorTypeId::VariableTensorId).raw_repr();
+ExcludeTensorTypeIdGuard::ExcludeTensorTypeIdGuard(TensorTypeId x)
+  : tls_(&raw_local_tensor_type_set)
+  , id_(x)
+  , prev_state_(tls_->excluded().has(x)) {
+  if (!prev_state_) {
+    tls_->set_excluded(tls_->excluded().add(x));
+  }
+}
+
+ExcludeTensorTypeIdGuard::~ExcludeTensorTypeIdGuard() {
+  if (!prev_state_) {
+    tls_->set_excluded(tls_->excluded().remove(id_));
  }
 }

--- a/c10/core/impl/LocalTensorTypeSet.h
+++ b/c10/core/impl/LocalTensorTypeSet.h
@ -1,22 +1,80 @@
+#pragma once
+
 #include <c10/core/TensorTypeSet.h>

-// TLS management for TensorTypeSet
+// TLS management for TensorTypeSet (the "local" TensorTypeSet(s))
 //
-// This manages thread-local TensorTypeSet of excluded keys which disqualify
-// tensor types from dispatch.  Keys which are in this set, even if they appear
-// in a list of potential valid keys on a tensor, are not considered for
-// dispatch.  This is used to, for example, turn off autograd after we have
-// handled autograd for a top-level element.
+// This manages two thread-local TensorTypeSets:
 //
-// Originally, I implemented this as storing the inverted set, but
-// TLS is defined to be zero-initialized, so this doesn't actually work
-// (you want the set to be -1 initialized).
+//  - The included type set, which adds a tensor type for consideration
+//    in dispatch.  (For example, you might add ProfilingTensorId to
+//    the included type set to turn on profiling on all tensor operations.)
+//
+//  - The excluded type set, which disqualifies a tensor type from dispatch.
+//    (For example, after redispatching on variable, we disqualify
+//    VariableTensorId so we don't attempt to handle variable again.)
+//    (Exclusion wins over inclusion.)
+//
+// NB: Originally, I implemented the excluded type set as storing the inverted
+// set, but TLS is defined to be zero-initialized, so this doesn't actually work
+// (if it's inverted, you want the set to be -1 initialized).

 namespace c10 {
 namespace impl {

-C10_API bool tls_variable_is_enabled();
-C10_API void tls_variable_set_enabled(bool enabled);
-C10_API TensorTypeSet tls_excluded_tensor_type_set();
+// POD version of LocalTensorTypeSet.  Declared here just so that
+// we can put it in the guards.
+struct C10_API PODLocalTensorTypeSet {
+  uint64_t included_;
+  uint64_t excluded_;
+
+  TensorTypeSet included() const {
+    return TensorTypeSet(TensorTypeSet::RAW, included_);
+  }
+  TensorTypeSet excluded() const {
+    return TensorTypeSet(TensorTypeSet::RAW, excluded_);
+  }
+
+  void set_included(TensorTypeSet x) {
+    included_ = x.raw_repr();
+  }
+  void set_excluded(TensorTypeSet x) {
+    excluded_ = x.raw_repr();
+  }
+};
+static_assert(std::is_pod<PODLocalTensorTypeSet>::value, "PODLocalTensorTypeSet must be a POD type.");
+
+struct C10_API LocalTensorTypeSet {
+  /* implicit */ LocalTensorTypeSet(PODLocalTensorTypeSet x)
+    : included_(x.included()), excluded_(x.excluded()) {}
+  TensorTypeSet included_;
+  TensorTypeSet excluded_;
+};
+
+C10_API LocalTensorTypeSet tls_local_tensor_type_set();
+
+class C10_API IncludeTensorTypeIdGuard {
+public:
+  IncludeTensorTypeIdGuard(TensorTypeId);
+  ~IncludeTensorTypeIdGuard();
+private:
+  // A little micro-optimization to save us from tls_get_addr call
+  // on destruction
+  PODLocalTensorTypeSet* tls_;
+  TensorTypeId id_;
+  bool prev_state_;
+};
+
+class C10_API ExcludeTensorTypeIdGuard {
+public:
+  ExcludeTensorTypeIdGuard(TensorTypeId);
+  ~ExcludeTensorTypeIdGuard();
+private:
+  // A little micro-optimization to save us from tls_get_addr call
+  // on destruction
+  PODLocalTensorTypeSet* tls_;
+  TensorTypeId id_;
+  bool prev_state_;
+};

 }} // namespace c10::impl
--- a/c10/util/Complex.h
+++ b/c10/util/Complex.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include <complex>
+
+
+namespace std {
+
+template <typename T> struct is_complex_t                  : public std::false_type {};
+template <typename T> struct is_complex_t<std::complex<T>> : public std::true_type {};
+
+template <>
+class numeric_limits<std::complex<float>> : public numeric_limits<float>  {};
+
+template <>
+class numeric_limits<std::complex<double>> : public numeric_limits<double>  {};
+
+} // namespace std
--- a/c10/util/llvmMathExtras.h
+++ b/c10/util/llvmMathExtras.h
@ -388,13 +388,23 @@
   return UINT64_MAX >> (64 - N);
 }

+ // Ignore the false warning "Arithmetic overflow" for MSVC
+ #ifdef _MSC_VER
+ # pragma warning(push)
+ # pragma warning(disable : 4146)
+ #endif
+
 /// Gets the minimum value for a N-bit signed integer.
 inline int64_t minIntN(int64_t N) {
   assert(N > 0 && N <= 64 && "integer width out of range");

-   return -(UINT64_C(1)<<(N-1));
+   return -(UINT64_C(1) << (N - 1));
 }

+ #ifdef _MSC_VER
+ # pragma warning(pop)
+ #endif
+
 /// Gets the maximum value for a N-bit signed integer.
 inline int64_t maxIntN(int64_t N) {
   assert(N > 0 && N <= 64 && "integer width out of range");
--- a/c10/util/variant.h
+++ b/c10/util/variant.h
--- a/docs/cpp/source/conf.py
+++ b/docs/cpp/source/conf.py
@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
 # PyTorch documentation build configuration file, created by
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
 # PyTorch documentation build configuration file, created by
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -53,6 +53,8 @@ PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
   torch.utils.model_zoo <model_zoo>
   torch.utils.tensorboard <tensorboard>
   type_info
+   named_tensor
+   name_inference
   torch.__config__ <__config__>

 .. toctree::
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@ -20,7 +20,7 @@ process and loaded in a process where there is no Python dependency.
 We provide tools to incrementally transition a model from a pure Python program
 to a TorchScript program that can be run independently from Python, such as in a standalone C++ program.
 This makes it possible to train models in PyTorch using familiar tools in Python and then export
-the model via TorchScript to a production environment where Python programs may be disadvantageous.
+the model via TorchScript to a production environment where Python programs may be disadvantageous
 for performance and multi-threading reasons.

 For a gentle introduction to TorchScript, see the `Introduction to TorchScript <https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html>`_ tutorial.
@ -34,6 +34,9 @@ Creating TorchScript Code
 .. autoclass:: ScriptModule()
    :members:

+
+.. autoclass:: ScriptFunction()
+
 .. autofunction:: script(obj)

 .. autofunction:: trace(func, example_inputs, optimize=None, check_trace=True, check_inputs=None, check_tolerance=1e-5)
@ -154,9 +157,9 @@ methods, and classes that it encounters. Once you call ``torch.jit.script``,
 compilation is "opt-out", rather than "opt-in".

 2. ``torch.jit.script(nn_module_instance)`` is now the preferred way to create
-``ScriptModule``\s, instead of inheriting from ``torch.jit.ScriptModule``.
+:class:`ScriptModule`\s, instead of inheriting from ``torch.jit.ScriptModule``.
 These changes combine to provide a simpler, easier-to-use API for converting
-your ``nn.Module``\s into ``ScriptModule``\s, ready to be optimized and executed in a
+your ``nn.Module``\s into :class:`ScriptModule`\s, ready to be optimized and executed in a
 non-Python environment.

 The new usage looks like this:
@ -207,7 +210,7 @@ Modules
    and :func:`@torch.jit.unused<torch.jit.unused>` for details.

 When passed to the :func:`torch.jit.script <torch.jit.script>` function, a ``torch.nn.Module``\'s data is
-copied to a ``ScriptModule`` and the TorchScript compiler compiles the module.
+copied to a :class:`ScriptModule` and the TorchScript compiler compiles the module.
 The module's ``forward`` is compiled by default. Methods called from ``forward`` are
 lazily compiled in the order they are used in ``forward``, as well as any
 ``@torch.jit.export`` methods.
@ -248,6 +251,9 @@ Attributes
 The TorchScript compiler needs to know the types of `module attributes`_. Most types
 can be inferred from the value of the member. Empty lists and dicts cannot have their
 types inferred and must have their types annotated with `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_ class annotations.
+If a type cannot be inferred and is not explicilty annotated, it will not be added as an attribute
+to the resulting :class:`ScriptModule`
+

 Old API:

@ -304,7 +310,7 @@ If you are stuck on Python 2 and cannot use the class annotation syntax, you can

 Constants
 ~~~~~~~~~
-The ``Final`` type constructor can be used to mark members as `constant`_. If members are not marked constant, they will be copied to the resulting ``ScriptModule`` as an attribute. Using ``Final`` opens opportunities for optimization if the value is known to be fixed and gives additional type safety.
+The ``Final`` type constructor can be used to mark members as `constant`_. If members are not marked constant, they will be copied to the resulting :class:`ScriptModule` as an attribute. Using ``Final`` opens opportunities for optimization if the value is known to be fixed and gives additional type safety.

 Old API:

@ -1187,9 +1193,11 @@ The ``torch.nn.Parameter`` wrapper and ``register_buffer`` can be used to assign
 tensors to a module. Other values assigned to a module that is compiled
 will be added to the compiled module if their types can be inferred. All `types`_
 available in TorchScript can be used as module attributes. Tensor attributes are
-semantically the same as buffers. The type of empty containers and ``None``
+semantically the same as buffers. The type of empty lists and dictionaries and ``None``
 values cannot be inferred and must be specified via
 `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_ class annotations.
+If a type cannot be inferred and is not explicilty annotated, it will not be added as an attribute
+to the resulting :class:`ScriptModule`.

 Example:

@ -1198,7 +1206,7 @@ Example:
    from typing import List, Dict

    class Foo(nn.Module):
-        # `words` is initialzed as an empty list, so its type must be specified
+        # `words` is initialized as an empty list, so its type must be specified
        words: List[str]

        # The type could potentially be inferred if `a_dict` (below) was not
@ -1284,13 +1292,13 @@ Disable JIT for Debugging
        traced_fn(torch.rand(3, 4))

    Debugging this script with ``pdb`` works except for when we invoke the :func:`@torch.jit.script <torch.jit.script>`
-    function. We can globally disable JIT, so that we can call the ``@torch.jit.script``
+    function. We can globally disable JIT, so that we can call the :func:`@torch.jit.script <torch.jit.script>`
    function as a normal Python function and not compile it. If the above script
    is called ``disable_jit_example.py``, we can invoke it like so::

        $ PYTORCH_JIT=0 python disable_jit_example.py

-    and we will be able to step into the ``@torch.jit.script`` function as a normal Python
+    and we will be able to step into the :func:`@torch.jit.script <torch.jit.script>` function as a normal Python
    function. To disable the TorchScript compiler for a specific function, see
    :func:`@torch.jit.ignore <torch.jit.ignore>`.

@ -1298,7 +1306,7 @@ Disable JIT for Debugging
 Inspecting Code
 ^^^^^^^^^^^^^^^

-TorchScript provides a code pretty-printer for all ``ScriptModule`` instances. This
+TorchScript provides a code pretty-printer for all :class:`ScriptModule` instances. This
 pretty-printer gives an interpretation of the script method's code as valid
 Python syntax. For example:

@ -1322,11 +1330,11 @@ Python syntax. For example:

    ...

-A ``ScriptModule`` with a single ``forward`` method will have an attribute
-``code``, which you can use to inspect the ``ScriptModule``'s code.
-If the ``ScriptModule`` has more than one method, you will need to access
+A :class:`ScriptModule` with a single ``forward`` method will have an attribute
+``code``, which you can use to inspect the :class:`ScriptModule`'s code.
+If the :class:`ScriptModule` has more than one method, you will need to access
 ``.code`` on the method itself and not the module. We can inspect the
-code of a method named ``bar`` on a ScriptModule by accessing ``.bar.code``.
+code of a method named ``foo`` on a ScriptModule by accessing ``.foo.code``.
 The example above produces this output: ::

    def foo(len: int) -> Tensor:
@ -1419,7 +1427,7 @@ operators are formatted to reflect their equivalent source code forms
 to facilitate easy debugging.

 Graphs can be inspected as shown to confirm that the computation described
-by a ``ScriptModule`` is correct, in both automated and manual fashion, as
+by a :class:`ScriptModule` is correct, in both automated and manual fashion, as
 described below.


@ -1638,7 +1646,7 @@ best practices?
   the correct device information.


-Q: How do I store attributes on a ``ScriptModule``?
+Q: How do I store attributes on a :class:`ScriptModule`?

    Say we have a model like:

@ -1658,7 +1666,7 @@ Q: How do I store attributes on a ``ScriptModule``?

    If ``Model`` is instantiated it will result in a compilation error
    since the compiler doesn't know about ``x``. There are 4 ways to inform the
-    compiler of attributes on ``ScriptModule``:
+    compiler of attributes on :class:`ScriptModule`:

    1. ``nn.Parameter`` - Values wrapped in ``nn.Parameter`` will work as they
    do on ``nn.Module``\s
--- a/docs/source/name_inference.rst
+++ b/docs/source/name_inference.rst
@ -0,0 +1,468 @@
+.. currentmodule:: torch
+
+.. _name_inference_reference-doc:
+
+Named Tensors operator coverage
+===============================
+
+Please read :ref:`named_tensors-doc` first for an introduction to named tensors.
+
+This document is a reference for *name inference*, a process that defines how
+named tensors:
+
+1. use names to provide additional automatic runtime correctness checks
+2. propagate names from input tensors to output tensors
+
+Below is a list of all operations that are supported with named tensors
+and their associated name inference rules.
+
+If you don't see an operation listed here, but it would help your use case, please
+`search if an issue has already been filed <https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3A%22module%3A+named+tensor%22>`_ and if not, `file one <https://github.com/pytorch/pytorch/issues/new/choose>`_.
+
+.. warning::
+    The named tensor API is experimental and subject to change.
+
+.. csv-table:: Supported Operations
+   :header: API, Name inference rule
+   :widths: 20, 20
+
+   ":meth:`Tensor.abs`, :func:`torch.abs`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.abs_`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.acos`, :func:`torch.acos`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.acos_`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.add`, :func:`torch.add`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.add_`,:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.addmm`, :func:`torch.addmm`",:ref:`contracts_away_dims-doc`
+   :meth:`Tensor.addmm_`,:ref:`contracts_away_dims-doc`
+   ":meth:`Tensor.addmv`, :func:`torch.addmv`",:ref:`contracts_away_dims-doc`
+   :meth:`Tensor.addmv_`,:ref:`contracts_away_dims-doc`
+   :meth:`Tensor.align_as`,See documentation
+   :meth:`Tensor.align_to`,See documentation
+   ":meth:`Tensor.all`, :func:`torch.all`",None
+   ":meth:`Tensor.any`, :func:`torch.any`",None
+   ":meth:`Tensor.asin`, :func:`torch.asin`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.asin_`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.atan`, :func:`torch.atan`",:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.atan2`, :func:`torch.atan2`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.atan2_`,:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.atan_`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.bernoulli`, :func:`torch.bernoulli`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.bernoulli_`,None
+   :meth:`Tensor.bfloat16`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.bitwise_not`, :func:`torch.bitwise_not`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.bitwise_not_`,None
+   ":meth:`Tensor.bmm`, :func:`torch.bmm`",:ref:`contracts_away_dims-doc`
+   :meth:`Tensor.bool`,:ref:`keeps_input_names-doc`
+   :meth:`Tensor.byte`,:ref:`keeps_input_names-doc`
+   :func:`torch.cat`,:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.cauchy_`,None
+   ":meth:`Tensor.ceil`, :func:`torch.ceil`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.ceil_`,None
+   :meth:`Tensor.char`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.chunk`, :func:`torch.chunk`",:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.clamp`, :func:`torch.clamp`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.clamp_`,None
+   :meth:`Tensor.copy_`,:ref:`out_function_semantics-doc`
+   ":meth:`Tensor.cos`, :func:`torch.cos`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.cos_`,None
+   ":meth:`Tensor.cosh`, :func:`torch.cosh`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.cosh_`,None
+   :meth:`Tensor.cpu`,:ref:`keeps_input_names-doc`
+   :meth:`Tensor.cuda`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.cumprod`, :func:`torch.cumprod`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.cumsum`, :func:`torch.cumsum`",:ref:`removes_dimensions-doc`
+   :meth:`Tensor.data_ptr`,None
+   ":meth:`Tensor.detach`, :func:`torch.detach`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.detach_`,None
+   ":attr:`Tensor.device`, :func:`torch.device`",None
+   ":meth:`Tensor.digamma`, :func:`torch.digamma`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.digamma_`,None
+   :meth:`Tensor.dim`,None
+   ":meth:`Tensor.div`, :func:`torch.div`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.div_`,:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.dot`, :func:`torch.dot`",None
+   :meth:`Tensor.double`,:ref:`keeps_input_names-doc`
+   :meth:`Tensor.element_size`,None
+   :func:`torch.empty`,:ref:`factory-doc`
+   :func:`torch.empty_like`,:ref:`factory-doc`
+   ":meth:`Tensor.eq`, :func:`torch.eq`",:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.erf`, :func:`torch.erf`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.erf_`,None
+   ":meth:`Tensor.erfc`, :func:`torch.erfc`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.erfc_`,None
+   ":meth:`Tensor.erfinv`, :func:`torch.erfinv`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.erfinv_`,None
+   ":meth:`Tensor.exp`, :func:`torch.exp`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.exp_`,None
+   :meth:`Tensor.expand`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.expm1`, :func:`torch.expm1`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.expm1_`,None
+   :meth:`Tensor.exponential_`,None
+   :meth:`Tensor.fill_`,None
+   ":meth:`Tensor.flatten`, :func:`torch.flatten`",See documentation
+   :meth:`Tensor.float`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.floor`, :func:`torch.floor`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.floor_`,None
+   ":meth:`Tensor.frac`, :func:`torch.frac`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.frac_`,None
+   ":meth:`Tensor.ge`, :func:`torch.ge`",:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.get_device`, :func:`torch.get_device`",None
+   :attr:`Tensor.grad`,None
+   ":meth:`Tensor.gt`, :func:`torch.gt`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.half`,:ref:`keeps_input_names-doc`
+   :meth:`Tensor.has_names`,See documentation
+   ":meth:`Tensor.index_fill`, :func:`torch.index_fill`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.index_fill_`,None
+   :meth:`Tensor.int`,:ref:`keeps_input_names-doc`
+   :meth:`Tensor.is_contiguous`,None
+   :attr:`Tensor.is_cuda`,None
+   ":meth:`Tensor.is_floating_point`, :func:`torch.is_floating_point`",None
+   :attr:`Tensor.is_leaf`,None
+   :meth:`Tensor.is_pinned`,None
+   :meth:`Tensor.is_shared`,None
+   ":meth:`Tensor.is_signed`, :func:`torch.is_signed`",None
+   :attr:`Tensor.is_sparse`,None
+   :func:`torch.is_tensor`,None
+   :meth:`Tensor.item`,None
+   ":meth:`Tensor.kthvalue`, :func:`torch.kthvalue`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.le`, :func:`torch.le`",:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.log`, :func:`torch.log`",:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.log10`, :func:`torch.log10`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.log10_`,None
+   ":meth:`Tensor.log1p`, :func:`torch.log1p`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.log1p_`,None
+   ":meth:`Tensor.log2`, :func:`torch.log2`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.log2_`,None
+   :meth:`Tensor.log_`,None
+   :meth:`Tensor.log_normal_`,None
+   ":meth:`Tensor.logical_not`, :func:`torch.logical_not`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.logical_not_`,None
+   ":meth:`Tensor.logsumexp`, :func:`torch.logsumexp`",:ref:`removes_dimensions-doc`
+   :meth:`Tensor.long`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.lt`, :func:`torch.lt`",:ref:`unifies_names_from_inputs-doc`
+   :func:`torch.manual_seed`,None
+   ":meth:`Tensor.masked_fill`, :func:`torch.masked_fill`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.masked_fill_`,None
+   ":meth:`Tensor.masked_select`, :func:`torch.masked_select`",Aligns mask up to input and then unifies_names_from_input_tensors
+   ":meth:`Tensor.matmul`, :func:`torch.matmul`",:ref:`contracts_away_dims-doc`
+   ":meth:`Tensor.mean`, :func:`torch.mean`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.median`, :func:`torch.median`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.mm`, :func:`torch.mm`",:ref:`contracts_away_dims-doc`
+   ":meth:`Tensor.mode`, :func:`torch.mode`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.mul`, :func:`torch.mul`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.mul_`,:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.mv`, :func:`torch.mv`",:ref:`contracts_away_dims-doc`
+   :attr:`Tensor.names`,See documentation
+   ":meth:`Tensor.narrow`, :func:`torch.narrow`",:ref:`keeps_input_names-doc`
+   :attr:`Tensor.ndim`,None
+   :meth:`Tensor.ndimension`,None
+   ":meth:`Tensor.ne`, :func:`torch.ne`",:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.neg`, :func:`torch.neg`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.neg_`,None
+   :func:`torch.normal`,:ref:`keeps_input_names-doc`
+   :meth:`Tensor.normal_`,None
+   ":meth:`Tensor.numel`, :func:`torch.numel`",None
+   :func:`torch.ones`,:ref:`factory-doc`
+   ":meth:`Tensor.pow`, :func:`torch.pow`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.pow_`,None
+   ":meth:`Tensor.prod`, :func:`torch.prod`",:ref:`removes_dimensions-doc`
+   :func:`torch.rand`,:ref:`factory-doc`
+   :func:`torch.rand`,:ref:`factory-doc`
+   :func:`torch.randn`,:ref:`factory-doc`
+   :func:`torch.randn`,:ref:`factory-doc`
+   :meth:`Tensor.random_`,None
+   ":meth:`Tensor.reciprocal`, :func:`torch.reciprocal`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.reciprocal_`,None
+   :meth:`Tensor.refine_names`,See documentation
+   :meth:`Tensor.register_hook`,None
+   :meth:`Tensor.rename`,See documentation
+   :meth:`Tensor.rename_`,See documentation
+   :attr:`Tensor.requires_grad`,None
+   :meth:`Tensor.requires_grad_`,None
+   :meth:`Tensor.resize_`,Only allow resizes that do not change shape
+   :meth:`Tensor.resize_as_`,Only allow resizes that do not change shape
+   ":meth:`Tensor.round`, :func:`torch.round`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.round_`,None
+   ":meth:`Tensor.rsqrt`, :func:`torch.rsqrt`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.rsqrt_`,None
+   ":meth:`Tensor.select`, :func:`torch.select`",:ref:`removes_dimensions-doc`
+   :meth:`Tensor.short`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.sigmoid`, :func:`torch.sigmoid`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sigmoid_`,None
+   ":meth:`Tensor.sign`, :func:`torch.sign`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sign_`,None
+   ":meth:`Tensor.sin`, :func:`torch.sin`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sin_`,None
+   ":meth:`Tensor.sinh`, :func:`torch.sinh`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sinh_`,None
+   :meth:`Tensor.size`,None
+   ":meth:`Tensor.split`, :func:`torch.split`",:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.sqrt`, :func:`torch.sqrt`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sqrt_`,None
+   ":meth:`Tensor.squeeze`, :func:`torch.squeeze`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.std`, :func:`torch.std`",:ref:`removes_dimensions-doc`
+   :func:`torch.std_mean`,:ref:`removes_dimensions-doc`
+   :meth:`Tensor.stride`,None
+   ":meth:`Tensor.sub`, :func:`torch.sub`",:ref:`unifies_names_from_inputs-doc`
+   :meth:`Tensor.sub_`,:ref:`unifies_names_from_inputs-doc`
+   ":meth:`Tensor.sum`, :func:`torch.sum`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.tan`, :func:`torch.tan`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.tan_`,None
+   ":meth:`Tensor.tanh`, :func:`torch.tanh`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.tanh_`,None
+   :func:`torch.tensor`,:ref:`factory-doc`
+   :meth:`Tensor.to`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.topk`, :func:`torch.topk`",:ref:`removes_dimensions-doc`
+   ":meth:`Tensor.transpose`, :func:`torch.transpose`",:ref:`permutes_dimensions-doc`
+   ":meth:`Tensor.trunc`, :func:`torch.trunc`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.trunc_`,None
+   :meth:`Tensor.type`,None
+   :meth:`Tensor.type_as`,:ref:`keeps_input_names-doc`
+   ":meth:`Tensor.unbind`, :func:`torch.unbind`",:ref:`removes_dimensions-doc`
+   :meth:`Tensor.unflatten`,See documentation
+   :meth:`Tensor.uniform_`,None
+   ":meth:`Tensor.var`, :func:`torch.var`",:ref:`removes_dimensions-doc`
+   :func:`torch.var_mean`,:ref:`removes_dimensions-doc`
+   :meth:`Tensor.zero_`,None
+   :func:`torch.zeros`,:ref:`factory-doc`
+
+
+.. _keeps_input_names-doc:
+
+Keeps input names
+^^^^^^^^^^^^^^^^^
+
+All pointwise unary functions follow this rule as well as some other unary functions.
+
+- Check names: None
+- Propagate names: input tensor's names are propagated to the output.
+
+::
+
+    >>> x = torch.randn(3, 3, names=('N', 'C'))
+    >>> x.abs().names
+    ('N', 'C')
+
+.. _removes_dimensions-doc:
+
+Removes dimensions
+^^^^^^^^^^^^^^^^^^
+
+All reduction ops like :meth:`~Tensor.sum` remove dimensions by reducing
+over the desired dimensions. Other operations like :meth:`~Tensor.select` and
+:meth:`~Tensor.squeeze` remove dimensions.
+
+Wherever one can pass an integer dimension index to an operator, one can also pass
+a dimension name. Functions that take lists of dimension indices can also take in a
+list of dimension names.
+
+- Check names: If :attr:`dim` or :attr:`dims` is passed in as a list of names,
+  check that those names exist in :attr:`self`.
+- Propagate names: If the dimensions of the input tensor specified by :attr:`dim`
+  or :attr:`dims` are not present in the output tensor, then the corresponding names
+  of those dimensions do not appear in ``output.names``.
+
+::
+
+    >>> x = torch.randn(1, 3, 3, 3, names=('N', 'C', 'H', 'W'))
+    >>> x.squeeze('N').names
+    ('C', 'H', 'W')
+
+    >>> x = torch.randn(3, 3, 3, 3, names=('N', 'C', 'H', 'W'))
+    >>> x.sum(['N', 'C']).names
+    ('H', 'W')
+
+    # Reduction ops with keepdim=True don't actually remove dimensions.
+    >>> x = torch.randn(3, 3, 3, 3, names=('N', 'C', 'H', 'W'))
+    >>> x.sum(['N', 'C'], keepdim=True).names
+    ('N', 'C', 'H', 'W')
+
+
+.. _unifies_names_from_inputs-doc:
+
+Unifies names from inputs
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+All binary arithmetic ops follow this rule. Operations that broadcast still
+broadcast positionally from the right to preserve compatibility with unnamed
+tensors. To perform explicit broadcasting by names, use :meth:`Tensor.align_as`.
+
+- Check names: All names must match positionally from the right. i.e., in
+  ``tensor + other``, ``match(tensor.names[i], other.names[i])`` must be true for all
+  ``i`` in ``(-min(tensor.dim(), other.dim()) + 1, -1]``.
+- Check names: Furthermore, all named dimensions must be aligned from the right.
+  During matching, if we match a named dimension ``A`` with an unnamed dimension
+  ``None``, then ``A`` must not appear in the tensor with the unnamed dimension.
+- Propagate names: unify pairs of names from the right from both tensors to
+  produce output names.
+
+For example,
+
+::
+
+    # tensor: Tensor[   N, None]
+    # other:  Tensor[None,    C]
+    >>> tensor = torch.randn(3, 3, names=('N', None))
+    >>> other = torch.randn(3, 3, names=(None, 'C'))
+    >>> (tensor + other).names
+    ('N', 'C')
+
+Check names:
+
+- ``match(tensor.names[-1], other.names[-1])`` is ``True``
+- ``match(tensor.names[-2], tensor.names[-2])`` is ``True``
+- Because we matched ``None`` in :attr:`tensor` with ``'C'``,
+  check to make sure ``'C'`` doesn't exist in :attr:`tensor` (it does not).
+- Check to make sure ``'N'`` doesn't exists in :attr:`other` (it does not).
+
+Finally, the output names are computed with
+``[unify('N', None), unify(None, 'C')] = ['N', 'C']``
+
+More examples::
+
+    # Dimensions don't match from the right:
+    # tensor: Tensor[N, C]
+    # other:  Tensor[   N]
+    >>> tensor = torch.randn(3, 3, names=('N', 'C'))
+    >>> other = torch.randn(3, names=('N',))
+    >>> (tensor + other).names
+    RuntimeError: Error when attempting to broadcast dims ['N', 'C'] and dims
+    ['N']: dim 'C' and dim 'N' are at the same position from the right but do
+    not match.
+
+    # Dimensions aren't aligned when matching tensor.names[-1] and other.names[-1]:
+    # tensor: Tensor[N, None]
+    # other:  Tensor[      N]
+    >>> tensor = torch.randn(3, 3, names=('N', None))
+    >>> other = torch.randn(3, names=('N',))
+    >>> (tensor + other).names
+    RuntimeError: Misaligned dims when attempting to broadcast dims ['N'] and
+    dims ['N', None]: dim 'N' appears in a different position from the right
+    across both lists.
+
+.. note::
+
+    In both of the last examples, it is possible to align the tensors by names
+    and then perform the addition. Use :meth:`Tensor.align_as` to align
+    tensors by name or :meth:`Tensor.align_to` to align tensors to a custom
+    dimension ordering.
+
+.. _permutes_dimensions-doc:
+
+Permutes dimensions
+^^^^^^^^^^^^^^^^^^^
+
+Some operations, like :meth:`Tensor.t()`, permute the order of dimensions. Dimension names
+are attached to individual dimensions so they get permuted as well.
+
+If the operator takes in positional index :attr:`dim`, it is also able to take a dimension
+name as :attr:`dim`.
+
+- Check names: If :attr:`dim` is passed as a name, check that it exists in the tensor.
+- Propagate names: Permute dimension names in the same way as the dimensions that are
+  being permuted.
+
+::
+
+    >>> x = torch.randn(3, 3, names=('N', 'C'))
+    >>> x.transpose('N', 'C').names
+    ('C', 'N')
+
+.. _contracts_away_dims-doc:
+
+Contracts away dims
+^^^^^^^^^^^^^^^^^^^
+
+Matrix multiply functions follow some variant of this. Let's go through
+:func:`torch.mm` first and then generalize the rule for batch matrix multiplication.
+
+For ``torch.mm(tensor, other)``:
+
+- Check names: None
+- Propagate names: result names are ``(tensor.names[-2], other.names[-1])``.
+
+::
+
+    >>> x = torch.randn(3, 3, names=('N', 'D'))
+    >>> y = torch.randn(3, 3, names=('in', 'out'))
+    >>> x.mm(y).names
+    ('N', 'out')
+
+Inherently, a matrix multiplication performs a dot product over two dimensions,
+collapsing them. When two tensors are matrix-multipled, the contracted dimensions
+disappear and do not show up in the output tensor.
+
+:func:`torch.mv`, :func:`torch.dot` work in a similar way: name inference does not
+check input names and removes the dimensions that are involved in the dot product:
+
+::
+
+    >>> x = torch.randn(3, 3, names=('N', 'D'))
+    >>> y = torch.randn(3, names=('something',))
+    >>> x.mv(y).names
+    ('N',)
+
+Now, let's take a look at ``torch.matmul(tensor, other)``. Assume that ``tensor.dim() >= 2``
+and ``other.dim() >= 2``.
+
+- Check names: Check that the batch dimensions of the inputs are aligned and broadcastable.
+  See :ref:`unifies_names_from_inputs-doc` for what it means for the inputs to be aligned.
+- Propagate names: result names are obtained by unifying the batch dimensions and removing
+  the contracted dimensions:
+  ``unify(tensor.names[:-2], other.names[:-2]) + (tensor.names[-2], other.names[-1])``.
+
+Examples::
+
+    # Batch matrix multiply of matrices Tensor['C', 'D'] and Tensor['E', 'F'].
+    # 'A', 'B' are batch dimensions.
+    >>> x = torch.randn(3, 3, 3, 3, names=('A', 'B', 'C', 'D))
+    >>> y = torch.randn(3, 3, 3, names=('B', 'E', 'F))
+    >>> torch.matmul(x, y).names
+    ('A', 'B', 'C', 'F')
+
+
+Finally, there are fused ``add`` versions of many matmul functions. i.e., :func:`addmm`
+and :func:`addmv`. These are treated as composing name inference for i.e. :func:`mm` and
+name inference for :func:`add`.
+
+.. _factory-doc:
+
+Factory functions
+^^^^^^^^^^^^^^^^^
+
+
+Factory functions now take a new :attr:`names` argument that associates a name
+with each dimension.
+
+::
+
+    >>> torch.zeros(2, 3, names=('N', 'C'))
+    tensor([[0., 0., 0.],
+            [0., 0., 0.]], names=('N', 'C'))
+
+.. _out_function_semantics-doc:
+
+out function and in-place variants
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A tensor specified as an ``out=`` tensor has the following behavior:
+
+- If it has no named dimensions, then the names computed from the operation
+  get propagated to it.
+- If it has any named dimensions, then the names computed from the operation
+  must be exactly equal to the existing names. Otherwise, the operation errors.
+
+All in-place methods modify inputs to have names equal to the computed names
+from name inference. For example,
+
+::
+
+    >>> x = torch.randn(3, 3)
+    >>> y = torch.randn(3, 3, names=('N', 'C'))
+    >>> x.names
+    (None, None)
+
+    >>> x += y
+    >>> x.names
+    ('N', 'C')
+
--- a/docs/source/named_tensor.rst
+++ b/docs/source/named_tensor.rst
@ -0,0 +1,319 @@
+.. currentmodule:: torch
+
+.. _named_tensors-doc:
+
+Named Tensors
+=============
+
+Named Tensors aim to make tensors easier to use by allowing users to associate
+explicit names with tensor dimensions. In most cases, operations that take
+dimension parameters will accept dimension names, avoiding the need to track
+dimensions by position. In addition, named tensors use names to automatically
+check that APIs are being used correctly at runtime, providing extra safety.
+Names can also be used to rearrange dimensions, for example, to support
+"broadcasting by name" rather than "broadcasting by position".
+
+.. warning::
+    The named tensor API is experimental and subject to change.
+
+Creating named tensors
+----------------------
+
+Factory functions now take a new :attr:`names` argument that associates a name
+with each dimension.
+
+::
+
+    >>> torch.zeros(2, 3, names=('N', 'C'))
+    tensor([[0., 0., 0.],
+            [0., 0., 0.]], names=('N', 'C'))
+
+Named dimensions, like regular Tensor dimensions, are ordered.
+``tensor.names[i]`` is the name of dimension ``i`` of ``tensor``.
+
+The following factory functions support named tensors:
+
+- :func:`torch.empty`
+- :func:`torch.rand`
+- :func:`torch.randn`
+- :func:`torch.ones`
+- :func:`torch.tensor`
+- :func:`torch.zeros`
+
+Named dimensions
+----------------
+
+See :attr:`~Tensor.names` for restrictions on tensor names.
+
+Use :attr:`~Tensor.names` to access the dimension names of a tensor and
+:meth:`~Tensor.rename` to rename named dimensions.
+
+::
+
+    >>> imgs = torch.randn(1, 2, 2, 3 , names=('N', 'C', 'H', 'W'))
+    >>> imgs.names
+    ('N', 'C', 'H', 'W')
+
+    >>> renamed_imgs = imgs.rename(H='height', W='width')
+    >>> renamed_imgs.names
+    ('N', 'C', 'height', 'width)
+
+
+Named tensors can coexist with unnamed tensors; named tensors are instances of
+:class:`torch.Tensor`. Unnamed tensors have ``None``-named dimensions. Named
+tensors do not require all dimensions to be named.
+
+::
+
+    >>> imgs = torch.randn(1, 2, 2, 3 , names=(None, 'C', 'H', 'W'))
+    >>> imgs.names
+    (None, 'C', 'H', 'W')
+
+Name propagation semantics
+--------------------------
+
+Named tensors use names to automatically check that APIs are being called
+correctly at runtime. This occurs in a process called *name inference*.
+More formally, name inference consists of the following two steps:
+
+- **Check names**: an operator may perform automatic checks at runtime that
+  check that certain dimension names must match.
+- **Propagate names**: name inference propagates names to output tensors.
+
+All operations that support named tensors propagate names.
+
+::
+
+    >>> x = torch.randn(3, 3, names=('N', 'C'))
+    >>> x.abs().names
+    ('N', 'C')
+
+
+.. _match_semantics-doc:
+
+match semantics
+^^^^^^^^^^^^^^^
+
+Two names *match* if they are equal (string equality) or if at least one is ``None``.
+Nones are essentially a special "wildcard" name.
+
+``unify(A, B)`` determines which of the names ``A`` and ``B`` to propagate to the outputs.
+It returns the more *specific* of the two names, if they match. If the names do not match,
+then it errors.
+
+.. note::
+    In practice, when working with named tensors, one should avoid having unnamed
+    dimensions because their handling can be complicated. It is recommended to lift
+    all unnamed dimensions to be named dimensions by using :meth:`~Tensor.refine_names`.
+
+
+Basic name inference rules
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Let's see how ``match`` and ``unify`` are used in name inference in the case of
+adding two one-dim tensors with no broadcasting.
+
+::
+
+    x = torch.randn(3, names=('X',))
+    y = torch.randn(3)
+    z = torch.randn(3, names=('Z',))
+
+**Check names**: check that the names of the two tensors *match*.
+
+For the following examples:
+
+::
+
+    >>> # x + y  # match('X', None) is True
+    >>> # x + z  # match('X', 'Z') is False
+    >>> # x + x  # match('X', 'X') is True
+
+    >>> x + z
+    Error when attempting to broadcast dims ['X'] and dims ['Z']: dim 'X' and dim 'Z' are at the same position from the right but do not match.
+
+**Propagate names**: *unify* the names to select which one to propagate.
+In the case of ``x + y``, ``unify('X', None) = 'X'`` because ``'X'`` is more
+specific than ``None``.
+
+::
+
+    >>> (x + y).names
+    ('X',)
+    >>> (x + x).names
+    ('X',)
+
+For a comprehensive list of name inference rules, see :ref:`name_inference_reference-doc`.
+Here are two common operations that may be useful to go over:
+
+- Binary arithmetic ops: :ref:`unifies_names_from_inputs-doc`
+- Matrix multiplication ops: :ref:`contracts_away_dims-doc`
+
+Explicit alignment by names
+---------------------------
+
+Use :meth:`~Tensor.align_as` or :meth:`~Tensor.align_to` to align tensor dimensions
+by name to a specified ordering. This is useful for performing "broadcasting by names".
+
+::
+
+    # This function is agnostic to the dimension ordering of `input`,
+    # as long as it has a `C` dimension somewhere.
+    def scale_channels(input, scale):
+        scale = scale.refine_names('C')
+        return input * scale.align_as(input)
+
+    >>> num_channels = 3
+    >>> scale = torch.randn(num_channels, names='C')
+    >>> imgs = torch.rand(3, 3, 3, num_channels, names=('N', 'H', 'W', 'C'))
+    >>> more_imgs = torch.rand(3, num_channels, 3, 3, names=('N', 'C', 'H', 'W'))
+    >>> videos = torch.randn(3, num_channels, 3, 3, 3, names=('N', 'C', 'H', 'W', 'D')
+
+    >>> scale_channels(imgs, scale)
+    >>> scale_channels(more_imgs, scale)
+    >>> scale_channels(videos, scale)
+
+Manipulating dimensions
+-----------------------
+
+Use :meth:`~Tensor.align_to` to permute large amounts of dimensions without
+mentioning all of them as in required by :meth:`~Tensor.permute`.
+
+::
+
+    >>> tensor = torch.randn(2, 2, 2, 2, 2, 2)
+    >>> named_tensor = tensor.refine_names('A', 'B', 'C', 'D', 'E', 'F')
+
+    # Move the F (dim 5) and E dimension (dim 4) to the front while keeping
+    # the rest in the same order
+    >>> tensor.permute(5, 4, 0, 1, 2, 3)
+    >>> named_tensor.align_to('F', 'E', ...)  # Use '...' instead in Python 2
+
+Use :meth:`~Tensor.flatten` and :meth:`~Tensor.unflatten` to flatten and unflatten
+dimensions, respectively. These methods are more verbose than :meth:`~Tensor.view`
+and :meth:`~Tensor.reshape`, but have more semantic meaning to someone reading the code.
+
+::
+
+    >>> imgs = torch.randn(32, 3, 128, 128)
+    >>> named_imgs = imgs.refine_names('N', 'C', 'H', 'W')
+
+    >>> flat_imgs = imgs.view(32, -1)
+    >>> named_flat_imgs = named_imgs.flatten(['C', 'H', 'W'], 'features')
+    >>> named_flat_imgs.names
+    ('N', 'features')
+
+    >>> unflattened_imgs = imgs.view(32, 3, 128, 128)
+    >>> unflattened_named_imgs = named_flat_imgs.unflatten(
+            'features', [('C', 3), ('H', 128), ('W', 128)])
+
+.. _named_tensors_autograd-doc:
+
+Autograd support
+----------------
+
+Autograd currently supports named tensors in a limited manner: autograd ignores
+names on all tensors. Gradient computation is still correct but we lose the
+safety that names give us.
+
+::
+
+    >>> x = torch.randn(3, names=('D',))
+    >>> weight = torch.randn(3, names=('D',), requires_grad=True)
+    >>> loss = (x - weight).abs()
+    >>> grad_loss = torch.randn(3)
+    >>> loss.backward(grad_loss)
+    >>> weight.grad  # Unnamed for now. Will be named in the future
+    tensor([-1.8107, -0.6357,  0.0783])
+
+    >>> weight.grad.zero_()
+    >>> grad_loss = grad_loss.refine_names('C')
+    >>> loss = (x - weight).abs()
+    # Ideally we'd check that the names of loss and grad_loss match but we don't yet.
+    >>> loss.backward(grad_loss)
+    >>> weight.grad
+    tensor([-1.8107, -0.6357,  0.0783])
+
+Currently supported operations and subsystems
+---------------------------------------------
+
+Operators
+^^^^^^^^^
+
+See :ref:`name_inference_reference-doc` for a full list of the supported torch and
+tensor operations. We do not yet support the following that is not covered by the link:
+
+- indexing, advanced indexing.
+
+For ``torch.nn.functional`` operators, we support the following:
+
+- :func:`torch.nn.functional.relu`
+- :func:`torch.nn.functional.softmax`
+- :func:`torch.nn.functional.log_softmax`
+- :func:`torch.nn.functional.tanh`
+- :func:`torch.nn.functional.sigmoid`
+- :func:`torch.nn.functional.dropout`
+
+Subsystems
+^^^^^^^^^^
+
+Autograd is supported, see :ref:`named_tensors_autograd-doc`.
+Because gradients are currently unnamed, optimizers may work but are untested.
+
+NN modules are currently unsupported. This can lead to the following when calling
+modules with named tensor inputs:
+
+- NN module parameters are unnamed, so outputs may be partially named.
+- NN module forward passes have code that don't support named tensors and will
+  error out appropriately.
+
+We also do not support the following subsystems, though some may work out
+of the box:
+
+- distributions
+- serialization (:func:`torch.load`, :func:`torch.save`)
+- multiprocessing
+- JIT
+- distributed
+- ONNX
+
+If any of these would help your use case, please
+`search if an issue has already been filed <https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3A%22module%3A+named+tensor%22>`_
+and if not, `file one <https://github.com/pytorch/pytorch/issues/new/choose>`_.
+
+Named tensor API reference
+--------------------------
+
+In this section please find the documentation for named tensor specific APIs.
+For a comprehensive reference for how names are propagated through other PyTorch
+operators, see :ref:`name_inference_reference-doc`.
+
+.. class:: Tensor()
+   :noindex:
+
+   .. autoattribute:: names
+   .. automethod:: rename
+   .. automethod:: rename_
+   .. automethod:: refine_names
+
+   .. automethod:: align_as
+   .. automethod:: align_to
+
+   .. automethod:: unflatten
+   .. py:method:: flatten(dims, out_dim) -> Tensor
+
+      Flattens :attr:`dims` into a single dimension with name :attr:`out_dim`.
+
+      All of `dims` must be consecutive in order in the :attr:`self` tensor,
+      but not necessary contiguous in memory.
+
+      Examples::
+
+          >>> imgs = torch.randn(32, 3, 128, 128, names=('N', 'C', 'H', 'W'))
+          >>> flat_imgs = imgs.flatten(['C', 'H', 'W'], 'features')
+          >>> flat_imgs.names, flat_imgs.shape
+          (('N', 'features'), torch.Size([32, 49152]))
+
+      .. warning::
+          The named tensor API is experimental and subject to change.
+
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@ -170,6 +170,7 @@ view of a storage and defines numeric operations on it.
   .. automethod:: addr
   .. automethod:: addr_
   .. automethod:: allclose
+   .. automethod:: angle
   .. automethod:: apply_
   .. automethod:: argmax
   .. automethod:: argmin
@ -206,6 +207,7 @@ view of a storage and defines numeric operations on it.
   .. automethod:: clone
   .. automethod:: contiguous
   .. automethod:: copy_
+   .. automethod:: conj
   .. automethod:: cos
   .. automethod:: cos_
   .. automethod:: cosh
@ -276,6 +278,7 @@ view of a storage and defines numeric operations on it.
   .. automethod:: hardshrink
   .. automethod:: histc
   .. automethod:: ifft
+   .. automethod:: imag
   .. automethod:: index_add_
   .. automethod:: index_add
   .. automethod:: index_copy_
@ -381,6 +384,7 @@ view of a storage and defines numeric operations on it.
   .. automethod:: register_hook
   .. automethod:: remainder
   .. automethod:: remainder_
+   .. automethod:: real
   .. automethod:: renorm
   .. automethod:: renorm_
   .. automethod:: repeat
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@ -188,12 +188,14 @@ Pointwise Ops
 .. autofunction:: add
 .. autofunction:: addcdiv
 .. autofunction:: addcmul
+.. autofunction:: angle
 .. autofunction:: asin
 .. autofunction:: atan
 .. autofunction:: atan2
 .. autofunction:: bitwise_not
 .. autofunction:: ceil
 .. autofunction:: clamp
+.. autofunction:: conj
 .. autofunction:: cos
 .. autofunction:: cosh
 .. autofunction:: div
@ -206,6 +208,7 @@ Pointwise Ops
 .. autofunction:: floor
 .. autofunction:: fmod
 .. autofunction:: frac
+.. autofunction:: imag
 .. autofunction:: lerp
 .. autofunction:: log
 .. autofunction:: log10
@ -217,6 +220,7 @@ Pointwise Ops
 .. autofunction:: mvlgamma
 .. autofunction:: neg
 .. autofunction:: pow
+.. autofunction:: real
 .. autofunction:: reciprocal
 .. autofunction:: remainder
 .. autofunction:: round
--- a/test/common_nn.py
+++ b/test/common_nn.py
@ -56,18 +56,6 @@ module_tests = [
        desc='no_bias',
        reference_fn=lambda i, p, _: torch.mm(i, p[0].t())
    ),
-    dict(
-        module_name='Linear',
-        constructor_args=(10, 8),
-        input_size=(0, 10),
-        desc='zero_batch',
-    ),
-    dict(
-        module_name='Linear',
-        constructor_args=(10, 8, False),
-        input_size=(0, 10),
-        desc='zero_batch_no_bias',
-    ),
    dict(
        module_name='Threshold',
        constructor_args=(2., 1.),
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@ -161,6 +161,17 @@ TEST_F(FunctionalTest, HingeEmbeddingLoss) {
  ASSERT_TRUE(output.allclose(expected));
 }

+TEST_F(FunctionalTest, MultiMarginLoss) {
+  auto weight = torch::tensor({0.3, 0.3, 0.4}, torch::kFloat);
+  auto input = torch::tensor({{0.2, 0.2, 0.6}, {0.1, 0.8, 0.1}, {0.9, 0.09, 0.01}}, torch::requires_grad());
+  auto target = torch::tensor({2, 1, 0}, torch::kLong);
+  auto output = F::multi_margin_loss(
+    input, target, MultiMarginLossOptions().margin(2).weight(weight));
+  auto expected = torch::tensor({0.305556}, torch::kFloat);
+
+  ASSERT_TRUE(output.allclose(expected, 1e-04));
+}
+
 TEST_F(FunctionalTest, CosineEmbeddingLoss) {
  auto input1 = torch::tensor({{2, 3, 4}, {6, 2, 4}});
  auto input2 = torch::tensor({{2, 3, 5}, {9, 12, 0}});
@ -254,6 +265,32 @@ TEST_F(FunctionalTest, ELU) {
  }
 }

+TEST_F(FunctionalTest, SELU) {
+  {
+    const double scale = 1.0507009873554804934193349852946;
+    const double alpha = 1.6732632423543772848170429916717;
+    for (const auto inplace : {false, true}) {
+      auto input = torch::randn({5, 5});
+      auto expected = scale *
+          (torch::max(torch::zeros_like(input), input) +
+           torch::min(
+               torch::zeros_like(input), alpha * (torch::exp(input) - 1)));
+      auto output = F::selu(input, inplace);
+
+      ASSERT_TRUE(output.allclose(expected));
+      if (inplace) {
+        ASSERT_TRUE(input.allclose(expected));
+      }
+    }
+  }
+  {
+    auto input = torch::arange(0, 9, torch::kDouble).view({3, 3});
+    auto output = F::selu(input);
+    auto expected = F::selu(input, false);
+    ASSERT_TRUE(output.allclose(expected));
+  }
+}
+
 TEST_F(FunctionalTest, Hardshrink) {
  const auto size = 3;
  for (const auto lambda : {-4.2, -1.0, -0.42, 0.0, 0.42, 1.0, 4.2, 42.42}) {
@ -371,3 +408,14 @@ TEST_F(FunctionalTest, LogSigmoid) {
  auto y_exp = torch::log(torch::ones_like(x)/(torch::ones_like(x) + torch::exp(torch::neg(x))));
  ASSERT_TRUE(torch::allclose(y, y_exp, 1e-4, 1e-7));
 }
+
+TEST_F(FunctionalTest, Softmax) {
+  auto input = torch::arange(10, torch::kFloat).reshape({2, 5});
+  auto output = F::softmax(input, /*dim=*/1);
+  auto sum = torch::sum(torch::exp(input), 1);
+
+  for (int i = 0; i < 2; i++) {
+    auto expected = torch::exp(input[i]) / sum[i];
+    ASSERT_TRUE(torch::allclose(output[i], expected));
+  }
+}
--- a/test/cpp/api/modulelist.cpp
+++ b/test/cpp/api/modulelist.cpp
@ -282,7 +282,7 @@ TEST_F(ModuleListTest, PrettyPrintModuleList) {
      "  (1): torch::nn::Conv2d(input_channels=1, output_channels=2, kernel_size=[3, 3], stride=[1, 1])\n"
      "  (2): torch::nn::Dropout(rate=0.5)\n"
      "  (3): torch::nn::BatchNorm(features=5, eps=1e-05, momentum=0.1, affine=true, stateful=true)\n"
-      "  (4): torch::nn::Embedding(count=4, dimension=10)\n"
+      "  (4): torch::nn::Embedding(num_embeddings=4, embedding_dim=10)\n"
      "  (5): torch::nn::LSTM(input_size=4, hidden_size=5, layers=1, dropout=0)\n"
      ")");
 }
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@ -800,6 +800,21 @@ TEST_F(ModulesTest, EmbeddingList) {
  ASSERT_EQ(y.size(2), 4);
 }

+TEST_F(ModulesTest, EmbeddingFromPretrained) {
+  auto weight = torch::tensor({{1., 2.3, 3.}, {4., 5.1, 6.3}});
+  Embedding embedding = torch::nn::Embedding::from_pretrained(weight);
+  auto input = torch::tensor({1}, torch::kLong);
+  ASSERT_TRUE(torch::allclose(embedding(input), torch::tensor({4.0000, 5.1000, 6.3000})));
+}
+
+TEST_F(ModulesTest, EmbeddingBagFromPretrained) {
+  auto weight = torch::tensor({{1., 2.3, 3.}, {4., 5.1, 6.3}});
+  EmbeddingBag embeddingbag = torch::nn::EmbeddingBag::from_pretrained(weight);
+  auto input = torch::zeros({{1, 2}}, torch::kLong);
+  input[0] = torch::tensor({1, 0});
+  ASSERT_TRUE(torch::allclose(embeddingbag(input), torch::tensor({2.5000, 3.7000, 4.6500})));
+}
+
 TEST_F(ModulesTest, Dropout) {
  Dropout dropout(0.5);
  torch::Tensor x = torch::ones(100, torch::requires_grad());
@ -979,6 +994,20 @@ TEST_F(ModulesTest, HingeEmbeddingLoss) {
  ASSERT_EQ(input.sizes(), input.grad().sizes());
 }

+TEST_F(ModulesTest, MultiMarginLoss) {
+  auto weight = torch::tensor({0.3, 0.3, 0.4}, torch::kFloat);
+  MultiMarginLoss loss(MultiMarginLossOptions().margin(2).weight(weight));
+  auto input = torch::tensor({{0.2, 0.2, 0.6}, {0.1, 0.8, 0.1}, {0.9, 0.09, 0.01}}, torch::requires_grad());
+  auto target = torch::tensor({2, 1, 0}, torch::kLong);
+  auto output = loss->forward(input, target);
+  auto expected = torch::tensor({0.305556}, torch::kFloat);
+  auto s = output.sum();
+  s.backward();
+
+  ASSERT_TRUE(output.allclose(expected, 1e-04));
+  ASSERT_EQ(input.sizes(), input.grad().sizes());
+}
+
 TEST_F(ModulesTest, CosineEmbeddingLoss) {
  CosineEmbeddingLoss cos(CosineEmbeddingLossOptions().margin(0.5));
  auto input1 = torch::tensor({{2, 3, 4}, {6, 2, 4}}, torch::requires_grad());
@ -1040,6 +1069,23 @@ TEST_F(ModulesTest, ELU) {
  }
 }

+TEST_F(ModulesTest, SELU) {
+  SELU model;
+  auto input = torch::randn({5, 5}, torch::requires_grad());
+  auto output = model->forward(input);
+  const double scale = 1.0507009873554804934193349852946;
+  const double alpha = 1.6732632423543772848170429916717;
+  auto zero = torch::zeros_like(input);
+  auto expected = scale *
+      (torch::max(zero, input) +
+       torch::min(zero, alpha * (torch::exp(input) - 1)));
+  auto s = output.sum();
+  s.backward();
+
+  ASSERT_EQ(s.ndimension(), 0);
+  ASSERT_TRUE(output.allclose(expected));
+}
+
 TEST_F(ModulesTest, Hardshrink) {
  const auto size = 3;
  for (const auto lambda : {-4.2, -1.0, -0.42, 0.0, 0.42, 1.0, 4.2, 42.42}) {
@ -1131,6 +1177,18 @@ TEST_F(ModulesTest, LogSigmoid) {
  ASSERT_TRUE(torch::allclose(y, y_exp, 1e-4, 1e-7));
 }

+TEST_F(ModulesTest, Softmax) {
+  Softmax m(/*dim=*/1);
+  auto input = torch::arange(10, torch::kFloat).reshape({2, 5});
+  auto output = m(input);
+  auto sum = torch::sum(torch::exp(input), 1);
+
+  for (int i = 0; i < 2; i++) {
+    auto expected = torch::exp(input[i]) / sum[i];
+    ASSERT_TRUE(torch::allclose(output[i], expected));
+  }
+}
+
 TEST_F(ModulesTest, PrettyPrintIdentity) {
  ASSERT_EQ(c10::str(Identity()), "torch::nn::Identity()");
 }
@ -1290,8 +1348,29 @@ TEST_F(ModulesTest, PrettyPrintBatchNorm) {

 TEST_F(ModulesTest, PrettyPrintEmbedding) {
  ASSERT_EQ(
-      c10::str(Embedding(10, 2)),
-      "torch::nn::Embedding(count=10, dimension=2)");
+      c10::str(Embedding(EmbeddingOptions(10, 2))),
+      "torch::nn::Embedding(num_embeddings=10, embedding_dim=2)");
+  ASSERT_EQ(
+      c10::str(Embedding(EmbeddingOptions(10, 2).padding_idx(3).max_norm(2))),
+      "torch::nn::Embedding(num_embeddings=10, embedding_dim=2, padding_idx=3, max_norm=2)");
+  ASSERT_EQ(
+      c10::str(Embedding(EmbeddingOptions(10, 2).padding_idx(3).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true))),
+      "torch::nn::Embedding(num_embeddings=10, embedding_dim=2, padding_idx=3, max_norm=2, norm_type=2.5, scale_grad_by_freq=true, sparse=true)");
+}
+
+TEST_F(ModulesTest, PrettyPrintEmbeddingBag) {
+  ASSERT_EQ(
+      c10::str(EmbeddingBag(EmbeddingBagOptions(10, 2))),
+      "torch::nn::EmbeddingBag(num_embeddings=10, embedding_dim=2)");
+  ASSERT_EQ(
+      c10::str(EmbeddingBag(EmbeddingBagOptions(10, 2).max_norm(2))),
+      "torch::nn::EmbeddingBag(num_embeddings=10, embedding_dim=2, max_norm=2)");
+  ASSERT_EQ(
+      c10::str(EmbeddingBag(EmbeddingBagOptions(10, 2).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true))),
+      "torch::nn::EmbeddingBag(num_embeddings=10, embedding_dim=2, max_norm=2, norm_type=2.5, scale_grad_by_freq=true, sparse=true)");
+  ASSERT_EQ(
+      c10::str(EmbeddingBag(EmbeddingBagOptions(10, 2).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true).mode("sum"))),
+      "torch::nn::EmbeddingBag(num_embeddings=10, embedding_dim=2, max_norm=2, norm_type=2.5, scale_grad_by_freq=true, sparse=true, mode=sum)");
 }

 TEST_F(ModulesTest, PrettyPrintHingeEmbeddingLoss) {
@ -1339,7 +1418,7 @@ TEST_F(ModulesTest, PrettyPrintNestedModel) {
    TestModule()
        : torch::nn::Module("TestModule"),
          fc(register_module("fc", torch::nn::Linear(4, 5))),
-          table(register_module("table", torch::nn::Embedding(10, 2))),
+          table(register_module("table", torch::nn::Embedding(EmbeddingOptions(10, 2)))),
          inner(register_module("inner", std::make_shared<InnerTestModule>())) {
    }

@ -1352,10 +1431,10 @@ TEST_F(ModulesTest, PrettyPrintNestedModel) {
      c10::str(TestModule{}),
      "TestModule(\n"
      "  (fc): torch::nn::Linear(in=4, out=5, with_bias=true)\n"
-      "  (table): torch::nn::Embedding(count=10, dimension=2)\n"
+      "  (table): torch::nn::Embedding(num_embeddings=10, embedding_dim=2)\n"
      "  (inner): InnerTestModule(\n"
      "    (fc): torch::nn::Linear(in=3, out=4, with_bias=true)\n"
-      "    (table): torch::nn::Embedding(count=10, dimension=2)\n"
+      "    (table): torch::nn::Embedding(num_embeddings=10, embedding_dim=2)\n"
      "  )\n"
      ")");
 }
@ -1366,6 +1445,12 @@ TEST_F(ModulesTest, PrettyPrintELU) {
            "torch::nn::ELU(alpha=42.42, inplace=true)");
 }

+TEST_F(ModulesTest, PrettyPrintSELU) {
+  ASSERT_EQ(c10::str(SELU()), "torch::nn::SELU()");
+  ASSERT_EQ(c10::str(SELU(SELUOptions().inplace(true))),
+            "torch::nn::SELU(inplace=true)");
+}
+
 TEST_F(ModulesTest, PrettyPrintHardshrink) {
  ASSERT_EQ(c10::str(Hardshrink()), "torch::nn::Hardshrink(0.5)");
  ASSERT_EQ(c10::str(Hardshrink(HardshrinkOptions().lambda(42.42))),
@ -1391,3 +1476,7 @@ TEST_F(ModulesTest, PrettyPrintLeakyReLU) {
 TEST_F(ModulesTest, PrettyPrintLogSigmoid) {
  ASSERT_EQ(c10::str(LogSigmoid()), "torch::nn::LogSigmoid()");
 }
+
+TEST_F(ModulesTest, PrettyPrintSoftmax) {
+  ASSERT_EQ(c10::str(Softmax(SoftmaxOptions(1))), "torch::nn::Softmax(dim=1)");
+}
--- a/test/cpp/api/sequential.cpp
+++ b/test/cpp/api/sequential.cpp
@ -393,7 +393,7 @@ TEST_F(SequentialTest, PrettyPrintSequential) {
      "  (1): torch::nn::Conv2d(input_channels=1, output_channels=2, kernel_size=[3, 3], stride=[1, 1])\n"
      "  (2): torch::nn::Dropout(rate=0.5)\n"
      "  (3): torch::nn::BatchNorm(features=5, eps=1e-05, momentum=0.1, affine=true, stateful=true)\n"
-      "  (4): torch::nn::Embedding(count=4, dimension=10)\n"
+      "  (4): torch::nn::Embedding(num_embeddings=4, embedding_dim=10)\n"
      "  (5): torch::nn::LSTM(input_size=4, hidden_size=5, layers=1, dropout=0)\n"
      ")");

@ -412,7 +412,7 @@ TEST_F(SequentialTest, PrettyPrintSequential) {
      "  (conv2d): torch::nn::Conv2d(input_channels=1, output_channels=2, kernel_size=[3, 3], stride=[1, 1])\n"
      "  (dropout): torch::nn::Dropout(rate=0.5)\n"
      "  (batchnorm): torch::nn::BatchNorm(features=5, eps=1e-05, momentum=0.1, affine=true, stateful=true)\n"
-      "  (embedding): torch::nn::Embedding(count=4, dimension=10)\n"
+      "  (embedding): torch::nn::Embedding(num_embeddings=4, embedding_dim=10)\n"
      "  (lstm): torch::nn::LSTM(input_size=4, hidden_size=5, layers=1, dropout=0)\n"
      ")");
 }
--- a/test/rpc_test.py
+++ b/test/rpc_test.py
@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 from __future__ import absolute_import, division, print_function, unicode_literals

 import concurrent.futures
--- a/test/test_dist_autograd_fork.py
+++ b/test/test_dist_autograd_fork.py
--- a/test/test_dist_autograd_spawn.py
+++ b/test/test_dist_autograd_spawn.py
--- a/test/test_docs_coverage.py
+++ b/test/test_docs_coverage.py
@ -75,8 +75,13 @@ class TestDocCoverage(unittest.TestCase):

    def test_tensor(self):
        in_rst = self.parse_rst('tensors.rst', r2)
+        whitelist = {
+            'names', 'unflatten', 'align_as', 'rename_', 'refine_names', 'align_to',
+            'has_names', 'rename',
+        }
        classes = [torch.FloatTensor, torch.LongTensor, torch.ByteTensor]
        has_docstring = set(x for c in classes for x in dir(c) if not x.startswith('_') and getattr(c, x).__doc__)
+        has_docstring -= whitelist
        self.assertEqual(
            has_docstring, in_rst,
            textwrap.dedent('''
--- a/test/test_jit.py
+++ b/test/test_jit.py
@ -3400,6 +3400,7 @@ def foo(x):
                cu.define(full)

    def test_namedtuple_python(self):
+        global MyTuple, MyMod  # see [local resolution in python]
        MyTuple = namedtuple('MyTuple', ['a'])

        @torch.jit.unused
@ -15000,6 +15001,7 @@ a")
        self.checkScript(fn, ())

    def test_named_tuple_redefine(self):
+        global _1, _2
        _1 = namedtuple('GoogLeNetOutputs', ['logits', 'aux_logits2', 'aux_logits1'])
        _2 = namedtuple('GoogLeNetOutputs', ['different'])

@ -15010,6 +15012,7 @@ a")
                return x

    def test_named_tuple_py2(self):
+        global _GoogLeNetOutputs  # see [local resolution in python]
        _GoogLeNetOutputs = namedtuple('GoogLeNetOutputs', ['logits', 'aux_logits2', 'aux_logits1'])

        @torch.jit.script
@ -15024,6 +15027,7 @@ a")
        self.assertEqual(out.aux_logits1, vals[2])

    def test_named_tuple_good_error(self):
+        global _GoogLeNetOutputs  # see [local resolution in python]
        _GoogLeNetOutputs = namedtuple('GoogLeNetOutputs', ['logits', 'aux_logits2', 'aux_logits1'])

        @torch.jit.script
@ -19370,6 +19374,7 @@ class TestClassType(JitTestCase):
                        self.attr = x

    def test_class_type_as_param(self):
+        global FooTest  # see [local resolution in python]
        @torch.jit.script  # noqa: B903
        class FooTest(object):
            def __init__(self, x):
@ -19512,6 +19517,7 @@ class TestClassType(JitTestCase):
        self.assertEqual(2 * input, output)

    def test_python_interop(self):
+        global Foo   # see [local resolution in python]
        @torch.jit.script  # noqa: B903
        class Foo(object):
            def __init__(self, x, y):
@ -19538,6 +19544,7 @@ class TestClassType(JitTestCase):
        self.assertEqual(y, f2.y)

    def test_class_specialization(self):
+        global Foo  # see [local resolution in python]
        @torch.jit.script  # noqa: B903
        class Foo(object):
            def __init__(self, x, y):
@ -19562,6 +19569,7 @@ class TestClassType(JitTestCase):
        FileCheck().check_count("Double(*, *) = prim::GetAttr", 4).run(graphstr)

    def test_class_sorting(self):
+        global Foo  # see [local resolution in python]
        @torch.jit.script  # noqa: B903
        class Foo(object):
            def __init__(self, x):
@ -19675,6 +19683,7 @@ class TestClassType(JitTestCase):
        self.assertEqual(3 * input, output)

    def test_interface(self):
+        global Foo, Bar, OneTwo, OneTwoThree, OneTwoWrong, NotMember, NotMember2
        @torch.jit.script
        class Foo(object):
            def __init__(self):
@ -19836,6 +19845,7 @@ class TestClassType(JitTestCase):
        # NamedTuple inheritance errors

    def test_overloaded_fn(self):
+        global Foo, MyClass  # see [local resolution in python]
        @torch.jit.script
        class Foo(object):
            def __init__(self, x):
@ -19991,6 +20001,7 @@ class TestClassType(JitTestCase):
                return Foo(torch.tensor(1)) + Foo(torch.tensor(1))

    def test_cast_overloads(self):
+        global Foo  # see [local resolution in python]
        @torch.jit.script
        class Foo(object):
            def __init__(self, val):
--- a/test/test_jit_py3.py
+++ b/test/test_jit_py3.py
@ -110,6 +110,8 @@ class TestScriptPy3(JitTestCase):
        FileCheck().check_not('TupleConstruct').run(foo.graph)

    def test_named_tuple_type_annotation(self):
+        global MyCoolNamedTuple  # see [local resolution in python]
+
        class MyCoolNamedTuple(NamedTuple):
            a : int
            b : float
--- a/test/test_quantized.py
+++ b/test/test_quantized.py
@ -901,8 +901,6 @@ class TestQuantizedOps(TestCase):
        self.assertEqual(Y, qY.dequantize())

    """Tests the correctness of the quantized equal op."""
-    @unittest.skip("temporarily disable until failures are fixed. " +
-                   "See https://github.com/pytorch/pytorch/issues/26279")
    @given(X=hu.tensor(shapes=hu.array_shapes(1, 5, 1, 5),
                       qparams=hu.qparams()),
           X2=hu.tensor(shapes=hu.array_shapes(1, 5, 1, 5),
@ -949,6 +947,8 @@ class TestQuantizedOps(TestCase):
                return False
            if qX.shape != qX2.shape:
                return False
+            if qX.dtype != qX2.dtype:
+                return False
            if qX.qscheme() == torch.per_tensor_affine:
                if qX.q_scale() != qX2.q_scale():
                    return False
--- a/test/test_rpc_fork.py
+++ b/test/test_rpc_fork.py
--- a/test/test_rpc_spawn.py
+++ b/test/test_rpc_spawn.py
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -245,14 +245,6 @@ class _TestTorchMixin(object):
                       'to_dense',
                       'sparse_resize_',
                       'sparse_resize_and_clear_',
-                       'align_to',  # BUILD_NAMEDTENSOR only
-                       'align_as',  # BUILD_NAMEDTENSOR only
-                       'rename',  # BUILD_NAMEDTENSOR only
-                       'rename_',  # BUILD_NAMEDTENSOR only
-                       'has_names',  # BUILD_NAMEDTENSOR only
-                       'rename',  # BUILD_NAMEDTENSOR only
-                       'refine_names',  # BUILD_NAMEDTENSOR only
-                       'unflatten',  # BUILD_NAMEDTENSOR only
                       )
        test_namespace(torch.nn)
        test_namespace(torch.nn.functional, 'assert_int_or_pair', 'feature_alpha_dropout')
--- a/test/test_type_hints.py
+++ b/test/test_type_hints.py
@ -61,7 +61,15 @@ def get_all_examples():
    This function grabs (hopefully all) examples from the torch documentation
    strings and puts them in one nonsensical module returned as a string.
    """
-    blacklist = {"_np"}
+    blacklist = {
+        "_np",
+        "refine_names",
+        "rename",
+        "names",
+        "align_as",
+        "align_to",
+        "unflatten",
+    }
    allexamples = ""

    example_file_lines = [
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@ -430,6 +430,23 @@ static PyObject * THPVariable_get_device(PyObject* self_, PyObject* args, PyObje
  END_HANDLE_TH_ERRORS
 }

+static PyObject * THPVariable_numel(PyObject* self_, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "numel(Tensor input)",
+  }, /*traceable=*/false);
+
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  if (r.idx == 0) {
+    return wrap(r.tensor(0).numel());
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 // generated methods start here

 ${py_methods}
@ -448,6 +465,7 @@ static PyMethodDef torch_functions[] = {
  {"spmm", (PyCFunction)(void(*)(void))THPVariable_mm, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
  {"tensor", (PyCFunction)(void(*)(void))THPVariable_tensor, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
  {"get_device", (PyCFunction)(void(*)(void))THPVariable_get_device, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  {"numel", (PyCFunction)(void(*)(void))THPVariable_numel, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
  ${py_method_defs}
  {NULL}
 };
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@ -181,6 +181,14 @@ static PyObject * THPVariable_dim(PyObject* self, PyObject* args)
   END_HANDLE_TH_ERRORS
 }

+static PyObject * THPVariable_numel(PyObject* self, PyObject* args)
+{
+   HANDLE_TH_ERRORS
+   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+   return THPUtils_packInt64(self_.numel());
+   END_HANDLE_TH_ERRORS
+}
+
 static Tensor dispatch_contiguous(const Tensor & self, at::MemoryFormat memory_format) {
  AutoNoGIL no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
@ -781,6 +789,7 @@ PyMethodDef variable_methods[] = {
  {"new_ones", (PyCFunction)(void(*)(void))THPVariable_new_ones, METH_VARARGS | METH_KEYWORDS, NULL},
  {"new_tensor", (PyCFunction)(void(*)(void))THPVariable_new_tensor, METH_VARARGS | METH_KEYWORDS, NULL},
  {"nonzero", (PyCFunction)(void(*)(void))THPVariable_nonzero, METH_VARARGS | METH_KEYWORDS, NULL},
+  {"numel", (PyCFunction)THPVariable_numel, METH_NOARGS, NULL},
  {"numpy", (PyCFunction)THPVariable_numpy, METH_NOARGS, NULL},
  {"record_stream", (PyCFunction)THPVariable_record_stream, METH_O, NULL},
  {"requires_grad_", (PyCFunction)(void(*)(void))THPVariable_requires_grad_, METH_VARARGS | METH_KEYWORDS, NULL},
--- a/tools/clang_format.py
+++ b/tools/clang_format.py
--- a/tools/clang_tidy.py
+++ b/tools/clang_tidy.py
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@ -375,6 +375,7 @@ def gen_jit_dispatch(declarations, out, template_path, disable_autograd=False):
        return [sorted(g, key=declkey) for g in grouped_decls]

    # We need to add methods implemented manually in TensorImpl
+    # TODO: This seems to claim sizes() returns an int64_t.  Really?
    tensor_impl_methods = [{
        'name': name,
        'api_name': name,
@ -382,7 +383,7 @@ def gen_jit_dispatch(declarations, out, template_path, disable_autograd=False):
        'method_of': ['Tensor'],
        'arguments': [{'name': 'self', 'simple_type': 'Tensor'}],
        'returns': [{'name': 'result', 'type': 'int64_t', 'dynamic_type': 'int64_t', 'simple_type': 'int64_t'}],
-    } for name in ['sizes', 'strides', 'dim']]
+    } for name in ['sizes', 'strides', 'dim', 'numel']]
    aten_decls = load_aten_declarations(declarations) + tensor_impl_methods
    jit_decls = [d for d in aten_decls if is_jit_op(d)]

--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@ -416,6 +416,7 @@ def gen_pyi(declarations_path, out):
        'set_flush_denormal': ['def set_flush_denormal(mode: _bool) -> _bool: ...'],
        'get_default_dtype': ['def get_default_dtype() -> _dtype: ...'],
        'from_numpy': ['def from_numpy(ndarray) -> Tensor: ...'],
+        'numel': ['def numel(self: Tensor) -> _int: ...'],
        'clamp': ["def clamp(self, min: _float=-inf, max: _float=inf,"
                  " *, out: Optional[Tensor]=None) -> Tensor: ..."],
        'as_tensor': ["def as_tensor(data: Any, dtype: _dtype=None, device: Optional[_device]=None) -> Tensor: ..."],
@ -501,6 +502,7 @@ def gen_pyi(declarations_path, out):
        'requires_grad_': ['def requires_grad_(self, mode: _bool=True) -> Tensor: ...'],
        'element_size': ['def element_size(self) -> _int: ...'],
        'dim': ['def dim(self) -> _int: ...'],
+        'numel': ['def numel(self) -> _int: ...'],
        'ndimension': ['def ndimension(self) -> _int: ...'],
        'nelement': ['def nelement(self) -> _int: ...'],
        'cuda': ['def cuda(self, device: Optional[_device]=None, non_blocking: _bool=False) -> Tensor: ...'],
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@ -16,25 +16,25 @@ from torch._utils_internal import get_source_lines_and_file
 boolean_dispatched = weakref.WeakKeyDictionary()  # noqa: T484


-def createResolutionCallback(frames_up=0):
+def createResolutionCallbackFromFrame(frames_up=0):
    """
    Creates a function which, given a string variable name,
    returns the value of the variable in the scope of the caller of
-    the function which called createResolutionCallback (by default).
+    the function which called createResolutionCallbackFromFrame (by default).

    This is used to enable access in-scope Python variables inside
    TorchScript fragments.

    frames_up is number of additional frames to go up on the stack.
    The default value is 0, which correspond to the frame of the caller
-    of createResolutionCallback. Also for example, if frames_up is set
-    to 1, then the frame of the caller's caller of createResolutionCallback
+    of createResolutionCallbackFromFrame. Also for example, if frames_up is set
+    to 1, then the frame of the caller's caller of createResolutionCallbackFromFrame
    will be taken.

    For example, the following program prints 2::

        def bar():
-            cb = createResolutionCallback(1)
+            cb = createResolutionCallbackFromFrame(1)
            print(cb("foo"))

        def baz():
@ -75,6 +75,48 @@ def get_closure(fn):

    return captures

+# [local resolution in python]
+# Depending on where a variable is defined, and where it is used, we may
+# or may not be able to recover its value when recursively compiling a
+# script function. Remember in the general case, a module or function is
+# first defined and then later scripted. This means we do not have a
+# chance to capture the active frames when the function is defined. Hence any
+# name resolution has to happen later on the created closure. The way
+# python captures type annotations restricts what we can recover. The
+# follow example illustrates the different cases:
+#
+#         class MyGlobalClass:
+#         ...
+#         def my_local_scope():
+#             @torch.jit.script
+#             class MyClass:
+#                 ...
+#             @torch.jit.script
+#             class MyClassUsedAsVar:
+#                 ...
+#             def eg(x: MyClass, y: MyGlobalClass):
+#                 a_local_capture : Foo
+#                 return MyClassUsedAsVar(x)
+#
+# MyGlobalClass is defined in the __globals__ dictionary of function
+# 'eg', so it is always recoverable. my_local_scope introduces a new local
+# variable scope in the function. Classes defined here are only visible as
+# local variables. For the case of MyClassUsedAsVar, it is captured
+# because it is used as a variable inside the body of the function, and we
+# can resolve it using the captures returned from `get_closure`. However,
+# the type annotations are not captured by the closure. In Python
+# 3.0--3.9, the _value_ of MyClass and MyGlobalClass will be availiable as
+# annotations on `eg``, but starting in Python 4.0, they will represented as
+# strings and no longer present. Furthermore, since the body of `eg` does
+# not reference those names, they do not appear in the list of closed over
+# variables. In Python 2.x, type annotations are in comments, leading to a
+# similar situation where their definitions are not available. We anticipate
+# that most users will not run into this issue because their modules and
+# functions will be defined at a global scope like MyGlobalClass. In cases
+# where they are not, it is possible to work around issues by declaring the
+# values global in the function.
+
+

 def createResolutionCallbackFromClosure(fn):
    """
@ -178,11 +220,12 @@ class FunctionModifiers(object):

 def export(fn):
    """
-    This decorator indicates that a method is used as an entry point into a
-    ``ScriptModule`` and should be compiled. ``forward`` implicitly is assumbed to be an
-    entry point, so it does not need this decorator. Functions and methods
-    called from ``forward`` are compiled as they are seen, so they do not need
-    this decorator either.
+    This decorator indicates that a method on an ``nn.Module`` is used as an entry point into a
+    :class:`ScriptModule` and should be compiled.
+
+    ``forward`` implicitly is assumed to be an entry point, so it does not need this decorator.
+    Functions and methods called from ``forward`` are compiled as they are seen
+    by the compiler, so they do not need this decorator either.

    Example (using ``@torch.jit.export`` on a method):

--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@ -282,6 +282,53 @@ addr_(beta=1, alpha=1, vec1, vec2) -> Tensor
 In-place version of :meth:`~Tensor.addr`
 """)

+add_docstr_all('align_as',
+               r"""
+align_as(other) -> Tensor
+
+Permutes the dimensions of the :attr:`self` tensor to match the dimension order
+in the :attr:`other` tensor, adding size-one dims for any new names.
+
+This operation is useful for explicit broadcasting by names (see examples).
+
+All of the dims of :attr:`self` must be named in order to use this method.
+The resulting tensor is a view on the original tensor.
+
+All dimension names of :attr:`self` must be present in ``other.names``.
+:attr:`other` may contain named dimensions that are not in ``self.names``;
+the output tensor has a size-one dimension for each of those new names.
+
+To align a tensor to a specific order, use :meth:`~Tensor.align_to`.
+
+Examples::
+
+    # Example 1: Applying a mask
+    >>> mask = torch.randint(2, [127, 128], dtype=torch.bool).refine_names('W', 'H')
+    >>> imgs = torch.randn(32, 128, 127, 3, names=('N', 'H', 'W', 'C'))
+    >>> imgs.masked_fill_(mask.align_as(imgs), 0)
+
+
+    # Example 2: Applying a per-channel-scale
+    def scale_channels(input, scale):
+        scale = scale.refine_names('C')
+        return input * scale.align_as(input)
+
+    >>> num_channels = 3
+    >>> scale = torch.randn(num_channels, names='C')
+    >>> imgs = torch.rand(32, 128, 128, num_channels, names=('N', 'H', 'W', 'C'))
+    >>> more_imgs = torch.rand(32, num_channels, 128, 128, names=('N', 'C', 'H', 'W'))
+    >>> videos = torch.randn(3, num_channels, 128, 128, 128, names=('N', 'C', 'H', 'W', 'D'))
+
+    # scale_channels is agnostic to the dimension order of the input
+    >>> scale_channels(imgs, scale)
+    >>> scale_channels(more_imgs, scale)
+    >>> scale_channels(videos, scale)
+
+.. warning::
+    The named tensor API is experimental and subject to change.
+
+""")
+
 add_docstr_all('all',
               r"""
 .. function:: all() -> bool
@ -332,6 +379,13 @@ allclose(other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
 See :func:`torch.allclose`
 """)

+add_docstr_all('angle',
+               r"""
+angle() -> Tensor
+
+See :func:`torch.angle`
+""")
+
 add_docstr_all('any',
               r"""
 .. function:: any() -> bool
@ -637,6 +691,13 @@ Args:
        cases, this argument has no effect.
 """)

+add_docstr_all('conj',
+               r"""
+conj() -> Tensor
+
+See :func:`torch.conj`
+""")
+
 add_docstr_all('cos',
               r"""
 cos() -> Tensor
@ -999,6 +1060,13 @@ flip(dims) -> Tensor
 See :func:`torch.flip`
 """)

+add_docstr_all('real',
+               r"""
+real() -> Tensor
+
+See :func:`torch.real`
+""")
+
 add_docstr_all('roll',
               r"""
 roll(shifts, dims) -> Tensor
@ -1095,6 +1163,13 @@ ger(vec2) -> Tensor
 See :func:`torch.ger`
 """)

+add_docstr_all('imag',
+               r"""
+imag() -> Tensor
+
+See :func:`torch.imag`
+""")
+
 add_docstr_all('indices',
               r"""
 indices() -> Tensor
@ -1154,6 +1229,11 @@ gt_(other) -> Tensor
 In-place version of :meth:`~Tensor.gt`
 """)

+add_docstr_all('has_names',
+               r"""
+Is ``True`` if any of this tensor's dimensions are named. Otherwise, is ``False``.
+""")
+
 add_docstr_all('hardshrink',
               r"""
 hardshrink(lambd=0.5) -> Tensor
@ -3320,6 +3400,24 @@ Example::

 """)

+add_docstr_all('names',
+               r"""
+Stores names for each of this tensor's dimensions.
+
+``names[idx]`` corresponds to the name of tensor dimension ``idx``.
+Names are either a string if the dimension is named or ``None`` if the
+dimension is unnamed.
+
+Dimension names may contain characters or underscore. Furthermore, a dimension
+name must be a valid Python variable name (i.e., does not start with underscore).
+
+Tensors may not have two named dimensions with the same name.
+
+.. warning::
+    The named tensor API is experimental and subject to change.
+
+""")
+
 add_docstr_all('is_cuda',
               r"""
 Is ``True`` if the Tensor is stored on the GPU, ``False`` otherwise.
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@ -473,6 +473,25 @@ Example::
    True
 """)

+add_docstr(torch.angle,
+           r"""
+angle(input, out=None) -> Tensor
+
+Computes the element-wise angle (in radians) of the given :attr:`input` tensor.
+
+.. math::
+    \text{out}_{i} = angle(\text{input}_{i})
+""" + r"""
+Args:
+    {input}
+    {out}
+
+Example::
+
+    >>> torch.angle(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))*180/3.14159
+    tensor([ 135.,  135,  325])
+""".format(**common_args))
+
 add_docstr(torch.as_strided,
           r"""
 as_strided(input, size, stride, storage_offset=0) -> Tensor
@ -953,6 +972,25 @@ Example::
    tensor([-0., -1., -1.,  1.])
 """.format(**common_args))

+add_docstr(torch.real,
+           r"""
+real(input, out=None) -> Tensor
+
+Computes the element-wise real value of the given :attr:`input` tensor.
+
+.. math::
+    \text{out}_{i} = real(\text{input}_{i})
+""" + r"""
+Args:
+    {input}
+    {out}
+
+Example::
+
+    >>> torch.real(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))
+    tensor([ -1,  -2,  3])
+""".format(**common_args))
+
 add_docstr(torch.reciprocal,
           r"""
 reciprocal(input, out=None) -> Tensor
@ -1205,6 +1243,25 @@ Example::
    tensor([ 0.5000, -0.4702, -0.4599,  0.5000])
 """.format(**common_args))

+add_docstr(torch.conj,
+           r"""
+conj(input, out=None) -> Tensor
+
+Computes the element-wise conjugate of the given :attr:`input` tensor.
+
+.. math::
+    \text{out}_{i} = conj(\text{input}_{i})
+""" + r"""
+Args:
+    {input}
+    {out}
+
+Example::
+
+    >>> torch.conj(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))
+    tensor([-1 - 1j, -2 - 2j, 3 + 3j])
+""".format(**common_args))
+
 add_docstr(torch.cos,
           r"""
 cos(input, out=None) -> Tensor
@ -2260,6 +2317,25 @@ Example::
    tensor([ 0.,  2.,  1.,  0.])
 """.format(**common_args))

+add_docstr(torch.imag,
+           r"""
+imag(input, out=None) -> Tensor
+
+Computes the element-wise imag value of the given :attr:`input` tensor.
+
+.. math::
+    \text{out}_{i} = imag(\text{input}_{i})
+""" + r"""
+Args:
+    {input}
+    {out}
+
+Example::
+
+    >>> torch.imag(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))
+    tensor([ 1,  2,  -3])
+""".format(**common_args))
+
 add_docstr(torch.index_select,
           r"""
 index_select(input, dim, index, out=None) -> Tensor
--- a/torch/csrc/TypeInfo.cpp
+++ b/torch/csrc/TypeInfo.cpp
@ -123,7 +123,7 @@ static PyObject* THPDTypeInfo_bits(THPDTypeInfo* self, void*) {
 }

 static PyObject* THPFInfo_eps(THPFInfo* self, void*) {
-  return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+  return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(at::kHalf,
      self->type, "epsilon", [] {
        return PyFloat_FromDouble(
            std::numeric_limits<
@ -132,14 +132,14 @@ static PyObject* THPFInfo_eps(THPFInfo* self, void*) {
 }

 static PyObject* THPFInfo_max(THPFInfo* self, void*) {
-  return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self->type, "max", [] {
+  return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(at::kHalf, self->type, "max", [] {
    return PyFloat_FromDouble(
        std::numeric_limits<at::scalar_value_type<scalar_t>::type>::max());
  });
 }

 static PyObject* THPFInfo_min(THPFInfo* self, void*) {
-  return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self->type, "min", [] {
+  return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(at::kHalf, self->type, "min", [] {
    return PyFloat_FromDouble(
        std::numeric_limits<at::scalar_value_type<scalar_t>::type>::lowest());
  });
@ -170,7 +170,7 @@ static PyObject* THPIInfo_min(THPFInfo* self, void*) {
 }

 static PyObject* THPFInfo_tiny(THPFInfo* self, void*) {
-  return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self->type, "min", [] {
+  return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(at::kHalf, self->type, "min", [] {
    return PyFloat_FromDouble(
        std::numeric_limits<at::scalar_value_type<scalar_t>::type>::min());
  });
--- a/torch/csrc/api/include/torch/nn/functional/activation.h
+++ b/torch/csrc/api/include/torch/nn/functional/activation.h
@ -7,7 +7,7 @@ namespace torch {
 namespace nn{
 namespace functional {

-inline Tensor elu(Tensor& input, const ELUOptions& options) {
+inline Tensor elu(Tensor& input, const ELUOptions& options = {}) {
  if (options.inplace()) {
    return torch::elu_(input, options.alpha());
  } else {
@ -15,12 +15,20 @@ inline Tensor elu(Tensor& input, const ELUOptions& options) {
  }
 }

+inline Tensor selu(Tensor& input, const SELUOptions& options = {}) {
+  if (options.inplace()) {
+    return torch::selu_(input);
+  } else {
+    return torch::selu(input);
+  }
+}
+
 inline Tensor hardshrink(const Tensor& input,
-                         const HardshrinkOptions& options) {
+                         const HardshrinkOptions& options = {}) {
  return torch::hardshrink(input, options.lambda());
 }

-inline Tensor hardtanh(Tensor& input, const HardtanhOptions& options) {
+inline Tensor hardtanh(Tensor& input, const HardtanhOptions& options = {}) {
  if (options.inplace()) {
    return torch::hardtanh_(input, options.min_val(), options.max_val());
  } else {
@ -28,7 +36,7 @@ inline Tensor hardtanh(Tensor& input, const HardtanhOptions& options) {
  }
 }

-inline Tensor leaky_relu(Tensor& input, const LeakyReLUOptions& options) {
+inline Tensor leaky_relu(Tensor& input, const LeakyReLUOptions& options = {}) {
  if (options.inplace()) {
    return torch::leaky_relu_(input, options.negative_slope());
  } else {
@ -40,6 +48,20 @@ inline Tensor logsigmoid(const Tensor& input) {
  return torch::log_sigmoid(input);
 }

+inline Tensor softmax(const Tensor& input, const SoftmaxOptions& options,
+                      c10::optional<torch::Dtype> dtype = c10::nullopt) {
+  int64_t dim = options.dim();
+  Tensor ret;
+
+  if (dtype == c10::nullopt) {
+    ret = input.softmax(dim);
+  } else {
+    ret = input.softmax(dim, dtype);
+  }
+
+  return ret;
+}
+
 } // namespace functional
 } // namespace nn
 } // namespace torch
--- a/torch/csrc/api/include/torch/nn/functional/loss.h
+++ b/torch/csrc/api/include/torch/nn/functional/loss.h
@ -17,6 +17,25 @@ inline Tensor hinge_embedding_loss(
      options.reduction());
 }

+inline Tensor multi_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const MultiMarginLossOptions& options = {}) {
+  TORCH_CHECK(options.p() == 1 || options.p() == 2, "only p == 1 and p == 2 supported");
+  if (options.weight().defined()) {
+    TORCH_CHECK(options.weight().dim() == 1, "weight must be one-dimensional");
+  }
+
+  return torch::multi_margin_loss(
+    input,
+    target,
+    options.p(),
+    options.margin(),
+    options.weight(),
+    options.reduction()
+  );
+}
+
 inline Tensor cosine_embedding_loss(
    const Tensor& input1,
    const Tensor& input2,
--- a/torch/csrc/api/include/torch/nn/modules/activation.h
+++ b/torch/csrc/api/include/torch/nn/modules/activation.h
@ -16,8 +16,7 @@ namespace nn {
 /// about the exact behavior of this module.
 class TORCH_API ELUImpl : public torch::nn::Cloneable<ELUImpl> {
 public:
-  ELUImpl() : ELUImpl(ELUOptions()) {}
-  explicit ELUImpl(const ELUOptions& options_);
+  explicit ELUImpl(const ELUOptions& options_ = {});

  Tensor forward(Tensor& input);

@ -32,6 +31,28 @@ class TORCH_API ELUImpl : public torch::nn::Cloneable<ELUImpl> {

 TORCH_MODULE(ELU);

+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the selu function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.SELU to learn
+/// about the exact behavior of this module.
+class TORCH_API SELUImpl : public torch::nn::Cloneable<SELUImpl> {
+ public:
+  explicit SELUImpl(const SELUOptions& options_ = {});
+
+  Tensor forward(Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `SELU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  SELUOptions options;
+};
+
+TORCH_MODULE(SELU);
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hardshrink ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 /// Applies the hard shrinkage function element-wise.
@ -39,8 +60,7 @@ TORCH_MODULE(ELU);
 /// about the exact behavior of this module.
 class TORCH_API HardshrinkImpl : public torch::nn::Cloneable<HardshrinkImpl> {
 public:
-  HardshrinkImpl() : HardshrinkImpl(HardshrinkOptions()) {}
-  explicit HardshrinkImpl(const HardshrinkOptions& options_);
+  explicit HardshrinkImpl(const HardshrinkOptions& options_ = {});

  Tensor forward(const Tensor& input);

@ -62,8 +82,7 @@ TORCH_MODULE(Hardshrink);
 /// about the exact behavior of this module.
 class TORCH_API HardtanhImpl : public torch::nn::Cloneable<HardtanhImpl> {
 public:
-  HardtanhImpl() : HardtanhImpl(HardtanhOptions()) {}
-  explicit HardtanhImpl(const HardtanhOptions& options_);
+  explicit HardtanhImpl(const HardtanhOptions& options_ = {});

  Tensor forward(Tensor& input);

@ -85,8 +104,7 @@ TORCH_MODULE(Hardtanh);
 /// about the exact behavior of this module.
 class TORCH_API LeakyReLUImpl : public torch::nn::Cloneable<LeakyReLUImpl> {
 public:
-  LeakyReLUImpl() : LeakyReLUImpl(LeakyReLUOptions()) {}
-  explicit LeakyReLUImpl(const LeakyReLUOptions& options_);
+  explicit LeakyReLUImpl(const LeakyReLUOptions& options_ = {});

  Tensor forward(Tensor& input);

@ -108,8 +126,6 @@ TORCH_MODULE(LeakyReLU);
 /// about the exact behavior of this module.
 class TORCH_API LogSigmoidImpl : public torch::nn::Cloneable<LogSigmoidImpl> {
 public:
-  LogSigmoidImpl() {}
-
  Tensor forward(const Tensor& input);

  void reset() override;
@ -120,5 +136,28 @@ class TORCH_API LogSigmoidImpl : public torch::nn::Cloneable<LogSigmoidImpl> {

 TORCH_MODULE(LogSigmoid);

+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softmax ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the Softmax function.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softmax to learn
+/// about the exact behavior of this module.
+class TORCH_API SoftmaxImpl : public torch::nn::Cloneable<SoftmaxImpl> {
+ public:
+  explicit SoftmaxImpl(int64_t dim) : SoftmaxImpl(SoftmaxOptions(dim)) {}
+  explicit SoftmaxImpl(const SoftmaxOptions& options_);
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softmax` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  SoftmaxOptions options;
+};
+
+TORCH_MODULE(Softmax);
+
 } // namespace nn
 } // namespace torch
--- a/torch/csrc/api/include/torch/nn/modules/embedding.h
+++ b/torch/csrc/api/include/torch/nn/modules/embedding.h
@ -12,19 +12,57 @@ namespace nn {

 /// Options for the `Embedding` module.
 struct TORCH_API EmbeddingOptions {
-  EmbeddingOptions(int64_t count, int64_t dimension);
-  /// The number of embeddings (number of rows in the table).
-  TORCH_ARG(int64_t, count);
-  /// The size of each embedding vector (number of columns in the table).
-  TORCH_ARG(int64_t, dimension);
+  EmbeddingOptions(int64_t num_embeddings, int64_t embedding_dim) :
+   num_embeddings_(num_embeddings), embedding_dim_(embedding_dim) {};
+  /// The size of the dictionary of embeddings.
+  TORCH_ARG(int64_t, num_embeddings);
+  /// The size of each embedding vector.
+  TORCH_ARG(int64_t, embedding_dim);
+  /// If given, pads the output with the embedding vector at `padding_idx` (initialized to zeros) whenever it encounters the index.
+  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+  /// If given, each embedding vector with norm larger than `max_norm` is renormalized to have norm `max_norm`.
+  TORCH_ARG(c10::optional<float>, max_norm) = c10::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(float, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default ``False``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// If ``True``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  TORCH_ARG(bool, sparse) = false;
+  /// The learnable weights of the module of shape (num_embeddings, embedding_dim)
+  TORCH_ARG(torch::Tensor, _weight) = Tensor();
+};
+
+/// Options for the `EmbeddingBag` module.
+struct TORCH_API EmbeddingBagOptions {
+  EmbeddingBagOptions(int64_t num_embeddings, int64_t embedding_dim) :
+   num_embeddings_(num_embeddings), embedding_dim_(embedding_dim) {};
+  /// The size of the dictionary of embeddings.
+  TORCH_ARG(int64_t, num_embeddings);
+  /// The size of each embedding vector.
+  TORCH_ARG(int64_t, embedding_dim);
+  /// If given, each embedding vector with norm larger than `max_norm` is renormalized to have norm `max_norm`.
+  TORCH_ARG(c10::optional<float>, max_norm) = c10::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(float, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default ``False``.
+  /// Note: this option is not supported when ``mode="max"``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag. ``"sum"`` computes the weighted sum, taking `per_sample_weights`
+  /// into consideration. ``"mean"`` computes the average of the values in the bag, ``"max"`` computes the max value over each bag.
+  TORCH_ARG(string, mode) = "mean";
+  /// If ``True``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  /// Note: this option is not supported when ``mode="max"``.
+  TORCH_ARG(bool, sparse) = false;
+  /// The learnable weights of the module of shape (num_embeddings, embedding_dim)
+  TORCH_ARG(torch::Tensor, _weight) = Tensor();
 };

 /// Performs a lookup in a fixed size embedding table.
 class TORCH_API EmbeddingImpl : public torch::nn::Cloneable<EmbeddingImpl> {
 public:
-  EmbeddingImpl(int64_t count, int64_t dimension)
-      : EmbeddingImpl(EmbeddingOptions(count, dimension)) {}
-  explicit EmbeddingImpl(EmbeddingOptions options);
+  EmbeddingImpl(int64_t num_embeddings, int64_t embedding_dim)
+     : EmbeddingImpl(EmbeddingOptions(num_embeddings, embedding_dim)) {}
+  explicit EmbeddingImpl(const EmbeddingOptions& options_);

  void reset() override;

@ -47,7 +85,64 @@ class TORCH_API EmbeddingImpl : public torch::nn::Cloneable<EmbeddingImpl> {
 /// See the documentation for `EmbeddingImpl` class to learn what methods it
 /// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
 /// module storage semantics.
-TORCH_MODULE(Embedding);
+class Embedding : public torch::nn::ModuleHolder<EmbeddingImpl> {
+ public:
+  using torch::nn::ModuleHolder<EmbeddingImpl>::ModuleHolder;

+  static Embedding from_pretrained(const torch::Tensor& embeddings, c10::optional<EmbeddingOptions> options = c10::nullopt, bool freeze = true) {
+    TORCH_CHECK(embeddings.dim() == 2, "Embeddings parameter is expected to be 2-dimensional");
+    if (options != c10::nullopt) {
+      TORCH_CHECK((*options).num_embeddings() == embeddings.size(0), "Expects options.num_embeddings to be ", embeddings.size(0) , "but found ", (*options).num_embeddings());
+      TORCH_CHECK((*options).embedding_dim() == embeddings.size(1), "Expects options.embeddings_dim to be ", embeddings.size(1) , "but found ", (*options).embedding_dim());
+    } else {
+      options = EmbeddingOptions(embeddings.size(0), embeddings.size(1));
+    }
+    Embedding embedding((*options)._weight(embeddings));
+    embedding->weight.set_requires_grad(!freeze);
+    return embedding;
+  }
+};
+
+class TORCH_API EmbeddingBagImpl : public torch::nn::Cloneable<EmbeddingBagImpl> {
+ public:
+  EmbeddingBagImpl(int64_t num_embeddings, int64_t embedding_dim)
+    : EmbeddingBagImpl(EmbeddingBagOptions(num_embeddings, embedding_dim)) {}
+  explicit EmbeddingBagImpl(const EmbeddingBagOptions& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `EmbeddingBag` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  torch::Tensor forward(const Tensor& input, const torch::Tensor& offsets = torch::Tensor(),
+    const torch::Tensor& per_sample_weights = torch::Tensor());
+
+  /// The `Options` used to configure this `EmbeddingBag` module.
+  EmbeddingBagOptions options;
+  /// The embedding table.
+  Tensor weight;
+};
+
+/// A `ModuleHolder` subclass for `EmbeddingBagImpl`.
+/// See the documentation for `EmbeddingBagImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+class EmbeddingBag : public torch::nn::ModuleHolder<EmbeddingBagImpl> {
+ public:
+  using torch::nn::ModuleHolder<EmbeddingBagImpl>::ModuleHolder;
+
+  static EmbeddingBag from_pretrained(const torch::Tensor& embeddings, c10::optional<EmbeddingBagOptions> options = c10::nullopt, bool freeze = true) {
+    TORCH_CHECK(embeddings.dim() == 2, "Embeddings parameter is expected to be 2-dimensional");
+    if (options != c10::nullopt) {
+      TORCH_CHECK((*options).num_embeddings() == embeddings.size(0), "Expects options.num_embeddings to be ", embeddings.size(0) , "but found ", (*options).num_embeddings());
+      TORCH_CHECK((*options).embedding_dim() == embeddings.size(1), "Expects options.embeddings_dim to be ", embeddings.size(1) , "but found ", (*options).embedding_dim());
+    } else {
+      options = EmbeddingBagOptions(embeddings.size(0), embeddings.size(1));
+    }
+    EmbeddingBag embeddingbag((*options)._weight(embeddings));
+    embeddingbag->weight.set_requires_grad(!freeze);
+    return embeddingbag;
+  }
+};
 } // namespace nn
 } // namespace torch
--- a/torch/csrc/api/include/torch/nn/modules/loss.h
+++ b/torch/csrc/api/include/torch/nn/modules/loss.h
@ -63,6 +63,33 @@ TORCH_MODULE(HingeEmbeddingLoss);

 // ============================================================================

+/// Creates a criterion that optimizes a multi-class classification hinge
+/// loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`) and
+/// output :math:`y` (which is a 1D tensor of target class indices,
+/// :math:`0 \leq y \leq \text{x.size}(1)-1`):
+struct TORCH_API MultiMarginLossImpl : Module {
+  explicit MultiMarginLossImpl(
+      const MultiMarginLossOptions& options_ = {});
+
+  void reset();
+
+  /// Pretty prints the `MultiMarginLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  MultiMarginLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `MultiMarginLossImpl`.
+/// See the documentation for `MultiMarginLossImpl` class to learn what
+/// methods it provides, or the documentation for `ModuleHolder` to learn about
+/// PyTorch's module storage semantics.
+TORCH_MODULE(MultiMarginLoss);
+
+// ============================================================================
+
 /// Creates a criterion that measures the loss given input tensors
 /// `input1`, `input2`, and a `Tensor` label `target` with values 1 or
 /// -1. This is used for measuring whether two inputs are similar or
--- a/Show More
+++ b/Show More