Land remaining parts of Torchscript Lazy Tensor backend (#74111)

Summary: Also enables bazel build to run lazy codegen. Bazel (oss) build feeds off the same filelists as cmake/buck (build_variables.bzl), so enabling it is easier than keeping it disabled. Pull Request resolved: https://github.com/pytorch/pytorch/pull/74111 Test Plan: Run CI and verify test_lazy_ops is running via OSS cmake builds Reviewed By: bdhirsh Differential Revision: D34772403 fbshipit-source-id: 8a63f58b9536e6ac1be530667932176ef2549496 (cherry picked from commit e807ffb1918853d10b924fdc24f85ee5b1a39021)
2025-12-06 12:20:52 +01:00 · 2022-03-22 16:06:04 -07:00 · 2022-03-22 16:06:04 -07:00 · 3547f20872
commit 3547f20872
parent 93be0e2053
36 changed files with 2571 additions and 43 deletions
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@ -277,7 +277,7 @@ test_libtorch() {
    fi

    # Run Lazy Tensor cpp tests
-    if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+    if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$BUILD_ENVIRONMENT" != *nogpu* ]]; then
      LTC_TS_CUDA=1 "$TORCH_BIN_DIR"/test_lazy  --gtest_output=xml:$TEST_REPORTS_DIR/test_lazy.xml
    else
      "$TORCH_BIN_DIR"/test_lazy  --gtest_output=xml:$TEST_REPORTS_DIR/test_lazy.xml
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -3,7 +3,7 @@ load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
 load("@rules_proto//proto:defs.bzl", "proto_library")
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_proto_library", "cc_test")
 load("//third_party:substitution.bzl", "header_template_rule")
-load("//:tools/build_variables.bzl", "jit_core_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_nvfuser_generated_headers", "libtorch_nvfuser_runtime_sources", "libtorch_python_core_sources", "torch_cpp_srcs")
+load("//:tools/build_variables.bzl", "jit_core_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_nvfuser_generated_headers", "libtorch_nvfuser_runtime_sources", "libtorch_python_core_sources", "torch_cpp_srcs", "lazy_tensor_ts_sources")
 load("//tools/rules:cu.bzl", "cu_library")
 load("//tools/config:defs.bzl", "if_cuda")
 load("//:aten.bzl", "intern_build_aten_ops", "generate_aten", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cuda_sources")
@ -155,6 +155,11 @@ libtorch_cpp_generated_sources = [
        "torch/csrc/autograd/generated/Functions.h",
        "torch/csrc/autograd/generated/Functions.cpp",
        "torch/csrc/autograd/generated/variable_factories.h",
+        "torch/csrc/lazy/generated/LazyIr.h",
+        "torch/csrc/lazy/generated/LazyNativeFunctions.h",
+        "torch/csrc/lazy/generated/LazyNativeFunctions.cpp",
+        "torch/csrc/lazy/generated/RegisterAutogradLazy.cpp",
+        "torch/csrc/lazy/generated/RegisterLazy.cpp",
 ]

 libtorch_python_generated_sources = [
@ -180,9 +185,16 @@ genrule(
    name = "all_generated_code",
    srcs = [
        "aten/src/ATen/native/native_functions.yaml",
+        "aten/src/ATen/native/ts_native_functions.yaml",
+        "torch/csrc/lazy/core/shape_inference.h",
+        "torch/csrc/lazy/ts_backend/ts_native_functions.cpp",
+        "aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp",
+        "aten/src/ATen/templates/DispatchKeyNativeFunctions.h",
+        "aten/src/ATen/templates/RegisterDispatchKey.cpp",
+        "aten/src/ATen/templates/LazyIr.h",
    ],
    outs = libtorch_cpp_generated_sources + libtorch_python_generated_sources,
-    cmd = "$(location :generate_code) --install_dir `dirname $(location torch/csrc/autograd/generated/variable_factories.h)`/../.. --native-functions-path $(location aten/src/ATen/native/native_functions.yaml) --nn-path aten/src",
+    cmd = "$(location :generate_code) --install_dir `dirname $(location torch/csrc/autograd/generated/variable_factories.h)`/../.. --native-functions-path $(location aten/src/ATen/native/native_functions.yaml) --nn-path aten/src --gen_lazy_ts_backend",
    tools = [":generate_code"],
 )

@ -1732,7 +1744,7 @@ cc_library(
            "torch/csrc/cuda/nccl.cpp",
            "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
        ],
-    )) + libtorch_core_sources + libtorch_distributed_sources + torch_cpp_srcs + libtorch_extra_sources + jit_core_sources + [
+    )) + libtorch_core_sources + libtorch_distributed_sources + torch_cpp_srcs + libtorch_extra_sources + jit_core_sources + lazy_tensor_ts_sources +[
        ":cpp_generated_code",
        "torch/csrc/jit/serialization/flatbuffer_serializer.cpp",
        "torch/csrc/jit/mobile/flatbuffer_loader.cpp"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -313,7 +313,7 @@ cmake_dependent_option(
    USE_GLOO "Use Gloo. Only available if USE_DISTRIBUTED is on." ON
    "USE_DISTRIBUTED" OFF)
 cmake_dependent_option(
-    USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
+  USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
    "USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
 cmake_dependent_option(
    USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
@ -337,6 +337,9 @@ cmake_dependent_option(USE_CCACHE "Attempt using CCache to wrap the compilation"
 option(WERROR "Build with -Werror supported by the compiler" OFF)
 option(USE_COREML_DELEGATE "Use the CoreML backend through delegate APIs" OFF)
 option(USE_PER_OPERATOR_HEADERS "Whether ATen should generate separate headers for each operator" ON)
+cmake_dependent_option(
+    BUILD_LAZY_TS_BACKEND "Build the lazy Torchscript backend, not compatible with mobile builds" ON
+    "NOT INTERN_BUILD_MOBILE" OFF)


 if(USE_CCACHE)
@ -551,6 +554,8 @@ endif(NOT MSVC)
 # purpose.
 if(ANDROID OR IOS OR DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN})
  set(INTERN_BUILD_MOBILE ON)
+  message(WARNING "INTERN_BUILD_MOBILE is on, disabling BUILD_LAZY_TS_BACKEND")
+  set(BUILD_LAZY_TS_BACKEND OFF)

  if(DEFINED ENV{BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN})
    # C10_MOBILE is derived from Android/iOS toolchain macros in
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@ -284,7 +284,8 @@ TEST(OperatorRegistrationTest, whenRegisteringMultipleKernelsInSameOpCallAndCall
  EXPECT_FALSE(called_kernel1);
  EXPECT_TRUE(called_kernel2);

-  for (c10::DispatchKey key : {c10::DispatchKey::XLA, c10::DispatchKey::Lazy}) {
+  // Test for out of tree lazy backends- ::Lazy key is now registered to TS backend in tree
+  for (c10::DispatchKey key : {c10::DispatchKey::XLA}) {
    std::string expectMessage = expectedMessageForBackend(key);
    expectThrows<c10::Error>([&] {
      callOp(*op, dummyTensor(key));
@ -613,14 +614,13 @@ void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) {
  EXPECT_FALSE(called_nonautograd);
 }

+// no longer test ::Lazy key here
+// since it is now registered to TS backend in-tree and thus behaves differently,
+// does not throw the expected 'could not run..' messages
 TEST(OperatorRegistrationTest, AutogradXLAOverridesAutogradKernel) {
  LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::XLA);
 }

-TEST(OperatorRegistrationTest, AutogradLazyOverridesAutogradKernel) {
-  LazyBackendsAutogradOverridesAutogradKernel(DispatchKey::Lazy);
-}
-
 void whenRegisterWithLazyBackendsAndCatchAll_AutogradLazyBackendsIsNotFilled(DispatchKey key) {
  {
    auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options()
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -350,18 +350,25 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
      "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_0.cpp"
      "${TORCH_SRC_DIR}/csrc/autograd/generated/ADInplaceOrViewType_1.cpp"
    )
+    if(BUILD_LAZY_TS_BACKEND)
+      list(APPEND GENERATED_CXX_TORCH
+        "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNativeFunctions.cpp"
+        "${TORCH_SRC_DIR}/csrc/lazy/generated/RegisterAutogradLazy.cpp"
+        "${TORCH_SRC_DIR}/csrc/lazy/generated/RegisterLazy.cpp"
+      )
+    endif()
  endif()

  set(GENERATED_H_TORCH
    "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.h"
    "${TORCH_SRC_DIR}/csrc/autograd/generated/variable_factories.h"
-    "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyIr.h"
-    "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNativeFunctions.h"
    )

  if(NOT INTERN_DISABLE_AUTOGRAD)
    list(APPEND GENERATED_H_TORCH
      "${TORCH_SRC_DIR}/csrc/autograd/generated/VariableType.h"
+      "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyIr.h"
+      "${TORCH_SRC_DIR}/csrc/lazy/generated/LazyNativeFunctions.h"
    )
  endif()

@ -420,6 +427,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
    "${TORCH_ROOT}/aten/src/ATen/native/native_functions.yaml"
    "${TORCH_ROOT}/aten/src/ATen/native/ts_native_functions.yaml"
    "${TORCH_ROOT}/torch/csrc/lazy/core/shape_inference.h"
+    "${TORCH_ROOT}/torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
    "${TORCH_ROOT}/aten/src/ATen/templates/DispatchKeyNativeFunctions.h"
    "${TORCH_ROOT}/aten/src/ATen/templates/DispatchKeyNativeFunctions.cpp"
    "${TORCH_ROOT}/aten/src/ATen/templates/LazyIr.h"
@ -490,7 +498,9 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
    set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
  else()
    append_filelist("libtorch_cmake_sources" LIBTORCH_CMAKE_SRCS)
-
+    if(BUILD_LAZY_TS_BACKEND)
+      append_filelist("lazy_tensor_ts_sources" LIBTORCH_CMAKE_SRCS)
+    endif()
    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
      # TODO: Delete this line once https://github.com/pytorch/pytorch/pull/55889 lands
      set_source_files_properties(../torch/csrc/jit/serialization/export.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -191,4 +191,5 @@ function(caffe2_print_configuration_summary)
  message(STATUS "  Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}")
  # coreml
  message(STATUS "  USE_COREML_DELEGATE     : ${USE_COREML_DELEGATE}")
+  message(STATUS "  BUILD_LAZY_TS_BACKEND   : ${BUILD_LAZY_TS_BACKEND}")
 endfunction()
--- a/test/cpp/lazy/CMakeLists.txt
+++ b/test/cpp/lazy/CMakeLists.txt
@ -9,9 +9,14 @@ set(LAZY_TEST_SRCS
  ${LAZY_TEST_ROOT}/test_misc.cpp
  ${LAZY_TEST_ROOT}/test_permutation_util.cpp
  ${LAZY_TEST_ROOT}/test_shape.cpp
-  ${LAZY_TEST_ROOT}/test_tensor_impl.cpp
  ${LAZY_TEST_ROOT}/test_util.cpp
 )
+if(BUILD_LAZY_TS_BACKEND)
+    list(APPEND LAZY_TEST_SRCS
+      ${LAZY_TEST_ROOT}/test_lazy_ops.cpp
+      ${LAZY_TEST_ROOT}/test_lazy_ops_util.cpp
+    )
+endif()

 add_executable(test_lazy
  ${TORCH_ROOT}/test/cpp/common/main.cpp
--- a/test/cpp/lazy/test_backend_device.cpp
+++ b/test/cpp/lazy/test_backend_device.cpp
@ -74,9 +74,13 @@ TEST(BackendDeviceTest, FromAten) {
  auto device = c10::Device(c10::kCPU);
  EXPECT_THROW(atenDeviceToBackendDevice(device), c10::Error);

-  // TODO(alanwaketan): Update the following test once we have TorchScript backend upstreamed.
  device = c10::Device(c10::kLazy);
+#ifndef FBCODE_CAFFE2
+  auto backend_device = atenDeviceToBackendDevice(device);
+#else
+  // Lazy Tensor is disabled in FBCODE until addressing non-virtual methods (e.g. sizes) in TensorImpl
  EXPECT_THROW(atenDeviceToBackendDevice(device), c10::Error);
+#endif // FBCODE_CAFFE2
 }

 TEST(BackendDeviceTest, ToAten) {
--- a/test/cpp/lazy/test_lazy_ops.cpp
+++ b/test/cpp/lazy/test_lazy_ops.cpp
@ -7,10 +7,6 @@
 #include <torch/csrc/lazy/core/debug_util.h>
 #include <torch/csrc/lazy/core/lazy_graph_executor.h>
 #include <torch/csrc/lazy/core/permutation_util.h>
-
-// Land unused tests first/separately since it is a large diff
-#if 0
-
 #include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
 #include <torch/torch.h>

@ -4528,6 +4524,10 @@ TEST_F(LazyOpsTest, TestIndexSelectRank0) {
 }

 TEST_F(LazyOpsTest, TestInverse) {
+  if (IsCuda()) {
+    // TODO(whc) debug failure on cuda, lazy_b comes back transposed
+    GTEST_SKIP();
+  }
  torch::Tensor a = torch::randn(
      {5, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
  torch::Tensor b = torch::inverse(a);
@ -7705,6 +7705,10 @@ TEST_F(LazyOpsTest, TestMaxUnpool3D) {
 }

 TEST_F(LazyOpsTest, TestNllLoss) {
+
+  // TODO(whc) debug divide-by-zero failure under ASAN
+  GTEST_SKIP();
+
  int batch = 6;
  int classes = 2;
  // TODO(asuhan): Fix the torch::kDouble case.
@ -10146,6 +10150,9 @@ TEST_F(LazyOpsTest, TestBinaryCrossEntropyBackward) {
 }

 TEST_F(LazyOpsTest, TestNllLossBackward) {
+  // TODO(whc) debug divide-by-zero failure under ASAN
+  GTEST_SKIP();
+
  int batch = 6;
  int classes = 2;
  // TODO(asuhan): Fix the torch::kDouble case.
@ -10438,6 +10445,11 @@ TEST_F(LazyOpsTest, TestEmbeddingBackward) {
 }

 TEST_F(LazyOpsTest, TestAmpForeachNonFiniteCheckAndUnscale) {
+  if (IsCuda()) {
+    // TODO(whc) debug failure on cuda
+    GTEST_SKIP();
+  }
+
  torch::Tensor grads0 = torch::tensor(
      {1, 2, 3, 4},
      torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
@ -10686,4 +10698,3 @@ TEST_F(LazyOpsTest, TestLerpScalarOut) {

 }  // namespace lazy
 }  // namespace torch
-#endif // if 0
--- a/test/cpp/lazy/test_tensor_impl.cpp
+++ b/test/cpp/lazy/test_tensor_impl.cpp
@ -6,12 +6,14 @@
 namespace torch {
 namespace lazy {

-// TODO(alanwaketan): Update the following unit tests once the TorchScript backend is merged.
+#ifdef FBCODE_CAFFE2
+// Lazy Tensor is disabled in FBCODE until addressing non-virtual methods (e.g. sizes) in TensorImpl
 TEST(LazyTensorImplTest, BasicThrow) {
  EXPECT_THROW({
    auto input = torch::rand({0, 1, 3, 0}, torch::TensorOptions(torch::kFloat).device("lazy"));
  }, ::c10::Error);
 }
+#endif // FBCODE_CAFFE2

 }  // namespace lazy
 }  // namespace torch
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@ -42,6 +42,13 @@ GENERATED_CPP = [
    "autograd/generated/python_variable_methods.cpp",
 ]

+# This is duplicated in caffe2/CMakeLists.txt for now and not yet used in buck
+GENERATED_LAZY_TS_CPP = [
+    "lazy/generated/LazyNativeFunctions.cpp",
+    "lazy/generated/RegisterAutogradLazy.cpp",
+    "lazy/generated/RegisterLazy.cpp",
+]
+
 # NVFuser runtime library
 libtorch_nvfuser_runtime_sources = [
    "torch/csrc/jit/codegen/cuda/runtime/bf16_support.cu",
@ -434,6 +441,9 @@ lazy_tensor_core_sources = [
    "torch/csrc/lazy/core/view_ops/unsqueeze.cpp",
    "torch/csrc/lazy/core/view_ops/select_view_update.cpp",
    "torch/csrc/lazy/core/view_ops/view.cpp",
+    # We should better segment the sources, but for now there are actually dependencies
+    # from some core files on some of these ts_backend files
+    # so we continue to build these parts of ts_backend in all build configs
    "torch/csrc/lazy/ts_backend/config.cpp",
    "torch/csrc/lazy/ts_backend/ops/arithmetic_ir_ops.cpp",
    "torch/csrc/lazy/ts_backend/ops/cast.cpp",
@ -444,6 +454,20 @@ lazy_tensor_core_sources = [
    "torch/csrc/lazy/ts_backend/ts_node.cpp",
 ]

+# We can't build all of the ts backend under certain build configurations, e.g. mobile,
+# since it depends on things like autograd, meta functions, which may be disabled
+lazy_tensor_ts_sources = [
+    "torch/csrc/lazy/ts_backend/ops/batch_norm_ops.cpp",
+    "torch/csrc/lazy/ts_backend/ops/random_ops.cpp",
+    "torch/csrc/lazy/ts_backend/ts_autograd_functions.cpp",
+    "torch/csrc/lazy/ts_backend/ts_backend_impl.cpp",
+    "torch/csrc/lazy/ts_backend/ts_lowering_context.cpp",
+    "torch/csrc/lazy/ts_backend/ts_native_functions.cpp",
+    "torch/csrc/lazy/ts_backend/ts_node_lowering.cpp",
+    "torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp",
+    "torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp",
+]
+
 lazy_tensor_core_python_sources = [
    "torch/csrc/lazy/python/init.cpp",
    "torch/csrc/lazy/python/python_util.cpp",
--- a/tools/codegen/dest/lazy_ir.py
+++ b/tools/codegen/dest/lazy_ir.py
@ -251,7 +251,7 @@ class GenLazyNativeFuncDefinition:
        meta_str += f"""
        TORCH_INTERNAL_ASSERT(shapes.size() == {returns_length});"""

-        node_str = f"""auto node = torch::lazy::MakeNode<ir::ops::{schema.node_name}>({node_ctor_input_str},
+        node_str = f"""auto node = torch::lazy::MakeNode<{schema.node_name}>({node_ctor_input_str},
                                                                                      std::move(shapes));"""
        first_tensor_name = value_types_names[0]
        bridge_str = """auto result = torch::lazy::CreateAtenFromLtcTensor(
--- a/tools/codegen/gen_backend_stubs.py
+++ b/tools/codegen/gen_backend_stubs.py
@ -242,11 +242,21 @@ def gen_dispatcher_registrations(
        backend_dispatch_key: DispatchKey,
        dispatch_key: DispatchKey,
        selector: 'SelectiveBuilder',
+        # build_in_tree is true for lazy TS backend and affects include paths, not used for external backends
+        build_in_tree: bool = False,
        per_operator_headers: bool = False) -> None:
+    headers = [
+        f"{output_dir}/{backend_dispatch_key}NativeFunctions.h",
+    ]
+    if build_in_tree:
+        external_backend_headers_str = "\n".join(f'#include <{h}>' for h in headers)
+    else:
+        external_backend_headers_str = "\n".join(f'#include "{h}"' for h in headers)
+
    backend_index = backend_indices[dispatch_key]
    fm.write_with_template(f'Register{dispatch_key}.cpp', 'RegisterDispatchKey.cpp', lambda: {
        'extra_cuda_headers': '',
-        'external_backend_headers': f'#include "{output_dir}/{backend_dispatch_key}NativeFunctions.h"',
+        'external_backend_headers': external_backend_headers_str,
        'ops_headers': '#include <ATen/Functions.h>' if not per_operator_headers else '',
        'DispatchKey': dispatch_key,
        'dispatch_namespace': dispatch_key.lower(),
--- a/tools/codegen/gen_lazy_tensor.py
+++ b/tools/codegen/gen_lazy_tensor.py
@ -121,6 +121,10 @@ def run_gen_lazy_tensor(aten_path: str, source_yaml: str, output_dir: str,
                        tensor_class_hdr: str = default_args.tensor_class_hdr,
                        shape_inference_hdr: str = default_args.shape_inference_hdr,
                        lazy_ir_cls: Type[LazyIR] = default_args.lazy_ir_cls,
+                        # build_in_tree is true for TS backend and affects include paths
+                        build_in_tree: bool = False,
+                        # per_operator_headers changes whether ATen/Functions.h or individual operator headers are used
+                        # it must match how ATen was built
                        per_operator_headers: bool = False) -> None:

    template_dir = os.path.join(aten_path, "templates")
@ -226,6 +230,7 @@ def run_gen_lazy_tensor(aten_path: str, source_yaml: str, output_dir: str,
    for dispatch_key in [backend_key] if autograd_key is None else [backend_key, autograd_key]:
        gen_dispatcher_registrations(fm, output_dir, cpp_namespace, backend_indices, grouped_native_functions,
                                     backend_key, dispatch_key, selector,
+                                     build_in_tree=build_in_tree,
                                     per_operator_headers=per_operator_headers)

    # Generate native function impls that build IR nodes
@ -237,12 +242,13 @@ def run_gen_lazy_tensor(aten_path: str, source_yaml: str, output_dir: str,
            "ATen/Functions.h",
            "ATen/MetaFunctions.h",
            "ATen/Operators.h",
+            "ATen/native/CPUFallback.h",
            "torch/csrc/lazy/core/lazy_graph_executor.h",
            "torch/csrc/lazy/core/metrics.h",
            "torch/csrc/lazy/core/shape.h",
-            "lazy_tensor_core/csrc/ts_backend/aten_eager_fallback.h",
            f"{output_dir}/{backend_key}NativeFunctions.h",
-            f"{output_dir}/{backend_key}LazyIr.h",
+            f"{output_dir}/LazyIr.h",
+            "torch/csrc/lazy/ts_backend/ts_eager_fallback.h",
        ]],
        'native_functions_include': '',
        'namespace_prologue': ns_helper.prologue,
--- a/tools/linter/clang_tidy/generate_build_files.py
+++ b/tools/linter/clang_tidy/generate_build_files.py
@ -53,6 +53,7 @@ def run_autogen() -> None:
            "aten/src/ATen/native/native_functions.yaml",
            "--nn-path",
            "aten/src",
+            "--gen_lazy_ts_backend",
        ]
    )

--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@ -189,7 +189,8 @@ def main() -> None:
    if options.gen_lazy_ts_backend:
        aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
        ts_backend_yaml = os.path.join(aten_path, 'native/ts_native_functions.yaml')
-
+        ts_native_functions = "torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
+        ts_node_base = "torch/csrc/lazy/ts_backend/ts_node.h"
        if options.install_dir is None:
            options.install_dir = "torch/csrc"
        lazy_install_dir = os.path.join(options.install_dir, "lazy/generated")
@ -197,16 +198,17 @@ def main() -> None:
            os.makedirs(lazy_install_dir)

        assert os.path.isfile(ts_backend_yaml), f"Unable to access ts_backend_yaml: {ts_backend_yaml}"
+        assert os.path.isfile(ts_native_functions), f"Unable to access {ts_native_functions}"
        from tools.codegen.gen_lazy_tensor import run_gen_lazy_tensor
        run_gen_lazy_tensor(aten_path=aten_path,
                            source_yaml=ts_backend_yaml,
                            output_dir=lazy_install_dir,
                            dry_run=False,
-                            # TODO(whc) reimplement checking of hand-implemented nativefunc file after landing it
-                            impl_path=None,
+                            impl_path=ts_native_functions,
                            gen_ts_lowerings=True,
                            node_base="TsNode",
-                            node_base_hdr="torch/csrc/lazy/ts_backend/ts_node.h",
+                            node_base_hdr=ts_node_base,
+                            build_in_tree=True,
                            per_operator_headers=options.per_operator_headers)


--- a/torch/csrc/lazy/backend/backend_device.h
+++ b/torch/csrc/lazy/backend/backend_device.h
@ -18,7 +18,11 @@ namespace lazy {

 // Backend should extend it and define their own supported hardware types.
 struct TORCH_API BackendDeviceType {
-  int8_t type {0};
+  int8_t type {(int8_t)at::kCPU};
+  // Note: previous default value was '0', which actually maps to at::kCPU, at least now it is explicit,
+  // we may want to make default/undefined semantics more clear though
+  BackendDeviceType() :type((int8_t)at::kCPU) {}
+  BackendDeviceType(int8_t type) :type(type) {}

  virtual ~BackendDeviceType() = default;
  virtual std::string toString() const { return "Unknown"; }
--- a/torch/csrc/lazy/ts_backend/config.cpp
+++ b/torch/csrc/lazy/ts_backend/config.cpp
@ -5,3 +5,16 @@ C10_DEFINE_int(
    torch_lazy_ts_shape_cache_size,
    4096,
    "Set the size for the shape cache used for shape inference");
+
+// TODO(whc) unclear if this is useful, has only been tested as true
+C10_DEFINE_bool(
+    torch_lazy_ts_tensor_update_sync,
+    true,
+    "Use synchronous copy inside _copy_from op");
+
+// TODO(whc) we need to hook up these flags in a more useful way
+// possibly also keep LTC_TS_CUDA env working?
+C10_DEFINE_bool(
+    torch_lazy_ts_cuda,
+    false,
+    "Use cuda device for torchscript backend (instead of CPU)");
--- a/torch/csrc/lazy/ts_backend/config.h
+++ b/torch/csrc/lazy/ts_backend/config.h
@ -3,3 +3,8 @@

 // TODO(whc) either deprecate this, or use it for all shape inference
 C10_DECLARE_int(torch_lazy_ts_shape_cache_size);
+
+// TODO(whc) unclear if this is useful, has only been tested as true
+C10_DECLARE_bool(torch_lazy_ts_tensor_update_sync);
+
+C10_DECLARE_bool(torch_lazy_ts_cuda);
--- a/torch/csrc/lazy/ts_backend/ops/batch_norm_ops.cpp
+++ b/torch/csrc/lazy/ts_backend/ops/batch_norm_ops.cpp
@ -0,0 +1,76 @@
+#include <torch/csrc/lazy/ts_backend/ops/batch_norm_ops.h>
+#include <torch/csrc/lazy/core/util.h>
+
+namespace torch {
+namespace lazy {
+
+TSNativeBatchNormBackward::TSNativeBatchNormBackward(
+    const torch::lazy::Value& grad_out, const torch::lazy::Value& input,
+    const torch::lazy::Value& weight, const torch::lazy::Value& running_mean,
+    const torch::lazy::Value& running_var, const torch::lazy::Value& save_mean,
+    const torch::lazy::Value& save_invstd, bool training, double eps,
+    std::array<bool, 3> output_mask)
+    : torch::lazy::TsNode(
+          torch::lazy::OpKind(at::aten::native_batch_norm_backward),
+          {grad_out, input, weight, running_mean, running_var, save_mean,
+           save_invstd},
+          {input.shape(),
+           weight.shape(),
+           weight.shape()},
+          /*num_outputs=*/3,
+          torch::lazy::MHash(training, eps, output_mask[0], output_mask[1],
+                             output_mask[2])),
+      training_(training),
+      eps_(eps),
+      output_mask_(output_mask) {}
+
+TSNativeBatchNormBackward::TSNativeBatchNormBackward(
+    const torch::lazy::Value& grad_out, const torch::lazy::Value& input,
+    const torch::lazy::Value& weight, const torch::lazy::Value& save_mean,
+    const torch::lazy::Value& save_invstd, bool training, double eps,
+    std::array<bool, 3> output_mask)
+    : torch::lazy::TsNode(
+          torch::lazy::OpKind(at::aten::native_batch_norm_backward),
+          {grad_out, input, weight, save_mean, save_invstd},
+          {input.shape(),
+           weight.shape(),
+           weight.shape()},
+          /*num_outputs=*/3,
+          torch::lazy::MHash(training, eps, output_mask[0], output_mask[1],
+                             output_mask[2])),
+      training_(training),
+      eps_(eps),
+      output_mask_(output_mask) {}
+
+std::string TSNativeBatchNormBackward::ToString() const {
+  std::stringstream ss;
+  ss << torch::lazy::TsNode::ToString() << ", training=" << training_
+     << ", eps=" << eps_;
+  return ss.str();
+}
+
+TSNativeBatchNormForward::TSNativeBatchNormForward(
+    const torch::lazy::Value& input, const torch::lazy::Value& weight,
+    const torch::lazy::Value& bias, const torch::lazy::Value& running_mean,
+    const torch::lazy::Value& running_var, bool training, double momentum,
+    double eps)
+    : torch::lazy::TsNode(torch::lazy::OpKind(at::aten::native_batch_norm),
+                          {input, weight, bias, running_mean, running_var},
+                          {input.shape(),
+                           running_mean.shape(),
+                           running_var.shape()},
+                          /*num_outputs=*/3,
+                          torch::lazy::MHash(training, momentum, eps)),
+      training_(training),
+      momentum_(momentum),
+      eps_(eps) {}
+
+std::string TSNativeBatchNormForward::ToString() const {
+  std::stringstream ss;
+  ss << torch::lazy::TsNode::ToString() << ", training=" << training_
+     << ", momentum=" << momentum_ << ", eps=" << eps_;
+  return ss.str();
+}
+
+}  // namespace lazy
+}  // namespace torch
--- a/torch/csrc/lazy/ts_backend/ops/batch_norm_ops.h
+++ b/torch/csrc/lazy/ts_backend/ops/batch_norm_ops.h
@ -0,0 +1,58 @@
+#pragma once
+
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch {
+namespace lazy {
+
+// Node for the backward batch norm operator.
+class TSNativeBatchNormBackward : public torch::lazy::TsNode {
+ public:
+  TSNativeBatchNormBackward(const torch::lazy::Value& grad_out, const torch::lazy::Value& input,
+                            const torch::lazy::Value& weight, const torch::lazy::Value& running_mean,
+                            const torch::lazy::Value& running_var, const torch::lazy::Value& save_mean,
+                            const torch::lazy::Value& save_invstd, bool training, double eps,
+                            std::array<bool, 3> output_mask);
+
+  TSNativeBatchNormBackward(const torch::lazy::Value& grad_out, const torch::lazy::Value& input,
+                            const torch::lazy::Value& weight, const torch::lazy::Value& save_mean,
+                            const torch::lazy::Value& save_invstd, bool training, double eps,
+                            std::array<bool, 3> output_mask);
+
+  std::string ToString() const override;
+
+  bool training() const { return training_; }
+
+  double eps() const { return eps_; }
+
+  const std::array<bool, 3>& output_mask() const { return output_mask_; }
+
+ private:
+  bool training_;
+  double eps_;
+  std::array<bool, 3> output_mask_;
+};
+
+class TSNativeBatchNormForward : public torch::lazy::TsNode {
+ public:
+  TSNativeBatchNormForward(const torch::lazy::Value& input, const torch::lazy::Value& weight,
+                           const torch::lazy::Value& bias, const torch::lazy::Value& running_mean,
+                           const torch::lazy::Value& running_var, bool training,
+                           double momentum, double eps);
+
+  std::string ToString() const override;
+
+  bool training() const { return training_; }
+
+  double momentum() const { return momentum_; }
+
+  double eps() const { return eps_; }
+
+ private:
+  bool training_;
+  double momentum_;
+  double eps_;
+};
+
+}  // namespace lazy
+}  // namespace torch
--- a/torch/csrc/lazy/ts_backend/ops/random_ops.cpp
+++ b/torch/csrc/lazy/ts_backend/ops/random_ops.cpp
@ -0,0 +1,40 @@
+#include <torch/csrc/lazy/ts_backend/ops/random_ops.h>
+#include <torch/csrc/lazy/core/util.h>
+
+namespace torch {
+namespace lazy {
+
+Normal::Normal(const torch::lazy::Value& self, const double& mean, const double& std, std::vector<torch::lazy::Shape>&& shapes)
+    : torch::lazy::TsNode(torch::lazy::OpKind(c10::Symbol::fromQualString("aten::normal_")),
+            {self}, std::move(shapes),
+            /* num_outputs */ 1,
+            torch::lazy::MHash(mean, std)),
+    mean_(mean),
+    std_(std) {}
+
+std::string Normal::ToString() const {
+  std::stringstream ss;
+  ss << TsNode::ToString();
+  ss << ", mean=" << mean_;
+  ss << ", std=" << std_;
+  return ss.str();
+}
+
+torch::lazy::TSOpVector Normal::Lower(
+    std::shared_ptr<torch::jit::GraphFunction> function,
+    torch::lazy::TSLoweringContext* loctx) const {
+  std::vector<torch::jit::NamedValue> arguments;
+  std::vector<torch::jit::NamedValue> kwarguments;
+  arguments.reserve(3);
+  size_t i = 0;
+  arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+  arguments.emplace_back("mean", mean_);
+  arguments.emplace_back("std", std_);
+  torch::lazy::TSOpVector normal__out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+  CHECK_EQ(normal__out.size(), 1);
+
+  return normal__out;
+}
+
+}  // namespace lazy
+}  // namespace torch
--- a/torch/csrc/lazy/ts_backend/ops/random_ops.h
+++ b/torch/csrc/lazy/ts_backend/ops/random_ops.h
@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch {
+namespace lazy {
+
+class Normal : public torch::lazy::TsNode {
+ public:
+  Normal(const torch::lazy::Value& self, const double& mean, const double& std, std::vector<torch::lazy::Shape>&& shapes);
+
+  std::string ToString() const override;
+  torch::lazy::TSOpVector Lower(std::shared_ptr<torch::jit::GraphFunction> function,
+                   torch::lazy::TSLoweringContext* loctx) const override;
+
+  double mean_;
+  double std_;
+};
+
+}  // namespace lazy
+}  // namespace torch
--- a/torch/csrc/lazy/ts_backend/ops/to_copy.h
+++ b/torch/csrc/lazy/ts_backend/ops/to_copy.h
@ -0,0 +1,89 @@
+#pragma once
+
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch {
+namespace lazy {
+
+
+// This IR was copied from code-generated output, but the entire _to_copy operator
+// cannot be trivially code genereated since it is only desirable to capture IR for
+// certain permutaions of _to_copy (e.g. dtype), and for the others it is difficult to even invoke
+// the aten/eager fallback necessitating directly implementing the right to(device) behavior
+class ToCopy : public torch::lazy::TsNode {
+ public:
+  ToCopy(const torch::lazy::Value& self, const c10::optional<at::ScalarType>& dtype, const c10::optional<at::Layout>& layout, const c10::optional<at::Device>& device, const c10::optional<bool>& pin_memory, const bool& non_blocking, const c10::optional<at::MemoryFormat>& memory_format, std::vector<torch::lazy::Shape>&& shapes)
+      : torch::lazy::TsNode(torch::lazy::OpKind(at::aten::_to_copy),
+              {self}, std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dtype, layout, device, pin_memory, non_blocking, memory_format)),
+
+        dtype(dtype),
+        layout(layout),
+        device(device),
+        pin_memory(pin_memory),
+        non_blocking(non_blocking),
+        memory_format(memory_format) {}
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << torch::lazy::TsNode::ToString();
+    if (dtype.has_value()) {
+        ss << ", dtype=" << dtype.value();
+    } else {
+        ss << ", dtype=null";
+    }
+    if (layout.has_value()) {
+        ss << ", layout=" << layout.value();
+    } else {
+        ss << ", layout=null";
+    }
+    if (device.has_value()) {
+        ss << ", device=" << device.value();
+    } else {
+        ss << ", device=null";
+    }
+    if (pin_memory.has_value()) {
+        ss << ", pin_memory=" << pin_memory.value();
+    } else {
+        ss << ", pin_memory=null";
+    }
+    ss << ", non_blocking=" << non_blocking;
+    if (memory_format.has_value()) {
+        ss << ", memory_format=" << memory_format.value();
+    } else {
+        ss << ", memory_format=null";
+    }
+    return ss.str();
+  }
+
+  torch::lazy::TSOpVector Lower(std::shared_ptr<torch::jit::GraphFunction> function,
+                   torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(6);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("dtype", dtype);
+    kwarguments.emplace_back("layout", layout);
+    kwarguments.emplace_back("device", device);
+    kwarguments.emplace_back("pin_memory", pin_memory);
+    kwarguments.emplace_back("non_blocking", non_blocking);
+    kwarguments.emplace_back("memory_format", memory_format);
+    torch::lazy::TSOpVector _to_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    CHECK_EQ(_to_copy_out.size(), 1);
+
+    return _to_copy_out;
+
+  }
+
+  c10::optional<at::ScalarType> dtype;
+  c10::optional<at::Layout> layout;
+  c10::optional<at::Device> device;
+  c10::optional<bool> pin_memory;
+  bool non_blocking;
+  c10::optional<at::MemoryFormat> memory_format;
+};
+}  // namespace lazy
+}  // namespace torch
--- a/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
+++ b/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
@ -0,0 +1,331 @@
+#include <torch/csrc/lazy/ts_backend/tensor_aten_ops.h>
+
+#include <ATen/InferSize.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/lazy/core/helpers.h>
+#include <torch/csrc/lazy/ts_backend/ops/arithmetic_ir_ops.h>
+#include <torch/csrc/lazy/ts_backend/ops/cast.h>
+#include <torch/csrc/lazy/ts_backend/ops/expand.h>
+#include <torch/csrc/lazy/ts_backend/ops/batch_norm_ops.h>
+#include <torch/csrc/lazy/ts_backend/ops/random_ops.h>
+#include <torch/csrc/lazy/core/ir_util.h>
+#include <torch/csrc/lazy/core/lazy_graph_executor.h>
+#include <torch/csrc/lazy/core/metrics.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <torch/csrc/lazy/core/util.h>
+#include <torch/csrc/lazy/core/view_ops/as_strided.h>
+#include <torch/csrc/lazy/core/view_ops/permute.h>
+#include <torch/csrc/lazy/core/view_ops/squeeze.h>
+#include <torch/csrc/lazy/core/view_ops/unsqueeze.h>
+#include <torch/csrc/lazy/core/view_ops/view.h>
+#include <torch/csrc/lazy/generated/LazyIr.h>
+#include <algorithm>
+#include <functional>
+
+namespace torch {
+namespace lazy {
+namespace {
+
+// to enable operator+-*/ for Value
+using namespace torch::lazy;
+
+torch::lazy::Value MaybeExpand(const torch::lazy::Value& input,
+                               const torch::lazy::Shape& target_shape) {
+  if (input.shape().sizes() == target_shape.sizes()) {
+    return input;
+  }
+  return torch::lazy::MakeNode<torch::lazy::Expand>(
+      input, target_shape.sizes().vec(),
+      /*is_scalar_expand=*/false);
+}
+
+std::vector<int64_t> GetExpandDimensions(const torch::lazy::Shape& shape,
+                                         std::vector<int64_t> dimensions) {
+  CHECK_GE(dimensions.size(), shape.dim()) << shape;
+  int64_t base = dimensions.size() - shape.dim();
+  for (size_t i = 0; i < shape.dim(); ++i) {
+    if (dimensions[base + i] == -1) {
+      dimensions[base + i] = shape.size(i);
+    }
+  }
+  return dimensions;
+}
+
+// Returns a 1-D shape for batch norm weight or bias based on the input shape.
+torch::lazy::Shape BatchNormFeaturesShape(const torch::lazy::LazyTensorPtr& input) {
+  CHECK(input);
+  auto input_shape = input->shape().Get();
+  return torch::lazy::Shape(input_shape.scalar_type(),
+                             input_shape.sizes()[1]);
+}
+
+// Returns the IR for the given input or the provided default value broadcasted
+// to the default shape, if the input is undefined.
+torch::lazy::Value GetIrValueOrDefault(const torch::lazy::LazyTensorPtr& input,
+                                       const at::Scalar& default_value,
+                                       const torch::lazy::Shape& default_shape,
+                                       const torch::lazy::BackendDevice& device) {
+  return input ? input->GetIrValue()
+               : torch::lazy::LazyGraphExecutor::Get()->GetIrValueForExpandedScalar(default_value,
+                                                                                    default_shape,
+                                                                                    device);
+}
+
+torch::lazy::ViewInfo CreateAsStridedViewInfo(
+    const torch::lazy::Shape& input_shape, std::vector<int64_t> size,
+    std::vector<int64_t> stride, c10::optional<int64_t> storage_offset) {
+  torch::lazy::Shape result_shape =
+      torch::lazy::Shape(input_shape.scalar_type(), size);
+  torch::lazy::AsStridedInfo as_strided_info;
+  as_strided_info.stride = std::move(stride);
+  if (storage_offset) {
+    as_strided_info.offset = *storage_offset;
+  }
+  return torch::lazy::ViewInfo(torch::lazy::ViewInfo::Type::kAsStrided,
+                               std::move(result_shape), input_shape,
+                               std::move(as_strided_info));
+}
+
+}  // namespace
+
+//////////////////////////////////////////////////////////////////////////////
+// ATEN operators follows here, listed in alphabetical order.
+//////////////////////////////////////////////////////////////////////////////
+torch::lazy::LazyTensorPtr as_strided(const torch::lazy::LazyTensorPtr& input, std::vector<int64_t> size,
+                      std::vector<int64_t> stride,
+                      c10::optional<int64_t> storage_offset) {
+  auto input_shape = input->shape();
+  return input->CreateViewTensor(CreateAsStridedViewInfo(
+      input_shape, std::move(size), std::move(stride), storage_offset));
+}
+
+void as_strided_(torch::lazy::LazyTensorPtr& input, std::vector<int64_t> size,
+                 std::vector<int64_t> stride,
+                 c10::optional<int64_t> storage_offset) {
+  if (input->data()->view == nullptr) {
+    input->SetIrValue(torch::lazy::MakeNode<torch::lazy::AsStrided>(
+        input->GetIrValue(), std::move(size), std::move(stride),
+        storage_offset.value_or(0)));
+  } else {
+    auto input_shape = input->shape();
+    input->SetSubView(CreateAsStridedViewInfo(
+        input_shape, std::move(size), std::move(stride), storage_offset));
+  }
+}
+
+torch::lazy::LazyTensorPtr expand(const torch::lazy::LazyTensorPtr& input, std::vector<int64_t> size) {
+  auto input_shape = input->shape();
+  return torch::lazy::LazyTensor::Create(torch::lazy::MakeNode<torch::lazy::Expand>(
+      input->GetIrValue(),
+      GetExpandDimensions(input_shape.Get(), std::move(size)),
+      /*is_scalar_expand=*/false), input->GetDevice());
+}
+
+void fill_(torch::lazy::LazyTensorPtr& input, const at::Scalar& value) {
+  torch::lazy::Value constant = torch::lazy::LazyGraphExecutor::Get()->GetIrValueForExpandedScalar(
+      value, input->shape(), input->GetDevice());
+  input->SetInPlaceIrValue(std::move(constant));
+}
+
+torch::lazy::LazyTensorPtr narrow(const torch::lazy::LazyTensorPtr& input, int64_t dim, int64_t start,
+                  int64_t length) {
+  auto input_shape = input->shape();
+  dim = torch::lazy::GetCanonicalDimensionIndex(dim, input_shape.Get().dim());
+  torch::lazy::Shape narrow_shape = input_shape;
+  narrow_shape.set_size(dim, length);
+
+  torch::lazy::ViewInfo::Type view_type =
+      (input_shape.Get().numel() == narrow_shape.numel())
+          ? torch::lazy::ViewInfo::Type::kReshape
+          : torch::lazy::ViewInfo::Type::kNarrow;
+  torch::lazy::ViewInfo view_info(view_type, std::move(narrow_shape),
+                                  input_shape);
+  view_info.indices[dim] =
+      torch::lazy::GetCanonicalPosition(input_shape.Get().sizes(), dim, start);
+  return input->CreateViewTensor(std::move(view_info));
+}
+
+std::tuple<torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr> ts_native_batch_norm(
+    const torch::lazy::LazyTensorPtr& input, const torch::lazy::LazyTensorPtr& weight, const torch::lazy::LazyTensorPtr& bias,
+    torch::lazy::LazyTensorPtr& running_mean, torch::lazy::LazyTensorPtr& running_var, bool training,
+    double momentum, double eps) {
+  torch::lazy::Shape features_shape = BatchNormFeaturesShape(input);
+  torch::lazy::Value weight_value =
+      GetIrValueOrDefault(weight, 1, features_shape, input->GetDevice());
+  torch::lazy::Value bias_value =
+      GetIrValueOrDefault(bias, 0, features_shape, input->GetDevice());
+  torch::lazy::Value running_mean_value =
+      GetIrValueOrDefault(running_mean, 0, features_shape, input->GetDevice());
+  torch::lazy::Value running_var_value =
+      GetIrValueOrDefault(running_var, 0, features_shape, input->GetDevice());
+  torch::lazy::NodePtr node =
+      torch::lazy::MakeNode<TSNativeBatchNormForward>(
+          input->GetIrValue(), weight_value, bias_value, running_mean_value,
+          running_var_value, training, momentum, eps);
+  torch::lazy::LazyTensorPtr output = torch::lazy::LazyTensor::Create(torch::lazy::Value(node, 0), input->GetDevice());
+  torch::lazy::LazyTensorPtr running_mean_output =
+      torch::lazy::LazyTensor::Create(torch::lazy::Value(node, 1), input->GetDevice());
+  torch::lazy::LazyTensorPtr running_var_output = torch::lazy::LazyTensor::Create(torch::lazy::Value(node, 2), input->GetDevice());
+  return std::make_tuple(std::move(output), std::move(running_mean_output),
+                         std::move(running_var_output));
+}
+
+std::tuple<torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr> ts_native_batch_norm_backward(
+    const torch::lazy::LazyTensorPtr& grad_out, const torch::lazy::LazyTensorPtr& input,
+    const torch::lazy::LazyTensorPtr& weight, const torch::lazy::LazyTensorPtr& running_mean,
+    const torch::lazy::LazyTensorPtr& running_var, const torch::lazy::LazyTensorPtr& save_mean,
+    const torch::lazy::LazyTensorPtr& save_invstd, bool training, double eps,
+    c10::ArrayRef<bool> output_mask) {
+  torch::lazy::Shape features_shape = BatchNormFeaturesShape(input);
+  torch::lazy::Value weight_value =
+      GetIrValueOrDefault(weight, 1, features_shape, input->GetDevice());
+  torch::lazy::NodePtr node;
+  if (!running_mean && !running_var) {
+    node = torch::lazy::MakeNode<TSNativeBatchNormBackward>(
+        grad_out->GetIrValue(), input->GetIrValue(), weight_value,
+        save_mean->GetIrValue(), save_invstd->GetIrValue(), training, eps,
+        std::array<bool, 3>{output_mask[0], output_mask[1], output_mask[2]});
+  } else {
+    CHECK(running_mean);
+    CHECK(running_var);
+    node = torch::lazy::MakeNode<TSNativeBatchNormBackward>(
+        grad_out->GetIrValue(), input->GetIrValue(), weight_value,
+        running_mean->GetIrValue(), running_var->GetIrValue(),
+        save_mean->GetIrValue(), save_invstd->GetIrValue(), training, eps,
+        std::array<bool, 3>{output_mask[0], output_mask[1], output_mask[2]});
+  }
+  torch::lazy::LazyTensorPtr grad_input = torch::lazy::LazyTensor::Create(torch::lazy::Value(node, 0), input->GetDevice());
+  torch::lazy::LazyTensorPtr grad_weight = torch::lazy::LazyTensor::Create(torch::lazy::Value(node, 1), input->GetDevice());
+  torch::lazy::LazyTensorPtr grad_bias = torch::lazy::LazyTensor::Create(torch::lazy::Value(node, 2), input->GetDevice());
+  return std::make_tuple(std::move(grad_input), std::move(grad_weight),
+                         std::move(grad_bias));
+}
+
+torch::lazy::LazyTensorPtr permute(const torch::lazy::LazyTensorPtr& input, c10::ArrayRef<int64_t> dims) {
+  auto input_shape = input->shape();
+  torch::lazy::ViewInfo view_info(
+      torch::lazy::ViewInfo::Type::kPermute, input_shape,
+      torch::lazy::GetCanonicalDimensionIndices(dims, input_shape.Get().dim()));
+  return input->CreateViewTensor(std::move(view_info));
+}
+
+void copy_(torch::lazy::LazyTensorPtr& input, torch::lazy::LazyTensorPtr& src) {
+  if (input->GetDevice() == src->GetDevice()) {
+    torch::lazy::Value copy_value;
+    if (input->dtype() == src->dtype()) {
+      copy_value = src->GetIrValue();
+    } else {
+      copy_value = torch::lazy::MakeNode<torch::lazy::Cast>(
+          src->GetIrValue(), input->dtype(), src->dtype());
+    }
+    input->SetIrValue(MaybeExpand(copy_value, input->shape()));
+  } else {
+    auto input_shape = input->shape();
+    at::Tensor src_tensor = src->ToTensor(/*detached=*/true);
+    if (src_tensor.sizes() != input_shape.Get().sizes()) {
+      src_tensor = src_tensor.expand(input_shape.Get().sizes().vec());
+    }
+    input->UpdateFromTensor(std::move(src_tensor), /*sync=*/false);
+  }
+}
+
+torch::lazy::LazyTensorPtr select(const torch::lazy::LazyTensorPtr& input, int64_t dim, int64_t index) {
+  auto shape = input->shape();
+  dim = torch::lazy::GetCanonicalDimensionIndex(dim, shape.Get().dim());
+  torch::lazy::LazyTensorPtr result = narrow(input, dim, index, 1);
+  auto new_dims = torch::lazy::DropDimensions(shape.Get().sizes(), {dim});
+  return view(result, new_dims);
+}
+
+torch::lazy::LazyTensorPtr slice(const torch::lazy::LazyTensorPtr& input, int64_t dim, int64_t start,
+                 int64_t end, int64_t step) {
+  auto input_shape = input->shape();
+  dim = torch::lazy::GetCanonicalDimensionIndex(dim, input_shape.Get().dim());
+  start =
+      torch::lazy::GetCanonicalPosition(input_shape.Get().sizes(), dim, start);
+  end = torch::lazy::GetCanonicalPosition(input_shape.Get().sizes(), dim, end);
+  // PyTorch allows tensor[-1:0] to return a 0-dim tensor.
+  if (start > end) {
+    end = start;
+  }
+  step = std::min(step, end - start);
+
+  torch::lazy::SelectInfo select = {dim, start, end, step};
+  torch::lazy::ViewInfo view_info(torch::lazy::ViewInfo::Type::kSelect,
+                                  input_shape, select);
+  return input->CreateViewTensor(std::move(view_info));
+}
+
+torch::lazy::LazyTensorPtr squeeze(const torch::lazy::LazyTensorPtr& input) {
+  auto input_shape = input->shape();
+  auto output_dimensions = BuildSqueezedDimensions(
+      input_shape.Get().sizes(), /*squeeze_dim=*/-1);
+  return view(input, output_dimensions);
+}
+
+torch::lazy::LazyTensorPtr squeeze(const torch::lazy::LazyTensorPtr& input, int64_t dim) {
+  auto input_shape = input->shape();
+  int64_t squeeze_dim =
+      torch::lazy::GetCanonicalDimensionIndex(dim, input->shape().Get().dim());
+  auto output_dimensions =
+      BuildSqueezedDimensions(input_shape.Get().sizes(), squeeze_dim);
+  return view(input, output_dimensions);
+}
+
+void squeeze_(torch::lazy::LazyTensorPtr& input) {
+  input->SetIrValue(
+      torch::lazy::MakeNode<Squeeze>(input->GetIrValue(), -1));
+}
+
+void squeeze_(torch::lazy::LazyTensorPtr& input, int64_t dim) {
+  input->SetIrValue(torch::lazy::MakeNode<Squeeze>(
+      input->GetIrValue(),
+      torch::lazy::GetCanonicalDimensionIndex(dim, input->shape().Get().dim())));
+}
+
+torch::lazy::LazyTensorPtr transpose(const torch::lazy::LazyTensorPtr& input, int64_t dim0, int64_t dim1) {
+  auto input_shape = input->shape();
+  auto permute_dims = torch::lazy::MakeTransposePermutation(
+      /*dim0=*/dim0, /*dim1=*/dim1, /*rank=*/input_shape.Get().dim());
+  torch::lazy::ViewInfo view_info(torch::lazy::ViewInfo::Type::kPermute,
+                                  input_shape, permute_dims);
+  return input->CreateViewTensor(std::move(view_info));
+}
+
+void transpose_(torch::lazy::LazyTensorPtr& input, int64_t dim0, int64_t dim1) {
+  auto input_shape = input->shape();
+  auto permute_dims = torch::lazy::MakeTransposePermutation(
+      /*dim0=*/dim0, /*dim1=*/dim1, /*rank=*/input_shape.Get().dim());
+  torch::lazy::ViewInfo view_info(torch::lazy::ViewInfo::Type::kPermute,
+                                  input_shape, permute_dims);
+  return input->ModifyCurrentView(std::move(view_info));
+}
+
+torch::lazy::LazyTensorPtr unsqueeze(const torch::lazy::LazyTensorPtr& input, int64_t dim) {
+  auto input_shape = input->shape();
+  int64_t squeeze_dim =
+      torch::lazy::GetCanonicalDimensionIndex(dim, input_shape.Get().dim() + 1);
+  auto dimensions =
+      BuildUnsqueezedDimensions(input_shape.Get().sizes(), squeeze_dim);
+  return view(input, dimensions);
+}
+
+void unsqueeze_(torch::lazy::LazyTensorPtr& input, int64_t dim) {
+  int squeeze_dim = torch::lazy::GetCanonicalDimensionIndex(
+      dim, input->shape().Get().dim() + 1);
+  input->SetIrValue(torch::lazy::MakeNode<Unsqueeze>(input->GetIrValue(),
+                                                             squeeze_dim));
+}
+
+torch::lazy::LazyTensorPtr view(const torch::lazy::LazyTensorPtr& input, c10::ArrayRef<int64_t> output_size) {
+  auto input_shape = input->shape().Get();
+  torch::lazy::Shape shape = torch::lazy::Shape(
+      input_shape.scalar_type(), at::infer_size(output_size, input_shape.numel()));
+  torch::lazy::ViewInfo view_info(torch::lazy::ViewInfo::Type::kReshape,
+                                  std::move(shape), input_shape);
+  return input->CreateViewTensor(std::move(view_info));
+}
+
+}  // namespace lazy
+}  // namespace torch
--- a/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
+++ b/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
@ -0,0 +1,90 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/tensor.h>
+
+namespace torch {
+namespace lazy {
+
+//////////////////////////////////////////////////////////////////////////////
+// ATEN operators follows here, listed in alphabetical order.
+//////////////////////////////////////////////////////////////////////////////
+// Takes a slice from the input as R1 at the specified offset and reshapes it
+// into the provided size.
+torch::lazy::LazyTensorPtr as_strided(const torch::lazy::LazyTensorPtr& input, std::vector<int64_t> size,
+                      std::vector<int64_t> stride,
+                      c10::optional<int64_t> storage_offset);
+
+// In-place version of the method above.
+void as_strided_(torch::lazy::LazyTensorPtr& input, std::vector<int64_t> size,
+                 std::vector<int64_t> stride,
+                 c10::optional<int64_t> storage_offset);
+
+torch::lazy::LazyTensorPtr expand(const torch::lazy::LazyTensorPtr& input,
+                  std::vector<int64_t> size);
+
+// Fills the input with the given value.
+void fill_(torch::lazy::LazyTensorPtr& input, const at::Scalar& value);
+
+// Returns a new tensor that is a narrowed view of the input in the given
+// dimension.
+torch::lazy::LazyTensorPtr narrow(const torch::lazy::LazyTensorPtr& input, int64_t dim, int64_t start,
+                  int64_t length);
+
+std::tuple<torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr> ts_native_batch_norm(
+    const torch::lazy::LazyTensorPtr& input, const torch::lazy::LazyTensorPtr& weight, const torch::lazy::LazyTensorPtr& bias,
+    torch::lazy::LazyTensorPtr& running_mean, torch::lazy::LazyTensorPtr& running_var, bool training,
+    double momentum, double eps);
+
+std::tuple<torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr> ts_native_batch_norm_backward(
+    const torch::lazy::LazyTensorPtr& grad_out, const torch::lazy::LazyTensorPtr& input,
+    const torch::lazy::LazyTensorPtr& weight, const torch::lazy::LazyTensorPtr& running_mean,
+    const torch::lazy::LazyTensorPtr& running_var, const torch::lazy::LazyTensorPtr& save_mean,
+    const torch::lazy::LazyTensorPtr& save_invstd, bool training, double eps,
+    c10::ArrayRef<bool> output_mask);
+
+// Permute the dimensions of this tensor according to the given permutation.
+torch::lazy::LazyTensorPtr permute(const torch::lazy::LazyTensorPtr& input, c10::ArrayRef<int64_t> dims);
+
+// Repeats the input tensor along each dimension by the given number of
+// repeats.
+torch::lazy::LazyTensorPtr repeat(const torch::lazy::LazyTensorPtr& input, std::vector<int64_t> repeats);
+
+void copy_(torch::lazy::LazyTensorPtr& input, torch::lazy::LazyTensorPtr& src);
+
+torch::lazy::LazyTensorPtr select(const torch::lazy::LazyTensorPtr& input, int64_t dim, int64_t index);
+
+torch::lazy::LazyTensorPtr slice(const torch::lazy::LazyTensorPtr& input, int64_t dim, int64_t start,
+                 int64_t end, int64_t step);
+
+// Squeeze out all trivial (size 1) dimensions.
+torch::lazy::LazyTensorPtr squeeze(const torch::lazy::LazyTensorPtr& input);
+
+// Squeeze out the specified dimension index, if trivial (size 1). Returns
+// unchanged input otherwise.
+torch::lazy::LazyTensorPtr squeeze(const torch::lazy::LazyTensorPtr& input, int64_t dim);
+
+// In-place versions of the methods above.
+void squeeze_(torch::lazy::LazyTensorPtr& input);
+void squeeze_(torch::lazy::LazyTensorPtr& input, int64_t dim);
+
+
+std::tuple<torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr, torch::lazy::LazyTensorPtr> svd(
+    const torch::lazy::LazyTensorPtr& input,
+    bool some, bool compute_uv);
+
+// Swap given dimensions of the input.
+torch::lazy::LazyTensorPtr transpose(const torch::lazy::LazyTensorPtr& input, int64_t dim0, int64_t dim1);
+
+// In-place version of the method above.
+void transpose_(torch::lazy::LazyTensorPtr& input, int64_t dim0, int64_t dim1);
+
+// Insert a dimension of size one at the specified position.
+torch::lazy::LazyTensorPtr unsqueeze(const torch::lazy::LazyTensorPtr& input, int64_t dim);
+
+// In-place version of the method above.
+void unsqueeze_(torch::lazy::LazyTensorPtr& input, int64_t dim);
+
+// Like reshape, but it returns a view into the original tensor.
+torch::lazy::LazyTensorPtr view(const torch::lazy::LazyTensorPtr& input, c10::ArrayRef<int64_t> output_size);
+}  // namespace lazy
+}  // namespace torch
--- a/torch/csrc/lazy/ts_backend/ts_autograd_functions.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_autograd_functions.cpp
@ -0,0 +1,55 @@
+#include <torch/csrc/lazy/ts_backend/ts_autograd_functions.h>
+#include <ATen/Operators.h>
+#include <ATen/native/CPUFallback.h>
+#include <torch/csrc/lazy/ts_backend/ts_eager_fallback.h>
+
+namespace torch {
+namespace lazy {
+
+at::Tensor MaxPool3dAutogradFunctionTS::forward(
+    torch::autograd::AutogradContext* ctx, at::Tensor self,
+    at::IntArrayRef kernel_size, at::IntArrayRef stride,
+    at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) {
+  ctx->saved_data["kernel_size"] = kernel_size;
+  ctx->saved_data["stride"] = stride;
+  ctx->saved_data["padding"] = padding;
+  ctx->saved_data["dilation"] = dilation;
+  ctx->saved_data["ceil_mode"] = ceil_mode;
+  auto results = at::native::call_fallback_fn<
+      &ltc_eager_fallback, ATEN_OP(max_pool3d_with_indices)>::call(self,
+                                                                   kernel_size,
+                                                                   stride,
+                                                                   padding,
+                                                                   dilation,
+                                                                   ceil_mode);
+  ctx->save_for_backward({self, std::get<1>(results)});
+  return std::get<0>(results);
+}
+
+torch::autograd::variable_list MaxPool3dAutogradFunctionTS::backward(
+    torch::autograd::AutogradContext* ctx,
+    torch::autograd::variable_list grad_output) {
+  auto kernel_size = ctx->saved_data["kernel_size"].toIntList().vec();
+  auto stride = ctx->saved_data["stride"].toIntList().vec();
+  auto padding = ctx->saved_data["padding"].toIntList().vec();
+  auto dilation = ctx->saved_data["dilation"].toIntList().vec();
+  auto ceil_mode = ctx->saved_data["ceil_mode"].toBool();
+  auto saved = ctx->get_saved_variables();
+  auto self = saved[0];
+  at::Tensor grad;
+  auto indices = saved[1];
+  grad = at::native::call_fallback_fn<
+      &ltc_eager_fallback,
+      ATEN_OP(max_pool3d_with_indices_backward)>::call(grad_output[0], self,
+                                                       kernel_size, stride,
+                                                       padding, dilation,
+                                                       ceil_mode, indices);
+
+  at::Tensor undef;
+  torch::autograd::variable_list grad_inputs = {grad,  undef, undef,
+                                                undef, undef, undef};
+  return grad_inputs;
+}
+
+}  // namespace lazy
+}  // namespace torch
--- a/torch/csrc/lazy/ts_backend/ts_autograd_functions.h
+++ b/torch/csrc/lazy/ts_backend/ts_autograd_functions.h
@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/autograd/custom_function.h>
+
+namespace torch {
+namespace lazy {
+
+struct MaxPool3dAutogradFunctionTS
+    : public torch::autograd::Function<MaxPool3dAutogradFunctionTS> {
+  static at::Tensor forward(torch::autograd::AutogradContext* ctx,
+                               at::Tensor self,
+                               at::IntArrayRef kernel_size,
+                               at::IntArrayRef stride,
+                               at::IntArrayRef padding,
+                               at::IntArrayRef dilation, bool ceil_mode);
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output);
+};
+
+}  // namespace lazy
+}  // namespace torch
--- a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
@ -0,0 +1,233 @@
+#include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
+
+#include <ATen/Functions.h>
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/ts_backend/config.h>
+#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
+
+namespace torch {
+namespace lazy {
+
+struct TSBackendDeviceType : public BackendDeviceType {
+  TSBackendDeviceType() = delete;
+  TSBackendDeviceType(c10::DeviceType deviceType)
+  :BackendDeviceType((int8_t)deviceType) {
+    TORCH_CHECK(deviceType == at::kCPU || deviceType == at::kCUDA);
+  }
+
+  std::string toString() const override {
+    return c10::DeviceTypeName((c10::DeviceType)type);
+  }
+
+  c10::DeviceType c10Type() const {
+    return (c10::DeviceType)type;
+  }
+};
+
+class TSBackendImpl : public torch::lazy::BackendImplInterface {
+ public:
+  TSBackendImpl() : default_device_type_(at::kCPU) {
+    // TODO(whc) unify how all our flags are set and parsed as envs
+    static bool env_use_cuda = std::getenv("LTC_TS_CUDA") != nullptr;
+    auto type = (env_use_cuda || FLAGS_torch_lazy_ts_cuda) ? at::kCUDA : at::kCPU;
+    default_device_type_ = TSBackendDeviceType(type);
+  }
+  std::unique_ptr<torch::lazy::LoweringContext> CreateLoweringContext(
+      const std::string& name,
+      torch::lazy::BackendDevice device,
+      c10::ArrayRef<torch::lazy::Node*> post_order,
+      torch::lazy::Util::EmissionMap emit_status) const override {
+    return std::make_unique<torch::lazy::TSLoweringContext>(
+        name, device, post_order, emit_status);
+  }
+
+  std::unique_ptr<torch::lazy::LoweringContext> CreateLoweringContext(
+      const std::string& name,
+      torch::lazy::BackendDevice device) const override {
+    return std::make_unique<torch::lazy::TSLoweringContext>(name, device);
+  }
+
+  std::vector<std::string> GetCompilationDevices(
+      const std::string& device,
+      c10::ArrayRef<std::string> devices) const override {
+    return std::vector<std::string>(devices.begin(), devices.end());
+  }
+
+  at::Tensor MakeTensorFromComputationData(
+      const torch::lazy::BackendDataPtr data,
+      c10::optional<at::ScalarType> logical_scalar_type) const override {
+    const auto ts_data = std::static_pointer_cast<TSData>(data);
+    return ts_data->data();
+  }
+
+  torch::lazy::BackendDataPtr MakeComputationDataFromTensor(
+      const at::Tensor& tensor,
+      const torch::lazy::Shape& shape,
+      const torch::lazy::BackendDevice& device) const override {
+    at::TensorOptions options = tensor.options().device(
+        default_device_type_.c10Type(), device.ordinal());
+    if (tensor.device().type() == default_device_type_.c10Type() &&
+        default_device_type_.c10Type() == at::kCUDA) {
+      return std::make_shared<TSData>(
+          tensor.to(options, /*non_blocking=*/true), shape, device);
+    } else if (tensor.device().type() == at::kCPU && tensor.numel() == 1) {
+      // calling .item() on singleton cpu tensor is fast, and using fill is a safe,
+      // async way to copy cpu to cuda for a single value
+      auto device_tensor = at::full(tensor.sizes(), tensor.item(), options);
+      return std::make_shared<TSData>(device_tensor, shape, device);
+    } else {
+      return std::make_shared<TSData>(
+          tensor.to(options, /*non_blocking=*/false), shape, device);
+    }
+  }
+
+  torch::lazy::BackendDataPtr MakeComputationDataFromScalar(
+      const at::Scalar& scalar,
+      const torch::lazy::BackendDevice& device) const override {
+    return std::make_shared<TSData>(scalar, device);
+  }
+
+  std::string GetComputationBackendText(
+      const torch::lazy::ComputationPtr computation) const override {
+    auto ts_computation =
+        static_cast<torch::lazy::TSComputation*>(computation.get());
+    return ts_computation->graph()->toString();
+  }
+
+  //////////////computation client interfaces///////////////////////
+
+ public:
+  torch::lazy::BackendDataPtr CreateDataPlaceholder(
+      const torch::lazy::BackendDevice& device,
+      const torch::lazy::Shape& shape) const override;
+
+  std::vector<torch::lazy::ComputationPtr> Compile(
+      std::vector<torch::lazy::ComputationPtr> instances) const override;
+
+  std::vector<torch::lazy::BackendDataPtr> ExecuteComputation(
+      torch::lazy::Computation& computation,
+      c10::ArrayRef<torch::lazy::BackendDataPtr> arguments,
+      const torch::lazy::BackendDevice& device) const override;
+
+  std::shared_ptr<torch::lazy::BackendDeviceType> GetDefaultDeviceType()
+      const override {
+    return std::make_shared<BackendDeviceType>(default_device_type_);
+  }
+
+  at::DeviceType EagerFallbackDeviceType() const override;
+
+  void SetDefaultDeviceType(std::string type) override {
+    default_device_type_ = TSBackendDeviceType(c10::Device(type).type());
+    // The first CUDA usage could happen via lazy tensors. Initialize CUDA here
+    // to account for that, at::scalar_tensor constructor triggers everything we
+    // need.
+    static auto init_cuda = default_device_type_.c10Type() == at::kCUDA
+        ? c10::optional<at::Tensor>(
+              at::scalar_tensor(0, at::TensorOptions().device(at::kCUDA)))
+        : c10::nullopt;
+  }
+
+  std::vector<torch::lazy::BackendDevice> GetBackendDevices() const override;
+
+  torch::lazy::BackendDevice GetBackendDevice(
+      c10::Device device) const override;
+
+  void SetRngSeed(size_t seed) const override {
+    LOG(FATAL) << "Not implemented yet.";
+  }
+
+  // std::map<std::string, Metric> GetMetrics() const override { return {}; }
+
+  // MemoryInfo GetMemoryInfo(const std::string& device) override {
+  //   LOG(FATAL) << "Not implemented yet.";
+  // }
+
+  void PrepareToExit() const override;
+
+ private:
+  TSBackendDeviceType default_device_type_;
+};
+
+torch::lazy::BackendDataPtr TSBackendImpl::CreateDataPlaceholder(
+    const torch::lazy::BackendDevice& device,
+    const torch::lazy::Shape& shape) const {
+  return std::make_shared<TSData>(shape, device);
+}
+
+std::vector<torch::lazy::ComputationPtr> TSBackendImpl::Compile(
+    std::vector<torch::lazy::ComputationPtr> instances) const {
+  for (const auto& instance : instances) {
+    auto ts_computation =
+        static_cast<torch::lazy::TSComputation*>(instance.get());
+  }
+  return instances;
+}
+
+std::vector<torch::lazy::BackendDataPtr> TSBackendImpl::ExecuteComputation(
+    torch::lazy::Computation& computation,
+    c10::ArrayRef<torch::lazy::BackendDataPtr> arguments,
+    const torch::lazy::BackendDevice& device) const {
+  torch::jit::GraphExecutor& graph_executor =
+      static_cast<torch::lazy::TSComputation&>(computation).graph_executor();
+  std::vector<torch::jit::IValue> stack;
+  for (const auto& argument : arguments) {
+    const auto ts_data = std::static_pointer_cast<TSData>(argument);
+    if (ts_data->scalar.has_value()) {
+      stack.emplace_back(ts_data->scalar.value());
+    } else {
+      // TODO(whc) should this check be made more general? it's written somewhat
+      // oddly
+      CHECK(
+          (c10::DeviceType)default_device_type_.type != at::kCUDA ||
+          ts_data->data().device().type() == at::kCUDA);
+      stack.emplace_back(ts_data->data());
+    }
+  }
+  graph_executor.run(stack);
+  std::vector<torch::lazy::BackendDataPtr> results;
+  for (torch::jit::IValue component : stack) {
+    at::Tensor result = component.toTensor();
+    at::IntArrayRef result_sizes = result.sizes();
+    torch::lazy::Shape shape(
+        result.scalar_type(),
+        std::vector<int64_t>(result_sizes.begin(), result_sizes.end()));
+    results.push_back(std::make_shared<TSData>(result, shape, device));
+  }
+  return results;
+}
+
+std::vector<torch::lazy::BackendDevice> TSBackendImpl::GetBackendDevices()
+    const {
+  std::vector<torch::lazy::BackendDevice> devices;
+  // TODO(whc) figure out how to query available devices from pytorch
+  devices.emplace_back(GetBackendDevice(c10::Device(c10::kCPU, 0)));
+  devices.emplace_back(GetBackendDevice(c10::Device(c10::kCUDA, 0)));
+  return devices;
+}
+
+torch::lazy::BackendDevice TSBackendImpl::GetBackendDevice(
+    c10::Device device) const {
+  // Note, we ignore the device type specified by the c10::Device since it is
+  // expected to be a virtual device (lazy::), but we need to change this when
+  // we support lazy as a mode
+  return torch::lazy::BackendDevice(GetDefaultDeviceType(), device.index());
+}
+
+void TSBackendImpl::PrepareToExit() const {}
+
+c10::DeviceType TSBackendImpl::EagerFallbackDeviceType() const {
+  // For TS backend, hardware device _is_ eager device
+  return (c10::DeviceType)GetDefaultDeviceType()->type;
+}
+
+torch::lazy::BackendImplInterface* GetTSBackendImpl() {
+  static TSBackendImpl* ts_backend_impl = new TSBackendImpl();
+  return ts_backend_impl;
+}
+
+void InitTorchScriptBackend() {
+  static std::unique_ptr<BackendRegistrar> s_registrar;
+  s_registrar = std::make_unique<BackendRegistrar>(GetTSBackendImpl());
+}
+} // namespace lazy
+} // namespace torch
--- a/torch/csrc/lazy/ts_backend/ts_backend_impl.h
+++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.h
@ -0,0 +1,50 @@
+#include <torch/csrc/lazy/backend/backend_interface.h>
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API TSData : public torch::lazy::BackendData {
+ public:
+  TSData(const at::Scalar& scalar, const torch::lazy::BackendDevice& device)
+      : torch::lazy::BackendData(device, torch::lazy::Shape(scalar.type(), {})),
+        scalar(scalar) {}
+
+  TSData(
+      const at::Tensor& data,
+      const torch::lazy::Shape& shape,
+      const torch::lazy::BackendDevice& device)
+      : torch::lazy::BackendData(device, shape), data_(data) {}
+
+  TSData(
+      const torch::lazy::Shape& shape,
+      const torch::lazy::BackendDevice& device)
+      : torch::lazy::BackendData(device, shape) {}
+
+  Handle GetHandle() override {
+    return reinterpret_cast<int64_t>(this);
+  }
+
+  void Assign(const torch::lazy::BackendData& data) override {
+    data_ = static_cast<const TSData&>(data).data_;
+  }
+
+  bool HasValue() const override {
+    return data_.defined();
+  }
+
+  at::Tensor data() {
+    return data_;
+  }
+
+  c10::optional<at::Scalar> scalar;
+
+ private:
+  at::Tensor data_;
+};
+
+TORCH_API torch::lazy::BackendImplInterface* GetTSBackendImpl();
+
+TORCH_API void InitTorchScriptBackend();
+
+} // namespace lazy
+} // namespace torch
--- a/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_eager_fallback.cpp
@ -0,0 +1,311 @@
+#include <torch/csrc/lazy/ts_backend/ts_eager_fallback.h>
+
+#include <ATen/Functions.h>
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/native/CPUFallback.h>
+#include <torch/csrc/lazy/backend/backend_interface.h>
+#include <torch/csrc/lazy/core/metrics.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <torch/library.h>
+#include <sstream>
+#include <unordered_map>
+
+namespace torch {
+namespace lazy {
+namespace {
+
+std::vector<at::Tensor> _to_eager(
+    at::TensorList tensors,
+    c10::DeviceType device_type) {
+  switch (device_type) {
+    case at::kCPU: {
+      return at::_to_cpu(tensors);
+    }
+    default: {
+      std::vector<at::Tensor> eager_tensors;
+      for (const auto& t : tensors) {
+        c10::TensorOptions options = t.options().device(device_type);
+        at::Tensor eager_tensor = t.to(
+            options,
+            /*non_blocking*/ false,
+            /*copy*/ false);
+        eager_tensors.push_back(eager_tensor);
+      }
+      return eager_tensors;
+    }
+  }
+}
+
+// convenience helper for converting tensors to cpu
+
+std::vector<at::Tensor> to_eager(
+    const at::TensorList& tensors,
+    c10::DeviceType device_type) {
+  // We can't just call _to_eager() on the entire list of Tensors because it
+  // will break on undefined tensors. Separate out undefined tensors first.
+  std::vector<at::Tensor> eager_tensors(tensors.size());
+  std::vector<at::Tensor> valid_tensors;
+  std::vector<bool> to_translate(tensors.size());
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    const at::Tensor& tensor = tensors[i];
+    // Explicitly handling undefined tensors here instead of letting `_to_eager`
+    // handle it. Otherwise, we'd need to require all backends with their own
+    // implementation of _to_eager to properly handle undefined tensors.
+    if (tensor.defined()) {
+      to_translate[i] = true;
+      valid_tensors.push_back(tensor);
+    } else {
+      eager_tensors[i] = tensor;
+    }
+  }
+  auto eager_valid_tensors = _to_eager(valid_tensors, device_type);
+  for (size_t i = 0, defined_pos = 0; i < tensors.size(); ++i) {
+    if (to_translate[i]) {
+      eager_tensors[i] = std::move(eager_valid_tensors[defined_pos++]);
+    }
+  }
+  return eager_tensors;
+}
+
+c10::DispatchKey dispatch_key(c10::DeviceType device_type) {
+  switch (device_type) {
+    case at::kCPU: {
+      return c10::DispatchKey::CPU;
+    }
+    case at::kCUDA: {
+      return c10::DispatchKey::CUDA;
+    }
+    default: {
+      AT_ERROR("Unsupported device type: ", device_type);
+    }
+  }
+}
+
+c10::optional<c10::Device> compute_target_device(
+    std::vector<at::Tensor>& t_args,
+    std::vector<c10::List<at::Tensor>> tlist_args) {
+  // Decide what device to move the output tensor(s) to.
+  // The current convention is that we use the first tensor arg to pick the
+  // device Barring that, we take the first tensor from a TensorList arg.
+  if (t_args.size() > 0) {
+    return t_args[0].device();
+  } else {
+    // We need to loop through all of the (potentially multiple) TensorList
+    // arguments In case, e.g. the first one is empty but the second is not.
+    for (auto& tens_list : tlist_args) {
+      for (const auto i : c10::irange(tens_list.size())) {
+        return tens_list.get(i).device();
+      }
+    }
+  }
+  return c10::nullopt;
+}
+
+} // namespace
+
+static std::unordered_map<std::string, ::torch::lazy::Counter*>
+    _eager_fallback_counters;
+
+bool force_eager_fallback(c10::Symbol op) {
+  static char* force_str = std::getenv("LTC_FORCE_FALLBACK");
+  if (force_str != nullptr) {
+    static auto force_sym = c10::Symbol::fromQualString(std::string(force_str));
+    if (op == force_sym) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void ltc_eager_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack) {
+  // TODO(whc) this FN_TRACK thing hasn't been used so far in LTC iirc but could land/re-enable it
+  // LTC_FN_TRACK(3);;
+  const auto name = c10::toString(op.operator_name());
+
+  // Manually applying the TORCH_LAZY_COUNTER macro.
+  // We need to do it ourselves and explicitly keep a mapping of counters
+  // because this boxed fallback kernel is used by multiple operators,
+  // and the macro stamps out a static Counter object with a fixed name
+  // at the code location that it was called.
+  if (_eager_fallback_counters.find(name) == _eager_fallback_counters.end()) {
+    _eager_fallback_counters[name] = new ::torch::lazy::Counter(name);
+  }
+  _eager_fallback_counters[name]->AddValue(1);
+
+  auto& args = op.schema().arguments();
+  auto arguments = torch::jit::last(stack, args.size());
+
+  // Log each tensor argument.
+  for (const auto & ivalue : arguments) {
+    if (ivalue.isTensor()) {
+      VLOG(3) << ivalue.toTensor().toString();
+    }
+  }
+
+  // Call the actual boxed CPU fallback.
+  ts_eager_fallback(
+      op, stack, torch::lazy::getBackend()->EagerFallbackDeviceType());
+}
+
+TORCH_LIBRARY_IMPL(_, Lazy, m) {
+  m.fallback(torch::CppFunction::makeFromBoxedFunction<&ltc_eager_fallback>());
+}
+
+void ts_eager_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    c10::DeviceType device_type) {
+  auto& schema_args = op.schema().arguments();
+  const auto num_arguments = schema_args.size();
+  auto arguments = torch::jit::last(stack, num_arguments);
+  const auto arguments_begin = stack->size() - num_arguments;
+
+  std::vector<at::Tensor> tensor_args;
+  std::vector<int> tensor_args_indices;
+
+  std::vector<c10::List<at::Tensor>> tensorlist_args;
+
+  // Step 1: Convert all non-eager tensor inputs into eager tensors and put them
+  // on the stack at the correct indices.
+  for (int64_t idx = 0; idx < arguments.size(); ++idx) {
+    const auto& ivalue = arguments[idx];
+    if (ivalue.isTensor()) {
+      tensor_args.push_back(ivalue.toTensor());
+      tensor_args_indices.push_back(idx);
+    } else if (ivalue.isTensorList()) {
+      // Note: we copy each TensorList argument to eager individually out of
+      // convenience, but XLA would benefit from materializing all tensor and
+      // TensorList args onto the CPU at the same time. We can improve this if
+      // we need better perf for XLA's CPU fallbacks.
+      auto eager_ivalue = c10::IValue(c10::List<at::Tensor>(
+          to_eager(ivalue.toTensorList().vec(), device_type)));
+      (*stack)[arguments_begin + idx] = std::move(eager_ivalue);
+      tensorlist_args.push_back(ivalue.toTensorList());
+    }
+  }
+  // XLA requires all of the tensor arguments to be gathered up and converted to
+  // CPU together.
+  auto eager_tensors = to_eager(tensor_args, device_type);
+
+  for (auto i = 0; i < tensor_args_indices.size(); ++i) {
+    auto idx = tensor_args_indices[i];
+    (*stack)[arguments_begin + idx] = c10::IValue(eager_tensors[i]);
+  }
+
+  // Step 2: Call the underlying eager implementation of the operator
+  op.redispatchBoxed(c10::DispatchKeySet(dispatch_key(device_type)), stack);
+
+  // Step 3: We need to take special care to handle mutable aliases properly:
+  // If any input tensors are mutable aliases, we need to directly copy the
+  // updated data on the eager tensors back to the original inputs.
+  for (int64_t i = 0; i < tensor_args_indices.size(); ++i) {
+    auto tensor_idx = tensor_args_indices[i];
+    const auto alias_info = schema_args[tensor_idx].alias_info();
+    if (alias_info != nullptr && alias_info->isWrite()) {
+      at::_copy_from_and_resize(eager_tensors[i], tensor_args[i]);
+    }
+  }
+
+  // Step 4: Convert any eager output tensors back to the original input device.
+  // For mutable alias'd outputs, we also need to take special care
+  // to move the ORIGINAL input tensor back onto the stack, in place of
+  // the temporary eager output tensor that we created.
+  //
+  // Note [Eager Fallback Does Not Handle View Operators]
+  // Also note that we are incapable of handling immutable alises properly.
+  // Why?
+  // Schemas with an immutable alias'd tensor outputs correspond to view
+  // operators. For example, the `view_as` schema from native_functions.yaml:
+  // `view_as(Tensor(a) self, Tensor other) -> Tensor(a)`
+  // We can't handle these ops properly, because view ops are supposed to return
+  // a NEW tensor that shares the SAME storage as the original tensor.
+  // However, the new tensor that we created cannot share the same storage,
+  // since it lives on the eager CPU / CUDA device and the original tensor lives
+  // on a different device. Because of that, we warn if someone attempts to call
+  // the eager fallback on a view operator (this is to maintain BC for view ops
+  // for XLA that fall back to CPU).
+  const auto& schema_returns = op.schema().returns();
+  const auto& num_returns = schema_returns.size();
+  auto returns = torch::jit::last(stack, num_returns);
+  const auto returns_begin = stack->size() - num_returns;
+
+  for (int64_t idx = 0; idx < returns.size(); ++idx) {
+    if (returns[idx].isTensor()) {
+      const auto& return_tens = returns[idx].toTensor();
+      if (return_tens.defined()) {
+        const auto alias_info = schema_returns[idx].alias_info();
+        if (alias_info != nullptr && alias_info->isWrite()) {
+          // Case (1): mutable alias case. Move the input ivalue directly onto
+          // the stack in place of the existing eager output tensor.
+          bool found_alias = false;
+          // We could store some extra metadata on the function schema to avoid
+          // the loop here if we need to improve perf.
+          for (int64_t i = 0; i < tensor_args_indices.size(); ++i) {
+            auto input_tensor_idx = tensor_args_indices[i];
+            const auto& input_tensor = eager_tensors[i];
+            const auto input_alias_info =
+                schema_args[input_tensor_idx].alias_info();
+            if (input_tensor.defined() && input_alias_info != nullptr &&
+                *alias_info == *input_alias_info) {
+              // We've found the original input tensor that aliases with the
+              // current output. Wrap it in an IValue and put it directly on the
+              // stack.
+              (*stack)[returns_begin + idx] = c10::IValue(tensor_args[i]);
+              found_alias = true;
+              break;
+            }
+          }
+          TORCH_CHECK(
+              found_alias,
+              "The operator ",
+              op.schema().operator_name(),
+              " appears to have invalid alias information. ",
+              "Found a return tensor argument with a mismatched "
+              "mutable alias: ",
+              schema_returns[idx]);
+        } else {
+          c10::optional<c10::Device> tgt_device =
+              compute_target_device(tensor_args, tensorlist_args);
+          if (alias_info != nullptr && !alias_info->isWrite()) {
+            // immutable alias (view) case: Warn here, since we're copying and
+            // not creating a view.
+            // If this operator is needed, the backend should provide a kernel
+            // for it.
+            // See Note [Eager Fallback Does Not Handle View Operators]
+            std::stringstream dev_str;
+            if (tgt_device) {
+              dev_str << *tgt_device;
+            } else {
+              dev_str << "<none>";
+            }
+            TORCH_WARN(
+                false,
+                "The operator ",
+                op.schema().operator_name(),
+                " appears to be a view operator, ",
+                "but it has no implementation for the backend \"",
+                dev_str.str(),
+                "\". View operators don't support ",
+                "falling back to run on the eager, since the tensor's "
+                "storage cannot be shared across devices.");
+          }
+          // Case (2): copy case. Copy the eager output tensor to the original
+          // device.
+
+          // We technically  might not have a target device, e.g. if you call
+          // torch.cat() with an empty list In that case, we shouldn't have any
+          // tensors to schlep across devices anyway.
+          if (tgt_device) {
+            (*stack)[returns_begin + idx] =
+                c10::IValue(returns[idx].toTensor().to(*tgt_device));
+          }
+        }
+      }
+    }
+  }
+}
+
+} // namespace lazy
+} // namespace torch
--- a/torch/csrc/lazy/ts_backend/ts_eager_fallback.h
+++ b/torch/csrc/lazy/ts_backend/ts_eager_fallback.h
@ -0,0 +1,21 @@
+#pragma once
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+
+namespace torch {
+namespace lazy {
+
+bool force_eager_fallback(c10::Symbol op);
+void ltc_eager_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+
+void ts_eager_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    c10::DeviceType device_type);
+
+} // namespace lazy
+} // namespace torch
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
@ -0,0 +1,69 @@
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
+#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch {
+namespace lazy {
+
+TSLoweringContext::TSLoweringContext(
+    const std::string& name,
+    BackendDevice device)
+    : torch::lazy::LoweringContext(name, device),
+      graph_(std::make_shared<torch::jit::Graph>()) {
+  lowering_ = TSNodeLoweringInterface::Create(this);
+}
+
+TSLoweringContext::TSLoweringContext(
+    const std::string& name,
+    BackendDevice device,
+    c10::ArrayRef<Node*> post_order,
+    Util::EmissionMap emit_status)
+    : torch::lazy::LoweringContext(name, device, post_order, emit_status),
+      graph_(std::make_shared<torch::jit::Graph>()) {
+  lowering_ = TSNodeLoweringInterface::Create(this);
+  for (auto node : post_order) {
+    bool ok = lowering_->Lower(node);
+    CHECK(ok) << "Failed to lower: " << *node;
+  }
+}
+
+void TSLoweringContext::AssignOutputOp(
+    const Output& output,
+    torch::jit::Value* op) {
+  auto ts_node = NodeCast<TsNode>(output.node, output.node->op());
+  if (!ts_node->getPythonStacktrace().empty()) {
+    op->node()->s_(c10::Symbol::attr("source"), ts_node->getPythonStacktrace());
+  }
+  emitted_outputs_[output] = op;
+}
+
+torch::jit::Value* TSLoweringContext::GetParameter(BackendDataPtr data) {
+  const auto ts_data =
+      std::static_pointer_cast<TSData>(data);
+  BackendData::Handle handle = ts_data->GetHandle();
+  auto it = parameters_map_.find(handle);
+  if (it == parameters_map_.end()) {
+    torch::jit::Value* param =
+        graph_->addInput(c10::str("p", parameters_.size()));
+    if (ts_data->scalar.has_value()) {
+      auto scalarType = ts_data->scalar.value().type();
+      if (isFloatingType(scalarType)) {
+        param->setType(c10::FloatType::get());
+      } else if (isIntegralType(scalarType) || (scalarType == c10::kBool)) {
+        param->setType(c10::IntType::get());
+      } else {
+        TORCH_CHECK(
+            false, "Unhandled scalar type: ", c10::toString(scalarType));
+      }
+    }
+    it = parameters_map_.emplace(handle, Parameter{param, parameters_.size()})
+             .first;
+    parameters_.push_back(ts_data);
+  }
+  parameter_sequence_.push_back(it->second.index);
+  return it->second.param;
+}
+
+} // namespace lazy
+} // namespace torch
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.h
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
@ -1,9 +1,9 @@
 #pragma once

+#include <torch/csrc/api/include/torch/jit.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
 #include <torch/csrc/lazy/backend/lowering_context.h>
 #include <torch/csrc/lazy/ts_backend/ts_node_lowering.h>
-#include <torch/csrc/jit/runtime/graph_executor.h>
-#include <torch/csrc/api/include/torch/jit.h>

 namespace torch {
 namespace lazy {
@ -22,7 +22,8 @@ class TORCH_API TSNodeLoweringInterface {

  virtual bool Lower(const Node* node) = 0;

-  static std::unique_ptr<TSNodeLoweringInterface> Create(LoweringContext* loctx);
+  static std::unique_ptr<TSNodeLoweringInterface> Create(
+      LoweringContext* loctx);
 };

 class TORCH_API TSComputation : public Computation {
@ -34,7 +35,9 @@ class TORCH_API TSComputation : public Computation {
    }
  }

-  int parameters_size() const override { return parameter_names_.size(); }
+  int parameters_size() const override {
+    return parameter_names_.size();
+  }

  const std::vector<Shape>& parameter_shapes() const override {
    throw std::runtime_error(
@ -52,9 +55,13 @@ class TORCH_API TSComputation : public Computation {
    return result_shape_;
  }

-  std::shared_ptr<torch::jit::Graph> graph() const { return graph_; }
+  std::shared_ptr<torch::jit::Graph> graph() const {
+    return graph_;
+  }

-  torch::jit::GraphExecutor& graph_executor() { return graph_executor_; }
+  torch::jit::GraphExecutor& graph_executor() {
+    return graph_executor_;
+  }

 private:
  std::shared_ptr<torch::jit::Graph> graph_;
@ -68,12 +75,15 @@ class TORCH_API TSLoweringContext : public LoweringContext {
 public:
  TSLoweringContext(const std::string& name, const BackendDevice device);

-  TSLoweringContext(const std::string& name, BackendDevice device,
-                    c10::ArrayRef<Node*> post_order,
-                    Util::EmissionMap emit_status);
+  TSLoweringContext(
+      const std::string& name,
+      BackendDevice device,
+      c10::ArrayRef<Node*> post_order,
+      Util::EmissionMap emit_status);

  // TODO(whc) replace these when real impl lands;
-  // I am just landing the interface in this diff, but MSVC won't allow undefined virtual funcs
+  // I am just landing the interface in this diff, but MSVC won't allow
+  // undefined virtual funcs
  Shape GetResultShape(size_t index) const override {
    TORCH_INTERNAL_ASSERT(false, "not implemented");
  }
@ -129,7 +139,9 @@ class TORCH_API TSLoweringContext : public LoweringContext {
  // held in data.
  torch::jit::Value* GetParameter(BackendDataPtr data);

-  std::shared_ptr<torch::jit::Graph> graph() const { return graph_; }
+  std::shared_ptr<torch::jit::Graph> graph() const {
+    return graph_;
+  }

 private:
  struct Parameter {
@ -149,5 +161,5 @@ class TORCH_API TSLoweringContext : public LoweringContext {
  std::unique_ptr<TSNodeLoweringInterface> lowering_;
 };

-}  // namespace lazy
-}  // namespace torch
+} // namespace lazy
+} // namespace torch
--- a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
@ -0,0 +1,525 @@
+#include <ATen/Operators.h>
+#include <ATen/Functions.h>
+#include <ATen/MetaFunctions.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/CPUFallback.h>
+#include <torch/csrc/lazy/core/helpers.h>
+#include <torch/csrc/lazy/core/metrics.h>
+#include <torch/csrc/lazy/core/shape_inference.h>
+#include <torch/csrc/lazy/core/tensor_util.h>
+#include <torch/csrc/lazy/core/view_ops/as_strided.h>
+#include <torch/csrc/lazy/core/tensor_impl.h>
+#include <torch/csrc/lazy/generated/LazyNativeFunctions.h>
+#include <torch/csrc/lazy/ts_backend/config.h>
+#include <torch/csrc/lazy/ts_backend/ts_eager_fallback.h>
+#include <torch/csrc/lazy/ts_backend/tensor_aten_ops.h>
+#include <torch/csrc/lazy/ts_backend/ts_autograd_functions.h>
+#include <torch/csrc/lazy/ts_backend/ops/random_ops.h>
+#include <torch/csrc/lazy/ts_backend/ops/to_copy.h>
+#include <torch/library.h>
+
+namespace torch {
+namespace lazy {
+namespace {
+
+at::Tensor CreateLtcTensor(const at::Tensor& tensor,
+                           const c10::optional<torch::lazy::BackendDevice>& device) {
+  if (tensor.defined() && device) {
+    return torch::lazy::CreateAtenFromLtcTensor(torch::lazy::LazyTensor::Create(tensor, *device));
+  }
+  return tensor;
+}
+
+c10::optional<torch::lazy::BackendDevice> GetLtcDevice(const c10::optional<c10::Device>& device) {
+  if (!device) {
+    return c10::nullopt;
+  }
+  if (device->type() != at::kLazy) {
+    return c10::nullopt;
+  }
+  return torch::lazy::atenDeviceToBackendDevice(*device);
+}
+
+}  // namespace
+
+at::Tensor LazyNativeFunctions::alias(const at::Tensor& self) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::as_strided(
+    const at::Tensor& self, at::IntArrayRef size, at::IntArrayRef stride,
+    c10::optional<int64_t> storage_offset) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  torch::lazy::LazyTensorPtr self_tensor = torch::lazy::TryGetLtcTensor(self);
+  auto xsize = torch::lazy::ToI64Vector(size);
+  auto xstride = torch::lazy::ToI64Vector(stride);
+  if (!torch::lazy::AsStrided::StrideIsSupported(xstride)) {
+    return at::native::call_fallback_fn<
+        &ltc_eager_fallback, ATEN_OP(as_strided)>::call(self, size, stride,
+                                                        storage_offset);
+  }
+  return torch::lazy::CreateAtenFromLtcTensor(torch::lazy::as_strided(
+      self_tensor, std::move(xsize), std::move(xstride), storage_offset));
+}
+
+const at::Tensor& LazyNativeFunctions::as_strided_(
+    const at::Tensor& self, at::IntArrayRef size, at::IntArrayRef stride,
+    c10::optional<int64_t> storage_offset) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  auto xsize = torch::lazy::ToI64Vector(size);
+  auto xstride = torch::lazy::ToI64Vector(stride);
+  if (!torch::lazy::AsStrided::StrideIsSupported(xstride)) {
+    return at::native::call_fallback_fn<
+        &ltc_eager_fallback, ATEN_OP(as_strided_)>::call(self, size, stride,
+                                                         storage_offset);
+  }
+  torch::lazy::as_strided_(self_tensor, std::move(xsize),
+                                    std::move(xstride), storage_offset);
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format) {
+  auto self_lt = torch::lazy::TryGetLtcTensor(self);
+  return torch::lazy::CreateAtenFromLtcTensor(self_lt->Create(self_lt->GetIrValue(), self_lt->GetDevice()));
+}
+
+at::Tensor LazyNativeFunctions::_copy_from(const at::Tensor& self,
+                                           const at::Tensor& dst,
+                                           bool non_blocking) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto dst_tensor = torch::lazy::TryGetLtcTensor(dst);
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  if (!self_tensor) {
+    // providing a new 'eager' value (self) for an existing lazy tensor (dst)
+    static bool sync_update = FLAGS_torch_lazy_ts_tensor_update_sync;
+    CHECK(dst_tensor);
+    dst_tensor->UpdateFromTensor(self, /*sync=*/sync_update);
+  } else if (!dst_tensor) {
+    // materializing a lazy tensor (self) and copying its value into eager tensor (dst)
+    // detached=false lets us skip a copy in `ToTensor`, which should be safe
+    // because we are only going to use the tensor for dst.copy_()
+    CHECK(self_tensor);
+    at::Tensor tensor = self_tensor->ToTensor(/*detached=*/false);
+    at::Tensor typed_tensor =
+        torch::lazy::CopyTensor(tensor, dst.scalar_type(), /*copy=*/false);
+    dst.resize_as_(typed_tensor).copy_(typed_tensor);
+  } else {
+    // Copying one lazy tensor to another
+    if (!dst_tensor->CurrentIrValue()) {
+      // if dest is not backed by IR (e.g. result of some lazy operation),
+      // then it should have at::Tensor data backing it instead
+      auto dst_tensor_data = dst_tensor->CurrentTensorData();
+      CHECK(dst_tensor_data);
+      auto src_tensor_data = self_tensor->CurrentTensorData();
+      if (src_tensor_data) {
+        // both src/dst are simply backed by at::Tensor data, no IR- do a straightforward copy
+        dst_tensor_data->copy_(*src_tensor_data);
+      } else {
+        // src needs to be materialized before its result can be used for a copy into dst
+        // since we use the src tensor only for making a copy, we don't need to detach it
+        // note: it would be even more efficient if we could cause ToTensor to materialize the
+        // value directly into dst's buffer (that would need to be detached though).
+        dst_tensor_data->copy_(self_tensor->ToTensor(/*detached=*/false));
+      }
+    } else {
+      copy_(dst_tensor, self_tensor);
+      auto* impl = dynamic_cast<torch::lazy::LTCTensorImpl*>(dst.unsafeGetTensorImpl());
+      impl->set_tensor(dst_tensor);
+    }
+  }
+  return dst;
+}
+
+at::Tensor LazyNativeFunctions::_copy_from_and_resize(const at::Tensor& self,
+                                                      const at::Tensor& dst) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto dst_tensor = torch::lazy::TryGetLtcTensor(dst);
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  if (!self_tensor) {
+    CHECK(dst_tensor);
+    dst_tensor->UpdateFromTensorOut(self);
+  } else if (!dst_tensor) {
+    CHECK(self_tensor);
+    at::Tensor tensor = self_tensor->ToTensor(/*detached=*/true);
+    at::Tensor typed_tensor =
+        torch::lazy::CopyTensor(tensor, dst.scalar_type(), /*copy=*/false);
+    dst.resize_as_(typed_tensor).copy_(typed_tensor);
+  } else {
+    // at this point we know dst is a lazy tensor
+    auto* dest_impl =
+        dynamic_cast<torch::lazy::LTCTensorImpl*>(dst.unsafeGetTensorImpl());
+    dest_impl->tensor()->UpdateFromTensorOut(self_tensor);
+    dest_impl->force_refresh_sizes();
+  }
+  return dst;
+}
+
+at::Tensor LazyNativeFunctions::_to_copy(const at::Tensor & self,
+                                         c10::optional<at::ScalarType> dtype,
+                                         c10::optional<at::Layout> layout,
+                                         c10::optional<at::Device> device,
+                                         c10::optional<bool> pin_memory,
+                                         bool non_blocking,
+                                         c10::optional<at::MemoryFormat> memory_format) {
+
+    if (force_eager_fallback(at::aten::_to_copy)) {
+      TORCH_INTERNAL_ASSERT(false,
+        "Fallback is currently impossible for _to_copy since the fallback helper itself reinvokes _to_copy");
+    }
+
+    auto options = self.options();
+    if (dtype) {
+      // I put each of these setters in a conditional instead of doing `self.options().dtype(dtype).layout(layout)...
+      // because calling .dtype(nullopt) on an options() that already has dtype appears to wipe it
+      options = options.dtype(dtype);
+    }
+    if (layout) {
+      options = options.layout(layout);
+    }
+    if (memory_format) {
+      options = options.memory_format(memory_format);
+    }
+    if (pin_memory) {
+      // TODO(whc) can we honor 'pin_memory' in some/all cases?
+      options = options.pinned_memory(pin_memory);
+      TORCH_WARN_ONCE("Pinned memory used in lazy _to_copy, check if the behavior is as intended");
+    }
+
+    TORCH_LAZY_FN_COUNTER("lazy::");
+    auto lazy_self = torch::lazy::TryGetLtcTensor(self);
+    if (!lazy_self && device && device->type() == c10::kLazy) {
+      // Case 1: eager->lazy (we create a new lazy tensor)
+
+      auto eager_tensor = self.to(options, /*non_blocking=*/non_blocking, /*copy=*/true);
+      lazy_self = torch::lazy::GetOrCreateLtcTensor(eager_tensor,
+                                                    torch::lazy::atenDeviceToBackendDevice(*device));
+      return torch::lazy::CreateAtenFromLtcTensor(lazy_self);
+    } else if(device && device->type() != c10::kLazy) {
+      // Case 2: lazy->eager (forces a graph break since we are materializing a tensor)
+
+      TORCH_INTERNAL_ASSERT(lazy_self);
+      auto eager_tensor = lazy_self->ToTensor(/*detached=*/true);
+      options = options.device(device);
+      auto moved_eager_tensor = eager_tensor.to(options, /*non_blocking=*/non_blocking, /*copy=*/true);
+      return moved_eager_tensor;
+    } else if (device &&
+               device->type() == c10::kLazy &&
+               device->has_index() &&
+               device->index() != self.device().index()) {
+      // Case 3: lazy:0 -> lazy:1
+
+      // TODO(whc) what do we actually want to do here?
+      //   option 1: materialize, move eager tensor, create new lazy tensor
+      //     - this should be our default, as it is what would happen before we implemented _to_copy
+      //     - actually combines case 1 + case 2
+      //   option 2: support multiple devices inside one lazy/TS executor (case 4)
+      //     - but: we may have other assumptions that there is just one device per executor? so don't take this lightly
+
+      TORCH_INTERNAL_ASSERT(lazy_self);
+      auto eager_tensor = lazy_self->ToTensor(/*detached=*/true);
+      // we move the eager tensor to the 'eager' equivalent of our lazy device
+      // e.g. if our device is lazy:1, the backend maps that to cuda:1, which is what we use
+      auto eager_device = c10::Device(torch::lazy::getBackend()->EagerFallbackDeviceType(), device->index());
+      options = options.device(eager_device);
+      auto moved_eager_tensor = eager_tensor.to(options, /*non_blocking=*/false, /*copy=*/true);
+      lazy_self = torch::lazy::GetOrCreateLtcTensor(moved_eager_tensor,
+                                                    torch::lazy::atenDeviceToBackendDevice(eager_device));
+      return torch::lazy::CreateAtenFromLtcTensor(lazy_self);
+
+    } else {
+      // Case 4: lazy->lazy (special case: keep the _to_copy INSIDE the lazy graph)
+
+      // Note: captured _to_copy will be executed with real eager tensors, not lazy tensors.
+      // We DO NOT want to burn 'lazy:0' as the device into this captured IR, or we will try to
+      // convert an eager tensor back to a lazy one inside the torchscript executor
+      // lazy:0 -> lazy:1 is handled in case3, so we can safely drop the device argument
+      device = c10::nullopt;
+
+      auto shapes = torch::lazy::compute_shape__to_copy(self, dtype, layout, device, pin_memory, non_blocking, memory_format);
+      TORCH_INTERNAL_ASSERT(shapes.size() == 1);
+      auto node = torch::lazy::MakeNode<ToCopy>(lazy_self->GetIrValue(),
+                            dtype,
+                            layout,
+                            device,
+                            pin_memory,
+                            non_blocking,
+                            memory_format,
+                            std::move(shapes));
+
+      auto result = torch::lazy::CreateAtenFromLtcTensor(
+              torch::lazy::LazyTensor::Create(std::move(node), lazy_self->GetDevice()));
+      return result;
+    }
+};
+
+
+at::Tensor LazyNativeFunctions::empty(
+    at::IntArrayRef size, c10::optional<at::ScalarType> dtype,
+    c10::optional<at::Layout> layout, c10::optional<at::Device> device,
+    c10::optional<bool> pin_memory,
+    c10::optional<at::MemoryFormat> memory_format) {
+  const auto device_type = torch::lazy::getBackend()->EagerFallbackDeviceType();
+  at::TensorOptions options = at::TensorOptions()
+                                  .device(c10::Device(device_type))
+                                  .layout(layout)
+                                  .pinned_memory(pin_memory)
+                                  .dtype(dtype);
+  auto x_result = at::empty(size, options, memory_format);
+  return CreateLtcTensor(x_result, GetLtcDevice(device));
+}
+
+at::Tensor LazyNativeFunctions::empty_strided(
+    at::IntArrayRef size, at::IntArrayRef stride,
+    c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout,
+    c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  at::Tensor t = empty(size, dtype, layout, device, pin_memory, c10::nullopt);
+  return LazyNativeFunctions::as_strided(
+      t, size, stride, /*storage_offset=*/0);
+}
+
+at::Tensor LazyNativeFunctions::expand(const at::Tensor& self,
+                                       at::IntArrayRef size, bool implicit) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(torch::lazy::expand(
+      torch::lazy::TryGetLtcTensor(self), size.vec()));
+}
+
+at::Tensor& LazyNativeFunctions::fill_(at::Tensor& self,
+                                       const at::Scalar& value) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  torch::lazy::fill_(self_tensor, value);
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::max_pool3d(
+    const at::Tensor& self, at::IntArrayRef kernel_size, at::IntArrayRef stride,
+    at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) {
+  return torch::lazy::MaxPool3dAutogradFunctionTS::apply(
+      self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor>
+LazyNativeFunctions::native_batch_norm(
+    const at::Tensor& input, const c10::optional<at::Tensor>& weight,
+    const c10::optional<at::Tensor>& bias,
+    const c10::optional<at::Tensor>& running_mean,
+    const c10::optional<at::Tensor>& running_var, bool training,
+    double momentum, double eps) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto input_tensor = torch::lazy::TryGetLtcTensor(input);
+  const torch::lazy::BackendDevice& device = input_tensor->GetDevice();
+  auto running_mean_tensor =
+      GetOrCreateLtcTensor(running_mean, device);
+  auto running_var_tensor =
+      GetOrCreateLtcTensor(running_var, device);
+  auto outputs = ts_native_batch_norm(
+      torch::lazy::TryGetLtcTensor(input), GetOrCreateLtcTensor(weight, device),
+      GetOrCreateLtcTensor(bias, device), running_mean_tensor,
+      running_var_tensor, training, momentum, eps);
+  return std::make_tuple(torch::lazy::CreateAtenFromLtcTensor(std::get<0>(outputs)),
+                         torch::lazy::CreateAtenFromLtcTensor(std::get<1>(outputs)),
+                         torch::lazy::CreateAtenFromLtcTensor(std::get<2>(outputs)));
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor>
+LazyNativeFunctions::native_batch_norm_backward(
+    const at::Tensor& grad_out, const at::Tensor& input,
+    const c10::optional<at::Tensor>& weight,
+    const c10::optional<at::Tensor>& running_mean,
+    const c10::optional<at::Tensor>& running_var,
+    const c10::optional<at::Tensor>& save_mean,
+    const c10::optional<at::Tensor>& save_invstd, bool train, double eps,
+    std::array<bool, 3> output_mask) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto grad_out_tensor = torch::lazy::TryGetLtcTensor(grad_out);
+  const torch::lazy::BackendDevice& device = grad_out_tensor->GetDevice();
+  torch::lazy::LazyTensorPtr null_tensor;
+  bool running_stats = running_mean && running_mean->defined();
+  CHECK_EQ(running_var && running_var->defined(), running_stats);
+  auto gradients = ts_native_batch_norm_backward(
+      torch::lazy::TryGetLtcTensor(grad_out), torch::lazy::TryGetLtcTensor(input),
+      GetOrCreateLtcTensor(weight, device),
+      running_stats ? GetOrCreateLtcTensor(running_mean, device)
+                    : null_tensor,
+      running_stats ? GetOrCreateLtcTensor(running_var, device)
+                    : null_tensor,
+      GetOrCreateLtcTensor(save_mean, device),
+      GetOrCreateLtcTensor(save_invstd, device), train, eps,
+      output_mask);
+  at::Tensor undefined;
+  return std::make_tuple(
+      output_mask[0] ? torch::lazy::CreateAtenFromLtcTensor(std::get<0>(gradients))
+                     : undefined,
+      output_mask[1] ? torch::lazy::CreateAtenFromLtcTensor(std::get<1>(gradients))
+                     : undefined,
+      output_mask[2] ? torch::lazy::CreateAtenFromLtcTensor(std::get<2>(gradients))
+                     : undefined);
+}
+
+// We need to explicitly override max pooling operators and just call the
+// fallback for them because we've customized the autograd function for them
+// (backward needs saved indices from forward).
+std::tuple<at::Tensor, at::Tensor> LazyNativeFunctions::max_pool3d_with_indices(
+    const at::Tensor& self, at::IntArrayRef kernel_size, at::IntArrayRef stride,
+    at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) {
+  return at::native::call_fallback_fn<
+      &ltc_eager_fallback, ATEN_OP(max_pool3d_with_indices)>::call(self,
+                                                                   kernel_size,
+                                                                   stride,
+                                                                   padding,
+                                                                   dilation,
+                                                                   ceil_mode);
+}
+
+at::Tensor LazyNativeFunctions::max_pool3d_with_indices_backward(
+    const at::Tensor& grad_output, const at::Tensor& self,
+    at::IntArrayRef kernel_size, at::IntArrayRef stride,
+    at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode,
+    const at::Tensor& indices) {
+  return at::native::call_fallback_fn<
+      &ltc_eager_fallback,
+      ATEN_OP(max_pool3d_with_indices_backward)>::call(grad_output, self,
+                                                       kernel_size, stride,
+                                                       padding, dilation,
+                                                       ceil_mode, indices);
+}
+
+at::Tensor & LazyNativeFunctions::normal_(at::Tensor & self, double mean, double std, c10::optional<at::Generator> generator) {
+    // Unconditionally fall back.
+    // implementing normal_ via lazy tensor caused differences in results compared to eager.
+    return at::native::call_fallback_fn<&ltc_eager_fallback, ATEN_OP(normal_)>::call(self, mean, std, generator);
+
+    // if (force_eager_fallback(c10::Symbol::fromQualString("aten::normal_"))) {
+    //   return at::native::call_fallback_fn<&ltc_eager_fallback, ATEN_OP(normal_)>::call(self, mean, std, generator);
+    // }
+
+    // if (generator.has_value()) {
+    //   return at::native::call_fallback_fn<&ltc_eager_fallback, ATEN_OP(normal_)>::call(self, mean, std, generator);
+    // }
+
+    // TORCH_LAZY_FN_COUNTER("lazy::");
+    // auto device = bridge::GetBackendDevice(self);
+    // LazyTensor lazy_self = GetLtcTensorOrCreateForWrappedNumber(self, *device);
+    // std::vector<torch::lazy::Shape> shapes = {torch::lazy::Shape(self.scalar_type(), self.sizes().vec())};
+    // auto node = torch::lazy::MakeNode<Normal>(lazy_self.GetIrValue(), mean, std, std::move(shapes));
+    // lazy_self.SetInPlaceIrValue(node);
+    // return self;
+};
+
+at::Tensor LazyNativeFunctions::permute(const at::Tensor& self,
+                                        at::IntArrayRef dims) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  return torch::lazy::CreateAtenFromLtcTensor(torch::lazy::permute(
+      self_tensor, torch::lazy::ToI64Vector(dims)));
+}
+
+at::Tensor LazyNativeFunctions::select(const at::Tensor& self, int64_t dim,
+                                       int64_t index) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::select(torch::lazy::TryGetLtcTensor(self), dim, index));
+}
+
+at::Tensor LazyNativeFunctions::slice(const at::Tensor& self, int64_t dim,
+                                      c10::optional<int64_t> start,
+                                      c10::optional<int64_t> end,
+                                      int64_t step) {
+  int64_t start_val = start.has_value() ? start.value() : 0;
+  int64_t end_val = end.has_value() ? end.value() : INT64_MAX;
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(torch::lazy::slice(
+      torch::lazy::TryGetLtcTensor(self), dim, start_val, end_val, step));
+}
+
+at::Tensor LazyNativeFunctions::squeeze(const at::Tensor& self) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::squeeze(torch::lazy::TryGetLtcTensor(self)));
+}
+
+at::Tensor LazyNativeFunctions::squeeze(const at::Tensor& self, int64_t dim) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::squeeze(torch::lazy::TryGetLtcTensor(self), dim));
+}
+
+at::Tensor& LazyNativeFunctions::squeeze_(at::Tensor& self) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  torch::lazy::squeeze_(self_tensor);
+  return self;
+}
+
+at::Tensor& LazyNativeFunctions::squeeze_(at::Tensor& self, int64_t dim) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  torch::lazy::squeeze_(self_tensor, dim);
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::t(const at::Tensor& self) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::transpose(torch::lazy::TryGetLtcTensor(self), 0, 1));
+}
+
+at::Tensor& LazyNativeFunctions::t_(at::Tensor& self) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  torch::lazy::transpose_(self_tensor, 0, 1);
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::transpose(const at::Tensor& self, int64_t dim0,
+                                          int64_t dim1) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::transpose(torch::lazy::TryGetLtcTensor(self), dim0, dim1));
+}
+
+at::Tensor& LazyNativeFunctions::transpose_(at::Tensor& self, int64_t dim0,
+                                            int64_t dim1) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  torch::lazy::transpose_(self_tensor, dim0, dim1);
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::unsqueeze(const at::Tensor& self, int64_t dim) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::unsqueeze(torch::lazy::TryGetLtcTensor(self), dim));
+}
+
+at::Tensor& LazyNativeFunctions::unsqueeze_(at::Tensor& self, int64_t dim) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  torch::lazy::unsqueeze_(self_tensor, dim);
+  return self;
+}
+
+at::Tensor LazyNativeFunctions::view(const at::Tensor& self,
+                                     at::IntArrayRef size) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::view(self_tensor, torch::lazy::ToI64Vector(size)));
+}
+
+at::Tensor LazyNativeFunctions::_unsafe_view(const at::Tensor& self,
+                                     at::IntArrayRef size) {
+  TORCH_LAZY_FN_COUNTER("lazy::");
+  auto self_tensor = torch::lazy::TryGetLtcTensor(self);
+  return torch::lazy::CreateAtenFromLtcTensor(
+      torch::lazy::view(self_tensor, torch::lazy::ToI64Vector(size)));
+}
+
+void InitializeAtenBindings() {}
+
+}  // namespace lazy
+}  // namespace torch
--- a/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
@ -0,0 +1,410 @@
+#include <torch/csrc/lazy/ts_backend/ts_node_lowering.h>
+
+#include <ATen/Functions.h>
+#include <torch/csrc/jit/frontend/sugared_value.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/lazy/backend/backend_interface.h>
+#include <torch/csrc/lazy/core/helpers.h>
+#include <torch/csrc/lazy/ts_backend/ops/cast.h>
+#include <torch/csrc/lazy/ts_backend/ops/device_data.h>
+#include <torch/csrc/lazy/ts_backend/ops/expand.h>
+#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
+#include <torch/csrc/lazy/ts_backend/ops/scalar.h>
+#include <torch/csrc/lazy/ts_backend/ops/batch_norm_ops.h>
+#include <torch/csrc/lazy/core/permutation_util.h>
+#include <torch/csrc/lazy/core/view_ops/as_strided.h>
+#include <torch/csrc/lazy/core/view_ops/as_strided_view_update.h>
+#include <torch/csrc/lazy/core/view_ops/narrow.h>
+#include <torch/csrc/lazy/core/view_ops/narrow_view_update.h>
+#include <torch/csrc/lazy/core/view_ops/permute.h>
+#include <torch/csrc/lazy/core/view_ops/select.h>
+#include <torch/csrc/lazy/core/view_ops/select_view_update.h>
+#include <torch/csrc/lazy/core/view_ops/squeeze.h>
+#include <torch/csrc/lazy/core/view_ops/unsqueeze.h>
+#include <torch/csrc/lazy/core/view_ops/view.h>
+#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
+#include <torch/csrc/lazy/core/lazy_graph_executor.h>
+
+namespace torch {
+namespace lazy {
+
+class TSNodeLowering : public TSNodeLoweringInterface {
+ public:
+  TSNodeLowering(const std::string& name, torch::lazy::TSLoweringContext* loctx)
+      : loctx_(loctx),
+        function_(loctx ? std::make_shared<torch::jit::GraphFunction>(
+                              name, loctx->graph(), nullptr)
+                        : nullptr) {}
+
+  torch::lazy::TSLoweringContext* loctx() { return loctx_; }
+
+  bool Lower(const torch::lazy::Node* node) override {
+    if (auto* tsnode = dynamic_cast<const torch::lazy::TsNode*>(node)) {
+      // First, we call the node lowering function, which exists for newly
+      // codegenned or refactored nodes
+      TSOpVector ops = tsnode->Lower(function_, loctx());
+      if (ops.empty()) {
+        // Then fall back to legacy lowering code, which should be gradually
+        // removed
+        ops = LowerNonCodegenOps(node);
+      }
+      if (ops.empty()) {
+        return false;
+      }
+      CHECK_EQ(node->num_outputs(), ops.size());
+      for (size_t i = 0; i < ops.size(); ++i) {
+        loctx()->AssignOutputOp(torch::lazy::Output(node, i), ops[i]);
+      }
+      return true;
+    }
+    throw std::runtime_error(
+        "Expected torch::lazy::TsNode but could not dynamic cast");
+  }
+
+  // TODO(whc) this is for legacy/non-codegen Ops, and after moving most ops
+  // to codegen we should delete this and put all the lowering logic into Node
+  // classes
+  TSOpVector LowerNonCodegenOps(const torch::lazy::Node* node) {
+    if (node->op().op == at::aten::as_strided) {
+      return LowerAsStrided(torch::lazy::NodeCast<torch::lazy::AsStrided>(
+          node, torch::lazy::OpKind(at::aten::as_strided)));
+    }
+    if (node->op() == *torch::lazy::ltc_as_strided_view_update) {
+      return LowerAsStridedViewUpdate(
+          torch::lazy::NodeCast<torch::lazy::AsStridedViewUpdate>(
+              node, *torch::lazy::ltc_as_strided_view_update));
+    }
+    if (node->op() == *torch::lazy::ltc_cast) {
+      return LowerCast(torch::lazy::NodeCast<torch::lazy::Cast>(
+          node, *torch::lazy::ltc_cast));
+    }
+    if (node->op() == *torch::lazy::ltc_select_view_update) {
+      return LowerSelectViewUpdate(
+          torch::lazy::NodeCast<torch::lazy::SelectViewUpdate>(
+              node, *torch::lazy::ltc_select_view_update));
+    }
+    if (node->op() == *torch::lazy::ltc_narrow_view_update) {
+      return LowerNarrowViewUpdate(
+          torch::lazy::NodeCast<torch::lazy::NarrowViewUpdate>(
+              node, *torch::lazy::ltc_narrow_view_update));
+    }
+    if (node->op().op == at::prim::Constant) {
+      return LowerScalar(torch::lazy::NodeCast<torch::lazy::Scalar>(
+          node, torch::lazy::OpKind(at::prim::Constant)));
+    }
+    if (node->op().op == at::aten::native_batch_norm) {
+      return LowerBatchNorm(
+          torch::lazy::NodeCast<TSNativeBatchNormForward>(
+              node, torch::lazy::OpKind(at::aten::native_batch_norm)));
+    }
+    if (node->op().op == at::aten::native_batch_norm_backward) {
+      return LowerBatchNormBackward(
+          torch::lazy::NodeCast<TSNativeBatchNormBackward>(
+              node, torch::lazy::OpKind(at::aten::native_batch_norm_backward)));
+    }
+    if (node->op().op == at::aten::expand) {
+      return LowerExpand(
+          torch::lazy::NodeCast<torch::lazy::Expand>(
+              node, torch::lazy::OpKind(at::aten::expand)));
+    }
+    if (node->op().op == at::aten::narrow) {
+      return LowerNarrow(torch::lazy::NodeCast<torch::lazy::Narrow>(
+          node, torch::lazy::OpKind(at::aten::narrow)));
+    }
+    if (node->op().op == at::aten::permute) {
+      return LowerPermute(torch::lazy::NodeCast<torch::lazy::Permute>(
+          node, torch::lazy::OpKind(at::aten::permute)));
+    }
+    if (node->op().op == at::aten::select) {
+      return LowerSelect(torch::lazy::NodeCast<torch::lazy::Select>(
+          node, torch::lazy::OpKind(at::aten::select)));
+    }
+    if (node->op().op == at::aten::squeeze) {
+      return LowerSqueeze(
+          torch::lazy::NodeCast<Squeeze>(
+              node, torch::lazy::OpKind(at::aten::squeeze)));
+    }
+    if (node->op().op == at::aten::unsqueeze) {
+      return LowerUnsqueeze(
+          torch::lazy::NodeCast<Unsqueeze>(
+              node, torch::lazy::OpKind(at::aten::unsqueeze)));
+    }
+    if (node->op().op == at::aten::view) {
+      return LowerView(torch::lazy::NodeCast<torch::lazy::View>(
+          node, torch::lazy::OpKind(at::aten::view)));
+    }
+    if (node->op() == *torch::lazy::ltc_device_data) {
+      const torch::lazy::DeviceData* device_data_node =
+          torch::lazy::NodeCast<torch::lazy::DeviceData>(
+              node, *torch::lazy::ltc_device_data);
+      auto infoptr = device_data_node->data()->info();
+      auto deviceDataInfoPtr = (torch::lazy::LazyGraphExecutor::DeviceDataInfo*) infoptr;
+      if (GRAPH_DUMP_ENABLED) {
+        LOG(ERROR) << "Lowering device data node, tensor id " << deviceDataInfoPtr->tensor_id << std::endl;
+      }
+      return {loctx()->GetParameter(device_data_node->data())};
+    }
+
+    std::vector<torch::jit::NamedValue> arguments;
+    for (const torch::lazy::Output& output : node->operands()) {
+      arguments.emplace_back(loctx()->GetOutputOp(output));
+    }
+    return LowerBuiltin(node, arguments);
+  }
+
+  TSOpVector LowerBuiltin(
+      const torch::lazy::Node* node,
+      const std::vector<torch::jit::NamedValue>& arguments,
+      const std::vector<torch::jit::NamedValue>& kwarguments = {}) {
+    return LowerTSBuiltin(function_, node->op().op, arguments, kwarguments);
+  }
+  TSOpVector LowerBuiltin(
+      c10::Symbol sym, const std::vector<torch::jit::NamedValue>& arguments,
+      const std::vector<torch::jit::NamedValue>& kwarguments = {}) {
+    return LowerTSBuiltin(function_, sym, arguments, kwarguments);
+  }
+
+  TSOpVector LowerAsStrided(const torch::lazy::AsStrided* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->size());
+    arguments.emplace_back(node->stride());
+    arguments.emplace_back(node->storage_offset());
+    TSOpVector as_strided_out = LowerBuiltin(node, arguments);
+    CHECK_EQ(as_strided_out.size(), 1);
+    return {GenerateClone(as_strided_out.front())};
+  }
+
+  TSOpVector LowerAsStridedViewUpdate(
+      const torch::lazy::AsStridedViewUpdate* node) {
+    torch::jit::Value* destination =
+        GenerateClone(loctx()->GetOutputOp(node->operand(0)));
+    const torch::lazy::Output& input_op = node->operand(1);
+    const torch::lazy::Shape& input_shape = input_op.shape();
+    const auto input_dimensions = input_shape.sizes();
+    std::vector<torch::jit::NamedValue> dest_arguments;
+    dest_arguments.emplace_back(destination);
+    dest_arguments.emplace_back(
+        std::vector<int64_t>(input_dimensions.begin(), input_dimensions.end()));
+    dest_arguments.emplace_back(node->stride());
+    dest_arguments.emplace_back(node->storage_offset());
+    TSOpVector as_strided_out =
+        LowerBuiltin(at::aten::as_strided, dest_arguments);
+    CHECK_EQ(as_strided_out.size(), 1);
+    torch::jit::Value* as_strided = as_strided_out.front();
+    GenerateCopy(as_strided, loctx()->GetOutputOp(input_op));
+    return {destination};
+  }
+
+  TSOpVector LowerBatchNorm(const TSNativeBatchNormForward* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    for (size_t i = 0; i < 5; ++i) {
+      arguments.emplace_back(loctx()->GetOutputOp(node->operand(i)));
+    }
+    arguments.emplace_back(node->training());
+    arguments.emplace_back(node->momentum());
+    arguments.emplace_back(node->eps());
+    return LowerBuiltin(node, arguments);
+  }
+
+  TSOpVector LowerBatchNormBackward(const TSNativeBatchNormBackward* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    for (size_t i = 0; i < 3; ++i) {
+      arguments.emplace_back(loctx()->GetOutputOp(node->operand(i)));
+    }
+    const auto& operands = node->operands();
+    c10::optional<at::Tensor> null_arg;
+    if (operands.size() == 5) {
+      arguments.emplace_back(null_arg);
+      arguments.emplace_back(null_arg);
+    }
+    for (size_t i = 3; i < operands.size(); ++i) {
+      arguments.emplace_back(loctx()->GetOutputOp(node->operand(i)));
+    }
+    arguments.emplace_back(node->training());
+    arguments.emplace_back(node->eps());
+    arguments.emplace_back(node->output_mask());
+    return LowerBuiltin(node, arguments);
+  }
+
+  TSOpVector LowerCast(const torch::lazy::Cast* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->dtype());
+    return LowerBuiltin(at::aten::to, arguments);
+  }
+
+  TSOpVector LowerExpand(const torch::lazy::Expand* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->size());
+    auto expand_out = LowerBuiltin(node, arguments);
+    if (node->is_scalar_expand()) {
+      // The aten::expand operations sets all strides to 0 when the original is
+      // of rank 0. This leads to false positives when checking for internal
+      // memory overlap, because at::has_internal_overlap returns
+      // MemOverlap::YES when a stride is set to 0.
+      CHECK_EQ(expand_out.size(), 1);
+      return {GenerateClone(expand_out.front())};
+    }
+    return expand_out;
+  }
+
+  TSOpVector LowerNarrow(const torch::lazy::Narrow* node) {
+    const torch::lazy::Output& input = node->operand(0);
+    torch::jit::Value* base = loctx()->GetOutputOp(input);
+    const auto& base_indices = node->base_indices();
+    const auto& sizes = node->sizes();
+    const torch::lazy::Shape& input_shape = input.shape();
+    CHECK_EQ(sizes.size(), base_indices.size());
+    CHECK_EQ(input_shape.dim(), base_indices.size());
+    for (size_t dim = 0; dim < base_indices.size(); ++dim) {
+      int64_t start = base_indices[dim];
+      base = GenerateSlice(/*base=*/base, /*dim=*/dim, /*start=*/start,
+                           /*end=*/start + sizes[dim], /*step=*/1);
+    }
+    return {base};
+  }
+
+  TSOpVector LowerPermute(const torch::lazy::Permute* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->dims());
+    return LowerBuiltin(node, arguments);
+  }
+
+  TSOpVector LowerScalar(const torch::lazy::Scalar* node) {
+    const at::Scalar& value = node->value();
+    const torch::lazy::Shape& shape = node->shape();
+    auto options =
+        at::TensorOptions()
+            .device(torch::lazy::getBackend()->EagerFallbackDeviceType())
+            .dtype(shape.scalar_type());
+    return {
+        loctx()->graph()->insertConstant(at::scalar_tensor(value, options))};
+  }
+
+  TSOpVector LowerSelect(const torch::lazy::Select* node) {
+    int64_t step = torch::lazy::Select::GetStride(node->start(), node->end(),
+                                                  node->stride());
+    torch::jit::Value* base = loctx()->GetOutputOp(node->operand(0));
+    return {GenerateSlice(/*base=*/base, /*dim=*/node->dim(),
+                          /*start=*/node->start(), /*end=*/node->end(),
+                          /*step=*/step)};
+  }
+
+  TSOpVector LowerSqueeze(const Squeeze* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    if (node->dim() != -1) {
+      arguments.emplace_back(node->dim());
+    }
+    return LowerBuiltin(node, arguments);
+  }
+
+  TSOpVector LowerSelectViewUpdate(const torch::lazy::SelectViewUpdate* node) {
+    torch::jit::Value* dest =
+        GenerateClone(loctx()->GetOutputOp(node->operand(0)));
+    int64_t step = torch::lazy::Select::GetStride(node->start(), node->end(),
+                                                  node->stride());
+    torch::jit::Value* selected = GenerateSlice(
+        /*base=*/dest, /*dim=*/node->dim(), /*start=*/node->start(),
+        /*end=*/node->end(), /*step=*/step);
+    GenerateCopy(selected, loctx()->GetOutputOp(node->operand(1)));
+    return {dest};
+  }
+
+  TSOpVector LowerNarrowViewUpdate(const torch::lazy::NarrowViewUpdate* node) {
+    torch::jit::Value* dest =
+        GenerateClone(loctx()->GetOutputOp(node->operand(0)));
+    const auto& base_indices = node->base_indices();
+    const torch::lazy::Output& source_argument = node->operand(1);
+    const torch::lazy::Shape& source_shape = source_argument.shape();
+    CHECK_EQ(source_shape.dim(), base_indices.size());
+    torch::jit::Value* base = dest;
+    for (size_t dim = 0; dim < base_indices.size(); ++dim) {
+      int64_t start = base_indices[dim];
+      base = GenerateSlice(/*base=*/base, /*dim=*/dim, /*start=*/start,
+                           /*end=*/start + source_shape.size(dim),
+                           /*step=*/1);
+    }
+    GenerateCopy(base, loctx()->GetOutputOp(source_argument));
+    return {dest};
+  }
+
+  TSOpVector LowerUnsqueeze(const Unsqueeze* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->dim());
+    return LowerBuiltin(node, arguments);
+  }
+
+  TSOpVector LowerView(const torch::lazy::View* node) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(loctx()->GetOutputOp(node->operand(0)));
+    arguments.emplace_back(node->output_size());
+    return LowerBuiltin(at::aten::reshape, arguments);
+  }
+
+  torch::jit::Value* GenerateClone(torch::jit::Value* val) {
+    std::vector<torch::jit::NamedValue> clone_arguments;
+    clone_arguments.emplace_back(val);
+    TSOpVector cloned = LowerBuiltin(at::aten::clone, clone_arguments);
+    CHECK_EQ(cloned.size(), 1);
+    return cloned.front();
+  }
+
+  void GenerateCopy(torch::jit::Value* destination, torch::jit::Value* source) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(destination);
+    arguments.emplace_back(source);
+    LowerBuiltin(at::aten::copy_, arguments);
+  }
+
+  torch::jit::Value* GenerateSlice(torch::jit::Value* base, int64_t dim,
+                                   int64_t start, int64_t end, int64_t step) {
+    std::vector<torch::jit::NamedValue> arguments;
+    arguments.emplace_back(base);
+    arguments.emplace_back(dim);
+    arguments.emplace_back(start);
+    arguments.emplace_back(end);
+    arguments.emplace_back(step);
+    TSOpVector selected = LowerBuiltin(at::aten::slice, arguments);
+    CHECK_EQ(selected.size(), 1);
+    return selected.front();
+  }
+  torch::lazy::TSLoweringContext* loctx_;
+  std::shared_ptr<torch::jit::GraphFunction> function_;
+};
+
+std::unique_ptr<TSNodeLoweringInterface> TSNodeLoweringInterface::Create(
+    torch::lazy::LoweringContext* loctx) {
+  return std::make_unique<TSNodeLowering>(
+      "TSNodeLowering", static_cast<torch::lazy::TSLoweringContext*>(loctx));
+}
+
+TSOpVector LowerTSBuiltin(
+    std::shared_ptr<torch::jit::GraphFunction> function, c10::Symbol sym,
+    const std::vector<torch::jit::NamedValue>& arguments,
+    const std::vector<torch::jit::NamedValue>& kwarguments) {
+  auto builtin =
+      std::make_shared<torch::jit::BuiltinFunction>(sym, at::nullopt);
+  auto magic_method = std::make_shared<torch::jit::MagicMethod>("", builtin);
+  auto ret = magic_method->call({}, *function, arguments, kwarguments, 0);
+  auto sv = dynamic_cast<torch::jit::SimpleValue*>(ret.get());
+  CHECK(sv);
+  if (sv->getValue()->type()->kind() == c10::TypeKind::TupleType) {
+    const auto tuple_call_result = sv->asTuple({}, *function);
+    TSOpVector tuple_result;
+    for (const auto& tuple_component : tuple_call_result) {
+      auto tuple_component_sv =
+          dynamic_cast<torch::jit::SimpleValue*>(tuple_component.get());
+      tuple_result.push_back(tuple_component_sv->getValue());
+    }
+    return tuple_result;
+  }
+  return {sv->getValue()};
+}
+
+}  // namespace lazy
+}  // namespace torch