Delete old PyTorch 1.3 type dispatch oriented code paths.

* We aren't quite at e2e parity, but we aren't going back and the old path is bit-rotted.
2020-11-12 22:27:05 -08:00 · 2020-11-12 22:27:05 -08:00 · 47ac80491c
parent e359167562
commit 47ac80491c
72 changed files with 22 additions and 34570 deletions
--- a/docker/pytorch-1.3/Dockerfile
+++ b/docker/pytorch-1.3/Dockerfile
@ -1,48 +0,0 @@
-FROM nvcr.io/nvidia/pytorch:19.10-py3
-MAINTAINER Stephen Neuendorffer <stephenn@xilinx.com>
-
-#
-# get the basics
-#
-USER root
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update
-RUN apt-get install software-properties-common --assume-yes
-RUN apt-get install wget curl unzip libxml2-dev --assume-yes
-RUN apt-get install autoconf libtool g++ g++-multilib --assume-yes
-RUN apt-get install build-essential python3 cmake git gitk --assume-yes
-RUN apt-get install clang-8 lld-8 ninja-build --assume-yes
-RUN apt-get install libncurses5-dev --assume-yes
-
-RUN /opt/conda/bin/conda install matplotlib pybind11
-#torchvision
-
-ENV LD_LIBRARY_PATH "${LD_LIBRARY_PATH}:/opt/conda/lib"
-# Rebuild pytorch
-
-WORKDIR /opt/pytorch/pytorch
-
-# this is the recommended rebuild command from NVIDIA
-# with the cleanup of the build area omitted.
-RUN TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5+PTX" \
-    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
-    NCCL_INCLUDE_DIR="/usr/include/" \
-    NCCL_LIB_DIR="/usr/lib/" \
-    python setup.py install
-
-WORKDIR /workspace
-
-# Additional packages for building npcomp
-RUN apt-get install clang-10 lld-10 --assume-yes
-RUN conda install -c gaiar nnpack
-
-# Make it possible to symbolize stack traces in crashes.
-RUN ln -s /usr/bin/llvm-symbolizer-10 /usr/bin/llvm-symbolizer
-
-# Additional env for building npcomp and running tests.
-ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda-10.1/compat/lib.real"
-ENV CC=clang-10
-ENV CXX=clang++-10
-ENV CXXFLAGS "-I/opt/conda/include"
-ENV LDFLAGS "-fuse-ld=/usr/bin/ld.lld-10 -L/opt/conda/lib"
--- a/docs/pytorch13_build.md
+++ b/docs/pytorch13_build.md
@ -1,60 +0,0 @@
-# Deprecated PyTorch 1.3 based build
-
-These instructions are retained for the transition. Refer to top-level README for up to date instructions.
-
-### PyTorch 1.3 - ATen pseudo-device type dispatch
-
-The currently functional approach to PyTorch integration uses an ATen pseudo
-device for program capture. It is activated by including the PyTorch cmake
-path and settind `-DNPCOMP_ENABLE_TORCH_TYPE_DISPATCH=ON`. This approach has a
-very fragile dependency on a specific PyTorch revisions in the ~1.3 era and
-currently must be built via the docker image in `docker/pytorch-1.3`.
-
-We are migrating to newer approaches that build with more recent PyTorch
-versions, but these are not yet functional (see below).
-
-Docker container setup:
-
-```shell
-# One of the maintainers does periodically push new images. To use one of these,
-# skip the build step and use:
-#   BUILD_IMAGE_TAG="stellaraccident/npcomp:build-pytorch-1.3"
-# Since we are not planning to support this branch long term, this process is
-# entirely ad-hoc at present and geared for project maintainers and build bots
-# to be able to make progress.
-# See https://hub.docker.com/repository/docker/stellaraccident/npcomp
-BUILD_IMAGE_TAG="local/npcomp:build-pytorch-1.3"
-
-# Build the docker image (rebuilds PyTorch, so takes quite some time).
-docker build docker/pytorch-1.3 --tag $BUILD_IMAGE_TAG
-
-# Docker workflow (or use your own preferences).
-# Create a volume for npcomp build artifacts.
-docker volume create npcomp-pytorch-1.3-build
-
-# Run the container, mounting /npcomp to the source directory and the volume
-# above to the /build directory. The source directory is mounted read-only to
-# avoid the container putting root owned files there.
-# Replace `$HOME/src/mlir-npcomp` with an appropriate path to where the project
-# is checked out.
-docker run \
-  --mount type=bind,source=$HOME/src/mlir-npcomp,target=/npcomp,readonly \
-  --mount source=npcomp-pytorch-1.3-build,target=/build \
-  --rm -it $BUILD_IMAGE_TAG /bin/bash
-```
-
-```shell
-# From within the docker image.
-# Install MLIR and configure project.
-cd /npcomp
-BUILD_DIR=/build ./build_tools/install_mlir.sh
-BUILD_DIR=/build ./build_tools/cmake_configure.sh \
-  -DCMAKE_PREFIX_PATH=/opt/conda/lib/python3.6/site-packages/torch/share/cmake \
-  -DNPCOMP_ENABLE_TORCH_TYPE_DISPATCH=ON
-
-# Build.
-cd /build
-ninja
-ninja check-npcomp
-ninja check-frontends-pytorch
-```
--- a/frontends/pytorch/CMakeLists.txt
+++ b/frontends/pytorch/CMakeLists.txt
@ -1,24 +1,7 @@
-#-------------------------------------------------------------------------------
-# Options and settings
-#-------------------------------------------------------------------------------
-
-option(NPCOMP_ENABLE_TORCH_TYPE_DISPATCH "Enables the legacy ATen Type dispatch code path" OFF)
-if(NPCOMP_ENABLE_TORCH_TYPE_DISPATCH)
-  add_compile_definitions(NPCOMP_ENABLE_TORCH_TYPE_DISPATCH)
-  message(STATUS "Legacy Torch type dispatch mechanism enabled")
-endif()
-
 #-------------------------------------------------------------------------------
 # Subdirectories
 #-------------------------------------------------------------------------------

-# TODO: This sub-directory does not need to be gated on the type dispatch
-# mechanism, but it presently has some dependencies on an older pytorch version
-# and is being excluded until those can be resolved.
-if(NPCOMP_ENABLE_TORCH_TYPE_DISPATCH)
-  add_subdirectory(lib)
-endif()
-
 add_subdirectory(csrc)
 add_subdirectory(python)
 add_subdirectory(test)
--- a/frontends/pytorch/csrc/CMakeLists.txt
+++ b/frontends/pytorch/csrc/CMakeLists.txt
@ -1,8 +1,4 @@
-if(NPCOMP_ENABLE_TORCH_TYPE_DISPATCH)
-  add_subdirectory(type_dispatch)
-else()
-  add_subdirectory(c10_dispatch)
-endif()
+add_subdirectory(builder)

 include(NpcompPython)

@ -16,25 +12,14 @@ include_directories(
  )
 link_directories("${TORCH_INSTALL_PREFIX}/lib")

-set(torch_mlir_optional_libraries)
-if(NPCOMP_ENABLE_TORCH_TYPE_DISPATCH)
-  list(APPEND torch_mlir_optional_libraries
-    npcomp_torch_type_dispatch_bindings
-    )
-else()
-  list(APPEND torch_mlir_optional_libraries
-    npcomp_torch_c10_dispatch_bindings
-  )
-endif()
-
 add_library(NPCOMPTorchMLIRExt SHARED
  init_python_bindings.cpp
  )
 target_link_libraries(NPCOMPTorchMLIRExt
  ${TORCH_LIBRARIES}
  ${PYTHON_LIBRARIES}
-  ${torch_mlir_optional_libraries}
  torch_python
+  npcomp_torch_builder_bindings

  # NPCOMP shared library.
  NPCOMP
--- a/frontends/pytorch/csrc/c10_dispatch/CMakeLists.txt
+++ b/frontends/pytorch/csrc/c10_dispatch/CMakeLists.txt
@ -7,7 +7,7 @@ include_directories(
  ${PYTHON_INCLUDE_DIRS}
  )
 link_directories("${TORCH_INSTALL_PREFIX}/lib")
-add_library(npcomp_torch_c10_dispatch_bindings
+add_library(npcomp_torch_builder_bindings
  acap_dispatch.cpp
  debug.cpp
  func_builder.cpp
@ -15,7 +15,7 @@ add_library(npcomp_torch_c10_dispatch_bindings
  python_bindings.cpp
 )

-target_link_libraries(npcomp_torch_c10_dispatch_bindings
+target_link_libraries(npcomp_torch_builder_bindings
  ${TORCH_LIBRARIES}
  ${PYTHON_LIBRARIES}
  torch_python
--- a/frontends/pytorch/csrc/c10_dispatch/acap_dispatch.cpp
+++ b/frontends/pytorch/csrc/c10_dispatch/acap_dispatch.cpp
--- a/frontends/pytorch/csrc/c10_dispatch/acap_dispatch.h
+++ b/frontends/pytorch/csrc/c10_dispatch/acap_dispatch.h
@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//

-#ifndef NPCOMP_FRONTENDS_PYTORCH_CSRC_C10_DISPATCH_ACAP_DISPATCH_H
-#define NPCOMP_FRONTENDS_PYTORCH_CSRC_C10_DISPATCH_ACAP_DISPATCH_H
+#ifndef NPCOMP_FRONTENDS_PYTORCH_CSRC_BUILDER_ACAP_DISPATCH_H
+#define NPCOMP_FRONTENDS_PYTORCH_CSRC_BUILDER_ACAP_DISPATCH_H

 #include <list>
 #include <memory>
--- a/frontends/pytorch/csrc/c10_dispatch/debug.cpp
+++ b/frontends/pytorch/csrc/c10_dispatch/debug.cpp
--- a/frontends/pytorch/csrc/c10_dispatch/debug.h
+++ b/frontends/pytorch/csrc/c10_dispatch/debug.h
@ -5,6 +5,9 @@
 //
 //===----------------------------------------------------------------------===//

+#ifndef NPCOMP_FRONTENDS_PYTORCH_CSRC_BUILDER_DEBUG_H
+#define NPCOMP_FRONTENDS_PYTORCH_CSRC_BUILDER_DEBUG_H
+
 #include <string>

 namespace torch_mlir {
@ -20,3 +23,5 @@ void debugTrace(const std::string &message);
 void enableDebugTraceToStderr();

 } // namespace torch_mlir
+
+#endif // NPCOMP_FRONTENDS_PYTORCH_CSRC_BUILDER_DEBUG_H
--- a/frontends/pytorch/csrc/c10_dispatch/func_builder.cpp
+++ b/frontends/pytorch/csrc/c10_dispatch/func_builder.cpp
--- a/frontends/pytorch/csrc/c10_dispatch/func_builder.h
+++ b/frontends/pytorch/csrc/c10_dispatch/func_builder.h
@ -5,8 +5,8 @@
 //
 //===----------------------------------------------------------------------===//

-#ifndef NPCOMP_FRONTENDS_PYTORCH_CSRC_C10_DISPATCH_FUNC_BUILDER_H
-#define NPCOMP_FRONTENDS_PYTORCH_CSRC_C10_DISPATCH_FUNC_BUILDER_H
+#ifndef NPCOMP_FRONTENDS_PYTORCH_CSRC_BUILDER_FUNC_BUILDER_H
+#define NPCOMP_FRONTENDS_PYTORCH_CSRC_BUILDER_FUNC_BUILDER_H

 #include "mlir-c/IR.h"
 #include "llvm/ADT/DenseMap.h"
@ -169,4 +169,4 @@ private:

 } // namespace torch_mlir

-#endif // NPCOMP_FRONTENDS_PYTORCH_CSRC_C10_DISPATCH_MODULE_BUILDER_H
+#endif // NPCOMP_FRONTENDS_PYTORCH_CSRC_BUILDER_FUNC_BUILDER_H
--- a/frontends/pytorch/csrc/c10_dispatch/module_builder.cpp
+++ b/frontends/pytorch/csrc/c10_dispatch/module_builder.cpp
--- a/frontends/pytorch/csrc/c10_dispatch/module_builder.h
+++ b/frontends/pytorch/csrc/c10_dispatch/module_builder.h
@ -5,8 +5,8 @@
 //
 //===----------------------------------------------------------------------===//

-#ifndef NPCOMP_FRONTENDS_PYTORCH_CSRC_C10_DISPATCH_MODULE_BUILDER_H
-#define NPCOMP_FRONTENDS_PYTORCH_CSRC_C10_DISPATCH_MODULE_BUILDER_H
+#ifndef NPCOMP_FRONTENDS_PYTORCH_CSRC_BUILDER_H
+#define NPCOMP_FRONTENDS_PYTORCH_CSRC_BUILDER_H

 #include "../pybind.h"

--- a/frontends/pytorch/csrc/c10_dispatch/python_bindings.cpp
+++ b/frontends/pytorch/csrc/c10_dispatch/python_bindings.cpp
@ -126,7 +126,9 @@ py::list GetRegisteredOps() {
  return results;
 }

-void InitModuleBindings(py::module &m) {
+} // namespace
+
+void torch_mlir::InitBuilderBindings(py::module &m) {
  m.def("debug_trace_to_stderr", &enableDebugTraceToStderr);

  py::class_<AcapController, std::shared_ptr<AcapController>>(m,
@ -139,8 +141,3 @@ void InitModuleBindings(py::module &m) {
  ModuleBuilder::bind(m);
 }

-} // namespace
-
-void torch_mlir::InitC10DispatchBindings(py::module &m) {
-  InitModuleBindings(m);
-}
--- a/frontends/pytorch/csrc/init_python_bindings.cpp
+++ b/frontends/pytorch/csrc/init_python_bindings.cpp
@ -148,12 +148,7 @@ void InitModuleBindings(py::module &m) {

 void InitBindings(py::module &m) {
  InitModuleBindings(m);
-
-#if defined(NPCOMP_ENABLE_TORCH_TYPE_DISPATCH)
-  InitTypeDispatchBindings(m);
-#else
-  InitC10DispatchBindings(m);
-#endif
+  InitBuilderBindings(m);
 }

 } // namespace torch_mlir
--- a/frontends/pytorch/csrc/init_python_bindings.h
+++ b/frontends/pytorch/csrc/init_python_bindings.h
@ -15,13 +15,8 @@ namespace torch_mlir {
 // Perform top-level initialization for the module.
 void InitBindings(pybind11::module &m);

-// Adds bindings related to the type-dispatch program capture mechanism.
-// Only defined if NPCOMP_ENABLE_TORCH_TYPE_DISPATCH (optional feature).
-void InitTypeDispatchBindings(pybind11::module &m);
-
-// Adds bindings related to the c10-dispatch program capture mechanism.
-// Only defined if !NPCOMP_ENABLE_TORCH_TYPE_DISPATCH (default).
-void InitC10DispatchBindings(pybind11::module &m);
+// Adds bindings related to building modules.
+void InitBuilderBindings(pybind11::module &m);

 } // namespace torch_mlir

--- a/frontends/pytorch/csrc/type_dispatch/CMakeLists.txt
+++ b/frontends/pytorch/csrc/type_dispatch/CMakeLists.txt
@ -1,31 +0,0 @@
-include_directories(
-  ${TORCH_INCLUDE_DIRS}
-  ${TORCH_INSTALL_PREFIX}/include/TH
-  ${TORCH_INSTALL_PREFIX}/include/THC/opt/pytorch/pytorch
-  ${CMAKE_CURRENT_SOURCE_DIR}
-  ${CMAKE_CURRENT_BINARY_DIR}
-  ${PYTHON_INCLUDE_DIRS}
-  )
-link_directories("${TORCH_INSTALL_PREFIX}/lib")
-add_library(npcomp_torch_type_dispatch_bindings
-  aten_mlir_bridge.cpp
-  aten_mlir_type.cpp
-  aten_mlir_type_default.cpp
-  device.cpp
-  ir.cpp
-  jit.cpp
-  mlir_gen.cpp
-  python_bindings.cpp
-  tensor.cpp
-  tensor_impl.cpp
-  torch_util.cpp
-  )
-
-get_property(mlir_libs GLOBAL PROPERTY MLIR_ALL_LIBS)
-target_link_libraries(npcomp_torch_type_dispatch_bindings
-  NPCOMPATenDialect
-  ${TORCH_LIBRARIES}
-  ${mlir_libs}
-  ${PYTHON_LIBRARIES}
-  torch_python
-  )
--- a/frontends/pytorch/csrc/type_dispatch/README.md
+++ b/frontends/pytorch/csrc/type_dispatch/README.md
@ -1,5 +0,0 @@
-# Type dispatch MLIR capture interface
-
-This directory contains code related to the legacy ATen "type dispatch"
-interface (which uses a large table of virtual functions). It is being
-superceded by the c10 dispatcher mechanism.
--- a/frontends/pytorch/csrc/type_dispatch/aten_mlir_bridge.cpp
+++ b/frontends/pytorch/csrc/type_dispatch/aten_mlir_bridge.cpp
@ -1,192 +0,0 @@
-//===- aten_mlir_bridge.cpp -------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-// Structured similarly to code from git@github.com:pytorch/xla.git
-
-#include "aten_mlir_bridge.h"
-
-#include <string>
-#include <vector>
-
-#include "device.h"
-#include "tensor_impl.h"
-
-namespace torch_mlir {
-namespace bridge {
-namespace {
-
-class AtenMLIRDeviceMapper {
-public:
-  static AtenMLIRDeviceMapper *Get();
-
-  size_t GetDeviceOrdinal(const Device &device) const {
-    auto it = devices_ordinals_.find(device);
-    assert(it != devices_ordinals_.end());
-    return it->second;
-  }
-
-  const Device &GetDeviceFromOrdinal(size_t ordinal) const {
-    return devices_.at(ordinal);
-  }
-
-private:
-  AtenMLIRDeviceMapper() {
-    std::vector<std::string> local_devices{"mlir:0", "mlir:1", "mlir:2"};
-    for (auto &device_str : local_devices) {
-      devices_.emplace_back(device_str);
-      devices_ordinals_[devices_.back()] = devices_.size() - 1;
-    }
-  }
-
-  std::vector<Device> devices_;
-  std::map<Device, size_t> devices_ordinals_;
-};
-
-AtenMLIRDeviceMapper *AtenMLIRDeviceMapper::Get() {
-  static AtenMLIRDeviceMapper *device_mapper = new AtenMLIRDeviceMapper();
-  return device_mapper;
-}
-
-} // namespace
-
-c10::optional<MLIRTensor> TryGetMLIRTensor(const at::Tensor &tensor) {
-  MLIRTensorImpl *impl =
-      dynamic_cast<MLIRTensorImpl *>(tensor.unsafeGetTensorImpl());
-  if (impl == nullptr) {
-    return c10::nullopt;
-  }
-  return impl->tensor();
-}
-
-MLIRTensor GetMLIRTensor(const at::Tensor &tensor) {
-  auto xtensor = TryGetMLIRTensor(tensor);
-  assert(xtensor && "Input tensor is not an MLIR tensor");
-  return *xtensor;
-}
-
-MLIRTensor GetOrCreateMLIRTensor(const at::Tensor &tensor,
-                                 const Device &device) {
-  if (!tensor.defined()) {
-    return MLIRTensor();
-  }
-  auto xtensor = TryGetMLIRTensor(tensor);
-  return xtensor ? *xtensor : MLIRTensor::Create(tensor, device);
-}
-
-std::vector<at::Tensor> MLIRCreateTensorList(const at::TensorList &tensors) {
-
-  std::vector<at::Tensor> aten_device_tensors(tensors.size());
-  std::vector<MLIRTensor> device_tensors;
-
-  std::vector<bool> to_translate(tensors.size());
-
-  for (size_t i = 0; i < tensors.size(); ++i) {
-    const at::Tensor &tensor = tensors[i];
-    if (tensor.defined()) {
-      auto xtensor = TryGetMLIRTensor(tensor);
-      if (xtensor) {
-        to_translate[i] = true;
-        device_tensors.push_back(*xtensor);
-      } else {
-        aten_device_tensors[i] = tensor;
-      }
-    }
-  }
-
-  for (size_t i = 0, defined_pos = 0; i < tensors.size(); ++i) {
-    if (to_translate[i]) {
-      aten_device_tensors[i] =
-          std::move(device_tensors[defined_pos++].ToTensor());
-    }
-  }
-  return aten_device_tensors;
-}
-
-c10::optional<Device> GetMLIRDevice(const at::TensorList &tensors) {
-  for (const auto &tensor : tensors) {
-    auto device = GetMLIRDevice(tensor);
-    if (device) {
-      return device;
-    }
-  }
-  return c10::nullopt;
-}
-
-c10::optional<Device> GetMLIRDevice(const at::TensorOptions &tensor_options) {
-  if (!tensor_options.has_device()) {
-    return c10::nullopt;
-  }
-  return GetMLIRDevice(tensor_options.device());
-}
-
-c10::optional<Device> GetMLIRDevice(const c10::Device &device) {
-  if (device.type() != at::kXLA) {
-    return c10::nullopt;
-  }
-  return AtenDeviceToMLIRDevice(device);
-}
-
-c10::optional<Device> GetMLIRDevice(const at::Tensor &tensor) {
-  auto xtensor = TryGetMLIRTensor(tensor);
-  if (!xtensor) {
-    return c10::nullopt;
-  }
-  return xtensor->GetDevice();
-}
-
-Device AtenDeviceToMLIRDevice(const c10::Device &device) {
-  assert(device.type() == at::kXLA);
-  int ordinal = device.has_index() ? device.index() : -1;
-  if (ordinal < 0) {
-    c10::Device current_device = MLIRTensorImpl::GetCurrentAtenDevice();
-    if (current_device.has_index()) {
-      ordinal = current_device.index();
-    }
-  }
-  if (ordinal < 0) {
-    return *GetDefaultDevice();
-  }
-  return AtenMLIRDeviceMapper::Get()->GetDeviceFromOrdinal(ordinal);
-}
-
-c10::Device MLIRDeviceToAtenDevice(const Device &device) {
-  // TODO: define our own device and stop hijacking the xla device.
-  return c10::Device(at::kXLA,
-                     AtenMLIRDeviceMapper::Get()->GetDeviceOrdinal(device));
-}
-
-at::Tensor MLIRToAtenTensor(MLIRTensor device_tensor,
-                            const at::TensorOptions &tensor_options) {
-  if (tensor_options.has_device()) {
-    assert(tensor_options.device().type() != at::kXLA);
-  }
-
-  at::Tensor tensor = device_tensor.ToTensor();
-
-  // We need to copy the tensor since it is cached within the MLIRTensor, and
-  // returning it directly might expose it to in place changes.
-  return tensor.to(tensor_options, /*non_blocking=*/false, /*copy=*/true);
-}
-
-at::Tensor AtenFromMLIRTensor(MLIRTensor device_tensor) {
-  assert(!device_tensor.is_null());
-  at::Tensor ret =
-      at::Tensor(c10::make_intrusive<MLIRTensorImpl>(std::move(device_tensor)));
-  return ret;
-}
-
-at::Tensor CreateMLIRTensor(at::Tensor tensor,
-                            const c10::optional<Device> &device) {
-  if (tensor.defined() && device) {
-    MLIRTensor device_tensor = MLIRTensor::Create(std::move(tensor), *device);
-    tensor = AtenFromMLIRTensor(device_tensor);
-  }
-  return tensor;
-}
-
-} // namespace bridge
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/aten_mlir_bridge.h
+++ b/frontends/pytorch/csrc/type_dispatch/aten_mlir_bridge.h
@ -1,61 +0,0 @@
-//===- aten_mlir_bridge.h ---------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-// Structured similarly to code from git@github.com:pytorch/xla.git
-
-// This file implements a bridge which moves data back and forth from torch
-// tensors (at::Tensor) to MLIRTensor, which represents a tensor associated
-// with our virtual 'MLIR' device.
-
-#include "device.h"
-#include "tensor.h"
-
-#include <ATen/Device.h>
-#include <ATen/Functions.h>
-#include <ATen/Tensor.h>
-
-namespace torch_mlir {
-namespace bridge {
-
-c10::optional<MLIRTensor> TryGetMLIRTensor(const at::Tensor &tensor);
-
-// Return an MLIR tensor that is computed the same way as the given at::Tensor
-MLIRTensor GetMLIRTensor(const at::Tensor &tensor);
-
-MLIRTensor GetOrCreateMLIRTensor(const at::Tensor &tensor,
-                                 const Device &device);
-
-// Creates a vector of at::Tensor objects extracted from a list of MLIR tensors.
-std::vector<at::Tensor> MLIRCreateTensorList(const at::TensorList &tensors);
-
-c10::optional<Device> GetMLIRDevice(const at::TensorList &tensors);
-
-c10::optional<Device> GetMLIRDevice(const at::TensorOptions &tensor_options);
-
-c10::optional<Device> GetMLIRDevice(const c10::Device &device);
-
-c10::optional<Device> GetMLIRDevice(const at::Tensor &tensor);
-
-Device AtenDeviceToMLIRDevice(const c10::Device &device);
-
-c10::Device MLIRDeviceToAtenDevice(const Device &device);
-
-at::Tensor MLIRToAtenTensor(MLIRTensor device_tensor,
-                            const at::TensorOptions &tensor_options);
-
-// Create an Aten tensor with MLIR type id from MLIRTensor
-at::Tensor AtenFromMLIRTensor(MLIRTensor device_tensor);
-
-// Creates an MLIR tensor holding the data in tensor, on the given device.
-at::Tensor CreateMLIRTensor(at::Tensor tensor,
-                            const c10::optional<Device> &device);
-
-} // namespace bridge
-
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/aten_mlir_type.cpp
+++ b/frontends/pytorch/csrc/type_dispatch/aten_mlir_type.cpp
@ -1,669 +0,0 @@
-//===- aten_mlir_type.cpp ---------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-// Structured similarly to code from git@github.com:pytorch/xla.git
-
-#include "llvm/Support/Debug.h"
-
-#include "aten_mlir_bridge.h"
-#include "aten_mlir_type.h"
-#include "aten_mlir_type_default.h"
-#include "ir.h"
-#include "tensor_impl.h"
-#include "torch_util.h"
-
-#include <mutex>
-
-#define DEBUG_TYPE "torch_mlir"
-
-namespace torch_mlir {
-namespace {
-
-struct MLIROptions {
-  MLIROptions(const at::TensorOptions &options,
-              c10::optional<Device> device_opt = c10::nullopt,
-              c10::optional<at::ScalarType> scalar_type_opt = c10::nullopt)
-      : device(std::move(device_opt)), scalar_type(std::move(scalar_type_opt)) {
-    if (options.has_device()) {
-      device = bridge::AtenDeviceToMLIRDevice(options.device());
-    }
-    if (options.has_dtype()) {
-      scalar_type = c10::typeMetaToScalarType(options.dtype());
-    }
-  }
-
-  Device get_device() const { return device ? *device : *GetDefaultDevice(); }
-
-  at::ScalarType
-  get_scalar_type(at::ScalarType defval = at::ScalarType::Float) const {
-    return scalar_type ? *scalar_type : defval;
-  }
-
-  c10::optional<Device> device;
-  c10::optional<at::ScalarType> scalar_type;
-};
-
-std::tuple<MLIRTensor, MLIRTensor>
-GetPromotedMLIRTensorsForBinaryOp(const at::Tensor &self,
-                                  const at::Tensor &other) {
-  // this requires slightly newer than pytorch 1.3.0, disable for now.
-  // at::ScalarType dtype = at::result_type(self, other);
-  MLIRTensor tensor1 = bridge::GetMLIRTensor(self);
-  MLIRTensor tensor2 =
-      bridge::GetOrCreateMLIRTensor(other, tensor1.GetDevice());
-  // tensor1.SetScalarType(dtype);
-  // tensor2.SetScalarType(dtype);
-  return std::make_tuple(tensor1, tensor2);
-}
-
-void AtenInitialize() {
-  RegisterAtenTypeFunctions();
-  ir::RegisterAtenIR();
-}
-
-} // namespace
-
-void ATenMLIRType::InitializeAtenBindings() {
-  static std::once_flag once;
-  std::call_once(once, []() { AtenInitialize(); });
-}
-
-at::Tensor ATenMLIRType::_adaptive_avg_pool2d(const at::Tensor &self,
-                                              at::IntArrayRef output_size) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  return bridge::AtenFromMLIRTensor(MLIRTensor::_adaptive_avg_pool2d(
-      bridge::GetMLIRTensor(self), output_size));
-}
-
-at::Tensor
-ATenMLIRType::_adaptive_avg_pool2d_backward(const at::Tensor &grad_output,
-                                            const at::Tensor &self) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto grad_output_tensor =
-      bridge::GetOrCreateMLIRTensor(grad_output, input_tensor.GetDevice());
-
-  return bridge::AtenFromMLIRTensor(MLIRTensor::_adaptive_avg_pool2d_backward(
-      grad_output_tensor, input_tensor));
-}
-
-at::Tensor ATenMLIRType::add(const at::Tensor &self, const at::Tensor &other,
-                             at::Scalar alpha) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto tensors = GetPromotedMLIRTensorsForBinaryOp(self, other);
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::add(std::get<0>(tensors), std::get<1>(tensors), alpha));
-}
-
-at::Tensor &ATenMLIRType::add_(at::Tensor &self, const at::Tensor &other,
-                               at::Scalar alpha) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto tensors = GetPromotedMLIRTensorsForBinaryOp(self, other);
-  auto result = bridge::AtenFromMLIRTensor(
-      MLIRTensor::add_(std::get<0>(tensors), std::get<1>(tensors), alpha));
-  MLIRTensorImpl *self_impl =
-      dynamic_cast<MLIRTensorImpl *>(self.unsafeGetTensorImpl());
-  self_impl->shallow_copy_from(result.getIntrusivePtr());
-  return self;
-}
-
-at::Tensor ATenMLIRType::addmm(const at::Tensor &self, const at::Tensor &mat1,
-                               const at::Tensor &mat2, at::Scalar beta,
-                               at::Scalar alpha) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto tensor = bridge::GetMLIRTensor(self);
-  return bridge::AtenFromMLIRTensor(MLIRTensor::addmm(
-      tensor, bridge::GetOrCreateMLIRTensor(mat1, tensor.GetDevice()),
-      bridge::GetOrCreateMLIRTensor(mat2, tensor.GetDevice()), beta, alpha));
-}
-
-at::Tensor ATenMLIRType::as_strided(const at::Tensor &self,
-                                    at::IntArrayRef size,
-                                    at::IntArrayRef stride,
-                                    c10::optional<int64_t> storage_offset) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  return bridge::AtenFromMLIRTensor(MLIRTensor::as_strided(
-      bridge::GetMLIRTensor(self), size, stride, storage_offset));
-}
-
-at::Tensor ATenMLIRType::clone(const at::Tensor &self) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::clone(bridge::GetMLIRTensor(self)));
-}
-
-at::Tensor &ATenMLIRType::copy_(at::Tensor &self, const at::Tensor &src,
-                                bool non_blocking) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-
-  auto self_tensor = bridge::TryGetMLIRTensor(self);
-  auto src_tensor = bridge::TryGetMLIRTensor(src);
-
-  if (!src_tensor) {
-    assert(self_tensor);
-    self_tensor->SetTensor(util::CopyTensor(src, self.scalar_type()));
-  } else if (!self_tensor) {
-    at::Tensor t = src_tensor->ToTensor();
-    const_cast<at::Tensor &>(self).unsafeGetTensorImpl()->shallow_copy_from(
-        t.getIntrusivePtr());
-  } else {
-    MLIRTensor::copy_(*self_tensor, *src_tensor);
-  }
-  return self;
-}
-
-at::Tensor ATenMLIRType::_copy_from(const at::Tensor &self,
-                                    const at::Tensor &dst, bool non_blocking) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-
-  std::vector<at::Tensor> tensors = {self};
-  auto device_tensors = bridge::MLIRCreateTensorList(tensors);
-  // Hack in an overwrite of a const tensor.
-  at::Tensor t = util::CopyTensor(device_tensors.front(), dst.scalar_type());
-  const_cast<at::Tensor &>(dst).unsafeGetTensorImpl()->shallow_copy_from(
-      t.getIntrusivePtr());
-  return dst;
-}
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
-ATenMLIRType::convolution_backward_overrideable(
-    const at::Tensor &grad_output, const at::Tensor &input,
-    const at::Tensor &weight, at::IntArrayRef stride, at::IntArrayRef padding,
-    at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding,
-    int64_t groups, std::array<bool, 3> output_mask) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(input);
-  auto weight_tensor =
-      bridge::GetOrCreateMLIRTensor(weight, input_tensor.GetDevice());
-  auto grad_output_tensor =
-      bridge::GetOrCreateMLIRTensor(grad_output, input_tensor.GetDevice());
-
-  auto ret = MLIRTensor::convolution_backward(
-      grad_output_tensor, input_tensor, weight_tensor, stride, padding,
-      dilation, transposed, output_padding, groups, output_mask);
-  return std::make_tuple(bridge::AtenFromMLIRTensor(std::get<0>(ret)),
-                         bridge::AtenFromMLIRTensor(std::get<1>(ret)),
-                         bridge::AtenFromMLIRTensor(std::get<2>(ret)));
-}
-
-at::Tensor ATenMLIRType::convolution_overrideable(
-    const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias,
-    at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation,
-    bool transposed, at::IntArrayRef output_padding, int64_t groups) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(input);
-  auto weight_tensor =
-      bridge::GetOrCreateMLIRTensor(weight, input_tensor.GetDevice());
-
-  auto bias_tensor =
-      bias.defined()
-          ? bridge::GetOrCreateMLIRTensor(bias, input_tensor.GetDevice())
-          : bridge::GetOrCreateMLIRTensor(
-                at::zeros(at::IntArrayRef{weight.sizes()[0]}),
-                input_tensor.GetDevice());
-
-  return bridge::AtenFromMLIRTensor(MLIRTensor::convolution(
-      input_tensor, weight_tensor, bias_tensor, stride, padding, dilation,
-      transposed, output_padding, groups));
-}
-
-at::Tensor ATenMLIRType::div(const at::Tensor &self, at::Scalar other) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  return bridge::AtenFromMLIRTensor(MLIRTensor::div(input_tensor, other));
-}
-
-at::Tensor ATenMLIRType::div(const at::Tensor &self, const at::Tensor &other) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto tensors = GetPromotedMLIRTensorsForBinaryOp(self, other);
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::div(std::get<0>(tensors), std::get<1>(tensors)));
-}
-
-at::Tensor &ATenMLIRType::div_(at::Tensor &self, const at::Tensor &other) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto tensors = GetPromotedMLIRTensorsForBinaryOp(self, other);
-  auto result = bridge::AtenFromMLIRTensor(
-      MLIRTensor::div_(std::get<0>(tensors), std::get<1>(tensors)));
-  MLIRTensorImpl *self_impl =
-      dynamic_cast<MLIRTensorImpl *>(self.unsafeGetTensorImpl());
-  self_impl->shallow_copy_from(result.getIntrusivePtr());
-  return self;
-}
-
-at::Tensor ATenMLIRType::expand(const at::Tensor &self, at::IntArrayRef size,
-                                bool implicit) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::expand(input_tensor, size, implicit));
-}
-
-at::Tensor ATenMLIRType::gather(const at::Tensor &self, int64_t dim,
-                                const at::Tensor &index, bool sparse_grad) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto index_tensor =
-      bridge::GetOrCreateMLIRTensor(index, input_tensor.GetDevice());
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::gather(input_tensor, dim, index_tensor, sparse_grad));
-}
-
-at::Tensor ATenMLIRType::hardtanh(const at::Tensor &self, at::Scalar min_val,
-                                  at::Scalar max_val) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto result = bridge::AtenFromMLIRTensor(
-      MLIRTensor::hardtanh(input_tensor, min_val, max_val));
-  MLIRTensorImpl *self_impl =
-      dynamic_cast<MLIRTensorImpl *>(self.unsafeGetTensorImpl());
-  self_impl->shallow_copy_from(result.getIntrusivePtr());
-  return self;
-}
-
-at::Tensor &ATenMLIRType::hardtanh_(at::Tensor &self, at::Scalar min_val,
-                                    at::Scalar max_val) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto result = bridge::AtenFromMLIRTensor(
-      MLIRTensor::hardtanh_(input_tensor, min_val, max_val));
-  MLIRTensorImpl *self_impl =
-      dynamic_cast<MLIRTensorImpl *>(self.unsafeGetTensorImpl());
-  self_impl->shallow_copy_from(result.getIntrusivePtr());
-  return self;
-}
-
-at::Tensor ATenMLIRType::hardtanh_backward(const at::Tensor &grad_output,
-                                           const at::Tensor &self,
-                                           at::Scalar min_val,
-                                           at::Scalar max_val) {
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto grad_output_tensor =
-      bridge::GetOrCreateMLIRTensor(grad_output, input_tensor.GetDevice());
-  return bridge::AtenFromMLIRTensor(MLIRTensor::hardtanh_backward(
-      grad_output_tensor, input_tensor, min_val, max_val));
-}
-
-at::Tensor ATenMLIRType::_log_softmax(const at::Tensor &self, int64_t dim,
-                                      bool half_to_float) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::_log_softmax(input_tensor, dim, half_to_float));
-}
-
-at::Tensor
-ATenMLIRType::_log_softmax_backward_data(const at::Tensor &grad_output,
-                                         const at::Tensor &output, int64_t dim,
-                                         const at::Tensor &self) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto output_tensor =
-      bridge::GetOrCreateMLIRTensor(output, input_tensor.GetDevice());
-  auto grad_output_tensor =
-      bridge::GetOrCreateMLIRTensor(grad_output, input_tensor.GetDevice());
-  return bridge::AtenFromMLIRTensor(MLIRTensor::_log_softmax_backward_data(
-      grad_output_tensor, output_tensor, dim, input_tensor));
-}
-
-std::tuple<at::Tensor, at::Tensor> ATenMLIRType::max_pool2d_with_indices(
-    const at::Tensor &self, at::IntArrayRef kernel_size, at::IntArrayRef stride,
-    at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto ret = MLIRTensor::max_pool2d_with_indices(
-      input_tensor, kernel_size, stride, padding, dilation, ceil_mode);
-  return std::make_tuple(bridge::AtenFromMLIRTensor(std::get<0>(ret)),
-                         bridge::AtenFromMLIRTensor(std::get<1>(ret)));
-}
-
-at::Tensor ATenMLIRType::max_pool2d_with_indices_backward(
-    const at::Tensor &grad_output, const at::Tensor &self,
-    at::IntArrayRef kernel_size, at::IntArrayRef stride,
-    at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode,
-    const at::Tensor &indices) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto grad_output_tensor =
-      bridge::GetOrCreateMLIRTensor(grad_output, input_tensor.GetDevice());
-  auto indices_tensor =
-      bridge::GetOrCreateMLIRTensor(indices, input_tensor.GetDevice());
-
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::max_pool2d_with_indices_backward(
-          grad_output_tensor, input_tensor, kernel_size, stride, padding,
-          dilation, ceil_mode, indices_tensor));
-}
-
-at::Tensor ATenMLIRType::mean(const at::Tensor &self,
-                              c10::optional<at::ScalarType> dtype) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::mean(bridge::GetMLIRTensor(self), dtype));
-}
-
-at::Tensor ATenMLIRType::mean(const at::Tensor &self, at::IntArrayRef dim,
-                              bool keepdim,
-                              c10::optional<at::ScalarType> dtype) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::mean(bridge::GetMLIRTensor(self), dim, keepdim, dtype));
-}
-
-at::Tensor ATenMLIRType::mm(const at::Tensor &input, const at::Tensor &mat2) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(input);
-  auto mat2_tensor =
-      bridge::GetOrCreateMLIRTensor(mat2, input_tensor.GetDevice());
-  return bridge::AtenFromMLIRTensor(MLIRTensor::mm(input_tensor, mat2_tensor));
-}
-
-at::Tensor ATenMLIRType::mul(const at::Tensor &self, const at::Tensor &other) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto tensors = GetPromotedMLIRTensorsForBinaryOp(self, other);
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::mul(std::get<0>(tensors), std::get<1>(tensors)));
-}
-
-at::Tensor &ATenMLIRType::mul_(at::Tensor &self, const at::Tensor &other) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto tensors = GetPromotedMLIRTensorsForBinaryOp(self, other);
-  auto result = bridge::AtenFromMLIRTensor(
-      MLIRTensor::mul_(std::get<0>(tensors), std::get<1>(tensors)));
-  MLIRTensorImpl *self_impl =
-      dynamic_cast<MLIRTensorImpl *>(self.unsafeGetTensorImpl());
-  self_impl->shallow_copy_from(result.getIntrusivePtr());
-  return self;
-}
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> ATenMLIRType::native_batch_norm(
-    const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias,
-    const at::Tensor &running_mean, const at::Tensor &running_var,
-    bool training, double momentum, double eps) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(input);
-  auto weight_tensor =
-      bridge::GetOrCreateMLIRTensor(weight, input_tensor.GetDevice());
-  auto bias_tensor =
-      bridge::GetOrCreateMLIRTensor(bias, input_tensor.GetDevice());
-  auto running_mean_tensor =
-      bridge::GetOrCreateMLIRTensor(running_mean, input_tensor.GetDevice());
-  auto running_var_tensor =
-      bridge::GetOrCreateMLIRTensor(running_var, input_tensor.GetDevice());
-
-  auto ret = MLIRTensor::native_batch_norm(
-      input_tensor, weight_tensor, bias_tensor, running_mean_tensor,
-      running_var_tensor, training, momentum, eps);
-
-  return std::make_tuple(bridge::AtenFromMLIRTensor(std::get<0>(ret)),
-                         bridge::AtenFromMLIRTensor(std::get<1>(ret)),
-                         bridge::AtenFromMLIRTensor(std::get<2>(ret)));
-}
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
-ATenMLIRType::native_batch_norm_backward(
-    const at::Tensor &grad_out, const at::Tensor &input,
-    const at::Tensor &weight, const at::Tensor &running_mean,
-    const at::Tensor &running_var, const at::Tensor &save_mean,
-    const at::Tensor &save_invstd, bool train, double eps,
-    std::array<bool, 3> output_mask) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(input);
-  auto grad_out_tensor =
-      bridge::GetOrCreateMLIRTensor(grad_out, input_tensor.GetDevice());
-  auto weight_tensor =
-      bridge::GetOrCreateMLIRTensor(weight, input_tensor.GetDevice());
-  auto running_mean_tensor =
-      bridge::GetOrCreateMLIRTensor(running_mean, input_tensor.GetDevice());
-  auto running_var_tensor =
-      bridge::GetOrCreateMLIRTensor(running_var, input_tensor.GetDevice());
-  auto save_mean_tensor =
-      bridge::GetOrCreateMLIRTensor(save_mean, input_tensor.GetDevice());
-  auto save_invstd_tensor =
-      bridge::GetOrCreateMLIRTensor(save_invstd, input_tensor.GetDevice());
-
-  auto ret = MLIRTensor::native_batch_norm_backward(
-      grad_out_tensor, input_tensor, weight_tensor, running_mean_tensor,
-      running_var_tensor, save_mean_tensor, save_invstd_tensor, train, eps,
-      output_mask);
-
-  return std::make_tuple(bridge::AtenFromMLIRTensor(std::get<0>(ret)),
-                         bridge::AtenFromMLIRTensor(std::get<1>(ret)),
-                         bridge::AtenFromMLIRTensor(std::get<2>(ret)));
-}
-
-at::Tensor ATenMLIRType::neg(const at::Tensor &self) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  return bridge::AtenFromMLIRTensor(MLIRTensor::neg(input_tensor));
-}
-
-std::tuple<at::Tensor, at::Tensor> ATenMLIRType::nll_loss2d_forward(
-    const at::Tensor &self, const at::Tensor &target, const at::Tensor &weight,
-    int64_t reduction, int64_t ignore_index) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto target_tensor =
-      bridge::GetOrCreateMLIRTensor(target, input_tensor.GetDevice());
-
-  auto weight_tensor =
-      weight.defined()
-          ? bridge::GetOrCreateMLIRTensor(weight, input_tensor.GetDevice())
-          : bridge::GetOrCreateMLIRTensor(at::ones(self.sizes()[1]),
-                                          input_tensor.GetDevice());
-
-  auto ret = MLIRTensor::nll_loss2d_forward(
-      input_tensor, target_tensor, weight_tensor, reduction, ignore_index);
-
-  return std::make_tuple(bridge::AtenFromMLIRTensor(std::get<0>(ret)),
-                         bridge::AtenFromMLIRTensor(std::get<1>(ret)));
-}
-
-at::Tensor ATenMLIRType::nll_loss2d_backward(
-    const at::Tensor &grad_output, const at::Tensor &self,
-    const at::Tensor &target, const at::Tensor &weight, int64_t reduction,
-    int64_t ignore_index, const at::Tensor &total_weight) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto grad_output_tensor =
-      bridge::GetOrCreateMLIRTensor(grad_output, input_tensor.GetDevice());
-  auto target_tensor =
-      bridge::GetOrCreateMLIRTensor(target, input_tensor.GetDevice());
-
-  auto weight_tensor =
-      weight.defined()
-          ? bridge::GetOrCreateMLIRTensor(weight, input_tensor.GetDevice())
-          : bridge::GetOrCreateMLIRTensor(at::ones(self.sizes()[1]),
-                                          input_tensor.GetDevice());
-  auto total_weight_tensor =
-      bridge::GetOrCreateMLIRTensor(total_weight, input_tensor.GetDevice());
-
-  return bridge::AtenFromMLIRTensor(MLIRTensor::nll_loss2d_backward(
-      grad_output_tensor, input_tensor, target_tensor, weight_tensor, reduction,
-      ignore_index, total_weight_tensor));
-}
-
-std::tuple<at::Tensor, at::Tensor>
-ATenMLIRType::nll_loss_forward(const at::Tensor &self, const at::Tensor &target,
-                               const at::Tensor &weight, int64_t reduction,
-                               int64_t ignore_index) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto target_tensor =
-      bridge::GetOrCreateMLIRTensor(target, input_tensor.GetDevice());
-
-  auto weight_tensor =
-      weight.defined()
-          ? bridge::GetOrCreateMLIRTensor(weight, input_tensor.GetDevice())
-          : bridge::GetOrCreateMLIRTensor(at::ones(self.sizes()[1]),
-                                          input_tensor.GetDevice());
-
-  auto ret = MLIRTensor::nll_loss_forward(
-      input_tensor, target_tensor, weight_tensor, reduction, ignore_index);
-
-  return std::make_tuple(bridge::AtenFromMLIRTensor(std::get<0>(ret)),
-                         bridge::AtenFromMLIRTensor(std::get<1>(ret)));
-}
-
-at::Tensor ATenMLIRType::nll_loss_backward(
-    const at::Tensor &grad_output, const at::Tensor &self,
-    const at::Tensor &target, const at::Tensor &weight, int64_t reduction,
-    int64_t ignore_index, const at::Tensor &total_weight) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto grad_output_tensor =
-      bridge::GetOrCreateMLIRTensor(grad_output, input_tensor.GetDevice());
-  auto target_tensor =
-      bridge::GetOrCreateMLIRTensor(target, input_tensor.GetDevice());
-
-  auto weight_tensor =
-      weight.defined()
-          ? bridge::GetOrCreateMLIRTensor(weight, input_tensor.GetDevice())
-          : bridge::GetOrCreateMLIRTensor(at::ones(self.sizes()[1]),
-                                          input_tensor.GetDevice());
-  auto total_weight_tensor =
-      bridge::GetOrCreateMLIRTensor(total_weight, input_tensor.GetDevice());
-
-  return bridge::AtenFromMLIRTensor(MLIRTensor::nll_loss_backward(
-      grad_output_tensor, input_tensor, target_tensor, weight_tensor, reduction,
-      ignore_index, total_weight_tensor));
-}
-
-at::Tensor ATenMLIRType::relu(const at::Tensor &self) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::relu(bridge::GetMLIRTensor(self)));
-}
-
-at::Tensor &ATenMLIRType::relu_(at::Tensor &self) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto result = bridge::AtenFromMLIRTensor(MLIRTensor::relu_(input_tensor));
-  MLIRTensorImpl *self_impl =
-      dynamic_cast<MLIRTensorImpl *>(self.unsafeGetTensorImpl());
-  self_impl->shallow_copy_from(result.getIntrusivePtr());
-  return self;
-}
-
-int64_t ATenMLIRType::size(const at::Tensor &self, int64_t dim) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  return bridge::GetMLIRTensor(self).sizes()[dim];
-}
-
-at::Tensor ATenMLIRType::squeeze(const at::Tensor &self, int64_t dim) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::squeeze(bridge::GetMLIRTensor(self), dim));
-}
-
-at::Tensor ATenMLIRType::sub(const at::Tensor &self, const at::Tensor &other,
-                             at::Scalar alpha) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto tensors = GetPromotedMLIRTensorsForBinaryOp(self, other);
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::sub(std::get<0>(tensors), std::get<1>(tensors), alpha));
-}
-
-at::Tensor &ATenMLIRType::sub_(at::Tensor &self, const at::Tensor &other,
-                               at::Scalar alpha) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto tensors = GetPromotedMLIRTensorsForBinaryOp(self, other);
-  auto result = bridge::AtenFromMLIRTensor(
-      MLIRTensor::sub_(std::get<0>(tensors), std::get<1>(tensors), alpha));
-  MLIRTensorImpl *self_impl =
-      dynamic_cast<MLIRTensorImpl *>(self.unsafeGetTensorImpl());
-  self_impl->shallow_copy_from(result.getIntrusivePtr());
-  return self;
-}
-
-at::Tensor ATenMLIRType::sum(const at::Tensor &self, at::IntArrayRef dim,
-                             bool keepdim,
-                             c10::optional<at::ScalarType> dtype) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::sum(bridge::GetMLIRTensor(self), dim, keepdim, dtype));
-}
-
-at::Tensor ATenMLIRType::t(const at::Tensor &self) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  return bridge::AtenFromMLIRTensor(MLIRTensor::t(bridge::GetMLIRTensor(self)));
-}
-
-at::Tensor ATenMLIRType::threshold_backward(const at::Tensor &grad_output,
-                                            const at::Tensor &self,
-                                            at::Scalar threshold) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  auto input_tensor = bridge::GetMLIRTensor(self);
-  auto grad_output_tensor =
-      bridge::GetOrCreateMLIRTensor(grad_output, input_tensor.GetDevice());
-  return bridge::AtenFromMLIRTensor(MLIRTensor::threshold_backward(
-      grad_output_tensor, input_tensor, threshold));
-}
-
-at::Tensor ATenMLIRType::to(const at::Tensor &self,
-                            const at::TensorOptions &options,
-                            bool /* non_blocking */, bool /* copy */) {
-
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-
-  auto self_tensor = bridge::TryGetMLIRTensor(self);
-  if (!self_tensor) {
-    assert(options.has_device());
-    at::ScalarType dtype = options.has_dtype()
-                               ? c10::typeMetaToScalarType(options.dtype())
-                               : self.scalar_type();
-    MLIRTensor xtensor =
-        MLIRTensor::Create(util::CopyTensor(self, dtype),
-                           bridge::AtenDeviceToMLIRDevice(options.device()));
-    return bridge::AtenFromMLIRTensor(xtensor);
-  }
-  if (options.has_device() && options.device().type() != at::kXLA) {
-    return bridge::MLIRToAtenTensor(*self_tensor, options);
-  }
-  MLIROptions mlir_options(options, self_tensor->GetDevice(),
-                           self_tensor->dtype());
-  return bridge::AtenFromMLIRTensor(MLIRTensor::to(
-      *self_tensor, mlir_options.device, mlir_options.scalar_type));
-}
-
-at::Tensor ATenMLIRType::to(const at::Tensor &self, c10::Device device,
-                            at::ScalarType dtype, bool non_blocking,
-                            bool copy) {
-  return to(self, self.options().device(device).dtype(dtype), non_blocking,
-            copy);
-}
-
-at::Tensor ATenMLIRType::to(const at::Tensor &self, at::ScalarType dtype,
-                            bool non_blocking, bool copy) {
-  return to(self, self.options().dtype(dtype), non_blocking, copy);
-}
-
-at::Tensor ATenMLIRType::to(const at::Tensor &self, const at::Tensor &other,
-                            bool non_blocking, bool copy) {
-  return to(self, other.options(), non_blocking, copy);
-}
-
-at::Tensor ATenMLIRType::_unsafe_view(const at::Tensor &self,
-                                      at::IntArrayRef size) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::view(bridge::GetMLIRTensor(self), size));
-}
-
-at::Tensor ATenMLIRType::unsqueeze(const at::Tensor &self, int64_t dim) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::unsqueeze(bridge::GetMLIRTensor(self), dim));
-}
-
-at::Tensor ATenMLIRType::view(const at::Tensor &self, at::IntArrayRef size) {
-  LLVM_DEBUG(llvm::dbgs() << "ATenMLIRType::" << __func__ << "\n");
-  return bridge::AtenFromMLIRTensor(
-      MLIRTensor::view(bridge::GetMLIRTensor(self), size));
-}
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/aten_mlir_type.h
+++ b/frontends/pytorch/csrc/type_dispatch/aten_mlir_type.h
@ -1,212 +0,0 @@
-//===- aten_mlir_type.h -----------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-// Structured similarly to code from git@github.com:pytorch/xla.git
-
-#pragma once
-
-#include <ATen/Tensor.h>
-
-namespace torch_mlir {
-
-// Base ATEN Type class where the MLIR specific overrides should be defined.
-class ATenMLIRType {
-public:
-  static void InitializeAtenBindings();
-
-  //////////////////////////////////////////////////////////////////////////////
-  // ATEN API overrides in alphabetical order.
-  // Note: The C++ signatures must match the ones listed within the following
-  // pytorch folder file:
-  //   build/aten/src/ATen/RegistrationDeclarations.h
-  /////////////////////////////////////////////////////////////////////////////
-  // The static method definitions here have multiple uses. Each function
-  // signature here will override the default implementation provided by
-  // aten_mlir_type_defaults.h. Most of these overrides are used to construct
-  // a small internal IR that can be used for different purposes. Primarily,
-  // in this code, the IR will be converted to MLIR. As such there is a often
-  // a 1:1 correspondance between code here and operations in the ATen MLIR
-  // dialect.
-
-  // This file is parsed by gen_aten_dialect.py to generate
-  // aten_mlir_type_defaults.*, including the appropriate bindings in that
-  // file for all pytorch methods.
-
-  static at::Tensor _adaptive_avg_pool2d(const at::Tensor &self,
-                                         at::IntArrayRef output_size);
-
-  static at::Tensor _adaptive_avg_pool2d_backward(const at::Tensor &grad_output,
-                                                  const at::Tensor &self);
-
-  static at::Tensor add(const at::Tensor &self, const at::Tensor &other,
-                        at::Scalar alpha);
-
-  static at::Tensor &add_(at::Tensor &self, const at::Tensor &other,
-                          at::Scalar alpha);
-
-  static at::Tensor addmm(const at::Tensor &self, const at::Tensor &mat1,
-                          const at::Tensor &mat2, at::Scalar beta,
-                          at::Scalar alpha);
-
-  static at::Tensor as_strided(const at::Tensor &self, at::IntArrayRef size,
-                               at::IntArrayRef stride,
-                               c10::optional<int64_t> storage_offset);
-
-  static at::Tensor clone(const at::Tensor &self);
-
-  static std::tuple<at::Tensor, at::Tensor, at::Tensor>
-  convolution_backward_overrideable(
-      const at::Tensor &grad_output, const at::Tensor &input,
-      const at::Tensor &weight, at::IntArrayRef stride, at::IntArrayRef padding,
-      at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding,
-      int64_t groups, std::array<bool, 3> output_mask);
-
-  static at::Tensor convolution_overrideable(
-      const at::Tensor &input, const at::Tensor &weight, const at::Tensor &bias,
-      at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation,
-      bool transposed, at::IntArrayRef output_padding, int64_t groups);
-
-  static at::Tensor &copy_(at::Tensor &self, const at::Tensor &src,
-                           bool non_blocking);
-
-  static at::Tensor _copy_from(const at::Tensor &self, const at::Tensor &dst,
-                               bool non_blocking);
-
-  static at::Tensor div(const at::Tensor &self, const at::Tensor &other);
-
-  static at::Tensor &div_(at::Tensor &self, const at::Tensor &other);
-
-  static at::Tensor div(const at::Tensor &self, at::Scalar other);
-
-  static at::Tensor expand(const at::Tensor &self, at::IntArrayRef size,
-                           bool implicit);
-
-  static at::Tensor gather(const at::Tensor &self, int64_t dim,
-                           const at::Tensor &index, bool sparse_grad);
-
-  static at::Tensor hardtanh(const at::Tensor &self, at::Scalar min_val,
-                             at::Scalar max_val);
-
-  static at::Tensor &hardtanh_(at::Tensor &self, at::Scalar min_val,
-                               at::Scalar max_val);
-
-  static at::Tensor hardtanh_backward(const at::Tensor &grad_output,
-                                      const at::Tensor &self,
-                                      at::Scalar min_val, at::Scalar max_val);
-
-  static at::Tensor _log_softmax(const at::Tensor &self, int64_t dim,
-                                 bool half_to_float);
-
-  static at::Tensor _log_softmax_backward_data(const at::Tensor &grad_output,
-                                               const at::Tensor &output,
-                                               int64_t dim,
-                                               const at::Tensor &self);
-
-  static std::tuple<at::Tensor, at::Tensor>
-  max_pool2d_with_indices(const at::Tensor &self, at::IntArrayRef kernel_size,
-                          at::IntArrayRef stride, at::IntArrayRef padding,
-                          at::IntArrayRef dilation, bool ceil_mode);
-
-  static at::Tensor max_pool2d_with_indices_backward(
-      const at::Tensor &grad_output, const at::Tensor &self,
-      at::IntArrayRef kernel_size, at::IntArrayRef stride,
-      at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode,
-      const at::Tensor &indices);
-
-  static at::Tensor mean(const at::Tensor &self,
-                         c10::optional<at::ScalarType> dtype);
-
-  static at::Tensor mean(const at::Tensor &self, at::IntArrayRef dim,
-                         bool keepdim, c10::optional<at::ScalarType> dtype);
-
-  static at::Tensor mm(const at::Tensor &self, const at::Tensor &mat2);
-
-  static at::Tensor mul(const at::Tensor &self, const at::Tensor &other);
-
-  static at::Tensor &mul_(at::Tensor &self, const at::Tensor &other);
-
-  static std::tuple<at::Tensor, at::Tensor, at::Tensor>
-  native_batch_norm(const at::Tensor &input, const at::Tensor &weight,
-                    const at::Tensor &bias, const at::Tensor &running_mean,
-                    const at::Tensor &running_var, bool training,
-                    double momentum, double eps);
-
-  static std::tuple<at::Tensor, at::Tensor, at::Tensor>
-  native_batch_norm_backward(const at::Tensor &grad_out,
-                             const at::Tensor &input, const at::Tensor &weight,
-                             const at::Tensor &running_mean,
-                             const at::Tensor &running_var,
-                             const at::Tensor &save_mean,
-                             const at::Tensor &save_invstd, bool train,
-                             double eps, std::array<bool, 3> output_mask);
-
-  static at::Tensor neg(const at::Tensor &self);
-
-  static std::tuple<at::Tensor, at::Tensor>
-  nll_loss2d_forward(const at::Tensor &self, const at::Tensor &target,
-                     const at::Tensor &weight, int64_t reduction,
-                     int64_t ignore_index);
-
-  static at::Tensor nll_loss2d_backward(const at::Tensor &grad_output,
-                                        const at::Tensor &self,
-                                        const at::Tensor &target,
-                                        const at::Tensor &weight,
-                                        int64_t reduction, int64_t ignore_index,
-                                        const at::Tensor &total_weight);
-
-  static std::tuple<at::Tensor, at::Tensor>
-  nll_loss_forward(const at::Tensor &self, const at::Tensor &target,
-                   const at::Tensor &weight, int64_t reduction,
-                   int64_t ignore_index);
-
-  static at::Tensor nll_loss_backward(const at::Tensor &grad_output,
-                                      const at::Tensor &self,
-                                      const at::Tensor &target,
-                                      const at::Tensor &weight,
-                                      int64_t reduction, int64_t ignore_index,
-                                      const at::Tensor &total_weight);
-
-  static at::Tensor relu(const at::Tensor &self);
-
-  static at::Tensor &relu_(at::Tensor &self);
-
-  static int64_t size(const at::Tensor &self, int64_t dim);
-
-  static at::Tensor squeeze(const at::Tensor &self, int64_t dim);
-
-  static at::Tensor sub(const at::Tensor &self, const at::Tensor &other,
-                        at::Scalar alpha);
-
-  static at::Tensor &sub_(at::Tensor &self, const at::Tensor &other,
-                          at::Scalar alpha);
-
-  static at::Tensor sum(const at::Tensor &self, at::IntArrayRef dim,
-                        bool keepdim, c10::optional<at::ScalarType> dtype);
-
-  static at::Tensor t(const at::Tensor &self);
-
-  static at::Tensor threshold_backward(const at::Tensor &grad_output,
-                                       const at::Tensor &self,
-                                       at::Scalar threshold);
-
-  static at::Tensor to(const at::Tensor &self, const at::TensorOptions &options,
-                       bool non_blocking, bool copy);
-  static at::Tensor to(const at::Tensor &self, c10::Device device,
-                       at::ScalarType dtype, bool non_blocking, bool copy);
-  static at::Tensor to(const at::Tensor &self, at::ScalarType dtype,
-                       bool non_blocking, bool copy);
-  static at::Tensor to(const at::Tensor &self, const at::Tensor &other,
-                       bool non_blocking, bool copy);
-
-  static at::Tensor _unsafe_view(const at::Tensor &self, at::IntArrayRef size);
-
-  static at::Tensor unsqueeze(const at::Tensor &self, int64_t dim);
-
-  static at::Tensor view(const at::Tensor &self, at::IntArrayRef size);
-};
-
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/aten_mlir_type_default.cpp
+++ b/frontends/pytorch/csrc/type_dispatch/aten_mlir_type_default.cpp
--- a/frontends/pytorch/csrc/type_dispatch/aten_mlir_type_default.h
+++ b/frontends/pytorch/csrc/type_dispatch/aten_mlir_type_default.h
--- a/frontends/pytorch/csrc/type_dispatch/device.cpp
+++ b/frontends/pytorch/csrc/type_dispatch/device.cpp
@ -1,67 +0,0 @@
-//===- device.cpp -----------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-// Structured similarly to code from git@github.com:pytorch/xla.git
-
-#include "device.h"
-
-namespace torch_mlir {
-namespace {
-
-std::string DeviceTypeToString(DeviceType hw_type) {
-  switch (hw_type) {
-  case DeviceType::CPU:
-    return "CPU";
-  case DeviceType::MLIR:
-    return "MLIR";
-  }
-  return "";
-}
-
-void ParseDevice(const std::string &device_spec, Device *device) {
-  if (device_spec.empty()) {
-    return ParseDevice(std::string("mlir:0"), device);
-  }
-
-  if (device_spec[0] == ':') {
-    return ParseDevice(std::string("mlir") + device_spec, device);
-  }
-
-  auto pos = device_spec.find(':');
-  auto devtype = device_spec.substr(0, pos);
-
-  // TODO error check
-
-  device->ordinal =
-      std::stoi(device_spec.substr(pos + 1, device_spec.size() - pos - 1));
-  if (devtype == "MLIR") {
-    device->hw_type = DeviceType::MLIR;
-  } else if (devtype == "CPU") {
-    device->hw_type = DeviceType::CPU;
-  } else {
-    // TODO, error
-    device->hw_type = DeviceType::MLIR;
-  }
-}
-
-} // namespace
-
-Device::Device(const std::string &device_spec) {
-  ParseDevice(device_spec, this);
-}
-
-std::string Device::ToString() const {
-  return DeviceTypeToString(hw_type) + std::string(":") +
-         std::to_string(ordinal);
-}
-
-const Device *GetDefaultDevice() {
-  static const Device *default_device = new Device("");
-  return default_device;
-}
-
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/device.h
+++ b/frontends/pytorch/csrc/type_dispatch/device.h
@ -1,59 +0,0 @@
-//===- device.h -------------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-// Structured similarly to code from git@github.com:pytorch/xla.git
-
-#pragma once
-
-#include <iostream>
-#include <string>
-
-namespace torch_mlir {
-
-enum class DeviceType { CPU, MLIR };
-
-/// Model a pytorch device, which determines the location of a buffer in
-/// pytorch.
-struct Device {
-  Device() = default;
-  explicit Device(const std::string &device_spec);
-  Device(DeviceType hw_type, int ordinal)
-      : hw_type(hw_type), ordinal(ordinal) {}
-
-  bool operator==(const Device &other) const { return compare(other) == 0; }
-
-  bool operator!=(const Device &other) const { return compare(other) != 0; }
-
-  bool operator<(const Device &rhs) const { return compare(rhs) < 0; }
-
-  int compare(const Device &rhs) const {
-    if (hw_type != rhs.hw_type) {
-      return hw_type < rhs.hw_type ? -1 : +1;
-    }
-    return ordinal < rhs.ordinal ? -1 : (ordinal > rhs.ordinal ? +1 : 0);
-  }
-
-  std::string ToString() const;
-
-  friend std::ostream &operator<<(std::ostream &os, const Device &device) {
-    os << device.ToString();
-    return os;
-  }
-
-  size_t hash() const { return std::hash<std::string>{}(ToString()); }
-
-  DeviceType hw_type = DeviceType::CPU;
-  int ordinal = 0;
-};
-
-const Device *GetDefaultDevice();
-
-static inline const Device &GetDeviceOrDefault(const Device *device) {
-  return device != nullptr ? *device : *GetDefaultDevice();
-}
-
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/ir.cpp
+++ b/frontends/pytorch/csrc/type_dispatch/ir.cpp
--- a/frontends/pytorch/csrc/type_dispatch/ir.h
+++ b/frontends/pytorch/csrc/type_dispatch/ir.h
@ -1,920 +0,0 @@
-//===- ir.h -----------------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-// This file defines an intermediate IR generated from a pytorch model.
-#include "llvm/Support/raw_ostream.h"
-
-namespace mlir {
-class OpBuilder;
-class Value;
-class Operation;
-class MLIRContext;
-} // namespace mlir
-
-#include <map>
-#include <vector>
-
-#include <ATen/Tensor.h>
-#include <ATen/core/interned_strings.h>
-#include <c10/core/Scalar.h>
-#include <c10/util/ArrayRef.h>
-
-namespace torch_mlir {
-namespace ir {
-
-class Node;
-
-void RegisterAtenIR();
-
-using NodePtr = std::shared_ptr<Node>;
-
-struct Value {
-  Value() = default;
-  Value(NodePtr node, size_t index = 0) : node(std::move(node)), index(index) {}
-
-  operator bool() const { return node != nullptr; }
-
-  bool operator==(const Value &rhs) const {
-    return node == rhs.node && index == rhs.index;
-  }
-
-  bool operator<(const Value &rhs) const {
-    if (node == rhs.node)
-      return index < rhs.index;
-    return node < rhs.node;
-  }
-
-  std::vector<int64_t> sizes() const;
-  std::vector<int64_t> strides() const;
-
-  NodePtr node;
-  size_t index = 0;
-};
-
-struct OpKind {
-  OpKind() = default;
-  explicit OpKind(c10::Symbol op) : op(std::move(op)) {}
-
-  bool operator==(const OpKind &rhs) const { return op == rhs.op; }
-  bool operator!=(const OpKind &rhs) const { return !operator==(rhs); }
-  bool operator<(const OpKind &rhs) const {
-    return c10::unique_t(op) < c10::unique_t(rhs.op);
-  }
-
-  // size_t hash() const;
-
-  std::string ToString() const { return op.toQualString(); }
-
-  static OpKind Get(const std::string &name) {
-    return OpKind(c10::Symbol::fromQualString(name));
-  }
-
-  c10::Symbol op;
-};
-
-inline std::ostream &operator<<(std::ostream &stream, const OpKind &op) {
-  stream << op.ToString();
-  return stream;
-}
-
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
-                                     const OpKind &op) {
-  stream << op.ToString();
-  return stream;
-}
-
-using OpList = std::vector<Value>;
-
-class Node {
-
-public:
-  Node(OpKind op);
-  Node(OpKind op, OpList operands, std::vector<int64_t> sizes);
-  Node(OpKind op, OpList operands, at::IntArrayRef sizes);
-
-  const OpKind &op() const { return op_; }
-
-  virtual std::vector<int64_t> sizes() const { return sizes_[0]; }
-  virtual std::vector<int64_t> sizes(size_t i) const { return sizes_[0]; }
-
-  virtual std::vector<int64_t> strides() const { return strides(sizes()); }
-  virtual std::vector<int64_t> strides(size_t i) const {
-    return strides(sizes(i));
-  }
-
-  OpList &operands() { return operands_; }
-  Value operand(size_t i) const { return operands_.at(i); }
-
-  virtual mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable);
-
-private:
-  std::vector<int64_t> strides(std::vector<int64_t> sz) const;
-
-  OpKind op_;
-  OpList operands_;
-  std::array<std::vector<int64_t>, 3> sizes_;
-  // std::array<std::vector<int64_t>, 3> strides_;
-};
-
-class ConstantNode : public Node {
-public:
-  ConstantNode(at::Scalar scalar)
-      : Node(OpKind::Get("aten::constant")), scalar(scalar) {}
-
-  ConstantNode(at::IntArrayRef array)
-      : Node(OpKind::Get("aten::constant")), array(array.begin(), array.end()) {
-  }
-
-  ConstantNode(bool bool_)
-      : Node(OpKind::Get("aten::constant")), bool_(bool_) {}
-
-  ConstantNode(int int_) : Node(OpKind::Get("aten::constant")), int_(int_) {}
-
-  ConstantNode(int64_t int_)
-      : Node(OpKind::Get("aten::constant")), int_(int_) {}
-
-  ConstantNode(float float_)
-      : Node(OpKind::Get("aten::constant")), float_(float_) {}
-
-  ConstantNode(double double_)
-      : Node(OpKind::Get("aten::constant")), double_(double_) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-  std::vector<int64_t> sizes() const override { return {1}; }
-  std::vector<int64_t> sizes(size_t i) const override { return sizes(); }
-
-private:
-  c10::optional<at::Scalar> scalar;
-  std::vector<int64_t> array;
-  c10::optional<bool> bool_;
-  c10::optional<int> int_;
-  c10::optional<float> float_;
-  c10::optional<double> double_;
-};
-
-class AdaptiveAvgPool2dNode : public Node {
-public:
-  AdaptiveAvgPool2dNode(Value input, at::IntArrayRef kernel_size)
-      : Node(OpKind::Get("aten::_adaptive_avg_pool2d"),
-             OpList{input,
-                    ir::Value(std::make_shared<ir::ConstantNode>(kernel_size))},
-             std::vector<int64_t>{input.sizes()[0], input.sizes()[1],
-                                  kernel_size[0], kernel_size[1]}) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class AdaptiveAvgPool2dBackwardNode : public Node {
-public:
-  AdaptiveAvgPool2dBackwardNode(Value grad_output, Value self)
-      : Node(OpKind::Get("aten::_adaptive_avg_pool2d_backward"),
-             OpList{grad_output, self}, self.sizes()) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class AddNode : public Node {
-public:
-  AddNode(Value rhs, Value lhs, Value alpha)
-      : Node(OpKind::Get("aten::add"), OpList{rhs, lhs, alpha}, rhs.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class AddInPlaceNode : public Node {
-public:
-  AddInPlaceNode(Value self, Value other, Value alpha)
-      : Node(OpKind::Get("aten::add_"), OpList{self, other, alpha},
-             self.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class AddmmNode : public Node {
-public:
-  AddmmNode(Value input, Value mat1, Value mat2, Value beta, Value alpha)
-      : Node(OpKind::Get("aten::addmm"), OpList{input, mat1, mat2, beta, alpha},
-             std::vector<int64_t>{mat1.sizes()[0], mat2.sizes()[1]}){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class AsStridedNode : public Node {
-public:
-  AsStridedNode(Value input, at::IntArrayRef size, at::IntArrayRef stride,
-                c10::optional<int64_t> storage_offset)
-      : Node(OpKind::Get("aten::as_strided"),
-             OpList{input, ir::Value(std::make_shared<ir::ConstantNode>(size)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(stride))},
-             input.sizes()),
-        size(size.begin(), size.end()), stride(stride.begin(), stride.end()),
-        storage_offset(storage_offset) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-  std::vector<int64_t> sizes() const override;
-  std::vector<int64_t> sizes(size_t i) const override { return sizes(); }
-
-  std::vector<int64_t> strides() const override { return stride; }
-  std::vector<int64_t> strides(size_t i) const override { return strides(); }
-
-  std::vector<int64_t> size;
-  std::vector<int64_t> stride;
-  c10::optional<int64_t> storage_offset;
-};
-
-class BatchNormNode : public Node {
-public:
-  BatchNormNode(Value input, Value weight, Value bias, Value running_mean,
-                Value running_var, bool training, double momentum, double eps)
-      : Node(OpKind::Get("aten::native_batch_norm"),
-             OpList{
-                 input, weight, bias, running_mean, running_var,
-                 ir::Value(std::make_shared<ir::ConstantNode>(training)),
-                 ir::Value(std::make_shared<ir::ConstantNode>((float)momentum)),
-                 ir::Value(std::make_shared<ir::ConstantNode>((float)eps))},
-             input.sizes()),
-        training(training), momentum(momentum), eps(eps) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-private:
-  bool training;
-  double momentum;
-  double eps;
-};
-
-class BatchNormBackwardNode : public Node {
-public:
-  BatchNormBackwardNode(Value grad_out, Value input, Value weight,
-                        Value running_mean, Value running_var, Value save_mean,
-                        Value save_invstd, bool train, double eps,
-                        std::array<bool, 3> output_mask)
-      : Node(OpKind::Get("aten::native_batch_norm_backward"),
-             OpList{grad_out, input, weight, running_mean, running_var,
-                    save_mean, save_invstd,
-                    ir::Value(std::make_shared<ir::ConstantNode>(train)),
-                    ir::Value(std::make_shared<ir::ConstantNode>((float)eps))},
-             input.sizes()),
-        train(train), eps(eps), output_mask(output_mask) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-  std::vector<int64_t> sizes() const override {
-    assert(0 && "Cannot call sizes() for multiple outputs");
-  }
-  std::vector<int64_t> sizes(size_t i) const override;
-
-private:
-  bool train;
-  double eps;
-  std::array<bool, 3> output_mask;
-};
-
-class Conv2dNode : public Node {
-public:
-  Conv2dNode(Value input, Value weight, Value bias, at::IntArrayRef stride,
-             at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed,
-             at::IntArrayRef output_padding, int64_t groups)
-      : Node(OpKind::Get("aten::_convolution"),
-             OpList{
-                 input, weight, bias,
-                 ir::Value(std::make_shared<ir::ConstantNode>(stride)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(padding)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(dilation)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(transposed)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(output_padding)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(groups))},
-             input.sizes()),
-        stride(stride.begin(), stride.end()),
-        padding(padding.begin(), padding.end()),
-        dilation(dilation.begin(), dilation.end()), transposed(transposed),
-        output_padding(output_padding.begin(), output_padding.end()),
-        groups(groups), has_bias(true) {}
-
-  Conv2dNode(Value input, Value weight, at::IntArrayRef stride,
-             at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed,
-             at::IntArrayRef output_padding, int64_t groups)
-      : Node(OpKind::Get("aten::_convolution"),
-             OpList{
-                 input, weight,
-                 ir::Value(std::make_shared<ir::ConstantNode>(stride)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(padding)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(dilation)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(transposed)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(output_padding)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(groups))},
-             input.sizes()),
-        stride(stride.begin(), stride.end()),
-        padding(padding.begin(), padding.end()),
-        dilation(dilation.begin(), dilation.end()), transposed(transposed),
-        output_padding(output_padding.begin(), output_padding.end()),
-        groups(groups), has_bias(false) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-  std::vector<int64_t> sizes() const override;
-  std::vector<int64_t> sizes(size_t i) const override { return sizes(); }
-
-private:
-  std::vector<int64_t> stride;
-  std::vector<int64_t> padding;
-  std::vector<int64_t> dilation;
-  bool transposed;
-  std::vector<int64_t> output_padding;
-  int64_t groups;
-  bool has_bias;
-};
-
-class Conv2dBackwardNode : public Node {
-public:
-  Conv2dBackwardNode(Value grad_output, Value input, Value weight,
-                     at::IntArrayRef stride, at::IntArrayRef padding,
-                     at::IntArrayRef dilation, bool transposed,
-                     at::IntArrayRef output_padding, int64_t groups)
-      : Node(OpKind::Get("aten::_convolution_backward"),
-             OpList{
-                 grad_output, input, weight,
-                 ir::Value(std::make_shared<ir::ConstantNode>(stride)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(padding)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(dilation)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(transposed)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(output_padding)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(groups))},
-             input.sizes()),
-        stride(stride.begin(), stride.end()),
-        padding(padding.begin(), padding.end()),
-        dilation(dilation.begin(), dilation.end()), transposed(transposed),
-        output_padding(output_padding.begin(), output_padding.end()),
-        groups(groups) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-  std::vector<int64_t> sizes() const override {
-    assert(0 && "Cannot call sizes() for multiple outputs");
-  }
-  std::vector<int64_t> sizes(size_t i) const override;
-
-private:
-  std::vector<int64_t> stride;
-  std::vector<int64_t> padding;
-  std::vector<int64_t> dilation;
-  bool transposed;
-  std::vector<int64_t> output_padding;
-  int64_t groups;
-};
-
-class DivNode : public Node {
-public:
-  DivNode(Value rhs, Value lhs)
-      : Node(OpKind::Get("aten::div"), OpList{rhs, lhs}, rhs.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class DivInPlaceNode : public Node {
-public:
-  DivInPlaceNode(Value self, Value other)
-      : Node(OpKind::Get("aten::div_"), OpList{self, other}, self.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class ExpandNode : public Node {
-public:
-  ExpandNode(Value input, at::IntArrayRef size, bool implicit)
-      : Node(OpKind::Get("aten::expand"),
-             OpList{input, ir::Value(std::make_shared<ir::ConstantNode>(size)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(implicit))},
-             input.sizes()),
-        output_size(size.begin(), size.end()), implicit(implicit) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-  std::vector<int64_t> sizes() const override { return output_size; }
-  std::vector<int64_t> sizes(size_t i) const override { return sizes(); }
-
-private:
-  std::vector<int64_t> output_size;
-  bool implicit;
-};
-
-class GatherNode : public Node {
-public:
-  GatherNode(Value input, int64_t dim, Value index, bool sparse_grad)
-      : Node(OpKind::Get("aten::gather"),
-             OpList{input, ir::Value(std::make_shared<ir::ConstantNode>(dim)),
-                    index,
-                    ir::Value(std::make_shared<ir::ConstantNode>(sparse_grad))},
-             input.sizes()) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class HardtanhNode : public Node {
-public:
-  HardtanhNode(Value self, Value min_val, Value max_val)
-      : Node(OpKind::Get("aten::hardtanh"), OpList{self, min_val, max_val},
-             self.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class HardtanhInPlaceNode : public Node {
-public:
-  HardtanhInPlaceNode(Value self, Value min_val, Value max_val)
-      : Node(OpKind::Get("aten::hardtanh_"), OpList{self, min_val, max_val},
-             self.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class HardtanhBackwardNode : public Node {
-public:
-  HardtanhBackwardNode(Value grad_output, Value self, Value min_val,
-                       Value max_val)
-      : Node(OpKind::Get("aten::hardtanh_backward"),
-             OpList{grad_output, self, min_val, max_val}, self.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class LogSoftmaxNode : public Node {
-public:
-  LogSoftmaxNode(Value input, int64_t dim, bool half_to_float)
-      : Node(OpKind::Get("aten::_log_softmax"),
-             OpList{
-                 input, ir::Value(std::make_shared<ir::ConstantNode>(dim)),
-                 ir::Value(std::make_shared<ir::ConstantNode>(half_to_float))},
-             input.sizes()),
-        dim(dim), half_to_float(half_to_float) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-private:
-  int64_t dim;
-  bool half_to_float;
-};
-
-class LogSoftmaxBackwardNode : public Node {
-public:
-  LogSoftmaxBackwardNode(Value grad_output, Value output, int64_t dim,
-                         Value input)
-      : Node(OpKind::Get("aten::_log_softmax_backward_data"),
-             OpList{grad_output, output,
-                    ir::Value(std::make_shared<ir::ConstantNode>(dim)), input},
-             input.sizes()),
-        dim(dim) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-private:
-  int64_t dim;
-};
-
-class MaxPool2dWithIndicesNode : public Node {
-public:
-  MaxPool2dWithIndicesNode(Value input, at::IntArrayRef kernel_size,
-                           at::IntArrayRef stride, at::IntArrayRef padding,
-                           at::IntArrayRef dilation, bool ceil_mode)
-      : Node(OpKind::Get("aten::max_pool2d_with_indices"),
-             OpList{input,
-                    ir::Value(std::make_shared<ir::ConstantNode>(kernel_size)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(stride)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(padding)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(dilation)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(ceil_mode))},
-             input.sizes()),
-        kernel_size(kernel_size.begin(), kernel_size.end()),
-        stride(stride.begin(), stride.end()),
-        padding(padding.begin(), padding.end()),
-        dilation(dilation.begin(), dilation.end()), ceil_mode(ceil_mode){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-  std::vector<int64_t> sizes() const override {
-    assert(0 && "Cannot call sizes() for multiple outputs");
-  }
-  std::vector<int64_t> sizes(size_t i) const override;
-
-private:
-  std::vector<int64_t> kernel_size;
-  std::vector<int64_t> stride;
-  std::vector<int64_t> padding;
-  std::vector<int64_t> dilation;
-  bool ceil_mode;
-};
-
-class MaxPool2dWithIndicesBackwardNode : public Node {
-public:
-  MaxPool2dWithIndicesBackwardNode(Value grad_output, Value input,
-                                   at::IntArrayRef kernel_size,
-                                   at::IntArrayRef stride,
-                                   at::IntArrayRef padding,
-                                   at::IntArrayRef dilation, bool ceil_mode,
-                                   Value indices)
-      : Node(OpKind::Get("aten::max_pool2d_with_indices_backward"),
-             OpList{grad_output, input,
-                    ir::Value(std::make_shared<ir::ConstantNode>(kernel_size)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(stride)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(padding)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(dilation)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(ceil_mode)),
-                    indices},
-             input.sizes()),
-        kernel_size(kernel_size.begin(), kernel_size.end()),
-        stride(stride.begin(), stride.end()),
-        padding(padding.begin(), padding.end()),
-        dilation(dilation.begin(), dilation.end()), ceil_mode(ceil_mode){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-private:
-  std::vector<int64_t> kernel_size;
-  std::vector<int64_t> stride;
-  std::vector<int64_t> padding;
-  std::vector<int64_t> dilation;
-  bool ceil_mode;
-};
-
-class MeanNode : public Node {
-public:
-  MeanNode(Value input, at::IntArrayRef dim, bool keepdim,
-           c10::optional<at::ScalarType> dtype)
-      : Node(OpKind::Get("aten::mean"),
-             OpList{input, ir::Value(std::make_shared<ir::ConstantNode>(dim)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(keepdim))},
-             input.sizes()),
-        dim(dim.begin(), dim.end()), keepdim(keepdim), dtype(dtype) {}
-
-  MeanNode(Value input, c10::optional<at::ScalarType> dtype)
-      : Node(OpKind::Get("aten::mean"), OpList{input}, input.sizes()),
-        dtype(dtype) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-  std::vector<int64_t> sizes() const override;
-  std::vector<int64_t> sizes(size_t i) const override { return sizes(); }
-
-private:
-  std::vector<int64_t> dim;
-  bool keepdim;
-  c10::optional<at::ScalarType> dtype;
-};
-
-class MMNode : public Node {
-public:
-  MMNode(Value input, Value mat2)
-      : Node(OpKind::Get("aten::mm"), OpList{input, mat2},
-             std::vector<int64_t>{input.sizes()[0], mat2.sizes()[1]}){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class MulNode : public Node {
-public:
-  MulNode(Value rhs, Value lhs)
-      : Node(OpKind::Get("aten::mul"), OpList{rhs, lhs}, rhs.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class MulInPlaceNode : public Node {
-public:
-  MulInPlaceNode(Value self, Value other)
-      : Node(OpKind::Get("aten::mul_"), OpList{self, other}, self.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class NegNode : public Node {
-public:
-  NegNode(Value input)
-      : Node(OpKind::Get("aten::neg"), OpList{input}, input.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class NllLoss2dForwardNode : public Node {
-public:
-  NllLoss2dForwardNode(Value self, Value target, Value weight,
-                       int64_t reduction, int64_t ignore_index)
-      : Node(
-            OpKind::Get("aten::nll_loss2d_forward"),
-            OpList{self, target, weight,
-                   ir::Value(std::make_shared<ir::ConstantNode>(reduction)),
-                   ir::Value(std::make_shared<ir::ConstantNode>(ignore_index))},
-            1 /*target.sizes()*/),
-        reduction(reduction), ignore_index(ignore_index) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-private:
-  int64_t reduction;
-  int64_t ignore_index;
-};
-
-class NllLoss2dBackwardNode : public Node {
-public:
-  NllLoss2dBackwardNode(Value grad_output, Value self, Value target,
-                        Value weight, int64_t reduction, int64_t ignore_index,
-                        Value total_weight)
-      : Node(OpKind::Get("aten::nll_loss2d_backward"),
-             OpList{grad_output, self, target, weight,
-                    ir::Value(std::make_shared<ir::ConstantNode>(reduction)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(ignore_index)),
-                    total_weight},
-             self.sizes()),
-        reduction(reduction), ignore_index(ignore_index) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-private:
-  int64_t reduction;
-  int64_t ignore_index;
-};
-
-class NllLossForwardNode : public Node {
-public:
-  NllLossForwardNode(Value self, Value target, Value weight, int64_t reduction,
-                     int64_t ignore_index)
-      : Node(
-            OpKind::Get("aten::nll_loss_forward"),
-            OpList{self, target, weight,
-                   ir::Value(std::make_shared<ir::ConstantNode>(reduction)),
-                   ir::Value(std::make_shared<ir::ConstantNode>(ignore_index))},
-            1 /*target.sizes()*/),
-        reduction(reduction), ignore_index(ignore_index) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-private:
-  int64_t reduction;
-  int64_t ignore_index;
-};
-
-class NllLossBackwardNode : public Node {
-public:
-  NllLossBackwardNode(Value grad_output, Value self, Value target, Value weight,
-                      int64_t reduction, int64_t ignore_index,
-                      Value total_weight)
-      : Node(OpKind::Get("aten::nll_loss_backward"),
-             OpList{grad_output, self, target, weight,
-                    ir::Value(std::make_shared<ir::ConstantNode>(reduction)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(ignore_index)),
-                    total_weight},
-             self.sizes()),
-        reduction(reduction), ignore_index(ignore_index) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-private:
-  int64_t reduction;
-  int64_t ignore_index;
-};
-
-class SumNode : public Node {
-public:
-  SumNode(Value input, at::IntArrayRef dim, bool keepdim,
-          c10::optional<at::ScalarType> dtype)
-      : Node(OpKind::Get("aten::sum"),
-             OpList{input, ir::Value(std::make_shared<ir::ConstantNode>(dim)),
-                    ir::Value(std::make_shared<ir::ConstantNode>(keepdim))},
-             input.sizes()),
-        dim(dim.begin(), dim.end()), keepdim(keepdim), dtype(dtype) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-  std::vector<int64_t> sizes() const override;
-  std::vector<int64_t> sizes(size_t i) const override { return sizes(); }
-
-private:
-  std::vector<int64_t> dim;
-  bool keepdim;
-  c10::optional<at::ScalarType> dtype;
-};
-
-class ReLUNode : public Node {
-public:
-  ReLUNode(Value input)
-      : Node(OpKind::Get("aten::relu"), OpList{input}, input.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class ReLUInPlaceNode : public Node {
-public:
-  ReLUInPlaceNode(Value input)
-      : Node(OpKind::Get("aten::relu_"), OpList{input}, input.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class ThresholdBackwardNode : public Node {
-public:
-  ThresholdBackwardNode(Value grad_output, Value input, Value threshold)
-      : Node(OpKind::Get("aten::threshold_backward"),
-             OpList{grad_output, input, threshold}, input.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class TransposeNode : public Node {
-public:
-  TransposeNode(Value input)
-      : Node(OpKind::Get("aten::t"), OpList{input},
-             std::vector<int64_t>{input.sizes()[1], input.sizes()[0]}){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class SizeNode : public Node {
-public:
-  SizeNode(Value input, int64_t dim)
-      : Node(OpKind::Get("aten::size"),
-             OpList{input, ir::Value(std::make_shared<ir::ConstantNode>(dim))},
-             1),
-        dim(dim) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-private:
-  int64_t dim;
-};
-
-class SqueezeNode : public Node {
-public:
-  SqueezeNode(Value input, int64_t dim)
-      : Node(OpKind::Get("aten::squeeze"),
-             OpList{input, ir::Value(std::make_shared<ir::ConstantNode>(dim))},
-             input.sizes()),
-        dim(dim) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-  std::vector<int64_t> sizes() const override;
-  std::vector<int64_t> sizes(size_t i) const override { return sizes(); }
-
-private:
-  int64_t dim;
-};
-
-class SubNode : public Node {
-public:
-  SubNode(Value rhs, Value lhs, Value alpha)
-      : Node(OpKind::Get("aten::sub"), OpList{rhs, lhs, alpha}, rhs.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class SubInPlaceNode : public Node {
-public:
-  SubInPlaceNode(Value self, Value other, Value alpha)
-      : Node(OpKind::Get("aten::sub_"), OpList{self, other, alpha},
-             self.sizes()){};
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-};
-
-class UnsqueezeNode : public Node {
-public:
-  UnsqueezeNode(Value input, int64_t dim)
-      : Node(OpKind::Get("aten::unsqueeze"),
-             OpList{input, ir::Value(std::make_shared<ir::ConstantNode>(dim))},
-             input.sizes()),
-        dim(dim) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-  std::vector<int64_t> sizes() const override;
-  std::vector<int64_t> sizes(size_t i) const override { return sizes(); }
-
-private:
-  int64_t dim;
-};
-
-class ViewNode : public Node {
-public:
-  ViewNode(Value input, at::IntArrayRef size)
-      : Node(OpKind::Get("aten::view"),
-             OpList{input, ir::Value(std::make_shared<ir::ConstantNode>(size))},
-             input.sizes()),
-        view_size(size.begin(), size.end()) {}
-
-  mlir::Operation *
-  genMLIR(std::unique_ptr<mlir::OpBuilder> &builder, mlir::MLIRContext &context,
-          std::map<const ir::Value, mlir::Value> &symbolTable) override;
-
-  std::vector<int64_t> sizes() const override;
-  std::vector<int64_t> sizes(size_t i) const override { return sizes(); }
-
-private:
-  std::vector<int64_t> view_size;
-};
-
-class TorchDataNode : public Node {
-
-public:
-  TorchDataNode(at::Tensor tensor)
-      : Node(ir::OpKind::Get("aten::torch_data"), {}, tensor.sizes()),
-        tensor_(std::move(tensor)) {}
-
-  at::Tensor tensor() { return tensor_; }
-
-private:
-  at::Tensor tensor_;
-};
-
-} // namespace ir
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/jit.cpp
+++ b/frontends/pytorch/csrc/type_dispatch/jit.cpp
@ -1,333 +0,0 @@
-//===- jit.cpp --------------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-// This file drives the generation and lowering of MLIR, followed by JIT
-// compiling the resulting LLVM dialect.
-
-#include "npcomp/Dialect/ATen/ATenDialect.h"
-#include "npcomp/Dialect/ATen/ATenPasses.h"
-
-#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
-#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/ExecutionEngine/ExecutionEngine.h"
-#include "mlir/ExecutionEngine/JitRunner.h"
-#include "mlir/ExecutionEngine/OptUtils.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Verifier.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Target/LLVMIR.h"
-#include "mlir/Transforms/Passes.h"
-
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <dlfcn.h>
-
-#include "ATen/ArrayRef.h"
-namespace at {
-template <typename T> using ArrayRef = c10::ArrayRef<T>;
-}
-#include "ATen/Tensor.h"
-#include <ATen/CPUType.h>
-
-#include "jit.h"
-#include "mlir_gen.h"
-#include "tensor.h"
-#include "torch_util.h"
-
-#define DEBUG_TYPE "torch_mlir"
-
-using namespace mlir;
-
-namespace torch_mlir {
-
-namespace {
-
-int LowerATenDialect(mlir::ModuleOp module) {
-  PassManager pm0(module.getContext());
-  pm0.addPass(mlir::createCSEPass());
-
-  // Lower to function calls.
-  pm0.addPass(mlir::NPCOMP::aten::createATenLoweringPass());
-  pm0.addPass(mlir::NPCOMP::aten::createReturnEliminationPass());
-
-  if (failed(pm0.run(module))) {
-    llvm::errs() << "aten to loops conversion failed ";
-    return 1;
-  }
-
-  PassManager pm1(module.getContext());
-  pm1.addPass(mlir::createLowerAffinePass());
-  pm1.addPass(mlir::createLowerToCFGPass());
-  pm1.addPass(mlir::createCSEPass());
-
-  if (failed(pm1.run(module))) {
-    llvm::errs() << "loops to std conversion failed ";
-    return 1;
-  }
-
-  return 0;
-}
-
-int LowerStdDialect(mlir::ModuleOp module) {
-  PassManager pm(module.getContext());
-
-  struct LowerToLLVMOptions options;
-  options.emitCWrappers = true;
-  LLVM_DEBUG(module.print(llvm::outs()));
-
-  pm.addPass(mlir::createLowerToLLVMPass(options));
-  pm.addPass(mlir::createCSEPass());
-
-  LLVM_DEBUG(module.print(llvm::outs()));
-
-  if (failed(pm.run(module))) {
-    llvm::errs() << "std to llvm conversion failed ";
-    return 1;
-  }
-
-  if (!module)
-    return 1;
-  return 0;
-}
-
-template <typename T, int N> struct llvm_tensor_t {
-  T *d;
-  T *aligned;
-  size_t offset;
-  size_t shape[N];
-  size_t stride[N];
-};
-
-template <typename T, int N> void *setupArg(at::Tensor &t) {
-  llvm_tensor_t<T, N> *arg = new llvm_tensor_t<T, N>;
-  llvm_tensor_t<T, N> **arg_storage = new llvm_tensor_t<T, N> *;
-  *arg_storage = arg;
-  arg->d = arg->aligned = (T *)t.data_ptr();
-  arg->offset = 0;
-  assert(t.dim() == N);
-  for (int j = 0; j < N; j++) {
-    arg->shape[j] = t.sizes()[j];
-    arg->stride[j] = t.stride(j);
-  }
-  return (void *)arg_storage;
-}
-
-at::Tensor LowerAndRun(mlir::ModuleOp module,
-                       std::vector<at::Tensor> &arguments, const ir::Value &v,
-                       mlir::MLIRContext &context) {
-
-  LowerATenDialect(module);
-  LowerStdDialect(module);
-
-  llvm::InitializeNativeTarget();
-  llvm::InitializeNativeTargetAsmPrinter();
-
-  Optional<llvm::CodeGenOpt::Level> jitCodeGenOptLevel =
-      llvm::CodeGenOpt::Level::Aggressive;
-  std::string libpath;
-  if (const char *path = std::getenv("TEST_BUILD_PATH")) {
-    libpath = path;
-  }
-
-  std::vector<std::string> sharedLibs{libpath +
-                                      "/frontends/pytorch/lib/libaten_ops.so"};
-  llvm::errs() << "Loading " << sharedLibs[0] << "\n";
-
-  llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
-
-  llvm::SmallVector<llvm::StringRef, 1> libs(sharedLibs.begin(),
-                                             sharedLibs.end());
-  auto expectedEngine = mlir::ExecutionEngine::create(
-      module, {}, jitCodeGenOptLevel, libs, false, false, false);
-  assert(expectedEngine && "no engine, cannot fly");
-
-  llvm::StringRef entryPoint("_mlir_ciface_graph");
-  auto engine = std::move(*expectedEngine);
-  auto expectedFPtr = engine->lookup(entryPoint);
-  assert(expectedFPtr && "entryPoint missing");
-
-  void (*fptr)(void **) = *expectedFPtr;
-
-  // this array holds pointers to the function arguments
-  void **args = (void **)malloc((arguments.size() + 1) * sizeof(void *));
-
-  // allocate and setup the function arguments
-  for (int i = 0, e = arguments.size(); i < e; i++) {
-    at::Tensor &t = arguments[i];
-    auto dtype = t.dtype();
-    int dim = t.dim();
-    if (dim == 4) {
-      if (dtype == at::kFloat)
-        args[i] = setupArg<float, 4>(t);
-      else if (dtype == at::kLong)
-        args[i] = setupArg<uint64_t, 4>(t);
-      else
-        assert(0);
-    } else if (dim == 3) {
-      if (dtype == at::kFloat)
-        args[i] = setupArg<float, 3>(t);
-      else if (dtype == at::kLong)
-        args[i] = setupArg<uint64_t, 3>(t);
-      else
-        assert(0);
-    } else if (dim == 2) {
-      if (dtype == at::kFloat)
-        args[i] = setupArg<float, 2>(t);
-      else if (dtype == at::kLong)
-        args[i] = setupArg<uint64_t, 2>(t);
-      else
-        assert(0);
-    } else if (dim == 1) {
-      if (dtype == at::kFloat)
-        args[i] = setupArg<float, 1>(t);
-      else if (dtype == at::kLong)
-        args[i] = setupArg<uint64_t, 1>(t);
-      else
-        assert(0);
-    } else {
-      assert(0 && "unhandled dim");
-    }
-  }
-
-  // allocate the result tensors
-  // TODO: num results > 1
-  at::Tensor result = util::Zeros(v.sizes(), at::kFloat);
-  if (result.dim() == 4) {
-    args[arguments.size()] = setupArg<float, 4>(result);
-  } else if (result.dim() == 3) {
-    args[arguments.size()] = setupArg<float, 3>(result);
-  } else if (result.dim() == 2) {
-    args[arguments.size()] = setupArg<float, 2>(result);
-  } else if (result.dim() == 1) {
-    args[arguments.size()] = setupArg<float, 1>(result);
-  } else {
-    assert(0 && "unhandled dim");
-  }
-
-  // call the JITed function
-  fptr(args);
-
-  // free pointers to the results
-  // TODO: num results > 1
-  if (result.dim() == 4) {
-    auto arg_storage =
-        static_cast<llvm_tensor_t<float, 4> **>(args[arguments.size()]);
-    auto arg = *arg_storage;
-    delete arg;
-    delete arg_storage;
-  } else if (result.dim() == 3) {
-    auto arg_storage =
-        static_cast<llvm_tensor_t<float, 3> **>(args[arguments.size()]);
-    auto arg = *arg_storage;
-    delete arg;
-    delete arg_storage;
-  } else if (result.dim() == 2) {
-    auto arg_storage =
-        static_cast<llvm_tensor_t<float, 2> **>(args[arguments.size()]);
-    auto arg = *arg_storage;
-    delete arg;
-    delete arg_storage;
-  } else if (result.dim() == 1) {
-    auto arg_storage =
-        static_cast<llvm_tensor_t<float, 1> **>(args[arguments.size()]);
-    auto arg = *arg_storage;
-    delete arg;
-    delete arg_storage;
-  } else {
-    assert(0 && "unhandled dim");
-  }
-
-  // free pointers to the arguments
-  for (int i = 0, e = arguments.size(); i < e; i++) {
-    at::Tensor &t = arguments[i];
-    int dim = t.dim();
-    if (dim == 4) {
-      auto arg_storage = static_cast<llvm_tensor_t<float, 4> **>(args[i]);
-      auto arg = *arg_storage;
-      delete arg;
-      delete arg_storage;
-    } else if (dim == 3) {
-      auto arg_storage = static_cast<llvm_tensor_t<float, 3> **>(args[i]);
-      auto arg = *arg_storage;
-      delete arg;
-      delete arg_storage;
-    } else if (dim == 2) {
-      auto arg_storage = static_cast<llvm_tensor_t<float, 2> **>(args[i]);
-      auto arg = *arg_storage;
-      delete arg;
-      delete arg_storage;
-    } else if (dim == 1) {
-      auto arg_storage = static_cast<llvm_tensor_t<float, 1> **>(args[i]);
-      auto arg = *arg_storage;
-      delete arg;
-      delete arg_storage;
-    } else {
-      assert(0 && "unhandled dim");
-    }
-  }
-
-  // free the array of void* ptrs
-  free(args);
-
-  return result;
-}
-
-at::Tensor JitAndRun(const ir::Value &v, mlir::MLIRContext &context) {
-
-  // generate the MLIR
-  std::vector<ir::Value> vs{v};
-  auto mlir_gen = MLIRGen(context).genModule(vs);
-  mlir::OwningModuleRef module = std::move(std::get<0>(mlir_gen));
-  std::vector<at::Tensor> arguments = std::move(std::get<1>(mlir_gen));
-
-  return LowerAndRun(module.get(), arguments, v, context);
-}
-
-at::Tensor JitAndRun(const ir::Value &v) {
-  mlir::MLIRContext context;
-  return JitAndRun(v, context);
-}
-
-at::Tensor Interpret(const ir::Value &v) { assert(0 && "unsupported"); }
-} // anonymous namespace
-
-// FIXME: Why is this code here and not in tensor.cpp?
-std::string MLIRTensor::GetMLIR() const {
-
-  // generate the MLIR
-  mlir::MLIRContext context;
-  ir::Value ir_value = CurrentIrValue();
-  if (!ir_value)
-    return "<tensor>";
-
-  std::vector<ir::Value> vs{ir_value};
-  auto mlir_gen = MLIRGen(context).genModule(vs);
-  mlir::OwningModuleRef module = std::move(std::get<0>(mlir_gen));
-
-  std::string aten;
-  llvm::raw_string_ostream ss(aten);
-  module->print(ss);
-  return ss.str();
-}
-
-at::Tensor MLIRTensor::CompileAndRun() const {
-  return JitAndRun(CurrentIrValue());
-}
-
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/jit.h
+++ b/frontends/pytorch/csrc/type_dispatch/jit.h
@ -1,16 +0,0 @@
-//===- jit.h ----------------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-namespace torch_mlir {
-// namespace jit {
-
-// at::Tensor CompileAndRun(const MLIRTensor &tensor);
-// at::Tensor JitAndRun(const ir::Value &v);
-//}
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/mlir_gen.cpp
+++ b/frontends/pytorch/csrc/type_dispatch/mlir_gen.cpp
@ -1,214 +0,0 @@
-//===- mlir_gen.cpp ---------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Verifier.h"
-
-#include "llvm/Support/Debug.h"
-
-#include "npcomp/Dialect/ATen/ATenDialect.h"
-
-#include "ATen/ArrayRef.h"
-namespace at {
-template <typename T> using ArrayRef = c10::ArrayRef<T>;
-}
-#include "ATen/Tensor.h"
-
-#include "ir.h"
-#include "mlir_gen.h"
-
-#include <set>
-#include <vector>
-
-#define DEBUG_TYPE "torch_mlir"
-
-namespace torch_mlir {
-
-MLIRGen::MLIRGen(mlir::MLIRContext &context) : context(context) {
-  context.getOrLoadDialect<mlir::NPCOMP::aten::ATenDialect>();
-  context.getOrLoadDialect<mlir::StandardOpsDialect>();
-}
-
-std::tuple<mlir::OwningModuleRef, std::vector<at::Tensor>>
-MLIRGen::genModule(std::vector<ir::Value> &v) {
-  // the module
-  module = mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
-
-  auto fn = genFunction(v);
-  if (fn) {
-    module->push_back(fn);
-    if (failed(mlir::verify(*module))) {
-      emitError(mlir::UnknownLoc::get(&context), "module verification error");
-    }
-  }
-  return std::make_tuple(std::move(module), arguments);
-}
-
-mlir::Value MLIRGen::genValue(const ir::Value &v) {
-
-  if (symbolTable.count(v))
-    return symbolTable[v];
-
-  LLVM_DEBUG(llvm::dbgs() << "genValue node: " << v.node->op() << "\n");
-
-  ir::NodePtr node = v.node;
-  auto loc = mlir::UnknownLoc::get(&context);
-
-  for (auto &operand : node->operands())
-    genValue(operand);
-
-  mlir::Value mlirValue = nullptr;
-  if (opTable.count(v.node)) {
-    mlirValue = opTable[v.node]->getResult(v.index);
-  } else {
-    mlir::Operation *mlirOp = node->genMLIR(builder, context, symbolTable);
-    opTable.insert({v.node, mlirOp});
-    assert(mlirOp && "failed to generate mlir op");
-    mlirValue = mlirOp->getResult(v.index);
-  }
-
-  declareSymbol(v, mlirValue);
-
-  return mlirValue;
-}
-
-// generate function parameters for the IR rooted at v
-void MLIRGen::genParameters(const ir::Value &v, std::set<ir::Value> &visited) {
-  ir::NodePtr node = v.node;
-  if (visited.count(v))
-    return;
-  visited.insert(v);
-  for (const ir::Value &operand : node->operands()) {
-    // if the operand is a leaf
-    if (operand.node->op() == ir::OpKind::Get("aten::torch_data")) {
-      parameters.push_back(operand);
-    } else {
-      genParameters(operand, visited);
-    }
-  }
-}
-
-mlir::FuncOp MLIRGen::genFunction(std::vector<ir::Value> &vs) {
-
-  auto loc = mlir::UnknownLoc::get(&context);
-
-  auto gen_tensor_ty = [&](const ir::Value &v) {
-    auto shape = v.sizes();
-    auto tdn = dynamic_cast<ir::TorchDataNode *>(v.node.get());
-    mlir::Type elemTy;
-    if (tdn) {
-      auto dtype = tdn->tensor().dtype();
-      if (dtype == at::kFloat)
-        elemTy = mlir::FloatType::getF32(&context);
-      else if (dtype == at::kDouble)
-        elemTy = mlir::FloatType::getF64(&context);
-      else if (dtype == at::kLong)
-        elemTy = mlir::IntegerType::get(64, &context);
-      else if (dtype == at::kInt)
-        elemTy = mlir::IntegerType::get(32, &context);
-      else if (dtype == at::kShort)
-        elemTy = mlir::IntegerType::get(16, &context);
-      else if (dtype == at::kChar || dtype == at::kByte)
-        elemTy = mlir::IntegerType::get(8, &context);
-      else {
-        std::cout << tdn->tensor().dtype() << "\n";
-        assert(0 && "bad type");
-      }
-    } else {
-      elemTy = mlir::FloatType::getF32(&context);
-    }
-    return mlir::RankedTensorType::get(shape, elemTy);
-  };
-
-  std::set<ir::Value> visited;
-  for (auto &v : vs)
-    genParameters(v, visited);
-
-  std::map<ir::Value, ir::Value> parameter_map;
-  std::vector<ir::Value> unique_parameters;
-
-  for (const ir::Value &p : parameters) {
-    bool found = false;
-    for (const ir::Value &q : unique_parameters) {
-      if (p.node->op() == ir::OpKind::Get("aten::torch_data") &&
-          q.node->op() == ir::OpKind::Get("aten::torch_data")) {
-        auto &ptd = *dynamic_cast<ir::TorchDataNode *>(p.node.get());
-        auto &qtd = *dynamic_cast<ir::TorchDataNode *>(q.node.get());
-        if (ptd.tensor().is_same(qtd.tensor())) {
-          found = true;
-          parameter_map.insert({p, q});
-          break;
-        }
-      }
-    }
-    if (!found) {
-      unique_parameters.push_back(p);
-    }
-  }
-
-  // collect the argument types and tensors
-  std::vector<mlir::Type> arg_types;
-  for (const ir::Value &p : unique_parameters) {
-    // tensor type for the function signature
-    arg_types.push_back(gen_tensor_ty(p));
-
-    // tensor itself for actually calling the graph
-    auto tdn = dynamic_cast<ir::TorchDataNode *>(p.node.get());
-    arguments.push_back(tdn->tensor());
-  }
-
-  // construct return type
-  std::vector<mlir::Type> ret_types;
-  for (auto &v : vs)
-    ret_types.push_back(gen_tensor_ty(v));
-
-  // create the function type and the function itself
-  auto func_type = mlir::FunctionType::get(arg_types, ret_types, &context);
-  auto function =
-      mlir::FuncOp::create(loc, "graph", func_type, /* attrs = */ {});
-
-  // entry
-  auto &entryBlock = *function.addEntryBlock();
-
-  // Declare all the function arguments in the symbol table.
-  for (const auto &i :
-       llvm::zip(unique_parameters, entryBlock.getArguments())) {
-    declareSymbol(std::get<0>(i), std::get<1>(i));
-  }
-  // Declare all the duplicates from the original
-  // parameter list in the symbol table
-  for (auto &k_v : parameter_map) {
-    assert(symbolTable.count(k_v.second));
-    declareSymbol(k_v.first, symbolTable[k_v.second]);
-  }
-
-  builder = std::make_unique<mlir::OpBuilder>(function.getBody());
-
-  std::vector<mlir::Value> rets;
-  for (auto &v : vs)
-    rets.push_back(genValue(v));
-
-  builder->create<mlir::ReturnOp>(loc, rets);
-  return function;
-}
-
-bool MLIRGen::declareSymbol(const ir::Value &irValue, mlir::Value mlirValue) {
-  if (symbolTable.count(irValue)) {
-    return false;
-  }
-  symbolTable.insert({irValue, mlirValue});
-  return true;
-}
-
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/mlir_gen.h
+++ b/frontends/pytorch/csrc/type_dispatch/mlir_gen.h
@ -1,45 +0,0 @@
-//===- mlir_gen.h -----------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include "mlir/IR/MLIRContext.h"
-
-#include "ir.h"
-
-namespace torch_mlir {
-
-/// This class generates MLIR from a pytorch graph
-class MLIRGen {
-
-public:
-  MLIRGen(mlir::MLIRContext &context);
-
-  // Generate an MLIR model that computes the given outputs.
-  std::tuple<mlir::OwningModuleRef, std::vector<at::Tensor>>
-  genModule(std::vector<ir::Value> &v);
-
-private:
-  mlir::Value genValue(const ir::Value &v);
-
-  void genParameters(const ir::Value &v, std::set<ir::Value> &visited);
-
-  mlir::FuncOp genFunction(std::vector<ir::Value> &v);
-
-  bool declareSymbol(const ir::Value &irValue, mlir::Value mlirValue);
-
-private:
-  mlir::MLIRContext &context;
-  mlir::OwningModuleRef module;
-  std::unique_ptr<mlir::OpBuilder> builder;
-  std::map<const ir::Value, mlir::Value> symbolTable;
-  std::map<const ir::NodePtr, mlir::Operation *> opTable;
-  std::vector<ir::Value> parameters;
-  std::vector<at::Tensor> arguments;
-};
-
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/python_bindings.cpp
+++ b/frontends/pytorch/csrc/type_dispatch/python_bindings.cpp
@ -1,137 +0,0 @@
-//===- init_python_bindings.cpp ---------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-// This file implements Python bindings to the MLIR/NPCOMP ATen dialect.
-// Roughly speaking, it enables something like this:
-//
-//  dev = torch_mlir.mlir_device()
-//  t0 = torch.randn((4,4), device=dev)
-//  t1 = torch.randn((4,4), device=dev)
-//  t2 = t0 + t1
-//  t2_mlir = torch_mlir.get_mlir( t2 )
-//  t2_cpu = t2.to('cpu')
-//
-// In this case t2_cpu contains the result of the computation, and t2_mlir
-// contains the mlir description of the computation.
-
-#include "../pybind.h"
-
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/Verifier.h"
-#include "mlir/Parser.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/Passes.h"
-
-#include "npcomp/Dialect/ATen/ATenDialect.h"
-#include "npcomp/Dialect/ATen/ATenOpReport.h"
-#include "npcomp/Dialect/ATen/ATenPasses.h"
-#include "npcomp/Dialect/ATen/LivenessReport.h"
-
-namespace py = pybind11;
-
-// Then ATen headers with workarounds
-#include "ATen/ArrayRef.h"
-namespace at {
-template <typename T> using ArrayRef = c10::ArrayRef<T>;
-}
-#include "ATen/SmallVector.h"
-namespace at {
-template <typename T, int S> using SmallVector = c10::SmallVector<T, S>;
-}
-#include <ATen/Tensor.h>
-
-// other headers
-
-#include "aten_mlir_bridge.h"
-#include "aten_mlir_type.h"
-#include "init_python_bindings.h"
-#include "mlir_gen.h"
-
-#include <string>
-
-using namespace mlir;
-
-namespace llvm {
-extern bool DebugFlag;
-}
-
-namespace torch_mlir {
-namespace {
-
-mlir::OwningModuleRef LoadModule(mlir::MLIRContext &context, std::string mlir) {
-
-  mlir::OwningModuleRef module;
-
-  std::unique_ptr<llvm::MemoryBuffer> membuf =
-      llvm::MemoryBuffer::getMemBuffer(mlir);
-
-  llvm::SourceMgr sourceMgr;
-  sourceMgr.AddNewSourceBuffer(std::move(membuf), llvm::SMLoc());
-  module = mlir::parseSourceFile(sourceMgr, &context);
-
-  if (!module) {
-    llvm::errs() << "Error can't parse mlir module\n";
-    return nullptr;
-  }
-  if (failed(mlir::verify(*module))) {
-    llvm::errs() << "Error verifying MLIR module\n";
-    return nullptr;
-  }
-  if (!module)
-    return nullptr;
-  return module;
-}
-
-void InitModuleBindings(py::module &m) {
-  m.def("_initialize_aten_bindings",
-        []() { ATenMLIRType::InitializeAtenBindings(); });
-  m.def("_set_default_device", []() {});
-
-  m.def("_get_mlir", [](std::vector<at::Tensor> &ts) -> std::string {
-    if (ts.size() == 0)
-      return std::string();
-
-    mlir::MLIRContext context;
-
-    // gather IR for all the tensors
-    std::vector<ir::Value> recorded_ir;
-    for (auto &t : ts)
-      if (c10::optional<MLIRTensor> at = bridge::TryGetMLIRTensor(t))
-        recorded_ir.push_back(at->GetIrValue());
-
-    // generate MLIR from IR
-    auto mlir_gen = MLIRGen(context).genModule(recorded_ir);
-    mlir::OwningModuleRef module = std::move(std::get<0>(mlir_gen));
-
-    mlir::PassManager pm(module->getContext());
-
-    pm.addPass(mlir::createCSEPass());
-    pm.addPass(mlir::NPCOMP::aten::createATenLayerNamePass());
-    if (failed(pm.run(*module))) {
-      llvm::errs() << "ATenLayerNamePass failed";
-      return "<error>";
-    }
-
-    // dump MLIR to string and return
-    std::string s;
-    llvm::raw_string_ostream ss(s);
-    module->print(ss);
-    return ss.str();
-  });
-}
-} // namespace
-
-void InitTypeDispatchBindings(py::module &m) { InitModuleBindings(m); }
-
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/tensor.cpp
+++ b/frontends/pytorch/csrc/type_dispatch/tensor.cpp
@ -1,613 +0,0 @@
-//===- tensor.cpp -----------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/Debug.h"
-
-#include "ATen/ArrayRef.h"
-namespace at {
-template <typename T> using ArrayRef = c10::ArrayRef<T>;
-}
-#include "ATen/Tensor.h"
-
-#include "jit.h"
-#include "tensor.h"
-
-#include <atomic>
-
-#define DEBUG_TYPE "torch_mlir"
-
-namespace torch_mlir {
-
-MLIRTensor MLIRTensor::Create(const at::Tensor &tensor, const Device &device) {
-  assert(tensor.device().type() == at::kCPU);
-  MLIRTensor device_tensor(tensor, device);
-  return device_tensor;
-}
-
-MLIRTensor
-MLIRTensor::Create(ir::Value ir_value, const Device &device,
-                   c10::optional<at::ScalarType> logical_element_type) {
-  MLIRTensor device_tensor(std::move(ir_value), device, logical_element_type);
-  return device_tensor;
-}
-
-MLIRTensor::MLIRTensor(const at::Tensor &tensor, const Device &device)
-    : data_(std::make_shared<Data>(tensor, device)) {}
-
-MLIRTensor::MLIRTensor(ir::Value ir_value, const Device &device,
-                       c10::optional<at::ScalarType> logical_element_type)
-    : data_(std::make_shared<Data>(std::move(ir_value), device,
-                                   logical_element_type)) {}
-
-MLIRTensor::Data *MLIRTensor::data() const {
-  assert(data_ != nullptr && "Trying to access null data");
-  return data_.get();
-}
-
-at::ScalarType MLIRTensor::dtype() const {
-  return data()->logical_element_type ? *data()->logical_element_type
-                                      : at::ScalarType::Float;
-}
-
-const Device &MLIRTensor::GetDevice() const { return data()->device; }
-
-uint64_t MLIRTensor::GetNextTensorId() {
-  static std::atomic<uint64_t> *id_generator = new std::atomic<uint64_t>(1);
-  return id_generator->fetch_add(1);
-}
-
-void MLIRTensor::SetTensorData(at::Tensor tensor_data) {
-  data()->tensor_data = std::move(tensor_data);
-}
-
-ir::Value MLIRTensor::GetIrValue() const {
-  ir::Value ir_value = CurrentIrValue();
-  if (ir_value) {
-    return ir_value;
-  }
-  c10::optional<at::Tensor> tensor_data = CurrentTensorData();
-  if (tensor_data) {
-    at::Tensor tensor = *tensor_data;
-    if (!tensor.dim()) {
-      auto dtype = tensor.dtype();
-      if (dtype == at::kFloat) {
-        auto d = tensor.data_ptr<float>();
-        return ir::Value(std::make_shared<ir::ConstantNode>(d[0]));
-      } else if (dtype == at::kDouble) {
-        auto d = tensor.data_ptr<double>();
-        return ir::Value(std::make_shared<ir::ConstantNode>(d[0]));
-      } else if (dtype == at::kLong) {
-        auto d = tensor.data_ptr<int64_t>();
-        return ir::Value(std::make_shared<ir::ConstantNode>(d[0]));
-      } else if (dtype == at::kInt) {
-        auto d = tensor.data_ptr<int32_t>();
-        return ir::Value(std::make_shared<ir::ConstantNode>(d[0]));
-      } else if (dtype == at::kShort) {
-        auto d = tensor.data_ptr<int16_t>();
-        return ir::Value(std::make_shared<ir::ConstantNode>(d[0]));
-      } else if (dtype == at::kChar || dtype == at::kByte) {
-        auto d = tensor.data_ptr<int8_t>();
-        return ir::Value(std::make_shared<ir::ConstantNode>(d[0]));
-      }
-      // fall through to TorchDataNode below
-    }
-    return ir::Value(std::make_shared<ir::TorchDataNode>(*tensor_data));
-  }
-  assert(0 && "Could not create ir value from leaf tensor");
-  return ir::Value();
-}
-
-ir::Value MLIRTensor::CurrentIrValue() const { return data()->ir_value; }
-
-void MLIRTensor::SetIrValue(ir::Value ir_value) {
-  data()->generation += 1;
-  data()->ir_value = std::move(ir_value);
-}
-
-c10::optional<at::Tensor> MLIRTensor::CurrentTensorData() const {
-  return data()->tensor_data;
-}
-
-void MLIRTensor::SetTensor(at::Tensor tensor) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  SetTensorData(tensor);
-  data()->generation += 1;
-}
-
-at::Tensor MLIRTensor::ToTensor() const {
-  c10::optional<at::Tensor> tensor_data = CurrentTensorData();
-  if (!tensor_data)
-    tensor_data = CompileAndRun();
-  assert(tensor_data);
-  return *tensor_data;
-}
-
-void MLIRTensor::ShallowCopyTo(MLIRTensor *dest) const {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-
-  auto data = CurrentTensorData();
-  if (data)
-    dest->SetTensor(*data);
-  else
-    dest->SetIrValue(CurrentIrValue());
-
-  dest->SetScalarType(dtype());
-  assert(GetDevice() == dest->GetDevice());
-}
-
-void MLIRTensor::SetScalarType(
-    c10::optional<at::ScalarType> logical_element_type) {
-  data()->logical_element_type = logical_element_type;
-}
-
-std::vector<int64_t> MLIRTensor::sizes() const {
-  if (data()->ir_value) {
-    return data()->ir_value.sizes();
-  }
-  assert(data()->tensor_data && "tensor has no shape information");
-  if (data()->tensor_data) {
-    auto s = data()->tensor_data->sizes();
-    return {s.begin(), s.end()};
-  }
-  return {};
-}
-
-std::vector<int64_t> MLIRTensor::strides() const {
-  if (data()->ir_value) {
-    return data()->ir_value.strides();
-  }
-  assert(data()->tensor_data && "tensor has no shape information");
-  if (data()->tensor_data) {
-    auto s = data()->tensor_data->strides();
-    return {s.begin(), s.end()};
-  }
-  return {};
-}
-
-MLIRTensor MLIRTensor::CreateFrom(ir::Value ir_value) const {
-  return Create(std::move(ir_value), GetDevice(), dtype());
-}
-
-////////////////////////////////////////////
-// aten tensor methods
-////////////////////////////////////////////
-
-MLIRTensor MLIRTensor::_adaptive_avg_pool2d(const MLIRTensor &self,
-                                            at::IntArrayRef output_size) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::AdaptiveAvgPool2dNode>(
-      self.GetIrValue(), output_size);
-  return self.CreateFrom(node);
-}
-
-MLIRTensor
-MLIRTensor::_adaptive_avg_pool2d_backward(const MLIRTensor &grad_output,
-                                          const MLIRTensor &self) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::AdaptiveAvgPool2dBackwardNode>(
-          grad_output.GetIrValue(), self.GetIrValue());
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::add(const MLIRTensor &self, const MLIRTensor &other,
-                           at::Scalar alpha) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::AddNode>(
-      self.GetIrValue(), other.GetIrValue(),
-      ir::Value(std::make_shared<ir::ConstantNode>(alpha)));
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::add_(MLIRTensor &self, const MLIRTensor &other,
-                            at::Scalar alpha) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::AddInPlaceNode>(
-      self.GetIrValue(), other.GetIrValue(),
-      ir::Value(std::make_shared<ir::ConstantNode>(alpha)));
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::addmm(const MLIRTensor &input, const MLIRTensor &mat1,
-                             const MLIRTensor &mat2, at::Scalar beta,
-                             at::Scalar alpha) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::AddmmNode>(
-      input.GetIrValue(), mat1.GetIrValue(), mat2.GetIrValue(),
-      ir::Value(std::make_shared<ir::ConstantNode>(beta)),
-      ir::Value(std::make_shared<ir::ConstantNode>(alpha)));
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::as_strided(const MLIRTensor &input, at::IntArrayRef size,
-                                  at::IntArrayRef stride,
-                                  c10::optional<int64_t> storage_offset) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::AsStridedNode>(
-      input.GetIrValue(), size, stride, storage_offset);
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::clone(const MLIRTensor &input) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  return MLIRTensor::Create(std::move(input.ToTensor()), input.GetDevice());
-}
-
-MLIRTensor MLIRTensor::convolution(
-    const MLIRTensor &input, const MLIRTensor &weight, const MLIRTensor &bias,
-    at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation,
-    bool transposed, at::IntArrayRef output_padding, int64_t groups) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::Conv2dNode>(
-      input.GetIrValue(), weight.GetIrValue(), bias.GetIrValue(), stride,
-      padding, dilation, transposed, output_padding, groups);
-  return input.CreateFrom(node);
-}
-
-std::tuple<MLIRTensor, MLIRTensor, MLIRTensor> MLIRTensor::convolution_backward(
-    const MLIRTensor &grad_output, const MLIRTensor &input,
-    const MLIRTensor &weight, at::IntArrayRef stride, at::IntArrayRef padding,
-    at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding,
-    int64_t groups, std::array<bool, 3> output_mask) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::Conv2dBackwardNode>(
-      grad_output.GetIrValue(), input.GetIrValue(), weight.GetIrValue(), stride,
-      padding, dilation, transposed, output_padding, groups /*, output_mask*/);
-  auto result0 = input.CreateFrom(ir::Value(node, 0));
-  auto result1 = input.CreateFrom(ir::Value(node, 1));
-  auto result2 = input.CreateFrom(ir::Value(node, 2));
-  return std::make_tuple(result0, result1, result2);
-}
-
-void MLIRTensor::copy_(MLIRTensor &self, MLIRTensor &src) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  src.ShallowCopyTo(&self);
-}
-
-MLIRTensor MLIRTensor::div(const MLIRTensor &self, at::Scalar other) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::DivNode>(
-      self.GetIrValue(), ir::Value(std::make_shared<ir::ConstantNode>(other)));
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::div(const MLIRTensor &self, const MLIRTensor &other) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::DivNode>(self.GetIrValue(), other.GetIrValue());
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::div_(MLIRTensor &self, const MLIRTensor &other) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::DivInPlaceNode>(
-      self.GetIrValue(), other.GetIrValue());
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::expand(const MLIRTensor &self, at::IntArrayRef size,
-                              bool implicit) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::ExpandNode>(self.GetIrValue(), size, implicit);
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::gather(const MLIRTensor &self, int64_t dim,
-                              const MLIRTensor &index, bool sparse_grad) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::GatherNode>(
-      self.GetIrValue(), dim, index.GetIrValue(), sparse_grad);
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::hardtanh(const MLIRTensor &self, at::Scalar min_val,
-                                at::Scalar max_val) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::HardtanhNode>(
-      self.GetIrValue(), ir::Value(std::make_shared<ir::ConstantNode>(min_val)),
-      ir::Value(std::make_shared<ir::ConstantNode>(max_val)));
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::hardtanh_(MLIRTensor &self, at::Scalar min_val,
-                                 at::Scalar max_val) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::HardtanhInPlaceNode>(
-      self.GetIrValue(), ir::Value(std::make_shared<ir::ConstantNode>(min_val)),
-      ir::Value(std::make_shared<ir::ConstantNode>(max_val)));
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::hardtanh_backward(const MLIRTensor &grad_output,
-                                         const MLIRTensor &self,
-                                         at::Scalar min_val,
-                                         at::Scalar max_val) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::HardtanhBackwardNode>(
-      grad_output.GetIrValue(), self.GetIrValue(),
-      ir::Value(std::make_shared<ir::ConstantNode>(min_val)),
-      ir::Value(std::make_shared<ir::ConstantNode>(max_val)));
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::_log_softmax(const MLIRTensor &input, int64_t dim,
-                                    bool half_to_float) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::LogSoftmaxNode>(
-      input.GetIrValue(), dim, half_to_float);
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::_log_softmax_backward_data(const MLIRTensor &grad_output,
-                                                  const MLIRTensor &output,
-                                                  int64_t dim,
-                                                  const MLIRTensor &input) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::LogSoftmaxBackwardNode>(
-      grad_output.GetIrValue(), output.GetIrValue(), dim, input.GetIrValue());
-  return input.CreateFrom(node);
-}
-
-std::tuple<MLIRTensor, MLIRTensor> MLIRTensor::max_pool2d_with_indices(
-    const MLIRTensor &input, at::IntArrayRef kernel_size,
-    at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation,
-    bool ceil_mode) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::MaxPool2dWithIndicesNode>(
-          input.GetIrValue(), kernel_size, stride, padding, dilation,
-          ceil_mode);
-  auto result0 = input.CreateFrom(ir::Value(node, 0));
-  auto result1 = input.CreateFrom(ir::Value(node, 1));
-  return std::make_tuple(result0, result1);
-}
-
-MLIRTensor MLIRTensor::max_pool2d_with_indices_backward(
-    const MLIRTensor &grad_output, const MLIRTensor &input,
-    at::IntArrayRef kernel_size, at::IntArrayRef stride,
-    at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode,
-    const MLIRTensor &indices) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::MaxPool2dWithIndicesBackwardNode>(
-          grad_output.GetIrValue(), input.GetIrValue(), kernel_size, stride,
-          padding, dilation, ceil_mode, indices.GetIrValue());
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::mean(const MLIRTensor &input,
-                            c10::optional<at::ScalarType> dtype) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::MeanNode>(input.GetIrValue(), dtype);
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::mean(const MLIRTensor &input, at::IntArrayRef dim,
-                            bool keepdim, c10::optional<at::ScalarType> dtype) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::MeanNode>(input.GetIrValue(), dim, keepdim, dtype);
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::mm(const MLIRTensor &input, const MLIRTensor &mat1) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::MMNode>(input.GetIrValue(), mat1.GetIrValue());
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::mul(const MLIRTensor &self, const MLIRTensor &other) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::MulNode>(self.GetIrValue(), other.GetIrValue());
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::mul_(MLIRTensor &self, const MLIRTensor &other) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::MulInPlaceNode>(
-      self.GetIrValue(), other.GetIrValue());
-  return self.CreateFrom(node);
-}
-
-std::tuple<MLIRTensor, MLIRTensor, MLIRTensor> MLIRTensor::native_batch_norm(
-    const MLIRTensor &self, const MLIRTensor &weight, const MLIRTensor &bias,
-    const MLIRTensor &running_mean, const MLIRTensor &running_var,
-    bool training, double momentum, double eps) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::BatchNormNode>(
-      self.GetIrValue(), weight.GetIrValue(), bias.GetIrValue(),
-      running_mean.GetIrValue(), running_var.GetIrValue(), training, momentum,
-      eps);
-  auto result0 = self.CreateFrom(ir::Value(node, 0));
-  auto result1 = self.CreateFrom(ir::Value(node, 1));
-  auto result2 = self.CreateFrom(ir::Value(node, 2));
-  return std::make_tuple(result0, result1, result2);
-}
-
-std::tuple<MLIRTensor, MLIRTensor, MLIRTensor>
-MLIRTensor::native_batch_norm_backward(
-    const MLIRTensor &grad_out, const MLIRTensor &input,
-    const MLIRTensor &weight, const MLIRTensor &running_mean,
-    const MLIRTensor &running_var, const MLIRTensor &save_mean,
-    const MLIRTensor &save_invstd, bool train, double eps,
-    std::array<bool, 3> output_mask) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::BatchNormBackwardNode>(
-      grad_out.GetIrValue(), input.GetIrValue(), weight.GetIrValue(),
-      running_mean.GetIrValue(), running_var.GetIrValue(),
-      save_mean.GetIrValue(), save_invstd.GetIrValue(), train, eps,
-      output_mask);
-  auto result0 = input.CreateFrom(ir::Value(node, 0));
-  auto result1 = input.CreateFrom(ir::Value(node, 1));
-  auto result2 = input.CreateFrom(ir::Value(node, 2));
-  return std::make_tuple(result0, result1, result2);
-}
-
-MLIRTensor MLIRTensor::neg(const MLIRTensor &input) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::NegNode>(input.GetIrValue());
-  return input.CreateFrom(node);
-}
-
-std::tuple<MLIRTensor, MLIRTensor>
-MLIRTensor::nll_loss2d_forward(const MLIRTensor &self, const MLIRTensor &target,
-                               const MLIRTensor &weight, int64_t reduction,
-                               int64_t ignore_index) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::NllLoss2dForwardNode>(
-      self.GetIrValue(), target.GetIrValue(), weight.GetIrValue(), reduction,
-      ignore_index);
-  auto result0 = self.CreateFrom(ir::Value(node, 0));
-  auto result1 = self.CreateFrom(ir::Value(node, 1));
-  return std::make_tuple(result0, result1);
-}
-
-MLIRTensor MLIRTensor::nll_loss2d_backward(
-    const MLIRTensor &grad_output, const MLIRTensor &self,
-    const MLIRTensor &target, const MLIRTensor &weight, int64_t reduction,
-    int64_t ignore_index, const MLIRTensor &total_weight) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::NllLoss2dBackwardNode>(
-      grad_output.GetIrValue(), self.GetIrValue(), target.GetIrValue(),
-      weight.GetIrValue(), reduction, ignore_index, total_weight.GetIrValue());
-  return self.CreateFrom(node);
-}
-
-std::tuple<MLIRTensor, MLIRTensor>
-MLIRTensor::nll_loss_forward(const MLIRTensor &self, const MLIRTensor &target,
-                             const MLIRTensor &weight, int64_t reduction,
-                             int64_t ignore_index) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::NllLossForwardNode>(
-      self.GetIrValue(), target.GetIrValue(), weight.GetIrValue(), reduction,
-      ignore_index);
-  auto result0 = self.CreateFrom(ir::Value(node, 0));
-  auto result1 = self.CreateFrom(ir::Value(node, 1));
-  return std::make_tuple(result0, result1);
-}
-
-MLIRTensor MLIRTensor::nll_loss_backward(
-    const MLIRTensor &grad_output, const MLIRTensor &self,
-    const MLIRTensor &target, const MLIRTensor &weight, int64_t reduction,
-    int64_t ignore_index, const MLIRTensor &total_weight) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::NllLossBackwardNode>(
-      grad_output.GetIrValue(), self.GetIrValue(), target.GetIrValue(),
-      weight.GetIrValue(), reduction, ignore_index, total_weight.GetIrValue());
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::sum(const MLIRTensor &input, at::IntArrayRef dim,
-                           bool keepdim, c10::optional<at::ScalarType> dtype) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::SumNode>(input.GetIrValue(), dim, keepdim, dtype);
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::relu(const MLIRTensor &input) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::ReLUNode>(input.GetIrValue());
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::relu_(MLIRTensor &input) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::ReLUInPlaceNode>(input.GetIrValue());
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::size(const MLIRTensor &input, int64_t dim) {
-  assert(0);
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::SizeNode>(input.GetIrValue(), dim);
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::squeeze(const MLIRTensor &input, int64_t dim) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::SqueezeNode>(input.GetIrValue(), dim);
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::sub(const MLIRTensor &self, const MLIRTensor &other,
-                           at::Scalar alpha) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::SubNode>(
-      self.GetIrValue(), other.GetIrValue(),
-      ir::Value(std::make_shared<ir::ConstantNode>(alpha)));
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::sub_(MLIRTensor &self, const MLIRTensor &other,
-                            at::Scalar alpha) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::SubInPlaceNode>(
-      self.GetIrValue(), other.GetIrValue(),
-      ir::Value(std::make_shared<ir::ConstantNode>(alpha)));
-  return self.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::t(const MLIRTensor &input) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::TransposeNode>(input.GetIrValue());
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::threshold_backward(const MLIRTensor &grad_output,
-                                          const MLIRTensor &input,
-                                          at::Scalar threshold) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node = std::make_shared<ir::ThresholdBackwardNode>(
-      grad_output.GetIrValue(), input.GetIrValue(),
-      ir::Value(std::make_shared<ir::ConstantNode>(threshold)));
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::to(MLIRTensor &input, c10::optional<Device> device,
-                          c10::optional<at::ScalarType> scalar_type) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  if (!device) {
-    device = input.GetDevice();
-  }
-  if (!scalar_type) {
-    scalar_type = input.dtype();
-  }
-
-  MLIRTensor new_tensor = Create(input.ToTensor(), *device);
-
-  if (input.dtype() != *scalar_type) {
-    new_tensor.SetScalarType(*scalar_type);
-  }
-  return new_tensor;
-}
-
-MLIRTensor MLIRTensor::unsqueeze(const MLIRTensor &input, int64_t dim) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::UnsqueezeNode>(input.GetIrValue(), dim);
-  return input.CreateFrom(node);
-}
-
-MLIRTensor MLIRTensor::view(const MLIRTensor &input, at::IntArrayRef size) {
-  LLVM_DEBUG(llvm::dbgs() << "MLIRTensor::" << __func__ << "\n");
-  std::shared_ptr<ir::Node> node =
-      std::make_shared<ir::ViewNode>(input.GetIrValue(), size);
-  return input.CreateFrom(node);
-}
-
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/tensor.h
+++ b/frontends/pytorch/csrc/type_dispatch/tensor.h
@ -1,275 +0,0 @@
-//===- tensor.h -------------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include "device.h"
-#include "ir.h"
-
-#include <cstdint>
-
-#include <ATen/Tensor.h>
-#include <c10/util/ArrayRef.h>
-
-namespace torch_mlir {
-
-class MLIRTensor {
-  struct Data;
-
-public:
-  static MLIRTensor Create(const at::Tensor &tensor, const Device &device);
-  static MLIRTensor Create(ir::Value ir_value, const Device &device,
-                           c10::optional<at::ScalarType> logical_element_type);
-
-  MLIRTensor() = default;
-
-  bool is_null() const { return data_ptr() == nullptr; }
-
-  void ShallowCopyTo(MLIRTensor *dest) const;
-
-  void SetTensor(at::Tensor tensor);
-  void SetIrValue(ir::Value ir_value);
-
-  at::ScalarType dtype() const;
-
-  // Set logical_element_type which is visible to upstream PyTorch.
-  void SetScalarType(c10::optional<at::ScalarType> logical_element_type);
-
-  std::vector<int64_t> sizes() const;
-  std::vector<int64_t> strides() const;
-
-  at::Tensor ToTensor() const;
-
-  const Device &GetDevice() const;
-
-  size_t generation() const { return data()->generation; }
-
-  std::string GetMLIR() const;
-
-  // Retrieves the IR Node representing this MLIRTensor. One will be created if
-  // missing. Note that although this is a const API, it actually changes the
-  // internal state of the object.
-  ir::Value GetIrValue() const;
-
-  at::Tensor CompileAndRun() const;
-
-  uint64_t id() const { return data()->unique_id; }
-
-private:
-  struct Data {
-    Data(at::Tensor tensor_data, const Device &device)
-        : logical_element_type(tensor_data.scalar_type()),
-          tensor_data(std::move(tensor_data)), device(device),
-          unique_id(GetNextTensorId()) {}
-
-    Data(ir::Value ir_value, const Device &device,
-         c10::optional<at::ScalarType> logical_element_type)
-        : logical_element_type(logical_element_type),
-          ir_value(std::move(ir_value)), device(device),
-          unique_id(GetNextTensorId()) {}
-
-    ~Data(){};
-
-    c10::optional<at::ScalarType> logical_element_type;
-    c10::optional<at::Tensor> tensor_data;
-    ir::Value ir_value;
-
-    const Device device;
-    const uint64_t unique_id = 0;
-    size_t generation = 1;
-  };
-
-  MLIRTensor(const at::Tensor &tensor, const Device &device);
-
-  MLIRTensor(ir::Value ir_value, const Device &device,
-             c10::optional<at::ScalarType> logical_element_type = c10::nullopt);
-
-  void SetTensorData(at::Tensor tensor_data);
-
-  c10::optional<at::Tensor> CurrentTensorData() const;
-
-  // Retrieves the current IR Node, or nullptr in case no active IR Node is
-  // available.
-  ir::Value CurrentIrValue() const;
-
-  Data *data() const;
-
-  std::shared_ptr<Data> data_ptr() const { return data_; }
-
-  MLIRTensor CreateFrom(ir::Value ir_value) const;
-
-  static uint64_t GetNextTensorId();
-
-  std::shared_ptr<Data> data_;
-
-  //////////////////////////////////////////////////////////////////////////////
-  // ATEN operators follows here, listed in alphabetical order.
-  //////////////////////////////////////////////////////////////////////////////
-public:
-  static MLIRTensor _adaptive_avg_pool2d(const MLIRTensor &self,
-                                         at::IntArrayRef output_size);
-
-  static MLIRTensor _adaptive_avg_pool2d_backward(const MLIRTensor &grad_output,
-                                                  const MLIRTensor &self);
-
-  static MLIRTensor add(const MLIRTensor &input, const MLIRTensor &other,
-                        at::Scalar alpha);
-
-  static MLIRTensor add_(MLIRTensor &input, const MLIRTensor &other,
-                         at::Scalar alpha);
-
-  static MLIRTensor addmm(const MLIRTensor &input, const MLIRTensor &mat1,
-                          const MLIRTensor &mat2, at::Scalar beta,
-                          at::Scalar alpha);
-
-  static MLIRTensor as_strided(const MLIRTensor &self, at::IntArrayRef size,
-                               at::IntArrayRef stride,
-                               c10::optional<int64_t> storage_offset);
-
-  static MLIRTensor clone(const MLIRTensor &self);
-
-  static MLIRTensor convolution(const MLIRTensor &input,
-                                const MLIRTensor &weight,
-                                const MLIRTensor &bias, at::IntArrayRef stride,
-                                at::IntArrayRef padding,
-                                at::IntArrayRef dilation, bool transposed,
-                                at::IntArrayRef output_padding, int64_t groups);
-
-  static std::tuple<MLIRTensor, MLIRTensor, MLIRTensor>
-  convolution_backward(const MLIRTensor &grad_output, const MLIRTensor &input,
-                       const MLIRTensor &weight, at::IntArrayRef stride,
-                       at::IntArrayRef padding, at::IntArrayRef dilation,
-                       bool transposed, at::IntArrayRef output_padding,
-                       int64_t groups, std::array<bool, 3> output_mask);
-
-  static void copy_(MLIRTensor &input, MLIRTensor &src);
-
-  static MLIRTensor div(const MLIRTensor &self, at::Scalar other);
-
-  static MLIRTensor div(const MLIRTensor &self, const MLIRTensor &other);
-
-  static MLIRTensor div_(MLIRTensor &self, const MLIRTensor &other);
-
-  static MLIRTensor expand(const MLIRTensor &self, at::IntArrayRef size,
-                           bool implicit);
-
-  static MLIRTensor gather(const MLIRTensor &self, int64_t dim,
-                           const MLIRTensor &index, bool sparse_grad);
-
-  static MLIRTensor hardtanh(const MLIRTensor &self, at::Scalar min_val,
-                             at::Scalar max_val);
-
-  static MLIRTensor hardtanh_(MLIRTensor &self, at::Scalar min_val,
-                              at::Scalar max_val);
-
-  static MLIRTensor hardtanh_backward(const MLIRTensor &grad_output,
-                                      const MLIRTensor &self,
-                                      at::Scalar min_val, at::Scalar max_val);
-
-  static MLIRTensor _log_softmax(const MLIRTensor &input, int64_t dim,
-                                 bool half_to_float);
-
-  static MLIRTensor _log_softmax_backward_data(const MLIRTensor &grad_output,
-                                               const MLIRTensor &output,
-                                               int64_t dim,
-                                               const MLIRTensor &self);
-
-  static std::tuple<MLIRTensor, MLIRTensor>
-  max_pool2d_with_indices(const MLIRTensor &input, at::IntArrayRef kernel_size,
-                          at::IntArrayRef stride, at::IntArrayRef padding,
-                          at::IntArrayRef dilation, bool ceil_mode);
-
-  static MLIRTensor max_pool2d_with_indices_backward(
-      const MLIRTensor &grad_output, const MLIRTensor &self,
-      at::IntArrayRef kernel_size, at::IntArrayRef stride,
-      at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode,
-      const MLIRTensor &indices);
-
-  static MLIRTensor mean(const MLIRTensor &input,
-                         c10::optional<at::ScalarType> dtype);
-
-  static MLIRTensor mean(const MLIRTensor &input, at::IntArrayRef dim,
-                         bool keepdim, c10::optional<at::ScalarType> dtype);
-
-  static MLIRTensor mm(const MLIRTensor &input, const MLIRTensor &mat1);
-
-  static MLIRTensor mul(const MLIRTensor &self, const MLIRTensor &other);
-
-  static MLIRTensor mul_(MLIRTensor &self, const MLIRTensor &other);
-
-  static std::tuple<MLIRTensor, MLIRTensor, MLIRTensor>
-  native_batch_norm(const MLIRTensor &input, const MLIRTensor &weight,
-                    const MLIRTensor &bias, const MLIRTensor &running_mean,
-                    const MLIRTensor &running_var, bool training,
-                    double momentum, double eps);
-
-  static std::tuple<MLIRTensor, MLIRTensor, MLIRTensor>
-  native_batch_norm_backward(const MLIRTensor &grad_out,
-                             const MLIRTensor &input, const MLIRTensor &weight,
-                             const MLIRTensor &running_mean,
-                             const MLIRTensor &running_var,
-                             const MLIRTensor &save_mean,
-                             const MLIRTensor &save_invstd, bool train,
-                             double eps, std::array<bool, 3> output_mask);
-
-  static MLIRTensor neg(const MLIRTensor &input);
-
-  static std::tuple<MLIRTensor, MLIRTensor>
-  nll_loss2d_forward(const MLIRTensor &self, const MLIRTensor &target,
-                     const MLIRTensor &weight, int64_t reduction,
-                     int64_t ignore_index);
-
-  static MLIRTensor nll_loss2d_backward(const MLIRTensor &grad_output,
-                                        const MLIRTensor &self,
-                                        const MLIRTensor &target,
-                                        const MLIRTensor &weight,
-                                        int64_t reduction, int64_t ignore_index,
-                                        const MLIRTensor &total_weight);
-
-  static std::tuple<MLIRTensor, MLIRTensor>
-  nll_loss_forward(const MLIRTensor &self, const MLIRTensor &target,
-                   const MLIRTensor &weight, int64_t reduction,
-                   int64_t ignore_index);
-
-  static MLIRTensor nll_loss_backward(const MLIRTensor &grad_output,
-                                      const MLIRTensor &self,
-                                      const MLIRTensor &target,
-                                      const MLIRTensor &weight,
-                                      int64_t reduction, int64_t ignore_index,
-                                      const MLIRTensor &total_weight);
-
-  static MLIRTensor size(const MLIRTensor &self, int64_t dim);
-
-  static MLIRTensor squeeze(const MLIRTensor &self, int64_t dim);
-
-  static MLIRTensor sub(const MLIRTensor &input, const MLIRTensor &other,
-                        at::Scalar alpha);
-
-  static MLIRTensor sub_(MLIRTensor &input, const MLIRTensor &other,
-                         at::Scalar alpha);
-
-  static MLIRTensor sum(const MLIRTensor &self, at::IntArrayRef dim,
-                        bool keepdim, c10::optional<at::ScalarType> dtype);
-
-  static MLIRTensor relu(const MLIRTensor &input);
-
-  static MLIRTensor relu_(MLIRTensor &input);
-
-  static MLIRTensor t(const MLIRTensor &input);
-
-  static MLIRTensor threshold_backward(const MLIRTensor &grad_output,
-                                       const MLIRTensor &self,
-                                       at::Scalar threshold);
-
-  static MLIRTensor to(MLIRTensor &input, c10::optional<Device> device,
-                       c10::optional<at::ScalarType> scalar_type);
-
-  static MLIRTensor unsqueeze(const MLIRTensor &self, int64_t dim);
-
-  static MLIRTensor view(const MLIRTensor &input, at::IntArrayRef size);
-};
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/tensor_impl.cpp
+++ b/frontends/pytorch/csrc/type_dispatch/tensor_impl.cpp
@ -1,156 +0,0 @@
-//===- tensor_impl.cpp ------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#include "tensor_impl.h"
-#include "aten_mlir_bridge.h"
-
-#include <c10/core/impl/DeviceGuardImplInterface.h>
-#include <c10/macros/Macros.h>
-
-namespace torch_mlir {
-namespace {
-
-thread_local c10::Device g_current_device(at::DeviceType::XLA, 0);
-
-struct MLIRGuardImpl : public c10::impl::DeviceGuardImplInterface {
-  at::DeviceType type() const override { return at::DeviceType::XLA; }
-
-  c10::Device exchangeDevice(c10::Device device) const override {
-    std::swap(g_current_device, device);
-    return device;
-  }
-
-  c10::Device getDevice() const override { return g_current_device; }
-
-  void setDevice(c10::Device device) const override {
-    g_current_device = device;
-  }
-
-  void uncheckedSetDevice(c10::Device device) const noexcept override {
-    g_current_device = device;
-  }
-
-  c10::Stream getStream(c10::Device device) const noexcept override {
-    return c10::Stream(c10::Stream::DEFAULT, device);
-  }
-
-  c10::Stream exchangeStream(c10::Stream s) const noexcept override {
-    return c10::Stream(c10::Stream::DEFAULT, g_current_device);
-  }
-
-  c10::DeviceIndex deviceCount() const noexcept override { return 0; }
-};
-
-C10_REGISTER_GUARD_IMPL(XLA, MLIRGuardImpl);
-
-} // namespace
-
-MLIRTensorImpl::MLIRTensorImpl(MLIRTensor tensor)
-    : c10::TensorImpl(c10::XLATensorId(), GetTypeMeta(tensor),
-                      bridge::MLIRDeviceToAtenDevice(tensor.GetDevice())),
-      tensor_(std::move(tensor)) {}
-
-c10::intrusive_ptr<c10::TensorImpl> MLIRTensorImpl::shallow_copy_and_detach(
-    const c10::VariableVersion &version_counter,
-    bool allow_tensor_metadata_change) const {
-  // std::cout << "MLIRTensorImpl::" << __func__ << std::endl;
-  auto impl = c10::make_intrusive<MLIRTensorImpl>(tensor_);
-  copy_tensor_metadata(
-      /*src_impl=*/this,
-      /*dest_impl=*/impl.get(),
-      /*version_counter=*/version_counter,
-      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-  return impl;
-}
-
-void MLIRTensorImpl::shallow_copy_from(
-    const c10::intrusive_ptr<TensorImpl> &impl) {
-  // std::cout << "MLIRTensorImpl::" << __func__ << std::endl;
-  MLIRTensorImpl *tensor_impl = dynamic_cast<MLIRTensorImpl *>(impl.get());
-  copy_tensor_metadata(
-      /*src_impl=*/tensor_impl,
-      /*dest_impl=*/this,
-      /*version_counter=*/version_counter(),
-      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
-  tensor_impl->tensor_.ShallowCopyTo(&tensor_);
-  generation_ = 0;
-}
-
-at::IntArrayRef MLIRTensorImpl::sizes() const {
-  const_cast<MLIRTensorImpl *>(this)->SetupSizeProperties();
-  return c10::TensorImpl::sizes();
-}
-
-at::IntArrayRef MLIRTensorImpl::strides() const {
-  const_cast<MLIRTensorImpl *>(this)->SetupSizeProperties();
-  return c10::TensorImpl::strides();
-}
-
-int64_t MLIRTensorImpl::dim() const {
-  const_cast<MLIRTensorImpl *>(this)->SetupSizeProperties();
-  return c10::TensorImpl::dim();
-}
-
-int64_t MLIRTensorImpl::numel() const {
-  const_cast<MLIRTensorImpl *>(this)->SetupSizeProperties();
-  return c10::TensorImpl::numel();
-}
-
-bool MLIRTensorImpl::is_contiguous(at::MemoryFormat memory_format) const {
-  // Only check that the storage is already contiguous.
-  assert(is_contiguous_ && "Non-contiguous storage for MLIR tensor");
-  return true;
-}
-
-int64_t MLIRTensorImpl::size(int64_t d) const {
-  const_cast<MLIRTensorImpl *>(this)->SetupSizeProperties();
-  return c10::TensorImpl::size(d);
-}
-
-void MLIRTensorImpl::SetupSizeProperties() {
-  size_t generation = tensor_.generation();
-  if (generation != generation_) {
-    // Fill up the basic dimension data members which the base class
-    // implementation uses in its APIs.
-    auto sizes = tensor_.sizes();
-    auto strides = tensor_.strides();
-
-    strides_.clear();
-    sizes_.clear();
-    numel_ = 1;
-
-    for (auto t : llvm::zip(sizes, strides)) {
-      auto size = std::get<0>(t);
-      sizes_.push_back(size);
-      strides_.push_back(std::get<1>(t));
-      numel_ *= size;
-    }
-
-    generation_ = generation;
-  }
-}
-
-caffe2::TypeMeta MLIRTensorImpl::GetTypeMeta(const MLIRTensor &tensor) {
-  return c10::scalarTypeToTypeMeta(tensor.dtype());
-}
-
-c10::Device MLIRTensorImpl::GetCurrentAtenDevice() { return g_current_device; }
-
-c10::Device MLIRTensorImpl::SetCurrentAtenDevice(c10::Device device) {
-  std::swap(g_current_device, device);
-  return device;
-}
-
-void MLIRTensorImpl::AtenInitialize() {}
-
-const at::Storage &MLIRTensorImpl::storage() const {
-  assert(0 && "MLIR tensors do not have storage");
-}
-
-bool MLIRTensorImpl::has_storage() const { return false; }
-
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/tensor_impl.h
+++ b/frontends/pytorch/csrc/type_dispatch/tensor_impl.h
@ -1,60 +0,0 @@
-//===- tensor_impl.h --------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include "tensor.h"
-
-#include <ATen/Tensor.h>
-#include <c10/core/Storage.h>
-#include <c10/core/TensorImpl.h>
-
-namespace torch_mlir {
-
-class MLIRTensorImpl : public c10::TensorImpl {
-public:
-  explicit MLIRTensorImpl(MLIRTensor tensor);
-
-  MLIRTensor &tensor() { return tensor_; }
-
-  c10::intrusive_ptr<TensorImpl>
-  shallow_copy_and_detach(const c10::VariableVersion &version_counter,
-                          bool allow_tensor_metadata_change) const override;
-
-  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl> &impl) override;
-
-  at::IntArrayRef sizes() const override;
-
-  at::IntArrayRef strides() const override;
-
-  int64_t dim() const override;
-
-  int64_t numel() const override;
-
-  bool is_contiguous(at::MemoryFormat memory_format) const override;
-
-  int64_t size(int64_t d) const override;
-
-  static c10::Device GetCurrentAtenDevice();
-
-  static c10::Device SetCurrentAtenDevice(c10::Device device);
-
-  static void AtenInitialize();
-
-  const at::Storage &storage() const override;
-
-  bool has_storage() const override;
-
-private:
-  static caffe2::TypeMeta GetTypeMeta(const MLIRTensor &tensor);
-
-  void SetupSizeProperties();
-
-  MLIRTensor tensor_;
-  size_t generation_ = 0;
-};
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/torch_util.cpp
+++ b/frontends/pytorch/csrc/type_dispatch/torch_util.cpp
@ -1,44 +0,0 @@
-//===- torch_util.cpp -------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#include "torch_util.h"
-
-#include <ATen/Functions.h>
-#include <ATen/Tensor.h>
-
-namespace torch_mlir {
-namespace util {
-
-at::Tensor Zeros(at::IntArrayRef sizes, at::ScalarType type) {
-  return at::zeros(sizes, type);
-}
-
-at::Tensor CopyTensor(const at::Tensor &ref) {
-  return ref.to(ref.options(), /*non_blocking=*/false, /*copy=*/true);
-}
-
-// Same as above, with an additional cast.
-at::Tensor CopyTensor(const at::Tensor &ref, at::ScalarType dest_type) {
-  return ref.to(ref.options().dtype(dest_type), /*non_blocking=*/false,
-                /*copy=*/true);
-}
-
-at::ScalarType GetScalarType(at::Scalar scalar) {
-  if (scalar.isFloatingPoint()) {
-    return at::kDouble;
-  } else if (scalar.isIntegral(/*includeBool=*/false)) {
-    return at::kLong;
-  } else if (scalar.isBoolean()) {
-    return at::kBool;
-  } else if (scalar.isComplex()) {
-    return at::kComplexDouble;
-  }
-  assert(0 && "Unknown type for scalar");
-}
-
-} // namespace util
-} // namespace torch_mlir
--- a/frontends/pytorch/csrc/type_dispatch/torch_util.h
+++ b/frontends/pytorch/csrc/type_dispatch/torch_util.h
@ -1,34 +0,0 @@
-//===- torch_util.h ---------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <ATen/Tensor.h>
-#include <c10/core/ScalarType.h>
-#include <c10/util/Optional.h>
-
-namespace torch_mlir {
-namespace util {
-
-at::Tensor Zeros(at::IntArrayRef sizes, at::ScalarType type);
-
-// Makes a deep copy of an ATEN tensor.
-at::Tensor CopyTensor(const at::Tensor &ref);
-
-// Same as above, with an additional cast.
-at::Tensor CopyTensor(const at::Tensor &ref, at::ScalarType dest_type);
-
-// Return at::ScalarType from at::Scalar
-at::ScalarType GetScalarType(at::Scalar scalar);
-
-template <typename T, typename S>
-T OptionalOr(const c10::optional<S> &value, T defval) {
-  return value ? static_cast<T>(*value) : defval;
-}
-
-} // namespace util
-} // namespace torch_mlir
--- a/frontends/pytorch/lib/CMakeLists.txt
+++ b/frontends/pytorch/lib/CMakeLists.txt
@ -1,10 +0,0 @@
-include_directories(
-  ${TORCH_INCLUDE_DIRS}
-  )
-add_library(aten_ops SHARED
-  aten_ops.cpp
-  )
-
-target_link_libraries(aten_ops
-  ${TORCH_LIBRARIES}
- )
--- a/frontends/pytorch/lib/aten_ops.cpp
+++ b/frontends/pytorch/lib/aten_ops.cpp
@ -1,772 +0,0 @@
-//===- aten_ops.cpp ---------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under a pytorch-style license
-// See frontends/pytorch/LICENSE for license information.
-//
-//===----------------------------------------------------------------------===//
-
-// This file implements C libraries that are targetted by MLIR code generation
-// from the ATen dialect.  This library is intended to support a functional
-// proof of concept rather than optimized for high performance.  Most of the
-// functions are implemented by calling back into the torch libraries.
-
-#include <assert.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-
-#include <ATen/ATen.h>
-#include <torch/torch.h>
-
-#include <nnpack.h>
-#include <ATen/CPUType.h>
-
-namespace {
-
-template <typename T, int N> struct tensor_t {
-  T *d;
-  T *aligned;
-  size_t offset;
-  size_t shape[N];
-  size_t stride[N];
-
-  size_t index(size_t n, size_t channel, size_t row, size_t col) const {
-    size_t channels = shape[1];
-    size_t height = shape[2];
-    size_t width = shape[3];
-    return n * height * width * channels + channel * height * width +
-           row * width + col;
-  }
-
-  tensor_t() {
-    d = aligned = nullptr;
-    offset = 0;
-    for (int i = 0; i < N; i++)
-      shape[i] = stride[i] = 0;
-  }
-};
-
-template <typename T, int N>
-std::vector<int64_t> translate_shape(tensor_t<T, N> *t) {
-  std::vector<int64_t> shape;
-  for (int i = 0; i < N; i++) {
-    shape.push_back(t->shape[i]);
-    // std::cout << i << " shape " << t->shape[i] << std::endl;
-  }
-  return shape;
-}
-
-template <typename T, int N>
-std::vector<int64_t> translate_stride(tensor_t<T, N> *t) {
-  std::vector<int64_t> stride;
-  for (int i = 0; i < N; i++) {
-    stride.push_back(t->stride[i]);
-    // std::cout << i << " stride " << t->stride[i] << std::endl;
-  }
-  return stride;
-}
-
-template <int N> void dumpTensor(std::ostream &o, tensor_t<float, N> *t) {
-  o << "Shape:";
-  for (int i = 0; i < N; i++)
-    o << t->shape[i] << " ";
-  o << "Stride:";
-  for (int i = 0; i < N; i++)
-    o << t->stride[i] << " ";
-  o << "\n";
-}
-
-template <typename T, int N>
-at::Tensor to_torch(tensor_t<T, N> *t,
-                    const at::TensorOptions &options = at::TensorOptions()) {
-  // std::cout << "to_torch\n";
-  return torch::from_blob((void *)t->d, translate_shape(t), translate_stride(t),
-                          options);
-}
-
-template <typename T>
-void mm_out(tensor_t<T, 2> *a, tensor_t<T, 2> *b, tensor_t<T, 2> *r);
-
-template <typename T, int N>
-void add_out(tensor_t<T, N> *a, tensor_t<T, N> *b, T alpha, tensor_t<T, N> *r) {
-  at::Tensor torch_a = to_torch(a);
-  at::Tensor torch_b = to_torch(b);
-  at::Tensor result = at::native::add(torch_a, torch_b, alpha).clone();
-
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-template <typename T>
-void addmm_out(tensor_t<T, 1> *a, tensor_t<T, 2> *b, tensor_t<T, 2> *c,
-               int32_t alpha, int32_t beta, tensor_t<T, 2> *r) {
-  at::Tensor torch_a = to_torch(a);
-  at::Tensor torch_b = to_torch(b);
-  at::Tensor torch_c = to_torch(c);
-  at::Tensor result =
-      at::native::addmm(torch_a, torch_b, torch_c, alpha, beta).clone();
-
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-template <typename T, int N, int M>
-void as_strided_out(tensor_t<float, M> *a,
-                    /*size*/ int32_t sz0, int32_t sz1, int32_t sz2, int32_t sz3,
-                    /*stride*/ int32_t sd0, int32_t sd1, int32_t sd2,
-                    int32_t sd3, int32_t offset, tensor_t<T, N> *r) {
-  at::Tensor input = to_torch(a);
-
-  std::vector<int64_t> size;
-  std::vector<int64_t> stride;
-  c10::optional<int64_t> storage_offset;
-
-  if (offset != 0)
-    storage_offset = offset;
-  if (N > 0) {
-    size.push_back(sz0);
-    stride.push_back(sd0);
-  }
-  if (N > 1) {
-    size.push_back(sz1);
-    stride.push_back(sd1);
-  }
-  if (N > 2) {
-    size.push_back(sz2);
-    stride.push_back(sd2);
-  }
-  if (N > 3) {
-    size.push_back(sz3);
-    stride.push_back(sd3);
-  }
-
-  std::vector<int64_t> sizeRef{size};
-  std::vector<int64_t> strideRef{stride};
-
-  // for (int i = 0; i<N; i++)
-  //  std::cout << "STRIDE " << i << " " << stride[i] << std::endl;
-  at::Tensor result =
-      at::native::as_strided_tensorimpl(input, size, stride, storage_offset)
-          .clone();
-
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-// FIXME: stride, padding, dilaection, output_padding should be IntArrayRef
-template <typename T>
-void conv2d_out(tensor_t<T, 4> *t, tensor_t<T, 4> *weight, tensor_t<T, 1> *bias,
-                int32_t stride, int32_t pad, int32_t dilation,
-                tensor_t<T, 4> *r) {
-  at::Tensor torch_t = to_torch(t);
-  at::Tensor torch_w = to_torch(weight);
-  at::Tensor torch_b = to_torch(bias);
-  int64_t groups = 1;
-
-  at::Tensor result = at::native::conv2d(torch_t, torch_w, torch_b, stride, pad,
-                                         dilation, groups)
-                          .clone();
-
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-template <typename T>
-void conv2d_backward_out(tensor_t<T, 4> *grad_output, tensor_t<T, 4> *input,
-                         tensor_t<T, 4> *weight, int32_t stride, int32_t pad,
-                         int32_t dilation, tensor_t<T, 4> *r0,
-                         tensor_t<T, 4> *r1, tensor_t<T, 1> *r2) {
-  const at::Tensor &arg_grad = to_torch(grad_output);
-  const at::Tensor &arg_input = to_torch(input);
-  const at::Tensor &arg_weight = to_torch(weight);
-
-  std::vector<int64_t> p{pad, pad};
-  std::vector<int64_t> s{stride, stride};
-  std::vector<int64_t> d{dilation, dilation};
-
-  std::array<bool, 3> output_mask{true, true, true};
-
-  std::tuple<at::Tensor, at::Tensor, at::Tensor> grads =
-      at::native::mkldnn_convolution_backward(arg_input, arg_grad, arg_weight,
-                                              p, s, d, 1, output_mask);
-
-  auto result0 = std::get<0>(grads);
-  auto result1 = std::get<1>(grads);
-  auto result2 = std::get<2>(grads);
-
-  memcpy(r0->d, result0.data_ptr(), result0.numel() * sizeof(T));
-  memcpy(r1->d, result1.data_ptr(), result1.numel() * sizeof(T));
-  memcpy(r2->d, result2.data_ptr(), result2.numel() * sizeof(T));
-}
-
-template <typename T, int N>
-void log_softmax_out(tensor_t<T, N> *t, int32_t dim, bool half_to_float,
-                     tensor_t<T, N> *r) {
-  at::Tensor input = to_torch(t);
-  at::Tensor result = at::native::log_softmax_cpu(input, dim, half_to_float);
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-template <typename T, int N>
-void log_softmax_backward_data_out(tensor_t<T, N> *a, tensor_t<T, N> *b,
-                                   int32_t c, tensor_t<T, N> *d,
-                                   tensor_t<T, N> *r) {
-  at::Tensor inputA = to_torch(a);
-  at::Tensor inputB = to_torch(b);
-  at::Tensor inputD = to_torch(d);
-
-  at::Tensor result =
-      at::native::log_softmax_backward_cpu(inputA, inputB, c, inputD);
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-template <typename T>
-void max_pool2d_with_indices_out(tensor_t<T, 4> *t, int32_t c, int32_t d,
-                                 int32_t e, int32_t f, bool ceil_mode,
-                                 tensor_t<T, 4> *r0, tensor_t<int64_t, 4> *r1) {
-  at::Tensor input = to_torch(t);
-
-  std::vector<int64_t> kernel{c, c};
-  std::vector<int64_t> stride{d, d};
-  std::vector<int64_t> padding{e, e};
-  std::vector<int64_t> dilation{f, f};
-
-  auto result = at::native::max_pool2d_with_indices_cpu(
-      input, kernel, stride, padding, dilation, ceil_mode);
-  at::Tensor outTensor = std::get<0>(result);
-  at::Tensor idxTensor = std::get<1>(result);
-  memcpy(r0->d, outTensor.data_ptr(), outTensor.numel() * sizeof(T));
-  memcpy(r1->d, idxTensor.data_ptr(), idxTensor.numel() * sizeof(T));
-}
-
-template <typename T>
-void max_pool2d_with_indices_backward_out(tensor_t<T, 4> *a, tensor_t<T, 4> *b,
-                                          int32_t c, int32_t d, int32_t e,
-                                          int32_t f, bool g,
-                                          tensor_t<int64_t, 4> *h,
-                                          tensor_t<T, 4> *r) {
-  const at::Tensor &inputA = to_torch(a);
-  const at::Tensor &inputB = to_torch(b);
-  at::TensorOptions options(at::ScalarType::Long);
-  const at::Tensor &inputH = to_torch(h, options);
-
-  std::vector<int64_t> kernel{c, c};
-  std::vector<int64_t> stride{d, d};
-  std::vector<int64_t> padding{e, e};
-  std::vector<int64_t> dilation{f, f};
-
-  at::Tensor result = at::native::max_pool2d_with_indices_backward_cpu(
-      inputA, inputB, kernel, stride, padding, dilation, g, inputH);
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-template <typename T>
-void mm_out(tensor_t<T, 2> *a, tensor_t<T, 2> *b, tensor_t<T, 2> *r) {
-  at::Tensor inputA = to_torch(a);
-  at::Tensor inputB = to_torch(b);
-
-  at::Tensor result = inputA.matmul(inputB);
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-template <typename T, int N>
-void mul_out(tensor_t<T, N> *a, tensor_t<T, N> *b, tensor_t<T, N> *r) {
-  at::Tensor inputA = to_torch(a);
-  at::Tensor inputB = to_torch(b);
-
-  at::Tensor result = at::native::mul(inputA, inputB);
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-template <typename T, int N>
-void relu_out(tensor_t<T, N> *a, tensor_t<T, N> *r) {
-  at::Tensor inputA = to_torch(a);
-
-  at::Tensor result = at::native::relu(inputA);
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-template <typename T> void t_out(tensor_t<T, 2> *a, tensor_t<T, 2> *r) {
-  size_t h = a->shape[0];
-  size_t w = a->shape[1];
-
-  for (size_t i = 0; i < h; i++)
-    for (size_t j = 0; j < w; j++)
-      r->d[j * h + i] = a->d[i * w + j];
-}
-
-template <typename T, int N>
-void threshold_backward_out(tensor_t<T, N> *a, tensor_t<T, N> *b, int32_t c,
-                            tensor_t<T, N> *r) {
-  at::Tensor inputA = to_torch(a);
-  at::Tensor inputB = to_torch(b);
-
-  at::Tensor result = at::native::threshold_backward(inputA, inputB, c);
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-template <typename T, int N, int M>
-void view_out(tensor_t<T, M> *a, int32_t b, int32_t c, int32_t d, int32_t e,
-              tensor_t<T, N> *r) {
-  tensor_t<T, N> result;
-  size_t numel = 1;
-  for (size_t d = 0; d < M; d++)
-    numel *= a->shape[d];
-
-  if (N == 1)
-    c = d = e = 1;
-  if (N == 2)
-    d = e = 1;
-  if (N == 3)
-    e = 1;
-
-  int inferred = 0;
-  if (b == -1)
-    inferred++;
-  if (c == -1)
-    inferred++;
-  if (d == -1)
-    inferred++;
-  if (e == -1)
-    inferred++;
-  assert(inferred <= 1 &&
-         "aten.view Error: only one dimension can be inferred");
-
-  if (b == -1)
-    b = numel / (c * d * e);
-  if (c == -1)
-    c = numel / (b * d * e);
-  if (d == -1)
-    d = numel / (b * c * e);
-  if (e == -1)
-    e = numel / (b * c * d);
-
-  if (N > 0)
-    r->shape[0] = b;
-  if (N > 1)
-    r->shape[1] = c;
-  if (N > 2)
-    r->shape[2] = d;
-  if (N > 3)
-    r->shape[3] = e;
-
-  memcpy(r->d, a->d, numel * sizeof(T));
-}
-
-} // namespace
-
-extern "C" {
-
-// add_out
-
-void _mlir_ciface_add_1F32_1F32_1F32_out(tensor_t<float, 1> *a,
-                                         tensor_t<float, 1> *b, int32_t i,
-                                         tensor_t<float, 1> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  add_out<float, 1>(a, b, i, r);
-}
-
-void _mlir_ciface_add_2F32_2F32_2F32_out(tensor_t<float, 2> *a,
-                                         tensor_t<float, 2> *b, int32_t i,
-                                         tensor_t<float, 2> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  add_out<float, 2>(a, b, i, r);
-}
-
-void _mlir_ciface_add_3F32_3F32_3F32_out(tensor_t<float, 3> *a,
-                                         tensor_t<float, 3> *b, int32_t i,
-                                         tensor_t<float, 3> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  add_out<float, 3>(a, b, i, r);
-}
-
-void _mlir_ciface_add_4F32_4F32_4F32_out(tensor_t<float, 4> *a,
-                                         tensor_t<float, 4> *b, int32_t i,
-                                         tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  add_out<float, 4>(a, b, i, r);
-}
-
-// addmm_out
-
-void _mlir_ciface_addmm_2F32_1F32_2F32_2F32_out(tensor_t<float, 1> *a,
-                                                tensor_t<float, 2> *b,
-                                                tensor_t<float, 2> *c,
-                                                int32_t alpha, int32_t beta,
-                                                tensor_t<float, 2> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  addmm_out<float>(a, b, c, alpha, beta, r);
-}
-
-// as_strided_out
-
-void _mlir_ciface_as_strided_1F32_1F32_out(tensor_t<float, 1> *a,
-                                           /*size*/ int32_t sz0, int32_t sz1,
-                                           int32_t sz2, int32_t sz3,
-                                           /*stride*/ int32_t sd0, int32_t sd1,
-                                           int32_t sd2, int32_t sd3,
-                                           int32_t offset,
-                                           tensor_t<float, 1> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  as_strided_out<float, 1, 1>(a, sz0, sz1, sz2, sz3, sd0, sd1, sd2, sd3, offset,
-                              r);
-}
-
-void _mlir_ciface_as_strided_4F32_2F32_out(tensor_t<float, 2> *a,
-                                           /*size*/ int32_t sz0, int32_t sz1,
-                                           int32_t sz2, int32_t sz3,
-                                           /*stride*/ int32_t sd0, int32_t sd1,
-                                           int32_t sd2, int32_t sd3,
-                                           int32_t offset,
-                                           tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  // std::cout << sz0 << " "
-  //           << sz1 << " "
-  //           << sz2 << " "
-  //           << sz3 << "\n";
-  // std::cout << sd0 << " "
-  //           << sd1 << " "
-  //           << sd2 << " "
-  //           << sd3 << "\n";
-  as_strided_out<float, 4, 2>(a, sz0, sz1, sz2, sz3, sd0, sd1, sd2, sd3, offset,
-                              r);
-}
-
-// conv2d_out
-
-void _mlir_ciface_conv2d_4F32_4F32_4F32_1F32_out(
-    tensor_t<float, 4> *t, tensor_t<float, 4> *weight, tensor_t<float, 1> *bias,
-    int32_t stride, int32_t padding, int32_t dilation, tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  conv2d_out<float>(t, weight, bias, stride, padding, dilation, r);
-}
-
-void _mlir_ciface_conv2d_relu_4F32_4F32_4F32_1F32_out(
-    tensor_t<float, 4> *t, tensor_t<float, 4> *weight, tensor_t<float, 1> *bias,
-    int32_t stride, int32_t padding, int32_t dilation, tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  conv2d_out<float>(t, weight, bias, stride, padding, dilation, r);
-  relu_out<float, 4>(r, r);
-}
-
-// conv2d_backward_out
-
-void _mlir_ciface_conv2d_backward_4F32_4F32_1F32_4F32_4F32_4F32_out(
-    tensor_t<float, 4> *grad_output, tensor_t<float, 4> *t,
-    tensor_t<float, 4> *weight, int32_t stride, int32_t padding,
-    int32_t dilation, tensor_t<float, 4> *r0, tensor_t<float, 4> *r1,
-    tensor_t<float, 1> *r2) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  conv2d_backward_out<float>(grad_output, t, weight, stride, padding, dilation,
-                             r0, r1, r2);
-}
-
-// div
-float *div_0F32_0F32_0F32(float *a, float *b) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  float *ret = (float *)malloc(sizeof(float));
-  *ret = *a / *b;
-  return ret;
-}
-
-// log_softmax_out
-
-void _mlir_ciface_log_softmax_1F32_1F32_out(tensor_t<float, 1> *t, int32_t dim,
-                                            bool half_to_float,
-                                            tensor_t<float, 1> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  log_softmax_out<float, 1>(t, dim, half_to_float, r);
-}
-void _mlir_ciface_log_softmax_2F32_2F32_out(tensor_t<float, 2> *t, int32_t dim,
-                                            bool half_to_float,
-                                            tensor_t<float, 2> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  log_softmax_out<float, 2>(t, dim, half_to_float, r);
-}
-void _mlir_ciface_log_softmax_3F32_3F32_out(tensor_t<float, 3> *t, int32_t dim,
-                                            bool half_to_float,
-                                            tensor_t<float, 3> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  log_softmax_out<float, 3>(t, dim, half_to_float, r);
-}
-void _mlir_ciface_log_softmax_4F32_4F32_out(tensor_t<float, 4> *t, int32_t dim,
-                                            bool half_to_float,
-                                            tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  log_softmax_out<float, 4>(t, dim, half_to_float, r);
-}
-
-// log_softmax_backward_data_out
-
-void _mlir_ciface_log_softmax_backward_data_2F32_2F32_2F32_2F32_out(
-    tensor_t<float, 2> *a, tensor_t<float, 2> *b, int32_t c,
-    tensor_t<float, 2> *d, tensor_t<float, 2> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  log_softmax_backward_data_out<float, 2>(a, b, c, d, r);
-}
-
-void _mlir_ciface_log_softmax_backward_data_4F32_4F32_4F32_4F32_out(
-    tensor_t<float, 4> *a, tensor_t<float, 4> *b, int32_t c,
-    tensor_t<float, 4> *d, tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  log_softmax_backward_data_out<float, 4>(a, b, c, d, r);
-}
-
-// max_pool2d_out
-
-void _mlir_ciface_max_pool2d_with_indices_4F32_4I64_4F32_out(
-    tensor_t<float, 4> *t, int32_t kernel, int32_t pad, int32_t stride,
-    int32_t dilation, bool ceil_mode, tensor_t<float, 4> *r0,
-    tensor_t<int64_t, 4> *r1) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  max_pool2d_with_indices_out<float>(t, kernel, pad, stride, dilation,
-                                     ceil_mode, r0, r1);
-}
-
-// max_pool2d backward_out
-
-void _mlir_ciface_max_pool2d_with_indices_backward_4F32_4F32_4F32_4I64_out(
-    tensor_t<float, 4> *a, tensor_t<float, 4> *b, int32_t c, int32_t d,
-    int32_t e, int32_t f, bool g, tensor_t<int64_t, 4> *h,
-    tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  max_pool2d_with_indices_backward_out<float>(a, b, c, d, e, f, g, h, r);
-}
-
-// mm_out
-
-void _mlir_ciface_mm_2F32_2F32_2F32_out(tensor_t<float, 2> *a,
-                                        tensor_t<float, 2> *b,
-                                        tensor_t<float, 2> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  mm_out<float>(a, b, r);
-}
-
-// mul_out
-
-void _mlir_ciface_mul_1F32_1F32_1F32_out(tensor_t<float, 1> *a,
-                                         tensor_t<float, 1> *b,
-                                         tensor_t<float, 1> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  mul_out<float, 1>(a, b, r);
-}
-
-void _mlir_ciface_mul_2F32_2F32_2F32_out(tensor_t<float, 2> *a,
-                                         tensor_t<float, 2> *b,
-                                         tensor_t<float, 2> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  mul_out<float, 2>(a, b, r);
-}
-
-void _mlir_ciface_mul_3F32_3F32_3F32_out(tensor_t<float, 3> *a,
-                                         tensor_t<float, 3> *b,
-                                         tensor_t<float, 3> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  mul_out<float, 3>(a, b, r);
-}
-
-void _mlir_ciface_mul_4F32_4F32_4F32_out(tensor_t<float, 4> *a,
-                                         tensor_t<float, 4> *b,
-                                         tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  mul_out<float, 4>(a, b, r);
-}
-
-// nll_loss2d_forward_out
-
-void _mlir_ciface_nll_loss2d_forward_1F32_1F32_4F32_3I64_1F32_out(
-    tensor_t<float, 4> *a, tensor_t<uint64_t, 3> *b, tensor_t<float, 1> *c,
-    int64_t d, int64_t e, tensor_t<float, 1> *r0, tensor_t<float, 1> *r1) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  using T = float;
-  at::Tensor inputA = to_torch(a);
-  at::TensorOptions options(at::ScalarType::Long);
-  at::Tensor inputB = to_torch(b, options);
-  at::Tensor inputC = to_torch(c);
-
-  std::tuple<at::Tensor, at::Tensor> result =
-      at::CPUType::nll_loss2d_forward(inputA, inputB, inputC, d, e);
-
-  at::Tensor result0 = std::get<0>(result);
-  at::Tensor result1 = std::get<1>(result);
-  memcpy(r0->d, result0.data_ptr(), result0.numel() * sizeof(T));
-  memcpy(r1->d, result1.data_ptr(), result1.numel() * sizeof(T));
-}
-
-// nll_loss2d_backward_out
-
-void _mlir_ciface_nll_loss2d_backward_4F32_1F32_4F32_3I64_1F32_1F32_out(
-    tensor_t<float, 1> *a, tensor_t<float, 4> *b, tensor_t<uint64_t, 3> *c,
-    tensor_t<float, 1> *d, int32_t e, int32_t f, tensor_t<float, 1> *g,
-    tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  using T = float;
-  at::Tensor inputA = to_torch(a);
-  at::Tensor inputB = to_torch(b);
-  at::TensorOptions options(at::ScalarType::Long);
-  at::Tensor inputC = to_torch(c, options);
-  at::Tensor inputD = to_torch(d);
-  at::Tensor inputG = to_torch(g);
-
-  at::Tensor result = at::CPUType::nll_loss2d_backward(inputA, inputB, inputC,
-                                                       inputD, e, f, inputG);
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-void _mlir_ciface_nll_loss_backward_2F32_1F32_2F32_1I64_1F32_1F32_out(
-    tensor_t<float, 1> *a, tensor_t<float, 2> *b, tensor_t<uint64_t, 1> *c,
-    tensor_t<float, 1> *d, int32_t e, int32_t f, tensor_t<float, 1> *g,
-    tensor_t<float, 2> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  using T = float;
-  at::Tensor inputA = to_torch(a);
-  at::Tensor inputB = to_torch(b);
-  at::TensorOptions options(at::ScalarType::Long);
-  at::Tensor inputC = to_torch(c, options);
-  at::Tensor inputD = to_torch(d);
-  at::Tensor inputG = to_torch(g);
-
-  at::Tensor result = at::CPUType::nll_loss_backward(inputA, inputB, inputC,
-                                                     inputD, e, f, inputG);
-
-  memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T));
-}
-
-// nll_loss_forward_out
-
-void _mlir_ciface_nll_loss_forward_1F32_1F32_2F32_1I64_1F32_out(
-    tensor_t<float, 2> *a, tensor_t<uint64_t, 1> *b, tensor_t<float, 1> *c,
-    int64_t d, int64_t e, tensor_t<float, 1> *r0, tensor_t<float, 1> *r1) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  using T = float;
-  at::Tensor inputA = to_torch(a);
-  at::TensorOptions options(at::ScalarType::Long);
-  at::Tensor inputB = to_torch(b, options);
-  at::Tensor inputC = to_torch(c);
-
-  std::tuple<at::Tensor, at::Tensor> result =
-      at::CPUType::nll_loss_forward(inputA, inputB, inputC, d, e);
-
-  at::Tensor result0 = std::get<0>(result);
-  at::Tensor result1 = std::get<1>(result);
-
-  memcpy(r0->d, result0.data_ptr(), result0.numel() * sizeof(T));
-  memcpy(r1->d, result1.data_ptr(), result1.numel() * sizeof(T));
-}
-
-// relu_out
-
-void _mlir_ciface_relu_1F32_1F32_out(tensor_t<float, 1> *a,
-                                     tensor_t<float, 1> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  relu_out<float, 1>(a, r);
-}
-
-void _mlir_ciface_relu_2F32_2F32_out(tensor_t<float, 2> *a,
-                                     tensor_t<float, 2> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  relu_out<float, 2>(a, r);
-}
-
-void _mlir_ciface_relu_3F32_3F32_out(tensor_t<float, 3> *a,
-                                     tensor_t<float, 3> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  relu_out<float, 3>(a, r);
-}
-
-void _mlir_ciface_relu_4F32_4F32_out(tensor_t<float, 4> *a,
-                                     tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  relu_out<float, 4>(a, r);
-}
-
-// t_out
-
-void _mlir_ciface_t_2F32_2F32_out(tensor_t<float, 2> *a,
-                                  tensor_t<float, 2> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  t_out<float>(a, r);
-}
-
-// threshold_backward_out
-
-void _mlir_ciface_threshold_backward_1F32_1F32_1F32_out(tensor_t<float, 1> *a,
-                                                        tensor_t<float, 1> *b,
-                                                        int32_t c,
-                                                        tensor_t<float, 1> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  threshold_backward_out<float, 1>(a, b, c, r);
-}
-
-void _mlir_ciface_threshold_backward_2F32_2F32_2F32_out(tensor_t<float, 2> *a,
-                                                        tensor_t<float, 2> *b,
-                                                        int32_t c,
-                                                        tensor_t<float, 2> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  threshold_backward_out<float, 2>(a, b, c, r);
-}
-
-void _mlir_ciface_threshold_backward_3F32_3F32_3F32_out(tensor_t<float, 3> *a,
-                                                        tensor_t<float, 3> *b,
-                                                        int32_t c,
-                                                        tensor_t<float, 3> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  threshold_backward_out<float, 3>(a, b, c, r);
-}
-
-void _mlir_ciface_threshold_backward_4F32_4F32_4F32_out(tensor_t<float, 4> *a,
-                                                        tensor_t<float, 4> *b,
-                                                        int32_t c,
-                                                        tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  threshold_backward_out<float, 4>(a, b, c, r);
-}
-
-// view_out
-
-void _mlir_ciface_view_1F32_4F32_out(tensor_t<float, 4> *a, int32_t b,
-                                     int32_t c, int32_t d, int32_t e,
-                                     tensor_t<float, 1> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  view_out<float, 1, 4>(a, b, c, d, e, r);
-}
-
-void _mlir_ciface_view_1F32_3F32_out(tensor_t<float, 3> *a, int32_t b,
-                                     int32_t c, int32_t d, int32_t e,
-                                     tensor_t<float, 1> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  view_out<float, 1, 3>(a, b, c, d, e, r);
-}
-
-void _mlir_ciface_view_1F32_2F32_out(tensor_t<float, 2> *a, int32_t b,
-                                     int32_t c, int32_t d, int32_t e,
-                                     tensor_t<float, 1> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  view_out<float, 1, 2>(a, b, c, d, e, r);
-}
-
-void _mlir_ciface_view_2F32_4F32_out(tensor_t<float, 4> *a, int32_t b,
-                                     int32_t c, int32_t d, int32_t e,
-                                     tensor_t<float, 2> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  view_out<float, 2, 4>(a, b, c, d, e, r);
-}
-
-void _mlir_ciface_view_4F32_1F32_out(tensor_t<float, 1> *a, int32_t b,
-                                     int32_t c, int32_t d, int32_t e,
-                                     tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  view_out<float, 4, 1>(a, b, c, d, e, r);
-}
-
-void _mlir_ciface_view_4F32_2F32_out(tensor_t<float, 2> *a, int32_t b,
-                                     int32_t c, int32_t d, int32_t e,
-                                     tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  view_out<float, 4, 2>(a, b, c, d, e, r);
-}
-
-void _mlir_ciface_view_4F32_3F32_out(tensor_t<float, 3> *a, int32_t b,
-                                     int32_t c, int32_t d, int32_t e,
-                                     tensor_t<float, 4> *r) {
-  // std::cout << "aten_ops " << __func__ << "\n";
-  view_out<float, 4, 3>(a, b, c, d, e, r);
-}
-}
--- a/frontends/pytorch/test/acap_regression/lit.local.cfg
+++ b/frontends/pytorch/test/acap_regression/lit.local.cfg
@ -1,3 +0,0 @@
-# TODO: Enable these tests for the new c10 dispatch code path with pt > 1.3
-if config.enable_c10_dispatch:
-  config.unsupported = True
--- a/frontends/pytorch/test/acap_regression/test_export_ResA.py
+++ b/frontends/pytorch/test/acap_regression/test_export_ResA.py
@ -1,78 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import unittest
-from unittest import TestCase
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-import npcomp.frontends.pytorch as torch_mlir
-
-import inspect
-
-# RUN: %PYTHON %s | FileCheck %s
-
-class ResA(nn.Module):
-    def __init__(self, channels):
-      C = int(channels)
-      C2 = int(channels/2)
-      super(ResA, self).__init__()
-      self.model = nn.Sequential(# A1
-                                nn.BatchNorm2d(C),
-                                nn.ReLU(),
-                                nn.Conv2d(C,C2,1,stride=1,padding=0,dilation=1,groups=1,bias=True),
-                                # B1
-                                nn.BatchNorm2d(C2),
-                                nn.ReLU(),
-                                nn.Conv2d(C2,C2,3,stride=1,padding=1,dilation=1,groups=1,bias=True),
-                                # C1
-                                nn.BatchNorm2d(C2),
-                                nn.ReLU(),
-                                nn.Conv2d(C2,C,1,stride=1,padding=0,dilation=1,groups=1,bias=True))
-    def forward(self, x):
-      res = self.model.forward(x)
-      return x + res
-
-# Prints `str` prefixed by the current test function name so we can use it in
-# Filecheck label directives.
-# This is achieved by inspecting the stack and getting the parent name.
-def printWithCurrentFunctionName(s):
-  # stack[1] is the caller, i.e. "_test_model"
-  # stack[2] is the caller's caller, e.g. "test_conv_1"
-  print(inspect.stack()[2][3], s)
-
-class TestMLIRExport(unittest.TestCase):
-  def setUp(self):
-    pass
-
-  def _test_model(self, model, model_args):
-    result = model(model_args)
-
-    mlir = torch_mlir.get_mlir(result)
-    printWithCurrentFunctionName (mlir)
-    return True
-
-  def test_ResA_16(self):
-    dev = torch_mlir.mlir_device()
-    model = ResA(16).to(dev)
-    passed = self._test_model(model, torch.ones((1,16,128,128), device=dev))
-    # CHECK-LABEL: test_ResA_16
-    #   CHECK: [[V0:%[a-zA-Z0-9]+]], %{{.*}}, %{{.*}} = "aten.native_batch_norm"({{.*}}) {layer_name = "L0-native_batch_norm-0"}
-    #   CHECK: [[V1:%[a-zA-Z0-9]+]] = "aten.relu"([[V0]]) {layer_name = "L1-relu-0"}
-    #   CHECK: [[V2:%[a-zA-Z0-9]+]] = "aten.convolution_overrideable"([[V1]], {{.*}}) {layer_name = "L2-convolution_overrideable-0"}
-    #   CHECK: [[V3:%[a-zA-Z0-9_]+]], %{{.*}}, %{{.*}} = "aten.native_batch_norm"([[V2]]{{.*}}) {layer_name = "L3-native_batch_norm-1"}
-    #   CHECK: [[V4:%[a-zA-Z0-9]+]] = "aten.relu"([[V3]]) {layer_name = "L4-relu-1"}
-    #   CHECK: [[V5:%[a-zA-Z0-9]+]] = "aten.convolution_overrideable"([[V4]],{{.*}}) {layer_name = "L5-convolution_overrideable-1"}
-    #   CHECK: [[V6:%[a-zA-Z0-9_]+]], %{{.*}}, %{{.*}} = "aten.native_batch_norm"([[V5]],{{.*}}) {layer_name = "L6-native_batch_norm-2"}
-    #   CHECK: [[V7:%[a-zA-Z0-9]+]] = "aten.relu"([[V6]]) {layer_name = "L7-relu-2"}
-    #   CHECK: [[V8:%[a-zA-Z0-9]+]] = "aten.convolution_overrideable"([[V7]],{{.*}}) {layer_name = "L8-convolution_overrideable-2"}
-    #   CHECK: {{.*}} = "aten.add"(%arg0, [[V8]], {{.*}}) {layer_name = "L9-add-0"}
-    self.assertTrue(passed)
-
-verbose = False
-if __name__ == '__main__':
-    verbose = True
-    unittest.main()
--- a/frontends/pytorch/test/acap_regression/test_export_add3.py
+++ b/frontends/pytorch/test/acap_regression/test_export_add3.py
@ -1,26 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-t0 = torch.randn((1,2,3,4), device=dev)
-t1 = torch.randn((1,2,3,4), device=dev)
-t2 = torch.randn((1,2,3,4), device=dev)
-
-t3 = t0 + t1 + t2
-
-#
-# Generate and check the MLIR for the result tensor
-#
-t3_mlir = torch_mlir.get_mlir( t3 )
-
-# CHECK-LABEL: test_export_add3
-#   CHECK: %1 = "aten.add"(%arg0, %arg1, %0) {layer_name = "L0-add-0"} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>, i32) -> tensor<1x2x3x4xf32>
-#   CHECK: %2 = "aten.add"(%1, %arg2, %0) {layer_name = "L1-add-1"} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>, i32) -> tensor<1x2x3x4xf32>
-print("test_export_add3")
-print(t3_mlir)
--- a/frontends/pytorch/test/acap_regression/test_export_batchnorm.py
+++ b/frontends/pytorch/test/acap_regression/test_export_batchnorm.py
@ -1,19 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-
-model = torch.nn.BatchNorm2d(123).to(dev)
-result = model(torch.ones(42,123,4,5).to(dev))
-
-# CHECK-LABEL: test_export_batchnorm
-#       CHECK: aten.native_batch_norm
-mlir = torch_mlir.get_mlir( result )
-print("test_export_batchnorm")
-print(mlir)
--- a/frontends/pytorch/test/acap_regression/test_export_conv2d_back.py
+++ b/frontends/pytorch/test/acap_regression/test_export_conv2d_back.py
@ -1,49 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-
-N = 3
-Cin = 16
-Cout = 4
-w = 10
-h = 10
-
-model = torch.nn.Conv2d(Cin, Cout, (3,3))
-ref_model = torch.nn.Conv2d(Cin, Cout, (3,3))
-
-ref_model.weight.data = model.weight.clone()
-ref_model.bias.data = model.bias.clone()
-
-model = model.to(dev)
-
-softmax = torch.nn.LogSoftmax(dim=1)
-loss = torch.nn.NLLLoss()
-
-tensor = torch.randn(N, Cin, h, w, device=dev)
-result = model(tensor)
-
-# CHECK-LABEL: test_export_conv2d
-#   CHECK: aten.convolution_overrideable
-print("test_export_conv2d")
-print(torch_mlir.get_mlir( result ))
-
-target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, Cout)
-ref_target = target.clone()
-target = target.to(dev)
-
-test_loss = loss( softmax(result), target )
-test_loss.backward()
-
-# CHECK-LABEL: test_export_conv2d_back
-# CHECK: aten.convolution_overrideable
-# CHECK: aten._log_softmax
-# CHECK: aten.nll_loss2d_forward
-print("test_export_conv2d_back")
-print(torch_mlir.get_mlir( test_loss ))
--- a/frontends/pytorch/test/acap_regression/test_export_multi_out.py
+++ b/frontends/pytorch/test/acap_regression/test_export_multi_out.py
@ -1,24 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-
-t0 = torch.randn(4, device=dev)
-t1 = torch.randn(4, device=dev)
-t2 = torch.randn(4, device=dev)
-
-t4 = t0 + t1 + t2
-t5 = t4 + t1
-t6 = t5 + t4
-
-# CHECK-LABEL: test_multi_out
-#   CHECK: return %2, %3, %4 : tensor<4xf32>, tensor<4xf32>, tensor<4xf32>
-mlir = torch_mlir.get_mlir([t4, t5, t6])
-print ("test_multi_out")
-print (mlir)
--- a/frontends/pytorch/test/acap_regression/test_export_resnet18.py
+++ b/frontends/pytorch/test/acap_regression/test_export_resnet18.py
@ -1,25 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import torchvision.models as models
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-
-model = models.resnet18().to(dev)
-model.training = False
-
-tensor = torch.randn(32,3,32,32).to(dev)
-result = model(tensor)
-
-mlir = torch_mlir.get_mlir( result )
-
-# for now we just check the output shape
-# CHECK-LABEL: test_export_resnet18
-#   CHECK: return %{{.*}} : tensor<32x1000xf32>
-print("test_export_resnet18")
-print(mlir)
--- a/frontends/pytorch/test/acap_regression/test_export_vgg11.py
+++ b/frontends/pytorch/test/acap_regression/test_export_vgg11.py
@ -1,24 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import torchvision.models as models
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-
-model = models.vgg11_bn().to(dev)
-model.training = False
-
-result = model(torch.ones(32,3,32,32).to(dev))
-
-mlir = torch_mlir.get_mlir( result )
-
-# for now we just check the output shape
-# CHECK-LABEL: test_export_vgg11
-#   CHECK: return %{{.*}} : tensor<32x1000xf32>
-print("test_export_vgg11")
-print(mlir)
--- a/frontends/pytorch/test/acap_regression/test_jit_add2.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_add2.py
@ -1,27 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-t0 = torch.randn((4,4), device=dev)
-t1 = torch.randn((4,4), device=dev)
-
-t2 = t0 + t1
-
-#
-# Check the result tensor against the CPU
-#
-t0_cpu = t0.to('cpu')
-t1_cpu = t1.to('cpu')
-t2_cpu = t2.to('cpu')
-
-print (t0_cpu, " +\n", t1_cpu, " =\n", t2_cpu)
-
-# CHECK: PASS! add2 check
-test.compare(t2, t0_cpu + t1_cpu, "add2")
--- a/frontends/pytorch/test/acap_regression/test_jit_add3.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_add3.py
@ -1,29 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-t0 = torch.randn((1,2,3,4), device=dev)
-t1 = torch.randn((1,2,3,4), device=dev)
-t2 = torch.randn((1,2,3,4), device=dev)
-
-t3 = t0 + t1 + t2
-
-#
-# Check the result tensor against the CPU
-#
-t0_cpu = t0.to('cpu')
-t1_cpu = t1.to('cpu')
-t2_cpu = t2.to('cpu')
-t3_cpu = t3.to('cpu')
-
-print (t0_cpu, " +\n", t1_cpu, " +\n", t2_cpu, " =\n", t3_cpu)
-
-# CHECK: PASS!
-test.compare(t3, t0_cpu + t1_cpu + t2_cpu, "add3")
--- a/frontends/pytorch/test/acap_regression/test_jit_add_views.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_add_views.py
@ -1,42 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-t0 = torch.randn((4,16,4), device=dev)
-t1 = torch.randn((4,16,4), device=dev)
-
-t3 = torch.randn((4,64), device=dev)
-t4 = torch.randn((4,64), device=dev)
-
-t2 = t0 + t1
-t5 = t3 + t4
-
-t6 = t5.view((4,4,4,4))
-t7 = t2.view((4,4,4,4))
-
-t8 = t6 + t7
-
-t0_cpu = t0.to('cpu')
-t1_cpu = t1.to('cpu')
-
-# CHECK: PASS! add_views_0 check
-test.compare(t2, t0_cpu + t1_cpu, "add_views_0")
-
-t3_cpu = t3.to('cpu')
-t4_cpu = t4.to('cpu')
-
-# CHECK: PASS! add_views_1 check
-test.compare(t5, t3_cpu + t4_cpu, "add_views_1")
-
-t6_cpu = t6.to('cpu')
-t7_cpu = t7.to('cpu')
-
-# CHECK: PASS! add_views_2 check
-test.compare(t8, t6_cpu + t7_cpu, "add_views_2")
--- a/frontends/pytorch/test/acap_regression/test_jit_as_stride.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_as_stride.py
@ -1,43 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-
-x = torch.rand((3,64,8,8), device=dev)
-y = x*x
-print (y.stride())
-
-dim = [64,24,24]
-dim = [4,4,4]
-N = 2;
-count = dim[0]*dim[1]*dim[2]
-sizes = (N,dim[0],dim[1],dim[2])
-strides = (1,dim[1]*dim[2],dim[2],1)
-print(count)
-t0 = torch.randn((N,count), device=dev)
-t0_like = torch.randn((N,count))
-
-
-t1 = t0.as_strided(sizes, strides)
-t1_ref = t0.to('cpu').as_strided(sizes, strides)
-t1_like = t0_like.as_strided(sizes, strides)
-
-t1_ref = t1_ref.clone()
-
-# check that the IR has recorded the
-# stride properly before invoking JIT
-# CHECK: PASS! stride check
-test.compare_eq(t1.stride(), t1_like.stride(), "stride")
-
-# CHECK: PASS! as_stride check
-test.compare(t1_ref, t1, "as_stride")
-
-# CHECK: PASS! as_stride stride check
-test.compare_eq(t1_ref.stride(), t1.to("cpu").stride(), "as_stride stride")
--- a/frontends/pytorch/test/acap_regression/test_jit_conv2d.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_conv2d.py
@ -1,17 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-model = torch.nn.Conv2d(2,16,7,stride=[2,2], padding=[3,3],
-                        dilation=1, groups=1, bias=True)
-
-tensor = torch.randn((1,2,128,128))
-
-# CHECK: PASS! fwd check
-test.check_ref(model, tensor)
--- a/frontends/pytorch/test/acap_regression/test_jit_conv2d_back.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_conv2d_back.py
@ -1,46 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-
-N = 3
-Cin = 16
-Cout = 4
-w = 10
-h = 10
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.conv1 = nn.Conv2d(Cin, Cout, (3,3))
-
-    def forward(self, x):
-        x = self.conv1(x)
-        output = F.log_softmax(x, dim=1)
-        return output
-
-model = Net()
-tensor = torch.randn(N, Cin, h, w)
-
-# CHECK: PASS! fwd check
-fwd_path = test.check_ref(model, tensor)
-
-loss = torch.nn.NLLLoss()
-target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, Cout)
-
-# CHECK: PASS! back check
-test.check_back(fwd_path, target, loss)
-
-# CHECK: PASS! weight_grad check
-test.compare(model.conv1.weight.grad, fwd_path[0].conv1.weight.grad, "weight_grad")
-# CHECK: PASS! bias_grad check
-test.compare(model.conv1.bias.grad, fwd_path[0].conv1.bias.grad, "bias_grad")
--- a/frontends/pytorch/test/acap_regression/test_jit_lenet_back.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_lenet_back.py
@ -1,60 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.conv1 = nn.Conv2d(1, 32, 3, 1, bias=True)
-        self.conv2 = nn.Conv2d(32, 64, 3, 1, bias=True)
-        #self.maxpool2d = nn.MaxPool2d(2,2)
-        self.fc1 = nn.Linear(9216*4, 128, bias=True)
-        self.fc2 = nn.Linear(128, 10, bias=True)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = F.relu(x)
-        x = self.conv2(x)
-        #x = self.maxpool2d(x)
-        x = x.view((64,9216*4))
-        x = self.fc1(x)
-        x = F.relu(x)
-        x = self.fc2(x)
-        output = F.log_softmax(x, dim=1)
-        return output
-
-def main():
-    model = Net()
-    tensor = torch.randn((64, 1, 28, 28), requires_grad=True)
-
-    # CHECK: PASS! fwd check
-    fwd_path = test.check_fwd(model, tensor)
-
-    target = torch.ones((64), dtype=torch.long)
-    loss = F.nll_loss
-
-    # CHECK: PASS! back check
-    test.check_back(fwd_path, target, loss)
-
-    # CHECK: PASS! weight_grad check
-    test.compare(model.conv2.weight.grad,
-                 fwd_path[0].conv2.weight.grad, "weight_grad")
-    # CHECK: PASS! bias_grad check
-    test.compare(model.conv2.bias.grad,
-                 fwd_path[0].conv2.bias.grad, "bias_grad")
-    # CHECK: PASS! fc1_weight_grad check
-    test.compare(model.fc1.weight.grad,
-                 fwd_path[0].fc1.weight.grad, "fc1_weight_grad")
-
-if __name__ == '__main__':
-    main()
--- a/frontends/pytorch/test/acap_regression/test_jit_lenet_fwd.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_lenet_fwd.py
@ -1,53 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-from __future__ import print_function
-import argparse
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torchvision import datasets, transforms
-from torch.optim.lr_scheduler import StepLR
-
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.conv1 = nn.Conv2d(1, 32, 3, 1)
-        self.conv2 = nn.Conv2d(32, 64, 3, 1)
-        self.maxpool2d = nn.MaxPool2d(2,2)
-        #self.dropout1 = nn.Dropout2d(0.25)
-        #self.dropout2 = nn.Dropout2d(0.5)
-        self.fc1 = nn.Linear(9216, 128)
-        self.fc2 = nn.Linear(128, 10)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = F.relu(x)
-        x = self.conv2(x)
-        x = self.maxpool2d(x)
-        #x = self.dropout1(x)
-        x = x.view((4,9216))
-        x = self.fc1(x)
-        x = F.relu(x)
-        #x = self.dropout2(x)
-        x = self.fc2(x)
-        output = F.log_softmax(x, dim=1)
-        return output
-
-
-def main():
-    model = Net()
-    tensor = torch.randn((4, 1, 28, 28))
-
-    # CHECK: PASS! fwd check
-    fwd_path = test.check_fwd(model, tensor)
-
-if __name__ == '__main__':
-    main()
--- a/frontends/pytorch/test/acap_regression/test_jit_linear.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_linear.py
@ -1,17 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-
-model = torch.nn.Linear(1024,16).to(dev)
-tensor = torch.randn(4,1024).to(dev)
-
-# CHECK: PASS! fwd check
-fwd_path = test.check_fwd(model, tensor)
--- a/frontends/pytorch/test/acap_regression/test_jit_logsoftmax.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_logsoftmax.py
@ -1,15 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-model = torch.nn.LogSoftmax(dim=0)
-tensor = torch.ones(1,2,3,4)
-
-# CHECK: PASS! fwd check
-fwd_path = test.check_fwd(model, tensor)
--- a/frontends/pytorch/test/acap_regression/test_jit_maxpool.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_maxpool.py
@ -1,18 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-model = torch.nn.MaxPool2d(kernel_size=(3,3), stride=(2,2), padding=(1,1),
-                           dilation=1, return_indices=False, ceil_mode=False)
-
-tensor = torch.randn(1,32,16,16)
-
-# CHECK: PASS! fwd check
-fwd_path = test.check_fwd(model, tensor)
-
--- a/frontends/pytorch/test/acap_regression/test_jit_mlp_back.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_mlp_back.py
@ -1,49 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-from __future__ import print_function
-import argparse
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torchvision import datasets, transforms
-from torch.optim.lr_scheduler import StepLR
-
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.fc1 = nn.Linear(28*28, 50)
-        self.fc2 = nn.Linear(50, 50)
-        self.fc3 = nn.Linear(50, 10)
-
-    def forward(self, x):
-        x = x.view(-1, 28*28)
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        return F.log_softmax(self.fc3(x), dim=1)
-
-def main():
-    device = torch_mlir.mlir_device()
-    model = Net()
-    tensor = torch.randn((64, 1, 28, 28),requires_grad=True)
-    # CHECK: PASS! fwd check
-    fwd_path = test.check_ref(model, tensor)
-
-    target = torch.ones((64), dtype=torch.long)
-    loss = F.nll_loss
-
-    # CHECK: PASS! back check
-    test.check_back(fwd_path, target, loss)
-
-    # CHECK: PASS! fc1_weight_grad check
-    test.compare(model.fc1.weight.grad, fwd_path[0].fc1.weight.grad, "fc1_weight_grad")
-
-if __name__ == '__main__':
-    main()
--- a/frontends/pytorch/test/acap_regression/test_jit_mm.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_mm.py
@ -1,32 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-
-t0 = torch.randn((3,13), device=dev)
-t1 = torch.randn((13,5), device=dev)
-print(t0.to('cpu'), t1.to('cpu'))
-print(torch.mm(t0.to('cpu'), t1.to('cpu')))
-
-t2 = torch.mm(t0, t1)
-
-#
-# Check the result tensor against the CPU
-#
-t0_cpu = t0.to('cpu')
-t1_cpu = t1.to('cpu')
-t2_cpu = t2.to('cpu')
-
-print (t0_cpu, " *\n", t1_cpu, " =\n", t2_cpu)
-
-ref_tensor = torch.mm(t0_cpu, t1_cpu)
-# CHECK: PASS! mm check
-test.compare(t2, ref_tensor, "mm")
-
--- a/frontends/pytorch/test/acap_regression/test_jit_mul2.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_mul2.py
@ -1,26 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-t0 = torch.randn((4,4), device=dev)
-t1 = torch.randn((4,4), device=dev)
-
-t2 = t0 * t1
-#
-# Check the result tensor against the CPU
-#
-t0_cpu = t0.to('cpu')
-t1_cpu = t1.to('cpu')
-t2_cpu = t2.to('cpu')
-
-print (t0_cpu, " *\n", t1_cpu, " =\n", t2_cpu)
-
-# CHECK: PASS! mul2 check
-test.compare(t2, t0_cpu * t1_cpu, "mul2")
--- a/frontends/pytorch/test/acap_regression/test_jit_nllloss.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_nllloss.py
@ -1,21 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-model = torch.nn.LogSoftmax(dim=1)
-tensor = torch.randn(3,5,requires_grad=True)
-
-# CHECK: PASS! fwd check
-fwd_path = test.check_fwd(model, tensor)
-
-target = torch.tensor([1, 0, 4])
-loss = torch.nn.NLLLoss()
-
-# CHECK: PASS! back check
-test.check_back(fwd_path, target, loss)
--- a/frontends/pytorch/test/acap_regression/test_jit_relu.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_relu.py
@ -1,15 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-model = torch.nn.ReLU()
-tensor = torch.randn(10)
-
-# CHECK: PASS! fwd check
-fwd_path = test.check_ref(model, tensor)
--- a/frontends/pytorch/test/acap_regression/test_jit_t.py
+++ b/frontends/pytorch/test/acap_regression/test_jit_t.py
@ -1,18 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-import npcomp.frontends.pytorch.test as test
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-
-tensor = torch.randn(2,3).to(dev)
-result = tensor.t()
-
-ref_result = tensor.to('cpu').t()
-# CHECK: PASS! transpose check
-test.compare(ref_result, result, "transpose")
--- a/frontends/pytorch/test/acap_regression/test_op_report_conv2d.py
+++ b/frontends/pytorch/test/acap_regression/test_op_report_conv2d.py
@ -1,31 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import torch
-import npcomp.frontends.pytorch as torch_mlir
-
-# RUN: %PYTHON %s | FileCheck %s
-
-dev = torch_mlir.mlir_device()
-
-model = torch.nn.Conv2d(2,16,7,stride=[2,2], padding=[3,3], dilation=1, groups=1, bias=True).to(dev)
-
-tensor = torch.randn((1,2,128,128), device=dev)
-result = model(tensor)
-
-mlir = torch_mlir.get_mlir( result )
-report = torch_mlir.op_report(mlir)
-
-# CHECK-LABEL:   "L0-convolution_overrideable-0"
-#   CHECK-NEXT:     "activation_in": 32768
-#   CHECK-NEXT:     "activation_out": 65536
-#   CHECK-NEXT:     "ops:+": 65536
-#   CHECK-NEXT:     "ops:MAC": 6422528
-#   CHECK-NEXT:     "parameters_in": 1584
-#   CHECK-NEXT:     "reads": 34352
-#   CHECK-NEXT:     "writes": 65536
-for k,v in report.items():
-    print("\"{}\"".format(k))
-    for k,v in v.items():
-        print("\"{}\": {}".format(k,v))
--- a/frontends/pytorch/test/acap_regression/test_op_report_vgg_style_lenet.py
+++ b/frontends/pytorch/test/acap_regression/test_op_report_vgg_style_lenet.py
@ -1,107 +0,0 @@
-# -*- Python -*-
-# This file is licensed under a pytorch-style license
-# See frontends/pytorch/LICENSE for license information.
-
-import argparse
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torchvision import datasets, transforms
-from torch.optim.lr_scheduler import StepLR
-
-import npcomp.frontends.pytorch as torch_mlir
-import json
-
-# RUN: %PYTHON %s | FileCheck %s
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.conv1 = nn.Conv2d(1, 8, 3, padding=1)
-        self.conv2 = nn.Conv2d(8, 16, 3, padding=0)
-        self.maxpool1 = nn.MaxPool2d(2,2)
-        self.maxpool2 = nn.MaxPool2d(2,2)
-        self.fc1 = nn.Linear(576, 128)
-        self.fc2 = nn.Linear(128, 64)
-        self.fc3 = nn.Linear(64, 8)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        print(x.shape)
-        x = F.relu(x)
-        print(x.shape)
-        x = self.maxpool1(x)
-        print(x.shape)
-
-        x = self.conv2(x)
-        print(x.shape)
-        x = F.relu(x)
-        print(x.shape)
-        x = self.maxpool2(x)
-        print(x.shape)
-        x = x.view(8, 6*6*16)
-
-        x = self.fc1(x)
-        x = F.relu(x)
-
-        x = self.fc2(x)
-        x = F.relu(x)
-
-        x = self.fc3(x)
-        output = F.log_softmax(x, dim=1)
-
-        return output
-
-def main():
-
-    test_status = "PASS!"
-
-    # CHECK-LABEL: test_op_report_vgg_style_lenet
-    # CHECK:       PASS!
-    print("test_op_report_vgg_style_lenet")
-
-    device = torch_mlir.mlir_device()
-
-    model = Net().to(device)
-    ref_tensor = torch.randn((8, 1, 30, 30))
-    tensor = ref_tensor.clone().to(device)
-
-    result = model(tensor)
-    target = torch.ones((8), dtype=torch.long).to(device)
-    loss = F.nll_loss(result, target)
-    loss.backward()
-
-    mlir0 = torch_mlir.get_mlir(model.conv1.weight.grad)
-    print(mlir0)
-    report = torch_mlir.op_report(mlir0)
-    print(report)
-
-    report_dict = report
-    expected = 32
-    if (len(report_dict) != expected):
-        print("### ERROR: Expecting",expected,"items in the report, but got ",len(report_dict))
-        test_status = "FAIL!"
-
-    # Every item should have a read and a write
-    for key, value in report_dict.items():
-        if not 'reads' in value:
-            print(f"### ERROR: {key} does not contain the required reads field")
-            test_status = "FAIL!"
-        if not 'writes' in value:
-            print(f"### ERROR: {key} does not contain the required writes field")
-            test_status = "FAIL!"
-        if "convolution" in key:
-            if not 'ops:MAC' in value:
-                print(f"### ERROR: convolution {key} does not contain the required MAC field")
-                test_status = "FAIL!"
-        if "mm" in key:
-            if not 'ops:MAC' in value:
-                print(f"### ERROR: mm {key} does not contain the required MAC field")
-                test_status = "FAIL!"
-
-
-    print(test_status)
-
-if __name__ == '__main__':
-    main()
--- a/frontends/pytorch/test/c10_dispatch/get_registered_ops.py
+++ b/frontends/pytorch/test/c10_dispatch/get_registered_ops.py
--- a/frontends/pytorch/test/c10_dispatch/simple_acap_e2e.py
+++ b/frontends/pytorch/test/c10_dispatch/simple_acap_e2e.py
--- a/frontends/pytorch/test/c10_dispatch/lit.local.cfg
+++ b/frontends/pytorch/test/c10_dispatch/lit.local.cfg
@ -1,2 +0,0 @@
-if not config.enable_c10_dispatch:
-  config.unsupported = True
--- a/frontends/pytorch/test/lit.site.cfg.py.in
+++ b/frontends/pytorch/test/lit.site.cfg.py.in
@ -34,7 +34,6 @@ config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
 config.host_arch = "@HOST_ARCH@"
 config.npcomp_src_root = "@CMAKE_SOURCE_DIR@"
 config.npcomp_obj_root = "@CMAKE_BINARY_DIR@"
-config.enable_c10_dispatch = not @NPCOMP_ENABLE_TORCH_TYPE_DISPATCH@

 # Support substitution of the tools_dir with user parameters. This is
 # used when we can't determine the tool dir at configuration time.