Add TMTensor dialect to torch-mlir

This is intended to explore support for non-structured ops that can't be modeled by Linalg dialect. `tm_tensor.scan` and `tm_tensor.scatter` are added as the first such ops. The dialect should aim to be upstreamed in the future.
2022-02-02 18:01:38 -05:00 · 2022-02-02 18:01:38 -05:00 · 869daf3c22
parent cd21dda867
commit 869daf3c22
45 changed files with 2532 additions and 4 deletions
--- a/.github/workflows/buildAndTest.yml
+++ b/.github/workflows/buildAndTest.yml
@ -53,8 +53,9 @@ jobs:
          -DPython3_EXECUTABLE=$(which python) \
          -DLLVM_ENABLE_ASSERTIONS=ON \
          -DLLVM_ENABLE_PROJECTS=mlir \
-          -DLLVM_EXTERNAL_PROJECTS=torch-mlir \
+          -DLLVM_EXTERNAL_PROJECTS="torch-mlir;torch-mlir-dialects" \
          -DLLVM_EXTERNAL_TORCH_MLIR_SOURCE_DIR="$GITHUB_WORKSPACE" \
+          -DLLVM_EXTERNAL_TORCH_MLIR_DIALECTS_SOURCE_DIR="${GITHUB_WORKSPACE}/external/llvm-external-projects/torch-mlir-dialects" \
          -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
          -DLLVM_TARGETS_TO_BUILD=host
        ninja check-torch-mlir-all
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -25,6 +25,22 @@ project(torch-mlir LANGUAGES CXX C)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 14)

+macro(torch_mlir_add_llvm_external_project name identifier location)
+  message(STATUS "Adding LLVM external project ${name} (${identifier}) -> ${location}")
+  if(NOT EXISTS "${location}/CMakeLists.txt")
+    message(FATAL_ERROR "External project location ${location} is not valid")
+  endif()
+  list(APPEND LLVM_EXTERNAL_PROJECTS ${name})
+  list(REMOVE_DUPLICATES LLVM_EXTERNAL_PROJECTS)
+  set(LLVM_EXTERNAL_${identifier}_SOURCE_DIR ${location} CACHE STRING "" FORCE)
+  set(LLVM_EXTERNAL_PROJECTS ${LLVM_EXTERNAL_PROJECTS} CACHE STRING "" FORCE)
+endmacro()
+
+torch_mlir_add_llvm_external_project(
+  torch-mlir-dialects
+  TORCH_MLIR_DIALECTS
+  ${CMAKE_CURRENT_SOURCE_DIR}/external/llvm-external-projects/torch-mlir-dialects)
+
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
  # Out-of-tree build

@ -129,6 +145,7 @@ add_subdirectory(tools)
 add_custom_target(check-torch-mlir-all)
 add_dependencies(check-torch-mlir-all
  check-torch-mlir
+  check-torch-mlir-dialects
  )

 if(MLIR_ENABLE_BINDINGS_PYTHON)
--- a/README.md
+++ b/README.md
@ -62,8 +62,9 @@ cmake -GNinja -Bbuild \
  -DCMAKE_CXX_COMPILER=clang++ \
  -DPython3_FIND_VIRTUALENV=ONLY \
  -DLLVM_ENABLE_PROJECTS=mlir \
-  -DLLVM_EXTERNAL_PROJECTS=torch-mlir \
+  -DLLVM_EXTERNAL_PROJECTS=torch-mlir;torch-mlir-dialects \
  -DLLVM_EXTERNAL_TORCH_MLIR_SOURCE_DIR=`pwd` \
+  -DLLVM_EXTERNAL_TORCH_MLIR_DIALECTS_SOURCE_DIR=`pwd`/external/llvm-external-projects/torch-mlir-dialects \
  -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
  -DLLVM_TARGETS_TO_BUILD=host \
  external/llvm-project/llvm
--- a/build_tools/build_standalone.sh
+++ b/build_tools/build_standalone.sh
@ -19,8 +19,9 @@ cmake -GNinja -B"$build_dir" "$llvm_project_dir/llvm" \
  -DCMAKE_BUILD_TYPE=Release \
  -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
  -DLLVM_ENABLE_PROJECTS=mlir \
-  -DLLVM_EXTERNAL_PROJECTS=torch-mlir \
+  -DLLVM_EXTERNAL_PROJECTS=torch-mlir;torch-mlir-dialects \
  -DLLVM_EXTERNAL_TORCH_MLIR_SOURCE_DIR="$project_dir" \
+  -DLLVM_EXTERNAL_TORCH_MLIR_DIALECTS_SOURCE_DIR=${project_dir}/external/llvm-external-projects/torch-mlir-dialects \
  -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
  -DLLVM_ENABLE_ASSERTIONS=ON \
  -DLLVM_TARGETS_TO_BUILD=host
--- a/external/llvm-external-projects/torch-mlir-dialects/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/CMakeLists.txt
@ -0,0 +1,54 @@
+if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+message(FATAL_ERROR
+  "This project is intended to be built as part of LLVM via "
+  "-DLLVM_EXTERNAL_PROJECTS=torch-mlir-dialects "
+  "-DLLVM_EXTERNAL_TORCH_MLIR_DIALECTS_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}")
+endif()
+
+option(MLIR_ENABLE_BINDINGS_PYTHON "Enables MLIR Python Bindings" OFF)
+
+set(TORCH_MLIR_DIALECTS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(TORCH_MLIR_DIALECTS_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+message(STATUS "Building torch-mlir-dialects project at ${TORCH_MLIR_DIALECTS_SOURCE_DIR} (into ${TORCH_MLIR_DIALECTS_BINARY_DIR})")
+
+# TODO: Fix this upstream so that global include directories are not needed.
+set(MLIR_MAIN_SRC_DIR ${LLVM_MAIN_SRC_DIR}/../mlir)
+set(MLIR_INCLUDE_DIR ${LLVM_MAIN_SRC_DIR}/../mlir/include)
+set(MLIR_GENERATED_INCLUDE_DIR ${LLVM_BINARY_DIR}/tools/mlir/include)
+
+# TODO: Needed for tablegen. Remove.
+include_directories(SYSTEM ${MLIR_INCLUDE_DIR})
+include_directories(SYSTEM ${MLIR_GENERATED_INCLUDE_DIR})
+include_directories(SYSTEM ${TORCH_MLIR_DIALECTS_SOURCE_DIR}/include)
+
+function(torch_mlir_dialects_target_includes target)
+  set(_dirs
+    $<BUILD_INTERFACE:${MLIR_INCLUDE_DIR}>
+    $<BUILD_INTERFACE:${MLIR_GENERATED_INCLUDE_DIR}>
+    $<BUILD_INTERFACE:${TORCH_MLIR_DIALECTS_SOURCE_DIR}/include>
+    $<BUILD_INTERFACE:${TORCH_MLIR_DIALECTS_BINARY_DIR}/include>
+  )
+  # In LLVM parlance, the actual target may just be an interface and may not
+  # be responsible for actually compiling anything. The corresponding obj.
+  # target, when present, is just used for compilation and does not
+  # contribute to the interface properties.
+  # TODO: Normalize this upstream.
+  target_include_directories(${target} PUBLIC ${_dirs})
+  if(TARGET obj.${target})
+    target_include_directories(obj.${target} PRIVATE ${_dirs})
+  endif()
+endfunction()
+
+# Configure CMake and tablegen.
+list(APPEND CMAKE_MODULE_PATH ${MLIR_MAIN_SRC_DIR}/cmake/modules)
+list(APPEND CMAKE_MODULE_PATH ${LLVM_MAIN_SRC_DIR}/cmake)
+set(MLIR_TABLEGEN_EXE mlir-tblgen)
+
+include(TableGen)
+include(AddLLVM)
+include(AddMLIR)
+
+add_subdirectory(include)
+add_subdirectory(lib)
+add_subdirectory(tools)
+add_subdirectory(test)
--- a/external/llvm-external-projects/torch-mlir-dialects/include/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/CMakeLists.txt
@ -0,0 +1 @@
+add_subdirectory(torch-mlir-dialects)
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/CMakeLists.txt
@ -0,0 +1 @@
+add_subdirectory(Dialect)
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/CMakeLists.txt
@ -0,0 +1 @@
+add_subdirectory(TMTensor)
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/CMakeLists.txt
@ -0,0 +1,2 @@
+add_subdirectory(IR)
+add_subdirectory(Transforms)
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/CMakeLists.txt
@ -0,0 +1,33 @@
+function(_add_interfaces)
+  set(LLVM_TARGET_DEFINITIONS TMTensorInterfaces.td)
+  mlir_tablegen(TMTensorOpInterfaces.h.inc -gen-op-interface-decls)
+  mlir_tablegen(TMTensorOpInterfaces.cpp.inc -gen-op-interface-defs)
+  mlir_tablegen(TMTensorTypeInterfaces.h.inc -gen-type-interface-decls)
+  mlir_tablegen(TMTensorTypeInterfaces.cpp.inc -gen-type-interface-defs)
+  add_public_tablegen_target(TorchMLIRTMTensorInterfacesIncGen)
+  add_dependencies(TorchMLIRTMTensorOpsIncGen TorchMLIRTMTensorInterfacesIncGen)
+endfunction()
+
+function(_add_scalar_loop_op_interface)
+  set(LLVM_TARGET_DEFINITIONS ScalarLoopOpInterface.td)
+  mlir_tablegen(ScalarLoopOpInterface.h.inc -gen-op-interface-decls)
+  mlir_tablegen(ScalarLoopOpInterface.cpp.inc -gen-op-interface-defs)
+  add_public_tablegen_target(TorchMLIRTMTensorScalarLoopOpInterfaceIncGen)
+  add_dependencies(TorchMLIRTMTensorOpsIncGen TorchMLIRTMTensorScalarLoopOpInterfaceIncGen)
+endfunction()
+
+function(_add_dialect)
+  set(LLVM_TARGET_DEFINITIONS TMTensorOps.td)
+  mlir_tablegen(TMTensorOps.h.inc -gen-op-decls)
+  mlir_tablegen(TMTensorOps.cpp.inc -gen-op-defs)
+  mlir_tablegen(TMTensorTypes.h.inc -gen-typedef-decls)
+  mlir_tablegen(TMTensorTypes.cpp.inc -gen-typedef-defs)
+  mlir_tablegen(TMTensorDialect.h.inc -gen-dialect-decls -dialect=tm_tensor)
+  mlir_tablegen(TMTensorDialect.cpp.inc -gen-dialect-defs -dialect=tm_tensor)
+  add_public_tablegen_target(TorchMLIRTMTensorOpsIncGen)
+  add_dependencies(mlir-headers TorchMLIRTMTensorOpsIncGen)
+endfunction()
+
+_add_dialect()
+_add_interfaces()
+_add_scalar_loop_op_interface()
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/ScalarLoopOpInterface.h
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/ScalarLoopOpInterface.h
@ -0,0 +1,29 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_SCALARLOOPOPINTERFACE_H_
+#define TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_SCALARLOOPOPINTERFACE_H_
+
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+#include "mlir/Support/LLVM.h"
+
+/// Include the ODS generated interface header files.
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/ScalarLoopOpInterface.h.inc"
+
+namespace mlir {
+namespace torch {
+namespace TMTensor {} // namespace TMTensor
+} // namespace torch
+} // namespace mlir
+
+#endif // TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_SCALARLOOPOPINTERFACE_H_
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/ScalarLoopOpInterface.td
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/ScalarLoopOpInterface.td
@ -0,0 +1,79 @@
+//===-------------------------------------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TORCH_MLIR_DIALECT_TMTENSOR_SCALARLOOPOPINTERFACE
+#define TORCH_MLIR_DIALECT_TMTENSOR_SCALARLOOPOPINTERFACE
+
+include "mlir/IR/OpBase.td"
+
+def ScalarLoopOpInterface : OpInterface<"ScalarLoopOpInterface"> {
+  let description = [{
+    Interface for allowing operations to expose information needed to
+    lower it to for loops
+  }];
+  let cppNamespace = "::mlir::torch::TMTensor";
+  let methods = [
+      InterfaceMethod<
+        /*desc=*/[{
+          Returns the destination operands. For op with `memref`
+          operands, this is the result buffers. For op with `tensor`
+          operands, this is the operands that contain the initial
+          value for the result. These are "tied" to the result
+          buffers. For example, for a `LinalgOp`/`TMTensor` ops, it
+          is the `outs` parameters. For `tensor.insert_slice`, it is
+          the `dest` parameter.
+        }],
+        /*retType=*/"SmallVector<Value>",
+        /*methodName=*/"getDestinationOperands",
+        /*args=*/(ins "OpBuilder &":$b),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/"return ValueRange{};"
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
+          Returns a list of `StringRef`s that describe the number of
+          loops and the iterator types of the operation. The list is
+          expected to use
+          `getParallelIteratorTypeName()`/`getReductionIteratorTypeName()`
+          from MLIR Structured Op Utils.
+        }],
+        /*retType=*/"SmallVector<StringRef>",
+        /*methodName=*/"getLoopIteratorTypes"
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
+          Returns a list of ranges that describe the loop bounds and
+          step for the loops of the operation.
+        }],
+        /*retTy=*/"SmallVector<Range>",
+        /*methodName=*/"getIterationDomain",
+        /*args=*/(ins "OpBuilder &":$b)
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
+          Generates the loop body implementation. Assume that all the parallel
+          loops and reduction loops are created and the insertion point of the
+          build is set to the innermost of the loop. This method implements the
+          loop body IRs.
+        }],
+        /*retType=*/"LogicalResult",
+        /*methodName=*/"generateScalarImplementation",
+        /*args=*/(ins
+            "OpBuilder &":$b,
+            "Location ":$loc,
+            "ValueRange ":$ivs),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/[{
+          return failure();
+        }]
+      >
+  ];
+}
+
+#endif  // TORCH_MLIR_DIALECT_TMTENSOR_SCALARLOOPOPINTERFACES
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorBase.td
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorBase.td
@ -0,0 +1,59 @@
+//===-------------------------------------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TORCH_MLIR_DIALECT_TMTENSOR_BASE
+#define TORCH_MLIR_DIALECT_TMTENSOR_BASE
+
+include "mlir/IR/OpBase.td"
+
+//===----------------------------------------------------------------------===//
+// Dialect definition
+//===----------------------------------------------------------------------===//
+
+def TMTensor_Dialect : Dialect {
+  let name = "tm_tensor";
+  let cppNamespace = "::mlir::torch::TMTensor";
+  let description = [{
+    The tm_tensor (tm = torch-mlir) dialect is a temporary staging ground in
+    the torch-mlir project for a set of widely-accepted tensor compute
+    operations that are not well-served by existing representations in MLIR
+    upstream. These ops are currently heavily inspired by the linalg_ext
+    dialect (which itself is heavily inspired by the structured ops of the
+    linalg dialect). But while linalg_ext is meant to power specific codegen
+    transformations, the tm_tensor dialect is a much more pure "interface
+    dialect" agnostic to any particular set of transformations applied to
+    the operations. We simply require a way to name the specified operations
+    for interchange between projects, without taking strong opinions on the
+    mechanics of transformations.
+
+    The dialect does include interfaces to generate scalar reference code for
+    the operations, which simultaneously provides a precise definition of their
+    semantics, and aids in producing executable reference implementations of
+    the operations.
+
+    The goal of this dialect is to eventually either be upstreamed or to be
+    subsumed by functionality included by upstream MLIR. It should also be kept
+    consistent with the linalg_ext dialect unless there is a good reason not
+    to.
+  }];
+  let hasCanonicalizer = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Type definitions
+//===----------------------------------------------------------------------===//
+
+class RankedTensorOrMemRefOf<list<Type> allowedTypes> :
+  ShapedContainerType<allowedTypes,
+      Or<[IsMemRefTypePred, And<[IsTensorTypePred, HasRankPred]>]>,
+  "ranked tensor or memref", "::mlir::ShapedType">;
+
+def AnyRankedTensorOrMemRefType : RankedTensorOrMemRefOf<[AnyType]>;
+
+#endif // TORCH_MLIR_DIALECT_TMTENSOR_BASE
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorDialect.h
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorDialect.h
@ -0,0 +1,20 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_TMTENSORDIALECT_H_
+#define TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_TMTENSORDIALECT_H_
+
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+
+// clang-format off: must be included after all LLVM/MLIR headers
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorDialect.h.inc" // IWYU pragma: keep
+// clang-format on
+
+#endif // TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_TMTENSORDIALECT_H_
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorInterfaces.h
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorInterfaces.h
@ -0,0 +1,42 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_TMTENSORINTERFACES_H_
+#define TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_TMTENSORINTERFACES_H_
+
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Support/LLVM.h"
+
+namespace mlir {
+namespace torch {
+namespace TMTensor {
+class TMTensorOp;
+
+/// OpOperand vector that implicitly converts to a Value vector.
+struct OpOperandVector : public SmallVector<OpOperand *> {
+  operator SmallVector<Value>();
+};
+
+namespace detail {
+LogicalResult verifyTMTensorOpInterface(Operation *op);
+}
+
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.h.inc" // IWYU pragma: export
+
+/// Include the generated interface declarations.
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOpInterfaces.h.inc" // IWYU pragma: export
+
+} // namespace TMTensor
+} // namespace torch
+} // namespace mlir
+
+#endif // TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_TMTENSORINTERFACES_H_
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorInterfaces.td
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorInterfaces.td
@ -0,0 +1,493 @@
+//===-------------------------------------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TORCH_MLIR_DIALECT_TMTENSOR_INTERFACES
+#define TORCH_MLIR_DIALECT_TMTENSOR_INTERFACES
+
+include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorBase.td"
+
+// The interface is a subset of LinalgStructuredInterface.
+def TMTensorInterface : OpInterface<"TMTensorOp"> {
+  let methods = [
+    //===------------------------------------------------------------------===//
+    // Num input/output arguments handling.
+    //===------------------------------------------------------------------===//
+    // `inputs` must be defined by each op that wants to implement the
+    // LinalgStructuredInterface.
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the input shape operands.
+      }],
+      /*retTy=*/"ValueRange",
+      /*methodName=*/"inputs",
+      /*args=*/(ins)
+    >,
+    // These special methods rely on `inputs` and `outputs` being defined by
+    // each op that wants to implement the LinalgStructuredInterface.
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the number of inputs.
+      }],
+      /*retTy=*/"int64_t",
+      /*methodName=*/"getNumInputs",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return $_op.inputs().size();
+      }]
+    >,
+    // `outputs` must be defined by each op that wants to implement the
+    // LinalgStructuredInterface.
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the output shape operands.
+      }],
+      /*retTy=*/"ValueRange",
+      /*methodName=*/"outputs",
+      /*args=*/(ins)
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the number of outputs.
+      }],
+      /*retTy=*/"int64_t",
+      /*methodName=*/"getNumOutputs",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return $_op.outputs().size();
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the number of inputs and outputs.
+      }],
+      /*retTy=*/"int64_t",
+      /*methodName=*/"getNumInputsAndOutputs",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return getNumInputs() + getNumOutputs();
+      }]
+    >,
+    //===------------------------------------------------------------------===//
+    // Input operands handling.
+    //===------------------------------------------------------------------===//
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the input operands.
+      }],
+      /*retTy=*/"OpOperandVector",
+      /*methodName=*/"getInputOperands",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        int64_t numInputs = getNumInputs();
+        OpOperandVector result;
+        result.reserve(numInputs);
+        llvm::transform(
+          this->getOperation()->getOpOperands().take_front(numInputs),
+          std::back_inserter(result),
+          [](OpOperand &opOperand) { return &opOperand; });
+        return result;
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the `i`-th input operand.
+      }],
+      /*retTy=*/"OpOperand*",
+      /*methodName=*/"getInputOperand",
+      /*args=*/(ins "int64_t":$i),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        assert(i >= 0 && i < getNumInputs());
+        return &this->getOperation()->getOpOperand(i);
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the subset of input operands that are of buffer type.
+      }],
+      /*retTy=*/"OpOperandVector",
+      /*methodName=*/"getInputBufferOperands",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        OpOperandVector result;
+        result.reserve(getNumInputs());
+        llvm::copy_if(getInputOperands(),
+          std::back_inserter(result),
+          [](OpOperand *opOperand) {
+            return opOperand->get().getType().template isa<MemRefType>();
+          });
+        return result;
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the subset of input operands that are of tensor type.
+      }],
+      /*retTy=*/"OpOperandVector",
+      /*methodName=*/"getInputTensorOperands",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        OpOperandVector result;
+        result.reserve(getNumInputs());
+        llvm::copy_if(getInputOperands(),
+          std::back_inserter(result),
+          [](OpOperand *opOperand) {
+            return opOperand->get().getType().template isa<RankedTensorType>();
+          });
+        return result;
+      }]
+    >,
+    //===------------------------------------------------------------------===//
+    // Output operands handling.
+    //===------------------------------------------------------------------===//
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the output operands.
+      }],
+      /*retTy=*/"OpOperandVector",
+      /*methodName=*/"getOutputOperands",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        int64_t numOutputs = getNumOutputs();
+        OpOperandVector result;
+        result.reserve(numOutputs);
+        llvm::transform(
+          this->getOperation()->getOpOperands()
+            .drop_front(getNumInputs())
+            .take_front(numOutputs),
+          std::back_inserter(result),
+          [](OpOperand &opOperand) { return &opOperand; });
+        return result;
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the `i`-th output operand.
+      }],
+      /*retTy=*/"OpOperand*",
+      /*methodName=*/"getOutputOperand",
+      /*args=*/(ins "int64_t":$i),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        assert(i >= 0 && i < getNumOutputs());
+        return &this->getOperation()->getOpOperand(getNumInputs() + i);
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the subset of output operands that are of buffer type.
+      }],
+      /*retTy=*/"OpOperandVector",
+      /*methodName=*/"getOutputBufferOperands",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        OpOperandVector result;
+        result.reserve(getNumOutputs());
+        llvm::copy_if(getOutputOperands(),
+          std::back_inserter(result),
+          [](OpOperand *opOperand) {
+            return opOperand->get().getType().template isa<MemRefType>();
+          });
+        return result;
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the subset of output operands that are of tensor type.
+      }],
+      /*retTy=*/"OpOperandVector",
+      /*methodName=*/"getOutputTensorOperands",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        OpOperandVector result;
+        result.reserve(getNumOutputs());
+        llvm::copy_if(getOutputOperands(),
+          std::back_inserter(result),
+          [](OpOperand *opOperand) {
+            return opOperand->get().getType().template isa<RankedTensorType>();
+          });
+        return result;
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the types of the subset of output operands that are of buffer type.
+      }],
+      /*retTy=*/"SmallVector<MemRefType>",
+      /*methodName=*/"getOutputBufferTypes",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        SmallVector<MemRefType> result;
+        result.reserve(getNumOutputs());
+        llvm::transform(getOutputBufferOperands(),
+          std::back_inserter(result),
+          [](OpOperand *opOperands) {
+            return opOperands->get().getType().cast<MemRefType>();
+          });
+        return result;
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the types of the subset of output operands that are of tensor type.
+      }],
+      /*retTy=*/"SmallVector<RankedTensorType>",
+      /*methodName=*/"getOutputTensorTypes",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        SmallVector<RankedTensorType> result;
+        result.reserve(getNumOutputs());
+        llvm::transform(getOutputTensorOperands(),
+          std::back_inserter(result),
+          [](OpOperand *opOperands) {
+            return opOperands->get().getType().cast<RankedTensorType>();
+          });
+        return result;
+      }]
+    >,
+    //===------------------------------------------------------------------===//
+    // Input and Output arguments handling.
+    //===------------------------------------------------------------------===//
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the range over input and output operands.
+      }],
+      /*retTy=*/"OpOperandVector",
+      /*methodName=*/"getInputAndOutputOperands",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        int64_t numInputsAndOutputs = getNumInputsAndOutputs();
+        OpOperandVector result;
+        result.reserve(numInputsAndOutputs);
+        llvm::transform(
+          this->getOperation()->getOpOperands()
+            .take_front(numInputsAndOutputs),
+          std::back_inserter(result),
+          [](OpOperand &opOperand) { return &opOperand; });
+        return result;
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return true if the payload uses the value loaded from `opOperand`. This
+        is useful to avoid loading from "write-only" memory that may be
+        uninitialized, as well as properly cloning "read-write" operands.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"payloadUsesValueFromOperand",
+      /*args=*/(ins "OpOperand *":$opOperand),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        unsigned bbArgNumber = opOperand->getOperandNumber();
+        // Safeguard against the named linalg ops that are manually defined and
+        // that only support buffer semantics: we should not be there.
+        // Such ops have an empty regionBuilder and are not constructed with a
+        // region for now. In the future they are slated to disappear.
+        assert(this->getOperation()->getNumRegions() == 1 && "unexpected "
+               "missing region (calling `payloadUsesValueFromOperand` on "
+               "manually defined named Linalg op?)");
+        Block &block = this->getOperation()->getRegion(0).front();
+        // Init tensors have uses.
+        return !block.getArgument(bbArgNumber).use_empty();
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return true if `opOperand` is an input tensor.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"isInputTensor",
+      /*args=*/(ins "OpOperand *":$opOperand),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        if (!opOperand->get().getType().template isa<RankedTensorType>())
+          return false;
+        if (opOperand->getOperandNumber() < $_op.getNumInputs())
+          return true;
+        return false;
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return true if `opOperand` is an output tensor.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"isOutputTensor",
+      /*args=*/(ins "OpOperand *":$opOperand),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        if (!opOperand->get().getType().template isa<RankedTensorType>())
+          return false;
+        if (opOperand->getOperandNumber() >= $_op.getNumInputs())
+          return true;
+        return false;
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return true if `opOperand` is an init tensor. This is true when it is
+        an output tensor operand whose value is used in the payload region.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"isInitTensor",
+      /*args=*/(ins "OpOperand *":$opOperand),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        if (!$_op.isOutputTensor(opOperand))
+          return false;
+        return payloadUsesValueFromOperand(opOperand);
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the `opOperand` rank or zero for scalars.
+      }],
+      /*retTy=*/"int64_t",
+      /*methodName=*/"getRank",
+      /*args=*/(ins "OpOperand*":$opOperand),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        assert(opOperand->getOwner() == this->getOperation());
+        if (auto shapedType =
+              opOperand->get().getType().template dyn_cast<ShapedType>())
+          return shapedType.getRank();
+        return 0;
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return the `opOperand` shape or an empty vector for scalars.
+      }],
+      /*retTy=*/"ArrayRef<int64_t>",
+      /*methodName=*/"getShape",
+      /*args=*/(ins "OpOperand*":$opOperand),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        assert(opOperand->getOwner() == this->getOperation());
+        if (auto shapedType =
+              opOperand->get().getType().template dyn_cast<ShapedType>())
+          return shapedType.getShape();
+        return {};
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return true if the `opOperand` is a scalar value.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"isScalar",
+      /*args=*/(ins "OpOperand*":$opOperand),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        assert(opOperand->getOwner() == this->getOperation());
+        return !opOperand->get().getType().template isa<ShapedType>();
+      }]
+    >,
+    //===------------------------------------------------------------------===//
+    // Other interface methods.
+    //===------------------------------------------------------------------===//
+    InterfaceMethod<
+      /*desc=*/[{
+        Return whether the op has only MemRef input and outputs.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"hasBufferSemantics",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return this->getOperation()->getNumResults() == 0 &&
+          llvm::all_of(getInputOperands(), [&](OpOperand *opOperand) {
+            return isScalar(opOperand) ||
+              opOperand->get().getType().template isa<MemRefType>();
+          }) &&
+          llvm::all_of(getOutputOperands(), [](OpOperand *opOperand) {
+            return opOperand->get().getType().template isa<MemRefType>();
+          });
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return whether the op has only RankedTensor input and outputs.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"hasTensorSemantics",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return
+          llvm::all_of(getInputOperands(), [&](OpOperand *opOperand) {
+            return isScalar(opOperand) ||
+              opOperand->get().getType().template isa<RankedTensorType>();
+          }) &&
+          llvm::all_of(getOutputOperands(), [](OpOperand *opOperand) {
+            return opOperand->get().getType().template isa<RankedTensorType>();
+          });
+      }]
+    >,
+    //===------------------------------------------------------------------===//
+    // Other static interface methods.
+    //===------------------------------------------------------------------===//
+    InterfaceMethod<
+      /*desc=*/[{
+        Clone the current operation with the given location and operands. This
+        is used to abstract away the optional underlying region creation. This
+        does not change the balance between input, output_buffer and
+        init_tensors operands.
+      }],
+      /*retTy=*/"Operation *",
+      /*methodName=*/"clone",
+      (ins "OpBuilder &":$b, "Location":$loc, "TypeRange":$resultTypes,
+           "ValueRange":$operands),
+      [{
+        BlockAndValueMapping bvm;
+        OperationState state(
+          loc, ConcreteOp::getOperationName(), operands, resultTypes,
+          $_op->getAttrs());
+        for (Region &r : $_op->getRegions())
+          r.cloneInto(state.addRegion(), bvm);
+        return b.createOperation(state);
+      }]
+    >
+  ];
+
+  let extraClassDeclaration = [{
+    //========================================================================//
+    // Helper functions to mutate the `operand_segment_sizes` attribute.
+    // These are useful when cloning and changing operand types.
+    //========================================================================//
+    void setNumInputs(unsigned num) { setOperandSegmentAt(0, num); }
+    void setNumOutputBuffers(unsigned num) { setOperandSegmentAt(1, num); }
+
+    private:
+    void setOperandSegmentAt(unsigned idx, unsigned val) {
+      auto attr = (*this)->getAttr("operand_segment_sizes")
+        .cast<DenseIntElementsAttr>();
+      unsigned i = 0;
+      auto newAttr = attr.mapValues(IntegerType::get(getContext(), 32),
+        [&](const APInt &v) { return (i++ == idx) ? APInt(32, val) : v; });
+      getOperation()->setAttr("operand_segment_sizes", newAttr);
+    }
+  }];
+
+  let verify = [{ return detail::verifyTMTensorOpInterface($_op); }];
+}
+
+#endif  // TORCH_MLIR_DIALECT_TMTENSOR_INTERFACES
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOpInterface.h
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOpInterface.h
@ -0,0 +1,23 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_SCALARLOOPOPINTERFACE_H_
+#define TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_SCALARLOOPOPINTERFACE_H_
+
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+#include "mlir/Support/LLVM.h"
+
+/// Include the ODS generated interface header files.
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/ScalarLoopOpInterface.h.inc"
+
+#endif // TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_SCALARLOOPOPINTERFACE_H_
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.h
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.h
@ -0,0 +1,41 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_TMTENSOROPS_H_
+#define TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_TMTENSOROPS_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/ScalarLoopOpInterface.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorInterfaces.h"
+
+namespace mlir {
+namespace torch {
+namespace TMTensor {
+
+/// Returns a `memref.dim` or `tensor.dim` operation to get the shape of `v` at
+/// `dim`.
+Value getDimValue(OpBuilder &builder, Location loc, Value v, int64_t dim);
+
+/// Returns a `memref.dim` or `tensor.dim` operation to get the shape of `v` at
+/// `dim`. If the shape is constant, returns the shape as an `IntegerAttr`.
+OpFoldResult getDim(OpBuilder &builder, Location loc, Value v, int64_t dim);
+
+} // namespace TMTensor
+} // namespace torch
+} // namespace mlir
+
+#define GET_OP_CLASSES
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.h.inc" // IWYU pragma: export
+
+#endif // TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_IR_TMTENSOROPS_H_
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.td
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.td
@ -0,0 +1,205 @@
+//===-------------------------------------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TORCH_MLIR_DIALECT_TMTENSOR_OPS
+#define TORCH_MLIR_DIALECT_TMTENSOR_OPS
+
+include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorBase.td"
+include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorInterfaces.td"
+include "torch-mlir-dialects/Dialect/TMTensor/IR/ScalarLoopOpInterface.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+
+//===----------------------------------------------------------------------===//
+// Base class.
+//===----------------------------------------------------------------------===//
+
+class TMTensor_PureOp<string mnemonic, list<OpTrait> traits = []> :
+    Op<TMTensor_Dialect, mnemonic, traits> {
+}
+
+class TMTensor_Op<string mnemonic, list<OpTrait> traits = []> :
+    TMTensor_PureOp<mnemonic, !listconcat(traits,
+        [AttrSizedOperandSegments,
+         DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+         TMTensorInterface,
+         SingleBlockImplicitTerminator<"::mlir::torch::TMTensor::YieldOp">
+  ])> {
+  let verifier = [{ return verify$cppClass(*this); }];
+  let printer = [{ return print$cppClass(p, *this); }];
+  let parser = [{ return parse$cppClass(parser, result); }];
+  code extraTMTensorOpClassDeclaration = [{
+    SmallVector<Value> getDestinationOperands(OpBuilder &b) {
+      SmallVector<Value> dest(outputs().begin(), outputs().end());
+      return dest;
+    }
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Non-structured ops
+//===----------------------------------------------------------------------===//
+
+def TMTensor_ScanOp : TMTensor_Op<"scan"
+         ,[DeclareOpInterfaceMethods<ScalarLoopOpInterface,
+           ["generateScalarImplementation"]>]> {
+  let summary = "Scan operator";
+  let description = [{
+    Computes the inclusive/exclusive scan along a given dimension.
+  }];
+
+  let arguments = (ins Variadic<AnyShaped>:$inputs,
+                       Variadic<AnyShaped>:$outputs,
+                       I64Attr:$dimension,
+                       BoolAttr:$inclusive
+  );
+
+  let builders = [
+    OpBuilder<(ins "ValueRange":$inputs, "ValueRange":$outputs,
+      CArg<"int64_t", "0">:$dimension, CArg<"bool", "true">:$inclusive)>
+  ];
+
+  let results = (outs Variadic<AnyRankedTensor>:$results);
+  let regions = (region AnyRegion:$region);
+  let hasFolder = 1;
+  let assemblyFormat = [{
+    `dimension` `(` $dimension `)`
+    `inclusive` `(` $inclusive `)`
+    attr-dict
+    `ins` `(` $inputs `:` type($inputs) `)`
+    `outs` `(` $outputs `:` type($outputs) `)`
+    $region (`->` type($results)^)?
+  }];
+
+  let extraClassDeclaration = extraTMTensorOpClassDeclaration # [{
+    Value input() {
+      return getInputOperand(0)->get();
+    }
+    Value accumulator() {
+      return getOutputOperand(1)->get();
+    }
+    Value output() {
+      return getOutputOperand(0)->get();
+    }
+    ShapedType getOperandType() {
+      return input().getType().cast<ShapedType>();
+    }
+    int64_t getOperandRank() {
+      return getOperandType().getRank();
+    }
+  }];
+}
+
+def TMTensor_ScatterOp : TMTensor_Op<"scatter",
+    [DeclareOpInterfaceMethods<ScalarLoopOpInterface,
+        ["generateScalarImplementation"]>]> {
+  let summary = "Scatter operator";
+  let description = [{
+    Based on XLA operation semantics, takes two `inputs` (`update` and
+    `indices`) and `outputs` value (`original`). The operation updates
+    the value at the slices specified by `indices` by combining the
+    current value with the value in `updates` using the computation
+    specified in `region`. The `region` specifies a binary operation
+    of signature (T, T) -> T, where `T` is the element-type of
+    `updates` (and `original`). The first argument correspond the
+    value to be updated (i.e. from `updates`), and the second the
+    current value (i.e. value from `original`).
+
+    The `indices` is a 2D tensor/memref type. The first dim is the number of
+    updates, and the second dim is index depth. The index depth should always be
+    static.
+
+    The first dim of `updates` and `indices` is identical, since they represent
+    the number of updates.
+
+    The rank of the `original`/`result` is `index_depth + rank(%updates) - 1`.
+    The first `index_depth` indices are derived from `indices` and the shape of
+    update value must match the rest shape of `original`.
+
+    The shapes definition follows tensorflow operations execept that it force
+    batch dims to be 1D. See more information in
+      https://www.tensorflow.org/api_docs/python/tf/tensor_scatter_nd_update
+  }];
+  let arguments = (ins
+      Variadic<AnyRankedTensorOrMemRefType>:$inputs,
+      Variadic<AnyRankedTensorOrMemRefType>:$outputs
+  );
+  let results = (outs Variadic<AnyRankedTensor>:$results);
+  let regions = (region AnyRegion:$region);
+  let assemblyFormat = [{
+    attr-dict (`ins` `(` $inputs^ `:` type($inputs) `)`)?
+    `outs` `(` $outputs `:` type($outputs) `)`
+    $region (`->` type($results)^)?
+  }];
+  let extraClassDeclaration = extraTMTensorOpClassDeclaration # [{
+
+    int64_t getIndexDepth() {
+      return getInputOperand(1)
+          ->get()
+          .getType()
+          .cast<ShapedType>()
+          .getShape()
+          .back();
+    }
+
+    Value updates() {
+      return getInputOperand(0)->get();
+    }
+
+    ShapedType getUpdateType() {
+      return updates().getType().cast<ShapedType>();
+    }
+
+    Value indices() {
+      return getInputOperand(1)->get();
+    }
+
+    ShapedType getIndicesType() {
+      return indices().getType().cast<ShapedType>();
+    }
+
+    Value original() {
+      return getOutputOperand(0)->get();
+    }
+
+    ShapedType getOriginalType() {
+      return original().getType().cast<ShapedType>();
+    }
+
+    int64_t getUpdateSliceRank() {
+      return updates().getType().cast<ShapedType>().getRank() - 1;
+    }
+
+    bool isScalarUpdate() {
+      return getUpdateSliceRank() == 0;
+    }
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Pure ops
+//===----------------------------------------------------------------------===//
+
+def TMTensor_YieldOp : TMTensor_PureOp<"yield", [NoSideEffect, ReturnLike, Terminator]> {
+  let summary = "TMTensor yield op";
+  let description = [{
+    `tm_tensor.yield` is a special terminator operation for blocks inside
+    regions in `tm_tensor` ops.
+  }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let builders = [
+    OpBuilder<(ins), [{ /* nothing to do */ }]>,
+  ];
+
+  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
+}
+
+#endif  // TORCH_MLIR_DIALECT_TMTENSOR_OPS
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/Transforms/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/Transforms/CMakeLists.txt
@ -0,0 +1,5 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls)
+mlir_tablegen(Passes.capi.h.inc -gen-pass-capi-header)
+mlir_tablegen(Passes.capi.cpp.inc -gen-pass-capi-impl)
+add_public_tablegen_target(TorchMLIRTMTensorTransformsPassesIncGen)
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/Transforms/PassDetail.h
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/Transforms/PassDetail.h
@ -0,0 +1,26 @@
+//===- PassDetail.h - TMTensor Pass class details -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_TRANSFORMS_PASS_DETAIL_H_
+#define TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_TRANSFORMS_PASS_DETAIL_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace torch {
+namespace TMTensor {
+
+#define GEN_PASS_CLASSES
+#include "torch-mlir-dialects/Dialect/TMTensor/Transforms/Passes.h.inc" // IWYU pragma: keep
+
+} // namespace TMTensor
+} // namespace torch
+} // namespace mlir
+
+#endif // TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_TRANSFORMS_PASS_DETAIL_H_
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/Transforms/Passes.h
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/Transforms/Passes.h
@ -0,0 +1,27 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_TRANSFORMS_PASSES_H_
+#define TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_TRANSFORMS_PASSES_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace torch {
+namespace TMTensor {
+
+std::unique_ptr<OperationPass<FuncOp>> createTMTensorToLoopsPass();
+
+void registerPasses();
+
+} // namespace TMTensor
+} // namespace torch
+} // namespace mlir
+
+#endif // TORCH_MLIR_DIALECTS_DIALECT_TMTENSOR_TRANSFORMS_PASSES_H_
--- a/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/Transforms/Passes.td
+++ b/external/llvm-external-projects/torch-mlir-dialects/include/torch-mlir-dialects/Dialect/TMTensor/Transforms/Passes.td
@ -0,0 +1,21 @@
+//===-------------------------------------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TORCH_MLIR_DIALECT_TMTENSOR_PASSES
+#define TORCH_MLIR_DIALECT_TMTENSOR_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def TMTensorToLoops :
+    Pass<"torch-mlir-tm-tensor-to-loops", "FuncOp"> {
+  let summary = "Convert TMTensor ops to loops and Linalg ops.";
+  let constructor = "mlir::torch::TMTensor::createTMTensorToLoopsPass()";
+}
+
+#endif  // TORCH_MLIR_DIALECT_TMTENSOR_PASSES
--- a/external/llvm-external-projects/torch-mlir-dialects/lib/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/lib/CMakeLists.txt
@ -0,0 +1 @@
+add_subdirectory(Dialect)
--- a/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/CMakeLists.txt
@ -0,0 +1 @@
+add_subdirectory(TMTensor)
--- a/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/CMakeLists.txt
@ -0,0 +1,2 @@
+add_subdirectory(IR)
+add_subdirectory(Transforms)
--- a/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/IR/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/IR/CMakeLists.txt
@ -0,0 +1,29 @@
+add_mlir_library(TorchMLIRTMTensorDialect
+  TMTensorDialect.cpp
+  TMTensorInterfaces.cpp
+  TMTensorOps.cpp
+  ScalarLoopOpInterface.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${TORCH_MLIR_DIALECTS_SOURCE_DIR}/include
+
+  DEPENDS
+  TorchMLIRTMTensorOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRAffine
+  MLIRDialectUtils
+  MLIRIR
+  MLIRLinalg
+  MLIRMath
+  MLIRMemRef
+  MLIRPass
+  MLIRSideEffectInterfaces
+  MLIRSupport
+  MLIRSCF
+  MLIRStandard
+  MLIRTensor
+  MLIRViewLikeInterface
+)
+
+torch_mlir_dialects_target_includes(TorchMLIRTMTensorDialect)
--- a/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/IR/ScalarLoopOpInterface.cpp
+++ b/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/IR/ScalarLoopOpInterface.cpp
@ -0,0 +1,24 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/ScalarLoopOpInterface.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "torch-mlir-tiled-op-interface"
+
+using namespace mlir;
+using namespace mlir::torch::TMTensor;
+
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/ScalarLoopOpInterface.cpp.inc"
--- a/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/IR/TMTensorDialect.cpp
+++ b/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/IR/TMTensorDialect.cpp
@ -0,0 +1,30 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorDialect.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.h"
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace mlir;
+using namespace mlir::torch::TMTensor;
+
+void TMTensorDialect::initialize() {
+#define GET_OP_LIST
+  addOperations<
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.cpp.inc"
+      >();
+}
+
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorDialect.cpp.inc"
--- a/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/IR/TMTensorInterfaces.cpp
+++ b/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/IR/TMTensorInterfaces.cpp
@ -0,0 +1,54 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorInterfaces.h"
+
+using namespace mlir;
+using namespace mlir::torch;
+using namespace mlir::torch::TMTensor;
+
+OpOperandVector::operator SmallVector<Value>() {
+  SmallVector<Value> result;
+  result.reserve(this->size());
+  llvm::transform(*this, std::back_inserter(result),
+                  [](OpOperand *opOperand) { return opOperand->get(); });
+  return result;
+}
+
+LogicalResult
+mlir::torch::TMTensor::detail::verifyTMTensorOpInterface(Operation *op) {
+  TMTensorOp mtTensorOp = cast<TMTensorOp>(op);
+  if (op->getNumResults()) {
+    if (!mtTensorOp.hasTensorSemantics()) {
+      return mtTensorOp.emitOpError(
+          "expected inputs and outputs to be RankedTensorType or scalar");
+    }
+
+    if (op->getNumResults() != mtTensorOp.outputs().size()) {
+      return mtTensorOp.emitOpError(
+          "expected number of outputs to be same as the number of results");
+    }
+    for (auto en : llvm::enumerate(op->getResultTypes())) {
+      Type outputType = mtTensorOp.outputs()[en.index()].getType();
+      if (en.value() != outputType) {
+        return mtTensorOp.emitOpError("expected type of `outs` operand #")
+               << en.index() << " " << outputType
+               << " to be same as result type " << en.value();
+      }
+    }
+  } else {
+    if (!mtTensorOp.hasBufferSemantics()) {
+      return mtTensorOp.emitOpError(
+          "expected inputs and outputs to be MemRefType or scalar");
+    }
+  }
+  return success();
+}
+
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOpInterfaces.cpp.inc" // IWYU pragma: export
--- a/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/IR/TMTensorOps.cpp
+++ b/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/IR/TMTensorOps.cpp
@ -0,0 +1,483 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Dialect/StandardOps/Utils/Utils.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorDialect.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/SMLoc.h"
+
+using namespace mlir;
+using namespace mlir::torch;
+using namespace mlir::torch::TMTensor;
+
+//===----------------------------------------------------------------------===//
+// Utils.
+//===----------------------------------------------------------------------===//
+
+static void getEffectsImpl(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects,
+    ValueRange results, ValueRange inputBuffers, ValueRange outputBuffers) {
+  for (Value value : results) {
+    effects.emplace_back(MemoryEffects::Allocate::get(), value,
+                         SideEffects::DefaultResource::get());
+  }
+  for (Value value : inputBuffers) {
+    effects.emplace_back(MemoryEffects::Read::get(), value,
+                         SideEffects::DefaultResource::get());
+  }
+  for (Value value : outputBuffers) {
+    effects.emplace_back(MemoryEffects::Read::get(), value,
+                         SideEffects::DefaultResource::get());
+    effects.emplace_back(MemoryEffects::Write::get(), value,
+                         SideEffects::DefaultResource::get());
+  }
+}
+
+Value TMTensor::getDimValue(OpBuilder &builder, Location loc, Value v,
+                            int64_t dim) {
+  return TypeSwitch<Type, Value>(v.getType())
+      .Case<RankedTensorType>([&](RankedTensorType t) -> Value {
+        return builder.create<tensor::DimOp>(loc, v, dim);
+      })
+      .Case<MemRefType>([&](MemRefType t) -> Value {
+        return builder.create<memref::DimOp>(loc, v, dim);
+      })
+      .Default([&](Type t) { return Value(); });
+}
+
+OpFoldResult TMTensor::getDim(OpBuilder &builder, Location loc, Value v,
+                              int64_t dim) {
+  auto t = v.getType().cast<ShapedType>();
+  if (t.isDynamicDim(dim)) {
+    return getDimValue(builder, loc, v, dim);
+  }
+  return builder.getI64IntegerAttr(t.getDimSize(dim));
+}
+
+//===----------------------------------------------------------------------===//
+// ScanOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verifyScanOp(ScanOp op) {
+  if (op.getNumInputs() != 1) {
+    return op.emitOpError("expected one input operands");
+  }
+  if (op.getNumOutputs() != 2) {
+    return op.emitOpError("expected two output operands");
+  }
+  if (!op.input().getType().isa<ShapedType>()) {
+    return op.emitOpError("expected first input element type to be shaped");
+  }
+  auto accumulatorType = op.accumulator().getType().cast<ShapedType>();
+  auto inputType = op.input().getType().cast<ShapedType>();
+  auto outputType = op.output().getType().cast<ShapedType>();
+  ArrayRef<int64_t> inputShapes = inputType.getShape();
+  ArrayRef<int64_t> outputShapes = outputType.getShape();
+  if (accumulatorType.getElementType() != inputType.getElementType()) {
+    return op.emitOpError(
+        "expected input/accumulator element types to be identical");
+  }
+  ArrayRef<int64_t> accumulatorShape = accumulatorType.getShape();
+  int64_t accumulatorRank = accumulatorType.getRank();
+  if (accumulatorRank != inputType.getRank() - 1) {
+    return op.emitOpError(
+        "expected accumulator rank to be equal to input rank - 1");
+  }
+  SmallVector<int64_t> expectedAccumulatorShape;
+  for (size_t i = 0; i < (size_t)inputType.getRank(); i++) {
+    if (i != op.dimension())
+      expectedAccumulatorShape.push_back(inputShapes[i]);
+  }
+  if (llvm::any_of(llvm::zip(expectedAccumulatorShape, accumulatorShape),
+                   [](std::tuple<int64_t, int64_t> s) {
+                     return std::get<0>(s) != ShapedType::kDynamicSize &&
+                            std::get<1>(s) != ShapedType::kDynamicSize &&
+                            std::get<0>(s) != std::get<1>(s);
+                   })) {
+    return op.emitOpError("incompatible input/accumulator shapes");
+  }
+  if (inputType.getElementType() != outputType.getElementType()) {
+    return op.emitOpError(
+        "expected input/output element types to be identical");
+  }
+  if (inputShapes.size() != outputShapes.size()) {
+    return op.emitOpError("expected input/output to have identical ranks");
+  }
+  if (llvm::any_of(llvm::zip(inputShapes, outputShapes),
+                   [](std::tuple<int64_t, int64_t> s) {
+                     return std::get<0>(s) != ShapedType::kDynamicSize &&
+                            std::get<1>(s) != ShapedType::kDynamicSize &&
+                            std::get<0>(s) != std::get<1>(s);
+                   })) {
+    return op.emitOpError("incompatible input/output shapes");
+  }
+  return success();
+}
+
+SmallVector<Range> ScanOp::getIterationDomain(OpBuilder &builder) {
+  int64_t operandRank = getOperandRank();
+  SmallVector<Range> loopBounds(operandRank);
+  Location loc = getLoc();
+  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
+  Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
+  Value source = input();
+  for (auto dim : llvm::seq<int64_t>(0, operandRank)) {
+    loopBounds[dim].offset = zero;
+    loopBounds[dim].size = getDimValue(builder, loc, source, dim);
+    loopBounds[dim].stride = one;
+  }
+  return loopBounds;
+}
+
+SmallVector<StringRef> ScanOp::getLoopIteratorTypes() {
+  SmallVector<StringRef> iteratorTypes(getOperandRank(),
+                                       getParallelIteratorTypeName());
+  iteratorTypes[dimension()] = getReductionIteratorTypeName();
+  return iteratorTypes;
+}
+
+// Generates naive scalar implementation of scan for a given operator f.
+// For inclusive,
+//     output[0] = input[0]
+//     output[i] = f(output[i-1], input[i])
+//
+// For exclusive,
+//     output[0] = 0
+//     output[i] = f(output[i-1], input[i-1])
+
+LogicalResult ScanOp::generateScalarImplementation(OpBuilder &b, Location loc,
+                                                   ValueRange ivs) {
+  SmallVector<Value> indices, scanBlkArgs;
+  indices.append(ivs.begin(), ivs.end());
+  Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
+  Value one = b.create<arith::ConstantIndexOp>(loc, 1);
+  uint64_t scanDim = dimension();
+  Value cond = b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
+                                       indices[scanDim], zero);
+  bool isInclusive = inclusive();
+  SmallVector<Value> accIndices;
+  for (size_t i = 0; i < indices.size(); i++) {
+    if (i != scanDim)
+      accIndices.push_back(indices[i]);
+  }
+
+  auto scfIf = b.create<scf::IfOp>(
+      loc, TypeRange{}, cond,
+      [&](OpBuilder &b, Location loc) {
+        if (isInclusive) {
+          auto value = b.create<memref::LoadOp>(loc, input(), indices);
+          b.create<memref::StoreOp>(loc, value, output(), indices);
+        } else {
+          auto value = b.create<memref::LoadOp>(loc, accumulator(), accIndices);
+          b.create<memref::StoreOp>(loc, value, output(), indices);
+        }
+        b.create<scf::YieldOp>(loc);
+      },
+      [&](OpBuilder &b, Location loc) {
+        SmallVector<Value> indices(ivs.begin(), ivs.end());
+        Value iv = indices[scanDim];
+        Value ivMinusOne = b.create<arith::SubIOp>(loc, iv, one);
+        indices[scanDim] = ivMinusOne;
+        scanBlkArgs.push_back(b.create<memref::LoadOp>(loc, output(), indices));
+        Value i0;
+        if (!isInclusive)
+          i0 = b.create<memref::LoadOp>(loc, input(), indices);
+        indices[scanDim] = iv;
+        if (isInclusive)
+          i0 = b.create<memref::LoadOp>(loc, input(), indices);
+        scanBlkArgs.push_back(i0);
+      });
+
+  auto &srcBlock = region().front();
+  Region &region = scfIf.getElseRegion();
+  BlockAndValueMapping bvm;
+  {
+    OpBuilder::InsertionGuard guard(b);
+    auto &block = region.front();
+    b.setInsertionPointToEnd(&block);
+    for (auto it : llvm::zip(srcBlock.getArguments(), scanBlkArgs)) {
+      bvm.map(std::get<0>(it), std::get<1>(it));
+    }
+    for (auto &blockOp : srcBlock.without_terminator()) {
+      b.clone(blockOp, bvm);
+    }
+    b.create<memref::StoreOp>(
+        loc, bvm.lookupOrDefault(srcBlock.getTerminator()->getOperand(0)),
+        output(), indices);
+    b.create<memref::StoreOp>(
+        loc, bvm.lookupOrDefault(srcBlock.getTerminator()->getOperand(0)),
+        accumulator(), accIndices);
+    b.create<scf::YieldOp>(loc);
+  }
+  return success();
+}
+
+static LogicalResult foldMemRefCast(Operation *op) {
+  bool folded = false;
+  for (OpOperand &operand : op->getOpOperands()) {
+    auto castOp = operand.get().getDefiningOp<memref::CastOp>();
+    if (castOp && memref::CastOp::canFoldIntoConsumerOp(castOp)) {
+      operand.set(castOp.getOperand());
+      folded = true;
+    }
+  }
+  return success(folded);
+}
+
+LogicalResult ScanOp::fold(ArrayRef<Attribute>,
+                           SmallVectorImpl<OpFoldResult> &) {
+  return foldMemRefCast(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// ScatterOp
+//===----------------------------------------------------------------------===//
+static LogicalResult verifyScatterOp(ScatterOp op) {
+  if (op.inputs().size() != 2) {
+    return op.emitOpError("expected two input operands");
+  }
+  if (op.outputs().size() != 1) {
+    return op.emitOpError("expected one output operand");
+  }
+  auto checkDimensionsMatch = [&](ShapedType t1, ShapedType t2, unsigned dim) {
+    return t1.getShape()[dim] == t2.getShape()[dim];
+  };
+
+  auto indicesType = op.getIndicesType();
+  if (indicesType.getRank() != 2 ||
+      !indicesType.getElementType().isInteger(32)) {
+    return op.emitOpError(
+        "expected indices to be of rank 2 of i32 element type");
+  }
+  auto indexDepth = op.getIndexDepth();
+  if (indexDepth == ShapedType::kDynamicSize) {
+    return op.emitOpError("expected index depth is static");
+  }
+
+  // The first dimension of the indices should match the first dimension of the
+  // output. They indicate to the number of updates.
+  auto updateType = op.getUpdateType();
+  if (updateType.getRank() < 1) {
+    return op.emitOpError("expected update value to be at least rank 1");
+  }
+  if (!checkDimensionsMatch(indicesType, updateType, 0)) {
+    return op.emitOpError(
+        "mismatch in shape of indices and update value at dim#0");
+  }
+  auto originalType = op.getOriginalType();
+  // indexDepth + update dims should match to original dims. The first dim of
+  // update is the number of updates.
+  if (originalType.getRank() != indexDepth + updateType.getRank() - 1) {
+    return op.emitOpError(
+        "mismatch in rank of update value, index depth and original value");
+  }
+  for (auto dim : llvm::seq<unsigned>(indexDepth, originalType.getRank())) {
+    // Offset one because the first dim is the number of updates.
+    if (updateType.getDimSize(1 + dim - indexDepth) !=
+        originalType.getDimSize(dim)) {
+      return op.emitOpError("mismatch in shape of update value dim#")
+             << (1 + dim - indexDepth) << " and original value at dim#" << dim;
+    }
+  }
+  Region &region = op.region();
+  Block *body = &region.front();
+  if (body->getNumArguments() != 2) {
+    return op.emitOpError("expected region to have two arguments");
+  }
+  Type arg0Type = body->getArgument(0).getType();
+  Type arg1Type = body->getArgument(1).getType();
+  if (!arg0Type.isIntOrFloat() || !arg1Type.isIntOrFloat()) {
+    return op.emitOpError(
+        "expected region to have scalar argument of integer or float types");
+  }
+  if (arg0Type != updateType.getElementType()) {
+    return op.emitOpError("mismatch in argument 0 of region ")
+           << arg0Type << " and element type of update value "
+           << updateType.getElementType();
+  }
+  if (arg1Type != originalType.getElementType()) {
+    return op.emitOpError("mismatch in argument 1 of region ")
+           << arg1Type << " and element type of original value "
+           << originalType.getElementType();
+  }
+  if (arg0Type != arg1Type) {
+    return op.emitOpError("mismatch in region argument types ")
+           << arg0Type << " and " << arg1Type;
+  }
+  auto yieldOp = cast<TMTensor::YieldOp>(body->getTerminator());
+  if (yieldOp->getNumOperands() != 1) {
+    return yieldOp.emitOpError("expected region to yield a single value");
+  }
+  auto yieldedType = yieldOp->getOperand(0).getType();
+  if (yieldedType != arg0Type) {
+    return yieldOp.emitOpError("mismatch in type of yielded value ")
+           << yieldedType << " and argument of the region " << arg0Type;
+  }
+  return success();
+}
+
+SmallVector<StringRef> ScatterOp::getLoopIteratorTypes() {
+  SmallVector<StringRef> iteratorTypes(getUpdateType().getRank(),
+                                       getParallelIteratorTypeName());
+  return iteratorTypes;
+}
+
+SmallVector<Range> ScatterOp::getIterationDomain(OpBuilder &builder) {
+  Location loc = getLoc();
+  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
+  Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
+  SmallVector<Range> ranges;
+  for (auto dim : llvm::seq<int64_t>(0, getUpdateType().getRank())) {
+    Value ub = getDimValue(builder, loc, updates(), dim);
+    ranges.emplace_back(Range{zero, ub, one});
+  }
+  return ranges;
+}
+
+LogicalResult ScatterOp::generateScalarImplementation(OpBuilder &b,
+                                                      Location loc,
+                                                      ValueRange ivs) {
+  auto indexDepth = getIndexDepth();
+  Value update = b.create<memref::LoadOp>(loc, updates(), ivs);
+  SmallVector<Value> starts;
+  SmallVector<Value> loadIndices;
+  loadIndices.push_back(ivs.front());
+  loadIndices.push_back(Value());
+  for (auto i : llvm::seq<unsigned>(0, indexDepth)) {
+    loadIndices.back() = b.create<arith::ConstantIndexOp>(loc, i);
+    Value idx = b.create<memref::LoadOp>(loc, indices(), loadIndices);
+    starts.push_back(b.create<arith::IndexCastOp>(loc, b.getIndexType(), idx));
+  }
+  starts.append(std::next(ivs.begin()), ivs.end());
+  Value init = b.create<memref::LoadOp>(loc, original(), starts);
+
+  BlockAndValueMapping bvm;
+  Block &block = region().front();
+  bvm.map(block.getArgument(0), update);
+  bvm.map(block.getArgument(1), init);
+  for (auto &blockOp : block.without_terminator()) {
+    b.clone(blockOp, bvm);
+  }
+  // The last op is linalg_ext.yield op. Store the operand to
+  // destination.
+  b.create<memref::StoreOp>(
+      loc, bvm.lookupOrDefault(block.getTerminator()->getOperand(0)),
+      original(), starts);
+  return success();
+}
+
+#define DEFINE_OP_GET_EFFECTS(OP_NAME)                                         \
+  void OP_NAME::getEffects(                                                    \
+      SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>      \
+          &effects) {                                                          \
+    SmallVector<Value> inputBuffers = getInputBufferOperands();                \
+    SmallVector<Value> outputBuffers = getOutputBufferOperands();              \
+    getEffectsImpl(effects, getOperation()->getResults(), inputBuffers,        \
+                   outputBuffers);                                             \
+  }
+
+DEFINE_OP_GET_EFFECTS(ScanOp)
+DEFINE_OP_GET_EFFECTS(ScatterOp)
+
+namespace {
+/// This is derived from mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp without any
+/// changes.
+struct FoldTensorCastOp : public OpInterfaceRewritePattern<TMTensorOp> {
+  using OpInterfaceRewritePattern<TMTensorOp>::OpInterfaceRewritePattern;
+
+  LogicalResult matchAndRewrite(TMTensorOp op,
+                                PatternRewriter &rewriter) const override {
+    // If no operand comes from a tensor::CastOp and can be folded then fail.
+    bool hasTensorCastOperand =
+        llvm::any_of(op.getInputAndOutputOperands(), [&](OpOperand *opOperand) {
+          if (opOperand->get().isa<BlockArgument>())
+            return false;
+          auto castOp = opOperand->get().getDefiningOp<tensor::CastOp>();
+          return castOp && canFoldIntoConsumerOp(castOp);
+        });
+    if (!hasTensorCastOperand)
+      return failure();
+
+    SmallVector<Type, 4> newResultTypes;
+    newResultTypes.reserve(op->getNumResults());
+    SmallVector<Value, 4> newOperands;
+    newOperands.reserve(op->getNumOperands());
+    // Inputs may fold.
+    for (OpOperand *opOperand : op.getInputOperands()) {
+      auto tensorCastOp = opOperand->get().getDefiningOp<tensor::CastOp>();
+      newOperands.push_back(canFoldIntoConsumerOp(tensorCastOp)
+                                ? tensorCastOp.source()
+                                : opOperand->get());
+    }
+    // Init tensors may fold, in which case the resultType must also change.
+    for (OpOperand *opOperand : op.getOutputOperands()) {
+      auto tensorCastOp = opOperand->get().getDefiningOp<tensor::CastOp>();
+      bool fold = canFoldIntoConsumerOp(tensorCastOp);
+      newOperands.push_back(fold ? tensorCastOp.getOperand()
+                                 : opOperand->get());
+      newResultTypes.push_back(newOperands.back().getType());
+    }
+    // Clone op.
+    Operation *newOp =
+        op.clone(rewriter, op->getLoc(), newResultTypes, newOperands);
+    SmallVector<Value, 4> replacements;
+    replacements.reserve(newOp->getNumResults());
+    for (auto result : llvm::zip(op->getResults(), newOp->getResults())) {
+      Value oldResult = std::get<0>(result);
+      Value newResult = std::get<1>(result);
+      if (newResult.getType() != oldResult.getType()) {
+        replacements.push_back(rewriter.create<tensor::CastOp>(
+            op->getLoc(), oldResult.getType(), newResult));
+      } else {
+        replacements.push_back(newResult);
+      }
+    }
+    rewriter.replaceOp(op, replacements);
+
+    return success();
+  }
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// TMTensorDialect
+//===----------------------------------------------------------------------===//
+
+void TMTensorDialect::getCanonicalizationPatterns(
+    RewritePatternSet &results) const {
+  results.add<FoldTensorCastOp>(getContext());
+}
+
+#define GET_OP_CLASSES
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.cpp.inc"
--- a/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/Transforms/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/Transforms/CMakeLists.txt
@ -0,0 +1,22 @@
+add_mlir_library(TorchMLIRTMTensorPasses
+  ConvertToLoops.cpp
+  Passes.cpp
+
+  DEPENDS
+  TorchMLIRTMTensorTransformsPassesIncGen
+
+  LINK_LIBS PUBLIC
+  TorchMLIRTMTensorDialect
+  MLIRAffine
+  MLIRIR
+  MLIRLinalg
+  MLIRLinalgTransforms
+  MLIRMath
+  MLIRMemRef
+  MLIRPass
+  MLIRSCF
+  MLIRStandard
+  MLIRSupport
+  MLIRTensor
+  MLIRTransforms
+)
--- a/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/Transforms/ConvertToLoops.cpp
+++ b/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/Transforms/ConvertToLoops.cpp
@ -0,0 +1,117 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorDialect.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/Transforms/PassDetail.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/Transforms/Passes.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace mlir;
+using namespace mlir::torch::TMTensor;
+
+/// Recursive method that lowers one dimension of the `ScalarLoopOpInterface` to
+/// scalar loops at a time.
+static LogicalResult lowerToLoopsImpl(OpBuilder &builder,
+                                      ScalarLoopOpInterface scalarLoopOp,
+                                      ArrayRef<Range> loopRanges,
+                                      unsigned loopDepth,
+                                      SmallVectorImpl<Value> &ivs) {
+  Location loc = scalarLoopOp.getLoc();
+  if (loopDepth == loopRanges.size()) {
+    return scalarLoopOp.generateScalarImplementation(builder, loc, ivs);
+  }
+  LogicalResult status = success();
+  builder.create<scf::ForOp>(
+      loc, loopRanges[loopDepth].offset, loopRanges[loopDepth].size,
+      loopRanges[loopDepth].stride, ValueRange{},
+      [&](OpBuilder &b, Location loc, Value iv, ValueRange args) {
+        ivs.push_back(iv);
+        status =
+            lowerToLoopsImpl(b, scalarLoopOp, loopRanges, loopDepth + 1, ivs);
+        b.create<scf::YieldOp>(loc);
+      });
+  return status;
+}
+
+/// Main entry point for lowering `ScalarLoopOpInterface` op to loops.
+static LogicalResult lowerToLoops(OpBuilder &builder,
+                                  ScalarLoopOpInterface scalarLoopOp) {
+  SmallVector<Range> loopBounds = scalarLoopOp.getIterationDomain(builder);
+  SmallVector<Value> ivs;
+  return lowerToLoopsImpl(builder, scalarLoopOp, loopBounds, 0, ivs);
+}
+
+/// Pattern rewriter hook to lower a `ScalarLoopOpInterface` to loops.
+namespace {
+struct ScalarLoopOpInterfaceLowerToLoopsPattern : public RewritePattern {
+  ScalarLoopOpInterfaceLowerToLoopsPattern(MLIRContext *context,
+                                           PatternBenefit benefit = 1)
+      : RewritePattern(MatchAnyOpTypeTag(), benefit, context) {}
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    auto scalarLoopOp = dyn_cast<ScalarLoopOpInterface>(op);
+    if (!scalarLoopOp) {
+      return failure();
+    }
+    if (llvm::any_of(scalarLoopOp->getResults(),
+                     [&](Value v) { return v.getType().isa<ShapedType>(); })) {
+      return rewriter.notifyMatchFailure(
+          scalarLoopOp, "lower to loops needs to have tensor semantics");
+    }
+    if (failed(lowerToLoops(rewriter, scalarLoopOp))) {
+      return failure();
+    }
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct TMTensorToLoopsPass : public TMTensorToLoopsBase<TMTensorToLoopsPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect, StandardOpsDialect,
+                    mlir::arith::ArithmeticDialect, math::MathDialect,
+                    memref::MemRefDialect, scf::SCFDialect>();
+  }
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+
+    RewritePatternSet patterns(context);
+    patterns.insert<ScalarLoopOpInterfaceLowerToLoopsPattern>(context);
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+torch::TMTensor::createTMTensorToLoopsPass() {
+  return std::make_unique<TMTensorToLoopsPass>();
+}
--- a/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/Transforms/Passes.cpp
+++ b/external/llvm-external-projects/torch-mlir-dialects/lib/Dialect/TMTensor/Transforms/Passes.cpp
@ -0,0 +1,33 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#include "torch-mlir-dialects/Dialect/TMTensor/Transforms/Passes.h"
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace mlir {
+namespace torch {
+namespace TMTensor {
+
+namespace detail {
+#define GEN_PASS_REGISTRATION
+#include "torch-mlir-dialects/Dialect/TMTensor/Transforms/Passes.h.inc" // IWYU pragma: export
+} // namespace detail
+
+} // namespace TMTensor
+} // namespace torch
+} // namespace mlir
+
+void torch::TMTensor::registerPasses() {
+  torch::TMTensor::detail::registerPasses();
+}
--- a/external/llvm-external-projects/torch-mlir-dialects/test/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/test/CMakeLists.txt
@ -0,0 +1,19 @@
+configure_lit_site_cfg(
+        ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+        ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+        MAIN_CONFIG
+        ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+)
+
+set(TORCH_MLIR_DIALECTS_TEST_DEPENDS
+        FileCheck count not
+        torch-mlir-dialects-opt
+        )
+
+add_lit_testsuite(check-torch-mlir-dialects "Running the torch-mlir-dialects regression tests"
+        ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS ${TORCH_MLIR_DIALECTS_TEST_DEPENDS}
+        )
+set_target_properties(check-torch-mlir-dialects PROPERTIES FOLDER "Tests")
+
+add_lit_testsuites(TORCH_MLIR_DIALECTS ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${TORCH_MLIR_DIALECTS_TEST_DEPENDS})
--- a/external/llvm-external-projects/torch-mlir-dialects/test/lit.cfg.py
+++ b/external/llvm-external-projects/torch-mlir-dialects/test/lit.cfg.py
@ -0,0 +1,69 @@
+# -*- Python -*-
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Also available under a BSD-style license. See LICENSE.
+
+import os
+import platform
+import re
+import subprocess
+import tempfile
+
+import lit.formats
+import lit.util
+
+from lit.llvm import llvm_config
+from lit.llvm.subst import ToolSubst
+from lit.llvm.subst import FindTool
+
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'TORCH_MLIR_DIALECTS'
+
+config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.mlir', '.py']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.torch_mlir_dialects_obj_root, 'test')
+
+config.substitutions.append(('%PATH%', config.environment['PATH']))
+config.substitutions.append(('%shlibext', config.llvm_shlib_ext))
+config.substitutions.append(
+    ('%resources_dir', os.path.join(config.torch_mlir_dialects_obj_root,
+                                    'resources')))
+
+llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP'])
+
+#llvm_config.use_default_substitutions()
+
+# excludes: A list of directories to exclude from the testsuite. The 'Inputs'
+# subdirectories contain auxiliary inputs for various tests in their parent
+# directories.
+config.excludes = [
+    'Inputs', 'Examples', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt',
+    'lit.cfg.py', 'lit.site.cfg.py'
+]
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.torch_mlir_dialects_obj_root, 'test')
+config.standalone_tools_dir = os.path.join(config.torch_mlir_dialects_obj_root, 'bin')
+
+# Tweak the PATH to include the tools dir.
+llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
+
+tool_dirs = [config.llvm_tools_dir]
+tools = [
+    ToolSubst('%PYTHON', config.python_executable, unresolved='ignore'),
+]
+
+llvm_config.add_tool_substitutions(tools, tool_dirs)
--- a/external/llvm-external-projects/torch-mlir-dialects/test/lit.site.cfg.py.in
+++ b/external/llvm-external-projects/torch-mlir-dialects/test/lit.site.cfg.py.in
@ -0,0 +1,26 @@
+# -*- Python -*-
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Also available under a BSD-style license. See LICENSE.
+
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.torch_mlir_dialects_obj_root = "@TORCH_MLIR_DIALECTS_BINARY_DIR@"
+config.llvm_src_root = "@LLVM_SOURCE_DIR@"
+config.llvm_obj_root = "@LLVM_BINARY_DIR@"
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.llvm_lib_dir = "@LLVM_LIBS_DIR@"
+config.llvm_shlib_dir = "@SHLIBDIR@"
+config.llvm_shlib_ext = "@SHLIBEXT@"
+config.llvm_exe_ext = "@EXEEXT@"
+config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
+config.python_executable = sys.executable
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@TORCH_MLIR_DIALECTS_SOURCE_DIR@/test/lit.cfg.py")
--- a/external/llvm-external-projects/torch-mlir-dialects/test/tmtensor/canonicalize.mlir
+++ b/external/llvm-external-projects/torch-mlir-dialects/test/tmtensor/canonicalize.mlir
@ -0,0 +1,24 @@
+// RUN: torch-mlir-dialects-opt -canonicalize -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func @tensor.cast(
+func @tensor.cast(%arg0: tensor<128xi32>) -> tensor<128xi32> {
+  %init = linalg.init_tensor [128] : tensor<128xi32>
+  %c0 = linalg.init_tensor [] : tensor<i32>
+
+  %casted_arg0 = tensor.cast %arg0 : tensor<128xi32> to tensor<?xi32>
+  %casted_init = tensor.cast %init : tensor<128xi32> to tensor<?xi32>
+ // CHECK:      tm_tensor.scan
+ // CHECK-SAME:   ins(%{{[a-zA-Z0-9]*}} : tensor<128xi32>)
+ // CHECK-SAME:  outs(%{{[a-zA-Z0-9]*}}, %{{[a-zA-Z0-9]*}} : tensor<128xi32>, tensor<i32>)
+  %0, %1 = tm_tensor.scan dimension(0) inclusive(true)
+       ins(%casted_arg0 : tensor<?xi32>)
+       outs(%casted_init, %c0: tensor<?xi32>, tensor<i32>) {
+       ^bb0(%barg0 : i32, %barg1 : i32, %barg2 : i32):
+         %sum = arith.addi %barg0, %barg1 : i32
+         tm_tensor.yield %sum : i32
+  } -> tensor<?xi32>, tensor<i32>
+
+  %2 = tensor.cast %0: tensor<?xi32> to tensor<128xi32>
+
+  return %2: tensor<128xi32>
+}
--- a/external/llvm-external-projects/torch-mlir-dialects/test/tmtensor/convert_to_loops.mlir
+++ b/external/llvm-external-projects/torch-mlir-dialects/test/tmtensor/convert_to_loops.mlir
@ -0,0 +1,332 @@
+// RUN: torch-mlir-dialects-opt -split-input-file -torch-mlir-tm-tensor-to-loops %s | FileCheck %s
+
+func @scan_1d_inclusive(%0: memref<128xi32>, %1: memref<128xi32>) {
+  %c0 = memref.alloc() : memref<i32>
+  tm_tensor.scan dimension(0) inclusive(true)
+    ins(%0 : memref<128xi32>) outs(%1, %c0 : memref<128xi32>, memref<i32>) {
+    ^bb0(%arg0 : i32, %arg1 : i32):
+      %sum = arith.addi %arg0, %arg1 : i32
+      tm_tensor.yield %sum : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scan_1d_inclusive
+// CHECK-SAME:    %[[BUFI:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[BUFO:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C128:.+]] = arith.constant 128 : index
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[ACC:.+]] = memref.alloc() : memref<i32>
+// CHECK:         scf.for %[[ARG1:.+]] = %[[C0]] to %[[C128]] step %[[C1]]
+// CHECK:           %[[COND:.+]] = arith.cmpi eq, %[[ARG1]], %[[C0]] : index
+// CHECK:           scf.if %[[COND]] {
+// CHECK:             %[[V1:.+]] = memref.load %[[BUFI]][%[[ARG1]]]
+// CHECK:             memref.store %[[V1]], %[[BUFO]][%[[ARG1]]]
+// CHECK:           } else {
+// CHECK:             %[[T1:.+]] = arith.subi %[[ARG1]], %[[C1]] : index
+// CHECK:             %[[V2:.+]] = memref.load %[[BUFO]][%[[T1]]]
+// CHECK:             %[[V3:.+]] = memref.load %[[BUFI]][%[[ARG1]]]
+// CHECK:             %[[V4:.+]] = arith.addi %[[V2]], %[[V3]] : i32
+// CHECK:             memref.store %[[V4]], %[[BUFO]][%[[ARG1]]]
+// CHECK:             memref.store %[[V4]], %[[ACC]][]
+// CHECK:           }
+
+// -----
+
+func @scan_1d_exclusive(%0: memref<128xi32>, %1: memref<128xi32>) {
+  %c0 = memref.alloc() : memref<i32>
+  tm_tensor.scan dimension(0) inclusive(false)
+    ins(%0 : memref<128xi32>) outs(%1, %c0 : memref<128xi32>, memref<i32>) {
+    ^bb0(%arg0 : i32, %arg1 : i32):
+      %sum = arith.addi %arg0, %arg1 : i32
+      tm_tensor.yield %sum : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scan_1d_exclusive
+// CHECK-SAME:    %[[BUFI:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[BUFO:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C128:.+]] = arith.constant 128 : index
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[ACC:.+]] = memref.alloc() : memref<i32>
+// CHECK:         scf.for %[[ARG1:.+]] = %[[C0]] to %[[C128]] step %[[C1]]
+// CHECK:           %[[COND:.+]] = arith.cmpi eq, %[[ARG1]], %[[C0]] : index
+// CHECK:           scf.if %[[COND]] {
+// CHECK:             %[[V0:.+]] = memref.load %[[ACC]][] : memref<i32>
+// CHECK:             memref.store %[[V0]], %[[BUFO]][%[[ARG1]]]
+// CHECK:           } else {
+// CHECK:             %[[T1:.+]] = arith.subi %[[ARG1]], %[[C1]] : index
+// CHECK:             %[[V2:.+]] = memref.load %[[BUFO]][%[[T1]]]
+// CHECK:             %[[V3:.+]] = memref.load %[[BUFI]][%[[T1]]]
+// CHECK:             %[[V4:.+]] = arith.addi %[[V2]], %[[V3]] : i32
+// CHECK:             memref.store %[[V4]], %[[BUFO]][%[[ARG1]]]
+// CHECK:             memref.store %[[V4]], %[[ACC]][]
+// CHECK:           }
+
+// -----
+
+func @scan_2d(%0: memref<16x32xi32>, %1: memref<16x32xi32>) {
+  %t0 = memref.alloc() : memref<32xi32>
+  tm_tensor.scan dimension(0) inclusive(true)
+    ins(%0 : memref<16x32xi32>) outs(%1, %t0 : memref<16x32xi32>, memref<32xi32>) {
+    ^bb0(%arg0 : i32, %arg1 : i32):
+      %sum = arith.addi %arg0, %arg1 : i32
+      tm_tensor.yield %sum : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scan_2d
+// CHECK-SAME:    %[[BUFI:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[BUFO:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C16:.+]] = arith.constant 16 : index
+// CHECK-DAG:     %[[C32:.+]] = arith.constant 32 : index
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[ACC:.+]] = memref.alloc() : memref<32xi32>
+// CHECK:         scf.for %[[ARG1:.+]] = %[[C0]] to %[[C16]] step %[[C1]]
+// CHECK:           scf.for %[[ARG2:.+]] = %[[C0]] to %[[C32]] step %[[C1]]
+// CHECK:             %[[COND:.+]] = arith.cmpi eq, %[[ARG1]], %[[C0]] : index
+// CHECK:             scf.if %[[COND]] {
+// CHECK:               %[[V1:.+]] = memref.load %[[BUFI]][%[[ARG1]], %[[ARG2]]]
+// CHECK:               memref.store %[[V1]], %[[BUFO]][%[[ARG1]], %[[ARG2]]]
+// CHECK:             } else {
+// CHECK:               %[[T1:.+]] = arith.subi %[[ARG1]], %[[C1]] : index
+// CHECK:               %[[V2:.+]] = memref.load %[[BUFO]][%[[T1]], %[[ARG2]]]
+// CHECK:               %[[V3:.+]] = memref.load %[[BUFI]][%[[ARG1]], %[[ARG2]]]
+// CHECK:               %[[V4:.+]] = arith.addi %[[V2]], %[[V3]] : i32
+// CHECK:               memref.store %[[V4]], %[[BUFO]][%[[ARG1]], %[[ARG2]]]
+// CHECK:               memref.store %[[V4]], %[[ACC]][%[[ARG2]]]
+// CHECK:             }
+
+
+// -----
+
+func @scatter_update_scalar_1D(
+    %original: memref<8xi32>, %indices: memref<3x1xi32>,
+    %updates: memref<3xi32>) {
+  tm_tensor.scatter
+    ins(%updates, %indices : memref<3xi32>, memref<3x1xi32>)
+    outs(%original : memref<8xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    tm_tensor.yield %arg0 : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scatter_update_scalar_1D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C3:.+]] = arith.constant 3 : index
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:           %[[T1:.+]] = memref.load %[[UPDATES]][%[[I]]] : memref<3xi32>
+// CHECK:           %[[T2:.+]] =  memref.load %[[INDICES]][%[[I]], %[[C0]]] : memref<3x1xi32>
+// CHECK:           %[[IDX:.+]] = arith.index_cast %[[T2]] : i32 to index
+// CHECK:           memref.store %[[T1]], %[[ORIGINAL]][%[[IDX]]]
+
+// -----
+
+func @scatter_add_scalar_2D(
+    %original: memref<4x3xi32>, %indices: memref<3x2xi32>,
+    %updates: memref<3xi32>) {
+  tm_tensor.scatter
+    ins(%updates, %indices : memref<3xi32>, memref<3x2xi32>)
+    outs(%original : memref<4x3xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    %0 = arith.addi %arg1, %arg0 : i32
+    tm_tensor.yield %0 : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scatter_add_scalar_2D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C3:.+]] = arith.constant 3 : index
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:           %[[T1:.+]] = memref.load %[[UPDATES]][%[[I]]] : memref<3xi32>
+// CHECK:           %[[T2:.+]] = memref.load %[[INDICES]][%[[I]], %[[C0]]] : memref<3x2xi32>
+// CHECK:           %[[IDX1:.+]] = arith.index_cast %[[T2]] : i32 to index
+// CHECK:           %[[T3:.+]] = memref.load %[[INDICES]][%[[I]], %[[C1]]] : memref<3x2xi32>
+// CHECK:           %[[IDX2:.+]] = arith.index_cast %[[T3]] : i32 to index
+// CHECK:           %[[ORI:.+]] = memref.load %[[ORIGINAL]][%[[IDX1]], %[[IDX2]]] : memref<4x3xi32>
+// CHECK:           %[[ADD:.+]] = arith.addi %[[ORI]], %[[T1]] : i32
+// CHECK:           memref.store %[[ADD]], %[[ORIGINAL]][%[[IDX1]], %[[IDX2]]]
+
+// -----
+
+func @scatter_update_slice_2D(
+    %original: memref<4x3xi32>, %indices: memref<2x1xi32>,
+    %updates: memref<2x3xi32>) {
+  tm_tensor.scatter
+    ins(%updates, %indices : memref<2x3xi32>, memref<2x1xi32>)
+    outs(%original : memref<4x3xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    tm_tensor.yield %arg0 : i32
+  }
+  return
+}
+// CHECK:       func @scatter_update_slice_2D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C3:.+]] = arith.constant 3 : index
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:           scf.for %[[J:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:             %[[UPDATE:.+]] = memref.load %[[UPDATES]][%[[I]], %[[J]]]
+// CHECK:             %[[INDEX:.+]] = memref.load %[[INDICES]][%[[I]], %[[C0]]]
+// CHECK:             %[[LOC:.+]] = arith.index_cast %[[INDEX]] : i32 to index
+// CHECK:             memref.store %[[UPDATE]], %[[ORIGINAL]][%[[LOC]], %[[J]]]
+// CHECK:           }
+// CHECK:         }
+
+// -----
+
+func @scatter_add_scalar_1D(
+    %original: memref<8xi32>, %indices: memref<3x1xi32>,
+    %updates: memref<3xi32>) {
+  tm_tensor.scatter
+    ins(%updates, %indices : memref<3xi32>, memref<3x1xi32>)
+    outs(%original : memref<8xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    %0 = arith.addi %arg1, %arg0 : i32
+    tm_tensor.yield %0 : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scatter_add_scalar_1D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C3:.+]] = arith.constant 3 : index
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:           %[[T1:.+]] = memref.load %[[UPDATES]][%[[I]]] : memref<3xi32>
+// CHECK:           %[[T2:.+]] =  memref.load %[[INDICES]][%[[I]], %[[C0]]] : memref<3x1xi32>
+// CHECK:           %[[IDX:.+]] = arith.index_cast %[[T2]] : i32 to index
+// CHECK:           %[[ORI:.+]] = memref.load %[[ORIGINAL]][%[[IDX]]] : memref<8xi32>
+// CHECK:           %[[ADD:.+]] = arith.addi %[[ORI]], %[[T1]] : i32
+// CHECK:           memref.store %[[ADD]], %[[ORIGINAL]][%[[IDX]]]
+
+// -----
+
+func @scatter_add_slice_2D(
+    %original: memref<4x3xi32>, %indices: memref<2x1xi32>,
+    %updates: memref<2x3xi32>) {
+  tm_tensor.scatter
+    ins(%updates, %indices : memref<2x3xi32>, memref<2x1xi32>)
+    outs(%original : memref<4x3xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    %0 = arith.addi %arg1, %arg0 : i32
+    tm_tensor.yield %0 : i32
+  }
+  return
+}
+// CHECK:       func @scatter_add_slice_2D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C2:.+]] = arith.constant 2 : index
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[C2]] step %[[C1]] {
+// CHECK:           scf.for %[[J:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {
+// CHECK:             %[[UPDATEVAL:.+]] = memref.load %[[UPDATES]][%[[I]], %[[J]]]
+// CHECK:             %[[INDEXVAL:.+]] = memref.load %[[INDICES]][%[[I]], %[[C0]]]
+// CHECK:             %[[INDEX:.+]] = arith.index_cast %[[INDEXVAL]] : i32 to index
+// CHECK:             %[[ORIGINALVAL:.+]] = memref.load %[[ORIGINAL]][%[[INDEX]], %[[J]]]
+// CHECK:             %[[STOREVAL:.+]] = arith.addi %[[ORIGINALVAL]], %[[UPDATEVAL]]
+// CHECK:             memref.store %[[STOREVAL]], %[[ORIGINAL]][%[[INDEX]], %[[J]]]
+
+// -----
+
+func @scatter_update_scalar_dynamic_1D(
+    %original: memref<?xi32>, %indices: memref<?x1xi32>,
+    %updates: memref<?xi32>) {
+  tm_tensor.scatter
+    ins(%updates, %indices : memref<?xi32>, memref<?x1xi32>)
+    outs(%original : memref<?xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    tm_tensor.yield %arg0 : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scatter_update_scalar_dynamic_1D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[UB:.+]] = memref.dim %[[UPDATES]], %[[C0]] : memref<?xi32>
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[UB]] step %[[C1]] {
+// CHECK:           %[[T1:.+]] = memref.load %[[UPDATES]][%[[I]]] : memref<?xi32>
+// CHECK:           %[[T2:.+]] =  memref.load %[[INDICES]][%[[I]], %[[C0]]] : memref<?x1xi32>
+// CHECK:           %[[IDX:.+]] = arith.index_cast %[[T2]] : i32 to index
+// CHECK:           memref.store %[[T1]], %[[ORIGINAL]][%[[IDX]]]
+
+// -----
+
+func @scatter_add_scalar_dynamic_2D(
+    %original: memref<?x?xi32>, %indices: memref<?x2xi32>,
+    %updates: memref<?xi32>) {
+  tm_tensor.scatter
+    ins(%updates, %indices : memref<?xi32>, memref<?x2xi32>)
+    outs(%original : memref<?x?xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    %0 = arith.addi %arg1, %arg0 : i32
+    tm_tensor.yield %0 : i32
+  }
+  return
+}
+// CHECK-LABEL: func @scatter_add_scalar_dynamic_2D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[UB:.+]] = memref.dim %[[UPDATES]], %[[C0]] : memref<?xi32>
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[UB]] step %[[C1]] {
+// CHECK:           %[[T1:.+]] = memref.load %[[UPDATES]][%[[I]]] : memref<?xi32>
+// CHECK:           %[[T2:.+]] = memref.load %[[INDICES]][%[[I]], %[[C0]]] : memref<?x2xi32>
+// CHECK:           %[[IDX1:.+]] = arith.index_cast %[[T2]] : i32 to index
+// CHECK:           %[[T3:.+]] = memref.load %[[INDICES]][%[[I]], %[[C1]]] : memref<?x2xi32>
+// CHECK:           %[[IDX2:.+]] = arith.index_cast %[[T3]] : i32 to index
+// CHECK:           %[[ORI:.+]] = memref.load %[[ORIGINAL]][%[[IDX1]], %[[IDX2]]] : memref<?x?xi32>
+// CHECK:           %[[ADD:.+]] = arith.addi %[[ORI]], %[[T1]] : i32
+// CHECK:           memref.store %[[ADD]], %[[ORIGINAL]][%[[IDX1]], %[[IDX2]]]
+
+// -----
+
+func @scatter_update_slice_dynamic_2D(
+    %original: memref<?x?xi32>, %indices: memref<?x1xi32>,
+    %updates: memref<?x?xi32>) {
+  tm_tensor.scatter
+    ins(%updates, %indices : memref<?x?xi32>, memref<?x1xi32>)
+    outs(%original : memref<?x?xi32>)  {
+  ^bb0(%arg0: i32, %arg1: i32):  // no predecessors
+    tm_tensor.yield %arg0 : i32
+  }
+  return
+}
+// CHECK:       func @scatter_update_slice_dynamic_2D
+// CHECK-SAME:    %[[ORIGINAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[INDICES:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[UPDATES:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[UB1:.+]] = memref.dim %[[UPDATES]], %[[C0]] : memref<?x?xi32>
+// CHECK-DAG:     %[[UB2:.+]] = memref.dim %[[UPDATES]], %[[C1]] : memref<?x?xi32>
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[UB1]] step %[[C1]] {
+// CHECK:           scf.for %[[J:.+]] = %[[C0]] to %[[UB2]] step %[[C1]] {
+// CHECK:             %[[UPDATEVAL:.+]] = memref.load %[[UPDATES]][%[[I]], %[[J]]]
+// CHECK:             %[[INDEXVAL:.+]] = memref.load %[[INDICES]][%[[I]], %[[C0]]]
+// CHECK:             %[[INDEX:.+]] = arith.index_cast %[[INDEXVAL]] : i32 to index
+// CHECK:             memref.store %[[UPDATEVAL]], %[[ORIGINAL]][%[[INDEX]], %[[J]]]
--- a/external/llvm-external-projects/torch-mlir-dialects/tools/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/tools/CMakeLists.txt
@ -0,0 +1 @@
+add_subdirectory(torch-mlir-dialects-opt)
--- a/external/llvm-external-projects/torch-mlir-dialects/tools/torch-mlir-dialects-opt/CMakeLists.txt
+++ b/external/llvm-external-projects/torch-mlir-dialects/tools/torch-mlir-dialects-opt/CMakeLists.txt
@ -0,0 +1,22 @@
+set(LIBS
+  MLIRArithmetic
+  MLIRDialect
+  MLIRLinalg
+  MLIRMemRef
+  MLIROptLib
+  MLIRSCF
+  MLIRSCFTransforms
+  MLIRStandard
+  MLIRTensor
+  MLIRTransforms
+  TorchMLIRTMTensorDialect
+  TorchMLIRTMTensorPasses
+)
+
+add_llvm_tool(torch-mlir-dialects-opt
+  torch-mlir-dialects-opt.cpp
+
+  DEPENDS
+  ${LIBS}
+)
+target_link_libraries(torch-mlir-dialects-opt PRIVATE ${LIBS})
--- a/external/llvm-external-projects/torch-mlir-dialects/tools/torch-mlir-dialects-opt/torch-mlir-dialects-opt.cpp
+++ b/external/llvm-external-projects/torch-mlir-dialects/tools/torch-mlir-dialects-opt/torch-mlir-dialects-opt.cpp
@ -0,0 +1,49 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// Also available under a BSD-style license. See LICENSE.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/Passes.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/Support/MlirOptMain.h"
+#include "mlir/Transforms/Passes.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/ScalarLoopOpInterface.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorDialect.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/Transforms/Passes.h"
+
+using namespace mlir;
+
+int main(int argc, char **argv) {
+  registerAsmPrinterCLOptions();
+  registerMLIRContextCLOptions();
+
+  registerTransformsPasses();
+  registerSCFPasses();
+
+  // Local dialects.
+  mlir::torch::TMTensor::registerPasses();
+
+  DialectRegistry registry;
+  registry.insert<
+      // Local dialects
+      mlir::torch::TMTensor::TMTensorDialect,
+      // Upstream dialects
+      mlir::arith::ArithmeticDialect, mlir::linalg::LinalgDialect,
+      mlir::memref::MemRefDialect, mlir::StandardOpsDialect,
+      mlir::scf::SCFDialect, mlir::tensor::TensorDialect>();
+
+  return mlir::asMainReturnCode(
+      mlir::MlirOptMain(argc, argv, "MLIR modular optimizer driver\n", registry,
+                        /*preloadDialectsInContext=*/false));
+}
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@ -19,6 +19,9 @@ add_mlir_library(TorchMLIRInitAll
  TorchMLIRTorchDialect
  TorchMLIRTorchConversionPasses

+  TorchMLIRTMTensorPasses
+  TorchMLIRTMTensorDialect
+
  TorchMLIRConversionPasses
  TorchMLIRRefBackend
 )
--- a/lib/InitAll.cpp
+++ b/lib/InitAll.cpp
@ -10,6 +10,8 @@
 #include "torch-mlir/InitAll.h"

 #include "mlir/IR/Dialect.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorDialect.h"
+#include "torch-mlir-dialects/Dialect/TMTensor/Transforms/Passes.h"
 #include "torch-mlir/Conversion/Passes.h"
 #include "torch-mlir/Dialect/Torch/IR/TorchDialect.h"
 #include "torch-mlir/Dialect/Torch/Transforms/Passes.h"
@ -20,6 +22,7 @@
 void mlir::torch::registerAllDialects(mlir::DialectRegistry &registry) {
  registry.insert<mlir::torch::Torch::TorchDialect>();
  registry.insert<mlir::torch::TorchConversion::TorchConversionDialect>();
+  registry.insert<mlir::torch::TMTensor::TMTensorDialect>();
 }

 void mlir::torch::registerAllPasses() {
@ -28,4 +31,5 @@ void mlir::torch::registerAllPasses() {

  mlir::torch::registerConversionPasses();
  mlir::torch::RefBackend::registerRefBackendPasses();
+  mlir::torch::TMTensor::registerPasses();
 }
--- a/setup.py
+++ b/setup.py
@ -62,8 +62,9 @@ class CMakeBuild(build_py):
                f"-DLLVM_TARGETS_TO_BUILD=host",
                f"-DMLIR_ENABLE_BINDINGS_PYTHON=ON",
                f"-DLLVM_ENABLE_PROJECTS=mlir",
-                f"-DLLVM_EXTERNAL_PROJECTS=torch-mlir",
+                f"-DLLVM_EXTERNAL_PROJECTS=torch-mlir;torch-mlir-dialects",
                f"-DLLVM_EXTERNAL_TORCH_MLIR_SOURCE_DIR={src_dir}",
+                f"-DLLVM_EXTERNAL_TORCH_MLIR_DIALECTS_SOURCE_DIR={src_dir}/external/llvm-external-projects/torch-mlir-dialects",
                # Optimization options for building wheels.
                f"-DCMAKE_VISIBILITY_INLINES_HIDDEN=ON",
                f"-DCMAKE_C_VISIBILITY_PRESET=hidden",