Lower to the upstream memref ABI.

Specifically, we use unranked memrefs which get passed as a fixed-size set of arguments/returns. One big caveat about this is that returning results isn't going to work. See TODO in LowerTensorLoadOp. This is far from enough runtime-wise, but it starts to demarcate a plausible layering. Notice for example how this removes the runtime-dependence from LowerRankedShapes. Eventually, we want to have an `npcomp_rt` or `npcomp_hal` dialect with its own set of runtime types that will supercede this. See comments in LowerTensorLoadOp for more direction about where this is going to evolve.
2020-05-15 16:33:01 -07:00 · 2020-05-15 16:33:01 -07:00 · 993338a12d
parent 7687a6d8d2
commit 993338a12d
8 changed files with 231 additions and 41 deletions
--- a/include/npcomp/E2E/E2E.h
+++ b/include/npcomp/E2E/E2E.h
@ -15,6 +15,11 @@
 namespace mlir {
 namespace NPCOMP {
 // Look in createE2ELoweringPipeline for more information about how these
 // passes fit together.
 //
 // Pass summaries are in Passes.td.
 std::unique_ptr<OperationPass<FuncOp>> createLowerBroadcastToToLoopsPass();
 std::unique_ptr<OperationPass<FuncOp>>
@ -29,6 +34,8 @@ std::unique_ptr<OperationPass<FuncOp>> createLowerLinalgLoopDimOpsPass();
 std::unique_ptr<OperationPass<FuncOp>> createLowerRankedShapesPass();
 std::unique_ptr<OperationPass<FuncOp>> createLowerToMemRefABIPass();
 void createLowerToHybridTensorMemRefPipeline(OpPassManager &pm);
 // The main pipeline that encapsulates the full E2E lowering.
--- a/include/npcomp/E2E/Passes.td
+++ b/include/npcomp/E2E/Passes.td
@ -43,4 +43,9 @@ def LowerRankedShapes : Pass<"lower-ranked-shapes", "FuncOp"> {
  let constructor = "mlir::NPCOMP::createLowerRankedShapesPass()";
 }
 def LowerToMemRefABI : Pass<"lower-to-memref-abi", "FuncOp"> {
  let summary = "Lower tensors at ABI boundaries to memref";
  let constructor = "mlir::NPCOMP::createLowerToMemRefABIPass()";
 }
 #endif // NPCOMP_E2E_PASSES
--- a/lib/E2E/CMakeLists.txt
+++ b/lib/E2E/CMakeLists.txt
@ -2,6 +2,7 @@ add_mlir_library(NPCOMPE2E
  E2E.cpp
  LowerRankedShapes.cpp
  LowerToHybridTensorMemRef.cpp
  LowerToMemRefABI.cpp
  ADDITIONAL_HEADER_DIRS
  ${PROJECT_SRC_DIR}/include/npcomp/E2E
--- a/lib/E2E/E2E.cpp
+++ b/lib/E2E/E2E.cpp
@ -218,7 +218,12 @@ class LowerLinalgLoopDimOps
    OwningRewritePatternList patterns;
    patterns.insert<LowerLinalgLoopDimOp>(context);
    ConversionTarget target(*context);
-    target.addIllegalOp<DimOp>();
+    target.addDynamicallyLegalOp<DimOp>([](DimOp op) -> bool {
      // TODO: We only need this because we use `dim` ops for the memref
      // ABI. Once we layer that out into our own runtime types, we can
      // remove this.
      return op.getOperand().getDefiningOp<tcp::AllocMemRefOp>();
    });
    target.addLegalOp<tcp::GetExtentOp>();
    if (failed(applyPartialConversion(func, target, patterns))) {
      return signalPassFailure();
@ -291,12 +296,22 @@ void mlir::NPCOMP::createE2ELoweringPipeline(OpPassManager &pm) {
  // tensor_load/tensor_store ops.
  pm.addPass(createResolveTensorLoadStoreOpsPass());
-  // At this point, the IR is in a form where the interiors of islands
+  // At this point, the IR is in a form where there are no tensor ops
-  // don't have tensor ops (except tensor_store's of arguments and
+  // (except tensor_store's of arguments and tensor_load's of returns).
  // tensor_load's of returns).
  //
  // This is a reasonable representation for doing buffer assignment.
  // TODO: Do buffer assignment here.
  // We need to finalize the removal of tensors from the program. To do
  // that, we need to interface with a runtime ABI.
  // We currently use a canonicalized version of upstream MLIR's memref
  // ABI, where we canonically use unranked memref's for all
  // arguments/returns (which makes the C-level ABI very predictable).
  //
  // TODO: This pass is very tentative. See comments on LowerTensorLoadOp
  // for where we need to take it.
  pm.addPass(createLowerToMemRefABIPass());
  // TODO: Might want a different kind of island to better represent this.
  // This island op would explicitly capture all tensors as inputs, and it
  // would establish a more formalized ABI with the interior of the body
--- a/lib/E2E/LowerRankedShapes.cpp
+++ b/lib/E2E/LowerRankedShapes.cpp
@ -17,38 +17,6 @@
 using namespace mlir;
 using namespace mlir::NPCOMP;
 // Lowers ShapeOfOp's (which at this point should only operating on tensors
 // that need to have a full runtime-reified representation) to low-level
 // runtime interfaces.
 //
 // This is the "root" ranked shape lowering which creates the first
 // ShapeFromExtentsOp which is needed to start the whole ranked conversion
 // process.
 //
 // TODO: Move this ABI-specific lowering to a separate pass that only does
 // that and make this pass require an invariant something like "a 'root'
 // set of tcp::ShapeFromExtentsOp exist".
 namespace {
 class LowerRootRankedShape : public OpRewritePattern<shape::ShapeOfOp> {
 public:
  using OpRewritePattern::OpRewritePattern;
  LogicalResult matchAndRewrite(shape::ShapeOfOp op,
                                PatternRewriter &rewriter) const override {
    auto tensor = op.getOperand();
    auto type = tensor.getType().dyn_cast<RankedTensorType>();
    if (!type)
      return rewriter.notifyMatchFailure(op, "not a ranked tensor");
    SmallVector<Value, 6> extents;
    for (int i = 0, e = type.getRank(); i < e; i++) {
      extents.push_back(rewriter.create<tcp::RtGetTensorExtentOp>(
          op.getLoc(), tensor, rewriter.getI64IntegerAttr(i)));
    }
    rewriter.replaceOpWithNewOp<tcp::ShapeFromExtentsOp>(op, extents);
    return success();
  }
 };
 } // namespace
 // This has to be a "conversion pattern" since the `operands` argument
 // gives access to the post-conversion operands from earlier ops.
 namespace {
@ -137,6 +105,9 @@ public:
 // operands to the `tcp.shape_from_extents` op) and produce a
 // `tcp.shape_from_extents` op.
 //
 // We expect that previous passes have inserted a "root" set of
 // tcp::ShapeFromExtentsOp's that allow this process to get started.
 //
 // We then use this to resolve get_extent ops by using a rewrite
 // `get_extent(from_extents(x1,x2,x3), N) -> xN`, which should apply in
 // maximally many places due to the above invariant.
@ -163,7 +134,6 @@ class LowerRankedShapes : public LowerRankedShapesBase<LowerRankedShapes> {
    auto *context = &getContext();
    OwningRewritePatternList patterns;
    patterns.insert<LowerRootRankedShape>(context);
    patterns.insert<LowerShapeBroadcastOp>(context);
    patterns.insert<LowerShapeGetExtentOp>(context);
    ConversionTarget target(*context);
--- a/lib/E2E/LowerToMemRefABI.cpp
+++ b/lib/E2E/LowerToMemRefABI.cpp
@ -0,0 +1,163 @@
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #include "npcomp/E2E/E2E.h"
 #include "PassDetail.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "npcomp/Dialect/TCP/IR/TCPOps.h"
 using namespace mlir;
 using namespace mlir::NPCOMP;
 namespace {
 class LowerTensorStoreOp : public OpConversionPattern<TensorStoreOp> {
 public:
  using OpConversionPattern::OpConversionPattern;
  LogicalResult
  matchAndRewrite(TensorStoreOp op, ArrayRef<Value> operands,
                  ConversionPatternRewriter &rewriter) const override {
    TensorStoreOp::OperandAdaptor adaptor(operands);
    // The tensor has been converted to an unranked memref. We need to cast
    // it to the original memref type and copy it to the destination.
    //
    // TODO: Can we have a conversion infrastructure that doesn't have
    // patterns that doesn't couple type conversions and the patterns. That
    // is, patterns should be "context free" and locally expand to always
    // valid IR without relying on some side-channel TypeConverter to do
    // something else to make the IR valid.
    auto memref = rewriter.create<MemRefCastOp>(
        op.getLoc(), op.memref().getType(), adaptor.tensor());
    rewriter.replaceOpWithNewOp<linalg::CopyOp>(op, memref, adaptor.memref());
    return success();
  }
 };
 } // namespace
 namespace {
 class LowerTensorLoadOp : public OpConversionPattern<TensorLoadOp> {
 public:
  using OpConversionPattern::OpConversionPattern;
  LogicalResult
  matchAndRewrite(TensorLoadOp op, ArrayRef<Value> operands,
                  ConversionPatternRewriter &rewriter) const override {
    TensorLoadOp::OperandAdaptor adaptor(operands);
    auto type = UnrankedMemRefType::get(op.getType().getElementType(), 0);
    // TODO: This won't work. The LLVM unranked memref calling convention
    // doesn't allow returning an unranked memref becuase it lowers it to
    // 'int64 rank, void *descriptor' but in this case the descriptor will
    // likely be on the stack, so when returning the descriptor pointer it
    // will be use-after-return.
    //
    // We could directly emit LLVM IR mallocing the memref struct on the
    // heap or do a conversion to out params and require a preallocated
    // memref out descriptor (perhaps preallocated to a fixed upper bound
    // rank).
    //
    // But a more holistic approach seems needed:
    // 1. Use custom npcomp runtime types at function boundaries. These can
    // be approximately like IREE's !hal.buffer_view, namely a type-erased,
    // shape-erased, ref-counted multidimensional array of dense primitive
    // types. (Something like Py_buffer from the python buffer protocol is
    // another potential inspiration)
    //   - [IREE HAL buffer view](https://github.com/google/iree/blob/634136f03c144ad3acd2f28cd87785b0b6b572ac/iree/hal/api_detail.h#L26)
    //   - [Python buffer protocol](https://docs.python.org/3/c-api/buffer.html)
    // 2. Use a custom LLVM conversion that creates the memref types.
    // For example, have an op
    // ```
    // npcomp_rt.to_memref %buf_view : !npcomp_rt.buffer_view -> memref<?xf32>
    // ```
    // with a custom LLVM lowering that expands to all the right stuff.
    rewriter.replaceOpWithNewOp<MemRefCastOp>(op, type, adaptor.memref());
    return success();
  }
 };
 } // namespace
 namespace {
 class LowerShapeOfOp : public OpConversionPattern<shape::ShapeOfOp> {
 public:
  using OpConversionPattern::OpConversionPattern;
  LogicalResult
  matchAndRewrite(shape::ShapeOfOp op, ArrayRef<Value> operands,
                  ConversionPatternRewriter &rewriter) const override {
    shape::ShapeOfOp::OperandAdaptor adaptor(operands);
    auto tensorType = op.arg().getType().cast<RankedTensorType>();
    auto rankedMemRefType =
        MemRefType::get(tensorType.getShape(), tensorType.getElementType());
    auto rankedMemRef = rewriter.create<MemRefCastOp>(
        op.getLoc(), rankedMemRefType, adaptor.arg());
    SmallVector<Value, 6> extents;
    for (int i = 0, e = tensorType.getRank(); i < e; i++)
      extents.push_back(rewriter.create<DimOp>(op.getLoc(), rankedMemRef, i));
    rewriter.replaceOpWithNewOp<tcp::ShapeFromExtentsOp>(op, extents);
    return success();
  }
 };
 } // namespace
 namespace {
 // This pass lowers tensor types to a calling convention where all tensors
 // are passed as UnrankedMemRefType. This allows the current StandardToLLVM
 // lowering to return them as `size_t rank, void *descriptor` which is easy
 // to bridge across a fixed C ABI. (otherwise it specializes the signature
 // to the memref rank, which is very difficult to interoperate with).
 class LowerToMemRefABI : public LowerToMemRefABIBase<LowerToMemRefABI> {
  void runOnOperation() {
    auto func = getOperation();
    auto *context = &getContext();
    TypeConverter converter;
    converter.addConversion([](TensorType type) {
      return UnrankedMemRefType::get(type.getElementType(), /*memorySpace=*/0);
    });
    // Mark UnrankedMemRefType as "legal". This is the awkward way of doing
    // that.
    // TODO: Commenting this out causes a seemingly unrelated crash.
    // Redesign MLIR's type conversion system to have a clearer mental
    // model and not be so flaky.
    converter.addConversion([](UnrankedMemRefType type) { return type; });
    OwningRewritePatternList patterns;
    ConversionTarget target(*context);
    populateFuncOpTypeConversionPattern(patterns, context, converter);
    target.addDynamicallyLegalOp<mlir::FuncOp>([&](mlir::FuncOp op) {
      return converter.isSignatureLegal(op.getType());
    });
    patterns.insert<LowerTensorStoreOp>(context);
    target.addIllegalOp<TensorStoreOp>();
    target.addLegalOp<DimOp>();
    target.addLegalOp<MemRefCastOp>();
    target.addLegalOp<linalg::CopyOp>();
    patterns.insert<LowerTensorLoadOp>(context);
    target.addIllegalOp<TensorLoadOp>();
    patterns.insert<LowerShapeOfOp>(context);
    target.addIllegalOp<shape::ShapeOfOp>();
    target.addLegalOp<tcp::ShapeFromExtentsOp>();
    if (failed(applyPartialConversion(func, target, patterns))) {
      return signalPassFailure();
    }
  }
 };
 } // namespace
 std::unique_ptr<OperationPass<FuncOp>>
 mlir::NPCOMP::createLowerToMemRefABIPass() {
  return std::make_unique<LowerToMemRefABI>();
 }
--- a/test/E2E/lower-ranked-shapes.mlir
+++ b/test/E2E/lower-ranked-shapes.mlir
@ -2,12 +2,11 @@
 // CHECK-LABEL: func @broadcast_rank2_rank1
-func @broadcast_rank2_rank1(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> (index, index) {
+func @broadcast_rank2_rank1(%arg0: index, %arg1: index, %arg2: index) -> (index, index) {
  // CHECK-NOT: shape.shape_of
  // CHECK-NOT: shape.broadcast
  // CHECK-NOT: tcp.get_extent
-  %0 = shape.shape_of %arg0 : tensor<?x?xf32>
+  %0 = "tcp.shape_from_extents"(%arg0, %arg1) : (index, index) -> !shape.shape
-  %1 = shape.shape_of %arg1 : tensor<?xf32>
+  %1 = "tcp.shape_from_extents"(%arg2) : (index) -> !shape.shape
  %2 = "shape.broadcast"(%0, %1) : (!shape.shape, !shape.shape) -> !shape.shape
  %e0 = tcp.get_extent %2, 0
  %e1 = tcp.get_extent %2, 1
--- a/test/E2E/lower-to-memref-abi.mlir
+++ b/test/E2E/lower-to-memref-abi.mlir
@ -0,0 +1,30 @@
 // RUN: npcomp-opt -lower-to-memref-abi <%s | FileCheck %s --dump-input=fail
 // CHECK-LABEL: func @identity
 func @identity(%arg0: tensor<?xf32>) -> tensor<?xf32> {
  // CHECK: return %arg0 : memref<*xf32>
  return %arg0 : tensor<?xf32>
 }
 // CHECK-LABEL:   func @basic(
 // CHECK-SAME:                %[[VAL_1:.*]]: memref<*xf32>) -> memref<*xf32> {
 func @basic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
  // CHECK: %[[VAL_2:.*]] = memref_cast %[[VAL_1]] : memref<*xf32> to memref<?xf32>
  // CHECK: %[[VAL_3:.*]] = dim %[[VAL_2]], 0 : memref<?xf32>
  // CHECK: %[[VAL_4:.*]] = "tcp.shape_from_extents"(%[[VAL_3]]) : (index) -> !shape.shape
  %shape = shape.shape_of %arg0 : tensor<?xf32>
  // CHECK: %[[VAL_5:.*]] = tcp.alloc_memref %[[VAL_4]] : memref<?xf32>
  %memref = tcp.alloc_memref %shape : memref<?xf32>
  // CHECK: %[[VAL_6:.*]] = memref_cast %[[VAL_1]] : memref<*xf32> to memref<?xf32>
  // CHECK: linalg.copy(%[[VAL_6]], %[[VAL_5]]) : memref<?xf32>, memref<?xf32>
  tensor_store %arg0, %memref : memref<?xf32>
  // CHECK: %[[VAL_7:.*]] = memref_cast %[[VAL_5]] : memref<?xf32> to memref<*xf32>
  %ret = tensor_load %memref : memref<?xf32>
  // CHECK: return %[[VAL_7]] : memref<*xf32>
  return %ret: tensor<?xf32>
 }