torch-mlir/lib/Dialect/TMTensor/Transforms/Bufferize.cpp

//===- Bufferize.cpp - Bufferization of tmtensor ops ------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Func/Transforms/Passes.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/BuiltinDialect.h"
#include "mlir/IR/Operation.h"
#include "mlir/Pass/Pass.h"
#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorDialect.h"
#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.h"
#include "torch-mlir-dialects/Dialect/TMTensor/Transforms/PassDetail.h"
#include "torch-mlir-dialects/Dialect/TMTensor/Transforms/Passes.h"

using namespace ::mlir;
using namespace ::mlir::torch::TMTensor;

static Value cloneMemref(Location loc, Value memref, OpBuilder &b) {
  auto memrefType = memref.getType().cast<MemRefType>();
  auto alloc = b.create<memref::AllocOp>(
      loc, memref::getMixedSizes(b, loc, memref), memrefType.getElementType());
  b.create<memref::CopyOp>(loc, memref, alloc);
  return alloc;
}

static LogicalResult
allocateBuffersForResults(Location loc, TMTensorOp tmtensorOp,
                          ValueRange outputs,
                          SmallVectorImpl<Value> &resultBuffers, OpBuilder &b) {
  // Lazily compute loopRanges.
  SmallVector<Range, 4> loopRanges;

  // Allocate a buffer for every tensor result.
  assert(tmtensorOp.getNumOutputs() == tmtensorOp->getNumResults());
  for (const auto &en : llvm::enumerate(tmtensorOp->getResultTypes())) {
    size_t resultIndex = en.index();
    Type resultType = en.value();

    auto tensorType = dyn_cast<RankedTensorType>(resultType);
    if (tensorType == nullptr) {
      tmtensorOp.emitOpError()
          << "tensor to buffer conversion expects ranked tensor results";
      return failure();
    }
    auto tensorShape = tensorType.getShape();
    auto memrefType = MemRefType::get(tensorShape, tensorType.getElementType());
    Value resultTensor = outputs[resultIndex];

    // Clone output buffers whose value is actually used.
    OpOperand *tiedOpOperand = tmtensorOp.getOutputOperand(resultIndex);
    if (tmtensorOp.payloadUsesValueFromOperand(tiedOpOperand)) {
      resultBuffers.push_back(cloneMemref(loc, resultTensor, b));
      continue;
    }

    // Allocate buffers for statically-shaped results.
    if (memrefType.hasStaticShape()) {
      resultBuffers.push_back(b.create<memref::AllocOp>(loc, memrefType));
      continue;
    }

    resultBuffers.push_back(b.create<memref::AllocOp>(
        loc, memref::getMixedSizes(b, loc, resultTensor),
        memrefType.getElementType()));
  }
  return success();
}

/// Create TMTensor op on buffers given the original tensor-based operation and
/// the buffers for the outputs.
static TMTensorOp createTMTensorOpOnBuffers(ConversionPatternRewriter &rewriter,
                                            TMTensorOp tmtensorOp,
                                            ValueRange inputs,
                                            ValueRange outputs) {
  SmallVector<Value, 8> newOperands = inputs;
  newOperands.append(outputs.begin(), outputs.end());
  return cast<TMTensorOp>(
      tmtensorOp.clone(rewriter, tmtensorOp->getLoc(), {}, newOperands));
}

/// Generic conversion pattern that matches any TMTensorOp. This avoids template
/// instantiating one pattern for each TMTensorOp.
class BufferizeAnyTMTensorOp : public OpInterfaceConversionPattern<TMTensorOp> {
public:
  using OpInterfaceConversionPattern<TMTensorOp>::OpInterfaceConversionPattern;

  LogicalResult
  matchAndRewrite(TMTensorOp op, ArrayRef<Value> operands,
                  ConversionPatternRewriter &rewriter) const final {
    Location loc = op.getLoc();
    SmallVector<Value, 2> newOutputBuffers;

    SmallVector<Value> outputs(operands.begin() + op.getNumInputs(),
                               operands.end());
    if (failed(allocateBuffersForResults(loc, op, outputs, newOutputBuffers,
                                         rewriter))) {
      return op.emitOpError()
             << "Failed to allocate buffers for tensor results.";
    }

    SmallVector<Value> inputs(operands.begin(),
                              operands.begin() + op.getNumInputs());
    createTMTensorOpOnBuffers(rewriter, op, inputs, newOutputBuffers);
    // Replace the results of the old op with the new output buffers.
    rewriter.replaceOp(op, newOutputBuffers);
    return success();
  }
};

namespace {
/// Converts TMTensor operations that work on tensor-type operands or results to
/// work on buffers.
struct TMTensorBufferizePass
    : public TMTensorBufferizeBase<TMTensorBufferizePass> {
  void getDependentDialects(DialectRegistry &registry) const override {
    registry.insert<bufferization::BufferizationDialect, memref::MemRefDialect,
                    torch::TMTensor::TMTensorDialect>();
  }

  void runOnOperation() override {
    MLIRContext &context = getContext();
    ConversionTarget target(context);
    bufferization::BufferizeTypeConverter typeConverter;

    // Mark all Standard operations legal.
    target.addLegalDialect<arith::ArithDialect, func::FuncDialect,
                           memref::MemRefDialect, tensor::TensorDialect>();

    // Mark all TMTensor operations illegal as long as they work on tensors.
    auto isLegalOperation = [&](Operation *op) {
      return typeConverter.isLegal(op);
    };
    target.addDynamicallyLegalDialect<TMTensorDialect>(isLegalOperation);
    RewritePatternSet patterns(&context);
    patterns.add<BufferizeAnyTMTensorOp>(typeConverter, patterns.getContext());
    if (failed(applyPartialConversion(getOperation(), target,
                                      std::move(patterns))))
      signalPassFailure();
  }
};
} // namespace

std::unique_ptr<OperationPass<func::FuncOp>>
torch::TMTensor::createTMTensorBufferizePass() {
  return std::make_unique<TMTensorBufferizePass>();
}
Add bufferization pass for TMTensor ops The pass is mostly borrowed from the BufferizeAnyLinalgOp pass in mlir upstream with some minor changes. At a high level, it's a naive partial bufferization pass which allocate new buffers for all the output tensors. The initial value of an output buffer is copied from the original buffer if there are uses of the original value. One difference from linalg bufferization pass is the way to tell if the loop body uses the init value of output operand. For TMTensor ops, it differs from op to op because the payload region doesn't represent the entire loop body. 2022-02-26 07:04:33 +08:00			`//===- Bufferize.cpp - Bufferization of tmtensor ops ------------------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"`
build: update llvm tag to 6f46ff37 (#1448) Summary of changes: - Updated references to the Arith dialect (https://reviews.llvm.org/D134762) - Switched to prefixed accessors for MemRef dialect (https://reviews.llvm.org/D134995) - Fixed warnings about signed/unsigned comparisons, ignored return values, and unused variables 2022-10-05 21:28:06 +08:00			`#include "mlir/Dialect/Arith/IR/Arith.h"`
			`#include "mlir/Dialect/Arith/Utils/Utils.h"`
Add bufferization pass for TMTensor ops The pass is mostly borrowed from the BufferizeAnyLinalgOp pass in mlir upstream with some minor changes. At a high level, it's a naive partial bufferization pass which allocate new buffers for all the output tensors. The initial value of an output buffer is copied from the original buffer if there are uses of the original value. One difference from linalg bufferization pass is the way to tell if the loop body uses the init value of output operand. For TMTensor ops, it differs from op to op because the payload region doesn't represent the entire loop body. 2022-02-26 07:04:33 +08:00			`#include "mlir/Dialect/Bufferization/IR/Bufferization.h"`
Bump LLVM at 8361c5da30588d3d4a48eae648f53be1feb5cfad 2022-03-16 18:44:23 +08:00			`#include "mlir/Dialect/Func/IR/FuncOps.h"`
			`#include "mlir/Dialect/Func/Transforms/Passes.h"`
Add bufferization pass for TMTensor ops The pass is mostly borrowed from the BufferizeAnyLinalgOp pass in mlir upstream with some minor changes. At a high level, it's a naive partial bufferization pass which allocate new buffers for all the output tensors. The initial value of an output buffer is copied from the original buffer if there are uses of the original value. One difference from linalg bufferization pass is the way to tell if the loop body uses the init value of output operand. For TMTensor ops, it differs from op to op because the payload region doesn't represent the entire loop body. 2022-02-26 07:04:33 +08:00			`#include "mlir/Dialect/Linalg/Utils/Utils.h"`
			`#include "mlir/Dialect/Math/IR/Math.h"`
llvm: bump tag to e1318078 (#781) The updated LLVM code includes a patch to create bfloat16 array attributes, thus enabling a different patch to torch-mlir to flesh out support for the bfloat16 type. 2022-04-27 03:27:51 +08:00			`#include "mlir/Dialect/MemRef/IR/MemRef.h"`
Add bufferization pass for TMTensor ops The pass is mostly borrowed from the BufferizeAnyLinalgOp pass in mlir upstream with some minor changes. At a high level, it's a naive partial bufferization pass which allocate new buffers for all the output tensors. The initial value of an output buffer is copied from the original buffer if there are uses of the original value. One difference from linalg bufferization pass is the way to tell if the loop body uses the init value of output operand. For TMTensor ops, it differs from op to op because the payload region doesn't represent the entire loop body. 2022-02-26 07:04:33 +08:00			`#include "mlir/Dialect/Tensor/IR/Tensor.h"`
			`#include "mlir/Dialect/Vector/IR/VectorOps.h"`
			`#include "mlir/IR/BuiltinDialect.h"`
			`#include "mlir/IR/Operation.h"`
			`#include "mlir/Pass/Pass.h"`
			`#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorDialect.h"`
			`#include "torch-mlir-dialects/Dialect/TMTensor/IR/TMTensorOps.h"`
			`#include "torch-mlir-dialects/Dialect/TMTensor/Transforms/PassDetail.h"`
			`#include "torch-mlir-dialects/Dialect/TMTensor/Transforms/Passes.h"`

			`using namespace ::mlir;`
			`using namespace ::mlir::torch::TMTensor;`

			`static Value cloneMemref(Location loc, Value memref, OpBuilder &b) {`
			`auto memrefType = memref.getType().cast<MemRefType>();`
			`auto alloc = b.create<memref::AllocOp>(`
build: update llvm tag to 4592543a01609fe - update llvm tag to 4592543a01609feb4b3c19e81a9d54743e15e329 - mhlo now points to f6615343fdab2c74bebd23c78366cf097f9a72df Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com> 2023-07-22 00:06:05 +08:00			`loc, memref::getMixedSizes(b, loc, memref), memrefType.getElementType());`
Add bufferization pass for TMTensor ops The pass is mostly borrowed from the BufferizeAnyLinalgOp pass in mlir upstream with some minor changes. At a high level, it's a naive partial bufferization pass which allocate new buffers for all the output tensors. The initial value of an output buffer is copied from the original buffer if there are uses of the original value. One difference from linalg bufferization pass is the way to tell if the loop body uses the init value of output operand. For TMTensor ops, it differs from op to op because the payload region doesn't represent the entire loop body. 2022-02-26 07:04:33 +08:00			`b.create<memref::CopyOp>(loc, memref, alloc);`
			`return alloc;`
			`}`

			`static LogicalResult`
			`allocateBuffersForResults(Location loc, TMTensorOp tmtensorOp,`
			`ValueRange outputs,`
			`SmallVectorImpl<Value> &resultBuffers, OpBuilder &b) {`
			`// Lazily compute loopRanges.`
			`SmallVector<Range, 4> loopRanges;`

			`// Allocate a buffer for every tensor result.`
			`assert(tmtensorOp.getNumOutputs() == tmtensorOp->getNumResults());`
			`for (const auto &en : llvm::enumerate(tmtensorOp->getResultTypes())) {`
			`size_t resultIndex = en.index();`
			`Type resultType = en.value();`

Fix deprecated uses of cast/dyn_cast/dyn_cast_or_null/isa (#3130) We should prefer functional style as the method style is deprecated https://github.com/llvm/mlir-www/blob/main/website/content/deprecation/_index.md#deprecated (https://mlir.llvm.org/deprecation/) 2024-04-11 21:47:35 +08:00			`auto tensorType = dyn_cast<RankedTensorType>(resultType);`
Add bufferization pass for TMTensor ops The pass is mostly borrowed from the BufferizeAnyLinalgOp pass in mlir upstream with some minor changes. At a high level, it's a naive partial bufferization pass which allocate new buffers for all the output tensors. The initial value of an output buffer is copied from the original buffer if there are uses of the original value. One difference from linalg bufferization pass is the way to tell if the loop body uses the init value of output operand. For TMTensor ops, it differs from op to op because the payload region doesn't represent the entire loop body. 2022-02-26 07:04:33 +08:00			`if (tensorType == nullptr) {`
			`tmtensorOp.emitOpError()`
			`<< "tensor to buffer conversion expects ranked tensor results";`
			`return failure();`
			`}`
			`auto tensorShape = tensorType.getShape();`
			`auto memrefType = MemRefType::get(tensorShape, tensorType.getElementType());`
			`Value resultTensor = outputs[resultIndex];`

			`// Clone output buffers whose value is actually used.`
			`OpOperand *tiedOpOperand = tmtensorOp.getOutputOperand(resultIndex);`
			`if (tmtensorOp.payloadUsesValueFromOperand(tiedOpOperand)) {`
			`resultBuffers.push_back(cloneMemref(loc, resultTensor, b));`
			`continue;`
			`}`

			`// Allocate buffers for statically-shaped results.`
			`if (memrefType.hasStaticShape()) {`
			`resultBuffers.push_back(b.create<memref::AllocOp>(loc, memrefType));`
			`continue;`
			`}`

			`resultBuffers.push_back(b.create<memref::AllocOp>(`
build: update llvm tag to 4592543a01609fe - update llvm tag to 4592543a01609feb4b3c19e81a9d54743e15e329 - mhlo now points to f6615343fdab2c74bebd23c78366cf097f9a72df Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com> 2023-07-22 00:06:05 +08:00			`loc, memref::getMixedSizes(b, loc, resultTensor),`
			`memrefType.getElementType()));`
Add bufferization pass for TMTensor ops The pass is mostly borrowed from the BufferizeAnyLinalgOp pass in mlir upstream with some minor changes. At a high level, it's a naive partial bufferization pass which allocate new buffers for all the output tensors. The initial value of an output buffer is copied from the original buffer if there are uses of the original value. One difference from linalg bufferization pass is the way to tell if the loop body uses the init value of output operand. For TMTensor ops, it differs from op to op because the payload region doesn't represent the entire loop body. 2022-02-26 07:04:33 +08:00			`}`
			`return success();`
			`}`

			`/// Create TMTensor op on buffers given the original tensor-based operation and`
			`/// the buffers for the outputs.`
			`static TMTensorOp createTMTensorOpOnBuffers(ConversionPatternRewriter &rewriter,`
			`TMTensorOp tmtensorOp,`
			`ValueRange inputs,`
			`ValueRange outputs) {`
			`SmallVector<Value, 8> newOperands = inputs;`
			`newOperands.append(outputs.begin(), outputs.end());`
Clang format refresh (#2812) After noticing a number of commits with unrelated formatting changes, I think something was changed with clang-format at one point and we're seeing a number of unrelated changes. Doing a refresh can help avoid this. The changes made here came from ``` find lib -iname .h -o -iname .cpp \| xargs clang-format -i --style=llvm find include -iname .h -o -iname .cpp \| xargs clang-format -i --style=llvm find projects -iname .h -o -iname .cpp \| xargs clang-format -i --style=llvm ``` 2024-01-30 01:59:33 +08:00			`return cast<TMTensorOp>(`
			`tmtensorOp.clone(rewriter, tmtensorOp->getLoc(), {}, newOperands));`
Add bufferization pass for TMTensor ops The pass is mostly borrowed from the BufferizeAnyLinalgOp pass in mlir upstream with some minor changes. At a high level, it's a naive partial bufferization pass which allocate new buffers for all the output tensors. The initial value of an output buffer is copied from the original buffer if there are uses of the original value. One difference from linalg bufferization pass is the way to tell if the loop body uses the init value of output operand. For TMTensor ops, it differs from op to op because the payload region doesn't represent the entire loop body. 2022-02-26 07:04:33 +08:00			`}`

			`/// Generic conversion pattern that matches any TMTensorOp. This avoids template`
			`/// instantiating one pattern for each TMTensorOp.`
			`class BufferizeAnyTMTensorOp : public OpInterfaceConversionPattern<TMTensorOp> {`
			`public:`
			`using OpInterfaceConversionPattern<TMTensorOp>::OpInterfaceConversionPattern;`

			`LogicalResult`
			`matchAndRewrite(TMTensorOp op, ArrayRef<Value> operands,`
			`ConversionPatternRewriter &rewriter) const final {`
			`Location loc = op.getLoc();`
			`SmallVector<Value, 2> newOutputBuffers;`

			`SmallVector<Value> outputs(operands.begin() + op.getNumInputs(),`
			`operands.end());`
			`if (failed(allocateBuffersForResults(loc, op, outputs, newOutputBuffers,`
			`rewriter))) {`
			`return op.emitOpError()`
			`<< "Failed to allocate buffers for tensor results.";`
			`}`

			`SmallVector<Value> inputs(operands.begin(),`
			`operands.begin() + op.getNumInputs());`
			`createTMTensorOpOnBuffers(rewriter, op, inputs, newOutputBuffers);`
			`// Replace the results of the old op with the new output buffers.`
			`rewriter.replaceOp(op, newOutputBuffers);`
			`return success();`
			`}`
			`};`

			`namespace {`
			`/// Converts TMTensor operations that work on tensor-type operands or results to`
			`/// work on buffers.`
			`struct TMTensorBufferizePass`
			`: public TMTensorBufferizeBase<TMTensorBufferizePass> {`
			`void getDependentDialects(DialectRegistry &registry) const override {`
			`registry.insert<bufferization::BufferizationDialect, memref::MemRefDialect,`
			`torch::TMTensor::TMTensorDialect>();`
			`}`

			`void runOnOperation() override {`
			`MLIRContext &context = getContext();`
			`ConversionTarget target(context);`
			`bufferization::BufferizeTypeConverter typeConverter;`

			`// Mark all Standard operations legal.`
build: update llvm tag to 6f46ff37 (#1448) Summary of changes: - Updated references to the Arith dialect (https://reviews.llvm.org/D134762) - Switched to prefixed accessors for MemRef dialect (https://reviews.llvm.org/D134995) - Fixed warnings about signed/unsigned comparisons, ignored return values, and unused variables 2022-10-05 21:28:06 +08:00			`target.addLegalDialect<arith::ArithDialect, func::FuncDialect,`
Bump LLVM at 8361c5da30588d3d4a48eae648f53be1feb5cfad 2022-03-16 18:44:23 +08:00			`memref::MemRefDialect, tensor::TensorDialect>();`
Add bufferization pass for TMTensor ops The pass is mostly borrowed from the BufferizeAnyLinalgOp pass in mlir upstream with some minor changes. At a high level, it's a naive partial bufferization pass which allocate new buffers for all the output tensors. The initial value of an output buffer is copied from the original buffer if there are uses of the original value. One difference from linalg bufferization pass is the way to tell if the loop body uses the init value of output operand. For TMTensor ops, it differs from op to op because the payload region doesn't represent the entire loop body. 2022-02-26 07:04:33 +08:00
			`// Mark all TMTensor operations illegal as long as they work on tensors.`
			`auto isLegalOperation = [&](Operation *op) {`
			`return typeConverter.isLegal(op);`
			`};`
			`target.addDynamicallyLegalDialect<TMTensorDialect>(isLegalOperation);`
			`RewritePatternSet patterns(&context);`
			`patterns.add<BufferizeAnyTMTensorOp>(typeConverter, patterns.getContext());`
			`if (failed(applyPartialConversion(getOperation(), target,`
			`std::move(patterns))))`
			`signalPassFailure();`
			`}`
			`};`
			`} // namespace`

llvm: bump tag to e1318078 (#781) The updated LLVM code includes a patch to create bfloat16 array attributes, thus enabling a different patch to torch-mlir to flesh out support for the bfloat16 type. 2022-04-27 03:27:51 +08:00			`std::unique_ptr<OperationPass<func::FuncOp>>`
Add bufferization pass for TMTensor ops The pass is mostly borrowed from the BufferizeAnyLinalgOp pass in mlir upstream with some minor changes. At a high level, it's a naive partial bufferization pass which allocate new buffers for all the output tensors. The initial value of an output buffer is copied from the original buffer if there are uses of the original value. One difference from linalg bufferization pass is the way to tell if the loop body uses the init value of output operand. For TMTensor ops, it differs from op to op because the payload region doesn't represent the entire loop body. 2022-02-26 07:04:33 +08:00			`torch::TMTensor::createTMTensorBufferizePass() {`
			`return std::make_unique<TMTensorBufferizePass>();`
			`}`