torch-mlir/lib/Conversion/TorchToLinalg/Random.cpp

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Also available under a BSD-style license. See LICENSE.
//
//===----------------------------------------------------------------------===//

#include "torch-mlir/Conversion/TorchToLinalg/TorchToLinalg.h"

#include "../PassDetail.h"
#include "PopulatePatterns.h"
#include "Utils.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/IR/Matchers.h"
#include "torch-mlir/Conversion/Utils/Utils.h"
#include "torch-mlir/Dialect/Torch/IR/TorchDialect.h"
#include "torch-mlir/Dialect/Torch/IR/TorchOps.h"
#include "torch-mlir/Dialect/Torch/Utils/TorchUpstream.h"
#include "torch-mlir/Dialect/Torch/Utils/Utils.h"
#include "torch-mlir/Dialect/TorchConversion/IR/TorchConversionOps.h"

using namespace mlir;
using namespace mlir::torch;
using namespace mlir::torch::Torch;

namespace {
// TODO: Dropout should probably be handled in DecomposeComplexOps instead of
// here.
class ConvertAtenDropoutOp : public OpConversionPattern<AtenDropoutOp> {
public:
  using OpConversionPattern::OpConversionPattern;
  LogicalResult
  matchAndRewrite(AtenDropoutOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
      return failure();

    bool train;
    if (!matchPattern(op.train(), m_TorchConstantBool(&train)))
      return rewriter.notifyMatchFailure(op,
                                         "Expected train to be constant bool.");

    if (train)
      return failure();
    auto resultType = getTypeConverter()
                          ->convertType(op->getResult(0).getType())
                          .cast<RankedTensorType>();
    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, resultType,
                                                adaptor.input());
    return success();
  }
};
} // namespace


namespace {
class ConvertAtenUniformOp : public OpConversionPattern<AtenUniformOp> {
public:
  using OpConversionPattern::OpConversionPattern;
  LogicalResult
  matchAndRewrite(AtenUniformOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
      return failure();
    Location loc = op.getLoc();
    Value self = adaptor.self();
    Value from = adaptor.from();
    Value to = adaptor.to();
    Value generator = adaptor.generator();
    RankedTensorType resultType = self.getType().cast<RankedTensorType>();
    Type elemTy = resultType.getElementType();

    if (!elemTy.isa<mlir::FloatType>())
      return rewriter.notifyMatchFailure(op, "This op only support float type");

    if (!generator.getType().isa<Torch::NoneType>())
      return rewriter.notifyMatchFailure(
          op, "The generator has to ben None because only global default "
              "generator is supported");

    // Build the core formula of LCG Algorithm that makes use of element index:
    // For output matrix with rank N:
    // temp1 = (cast(I64, index(D.0)) + seed) * multiplier + incrementStep
    // ...
    // tempN = (cast(I64, index(D.(N))) + tempN-1) * multiplier + incr
    // Refer to https://reviews.llvm.org/D101364.
    // The value of multiplier and incrementStep are referenced from
    // https://en.wikipedia.org/wiki/Linear_congruential_generator for 2^64.
    Value multiplier = rewriter.create<arith::ConstantOp>(
        loc, rewriter.getI64IntegerAttr(6364136223846793005));
    Value incrementStep = rewriter.create<arith::ConstantOp>(
        loc, rewriter.getI64IntegerAttr(1442695040888963407));
    // Tn = (index + Tn-1) * multiplier + incrementStep
    auto getNextTemp = [&](OpBuilder &b, Value index, Value temp) {
      Value castIndex =
          b.create<arith::IndexCastOp>(loc, b.getI64Type(), index);
      Value add = b.create<arith::AddIOp>(loc, castIndex, temp);
      Value mult = b.create<arith::MulIOp>(loc, add, multiplier);
      return b.create<arith::AddIOp>(loc, mult, incrementStep);
    };

    // Get initial seed, min and max used by `linalg.generic` compute payload.
    Value initialSeed = rewriter.create<TorchConversion::GetNextSeedOp>(loc);
    Value min = convertScalarToDtype(rewriter, loc, from, elemTy);
    Value max = convertScalarToDtype(rewriter, loc, to, elemTy);

    // Construct the `linalg.generic` op.
    auto resultRank = resultType.getRank();
    SmallVector<AffineMap, 1> indexingMaps(
        1, rewriter.getMultiDimIdentityMap(resultRank));
    SmallVector<utils::IteratorType> iteratorTypes(
        resultRank, utils::IteratorType::parallel);
    SmallVector<Value> sizes = getTensorSizes(rewriter, loc, self);
    Value initTensor =
        rewriter.create<tensor::EmptyOp>(loc, getAsOpFoldResult(sizes), elemTy);
    Value uniformRes =
        rewriter
            .create<linalg::GenericOp>(
                loc, initTensor.getType(), /*inputs=*/ValueRange{},
                /*outputs=*/initTensor, indexingMaps, iteratorTypes,
                [&](OpBuilder &b, Location loc, ValueRange args) {
                  Value temp = initialSeed;
                  for (int i = 0; i < resultRank; i++) {
                    Value index = b.create<linalg::IndexOp>(loc, i);
                    temp = getNextTemp(b, index, temp);
                  }
                  // scale = (max - min) * const(F64,  5.4210108E-20)
                  // which is derived from rand(min,max) =
                  // rand()/(RAND_MAX/(max-min)) where RAND_MAX = 2^64 - 1
                  Value epsilon = b.create<arith::ConstantOp>(
                      loc, b.getFloatAttr(min.getType(), 5.4210108E-20));
                  Value range = b.create<arith::SubFOp>(loc, max, min);
                  Value scale = b.create<arith::MulFOp>(loc, range, epsilon);

                  // res = cast(F64, tempN) * scale + min
                  Value updateFloat =
                      b.create<arith::UIToFPOp>(loc, elemTy, temp);
                  Value updateScaled =
                      b.create<arith::MulFOp>(loc, updateFloat, scale);
                  Value res = b.create<arith::AddFOp>(loc, updateScaled, min);
                  b.create<linalg::YieldOp>(loc, res);
                })
            .getResult(0);

    Type newResultType = getTypeConverter()->convertType(op.getType());
    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, newResultType, uniformRes);
    return success();
  }
};
} // namespace


void mlir::torch::torch_to_linalg::populateRandomPatternsAndLegality(
    TypeConverter &typeConverter, RewritePatternSet &patterns,
    ConversionTarget &target) {
  MLIRContext *context = patterns.getContext();
  target.addIllegalOp<AtenDropoutOp>();
  patterns.add<ConvertAtenDropoutOp>(typeConverter, context);
  target.addIllegalOp<AtenUniformOp>();
  patterns.add<ConvertAtenUniformOp>(typeConverter, context);
}
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`//===----------------------------------------------------------------------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`// Also available under a BSD-style license. See LICENSE.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "torch-mlir/Conversion/TorchToLinalg/TorchToLinalg.h"`

			`#include "../PassDetail.h"`
			`#include "PopulatePatterns.h"`
			`#include "Utils.h"`
build: update llvm tag to 6f46ff37 (#1448) Summary of changes: - Updated references to the Arith dialect (https://reviews.llvm.org/D134762) - Switched to prefixed accessors for MemRef dialect (https://reviews.llvm.org/D134995) - Fixed warnings about signed/unsigned comparisons, ignored return values, and unused variables 2022-10-05 21:28:06 +08:00			`#include "mlir/Dialect/Arith/IR/Arith.h"`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"`
			`#include "mlir/Dialect/Linalg/IR/Linalg.h"`
			`#include "mlir/Dialect/Tensor/IR/Tensor.h"`
			`#include "mlir/IR/Matchers.h"`
			`#include "torch-mlir/Conversion/Utils/Utils.h"`
			`#include "torch-mlir/Dialect/Torch/IR/TorchDialect.h"`
			`#include "torch-mlir/Dialect/Torch/IR/TorchOps.h"`
			`#include "torch-mlir/Dialect/Torch/Utils/TorchUpstream.h"`
			`#include "torch-mlir/Dialect/Torch/Utils/Utils.h"`
			`#include "torch-mlir/Dialect/TorchConversion/IR/TorchConversionOps.h"`

			`using namespace mlir;`
			`using namespace mlir::torch;`
			`using namespace mlir::torch::Torch;`

			`namespace {`
			`// TODO: Dropout should probably be handled in DecomposeComplexOps instead of`
			`// here.`
			`class ConvertAtenDropoutOp : public OpConversionPattern<AtenDropoutOp> {`
			`public:`
			`using OpConversionPattern::OpConversionPattern;`
			`LogicalResult`
			`matchAndRewrite(AtenDropoutOp op, OpAdaptor adaptor,`
			`ConversionPatternRewriter &rewriter) const override {`
			`if (failed(verifyLinalgCompatibleTypes(op, rewriter)))`
			`return failure();`

			`bool train;`
			`if (!matchPattern(op.train(), m_TorchConstantBool(&train)))`
			`return rewriter.notifyMatchFailure(op,`
			`"Expected train to be constant bool.");`

			`if (train)`
			`return failure();`
			`auto resultType = getTypeConverter()`
			`->convertType(op->getResult(0).getType())`
			`.cast<RankedTensorType>();`
			`rewriter.replaceOpWithNewOp<tensor::CastOp>(op, resultType,`
			`adaptor.input());`
			`return success();`
			`}`
			`};`
			`} // namespace`


			`namespace {`
Remove all but one of valsem ops + move fill.Scalar to elementwise (#1531) This commit removes almost all of the valsem ops, since the value semantics version of the ops now exist in PyTorch. The only op missing is `aten.bernoulli_.float`. In addition, this commit also simplifies the implementation of `aten.fill.Scalar` by moving it to the pattern that converts elementwise ops. 2022-10-28 23:06:11 +08:00			`class ConvertAtenUniformOp : public OpConversionPattern<AtenUniformOp> {`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`public:`
			`using OpConversionPattern::OpConversionPattern;`
			`LogicalResult`
Remove all but one of valsem ops + move fill.Scalar to elementwise (#1531) This commit removes almost all of the valsem ops, since the value semantics version of the ops now exist in PyTorch. The only op missing is `aten.bernoulli_.float`. In addition, this commit also simplifies the implementation of `aten.fill.Scalar` by moving it to the pattern that converts elementwise ops. 2022-10-28 23:06:11 +08:00			`matchAndRewrite(AtenUniformOp op, OpAdaptor adaptor,`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`ConversionPatternRewriter &rewriter) const override {`
			`if (failed(verifyLinalgCompatibleTypes(op, rewriter)))`
			`return failure();`
			`Location loc = op.getLoc();`
			`Value self = adaptor.self();`
			`Value from = adaptor.from();`
			`Value to = adaptor.to();`
			`Value generator = adaptor.generator();`
			`RankedTensorType resultType = self.getType().cast<RankedTensorType>();`
			`Type elemTy = resultType.getElementType();`

			`if (!elemTy.isa<mlir::FloatType>())`
			`return rewriter.notifyMatchFailure(op, "This op only support float type");`

			`if (!generator.getType().isa<Torch::NoneType>())`
			`return rewriter.notifyMatchFailure(`
			`op, "The generator has to ben None because only global default "`
			`"generator is supported");`

			`// Build the core formula of LCG Algorithm that makes use of element index:`
			`// For output matrix with rank N:`
			`// temp1 = (cast(I64, index(D.0)) + seed) * multiplier + incrementStep`
			`// ...`
			`// tempN = (cast(I64, index(D.(N))) + tempN-1) * multiplier + incr`
			`// Refer to https://reviews.llvm.org/D101364.`
			`// The value of multiplier and incrementStep are referenced from`
			`// https://en.wikipedia.org/wiki/Linear_congruential_generator for 2^64.`
			`Value multiplier = rewriter.create<arith::ConstantOp>(`
			`loc, rewriter.getI64IntegerAttr(6364136223846793005));`
			`Value incrementStep = rewriter.create<arith::ConstantOp>(`
			`loc, rewriter.getI64IntegerAttr(1442695040888963407));`
			`// Tn = (index + Tn-1) * multiplier + incrementStep`
			`auto getNextTemp = [&](OpBuilder &b, Value index, Value temp) {`
			`Value castIndex =`
			`b.create<arith::IndexCastOp>(loc, b.getI64Type(), index);`
			`Value add = b.create<arith::AddIOp>(loc, castIndex, temp);`
			`Value mult = b.create<arith::MulIOp>(loc, add, multiplier);`
			`return b.create<arith::AddIOp>(loc, mult, incrementStep);`
			`};`

			// Get initial seed, min and max used by `linalg.generic` compute payload.
			`Value initialSeed = rewriter.create<TorchConversion::GetNextSeedOp>(loc);`
			`Value min = convertScalarToDtype(rewriter, loc, from, elemTy);`
			`Value max = convertScalarToDtype(rewriter, loc, to, elemTy);`

			// Construct the `linalg.generic` op.
			`auto resultRank = resultType.getRank();`
			`SmallVector<AffineMap, 1> indexingMaps(`
			`1, rewriter.getMultiDimIdentityMap(resultRank));`
llvm: update tag to e864ac6945 (#1600) Summary of changes: 1. Replace `string` iterator types by `IteratorType` enum. (https://github.com/llvm/llvm-project/commit/e6598b053dc71845423132ac0803b1ecd0b71d45) 2. Update `includes` wrt new directory layout of MLIR HLO codebase. (https://github.com/tensorflow/mlir-hlo/commit/9fd8d251a811f1bae0fde20744aef3e1817a11e8) 3. Update tags llvm: e864ac694540342d5e59f59c525c5082f2594fb8 MHLO: eab364ba2a66bd0613efb94f8a738c1c97aaee92 Signed-Off-by: Gaurav Shukla <gaurav@nod-labs.com> Signed-off-by: Gaurav Shukla <gaurav@nod-labs.com> 2022-11-17 06:40:36 +08:00			`SmallVector<utils::IteratorType> iteratorTypes(`
			`resultRank, utils::IteratorType::parallel);`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`SmallVector<Value> sizes = getTensorSizes(rewriter, loc, self);`
			`Value initTensor =`
build: update llvm tag to 4546397e (#1502) This commit makes the following changes needed to update bump LLVM: - Replace `linalg.init_tensor` with `tensor.empty` (see: https://reviews.llvm.org/D135129) - Replace `NoSideEffect` with `Pure` (see https://reviews.llvm.org/D135505) - Replace `body` region accessor for `ReduceOp` and `ReduceWindowOp` with `getBody` - Fix incorrect use of `tosa::ReduceSumOp` in `AtenNativeLayerNormOp` conversion pattern. The result type of `tosa::ReduceSumOp` must have the same rank as the input type. (see: https://www.mlplatform.org/tosa/tosa_spec.html#_reduce_sum) Co-authored-by: Ashay Rane <ashay@users.noreply.github.com> Co-authored-by: Ashay Rane <ashay@users.noreply.github.com> 2022-10-18 12:22:53 +08:00			`rewriter.create<tensor::EmptyOp>(loc, getAsOpFoldResult(sizes), elemTy);`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`Value uniformRes =`
			`rewriter`
			`.create<linalg::GenericOp>(`
			`loc, initTensor.getType(), /inputs=/ValueRange{},`
			`/outputs=/initTensor, indexingMaps, iteratorTypes,`
			`[&](OpBuilder &b, Location loc, ValueRange args) {`
			`Value temp = initialSeed;`
			`for (int i = 0; i < resultRank; i++) {`
			`Value index = b.create<linalg::IndexOp>(loc, i);`
			`temp = getNextTemp(b, index, temp);`
			`}`
			`// scale = (max - min) * const(F64, 5.4210108E-20)`
			`// which is derived from rand(min,max) =`
			`// rand()/(RAND_MAX/(max-min)) where RAND_MAX = 2^64 - 1`
			`Value epsilon = b.create<arith::ConstantOp>(`
			`loc, b.getFloatAttr(min.getType(), 5.4210108E-20));`
			`Value range = b.create<arith::SubFOp>(loc, max, min);`
			`Value scale = b.create<arith::MulFOp>(loc, range, epsilon);`

			`// res = cast(F64, tempN) * scale + min`
			`Value updateFloat =`
			`b.create<arith::UIToFPOp>(loc, elemTy, temp);`
			`Value updateScaled =`
			`b.create<arith::MulFOp>(loc, updateFloat, scale);`
			`Value res = b.create<arith::AddFOp>(loc, updateScaled, min);`
			`b.create<linalg::YieldOp>(loc, res);`
			`})`
			`.getResult(0);`

			`Type newResultType = getTypeConverter()->convertType(op.getType());`
			`rewriter.replaceOpWithNewOp<tensor::CastOp>(op, newResultType, uniformRes);`
			`return success();`
			`}`
			`};`
			`} // namespace`


			`void mlir::torch::torch_to_linalg::populateRandomPatternsAndLegality(`
			`TypeConverter &typeConverter, RewritePatternSet &patterns,`
			`ConversionTarget &target) {`
			`MLIRContext *context = patterns.getContext();`
			`target.addIllegalOp<AtenDropoutOp>();`
			`patterns.add<ConvertAtenDropoutOp>(typeConverter, context);`
Remove all but one of valsem ops + move fill.Scalar to elementwise (#1531) This commit removes almost all of the valsem ops, since the value semantics version of the ops now exist in PyTorch. The only op missing is `aten.bernoulli_.float`. In addition, this commit also simplifies the implementation of `aten.fill.Scalar` by moving it to the pattern that converts elementwise ops. 2022-10-28 23:06:11 +08:00			`target.addIllegalOp<AtenUniformOp>();`
			`patterns.add<ConvertAtenUniformOp>(typeConverter, context);`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`}`