torch-mlir/lib/Conversion/TorchToLinalg/Random.cpp

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Also available under a BSD-style license. See LICENSE.
//
//===----------------------------------------------------------------------===//

#include "torch-mlir/Conversion/TorchToLinalg/TorchToLinalg.h"

#include "../PassDetail.h"
#include "PopulatePatterns.h"
#include "Utils.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/IR/Matchers.h"
#include "torch-mlir/Conversion/Utils/Utils.h"
#include "torch-mlir/Dialect/Torch/IR/TorchDialect.h"
#include "torch-mlir/Dialect/Torch/IR/TorchOps.h"
#include "torch-mlir/Dialect/Torch/Utils/TorchUpstream.h"
#include "torch-mlir/Dialect/Torch/Utils/Utils.h"
#include "torch-mlir/Dialect/TorchConversion/IR/TorchConversionOps.h"

using namespace mlir;
using namespace mlir::torch;
using namespace mlir::torch::Torch;

namespace {
// TODO: Dropout should probably be handled in DecomposeComplexOps instead of
// here.
class ConvertAtenDropoutOp : public OpConversionPattern<AtenDropoutOp> {
public:
  using OpConversionPattern::OpConversionPattern;
  LogicalResult
  matchAndRewrite(AtenDropoutOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
      return failure();

    bool train;
    if (!matchPattern(op.train(), m_TorchConstantBool(&train)))
      return rewriter.notifyMatchFailure(op,
                                         "Expected train to be constant bool.");

    if (train)
      return failure();
    auto resultType = getTypeConverter()
                          ->convertType(op->getResult(0).getType())
                          .cast<RankedTensorType>();
    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, resultType,
                                                adaptor.input());
    return success();
  }
};
} // namespace


namespace {
class ConvertValsemVariantAtenUniformOp
    : public OpConversionPattern<ValsemVariantAtenUniformOp> {
public:
  using OpConversionPattern::OpConversionPattern;
  LogicalResult
  matchAndRewrite(ValsemVariantAtenUniformOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
      return failure();
    Location loc = op.getLoc();
    Value self = adaptor.self();
    Value from = adaptor.from();
    Value to = adaptor.to();
    Value generator = adaptor.generator();
    RankedTensorType resultType = self.getType().cast<RankedTensorType>();
    Type elemTy = resultType.getElementType();

    if (!elemTy.isa<mlir::FloatType>())
      return rewriter.notifyMatchFailure(op, "This op only support float type");

    if (!generator.getType().isa<Torch::NoneType>())
      return rewriter.notifyMatchFailure(
          op, "The generator has to ben None because only global default "
              "generator is supported");

    // Build the core formula of LCG Algorithm that makes use of element index:
    // For output matrix with rank N:
    // temp1 = (cast(I64, index(D.0)) + seed) * multiplier + incrementStep
    // ...
    // tempN = (cast(I64, index(D.(N))) + tempN-1) * multiplier + incr
    // Refer to https://reviews.llvm.org/D101364.
    // The value of multiplier and incrementStep are referenced from
    // https://en.wikipedia.org/wiki/Linear_congruential_generator for 2^64.
    Value multiplier = rewriter.create<arith::ConstantOp>(
        loc, rewriter.getI64IntegerAttr(6364136223846793005));
    Value incrementStep = rewriter.create<arith::ConstantOp>(
        loc, rewriter.getI64IntegerAttr(1442695040888963407));
    // Tn = (index + Tn-1) * multiplier + incrementStep
    auto getNextTemp = [&](OpBuilder &b, Value index, Value temp) {
      Value castIndex =
          b.create<arith::IndexCastOp>(loc, b.getI64Type(), index);
      Value add = b.create<arith::AddIOp>(loc, castIndex, temp);
      Value mult = b.create<arith::MulIOp>(loc, add, multiplier);
      return b.create<arith::AddIOp>(loc, mult, incrementStep);
    };

    // Get initial seed, min and max used by `linalg.generic` compute payload.
    Value initialSeed = rewriter.create<TorchConversion::GetNextSeedOp>(loc);
    Value min = convertScalarToDtype(rewriter, loc, from, elemTy);
    Value max = convertScalarToDtype(rewriter, loc, to, elemTy);

    // Construct the `linalg.generic` op.
    auto resultRank = resultType.getRank();
    SmallVector<AffineMap, 1> indexingMaps(
        1, rewriter.getMultiDimIdentityMap(resultRank));
    SmallVector<StringRef> iteratorTypes(resultRank,
                                         getParallelIteratorTypeName());
    SmallVector<Value> sizes = getTensorSizes(rewriter, loc, self);
    Value initTensor =
        rewriter.create<linalg::InitTensorOp>(loc, sizes, elemTy);
    Value uniformRes =
        rewriter
            .create<linalg::GenericOp>(
                loc, initTensor.getType(), /*inputs=*/ValueRange{},
                /*outputs=*/initTensor, indexingMaps, iteratorTypes,
                [&](OpBuilder &b, Location loc, ValueRange args) {
                  Value temp = initialSeed;
                  for (int i = 0; i < resultRank; i++) {
                    Value index = b.create<linalg::IndexOp>(loc, i);
                    temp = getNextTemp(b, index, temp);
                  }
                  // scale = (max - min) * const(F64,  5.4210108E-20)
                  // which is derived from rand(min,max) =
                  // rand()/(RAND_MAX/(max-min)) where RAND_MAX = 2^64 - 1
                  Value epsilon = b.create<arith::ConstantOp>(
                      loc, b.getFloatAttr(min.getType(), 5.4210108E-20));
                  Value range = b.create<arith::SubFOp>(loc, max, min);
                  Value scale = b.create<arith::MulFOp>(loc, range, epsilon);

                  // res = cast(F64, tempN) * scale + min
                  Value updateFloat =
                      b.create<arith::UIToFPOp>(loc, elemTy, temp);
                  Value updateScaled =
                      b.create<arith::MulFOp>(loc, updateFloat, scale);
                  Value res = b.create<arith::AddFOp>(loc, updateScaled, min);
                  b.create<linalg::YieldOp>(loc, res);
                })
            .getResult(0);

    Type newResultType = getTypeConverter()->convertType(op.getType());
    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, newResultType, uniformRes);
    return success();
  }
};
} // namespace


void mlir::torch::torch_to_linalg::populateRandomPatternsAndLegality(
    TypeConverter &typeConverter, RewritePatternSet &patterns,
    ConversionTarget &target) {
  MLIRContext *context = patterns.getContext();
  target.addIllegalOp<AtenDropoutOp>();
  patterns.add<ConvertAtenDropoutOp>(typeConverter, context);
  target.addIllegalOp<ValsemVariantAtenUniformOp>();
  patterns.add<ConvertValsemVariantAtenUniformOp>(typeConverter, context);
}
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`//===----------------------------------------------------------------------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`// Also available under a BSD-style license. See LICENSE.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "torch-mlir/Conversion/TorchToLinalg/TorchToLinalg.h"`

			`#include "../PassDetail.h"`
			`#include "PopulatePatterns.h"`
			`#include "Utils.h"`
			`#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"`
			`#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"`
			`#include "mlir/Dialect/Linalg/IR/Linalg.h"`
			`#include "mlir/Dialect/Tensor/IR/Tensor.h"`
			`#include "mlir/IR/Matchers.h"`
			`#include "torch-mlir/Conversion/Utils/Utils.h"`
			`#include "torch-mlir/Dialect/Torch/IR/TorchDialect.h"`
			`#include "torch-mlir/Dialect/Torch/IR/TorchOps.h"`
			`#include "torch-mlir/Dialect/Torch/Utils/TorchUpstream.h"`
			`#include "torch-mlir/Dialect/Torch/Utils/Utils.h"`
			`#include "torch-mlir/Dialect/TorchConversion/IR/TorchConversionOps.h"`

			`using namespace mlir;`
			`using namespace mlir::torch;`
			`using namespace mlir::torch::Torch;`

			`namespace {`
			`// TODO: Dropout should probably be handled in DecomposeComplexOps instead of`
			`// here.`
			`class ConvertAtenDropoutOp : public OpConversionPattern<AtenDropoutOp> {`
			`public:`
			`using OpConversionPattern::OpConversionPattern;`
			`LogicalResult`
			`matchAndRewrite(AtenDropoutOp op, OpAdaptor adaptor,`
			`ConversionPatternRewriter &rewriter) const override {`
			`if (failed(verifyLinalgCompatibleTypes(op, rewriter)))`
			`return failure();`

			`bool train;`
			`if (!matchPattern(op.train(), m_TorchConstantBool(&train)))`
			`return rewriter.notifyMatchFailure(op,`
			`"Expected train to be constant bool.");`

			`if (train)`
			`return failure();`
			`auto resultType = getTypeConverter()`
			`->convertType(op->getResult(0).getType())`
			`.cast<RankedTensorType>();`
			`rewriter.replaceOpWithNewOp<tensor::CastOp>(op, resultType,`
			`adaptor.input());`
			`return success();`
			`}`
			`};`
			`} // namespace`


			`namespace {`
Improve "pseudo" op terminology. The term "pseudo" is very vague and was getting confusing (I felt I had to explain it in every comment referencing it). Instead, rework the "pseudo" ops to instead be named: - MLIR Syntax: `torch.valsem.` - C++ / ODS: `ValsemVariantOp` This makes it clear what the concept is, and avoids confusion with other things that might be called "pseudo", since these are very specific and should be 100% consistently named w.r.t. the non-valsem-variant ops that they correspond to. 2022-03-16 07:57:33 +08:00			`class ConvertValsemVariantAtenUniformOp`
			`: public OpConversionPattern<ValsemVariantAtenUniformOp> {`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`public:`
			`using OpConversionPattern::OpConversionPattern;`
			`LogicalResult`
Improve "pseudo" op terminology. The term "pseudo" is very vague and was getting confusing (I felt I had to explain it in every comment referencing it). Instead, rework the "pseudo" ops to instead be named: - MLIR Syntax: `torch.valsem.` - C++ / ODS: `ValsemVariantOp` This makes it clear what the concept is, and avoids confusion with other things that might be called "pseudo", since these are very specific and should be 100% consistently named w.r.t. the non-valsem-variant ops that they correspond to. 2022-03-16 07:57:33 +08:00			`matchAndRewrite(ValsemVariantAtenUniformOp op, OpAdaptor adaptor,`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`ConversionPatternRewriter &rewriter) const override {`
			`if (failed(verifyLinalgCompatibleTypes(op, rewriter)))`
			`return failure();`
			`Location loc = op.getLoc();`
			`Value self = adaptor.self();`
			`Value from = adaptor.from();`
			`Value to = adaptor.to();`
			`Value generator = adaptor.generator();`
			`RankedTensorType resultType = self.getType().cast<RankedTensorType>();`
			`Type elemTy = resultType.getElementType();`

			`if (!elemTy.isa<mlir::FloatType>())`
			`return rewriter.notifyMatchFailure(op, "This op only support float type");`

			`if (!generator.getType().isa<Torch::NoneType>())`
			`return rewriter.notifyMatchFailure(`
			`op, "The generator has to ben None because only global default "`
			`"generator is supported");`

			`// Build the core formula of LCG Algorithm that makes use of element index:`
			`// For output matrix with rank N:`
			`// temp1 = (cast(I64, index(D.0)) + seed) * multiplier + incrementStep`
			`// ...`
			`// tempN = (cast(I64, index(D.(N))) + tempN-1) * multiplier + incr`
			`// Refer to https://reviews.llvm.org/D101364.`
			`// The value of multiplier and incrementStep are referenced from`
			`// https://en.wikipedia.org/wiki/Linear_congruential_generator for 2^64.`
			`Value multiplier = rewriter.create<arith::ConstantOp>(`
			`loc, rewriter.getI64IntegerAttr(6364136223846793005));`
			`Value incrementStep = rewriter.create<arith::ConstantOp>(`
			`loc, rewriter.getI64IntegerAttr(1442695040888963407));`
			`// Tn = (index + Tn-1) * multiplier + incrementStep`
			`auto getNextTemp = [&](OpBuilder &b, Value index, Value temp) {`
			`Value castIndex =`
			`b.create<arith::IndexCastOp>(loc, b.getI64Type(), index);`
			`Value add = b.create<arith::AddIOp>(loc, castIndex, temp);`
			`Value mult = b.create<arith::MulIOp>(loc, add, multiplier);`
			`return b.create<arith::AddIOp>(loc, mult, incrementStep);`
			`};`

			// Get initial seed, min and max used by `linalg.generic` compute payload.
			`Value initialSeed = rewriter.create<TorchConversion::GetNextSeedOp>(loc);`
			`Value min = convertScalarToDtype(rewriter, loc, from, elemTy);`
			`Value max = convertScalarToDtype(rewriter, loc, to, elemTy);`

			// Construct the `linalg.generic` op.
			`auto resultRank = resultType.getRank();`
			`SmallVector<AffineMap, 1> indexingMaps(`
			`1, rewriter.getMultiDimIdentityMap(resultRank));`
			`SmallVector<StringRef> iteratorTypes(resultRank,`
			`getParallelIteratorTypeName());`
			`SmallVector<Value> sizes = getTensorSizes(rewriter, loc, self);`
			`Value initTensor =`
			`rewriter.create<linalg::InitTensorOp>(loc, sizes, elemTy);`
			`Value uniformRes =`
			`rewriter`
			`.create<linalg::GenericOp>(`
			`loc, initTensor.getType(), /inputs=/ValueRange{},`
			`/outputs=/initTensor, indexingMaps, iteratorTypes,`
			`[&](OpBuilder &b, Location loc, ValueRange args) {`
			`Value temp = initialSeed;`
			`for (int i = 0; i < resultRank; i++) {`
			`Value index = b.create<linalg::IndexOp>(loc, i);`
			`temp = getNextTemp(b, index, temp);`
			`}`
			`// scale = (max - min) * const(F64, 5.4210108E-20)`
			`// which is derived from rand(min,max) =`
			`// rand()/(RAND_MAX/(max-min)) where RAND_MAX = 2^64 - 1`
			`Value epsilon = b.create<arith::ConstantOp>(`
			`loc, b.getFloatAttr(min.getType(), 5.4210108E-20));`
			`Value range = b.create<arith::SubFOp>(loc, max, min);`
			`Value scale = b.create<arith::MulFOp>(loc, range, epsilon);`

			`// res = cast(F64, tempN) * scale + min`
			`Value updateFloat =`
			`b.create<arith::UIToFPOp>(loc, elemTy, temp);`
			`Value updateScaled =`
			`b.create<arith::MulFOp>(loc, updateFloat, scale);`
			`Value res = b.create<arith::AddFOp>(loc, updateScaled, min);`
			`b.create<linalg::YieldOp>(loc, res);`
			`})`
			`.getResult(0);`

			`Type newResultType = getTypeConverter()->convertType(op.getType());`
			`rewriter.replaceOpWithNewOp<tensor::CastOp>(op, newResultType, uniformRes);`
			`return success();`
			`}`
			`};`
			`} // namespace`


			`void mlir::torch::torch_to_linalg::populateRandomPatternsAndLegality(`
			`TypeConverter &typeConverter, RewritePatternSet &patterns,`
			`ConversionTarget &target) {`
			`MLIRContext *context = patterns.getContext();`
			`target.addIllegalOp<AtenDropoutOp>();`
			`patterns.add<ConvertAtenDropoutOp>(typeConverter, context);`
Improve "pseudo" op terminology. The term "pseudo" is very vague and was getting confusing (I felt I had to explain it in every comment referencing it). Instead, rework the "pseudo" ops to instead be named: - MLIR Syntax: `torch.valsem.` - C++ / ODS: `ValsemVariantOp` This makes it clear what the concept is, and avoids confusion with other things that might be called "pseudo", since these are very specific and should be 100% consistently named w.r.t. the non-valsem-variant ops that they correspond to. 2022-03-16 07:57:33 +08:00			`target.addIllegalOp<ValsemVariantAtenUniformOp>();`
			`patterns.add<ConvertValsemVariantAtenUniformOp>(typeConverter, context);`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`}`