torch-mlir/lib/Conversion/TorchToLinalg/IndirectDataMovement.cpp

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Also available under a BSD-style license. See LICENSE.
//
//===----------------------------------------------------------------------===//

#include "torch-mlir/Conversion/TorchToLinalg/TorchToLinalg.h"

#include "../PassDetail.h"
#include "PopulatePatterns.h"
#include "Utils.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/IR/Matchers.h"
#include "torch-mlir/Conversion/Utils/Utils.h"
#include "torch-mlir/Dialect/Torch/IR/TorchDialect.h"
#include "torch-mlir/Dialect/Torch/IR/TorchOps.h"
#include "torch-mlir/Dialect/Torch/Utils/TorchUpstream.h"
#include "torch-mlir/Dialect/Torch/Utils/Utils.h"

using namespace mlir;
using namespace mlir::torch;
using namespace mlir::torch::Torch;

static void createLinalgPayloadCalculationForGatherOps(
    OpBuilder &b, Location loc, Value input, int64_t inputRank, Value index,
    int64_t dim, int64_t outputRank) {
  SmallVector<Value> indices;
  for (int i = 0; i < inputRank; i++) {
    if (i == dim) {
      indices.push_back(castIntToIndex(b, loc, index));
    } else {
      // `outputRank` might be larger than `inputRank`. The `linalg::IndexOp`
      // takes in the dimension of the output. Add `inputDimOffset` to
      // related to the correct dimension of the output for dimension larger
      // than the given `dim`.
      int64_t inputDimOffset = i < dim ? 0 : outputRank - inputRank;
      indices.push_back(b.create<linalg::IndexOp>(loc, i + inputDimOffset));
    }
  }

  // Assert index < input.sizes[dim]
  Value indexLTInputDim = b.create<arith::CmpIOp>(
      loc, arith::CmpIPredicate::slt, castIntToIndex(b, loc, index),
      getDimOp(b, loc, input, dim));
  b.create<cf::AssertOp>(
      loc, indexLTInputDim,
      b.getStringAttr("index must be smaller than dim size"));

  // Assert index >= 0
  Value cst0 = b.create<arith::ConstantOp>(loc, b.getZeroAttr(index.getType()));
  Value indexGEThanZero =
      b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sge, index, cst0);
  b.create<cf::AssertOp>(loc, indexGEThanZero,
                         b.getStringAttr("index must be larger or equal to 0"));

  Value extract = b.create<tensor::ExtractOp>(loc, input, indices);
  b.create<linalg::YieldOp>(loc, extract);
}

namespace {
class ConvertAtenGatherOp : public OpConversionPattern<AtenGatherOp> {
public:
  using OpConversionPattern::OpConversionPattern;
  LogicalResult
  matchAndRewrite(AtenGatherOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
      return failure();
    Location loc = op->getLoc();

    Value dimValue = op.dim();
    int64_t dim;
    if (!matchPattern(dimValue, m_TorchConstantInt(&dim)))
      return op.emitError("unimplemented: dim is not constant");

    Value indices = adaptor.index();
    Value self = adaptor.self();
    RankedTensorType newResultTy =
        getTypeConverter()->convertType(op.getType()).cast<RankedTensorType>();
    int64_t rank = newResultTy.getRank();

    SmallVector<Value> sizes = getTensorSizes(rewriter, loc, indices);
    Value result = createZeroInitTensor(rewriter, loc, sizes,
                                        newResultTy.getElementType());

    SmallVector<AffineMap, 2> affineMaps(2,
                                         rewriter.getMultiDimIdentityMap(rank));
    SmallVector<StringRef> iteratorTypes(rank, getParallelIteratorTypeName());
    auto genericOp = rewriter
                         .create<linalg::GenericOp>(
                             loc, result.getType(), indices, result, affineMaps,
                             iteratorTypes,
                             [&](OpBuilder &b, Location loc, ValueRange args) {
                               auto index = args[0];
                               createLinalgPayloadCalculationForGatherOps(
                                   b, loc, self, rank, index, dim, rank);
                             })
                         .getResult(0);
    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, newResultTy, genericOp);
    return success();
  }
};
} // namespace

namespace {
class ConvertAtenEmbeddingOp : public OpConversionPattern<AtenEmbeddingOp> {
public:
  using OpConversionPattern::OpConversionPattern;
  LogicalResult
  matchAndRewrite(AtenEmbeddingOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
      return failure();
    Location loc = op->getLoc();
    Value weight = adaptor.weight();
    Value indices = adaptor.indices();
    RankedTensorType newResultType =
        typeConverter->convertType(op.getType()).cast<RankedTensorType>();

    auto weightTy = weight.getType().cast<RankedTensorType>();
    if (weightTy.getRank() != 2)
      return rewriter.notifyMatchFailure(op, "weight must be rank 2");
    Value embeddingDim = getDimOp(rewriter, loc, weight, 1);
    Type elemTy = weightTy.getElementType();

    SmallVector<Value> sizes = getTensorSizes(rewriter, loc, indices);
    sizes.push_back(embeddingDim);
    int64_t resultRank = sizes.size();

    auto indicesTy = indices.getType().cast<RankedTensorType>();
    int64_t indicesRank = indicesTy.getRank();
    SmallVector<AffineExpr> indicesExprs;
    for (int i = 0; i < indicesRank; i++)
      indicesExprs.push_back(rewriter.getAffineDimExpr(i));
    auto indicesAffineMap = AffineMap::get(
        /*dimCount=*/resultRank,
        /*symbolCount=*/0, indicesExprs, op->getContext());
    SmallVector<AffineMap, 2> indexingMaps = {
        indicesAffineMap,
        rewriter.getMultiDimIdentityMap(resultRank),
    };
    SmallVector<StringRef> iteratorTypes(sizes.size(),
                                         getParallelIteratorTypeName());
    Value initTensor =
        rewriter.create<linalg::InitTensorOp>(loc, sizes, elemTy);
    Value embeddingResult =
        rewriter
            .create<linalg::GenericOp>(
                loc, initTensor.getType(), indices, initTensor,
                /*indexingMaps=*/indexingMaps, /*iteratorTypes=*/iteratorTypes,
                [&](OpBuilder &b, Location loc, ValueRange args) {
                  Value index = args[0];
                  createLinalgPayloadCalculationForGatherOps(
                      b, loc, weight, weightTy.getRank(), index, /*dim=*/0,
                      resultRank);
                })
            .getResult(0);
    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, newResultType,
                                                embeddingResult);
    return success();
  }
};
} // namespace

namespace {
// Let's say we have an input tensor: initialized with some random values of
// size [4, 5, 6]. An index tensor (always 1-d): [0, 2] of size [2], and an
// integer argument dim = 1. The size of the output tensor will be [4, 2, 6].
// The approach is as follows:
//
// for i in range(input.size[0])
//    for j in range(index.size[0])
//       for k in range(input.size[2])
//          indexValue = index[j]
//          output[i,j,k] = input[i,indexValue,k]

class ConvertAtenIndexSelectOp : public OpConversionPattern<AtenIndexSelectOp> {
public:
  using OpConversionPattern::OpConversionPattern;
  LogicalResult
  matchAndRewrite(AtenIndexSelectOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
      return failure();

    Location loc = op.getLoc();
    Value input = adaptor.self();
    Value indices = adaptor.index();
    RankedTensorType inputType = input.getType().cast<RankedTensorType>();
    RankedTensorType resultType = getTypeConverter()
                                      ->convertType(op->getResult(0).getType())
                                      .cast<RankedTensorType>();
    Type elementType = resultType.getElementType();
    unsigned inputRank = inputType.getRank();

    int64_t dimInt;
    if (!matchPattern(op.dim(), m_TorchConstantInt(&dimInt)))
      return op->emitError("unimplemented: dim is not constant");

    SmallVector<Value> resultShape = getTensorSizes(rewriter, loc, input);
    resultShape[dimInt] = getTensorSizes(rewriter, loc, indices)[0];
    Value initTensor =
        rewriter.create<linalg::InitTensorOp>(loc, resultShape, elementType);

    SmallVector<AffineExpr> resultExpr;
    AffineExpr indicesExpr = rewriter.getAffineDimExpr(dimInt);
    SmallVector<StringRef> iteratorTypes;

    for (unsigned i = 0; i < inputRank; i++) {
      resultExpr.push_back(rewriter.getAffineDimExpr(i));
      iteratorTypes.push_back(getParallelIteratorTypeName());
    }

    auto indexingMaps = AffineMap::inferFromExprList({indicesExpr, resultExpr});

    Value finalRes =
        rewriter
            .create<linalg::GenericOp>(
                loc, initTensor.getType(), ValueRange{indices}, initTensor,
                /*indexingMaps=*/indexingMaps,
                /*iteratorTypes=*/iteratorTypes,
                [&](OpBuilder &b, Location loc, ValueRange args) {
                  Value index = rewriter.create<arith::IndexCastOp>(
                      loc, rewriter.getIndexType(), args[0]);
                  SmallVector<Value> indexTarget;
                  for (unsigned i = 0; i < inputRank; i++)
                    indexTarget.push_back(b.create<linalg::IndexOp>(loc, i));
                  indexTarget[dimInt] = index;
                  Value extractedElement =
                      b.create<tensor::ExtractOp>(loc, input, indexTarget);
                  b.create<linalg::YieldOp>(loc, extractedElement);
                })
            .getResult(0);

    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, resultType, finalRes);
    return success();
  }
};
} // namespace

namespace {
class ConvertAtenIndexTensorOp : public OpConversionPattern<AtenIndexTensorOp> {
public:
  using OpConversionPattern::OpConversionPattern;
  LogicalResult
  matchAndRewrite(AtenIndexTensorOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
      return failure();

    Location loc = op.getLoc();
    Value input = adaptor.self();
    Value indices = op.indices();
    SmallVector<Value> indicesTuple;
    if (!getListConstructElements(indices, indicesTuple)) {
      return rewriter.notifyMatchFailure(
          op, "unimplemented: the indices list is not from a list construct");
    }
    if (indicesTuple.size() != 1) {
      return rewriter.notifyMatchFailure(
          op, "unimplemented: only one index tensor is supported");
    }

    SmallVector<Value> indicesVal =
        getTypeConvertedValues(rewriter, loc, getTypeConverter(), indicesTuple);
    Value indexTensor = indicesVal[0];
    if (failed(checkNotNone(rewriter, op, indexTensor))) {
      return rewriter.notifyMatchFailure(
          op, "unimplemented: index tensor must not be None");
    }

    RankedTensorType inputType = input.getType().cast<RankedTensorType>();
    RankedTensorType indexTensorType =
        indexTensor.getType().cast<RankedTensorType>();
    RankedTensorType resultType = getTypeConverter()
                                      ->convertType(op->getResult(0).getType())
                                      .cast<RankedTensorType>();
    Type elementType = resultType.getElementType();
    int inputRank = inputType.getRank();
    int indexTensorRank = indexTensorType.getRank();

    // This result shape calculation assumes that there is only one
    // index tensor and that it is indexing the first dimension of the
    // input tensor. The calculation for arbitrary inputs is much more complex.
    SmallVector<Value> resultShape;
    for (auto i : llvm::seq(0, indexTensorRank)) {
      resultShape.push_back(getDimOp(rewriter, loc, indexTensor, i));
    }
    for (auto i : llvm::seq(1, inputRank)) {
      resultShape.push_back(getDimOp(rewriter, loc, input, i));
    }
    int resultRank = resultShape.size();

    Value initTensor =
        rewriter.create<linalg::InitTensorOp>(loc, resultShape, elementType);
    SmallVector<AffineExpr> indicesExpr, resultExpr;
    SmallVector<StringRef> iteratorTypes;

    for (auto i : llvm::seq(0, indexTensorRank))
      indicesExpr.push_back(rewriter.getAffineDimExpr(i));
    for (auto i : llvm::seq(0, resultRank)) {
      resultExpr.push_back(rewriter.getAffineDimExpr(i));
      iteratorTypes.push_back(getParallelIteratorTypeName());
    }
    auto indexingMaps = AffineMap::inferFromExprList({indicesExpr, resultExpr});

    Value finalRes =
        rewriter
            .create<linalg::GenericOp>(
                loc, initTensor.getType(), indexTensor, initTensor,
                indexingMaps, iteratorTypes,
                [&](OpBuilder &b, Location loc, ValueRange args) {
                  SmallVector<Value> extractionIndices{
                      castIntToIndex(b, loc, args[0])};
                  for (auto i : llvm::seq(1, inputRank)) {
                    extractionIndices.push_back(b.create<linalg::IndexOp>(
                        loc, i + indexTensorRank - 1));
                  }
                  Value extractedElement = b.create<tensor::ExtractOp>(
                      loc, input, extractionIndices);
                  b.create<linalg::YieldOp>(loc, extractedElement);
                })
            .getResult(0);

    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, resultType, finalRes);
    return success();
  }
};
} // namespace

void mlir::torch::torch_to_linalg::
    populateIndirectDataMovementPatternsAndLegality(
        TypeConverter &typeConverter, RewritePatternSet &patterns,
        ConversionTarget &target) {
  MLIRContext *context = patterns.getContext();
  target.addIllegalOp<AtenGatherOp>();
  patterns.add<ConvertAtenGatherOp>(typeConverter, context);
  target.addIllegalOp<AtenEmbeddingOp>();
  patterns.add<ConvertAtenEmbeddingOp>(typeConverter, context);
  target.addIllegalOp<AtenIndexSelectOp>();
  patterns.add<ConvertAtenIndexSelectOp>(typeConverter, context);
  target.addIllegalOp<AtenIndexTensorOp>();
  patterns.add<ConvertAtenIndexTensorOp>(typeConverter, context);
}
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`//===----------------------------------------------------------------------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`// Also available under a BSD-style license. See LICENSE.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "torch-mlir/Conversion/TorchToLinalg/TorchToLinalg.h"`

			`#include "../PassDetail.h"`
			`#include "PopulatePatterns.h"`
			`#include "Utils.h"`
			`#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"`
			`#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"`
			`#include "mlir/Dialect/Linalg/IR/Linalg.h"`
			`#include "mlir/Dialect/Tensor/IR/Tensor.h"`
			`#include "mlir/IR/Matchers.h"`
			`#include "torch-mlir/Conversion/Utils/Utils.h"`
			`#include "torch-mlir/Dialect/Torch/IR/TorchDialect.h"`
			`#include "torch-mlir/Dialect/Torch/IR/TorchOps.h"`
			`#include "torch-mlir/Dialect/Torch/Utils/TorchUpstream.h"`
			`#include "torch-mlir/Dialect/Torch/Utils/Utils.h"`

			`using namespace mlir;`
			`using namespace mlir::torch;`
			`using namespace mlir::torch::Torch;`

			`static void createLinalgPayloadCalculationForGatherOps(`
			`OpBuilder &b, Location loc, Value input, int64_t inputRank, Value index,`
			`int64_t dim, int64_t outputRank) {`
			`SmallVector<Value> indices;`
			`for (int i = 0; i < inputRank; i++) {`
			`if (i == dim) {`
			`indices.push_back(castIntToIndex(b, loc, index));`
			`} else {`
			// `outputRank` might be larger than `inputRank`. The `linalg::IndexOp`
			// takes in the dimension of the output. Add `inputDimOffset` to
			`// related to the correct dimension of the output for dimension larger`
			// than the given `dim`.
			`int64_t inputDimOffset = i < dim ? 0 : outputRank - inputRank;`
			`indices.push_back(b.create<linalg::IndexOp>(loc, i + inputDimOffset));`
			`}`
			`}`

			`// Assert index < input.sizes[dim]`
			`Value indexLTInputDim = b.create<arith::CmpIOp>(`
[LINALG] Bug fix i64 vs i32 type comparison. Comparing index type instead of integer types solves the problem. 2022-04-22 01:10:04 +08:00			`loc, arith::CmpIPredicate::slt, castIntToIndex(b, loc, index),`
			`getDimOp(b, loc, input, dim));`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`b.create<cf::AssertOp>(`
			`loc, indexLTInputDim,`
			`b.getStringAttr("index must be smaller than dim size"));`

			`// Assert index >= 0`
			`Value cst0 = b.create<arith::ConstantOp>(loc, b.getZeroAttr(index.getType()));`
			`Value indexGEThanZero =`
			`b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sge, index, cst0);`
			`b.create<cf::AssertOp>(loc, indexGEThanZero,`
			`b.getStringAttr("index must be larger or equal to 0"));`

			`Value extract = b.create<tensor::ExtractOp>(loc, input, indices);`
			`b.create<linalg::YieldOp>(loc, extract);`
			`}`

			`namespace {`
			`class ConvertAtenGatherOp : public OpConversionPattern<AtenGatherOp> {`
			`public:`
			`using OpConversionPattern::OpConversionPattern;`
			`LogicalResult`
			`matchAndRewrite(AtenGatherOp op, OpAdaptor adaptor,`
			`ConversionPatternRewriter &rewriter) const override {`
			`if (failed(verifyLinalgCompatibleTypes(op, rewriter)))`
			`return failure();`
			`Location loc = op->getLoc();`

			`Value dimValue = op.dim();`
			`int64_t dim;`
			`if (!matchPattern(dimValue, m_TorchConstantInt(&dim)))`
			`return op.emitError("unimplemented: dim is not constant");`

			`Value indices = adaptor.index();`
			`Value self = adaptor.self();`
			`RankedTensorType newResultTy =`
			`getTypeConverter()->convertType(op.getType()).cast<RankedTensorType>();`
			`int64_t rank = newResultTy.getRank();`

			`SmallVector<Value> sizes = getTensorSizes(rewriter, loc, indices);`
			`Value result = createZeroInitTensor(rewriter, loc, sizes,`
			`newResultTy.getElementType());`

			`SmallVector<AffineMap, 2> affineMaps(2,`
			`rewriter.getMultiDimIdentityMap(rank));`
			`SmallVector<StringRef> iteratorTypes(rank, getParallelIteratorTypeName());`
			`auto genericOp = rewriter`
			`.create<linalg::GenericOp>(`
			`loc, result.getType(), indices, result, affineMaps,`
			`iteratorTypes,`
			`[&](OpBuilder &b, Location loc, ValueRange args) {`
			`auto index = args[0];`
			`createLinalgPayloadCalculationForGatherOps(`
			`b, loc, self, rank, index, dim, rank);`
			`})`
			`.getResult(0);`
			`rewriter.replaceOpWithNewOp<tensor::CastOp>(op, newResultTy, genericOp);`
			`return success();`
			`}`
			`};`
			`} // namespace`

			`namespace {`
			`class ConvertAtenEmbeddingOp : public OpConversionPattern<AtenEmbeddingOp> {`
			`public:`
			`using OpConversionPattern::OpConversionPattern;`
			`LogicalResult`
			`matchAndRewrite(AtenEmbeddingOp op, OpAdaptor adaptor,`
			`ConversionPatternRewriter &rewriter) const override {`
			`if (failed(verifyLinalgCompatibleTypes(op, rewriter)))`
			`return failure();`
			`Location loc = op->getLoc();`
			`Value weight = adaptor.weight();`
			`Value indices = adaptor.indices();`
			`RankedTensorType newResultType =`
			`typeConverter->convertType(op.getType()).cast<RankedTensorType>();`

			`auto weightTy = weight.getType().cast<RankedTensorType>();`
			`if (weightTy.getRank() != 2)`
			`return rewriter.notifyMatchFailure(op, "weight must be rank 2");`
			`Value embeddingDim = getDimOp(rewriter, loc, weight, 1);`
			`Type elemTy = weightTy.getElementType();`

			`SmallVector<Value> sizes = getTensorSizes(rewriter, loc, indices);`
			`sizes.push_back(embeddingDim);`
			`int64_t resultRank = sizes.size();`

[LINALG] Fix typo in conversion pattern of `aten.embedding` (#942) 2022-06-16 00:45:10 +08:00			`auto indicesTy = indices.getType().cast<RankedTensorType>();`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`int64_t indicesRank = indicesTy.getRank();`
			`SmallVector<AffineExpr> indicesExprs;`
			`for (int i = 0; i < indicesRank; i++)`
			`indicesExprs.push_back(rewriter.getAffineDimExpr(i));`
			`auto indicesAffineMap = AffineMap::get(`
			`/dimCount=/resultRank,`
			`/symbolCount=/0, indicesExprs, op->getContext());`
			`SmallVector<AffineMap, 2> indexingMaps = {`
			`indicesAffineMap,`
			`rewriter.getMultiDimIdentityMap(resultRank),`
			`};`
			`SmallVector<StringRef> iteratorTypes(sizes.size(),`
			`getParallelIteratorTypeName());`
			`Value initTensor =`
			`rewriter.create<linalg::InitTensorOp>(loc, sizes, elemTy);`
			`Value embeddingResult =`
			`rewriter`
			`.create<linalg::GenericOp>(`
			`loc, initTensor.getType(), indices, initTensor,`
			`/indexingMaps=/indexingMaps, /iteratorTypes=/iteratorTypes,`
			`[&](OpBuilder &b, Location loc, ValueRange args) {`
			`Value index = args[0];`
			`createLinalgPayloadCalculationForGatherOps(`
			`b, loc, weight, weightTy.getRank(), index, /dim=/0,`
			`resultRank);`
			`})`
			`.getResult(0);`
			`rewriter.replaceOpWithNewOp<tensor::CastOp>(op, newResultType,`
			`embeddingResult);`
			`return success();`
			`}`
			`};`
			`} // namespace`

			`namespace {`
			`// Let's say we have an input tensor: initialized with some random values of`
			`// size [4, 5, 6]. An index tensor (always 1-d): [0, 2] of size [2], and an`
			`// integer argument dim = 1. The size of the output tensor will be [4, 2, 6].`
			`// The approach is as follows:`
			`//`
			`// for i in range(input.size[0])`
			`// for j in range(index.size[0])`
			`// for k in range(input.size[2])`
			`// indexValue = index[j]`
			`// output[i,j,k] = input[i,indexValue,k]`

			`class ConvertAtenIndexSelectOp : public OpConversionPattern<AtenIndexSelectOp> {`
			`public:`
			`using OpConversionPattern::OpConversionPattern;`
			`LogicalResult`
			`matchAndRewrite(AtenIndexSelectOp op, OpAdaptor adaptor,`
			`ConversionPatternRewriter &rewriter) const override {`
			`if (failed(verifyLinalgCompatibleTypes(op, rewriter)))`
			`return failure();`

			`Location loc = op.getLoc();`
			`Value input = adaptor.self();`
			`Value indices = adaptor.index();`
			`RankedTensorType inputType = input.getType().cast<RankedTensorType>();`
			`RankedTensorType resultType = getTypeConverter()`
			`->convertType(op->getResult(0).getType())`
			`.cast<RankedTensorType>();`
			`Type elementType = resultType.getElementType();`
			`unsigned inputRank = inputType.getRank();`

			`int64_t dimInt;`
			`if (!matchPattern(op.dim(), m_TorchConstantInt(&dimInt)))`
			`return op->emitError("unimplemented: dim is not constant");`

			`SmallVector<Value> resultShape = getTensorSizes(rewriter, loc, input);`
			`resultShape[dimInt] = getTensorSizes(rewriter, loc, indices)[0];`
			`Value initTensor =`
			`rewriter.create<linalg::InitTensorOp>(loc, resultShape, elementType);`

			`SmallVector<AffineExpr> resultExpr;`
			`AffineExpr indicesExpr = rewriter.getAffineDimExpr(dimInt);`
			`SmallVector<StringRef> iteratorTypes;`

			`for (unsigned i = 0; i < inputRank; i++) {`
			`resultExpr.push_back(rewriter.getAffineDimExpr(i));`
			`iteratorTypes.push_back(getParallelIteratorTypeName());`
			`}`

			`auto indexingMaps = AffineMap::inferFromExprList({indicesExpr, resultExpr});`

			`Value finalRes =`
			`rewriter`
			`.create<linalg::GenericOp>(`
			`loc, initTensor.getType(), ValueRange{indices}, initTensor,`
			`/indexingMaps=/indexingMaps,`
			`/iteratorTypes=/iteratorTypes,`
			`[&](OpBuilder &b, Location loc, ValueRange args) {`
			`Value index = rewriter.create<arith::IndexCastOp>(`
			`loc, rewriter.getIndexType(), args[0]);`
			`SmallVector<Value> indexTarget;`
			`for (unsigned i = 0; i < inputRank; i++)`
			`indexTarget.push_back(b.create<linalg::IndexOp>(loc, i));`
			`indexTarget[dimInt] = index;`
			`Value extractedElement =`
			`b.create<tensor::ExtractOp>(loc, input, indexTarget);`
			`b.create<linalg::YieldOp>(loc, extractedElement);`
			`})`
			`.getResult(0);`

			`rewriter.replaceOpWithNewOp<tensor::CastOp>(op, resultType, finalRes);`
			`return success();`
			`}`
			`};`
			`} // namespace`

			`namespace {`
			`class ConvertAtenIndexTensorOp : public OpConversionPattern<AtenIndexTensorOp> {`
			`public:`
			`using OpConversionPattern::OpConversionPattern;`
			`LogicalResult`
			`matchAndRewrite(AtenIndexTensorOp op, OpAdaptor adaptor,`
			`ConversionPatternRewriter &rewriter) const override {`
			`if (failed(verifyLinalgCompatibleTypes(op, rewriter)))`
			`return failure();`

			`Location loc = op.getLoc();`
			`Value input = adaptor.self();`
			`Value indices = op.indices();`
			`SmallVector<Value> indicesTuple;`
			`if (!getListConstructElements(indices, indicesTuple)) {`
			`return rewriter.notifyMatchFailure(`
			`op, "unimplemented: the indices list is not from a list construct");`
			`}`
[LINALG] Fix shape function of index.Tensor + support N-rank inputs (#972) This commit fixes the shape function for `index.Tensor`, adding support for multiple index tensors and `None`s in the indices list. This commit also adds support for input tensors of rank greater than 1. The lowering for `index.Tensor` still has the the limitation that only a single index tensor along the first dimension of the input tensor is supported. 2022-06-25 00:45:44 +08:00			`if (indicesTuple.size() != 1) {`
			`return rewriter.notifyMatchFailure(`
			`op, "unimplemented: only one index tensor is supported");`
			`}`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00
			`SmallVector<Value> indicesVal =`
			`getTypeConvertedValues(rewriter, loc, getTypeConverter(), indicesTuple);`
[LINALG] Fix shape function of index.Tensor + support N-rank inputs (#972) This commit fixes the shape function for `index.Tensor`, adding support for multiple index tensors and `None`s in the indices list. This commit also adds support for input tensors of rank greater than 1. The lowering for `index.Tensor` still has the the limitation that only a single index tensor along the first dimension of the input tensor is supported. 2022-06-25 00:45:44 +08:00			`Value indexTensor = indicesVal[0];`
			`if (failed(checkNotNone(rewriter, op, indexTensor))) {`
			`return rewriter.notifyMatchFailure(`
			`op, "unimplemented: index tensor must not be None");`
			`}`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00
			`RankedTensorType inputType = input.getType().cast<RankedTensorType>();`
[LINALG] Fix shape function of index.Tensor + support N-rank inputs (#972) This commit fixes the shape function for `index.Tensor`, adding support for multiple index tensors and `None`s in the indices list. This commit also adds support for input tensors of rank greater than 1. The lowering for `index.Tensor` still has the the limitation that only a single index tensor along the first dimension of the input tensor is supported. 2022-06-25 00:45:44 +08:00			`RankedTensorType indexTensorType =`
			`indexTensor.getType().cast<RankedTensorType>();`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`RankedTensorType resultType = getTypeConverter()`
			`->convertType(op->getResult(0).getType())`
			`.cast<RankedTensorType>();`
			`Type elementType = resultType.getElementType();`
[LINALG] Fix shape function of index.Tensor + support N-rank inputs (#972) This commit fixes the shape function for `index.Tensor`, adding support for multiple index tensors and `None`s in the indices list. This commit also adds support for input tensors of rank greater than 1. The lowering for `index.Tensor` still has the the limitation that only a single index tensor along the first dimension of the input tensor is supported. 2022-06-25 00:45:44 +08:00			`int inputRank = inputType.getRank();`
			`int indexTensorRank = indexTensorType.getRank();`

			`// This result shape calculation assumes that there is only one`
			`// index tensor and that it is indexing the first dimension of the`
			`// input tensor. The calculation for arbitrary inputs is much more complex.`
			`SmallVector<Value> resultShape;`
			`for (auto i : llvm::seq(0, indexTensorRank)) {`
			`resultShape.push_back(getDimOp(rewriter, loc, indexTensor, i));`
			`}`
			`for (auto i : llvm::seq(1, inputRank)) {`
			`resultShape.push_back(getDimOp(rewriter, loc, input, i));`
			`}`
			`int resultRank = resultShape.size();`

			`Value initTensor =`
			`rewriter.create<linalg::InitTensorOp>(loc, resultShape, elementType);`
			`SmallVector<AffineExpr> indicesExpr, resultExpr;`
			`SmallVector<StringRef> iteratorTypes;`

			`for (auto i : llvm::seq(0, indexTensorRank))`
			`indicesExpr.push_back(rewriter.getAffineDimExpr(i));`
			`for (auto i : llvm::seq(0, resultRank)) {`
			`resultExpr.push_back(rewriter.getAffineDimExpr(i));`
			`iteratorTypes.push_back(getParallelIteratorTypeName());`
			`}`
			`auto indexingMaps = AffineMap::inferFromExprList({indicesExpr, resultExpr});`

			`Value finalRes =`
			`rewriter`
			`.create<linalg::GenericOp>(`
			`loc, initTensor.getType(), indexTensor, initTensor,`
			`indexingMaps, iteratorTypes,`
			`[&](OpBuilder &b, Location loc, ValueRange args) {`
			`SmallVector<Value> extractionIndices{`
			`castIntToIndex(b, loc, args[0])};`
			`for (auto i : llvm::seq(1, inputRank)) {`
			`extractionIndices.push_back(b.create<linalg::IndexOp>(`
			`loc, i + indexTensorRank - 1));`
			`}`
			`Value extractedElement = b.create<tensor::ExtractOp>(`
			`loc, input, extractionIndices);`
			`b.create<linalg::YieldOp>(loc, extractedElement);`
			`})`
			`.getResult(0);`

			`rewriter.replaceOpWithNewOp<tensor::CastOp>(op, resultType, finalRes);`
			`return success();`
Split up TorchToLinalg.cpp This helps keep things organized and also exposes more parallelism to the build system. It seems though that most of the compile time is actually spent in the headers though, so the wall time doesn't decrease as much as I had hoped (and now that the headers are being included multiple times, the cpu time actually increases a lot, sadly -- will try to dig into this). 2022-03-11 01:54:13 +08:00			`}`
			`};`
			`} // namespace`

			`void mlir::torch::torch_to_linalg::`
			`populateIndirectDataMovementPatternsAndLegality(`
			`TypeConverter &typeConverter, RewritePatternSet &patterns,`
			`ConversionTarget &target) {`
			`MLIRContext *context = patterns.getContext();`
			`target.addIllegalOp<AtenGatherOp>();`
			`patterns.add<ConvertAtenGatherOp>(typeConverter, context);`
			`target.addIllegalOp<AtenEmbeddingOp>();`
			`patterns.add<ConvertAtenEmbeddingOp>(typeConverter, context);`
			`target.addIllegalOp<AtenIndexSelectOp>();`
			`patterns.add<ConvertAtenIndexSelectOp>(typeConverter, context);`
			`target.addIllegalOp<AtenIndexTensorOp>();`
			`patterns.add<ConvertAtenIndexTensorOp>(typeConverter, context);`
			`}`