//===----------------------------------------------------------------------===// //// // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // Also available under a BSD-style license. See LICENSE. // //===----------------------------------------------------------------------===// #include "torch-mlir/Conversion/TorchToTosa/TorchToTosa.h" #include "../PassDetail.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tosa/IR/TosaOps.h" #include "mlir/IR/Matchers.h" #include "mlir/Transforms/DialectConversion.h" #include "torch-mlir/Conversion/TorchToTosa/TosaLegalizeCommon.h" #include "torch-mlir/Conversion/TorchToTosa/TosaLegalizeUtils.h" #include "torch-mlir/Conversion/Utils/Utils.h" #include "torch-mlir/Dialect/Torch/IR/TorchDialect.h" #include "torch-mlir/Dialect/Torch/IR/TorchOps.h" #include "torch-mlir/Dialect/Torch/IR/TorchTypes.h" #include "torch-mlir/Dialect/Torch/Utils/Utils.h" #include "torch-mlir/Dialect/TorchConversion/Transforms/BackendTypeConversion.h" #include "llvm/ADT/TypeSwitch.h" #include #include #include using namespace mlir; using namespace mlir::torch; using namespace mlir::torch::Torch; namespace { // These legalizations are for unary ops with only for floating point datatypes. // There is no supported quantized integer mode for these. template class ConvertAtenUnaryFPOnlyOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value self = adaptor.getSelf(); auto selfTy = cast(self.getType()); if (!selfTy) return rewriter.notifyMatchFailure(op, "Only Tensor types supported in TOSA"); if (isa(selfTy.getElementType())) { rewriter.replaceOpWithNewOp( op, OpConversionPattern::getTypeConverter()->convertType( op.getType()), self); return success(); } else { return rewriter.notifyMatchFailure( op, "Only floating-point datatype legalization supported"); } } }; // These unary op legalizations are identical for floating-point // or quantized types template class ConvertAtenUnaryOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { rewriter.replaceOpWithNewOp( op, OpConversionPattern::getTypeConverter()->convertType( op.getType()), adaptor.getSelf()); return success(); } }; // These binary op legalizations are identical for floating-point // or quantized types template class ConvertAtenBinaryOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value lhs = adaptor.getSelf(); auto lhsTy = cast(lhs.getType()); Value rhs = adaptor.getOther(); auto rhsTy = cast(rhs.getType()); if (!lhsTy || !rhsTy) return rewriter.notifyMatchFailure(op, "Only Tensor types supported in TOSA"); auto outTy = cast( OpConversionPattern::getTypeConverter()->convertType( op.getType())); Value binaryOp; // TOSA ArithmeticRightShiftOp has a round parameter. if constexpr (std::is_same()) { binaryOp = rewriter.create(op->getLoc(), outTy, lhs, rhs, /*round=*/false); } else { binaryOp = tosa::createBinaryOpAndCast(rewriter, op, outTy, lhs, rhs); } rewriter.replaceOp(op, binaryOp); return success(); } }; template static bool isInValidRange(bool isFloat, const double &doubleValue, bool isInt, const int64_t &intValue) { if (isFloat) { return (doubleValue >= static_cast(std::numeric_limits::min())) && (doubleValue <= static_cast(std::numeric_limits::max())); } else if (isInt) { return (intValue >= static_cast(std::numeric_limits::min())) && (intValue <= static_cast(std::numeric_limits::max())); } return false; } // FIXME: This will eventually go into a Tosa*Utils file. LogicalResult torchScalarToTosaTensor(ConversionPatternRewriter &rewriter, Operation *op, Value torchScalarValue, Value &tosaTensor, Type dtype, llvm::ArrayRef dshape) { // Retrieve a const float or int value but create the out Tensor with dtype. double doubleValue; auto isFloat = matchPattern(torchScalarValue, m_TorchConstantFloat(&doubleValue)); int64_t intValue; auto isInt = matchPattern(torchScalarValue, m_TorchConstantInt(&intValue)); if (!isFloat && !isInt) return rewriter.notifyMatchFailure(op, "Unable to extract the scalar constant"); int64_t numElem = 1; for (int64_t dim : dshape) numElem *= dim; if (isa(dtype)) { tosaTensor = tosa::getConstTensor( rewriter, op, SmallVector(numElem, (isFloat ? doubleValue : intValue)), dshape, dtype) .value(); } else if (auto intType = dyn_cast(dtype)) { auto width = intType.getWidth(); if (width != 1 && width != 8 && width != 32 && width != 64) return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) { diag << "Unsupported integer type: " << intType; }); if (width == 1) { if (!isInValidRange(isFloat, doubleValue, isInt, intValue)) { return rewriter.notifyMatchFailure( op, "Supplied value of scalar constant exceeds limits " "of destination type"); } bool d = isFloat ? static_cast(doubleValue) : static_cast(intValue); tosaTensor = tosa::getConstTensor( rewriter, op, SmallVector(numElem, d), dshape) .value(); } else if (width == 8) { if (!isInValidRange(isFloat, doubleValue, isInt, intValue)) { return rewriter.notifyMatchFailure( op, "Supplied value of scalar constant exceeds limits " "of destination type"); } int8_t d = isFloat ? static_cast(doubleValue) : static_cast(intValue); tosaTensor = tosa::getConstTensor( rewriter, op, SmallVector(numElem, d), dshape) .value(); } else if (width == 32) { if (!isInValidRange(isFloat, doubleValue, isInt, intValue)) { return rewriter.notifyMatchFailure( op, "Supplied value of scalar constant exceeds limits " "of destination type"); } int32_t d = isFloat ? static_cast(doubleValue) : static_cast(intValue); tosaTensor = tosa::getConstTensor( rewriter, op, SmallVector(numElem, d), dshape) .value(); } else if (width == 64) { if (!isInValidRange(isFloat, doubleValue, isInt, intValue)) { return rewriter.notifyMatchFailure( op, "Supplied value of scalar constant exceeds limits " "of destination type"); } int64_t d = (isFloat ? static_cast(doubleValue) : intValue); tosaTensor = tosa::getConstTensor( rewriter, op, SmallVector(numElem, d), dshape) .value(); } } else { return rewriter.notifyMatchFailure(op, "Usupported element type"); } return success(); } LogicalResult torchAlphaToTosaTensor(ConversionPatternRewriter &rewriter, Operation *op, Value alphaScalar, Value &alphaTensor, Type dtype, bool checkForUnity) { if (succeeded(torchScalarToTosaTensor(rewriter, op, alphaScalar, alphaTensor, dtype, {}))) return success(); // `alpha` has not been specified. int64_t alphaValue; if (!matchPattern(alphaScalar, m_TorchConstantInt(&alphaValue))) return rewriter.notifyMatchFailure( op, "Currently only scalar constants are supported for " "alpha in TOSA operation"); // When no alpha has been specified, this must be 1. if (checkForUnity && alphaValue != 1) return rewriter.notifyMatchFailure(op, "Unsupported integer value for alpha"); alphaTensor = tosa::getConstTensor( rewriter, op, {static_cast(alphaValue)}, {}, dtype) .value(); return success(); } // These binary op legalizations are specific to add/sub which have an // alpha multiplier. template class ConvertAtenAddSubOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { // left : tensor: tensor // right : scalar: i32/i64/f32 // tensor: tensor // alpha : scalar: i32/i64/f32 // output: tensor: tensor Value lhs = adaptor.getSelf(); auto lhsType = dyn_cast(lhs.getType()); Value rhs = adaptor.getOther(); auto rhsType = dyn_cast(rhs.getType()); if (!lhsType) return rewriter.notifyMatchFailure(op, "Only Tensor types supported in TOSA"); if (auto lhsElemTy = dyn_cast(lhsType.getElementType())) { if (lhsElemTy.getWidth() > 64) return rewriter.notifyMatchFailure( op, "Integers with widths greater than 64 are not supported"); } // Get output type: tensor auto outType = cast( OpConversionPattern::getTypeConverter()->convertType( op.getType())); Type outElemTy = outType.getElementType(); if (!outElemTy.isIntOrFloat()) { return rewriter.notifyMatchFailure( op, "Only floating-point or integer datatype legalization supported"); } Type rhsAlphaMulElemType; if (isa(outElemTy)) { rhsAlphaMulElemType = outElemTy; } else { // if output type is 64, input type should also be 32 rhsAlphaMulElemType = rewriter.getIntegerType(32); } // if right is scalar, rhgType==None, which need to be manually cast to // TensorType else right is tensor, rhsType==tensor Value rhsAsTensor; if (!rhsType) { if (failed(torchScalarToTosaTensor(rewriter, op, op.getOther(), rhsAsTensor, rhsAlphaMulElemType, {}))) return rewriter.notifyMatchFailure( op, "Currently only scalar constants are supported for " "conversion in TOSA operation"); } else if (rhsType.getElementType() != rhsAlphaMulElemType) { // right is tensor, rhsType == tensor // right must be cast to same type as the alpha, so MulOp success rhs = rewriter.create( op->getLoc(), RankedTensorType::get(rhsType.getShape(), rhsAlphaMulElemType), rhs); // reinitialize right value type to tensor rhsType = dyn_cast(rhs.getType()); } auto rhsTensor = rhsType ? rhs : rhsAsTensor; // Handle scalar value alpha. // It should be either f32/i32 Value alphaTensor; if (failed(torchAlphaToTosaTensor(rewriter, op.getOperation(), op.getAlpha(), alphaTensor, rhsAlphaMulElemType, /*checkForUnity=*/false))) { return rewriter.notifyMatchFailure( op, "Currently only scalar constants are supported for " "alpha in conversion to TOSA operation"); } auto mulAlphaOp = tosa::createMulOpAndCast( rewriter, op, rhsType ? rhsType : RankedTensorType::get({}, rhsAlphaMulElemType), rhsTensor, alphaTensor, /*shift=*/0); if (outElemTy.isInteger(64)) { // Tosa doesn't support 64-bit elementwise addition and subtraction. // if outElemTy tensor, mulTensor must be tensor, // left value could be tensor type, cast left value to // tensor type auto addOrSubi64Op = tosa::createBinaryOpAndCast( rewriter, op, RankedTensorType::get(outType.getShape(), rhsAlphaMulElemType), lhs, mulAlphaOp); // cast tensor back to tensor rewriter.replaceOpWithNewOp(op, outType, addOrSubi64Op); return success(); } auto binaryOp = tosa::createBinaryOpAndCast(rewriter, op, outType, lhs, mulAlphaOp); rewriter.replaceOp(op, binaryOp.getResult()); return success(); } }; // namespace // Binary op legalizations for comparator ops. template class ConvertAtenCompareOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value lhs = adaptor.getSelf(); auto lhsTy = dyn_cast(lhs.getType()); Value rhs = adaptor.getOther(); auto rhsTy = dyn_cast(rhs.getType()); if (!lhsTy) return rewriter.notifyMatchFailure(op, "Only Tensor types supported in TOSA"); auto lhsElemTy = lhsTy.getElementType(); if (!lhsElemTy.isIntOrFloat()) return rewriter.notifyMatchFailure( op, "Only floating-point or integer datatype legalization supported"); // For bitwise operators, only integer datatype legalization is supported constexpr bool isBitwiseOp = std::is_same() || std::is_same() || std::is_same() || std::is_same(); if (isa(lhsElemTy) && isBitwiseOp) { return rewriter.notifyMatchFailure(op, "For bitwise operators, only integer " "datatype legalization is supported"); } Value rhsAsTensor; if (!rhsTy) { if (failed(torchScalarToTosaTensor(rewriter, op, op.getOther(), rhsAsTensor, lhsElemTy, {}))) return rewriter.notifyMatchFailure( op, "Currently only scalar constants are supported for " "conversion in TOSA operation"); } auto rhsTensor = rhsTy ? rhs : rhsAsTensor; // There is no Lesser operator in TOSA. constexpr auto swapLhsRhs = (std::is_same() || std::is_same() || std::is_same() || std::is_same()); // Promote lhs and rhs dtypes for bitwise operators. TensorType resultTy = cast( OpConversionPattern::getTypeConverter()->convertType( op.getType())); if (isBitwiseOp) { lhs = tosa::promoteType(rewriter, lhs, resultTy); rhsTensor = tosa::promoteType(rewriter, rhsTensor, resultTy); } auto resultOp = rewriter.create(op.getLoc(), resultTy, (swapLhsRhs ? rhsTensor : lhs), (swapLhsRhs ? lhs : rhsTensor)); // There is no NE operator in TOSA. if constexpr (std::is_same() || std::is_same()) { rewriter.replaceOpWithNewOp(op, resultTy, resultOp.getResult()); } else { rewriter.replaceOp(op, resultOp.getResult()); } return success(); } }; // Binary op legalizations for Mul variants. template class ConvertAtenMulOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value lhs = adaptor.getSelf(); auto lhsType = dyn_cast(lhs.getType()); if (!lhsType) return rewriter.notifyMatchFailure(op, "Only Tensor types supported in TOSA"); auto outType = cast( OpConversionPattern::getTypeConverter()->convertType( op.getType())); Type outElemTy = outType.getElementType(); if (!outElemTy.isIntOrFloat()) return rewriter.notifyMatchFailure( op, "Only floating-point or integer datatype legalization supported"); Value rhsTensor; if constexpr (std::is_same()) { rhsTensor = lhs; } else { Value rhsAsTensor; Value rhs = adaptor.getOther(); auto rhsType = dyn_cast(rhs.getType()); if (!rhsType) { if (failed(torchScalarToTosaTensor(rewriter, op, op.getOther(), rhsAsTensor, outElemTy, {}))) { return rewriter.notifyMatchFailure( op, "Currently only scalar constants are supported for " "conversion in TOSA operation"); } } rhsTensor = rhsType ? rhs : rhsAsTensor; } if (isa(outElemTy) || isa(outElemTy)) { auto outType = cast( OpConversionPattern::getTypeConverter()->convertType( op.getType())); auto mulOp = tosa::createMulOpAndCast(rewriter, op, outType, lhs, rhsTensor, /*shift=*/0); rewriter.replaceOp(op, mulOp.getResult()); return success(); } // Quantized multiplication may need to rescale inputs. return rewriter.notifyMatchFailure( op, "Only floating-point or integer datatype " "legalization currently supported"); } }; // Function to perform division with trunc rounding mode (rounding result // towards zero) for float type inputs. // This function takes in the division result between lhs and rhs rather // than takes in the original lhs and rhs tensors as parameters. Value truncFloatDivWithDivResult(PatternRewriter &rewriter, Operation *op, TensorType outType, Value divResult) { // To implement trunc mode for float inputs, multiply the floored abs // of the tensor with the elementwise signedness of the tensor. // div_result = lhs / rhs // trunc_val = floor(abs(div_result)) * sign(div_result) auto zero = tosa::getConstTensor(rewriter, op, 0, {}, outType.getElementType()) .value(); auto one = tosa::getConstTensor(rewriter, op, 1, {}, outType.getElementType()) .value(); auto minusOne = tosa::getConstTensor(rewriter, op, -1, {}, outType.getElementType()) .value(); auto cond = rewriter.create( op->getLoc(), RankedTensorType::get(outType.getShape(), rewriter.getIntegerType(1)), divResult, zero); auto selectOp = rewriter.create(op->getLoc(), outType, cond, one, minusOne); auto absDivResult = rewriter.create(op->getLoc(), outType, divResult); auto flooredAbsDivResult = rewriter.create(op->getLoc(), outType, absDivResult); Value result = tosa::createMulOpAndCast(rewriter, op, outType, flooredAbsDivResult, selectOp, /*shift=*/0) .getResult(); return result; } // Function to perform division with trunc rounding mode (rounding result // towards zero) for float type inputs Value truncFloatDiv(PatternRewriter &rewriter, Operation *op, TensorType outType, Value lhs, Value rhs) { rhs = tosa::promoteType(rewriter, rhs, outType); auto rhsRcp = rewriter.create(op->getLoc(), rhs.getType(), rhs); auto divResult = tosa::createMulOpAndCast(rewriter, op, outType, lhs, rhsRcp, /*shift=*/0); return truncFloatDivWithDivResult(rewriter, op, outType, divResult); } // Function to perform division with floor rounding mode (rounding result // down) for integer type inputs. Value floorIntDiv(PatternRewriter &rewriter, Operation *op, TensorType outType, Value lhs, Value rhs) { // To implement floor mode int input, utilize tosa::IntDivOp (trunc div // result) with the following formula elementwise: // floor_val = trunc_val - ((trunc_val * rhs != lhs) // && (sign(lhs) != sign(rhs))) // TOSA IntDiv requires inputs to be i32 auto i32Type = RankedTensorType::get(outType.getShape(), rewriter.getIntegerType(32)); lhs = tosa::promoteType(rewriter, lhs, i32Type); rhs = tosa::promoteType(rewriter, rhs, i32Type); auto intDivOp = rewriter.create(op->getLoc(), i32Type, lhs, rhs); auto zero = tosa::getConstTensor(rewriter, op, 0, {}).value(); auto one = tosa::getConstTensor(rewriter, op, 1, {}).value(); auto boolType = RankedTensorType::get(outType.getShape(), rewriter.getIntegerType(1)); auto lhsMulRhs = rewriter.create(op->getLoc(), i32Type, lhs, rhs, /*shift=*/0); auto lhsRhsDifferentSign = rewriter.create(op->getLoc(), boolType, zero, lhsMulRhs); auto truncMulRhs = rewriter.create(op->getLoc(), i32Type, intDivOp, rhs, /*shift=*/0); auto truncMulRhsEqualLhs = rewriter.create(op->getLoc(), boolType, truncMulRhs, lhs); auto truncMulRhsNotEqualLhs = rewriter.create( op->getLoc(), boolType, truncMulRhsEqualLhs); auto truncMinusOne = rewriter.create(op->getLoc(), i32Type, intDivOp, one); auto cond = rewriter.create( op->getLoc(), boolType, lhsRhsDifferentSign, truncMulRhsNotEqualLhs); auto selectOp = rewriter.create(op->getLoc(), i32Type, cond, truncMinusOne, intDivOp); Value result = tosa::promoteType(rewriter, selectOp, outType); return result; } template class ConvertAtenDivOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value lhs = adaptor.getSelf(); auto lhsTy = dyn_cast(lhs.getType()); Value rhs = adaptor.getOther(); auto rhsTy = dyn_cast(rhs.getType()); if (!lhsTy) return rewriter.notifyMatchFailure(op, "Only Tensor types supported in TOSA"); auto lhsElemTy = lhsTy.getElementType(); if (!lhsElemTy.isIntOrFloat()) return rewriter.notifyMatchFailure( op, "Only floating-point or integer datatype legalization supported"); Value rhsAsTensor; if (!rhsTy) { if (failed(torchScalarToTosaTensor(rewriter, op, op.getOther(), rhsAsTensor, lhsElemTy, {}))) return rewriter.notifyMatchFailure( op, "Currently only scalar constants are supported for " "conversion in TOSA operation"); } auto rhsTensor = rhsTy ? rhs : rhsAsTensor; auto outType = cast( OpConversionPattern::getTypeConverter()->convertType( op.getType())); // Get rounding mode for aten.div.Tensor_mode std::string roundMode; if constexpr (std::is_same() || std::is_same()) { if (!matchPattern(op.getRoundingMode(), m_TorchConstantStr(roundMode))) return rewriter.notifyMatchFailure( op, "Non-const rounding mode parameter unsupported"); } Value result; if (isa(outType.getElementType())) { // The input to the reciprocal is an integer sometimes, and we may need // to promote it to a floating point. Per TOSA specification, the input // types can only be floating point for tosa::ReciprocalOp. rhsTensor = tosa::promoteType(rewriter, rhsTensor, outType); auto rhsRcp = rewriter.create( op->getLoc(), rhsTensor.getType(), rhsTensor); auto divResult = tosa::createMulOpAndCast(rewriter, op, outType, lhs, rhsRcp, /*shift=*/0); // Round result based on rounding mode if (roundMode.compare("floor") == 0) { // "floor": rounds the results of the division down. Equivalent to // floor division in Python (the // operator). auto floorOp = rewriter.create(op->getLoc(), outType, divResult); result = floorOp.getResult(); } else if (roundMode.compare("trunc") == 0) { // "trunc": rounds the results of the division towards zero. Equivalent // to C-style integer division. result = truncFloatDivWithDivResult(rewriter, op, outType, divResult); } else { // None: No rounding mode result = divResult.getResult(); } } else { if (roundMode.compare("floor") == 0) { // "floor": rounds the results of the division down. Equivalent to floor // division in Python (the // operator). result = floorIntDiv(rewriter, op, outType, lhs, rhsTensor); } else { // "trunc": rounds the results of the division towards zero. Equivalent // to C-style integer division. // None: no rounding mode. // TOSA IntDiv requires inputs to be i32 auto i32Type = RankedTensorType::get(outType.getShape(), rewriter.getIntegerType(32)); lhs = tosa::promoteType(rewriter, lhs, i32Type); rhsTensor = tosa::promoteType(rewriter, rhsTensor, i32Type); auto intDivOp = rewriter.create(op->getLoc(), i32Type, lhs, rhsTensor); result = tosa::promoteType(rewriter, intDivOp, outType); } } rewriter.replaceOp(op, {result}); return success(); } }; // This defines a template to construct ops whose legalizations are // specialized. template class ConvertAtenOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; template class ConvertAtenActivationFunctionOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value self = adaptor.getSelf(); auto selfTy = cast(self.getType()); if (!selfTy) return rewriter.notifyMatchFailure(op, "Only Tensor types supported"); if (!isa(selfTy.getElementType())) return rewriter.notifyMatchFailure( op, "Only floating-point datatype legalization currently supported"); rewriter.replaceOpWithNewOp( op, this->getTypeConverter()->convertType(op.getType()), self); return success(); } }; template <> LogicalResult ConvertAtenOp::matchAndRewrite( AtenReluOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Value self = adaptor.getSelf(); auto selfTy = cast(self.getType()); // Maps to tosa.clamp which has both int and fp limits. int64_t clampMin = 0; Value clampIn = self; if (!selfTy) { return rewriter.notifyMatchFailure(op, "Only Tensor types supported in TOSA"); } // Rescale the clampIn for quantized types. TBD if (!isa(selfTy.getElementType())) { return rewriter.notifyMatchFailure( op, "Only floating-point datatype legalization currently supported"); } rewriter.replaceOpWithNewOp( op, getTypeConverter()->convertType(op.getType()), clampIn, rewriter.getI64IntegerAttr(clampMin), rewriter.getI64IntegerAttr(std::numeric_limits::max()), rewriter.getF32FloatAttr(0.0f), rewriter.getF32FloatAttr(std::numeric_limits::max())); return success(); } template <> LogicalResult ConvertAtenOp::matchAndRewrite( AtenLeakyReluOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Value self = adaptor.getSelf(); auto selfTy = cast(self.getType()); if (!isa(selfTy.getElementType())) { return rewriter.notifyMatchFailure( op, "Only floating-point datatype legalization currently supported"); } Value alphaScalar = op.getNegativeSlope(); Value alphaTensor; if (failed(torchScalarToTosaTensor(rewriter, op.getOperation(), alphaScalar, alphaTensor, selfTy.getElementType(), {}))) return rewriter.notifyMatchFailure( op, "Negative slope needs to be a scalar constant for conversion to " "TOSA LeakyReLU operation"); auto zero = tosa::getConstTensor(rewriter, op, 0, {}, selfTy.getElementType()) .value(); auto cond = rewriter.create( op->getLoc(), RankedTensorType::get(selfTy.getShape(), rewriter.getIntegerType(1)), self, zero); auto mulTensor = rewriter.create( op->getLoc(), getTypeConverter()->convertType(op.getType()), self, alphaTensor, /*shift=*/0); rewriter.replaceOpWithNewOp( op, getTypeConverter()->convertType(op.getType()), cond, self, mulTensor); return success(); } using ReductionConvFunc = std::optional (*)(PatternRewriter &, Operation *, RankedTensorType, Value, ElementsAttr, bool); // They all constitute a common form invoking the appropriate // converion function in TosaLegalizeCommon.cpp template class ConvertAtenReductionOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; // Each variant must implement corresponding parameter parsing options virtual LogicalResult readReduceDimsAndKeepDims( AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, ElementsAttr &reduceDimsAttr, bool &keepDims) const { return rewriter.notifyMatchFailure( op, "Unimplemented reduce_dims and keep_dims parsing function"); } // Common rewriter for all reduction ops, calls the specific implementation of // readReduceDimsAndKeepDims() needed for the op variant. LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value self = adaptor.getSelf(); auto selfTy = cast(self.getType()); if (!selfTy) return rewriter.notifyMatchFailure(op, "Only Tensor types supported in TOSA"); auto outputTy = cast( OpConversionPattern::getTypeConverter()->convertType( op.getType())); if (!outputTy) return rewriter.notifyMatchFailure( op, "Only ranked tensor type outputs permitted for reduce_mean"); auto selfElemTy = selfTy.getElementType(); if (!selfElemTy.isIntOrFloat()) return rewriter.notifyMatchFailure( op, "Only floating-point or integer datatype legalization supported"); // TOSA ReduceAll and ReduceAny ops only accept bool input if constexpr (std::is_same() || std::is_same() || std::is_same() || std::is_same()) { self = tosa::promoteType( rewriter, self, RankedTensorType::get(selfTy.getShape(), rewriter.getIntegerType(1))); } // Handle dtype output and bool elem type for ReduceSum and ReduceProd ops if constexpr (std::is_same() || std::is_same() || std::is_same() || std::is_same()) { auto dtype = op.getDtype(); int64_t dtypeInt; if (!isa(dtype.getType())) { if (!matchPattern(dtype, m_TorchConstantInt(&dtypeInt))) return rewriter.notifyMatchFailure(op, "dtype is not a constant int"); FailureOr maybeDtypeType = getTypeForScalarType( op.getContext(), (torch_upstream::ScalarType)dtypeInt); if (failed(maybeDtypeType)) { return rewriter.notifyMatchFailure(op, "dtype is undefined"); } else { Type dtypeType = maybeDtypeType.value(); if (isa(dtypeType)) dtypeType = rewriter.getIntegerType(dtypeType.getIntOrFloatBitWidth()); self = tosa::promoteType( rewriter, self, RankedTensorType::get(selfTy.getShape(), dtypeType)); } } else { if (selfElemTy.isInteger(1)) self = tosa::promoteType(rewriter, self, outputTy); } } ElementsAttr reduceDimsAttr; bool keepDims; if (failed(readReduceDimsAndKeepDims(op, adaptor, rewriter, reduceDimsAttr, keepDims))) return failure(); std::optional result = ConversionFuncT(rewriter, op, outputTy, self, reduceDimsAttr, keepDims); if (!result) return failure(); rewriter.replaceOp(op, {result.value()}); return success(); } }; // This reduction op legalization template handles op variants that have // explicit reduce_dims dimensions (provided as a list) and keep_dims // parameters. template class ConvertAtenMultipleDimsReductionOp : public ConvertAtenReductionOp { using ConvertAtenReductionOp::ConvertAtenReductionOp; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult readReduceDimsAndKeepDims(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, ElementsAttr &reduceDimsAttr, bool &keepDims) const override { int64_t inputRank = cast(adaptor.getSelf().getType()).getRank(); SmallVector reduceDims; // If dim list is none, all dimensions are reduced if (!matchPattern(op.getDim(), m_TorchListOfConstantInts(reduceDims))) { for (int64_t i = 0; i < inputRank; i++) reduceDims.push_back(i); } int64_t N = reduceDims.size(); for (unsigned i = 0; i < N; i++) { reduceDims[i] = toPositiveDim(reduceDims[i], inputRank); if (!isValidDim(reduceDims[i], inputRank)) return rewriter.notifyMatchFailure(op, "reduce dim is statically invalid"); } auto reduceDimsType = RankedTensorType::get({N}, rewriter.getI64Type()); reduceDimsAttr = DenseIntElementsAttr::get(reduceDimsType, llvm::ArrayRef(reduceDims)); keepDims = false; if (!matchPattern(op.getKeepdim(), m_TorchConstantBool(&keepDims))) return rewriter.notifyMatchFailure( op, "non-const keepdim parameter unsupported"); return success(); } }; // This reduction op legalization template handles op variants that reduce in // only one explicit dim which is provided as a number (rather than a list), and // a keep_dims parameter. template class ConvertAtenOneDimReductionOp : public ConvertAtenReductionOp { using ConvertAtenReductionOp::ConvertAtenReductionOp; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult readReduceDimsAndKeepDims(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, ElementsAttr &reduceDimsAttr, bool &keepDims) const override { int64_t reduceDim; if (!matchPattern(op.getDim(), m_TorchConstantInt(&reduceDim))) return rewriter.notifyMatchFailure(op, "non-const dim parameter unsupported"); int64_t inputRank = cast(adaptor.getSelf().getType()).getRank(); reduceDim = toPositiveDim(reduceDim, inputRank); if (!isValidDim(reduceDim, inputRank)) return rewriter.notifyMatchFailure(op, "dim is statically invalid"); auto reduceDimsType = RankedTensorType::get({1}, rewriter.getI64Type()); reduceDimsAttr = DenseIntElementsAttr::get(reduceDimsType, llvm::ArrayRef({reduceDim})); keepDims = false; if (!matchPattern(op.getKeepdim(), m_TorchConstantBool(&keepDims))) return rewriter.notifyMatchFailure( op, "non-const keepdim parameter unsupported"); return success(); } }; // This reduction op legalization template handles op variants that reduce all // dims does not keep dims. template class ConvertAtenAllDimsReductionOp : public ConvertAtenReductionOp { public: using ConvertAtenReductionOp::ConvertAtenReductionOp; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult readReduceDimsAndKeepDims(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, ElementsAttr &reduceDimsAttr, bool &keepDims) const override { auto self = adaptor.getSelf(); auto selfTy = cast(self.getType()); // Select all dims to reduce SmallVector reduceDims; for (int64_t i = 0; i < selfTy.getRank(); i++) reduceDims.push_back(i); int64_t N = selfTy.getRank(); auto reduceDimsType = RankedTensorType::get({N}, rewriter.getI64Type()); reduceDimsAttr = DenseIntElementsAttr::get(reduceDimsType, llvm::ArrayRef(reduceDims)); keepDims = false; return success(); } }; template <> LogicalResult ConvertAtenOp::matchAndRewrite( AtenArgmaxOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Value self = adaptor.getSelf(); auto selfTy = cast(self.getType()); if (!selfTy) return rewriter.notifyMatchFailure( op, "Only ranked tensor types supported in TOSA argmax"); int64_t reduceDim; if (!matchPattern(op.getDim(), m_TorchConstantInt(&reduceDim))) { // NoneType indicates reduce on all dims reduceDim = -1; } else { int64_t inputRank = selfTy.getRank(); reduceDim = toPositiveDim(reduceDim, inputRank); if (!isValidDim(reduceDim, inputRank)) return rewriter.notifyMatchFailure(op, "reduce dim is statically invalid"); } bool keepDim = false; if (!matchPattern(op.getKeepdim(), m_TorchConstantBool(&keepDim))) return rewriter.notifyMatchFailure( op, "non-const keepdim parameter unsupported"); auto resultTy = cast( getTypeConverter()->convertType(op.getResult().getType())); auto outputETy = resultTy.getElementType(); // Create a single instance of tosa.argmax. // Multiple dims require chained construct. auto buildArgmax = [&](int64_t reduceDim, Value input) -> Value { auto inputTy = cast(input.getType()); auto inputShape = makeShapeTorchCompatible(inputTy.getShape()); SmallVector outputShapeArr = {}; int32_t i = 0; for (auto &dim : inputShape) { if (i++ != reduceDim) { outputShapeArr.push_back(dim); } else { if (keepDim) outputShapeArr.push_back(1); } } // Tosa argmax output is i32, while Torch backend mandates i64. auto outputReduceTy = RankedTensorType::get( makeShapeLLVMCompatible(ArrayRef(outputShapeArr)), rewriter.getI32Type()); auto reduceDimAttr = rewriter.getIntegerAttr(rewriter.getI64Type(), reduceDim); return rewriter .create(op->getLoc(), getTypeConverter()->convertType(outputReduceTy), input, reduceDimAttr) .getResult(); }; // Convert the final index to i64 for backend finalization, However, i64 // is not a defined type for tosa.cast, so using arith.extsi instead. auto castToInt64 = [&](Value result) -> LogicalResult { auto resTy = cast(result.getType()); if (!resTy) return rewriter.notifyMatchFailure(op, "Argmax: Result is not a shaped type"); auto resShape = makeShapeTorchCompatible(resTy.getShape()); auto outTy = RankedTensorType::get(makeShapeLLVMCompatible(resShape), outputETy); rewriter.replaceOpWithNewOp( op, getTypeConverter()->convertType(outTy), result); return success(); }; if (reduceDim == -1) { // reducing on all dims Value input = self; for (int dim = 0; dim < selfTy.getRank(); dim++) { // progressively reduce each 0-th dim input = buildArgmax(0, input); } return castToInt64(input); } else { return castToInt64(buildArgmax(reduceDim, self)); } return success(); } template class ConvertAtenSqueezeOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; // Each variant must implement corresponding parameter parsing options virtual LogicalResult generateSqueezedShape(AtenOpT op, RankedTensorType selfTy, ConversionPatternRewriter &rewriter, SmallVector &squeezedShape) const { return rewriter.notifyMatchFailure( op, "Unimplemented dim/dim-list parsing function"); } // Common rewriter for all squeeze ops, calls the specific implementation of // generateSqueezedShape() needed for the op variant. LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value self = adaptor.getSelf(); auto selfTy = cast(self.getType()); if (!selfTy) return rewriter.notifyMatchFailure( op, "Only ranked tensor types supported in TOSA argmax"); SmallVector newOutputShape; if (failed(generateSqueezedShape(op, selfTy, rewriter, newOutputShape))) return rewriter.notifyMatchFailure(op, "Squeeze could not compute new shape"); auto resultTy = cast( OpConversionPattern::getTypeConverter()->convertType( op.getResult().getType())); auto resultElemTy = resultTy.getElementType(); auto newOutputTy = RankedTensorType::get( makeShapeLLVMCompatible(newOutputShape), resultElemTy); auto reshapeOp = rewriter.create( op->getLoc(), OpConversionPattern::getTypeConverter()->convertType( newOutputTy), self, rewriter.getDenseI64ArrayAttr(newOutputShape)); rewriter.replaceOpWithNewOp( op, OpConversionPattern::getTypeConverter()->convertType( newOutputTy), reshapeOp); return success(); } }; template class ConvertAtenSqueezeOneDimOp : public ConvertAtenSqueezeOp { using ConvertAtenSqueezeOp::ConvertAtenSqueezeOp; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult generateSqueezedShape(AtenOpT op, RankedTensorType selfTy, ConversionPatternRewriter &rewriter, SmallVector &squeezedShape) const override { int64_t squeezeDim; if (!matchPattern(op.getDim(), m_TorchConstantInt(&squeezeDim))) return rewriter.notifyMatchFailure(op, "non-const dim parameter unsupported"); // Handle negative dim if (squeezeDim < 0) squeezeDim = squeezeDim + selfTy.getRank(); auto selfShape = makeShapeTorchCompatible(selfTy.getShape()); // Only dims statically known to have size=1 are reduced. // Dynamic dims are treated as unknowns and will not be squeezed // even if dim parameter says it should be. uint32_t dimNum = 0; for (auto &dim : selfShape) { if (dim != 1 || squeezeDim != dimNum) squeezedShape.push_back(dim); dimNum++; } return success(); } }; template class ConvertAtenSqueezeAllDimsOp : public ConvertAtenSqueezeOp { using ConvertAtenSqueezeOp::ConvertAtenSqueezeOp; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult generateSqueezedShape(AtenOpT op, RankedTensorType selfTy, ConversionPatternRewriter &rewriter, SmallVector &squeezedShape) const override { auto selfShape = makeShapeTorchCompatible(selfTy.getShape()); // Dims that may dynamically resolve to 1 are not reduced here. Only // compile-time resolvable dims are handled here. for (auto &dim : selfShape) { if (dim != 1) squeezedShape.push_back(dim); } return success(); } }; template class ConvertAtenPowOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto outType = cast(this->getTypeConverter()->convertType(op.getType())); Value selfTensor; if constexpr (std::is_same()) { Value selfScalar = op.getSelf(); if (failed(torchScalarToTosaTensor(rewriter, op, selfScalar, selfTensor, outType.getElementType(), {}))) return rewriter.notifyMatchFailure( op, "Currently only scalar constants are supported for " "conversion in TOSA PowScalar operation"); } else { selfTensor = adaptor.getSelf(); auto selfTy = cast(selfTensor.getType()); if (!selfTy) return rewriter.notifyMatchFailure( op, "Only ranked tensor types supported in TOSA Pow"); if (!isa(selfTy.getElementType())) return rewriter.notifyMatchFailure( op, "Only floating-point datatype legalization supported"); } Value expTensor; if constexpr (std::is_same()) { Value expScalar = op.getExponent(); if (failed(torchScalarToTosaTensor(rewriter, op, expScalar, expTensor, outType.getElementType(), {}))) return rewriter.notifyMatchFailure( op, "Currently only scalar constants are supported for " "conversion in TOSA Pow operation"); } else { expTensor = adaptor.getExponent(); auto expTy = cast(expTensor.getType()); if (!expTy) return rewriter.notifyMatchFailure( op, "Only ranked tensor types supported in TOSA Pow"); } auto powOp = tosa::createBinaryOpAndCast( rewriter, op, outType, selfTensor, expTensor); rewriter.replaceOp(op, powOp.getResult()); return success(); } }; // Perform the basic n-dim matmul operation encompassing the handling of // broadcasting and dynamic shape propagation. // All PyTorch ops that leverage matrix multiplication will derive this and // implement their specialized input processing (e.g transpose), and output // processing, e.g. GEMM or fully connected bias handling. template class ConvertAtenMatmulBaseOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; using OpAdaptor = typename AtenOpT::Adaptor; // Each variant must implement corresponding parameter parsing options. // Maintain separate input read functions for each variant because it is not // necessarily true with all variants that the first two operands are the lhs // and rhs. virtual LogicalResult readMatMulInputs(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Value &lhs, Value &rhs) const { return rewriter.notifyMatchFailure( op, "Unimplemented matrix multiplication variant input parsing function"); } LogicalResult performMatmul(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Value &lhs, Value &rhs, Value &output) const { auto lhsTy = cast(lhs.getType()); auto rhsTy = cast(rhs.getType()); auto lhsRank = lhsTy.getRank(); auto rhsRank = rhsTy.getRank(); auto lhsShape = makeShapeTorchCompatible(lhsTy.getShape()); auto rhsShape = makeShapeTorchCompatible(rhsTy.getShape()); auto lhsElemTy = lhsTy.getElementType(); auto rhsElemTy = rhsTy.getElementType(); if (lhsElemTy != rhsElemTy) return rewriter.notifyMatchFailure(op, "Matmul: input datatypes mismatched"); // Legalization constructs may offer input shapes but expect output shapes // to be inferred, e.g. // func @forward(%arg0: !torch.vtensor<[14,19],f32>, // %arg1: !torch.vtensor<[19,28],f32>) -> // !torch.vtensor<[?,?],f32> // This is tricky with matmul, since TOSA matmul is on 3D inputs. // This means the need to reshape potentially both inputs and outputs, // and reshape to unknown shape is undefined. auto maxInputRank = lhsRank > rhsRank ? lhsRank : rhsRank; // If performing dot product on vectors, the RHS is synthetically transposed if (maxInputRank == 1) maxInputRank++; // Obtaining the rank broadcasted shapes of tensors makes it easier to // construct the input and output reshaping logic. auto getRankBroadcastedShape = [&](Value tensor, bool isRHS) -> SmallVector { auto tensorTy = cast(tensor.getType()); auto tensorShape = makeShapeTorchCompatible(tensorTy.getShape()); auto tensorRank = tensorTy.getRank(); SmallVector bcastedShape; auto bcastDims = maxInputRank - tensorRank; if (isRHS && (tensorRank == 1) && bcastDims) { // RHS with rank1 is special. It be synthetically transposed to dim[:-2] for (int32_t i = 0; i < bcastDims - 1; i++) bcastedShape.push_back(1); bcastedShape.push_back(tensorShape[0]); bcastedShape.push_back(1); } else { if (bcastDims > 0) { // rank broadcast for (uint32_t i = 0; i < bcastDims; i++) bcastedShape.push_back(1); } for (auto &dim : tensorShape) bcastedShape.push_back(dim); } return bcastedShape; }; // Step: Rank broadcast the two inputs. auto lhsBroadcastedShape = getRankBroadcastedShape(lhs, false); auto lhsBroadcastedTy = RankedTensorType::get( makeShapeLLVMCompatible(lhsBroadcastedShape), lhsElemTy); auto rhsBroadcastedShape = getRankBroadcastedShape(rhs, true); auto rhsBroadcastedTy = RankedTensorType::get( makeShapeLLVMCompatible(rhsBroadcastedShape), rhsElemTy); auto rankBroadcastedLhs = lhsRank == maxInputRank ? lhs : rewriter.create( op->getLoc(), OpConversionPattern::getTypeConverter()->convertType( lhsBroadcastedTy), lhs, rewriter.getDenseI64ArrayAttr(lhsBroadcastedShape)); auto rankBroadcastedRhs = rhsRank == maxInputRank ? rhs : rewriter.create( op->getLoc(), OpConversionPattern::getTypeConverter()->convertType( rhsBroadcastedTy), rhs, rewriter.getDenseI64ArrayAttr(rhsBroadcastedShape)); // TOSA matmul is performed on two 3D inputs and generates a 3D output. // Lower ranked tensors are dim-1 reshaped up to 3D auto reshapeUpTo3DTensor = [&](Value tensor) -> Value { auto tensorTy = cast(tensor.getType()); auto rank = tensorTy.getRank(); assert(rank <= 3 && "reshapeUpTo3D tensor must receive rank <= 3"); if (rank == 3) return tensor; auto shape = makeShapeTorchCompatible(tensorTy.getShape()); SmallVector newShape({1, 1, 1}); if (rank == 2) { // batchsize = 1 newShape[1] = shape[0]; newShape[2] = shape[1]; } else { // rank 1 newShape[2] = shape[0]; } auto newType = RankedTensorType::get(makeShapeLLVMCompatible(newShape), tensorTy.getElementType()); return rewriter.create( op->getLoc(), OpConversionPattern::getTypeConverter()->convertType( newType), tensor, rewriter.getDenseI64ArrayAttr(newShape)); }; // Where broadcasting is required in one or more batch dims, the following // is done. // Where all batch dims are involved in broadcasting: // Given A: 3x1x5x6 and B: 1x4x6x7 // 1. Reshape A to 1x15x6 (squeeze all batchdims into dim1) // 2. Transpose B to 6x1x4x7, Reshape to 1x6x28 // 3. tosa.Matmul 1x15x6 1x6x28 = 1x15x28 // 4. Reshape out to 3x5x4x7, Transpose to 3x4x5x7 // Where there are batch dimensions that are broadcast and not, the // treatment is to have dim0 correspond to product of all non-broadcast // dimsizes: // Given A: 4x8x16x32 B: 8x32x17 // 1. Reshape A to 8x64x32 (squeeze all unbroadcasted dims into dim0, // broadcasted dims into dim1) // 2. No transpose or reshape of B as its batchdims are not broadcast to. // 3. tosa.Matmul 8x64x32 8x32x17 = 8x64x17 // 4. Reshape to 8x4x16x17, Transpose to 4x8x16x17 // Check if we need to perform the broadcast on batch dim // Not needed if max rank < 3, or if maxrank == 3 and dim[0] matches auto needsBatchDimBroadcast = [&]() -> bool { if (maxInputRank < 3) { return false; } else { if (maxInputRank == 3 && lhsBroadcastedShape[0] == rhsBroadcastedShape[0]) { return false; } return true; } }; auto performBatchDimBroadcast = needsBatchDimBroadcast(); // Inputs to the tosa.matmul Value matmulLhs, matmulRhs; using TensorShape_t = struct { int64_t dim; int64_t shape; }; // Transpose needs to done if transposeDims are not non-monotonically // increasing. E.g. [0, 1, 2, 3]: No transpose [1, 0, 2, 3]: Transpose dim0 // and dim1 The order need not be sequential, since one or more dims may // have been removed due to broadcasting. auto isTransposeRequired = [](SmallVector transposedDims) -> bool { int32_t lastDim = -1; for (auto &dim : transposedDims) { if (lastDim > dim) return true; lastDim = dim; } return false; }; SmallVector batchElems, lhsSqueezedElems, rhsSqueezedElems; if (!performBatchDimBroadcast) { // Simple with no broadcasting artifacts. Just reshape up to 3D matmulLhs = reshapeUpTo3DTensor(rankBroadcastedLhs); matmulRhs = reshapeUpTo3DTensor(rankBroadcastedRhs); } else { // In this case, either or both input matrices involve broadcasting on // their batch dimensions. For example: // 4x5x6, 1x6x7 -> 4x5x7 // 4x1x5x6, 1x3x6x7 -> 4x3x5x7 // Though maxInputRank is necessarily >=3 here, individual matrices may be // lower rank. // E.g. 3x4x5x6, 6 -> 3x4x5 // These are the accumulated products of the shape of each dim: // 1. common dimensions: upper dimensions (dims other than two rightmost) // whose shapes are the same for both LHS and RHS. // 2. LHS squeezed dimensions: all dimensions of LHS that involve // broadcasting in either direction, plus the LHS[-2] shape // 3. RHS squeezed dimensions: all dimensions of RHS that involve // broadcasting in either direction, plus the RHS[-1] shape int64_t commonValue = 1, lhsSqueezedValue = 1, rhsSqueezedValue = 1; // For both LHS and RHS, the dimensions are separated into the common, // squeezed and remaining dim. E.g. given // LHS = 3x4x5x6 // RHS = 1x4x6x7 // common = {{dim=1, shape=4}} // lhs squeezed = {{dim=0, shape=3}, // {dim=2, shape=5}} // rhs squeezed = {{dim=0, shape=1}, // {dim=2, shape=7}} // The matmul dim is LHS[-1] and RHS[-2], i.e. 6. // Once this is obtained, LHS and RHS are expressed as: // LHS = {common, lhs_squeezed, matmul_dim} // RHS = {common, matmul_dim, rhs_squeezed} // The matmul is then performed to obtain output: // matmul_out = {common, lhs_squeezed, rhs_squeezed} // Finally, we reshape to 'unsqueeze' the LHS and RHS parts and transpose // them back to their correct positions. SmallVector transposedLhsShape; SmallVector transposedLhsDims; // Step: generate the common dim/shape information bool hasDynamicDims = false; for (uint32_t dim = 0; dim < maxInputRank - 2; dim++) { bool isDynamicDim = ShapedType::isDynamic(lhsBroadcastedShape[dim]); hasDynamicDims |= isDynamicDim; if (isDynamicDim || lhsBroadcastedShape[dim] == rhsBroadcastedShape[dim]) { commonValue *= lhsBroadcastedShape[dim]; batchElems.push_back({dim, lhsBroadcastedShape[dim]}); } } commonValue = commonValue < 0 ? kUnknownSize : commonValue; // TODO: Handle the case when there are dynamic batch dimensions. if (hasDynamicDims) commonValue = kUnknownSize; // Step: generate the LHS squeezed dim/shape information. for (uint32_t dim = 0; dim < maxInputRank - 2; dim++) { bool isDynamicDim = ShapedType::isDynamic(lhsBroadcastedShape[dim]); if (!isDynamicDim && lhsBroadcastedShape[dim] != rhsBroadcastedShape[dim]) { lhsSqueezedValue *= lhsBroadcastedShape[dim]; lhsSqueezedElems.push_back({dim, lhsBroadcastedShape[dim]}); } } // including LHS[-2] lhsSqueezedElems.push_back( {maxInputRank - 2, lhsBroadcastedShape[maxInputRank - 2]}); lhsSqueezedValue *= lhsBroadcastedShape[maxInputRank - 2]; lhsSqueezedValue = lhsSqueezedValue < 0 ? kUnknownSize : lhsSqueezedValue; // Step: Create the tosa.transpose array. If this array has a // non-monotonic series of dims, perform transpose. // First the common_elems for (uint32_t i = 0; i < batchElems.size(); i++) { transposedLhsShape.push_back(batchElems[i].shape); transposedLhsDims.push_back(batchElems[i].dim); } // then the lhs_squeezed elems for (uint32_t i = 0; i < lhsSqueezedElems.size(); i++) { transposedLhsShape.push_back(lhsSqueezedElems[i].shape); transposedLhsDims.push_back(lhsSqueezedElems[i].dim); } // then the final dim transposedLhsDims.push_back(maxInputRank - 1); transposedLhsShape.push_back(lhsBroadcastedShape[maxInputRank - 1]); bool lhsNeedsTranspose = isTransposeRequired(transposedLhsDims); auto lhsReshapeInput = rankBroadcastedLhs; if (lhsNeedsTranspose) { auto transposedLhsType = RankedTensorType::get( makeShapeLLVMCompatible(transposedLhsShape), rhsElemTy); std::optional transposedLhsDimsConst = tosa::getConstTensor( rewriter, op, /*vec=*/transposedLhsDims, /*shape=*/{static_cast(transposedLhsDims.size())}); lhsReshapeInput = rewriter .create( op->getLoc(), OpConversionPattern::getTypeConverter() ->convertType(transposedLhsType), rankBroadcastedLhs, transposedLhsDimsConst.value()) .getResult(); } // LHS = {common, lhs_squeezed, matmul_dim} SmallVector newLhsShape( {1, 1, lhsBroadcastedShape[maxInputRank - 1]}); newLhsShape[0] = commonValue; newLhsShape[1] = hasDynamicDims ? kUnknownSize : lhsSqueezedValue; auto newLhsType = RankedTensorType::get( makeShapeLLVMCompatible(newLhsShape), lhsElemTy); matmulLhs = rewriter.create( op->getLoc(), OpConversionPattern::getTypeConverter()->convertType( newLhsType), lhsReshapeInput, rewriter.getDenseI64ArrayAttr(newLhsShape)); SmallVector transposedRhsShape; SmallVector transposedRhsDims; // Step: Create the RHS transpose sequence // RHS = {common, matmul_dim, rhs_squeezed} // first the common_dims for (uint32_t i = 0; i < batchElems.size(); i++) { transposedRhsShape.push_back(batchElems[i].shape); transposedRhsDims.push_back(batchElems[i].dim); } // The matmul_dim of RHS transposedRhsDims.push_back(maxInputRank - 2); transposedRhsShape.push_back(rhsBroadcastedShape[maxInputRank - 2]); // finally all the rhs_squeeze dims hasDynamicDims = false; for (uint32_t dim = 0; dim < maxInputRank - 2; dim++) { bool isDynamicDim = ShapedType::isDynamic(rhsBroadcastedShape[dim]); hasDynamicDims |= isDynamicDim; if (!isDynamicDim && rhsBroadcastedShape[dim] != lhsBroadcastedShape[dim]) { rhsSqueezedElems.push_back({dim, rhsBroadcastedShape[dim]}); rhsSqueezedValue *= rhsBroadcastedShape[dim]; } } rhsSqueezedElems.push_back( {maxInputRank - 1, rhsBroadcastedShape[maxInputRank - 1]}); rhsSqueezedValue *= rhsBroadcastedShape[maxInputRank - 1]; for (uint32_t i = 0; i < rhsSqueezedElems.size(); i++) { transposedRhsShape.push_back(rhsSqueezedElems[i].shape); transposedRhsDims.push_back(rhsSqueezedElems[i].dim); } auto transposedRhsType = RankedTensorType::get( makeShapeLLVMCompatible(transposedRhsShape), rhsElemTy); if (hasDynamicDims) rhsSqueezedValue = kUnknownSize; SmallVector newRhsShape( {commonValue < 0 ? kUnknownSize : commonValue, rhsBroadcastedShape[maxInputRank - 2], rhsSqueezedValue}); auto newRhsType = RankedTensorType::get( makeShapeLLVMCompatible(newRhsShape), rhsElemTy); bool rhsNeedsTranspose = isTransposeRequired(transposedRhsDims); auto transposedRhsValue = rankBroadcastedRhs; if (rhsNeedsTranspose) { std::optional transposedRhsDimsConst = tosa::getConstTensor( rewriter, op, /*vec=*/transposedRhsDims, /*shape=*/{static_cast(transposedRhsDims.size())}); transposedRhsValue = rewriter .create( op->getLoc(), OpConversionPattern::getTypeConverter() ->convertType(transposedRhsType), rankBroadcastedRhs, transposedRhsDimsConst.value()) .getResult(); } // reshape matmulRhs = rewriter.create( op->getLoc(), OpConversionPattern::getTypeConverter()->convertType( newRhsType), transposedRhsValue, rewriter.getDenseI64ArrayAttr(newRhsShape)); } auto matmulLhsShape = makeShapeTorchCompatible( cast(matmulLhs.getType()).getShape()); auto matmulRhsShape = makeShapeTorchCompatible( cast(matmulRhs.getType()).getShape()); // The reshape/transpose should ensure the tosa.matmul always has same // batch size for either matrix. If if shapes are dynamic, they'll be // appropriately handled. assert(matmulLhsShape[0] == matmulRhsShape[0] && "tosa.matmul needs same batchsize on LHS and RHS"); SmallVector matmulOutputShape( {matmulLhsShape[0], matmulLhsShape[1], matmulRhsShape[2]}); Type outputElemTy; if (isa(lhsElemTy)) { outputElemTy = lhsElemTy; } else { // qint8 emits i32 matmul output outputElemTy = rewriter.getIntegerType(32); } auto mmOutputTy = RankedTensorType::get( makeShapeLLVMCompatible(matmulOutputShape), outputElemTy); auto mmOpResult = rewriter .create( op->getLoc(), OpConversionPattern::getTypeConverter()->convertType( mmOutputTy), matmulLhs, matmulRhs) .getResult(); // Perform the reshape to output shape. This is always required unless max // input rank=3 and there was no broadcasting, in which case the tosa.matmul // output itself is correctly shaped. bool performOpReshape = !(maxInputRank == 3 && !performBatchDimBroadcast); if (performOpReshape) { // Since the output shape may be unknown, we construct it // independently and reshape. Otherwise reshape may be expressed for // an unknown to-be-inferred output shape. The final tensor.cast // reshapes the known shape to the desired output shape. auto computeOpShape = [&](SmallVector &reshapedOpShape, SmallVector &transposedOpDims, SmallVector &transposedOpShapes) { if (maxInputRank == 1) return; if (maxInputRank == 2) { if (lhsRank == 2) reshapedOpShape.push_back(lhsShape[0]); if (rhsRank == 2) reshapedOpShape.push_back(rhsShape[1]); return; } // Step: Construct the output transpose/reshape information // First the common_dims for (uint32_t i = 0; i < batchElems.size(); i++) { reshapedOpShape.push_back(batchElems[i].shape); transposedOpDims.push_back(batchElems[i].dim); } // Then the LHS squeezed dims for (uint32_t i = 0; i < lhsSqueezedElems.size() - 1; i++) { // Only dims that don't broadcast - broadcasting ones come from the // other input. if (lhsSqueezedElems[i].shape != 1) { reshapedOpShape.push_back(lhsSqueezedElems[i].shape); transposedOpDims.push_back(lhsSqueezedElems[i].dim); } } // The last squeezed dim is lhs[-2] which needs to be // checked separately for broadcasting if (lhsRank > 1) { reshapedOpShape.push_back(lhsBroadcastedShape[maxInputRank - 2]); transposedOpDims.push_back(maxInputRank - 2); } // then the RHS squeezed dims except rhs[-1] which is handled like // lhs[-2] for (uint32_t i = 0; i < rhsSqueezedElems.size() - 1; i++) { if (rhsSqueezedElems[i].shape != 1) { reshapedOpShape.push_back(rhsSqueezedElems[i].shape); transposedOpDims.push_back(rhsSqueezedElems[i].dim); } } // rhs[-1] if (rhsRank > 1) { reshapedOpShape.push_back(rhsBroadcastedShape[maxInputRank - 1]); transposedOpDims.push_back(maxInputRank - 1); } // The transposition order is the inverse of what we actually want, // inversing should fix this: llvm::SmallVector inverseTransposeDims(transposedOpDims.size()); for (int i = 0, s = transposedOpDims.size(); i < s; ++i) inverseTransposeDims[transposedOpDims[i]] = i; transposedOpDims = inverseTransposeDims; // Final transposed output shape construction for (uint32_t i = 0; i < maxInputRank - 2; i++) { if (lhsBroadcastedTy.isDynamicDim(i)) { transposedOpShapes.push_back(kUnknownSize); } else { if (lhsBroadcastedShape[i] == rhsBroadcastedShape[i]) { transposedOpShapes.push_back(lhsBroadcastedShape[i]); } else { transposedOpShapes.push_back(lhsBroadcastedShape[i] == 1 ? rhsBroadcastedShape[i] : lhsBroadcastedShape[i]); } } } if (lhsRank > 1) transposedOpShapes.push_back(lhsBroadcastedShape[maxInputRank - 2]); if (rhsRank > 1) transposedOpShapes.push_back(rhsBroadcastedShape[maxInputRank - 1]); return; }; SmallVector reshapedOpShape, transposedOpShape; SmallVector transposedOpDims; computeOpShape(reshapedOpShape, transposedOpDims, transposedOpShape); bool opNeedsTranspose = isTransposeRequired(transposedOpDims); // Perform reshape auto reshapedOpType = RankedTensorType::get( makeShapeLLVMCompatible(reshapedOpShape), outputElemTy); auto reshapedOp = rewriter.create( op->getLoc(), OpConversionPattern::getTypeConverter()->convertType( reshapedOpType), mmOpResult, rewriter.getDenseI64ArrayAttr(reshapedOpShape)); if (opNeedsTranspose) { std::optional transposedOpShapeConst = tosa::getConstTensor( rewriter, op, /*vec=*/transposedOpDims, /*shape=*/{static_cast(transposedOpDims.size())}); auto transposedOpType = RankedTensorType::get( makeShapeLLVMCompatible(transposedOpShape), outputElemTy); output = rewriter .create( op->getLoc(), OpConversionPattern::getTypeConverter() ->convertType(transposedOpType), reshapedOp.getResult(), transposedOpShapeConst.value()) .getResult(); } else { output = reshapedOp.getResult(); } } else { output = mmOpResult; } return success(); } // The default version just reads two inputs, computes output and returns it. // Other versions may add a bias, apply GEMM-style alpha/beta scaling etc. virtual LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value lhs, rhs; if (failed(readMatMulInputs(op, adaptor, rewriter, lhs, rhs))) return rewriter.notifyMatchFailure(op, "Failed to read matmul inputs"); Value output; if (failed(performMatmul(op, adaptor, rewriter, lhs, rhs, output))) return rewriter.notifyMatchFailure(op, "Failed to perform matmul operation"); rewriter.replaceOpWithNewOp( op, cast( OpConversionPattern::getTypeConverter()->convertType( op.getType())), output); return success(); } }; // Legalizes the torch.matmul op for general n-dim matmul. template class ConvertAtenMatMulOp : public ConvertAtenMatmulBaseOp { public: using ConvertAtenMatmulBaseOp::ConvertAtenMatmulBaseOp; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult readMatMulInputs(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Value &lhs, Value &rhs) const override { lhs = adaptor.getSelf(); auto lhsTy = cast(lhs.getType()); rhs = adaptor.getOther(); auto rhsTy = cast(rhs.getType()); if (!lhsTy || !rhsTy) return rewriter.notifyMatchFailure( op, "Only ranked tensor types supported in TOSA matmul"); return success(); } }; // Implements handling of aten.mm and aten.bmm ops. template class ConvertAtenMmOp : public ConvertAtenMatmulBaseOp { public: using ConvertAtenMatmulBaseOp::ConvertAtenMatmulBaseOp; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult readMatMulInputs(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Value &lhs, Value &rhs) const override { lhs = adaptor.getSelf(); auto lhsTy = cast(lhs.getType()); rhs = adaptor.getMat2(); auto rhsTy = cast(rhs.getType()); if (!lhsTy || !rhsTy) return rewriter.notifyMatchFailure( op, "Only ranked tensor types supported in TOSA matmul"); auto lhsRank = lhsTy.getRank(); auto rhsRank = rhsTy.getRank(); if (isa(op)) { // Mm takes two 2D tensors. if (lhsRank != 2 || rhsRank != 2) return op.emitError("aten.mm called but matrix rank != 2"); } else if (isa(op)) { // Bmm takes two 3D tensors. if (lhsRank != 3 || rhsRank != 3) return op.emitError("aten.bmm called but matrix rank != 3"); } return success(); } }; // Implements handling of aten.linear op. template class ConvertAtenLinearOp : public ConvertAtenMatmulBaseOp { public: using ConvertAtenMatmulBaseOp::ConvertAtenMatmulBaseOp; using OpAdaptor = typename AtenOpT::Adaptor; LogicalResult readMatMulInputs(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter, Value &lhs, Value &rhs) const override { lhs = adaptor.getInput(); auto lhsTy = cast(lhs.getType()); rhs = adaptor.getWeight(); auto rhsTy = cast(rhs.getType()); if (!lhsTy || !rhsTy) return rewriter.notifyMatchFailure( op, "Only ranked tensor types supported in TOSA matmul"); auto lhsRank = lhsTy.getRank(); auto rhsRank = rhsTy.getRank(); if (lhsRank != 2 && lhsRank != 3) return op.emitError("aten.Linear called but input rank not 2 or 3"); if (rhsRank != 2 && rhsRank != 3) return op.emitError("aten.Linear called but weight rank not 2 or 3"); // Protection against crash due to unguarded code in TOSA->LinAlg. // TODO: This should be handled in TOSA->LinAlg instead. if (!lhsTy.hasStaticShape() || !rhsTy.hasStaticShape()) return rewriter.notifyMatchFailure( op, "aten.Linear needs statically shaped input"); return success(); } // Override the default rewriter to perform RHS transpose and bias addition as // well. LogicalResult matchAndRewrite(AtenOpT op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Value lhs, rhs; if (failed(readMatMulInputs(op, adaptor, rewriter, lhs, rhs))) return rewriter.notifyMatchFailure(op, "Failed to read matmul inputs"); // The aten.Linear op has a bias tensor that is added to the matmul output. auto bias = adaptor.getBias(); auto biasTy = bias.getType(); // TOSA does not mandate that elementwise op tensors need to be ranked. if (!isa(biasTy) && !isa(biasTy)) return rewriter.notifyMatchFailure( op, "Only tensor types supported in GEMM to TOSA for bias tensor"); // RHS must have its last two dims transposed prior to matrix // multiplication. auto rhsTy = cast(rhs.getType()); auto rhsRank = rhsTy.getRank(); auto rhsShape = makeShapeTorchCompatible(rhsTy.getShape()); auto rhsElemTy = rhsTy.getElementType(); // Create a non-const shape array to transpose dims. SmallVector transposedRhsShape; for (auto &shape : rhsShape) transposedRhsShape.push_back(shape); SmallVector transposedRhsDims; for (int32_t i = 0; i < rhsRank; i++) transposedRhsDims.push_back(i); // Swap the last two dims. std::swap(transposedRhsShape[rhsRank - 1], transposedRhsShape[rhsRank - 2]); std::swap(transposedRhsDims[rhsRank - 1], transposedRhsDims[rhsRank - 2]); std::optional transposedRhsShapeConst = tosa::getConstTensor( rewriter, op, /*vec=*/transposedRhsDims, /*shape=*/{static_cast(transposedRhsDims.size())}); auto transposedRhsType = RankedTensorType::get( makeShapeLLVMCompatible(transposedRhsShape), rhsElemTy); rhs = rewriter.create( op->getLoc(), OpConversionPattern::getTypeConverter()->convertType( transposedRhsType), rhs, transposedRhsShapeConst.value()); Value matmulOutput; if (failed( this->performMatmul(op, adaptor, rewriter, lhs, rhs, matmulOutput))) return rewriter.notifyMatchFailure(op, "Failed to perform matmul operation"); Value matmulPlusBias = matmulOutput; if (!isa(biasTy)) { // Bias addition broadcasts to the matmul output shape. matmulPlusBias = rewriter .create(op->getLoc(), matmulOutput.getType(), matmulOutput, bias) .getResult(); } rewriter.replaceOpWithNewOp( op, cast( OpConversionPattern::getTypeConverter()->convertType( op.getType())), matmulPlusBias); return success(); } }; template <> LogicalResult ConvertAtenOp::matchAndRewrite( AtenRsubScalarOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto self = adaptor.getSelf(); auto otherScalar = op.getOther(); auto alphaScalar = op.getAlpha(); auto selfTy = cast(self.getType()); if (!selfTy) return rewriter.notifyMatchFailure( op, "Only ranked tensor types supported in TOSA Rsub"); Value otherTensor, alphaTensor; if (failed(torchScalarToTosaTensor(rewriter, op, otherScalar, otherTensor, selfTy.getElementType(), {}))) return rewriter.notifyMatchFailure( op, "Currently only scalar constants are supported for " "conversion in TOSA Rsub operation"); if (failed(torchAlphaToTosaTensor(rewriter, op.getOperation(), alphaScalar, alphaTensor, selfTy.getElementType(), /*checkForUnity=*/true))) return failure(); auto multTensor = rewriter.create( op->getLoc(), getTypeConverter()->convertType(op.getType()), self, alphaTensor, /*shift=*/0); rewriter.replaceOpWithNewOp( op, getTypeConverter()->convertType(op.getType()), otherTensor, multTensor); return success(); } template <> LogicalResult ConvertAtenOp::matchAndRewrite( AtenConvolutionOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { bool transposed; if (!matchPattern(op.getTransposed(), m_TorchConstantBool(&transposed))) return rewriter.notifyMatchFailure( op, "Unimplemented: non-constant value for transposed not supported"); if (transposed) return rewriter.notifyMatchFailure( op, "Unimplemented: transposed convolution not supported"); auto input = adaptor.getInput(); auto weight = adaptor.getWeight(); auto inputTy = cast(input.getType()); auto weightTy = cast(weight.getType()); auto outputTy = cast(getTypeConverter()->convertType(op.getType())); if (!inputTy || !weightTy || !outputTy) return rewriter.notifyMatchFailure( op, "Input, weight and output to Convolution must be ranked tensors"); auto inputElemTy = inputTy.getElementType(); auto weightElemTy = weightTy.getElementType(); auto inputShape = makeShapeTorchCompatible(inputTy.getShape()); auto weightShape = makeShapeTorchCompatible(weightTy.getShape()); if (inputTy.getRank() != 4) return rewriter.notifyMatchFailure( op, "Unimplemented: only 2D convolutions supported"); if (!weightTy.hasStaticShape()) return rewriter.notifyMatchFailure( op, "Unimplemented: TOSA only supports static weight"); // Bias is optional. TOSA mandates a zero tensor here, so construct one if // required. auto bias = adaptor.getBias(); if (isa(adaptor.getBias().getType())) { // TBD: This is only valid for quantized 8-bit. For 16-bit, the bias (and // accumulator) are 48-bit and not 32-bit, and requires the use of APInt to // define a 48-bit int. if (isa(inputElemTy)) { SmallVector zeroVec(weightShape[0], 0); bias = tosa::getConstTensor( rewriter, op, zeroVec, {static_cast(weightShape[0])}) .value(); } else { SmallVector zeroVec(weightShape[0], 0); bias = tosa::getConstTensor(rewriter, op, zeroVec, {static_cast(weightShape[0])}) .value(); } } else { if (!cast(bias.getType())) return rewriter.notifyMatchFailure( op, "Bias provided but not a ranked tensor"); } auto biasElemTy = isa(inputElemTy) ? inputElemTy : rewriter.getI32Type(); int64_t groups; if (!matchPattern(op.getGroups(), m_TorchConstantInt(&groups))) { return rewriter.notifyMatchFailure(op, "non-const group size unsupported"); } else if (groups != 1 && weightShape[1] != 1) { return rewriter.notifyMatchFailure( op, "group size must be 1 (convolution) or weight.dim(1) must be 1 " "(depthwise convolution)"); } SmallVector stride; if (!matchPattern(adaptor.getStride(), m_TorchListOfConstantInts(stride))) return rewriter.notifyMatchFailure(op, "non-const stride list unsupported"); SmallVector padding_2d; if (!matchPattern(adaptor.getPadding(), m_TorchListOfConstantInts(padding_2d))) return rewriter.notifyMatchFailure(op, "non-const padding list unsupported"); // TOSA uses 4D padding {t, b, l, r} while Torch defines 2D padding {t, l}. // The Torch OFM computation uses 2*pad in each spatial direction, implying // the same t=b and l=r values for TOSA. SmallVector padding( {padding_2d[0], padding_2d[0], padding_2d[1], padding_2d[1]}); SmallVector dilation; if (!matchPattern(adaptor.getDilation(), m_TorchListOfConstantInts(dilation))) return rewriter.notifyMatchFailure(op, "non-const dilation list unsupported"); // TOSA works in NHWC and takes OHWI (conv) / HWIM (depthwise conv) weights. // Perform the necessary transformations. std::optional nchwToNhwcTransposeConst = tosa::getConstTensor(rewriter, op, /*vec=*/{0, 2, 3, 1}, /*shape=*/{static_cast(4)}); SmallVector transposedInputShape( {inputShape[0], inputShape[2], inputShape[3], inputShape[1]}); auto transposedInputType = RankedTensorType::get( makeShapeLLVMCompatible(transposedInputShape), inputElemTy); auto transposedInput = rewriter .create( op->getLoc(), getTypeConverter()->convertType(transposedInputType), input, nchwToNhwcTransposeConst.value()) .getResult(); SmallVector transformedWeightShape; RankedTensorType transformedWeightType; Value transformedWeight; int64_t outputCDim; if (groups == 1) { // full convolution: O(I/G)HW-> OHWI transformedWeightShape = {weightShape[0], weightShape[2], weightShape[3], weightShape[1]}; transformedWeightType = RankedTensorType::get( makeShapeLLVMCompatible(transformedWeightShape), weightElemTy); transformedWeight = rewriter .create( op->getLoc(), getTypeConverter()->convertType(transformedWeightType), weight, nchwToNhwcTransposeConst.value()) .getResult(); outputCDim = transformedWeightShape[0]; } else if (weightShape[1] == 1) { // depthwise convolution: O(I/G)HW-> HWIM) // transpose: O(I/G)HW -> HWO(I/G) std::optional transposeConst = tosa::getConstTensor(rewriter, op, /*vec=*/{2, 3, 0, 1}, /*shape=*/{static_cast(4)}); SmallVector transposedWeightShape = { weightShape[2], weightShape[3], weightShape[0], weightShape[1]}; auto transposedWeightType = RankedTensorType::get( makeShapeLLVMCompatible(transposedWeightShape), weightElemTy); auto transposedWeight = rewriter .create( op->getLoc(), getTypeConverter()->convertType(transposedWeightType), weight, transposeConst.value()) .getResult(); // reshape: HWO(I/G) -> HWIM outputCDim = makeShapeTorchCompatible(outputTy.getShape())[1]; if (outputCDim == kUnknownSize) { return rewriter.notifyMatchFailure( op, "number of output channels must be statically known for " "depthwise convolutions"); } transformedWeightShape = { transposedWeightShape[0], transposedWeightShape[1], groups, outputCDim / groups, }; transformedWeightType = RankedTensorType::get( makeShapeLLVMCompatible(transformedWeightShape), weightElemTy); transformedWeight = rewriter .create( op->getLoc(), getTypeConverter()->convertType(transformedWeightType), transposedWeight, rewriter.getDenseI64ArrayAttr(transformedWeightShape)) .getResult(); } else { llvm_unreachable("Unhandled convolution type"); } int64_t outputHDim, outputWDim; if (inputTy.hasStaticShape()) { int64_t inputHDim = inputShape[2]; int64_t inputWDim = inputShape[3]; int64_t weightHDim = weightShape[2]; int64_t weightWDim = weightShape[3]; outputHDim = (inputHDim + padding[0] + padding[1] - dilation[0] * (weightHDim - 1) - 1) / stride[0] + 1; outputWDim = (inputWDim + padding[2] + padding[3] - dilation[1] * (weightWDim - 1) - 1) / stride[1] + 1; } else { outputHDim = kUnknownSize; outputWDim = kUnknownSize; } // Output shape is NHWC, to be transposed back to NCHW. Output elemTy for // quantized input is i32, which gets rescaled down to quantized output range. SmallVector outputShape = {transposedInputShape[0], outputHDim, outputWDim, outputCDim}; auto convOpTy = RankedTensorType::get(makeShapeLLVMCompatible(outputShape), biasElemTy); Value convOpResult; if (groups == 1) { // full convolution convOpResult = rewriter .create(op->getLoc(), getTypeConverter()->convertType(convOpTy), transposedInput, transformedWeight, bias, rewriter.getDenseI64ArrayAttr(padding), rewriter.getDenseI64ArrayAttr(stride), rewriter.getDenseI64ArrayAttr(dilation)) .getResult(); } else if (weightShape[1] == 1) { // depthwise convolution convOpResult = rewriter .create( op->getLoc(), getTypeConverter()->convertType(convOpTy), transposedInput, transformedWeight, bias, rewriter.getDenseI64ArrayAttr(padding), rewriter.getDenseI64ArrayAttr(stride), rewriter.getDenseI64ArrayAttr(dilation)) .getResult(); } else { llvm_unreachable("Unhandled convolution type"); } std::optional nhwcToNchwTransposeConst = tosa::getConstTensor(rewriter, op, /*vec=*/{0, 3, 1, 2}, /*shape=*/{static_cast(4)}); SmallVector transposedOutputShape( {outputShape[0], outputShape[3], outputShape[1], outputShape[2]}); auto transposedOutputType = RankedTensorType::get( makeShapeLLVMCompatible(transposedOutputShape), biasElemTy); auto transposedOutput = rewriter .create( op->getLoc(), getTypeConverter()->convertType(transposedOutputType), convOpResult, nhwcToNchwTransposeConst.value()) .getResult(); Value rescaledResult = transposedOutput; if (isa(inputElemTy)) { rescaledResult = tosa::buildRescaleOpConvOutput( rewriter, op, transposedOutput, inputTy, weightTy, outputTy); } rewriter.replaceOpWithNewOp( op, getTypeConverter()->convertType(op.getType()), rescaledResult); return success(); } template <> LogicalResult ConvertAtenOp::matchAndRewrite( AtenReshapeOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { auto self = adaptor.getSelf(); auto selfTy = cast(self.getType()); if (!selfTy) return rewriter.notifyMatchFailure( op, "Only ranked tensor types supported in TOSA Reshape"); // Check that at most one dimension is -1 SmallVector newShape; if (!matchPattern(op.getShape(), m_TorchListOfConstantInts(newShape))) return rewriter.notifyMatchFailure( op, "Only constant shape supported in TOSA Reshape"); int auto_sz = 0; for (auto s : newShape) auto_sz += (s == -1 ? 1 : 0); if (auto_sz > 1) return rewriter.notifyMatchFailure( op, "At most one dimension may be specified as -1 to " "automatically calculate its size"); auto newType = RankedTensorType::get(makeShapeLLVMCompatible(newShape), selfTy.getElementType()); rewriter.replaceOpWithNewOp( op, getTypeConverter()->convertType(newType), self, rewriter.getDenseI64ArrayAttr(newShape)); return success(); } Value computeBatchNorm(Operation *op, ConversionPatternRewriter &rewriter, Type outType, Value input, Value variance, Value eps, Value mean, Value weight, Value bias) { // For PyTorch: // scale = gamma = weight // offset = beta = bias // Lowering: // fused batchnorm = (input-mean) * scale * rsqrt(var+epsilon)) + offset // // shape_0 = ones(input.rank) // shape_0[input.rank-1] = input.shape[input.rank-1] // shape_1 = ones(1) // // bmean = reshape(mean, shape_0) // bscale = reshape(scale, shape_0) // boffset= reshape(offset, shape_0) // beps = reshape(epsilon, shape_1) // // op1 = sub(input, bmean) // op2 = add(var, beps) // op3 = rsqrt(op2) // bvar = reshape(op3, shape_0) // op4 = mul(op1, bvar) // op5 = mul(op4, bscale) // op6 = add(op5, boffset) auto op1SubInputMean = rewriter.create(op->getLoc(), outType, input, mean); auto op2AddVarEpsilon = rewriter.create( op->getLoc(), variance.getType(), variance, eps); auto op3RsqrtOp2 = rewriter.create( op->getLoc(), variance.getType(), op2AddVarEpsilon.getResult()); auto op4MulOp1Op3 = rewriter.create(op->getLoc(), outType, op1SubInputMean.getResult(), op3RsqrtOp2.getResult(), 0); auto op5MulOp4Scale = rewriter.create( op->getLoc(), outType, op4MulOp1Op3.getResult(), weight, 0); return rewriter .create(op->getLoc(), outType, op5MulOp4Scale.getResult(), bias) .getResult(); } // This lowering is based on the TensorFlow to TOSA lowering. template <> LogicalResult ConvertAtenOp::matchAndRewrite( AtenBatchNormOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { // Not a ranked tensor output if (!dyn_cast(adaptor.getInput().getType())) return rewriter.notifyMatchFailure( op, "Only ranked tensor types are supported"); auto outType = getTypeConverter()->convertType(op.getType()); // Note: cudnn_enabled is not handled. // FIXME: Handle training and momentum. if (isa(op.getMomentum().getType())) return rewriter.notifyMatchFailure(op, "Unsupported None for momentum"); auto meanType = dyn_cast(adaptor.getRunningMean().getType()); auto varianceType = dyn_cast(adaptor.getRunningVar().getType()); if (!varianceType || !meanType) return rewriter.notifyMatchFailure( op, "Only ranked tensor types are supported"); // Normalization ops perform elementwise ops of a single mean/stdev value // against the feature map and because input is NCHW, the rank-1 value must be // reshaped so it sits on the same dim as 'C'. auto reshapeToNormInputDim = [&](Operation *op, ConversionPatternRewriter &rewriter, const TypeConverter *converter, Type outType, const Value toBcast, Value &result) { RankedTensorType toBcastType = dyn_cast(toBcast.getType()); if (toBcastType.getRank() > 1) return rewriter.notifyMatchFailure(op, "Rank cannot be more than 1"); RankedTensorType outTensorType = cast(outType); SmallVector newShape = { makeShapeTorchCompatible(toBcastType.getShape())[0]}; for (auto i = 2; i < outTensorType.getRank(); ++i) newShape.push_back(1); auto newType = RankedTensorType::get(makeShapeLLVMCompatible(newShape), outTensorType.getElementType()); result = rewriter.create