[tosa] Implement matmul, mm and bmm support (#526)

- Also handles braodcasting n-D tensors, dynamic shapes Signed-off-by: Suraj Sudhir <suraj.sudhir@arm.com>
2022-01-18 13:37:32 -08:00 · 2022-01-18 13:37:32 -08:00 · 0188ca5498
parent 3745f54489
commit 0188ca5498
1 changed files with 568 additions and 0 deletions
--- a/lib/Conversion/TorchToTosa/TorchToTosa.cpp
+++ b/lib/Conversion/TorchToTosa/TorchToTosa.cpp
@ -678,6 +678,566 @@ LogicalResult ConvertAtenOp<AtenPowTensorScalarOp>::matchAndRewrite(

  return success();
 }
+
+// Perform torch matmul, mm and bmm
+template <typename AtenOpT>
+class ConvertAtenMatMulOp : public OpConversionPattern<AtenOpT> {
+public:
+  using OpConversionPattern<AtenOpT>::OpConversionPattern;
+  using OpAdaptor = typename AtenOpT::Adaptor;
+  LogicalResult
+  matchAndRewrite(AtenOpT op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value lhs = adaptor.self();
+    auto lhsTy = lhs.getType().cast<RankedTensorType>();
+
+    // Aten matmul, mm and bmm call operand2 by different names.
+    Value rhs = adaptor.getOperands()[1];
+    auto rhsTy = rhs.getType().cast<RankedTensorType>();
+
+    if (!lhsTy || !rhsTy)
+      return op.emitError("Only ranked tensor types supported in TOSA matmul");
+
+    auto lhsRank = lhsTy.getRank();
+    auto rhsRank = rhsTy.getRank();
+
+    // Mm takes two 2D tensors
+    if (isa<AtenMmOp>(op)) {
+      assert(lhsRank == 2 && rhsRank == 2 &&
+             "aten.mm called but matrix rank != 2");
+    }
+
+    // Bmm takes two 2D tensors
+    if (isa<AtenBmmOp>(op)) {
+      assert(lhsRank == 3 && rhsRank == 3 &&
+             "aten.bmm called but matrix rank != 2");
+    }
+
+    auto lhsShape = lhsTy.getShape();
+    auto rhsShape = rhsTy.getShape();
+
+    auto lhsElemTy = lhsTy.getElementType();
+    auto rhsElemTy = rhsTy.getElementType();
+
+    if (lhsElemTy != rhsElemTy)
+      return op.emitError("Matmul: input datatypes mismatched");
+
+    // Legalization constructs may offer input shapes but expect output shapes
+    // to be inferred, e.g.
+    // func @forward(%arg0: !torch.vtensor<[14,19],f32>,
+    //               %arg1: !torch.vtensor<[19,28],f32>) ->
+    //               !torch.vtensor<[?,?],f32>
+    // This is tricky with matmul, since TOSA matmul is on 3D inputs.
+    // This means the need to reshape potentially both inputs and outputs,
+    // and reshape to unknown shape is undefined.
+
+    auto maxInputRank = lhsRank > rhsRank ? lhsRank : rhsRank;
+    // If performing dot product on vectors, the RHS is synthetically transposed
+    if (maxInputRank == 1)
+      maxInputRank++;
+
+    // Obtaining the rank broadcasted shapes of tensors makes it easier to
+    // construct the input and output reshaping logic.
+    auto getRankBroadcastedShape = [&](Value tensor,
+                                       bool isRHS) -> SmallVector<int64_t> {
+      auto tensorTy = tensor.getType().cast<TensorType>();
+      auto tensorShape = tensorTy.getShape();
+      auto tensorRank = tensorTy.getRank();
+
+      SmallVector<int64_t> bcastedShape;
+
+      auto bcastDims = maxInputRank - tensorRank;
+
+      if (isRHS && (tensorRank == 1) && bcastDims) {
+        // RHS with rank1 is special. It be synthetically transposed to dim[:-2]
+        for (int32_t i = 0; i < bcastDims - 1; i++)
+          bcastedShape.push_back(1);
+        bcastedShape.push_back(tensorShape[0]);
+        bcastedShape.push_back(1);
+      } else {
+        if (bcastDims > 0) { // rank broadcast
+          for (uint32_t i = 0; i < bcastDims; i++)
+            bcastedShape.push_back(1);
+        }
+        for (auto &dim : tensorShape)
+          bcastedShape.push_back(dim);
+      }
+      return bcastedShape;
+    };
+
+    // Step: Rank broadcast the two inputs.
+    auto lhsBroadcastedShape = getRankBroadcastedShape(lhs, false);
+    auto lhsBroadcastedTy =
+        RankedTensorType::get(lhsBroadcastedShape, lhsElemTy);
+    auto rhsBroadcastedShape = getRankBroadcastedShape(rhs, true);
+    auto rhsBroadcastedTy =
+        RankedTensorType::get(rhsBroadcastedShape, rhsElemTy);
+
+    auto rankBroadcastedLhs =
+        lhsRank == maxInputRank
+            ? lhs
+            : rewriter.create<tosa::ReshapeOp>(
+                  op->getLoc(),
+                  OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+                      lhsBroadcastedTy),
+                  lhs, rewriter.getI64ArrayAttr(lhsBroadcastedShape));
+
+    auto rankBroadcastedRhs =
+        rhsRank == maxInputRank
+            ? rhs
+            : rewriter.create<tosa::ReshapeOp>(
+                  op->getLoc(),
+                  OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+                      rhsBroadcastedTy),
+                  rhs, rewriter.getI64ArrayAttr(rhsBroadcastedShape));
+
+    // TOSA matmul is performed on two 3D inputs and generates a 3D output.
+    // Lower ranked tensors are dim-1 reshaped up to 3D
+    auto reshapeUpTo3DTensor = [&](Value tensor) -> Value {
+      auto tensorTy = tensor.getType().cast<TensorType>();
+      auto rank = tensorTy.getRank();
+
+      assert(rank <= 3 && "reshapeUpTo3D tensor must receive rank <= 3");
+      if (rank == 3)
+        return tensor;
+
+      auto shape = tensorTy.getShape();
+      SmallVector<int64_t> newShape({1, 1, 1});
+
+      if (rank == 2) { // batchsize = 1
+        newShape[1] = shape[0];
+        newShape[2] = shape[1];
+      } else { // rank 1
+        newShape[2] = shape[0];
+      }
+      auto newType = RankedTensorType::get(newShape, tensorTy.getElementType());
+
+      return rewriter.create<tosa::ReshapeOp>(
+          op->getLoc(),
+          OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+              newType),
+          tensor, rewriter.getI64ArrayAttr(newShape));
+    };
+
+    // Where broadcasting is required in one or more batch dims, the following
+    // is done.
+    // Where all batch dims are involved in broadcasting:
+    // Given A: 3x1x5x6 and B: 1x4x6x7
+    // 1. Reshape A to 1x15x6 (squeeze all batchdims into dim1)
+    // 2. Transpose B to 6x1x4x7, Reshape to 1x6x28
+    // 3. tosa.Matmul 1x15x6 1x6x28 = 1x15x28
+    // 4. Reshape out to 3x5x4x7, Transpose to 3x4x5x7
+    // Where there are batch dimensions that are broadcast and not, the
+    // treatment is to have dim0 correspond to product of all non-broadcast
+    // dimsizes:
+    // Given A: 4x8x16x32 B: 8x32x17
+    // 1. Reshape A to 8x64x32 (squeeze all unbroadcasted dims into dim0,
+    // broadcasted dims into dim1)
+    // 2. No transpose or reshape of B as its batchdims are not broadcast to.
+    // 3. tosa.Matmul 8x64x32 8x32x17 = 8x64x17
+    // 4. Reshape to 8x4x16x17, Transpose to 4x8x16x17
+
+    // Check if we need to perform the broadcast on batch dim
+    // Not needed if max rank < 3, or if maxrank == 3 and dim[0] matches
+    auto needsBatchDimBroadcast = [&]() -> bool {
+      if (maxInputRank < 3) {
+        return false;
+      } else {
+        if (maxInputRank == 3 &&
+            lhsBroadcastedShape[0] == rhsBroadcastedShape[0]) {
+          return false;
+        }
+        return true;
+      }
+    };
+
+    auto performBatchDimBroadcast = needsBatchDimBroadcast();
+
+    // Inputs to the tosa.matmul
+    Value matmulLhs, matmulRhs;
+
+    using TensorShape_t = struct {
+      int64_t dim;
+      int64_t shape;
+    };
+
+    // Transpose needs to done if transposeDims are not non-monotonically
+    // increasing. E.g. [0, 1, 2, 3]: No transpose [1, 0, 2, 3]: Transpose dim0
+    // and dim1 The order need not be sequential, since one or more dims may
+    // have been removed due to broadcasting.
+    auto isTransposeRequired = [](SmallVector<int32_t> transposedDims) -> bool {
+      int32_t lastDim = -1;
+      for (auto &dim : transposedDims) {
+        if (lastDim > dim)
+          return true;
+        lastDim = dim;
+      }
+      return false;
+    };
+
+    SmallVector<TensorShape_t> commonElems, lhsSqueezedElems, rhsSqueezedElems;
+
+    if (!performBatchDimBroadcast) {
+      // Simple with no broadcasting artifacts. Just reshape up to 3D
+      matmulLhs = reshapeUpTo3DTensor(rankBroadcastedLhs);
+      matmulRhs = reshapeUpTo3DTensor(rankBroadcastedRhs);
+
+    } else {
+      // In this case, either or both input matrices involve broadcasting on
+      // their batch dimensions. For example:
+      // 4x5x6, 1x6x7 -> 4x5x7
+      // 4x1x5x6, 1x3x6x7 -> 4x3x5x7
+      // Though maxInputRank is necessarily >=3 here, individual matrices may be
+      // lower rank.
+      // E.g. 3x4x5x6, 6 -> 3x4x5
+
+      // These are the accumulated products of the shape of each dim:
+      // 1. common dimensions: upper dimensions (dims other than two rightmost)
+      // whose shapes are the same for both LHS and RHS.
+      // 2. LHS squeezed dimensions: all dimensions of LHS that involve
+      // broadcasting in either direction, plus the LHS[-2] shape
+      // 3. RHS squeezed dimensions: all dimensions of RHS that involve
+      // broadcasting in either direction, plus the RHS[-1] shape
+      int64_t commonValue = 1, lhsSqueezedValue = 1, rhsSqueezedValue = 1;
+
+      // For both LHS and RHS, the dimensions are separated into the common,
+      // squeezed and remaining dim. E.g. given
+      // LHS = 3x4x5x6
+      // RHS = 1x4x6x7
+      // common = {{dim=1, shape=4}}
+      // lhs squeezed = {{dim=0, shape=3},
+      //                 {dim=2, shape=5}}
+      // rhs squeezed = {{dim=0, shape=1},
+      //                 {dim=2, shape=7}}
+      // The matmul dim is LHS[-1] and RHS[-2], i.e. 6.
+      // Once this is obtained, LHS and RHS are expressed as:
+      // LHS = {common, lhs_squeezed, matmul_dim}
+      // RHS = {common, matmul_dim, rhs_squeezed}
+      // The matmul is then performed to obtain output:
+      // matmul_out = {common, lhs_squeezed, rhs_squeezed}
+      // Finally, we reshape to 'unsqueeze' the LHS and RHS parts and transpose
+      // them back to their correct positions.
+
+      SmallVector<int64_t> transposedLhsShape;
+      SmallVector<int32_t> transposedLhsDims;
+
+      // Step: generate the common dim/shape information
+      for (uint32_t dim = 0; dim < maxInputRank - 2; dim++) {
+        bool isDynamicDim =
+            lhsBroadcastedTy.isDynamic(lhsBroadcastedShape[dim]);
+        if (isDynamicDim ||
+            lhsBroadcastedShape[dim] == rhsBroadcastedShape[dim]) {
+          commonValue *= lhsBroadcastedShape[dim];
+          commonElems.push_back({dim, lhsBroadcastedShape[dim]});
+        }
+      }
+
+      // Step: generate the LHS squeezed dim/shape information.
+      bool hasDynamicDims = false;
+      for (uint32_t dim = 0; dim < maxInputRank - 2; dim++) {
+        bool isDynamicDim =
+            lhsBroadcastedTy.isDynamic(lhsBroadcastedShape[dim]);
+        hasDynamicDims |= isDynamicDim;
+        if (!isDynamicDim &&
+            lhsBroadcastedShape[dim] != rhsBroadcastedShape[dim]) {
+          lhsSqueezedValue *= lhsBroadcastedShape[dim];
+          lhsSqueezedElems.push_back({dim, lhsBroadcastedShape[dim]});
+        }
+      }
+      // including LHS[-2]
+      lhsSqueezedElems.push_back(
+          {maxInputRank - 2, lhsBroadcastedShape[maxInputRank - 2]});
+      lhsSqueezedValue *= lhsBroadcastedShape[maxInputRank - 2];
+
+      // Step: Create the tosa.transpose array. If this array has a
+      // non-monotonic series of dims, perform transpose.
+      // First the common_elems
+      for (uint32_t i = 0; i < commonElems.size(); i++) {
+        transposedLhsShape.push_back(commonElems[i].shape);
+        transposedLhsDims.push_back(commonElems[i].dim);
+      }
+      // then the lhs_squeezed elems
+      for (uint32_t i = 0; i < lhsSqueezedElems.size(); i++) {
+        transposedLhsShape.push_back(lhsSqueezedElems[i].shape);
+        transposedLhsDims.push_back(lhsSqueezedElems[i].dim);
+      }
+      // then the final dim
+      transposedLhsDims.push_back(maxInputRank - 1);
+      transposedLhsShape.push_back(lhsBroadcastedShape[maxInputRank - 1]);
+
+      bool lhsNeedsTranspose = isTransposeRequired(transposedLhsDims);
+
+      auto lhsReshapeInput = rankBroadcastedLhs;
+
+      if (lhsNeedsTranspose) {
+        auto transposedLhsType =
+            RankedTensorType::get(transposedLhsShape, rhsElemTy);
+
+        llvm::Optional<Value> transposedLhsDimsConst =
+            tosa::getConstTensor<int32_t>(
+                rewriter, op,
+                /*vec=*/transposedLhsDims,
+                /*shape=*/{static_cast<int32_t>(transposedLhsDims.size())});
+
+        lhsReshapeInput =
+            rewriter
+                .create<tosa::TransposeOp>(
+                    op->getLoc(),
+                    OpConversionPattern<AtenOpT>::getTypeConverter()
+                        ->convertType(transposedLhsType),
+                    rankBroadcastedLhs, transposedLhsDimsConst.getValue())
+                .getResult();
+      }
+
+      // LHS = {common, lhs_squeezed, matmul_dim}
+      SmallVector<int64_t> newLhsShape(
+          {1, 1, lhsBroadcastedShape[maxInputRank - 1]});
+      newLhsShape[0] = commonValue;
+      newLhsShape[1] =
+          hasDynamicDims ? ShapedType::kDynamicSize : lhsSqueezedValue;
+
+      auto newLhsType = RankedTensorType::get(newLhsShape, lhsElemTy);
+
+      matmulLhs = rewriter.create<tosa::ReshapeOp>(
+          op->getLoc(),
+          OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+              newLhsType),
+          lhsReshapeInput, rewriter.getI64ArrayAttr(newLhsShape));
+
+      SmallVector<int64_t> transposedRhsShape;
+      SmallVector<int32_t> transposedRhsDims;
+
+      // Step: Create the RHS transpose sequence
+      // RHS = {common, matmul_dim, rhs_squeezed}
+      // first the common_dims
+      for (uint32_t i = 0; i < commonElems.size(); i++) {
+        transposedRhsShape.push_back(commonElems[i].shape);
+        transposedRhsDims.push_back(commonElems[i].dim);
+      }
+      // The matmul_dim of RHS
+      transposedRhsDims.push_back(maxInputRank - 2);
+      transposedRhsShape.push_back(rhsBroadcastedShape[maxInputRank - 2]);
+      // finally all the rhs_squeeze dims
+      hasDynamicDims = false;
+      for (uint32_t dim = 0; dim < maxInputRank - 2; dim++) {
+        bool isDynamicDim =
+            rhsBroadcastedTy.isDynamic(rhsBroadcastedShape[dim]);
+        hasDynamicDims |= isDynamicDim;
+        if (!isDynamicDim &&
+            rhsBroadcastedShape[dim] != lhsBroadcastedShape[dim]) {
+          rhsSqueezedElems.push_back({dim, rhsBroadcastedShape[dim]});
+          rhsSqueezedValue *= rhsBroadcastedShape[dim];
+        }
+      }
+      rhsSqueezedElems.push_back(
+          {maxInputRank - 1, rhsBroadcastedShape[maxInputRank - 1]});
+      rhsSqueezedValue *= rhsBroadcastedShape[maxInputRank - 1];
+      for (uint32_t i = 0; i < rhsSqueezedElems.size(); i++) {
+        transposedRhsShape.push_back(rhsSqueezedElems[i].shape);
+        transposedRhsDims.push_back(rhsSqueezedElems[i].dim);
+      }
+
+      auto transposedRhsType =
+          RankedTensorType::get(transposedRhsShape, rhsElemTy);
+
+      if (hasDynamicDims)
+        rhsSqueezedValue = ShapedType::kDynamicSize;
+
+      SmallVector<int64_t> newRhsShape({commonValue,
+                                        rhsBroadcastedShape[maxInputRank - 2],
+                                        rhsSqueezedValue});
+      auto newRhsType = RankedTensorType::get(newRhsShape, rhsElemTy);
+
+      bool rhsNeedsTranspose = isTransposeRequired(transposedRhsDims);
+
+      auto transposedRhsValue = rankBroadcastedRhs;
+
+      if (rhsNeedsTranspose) {
+        llvm::Optional<Value> transposedRhsDimsConst =
+            tosa::getConstTensor<int32_t>(
+                rewriter, op,
+                /*vec=*/transposedRhsDims,
+                /*shape=*/{static_cast<int32_t>(transposedRhsDims.size())});
+
+        transposedRhsValue =
+            rewriter
+                .create<tosa::TransposeOp>(
+                    op->getLoc(),
+                    OpConversionPattern<AtenOpT>::getTypeConverter()
+                        ->convertType(transposedRhsType),
+                    rankBroadcastedRhs, transposedRhsDimsConst.getValue())
+                .getResult();
+      }
+
+      // reshape
+      matmulRhs = rewriter.create<tosa::ReshapeOp>(
+          op->getLoc(),
+          OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+              newRhsType),
+          transposedRhsValue, rewriter.getI64ArrayAttr(newRhsShape));
+    }
+
+    auto matmulLhsShape =
+        matmulLhs.getType().template cast<RankedTensorType>().getShape();
+    auto matmulRhsShape =
+        matmulRhs.getType().template cast<RankedTensorType>().getShape();
+
+    // The reshape/transpose should ensure the tosa.matmul always has same
+    // batch size for either matrix. If if shapes are dynamic, they'll be
+    // appropriately handled.
+    assert(matmulLhsShape[0] == matmulRhsShape[0] &&
+           "tosa.matmul needs same batchsize on LHS and RHS");
+
+    SmallVector<int64_t> matmulOutputShape(
+        {matmulLhsShape[0], matmulLhsShape[1], matmulRhsShape[2]});
+    Type outputElemTy;
+    if (lhsElemTy.isa<mlir::FloatType>()) {
+      outputElemTy = lhsElemTy;
+    } else { // qint8 emits i32 matmul output
+      outputElemTy = rewriter.getIntegerType(32);
+    }
+
+    auto mmOutputTy = RankedTensorType::get(matmulOutputShape, outputElemTy);
+    auto mmOpResult =
+        rewriter
+            .create<tosa::MatMulOp>(
+                op->getLoc(),
+                OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+                    mmOutputTy),
+                matmulLhs, matmulRhs)
+            .getResult();
+
+    // Perform the reshape to output shape. This is always required unless both
+    // inputs are rank=3, in which case the tosa.matmul output itself is
+    // correctly shaped.
+    bool performOpReshape = !(lhsRank == 3 && rhsRank == 3);
+
+    auto outputTy = OpConversionPattern<AtenOpT>::getTypeConverter()
+                        ->convertType(op.getType())
+                        .template cast<RankedTensorType>();
+
+    if (performOpReshape) {
+      // Since the output shape may be unknown, we construct it
+      // independently and reshape. Otherwise reshape may be expressed for
+      // an unknown to-be-inferred output shape. The final tensor.cast
+      // reshapes the known shape to the desired output shape.
+      auto computeOpShape = [&](SmallVector<int64_t> &reshapedOpShape,
+                                SmallVector<int32_t> &transposedOpDims,
+                                SmallVector<int64_t> &transposedOpShapes) {
+        if (maxInputRank == 1)
+          return;
+
+        if (maxInputRank == 2) {
+          if (lhsRank == 2)
+            reshapedOpShape.push_back(lhsShape[0]);
+          if (rhsRank == 2)
+            reshapedOpShape.push_back(rhsShape[1]);
+          return;
+        }
+
+        // Step: Construct the output transpose/reshape information
+        // First the common_dims
+        for (uint32_t i = 0; i < commonElems.size(); i++) {
+          reshapedOpShape.push_back(commonElems[i].shape);
+          transposedOpDims.push_back(commonElems[i].dim);
+        }
+
+        // Then the LHS squeezed dims
+        for (uint32_t i = 0; i < lhsSqueezedElems.size() - 1; i++) {
+          // Only dims that don't broadcast - broadcasting ones come from the
+          // other input.
+          if (lhsSqueezedElems[i].shape != 1) {
+            reshapedOpShape.push_back(lhsSqueezedElems[i].shape);
+            transposedOpDims.push_back(lhsSqueezedElems[i].dim);
+          }
+        }
+        // The last squeezed dim is lhs[-2] which needs to be
+        // checked separately for broadcasting
+        if (lhsRank > 1) {
+          reshapedOpShape.push_back(lhsBroadcastedShape[maxInputRank - 2]);
+          transposedOpDims.push_back(maxInputRank - 2);
+        }
+
+        // then the RHS squeezed dims except rhs[-1] which is handled like
+        // lhs[-2]
+        for (uint32_t i = 0; i < rhsSqueezedElems.size() - 1; i++) {
+          if (rhsSqueezedElems[i].shape != 1) {
+            reshapedOpShape.push_back(rhsSqueezedElems[i].shape);
+            transposedOpDims.push_back(rhsSqueezedElems[i].dim);
+          }
+        }
+        // rhs[-1]
+        if (rhsRank > 1) {
+          reshapedOpShape.push_back(rhsBroadcastedShape[maxInputRank - 1]);
+          transposedOpDims.push_back(maxInputRank - 1);
+        }
+
+        // Final transposed output shape construction
+        for (uint32_t i = 0; i < maxInputRank - 2; i++) {
+          if (lhsBroadcastedTy.isDynamicDim(i)) {
+            transposedOpShapes.push_back(ShapedType::kDynamicSize);
+          } else {
+            if (lhsBroadcastedShape[i] == rhsBroadcastedShape[i]) {
+              transposedOpShapes.push_back(lhsBroadcastedShape[i]);
+            } else {
+              transposedOpShapes.push_back(lhsBroadcastedShape[i] == 1
+                                               ? rhsBroadcastedShape[i]
+                                               : lhsBroadcastedShape[i]);
+            }
+          }
+        }
+        if (lhsRank > 1)
+          transposedOpShapes.push_back(lhsBroadcastedShape[maxInputRank - 2]);
+        if (rhsRank > 1)
+          transposedOpShapes.push_back(rhsBroadcastedShape[maxInputRank - 1]);
+
+        return;
+      };
+
+      SmallVector<int64_t> reshapedOpShape, transposedOpShape;
+      SmallVector<int32_t> transposedOpDims;
+
+      computeOpShape(reshapedOpShape, transposedOpDims, transposedOpShape);
+
+      bool opNeedsTranspose = isTransposeRequired(transposedOpDims);
+
+      // Perform reshape
+      auto reshapedOpType =
+          RankedTensorType::get(reshapedOpShape, outputElemTy);
+      auto reshapedOp = rewriter.create<tosa::ReshapeOp>(
+          op->getLoc(),
+          OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+              reshapedOpType),
+          mmOpResult, rewriter.getI64ArrayAttr(reshapedOpShape));
+
+      if (opNeedsTranspose) {
+
+        llvm::Optional<Value> transposedOpShapeConst =
+            tosa::getConstTensor<int32_t>(
+                rewriter, op,
+                /*vec=*/transposedOpDims,
+                /*shape=*/{static_cast<int32_t>(transposedOpDims.size())});
+
+        auto transposedOpType =
+            RankedTensorType::get(transposedOpShape, outputElemTy);
+        auto transposedOp = rewriter.create<tosa::TransposeOp>(
+            op->getLoc(),
+            OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
+                transposedOpType),
+            reshapedOp.getResult(), transposedOpShapeConst.getValue());
+
+        rewriter.replaceOpWithNewOp<tensor::CastOp>(op, outputTy, transposedOp);
+      } else {
+        rewriter.replaceOpWithNewOp<tensor::CastOp>(op, outputTy, reshapedOp);
+      }
+    } else {
+      rewriter.replaceOpWithNewOp<tensor::CastOp>(op, outputTy, mmOpResult);
+    }
+
+    return success();
+  }
+};
+
 } // namespace

 // -----------------------------------------------------------------------------
@ -774,6 +1334,14 @@ public:
    INSERT_SQUEEZE_OP_PATTERN(AtenSqueezeDimOp, ConvertAtenSqueezeOneDimOp)
 #undef INSERT_SQUEEZE_OP_PATTERN

+#define INSERT_MATMUL_ATENOP_PATTERN(AtenOp)                                   \
+  target.addIllegalOp<AtenOp>();                                               \
+  patterns.add<ConvertAtenMatMulOp<AtenOp>>(typeConverter, context);
+    INSERT_MATMUL_ATENOP_PATTERN(AtenMatmulOp);
+    INSERT_MATMUL_ATENOP_PATTERN(AtenMmOp);
+    INSERT_MATMUL_ATENOP_PATTERN(AtenBmmOp);
+#undef INSERT_MATMUL_ATEMOP_PATTERN
+
 #define INSERT_ATENOP_PATTERN(AtenOp)                                          \
  target.addIllegalOp<AtenOp>();                                               \
  patterns.add<ConvertAtenOp<AtenOp>>(typeConverter, context);