mirror of https://github.com/llvm/torch-mlir
[tosa] Implement matmul, mm and bmm support (#526)
- Also handles braodcasting n-D tensors, dynamic shapes Signed-off-by: Suraj Sudhir <suraj.sudhir@arm.com>pull/523/merge snapshot-20220118.214
parent
3745f54489
commit
0188ca5498
|
@ -678,6 +678,566 @@ LogicalResult ConvertAtenOp<AtenPowTensorScalarOp>::matchAndRewrite(
|
||||||
|
|
||||||
return success();
|
return success();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Perform torch matmul, mm and bmm
|
||||||
|
template <typename AtenOpT>
|
||||||
|
class ConvertAtenMatMulOp : public OpConversionPattern<AtenOpT> {
|
||||||
|
public:
|
||||||
|
using OpConversionPattern<AtenOpT>::OpConversionPattern;
|
||||||
|
using OpAdaptor = typename AtenOpT::Adaptor;
|
||||||
|
LogicalResult
|
||||||
|
matchAndRewrite(AtenOpT op, OpAdaptor adaptor,
|
||||||
|
ConversionPatternRewriter &rewriter) const override {
|
||||||
|
Value lhs = adaptor.self();
|
||||||
|
auto lhsTy = lhs.getType().cast<RankedTensorType>();
|
||||||
|
|
||||||
|
// Aten matmul, mm and bmm call operand2 by different names.
|
||||||
|
Value rhs = adaptor.getOperands()[1];
|
||||||
|
auto rhsTy = rhs.getType().cast<RankedTensorType>();
|
||||||
|
|
||||||
|
if (!lhsTy || !rhsTy)
|
||||||
|
return op.emitError("Only ranked tensor types supported in TOSA matmul");
|
||||||
|
|
||||||
|
auto lhsRank = lhsTy.getRank();
|
||||||
|
auto rhsRank = rhsTy.getRank();
|
||||||
|
|
||||||
|
// Mm takes two 2D tensors
|
||||||
|
if (isa<AtenMmOp>(op)) {
|
||||||
|
assert(lhsRank == 2 && rhsRank == 2 &&
|
||||||
|
"aten.mm called but matrix rank != 2");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bmm takes two 2D tensors
|
||||||
|
if (isa<AtenBmmOp>(op)) {
|
||||||
|
assert(lhsRank == 3 && rhsRank == 3 &&
|
||||||
|
"aten.bmm called but matrix rank != 2");
|
||||||
|
}
|
||||||
|
|
||||||
|
auto lhsShape = lhsTy.getShape();
|
||||||
|
auto rhsShape = rhsTy.getShape();
|
||||||
|
|
||||||
|
auto lhsElemTy = lhsTy.getElementType();
|
||||||
|
auto rhsElemTy = rhsTy.getElementType();
|
||||||
|
|
||||||
|
if (lhsElemTy != rhsElemTy)
|
||||||
|
return op.emitError("Matmul: input datatypes mismatched");
|
||||||
|
|
||||||
|
// Legalization constructs may offer input shapes but expect output shapes
|
||||||
|
// to be inferred, e.g.
|
||||||
|
// func @forward(%arg0: !torch.vtensor<[14,19],f32>,
|
||||||
|
// %arg1: !torch.vtensor<[19,28],f32>) ->
|
||||||
|
// !torch.vtensor<[?,?],f32>
|
||||||
|
// This is tricky with matmul, since TOSA matmul is on 3D inputs.
|
||||||
|
// This means the need to reshape potentially both inputs and outputs,
|
||||||
|
// and reshape to unknown shape is undefined.
|
||||||
|
|
||||||
|
auto maxInputRank = lhsRank > rhsRank ? lhsRank : rhsRank;
|
||||||
|
// If performing dot product on vectors, the RHS is synthetically transposed
|
||||||
|
if (maxInputRank == 1)
|
||||||
|
maxInputRank++;
|
||||||
|
|
||||||
|
// Obtaining the rank broadcasted shapes of tensors makes it easier to
|
||||||
|
// construct the input and output reshaping logic.
|
||||||
|
auto getRankBroadcastedShape = [&](Value tensor,
|
||||||
|
bool isRHS) -> SmallVector<int64_t> {
|
||||||
|
auto tensorTy = tensor.getType().cast<TensorType>();
|
||||||
|
auto tensorShape = tensorTy.getShape();
|
||||||
|
auto tensorRank = tensorTy.getRank();
|
||||||
|
|
||||||
|
SmallVector<int64_t> bcastedShape;
|
||||||
|
|
||||||
|
auto bcastDims = maxInputRank - tensorRank;
|
||||||
|
|
||||||
|
if (isRHS && (tensorRank == 1) && bcastDims) {
|
||||||
|
// RHS with rank1 is special. It be synthetically transposed to dim[:-2]
|
||||||
|
for (int32_t i = 0; i < bcastDims - 1; i++)
|
||||||
|
bcastedShape.push_back(1);
|
||||||
|
bcastedShape.push_back(tensorShape[0]);
|
||||||
|
bcastedShape.push_back(1);
|
||||||
|
} else {
|
||||||
|
if (bcastDims > 0) { // rank broadcast
|
||||||
|
for (uint32_t i = 0; i < bcastDims; i++)
|
||||||
|
bcastedShape.push_back(1);
|
||||||
|
}
|
||||||
|
for (auto &dim : tensorShape)
|
||||||
|
bcastedShape.push_back(dim);
|
||||||
|
}
|
||||||
|
return bcastedShape;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Step: Rank broadcast the two inputs.
|
||||||
|
auto lhsBroadcastedShape = getRankBroadcastedShape(lhs, false);
|
||||||
|
auto lhsBroadcastedTy =
|
||||||
|
RankedTensorType::get(lhsBroadcastedShape, lhsElemTy);
|
||||||
|
auto rhsBroadcastedShape = getRankBroadcastedShape(rhs, true);
|
||||||
|
auto rhsBroadcastedTy =
|
||||||
|
RankedTensorType::get(rhsBroadcastedShape, rhsElemTy);
|
||||||
|
|
||||||
|
auto rankBroadcastedLhs =
|
||||||
|
lhsRank == maxInputRank
|
||||||
|
? lhs
|
||||||
|
: rewriter.create<tosa::ReshapeOp>(
|
||||||
|
op->getLoc(),
|
||||||
|
OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
|
||||||
|
lhsBroadcastedTy),
|
||||||
|
lhs, rewriter.getI64ArrayAttr(lhsBroadcastedShape));
|
||||||
|
|
||||||
|
auto rankBroadcastedRhs =
|
||||||
|
rhsRank == maxInputRank
|
||||||
|
? rhs
|
||||||
|
: rewriter.create<tosa::ReshapeOp>(
|
||||||
|
op->getLoc(),
|
||||||
|
OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
|
||||||
|
rhsBroadcastedTy),
|
||||||
|
rhs, rewriter.getI64ArrayAttr(rhsBroadcastedShape));
|
||||||
|
|
||||||
|
// TOSA matmul is performed on two 3D inputs and generates a 3D output.
|
||||||
|
// Lower ranked tensors are dim-1 reshaped up to 3D
|
||||||
|
auto reshapeUpTo3DTensor = [&](Value tensor) -> Value {
|
||||||
|
auto tensorTy = tensor.getType().cast<TensorType>();
|
||||||
|
auto rank = tensorTy.getRank();
|
||||||
|
|
||||||
|
assert(rank <= 3 && "reshapeUpTo3D tensor must receive rank <= 3");
|
||||||
|
if (rank == 3)
|
||||||
|
return tensor;
|
||||||
|
|
||||||
|
auto shape = tensorTy.getShape();
|
||||||
|
SmallVector<int64_t> newShape({1, 1, 1});
|
||||||
|
|
||||||
|
if (rank == 2) { // batchsize = 1
|
||||||
|
newShape[1] = shape[0];
|
||||||
|
newShape[2] = shape[1];
|
||||||
|
} else { // rank 1
|
||||||
|
newShape[2] = shape[0];
|
||||||
|
}
|
||||||
|
auto newType = RankedTensorType::get(newShape, tensorTy.getElementType());
|
||||||
|
|
||||||
|
return rewriter.create<tosa::ReshapeOp>(
|
||||||
|
op->getLoc(),
|
||||||
|
OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
|
||||||
|
newType),
|
||||||
|
tensor, rewriter.getI64ArrayAttr(newShape));
|
||||||
|
};
|
||||||
|
|
||||||
|
// Where broadcasting is required in one or more batch dims, the following
|
||||||
|
// is done.
|
||||||
|
// Where all batch dims are involved in broadcasting:
|
||||||
|
// Given A: 3x1x5x6 and B: 1x4x6x7
|
||||||
|
// 1. Reshape A to 1x15x6 (squeeze all batchdims into dim1)
|
||||||
|
// 2. Transpose B to 6x1x4x7, Reshape to 1x6x28
|
||||||
|
// 3. tosa.Matmul 1x15x6 1x6x28 = 1x15x28
|
||||||
|
// 4. Reshape out to 3x5x4x7, Transpose to 3x4x5x7
|
||||||
|
// Where there are batch dimensions that are broadcast and not, the
|
||||||
|
// treatment is to have dim0 correspond to product of all non-broadcast
|
||||||
|
// dimsizes:
|
||||||
|
// Given A: 4x8x16x32 B: 8x32x17
|
||||||
|
// 1. Reshape A to 8x64x32 (squeeze all unbroadcasted dims into dim0,
|
||||||
|
// broadcasted dims into dim1)
|
||||||
|
// 2. No transpose or reshape of B as its batchdims are not broadcast to.
|
||||||
|
// 3. tosa.Matmul 8x64x32 8x32x17 = 8x64x17
|
||||||
|
// 4. Reshape to 8x4x16x17, Transpose to 4x8x16x17
|
||||||
|
|
||||||
|
// Check if we need to perform the broadcast on batch dim
|
||||||
|
// Not needed if max rank < 3, or if maxrank == 3 and dim[0] matches
|
||||||
|
auto needsBatchDimBroadcast = [&]() -> bool {
|
||||||
|
if (maxInputRank < 3) {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
if (maxInputRank == 3 &&
|
||||||
|
lhsBroadcastedShape[0] == rhsBroadcastedShape[0]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
auto performBatchDimBroadcast = needsBatchDimBroadcast();
|
||||||
|
|
||||||
|
// Inputs to the tosa.matmul
|
||||||
|
Value matmulLhs, matmulRhs;
|
||||||
|
|
||||||
|
using TensorShape_t = struct {
|
||||||
|
int64_t dim;
|
||||||
|
int64_t shape;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Transpose needs to done if transposeDims are not non-monotonically
|
||||||
|
// increasing. E.g. [0, 1, 2, 3]: No transpose [1, 0, 2, 3]: Transpose dim0
|
||||||
|
// and dim1 The order need not be sequential, since one or more dims may
|
||||||
|
// have been removed due to broadcasting.
|
||||||
|
auto isTransposeRequired = [](SmallVector<int32_t> transposedDims) -> bool {
|
||||||
|
int32_t lastDim = -1;
|
||||||
|
for (auto &dim : transposedDims) {
|
||||||
|
if (lastDim > dim)
|
||||||
|
return true;
|
||||||
|
lastDim = dim;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
SmallVector<TensorShape_t> commonElems, lhsSqueezedElems, rhsSqueezedElems;
|
||||||
|
|
||||||
|
if (!performBatchDimBroadcast) {
|
||||||
|
// Simple with no broadcasting artifacts. Just reshape up to 3D
|
||||||
|
matmulLhs = reshapeUpTo3DTensor(rankBroadcastedLhs);
|
||||||
|
matmulRhs = reshapeUpTo3DTensor(rankBroadcastedRhs);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// In this case, either or both input matrices involve broadcasting on
|
||||||
|
// their batch dimensions. For example:
|
||||||
|
// 4x5x6, 1x6x7 -> 4x5x7
|
||||||
|
// 4x1x5x6, 1x3x6x7 -> 4x3x5x7
|
||||||
|
// Though maxInputRank is necessarily >=3 here, individual matrices may be
|
||||||
|
// lower rank.
|
||||||
|
// E.g. 3x4x5x6, 6 -> 3x4x5
|
||||||
|
|
||||||
|
// These are the accumulated products of the shape of each dim:
|
||||||
|
// 1. common dimensions: upper dimensions (dims other than two rightmost)
|
||||||
|
// whose shapes are the same for both LHS and RHS.
|
||||||
|
// 2. LHS squeezed dimensions: all dimensions of LHS that involve
|
||||||
|
// broadcasting in either direction, plus the LHS[-2] shape
|
||||||
|
// 3. RHS squeezed dimensions: all dimensions of RHS that involve
|
||||||
|
// broadcasting in either direction, plus the RHS[-1] shape
|
||||||
|
int64_t commonValue = 1, lhsSqueezedValue = 1, rhsSqueezedValue = 1;
|
||||||
|
|
||||||
|
// For both LHS and RHS, the dimensions are separated into the common,
|
||||||
|
// squeezed and remaining dim. E.g. given
|
||||||
|
// LHS = 3x4x5x6
|
||||||
|
// RHS = 1x4x6x7
|
||||||
|
// common = {{dim=1, shape=4}}
|
||||||
|
// lhs squeezed = {{dim=0, shape=3},
|
||||||
|
// {dim=2, shape=5}}
|
||||||
|
// rhs squeezed = {{dim=0, shape=1},
|
||||||
|
// {dim=2, shape=7}}
|
||||||
|
// The matmul dim is LHS[-1] and RHS[-2], i.e. 6.
|
||||||
|
// Once this is obtained, LHS and RHS are expressed as:
|
||||||
|
// LHS = {common, lhs_squeezed, matmul_dim}
|
||||||
|
// RHS = {common, matmul_dim, rhs_squeezed}
|
||||||
|
// The matmul is then performed to obtain output:
|
||||||
|
// matmul_out = {common, lhs_squeezed, rhs_squeezed}
|
||||||
|
// Finally, we reshape to 'unsqueeze' the LHS and RHS parts and transpose
|
||||||
|
// them back to their correct positions.
|
||||||
|
|
||||||
|
SmallVector<int64_t> transposedLhsShape;
|
||||||
|
SmallVector<int32_t> transposedLhsDims;
|
||||||
|
|
||||||
|
// Step: generate the common dim/shape information
|
||||||
|
for (uint32_t dim = 0; dim < maxInputRank - 2; dim++) {
|
||||||
|
bool isDynamicDim =
|
||||||
|
lhsBroadcastedTy.isDynamic(lhsBroadcastedShape[dim]);
|
||||||
|
if (isDynamicDim ||
|
||||||
|
lhsBroadcastedShape[dim] == rhsBroadcastedShape[dim]) {
|
||||||
|
commonValue *= lhsBroadcastedShape[dim];
|
||||||
|
commonElems.push_back({dim, lhsBroadcastedShape[dim]});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step: generate the LHS squeezed dim/shape information.
|
||||||
|
bool hasDynamicDims = false;
|
||||||
|
for (uint32_t dim = 0; dim < maxInputRank - 2; dim++) {
|
||||||
|
bool isDynamicDim =
|
||||||
|
lhsBroadcastedTy.isDynamic(lhsBroadcastedShape[dim]);
|
||||||
|
hasDynamicDims |= isDynamicDim;
|
||||||
|
if (!isDynamicDim &&
|
||||||
|
lhsBroadcastedShape[dim] != rhsBroadcastedShape[dim]) {
|
||||||
|
lhsSqueezedValue *= lhsBroadcastedShape[dim];
|
||||||
|
lhsSqueezedElems.push_back({dim, lhsBroadcastedShape[dim]});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// including LHS[-2]
|
||||||
|
lhsSqueezedElems.push_back(
|
||||||
|
{maxInputRank - 2, lhsBroadcastedShape[maxInputRank - 2]});
|
||||||
|
lhsSqueezedValue *= lhsBroadcastedShape[maxInputRank - 2];
|
||||||
|
|
||||||
|
// Step: Create the tosa.transpose array. If this array has a
|
||||||
|
// non-monotonic series of dims, perform transpose.
|
||||||
|
// First the common_elems
|
||||||
|
for (uint32_t i = 0; i < commonElems.size(); i++) {
|
||||||
|
transposedLhsShape.push_back(commonElems[i].shape);
|
||||||
|
transposedLhsDims.push_back(commonElems[i].dim);
|
||||||
|
}
|
||||||
|
// then the lhs_squeezed elems
|
||||||
|
for (uint32_t i = 0; i < lhsSqueezedElems.size(); i++) {
|
||||||
|
transposedLhsShape.push_back(lhsSqueezedElems[i].shape);
|
||||||
|
transposedLhsDims.push_back(lhsSqueezedElems[i].dim);
|
||||||
|
}
|
||||||
|
// then the final dim
|
||||||
|
transposedLhsDims.push_back(maxInputRank - 1);
|
||||||
|
transposedLhsShape.push_back(lhsBroadcastedShape[maxInputRank - 1]);
|
||||||
|
|
||||||
|
bool lhsNeedsTranspose = isTransposeRequired(transposedLhsDims);
|
||||||
|
|
||||||
|
auto lhsReshapeInput = rankBroadcastedLhs;
|
||||||
|
|
||||||
|
if (lhsNeedsTranspose) {
|
||||||
|
auto transposedLhsType =
|
||||||
|
RankedTensorType::get(transposedLhsShape, rhsElemTy);
|
||||||
|
|
||||||
|
llvm::Optional<Value> transposedLhsDimsConst =
|
||||||
|
tosa::getConstTensor<int32_t>(
|
||||||
|
rewriter, op,
|
||||||
|
/*vec=*/transposedLhsDims,
|
||||||
|
/*shape=*/{static_cast<int32_t>(transposedLhsDims.size())});
|
||||||
|
|
||||||
|
lhsReshapeInput =
|
||||||
|
rewriter
|
||||||
|
.create<tosa::TransposeOp>(
|
||||||
|
op->getLoc(),
|
||||||
|
OpConversionPattern<AtenOpT>::getTypeConverter()
|
||||||
|
->convertType(transposedLhsType),
|
||||||
|
rankBroadcastedLhs, transposedLhsDimsConst.getValue())
|
||||||
|
.getResult();
|
||||||
|
}
|
||||||
|
|
||||||
|
// LHS = {common, lhs_squeezed, matmul_dim}
|
||||||
|
SmallVector<int64_t> newLhsShape(
|
||||||
|
{1, 1, lhsBroadcastedShape[maxInputRank - 1]});
|
||||||
|
newLhsShape[0] = commonValue;
|
||||||
|
newLhsShape[1] =
|
||||||
|
hasDynamicDims ? ShapedType::kDynamicSize : lhsSqueezedValue;
|
||||||
|
|
||||||
|
auto newLhsType = RankedTensorType::get(newLhsShape, lhsElemTy);
|
||||||
|
|
||||||
|
matmulLhs = rewriter.create<tosa::ReshapeOp>(
|
||||||
|
op->getLoc(),
|
||||||
|
OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
|
||||||
|
newLhsType),
|
||||||
|
lhsReshapeInput, rewriter.getI64ArrayAttr(newLhsShape));
|
||||||
|
|
||||||
|
SmallVector<int64_t> transposedRhsShape;
|
||||||
|
SmallVector<int32_t> transposedRhsDims;
|
||||||
|
|
||||||
|
// Step: Create the RHS transpose sequence
|
||||||
|
// RHS = {common, matmul_dim, rhs_squeezed}
|
||||||
|
// first the common_dims
|
||||||
|
for (uint32_t i = 0; i < commonElems.size(); i++) {
|
||||||
|
transposedRhsShape.push_back(commonElems[i].shape);
|
||||||
|
transposedRhsDims.push_back(commonElems[i].dim);
|
||||||
|
}
|
||||||
|
// The matmul_dim of RHS
|
||||||
|
transposedRhsDims.push_back(maxInputRank - 2);
|
||||||
|
transposedRhsShape.push_back(rhsBroadcastedShape[maxInputRank - 2]);
|
||||||
|
// finally all the rhs_squeeze dims
|
||||||
|
hasDynamicDims = false;
|
||||||
|
for (uint32_t dim = 0; dim < maxInputRank - 2; dim++) {
|
||||||
|
bool isDynamicDim =
|
||||||
|
rhsBroadcastedTy.isDynamic(rhsBroadcastedShape[dim]);
|
||||||
|
hasDynamicDims |= isDynamicDim;
|
||||||
|
if (!isDynamicDim &&
|
||||||
|
rhsBroadcastedShape[dim] != lhsBroadcastedShape[dim]) {
|
||||||
|
rhsSqueezedElems.push_back({dim, rhsBroadcastedShape[dim]});
|
||||||
|
rhsSqueezedValue *= rhsBroadcastedShape[dim];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rhsSqueezedElems.push_back(
|
||||||
|
{maxInputRank - 1, rhsBroadcastedShape[maxInputRank - 1]});
|
||||||
|
rhsSqueezedValue *= rhsBroadcastedShape[maxInputRank - 1];
|
||||||
|
for (uint32_t i = 0; i < rhsSqueezedElems.size(); i++) {
|
||||||
|
transposedRhsShape.push_back(rhsSqueezedElems[i].shape);
|
||||||
|
transposedRhsDims.push_back(rhsSqueezedElems[i].dim);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto transposedRhsType =
|
||||||
|
RankedTensorType::get(transposedRhsShape, rhsElemTy);
|
||||||
|
|
||||||
|
if (hasDynamicDims)
|
||||||
|
rhsSqueezedValue = ShapedType::kDynamicSize;
|
||||||
|
|
||||||
|
SmallVector<int64_t> newRhsShape({commonValue,
|
||||||
|
rhsBroadcastedShape[maxInputRank - 2],
|
||||||
|
rhsSqueezedValue});
|
||||||
|
auto newRhsType = RankedTensorType::get(newRhsShape, rhsElemTy);
|
||||||
|
|
||||||
|
bool rhsNeedsTranspose = isTransposeRequired(transposedRhsDims);
|
||||||
|
|
||||||
|
auto transposedRhsValue = rankBroadcastedRhs;
|
||||||
|
|
||||||
|
if (rhsNeedsTranspose) {
|
||||||
|
llvm::Optional<Value> transposedRhsDimsConst =
|
||||||
|
tosa::getConstTensor<int32_t>(
|
||||||
|
rewriter, op,
|
||||||
|
/*vec=*/transposedRhsDims,
|
||||||
|
/*shape=*/{static_cast<int32_t>(transposedRhsDims.size())});
|
||||||
|
|
||||||
|
transposedRhsValue =
|
||||||
|
rewriter
|
||||||
|
.create<tosa::TransposeOp>(
|
||||||
|
op->getLoc(),
|
||||||
|
OpConversionPattern<AtenOpT>::getTypeConverter()
|
||||||
|
->convertType(transposedRhsType),
|
||||||
|
rankBroadcastedRhs, transposedRhsDimsConst.getValue())
|
||||||
|
.getResult();
|
||||||
|
}
|
||||||
|
|
||||||
|
// reshape
|
||||||
|
matmulRhs = rewriter.create<tosa::ReshapeOp>(
|
||||||
|
op->getLoc(),
|
||||||
|
OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
|
||||||
|
newRhsType),
|
||||||
|
transposedRhsValue, rewriter.getI64ArrayAttr(newRhsShape));
|
||||||
|
}
|
||||||
|
|
||||||
|
auto matmulLhsShape =
|
||||||
|
matmulLhs.getType().template cast<RankedTensorType>().getShape();
|
||||||
|
auto matmulRhsShape =
|
||||||
|
matmulRhs.getType().template cast<RankedTensorType>().getShape();
|
||||||
|
|
||||||
|
// The reshape/transpose should ensure the tosa.matmul always has same
|
||||||
|
// batch size for either matrix. If if shapes are dynamic, they'll be
|
||||||
|
// appropriately handled.
|
||||||
|
assert(matmulLhsShape[0] == matmulRhsShape[0] &&
|
||||||
|
"tosa.matmul needs same batchsize on LHS and RHS");
|
||||||
|
|
||||||
|
SmallVector<int64_t> matmulOutputShape(
|
||||||
|
{matmulLhsShape[0], matmulLhsShape[1], matmulRhsShape[2]});
|
||||||
|
Type outputElemTy;
|
||||||
|
if (lhsElemTy.isa<mlir::FloatType>()) {
|
||||||
|
outputElemTy = lhsElemTy;
|
||||||
|
} else { // qint8 emits i32 matmul output
|
||||||
|
outputElemTy = rewriter.getIntegerType(32);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto mmOutputTy = RankedTensorType::get(matmulOutputShape, outputElemTy);
|
||||||
|
auto mmOpResult =
|
||||||
|
rewriter
|
||||||
|
.create<tosa::MatMulOp>(
|
||||||
|
op->getLoc(),
|
||||||
|
OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
|
||||||
|
mmOutputTy),
|
||||||
|
matmulLhs, matmulRhs)
|
||||||
|
.getResult();
|
||||||
|
|
||||||
|
// Perform the reshape to output shape. This is always required unless both
|
||||||
|
// inputs are rank=3, in which case the tosa.matmul output itself is
|
||||||
|
// correctly shaped.
|
||||||
|
bool performOpReshape = !(lhsRank == 3 && rhsRank == 3);
|
||||||
|
|
||||||
|
auto outputTy = OpConversionPattern<AtenOpT>::getTypeConverter()
|
||||||
|
->convertType(op.getType())
|
||||||
|
.template cast<RankedTensorType>();
|
||||||
|
|
||||||
|
if (performOpReshape) {
|
||||||
|
// Since the output shape may be unknown, we construct it
|
||||||
|
// independently and reshape. Otherwise reshape may be expressed for
|
||||||
|
// an unknown to-be-inferred output shape. The final tensor.cast
|
||||||
|
// reshapes the known shape to the desired output shape.
|
||||||
|
auto computeOpShape = [&](SmallVector<int64_t> &reshapedOpShape,
|
||||||
|
SmallVector<int32_t> &transposedOpDims,
|
||||||
|
SmallVector<int64_t> &transposedOpShapes) {
|
||||||
|
if (maxInputRank == 1)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (maxInputRank == 2) {
|
||||||
|
if (lhsRank == 2)
|
||||||
|
reshapedOpShape.push_back(lhsShape[0]);
|
||||||
|
if (rhsRank == 2)
|
||||||
|
reshapedOpShape.push_back(rhsShape[1]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step: Construct the output transpose/reshape information
|
||||||
|
// First the common_dims
|
||||||
|
for (uint32_t i = 0; i < commonElems.size(); i++) {
|
||||||
|
reshapedOpShape.push_back(commonElems[i].shape);
|
||||||
|
transposedOpDims.push_back(commonElems[i].dim);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then the LHS squeezed dims
|
||||||
|
for (uint32_t i = 0; i < lhsSqueezedElems.size() - 1; i++) {
|
||||||
|
// Only dims that don't broadcast - broadcasting ones come from the
|
||||||
|
// other input.
|
||||||
|
if (lhsSqueezedElems[i].shape != 1) {
|
||||||
|
reshapedOpShape.push_back(lhsSqueezedElems[i].shape);
|
||||||
|
transposedOpDims.push_back(lhsSqueezedElems[i].dim);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// The last squeezed dim is lhs[-2] which needs to be
|
||||||
|
// checked separately for broadcasting
|
||||||
|
if (lhsRank > 1) {
|
||||||
|
reshapedOpShape.push_back(lhsBroadcastedShape[maxInputRank - 2]);
|
||||||
|
transposedOpDims.push_back(maxInputRank - 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// then the RHS squeezed dims except rhs[-1] which is handled like
|
||||||
|
// lhs[-2]
|
||||||
|
for (uint32_t i = 0; i < rhsSqueezedElems.size() - 1; i++) {
|
||||||
|
if (rhsSqueezedElems[i].shape != 1) {
|
||||||
|
reshapedOpShape.push_back(rhsSqueezedElems[i].shape);
|
||||||
|
transposedOpDims.push_back(rhsSqueezedElems[i].dim);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// rhs[-1]
|
||||||
|
if (rhsRank > 1) {
|
||||||
|
reshapedOpShape.push_back(rhsBroadcastedShape[maxInputRank - 1]);
|
||||||
|
transposedOpDims.push_back(maxInputRank - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final transposed output shape construction
|
||||||
|
for (uint32_t i = 0; i < maxInputRank - 2; i++) {
|
||||||
|
if (lhsBroadcastedTy.isDynamicDim(i)) {
|
||||||
|
transposedOpShapes.push_back(ShapedType::kDynamicSize);
|
||||||
|
} else {
|
||||||
|
if (lhsBroadcastedShape[i] == rhsBroadcastedShape[i]) {
|
||||||
|
transposedOpShapes.push_back(lhsBroadcastedShape[i]);
|
||||||
|
} else {
|
||||||
|
transposedOpShapes.push_back(lhsBroadcastedShape[i] == 1
|
||||||
|
? rhsBroadcastedShape[i]
|
||||||
|
: lhsBroadcastedShape[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (lhsRank > 1)
|
||||||
|
transposedOpShapes.push_back(lhsBroadcastedShape[maxInputRank - 2]);
|
||||||
|
if (rhsRank > 1)
|
||||||
|
transposedOpShapes.push_back(rhsBroadcastedShape[maxInputRank - 1]);
|
||||||
|
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
SmallVector<int64_t> reshapedOpShape, transposedOpShape;
|
||||||
|
SmallVector<int32_t> transposedOpDims;
|
||||||
|
|
||||||
|
computeOpShape(reshapedOpShape, transposedOpDims, transposedOpShape);
|
||||||
|
|
||||||
|
bool opNeedsTranspose = isTransposeRequired(transposedOpDims);
|
||||||
|
|
||||||
|
// Perform reshape
|
||||||
|
auto reshapedOpType =
|
||||||
|
RankedTensorType::get(reshapedOpShape, outputElemTy);
|
||||||
|
auto reshapedOp = rewriter.create<tosa::ReshapeOp>(
|
||||||
|
op->getLoc(),
|
||||||
|
OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
|
||||||
|
reshapedOpType),
|
||||||
|
mmOpResult, rewriter.getI64ArrayAttr(reshapedOpShape));
|
||||||
|
|
||||||
|
if (opNeedsTranspose) {
|
||||||
|
|
||||||
|
llvm::Optional<Value> transposedOpShapeConst =
|
||||||
|
tosa::getConstTensor<int32_t>(
|
||||||
|
rewriter, op,
|
||||||
|
/*vec=*/transposedOpDims,
|
||||||
|
/*shape=*/{static_cast<int32_t>(transposedOpDims.size())});
|
||||||
|
|
||||||
|
auto transposedOpType =
|
||||||
|
RankedTensorType::get(transposedOpShape, outputElemTy);
|
||||||
|
auto transposedOp = rewriter.create<tosa::TransposeOp>(
|
||||||
|
op->getLoc(),
|
||||||
|
OpConversionPattern<AtenOpT>::getTypeConverter()->convertType(
|
||||||
|
transposedOpType),
|
||||||
|
reshapedOp.getResult(), transposedOpShapeConst.getValue());
|
||||||
|
|
||||||
|
rewriter.replaceOpWithNewOp<tensor::CastOp>(op, outputTy, transposedOp);
|
||||||
|
} else {
|
||||||
|
rewriter.replaceOpWithNewOp<tensor::CastOp>(op, outputTy, reshapedOp);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
rewriter.replaceOpWithNewOp<tensor::CastOp>(op, outputTy, mmOpResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
return success();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
|
@ -774,6 +1334,14 @@ public:
|
||||||
INSERT_SQUEEZE_OP_PATTERN(AtenSqueezeDimOp, ConvertAtenSqueezeOneDimOp)
|
INSERT_SQUEEZE_OP_PATTERN(AtenSqueezeDimOp, ConvertAtenSqueezeOneDimOp)
|
||||||
#undef INSERT_SQUEEZE_OP_PATTERN
|
#undef INSERT_SQUEEZE_OP_PATTERN
|
||||||
|
|
||||||
|
#define INSERT_MATMUL_ATENOP_PATTERN(AtenOp) \
|
||||||
|
target.addIllegalOp<AtenOp>(); \
|
||||||
|
patterns.add<ConvertAtenMatMulOp<AtenOp>>(typeConverter, context);
|
||||||
|
INSERT_MATMUL_ATENOP_PATTERN(AtenMatmulOp);
|
||||||
|
INSERT_MATMUL_ATENOP_PATTERN(AtenMmOp);
|
||||||
|
INSERT_MATMUL_ATENOP_PATTERN(AtenBmmOp);
|
||||||
|
#undef INSERT_MATMUL_ATEMOP_PATTERN
|
||||||
|
|
||||||
#define INSERT_ATENOP_PATTERN(AtenOp) \
|
#define INSERT_ATENOP_PATTERN(AtenOp) \
|
||||||
target.addIllegalOp<AtenOp>(); \
|
target.addIllegalOp<AtenOp>(); \
|
||||||
patterns.add<ConvertAtenOp<AtenOp>>(typeConverter, context);
|
patterns.add<ConvertAtenOp<AtenOp>>(typeConverter, context);
|
||||||
|
|
Loading…
Reference in New Issue