From 07d0645f640bdc8b09a706150fa1a9a5f85b8147 Mon Sep 17 00:00:00 2001 From: zjgarvey <47986913+zjgarvey@users.noreply.github.com> Date: Tue, 9 Jan 2024 13:14:10 -0600 Subject: [PATCH] [RFC] general support for Adaptive Pooling Ops (#2661) Adaptive pooling ops can only be decomposed into their non-adaptive counterparts in trivial cases. For example, the current decomposition for AtenAdaptiveAvgPool1dOp in DecomposeComplexOps.cpp supports outSize = inSize (i.e., do literally nothing), and outSize = 1 (i.e., do a batched average). The reason adaptive pooling ops are difficult to lower to linalg is that they are not constantly strided. They are computed by taking an input tensor of shape (N, C, Hin), and an output size Hout, and computing the output tensor at position (n,c, h) in the following way: 1. compute st(h) = (h*Hin)//Hout 2. compute en(h) = 1 + ((h+1)*Hin -1)//Hout 3. apply a computation (max or avg) to the slice: INPUT[n, c, st(h):en(h)] The provided sample implementation (for ConvertAtenAdaptiveAvgPool1dOp) uses tensor.extract to access the input tensor inside the payload of a linalg generic op. This is likely an unattractive use of linalg generic ops, which is why I am asking for some more targeted feedback on the validity of this approach before attempting to support the many other adaptive pooling ops. Specifically: - Is the performance of this implementation bad enough to warrant targeting different dialects entirely? e.g. TMtensor/linalg ext/ etc. - If the provided implementation is of acceptable performance to the community, then is it permissable to remove the Adaptive pooling decompositions from DecomposeComplexOps.cpp? Based on the current structure of the -torch-decompose-complex-ops pass, it does not seem possible to only decompose the adaptive ops in special cases (it seems to get stuck in an infinite loop on a match failure). I would be happy to instead incorporate the case logic into the conversion directly, and remove the decompositions once they are rendered completely obsolete. As long as this approach is acceptable, I can clean up the implementation with some helper functions, and quickly add support for each of the remaining Adaptive pooling ops. --- lib/Conversion/TorchToLinalg/Pooling.cpp | 237 ++++++++++++++++-- projects/pt1/e2e_testing/xfail_sets.py | 3 + projects/pt1/python/torch_mlir/__init__.py | 2 +- .../torch_mlir_e2e_test/test_suite/pooling.py | 71 +++++- 4 files changed, 280 insertions(+), 33 deletions(-) diff --git a/lib/Conversion/TorchToLinalg/Pooling.cpp b/lib/Conversion/TorchToLinalg/Pooling.cpp index 87419f093..20c03f5ff 100644 --- a/lib/Conversion/TorchToLinalg/Pooling.cpp +++ b/lib/Conversion/TorchToLinalg/Pooling.cpp @@ -90,18 +90,19 @@ static LogicalResult createPoolingOp( SmallVector lowPaddingIncludingNC = {0, 0}; lowPaddingIncludingNC.append(paddingInts); SmallVector highPaddingIncludingNC = lowPaddingIncludingNC; - + if (ceilMode) { for (int64_t i = 0; i < dimensionality; ++i) { highPaddingIncludingNC[i + 2] += strideInts[i]; } } - Value initValue = rewriter.create(loc, cast(initValueAttr)); + Value initValue = + rewriter.create(loc, cast(initValueAttr)); paddedInput = torch_to_linalg::getPaddedTensor( op, rewriter, self, lowPaddingIncludingNC, highPaddingIncludingNC, initValue); - + Value N = getDimOp(rewriter, loc, self, 0); Value C = getDimOp(rewriter, loc, self, 1); @@ -141,7 +142,6 @@ static LogicalResult createPoolingOp( return success(); } - namespace { class ConvertAtenMaxPool2dOp : public OpConversionPattern { public: @@ -163,7 +163,8 @@ public: bool ceilMode; SmallVector kernelSizeIntValues; SmallVector strideInts, paddingInts, dilationInts; - if (!matchPattern(op.getDilation(), m_TorchListOfConstantInts(dilationInts))) + if (!matchPattern(op.getDilation(), + m_TorchListOfConstantInts(dilationInts))) return rewriter.notifyMatchFailure(op, "only support constant int dilations"); if (failed(checkAndGetPoolingParameters( @@ -241,7 +242,8 @@ public: bool ceilMode; SmallVector kernelSizeIntValues; SmallVector strideInts, paddingInts, dilationInts; - if (!matchPattern(op.getDilation(), m_TorchListOfConstantInts(dilationInts))) + if (!matchPattern(op.getDilation(), + m_TorchListOfConstantInts(dilationInts))) return rewriter.notifyMatchFailure(op, "only support constant int dilations"); if (failed(checkAndGetPoolingParameters( @@ -372,7 +374,6 @@ public: }; } // namespace - namespace { template class ConvertAtenAvgPoolOp : public OpConversionPattern { @@ -383,7 +384,7 @@ public: ConversionPatternRewriter &rewriter) const override { if (failed(verifyLinalgCompatibleTypes(op, rewriter))) return failure(); - + Location loc = op->getLoc(); const TypeConverter *typeConverter = this->getTypeConverter(); Value self = adaptor.getSelf(); @@ -397,9 +398,9 @@ public: bool ceilMode; SmallVector kernelSizeIntValues; SmallVector strideInts, paddingInts, dilationInts(Dim, 1); - if (failed(checkAndGetPoolingParameters( - op, rewriter, typeConverter, ceilMode, kernelSizeIntValues, - strideInts, paddingInts))) + if (failed(checkAndGetPoolingParameters(op, rewriter, typeConverter, + ceilMode, kernelSizeIntValues, + strideInts, paddingInts))) return rewriter.notifyMatchFailure(op, "invalid pooling parameters"); // TODO: Add support for count_include_pad equal to `False`. @@ -415,20 +416,21 @@ public: // `sumPool` contains the result of sumpool operation over the input. Value sumPool, paddedInput; - SmallVector outTensorShape; + SmallVector outTensorShape; if (failed(createPoolingOp( op, rewriter, self, /*supportNonFPInput=*/true, ceilMode, - /*dimensionality=*/Dim, kernelSizeIntValues, strideInts, paddingInts, - dilationInts, rewriter.getZeroAttr(inputElementType), outTensorShape, - paddedInput, sumPool))) + /*dimensionality=*/Dim, kernelSizeIntValues, strideInts, + paddingInts, dilationInts, rewriter.getZeroAttr(inputElementType), + outTensorShape, paddedInput, sumPool))) return rewriter.notifyMatchFailure(op, "unable to compute sumpool"); Value divisor; if constexpr (std::is_same()) { Value kHtimeskW = rewriter.create( loc, kernelSizeIntValues[0], kernelSizeIntValues[1]); - divisor = op.getDivisorOverride().getType().template isa() - ? kHtimeskW - : adaptor.getDivisorOverride(); + divisor = + op.getDivisorOverride().getType().template isa() + ? kHtimeskW + : adaptor.getDivisorOverride(); } else { divisor = kernelSizeIntValues[0]; } @@ -436,9 +438,10 @@ public: Value outputTensor = rewriter.create( loc, getAsOpFoldResult(outTensorShape), resultElementType); - SmallVector indexingMapsAvg(2, rewriter.getMultiDimIdentityMap(Dim+2)); + SmallVector indexingMapsAvg( + 2, rewriter.getMultiDimIdentityMap(Dim + 2)); SmallVector iteratorTypesAvg( - Dim+2, utils::IteratorType::parallel); + Dim + 2, utils::IteratorType::parallel); Value avgPool = rewriter .create( @@ -459,8 +462,188 @@ public: return success(); } }; -} +} // namespace +/* +This section is for lowering adaptive pooling ops, which cannot generally be +decomposed into typical pooling ops. Given an input tensor of rank (N,C,Hin) and +an output spatial size Hout, an element of the output tensor at position (n, c, +h) is computed as follows. + 1. compute st(h) = (h*Hin)//Hout + 2. compute en(h) = 1 + ((h+1)*Hin - 1)//Hout + 3. apply the operation (max or avg) over input[n, c, st(h):en(h)] +This is problematic for linalg ops for a few reasons: + 1. The access to the input tensor is not constantly strided + 2. The size of the window itself is not contant: en(h) - st(h) can vary with +h! Although it is a bit like using a hammer to paint, our workaround is to use +tensor.extract to access the elements of the input tensor inside our linalg +generic op's payload. + +Current TODO's: + 1. gather most of the boilerplate out of this op and make it into an +adaptive pooling helper function. + 2. figure out what to do with the conflicting decompositions in +DecomposeComplexOps.cpp + 3. Implement more efficient passes for when the kernel-size, input spatial +dims, and output spatial dims are constant. +*/ + +namespace { +class ConvertAtenAdaptiveAvgPool1dOp + : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(AtenAdaptiveAvgPool1dOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + + Location loc = op->getLoc(); + const TypeConverter *typeConverter = getTypeConverter(); + + // get rank of input (same as rank of output) + int64_t rank = + adaptor.getSelf().getType().cast().getRank(); + // input operand should be NCH (i.e. rank 3) + if (rank != 3) { + return rewriter.notifyMatchFailure(op, "only supports input type NCH"); + } + + // input tensor and output shape + Value input = adaptor.getSelf(); + Value outputShape = op.getOutputSize(); + SmallVector outShapeVector; + getListConstructElements(outputShape, outShapeVector); + outShapeVector = + getTypeConvertedValues(rewriter, loc, typeConverter, outShapeVector); + Value hIn = getDimOp(rewriter, loc, input, 2); + Value hOut = outShapeVector[0]; + Value hOutIndex = castIntToIndex(rewriter, loc, hOut); + RankedTensorType inputType = input.getType().cast(); + RankedTensorType outputType = + typeConverter->convertType(op.getResult().getType()) + .cast(); + + // get elementType of input tensor + Type elementType = inputType.getElementType(); + + // make an iteration space of size kMax = 1 + ceildiv (hIn - 1) , hOut + Type boolType = rewriter.getI1Type(); + Value kIter; + Value constantOne = + rewriter.create(loc, rewriter.getIndexAttr(1)); + Value hInPlusOne = rewriter.create(loc, hIn, constantOne); + Value kMaxMinusOne = + rewriter.create(loc, hInPlusOne, hOutIndex); + Value kMax = rewriter.create(loc, constantOne, kMaxMinusOne); + kIter = rewriter.create( + loc, getAsOpFoldResult(ValueRange({kMax})), boolType); + + // need to buffer input, else there will possibly be an out of bounds access + // later buffVal = 0 for avg pooling and -inf for max pooling + Value buffVal = rewriter.create( + loc, elementType, rewriter.getFloatAttr(elementType, 0)); + SmallVector lowPadding = {0, 0, 0}; + SmallVector highPadding = {0, 0, 1}; + Value buffInput = torch_to_linalg::getPaddedTensor( + op, rewriter, input, lowPadding, highPadding, buffVal); + + // make a list of outputSizes + SmallVector outputSizes; + for (unsigned i = 0; i < rank - 1; i++) { + outputSizes.push_back(getDimOp(rewriter, loc, input, i)); + } + outputSizes.push_back(hOutIndex); + + // initialize a kernel size tensor (only for avg pooling) + Value kSizeTensor = rewriter.create( + loc, getAsOpFoldResult(ValueRange({hOutIndex})), elementType); + + // initialize an output tensor + Value initOutput = + createInitTensor(rewriter, loc, outputSizes, elementType, buffVal); + + // setup indexing maps and iterator types for linalg generic op + // for kIter (d0,d1,d2,d3) -> (d3) + // for output (d0,d1,d2,d3) -> (d0,d1,d2) + // for kSizeTensor (d0,d1,d2,d3) -> (d2) + SmallVector kIterExprs, outputExprs, kSizeTensorExprs; + for (unsigned i = 0; i < 3; i++) { + outputExprs.push_back(rewriter.getAffineDimExpr(i)); + } + kSizeTensorExprs.push_back(rewriter.getAffineDimExpr(2)); + kIterExprs.push_back(rewriter.getAffineDimExpr(3)); + SmallVector indexingMaps = AffineMap::inferFromExprList( + {kIterExprs, outputExprs, kSizeTensorExprs}); + SmallVector iteratorTypes( + 3, utils::IteratorType::parallel); + iteratorTypes.push_back(utils::IteratorType::reduction); + + Value indexOne = rewriter.create(loc, 1); + auto sumPool = rewriter.create( + loc, /*resultTensorTypes=*/ + TypeRange({initOutput.getType(), kSizeTensor.getType()}), + /*inputs=*/ValueRange({kIter}), + /*outputs=*/ValueRange({initOutput, kSizeTensor}), + /*indexingMaps=*/indexingMaps, + /*iteratorTypes=*/iteratorTypes, + [&](OpBuilder &b, Location loc, ValueRange args) { + Value res = args[1]; + Value ind0 = b.create(loc, 0); + Value ind1 = b.create(loc, 1); + Value ind2 = b.create(loc, 2); + Value ind3 = b.create(loc, 3); + // compute start and end indices + // st = s1( s0(ind2 * Hin) // Hout ) + Value s0 = b.create(loc, ind2, hIn); + Value s1 = b.create(loc, s0, hOutIndex); + // en = e4( 1 + e3( e2( e1( e0(ind2 + 1) * hIn ) - 1 ) // hOut ) ) + Value e0 = b.create(loc, ind2, indexOne); + Value e1 = b.create(loc, e0, hIn); + Value e2 = b.create(loc, e1, indexOne); + Value e3 = b.create(loc, e2, hOutIndex); + Value e4 = b.create(loc, indexOne, e3); + // get input element @ st + ind3: + Value wIndex = b.create(loc, s1, ind3); + Value inElt = b.create( + loc, elementType, buffInput, ValueRange({ind0, ind1, wIndex})); + // check if we extracted at windex < end index + Value cond = + b.create(loc, arith::CmpIPredicate(6), wIndex, e4); + // if inElt is in bounds, include it in the computation + // else, use buffVal = 0 (for max pool use -infinity) + Value out1 = b.create(loc, cond, inElt, buffVal); + // compute Kernel size: we store this to kwTensor + Value kSize = b.create(loc, e4, s1); + Value kSizeInt = castIndexToInt64(b, loc, kSize); + Value kSizeF = b.create(loc, elementType, kSizeInt); + // accumulate out2 to res = args[1] + Value out2 = b.create(loc, res, out1); + b.create(loc, ValueRange({out2, kSizeF})); + }); + + // make a linalg generic to divide each element by the corresponding + // Kernel Width. This step is only necessary for avg pooling. + SmallVector indexingMaps1 = + AffineMap::inferFromExprList({kSizeTensorExprs, outputExprs}); + SmallVector iteratorTypes1( + 3, utils::IteratorType::parallel); + auto output = rewriter.create( + loc, /*resultTensorTypes=*/initOutput.getType(), + /*inputs=*/sumPool.getResultTensors()[1], + /*outputs=*/sumPool.getResultTensors()[0], + /*indexingMaps=*/indexingMaps1, + /*iteratorTypes=*/iteratorTypes1, + [&](OpBuilder &b, Location loc, ValueRange args) { + Value q = b.create(loc, args[1], args[0]); + b.create(loc, q); + }); + + rewriter.replaceOpWithNewOp(op, outputType, + output.getResultTensors()); + return success(); + } +}; +} // namespace void mlir::torch::torch_to_linalg::populatePoolingPatternsAndLegality( TypeConverter &typeConverter, RewritePatternSet &patterns, @@ -471,8 +654,12 @@ void mlir::torch::torch_to_linalg::populatePoolingPatternsAndLegality( target.addIllegalOp(); patterns.add(typeConverter, context); target.addIllegalOp(); - patterns.add>( - typeConverter, context); - patterns.add>( - typeConverter, context); + patterns + .add>( + typeConverter, context); + patterns + .add>( + typeConverter, context); + target.addIllegalOp(); + patterns.add(typeConverter, context); } diff --git a/projects/pt1/e2e_testing/xfail_sets.py b/projects/pt1/e2e_testing/xfail_sets.py index 76f84344b..ddb4865ec 100644 --- a/projects/pt1/e2e_testing/xfail_sets.py +++ b/projects/pt1/e2e_testing/xfail_sets.py @@ -257,6 +257,8 @@ TORCHDYNAMO_XFAIL_SET = { # ERROR: Exception: Unsupported: missing default value for argument 0 in schema for aten.div.Tensor_mode "ElementwiseDivRoundingModeFloorModule_basic", "ElementwiseDivRoundingModeTruncModule_basic", + "AdaptiveAvgPool1dStaticLargerOutput_basic", + "AdaptiveAvgPool1dGeneralDynamic_basic", # ERROR: Exception: Unsupported op: get_attr "NumToTensorFloatModule_basic", @@ -1324,6 +1326,7 @@ MAKE_FX_TOSA_PASS_SET = (TOSA_PASS_SET | { ### Tests additionally passing in make_fx_tosa "AdaptiveAvgPool1dNonUnitOutputSizeStaticModule_basic", "AdaptiveAvgPool1dUnitOutputSizeStaticModule_basic", + "AdaptiveAvgPool1dStaticEvenMultiple_basic", "NativeGroupNormBackwardModule_basic", "SliceWholeTensorModule_basic", "TensorFloatModule_basic", diff --git a/projects/pt1/python/torch_mlir/__init__.py b/projects/pt1/python/torch_mlir/__init__.py index 1cf1aa0e0..c916043c2 100644 --- a/projects/pt1/python/torch_mlir/__init__.py +++ b/projects/pt1/python/torch_mlir/__init__.py @@ -248,7 +248,7 @@ class ExampleArgs: # compiler where each backend can "own" its set of legal ops. BACKEND_LEGAL_OPS = { OutputType.TOSA: ['aten.flatten.using_ints', 'aten.native_layer_norm', 'aten.linear'], - OutputType.LINALG_ON_TENSORS: ['aten.flatten.using_ints', ], + OutputType.LINALG_ON_TENSORS: ['aten.flatten.using_ints','aten.adaptive_avg_pool1d'], OutputType.STABLEHLO: [], } diff --git a/projects/pt1/python/torch_mlir_e2e_test/test_suite/pooling.py b/projects/pt1/python/torch_mlir_e2e_test/test_suite/pooling.py index dd18545b0..1c6748538 100644 --- a/projects/pt1/python/torch_mlir_e2e_test/test_suite/pooling.py +++ b/projects/pt1/python/torch_mlir_e2e_test/test_suite/pooling.py @@ -11,7 +11,6 @@ from torch_mlir_e2e_test.annotations import annotate_args, export # ============================================================================== - class AdaptiveAvgPool2dNonUnitOutputSizeStaticModule(torch.nn.Module): def __init__(self): @@ -55,7 +54,6 @@ def AdaptiveAvgPool2dNonUnitOutputSizeDynamicModule_basic( module, tu: TestUtils): module.forward(tu.rand(1, 512, 7, 7)) - class AdaptiveAvgPool2dUnitOutputSizeStaticModule(torch.nn.Module): def __init__(self): @@ -776,12 +774,71 @@ def AvgPool1dStaticModule_basic(module, tu: TestUtils): # ============================================================================== +class AdaptiveAvgPool1dStaticLargerOutput(torch.nn.Module): + + def __init__(self): + super().__init__() + self.aap1d = torch.nn.AdaptiveAvgPool1d(output_size=13) + + @export + @annotate_args([ + None, + ([5, 512, 7], torch.float32, True) + ]) + def forward(self,x): + return self.aap1d(x) + +@register_test_case( + module_factory=lambda: AdaptiveAvgPool1dStaticLargerOutput()) +def AdaptiveAvgPool1dStaticLargerOutput_basic( + module, tu: TestUtils): + module.forward(tu.rand(5, 512, 7)) + +class AdaptiveAvgPool1dStaticEvenMultiple(torch.nn.Module): + + def __init__(self): + super().__init__() + self.aap1d = torch.nn.AdaptiveAvgPool1d(output_size=7) + + @export + @annotate_args([ + None, + ([5, 512, 147], torch.float32, True) + ]) + def forward(self,x): + return self.aap1d(x) + +@register_test_case( + module_factory=lambda: AdaptiveAvgPool1dStaticEvenMultiple()) +def AdaptiveAvgPool1dStaticEvenMultiple_basic( + module, tu: TestUtils): + module.forward(tu.rand(5, 512, 147)) + +class AdaptiveAvgPool1dGeneralDynamic(torch.nn.Module): + + def __init__(self): + super().__init__() + self.aap1d = torch.nn.AdaptiveAvgPool1d(output_size=7) + + @export + @annotate_args([ + None, + ([-1,-1,-1], torch.float32, True) + ]) + def forward(self,x): + return self.aap1d(x) + +@register_test_case( + module_factory=lambda: AdaptiveAvgPool1dGeneralDynamic()) +def AdaptiveAvgPool1dGeneralDynamic_basic( + module, tu: TestUtils): + module.forward(tu.rand(1, 512, 10)) class AdaptiveAvgPool1dNonUnitOutputSizeStaticModule(torch.nn.Module): def __init__(self): super().__init__() - self.aap1d = torch.nn.AdaptiveAvgPool1d(7) + self.aap1d = torch.nn.AdaptiveAvgPool1d(output_size=7) @export @annotate_args([ @@ -801,7 +858,7 @@ class AdaptiveAvgPool1dNonUnitOutputSizeDynamicModule(torch.nn.Module): def __init__(self): super().__init__() - self.aap1d = torch.nn.AdaptiveAvgPool1d(7) + self.aap1d = torch.nn.AdaptiveAvgPool1d(output_size=7) @export @annotate_args([ @@ -821,7 +878,7 @@ class AdaptiveAvgPool1dUnitOutputSizeStaticModule(torch.nn.Module): def __init__(self): super().__init__() - self.aap1d = torch.nn.AdaptiveAvgPool1d(1) + self.aap1d = torch.nn.AdaptiveAvgPool1d(output_size=1) @export @annotate_args([ @@ -841,7 +898,7 @@ class AdaptiveAvgPool1dUnitOutputSizeDynamicModule(torch.nn.Module): def __init__(self): super().__init__() - self.aap1d = torch.nn.AdaptiveAvgPool1d(1) + self.aap1d = torch.nn.AdaptiveAvgPool1d(output_size=1) @export @annotate_args([ @@ -855,4 +912,4 @@ class AdaptiveAvgPool1dUnitOutputSizeDynamicModule(torch.nn.Module): module_factory=lambda: AdaptiveAvgPool1dUnitOutputSizeDynamicModule()) def AdaptiveAvgPool1dUnitOutputSizeDynamicModule_basic( module, tu: TestUtils): - module.forward(tu.rand(1, 512, 7)) \ No newline at end of file + module.forward(tu.rand(1, 512, 7))