From 334633b738480e243b794acf8f36703915ffd2ea Mon Sep 17 00:00:00 2001
From: Matthias Gehre <matthias.gehre@amd.com>
Date: Mon, 12 Aug 2024 14:15:12 +0200
Subject: [PATCH] e2e: Enable generate-runtime-verification pass (#3615)

This adds the `generate-runtime-verification` pass into the linalg
refbackend, and moves all tests that now abort at runtime into the crash
set, sorted by their respective errors.

I have fixed on set of errors found that way, which are mismatches
between the static dimensions we cast to and the actual dynamic
dimensions. This was caused by wrong annotations on the test cases, like
in
https://github.com/llvm/torch-mlir/pull/3615/files#diff-48bfbf41fcad5fa01b49197d251114f84a2b8de4f1d87ab938a061aedd1419b1R1931
---
 projects/pt1/e2e_testing/main.py              |  11 +-
 projects/pt1/e2e_testing/xfail_sets.py        |  54 ++++++-
 .../linalg_on_tensors_backends/refbackend.py  | 144 +++++++++---------
 .../stablehlo_backends/linalg_on_tensors.py   |   5 +-
 .../test_suite/constant_alloc.py              |   2 +-
 .../test_suite/elementwise.py                 |  12 +-
 .../torch_mlir_e2e_test/test_suite/scalar.py  |   2 +-
 test/python/fx_importer/sparse_test.py        |   4 +-
 8 files changed, 147 insertions(+), 87 deletions(-)

diff --git a/projects/pt1/e2e_testing/main.py b/projects/pt1/e2e_testing/main.py
index 4d0eb4861..ce767c567 100644
--- a/projects/pt1/e2e_testing/main.py
+++ b/projects/pt1/e2e_testing/main.py
@@ -43,9 +43,11 @@ from .xfail_sets import (
     LINALG_XFAIL_SET,
     LINALG_CRASHING_SET,
     MAKE_FX_TOSA_PASS_SET,
+    MAKE_FX_TOSA_CRASHING_SET,
     STABLEHLO_PASS_SET,
     STABLEHLO_CRASHING_SET,
     TOSA_PASS_SET,
+    TOSA_CRASHING_SET,
     LTC_XFAIL_SET,
     LTC_CRASHING_SET,
     TORCHDYNAMO_XFAIL_SET,
@@ -161,11 +163,11 @@ def main():
     elif args.config == "tosa":
         config = TosaBackendTestConfig(LinalgOnTensorsTosaBackend())
         xfail_set = all_test_unique_names - TOSA_PASS_SET
-        crashing_set = set()
+        crashing_set = TOSA_CRASHING_SET
     elif args.config == "make_fx_tosa":
         config = TosaBackendTestConfig(LinalgOnTensorsTosaBackend(), use_make_fx=True)
         xfail_set = all_test_unique_names - MAKE_FX_TOSA_PASS_SET
-        crashing_set = set()
+        crashing_set = MAKE_FX_TOSA_CRASHING_SET
     elif args.config == "native_torch":
         config = NativeTorchTestConfig()
         xfail_set = set()
@@ -191,7 +193,10 @@ def main():
         xfail_set = FX_IMPORTER_TOSA_XFAIL_SET
         crashing_set = set()
     elif args.config == "torchdynamo":
-        config = TorchDynamoTestConfig(RefBackendLinalgOnTensorsBackend())
+        # TODO: Enanble runtime verification and extend crashing set.
+        config = TorchDynamoTestConfig(
+            RefBackendLinalgOnTensorsBackend(generate_runtime_verification=False)
+        )
         xfail_set = TORCHDYNAMO_XFAIL_SET
         crashing_set = TORCHDYNAMO_CRASHING_SET
     elif args.config == "onnx":
diff --git a/projects/pt1/e2e_testing/xfail_sets.py b/projects/pt1/e2e_testing/xfail_sets.py
index 4b1b260ac..b77c9bf55 100644
--- a/projects/pt1/e2e_testing/xfail_sets.py
+++ b/projects/pt1/e2e_testing/xfail_sets.py
@@ -34,6 +34,31 @@ LINALG_XFAIL_SET = COMMON_TORCH_MLIR_LOWERING_XFAILS | {
 }
 
 LINALG_CRASHING_SET = {
+    # Runtime op verification: Out of bounds access
+    "AtenDiagEmbedNegOffsetDiag_basic",
+    "AtenDiagEmbedNonDefault4DDiag_basic",
+    "AtenDiagEmbedOffsetDiag_basic",
+    "AtenDiagEmbedRevDimDiag_basic",
+    "AtenEmbeddingBagStaticModule_basic",
+    "AtenEmbeddingBagSumExample_basic",
+    "Aten_EmbeddingBagExample_basic",
+    # Runtime op verification: subview is out-of-bounds of the base memref
+    "Conv_Transpose1dModule_basic",
+    "Conv_Transpose1dStaticModule_basic",
+    "Conv_Transpose2dModule_basic",
+    "Conv_Transpose2dStaticModule_basic",
+    "Conv_Transpose3dModule_basic",
+    "Conv_Transpose3dStaticModule_basic",
+    "ConvolutionModule2DTransposeStridedStatic_basic",
+    "ConvolutionModule2DTransposeStrided_basic",
+    "GridSamplerBasic1_basic",
+    "GridSamplerBasic2_basic",
+    "GridSamplerBasic3_basic",
+    "GridSamplerBasic4_basic",
+    # Runtime op verification: stride mismatch in memref.cast
+    "ReduceAllDimEmpty_basic",
+    "TraceUnsignedIntModule_empty",
+    "TraceModule_empty",
     # Crashes due to copy to a smaller destination buffer than the source buffer.
     "SliceCopyStartGreaterThanDimSize_Module_basic",
 }
@@ -476,8 +501,11 @@ FX_IMPORTER_XFAIL_SET = {
     "WeightNormInterfaceModule_basic",
 }
 
-FX_IMPORTER_CRASHING_SET = {
+FX_IMPORTER_CRASHING_SET = LINALG_CRASHING_SET | {
     "HBC_basic",
+    # Runtime op verification: out-of-bounds access
+    "_SoftmaxModule_basic",
+    "UpSampleNearest2dDynamicFactor_basic",
 }
 
 FX_IMPORTER_STABLEHLO_XFAIL_SET = {
@@ -899,7 +927,6 @@ STABLEHLO_PASS_SET = {
     "AtenIntBoolOpModule_basic",
     "AtenIntTensorByteDtypeModule_basic",
     "AtenIntTensorCharDtypeModule_basic",
-    "AtenItemFpOpModule_basic",
     "AtenItemIntOpModule_basic",
     "AtenMmFloatTypes_basic",
     "AtenMmIntTypes_basic",
@@ -1522,6 +1549,16 @@ STABLEHLO_CRASHING_SET = {
     "IndexPutWithNoneAndBroadcastModule_basic",
     "ReduceMaxAlongDimUnsignedInt_basic",
     "ReduceMinAlongDimUnsignedInt_basic",
+    # LLVM ERROR: Failed to infer result type(s)
+    "ElementwiseClampMinTensorFloatModule_basic",
+    "ElementwiseClampMinTensorIntModule_basic",
+    "ElementwiseClampTensorFloatModule_basic",
+    "ElementwiseClampTensorIntModule_basic",
+}
+
+TOSA_CRASHING_SET = {
+    # Runtime op verification: Out of bounds access
+    "IndexTensorNegativeIndexModule_basic",
 }
 
 # Write the TOSA set as a "passing" set as it is very early in development
@@ -2010,6 +2047,15 @@ TOSA_PASS_SET = {
     "IndexTensorStaticNonContiguousWithNoneModule_basic",
 }
 
+MAKE_FX_TOSA_CRASHING_SET = TOSA_CRASHING_SET | {
+    # Runtime op verification: static result dims in reassoc group do not divide src dim evenly
+    "FlattenDynamicModule_basic",
+    "ReshapeDynamicModule_basic",
+    "ViewFlattenAndExpandModule_basic",
+    "ViewSizeDimLedAndFollowedByExpandedOnesModule_basic",
+    "ViewSizeDimLedByExpandedOnesModule_basic",
+}
+
 MAKE_FX_TOSA_PASS_SET = (
     TOSA_PASS_SET
     | {
@@ -2821,7 +2867,7 @@ if torch_version_for_comparison() < version.parse("2.4.0.dev"):
     }
 
 
-ONNX_CRASHING_SET = {
+ONNX_CRASHING_SET = LINALG_CRASHING_SET | {
     "FakeQuantizePerTensorAffineModule_basic",
     "FakeQuantizePerTensorAffineDynamicShapeModule_basic",
     "ElementwisePreluModule_basic",
@@ -2840,6 +2886,8 @@ ONNX_CRASHING_SET = {
     "StdCorrectionEmptyDimModule_basic",
     "VarCorrectionEmptyDimModule_basic",
     "VarDimEmptyDimModule_basic",
+    # Runtime op verification: rank mismatch in memref.cast
+    "ViewSizeFromOtherTensor_basic",
 }
 
 FX_IMPORTER_TOSA_XFAIL_SET = {
diff --git a/projects/pt1/python/torch_mlir_e2e_test/linalg_on_tensors_backends/refbackend.py b/projects/pt1/python/torch_mlir_e2e_test/linalg_on_tensors_backends/refbackend.py
index b87038bae..e089c941f 100644
--- a/projects/pt1/python/torch_mlir_e2e_test/linalg_on_tensors_backends/refbackend.py
+++ b/projects/pt1/python/torch_mlir_e2e_test/linalg_on_tensors_backends/refbackend.py
@@ -134,82 +134,84 @@ class RefBackendInvoker:
         return invoke
 
 
-LOWERING_PIPELINE = (
-    "builtin.module("
-    + ",".join(
-        [
-            # Apply some optimizations. It would be great if MLIR had more useful
-            # optimizations that worked out of the box here.
-            # Note: When measured, this doesn't seem to actually help that much
-            # for the linalg-on-tensors backend.
-            # This is likely because if things are naturally fusable we usually already
-            # emit things in that form from the high level (e.g. single linalg-generic).
-            # Other backends are likely to benefit more.
-            "func.func(linalg-generalize-named-ops)",
-            "func.func(linalg-fuse-elementwise-ops)",
-            "convert-shape-to-std",
-            # MLIR Sparsifier mini-pipeline. Note that this is the bare minimum
-            # to ensure operations on sparse tensors are lowered to loops.
-            "sparse-assembler{direct-out}",
-            "sparsification-and-bufferization",
-            "sparse-storage-specifier-to-llvm",
-            # Buffer deallocation pass does not know how to handle realloc.
-            "func.func(expand-realloc)",
-            # Generalize pad and concat after sparse compiler, as they are handled
-            # differently when the operations involve sparse operand.
-            "func.func(refback-generalize-tensor-pad)",
-            "func.func(refback-generalize-tensor-concat)",
-            # Bufferize.
-            "func.func(tm-tensor-bufferize)",
-            "one-shot-bufferize{copy-before-write bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map}",
-            "refback-mlprogram-bufferize",
-            "func.func(finalizing-bufferize)",
-            "func.func(buffer-deallocation)",
-            # Buffer-deallocation does not work with the inlined code generated
-            # by sparse tensor dialect.
-            "inline",  # inline sparse helper methods where useful
-            # Munge to make it ExecutionEngine compatible.
-            # Specifically, we rewrite calling convention boundaries to be in terms
-            # of unranked memref, and we rewrite the return to actually be a
-            # callback that consumes the return (the final munged function always
-            # returns void at the C level -- we get the return value by providing the
-            # callback).
-            "refback-munge-calling-conventions",
-            # Insert global variable and instruction sequence for getting the next
-            # global seed used in stateful rng.
-            # Lower to LLVM
-            "func.func(tm-tensor-to-loops)",
-            "func.func(refback-munge-memref-copy)",
-            "func.func(convert-linalg-to-loops)",
-            "func.func(lower-affine)",
-            "convert-scf-to-cf",
-            "func.func(refback-expand-ops-for-llvm)",
-            "func.func(arith-expand)",
-            "func.func(convert-math-to-llvm)",
-            # Handle some complex mlir::math ops (e.g. atan2)
-            "convert-math-to-libm",
-            "expand-strided-metadata",
-            "finalize-memref-to-llvm",
-            "lower-affine",
-            "convert-bufferization-to-memref",
-            "finalize-memref-to-llvm",
-            "func.func(convert-arith-to-llvm)",
-            "convert-vector-to-llvm",
-            "convert-func-to-llvm",
-            "convert-cf-to-llvm",
-            "convert-complex-to-llvm",
-            "reconcile-unrealized-casts",
-        ]
-    )
-    + ")"
-)
+def lowering_pipeline(generate_runtime_verification: bool):
+    passes = [
+        # Apply some optimizations. It would be great if MLIR had more useful
+        # optimizations that worked out of the box here.
+        # Note: When measured, this doesn't seem to actually help that much
+        # for the linalg-on-tensors backend.
+        # This is likely because if things are naturally fusable we usually already
+        # emit things in that form from the high level (e.g. single linalg-generic).
+        # Other backends are likely to benefit more.
+        "func.func(linalg-generalize-named-ops)",
+        "func.func(linalg-fuse-elementwise-ops)",
+        "convert-shape-to-std",
+        # MLIR Sparsifier mini-pipeline. Note that this is the bare minimum
+        # to ensure operations on sparse tensors are lowered to loops.
+        "sparse-assembler{direct-out}",
+        "sparsification-and-bufferization",
+        "sparse-storage-specifier-to-llvm",
+        # Buffer deallocation pass does not know how to handle realloc.
+        "func.func(expand-realloc)",
+        # Generalize pad and concat after sparse compiler, as they are handled
+        # differently when the operations involve sparse operand.
+        "func.func(refback-generalize-tensor-pad)",
+        "func.func(refback-generalize-tensor-concat)",
+        # Bufferize.
+        "func.func(tm-tensor-bufferize)",
+        "one-shot-bufferize{copy-before-write bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map}",
+        "refback-mlprogram-bufferize",
+        "func.func(finalizing-bufferize)",
+        "func.func(buffer-deallocation)",
+        # Buffer-deallocation does not work with the inlined code generated
+        # by sparse tensor dialect.
+        "inline",  # inline sparse helper methods where useful
+        # Munge to make it ExecutionEngine compatible.
+        # Specifically, we rewrite calling convention boundaries to be in terms
+        # of unranked memref, and we rewrite the return to actually be a
+        # callback that consumes the return (the final munged function always
+        # returns void at the C level -- we get the return value by providing the
+        # callback).
+        "refback-munge-calling-conventions",
+        # Insert global variable and instruction sequence for getting the next
+        # global seed used in stateful rng.
+        # Lower to LLVM
+        "func.func(tm-tensor-to-loops)",
+        "func.func(refback-munge-memref-copy)",
+        "func.func(convert-linalg-to-loops)",
+        "func.func(lower-affine)",
+        "convert-scf-to-cf",
+    ]
+    if generate_runtime_verification:
+        passes += ["generate-runtime-verification"]
+    passes += [
+        "func.func(refback-expand-ops-for-llvm)",
+        "func.func(arith-expand)",
+        "func.func(convert-math-to-llvm)",
+        # Handle some complex mlir::math ops (e.g. atan2)
+        "convert-math-to-libm",
+        "expand-strided-metadata",
+        "finalize-memref-to-llvm",
+        "lower-affine",
+        "convert-bufferization-to-memref",
+        "finalize-memref-to-llvm",
+        "func.func(convert-arith-to-llvm)",
+        "convert-vector-to-llvm",
+        "convert-func-to-llvm",
+        "convert-cf-to-llvm",
+        "convert-complex-to-llvm",
+        "reconcile-unrealized-casts",
+    ]
+
+    return "builtin.module(" + ",".join(passes) + ")"
 
 
 class RefBackendLinalgOnTensorsBackend(LinalgOnTensorsBackend):
     """Main entry-point for the reference backend."""
 
-    def __init__(self):
+    def __init__(self, generate_runtime_verification: bool = True):
         super().__init__()
+        self.generate_runtime_verification = generate_runtime_verification
 
     def compile(self, imported_module: Module):
         """Compiles an imported module, with a flat list of functions.
@@ -226,7 +228,7 @@ class RefBackendLinalgOnTensorsBackend(LinalgOnTensorsBackend):
         """
         run_pipeline_with_repro_report(
             imported_module,
-            LOWERING_PIPELINE,
+            lowering_pipeline(self.generate_runtime_verification),
             "Lowering Linalg-on-Tensors IR to LLVM with RefBackend",
             enable_ir_printing=False,
         )
diff --git a/projects/pt1/python/torch_mlir_e2e_test/stablehlo_backends/linalg_on_tensors.py b/projects/pt1/python/torch_mlir_e2e_test/stablehlo_backends/linalg_on_tensors.py
index 25c6405b7..79c743353 100644
--- a/projects/pt1/python/torch_mlir_e2e_test/stablehlo_backends/linalg_on_tensors.py
+++ b/projects/pt1/python/torch_mlir_e2e_test/stablehlo_backends/linalg_on_tensors.py
@@ -37,7 +37,10 @@ class LinalgOnTensorsStablehloBackend(StablehloBackend):
 
     def __init__(self):
         super().__init__()
-        self.refbackend = RefBackendLinalgOnTensorsBackend()
+        # TOOD: Enable runtime verification and fix found bugs.
+        self.refbackend = RefBackendLinalgOnTensorsBackend(
+            generate_runtime_verification=False
+        )
 
     def compile(self, imported_module: Module):
         """Compiles an imported module that satisfied the Stablehlo backend contract.
diff --git a/projects/pt1/python/torch_mlir_e2e_test/test_suite/constant_alloc.py b/projects/pt1/python/torch_mlir_e2e_test/test_suite/constant_alloc.py
index 8ce0a44d7..ab18aeea2 100644
--- a/projects/pt1/python/torch_mlir_e2e_test/test_suite/constant_alloc.py
+++ b/projects/pt1/python/torch_mlir_e2e_test/test_suite/constant_alloc.py
@@ -1928,7 +1928,7 @@ class EmptyStridedModule(torch.nn.Module):
     @annotate_args(
         [
             None,
-            ([2, 3, 4], torch.float32, True),
+            ([4, 3, 4], torch.float32, True),
         ]
     )
     def forward(self, a):
diff --git a/projects/pt1/python/torch_mlir_e2e_test/test_suite/elementwise.py b/projects/pt1/python/torch_mlir_e2e_test/test_suite/elementwise.py
index 82c77fee9..52112948b 100644
--- a/projects/pt1/python/torch_mlir_e2e_test/test_suite/elementwise.py
+++ b/projects/pt1/python/torch_mlir_e2e_test/test_suite/elementwise.py
@@ -1658,8 +1658,8 @@ class ElementwiseClampTensorFloatModule(torch.nn.Module):
         [
             None,
             ([-1, -1], torch.float32, True),
-            ([], torch.float32, True),
-            ([], torch.float32, True),
+            ([1], torch.float32, True),
+            ([1], torch.float32, True),
         ]
     )
     def forward(self, x, min, max):
@@ -1688,8 +1688,8 @@ class ElementwiseClampTensorIntModule(torch.nn.Module):
         [
             None,
             ([-1, -1], torch.int64, True),
-            ([], torch.int64, True),
-            ([], torch.int64, True),
+            ([1], torch.int64, True),
+            ([1], torch.int64, True),
         ]
     )
     def forward(self, x, min, max):
@@ -1741,7 +1741,7 @@ class ElementwiseClampMinTensorFloatModule(torch.nn.Module):
         [
             None,
             ([-1, -1], torch.float32, True),
-            ([], torch.float32, True),
+            ([1], torch.float32, True),
         ]
     )
     def forward(self, x, min):
@@ -1765,7 +1765,7 @@ class ElementwiseClampMinTensorIntModule(torch.nn.Module):
         [
             None,
             ([-1, -1], torch.int64, True),
-            ([], torch.int64, True),
+            ([1], torch.int64, True),
         ]
     )
     def forward(self, x, min):
diff --git a/projects/pt1/python/torch_mlir_e2e_test/test_suite/scalar.py b/projects/pt1/python/torch_mlir_e2e_test/test_suite/scalar.py
index 28b3a6f36..3157a0fde 100644
--- a/projects/pt1/python/torch_mlir_e2e_test/test_suite/scalar.py
+++ b/projects/pt1/python/torch_mlir_e2e_test/test_suite/scalar.py
@@ -542,7 +542,7 @@ class AtenItemFpOpModule(torch.nn.Module):
     @annotate_args(
         [
             None,
-            ([], torch.float, True),
+            ([1], torch.float, True),
         ]
     )
     def forward(self, val):
diff --git a/test/python/fx_importer/sparse_test.py b/test/python/fx_importer/sparse_test.py
index 699d57cb2..089a5eabb 100644
--- a/test/python/fx_importer/sparse_test.py
+++ b/test/python/fx_importer/sparse_test.py
@@ -175,7 +175,9 @@ def sparse_jit(f, *args, **kwargs):
         enable_ir_printing=False,
     )
     # Compile with reference Linalg backend.
-    backend = RefBackendLinalgOnTensorsBackend()
+    # TODO: runtime verification currently fails with 'rank mismatch' on
+    # memref.cast. Need to fix the IR first.
+    backend = RefBackendLinalgOnTensorsBackend(generate_runtime_verification=False)
     compiled = backend.compile(module)
     invoker = backend.load(compiled)
     xargs = []