From c46d48f9f57a16797773f87bf0aa7a1973165b8c Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Thu, 28 Oct 2021 18:25:53 +0000
Subject: [PATCH] Make error reporting a bit better.

- Split out TOSA in the CI.
- Add summary of unexpected test outcomes. This works better when there
  are many XFAIL'ing tests, as it only prints out the error_str on
  FAIL, not on XFAIL. Example here:
  https://gist.github.com/silvasean/c7886ec7b3d35c21563cb09f7c3407da
---
 .github/workflows/buildAndTest.yml            |  6 +-
 .../torchscript_e2e_test/error_reports.py     |  1 +
 .../torchscript/reporting.py                  | 62 ++++++++++++-------
 3 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/buildAndTest.yml b/.github/workflows/buildAndTest.yml
index e8233e94f..615e36a4c 100644
--- a/.github/workflows/buildAndTest.yml
+++ b/.github/workflows/buildAndTest.yml
@@ -58,11 +58,15 @@ jobs:
           -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
           -DLLVM_TARGETS_TO_BUILD=host
         ninja check-torch-mlir-all
-    - name: RefBackend integration tests
+    - name: RefBackend - TorchScript end-to-end tests
       run: |
         cd $GITHUB_WORKSPACE
         export PYTHONPATH="$GITHUB_WORKSPACE/build/tools/torch-mlir/python_packages/torch_mlir"
         python -m e2e_testing.torchscript.main --config=refbackend -v
+    - name: TOSA backend - TorchScript end-to-end tests
+      run: |
+        cd $GITHUB_WORKSPACE
+        export PYTHONPATH="$GITHUB_WORKSPACE/build/tools/torch-mlir/python_packages/torch_mlir"
         python -m e2e_testing.torchscript.main --config=tosa -v
 
     # TODO: Only build packages in full Release mode.
diff --git a/python/test/torchscript_e2e_test/error_reports.py b/python/test/torchscript_e2e_test/error_reports.py
index b0fc5a392..686522644 100644
--- a/python/test/torchscript_e2e_test/error_reports.py
+++ b/python/test/torchscript_e2e_test/error_reports.py
@@ -14,6 +14,7 @@ from torch_mlir_e2e_test.torchscript.reporting import report_results
 from torch_mlir_e2e_test.torchscript.registry import register_test_case, GLOBAL_TEST_REGISTRY
 from torch_mlir_e2e_test.torchscript.configs import TorchScriptTestConfig
 
+# CHECK: Unexpected outcome summary:
 # CHECK: FAIL - "ErroneousModule_basic"
 
 
diff --git a/python/torch_mlir_e2e_test/torchscript/reporting.py b/python/torch_mlir_e2e_test/torchscript/reporting.py
index 45f381375..75ec3e781 100644
--- a/python/torch_mlir_e2e_test/torchscript/reporting.py
+++ b/python/torch_mlir_e2e_test/torchscript/reporting.py
@@ -275,42 +275,56 @@ def report_results(results: List[TestResult],
     Returns True if the run resulted in any unexpected pass/fail behavior.
     Otherwise False.
     """
-    summary = collections.Counter()
+    results_by_outcome = collections.defaultdict(list)
     for result in results:
         report = SingleTestReport(result, ErrorContext.empty())
         expected_failure = result.unique_name in expected_failures
         if expected_failure:
             if report.failed:
-                error_str = ''
-                if verbose:
-                    error_str = '\n' + textwrap.indent(report.error_str(),
-                                                       '    ')
-                print(f'XFAIL - "{result.unique_name}"' + error_str)
-                summary['XFAIL'] += 1
+                print(f'XFAIL - "{result.unique_name}"')
+                results_by_outcome['XFAIL'].append(result)
             else:
                 print(f'XPASS - "{result.unique_name}"')
-                summary['XPASS'] += 1
+                results_by_outcome['XPASS'].append(result)
         else:
             if not report.failed:
                 print(f'PASS - "{result.unique_name}"')
-                summary['PASS'] += 1
+                results_by_outcome['PASS'].append(result)
             else:
-                error_str = ''
-                if verbose:
-                    error_str = '\n' + textwrap.indent(report.error_str(),
-                                                       '    ')
-                print(f'FAIL - "{result.unique_name}"' + error_str)
-                summary['FAIL'] += 1
+                print(f'FAIL - "{result.unique_name}"')
+                results_by_outcome['FAIL'].append(result)
+
+    OUTCOME_MEANINGS = collections.OrderedDict()
+    OUTCOME_MEANINGS['PASS'] = 'Passed'
+    OUTCOME_MEANINGS['FAIL'] = 'Failed'
+    OUTCOME_MEANINGS['XFAIL'] = 'Expectedly Failed'
+    OUTCOME_MEANINGS['XPASS'] = 'Unexpectedly Passed'
+
+    had_unexpected_results = len(results_by_outcome['FAIL']) != 0 or len(
+        results_by_outcome['XPASS']) != 0
+
+    if had_unexpected_results:
+        print('\nUnexpected outcome summary:')
+
+    # For FAIL and XPASS (unexpected outcomes), print a summary.
+    for outcome, results in results_by_outcome.items():
+        # PASS and XFAIL are "good"/"successful" outcomes.
+        if outcome == 'PASS' or outcome == 'XFAIL':
+            continue
+        # If there is nothing to report, be quiet.
+        if len(results) == 0:
+            continue
+        print(f'\n****** {OUTCOME_MEANINGS[outcome]} tests - {len(results)} tests')
+        for result in results:
+            print(f'    {outcome} - "{result.unique_name}"')
+            # If the test failed, print the error message.
+            if outcome == 'FAIL' and verbose:
+                print(textwrap.indent(report.error_str(), ' ' * 8))
 
     # Print a summary for easy scanning.
     print('\nSummary:')
-    KEY_MEANINGS = {
-        'PASS': 'Passed',
-        'FAIL': 'Failed',
-        'XFAIL': 'Expectedly Failed',
-        'XPASS': 'Unexpectedly Passed',
-    }
+
     for key in ['PASS', 'FAIL', 'XFAIL', 'XPASS']:
-        if summary[key]:
-            print(f'    {KEY_MEANINGS[key]}: {summary[key]}')
-    return summary['FAIL'] != 0 or summary['XPASS'] != 0
+        if results_by_outcome[key]:
+            print(f'    {OUTCOME_MEANINGS[key]}: {len(results_by_outcome[key])}')
+    return had_unexpected_results