From 1f73895f93e03b1804a8a52e82a0c3395b2c1a49 Mon Sep 17 00:00:00 2001 From: Aart Bik Date: Thu, 27 Jun 2024 19:28:02 -0700 Subject: [PATCH] [torch-mlir] bump to llvm/llvm-project@9b78ddf3b2abfb3e (#3491) This bump triggered an upstream assert. Includes a WAR for #3506. Also includes several things I needed to do to repro: * When TORCH_MLIR_TEST_CONCURRENCY=1, test runs will be printed. * Added TORCH_MLIR_TEST_VERBOSE=1 handling to enable verbose mode (useful on CI). --------- Co-authored-by: Stella Laurenzo --- docs/development.md | 14 ++++++++++++++ externals/llvm-project | 2 +- lib/Dialect/TMTensor/IR/TMTensorOps.cpp | 13 +++++++------ lib/Dialect/Torch/IR/TorchOps.cpp | 6 ++++-- projects/pt1/e2e_testing/main.py | 4 ++++ .../pt1/python/torch_mlir_e2e_test/framework.py | 14 +++++++++++++- python/torch_mlir/compiler_utils.py | 3 +++ 7 files changed, 46 insertions(+), 10 deletions(-) diff --git a/docs/development.md b/docs/development.md index 154b398f1..771c4fcbe 100644 --- a/docs/development.md +++ b/docs/development.md @@ -429,6 +429,20 @@ cd projects/pt1 python -m e2e_testing.main -f 'AtenEmbeddingBag' ``` +The default mode of running tests uses the multi-processing framework and is +not tolerant of certain types of errors. If encountering native crashes/hangs, +enable debug variables to run sequentially/in-process with more verbosity: + +``` +export TORCH_MLIR_TEST_CONCURRENCY=1 +export TORCH_MLIR_TEST_VERBOSE=1 +``` + +In this way, you can run under `gdb`, etc and get useful results. Having env +vars like this makes it easy to set in GH action files, etc. Note that the +verbose flags are very verbose. Basic sequential progress reports will be +printed regardless when not running in parallel. + ## Running unit tests. To run all of the unit tests, run: diff --git a/externals/llvm-project b/externals/llvm-project index 5207632f8..9b78ddf3b 160000 --- a/externals/llvm-project +++ b/externals/llvm-project @@ -1 +1 @@ -Subproject commit 5207632f8698a2fab0c4cdcdf2f7ad9aaf96e06f +Subproject commit 9b78ddf3b2abfb3e2063e3dad2a326f5eabc1618 diff --git a/lib/Dialect/TMTensor/IR/TMTensorOps.cpp b/lib/Dialect/TMTensor/IR/TMTensorOps.cpp index 05258f506..943eda423 100644 --- a/lib/Dialect/TMTensor/IR/TMTensorOps.cpp +++ b/lib/Dialect/TMTensor/IR/TMTensorOps.cpp @@ -46,16 +46,17 @@ using namespace mlir::torch::TMTensor; static void getEffectsImpl( SmallVectorImpl> &effects, - ValueRange results, ValueRange inputBuffers, ValueRange outputBuffers) { - for (Value value : results) { + ResultRange results, ArrayRef inputBuffers, + ArrayRef outputBuffers) { + for (OpResult value : results) { effects.emplace_back(MemoryEffects::Allocate::get(), value, SideEffects::DefaultResource::get()); } - for (Value value : inputBuffers) { + for (OpOperand *value : inputBuffers) { effects.emplace_back(MemoryEffects::Read::get(), value, SideEffects::DefaultResource::get()); } - for (Value value : outputBuffers) { + for (OpOperand *value : outputBuffers) { effects.emplace_back(MemoryEffects::Read::get(), value, SideEffects::DefaultResource::get()); effects.emplace_back(MemoryEffects::Write::get(), value, @@ -1121,8 +1122,8 @@ bool TopkOp::payloadUsesValueFromOperand(OpOperand *opOperand) { void OP_NAME::getEffects( \ SmallVectorImpl> \ &effects) { \ - SmallVector inputBuffers = getInputBufferOperands(); \ - SmallVector outputBuffers = getOutputBufferOperands(); \ + OpOperandVector inputBuffers = getInputBufferOperands(); \ + OpOperandVector outputBuffers = getOutputBufferOperands(); \ getEffectsImpl(effects, getOperation()->getResults(), inputBuffers, \ outputBuffers); \ } diff --git a/lib/Dialect/Torch/IR/TorchOps.cpp b/lib/Dialect/Torch/IR/TorchOps.cpp index c37b96c60..b10a0c61f 100644 --- a/lib/Dialect/Torch/IR/TorchOps.cpp +++ b/lib/Dialect/Torch/IR/TorchOps.cpp @@ -2810,7 +2810,8 @@ LogicalResult CopyToNonValueTensorOp::inferReturnTypes( void CopyToNonValueTensorOp::getEffects( SmallVectorImpl> &effects) { - effects.emplace_back(MemoryEffects::Allocate::get(), getResult()); + effects.emplace_back(MemoryEffects::Allocate::get(), + getOperation()->getOpResult(0)); } //===----------------------------------------------------------------------===// @@ -2837,7 +2838,8 @@ LogicalResult CopyToValueTensorOp::inferReturnTypes( void CopyToValueTensorOp::getEffects( SmallVectorImpl> &effects) { - effects.emplace_back(MemoryEffects::Read::get(), getOperand()); + effects.emplace_back(MemoryEffects::Read::get(), + &getOperation()->getOpOperand(0)); } //===----------------------------------------------------------------------===// diff --git a/projects/pt1/e2e_testing/main.py b/projects/pt1/e2e_testing/main.py index e9468ee91..4d0eb4861 100644 --- a/projects/pt1/e2e_testing/main.py +++ b/projects/pt1/e2e_testing/main.py @@ -7,6 +7,10 @@ import argparse import re import sys +import torch + +torch.device("cpu") + from torch_mlir_e2e_test.framework import run_tests from torch_mlir_e2e_test.reporting import report_results from torch_mlir_e2e_test.registry import GLOBAL_TEST_REGISTRY diff --git a/projects/pt1/python/torch_mlir_e2e_test/framework.py b/projects/pt1/python/torch_mlir_e2e_test/framework.py index 42f4b5415..56c2e91ae 100644 --- a/projects/pt1/python/torch_mlir_e2e_test/framework.py +++ b/projects/pt1/python/torch_mlir_e2e_test/framework.py @@ -358,6 +358,15 @@ def run_tests( if env_concurrency > 0: num_processes = min(num_processes, env_concurrency) + try: + env_verbose = os.getenv("TORCH_MLIR_TEST_VERBOSE", "0") + if env_verbose is not None: + verbose = bool(int(env_verbose)) + except ValueError as e: + raise ValueError( + "Bad value for TORCH_MLIR_TEST_VERBOSE env var: " "Expected integer." + ) from e + # TODO: We've noticed that on certain 2 core machine parallelizing the tests # makes the llvm backend legacy pass manager 20x slower than using a # single process. Need to investigate the root cause eventually. This is a @@ -375,7 +384,10 @@ def run_tests( # seems to cause a cascade of failures resulting in undecipherable error # messages. if num_processes == 1 or sequential: - return [compile_and_run_test(test, config, verbose) for test in tests] + print("Running tests sequentially with progress status") + for test in tests: + print(f"*** RUNNING TEST: {test.unique_name} ***") + compile_and_run_test(test, config, verbose) # This is needed because autograd does not support crossing process # boundaries. diff --git a/python/torch_mlir/compiler_utils.py b/python/torch_mlir/compiler_utils.py index 4e5a2f8f8..c1315abd4 100644 --- a/python/torch_mlir/compiler_utils.py +++ b/python/torch_mlir/compiler_utils.py @@ -40,6 +40,9 @@ def run_pipeline_with_repro_report( ) # Lower module in place to make it ready for compiler backends. with module.context as ctx: + # TODO(#3506): Passes can emit errors but not signal failure, + # which causes a native assert. + ctx.emit_error_diagnostics = True pm = PassManager.parse(pipeline) if enable_ir_printing: ctx.enable_multithreading(False)