This bump triggered an upstream assert. Includes a WAR for #3506.

Also includes several things I needed to do to repro:

* When TORCH_MLIR_TEST_CONCURRENCY=1, test runs will be printed.
* Added TORCH_MLIR_TEST_VERBOSE=1 handling to enable verbose mode
(useful on CI).

---------

Co-authored-by: Stella Laurenzo <stellaraccident@gmail.com>
pull/3508/head
Aart Bik 2024-06-27 19:28:02 -07:00 committed by GitHub
parent 6d0ca499e6
commit 1f73895f93
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 46 additions and 10 deletions

View File

@ -429,6 +429,20 @@ cd projects/pt1
python -m e2e_testing.main -f 'AtenEmbeddingBag'
```
The default mode of running tests uses the multi-processing framework and is
not tolerant of certain types of errors. If encountering native crashes/hangs,
enable debug variables to run sequentially/in-process with more verbosity:
```
export TORCH_MLIR_TEST_CONCURRENCY=1
export TORCH_MLIR_TEST_VERBOSE=1
```
In this way, you can run under `gdb`, etc and get useful results. Having env
vars like this makes it easy to set in GH action files, etc. Note that the
verbose flags are very verbose. Basic sequential progress reports will be
printed regardless when not running in parallel.
## Running unit tests.
To run all of the unit tests, run:

@ -1 +1 @@
Subproject commit 5207632f8698a2fab0c4cdcdf2f7ad9aaf96e06f
Subproject commit 9b78ddf3b2abfb3e2063e3dad2a326f5eabc1618

View File

@ -46,16 +46,17 @@ using namespace mlir::torch::TMTensor;
static void getEffectsImpl(
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
&effects,
ValueRange results, ValueRange inputBuffers, ValueRange outputBuffers) {
for (Value value : results) {
ResultRange results, ArrayRef<OpOperand *> inputBuffers,
ArrayRef<OpOperand *> outputBuffers) {
for (OpResult value : results) {
effects.emplace_back(MemoryEffects::Allocate::get(), value,
SideEffects::DefaultResource::get());
}
for (Value value : inputBuffers) {
for (OpOperand *value : inputBuffers) {
effects.emplace_back(MemoryEffects::Read::get(), value,
SideEffects::DefaultResource::get());
}
for (Value value : outputBuffers) {
for (OpOperand *value : outputBuffers) {
effects.emplace_back(MemoryEffects::Read::get(), value,
SideEffects::DefaultResource::get());
effects.emplace_back(MemoryEffects::Write::get(), value,
@ -1121,8 +1122,8 @@ bool TopkOp::payloadUsesValueFromOperand(OpOperand *opOperand) {
void OP_NAME::getEffects( \
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>> \
&effects) { \
SmallVector<Value> inputBuffers = getInputBufferOperands(); \
SmallVector<Value> outputBuffers = getOutputBufferOperands(); \
OpOperandVector inputBuffers = getInputBufferOperands(); \
OpOperandVector outputBuffers = getOutputBufferOperands(); \
getEffectsImpl(effects, getOperation()->getResults(), inputBuffers, \
outputBuffers); \
}

View File

@ -2810,7 +2810,8 @@ LogicalResult CopyToNonValueTensorOp::inferReturnTypes(
void CopyToNonValueTensorOp::getEffects(
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
&effects) {
effects.emplace_back(MemoryEffects::Allocate::get(), getResult());
effects.emplace_back(MemoryEffects::Allocate::get(),
getOperation()->getOpResult(0));
}
//===----------------------------------------------------------------------===//
@ -2837,7 +2838,8 @@ LogicalResult CopyToValueTensorOp::inferReturnTypes(
void CopyToValueTensorOp::getEffects(
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
&effects) {
effects.emplace_back(MemoryEffects::Read::get(), getOperand());
effects.emplace_back(MemoryEffects::Read::get(),
&getOperation()->getOpOperand(0));
}
//===----------------------------------------------------------------------===//

View File

@ -7,6 +7,10 @@ import argparse
import re
import sys
import torch
torch.device("cpu")
from torch_mlir_e2e_test.framework import run_tests
from torch_mlir_e2e_test.reporting import report_results
from torch_mlir_e2e_test.registry import GLOBAL_TEST_REGISTRY

View File

@ -358,6 +358,15 @@ def run_tests(
if env_concurrency > 0:
num_processes = min(num_processes, env_concurrency)
try:
env_verbose = os.getenv("TORCH_MLIR_TEST_VERBOSE", "0")
if env_verbose is not None:
verbose = bool(int(env_verbose))
except ValueError as e:
raise ValueError(
"Bad value for TORCH_MLIR_TEST_VERBOSE env var: " "Expected integer."
) from e
# TODO: We've noticed that on certain 2 core machine parallelizing the tests
# makes the llvm backend legacy pass manager 20x slower than using a
# single process. Need to investigate the root cause eventually. This is a
@ -375,7 +384,10 @@ def run_tests(
# seems to cause a cascade of failures resulting in undecipherable error
# messages.
if num_processes == 1 or sequential:
return [compile_and_run_test(test, config, verbose) for test in tests]
print("Running tests sequentially with progress status")
for test in tests:
print(f"*** RUNNING TEST: {test.unique_name} ***")
compile_and_run_test(test, config, verbose)
# This is needed because autograd does not support crossing process
# boundaries.

View File

@ -40,6 +40,9 @@ def run_pipeline_with_repro_report(
)
# Lower module in place to make it ready for compiler backends.
with module.context as ctx:
# TODO(#3506): Passes can emit errors but not signal failure,
# which causes a native assert.
ctx.emit_error_diagnostics = True
pm = PassManager.parse(pipeline)
if enable_ir_printing:
ctx.enable_multithreading(False)