diff --git a/.github/workflows/buildAndTest.yml b/.github/workflows/buildAndTest.yml index c6d345860..08ab86571 100644 --- a/.github/workflows/buildAndTest.yml +++ b/.github/workflows/buildAndTest.yml @@ -2,11 +2,11 @@ name: Build and Test on: - pull_request: - branches: [main] - push: - branches: [main] - workflow_dispatch: + # pull_request: + # branches: [main] + # push: + # branches: [main] + # workflow_dispatch: # Ensure that only a single job or workflow using the same # concurrency group will run at a time. This would cancel @@ -30,7 +30,7 @@ jobs: strategy: fail-fast: true matrix: - os-arch: [ubuntu-x86_64, macos-arm64, windows-x86_64] + os-arch: [macos-arm64, windows-x86_64] llvm-build: [in-tree, out-of-tree] torch-binary: [ON] torch-version: [nightly, stable] diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..4bb27526b --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,77 @@ +name: CI + +on: + workflow_dispatch: + workflow_call: + pull_request: + branches: [main] + push: + branches: [main] + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). + group: ci-build-test-cpp-linux-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + build-test-linux: + strategy: + fail-fast: true + matrix: + torch-version: [nightly, stable] + name: Build and Test (Linux, torch-${{ matrix.torch-version }}, assertions) + runs-on: torch-mlir-cpubuilder-manylinux-x86-64 + env: + CACHE_DIR: ${{ github.workspace }}/.container-cache + steps: + - name: Configure local git mirrors + run: | + # Our stock runners have access to certain local git caches. If these + # files are available, it will prime the cache and configure git to + # use them. Practically, this eliminates network/latency for cloning + # llvm. + if [[ -x /gitmirror/scripts/trigger_update_mirrors.sh ]]; then + /gitmirror/scripts/trigger_update_mirrors.sh + /gitmirror/scripts/git_config.sh + fi + - name: "Checking out repository" + uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 + with: + submodules: true + + - name: Enable cache + uses: actions/cache/restore@v3 + with: + path: ${{ env.CACHE_DIR }} + key: build-test-cpp-asserts-manylinux-v2-${{ github.sha }} + restore-keys: | + build-test-cpp-asserts-manylinux-v2- + + - name: Install python deps (torch-${{ matrix.torch-version }}) + run: | + export cache_dir="${{ env.CACHE_DIR }}" + bash build_tools/ci/install_python_deps.sh ${{ matrix.torch-version }} + + - name: Build project + run: | + export cache_dir="${{ env.CACHE_DIR }}" + bash build_tools/ci/build_posix.sh + + - name: Save cache + uses: actions/cache/save@v3 + if: ${{ !cancelled() }} + with: + path: ${{ env.CACHE_DIR }} + key: build-test-cpp-asserts-manylinux-v2-${{ github.sha }} + + - name: Test project (torch-${{ matrix.torch-version }}) + run: | + export cache_dir="${{ env.CACHE_DIR }}" + bash build_tools/ci/test_posix.sh ${{ matrix.torch-version }} + + - name: Check generated sources (torch-nightly only) + if: ${{ matrix.torch-version == 'nightly' }} + run: | + bash build_tools/ci/check_generated_sources.sh diff --git a/.github/workflows/releaseSnapshotPackage.yml b/.github/workflows/releaseSnapshotPackage.yml index 71acd9ac3..8a0ec9144 100644 --- a/.github/workflows/releaseSnapshotPackage.yml +++ b/.github/workflows/releaseSnapshotPackage.yml @@ -2,9 +2,8 @@ name: Release snapshot package on: - schedule: - - cron: '0 11 * * *' - + # schedule: + # - cron: '0 11 * * *' workflow_dispatch: jobs: diff --git a/build_tools/ci/build_posix.sh b/build_tools/ci/build_posix.sh new file mode 100755 index 000000000..c6e7e168d --- /dev/null +++ b/build_tools/ci/build_posix.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +set -eu -o errtrace + +this_dir="$(cd $(dirname $0) && pwd)" +repo_root="$(cd $this_dir/../.. && pwd)" +build_dir="$repo_root/build" +install_dir="$repo_root/install" +mkdir -p "$build_dir" +build_dir="$(cd $build_dir && pwd)" +cache_dir="${cache_dir:-}" + +# Setup cache dir. +if [ -z "${cache_dir}" ]; then + cache_dir="${repo_root}/.build-cache" + mkdir -p "${cache_dir}" + cache_dir="$(cd ${cache_dir} && pwd)" +fi +echo "Caching to ${cache_dir}" +mkdir -p "${cache_dir}/ccache" +mkdir -p "${cache_dir}/pip" + +python="$(which python)" +echo "Using python: $python" + +export CMAKE_TOOLCHAIN_FILE="$this_dir/linux_default_toolchain.cmake" +export CC=clang +export CXX=clang++ +export CCACHE_DIR="${cache_dir}/ccache" +export CCACHE_MAXSIZE="350M" +export CMAKE_C_COMPILER_LAUNCHER=ccache +export CMAKE_CXX_COMPILER_LAUNCHER=ccache + +# Clear ccache stats. +ccache -z + +cd $repo_root + +echo "::group::CMake configure" +cmake -S "$repo_root/externals/llvm-project/llvm" -B "$build_dir" \ + -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DPython3_EXECUTABLE="$(which python)" \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DCMAKE_INSTALL_PREFIX="$install_dir" \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DLLVM_ENABLE_PROJECTS=mlir \ + -DLLVM_EXTERNAL_PROJECTS="torch-mlir" \ + -DLLVM_EXTERNAL_TORCH_MLIR_SOURCE_DIR="$repo_root" \ + -DLLVM_TARGETS_TO_BUILD=host \ + -DMLIR_ENABLE_BINDINGS_PYTHON=ON \ + -DTORCH_MLIR_ENABLE_LTC=ON +echo "::endgroup::" + +echo "::group::Build" +cmake --build "$build_dir" --target tools/torch-mlir/all -- -k 0 +echo "::endgroup::" + +# Show ccache stats. +ccache --show-stats diff --git a/build_tools/ci/check_generated_sources.sh b/build_tools/ci/check_generated_sources.sh new file mode 100755 index 000000000..719e221d7 --- /dev/null +++ b/build_tools/ci/check_generated_sources.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +set -eu -o errtrace + +this_dir="$(cd $(dirname $0) && pwd)" +repo_root="$(cd $this_dir/../.. && pwd)" + +function _check_file_not_changed_by() { + # _check_file_not_changed_by + cmd="$1" + file="$2" + file_backup="$PWD/$(basename $file)" + file_new="$PWD/$(basename $file).new" + # Save the original file. + cp "$file" "$file_backup" + # Run the command to regenerate it. + "$1" || return 1 + # Save the new generated file. + cp "$file" "$file_new" + # Restore the original file. We want this function to not change the user's + # working tree state. + mv "$file_backup" "$file" + # We use git-diff as "just a diff program" (no SCM stuff) because it has + # nicer output than regular `diff`. + if ! git diff --no-index --quiet "$file" "$file_new"; then + echo "#######################################################" + echo "Generated file '${file}' is not up to date (see diff below)" + echo ">>> Please run '${cmd}' to update it <<<" + echo "#######################################################" + git diff --no-index --color=always "$file" "$file_new" + # TODO: Is there a better cleanup strategy that doesn't require duplicating + # this inside and outside the `if`? + rm "$file_new" + return 1 + fi + rm "$file_new" +} + +echo "::group:: Check that update_abstract_interp_lib.sh has been run" +_check_file_not_changed_by $repo_root/build_tools/update_abstract_interp_lib.sh $repo_root/lib/Dialect/Torch/Transforms/AbstractInterpLibrary.cpp +echo "::endgroup::" + +echo "::group:: Check that update_torch_ods.sh has been run" +_check_file_not_changed_by $repo_root/build_tools/update_torch_ods.sh $repo_root/include/torch-mlir/Dialect/Torch/IR/GeneratedTorchOps.td +echo "::endgroup::" diff --git a/build_tools/ci/install_python_deps.sh b/build_tools/ci/install_python_deps.sh new file mode 100755 index 000000000..6b49689ce --- /dev/null +++ b/build_tools/ci/install_python_deps.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -eu -o errtrace + +this_dir="$(cd $(dirname $0) && pwd)" +repo_root="$(cd $this_dir/../.. && pwd)" +torch_version="${1:-unknown}" + +echo "::group::installing llvm python deps" +python -m pip install --no-cache-dir -r $repo_root/externals/llvm-project/mlir/python/requirements.txt +echo "::endgroup::" + +case $torch_version in + nightly) + echo "::group::installing nightly torch" + python3 -m pip install --no-cache-dir -r $repo_root/requirements.txt + python3 -m pip install --no-cache-dir -r $repo_root/torchvision-requirements.txt + echo "::endgroup::" + ;; + stable) + echo "::group::installing stable torch" + python3 -m pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu + python3 -m pip install --no-cache-dir -r $repo_root/build-requirements.txt + echo "::endgroup::" + ;; + *) + echo "Unrecognized torch version '$torch_version' (specify 'nightly' or 'stable' with cl arg)" + exit 1 + ;; +esac + +echo "::group::installing test requirements" +python -m pip install --no-cache-dir -r $repo_root/test-requirements.txt +echo "::endgroup::" diff --git a/build_tools/ci/linux_default_toolchain.cmake b/build_tools/ci/linux_default_toolchain.cmake new file mode 100644 index 000000000..4e0c36c71 --- /dev/null +++ b/build_tools/ci/linux_default_toolchain.cmake @@ -0,0 +1,14 @@ +message(STATUS "Enabling thin archives (static libraries will not be relocatable)") +set(CMAKE_C_ARCHIVE_APPEND " qT ") +set(CMAKE_CXX_ARCHIVE_APPEND " qT ") +set(CMAKE_C_ARCHIVE_CREATE " crT ") +set(CMAKE_CXX_ARCHIVE_CREATE " crT ") + +set(CMAKE_EXE_LINKER_FLAGS_INIT "-fuse-ld=lld -Wl,--gdb-index") +set(CMAKE_MODULE_LINKER_FLAGS_INIT "-fuse-ld=lld -Wl,--gdb-index") +set(CMAKE_SHARED_LINKER_FLAGS_INIT "-fuse-ld=lld -Wl,--gdb-index") + +set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -gsplit-dwarf -ggnu-pubnames") +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -gsplit-dwarf -ggnu-pubnames") +set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -gsplit-dwarf -ggnu-pubnames") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -gsplit-dwarf -ggnu-pubnames") diff --git a/build_tools/ci/test_posix.sh b/build_tools/ci/test_posix.sh new file mode 100755 index 000000000..8cc68d77b --- /dev/null +++ b/build_tools/ci/test_posix.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +set -eu -o errtrace + +this_dir="$(cd $(dirname $0) && pwd)" +repo_root="$(cd $this_dir/../.. && pwd)" +torch_version="${1:-unknown}" + +export PYTHONPATH="$repo_root/build/tools/torch-mlir/python_packages/torch_mlir:$repo_root/projects/pt1" + +echo "::group::Run Linalg e2e integration tests" +python -m e2e_testing.main --config=linalg -v +echo "::endgroup::" + +echo "::group::Run make_fx + TOSA e2e integration tests" +python -m e2e_testing.main --config=make_fx_tosa -v +echo "::endgroup::" + +echo "::group::Run TOSA e2e integration tests" +python -m e2e_testing.main --config=tosa -v +echo "::endgroup::" + +case $torch_version in + nightly) + # Failing with: NotImplementedError: + # Could not run 'aten::empty.memory_format' with arguments from the 'Lazy' backend. + # As of 2024-01-07 + # echo "::group::Run Lazy Tensor Core e2e integration tests" + # python -m e2e_testing.main --config=lazy_tensor_core -v + # echo "::endgroup::" + + # TODO: There is one failing test in this group on stable. It could + # be xfailed vs excluding entirely. + echo "::group::Run TorchDynamo e2e integration tests" + python -m e2e_testing.main --config=torchdynamo -v + echo "::endgroup::" + ;; + stable) + ;; + *) + echo "Unrecognized torch version '$torch_version' (specify 'nightly' or 'stable' with cl arg)" + exit 1 + ;; +esac diff --git a/projects/pt1/python/torch_mlir_e2e_test/framework.py b/projects/pt1/python/torch_mlir_e2e_test/framework.py index f1fbad2ec..388976256 100644 --- a/projects/pt1/python/torch_mlir_e2e_test/framework.py +++ b/projects/pt1/python/torch_mlir_e2e_test/framework.py @@ -24,11 +24,19 @@ import abc from typing import Any, Callable, List, NamedTuple, Optional, TypeVar, Union, Dict from itertools import repeat +import os import sys import traceback -import torch import multiprocess as mp +from multiprocess import set_start_method +try: + set_start_method("spawn") +except RuntimeError: + # Children can error here so we suppress. + pass + +import torch TorchScriptValue = Union[int, float, List['TorchScriptValue'], Dict['TorchScriptValue', @@ -316,8 +324,16 @@ def compile_and_run_test(test: Test, config: TestConfig, verbose=False) -> Any: def run_tests(tests: List[Test], config: TestConfig, sequential=False, verbose=False) -> List[TestResult]: - """Invoke the given `Test`'s with the provided `TestConfig`.""" - num_processes = min(int(mp.cpu_count() * 1.1), len(tests)) + """Invoke the given `Test`'s with the provided `TestConfig`.""" + num_processes = min(int(mp.cpu_count() * 0.8) + 1, len(tests)) + try: + env_concurrency = int(os.getenv("TORCH_MLIR_TEST_CONCURRENCY", "0")) + except ValueError as e: + raise ValueError("Bad value for TORCH_MLIR_TEST_CONCURRENCY env var: " + "Expected integer.") from e + if env_concurrency > 0: + num_processes = min(num_processes, env_concurrency) + # TODO: We've noticed that on certain 2 core machine parallelizing the tests # makes the llvm backend legacy pass manager 20x slower than using a # single process. Need to investigate the root cause eventually. This is a @@ -344,7 +360,7 @@ def run_tests(tests: List[Test], config: TestConfig, sequential=False, verbose=F pool = mp.Pool(num_processes) arg_list = zip(tests, repeat(config)) handles = pool.starmap_async(compile_and_run_test, arg_list) - results = handles.get() + results = handles.get(timeout=360) tests_with_results = {result.unique_name for result in results} all_tests = {test.unique_name for test in tests} diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b8f839445..d725aae6c 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -46,6 +46,13 @@ declare_mlir_python_sources(TorchMLIRPythonSources.Tools tools/import_onnx/__main__.py ) +declare_mlir_python_sources(TorchMLIRSiteInitialize + ROOT_DIR "${TORCH_MLIR_PYTHON_ROOT_DIR}" + ADD_TO_PARENT TorchMLIRPythonSources + SOURCES + _mlir_libs/_site_initialize_0.py +) + ################################################################################ # Extensions ################################################################################ @@ -79,6 +86,7 @@ set(_source_components MLIRPythonExtension.RegisterEverything TorchMLIRPythonSources TorchMLIRPythonExtensions + TorchMLIRSiteInitialize # Sources related to optional Torch extension dependent features. Typically # empty unless if project features are enabled. diff --git a/python/torch_mlir/_mlir_libs/_site_initialize_0.py b/python/torch_mlir/_mlir_libs/_site_initialize_0.py new file mode 100644 index 000000000..3b93b1fa9 --- /dev/null +++ b/python/torch_mlir/_mlir_libs/_site_initialize_0.py @@ -0,0 +1,9 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# Also available under a BSD-style license. See LICENSE. + +# Multi-threading rarely helps the frontend and we are also running in contexts +# where we want to run a lot of test parallelism (and nproc*nproc threads +# puts a large load on the system and virtual memory). +disable_multithreading = True