torch-mlir/lib/Dialect/Torch/Transforms/Passes.cpp

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "npcomp/Dialect/Torch/Transforms/Passes.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/Passes.h"
#include "npcomp/Backend/Common/Passes.h"
#include "npcomp/Conversion/ATenToLinalg/ATenToLinalg.h"
#include "npcomp/Conversion/ATenToTCF/Passes.h"
#include "npcomp/Conversion/TCFToStd/TCFToStd.h"
#include "npcomp/Dialect/ATen/Transforms/Passes.h"
#include "npcomp/Dialect/Numpy/Transforms/Passes.h"

//===----------------------------------------------------------------------===//
// Pass registration
//===----------------------------------------------------------------------===//

namespace {
#define GEN_PASS_REGISTRATION
#include "npcomp/Dialect/Torch/Transforms/Passes.h.inc"
} // end namespace

void mlir::NPCOMP::registerTorchPasses() {
  ::registerPasses();
  mlir::PassPipelineRegistration<>(
      "torch-globalize-pipeline", "Globalization pipeline.",
      mlir::NPCOMP::Torch::createGlobalizePipeline);
  mlir::PassPipelineRegistration<>(
      "torchscript-to-npcomp-backend-pipeline",
      "Pipeline lowering torch object graph to npcomp backend format.",
      mlir::NPCOMP::Torch::createLowerObjectGraphPipeline);
  mlir::PassPipelineRegistration<>(
      "torch-globalized-module-to-npcomp-backend-pipeline",
      "Pipeline lowering to npcomp backend form.",
      mlir::NPCOMP::Torch::createLowerToNpcompBackendPipeline);
}

void mlir::NPCOMP::Torch::createGlobalizePipeline(OpPassManager &pm) {
  pm.addPass(createPrepareForGlobalizeObjectGraphPass());
  pm.addPass(createGlobalizeObjectGraphPass());
}

void mlir::NPCOMP::Torch::createLowerObjectGraphPipeline(OpPassManager &pm) {
    // When we import TorchScript IR, we import their entire "compilation unit",
    // which can contain numerous functions unrelated to the current program,
    // which breaks torch-globalization-pipeline; for example, there can be
    // random functions referencing types that haven't been imported
    // as part of the root `torch.nn.Module` we imported. Those will
    // be unreferenced private functions which symbol-dce will clean up nicely.
    pm.addPass(createSymbolDCEPass());
    // Globalize the program. The rest of the compiler assumes a globalized
    // program, which makes all analyses and transforms significantly easier
    // to write.
    pm.addPass(createPrepareForGlobalizeObjectGraphPass());
    pm.addPass(createGlobalizeObjectGraphPass());
    // "lower" `torch.global_slot` ops by deleting them if unused, which we
    // currently require because we don't have a lowering path for backends to
    // handle them.
    // Torch usually inserts a few unused global slots so this ends up hitting
    // every single module even if it doesn't have any explicit slots.
    // TODO: Support global slots in backends.
    pm.addPass(createSymbolDCEPass());
    // Currently, our shape inference is not powerful enough to deal with
    // calls, so inline everything.
    // TODO: Improve shape inference.
    pm.addPass(createInlinerPass());
    // Incorporate user annotations and remove signature Python-isms.
    pm.addPass(createAdjustCallingConventionsPass());

    createLowerToNpcompBackendPipeline(pm);
}

void mlir::NPCOMP::Torch::createLowerToNpcompBackendPipeline(OpPassManager &pm) {
    // Recognize ATen kernels.
    pm.addNestedPass<FuncOp>(aten::createRecognizeKernelsPass());

    // Convert the bulk of the program to ranked tensors with known dtype.
    // This is the input to the backend layer that we are aiming for.

    // First, unilaterally convert public functions to tensor.
    // The way this pass is currently written, this implies that
    // as pipeline authors, we are restricting our users to not be able to see
    // updates to "out params" on their public functions.
    // This is deemed ok for now.
    pm.addPass(Numpy::createPublicFunctionsToTensorPass());
    // Convert the bulk of non-ABI-visible arrays to tensors.
    pm.addNestedPass<FuncOp>(Numpy::createArrayToTensorPass());
    // Do shape and dtype refinement.
    // We could do it sooner, but the pass currently doesn't have transfer
    // functions for array ops.
    pm.addNestedPass<FuncOp>(Torch::createRefineTypesPass());
    // Propagate to ABI return types the shape/dtype information discovered by
    // the previous pass. Doing this is ABI-compatible for our backends.
    pm.addPass(Numpy::createRefinePublicReturnPass());
    // Clean up a few stray array/tensor conversion remnants.
    pm.addNestedPass<FuncOp>(Numpy::createArrayToTensorPass());

    // Lower to TCP (+ guards) which is the input to codegen backends.
    // Most of this should be subsumed by aten->linalg+guards conversions.
    // (the guard generation will be automated from the linalg Op DSL).
    pm.addNestedPass<FuncOp>(createConvertATenToLinalgPass());
    pm.addNestedPass<FuncOp>(createConvertATenToTCFPass());
    pm.addNestedPass<FuncOp>(createConvertTCFToStdPass());
    pm.addNestedPass<FuncOp>(createConvertElementwiseToLinalgPass());

    // Verify that we have lowered to the form that backends expect.
    // This fails compilation (signalPassFailure) if the IR is not in the
    // correct form.
    pm.addPass(CommonBackend::createVerifyBackendContractPass());
}
Implement GlobalizeObjectGraph transformation. This required restructuring of how we model TorchScript on import. The main difference is that now we split out a `torch.class_type` that holds methods and declarations of the types of each slot. This is more consistent with TorchScript (our previous representation was "denormalized"). Recommended reading order: 1. check out the description of `torch.class_type` in `TorchOps.td` and look at `test/Dialect/Torch/ops.mlir` and `frontends/pytorch/test/module_import/` to familiarize with the new representation. - Just look at the new IR. The diff between the old names and new names is confusing. 2. check out `test/Dialect/Torch/globalize-object-graph*.mlir` and read along with the pass description in `include/npcomp/Dialect/Torch/Transforms/Passes.td` 3. Read the code in `GlobalizeObjectGraph.cpp` and miscellaneous changes in `ivalue_importer.cpp`, `TorchOps.cpp`, etc. 2021-02-18 03:28:51 +08:00			`//===----------------------------------------------------------------------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "npcomp/Dialect/Torch/Transforms/Passes.h"`
Miscellaneous changes while trying to work on ResNet18 - Move frontend lowering pipelines to c++ (this helps with reproducing failures in npcomp-opt) - Add debugging printouts when compilation fails on RefBackendTestConfig The experience now when a test fails during MLIR lowering is now like this: ``` NPCOMP TorchScript Object Graph IR -> NPCOMP Backend IR lowering failed with the following diagnostics: failed to legalize operation 'torch.global_slot' Module does not conform to npcomp's backend contract. See dialect conversion legality information above. Error can be reproduced with: $ npcomp-opt -torchscript-to-npcomp-backend-pipeline /tmp/ResNet18Module.mlir ``` And when TorchScript->MLIR import fails it looks like this: ``` PyTorch TorchScript module -> NPCOMP Object Graph IR import failed with the following diagnostics: unhandled prim operation: %18 : int = prim::min(%17) # /usr/local/google/home/silvasean/.local/lib/python3.9/site-packages/torch/nn/functional.py:4532:4 ``` Also, - Add `--filter=<regex>` to e2e test harness to filter tests. - Add a few prim ops that were needed to import ResNet18 - Fix torch.prim.Loop.condition assemblyFormat (it previously would not round-trip in the case of no loop-carried variables) 2021-04-22 06:07:15 +08:00			`#include "mlir/Dialect/Linalg/Passes.h"`
Support multiple instances of a class in GlobalizeObjectGraph. This happens in practice with e.g. ResNet from torchvision (multiple instances of the same BatchNorm class). The key observation is that for this program, and the expected set of programs, we can convert the program to the same globalized form with a bit more static analysis and effort to suitably monomorphize the program. Though what we are doing here is fairly annoying to implement, it saves any nontrivial later pass from having to do similar analyses (or worse). E.g. shape inference would need to be object-graph aware, mutation/lifetime analyses would have to be aware, etc. Additionally, it would make us front-load what it means to have a !torch.nn.Module type on an ABI boundary, which we are just not ready to handle. I'm really, really hoping that in practice we can get away with this, otherwise it's going to be really rough designing a representation (and implementing everything to back it) that is convenient to transform and gracefully scales from full object graph (in the most dynamic case) down to a fixed set of global slots like we have here (in the most static case, which we presume a lot of practical programs fall into). This also involved introducing a `torch-prepare-for-globalize-object-graph` pass that does a minimal set of lowerings to simplify the IR into a more orthogonal and analyzable form, and a `torch-globalize-pipeline` helper. Recommended review order: - updated documentation in Passes.td - new tests in `globalize-object-graph-multiple-instances*.mlir` - implementation of GlobalizeObjectGraph.cpp - PrepareForGlobalizeObjectGraph.cpp + prepare-for-globalize-object-graph.mlir - misc stuff like torch-globalize-pipeline pipeline definition. With this, we can import, globalize, and inline resnet18 from torchvision: https://gist.github.com/silvasean/821586afc19b67d9fb72030b2e0adeb8 2021-03-10 12:33:21 +08:00			`#include "mlir/Pass/PassManager.h"`
Miscellaneous changes while trying to work on ResNet18 - Move frontend lowering pipelines to c++ (this helps with reproducing failures in npcomp-opt) - Add debugging printouts when compilation fails on RefBackendTestConfig The experience now when a test fails during MLIR lowering is now like this: ``` NPCOMP TorchScript Object Graph IR -> NPCOMP Backend IR lowering failed with the following diagnostics: failed to legalize operation 'torch.global_slot' Module does not conform to npcomp's backend contract. See dialect conversion legality information above. Error can be reproduced with: $ npcomp-opt -torchscript-to-npcomp-backend-pipeline /tmp/ResNet18Module.mlir ``` And when TorchScript->MLIR import fails it looks like this: ``` PyTorch TorchScript module -> NPCOMP Object Graph IR import failed with the following diagnostics: unhandled prim operation: %18 : int = prim::min(%17) # /usr/local/google/home/silvasean/.local/lib/python3.9/site-packages/torch/nn/functional.py:4532:4 ``` Also, - Add `--filter=<regex>` to e2e test harness to filter tests. - Add a few prim ops that were needed to import ResNet18 - Fix torch.prim.Loop.condition assemblyFormat (it previously would not round-trip in the case of no loop-carried variables) 2021-04-22 06:07:15 +08:00			`#include "mlir/Transforms/Passes.h"`
			`#include "npcomp/Backend/Common/Passes.h"`
			`#include "npcomp/Conversion/ATenToLinalg/ATenToLinalg.h"`
			`#include "npcomp/Conversion/ATenToTCF/Passes.h"`
			`#include "npcomp/Conversion/TCFToStd/TCFToStd.h"`
			`#include "npcomp/Dialect/ATen/Transforms/Passes.h"`
			`#include "npcomp/Dialect/Numpy/Transforms/Passes.h"`
Implement GlobalizeObjectGraph transformation. This required restructuring of how we model TorchScript on import. The main difference is that now we split out a `torch.class_type` that holds methods and declarations of the types of each slot. This is more consistent with TorchScript (our previous representation was "denormalized"). Recommended reading order: 1. check out the description of `torch.class_type` in `TorchOps.td` and look at `test/Dialect/Torch/ops.mlir` and `frontends/pytorch/test/module_import/` to familiarize with the new representation. - Just look at the new IR. The diff between the old names and new names is confusing. 2. check out `test/Dialect/Torch/globalize-object-graph*.mlir` and read along with the pass description in `include/npcomp/Dialect/Torch/Transforms/Passes.td` 3. Read the code in `GlobalizeObjectGraph.cpp` and miscellaneous changes in `ivalue_importer.cpp`, `TorchOps.cpp`, etc. 2021-02-18 03:28:51 +08:00
			`//===----------------------------------------------------------------------===//`
			`// Pass registration`
			`//===----------------------------------------------------------------------===//`

			`namespace {`
			`#define GEN_PASS_REGISTRATION`
			`#include "npcomp/Dialect/Torch/Transforms/Passes.h.inc"`
			`} // end namespace`

Support multiple instances of a class in GlobalizeObjectGraph. This happens in practice with e.g. ResNet from torchvision (multiple instances of the same BatchNorm class). The key observation is that for this program, and the expected set of programs, we can convert the program to the same globalized form with a bit more static analysis and effort to suitably monomorphize the program. Though what we are doing here is fairly annoying to implement, it saves any nontrivial later pass from having to do similar analyses (or worse). E.g. shape inference would need to be object-graph aware, mutation/lifetime analyses would have to be aware, etc. Additionally, it would make us front-load what it means to have a !torch.nn.Module type on an ABI boundary, which we are just not ready to handle. I'm really, really hoping that in practice we can get away with this, otherwise it's going to be really rough designing a representation (and implementing everything to back it) that is convenient to transform and gracefully scales from full object graph (in the most dynamic case) down to a fixed set of global slots like we have here (in the most static case, which we presume a lot of practical programs fall into). This also involved introducing a `torch-prepare-for-globalize-object-graph` pass that does a minimal set of lowerings to simplify the IR into a more orthogonal and analyzable form, and a `torch-globalize-pipeline` helper. Recommended review order: - updated documentation in Passes.td - new tests in `globalize-object-graph-multiple-instances*.mlir` - implementation of GlobalizeObjectGraph.cpp - PrepareForGlobalizeObjectGraph.cpp + prepare-for-globalize-object-graph.mlir - misc stuff like torch-globalize-pipeline pipeline definition. With this, we can import, globalize, and inline resnet18 from torchvision: https://gist.github.com/silvasean/821586afc19b67d9fb72030b2e0adeb8 2021-03-10 12:33:21 +08:00			`void mlir::NPCOMP::registerTorchPasses() {`
			`::registerPasses();`
			`mlir::PassPipelineRegistration<>(`
			`"torch-globalize-pipeline", "Globalization pipeline.",`
			`mlir::NPCOMP::Torch::createGlobalizePipeline);`
Miscellaneous changes while trying to work on ResNet18 - Move frontend lowering pipelines to c++ (this helps with reproducing failures in npcomp-opt) - Add debugging printouts when compilation fails on RefBackendTestConfig The experience now when a test fails during MLIR lowering is now like this: ``` NPCOMP TorchScript Object Graph IR -> NPCOMP Backend IR lowering failed with the following diagnostics: failed to legalize operation 'torch.global_slot' Module does not conform to npcomp's backend contract. See dialect conversion legality information above. Error can be reproduced with: $ npcomp-opt -torchscript-to-npcomp-backend-pipeline /tmp/ResNet18Module.mlir ``` And when TorchScript->MLIR import fails it looks like this: ``` PyTorch TorchScript module -> NPCOMP Object Graph IR import failed with the following diagnostics: unhandled prim operation: %18 : int = prim::min(%17) # /usr/local/google/home/silvasean/.local/lib/python3.9/site-packages/torch/nn/functional.py:4532:4 ``` Also, - Add `--filter=<regex>` to e2e test harness to filter tests. - Add a few prim ops that were needed to import ResNet18 - Fix torch.prim.Loop.condition assemblyFormat (it previously would not round-trip in the case of no loop-carried variables) 2021-04-22 06:07:15 +08:00			`mlir::PassPipelineRegistration<>(`
			`"torchscript-to-npcomp-backend-pipeline",`
			`"Pipeline lowering torch object graph to npcomp backend format.",`
			`mlir::NPCOMP::Torch::createLowerObjectGraphPipeline);`
			`mlir::PassPipelineRegistration<>(`
			`"torch-globalized-module-to-npcomp-backend-pipeline",`
			`"Pipeline lowering to npcomp backend form.",`
			`mlir::NPCOMP::Torch::createLowerToNpcompBackendPipeline);`
Support multiple instances of a class in GlobalizeObjectGraph. This happens in practice with e.g. ResNet from torchvision (multiple instances of the same BatchNorm class). The key observation is that for this program, and the expected set of programs, we can convert the program to the same globalized form with a bit more static analysis and effort to suitably monomorphize the program. Though what we are doing here is fairly annoying to implement, it saves any nontrivial later pass from having to do similar analyses (or worse). E.g. shape inference would need to be object-graph aware, mutation/lifetime analyses would have to be aware, etc. Additionally, it would make us front-load what it means to have a !torch.nn.Module type on an ABI boundary, which we are just not ready to handle. I'm really, really hoping that in practice we can get away with this, otherwise it's going to be really rough designing a representation (and implementing everything to back it) that is convenient to transform and gracefully scales from full object graph (in the most dynamic case) down to a fixed set of global slots like we have here (in the most static case, which we presume a lot of practical programs fall into). This also involved introducing a `torch-prepare-for-globalize-object-graph` pass that does a minimal set of lowerings to simplify the IR into a more orthogonal and analyzable form, and a `torch-globalize-pipeline` helper. Recommended review order: - updated documentation in Passes.td - new tests in `globalize-object-graph-multiple-instances*.mlir` - implementation of GlobalizeObjectGraph.cpp - PrepareForGlobalizeObjectGraph.cpp + prepare-for-globalize-object-graph.mlir - misc stuff like torch-globalize-pipeline pipeline definition. With this, we can import, globalize, and inline resnet18 from torchvision: https://gist.github.com/silvasean/821586afc19b67d9fb72030b2e0adeb8 2021-03-10 12:33:21 +08:00			`}`

			`void mlir::NPCOMP::Torch::createGlobalizePipeline(OpPassManager &pm) {`
			`pm.addPass(createPrepareForGlobalizeObjectGraphPass());`
			`pm.addPass(createGlobalizeObjectGraphPass());`
			`}`
Miscellaneous changes while trying to work on ResNet18 - Move frontend lowering pipelines to c++ (this helps with reproducing failures in npcomp-opt) - Add debugging printouts when compilation fails on RefBackendTestConfig The experience now when a test fails during MLIR lowering is now like this: ``` NPCOMP TorchScript Object Graph IR -> NPCOMP Backend IR lowering failed with the following diagnostics: failed to legalize operation 'torch.global_slot' Module does not conform to npcomp's backend contract. See dialect conversion legality information above. Error can be reproduced with: $ npcomp-opt -torchscript-to-npcomp-backend-pipeline /tmp/ResNet18Module.mlir ``` And when TorchScript->MLIR import fails it looks like this: ``` PyTorch TorchScript module -> NPCOMP Object Graph IR import failed with the following diagnostics: unhandled prim operation: %18 : int = prim::min(%17) # /usr/local/google/home/silvasean/.local/lib/python3.9/site-packages/torch/nn/functional.py:4532:4 ``` Also, - Add `--filter=<regex>` to e2e test harness to filter tests. - Add a few prim ops that were needed to import ResNet18 - Fix torch.prim.Loop.condition assemblyFormat (it previously would not round-trip in the case of no loop-carried variables) 2021-04-22 06:07:15 +08:00
			`void mlir::NPCOMP::Torch::createLowerObjectGraphPipeline(OpPassManager &pm) {`
			`// When we import TorchScript IR, we import their entire "compilation unit",`
			`// which can contain numerous functions unrelated to the current program,`
			`// which breaks torch-globalization-pipeline; for example, there can be`
			`// random functions referencing types that haven't been imported`
			// as part of the root `torch.nn.Module` we imported. Those will
			`// be unreferenced private functions which symbol-dce will clean up nicely.`
			`pm.addPass(createSymbolDCEPass());`
			`// Globalize the program. The rest of the compiler assumes a globalized`
			`// program, which makes all analyses and transforms significantly easier`
			`// to write.`
			`pm.addPass(createPrepareForGlobalizeObjectGraphPass());`
			`pm.addPass(createGlobalizeObjectGraphPass());`
			// "lower" `torch.global_slot` ops by deleting them if unused, which we
			`// currently require because we don't have a lowering path for backends to`
			`// handle them.`
			`// Torch usually inserts a few unused global slots so this ends up hitting`
			`// every single module even if it doesn't have any explicit slots.`
			`// TODO: Support global slots in backends.`
			`pm.addPass(createSymbolDCEPass());`
			`// Currently, our shape inference is not powerful enough to deal with`
			`// calls, so inline everything.`
			`// TODO: Improve shape inference.`
			`pm.addPass(createInlinerPass());`
			`// Incorporate user annotations and remove signature Python-isms.`
			`pm.addPass(createAdjustCallingConventionsPass());`

			`createLowerToNpcompBackendPipeline(pm);`
			`}`

			`void mlir::NPCOMP::Torch::createLowerToNpcompBackendPipeline(OpPassManager &pm) {`
			`// Recognize ATen kernels.`
			`pm.addNestedPass<FuncOp>(aten::createRecognizeKernelsPass());`

			`// Convert the bulk of the program to ranked tensors with known dtype.`
			`// This is the input to the backend layer that we are aiming for.`

			`// First, unilaterally convert public functions to tensor.`
			`// The way this pass is currently written, this implies that`
			`// as pipeline authors, we are restricting our users to not be able to see`
			`// updates to "out params" on their public functions.`
			`// This is deemed ok for now.`
			`pm.addPass(Numpy::createPublicFunctionsToTensorPass());`
			`// Convert the bulk of non-ABI-visible arrays to tensors.`
			`pm.addNestedPass<FuncOp>(Numpy::createArrayToTensorPass());`
			`// Do shape and dtype refinement.`
			`// We could do it sooner, but the pass currently doesn't have transfer`
			`// functions for array ops.`
			`pm.addNestedPass<FuncOp>(Torch::createRefineTypesPass());`
			`// Propagate to ABI return types the shape/dtype information discovered by`
			`// the previous pass. Doing this is ABI-compatible for our backends.`
			`pm.addPass(Numpy::createRefinePublicReturnPass());`
			`// Clean up a few stray array/tensor conversion remnants.`
			`pm.addNestedPass<FuncOp>(Numpy::createArrayToTensorPass());`

			`// Lower to TCP (+ guards) which is the input to codegen backends.`
			`// Most of this should be subsumed by aten->linalg+guards conversions.`
			`// (the guard generation will be automated from the linalg Op DSL).`
			`pm.addNestedPass<FuncOp>(createConvertATenToLinalgPass());`
			`pm.addNestedPass<FuncOp>(createConvertATenToTCFPass());`
			`pm.addNestedPass<FuncOp>(createConvertTCFToStdPass());`
			`pm.addNestedPass<FuncOp>(createConvertElementwiseToLinalgPass());`

			`// Verify that we have lowered to the form that backends expect.`
			`// This fails compilation (signalPassFailure) if the IR is not in the`
			`// correct form.`
			`pm.addPass(CommonBackend::createVerifyBackendContractPass());`
			`}`