torch-mlir/lib/Dialect/Torch/Transforms/PrepareForGlobalizeObjectGr...

//===- PrepareForGlobalizeObjectGraph.cpp ------------------------*- C++-*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "PassDetail.h"

#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "npcomp/Dialect/Torch/IR/TorchDialect.h"
#include "npcomp/Dialect/Torch/IR/TorchOps.h"
#include "npcomp/Dialect/Torch/Transforms/Passes.h"

using namespace mlir;
using namespace mlir::NPCOMP;
using namespace mlir::NPCOMP::Torch;

namespace {
class ConvertPrimCallMethodToCall : public OpRewritePattern<PrimCallMethodOp> {
public:
  ConvertPrimCallMethodToCall(MLIRContext *context, SymbolTable &symbolTable)
      : OpRewritePattern(context), symbolTable(symbolTable) {}
  LogicalResult matchAndRewrite(PrimCallMethodOp op,
                                PatternRewriter &rewriter) const override {
    auto classType = symbolTable.lookup<ClassTypeOp>(
        op.receiver().getType().cast<NnModuleType>().getClassName());
    assert(classType && "malformed module -- missing ClassTypeOp");
    FuncOp func;
    for (auto method : classType.getOps<MethodOp>()) {
      if (method.name() == op.name()) {
        func = symbolTable.lookup<FuncOp>(method.function());
        break;
      }
    }
    assert(func);
    rewriter.replaceOpWithNewOp<CallOp>(op, func, op->getOperands());
    return success();
  }

private:
  SymbolTable &symbolTable;
};
} // namespace

namespace {
class EraseUnusedConstantOp : public OpRewritePattern<ConstantOp> {
public:
  using OpRewritePattern::OpRewritePattern;
  LogicalResult matchAndRewrite(ConstantOp op,
                                PatternRewriter &rewriter) const override {
    if (op.use_empty()) {
      rewriter.eraseOp(op);
      return success();
    }
    return failure();
  }
};
} // namespace

namespace {
class PrepareForGlobalizeObjectGraphPass
    : public PrepareForGlobalizeObjectGraphBase<
          PrepareForGlobalizeObjectGraphPass> {
  void runOnOperation() override {

    SymbolTable symbolTable(getOperation());

    MLIRContext *context = &getContext();
    RewritePatternSet patterns(context);
    patterns.add<ConvertPrimCallMethodToCall>(context, symbolTable);
    CallIndirectOp::getCanonicalizationPatterns(patterns, context);
    patterns.add<EraseUnusedConstantOp>(context);

    // Use applyPatternsAndFoldGreedily because the CallIndirectOp folding
    // makes the ConstantOp unused, which does not work with the visitation
    // order of the dialect conversion infrastructure.
    // TODO: Do this with the dialect conversion infrastructure to avoid doing
    // folding as part of this. Or avoid folding during greedy pattern
    // application. See: https://llvm.org/PR49502
    if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                            std::move(patterns)))) {
      return signalPassFailure();
    }

    // Do a dummy full conversion to ensure that the program has been converted
    // to the form we want.
    ConversionTarget target(*context);
    target.addIllegalOp<PrimCallMethodOp>();
    target.addDynamicallyLegalOp<ConstantOp>([](ConstantOp op) {
      return !op.getType().isa<FunctionType>();
    });
    target.addIllegalOp<CallIndirectOp>();
    target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });

    RewritePatternSet dummyPatterns(context);

    if (failed(applyFullConversion(getOperation(), target,
                                      std::move(dummyPatterns)))) {
      return signalPassFailure();
    }
  }
};
} // namespace

std::unique_ptr<OperationPass<ModuleOp>>
mlir::NPCOMP::Torch::createPrepareForGlobalizeObjectGraphPass() {
  return std::make_unique<PrepareForGlobalizeObjectGraphPass>();
}
Support multiple instances of a class in GlobalizeObjectGraph. This happens in practice with e.g. ResNet from torchvision (multiple instances of the same BatchNorm class). The key observation is that for this program, and the expected set of programs, we can convert the program to the same globalized form with a bit more static analysis and effort to suitably monomorphize the program. Though what we are doing here is fairly annoying to implement, it saves any nontrivial later pass from having to do similar analyses (or worse). E.g. shape inference would need to be object-graph aware, mutation/lifetime analyses would have to be aware, etc. Additionally, it would make us front-load what it means to have a !torch.nn.Module type on an ABI boundary, which we are just not ready to handle. I'm really, really hoping that in practice we can get away with this, otherwise it's going to be really rough designing a representation (and implementing everything to back it) that is convenient to transform and gracefully scales from full object graph (in the most dynamic case) down to a fixed set of global slots like we have here (in the most static case, which we presume a lot of practical programs fall into). This also involved introducing a `torch-prepare-for-globalize-object-graph` pass that does a minimal set of lowerings to simplify the IR into a more orthogonal and analyzable form, and a `torch-globalize-pipeline` helper. Recommended review order: - updated documentation in Passes.td - new tests in `globalize-object-graph-multiple-instances*.mlir` - implementation of GlobalizeObjectGraph.cpp - PrepareForGlobalizeObjectGraph.cpp + prepare-for-globalize-object-graph.mlir - misc stuff like torch-globalize-pipeline pipeline definition. With this, we can import, globalize, and inline resnet18 from torchvision: https://gist.github.com/silvasean/821586afc19b67d9fb72030b2e0adeb8 2021-03-10 12:33:21 +08:00			`//===- PrepareForGlobalizeObjectGraph.cpp ------------------------- C++--===//`
			`//`
			`// This file is licensed under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "PassDetail.h"`

			`#include "mlir/Dialect/StandardOps/IR/Ops.h"`
			`#include "mlir/IR/BlockAndValueMapping.h"`
			`#include "mlir/IR/Builders.h"`
			`#include "mlir/IR/BuiltinOps.h"`
			`#include "mlir/Transforms/DialectConversion.h"`
			`#include "mlir/Transforms/GreedyPatternRewriteDriver.h"`
			`#include "npcomp/Dialect/Torch/IR/TorchDialect.h"`
			`#include "npcomp/Dialect/Torch/IR/TorchOps.h"`
			`#include "npcomp/Dialect/Torch/Transforms/Passes.h"`

			`using namespace mlir;`
			`using namespace mlir::NPCOMP;`
			`using namespace mlir::NPCOMP::Torch;`

			`namespace {`
			`class ConvertPrimCallMethodToCall : public OpRewritePattern<PrimCallMethodOp> {`
			`public:`
			`ConvertPrimCallMethodToCall(MLIRContext *context, SymbolTable &symbolTable)`
			`: OpRewritePattern(context), symbolTable(symbolTable) {}`
			`LogicalResult matchAndRewrite(PrimCallMethodOp op,`
			`PatternRewriter &rewriter) const override {`
			`auto classType = symbolTable.lookup<ClassTypeOp>(`
			`op.receiver().getType().cast<NnModuleType>().getClassName());`
Fix issue with unused functions in torch::jit::CompilationUnit As described in the code comment: ``` When we import TorchScript IR, we import their entire "compilation unit", which can contain numerous functions unrelated to the current program, which breaks torch-globalization-pipeline; for example, there can be random functions referencing types that haven't been imported as part of the root `torch.nn.Module` we imported. Those will be unreferenced private functions which symbol-dce will clean up nicely. ``` This situation is really easy to hit in jupyter notebooks, where the same cell is evaluated multiple times. That results in the same class name (at the Python level, e.g. class `Foo` in the top-level main module). Internally to PyTorch, it handles this situation by mangling in a unique number to the names of ClassType's and such. When we import the new ClassType's, we see not just the new torch::jit::Function's in the CompilationUnit, but, also all the old ones, which reference ClassType's that are not reachable from the `torch.nn.Module` that we imported. Note: there is no way to avoid importing the whole CompilationUnit (including these old remnants) without doing a fairly complicated call graph reachability analysis of which functions are reachable from the methods of the ClassType's we imported. It turns out that once we are inside MLIR, we model visibility correctly so that `symbol-dce` "Just Works" for this use case. That is to say, this is not a quick hack, but rather seems like a totally palatable long-term solution. 2021-04-14 05:50:58 +08:00			`assert(classType && "malformed module -- missing ClassTypeOp");`
Support multiple instances of a class in GlobalizeObjectGraph. This happens in practice with e.g. ResNet from torchvision (multiple instances of the same BatchNorm class). The key observation is that for this program, and the expected set of programs, we can convert the program to the same globalized form with a bit more static analysis and effort to suitably monomorphize the program. Though what we are doing here is fairly annoying to implement, it saves any nontrivial later pass from having to do similar analyses (or worse). E.g. shape inference would need to be object-graph aware, mutation/lifetime analyses would have to be aware, etc. Additionally, it would make us front-load what it means to have a !torch.nn.Module type on an ABI boundary, which we are just not ready to handle. I'm really, really hoping that in practice we can get away with this, otherwise it's going to be really rough designing a representation (and implementing everything to back it) that is convenient to transform and gracefully scales from full object graph (in the most dynamic case) down to a fixed set of global slots like we have here (in the most static case, which we presume a lot of practical programs fall into). This also involved introducing a `torch-prepare-for-globalize-object-graph` pass that does a minimal set of lowerings to simplify the IR into a more orthogonal and analyzable form, and a `torch-globalize-pipeline` helper. Recommended review order: - updated documentation in Passes.td - new tests in `globalize-object-graph-multiple-instances*.mlir` - implementation of GlobalizeObjectGraph.cpp - PrepareForGlobalizeObjectGraph.cpp + prepare-for-globalize-object-graph.mlir - misc stuff like torch-globalize-pipeline pipeline definition. With this, we can import, globalize, and inline resnet18 from torchvision: https://gist.github.com/silvasean/821586afc19b67d9fb72030b2e0adeb8 2021-03-10 12:33:21 +08:00			`FuncOp func;`
			`for (auto method : classType.getOps<MethodOp>()) {`
			`if (method.name() == op.name()) {`
			`func = symbolTable.lookup<FuncOp>(method.function());`
			`break;`
			`}`
			`}`
			`assert(func);`
			`rewriter.replaceOpWithNewOp<CallOp>(op, func, op->getOperands());`
			`return success();`
			`}`

			`private:`
			`SymbolTable &symbolTable;`
			`};`
			`} // namespace`

			`namespace {`
			`class EraseUnusedConstantOp : public OpRewritePattern<ConstantOp> {`
			`public:`
			`using OpRewritePattern::OpRewritePattern;`
			`LogicalResult matchAndRewrite(ConstantOp op,`
			`PatternRewriter &rewriter) const override {`
			`if (op.use_empty()) {`
			`rewriter.eraseOp(op);`
			`return success();`
			`}`
			`return failure();`
			`}`
			`};`
			`} // namespace`

			`namespace {`
			`class PrepareForGlobalizeObjectGraphPass`
			`: public PrepareForGlobalizeObjectGraphBase<`
			`PrepareForGlobalizeObjectGraphPass> {`
			`void runOnOperation() override {`

			`SymbolTable symbolTable(getOperation());`

			`MLIRContext *context = &getContext();`
Bump llvm-project to 0524a09cc7e1a0797982feacf505825231efbee7 - renames of OwningRewritePatternList -> RewritePatternSet - also `insert` to `add` - RewritePatternSet holds a context now - memref dialect split from std 2021-03-24 05:16:23 +08:00			`RewritePatternSet patterns(context);`
			`patterns.add<ConvertPrimCallMethodToCall>(context, symbolTable);`
Support multiple instances of a class in GlobalizeObjectGraph. This happens in practice with e.g. ResNet from torchvision (multiple instances of the same BatchNorm class). The key observation is that for this program, and the expected set of programs, we can convert the program to the same globalized form with a bit more static analysis and effort to suitably monomorphize the program. Though what we are doing here is fairly annoying to implement, it saves any nontrivial later pass from having to do similar analyses (or worse). E.g. shape inference would need to be object-graph aware, mutation/lifetime analyses would have to be aware, etc. Additionally, it would make us front-load what it means to have a !torch.nn.Module type on an ABI boundary, which we are just not ready to handle. I'm really, really hoping that in practice we can get away with this, otherwise it's going to be really rough designing a representation (and implementing everything to back it) that is convenient to transform and gracefully scales from full object graph (in the most dynamic case) down to a fixed set of global slots like we have here (in the most static case, which we presume a lot of practical programs fall into). This also involved introducing a `torch-prepare-for-globalize-object-graph` pass that does a minimal set of lowerings to simplify the IR into a more orthogonal and analyzable form, and a `torch-globalize-pipeline` helper. Recommended review order: - updated documentation in Passes.td - new tests in `globalize-object-graph-multiple-instances*.mlir` - implementation of GlobalizeObjectGraph.cpp - PrepareForGlobalizeObjectGraph.cpp + prepare-for-globalize-object-graph.mlir - misc stuff like torch-globalize-pipeline pipeline definition. With this, we can import, globalize, and inline resnet18 from torchvision: https://gist.github.com/silvasean/821586afc19b67d9fb72030b2e0adeb8 2021-03-10 12:33:21 +08:00			`CallIndirectOp::getCanonicalizationPatterns(patterns, context);`
Bump llvm-project to 0524a09cc7e1a0797982feacf505825231efbee7 - renames of OwningRewritePatternList -> RewritePatternSet - also `insert` to `add` - RewritePatternSet holds a context now - memref dialect split from std 2021-03-24 05:16:23 +08:00			`patterns.add<EraseUnusedConstantOp>(context);`
Support multiple instances of a class in GlobalizeObjectGraph. This happens in practice with e.g. ResNet from torchvision (multiple instances of the same BatchNorm class). The key observation is that for this program, and the expected set of programs, we can convert the program to the same globalized form with a bit more static analysis and effort to suitably monomorphize the program. Though what we are doing here is fairly annoying to implement, it saves any nontrivial later pass from having to do similar analyses (or worse). E.g. shape inference would need to be object-graph aware, mutation/lifetime analyses would have to be aware, etc. Additionally, it would make us front-load what it means to have a !torch.nn.Module type on an ABI boundary, which we are just not ready to handle. I'm really, really hoping that in practice we can get away with this, otherwise it's going to be really rough designing a representation (and implementing everything to back it) that is convenient to transform and gracefully scales from full object graph (in the most dynamic case) down to a fixed set of global slots like we have here (in the most static case, which we presume a lot of practical programs fall into). This also involved introducing a `torch-prepare-for-globalize-object-graph` pass that does a minimal set of lowerings to simplify the IR into a more orthogonal and analyzable form, and a `torch-globalize-pipeline` helper. Recommended review order: - updated documentation in Passes.td - new tests in `globalize-object-graph-multiple-instances*.mlir` - implementation of GlobalizeObjectGraph.cpp - PrepareForGlobalizeObjectGraph.cpp + prepare-for-globalize-object-graph.mlir - misc stuff like torch-globalize-pipeline pipeline definition. With this, we can import, globalize, and inline resnet18 from torchvision: https://gist.github.com/silvasean/821586afc19b67d9fb72030b2e0adeb8 2021-03-10 12:33:21 +08:00
			`// Use applyPatternsAndFoldGreedily because the CallIndirectOp folding`
			`// makes the ConstantOp unused, which does not work with the visitation`
			`// order of the dialect conversion infrastructure.`
			`// TODO: Do this with the dialect conversion infrastructure to avoid doing`
			`// folding as part of this. Or avoid folding during greedy pattern`
			`// application. See: https://llvm.org/PR49502`
			`if (failed(applyPatternsAndFoldGreedily(getOperation(),`
			`std::move(patterns)))) {`
			`return signalPassFailure();`
			`}`

			`// Do a dummy full conversion to ensure that the program has been converted`
			`// to the form we want.`
			`ConversionTarget target(*context);`
			`target.addIllegalOp<PrimCallMethodOp>();`
			`target.addDynamicallyLegalOp<ConstantOp>([](ConstantOp op) {`
			`return !op.getType().isa<FunctionType>();`
			`});`
			`target.addIllegalOp<CallIndirectOp>();`
			`target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });`

Bump llvm-project to 0524a09cc7e1a0797982feacf505825231efbee7 - renames of OwningRewritePatternList -> RewritePatternSet - also `insert` to `add` - RewritePatternSet holds a context now - memref dialect split from std 2021-03-24 05:16:23 +08:00			`RewritePatternSet dummyPatterns(context);`
Support multiple instances of a class in GlobalizeObjectGraph. This happens in practice with e.g. ResNet from torchvision (multiple instances of the same BatchNorm class). The key observation is that for this program, and the expected set of programs, we can convert the program to the same globalized form with a bit more static analysis and effort to suitably monomorphize the program. Though what we are doing here is fairly annoying to implement, it saves any nontrivial later pass from having to do similar analyses (or worse). E.g. shape inference would need to be object-graph aware, mutation/lifetime analyses would have to be aware, etc. Additionally, it would make us front-load what it means to have a !torch.nn.Module type on an ABI boundary, which we are just not ready to handle. I'm really, really hoping that in practice we can get away with this, otherwise it's going to be really rough designing a representation (and implementing everything to back it) that is convenient to transform and gracefully scales from full object graph (in the most dynamic case) down to a fixed set of global slots like we have here (in the most static case, which we presume a lot of practical programs fall into). This also involved introducing a `torch-prepare-for-globalize-object-graph` pass that does a minimal set of lowerings to simplify the IR into a more orthogonal and analyzable form, and a `torch-globalize-pipeline` helper. Recommended review order: - updated documentation in Passes.td - new tests in `globalize-object-graph-multiple-instances*.mlir` - implementation of GlobalizeObjectGraph.cpp - PrepareForGlobalizeObjectGraph.cpp + prepare-for-globalize-object-graph.mlir - misc stuff like torch-globalize-pipeline pipeline definition. With this, we can import, globalize, and inline resnet18 from torchvision: https://gist.github.com/silvasean/821586afc19b67d9fb72030b2e0adeb8 2021-03-10 12:33:21 +08:00
			`if (failed(applyFullConversion(getOperation(), target,`
			`std::move(dummyPatterns)))) {`
			`return signalPassFailure();`
			`}`
			`}`
			`};`
			`} // namespace`

			`std::unique_ptr<OperationPass<ModuleOp>>`
			`mlir::NPCOMP::Torch::createPrepareForGlobalizeObjectGraphPass() {`
			`return std::make_unique<PrepareForGlobalizeObjectGraphPass>();`
			`}`