2021-09-24 11:59:12 +08:00
|
|
|
// RUN: torch-mlir-opt <%s -convert-torch-to-linalg -split-input-file -verify-diagnostics | FileCheck %s
|
Add TorchToIREE and factor out TorchConversion dialect.
This converts a basic list op (torch.prim.ListConstruct) to the IREE
dialect.
```
def forward(self, x: float):
return [x, x]
```
turns into:
```
builtin.func @forward(%arg0: !torch.float) -> !torch.list<!torch.float> {
%0 = torch.prim.ListConstruct %arg0, %arg0 : (!torch.float, !torch.float) -> !torch.list<!torch.float>
return %0 : !torch.list<!torch.float>
}
```
which turns into:
```
builtin.func @forward(%arg0: f64) -> !iree.list<f64> {
%c1 = constant 1 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%0 = iree.list.create %c2 : !iree.list<f64>
iree.list.set %0[%c0], %arg0 : !iree.list<f64>, f64
iree.list.set %0[%c1], %arg0 : !iree.list<f64>, f64
return %0 : !iree.list<f64>
}
```
As part of doing this, I realized that it was time to formalize the IR
form that we reach right before running TorchTo{Linalg,Std,...}. We now
call it the "Torch backend contract". We then lower the "Torch backend
contract" to the "npcomp backend contract", which involves the new
TorchConversion (`torch_c`) dialect, which holds ops that need to
operate on both the npcomp backend types (e.g. builtin tensors, i1, IREE
list, etc.) and the `!torch` types.
This made more sense, as I realized that if I didn't factor out
`torch_c` then the Torch dialect would have a dependency on IREE
dialect (we previously didn't notice this was an issue because we only
depended on `builtin` types), which seemed wrong to me.
Recommended review order:
- TorchToIREE.cpp / `TorchToIREE/basic.mlir`
- Look at the new structure of createTorchScriptToNpcompBackendPipeline.
It now lives in TorchConversion/Transforms/Passes.cpp and cleanly
calls into `Torch::createTorchScriptToTorchBackendPipeline` for the
frontend lowering to the Torch backend contract.
- Mechanical change extracting
`torch_c.{to,from}_{i1,i64,f64,builtin_tensor,iree_list}` into a new
TorchConversion dialect, and a few passes specific to the lowering
from the Torch backend contract to the npcomp backend contract.
- Minor fixes to TorchToLinalg.cpp to use unconverted operands (now that
we convert lists as part of operand materialization, we need to use
the original operands). Also added test for AtenMaxPool2dOp and fixed
m_TorchConstantIntList.
- TmpDeleteDeadIREELists pass. Temporary pass for deleting dead IREE lists that
are created as part of operand materialization for conv/max pool/avg pool ops
in TorchToLinalg.
2021-08-12 05:40:08 +08:00
|
|
|
|
Add aten.pool_max3d support to torch-to-linalg (#2735)
Added verification logic to the abstract_interpreter_lib_gen.py
Also made some unit tests
Initially, I thought we can use `linalg::pooling_ndhwc_max` to help
implement this problem. However, on a 5-dimensional matrix it does the
pooling on dimensions (2, 3, 4) which is not what we want. We want
pooling on dimensions (3, 4, 5).
To achieve this, we would need to lower our code using the `linalg`
dialect.
Turns out the pooling code in `linalg` looks like this.
```
func @max_pooling_ncdhw(%I: memref<?x?x?x?x?xf32>, %K: memref<3xindex>, %O: memref<?x?x?x?x?xf32>,
%strides: memref<3xindex>, %dilations: memref<3xindex>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%N = memref.dim %I, %c0 : memref<?x?x?x?x?xf32>
%C = memref.dim %I, %c1 : memref<?x?x?x?x?xf32>
%D = memref.dim %I, 2 : memref<?x?x?x?x?xf32>
%H = memref.dim %I, 3 : memref<?x?x?x?x?xf32>
%W = memref.dim %I, 4 : memref<?x?x?x?x?xf32>
%kernel_d = memref.load %K[%c0] : memref<3xindex>
%kernel_h = memref.load %K[%c1] : memref<3xindex>
%kernel_w = memref.load %K[2] : memref<3xindex>
%stride_d = memref.load %strides[%c0] : memref<3xindex>
%stride_h = memref.load %strides[%c1] : memref<3xindex>
%stride_w = memref.load %strides[2] : memref<3xindex>
%dilation_d = memref.load %dilations[%c0] : memref<3xindex>
%dilation_h = memref.load %dilations[%c1] : memref<3xindex>
%dilation_w = memref.load %dilations[2] : memref<3xindex>
linalg.generic {
indexing_maps = [
affine_map<(n, c, d, h, w, kd, kh, kw) -> (n, c, d * %stride_d + kd * %dilation_d, h * %stride_h + kh * %dilation_h, w * %stride_w + kw * %dilation_w)>, // Map for input tensor
affine_map<(n, c, d, h, w, kd, kh, kw) -> (kd, kh, kw)>, // Map for kernel tensor
affine_map<(n, c, d, h, w, kd, kh, kw) -> (n, c, d, h, w)> // Map for output tensor
],
iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"],
doc = "3D Max Pooling NCDHW with Strides, Dilations, and Kernel Size"
} ins(%I, %K : memref<?x?x?x?x?xf32>, memref<3xindex>) outs(%O : memref<?x?x?x?x?xf32>) {
^bb0(%input_elem: f32, %kernel_elem: index, %output_elem: f32):
%max_val = arith.maxf %input_elem, %output_elem : f32
linalg.yield %max_val : f32
}
return
}
```
This was implemented based on it's source code with the adjustments
mentioned above:
https://github.com/llvm/llvm-project/blob/4ca1b5e094280ef1af40412e3cfcb62dc3cf15bc/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml#L5647
Issues related to this can be found here
https://github.com/nod-ai/SHARK-Turbine/issues/324
2024-01-19 23:39:46 +08:00
|
|
|
// CHECK-LABEL: func @forward_max_pool2d
|
|
|
|
func.func @forward_max_pool2d(%arg0: !torch.vtensor<[?,?,?,?],f32>) -> !torch.vtensor<[?,?,?,?],f32> {
|
Add TorchToIREE and factor out TorchConversion dialect.
This converts a basic list op (torch.prim.ListConstruct) to the IREE
dialect.
```
def forward(self, x: float):
return [x, x]
```
turns into:
```
builtin.func @forward(%arg0: !torch.float) -> !torch.list<!torch.float> {
%0 = torch.prim.ListConstruct %arg0, %arg0 : (!torch.float, !torch.float) -> !torch.list<!torch.float>
return %0 : !torch.list<!torch.float>
}
```
which turns into:
```
builtin.func @forward(%arg0: f64) -> !iree.list<f64> {
%c1 = constant 1 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%0 = iree.list.create %c2 : !iree.list<f64>
iree.list.set %0[%c0], %arg0 : !iree.list<f64>, f64
iree.list.set %0[%c1], %arg0 : !iree.list<f64>, f64
return %0 : !iree.list<f64>
}
```
As part of doing this, I realized that it was time to formalize the IR
form that we reach right before running TorchTo{Linalg,Std,...}. We now
call it the "Torch backend contract". We then lower the "Torch backend
contract" to the "npcomp backend contract", which involves the new
TorchConversion (`torch_c`) dialect, which holds ops that need to
operate on both the npcomp backend types (e.g. builtin tensors, i1, IREE
list, etc.) and the `!torch` types.
This made more sense, as I realized that if I didn't factor out
`torch_c` then the Torch dialect would have a dependency on IREE
dialect (we previously didn't notice this was an issue because we only
depended on `builtin` types), which seemed wrong to me.
Recommended review order:
- TorchToIREE.cpp / `TorchToIREE/basic.mlir`
- Look at the new structure of createTorchScriptToNpcompBackendPipeline.
It now lives in TorchConversion/Transforms/Passes.cpp and cleanly
calls into `Torch::createTorchScriptToTorchBackendPipeline` for the
frontend lowering to the Torch backend contract.
- Mechanical change extracting
`torch_c.{to,from}_{i1,i64,f64,builtin_tensor,iree_list}` into a new
TorchConversion dialect, and a few passes specific to the lowering
from the Torch backend contract to the npcomp backend contract.
- Minor fixes to TorchToLinalg.cpp to use unconverted operands (now that
we convert lists as part of operand materialization, we need to use
the original operands). Also added test for AtenMaxPool2dOp and fixed
m_TorchConstantIntList.
- TmpDeleteDeadIREELists pass. Temporary pass for deleting dead IREE lists that
are created as part of operand materialization for conv/max pool/avg pool ops
in TorchToLinalg.
2021-08-12 05:40:08 +08:00
|
|
|
%int1 = torch.constant.int 1
|
|
|
|
%int2 = torch.constant.int 2
|
|
|
|
%int3 = torch.constant.int 3
|
|
|
|
%int4 = torch.constant.int 4
|
|
|
|
%int5 = torch.constant.int 5
|
|
|
|
%int6 = torch.constant.int 6
|
|
|
|
%int7 = torch.constant.int 7
|
|
|
|
%int8 = torch.constant.int 8
|
|
|
|
%false = torch.constant.bool false
|
2022-05-13 20:06:24 +08:00
|
|
|
// CHECK: %[[C1:.*]] = torch_c.to_i64 %int1
|
|
|
|
// CHECK: %[[C2:.*]] = torch_c.to_i64 %int2
|
2023-05-09 00:17:49 +08:00
|
|
|
// CHECK: %[[NEUTRAL:.*]] = arith.constant 0xFF800000 : f32
|
2022-01-26 14:16:30 +08:00
|
|
|
// CHECK: %[[PADDED:.*]] = tensor.pad %{{.*}} low[0, 0, 5, 6] high[0, 0, 5, 6]
|
2022-03-16 18:44:23 +08:00
|
|
|
// CHECK: %[[OUT:.*]] = linalg.fill ins(%[[NEUTRAL]] : f32) outs(%{{.*}} : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
|
2022-05-13 20:06:24 +08:00
|
|
|
// CHECK: %[[T1:.*]] = arith.index_cast %[[C1]] : i64 to index
|
|
|
|
// CHECK: %[[T2:.*]] = arith.index_cast %[[C2]] : i64 to index
|
2022-10-18 12:22:53 +08:00
|
|
|
// CHECK: %[[INIT:.*]] = tensor.empty(%[[T1]], %[[T2]]) : tensor<?x?xf32>
|
Add TorchToIREE and factor out TorchConversion dialect.
This converts a basic list op (torch.prim.ListConstruct) to the IREE
dialect.
```
def forward(self, x: float):
return [x, x]
```
turns into:
```
builtin.func @forward(%arg0: !torch.float) -> !torch.list<!torch.float> {
%0 = torch.prim.ListConstruct %arg0, %arg0 : (!torch.float, !torch.float) -> !torch.list<!torch.float>
return %0 : !torch.list<!torch.float>
}
```
which turns into:
```
builtin.func @forward(%arg0: f64) -> !iree.list<f64> {
%c1 = constant 1 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%0 = iree.list.create %c2 : !iree.list<f64>
iree.list.set %0[%c0], %arg0 : !iree.list<f64>, f64
iree.list.set %0[%c1], %arg0 : !iree.list<f64>, f64
return %0 : !iree.list<f64>
}
```
As part of doing this, I realized that it was time to formalize the IR
form that we reach right before running TorchTo{Linalg,Std,...}. We now
call it the "Torch backend contract". We then lower the "Torch backend
contract" to the "npcomp backend contract", which involves the new
TorchConversion (`torch_c`) dialect, which holds ops that need to
operate on both the npcomp backend types (e.g. builtin tensors, i1, IREE
list, etc.) and the `!torch` types.
This made more sense, as I realized that if I didn't factor out
`torch_c` then the Torch dialect would have a dependency on IREE
dialect (we previously didn't notice this was an issue because we only
depended on `builtin` types), which seemed wrong to me.
Recommended review order:
- TorchToIREE.cpp / `TorchToIREE/basic.mlir`
- Look at the new structure of createTorchScriptToNpcompBackendPipeline.
It now lives in TorchConversion/Transforms/Passes.cpp and cleanly
calls into `Torch::createTorchScriptToTorchBackendPipeline` for the
frontend lowering to the Torch backend contract.
- Mechanical change extracting
`torch_c.{to,from}_{i1,i64,f64,builtin_tensor,iree_list}` into a new
TorchConversion dialect, and a few passes specific to the lowering
from the Torch backend contract to the npcomp backend contract.
- Minor fixes to TorchToLinalg.cpp to use unconverted operands (now that
we convert lists as part of operand materialization, we need to use
the original operands). Also added test for AtenMaxPool2dOp and fixed
m_TorchConstantIntList.
- TmpDeleteDeadIREELists pass. Temporary pass for deleting dead IREE lists that
are created as part of operand materialization for conv/max pool/avg pool ops
in TorchToLinalg.
2021-08-12 05:40:08 +08:00
|
|
|
// CHECK: linalg.pooling_nchw_max {dilations = dense<[7, 8]> : vector<2xi64>, strides = dense<[3, 4]> : vector<2xi64>} ins(%[[PADDED]], %[[INIT]] : tensor<?x?x?x?xf32>, tensor<?x?xf32>) outs(%[[OUT]] : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
|
2022-03-16 07:22:56 +08:00
|
|
|
%kernel_size = torch.prim.ListConstruct %int1, %int2 : (!torch.int, !torch.int) -> !torch.list<int>
|
|
|
|
%stride = torch.prim.ListConstruct %int3, %int4 : (!torch.int, !torch.int) -> !torch.list<int>
|
|
|
|
%padding = torch.prim.ListConstruct %int5, %int6 : (!torch.int, !torch.int) -> !torch.list<int>
|
|
|
|
%dilation = torch.prim.ListConstruct %int7, %int8 : (!torch.int, !torch.int) -> !torch.list<int>
|
|
|
|
%4 = torch.aten.max_pool2d %arg0, %kernel_size, %stride, %padding, %dilation, %false : !torch.vtensor<[?,?,?,?],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool -> !torch.vtensor<[?,?,?,?],f32>
|
Add TorchToIREE and factor out TorchConversion dialect.
This converts a basic list op (torch.prim.ListConstruct) to the IREE
dialect.
```
def forward(self, x: float):
return [x, x]
```
turns into:
```
builtin.func @forward(%arg0: !torch.float) -> !torch.list<!torch.float> {
%0 = torch.prim.ListConstruct %arg0, %arg0 : (!torch.float, !torch.float) -> !torch.list<!torch.float>
return %0 : !torch.list<!torch.float>
}
```
which turns into:
```
builtin.func @forward(%arg0: f64) -> !iree.list<f64> {
%c1 = constant 1 : index
%c0 = constant 0 : index
%c2 = constant 2 : index
%0 = iree.list.create %c2 : !iree.list<f64>
iree.list.set %0[%c0], %arg0 : !iree.list<f64>, f64
iree.list.set %0[%c1], %arg0 : !iree.list<f64>, f64
return %0 : !iree.list<f64>
}
```
As part of doing this, I realized that it was time to formalize the IR
form that we reach right before running TorchTo{Linalg,Std,...}. We now
call it the "Torch backend contract". We then lower the "Torch backend
contract" to the "npcomp backend contract", which involves the new
TorchConversion (`torch_c`) dialect, which holds ops that need to
operate on both the npcomp backend types (e.g. builtin tensors, i1, IREE
list, etc.) and the `!torch` types.
This made more sense, as I realized that if I didn't factor out
`torch_c` then the Torch dialect would have a dependency on IREE
dialect (we previously didn't notice this was an issue because we only
depended on `builtin` types), which seemed wrong to me.
Recommended review order:
- TorchToIREE.cpp / `TorchToIREE/basic.mlir`
- Look at the new structure of createTorchScriptToNpcompBackendPipeline.
It now lives in TorchConversion/Transforms/Passes.cpp and cleanly
calls into `Torch::createTorchScriptToTorchBackendPipeline` for the
frontend lowering to the Torch backend contract.
- Mechanical change extracting
`torch_c.{to,from}_{i1,i64,f64,builtin_tensor,iree_list}` into a new
TorchConversion dialect, and a few passes specific to the lowering
from the Torch backend contract to the npcomp backend contract.
- Minor fixes to TorchToLinalg.cpp to use unconverted operands (now that
we convert lists as part of operand materialization, we need to use
the original operands). Also added test for AtenMaxPool2dOp and fixed
m_TorchConstantIntList.
- TmpDeleteDeadIREELists pass. Temporary pass for deleting dead IREE lists that
are created as part of operand materialization for conv/max pool/avg pool ops
in TorchToLinalg.
2021-08-12 05:40:08 +08:00
|
|
|
return %4 : !torch.vtensor<[?,?,?,?],f32>
|
|
|
|
}
|
Add aten.pool_max3d support to torch-to-linalg (#2735)
Added verification logic to the abstract_interpreter_lib_gen.py
Also made some unit tests
Initially, I thought we can use `linalg::pooling_ndhwc_max` to help
implement this problem. However, on a 5-dimensional matrix it does the
pooling on dimensions (2, 3, 4) which is not what we want. We want
pooling on dimensions (3, 4, 5).
To achieve this, we would need to lower our code using the `linalg`
dialect.
Turns out the pooling code in `linalg` looks like this.
```
func @max_pooling_ncdhw(%I: memref<?x?x?x?x?xf32>, %K: memref<3xindex>, %O: memref<?x?x?x?x?xf32>,
%strides: memref<3xindex>, %dilations: memref<3xindex>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%N = memref.dim %I, %c0 : memref<?x?x?x?x?xf32>
%C = memref.dim %I, %c1 : memref<?x?x?x?x?xf32>
%D = memref.dim %I, 2 : memref<?x?x?x?x?xf32>
%H = memref.dim %I, 3 : memref<?x?x?x?x?xf32>
%W = memref.dim %I, 4 : memref<?x?x?x?x?xf32>
%kernel_d = memref.load %K[%c0] : memref<3xindex>
%kernel_h = memref.load %K[%c1] : memref<3xindex>
%kernel_w = memref.load %K[2] : memref<3xindex>
%stride_d = memref.load %strides[%c0] : memref<3xindex>
%stride_h = memref.load %strides[%c1] : memref<3xindex>
%stride_w = memref.load %strides[2] : memref<3xindex>
%dilation_d = memref.load %dilations[%c0] : memref<3xindex>
%dilation_h = memref.load %dilations[%c1] : memref<3xindex>
%dilation_w = memref.load %dilations[2] : memref<3xindex>
linalg.generic {
indexing_maps = [
affine_map<(n, c, d, h, w, kd, kh, kw) -> (n, c, d * %stride_d + kd * %dilation_d, h * %stride_h + kh * %dilation_h, w * %stride_w + kw * %dilation_w)>, // Map for input tensor
affine_map<(n, c, d, h, w, kd, kh, kw) -> (kd, kh, kw)>, // Map for kernel tensor
affine_map<(n, c, d, h, w, kd, kh, kw) -> (n, c, d, h, w)> // Map for output tensor
],
iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"],
doc = "3D Max Pooling NCDHW with Strides, Dilations, and Kernel Size"
} ins(%I, %K : memref<?x?x?x?x?xf32>, memref<3xindex>) outs(%O : memref<?x?x?x?x?xf32>) {
^bb0(%input_elem: f32, %kernel_elem: index, %output_elem: f32):
%max_val = arith.maxf %input_elem, %output_elem : f32
linalg.yield %max_val : f32
}
return
}
```
This was implemented based on it's source code with the adjustments
mentioned above:
https://github.com/llvm/llvm-project/blob/4ca1b5e094280ef1af40412e3cfcb62dc3cf15bc/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml#L5647
Issues related to this can be found here
https://github.com/nod-ai/SHARK-Turbine/issues/324
2024-01-19 23:39:46 +08:00
|
|
|
|
|
|
|
// -----
|
|
|
|
|
|
|
|
// CHECK: #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2 * 2 + d5 * 3, d3 * 2 + d6 * 3, d4 * 2 + d7 * 3)>
|
|
|
|
// CHECK: #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d5, d6, d7)>
|
|
|
|
// CHECK: #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4)>
|
|
|
|
// CHECK-LABEL: func @forward_max_pool3d
|
|
|
|
func.func @forward_max_pool3d(%arg0: !torch.vtensor<[?,?,?,?,?],f32>) -> !torch.vtensor<[?,?,?,?,?],f32> {
|
|
|
|
%kernel_size1 = torch.constant.int 8
|
|
|
|
%kernel_size2 = torch.constant.int 8
|
|
|
|
%kernel_size3 = torch.constant.int 8
|
|
|
|
|
|
|
|
%stride1 = torch.constant.int 2
|
|
|
|
%stride2 = torch.constant.int 2
|
|
|
|
%stride3 = torch.constant.int 2
|
|
|
|
|
|
|
|
%padding1 = torch.constant.int 4
|
|
|
|
%padding2 = torch.constant.int 4
|
|
|
|
%padding3 = torch.constant.int 4
|
|
|
|
|
|
|
|
%dilation1 = torch.constant.int 3
|
|
|
|
%dilation2 = torch.constant.int 3
|
|
|
|
%dilation3 = torch.constant.int 3
|
2024-01-26 06:24:28 +08:00
|
|
|
|
Add aten.pool_max3d support to torch-to-linalg (#2735)
Added verification logic to the abstract_interpreter_lib_gen.py
Also made some unit tests
Initially, I thought we can use `linalg::pooling_ndhwc_max` to help
implement this problem. However, on a 5-dimensional matrix it does the
pooling on dimensions (2, 3, 4) which is not what we want. We want
pooling on dimensions (3, 4, 5).
To achieve this, we would need to lower our code using the `linalg`
dialect.
Turns out the pooling code in `linalg` looks like this.
```
func @max_pooling_ncdhw(%I: memref<?x?x?x?x?xf32>, %K: memref<3xindex>, %O: memref<?x?x?x?x?xf32>,
%strides: memref<3xindex>, %dilations: memref<3xindex>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%N = memref.dim %I, %c0 : memref<?x?x?x?x?xf32>
%C = memref.dim %I, %c1 : memref<?x?x?x?x?xf32>
%D = memref.dim %I, 2 : memref<?x?x?x?x?xf32>
%H = memref.dim %I, 3 : memref<?x?x?x?x?xf32>
%W = memref.dim %I, 4 : memref<?x?x?x?x?xf32>
%kernel_d = memref.load %K[%c0] : memref<3xindex>
%kernel_h = memref.load %K[%c1] : memref<3xindex>
%kernel_w = memref.load %K[2] : memref<3xindex>
%stride_d = memref.load %strides[%c0] : memref<3xindex>
%stride_h = memref.load %strides[%c1] : memref<3xindex>
%stride_w = memref.load %strides[2] : memref<3xindex>
%dilation_d = memref.load %dilations[%c0] : memref<3xindex>
%dilation_h = memref.load %dilations[%c1] : memref<3xindex>
%dilation_w = memref.load %dilations[2] : memref<3xindex>
linalg.generic {
indexing_maps = [
affine_map<(n, c, d, h, w, kd, kh, kw) -> (n, c, d * %stride_d + kd * %dilation_d, h * %stride_h + kh * %dilation_h, w * %stride_w + kw * %dilation_w)>, // Map for input tensor
affine_map<(n, c, d, h, w, kd, kh, kw) -> (kd, kh, kw)>, // Map for kernel tensor
affine_map<(n, c, d, h, w, kd, kh, kw) -> (n, c, d, h, w)> // Map for output tensor
],
iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"],
doc = "3D Max Pooling NCDHW with Strides, Dilations, and Kernel Size"
} ins(%I, %K : memref<?x?x?x?x?xf32>, memref<3xindex>) outs(%O : memref<?x?x?x?x?xf32>) {
^bb0(%input_elem: f32, %kernel_elem: index, %output_elem: f32):
%max_val = arith.maxf %input_elem, %output_elem : f32
linalg.yield %max_val : f32
}
return
}
```
This was implemented based on it's source code with the adjustments
mentioned above:
https://github.com/llvm/llvm-project/blob/4ca1b5e094280ef1af40412e3cfcb62dc3cf15bc/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml#L5647
Issues related to this can be found here
https://github.com/nod-ai/SHARK-Turbine/issues/324
2024-01-19 23:39:46 +08:00
|
|
|
%false = torch.constant.bool false
|
|
|
|
%kernel_size = torch.prim.ListConstruct %kernel_size1, %kernel_size2, %kernel_size3 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
|
|
|
|
%stride = torch.prim.ListConstruct %stride1, %stride2, %stride3 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
|
|
|
|
%padding = torch.prim.ListConstruct %padding1, %padding2, %padding3 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
|
|
|
|
%dilation = torch.prim.ListConstruct %dilation1, %dilation2, %dilation3 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
|
2024-01-26 06:24:28 +08:00
|
|
|
|
Add aten.pool_max3d support to torch-to-linalg (#2735)
Added verification logic to the abstract_interpreter_lib_gen.py
Also made some unit tests
Initially, I thought we can use `linalg::pooling_ndhwc_max` to help
implement this problem. However, on a 5-dimensional matrix it does the
pooling on dimensions (2, 3, 4) which is not what we want. We want
pooling on dimensions (3, 4, 5).
To achieve this, we would need to lower our code using the `linalg`
dialect.
Turns out the pooling code in `linalg` looks like this.
```
func @max_pooling_ncdhw(%I: memref<?x?x?x?x?xf32>, %K: memref<3xindex>, %O: memref<?x?x?x?x?xf32>,
%strides: memref<3xindex>, %dilations: memref<3xindex>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%N = memref.dim %I, %c0 : memref<?x?x?x?x?xf32>
%C = memref.dim %I, %c1 : memref<?x?x?x?x?xf32>
%D = memref.dim %I, 2 : memref<?x?x?x?x?xf32>
%H = memref.dim %I, 3 : memref<?x?x?x?x?xf32>
%W = memref.dim %I, 4 : memref<?x?x?x?x?xf32>
%kernel_d = memref.load %K[%c0] : memref<3xindex>
%kernel_h = memref.load %K[%c1] : memref<3xindex>
%kernel_w = memref.load %K[2] : memref<3xindex>
%stride_d = memref.load %strides[%c0] : memref<3xindex>
%stride_h = memref.load %strides[%c1] : memref<3xindex>
%stride_w = memref.load %strides[2] : memref<3xindex>
%dilation_d = memref.load %dilations[%c0] : memref<3xindex>
%dilation_h = memref.load %dilations[%c1] : memref<3xindex>
%dilation_w = memref.load %dilations[2] : memref<3xindex>
linalg.generic {
indexing_maps = [
affine_map<(n, c, d, h, w, kd, kh, kw) -> (n, c, d * %stride_d + kd * %dilation_d, h * %stride_h + kh * %dilation_h, w * %stride_w + kw * %dilation_w)>, // Map for input tensor
affine_map<(n, c, d, h, w, kd, kh, kw) -> (kd, kh, kw)>, // Map for kernel tensor
affine_map<(n, c, d, h, w, kd, kh, kw) -> (n, c, d, h, w)> // Map for output tensor
],
iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"],
doc = "3D Max Pooling NCDHW with Strides, Dilations, and Kernel Size"
} ins(%I, %K : memref<?x?x?x?x?xf32>, memref<3xindex>) outs(%O : memref<?x?x?x?x?xf32>) {
^bb0(%input_elem: f32, %kernel_elem: index, %output_elem: f32):
%max_val = arith.maxf %input_elem, %output_elem : f32
linalg.yield %max_val : f32
}
return
}
```
This was implemented based on it's source code with the adjustments
mentioned above:
https://github.com/llvm/llvm-project/blob/4ca1b5e094280ef1af40412e3cfcb62dc3cf15bc/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml#L5647
Issues related to this can be found here
https://github.com/nod-ai/SHARK-Turbine/issues/324
2024-01-19 23:39:46 +08:00
|
|
|
%4 = torch.aten.max_pool3d %arg0, %kernel_size, %stride, %padding, %dilation, %false : !torch.vtensor<[?,?,?,?,?],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool -> !torch.vtensor<[?,?,?,?,?],f32>
|
|
|
|
|
|
|
|
// CHECK: %[[MIN_VALUE:.*]] = arith.constant 0xFF800000 : f32
|
|
|
|
// CHECK: %[[PADDED_INPUT_TENSOR:.*]] = tensor.pad %{{.*}} low[0, 0, 4, 4, 4] high[0, 0, 4, 4, 4] {
|
|
|
|
// CHECK-NEXT: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index):
|
|
|
|
// CHECK-NEXT: tensor.yield %[[MIN_VALUE:.*]] : f32
|
|
|
|
// CHECK: } : tensor<?x?x?x?x?xf32> to tensor<?x?x?x?x?xf32>
|
2024-01-26 06:24:28 +08:00
|
|
|
|
Add aten.pool_max3d support to torch-to-linalg (#2735)
Added verification logic to the abstract_interpreter_lib_gen.py
Also made some unit tests
Initially, I thought we can use `linalg::pooling_ndhwc_max` to help
implement this problem. However, on a 5-dimensional matrix it does the
pooling on dimensions (2, 3, 4) which is not what we want. We want
pooling on dimensions (3, 4, 5).
To achieve this, we would need to lower our code using the `linalg`
dialect.
Turns out the pooling code in `linalg` looks like this.
```
func @max_pooling_ncdhw(%I: memref<?x?x?x?x?xf32>, %K: memref<3xindex>, %O: memref<?x?x?x?x?xf32>,
%strides: memref<3xindex>, %dilations: memref<3xindex>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%N = memref.dim %I, %c0 : memref<?x?x?x?x?xf32>
%C = memref.dim %I, %c1 : memref<?x?x?x?x?xf32>
%D = memref.dim %I, 2 : memref<?x?x?x?x?xf32>
%H = memref.dim %I, 3 : memref<?x?x?x?x?xf32>
%W = memref.dim %I, 4 : memref<?x?x?x?x?xf32>
%kernel_d = memref.load %K[%c0] : memref<3xindex>
%kernel_h = memref.load %K[%c1] : memref<3xindex>
%kernel_w = memref.load %K[2] : memref<3xindex>
%stride_d = memref.load %strides[%c0] : memref<3xindex>
%stride_h = memref.load %strides[%c1] : memref<3xindex>
%stride_w = memref.load %strides[2] : memref<3xindex>
%dilation_d = memref.load %dilations[%c0] : memref<3xindex>
%dilation_h = memref.load %dilations[%c1] : memref<3xindex>
%dilation_w = memref.load %dilations[2] : memref<3xindex>
linalg.generic {
indexing_maps = [
affine_map<(n, c, d, h, w, kd, kh, kw) -> (n, c, d * %stride_d + kd * %dilation_d, h * %stride_h + kh * %dilation_h, w * %stride_w + kw * %dilation_w)>, // Map for input tensor
affine_map<(n, c, d, h, w, kd, kh, kw) -> (kd, kh, kw)>, // Map for kernel tensor
affine_map<(n, c, d, h, w, kd, kh, kw) -> (n, c, d, h, w)> // Map for output tensor
],
iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"],
doc = "3D Max Pooling NCDHW with Strides, Dilations, and Kernel Size"
} ins(%I, %K : memref<?x?x?x?x?xf32>, memref<3xindex>) outs(%O : memref<?x?x?x?x?xf32>) {
^bb0(%input_elem: f32, %kernel_elem: index, %output_elem: f32):
%max_val = arith.maxf %input_elem, %output_elem : f32
linalg.yield %max_val : f32
}
return
}
```
This was implemented based on it's source code with the adjustments
mentioned above:
https://github.com/llvm/llvm-project/blob/4ca1b5e094280ef1af40412e3cfcb62dc3cf15bc/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml#L5647
Issues related to this can be found here
https://github.com/nod-ai/SHARK-Turbine/issues/324
2024-01-19 23:39:46 +08:00
|
|
|
// CHECK: %[[OUTPUT_TENSOR:.*]] = linalg.fill ins(%[[MIN_VALUE:.*]] : f32) outs(%{{.*}} : tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
|
|
|
|
// CHECK: %[[MAX_3D_POOL:.*]] = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%[[PADDED_INPUT_TENSOR:.*]], %{{.*}} : tensor<?x?x?x?x?xf32>, tensor<?x?x?xf32>) outs(%[[OUTPUT_TENSOR:.*]] : tensor<?x?x?x?x?xf32>) {
|
|
|
|
// CHECK-NEXT: ^bb0(%[[CURRENT_VALUE:.*]]: f32, %[[KERNEL:.*]]: f32, %[[ACC_OUT:.*]]: f32):
|
|
|
|
// CHECK-NEXT: %[[MAXF:.*]] = arith.maximumf %[[CURRENT_VALUE:.*]], %[[ACC_OUT:.*]] : f32
|
|
|
|
// CHECK-NEXT: linalg.yield %[[MAXF:.*]] : f32
|
|
|
|
// CHECK: } -> tensor<?x?x?x?x?xf32>
|
|
|
|
return %4 : !torch.vtensor<[?,?,?,?,?],f32>
|
|
|
|
}
|