//===- aten_ops.cpp ---------------------------------------------*- C++ -*-===// // // This file is licensed under a pytorch-style license // See frontends/pytorch/LICENSE for license information. // //===----------------------------------------------------------------------===// // This file implements C libraries that are targetted by MLIR code generation // from the ATen dialect. This library is intended to support a functional // proof of concept rather than optimized for high performance. Most of the // functions are implemented by calling back into the torch libraries. #include #include #include #include #include #include #include #include "nnpack.h" #include namespace { template struct tensor_t { T *d; T *aligned; size_t offset; size_t shape[N]; size_t stride[N]; size_t index(size_t n, size_t channel, size_t row, size_t col) const { size_t channels = shape[1]; size_t height = shape[2]; size_t width = shape[3]; return n * height * width * channels + channel * height * width + row * width + col; } tensor_t() { d = aligned = nullptr; offset = 0; for (int i = 0; i < N; i++) shape[i] = stride[i] = 0; } }; template std::vector translate_shape(tensor_t *t) { std::vector shape; for (int i = 0; i < N; i++) { shape.push_back(t->shape[i]); // std::cout << i << " shape " << t->shape[i] << std::endl; } return shape; } template std::vector translate_stride(tensor_t *t) { std::vector stride; for (int i = 0; i < N; i++) { stride.push_back(t->stride[i]); // std::cout << i << " stride " << t->stride[i] << std::endl; } return stride; } template void dumpTensor(std::ostream &o, tensor_t *t) { o << "Shape:"; for (int i = 0; i < N; i++) o << t->shape[i] << " "; o << "Stride:"; for (int i = 0; i < N; i++) o << t->stride[i] << " "; o << "\n"; } template at::Tensor to_torch(tensor_t *t, const at::TensorOptions &options = at::TensorOptions()) { // std::cout << "to_torch\n"; return torch::from_blob((void *)t->d, translate_shape(t), translate_stride(t), options); } template void mm_out(tensor_t *a, tensor_t *b, tensor_t *r); template void add_out(tensor_t *a, tensor_t *b, T alpha, tensor_t *r) { at::Tensor torch_a = to_torch(a); at::Tensor torch_b = to_torch(b); at::Tensor result = at::native::add(torch_a, torch_b, alpha).clone(); memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T)); } template void addmm_out(tensor_t *a, tensor_t *b, tensor_t *c, int32_t alpha, int32_t beta, tensor_t *r) { at::Tensor torch_a = to_torch(a); at::Tensor torch_b = to_torch(b); at::Tensor torch_c = to_torch(c); at::Tensor result = at::native::addmm(torch_a, torch_b, torch_c, alpha, beta).clone(); memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T)); } template void as_strided_out(tensor_t *a, /*size*/ int32_t sz0, int32_t sz1, int32_t sz2, int32_t sz3, /*stride*/ int32_t sd0, int32_t sd1, int32_t sd2, int32_t sd3, int32_t offset, tensor_t *r) { at::Tensor input = to_torch(a); std::vector size; std::vector stride; c10::optional storage_offset; if (offset != 0) storage_offset = offset; if (N > 0) { size.push_back(sz0); stride.push_back(sd0); } if (N > 1) { size.push_back(sz1); stride.push_back(sd1); } if (N > 2) { size.push_back(sz2); stride.push_back(sd2); } if (N > 3) { size.push_back(sz3); stride.push_back(sd3); } std::vector sizeRef{size}; std::vector strideRef{stride}; // for (int i = 0; id, result.data_ptr(), result.numel() * sizeof(T)); } // FIXME: stride, padding, dilaection, output_padding should be IntArrayRef template void conv2d_out(tensor_t *t, tensor_t *weight, tensor_t *bias, int32_t stride, int32_t pad, int32_t dilation, tensor_t *r) { at::Tensor torch_t = to_torch(t); at::Tensor torch_w = to_torch(weight); at::Tensor torch_b = to_torch(bias); int64_t groups = 1; at::Tensor result = at::native::conv2d(torch_t, torch_w, torch_b, stride, pad, dilation, groups) .clone(); memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T)); } template void conv2d_backward_out(tensor_t *grad_output, tensor_t *input, tensor_t *weight, int32_t stride, int32_t pad, int32_t dilation, tensor_t *r0, tensor_t *r1, tensor_t *r2) { const at::Tensor &arg_grad = to_torch(grad_output); const at::Tensor &arg_input = to_torch(input); const at::Tensor &arg_weight = to_torch(weight); std::vector p{pad, pad}; std::vector s{stride, stride}; std::vector d{dilation, dilation}; std::array output_mask{true, true, true}; std::tuple grads = at::native::mkldnn_convolution_backward(arg_input, arg_grad, arg_weight, p, s, d, 1, output_mask); auto result0 = std::get<0>(grads); auto result1 = std::get<1>(grads); auto result2 = std::get<2>(grads); memcpy(r0->d, result0.data_ptr(), result0.numel() * sizeof(T)); memcpy(r1->d, result1.data_ptr(), result1.numel() * sizeof(T)); memcpy(r2->d, result2.data_ptr(), result2.numel() * sizeof(T)); } template void log_softmax_out(tensor_t *t, int32_t dim, bool half_to_float, tensor_t *r) { at::Tensor input = to_torch(t); at::Tensor result = at::native::log_softmax_cpu(input, dim, half_to_float); memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T)); } template void log_softmax_backward_data_out(tensor_t *a, tensor_t *b, int32_t c, tensor_t *d, tensor_t *r) { at::Tensor inputA = to_torch(a); at::Tensor inputB = to_torch(b); at::Tensor inputD = to_torch(d); at::Tensor result = at::native::log_softmax_backward_cpu(inputA, inputB, c, inputD); memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T)); } template void max_pool2d_with_indices_out(tensor_t *t, int32_t c, int32_t d, int32_t e, int32_t f, bool ceil_mode, tensor_t *r0, tensor_t *r1) { at::Tensor input = to_torch(t); std::vector kernel{c, c}; std::vector stride{d, d}; std::vector padding{e, e}; std::vector dilation{f, f}; auto result = at::native::max_pool2d_with_indices_cpu( input, kernel, stride, padding, dilation, ceil_mode); at::Tensor outTensor = std::get<0>(result); at::Tensor idxTensor = std::get<1>(result); memcpy(r0->d, outTensor.data_ptr(), outTensor.numel() * sizeof(T)); memcpy(r1->d, idxTensor.data_ptr(), idxTensor.numel() * sizeof(T)); } template void max_pool2d_with_indices_backward_out(tensor_t *a, tensor_t *b, int32_t c, int32_t d, int32_t e, int32_t f, bool g, tensor_t *h, tensor_t *r) { const at::Tensor &inputA = to_torch(a); const at::Tensor &inputB = to_torch(b); at::TensorOptions options(at::ScalarType::Long); const at::Tensor &inputH = to_torch(h, options); std::vector kernel{c, c}; std::vector stride{d, d}; std::vector padding{e, e}; std::vector dilation{f, f}; at::Tensor result = at::native::max_pool2d_with_indices_backward_cpu( inputA, inputB, kernel, stride, padding, dilation, g, inputH); memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T)); } template void mm_out(tensor_t *a, tensor_t *b, tensor_t *r) { at::Tensor inputA = to_torch(a); at::Tensor inputB = to_torch(b); at::Tensor result = inputA.matmul(inputB); memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T)); } template void mul_out(tensor_t *a, tensor_t *b, tensor_t *r) { at::Tensor inputA = to_torch(a); at::Tensor inputB = to_torch(b); at::Tensor result = at::native::mul(inputA, inputB); memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T)); } template void relu_out(tensor_t *a, tensor_t *r) { at::Tensor inputA = to_torch(a); at::Tensor result = at::native::relu(inputA); memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T)); } template void t_out(tensor_t *a, tensor_t *r) { size_t h = a->shape[0]; size_t w = a->shape[1]; for (size_t i = 0; i < h; i++) for (size_t j = 0; j < w; j++) r->d[j * h + i] = a->d[i * w + j]; } template void threshold_backward_out(tensor_t *a, tensor_t *b, int32_t c, tensor_t *r) { at::Tensor inputA = to_torch(a); at::Tensor inputB = to_torch(b); at::Tensor result = at::native::threshold_backward(inputA, inputB, c); memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T)); } template void view_out(tensor_t *a, int32_t b, int32_t c, int32_t d, int32_t e, tensor_t *r) { tensor_t result; size_t numel = 1; for (size_t d = 0; d < M; d++) numel *= a->shape[d]; if (N == 1) c = d = e = 1; if (N == 2) d = e = 1; if (N == 3) e = 1; int inferred = 0; if (b == -1) inferred++; if (c == -1) inferred++; if (d == -1) inferred++; if (e == -1) inferred++; assert(inferred <= 1 && "aten.view Error: only one dimension can be inferred"); if (b == -1) b = numel / (c * d * e); if (c == -1) c = numel / (b * d * e); if (d == -1) d = numel / (b * c * e); if (e == -1) e = numel / (b * c * d); if (N > 0) r->shape[0] = b; if (N > 1) r->shape[1] = c; if (N > 2) r->shape[2] = d; if (N > 3) r->shape[3] = e; memcpy(r->d, a->d, numel * sizeof(T)); } } // namespace extern "C" { // add_out void _mlir_ciface_add_1F32_1F32_1F32_out(tensor_t *a, tensor_t *b, int32_t i, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; add_out(a, b, i, r); } void _mlir_ciface_add_2F32_2F32_2F32_out(tensor_t *a, tensor_t *b, int32_t i, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; add_out(a, b, i, r); } void _mlir_ciface_add_3F32_3F32_3F32_out(tensor_t *a, tensor_t *b, int32_t i, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; add_out(a, b, i, r); } void _mlir_ciface_add_4F32_4F32_4F32_out(tensor_t *a, tensor_t *b, int32_t i, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; add_out(a, b, i, r); } // addmm_out void _mlir_ciface_addmm_2F32_1F32_2F32_2F32_out(tensor_t *a, tensor_t *b, tensor_t *c, int32_t alpha, int32_t beta, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; addmm_out(a, b, c, alpha, beta, r); } // as_strided_out void _mlir_ciface_as_strided_1F32_1F32_out(tensor_t *a, /*size*/ int32_t sz0, int32_t sz1, int32_t sz2, int32_t sz3, /*stride*/ int32_t sd0, int32_t sd1, int32_t sd2, int32_t sd3, int32_t offset, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; as_strided_out(a, sz0, sz1, sz2, sz3, sd0, sd1, sd2, sd3, offset, r); } void _mlir_ciface_as_strided_4F32_2F32_out(tensor_t *a, /*size*/ int32_t sz0, int32_t sz1, int32_t sz2, int32_t sz3, /*stride*/ int32_t sd0, int32_t sd1, int32_t sd2, int32_t sd3, int32_t offset, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; // std::cout << sz0 << " " // << sz1 << " " // << sz2 << " " // << sz3 << "\n"; // std::cout << sd0 << " " // << sd1 << " " // << sd2 << " " // << sd3 << "\n"; as_strided_out(a, sz0, sz1, sz2, sz3, sd0, sd1, sd2, sd3, offset, r); } // conv2d_out void _mlir_ciface_conv2d_4F32_4F32_4F32_1F32_out( tensor_t *t, tensor_t *weight, tensor_t *bias, int32_t stride, int32_t padding, int32_t dilation, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; conv2d_out(t, weight, bias, stride, padding, dilation, r); } void _mlir_ciface_conv2d_relu_4F32_4F32_4F32_1F32_out( tensor_t *t, tensor_t *weight, tensor_t *bias, int32_t stride, int32_t padding, int32_t dilation, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; conv2d_out(t, weight, bias, stride, padding, dilation, r); relu_out(r, r); } // conv2d_backward_out void _mlir_ciface_conv2d_backward_4F32_4F32_1F32_4F32_4F32_4F32_out( tensor_t *grad_output, tensor_t *t, tensor_t *weight, int32_t stride, int32_t padding, int32_t dilation, tensor_t *r0, tensor_t *r1, tensor_t *r2) { // std::cout << "aten_ops " << __func__ << "\n"; conv2d_backward_out(grad_output, t, weight, stride, padding, dilation, r0, r1, r2); } // div float *div_0F32_0F32_0F32(float *a, float *b) { // std::cout << "aten_ops " << __func__ << "\n"; float *ret = (float *)malloc(sizeof(float)); *ret = *a / *b; return ret; } // log_softmax_out void _mlir_ciface_log_softmax_1F32_1F32_out(tensor_t *t, int32_t dim, bool half_to_float, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; log_softmax_out(t, dim, half_to_float, r); } void _mlir_ciface_log_softmax_2F32_2F32_out(tensor_t *t, int32_t dim, bool half_to_float, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; log_softmax_out(t, dim, half_to_float, r); } void _mlir_ciface_log_softmax_3F32_3F32_out(tensor_t *t, int32_t dim, bool half_to_float, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; log_softmax_out(t, dim, half_to_float, r); } void _mlir_ciface_log_softmax_4F32_4F32_out(tensor_t *t, int32_t dim, bool half_to_float, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; log_softmax_out(t, dim, half_to_float, r); } // log_softmax_backward_data_out void _mlir_ciface_log_softmax_backward_data_2F32_2F32_2F32_2F32_out( tensor_t *a, tensor_t *b, int32_t c, tensor_t *d, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; log_softmax_backward_data_out(a, b, c, d, r); } void _mlir_ciface_log_softmax_backward_data_4F32_4F32_4F32_4F32_out( tensor_t *a, tensor_t *b, int32_t c, tensor_t *d, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; log_softmax_backward_data_out(a, b, c, d, r); } // max_pool2d_out void _mlir_ciface_max_pool2d_with_indices_4F32_4I64_4F32_out( tensor_t *t, int32_t kernel, int32_t pad, int32_t stride, int32_t dilation, bool ceil_mode, tensor_t *r0, tensor_t *r1) { // std::cout << "aten_ops " << __func__ << "\n"; max_pool2d_with_indices_out(t, kernel, pad, stride, dilation, ceil_mode, r0, r1); } // max_pool2d backward_out void _mlir_ciface_max_pool2d_with_indices_backward_4F32_4F32_4F32_4I64_out( tensor_t *a, tensor_t *b, int32_t c, int32_t d, int32_t e, int32_t f, bool g, tensor_t *h, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; max_pool2d_with_indices_backward_out(a, b, c, d, e, f, g, h, r); } // mm_out void _mlir_ciface_mm_2F32_2F32_2F32_out(tensor_t *a, tensor_t *b, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; mm_out(a, b, r); } // mul_out void _mlir_ciface_mul_1F32_1F32_1F32_out(tensor_t *a, tensor_t *b, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; mul_out(a, b, r); } void _mlir_ciface_mul_2F32_2F32_2F32_out(tensor_t *a, tensor_t *b, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; mul_out(a, b, r); } void _mlir_ciface_mul_3F32_3F32_3F32_out(tensor_t *a, tensor_t *b, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; mul_out(a, b, r); } void _mlir_ciface_mul_4F32_4F32_4F32_out(tensor_t *a, tensor_t *b, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; mul_out(a, b, r); } // nll_loss2d_forward_out void _mlir_ciface_nll_loss2d_forward_1F32_1F32_4F32_3I64_1F32_out( tensor_t *a, tensor_t *b, tensor_t *c, int64_t d, int64_t e, tensor_t *r0, tensor_t *r1) { // std::cout << "aten_ops " << __func__ << "\n"; using T = float; at::Tensor inputA = to_torch(a); at::TensorOptions options(at::ScalarType::Long); at::Tensor inputB = to_torch(b, options); at::Tensor inputC = to_torch(c); std::tuple result = at::CPUType::nll_loss2d_forward(inputA, inputB, inputC, d, e); at::Tensor result0 = std::get<0>(result); at::Tensor result1 = std::get<1>(result); memcpy(r0->d, result0.data_ptr(), result0.numel() * sizeof(T)); memcpy(r1->d, result1.data_ptr(), result1.numel() * sizeof(T)); } // nll_loss2d_backward_out void _mlir_ciface_nll_loss2d_backward_4F32_1F32_4F32_3I64_1F32_1F32_out( tensor_t *a, tensor_t *b, tensor_t *c, tensor_t *d, int32_t e, int32_t f, tensor_t *g, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; using T = float; at::Tensor inputA = to_torch(a); at::Tensor inputB = to_torch(b); at::TensorOptions options(at::ScalarType::Long); at::Tensor inputC = to_torch(c, options); at::Tensor inputD = to_torch(d); at::Tensor inputG = to_torch(g); at::Tensor result = at::CPUType::nll_loss2d_backward(inputA, inputB, inputC, inputD, e, f, inputG); memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T)); } void _mlir_ciface_nll_loss_backward_2F32_1F32_2F32_1I64_1F32_1F32_out( tensor_t *a, tensor_t *b, tensor_t *c, tensor_t *d, int32_t e, int32_t f, tensor_t *g, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; using T = float; at::Tensor inputA = to_torch(a); at::Tensor inputB = to_torch(b); at::TensorOptions options(at::ScalarType::Long); at::Tensor inputC = to_torch(c, options); at::Tensor inputD = to_torch(d); at::Tensor inputG = to_torch(g); at::Tensor result = at::CPUType::nll_loss_backward(inputA, inputB, inputC, inputD, e, f, inputG); memcpy(r->d, result.data_ptr(), result.numel() * sizeof(T)); } // nll_loss_forward_out void _mlir_ciface_nll_loss_forward_1F32_1F32_2F32_1I64_1F32_out( tensor_t *a, tensor_t *b, tensor_t *c, int64_t d, int64_t e, tensor_t *r0, tensor_t *r1) { // std::cout << "aten_ops " << __func__ << "\n"; using T = float; at::Tensor inputA = to_torch(a); at::TensorOptions options(at::ScalarType::Long); at::Tensor inputB = to_torch(b, options); at::Tensor inputC = to_torch(c); std::tuple result = at::CPUType::nll_loss_forward(inputA, inputB, inputC, d, e); at::Tensor result0 = std::get<0>(result); at::Tensor result1 = std::get<1>(result); memcpy(r0->d, result0.data_ptr(), result0.numel() * sizeof(T)); memcpy(r1->d, result1.data_ptr(), result1.numel() * sizeof(T)); } // relu_out void _mlir_ciface_relu_1F32_1F32_out(tensor_t *a, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; relu_out(a, r); } void _mlir_ciface_relu_2F32_2F32_out(tensor_t *a, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; relu_out(a, r); } void _mlir_ciface_relu_3F32_3F32_out(tensor_t *a, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; relu_out(a, r); } void _mlir_ciface_relu_4F32_4F32_out(tensor_t *a, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; relu_out(a, r); } // t_out void _mlir_ciface_t_2F32_2F32_out(tensor_t *a, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; t_out(a, r); } // threshold_backward_out void _mlir_ciface_threshold_backward_1F32_1F32_1F32_out(tensor_t *a, tensor_t *b, int32_t c, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; threshold_backward_out(a, b, c, r); } void _mlir_ciface_threshold_backward_2F32_2F32_2F32_out(tensor_t *a, tensor_t *b, int32_t c, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; threshold_backward_out(a, b, c, r); } void _mlir_ciface_threshold_backward_3F32_3F32_3F32_out(tensor_t *a, tensor_t *b, int32_t c, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; threshold_backward_out(a, b, c, r); } void _mlir_ciface_threshold_backward_4F32_4F32_4F32_out(tensor_t *a, tensor_t *b, int32_t c, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; threshold_backward_out(a, b, c, r); } // view_out void _mlir_ciface_view_1F32_4F32_out(tensor_t *a, int32_t b, int32_t c, int32_t d, int32_t e, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; view_out(a, b, c, d, e, r); } void _mlir_ciface_view_1F32_3F32_out(tensor_t *a, int32_t b, int32_t c, int32_t d, int32_t e, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; view_out(a, b, c, d, e, r); } void _mlir_ciface_view_1F32_2F32_out(tensor_t *a, int32_t b, int32_t c, int32_t d, int32_t e, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; view_out(a, b, c, d, e, r); } void _mlir_ciface_view_2F32_4F32_out(tensor_t *a, int32_t b, int32_t c, int32_t d, int32_t e, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; view_out(a, b, c, d, e, r); } void _mlir_ciface_view_4F32_1F32_out(tensor_t *a, int32_t b, int32_t c, int32_t d, int32_t e, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; view_out(a, b, c, d, e, r); } void _mlir_ciface_view_4F32_2F32_out(tensor_t *a, int32_t b, int32_t c, int32_t d, int32_t e, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; view_out(a, b, c, d, e, r); } void _mlir_ciface_view_4F32_3F32_out(tensor_t *a, int32_t b, int32_t c, int32_t d, int32_t e, tensor_t *r) { // std::cout << "aten_ops " << __func__ << "\n"; view_out(a, b, c, d, e, r); } }