Environment#
The hardware environment this article is based on is the Ascend 910B3, and the software environment includes CANN 7.0-RC1, PyTorch 1.11.0, and Ascend PyTorch Adapter v5.0.rc3-pytorch1.11.0. The situation on other CANN and PyTorch versions may differ slightly.
Registration Process#
Adding a Custom Operator in the Ascend PyTorch Adapter#
References:
Add the npu_add_custom function in torch_npu/csrc/aten/npu_native_functions.yaml:
1
2
| custom:
- func: npu_add_custom(Tensor x, Tensor y) -> Tensor # 添加的函数
|
Add the file AddCustomKernelNpu.cpp in torch_npu/csrc/aten/ops/op_api:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
| #include <torch/csrc/autograd/custom_function.h>
#include "torch_npu/csrc/framework/utils/OpAdapter.h"
#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
#include "torch_npu/csrc/aten/ops/op_api/op_api_common.h"
namespace at_npu {
namespace native {
using torch::autograd::Function;
using torch::autograd::AutogradContext;
at::Tensor NPUNativeFunctions::npu_add_custom(const at::Tensor& x, const at::Tensor& y) {
at::Tensor result = OpPreparation::ApplyTensor(x); // 创建输出内存
// calculate the output result of the NPU
EXEC_NPU_CMD(aclnnAddCustom, x, y, result);
return result;
}
} // namespace native
} // namespace at_npu
|
Afterwards, recompile and reinstall torch_npu.
Adding the Custom Operator Implementation in CANN#
References:
First, define the operator description file add_custom.json:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
| [
{
"op": "AddCustom",
"language": "cpp",
"input_desc": [
{
"name": "x",
"param_type": "required",
"format": [
"ND"
],
"type": [
"fp16"
]
},
{
"name": "y",
"param_type": "required",
"format": [
"ND"
],
"type": [
"fp16"
]
}
],
"output_desc": [
{
"name": "z",
"param_type": "required",
"format": [
"ND"
],
"type": [
"fp16"
]
}
]
}
]
|
Run
1
| msopgen gen -i add_custom.json -c ai_core-Ascend910B3 -f pytorch -out . -lan cpp
|
to generate the operator project:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
| AddCustom
├── build.sh
├── cmake
│ ├── config.cmake
│ ├── func.cmake
│ ├── intf.cmake
│ ├── makeself.cmake
│ └── util
├── CMakeLists.txt
├── CMakePresets.json // 修改 ASCEND_CANN_PACKAGE_PATH
├── framework
├── op_host
│ ├── add_custom_tiling.h // 定义 length 和 tiling 相关信息
│ ├── add_custom.cpp // 算子 host 侧实现
│ ├── CMakeLists.txt
├── op_kernel
│ ├── CMakeLists.txt
│ ├── add_custom.cpp // 算子 kernel 侧实现
└── scripts
|
In CMakePresets.json, change ASCEND_CANN_PACKAGE_PATH to the CANN installation path.
The content of op_host/add_custom_tiling.h is as follows (a simple implementation):
1
2
3
4
5
6
7
8
9
| #include "register/tilingdata_base.h"
namespace optiling {
BEGIN_TILING_DATA_DEF(AddCustomTilingData)
TILING_DATA_FIELD_DEF(uint32_t, size); // 定义 tensor size
END_TILING_DATA_DEF;
REGISTER_TILING_DATA_CLASS(AddCustom, AddCustomTilingData)
}
|
In op_host/add_custom.cpp, modify the block_dim used when the operator is invoked:
1
| context->SetBlockDim(20); // 910B3 的 block_dim
|
op_kernel/add_custom.cpp is the concrete implementation of the operator:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
#include "kernel_operator.h"
#ifdef __DAV_C220_VEC__
extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) {
GET_TILING_DATA(tiling_data, tiling);
uint32_t M = tiling_data.size; // 从 tiling_data 中获取 tensor size
// ...
}
#else
// 重要:CANN 会尝试不同的 ccec 编译参数以推断算子的类型(VEC、CUBE、MIXED),如果不创建一个 stub 函数将会编译失败
extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) {
pip_barrier(PIPE_ALL);
}
#endif
|
Compilation and Deployment#
1
2
| $ bash build.sh
$ ./custom_opp_euleros_aarch64.run
|
Calling it in PyTorch:
1
2
3
4
5
6
| import torch
import torch_npu
# ...
z = torch.npu_add_custom(x, y) # 由于是运行时编译,第一次运行时需要等待编译
|
Registration Principles#
TODO
References#
TODO