TPU-KERNEL开发参考手册
介绍
TPU架构图
TPU的内存类型
TPU的工作模式
TPU编程模型
Host端
Device端
基础定义
舍入模式
功能函数
数据同步函数
辅助函数
GDMA 操作
基础数据操作
数据类型转换与舍入操作
一元操作
二元操作
浮点二元操作
整型二元操作
比较选择函数
浮点矩阵操作
整型矩阵操作
浮点神经网络操作
定点神经网络操作
激活函数
scatter 和 gather 操作
特殊函数
量化操作
HAU 操作
TPU-KERNEL开发参考手册
索引
索引
A
|
B
|
C
|
D
|
F
|
G
|
L
|
N
|
R
|
S
|
T
|
U
|
V
A
addr_t(C++ type)
addr(C++ member)
ALIGN(C macro)
B
bf16(C++ member)
bfloat16(C++ union)
bits(C++ member)
,
[1]
C
context(C++ member)
D
data_type_t(C++ enum)
dim2::h(C++ member)
dim2::w(C++ member)
dim2(C++ class)
dim4::c(C++ member)
dim4::h(C++ member)
dim4::n(C++ member)
dim4::w(C++ member)
dim4(C++ class)
DIV_UP(C macro)
DT_BFP16(C++ enumerator)
DT_FP16(C++ enumerator)
DT_FP32(C++ enumerator)
DT_INT16(C++ enumerator)
DT_INT32(C++ enumerator)
DT_INT8(C++ enumerator)
DT_UINT16(C++ enumerator)
DT_UINT32(C++ enumerator)
DT_UINT8(C++ enumerator)
F
f16(C++ member)
f32(C++ member)
float16(C++ union)
G
global_addr_t(C++ type)
L
l2_sram_addr_t(C++ type)
local_addr_t(C++ type)
LOCAL_MEM_SIZE(C macro)
N
NPU_NUM(C macro)
R
RM_DOWN(C++ enumerator)
RM_HALF_AWAY_FROM_ZERO(C++ enumerator)
RM_HALF_DOWN(C++ enumerator)
RM_HALF_TO_EVEN(C++ enumerator)
RM_HALF_UP(C++ enumerator)
RM_TOWARDS_ZERO(C++ enumerator)
RM_UP(C++ enumerator)
rounding_mode_t(C++ enum)
S
s16(C++ member)
s32(C++ member)
s8(C++ member)
scalar_t(C++ member)
scalar_t(C++ union)
SCALAR(C++ enumerator)
system_addr_t(C++ type)
T
TENSOR(C++ enumerator)
tpu_aligned_feature_size(C++ function)
tpu_aligned_stride(C++ function)
tpu_bank_index(C++ function)
tpu_bank_num(C++ function)
tpu_bdc_abs(C++ function)
tpu_bdc_and_C(C++ function)
tpu_bdc_and(C++ function)
tpu_bdc_arithmetic_sequence_bcast(C++ function)
tpu_bdc_arithmetic_sequence_distribute(C++ function)
tpu_bdc_arithmetic_sequence_general(C++ function)
tpu_bdc_arithmetic_shift_C(C++ function)
tpu_bdc_arithmetic_shift(C++ function)
tpu_bdc_batch_bcast_h_gather_exception(C++ function)
tpu_bdc_batch_bcast_h_gather(C++ function)
tpu_bdc_batch_bcast_h_scatter(C++ function)
tpu_bdc_batch_bcast_w_gather_exception(C++ function)
tpu_bdc_batch_bcast_w_gather(C++ function)
tpu_bdc_batch_bcast_w_mask_select(C++ function)
tpu_bdc_batch_bcast_w_scatter(C++ function)
tpu_bdc_cast(C++ function)
tpu_bdc_cpy_cross_npu(C++ function)
tpu_bdc_cpy(C++ function)
tpu_bdc_cw_trans(C++ function)
tpu_bdc_equal_C(C++ function)
tpu_bdc_equal_select(C++ function)
tpu_bdc_equal(C++ function)
tpu_bdc_fp32_arccos(C++ function)
tpu_bdc_fp32_arcsin(C++ function)
tpu_bdc_fp32_C_div(C++ function)
tpu_bdc_fp32_cos(C++ function)
tpu_bdc_fp32_cot(C++ function)
tpu_bdc_fp32_dequant(C++ function)
tpu_bdc_fp32_div_C(C++ function)
tpu_bdc_fp32_div(C++ function)
tpu_bdc_fp32_elu(C++ function)
tpu_bdc_fp32_erfc(C++ function)
tpu_bdc_fp32_erf(C++ function)
tpu_bdc_fp32_expm1(C++ function)
tpu_bdc_fp32_exp(C++ function)
tpu_bdc_fp32_gelu_fast(C++ function)
tpu_bdc_fp32_gelu(C++ function)
tpu_bdc_fp32_log1p(C++ function)
tpu_bdc_fp32_logx(C++ function)
tpu_bdc_fp32_log(C++ function)
tpu_bdc_fp32_mac_C(C++ function)
tpu_bdc_fp32_mac(C++ function)
tpu_bdc_fp32_mish(C++ function)
tpu_bdc_fp32_mm_left_const(C++ function)
tpu_bdc_fp32_mm_left_trans(C++ function)
tpu_bdc_fp32_mm(C++ function)
tpu_bdc_fp32_pc_dequant(C++ function)
tpu_bdc_fp32_pc_requant(C++ function)
tpu_bdc_fp32_pow_C(C++ function)
,
[1]
tpu_bdc_fp32_pow(C++ function)
tpu_bdc_fp32_reciprocal(C++ function)
tpu_bdc_fp32_requant(C++ function)
tpu_bdc_fp32_rsqrt(C++ function)
tpu_bdc_fp32_selu(C++ function)
tpu_bdc_fp32_sigmoid(C++ function)
tpu_bdc_fp32_silu(C++ function)
tpu_bdc_fp32_sin(C++ function)
tpu_bdc_fp32_softplus(C++ function)
tpu_bdc_fp32_sqrt(C++ function)
tpu_bdc_fp32_tanh(C++ function)
tpu_bdc_fp32_tan(C++ function)
tpu_bdc_fp32_tunable_C_div(C++ function)
tpu_bdc_fp32_tunable_div_C(C++ function)
tpu_bdc_fp32_tunable_div(C++ function)
tpu_bdc_fp32_tunable_reciprocal(C++ function)
tpu_bdc_fp32_tunable_rsqrt(C++ function)
tpu_bdc_fp32_tunable_sqrt(C++ function)
tpu_bdc_fp32_vc_div(C++ function)
tpu_bdc_fp_add_bias_sqr(C++ function)
tpu_bdc_fp_add_C_sqr(C++ function)
tpu_bdc_fp_add_C(C++ function)
tpu_bdc_fp_add(C++ function)
tpu_bdc_fp_avg_pool2d(C++ function)
tpu_bdc_fp_bias(C++ function)
tpu_bdc_fp_C_sub(C++ function)
tpu_bdc_fp_ceil(C++ function)
tpu_bdc_fp_conv2d_for_deconv2d(C++ function)
tpu_bdc_fp_conv2d_kernel_const(C++ function)
tpu_bdc_fp_conv2d(C++ function)
tpu_bdc_fp_depthwise2d(C++ function)
tpu_bdc_fp_diff_abs_C(C++ function)
tpu_bdc_fp_diff_abs(C++ function)
tpu_bdc_fp_floor(C++ function)
tpu_bdc_fp_hsigmoid(C++ function)
tpu_bdc_fp_hswish(C++ function)
tpu_bdc_fp_ins_avg_pool2d(C++ function)
tpu_bdc_fp_max_pool2d(C++ function)
tpu_bdc_fp_mm_all_trans(C++ function)
tpu_bdc_fp_mm_L_const_all_trans(C++ function)
tpu_bdc_fp_mm_L_const_R_trans(C++ function)
tpu_bdc_fp_mm_L_const(C++ function)
tpu_bdc_fp_mm_R_const_all_trans(C++ function)
tpu_bdc_fp_mm_R_const(C++ function)
tpu_bdc_fp_mm_R_trans(C++ function)
tpu_bdc_fp_mm(C++ function)
tpu_bdc_fp_mul_C(C++ function)
tpu_bdc_fp_mul(C++ function)
tpu_bdc_fp_round(C++ function)
tpu_bdc_fp_scale_bias_C(C++ function)
tpu_bdc_fp_scale_bias(C++ function)
tpu_bdc_fp_scale(C++ function)
tpu_bdc_fp_sub_bias_sqr(C++ function)
tpu_bdc_fp_sub_C_sqr(C++ function)
tpu_bdc_fp_sub_C(C++ function)
tpu_bdc_fp_sub(C++ function)
tpu_bdc_fp_taylor(C++ function)
tpu_bdc_fp_vc_add(C++ function)
tpu_bdc_fp_vc_mul(C++ function)
tpu_bdc_fp_vc_sub(C++ function)
tpu_bdc_greater_C(C++ function)
tpu_bdc_greater_equal_C(C++ function)
tpu_bdc_greater_equal(C++ function)
tpu_bdc_greater_select(C++ function)
tpu_bdc_greater(C++ function)
tpu_bdc_hw_gather_exception(C++ function)
tpu_bdc_hw_gather(C++ function)
tpu_bdc_hw_scatter(C++ function)
tpu_bdc_int8_avg_pool2d(C++ function)
tpu_bdc_int8_mac_C(C++ function)
tpu_bdc_int8_mac(C++ function)
tpu_bdc_int8_max_pool2d(C++ function)
tpu_bdc_int8_mm_L_const(C++ function)
tpu_bdc_int8_mm_L_trans(C++ function)
tpu_bdc_int8_mm(C++ function)
tpu_bdc_int8_pc_zp_mm_all_trans(C++ function)
tpu_bdc_int8_pc_zp_mm_L_const_all_trans(C++ function)
tpu_bdc_int8_pc_zp_mm_L_const_R_trans(C++ function)
tpu_bdc_int8_pc_zp_mm_L_const(C++ function)
tpu_bdc_int8_pc_zp_mm_R_const_all_trans(C++ function)
tpu_bdc_int8_pc_zp_mm_R_const(C++ function)
tpu_bdc_int8_pc_zp_mm_R_trans(C++ function)
tpu_bdc_int8_pc_zp_mm(C++ function)
tpu_bdc_int8_zp_mm_all_trans(C++ function)
tpu_bdc_int8_zp_mm_L_const_all_trans(C++ function)
tpu_bdc_int8_zp_mm_L_const_R_trans(C++ function)
tpu_bdc_int8_zp_mm_L_const(C++ function)
tpu_bdc_int8_zp_mm_R_const_all_trans(C++ function)
tpu_bdc_int8_zp_mm_R_const(C++ function)
tpu_bdc_int8_zp_mm_R_trans(C++ function)
tpu_bdc_int8_zp_mm(C++ function)
tpu_bdc_int_add_C(C++ function)
tpu_bdc_int_add(C++ function)
tpu_bdc_int_C_sub(C++ function)
tpu_bdc_int_dequant(C++ function)
tpu_bdc_int_max_C(C++ function)
tpu_bdc_int_min_C(C++ function)
tpu_bdc_int_mm_L_const(C++ function)
tpu_bdc_int_mm_L_trans(C++ function)
tpu_bdc_int_mm(C++ function)
tpu_bdc_int_mul_C(C++ function)
tpu_bdc_int_mul(C++ function)
tpu_bdc_int_pc_dequant(C++ function)
tpu_bdc_int_pc_requant(C++ function)
tpu_bdc_int_pcs_add_C(C++ function)
tpu_bdc_int_pcs_add(C++ function)
tpu_bdc_int_pcs_C_sub(C++ function)
tpu_bdc_int_pcs_mm_L_const(C++ function)
tpu_bdc_int_pcs_mm_L_trans(C++ function)
tpu_bdc_int_pcs_mm(C++ function)
tpu_bdc_int_pcs_mul_C(C++ function)
tpu_bdc_int_pcs_mul(C++ function)
tpu_bdc_int_pcs_sub_C(C++ function)
tpu_bdc_int_pcs_sub(C++ function)
tpu_bdc_int_requant(C++ function)
tpu_bdc_int_sub_C(C++ function)
tpu_bdc_int_sub(C++ function)
tpu_bdc_int_vc_add(C++ function)
tpu_bdc_int_vc_mul(C++ function)
tpu_bdc_int_vc_sub(C++ function)
tpu_bdc_less_C(C++ function)
tpu_bdc_less_equal_C(C++ function)
tpu_bdc_less_equal(C++ function)
tpu_bdc_less_select(C++ function)
tpu_bdc_less(C++ function)
tpu_bdc_load_fp32_arcsin_coeff(C++ function)
tpu_bdc_load_fp32_cos_coeff(C++ function)
tpu_bdc_load_fp32_erf_coeff(C++ function)
tpu_bdc_load_fp32_exp_coeff(C++ function)
tpu_bdc_load_fp32_exp_table(C++ function)
tpu_bdc_load_fp32_log_coeff(C++ function)
tpu_bdc_load_fp32_sin_coeff(C++ function)
tpu_bdc_load_fp32_tan_coeff(C++ function)
tpu_bdc_logical_shift_C(C++ function)
tpu_bdc_logical_shift(C++ function)
tpu_bdc_max_C(C++ function)
tpu_bdc_maximum_greater_select(C++ function)
tpu_bdc_max(C++ function)
tpu_bdc_min_C(C++ function)
tpu_bdc_minimum_less_select(C++ function)
tpu_bdc_min(C++ function)
tpu_bdc_neg(C++ function)
tpu_bdc_not_equal_C(C++ function)
tpu_bdc_not_equal(C++ function)
tpu_bdc_not(C++ function)
tpu_bdc_npu_bcast(C++ function)
tpu_bdc_or_C(C++ function)
tpu_bdc_or(C++ function)
tpu_bdc_prelu(C++ function)
tpu_bdc_relu(C++ function)
tpu_bdc_set_C(C++ function)
tpu_bdc_sign(C++ function)
tpu_bdc_table_lookup(C++ function)
tpu_bdc_vc_and(C++ function)
tpu_bdc_vc_equal(C++ function)
tpu_bdc_vc_greater_equal(C++ function)
tpu_bdc_vc_greater(C++ function)
tpu_bdc_vc_less_equal(C++ function)
tpu_bdc_vc_less(C++ function)
tpu_bdc_vc_max(C++ function)
tpu_bdc_vc_min(C++ function)
tpu_bdc_vc_not_equal(C++ function)
tpu_bdc_vc_or(C++ function)
tpu_bdc_vc_xor(C++ function)
tpu_bdc_w_gather_exception(C++ function)
tpu_bdc_w_gather(C++ function)
tpu_bdc_w_scatter(C++ function)
tpu_bdc_wc_trans(C++ function)
tpu_bdc_xor_C(C++ function)
tpu_bdc_xor(C++ function)
tpu_channle_num_per_npu(C++ function)
tpu_compact_stride(C++ function)
tpu_continuous_stride(C++ function)
tpu_data_type_bits(C++ function)
tpu_data_type_size(C++ function)
tpu_eu_num(C++ function)
tpu_flush_cache(C++ function)
tpu_gdma_channel_bcast_L2L(C++ function)
tpu_gdma_channel_bcast_S2L(C++ function)
tpu_gdma_compact_L2S(C++ function)
tpu_gdma_compact_nc_trans_L2S(C++ function)
tpu_gdma_compact_nc_trans_S2L(C++ function)
tpu_gdma_compact_S2L(C++ function)
tpu_gdma_compress_RACU_L2S(C++ function)
,
[1]
tpu_gdma_compress_RACU_max_meta_bytes(C++ function)
tpu_gdma_compress_RACU_max_racu_bytes(C++ function)
tpu_gdma_cpy_cw_trans_L2L(C++ function)
tpu_gdma_cpy_cw_trans_L2S(C++ function)
tpu_gdma_cpy_cw_trans_S2L(C++ function)
tpu_gdma_cpy_cw_trans_S2S(C++ function)
tpu_gdma_cpy_L2L(C++ function)
tpu_gdma_cpy_L2S(C++ function)
tpu_gdma_cpy_nc_trans_L2L(C++ function)
tpu_gdma_cpy_nc_trans_L2S(C++ function)
tpu_gdma_cpy_nc_trans_S2L(C++ function)
tpu_gdma_cpy_nc_trans_S2S(C++ function)
tpu_gdma_cpy_S2L(C++ function)
tpu_gdma_cpy_S2S(C++ function)
tpu_gdma_decompress_normal_S2L(C++ function)
tpu_gdma_general_cpy_L2S(C++ function)
tpu_gdma_general_cpy_S2L(C++ function)
tpu_gdma_h_gather_L2L(C++ function)
tpu_gdma_h_gather_L2S(C++ function)
tpu_gdma_h_gather_S2L(C++ function)
tpu_gdma_h_gather_S2S(C++ function)
tpu_gdma_h_scatter_L2L(C++ function)
tpu_gdma_h_scatter_L2S(C++ function)
tpu_gdma_h_scatter_S2L(C++ function)
tpu_gdma_h_scatter_S2S(C++ function)
tpu_gdma_mask_select_L2S(C++ function)
,
[1]
tpu_gdma_matrix_L2S(C++ function)
tpu_gdma_matrix_S2L(C++ function)
tpu_gdma_matrix_trans_L2S(C++ function)
tpu_gdma_matrix_trans_S2L(C++ function)
tpu_gdma_nonzero_L2S(C++ function)
,
[1]
tpu_gdma_reverse_L2L(C++ function)
tpu_gdma_reverse_L2S(C++ function)
tpu_gdma_reverse_S2L(C++ function)
tpu_gdma_reverse_S2S(C++ function)
tpu_gdma_set_C_local(C++ function)
tpu_gdma_set_C_system(C++ function)
tpu_gdma_system_cpy(C++ function)
tpu_gdma_vector_L2S(C++ function)
tpu_gdma_vector_S2L(C++ function)
tpu_global_mem_addr(C++ function)
tpu_hau_line_gather(C++ function)
tpu_hau_sort_natural_index(C++ function)
tpu_hau_sort_specific_index(C++ function)
tpu_hau_sort(C++ function)
tpu_initialize(C++ function)
tpu_invalidate_cache(C++ function)
tpu_is_parallel_state(C++ function)
tpu_kernel_launch_async(C++ function)
tpu_kernel_launch_sync(C++ function)
tpu_kernel_sync(C++ function)
tpu_l2_sram_addr(C++ function)
tpu_l2_sram_get_start_addr(C++ function)
tpu_l2_sram_size(C++ function)
tpu_line_aligned_stride(C++ function)
tpu_local_mem_addr_unified(C++ function)
tpu_local_mem_addr(C++ function)
tpu_local_mem_get_start_addr(C++ function)
tpu_local_mem_size_per_npu(C++ function)
tpu_npu_index(C++ function)
tpu_npu_num(C++ function)
tpu_parallel_end(C++ function)
tpu_parallel_start(C++ function)
tpu_poll(C++ function)
type(C++ member)
U
u16(C++ member)
u32(C++ member)
u8(C++ member)
V
var_context_t(C++ union)
var_type_t(C++ enum)
variable_t(C++ class)
VECTOR(C++ enumerator)