BMNET 前端插件¶
BMLang是提供用户针对Sohpon TPU的上层编程模型,所实现的算法可以在Sohpon TPU中运行,详细见 BMLang 编程 。
以下是以exp layer为例介绍基于BMLang开发网络中未支持layer的步骤:
首先需要修改prototxt中bmnetc不支持的layer type的param。bmnetc提供给用户自定义layer的google proto parameters格式如下:
message UserDefinedParameter { repeated float float_value = 1; repeated string string_value = 2; }
修改.prototxt里面的type为Exp的layer param。caffe原版的Exp prototxt格式为:
layer { name: "exp" type: "Exp" bottom: "log" top: "usr" exp_param { base: 2 scale: 2 shift: 3 } }
layer { name: "exp" type: "Exp" bottom: "log" top: "usr" user_defined_param { float_value: 2 float_value: 2 float_value: 3 } }
然后在imp_bmnetc的inclue和src里,加入exp layer的.hpp和.cpp代码文件。
#ifndef CAFFE_USER_DEFINED_LAYER_HPP_ #define CAFFE_USER_DEFINED_LAYER_HPP_ #include <vector> #include "bmnetc/blob.hpp" #include "bmnetc/layer.hpp" #include "bmnetc/proto/bmnetc.pb.h" #include "bmnetc/layers/neuron_layer.hpp" namespace bmnetc { /** * @brief Computes @f$ y = \gamma ^ {\alpha x + \beta} @f$, * as specified by the scale @f$ \alpha @f$, shift @f$ \beta @f$, * and base @f$ \gamma @f$. */ template <typename Dtype> class ExpLayer : public NeuronLayer<Dtype> { public: /** * @param param provides UserDefinedParameter UserDefined_param, * with ExpLayer options: */ explicit ExpLayer(const LayerParameter& param) : NeuronLayer<Dtype>(param) {} virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual inline const char* type() const { return "Exp"; } protected: /** * @param bottom input Blob vector (length 1) * -# @f$ (N \times C \times H \times W) @f$ * the inputs @f$ x @f$ * @param top output Blob vector (length 1) * -# @f$ (N \times C \times H \times W) @f$ * the computed outputs @f$ * y = \gamma ^ {\alpha x + \beta} * @f$ */ virtual void CheckBlobCounts(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){}; virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void layer_deploy(void* p_bmcompiler, const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Forward_bmtpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); void bmtpu_module(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); Dtype inner_scale_, outer_scale_; }; } // namespace bmnetc #endif
其中,LayerSetup需要获取的是UserDefinedParameter,然后根据该UserDefinedParameter设置Exp Layer的变量。
Forward_bmtpu中,需要set_mode(BMLANG_COMPUTE),表明使用cpu来模拟bmtpu_module的计算,以便于我们调试用BMLang对exp layer编程对不对。
layer_deploy则需要set_mode(BMLANG_COMPILE),此时进入compile模式。在运行bmnetc编译caffe model时,会进入layer_deploy函数,并生成可在BMTPU芯片运行的bmodel。
#include <vector> #include "exp_layer.hpp" #include "bmnetc/util/math_functions.hpp" namespace bmnetc { template <typename Dtype> void ExpLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { NeuronLayer<Dtype>::LayerSetUp(bottom, top); const UserDefinedParameter& param = this->layer_param_.user_defined_param(); const Dtype base = param.float_value(0); if (base != Dtype(-1)) { CHECK_GT(base, 0) << "base must be strictly positive."; } // If base == -1, interpret the base as e and set log_base = 1 exactly. // Otherwise, calculate its log UserDefinedlicitly. const Dtype log_base = (base == Dtype(-1)) ? Dtype(1) : log(base); CHECK(!isnan(log_base)) << "NaN result: log(base) = log(" << base << ") = " << log_base; CHECK(!isinf(log_base)) << "Inf result: log(base) = log(" << base << ") = " << log_base; const Dtype input_scale = param.float_value(1); const Dtype input_shift = param.float_value(2); inner_scale_ = log_base * input_scale; outer_scale_ = (input_shift == Dtype(0)) ? Dtype(1) : ( (base != Dtype(-1)) ? pow(base, input_shift) : exp(input_shift) ); } template <typename Dtype> void ExpLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const int count = bottom[0]->count(); const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); if (inner_scale_ == Dtype(1)) { bmnetc_exp(count, bottom_data, top_data); } else { bmnetc_cpu_scale(count, inner_scale_, bottom_data, top_data); bmnetc_exp(count, top_data, top_data); } if (outer_scale_ != Dtype(1)) { bmnetc_scal(count, outer_scale_, top_data); } } template <typename Dtype> void ExpLayer<Dtype>::layer_deploy(void* p_bmcompiler, const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { #if defined(BMCOMPILE) && defined(BMLANG) bmlang::set_mode(bmlang::COMPILE_TPU); bmtpu_module(bottom, top); #endif } template <typename Dtype> void ExpLayer<Dtype>::Forward_bmtpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { #if defined(BMCOMPILE) && defined(BMLANG) bmlang::set_mode(bmlang::COMPUTE_CPU); bmtpu_module(bottom, top); #endif } template <typename Dtype> void ExpLayer<Dtype>::bmtpu_module(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { #if defined(BMCOMPILE) && defined(BMLANG) // Create a bmlang::Tensor for the input bmlang::Tensor bottom_tensor(this->layer_param_.bottom(0), bmlang::FLOAT32, bottom[0]->shape()); // Set data to the created bottom_tensor // If we use bmlang::COMPUTE_CPU or bmlang::BOTH mode for debug, // we must set data from bottom[0] to the bottom_tensor bottom_tensor.set_data((const char*)bottom[0]->cpu_data()); // Create a bmlang::Tensor for the output bmlang::Tensor top_tensor(this->, bmlang::FLOAT32, top[0]->shape()); if (inner_scale_ != Dtype(1)) { // Create a bmlang::Scalar with a constant value "inner_scale_" bmlang::Scalar tmp_inner_scale_((float)inner_scale_, bmlang::FLOAT32); // Create a intermediate bmlang::Tensor bmlang::Tensor mul_tensor("mul_res", bmlang::FLOAT32); // mul_tensor = bottom_tensor * tmp_inner_scale_ bmlang::muls(bottom_tensor, tmp_inner_scale_, mul_tensor); // top_tensor = EXP(mul_tensor) bmlang::active(mul_tensor, top_tensor, ACTIVE_EXP); } else { // top_tensor = EXP(bottom_tensor) bmlang::active(bottom_tensor, top_tensor, ACTIVE_EXP); } if (outer_scale_ != Dtype(1)) { // Create a bmlang::Scalar with a constant value "outer_scale_" bmlang::Scalar tmp_outer_scale_((float)outer_scale_, bmlang::FLOAT32); // top_tensor = top_tensor * tmp_outer_scale_ bmlang::muls(top_tensor, tmp_outer_scale_, top_tensor); } // Get data to the top_tensor // If we use bmlang::COMPUTE_CPU or bmlang::BOTH mode for debug, // we must get data from top_tensor to top[0] top_tensor.get_data((char*)top[0]->mutable_cpu_data()); #endif } INSTANTIATE_CLASS(ExpLayer); REGISTER_LAYER_CLASS(Exp); } // namespace bmnetc
完成代码后,在imp_bmnetc下make即可编译,将生成的libimpbmnetc.so覆盖bmcompiler/。或者直接make && make install即可。
启动bmnetc编译该caffe model,其中prototxt为修改后的,.caffemodel为已经用原版.prototxt训练好的。最终生成bmodel,在bmruntime时可将bmodel下发到BMTPU芯片中运行。
BMCPU是提供用户对TPU不能实现的layer进行CPU编程的环境,BMCPU开发用户CPU Layer详细见 BMCPU 插件使用 。
这里介绍如何在bmnetc编程环境下将BMCPU中开发的用户CPU Layer插入到Caffe网络中,并与其他layer一起进行网络级 编译,生成bmodel。
这里假设用户已经在BMCPU下开发完了CPU Layer程序,编译、测试通过,并且通过make install安装。
这里以user_exp_layer为例,示例.hpp代码请见imp_bmnetc的example代码。与BMLang开发不同的是, 没有Forward_bmtpu和bmtpu_module函数。
user_exp_layer.cpp代码如下,与BMLang开发不同的是,Forward_cpu通过调用bmcpu.h中的bmcpu_user_process 来实现,用户需要将user_cpu_param_t设置正确。bmcpu_user_process根据op_type找到用户自己的cpu layer, 然后将param传给用户cpu layer并启动计算。
layer_deploy函数用于给bmnetc加入该用户cpu layer到graph中,该函数需要调用bmcompiler的add_user_cpu_layer 接口。add_user_cpu_layer需要传入输入输出tensor的信息,以及user_cpu_param_t参数。
template <typename Dtype> void UserExpLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); user_cpu_param_t param; param.op_type = USER_EXP; param.u.exp.inner_scale_ = this->inner_scale_; param.u.exp.outer_scale_ = this->outer_scale_; output_shapes_v.push_back(top[0]->shape()); void* bmcpu_handle = bmcpu_init(); bmcpu_user_process(bmcpu_handle, ¶m, vector<float *>(1, (float *)(const_cast<Dtype *>(bottom_data))), vector<vector<int>>(1, bottom[0]->shape()), vector<float *>(1, (float *)(top_data)), output_shapes_v); bmcpu_uninit(bmcpu_handle); } template <typename Dtype> void UserExpLayer<Dtype>::layer_deploy(void* p_bmcompiler, const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { #if defined(BMCOMPILE) user_cpu_param_t param; param.op_type = USER_EXP; param.u.exp.inner_scale_ = this->inner_scale_; param.u.exp.outer_scale_ = this->outer_scale_; #define LAYER_INPUT_NUM 1 #define LAYER_OUTPUT_NUM 1 /* input shape/dim/name */ int* input_shapes[LAYER_INPUT_NUM]; int input_shape_dims[LAYER_INPUT_NUM]; char *input_names[LAYER_INPUT_NUM]; for (int idx = 0; idx < LAYER_INPUT_NUM; idx++) { input_shapes[idx] = const_cast<int*>(&(bottom[idx]->shape()[0])); input_shape_dims[idx] = (int)(bottom[idx]->shape().size()); input_names[idx] = const_cast<char*>(this->layer_param_.bottom(idx).c_str()); } /* output shape/dim/name */ int* output_shapes[LAYER_OUTPUT_NUM]; int output_shape_dims[LAYER_OUTPUT_NUM]; char *output_names[LAYER_OUTPUT_NUM]; for (int idx = 0; idx < LAYER_OUTPUT_NUM; idx++) { output_shapes[idx] = const_cast<int*>(&(top[idx]->shape()[0])); output_shape_dims[idx] = (int)(top[idx]->shape().size()); output_names[idx] = const_cast<char*>(this->; } add_user_cpu_layer( p_bmcompiler, LAYER_INPUT_NUM, input_shapes, input_shape_dims, input_names, LAYER_OUTPUT_NUM, output_shapes, output_shape_dims, output_names, (void *)(¶m), sizeof(param) ); #endif }
完成代码后,在imp_bmnetc下make即可编译,将生成的libimpbmnetc.so覆盖bmcompiler/。或者直接make && make install即可。
启动bmnetc编译该caffe model,其中prototxt为修改后的,.caffemodel为已经用原版.prototxt训练好的。最终生成bmodel,在bmruntime时可将bmodel下发到BMTPU芯片中运行。
BMKernel是提供用户针对Sophon TPU的底层编程模型,所实现的算法可以在Sophon TPU中运行,详细见 OKKernel(原BMKernel)介绍 。
这里介绍如何在bmnetc插件编程环境下将BMKernel开放的用户TPU Layer插入到Caffe网络中,并与其他的layer一起进行网络级编译,生成bmodel。
详细见 编译器自定义TPU层插件 。
完成代码后,在imp_bmnetc下make即可编译,将生成的libimpbmnetc.so覆盖bmcompiler/。或者直接make && make install即可。
启动bmnetc编译该caffe model,其中prototxt为修改后的,.caffemodel为已经用原版.prototxt训练好的。最终生成bmodel,在bmruntime时可将bmodel下发到BMTPU芯片中运行。
BMLang是提供用户针对BMTPU编程的接口,所实现的算法可以在BMTPU中运行,详细见 BMLang 编程 。
在进行BMLang开发前,需要将包含自定义layer实现的mxnet python包安装替换原来的mxnet python包。如果已经安装的mxnet python包中已经包含用户自定义layer的cpu实现,则直接进入第2步。
''' The following is an example that use bmlang(python) to implement a operation for SOPHON TPU A typical bmlang program for an OP implementation should include 1. A compute core that is written by bmlang 2. A register that import the bmlang program to bmnetm compiler 3. (Optional) A debug module that check the accuracy of bmlang program ''' import bmlang import bmnetm ACTIVE_TYPE_DICT = { 'tanh' : bmlang.tanh, 'sigmoid' : bmlang.sigmoid, 'Sigmoid' : bmlang.sigmoid, 'relu' : bmlang.relu, 'exp' : bmlang.exp, 'elu' : bmlang.elu, 'sqrt' : bmlang.sqrt, 'square' : bmlang.square, 'rsqrt' : bmlang.rsqrt, 'absval' : bmlang.abs, 'ln' : bmlang.ln, } ''' The following is the activation compute core that is written by bmlang API The parameters in this function are defined by users ''' def activation_core(bottom_name, top_name, shape, active_func): print('in tensor name: ', bottom_name, 'out tensor name: ', top_name) print('add active layer, shape is {0}'.format(shape)) ## create bmlang tensor of activation input inp_tensor = bmlang.Tensor(bottom_name, dtype = 'float32', shape = shape) ## bmlang compution operation oup_tensor = active_func(inp_tensor, top_name) ''' The following is the register that import bmlang program to bmnetm compiler When we finish it. We must register this function in The name of the function can be defined by users But the paramteters must be set as follows @param layer instance of class bmnetm.Layer, contains layer_name, layer_type, in_tensors, out_tensors, params member layer.in_tensors, layer.out_tensors are lists of bmnetm.Tensor instances, contains name, shape, dims member layer.params is instance of bmnetm.ParamDict herits from OrderedDict, contains key/values from the node['attr'] info ''' def user_activation(layer): ''' In the register, we should firstly parse the params of the OP The following is parser of this OP ''' layer_type, params = layer.layer_type, layer.params if (ACTIVE_TYPE_DICT.__contains__(layer_type)): active_func = ACTIVE_TYPE_DICT.get(layer_type) elif (ACTIVE_TYPE_DICT.__contains__(params["act_type"])): active_func = ACTIVE_TYPE_DICT.get(params["act_type"]) elif (ACTIVE_TYPE_DICT.__contains__(params["act_type"])): active_func = ACTIVE_TYPE_DICT.get(params["act_type"]) else: raise RuntimeError("Not support activate layer type") print('EXT Factory: Bmlang activation is called') ## Get information of the input and output tensor in_tensors, out_tensors = layer.in_tensors, layer.out_tensors bottom_name = in_tensors[0].name top_name = out_tensors[0].name tensor_shape = out_tensors[0].shape ## Then set the bmlang model to COMPILE ## Now we set the mode of bmlang to compile with tpu ## This mean compile this OP through bmcompiler bmlang.set_mode('tpu') ## At last, call the compute core written through bmlang ## Compile the activation core activation_core(bottom_name, top_name, tensor_shape, active_func) ''' BMLang debug example ''' def bmlang_active_debug(): ## 1. call mxnet active cpu compuation ## 2. bmlang.set_mode(bmlang.BMLANG_COMPILE) ## 3. call activation_core() ## 4. compare results from 1 and 3 above
import bmnetm from layers_ext.activate_layer import * ## The layer that is wrote by bmlang can be registered here def layer_ext_register(): ## bmnetm.register('node_op_name', bmlang_register_func) bmnetm.register('Activation', user_activation)
其中’Activation’为该OP,对应.json文件中的”op”: “Activation”
{ "op": "Activation", "name": "relu4", "attrs": {"act_type": "relu"}, "inputs": [[19, 0, 0]] }
以下是以lenet为例来介绍,在使用bmnetm compile接口前,需要执行之前的注册程序layer_ext_register()。
#model = r'/path/to/xxx-symbol.json' model = r'../../../nnmodel/mxnet_models/lenet/lenet-symbol.json' #weight = r'/path/to/xxxx-xxxx.params' weight = r'../../../nnmodel/mxnet_models/lenet/lenet-0100.params' #export_dir = r'./xxx' export_dir = r'./compilation' #set target target = r'BM1684' #set input shapes shapes = [[1, 1, 28, 28]] #set network name net_name = r'lenet' ## If user writes user-defined layers through bmlang, please register these layers firstly ## If user does not have user-defined layer through bmlang, please delete the following import layers_ext.layer_ext_register as new_register new_register.layer_ext_register() ## Launch bmnetm compilation import bmnetm bmnetm.compile(model, weight, export_dir, target, shapes, net_name)