Demo示例¶
Product Quantizer Algorithm¶
以下是对‘Product Quantizer Algorithm’的需要加速部分所做的从bmlang编程生成bmodel,到runtime编程加载bmodel进行加速的一个具体实现过程,从而更清晰地展示bmlang使用方法。
source code¶
我们只用bmlang对部分算法的实现进行加速,该demo是对ProductQuantizer::search()的加速。
// xq, xb_pq都是已知的输入数据;ncodes=nb=10000; m_ksub=d=256; m_subquantizers=pq_length=32
// k 表示topk=100; distances, labels分别表示topk的输出value和index
void ProductQuantizer::search(const float * xq,
const uint8_t * xb_pq, const size_t ncodes,
uint32_t k, float *distances, uint64_t *labels) const
{
float * dis_tables = new float[m_ksub * m_subquantizers];
ScopeDeleter<float> del(dis_tables);
computeInnerProdTable(xq, dis_tables);
std::multimap<float, int, std::greater<float> > _distances;
int _dataset_length = ncodes;
int _sub_space = m_subquantizers;
for(int i = 0; i < _dataset_length; i++) {
float dist = .0;
for(int s = 0; s < _sub_space; s++) {
dist += dis_tables[s * m_ksub + xb_pq[i * _sub_space + s]];
}
_distances.emplace(dist, i);
}
int kk = 0;
for(auto dist: _distances) {
if(kk < k) {
distances[kk] = dist.first;
labels[kk] = dist.second;
kk++;
} else {
break;
}
}
}
void ProductQuantizer::computeInnerProdTable(const float* x,
float* dis_table) const
{
size_t m;
for (m = 0; m < m_subquantizers; m++) {
vec_inner_products(dis_table + m * m_ksub,
x + m * m_dsub, get_centroids(m, 0),
m_dsub, m_ksub);
}
}
void vec_inner_products(float* ip, const float* x, const float* y,
size_t d, size_t ny)
{
for (size_t i = 0; i < ny; i++) {
ip[i] = 0;
for(int j = 0; j < d; j++) {
ip[i] += x[j] * y[j];
}
y += d;
}
}
// m_dsub = 8
// m_codes数据已知,与后文bmlang编程中的codes.data()相同
const float* ProductQuantizer::get_centroids(size_t m, size_t i) const
{
return &m_codes[(m * m_ksub + i) * m_dsub];
}
bmlang编程¶
首先需要对上面的算法进行bmlang编程,最终生成bmodel供runtime使用
product_quant_search()
{
// create input tensor
bmlang::Tensor tensor_0("query", bmlang::FLOAT32, 1, pq_length, 1, 8);
tensor_0.set_data((char*)xq);
bmlang::Tensor tensor_1("m_centroid", bmlang::FLOAT32, 1, pq_length, d, 8);
tensor_1.set_data((char*)codes.data());
tensor_1.set_tensor_type(bmlang::COEFF_TENSOR);
// computeInnerProdTable
bmlang::Tensor tensor_dis_inner("dis_inner", bmlang::FLOAT32);
bmlang::mul(tensor_0, tensor_1, tensor_dis_inner);
bmlang::ReduceParam p_reduce_param;
p_reduce_param.method = bmlang::REDUCE_SUM;
p_reduce_param.axis_num = 1;
p_reduce_param.axis_list[0] = 3;
bmlang::Tensor tensor_dis_table("dis_table", bmlang::FLOAT32);
bmlang::reduce(tensor_dis_inner, tensor_dis_table, p_reduce_param);
/*
// get tensor data
std::unique_ptr<float> dis_table_data(new float[tensor_dis_table.count()]);
tensor_dis_table.get_data((char*)dis_table_data.get());
*/
std::vector<bmlang::Tensor*> codes_tensor;
std::vector<bmlang::Tensor*> dis_table_split(bmlang::FLOAT32);
for (int i = 0; i < pq_length; i++) {
codes_tensor.push_back(new bmlang::Tensor(bmlang::UINT8, nb));
codes_tensor[i]->set_tensor_type(bmlang::COEFF_TENSOR);
codes_tensor[i]->set_data((char*)(xb_pq_trans.get()) + i * nb);
dis_table_split.push_back(new bmlang::Tensor(bmlang::FLOAT32));
}
// split table
bmlang::split(tensor_dis_table, dis_table_split, 1, pq_length);
bmlang::Tensor tensor_with_gallery(bmlang::FLOAT32);
for (int i = 0; i < pq_length; i++) {
bmlang::Tensor lut_res(bmlang::FLOAT32);
bmlang::lut(*(codes_tensor[i]), *(dis_table_split[i]), lut_res);
if (i == 0)
bmlang::copy(lut_res, tensor_with_gallery);
else
bmlang::add(tensor_with_gallery, lut_res, tensor_with_gallery);
}
// topk
bmlang::TopkParam pq_topk;
pq_topk.axis = 0;
pq_topk.k = topk;
pq_topk.descending = true;
bmlang::Tensor tensor_index("output_index", bmlang::INT32);
bmlang::Tensor tensor_score("output_score", bmlang::FLOAT32);
bmlang::topk(tensor_with_gallery, tensor_score, tensor_index, pq_topk);
std::cout << "topk output count: " << tensor_index.count() << std::endl;
#ifdef COMPILE_CHECK
std::vector<bmlang::Tensor*> inp_tensor;
std::vector<bmlang::Tensor*> ref_tensor;
inp_tensor.push_back(&tensor_0);
ref_tensor.push_back(&tensor_index);
ref_tensor.push_back(&tensor_score);
bmlang::compile_with_check("product_quant_search", inp_tensor, ref_tensor, 2, 0, false);
#else
bmlang::compile("product_quant_search", 2, 1, false);
#endif
for (auto o : codes_tensor) delete o;
for (auto o : dis_table_split) delete o;
}
runtime编程¶
加载上一步通过bmlang编程编译后生成的bmodel在芯片上推理实现加速
void ProductQuantizer::search_accelerate(
const float * xq, const uint8_t* codes,
const int ncodes, const int pq_length,
float *distances, uint64_t *labels) const
{
// 1. require device
bm_handle_t bm_handle;
bm_dev_request(&bm_handle, 0);
// 2. creat runtime
void* p_bmrt = bmrt_create(bm_handle);
// 3. load bmodel
// char* bmodel = path_of_your_bmodel
bmrt_load_bmodel(p_bmrt, bmodel);
const char** net_names = nullptr;
bmrt_get_network_names(p_bmrt, &net_names);
const bm_net_info_t* net_info = bmrt_get_network_info(p_bmrt, net_names[0]);
// 4. initial input tensor shape, data_type and data
int input_num = net_info->input_num;
bm_tensor_t* input_tensors = new bm_tensor_t[input_num];
bmrt_tensor(&input_tensors[0], p_bmrt, BM_FLOAT32, {4, {1, pq_length, 1, 8}});
bm_memcpy_s2d_partial(bm_handle, input_tensors[0].device_mem,
(void*)xq, bmrt_tensor_bytesize(&input_tensors[0]));
int output_num = net_info->output_num;
assert(output_num == 2);
bm_tensor_t* output_tensors = new bm_tensor_t[output_num];
// 5. net inference
bmrt_launch_tensor(p_bmrt, net_names[0], input_tensors, input_num,
output_tensors, output_num);
// make sure inference has finished
assert(bm_thread_sync(bm_handle) == BM_SUCCESS);
// 6. get output data
// BM1684 can't support uint64_t, use int32 instead
int index_shape_count = bmrt_shape_count(&(output_tensors[1].shape));
int* output_index = new int[index_shape_count];
bm_memcpy_d2s_partial(bm_handle, output_index, output_tensors[1].device_mem,
bmrt_tensor_bytesize(&output_tensors[1]));
bm_memcpy_d2s_partial(bm_handle, distances, output_tensors[0].device_mem,
bmrt_tensor_bytesize(&output_tensors[0]));
for (int i = 0; i < index_shape_count; i++) {
labels[i] = static_cast<uint64_t>(output_index[i]);
}
// 7. free device memory
free(net_names);
for (int i = 0; i < output_num; i++) {
bm_free_device(bm_handle, output_tensors[i].device_mem);
}
for (int i = 0; i < input_num; i++) {
bm_free_device(bm_handle, input_tensors[i].device_mem);
}
delete [] output_index;
delete [] input_tensors;
delete [] output_tensors;
// 8. destory runtime and free device
bmrt_destroy(p_bmrt);
bm_dev_free(bm_handle);
}
Softmax的BMLang实现¶
以下是对Softmax的BMLang C++编程:
TfLiteStatus EvalFloatSoftmax(
TfLiteContext* context,
const TfLiteTensor* input,
TfLiteTensor* output,
std::vector<int> input_shape) {
int num_dims = NumDimensions(input);
int n = input_shape[0];
int h = input_shape[1];
int w = input_shape[2];
int c = input_shape[3];
bmlang::Tensor src(input->name, ToBmlangDtype(kTfLiteFloat32), {n, h, w, c});
src.set_data(input->data.raw);
bmlang::Tensor dst(output->name, ToBmlangDtype(kTfLiteFloat32));
bmlang::Tensor reduce_max(ToBmlangDtype(kTfLiteFloat32));
bmlang::ReduceParam param;
param.method = bmlang::REDUCE_MAX;
param.axis_list[0] = num_dims - 1;
param.axis_num = 1;
bmlang::reduce(src, reduce_max, param);
bmlang::Tensor broadcast1(ToBmlangDtype(kTfLiteFloat32));
std::vector<int> times(num_dims, 1);
times[num_dims-1] = c;
bmlang::broadcast(reduce_max, broadcast1, times);
bmlang::Tensor tensor_diff(ToBmlangDtype(kTfLiteFloat32));
bmlang::sub(src, broadcast1, tensor_diff);
bmlang::Tensor tensor_exp(ToBmlangDtype(kTfLiteFloat32));
bmlang::active(tensor_diff, tensor_exp, bmlang::ACTIVE_EXP);
bmlang::Tensor reduce_sum(ToBmlangDtype(kTfLiteFloat32));
param.method = bmlang::REDUCE_SUM;
param.axis_list[0] = num_dims - 1;
param.axis_num = 1;
bmlang::reduce(tensor_exp, reduce_sum, param);
bmlang::Tensor broadcast2(ToBmlangDtype(kTfLiteFloat32));
bmlang::broadcast(reduce_sum, broadcast2, times);
bmlang::div(tensor_exp, broadcast2, dst);
dst.get_data(output->data.raw);
return kTfLiteOk;
}