Demo示例

Product Quantizer Algorithm

以下是对‘Product Quantizer Algorithm’的需要加速部分所做的从bmlang编程生成bmodel,到runtime编程加载bmodel进行加速的一个具体实现过程,从而更清晰地展示bmlang使用方法。

source code

我们只用bmlang对部分算法的实现进行加速,该demo是对ProductQuantizer::search()的加速。

// xq, xb_pq都是已知的输入数据;ncodes=nb=10000; m_ksub=d=256; m_subquantizers=pq_length=32
// k 表示topk=100; distances, labels分别表示topk的输出value和index

void ProductQuantizer::search(const float * xq,
 const uint8_t * xb_pq, const size_t ncodes,
 uint32_t k, float *distances, uint64_t *labels) const
{
  float * dis_tables = new float[m_ksub * m_subquantizers];
  ScopeDeleter<float> del(dis_tables);

  computeInnerProdTable(xq, dis_tables);

  std::multimap<float, int, std::greater<float> > _distances;
  int _dataset_length = ncodes;
  int _sub_space = m_subquantizers;
  for(int i = 0; i < _dataset_length; i++) {
    float dist = .0;
    for(int s = 0; s < _sub_space; s++) {
      dist += dis_tables[s * m_ksub + xb_pq[i * _sub_space + s]];
    }
    _distances.emplace(dist, i);
  }

  int kk = 0;
  for(auto dist: _distances) {
    if(kk < k) {
      distances[kk] = dist.first;
      labels[kk] = dist.second;
      kk++;
    } else {
      break;
    }
  }
}

void ProductQuantizer::computeInnerProdTable(const float* x,
    float* dis_table) const
{
  size_t m;
  for (m = 0; m < m_subquantizers; m++) {
    vec_inner_products(dis_table + m * m_ksub,
        x + m * m_dsub, get_centroids(m, 0),
        m_dsub, m_ksub);
  }
}

void vec_inner_products(float* ip, const float* x, const float* y,
    size_t d, size_t ny)
{
  for (size_t i = 0; i < ny; i++) {
    ip[i] = 0;
    for(int j = 0; j < d; j++) {
      ip[i] += x[j] * y[j];
    }
    y += d;
  }
}

// m_dsub = 8
// m_codes数据已知,与后文bmlang编程中的codes.data()相同
const float* ProductQuantizer::get_centroids(size_t m, size_t i) const
{
  return &m_codes[(m * m_ksub + i) * m_dsub];
}

bmlang编程

首先需要对上面的算法进行bmlang编程,最终生成bmodel供runtime使用

product_quant_search()
{
  // create input tensor
  bmlang::Tensor tensor_0("query", bmlang::FLOAT32, 1, pq_length, 1, 8);
  tensor_0.set_data((char*)xq);

  bmlang::Tensor tensor_1("m_centroid", bmlang::FLOAT32, 1, pq_length, d, 8);
  tensor_1.set_data((char*)codes.data());
  tensor_1.set_tensor_type(bmlang::COEFF_TENSOR);

  // computeInnerProdTable
  bmlang::Tensor tensor_dis_inner("dis_inner", bmlang::FLOAT32);
  bmlang::mul(tensor_0, tensor_1, tensor_dis_inner);

  bmlang::ReduceParam p_reduce_param;
  p_reduce_param.method = bmlang::REDUCE_SUM;
  p_reduce_param.axis_num = 1;
  p_reduce_param.axis_list[0] = 3;
  bmlang::Tensor tensor_dis_table("dis_table", bmlang::FLOAT32);
  bmlang::reduce(tensor_dis_inner, tensor_dis_table, p_reduce_param);
  /*
    // get tensor data
    std::unique_ptr<float> dis_table_data(new float[tensor_dis_table.count()]);
    tensor_dis_table.get_data((char*)dis_table_data.get());
  */

  std::vector<bmlang::Tensor*> codes_tensor;

  std::vector<bmlang::Tensor*> dis_table_split(bmlang::FLOAT32);
  for (int i = 0; i < pq_length; i++) {
    codes_tensor.push_back(new bmlang::Tensor(bmlang::UINT8, nb));
    codes_tensor[i]->set_tensor_type(bmlang::COEFF_TENSOR);
    codes_tensor[i]->set_data((char*)(xb_pq_trans.get()) + i * nb);
    dis_table_split.push_back(new bmlang::Tensor(bmlang::FLOAT32));
  }

  // split table
  bmlang::split(tensor_dis_table, dis_table_split, 1, pq_length);

  bmlang::Tensor tensor_with_gallery(bmlang::FLOAT32);
  for (int i = 0; i < pq_length; i++) {
    bmlang::Tensor lut_res(bmlang::FLOAT32);
    bmlang::lut(*(codes_tensor[i]), *(dis_table_split[i]), lut_res);
    if (i == 0)
      bmlang::copy(lut_res, tensor_with_gallery);
    else
      bmlang::add(tensor_with_gallery, lut_res, tensor_with_gallery);
  }

  // topk
  bmlang::TopkParam pq_topk;
  pq_topk.axis = 0;
  pq_topk.k = topk;
  pq_topk.descending = true;

  bmlang::Tensor tensor_index("output_index", bmlang::INT32);
  bmlang::Tensor tensor_score("output_score", bmlang::FLOAT32);
  bmlang::topk(tensor_with_gallery, tensor_score, tensor_index, pq_topk);
  std::cout << "topk output count: " << tensor_index.count() << std::endl;

#ifdef COMPILE_CHECK
  std::vector<bmlang::Tensor*> inp_tensor;
  std::vector<bmlang::Tensor*> ref_tensor;
  inp_tensor.push_back(&tensor_0);
  ref_tensor.push_back(&tensor_index);
  ref_tensor.push_back(&tensor_score);
  bmlang::compile_with_check("product_quant_search", inp_tensor, ref_tensor, 2, 0, false);
#else
  bmlang::compile("product_quant_search", 2, 1, false);
#endif

  for (auto o : codes_tensor) delete o;
  for (auto o : dis_table_split) delete o;
}

runtime编程

加载上一步通过bmlang编程编译后生成的bmodel在芯片上推理实现加速

void ProductQuantizer::search_accelerate(
 const float * xq, const uint8_t* codes,
 const int ncodes, const int pq_length,
 float *distances, uint64_t *labels) const
{
  // 1. require device
  bm_handle_t bm_handle;
  bm_dev_request(&bm_handle, 0);

  // 2. creat runtime
  void* p_bmrt = bmrt_create(bm_handle);

  // 3. load bmodel
  // char* bmodel = path_of_your_bmodel
  bmrt_load_bmodel(p_bmrt, bmodel);

  const char** net_names = nullptr;
  bmrt_get_network_names(p_bmrt, &net_names);
  const bm_net_info_t* net_info = bmrt_get_network_info(p_bmrt, net_names[0]);

  // 4. initial input tensor shape, data_type and data
  int input_num = net_info->input_num;
  bm_tensor_t* input_tensors = new bm_tensor_t[input_num];
  bmrt_tensor(&input_tensors[0], p_bmrt, BM_FLOAT32, {4, {1, pq_length, 1, 8}});
  bm_memcpy_s2d_partial(bm_handle, input_tensors[0].device_mem,
                        (void*)xq, bmrt_tensor_bytesize(&input_tensors[0]));

  int output_num = net_info->output_num;
  assert(output_num == 2);
  bm_tensor_t* output_tensors = new bm_tensor_t[output_num];

  // 5. net inference
  bmrt_launch_tensor(p_bmrt, net_names[0], input_tensors, input_num,
                     output_tensors, output_num);
  // make sure inference has finished
  assert(bm_thread_sync(bm_handle) == BM_SUCCESS);

  // 6. get output data
  // BM1684 can't support uint64_t, use int32 instead
  int index_shape_count = bmrt_shape_count(&(output_tensors[1].shape));
  int* output_index = new int[index_shape_count];
  bm_memcpy_d2s_partial(bm_handle, output_index, output_tensors[1].device_mem,
                        bmrt_tensor_bytesize(&output_tensors[1]));
  bm_memcpy_d2s_partial(bm_handle, distances, output_tensors[0].device_mem,
                        bmrt_tensor_bytesize(&output_tensors[0]));

  for (int i = 0; i < index_shape_count; i++) {
    labels[i] = static_cast<uint64_t>(output_index[i]);
  }

  // 7. free device memory
  free(net_names);
  for (int i = 0; i < output_num; i++) {
    bm_free_device(bm_handle, output_tensors[i].device_mem);
  }
  for (int i = 0; i < input_num; i++) {
    bm_free_device(bm_handle, input_tensors[i].device_mem);
  }
  delete [] output_index;
  delete [] input_tensors;
  delete [] output_tensors;

  // 8. destory runtime and free device
  bmrt_destroy(p_bmrt);
  bm_dev_free(bm_handle);
}

Softmax的BMLang实现

以下是对Softmax的BMLang C++编程:

TfLiteStatus EvalFloatSoftmax(
    TfLiteContext* context,
    const TfLiteTensor* input,
    TfLiteTensor* output,
    std::vector<int> input_shape) {
  int num_dims = NumDimensions(input);

  int n = input_shape[0];
  int h = input_shape[1];
  int w = input_shape[2];
  int c = input_shape[3];

  bmlang::Tensor src(input->name, ToBmlangDtype(kTfLiteFloat32), {n, h, w, c});
  src.set_data(input->data.raw);

  bmlang::Tensor dst(output->name, ToBmlangDtype(kTfLiteFloat32));

  bmlang::Tensor reduce_max(ToBmlangDtype(kTfLiteFloat32));
  bmlang::ReduceParam param;
  param.method = bmlang::REDUCE_MAX;
  param.axis_list[0] = num_dims - 1;
  param.axis_num = 1;
  bmlang::reduce(src, reduce_max, param);

  bmlang::Tensor broadcast1(ToBmlangDtype(kTfLiteFloat32));
  std::vector<int> times(num_dims, 1);
  times[num_dims-1] = c;

  bmlang::broadcast(reduce_max, broadcast1, times);

  bmlang::Tensor tensor_diff(ToBmlangDtype(kTfLiteFloat32));
  bmlang::sub(src, broadcast1, tensor_diff);

  bmlang::Tensor tensor_exp(ToBmlangDtype(kTfLiteFloat32));
  bmlang::active(tensor_diff, tensor_exp, bmlang::ACTIVE_EXP);

  bmlang::Tensor reduce_sum(ToBmlangDtype(kTfLiteFloat32));
  param.method = bmlang::REDUCE_SUM;
  param.axis_list[0] = num_dims - 1;
  param.axis_num = 1;
  bmlang::reduce(tensor_exp, reduce_sum, param);

  bmlang::Tensor broadcast2(ToBmlangDtype(kTfLiteFloat32));
  bmlang::broadcast(reduce_sum, broadcast2, times);

  bmlang::div(tensor_exp, broadcast2, dst);

  dst.get_data(output->data.raw);

  return kTfLiteOk;
}