Have a nice day
I have a detector model written in Python, and I`m trying to convert it into C++ and then compile it.
This detector model has been exported in Onnx format, and everything is fine when I`m trying to infer or run in Python. In the following, I put this model class that I use it.
class Detector:
def __init__(self, model_file: str, nms_thresh: float = .4, det_thresh: float = .5, min_res: float = 20,
input_size: Tuple[int, int] = (480, 640)) -> None:
self.session = ort.InferenceSession(model_file, None,providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
self.center_cache = {}
self.nms_thresh = nms_thresh
self.det_thresh = det_thresh
self.min_res = min_res
self.input_name = self.session.get_inputs()[0].name
self.input_size = input_size
self.output_names = [o.name for o in self.session.get_outputs()]
self.fmc = 3
self.input_mean = 127.5
self.input_std = 128.0
self._feat_stride_fpn = [8, 16, 32]
self._num_anchors = 2
def forward(self, img: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
scores_list = []
bboxes_list = []
kps_list = []
blob = cv2.dnn.blobFromImage(img, 1.0 / self.input_std, tuple(img.shape[0:2][::-1]),
(self.input_mean, self.input_mean, self.input_mean), swapRB=True)
net_outs = self.session.run(self.output_names, {self.input_name: blob})
input_height = blob.shape[2]
input_width = blob.shape[3]
fmc = self.fmc
for idx, stride in enumerate(self._feat_stride_fpn):
scores = net_outs[idx]
bbox_predicts = net_outs[idx + fmc]
bbox_predicts = bbox_predicts * stride
kps_predicts = net_outs[idx + fmc * 2] * stride
height = input_height // stride
width = input_width // stride
key = (height, width, stride)
if key in self.center_cache:
anchor_centers = self.center_cache[key]
else:
anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
anchor_centers = (anchor_centers * stride).reshape((-1, 2))
if self._num_anchors > 1:
anchor_centers = np.stack([anchor_centers] * self._num_anchors, axis=1).reshape((-1, 2))
if len(self.center_cache) < 100:
self.center_cache[key] = anchor_centers
# with size of anchors
scores = scores[:len(anchor_centers)]
bbox_predicts = bbox_predicts[:len(anchor_centers)]
kps_predicts = kps_predicts[:len(anchor_centers)]
pos_indices = np.where(scores >= self.det_thresh)[0]
bboxes = distance2bbox(anchor_centers, bbox_predicts)
pos_scores = scores[pos_indices]
pos_bboxes = bboxes[pos_indices]
scores_list.append(pos_scores)
bboxes_list.append(pos_bboxes)
key_points = distance2kps(anchor_centers, kps_predicts)
key_points = key_points.reshape((key_points.shape[0], -1, 2))
pos_key_points = key_points[pos_indices]
kps_list.append(pos_key_points)
scores = np.vstack(scores_list)
bboxes_list = np.vstack(bboxes_list)
kps_list = np.vstack(kps_list)
# cv2.imshow('blob', np.transpose(blob[0], [1, 2, 0]))
return scores, bboxes_list, kps_list
def detect(self, img: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
ref_size = img.shape[:2]
img = cv2.resize(img, self.input_size[::-1])
res_size = img.shape[:2]
det_scale = (ref_size[0] / res_size[0], ref_size[1] / res_size[1])
scores, boxes, points = self.forward(img)
order = scores.ravel().argsort()[::-1]
# Order
scores = scores[order, :]
boxes = boxes[order, :]
points = points[order, :]
# Scale
boxes[:, [0, 2]] *= det_scale[1]
boxes[:, [1, 3]] *= det_scale[0]
points[:, :, [0]] *= det_scale[1]
points[:, :, [1]] *= det_scale[0]
keep_indices = self.nms(boxes, scores)
# Keep
scores = scores[keep_indices, :]
boxes = boxes[keep_indices, :]
points = points[keep_indices, :]
# Keep min resolution
if len(boxes) > 0:
wh = boxes[:, [2, 3]] - boxes[:, [0, 1]]
wh = wh[:, 0] * wh[:, 1]
min_res_idx = np.where(wh > self.min_res)[0]
scores = scores[min_res_idx, :]
boxes = boxes[min_res_idx, :]
points = points[min_res_idx, :]
return scores, boxes, points
# cv2.imshow('blob', np.transpose(blob[0], (1, 2, 0)))
def nms(self, boxes: np.ndarray, scores: np.ndarray):
thresh = self.nms_thresh
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.ravel().argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= thresh)[0]
order = order[inds + 1]
return keep
Unfortunately, I have converted this class into C++ version, and everything looks fine but I cannot get the same result on our dataset.
Assume I put a video into these two versions, all objects in the C++ version have a very close to zero probability and I cannot use the same confidence threshold for these two versions
// #include <retina.h>
#include <chrono>
#include <onnxruntime/onnxruntime_cxx_api.h>
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <iostream>
#include <vector>
#include <string>
#include <cuda_runtime_api.h>
#include <memory>
#include <numeric>
#include <cmath>
#include <map>
// #include <gst/gst.h>
// #include "gstnvdsmeta.h"
// #include "nvds_yml_parser.h"
// #include "nvds_version.h"
struct AnchorBox{
float cx;
float cy;
float s_kx;
float s_ky;
};
struct Bbox{
float x1;
float y1;
float x2;
float y2;
};
struct ObjectMeta
{
Bbox bbox;
float score;
};
template <typename T> T vectorProduct(const std::vector<T>& v){
return accumulate(v.begin(),v.end(),1,std::multiplies<T>());
}
void nms(std::vector<ObjectMeta>& input_boxes,std::vector<ObjectMeta>& output_boxes, float nms_threshold){
std::sort(input_boxes.begin(), input_boxes.end(),
[](const ObjectMeta &a, const ObjectMeta &b)
{ return a.score > b.score; });
const unsigned int box_num = input_boxes.size();
std::vector<int> merged(box_num, 0);
unsigned int count = 0;
for (unsigned int i = 0; i < box_num; ++i)
{
if (merged[i]) continue;
std::vector<ObjectMeta> buf;
buf.push_back(input_boxes[i]);
merged[i] = 1;
for (unsigned int j = i + 1; j < box_num; ++j)
{
if (merged[j]) continue;
float area1 = (input_boxes[i].bbox.x2 - input_boxes[i].bbox.x1+1) * (input_boxes[i].bbox.y2 - input_boxes[i].bbox.y1 + 1);
float area2 = (input_boxes[j].bbox.x2 - input_boxes[j].bbox.x1+1) * (input_boxes[j].bbox.y2 - input_boxes[j].bbox.y1 + 1);
float xx1 = input_boxes[i].bbox.x1 >input_boxes[j].bbox.x1?input_boxes[i].bbox.x1:input_boxes[j].bbox.x1;
float yy1 = input_boxes[i].bbox.y1 >input_boxes[j].bbox.y1?input_boxes[i].bbox.y1:input_boxes[j].bbox.y1;
float xx2 = input_boxes[i].bbox.x2 >input_boxes[j].bbox.x2?input_boxes[i].bbox.x2:input_boxes[j].bbox.x2;
float yy2 = input_boxes[i].bbox.y2 >input_boxes[j].bbox.y2?input_boxes[i].bbox.y2:input_boxes[j].bbox.y2;
float wb = xx2-xx1+1;
float hb =yy2-yy1+1;
float inner = (wb>0?wb:0)*(hb>0?hb:0);
float iou = inner / (area1+area2-inner);
if (iou > nms_threshold)
{
merged[j] = 1;
buf.push_back(input_boxes[j]);
}
}
output_boxes.push_back(buf[0]);
// keep top k
// count += 1;
// if (count >= topk)
// break;
}
}
Bbox distance2bbox(const AnchorBox &anchor,const Bbox& distance){
return {
anchor.cx - distance.x1,
anchor.cy - distance.y1,
anchor.cx + distance.x2,
anchor.cy + distance.y2
};
}
template<typename T> std::ostream& operator<<(std::ostream& os, const std::vector<T>& v){
os<<"[";
for(int i=0;i<v.size();i++){
os<<v[i];
if(i>100){
break;
}
if(i!=v.size()-1){
os<<", ";
}
}
os<<"]";
return os;
}
template<typename T> std::ostream& operator<<(std::ostream& os, const std::vector<T>*v){
os<<"[";
for(int i=0;i<v->size();i++){
if(i<20){
os<<v+i;
if(i!=v->size()-1){
os<<", ";
}
}else{
break;
}
}
os<<"]";
return os;
}
// Types
typedef std::vector<int64_t> vInt64Shape;
// Define some constant variables
const static std::string appname{"FatechV"};
const static bool useCuda{true};
static int deviceId{-1};
const static unsigned short int MAJOR_VER{0};
const static unsigned short int MINOR_VER{0};
const static unsigned short int PATCH_VER{1};
// Detector specification
const static std::string DetModelPath{"/home/ixion/Projects/fatechCTL/services/face/data/models/Face/det_10g.onnx"};
const static int fmc{3};
const static std::vector<int> feat_stride_fpn{8,16,32};
const static int anchors = 2;
const static std::vector<int> inputDims{640,480};
const static float input_mean{127.5};
const static float input_std{128.0};
const static float conf_thresh = 0.5;
int main(int argc, char *argv[]){
std::cout<<appname<<" "<<MAJOR_VER<<"."<<MINOR_VER<<"."<<PATCH_VER<<std::endl;
Ort::SessionOptions session_options;
Ort::Env env{ORT_LOGGING_LEVEL_ERROR};
if(useCuda){
cudaGetDevice(&deviceId);
cudaDeviceProp cudaProp;
cudaGetDeviceProperties(&cudaProp,deviceId);
std::cout<<cudaProp.name<<std::endl;
OrtCUDAProviderOptions cuda_option;
cuda_option.device_id = deviceId;
cuda_option.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchExhaustive;
cuda_option.gpu_mem_limit = static_cast<int>(SIZE_MAX*1024*1024);
cuda_option.arena_extend_strategy = 1;
cuda_option.has_user_compute_stream = 1;
cuda_option.default_memory_arena_cfg = nullptr;
session_options.AppendExecutionProvider_CUDA(cuda_option);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
}
// Start Detector Session
std::unique_ptr<Ort::Session> session;
std::vector<const char*> inputNames{"input.1"};
vInt64Shape inputShape{1,3,inputDims[0],inputDims[1]};
std::vector<const char*> outputNames{"448", "471", "494", "451", "474", "497", "454", "477", "500"};
vInt64Shape outputShape;
session = std::make_unique<Ort::Session>(env,DetModelPath.c_str(),session_options);
Ort::AllocatorWithDefaultOptions allocator;
outputShape = session->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();
std::cout<<session->GetOutputCount()<<std::endl;
// std::vector<std::vector<float>> outputTensorsValue(session->GetOutputCount());
// std::vector<Ort::Value> outputTensors;
for(int i=0;i<session->GetOutputCount();i++){
outputShape = session->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape();
// auto outTensorSize = vectorProduct(outputShape);
// std::vector<float> outValue (outTensorSize);
// outputTensorsValue.push_back(outValue);
// Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator,OrtMemType::OrtMemTypeDefault);
// outputTensors.push_back(Ort::Value::CreateTensor<float>(memoryInfo,outValue.data(),outTensorSize,outputShape.data(),outputShape.size()))
std::cout<<i+1<<" "<<outputShape<<" "<<vectorProduct(outputShape)<<std::endl;
}
std::map<int,std::vector<AnchorBox>> cached_anchors;
// Create anchors
for(int k=0;k<feat_stride_fpn.size();k++){
int stride = feat_stride_fpn[k];
int heightR = std::ceil(inputDims[1]/stride);
int widthR = std::ceil(inputDims[0]/stride);
std::vector<AnchorBox> anchors_boxes;
for(int na=0;na<anchors;na++){
for(int h=0;h<heightR;h++){
for(int w=0;w<widthR;w++){
float cx = ((float)w+0.5f) * (float)stride;
float cy = ((float)h+0.5f) * (float)stride;
anchors_boxes.push_back(AnchorBox{cx,cy});
}
}
}
std::cout<<anchors_boxes.size()<<std::endl;
cached_anchors[stride] = anchors_boxes;
}
// End Detector Session
const std::string file_name = "/home/ixion/Projects/fatechCTL/services/face/data/videos/Cathy_ONeil.mp4";
cv::namedWindow("Main",cv::WINDOW_AUTOSIZE);
cv::VideoCapture vision(file_name);
while (vision.isOpened()){
cv::Mat frame;
cv::Mat fframe;
bool ret = vision.read(frame);
if(frame.empty()){
break;
}
frame.copyTo(fframe);
cv::Size input_size{inputDims[1],inputDims[0]};
cv::resize(fframe, fframe, input_size);
size_t inputTensorSize = vectorProduct(inputShape);
std::vector<float> inputTensorValue(inputTensorSize);
// Frame specs
double scalefactor = 1/input_std;
// std::cout<<"Scale factor "<<scalefactor<<std::endl;
cv::Scalar means {input_mean,input_mean,input_mean};
cv::Mat matf;
frame.convertTo(matf, CV_32FC3);
cv::cvtColor(matf,matf,cv::COLOR_BGR2RGB);
matf = (matf - input_mean) * scalefactor;
cv::Mat resize_mat_ref;
cv::resize(matf, resize_mat_ref, input_size);
std::cout<<resize_mat_ref.rows<<" "<<resize_mat_ref.cols<<std::endl;
std::vector<cv::Mat> mat_channels;
cv::split(resize_mat_ref, mat_channels);
for (unsigned int i = 0; i < 3; ++i){
std::memcpy(inputTensorValue.data() + i * (inputShape[3] * inputShape[2]),
mat_channels.at(i).data,inputShape[2] * inputShape[3] * sizeof(float));
}
// Blob images
// cv::Mat blobImage = cv::dnn::blobFromImage(frame,scalefactor,input_size,means,true);
// std::cout<<"Channels "<<blobImage.channels()<<std::endl;
// std::cout<<"Type "<<blobImage.type() == CV_32FC(blobImage.channels())<<std::endl;
// auto kk = blobImage.data;
// for(int z=0;z<inputTensorSize;z++){
// inputTensorValue[z] = blobImage.at<float>(z);
// }
// for(int c=0;c<3;c++){
// for(int h=0;h<480;h++){
// for(int w=0;w<640;w++){
// inputTensorValue[c*480*640+h*640+w] = (float)kk[h*640*3+w*3+c];
// }
// }
// }
// inputTensorValue.assign(blobImage.begin<float>(),blobImage.end<float>());
// std::cout<<inputTensorValue<<std::endl;
std::vector<Ort::Value> inputTensors;
Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator,OrtMemType::OrtMemTypeDefault);
inputTensors.push_back(std::move(Ort::Value::CreateTensor<float>(memoryInfo,inputTensorValue.data(),inputTensorSize,inputShape.data(),inputShape.size())));
// std::cout<<inputTensors.front().IsTensor()<<std::endl;
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
std::vector<Ort::Value> outputTensors = session->Run(Ort::RunOptions{nullptr},inputNames.data(),inputTensors.data(),size_t(1),outputNames.data(),size_t(9));
assert(outputTensors.size()==9 && outputTensors.front().IsTensor());
std::vector<float> confVector;
std::vector<std::vector<float>> bboxVector;
std::vector<std::vector<float>> kpsVector;
std::vector<ObjectMeta> metas;
for(int i=0;i<feat_stride_fpn.size();i++){
auto confSize = vectorProduct(outputTensors[i].GetTensorTypeAndShapeInfo().GetShape());
auto bboxSize = vectorProduct(outputTensors[i+fmc].GetTensorTypeAndShapeInfo().GetShape());
auto kpsSize = vectorProduct(outputTensors[i+fmc*2].GetTensorTypeAndShapeInfo().GetShape());
auto confShape = outputTensors[i].GetTensorTypeAndShapeInfo().GetShape();
auto bboxShape = outputTensors[i+fmc].GetTensorTypeAndShapeInfo().GetShape();
auto kpsShape = outputTensors[i+fmc*2].GetTensorTypeAndShapeInfo().GetShape();
int stride = feat_stride_fpn[i];
auto anchorStride = cached_anchors[stride];
// std::cout<<std::floor(inputDims[0]/stride)<<" "<<std::floor(inputDims[1]/stride)<<std::endl;
for(int j=0;j<confSize;j++){
// std::cout<<j<<std::endl;
float score = outputTensors.at(i).At<float>({j,0});
if(score<conf_thresh){
continue;
}
ObjectMeta obj;
obj.score = score;
Bbox bbox {
outputTensors.at(i+fmc).At<float>({j,0})*stride,
outputTensors.at(i+fmc).At<float>({j,1})*stride,
outputTensors.at(i+fmc).At<float>({j,2})*stride,
outputTensors.at(i+fmc).At<float>({j,3})*stride,
};
obj.bbox = distance2bbox(anchorStride[j],bbox);
metas.push_back(obj);
// std::vector<float> kps;
// for(int bi=0;bi<10;bi++){
// kps.push_back(bbox_pred[10*j+bi]*stride);
// }
// kpsVector.push_back(kps);
}
}
std::vector<ObjectMeta> outputMetas;
nms(metas,outputMetas,.4);
std::cout<<outputMetas.size()<<std::endl;
for(auto obj:outputMetas){
std::cout<<"["<<obj.score<<", "<<obj.bbox.x1<<", "<<obj.bbox.y1<<", "<<obj.bbox.x2<<", "<<obj.bbox.y2<<"]\n";
cv::rectangle(fframe,cv::Point(obj.bbox.x1,obj.bbox.y1),cv::Point(obj.bbox.x2,obj.bbox.y2),cv::Scalar(0, 255, 0));
}
// std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
// std::cout<<"Inference Time: "<<std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count()<<"ms"<<std::endl;
// std::cout<<inputTensorValue.size()<<std::endl;
cv::imshow("Main",resize_mat_ref);
if(cv::waitKey(10)==27){
break;
}
}
return 0;
}
Also, I checked the processing steps and there is no problem and it looks fine.
Please let me know if there is any mistake:))