Different result on using onnxruntime c++ and python

Have a nice day

I have a detector model written in Python, and I`m trying to convert it into C++ and then compile it.

This detector model has been exported in Onnx format, and everything is fine when I`m trying to infer or run in Python. In the following, I put this model class that I use it.

class Detector:
    def __init__(self, model_file: str, nms_thresh: float = .4, det_thresh: float = .5, min_res: float = 20,
                 input_size: Tuple[int, int] = (480, 640)) -> None:
        self.session = ort.InferenceSession(model_file, None,providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
        self.center_cache = {}
        self.nms_thresh = nms_thresh
        self.det_thresh = det_thresh
        self.min_res = min_res

        self.input_name = self.session.get_inputs()[0].name
        self.input_size = input_size
        self.output_names = [o.name for o in self.session.get_outputs()]
        self.fmc = 3
        self.input_mean = 127.5
        self.input_std = 128.0
        self._feat_stride_fpn = [8, 16, 32]
        self._num_anchors = 2

    def forward(self, img: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        scores_list = []
        bboxes_list = []
        kps_list = []

        blob = cv2.dnn.blobFromImage(img, 1.0 / self.input_std, tuple(img.shape[0:2][::-1]),
                                     (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
        net_outs = self.session.run(self.output_names, {self.input_name: blob})
        input_height = blob.shape[2]
        input_width = blob.shape[3]
        fmc = self.fmc

        for idx, stride in enumerate(self._feat_stride_fpn):
            scores = net_outs[idx]
            bbox_predicts = net_outs[idx + fmc]
            bbox_predicts = bbox_predicts * stride
            kps_predicts = net_outs[idx + fmc * 2] * stride
            height = input_height // stride
            width = input_width // stride
            key = (height, width, stride)
            if key in self.center_cache:
                anchor_centers = self.center_cache[key]
            else:
                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
                anchor_centers = (anchor_centers * stride).reshape((-1, 2))
                if self._num_anchors > 1:
                    anchor_centers = np.stack([anchor_centers] * self._num_anchors, axis=1).reshape((-1, 2))
                if len(self.center_cache) < 100:
                    self.center_cache[key] = anchor_centers
            # with size of anchors
            scores = scores[:len(anchor_centers)]
            bbox_predicts = bbox_predicts[:len(anchor_centers)]
            kps_predicts = kps_predicts[:len(anchor_centers)]
            pos_indices = np.where(scores >= self.det_thresh)[0]
            bboxes = distance2bbox(anchor_centers, bbox_predicts)
            pos_scores = scores[pos_indices]
            pos_bboxes = bboxes[pos_indices]
            scores_list.append(pos_scores)
            bboxes_list.append(pos_bboxes)
            key_points = distance2kps(anchor_centers, kps_predicts)
            key_points = key_points.reshape((key_points.shape[0], -1, 2))
            pos_key_points = key_points[pos_indices]
            kps_list.append(pos_key_points)

        scores = np.vstack(scores_list)
        bboxes_list = np.vstack(bboxes_list)
        kps_list = np.vstack(kps_list)

        # cv2.imshow('blob', np.transpose(blob[0], [1, 2, 0]))

        return scores, bboxes_list, kps_list

    def detect(self, img: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:

        ref_size = img.shape[:2]
        img = cv2.resize(img, self.input_size[::-1])
        res_size = img.shape[:2]
        det_scale = (ref_size[0] / res_size[0], ref_size[1] / res_size[1])
        scores, boxes, points = self.forward(img)
        order = scores.ravel().argsort()[::-1]

        # Order
        scores = scores[order, :]
        boxes = boxes[order, :]
        points = points[order, :]

        # Scale
        boxes[:, [0, 2]] *= det_scale[1]
        boxes[:, [1, 3]] *= det_scale[0]
        points[:, :, [0]] *= det_scale[1]
        points[:, :, [1]] *= det_scale[0]

        keep_indices = self.nms(boxes, scores)

        # Keep
        scores = scores[keep_indices, :]
        boxes = boxes[keep_indices, :]
        points = points[keep_indices, :]

        # Keep min resolution
        if len(boxes) > 0:
            wh = boxes[:, [2, 3]] - boxes[:, [0, 1]]
            wh = wh[:, 0] * wh[:, 1]
            min_res_idx = np.where(wh > self.min_res)[0]
            scores = scores[min_res_idx, :]
            boxes = boxes[min_res_idx, :]
            points = points[min_res_idx, :]

        return scores, boxes, points
        # cv2.imshow('blob', np.transpose(blob[0], (1, 2, 0)))

    def nms(self, boxes: np.ndarray, scores: np.ndarray):
        thresh = self.nms_thresh
        x1 = boxes[:, 0]
        y1 = boxes[:, 1]
        x2 = boxes[:, 2]
        y2 = boxes[:, 3]

        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
        order = scores.ravel().argsort()[::-1]
        keep = []
        while order.size > 0:
            i = order[0]
            keep.append(i)
            xx1 = np.maximum(x1[i], x1[order[1:]])
            yy1 = np.maximum(y1[i], y1[order[1:]])
            xx2 = np.minimum(x2[i], x2[order[1:]])
            yy2 = np.minimum(y2[i], y2[order[1:]])

            w = np.maximum(0.0, xx2 - xx1 + 1)
            h = np.maximum(0.0, yy2 - yy1 + 1)
            inter = w * h
            ovr = inter / (areas[i] + areas[order[1:]] - inter)

            inds = np.where(ovr <= thresh)[0]
            order = order[inds + 1]

        return keep

Unfortunately, I have converted this class into C++ version, and everything looks fine but I cannot get the same result on our dataset.

Assume I put a video into these two versions, all objects in the C++ version have a very close to zero probability and I cannot use the same confidence threshold for these two versions

// #include <retina.h>
#include <chrono>
#include <onnxruntime/onnxruntime_cxx_api.h>
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <iostream>
#include <vector>
#include <string>
#include <cuda_runtime_api.h>
#include <memory>
#include <numeric>
#include <cmath>
#include <map>

// #include <gst/gst.h>
// #include "gstnvdsmeta.h"
// #include "nvds_yml_parser.h"
// #include "nvds_version.h"




struct AnchorBox{
    float cx;
    float cy;
    float s_kx;
    float s_ky;
};

struct Bbox{
    float x1;
    float y1;
    float x2;
    float y2;
};

struct ObjectMeta
{
    Bbox bbox;
    float score;
};

template <typename T> T vectorProduct(const std::vector<T>& v){
    return accumulate(v.begin(),v.end(),1,std::multiplies<T>());
}

void nms(std::vector<ObjectMeta>& input_boxes,std::vector<ObjectMeta>& output_boxes, float nms_threshold){
    
    std::sort(input_boxes.begin(), input_boxes.end(),
            [](const ObjectMeta &a, const ObjectMeta &b)
            { return a.score > b.score; });

    const unsigned int box_num = input_boxes.size();
    std::vector<int> merged(box_num, 0);

    unsigned int count = 0;
    for (unsigned int i = 0; i < box_num; ++i)
    {
        if (merged[i]) continue;
        std::vector<ObjectMeta> buf;

        buf.push_back(input_boxes[i]);
        merged[i] = 1;

        for (unsigned int j = i + 1; j < box_num; ++j)
        {
            if (merged[j]) continue;

            float area1 = (input_boxes[i].bbox.x2 - input_boxes[i].bbox.x1+1) * (input_boxes[i].bbox.y2 - input_boxes[i].bbox.y1 + 1);
            float area2 = (input_boxes[j].bbox.x2 - input_boxes[j].bbox.x1+1) * (input_boxes[j].bbox.y2 - input_boxes[j].bbox.y1 + 1);

            float xx1 = input_boxes[i].bbox.x1 >input_boxes[j].bbox.x1?input_boxes[i].bbox.x1:input_boxes[j].bbox.x1;
            float yy1 = input_boxes[i].bbox.y1 >input_boxes[j].bbox.y1?input_boxes[i].bbox.y1:input_boxes[j].bbox.y1;
            float xx2 = input_boxes[i].bbox.x2 >input_boxes[j].bbox.x2?input_boxes[i].bbox.x2:input_boxes[j].bbox.x2;
            float yy2 = input_boxes[i].bbox.y2 >input_boxes[j].bbox.y2?input_boxes[i].bbox.y2:input_boxes[j].bbox.y2;

            float wb = xx2-xx1+1;
            float hb =yy2-yy1+1;

            float inner = (wb>0?wb:0)*(hb>0?hb:0);

            float iou = inner / (area1+area2-inner);

            if (iou > nms_threshold)
            {
                merged[j] = 1;
                buf.push_back(input_boxes[j]);
            }

        }
        output_boxes.push_back(buf[0]);

        // keep top k
        // count += 1;
        // if (count >= topk)
        // break;
    }
}

Bbox distance2bbox(const AnchorBox &anchor,const Bbox& distance){
    return {
        anchor.cx - distance.x1,
        anchor.cy - distance.y1,
        anchor.cx + distance.x2,
        anchor.cy + distance.y2
    };
}

template<typename T> std::ostream& operator<<(std::ostream& os, const std::vector<T>& v){
    os<<"[";
    for(int i=0;i<v.size();i++){
        os<<v[i];
        if(i>100){
            break;
        }
        if(i!=v.size()-1){
            os<<", ";
        }
    }
    os<<"]";
    return os;
}


template<typename T> std::ostream& operator<<(std::ostream& os, const std::vector<T>*v){
    os<<"[";
    for(int i=0;i<v->size();i++){
        if(i<20){
            os<<v+i;
            if(i!=v->size()-1){
                os<<", ";
            }
        }else{
            break;
        }
    }
    os<<"]";
    return os;
}

// Types
typedef std::vector<int64_t> vInt64Shape;

// Define some constant variables
const static std::string appname{"FatechV"};
const static bool useCuda{true};
static int deviceId{-1};
const static unsigned short int MAJOR_VER{0};
const static unsigned short int MINOR_VER{0};
const static unsigned short int PATCH_VER{1};

// Detector specification
const static std::string DetModelPath{"/home/ixion/Projects/fatechCTL/services/face/data/models/Face/det_10g.onnx"};
const static int fmc{3};
const static std::vector<int> feat_stride_fpn{8,16,32};
const static int anchors = 2;
const static std::vector<int> inputDims{640,480};
const static float input_mean{127.5};
const static float input_std{128.0};
const static float conf_thresh = 0.5;


int main(int argc, char *argv[]){
    std::cout<<appname<<" "<<MAJOR_VER<<"."<<MINOR_VER<<"."<<PATCH_VER<<std::endl;

    Ort::SessionOptions session_options;
    Ort::Env env{ORT_LOGGING_LEVEL_ERROR};
    if(useCuda){
        cudaGetDevice(&deviceId);
        cudaDeviceProp cudaProp;
        cudaGetDeviceProperties(&cudaProp,deviceId);
        std::cout<<cudaProp.name<<std::endl;

        OrtCUDAProviderOptions cuda_option;
        cuda_option.device_id = deviceId;
        cuda_option.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchExhaustive;
        cuda_option.gpu_mem_limit = static_cast<int>(SIZE_MAX*1024*1024);
        cuda_option.arena_extend_strategy = 1;
        cuda_option.has_user_compute_stream = 1;
        cuda_option.default_memory_arena_cfg = nullptr;

        session_options.AppendExecutionProvider_CUDA(cuda_option);
        session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
    }


    // Start Detector Session
    std::unique_ptr<Ort::Session> session;
    std::vector<const char*> inputNames{"input.1"};
    vInt64Shape inputShape{1,3,inputDims[0],inputDims[1]};
    std::vector<const char*> outputNames{"448", "471", "494", "451", "474", "497", "454", "477", "500"};
    vInt64Shape outputShape;

    session = std::make_unique<Ort::Session>(env,DetModelPath.c_str(),session_options);

    Ort::AllocatorWithDefaultOptions allocator;

    outputShape = session->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape();

    std::cout<<session->GetOutputCount()<<std::endl;

    // std::vector<std::vector<float>> outputTensorsValue(session->GetOutputCount());
    // std::vector<Ort::Value> outputTensors;

    for(int i=0;i<session->GetOutputCount();i++){
        outputShape = session->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape();
        // auto outTensorSize = vectorProduct(outputShape);
        // std::vector<float> outValue (outTensorSize);
        // outputTensorsValue.push_back(outValue);
        // Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator,OrtMemType::OrtMemTypeDefault);
        // outputTensors.push_back(Ort::Value::CreateTensor<float>(memoryInfo,outValue.data(),outTensorSize,outputShape.data(),outputShape.size()))
        std::cout<<i+1<<" "<<outputShape<<" "<<vectorProduct(outputShape)<<std::endl;
    }

    std::map<int,std::vector<AnchorBox>> cached_anchors;
    // Create anchors
    for(int k=0;k<feat_stride_fpn.size();k++){
        int stride = feat_stride_fpn[k];
        int heightR = std::ceil(inputDims[1]/stride);
        int widthR = std::ceil(inputDims[0]/stride);


        std::vector<AnchorBox> anchors_boxes;
        for(int na=0;na<anchors;na++){
            for(int h=0;h<heightR;h++){
                for(int w=0;w<widthR;w++){
                float cx = ((float)w+0.5f) * (float)stride;
                float cy = ((float)h+0.5f) * (float)stride;
                    anchors_boxes.push_back(AnchorBox{cx,cy});
                }
            }
        }
        std::cout<<anchors_boxes.size()<<std::endl;
        cached_anchors[stride] = anchors_boxes;
    }

    // End Detector Session

    
    const std::string file_name = "/home/ixion/Projects/fatechCTL/services/face/data/videos/Cathy_ONeil.mp4";
    cv::namedWindow("Main",cv::WINDOW_AUTOSIZE);
    cv::VideoCapture vision(file_name);
    while (vision.isOpened()){
        cv::Mat frame;
        cv::Mat fframe;
        bool ret = vision.read(frame);
        if(frame.empty()){
            break;
        }
        frame.copyTo(fframe);
        cv::Size input_size{inputDims[1],inputDims[0]};
        cv::resize(fframe, fframe, input_size);
        

        size_t inputTensorSize = vectorProduct(inputShape);
        std::vector<float> inputTensorValue(inputTensorSize);
        
        // Frame specs
        double scalefactor = 1/input_std;
        // std::cout<<"Scale factor "<<scalefactor<<std::endl;
        cv::Scalar means {input_mean,input_mean,input_mean};
        

        cv::Mat matf;
        frame.convertTo(matf, CV_32FC3);
        cv::cvtColor(matf,matf,cv::COLOR_BGR2RGB);
        matf = (matf - input_mean) * scalefactor;

        cv::Mat resize_mat_ref;
        cv::resize(matf, resize_mat_ref, input_size);
        std::cout<<resize_mat_ref.rows<<" "<<resize_mat_ref.cols<<std::endl;
        std::vector<cv::Mat> mat_channels;
        cv::split(resize_mat_ref, mat_channels);
        for (unsigned int i = 0; i < 3; ++i){
            std::memcpy(inputTensorValue.data() + i * (inputShape[3] * inputShape[2]),
                        mat_channels.at(i).data,inputShape[2] * inputShape[3] * sizeof(float));
        }
            
        // Blob images
        // cv::Mat blobImage = cv::dnn::blobFromImage(frame,scalefactor,input_size,means,true);

        // std::cout<<"Channels "<<blobImage.channels()<<std::endl;

        // std::cout<<"Type "<<blobImage.type() == CV_32FC(blobImage.channels())<<std::endl;
        

        

        // auto kk = blobImage.data;

        // for(int z=0;z<inputTensorSize;z++){
        //     inputTensorValue[z] = blobImage.at<float>(z);
        // }

        // for(int c=0;c<3;c++){
        //     for(int h=0;h<480;h++){
        //         for(int w=0;w<640;w++){
        //             inputTensorValue[c*480*640+h*640+w] = (float)kk[h*640*3+w*3+c];
        //         }
        //     }

        // }

        // inputTensorValue.assign(blobImage.begin<float>(),blobImage.end<float>());
        // std::cout<<inputTensorValue<<std::endl;


        std::vector<Ort::Value> inputTensors;
        Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator,OrtMemType::OrtMemTypeDefault);
        inputTensors.push_back(std::move(Ort::Value::CreateTensor<float>(memoryInfo,inputTensorValue.data(),inputTensorSize,inputShape.data(),inputShape.size())));

        // std::cout<<inputTensors.front().IsTensor()<<std::endl;

        std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
        std::vector<Ort::Value> outputTensors = session->Run(Ort::RunOptions{nullptr},inputNames.data(),inputTensors.data(),size_t(1),outputNames.data(),size_t(9));
        assert(outputTensors.size()==9 && outputTensors.front().IsTensor());
        std::vector<float> confVector;
        std::vector<std::vector<float>> bboxVector;
        std::vector<std::vector<float>> kpsVector;

        std::vector<ObjectMeta> metas;

        for(int i=0;i<feat_stride_fpn.size();i++){

            auto confSize = vectorProduct(outputTensors[i].GetTensorTypeAndShapeInfo().GetShape());
            auto bboxSize = vectorProduct(outputTensors[i+fmc].GetTensorTypeAndShapeInfo().GetShape());
            auto kpsSize = vectorProduct(outputTensors[i+fmc*2].GetTensorTypeAndShapeInfo().GetShape());

            auto confShape = outputTensors[i].GetTensorTypeAndShapeInfo().GetShape();
            auto bboxShape = outputTensors[i+fmc].GetTensorTypeAndShapeInfo().GetShape();
            auto kpsShape = outputTensors[i+fmc*2].GetTensorTypeAndShapeInfo().GetShape();

            int stride = feat_stride_fpn[i];
            auto anchorStride = cached_anchors[stride];
            // std::cout<<std::floor(inputDims[0]/stride)<<" "<<std::floor(inputDims[1]/stride)<<std::endl;
            for(int j=0;j<confSize;j++){
                // std::cout<<j<<std::endl;
                
                float score = outputTensors.at(i).At<float>({j,0});
                if(score<conf_thresh){
                    continue;
                }
                ObjectMeta obj;
                obj.score = score;

                Bbox bbox {
                    outputTensors.at(i+fmc).At<float>({j,0})*stride,
                    outputTensors.at(i+fmc).At<float>({j,1})*stride,
                    outputTensors.at(i+fmc).At<float>({j,2})*stride,
                    outputTensors.at(i+fmc).At<float>({j,3})*stride,
                };

                obj.bbox = distance2bbox(anchorStride[j],bbox);
                

                metas.push_back(obj);
                // std::vector<float> kps;
                // for(int bi=0;bi<10;bi++){
                //     kps.push_back(bbox_pred[10*j+bi]*stride);
                // }
                // kpsVector.push_back(kps);


            }


        }

        std::vector<ObjectMeta> outputMetas;

        nms(metas,outputMetas,.4);
        std::cout<<outputMetas.size()<<std::endl;

        for(auto obj:outputMetas){
            std::cout<<"["<<obj.score<<", "<<obj.bbox.x1<<", "<<obj.bbox.y1<<", "<<obj.bbox.x2<<", "<<obj.bbox.y2<<"]\n";
            cv::rectangle(fframe,cv::Point(obj.bbox.x1,obj.bbox.y1),cv::Point(obj.bbox.x2,obj.bbox.y2),cv::Scalar(0, 255, 0));
        }

        // std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();

        // std::cout<<"Inference Time: "<<std::chrono::duration_cast<std::chrono::milliseconds>(end-begin).count()<<"ms"<<std::endl;
        // std::cout<<inputTensorValue.size()<<std::endl;
        cv::imshow("Main",resize_mat_ref);

        if(cv::waitKey(10)==27){
            break;
        }
    }


    return 0;
}

Also, I checked the processing steps and there is no problem and it looks fine.

Please let me know if there is any mistake:))

Source link