opencv dnn模块
最近在研究opencv的dnn模块,简单的做一个记录,以备后面使用。
dnn简介
dnn是opencv中的深度神经网络模块,支持运行网络,但是不支持训练,实现图像场景中的图像分类,图像检测和图像分割,并且支持主流的深度学习框架。
dnn尝试
最近一端时间尝试了tensorflow,darknet还有caffe。调通了darknet的yolov3还有caffe的ssd_mobilnet检测网络,但是都是基于cpu的,所以速度可想而知,yolov3速度大约在1~2fps,ssd-mobilnet大约在9fps。
googlenet模型是从caffe官方github仓库里面下的,从网上下载了一张雨伞图片,测试还可可以,准确率蛮高的。
资料来源
代码主要是在github上,还有各个博客上找的。其实opencv也提供了相关demo,网上的代码也是基于demo改的,安装完opencv之后,在source/samples/dnn里面提供了相关代码,可以参考。
问题记录
- 首先是tensorflow下的ssd网络,如果调用opencv里面的权重是可以跑通的,但是自己生成的却不行,但是网上有大佬用自己训练的模型跑通了。
- 使用opencl加速,代码里有一个参数可以进行加速,但是使用之后就报错,从网上查了一下
- opencv对显卡支持不好 -- github opencv issue
- opencv只支持intel显卡-- 这个是前几年的文章看见的,现在支持了。
总结
- 个人感觉darknet是用c写的,caffe用c++写的,比较容易调通,调用过程基本上没有遇到困难,但是tensorflow和pytorch使用python写的,调用却不太顺利,是不是和语言有关系呢?
- dnn模块软肋
- 只支持部分网络
- gpu加速还是个问题,可能还不太成熟吧,博客上看见的也是用cpu调通的,但是速度这么慢,落地不行呀。
##caffe-ssd mobilnet opencv3.4.1
#include<iostream>
#include<opencv2/opencv.hpp>
#include<opencv2/dnn.hpp>
using namespace std;
using namespace cv;
using namespace cv::dnn;
class Object
{
public:
Object();
Object(int index, float confidence, String name, Rect rect);
~Object();
public:
int index;
String name;
float confidence;
Rect rect;
private:
};
Object::Object() {
}
Object::Object(int index, float confidence, String name, Rect rect) {
this->index = index;
this->confidence = confidence;
this->name = name;
this->rect = rect;
}
Object::~Object() {
}
//----------------------------全局常量----------------------------------
//配置好protxt文件,网络结构描述文件
//配置好caffemodel文件,训练好的网络权重
const String PROTOTX_FILE = "no_bn.prototxt";
const String CAFFE_MODEL_FILE = "no_bn.caffemodel";
const String classNames[] = { "background", "jingling", "t16", "xiao_spark", "yu_mavic"};
const float CONF_THRESH = 0.5;
int main() {
//------------------------实例化网络----------------------------
Net mobileNetSSD = readNetFromCaffe(PROTOTX_FILE, CAFFE_MODEL_FILE);
if (mobileNetSSD.empty()) {
cerr << "加载网络失败!" << endl;
return -1;
}
TickMeter t;
VideoCapture video;
video.open("scr.mp4");
Mat srcImg;
video >> srcImg;
while (!srcImg.empty())
{
clock_t start_time,end_time;
start_time = clock();
//----------------------设置网络输入-----------------------
//将二维图像转换为CNN输入的张量Tensor,作为网络的输入
mobileNetSSD.setInput(blobFromImage(srcImg, 1.0 / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), true, false));
t.start();
//--------------------CNN网络前向计算----------------------
Mat netOut = mobileNetSSD.forward();
t.stop();
//----------------------解析计算结果-----------------------
vector<Object> detectObjects;
Mat detectionResult(netOut.size[2], netOut.size[3], CV_32F, netOut.ptr<float>());
for (int i = 0; i < detectionResult.rows; i++) {
//目标类别的索引
int objectIndex = detectionResult.at<float>(i, 1);
//检测结果置信度
float confidence = detectionResult.at<float>(i, 2);
//根据置信度阈值过滤掉置信度较小的目标
if (confidence<CONF_THRESH) {
continue;
}
//反归一化,得到图像坐标
int xLeftUp = static_cast<int>(detectionResult.at<float>(i, 3)*srcImg.cols);
int yLeftUp = static_cast<int>(detectionResult.at<float>(i, 4)*srcImg.rows);
int xRightBottom = static_cast<int>(detectionResult.at<float>(i, 5)*srcImg.cols);
int yRightBottom = static_cast<int>(detectionResult.at<float>(i, 6)*srcImg.rows);
//矩形框
Rect rect(Point{ xLeftUp,yLeftUp }, Point{ xRightBottom,yRightBottom });
//保存结果
detectObjects.push_back(Object{ objectIndex,confidence,classNames[objectIndex],rect });
}
//------------------------显示结果-----------------------------------
int count = 0;
for (auto& i : detectObjects) {
rectangle(srcImg, i.rect, Scalar(0, 255, 255), 2);
putText(srcImg, i.name, i.rect.tl(), 1, 1.8, Scalar(255, 0, 0), 2);
cout << "第" << count << "个目标:" << i.name << "\t" << i.rect << "\t" << i.confidence << endl;
count++;
}
end_time = clock();
imshow("MobileNet-SSD", srcImg);
waitKey(10);
cout << "FPS:" << CLOCKS_PER_SEC/(double)(end_time - start_time) << endl;
video >> srcImg;
}
waitKey(0);
}
### opencv4.1 darknet-yolov3
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <fstream>
#include <iostream>
#include <algorithm>
#include <cstdlib>
using namespace std;
using namespace cv;
using namespace cv::dnn;
void image_detection();
void video_detection();
float confidenceThreshold = 0.25;
String yolo_cfg = "yolov3-voc.cfg";
String yolo_model = "yolov3-voc_f.weights";
int main(int argc, char** argv)
{
//image_detection();
video_detection();
}
void image_detection() {
Net net = readNetFromDarknet(yolo_cfg, yolo_model);
net.setPreferableBackend(DNN_BACKEND_OPENCV);
net.setPreferableTarget(DNN_TARGET_CPU);
//net.setPreferableBackend(DNN_BACKEND_OPENCV); //启动GPU加速
//net.setPreferableTarget(DNN_TARGET_OPENCL); //启动GPU加速
std::vector<String> outNames = net.getUnconnectedOutLayersNames();
for (int i = 0; i < outNames.size(); i++) {
printf("output layer name : %s\n", outNames[i].c_str());
}
vector<string> classNamesVec;
//ifstream classNamesFile("object_detection_classes_yolov3.txt");
ifstream classNamesFile("myData.names");
if (classNamesFile.is_open())
{
string className = "";
while (std::getline(classNamesFile, className))
classNamesVec.push_back(className);
}
clock_t start_time;
start_time = clock();
// 加载图像
Mat frame = imread("scr.jpg");
//Mat frame = imread("cat.jpg");
Mat inputBlob = blobFromImage(frame, 1 / 255.F, Size(416, 416), Scalar(), true, false);
net.setInput(inputBlob);
// 检测
std::vector<Mat> outs;
net.forward(outs, outNames);
vector<double> layersTimings;
double freq = getTickFrequency() / 1000;
double time = net.getPerfProfile(layersTimings) / freq;
ostringstream ss;
cout << "detection time: " << time << " ms";
//putText(frame, ss.str(), Point(20, 20), 0, 0.5, Scalar(0, 0, 255));
vector<Rect> boxes;
vector<int> classIds;
vector<float> confidences;
for (size_t i = 0; i<outs.size(); ++i)
{
// Network produces output blob with a shape NxC where N is a number of
// detected objects and C is a number of classes + 4 where the first 4
// numbers are [center_x, center_y, width, height]
float* data = (float*)outs[i].data;
for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
{
Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
Point classIdPoint;
double confidence;
minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
if (confidence > confidenceThreshold)
{
int centerX = (int)(data[0] * frame.cols);
int centerY = (int)(data[1] * frame.rows);
int width = (int)(data[2] * frame.cols);
int height = (int)(data[3] * frame.rows);
int left = centerX - width / 2;
int top = centerY - height / 2;
classIds.push_back(classIdPoint.x);
confidences.push_back((float)confidence);
boxes.push_back(Rect(left, top, width, height));
}
}
}
vector<int> indices;
NMSBoxes(boxes, confidences, 0.5, 0.2, indices);
for (size_t i = 0; i < indices.size(); ++i)
{
int idx = indices[i];
Rect box = boxes[idx];
String className = classNamesVec[classIds[idx]] ;
putText(frame, className.c_str(), box.tl(), FONT_HERSHEY_SIMPLEX, 1, Scalar(255, 0, 0), 2, 8);
rectangle(frame, box, Scalar(0, 0, 255), 2, 8, 0);
}
cout <<"运行时间:" << ((double)(clock() - start_time) / CLOCKS_PER_SEC) * 1000 << "ms" << "\n" << endl;
imshow("YOLOv3-Detections", frame);
waitKey(0);
return;
}
void video_detection() {
Net net = readNetFromDarknet(yolo_cfg, yolo_model);
net.setPreferableBackend(DNN_BACKEND_OPENCV);
net.setPreferableTarget(DNN_TARGET_CPU);
//net.setPreferableBackend(DNN_BACKEND_OPENCV); //启动GPU加速
//net.setPreferableTarget(DNN_TARGET_OPENCL); //启动GPU加速
if (net.empty())
{
printf("Could not load net...\n");
return;
}
std::vector<String> outNames = net.getUnconnectedOutLayersNames();
for (int i = 0; i < outNames.size(); i++) {
printf("output layer name : %s\n", outNames[i].c_str());
}
vector<string> classNamesVec;
ifstream classNamesFile("myData.names");
if (classNamesFile.is_open())
{
string className = "";
while (std::getline(classNamesFile, className))
classNamesVec.push_back(className);
}
VideoCapture video_read;
video_read.open("scr2.mp4");
Mat frame;
video_read >> frame;
/*
VideoCapture capture;
capture.open("D:/vcprojects/images/fbb.avi");
if (!capture.isOpened()) {
printf("could not open the camera...\n");
return;
}
*/
while (!frame.empty())
{
//加载图像
//if (frame.empty())
// if (frame.channels() == 4)
// cvtColor(frame, frame, COLOR_BGRA2BGR);
Mat inputBlob = blobFromImage(frame, 1 / 255.F, Size(416, 416), Scalar(), true, false);
net.setInput(inputBlob, "data");
//Mat detectionMat = net.forward("detection_out");
//vector<double> layersTimings;
//double freq = getTickFrequency() / 1000;
//double time = net.getPerfProfile(layersTimings) / freq;
//ostringstream ss;
//ss << "FPS: " << 1000 / time << " ; time: " << time << " ms";
//putText(frame, ss.str(), Point(20, 20), 0, 0.5, Scalar(0, 0, 255));
// 检测
std::vector<Mat> outs;
net.forward(outs, outNames);
vector<double> layersTimings;
double freq = getTickFrequency() / 1000;
double time = net.getPerfProfile(layersTimings) / freq;
ostringstream ss;
ss << "detection time: " << time << " ms";
putText(frame, ss.str(), Point(20, 20), 0, 0.5, Scalar(0, 0, 255));
vector<Rect> boxes;
vector<int> classIds;
vector<float> confidences;
for (size_t i = 0; i < outs.size(); ++i)
{
// Network produces output blob with a shape NxC where N is a number of
// detected objects and C is a number of classes + 4 where the first 4
// numbers are [center_x, center_y, width, height]
float* data = (float*)outs[i].data;
for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
{
Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
Point classIdPoint;
double confidence;
minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
if (confidence > confidenceThreshold)
{
int centerX = (int)(data[0] * frame.cols);
int centerY = (int)(data[1] * frame.rows);
int width = (int)(data[2] * frame.cols);
int height = (int)(data[3] * frame.rows);
int left = centerX - width / 2;
int top = centerY - height / 2;
classIds.push_back(classIdPoint.x);
confidences.push_back((float)confidence);
boxes.push_back(Rect(left, top, width, height));
}
}
}
vector<int> indices;
NMSBoxes(boxes, confidences, 0.5, 0.2, indices);
for (size_t i = 0; i < indices.size(); ++i)
{
int idx = indices[i];
Rect box = boxes[idx];
String className = classNamesVec[classIds[idx]] + std::to_string(confidences[idx]);
putText(frame, className.c_str(), box.tl(), FONT_HERSHEY_SIMPLEX, 1, Scalar(255, 0, 0), 2, 8);
rectangle(frame, box, Scalar(0, 0, 255), 2, 8, 0);
}
/*for (int i = 0; i < detectionMat.rows; i++)
{
const int probability_index = 5;
const int probability_size = detectionMat.cols - probability_index;
float *prob_array_ptr = &detectionMat.at<float>(i, probability_index);
size_t objectClass = max_element(prob_array_ptr, prob_array_ptr + probability_size) - prob_array_ptr;
float confidence = detectionMat.at<float>(i, (int)objectClass + probability_index);
if (confidence > confidenceThreshold)
{
float x = detectionMat.at<float>(i, 0);
float y = detectionMat.at<float>(i, 1);
float width = detectionMat.at<float>(i, 2);
float height = detectionMat.at<float>(i, 3);
int xLeftBottom = static_cast<int>((x - width / 2) * frame.cols);
int yLeftBottom = static_cast<int>((y - height / 2) * frame.rows);
int xRightTop = static_cast<int>((x + width / 2) * frame.cols);
int yRightTop = static_cast<int>((y + height / 2) * frame.rows);
Rect object(xLeftBottom, yLeftBottom,
xRightTop - xLeftBottom,
yRightTop - yLeftBottom);
rectangle(frame, object, Scalar(0, 255, 0));
if (objectClass < classNamesVec.size())
{
ss.str("");
ss << confidence;
String conf(ss.str());
String label = String(classNamesVec[objectClass]) + ": " + conf;
int baseLine = 0;
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
rectangle(frame, Rect(Point(xLeftBottom, yLeftBottom),
Size(labelSize.width, labelSize.height + baseLine)),
Scalar(255, 255, 255), -1);
putText(frame, label, Point(xLeftBottom, yLeftBottom + labelSize.height),
FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 0, 0));
}
}
}*/
imshow("YOLOv3: Detections", frame);
video_read >> frame;
if (waitKey(1) >= 0) break;
}
}
#opencv3.4.1 caffe googlenet
#include<opencv2\opencv.hpp>
#include<vector>
#include<string>
using namespace cv;
using namespace dnn;
using namespace std;
String model_file = "bvlc_googlenet.caffemodel";
String model_txtfile = "deploy.prototxt";
String labels_file = "synset_words.txt";
vector<String>readLabels();
int main(int arc, char** argv) {
Mat src = imread("scr1.jpg");
namedWindow("input", CV_WINDOW_AUTOSIZE);
imshow("input", src);
//¶ÁÈ¡Ä£Ð͵ÄÀà±ð£šÎıŸ£©
vector<String> labels = readLabels();
//¶ÁÈ¡google_netµÄÄ£ÐͺÍÃèÊöÎÄŒþ
Net net = readNetFromCaffe(model_txtfile, model_file);
if (net.empty()) {
printf("read caffee model data failure\n");
return -1;
}
//œ«ÍŒÏñתΪgoogle_netÍøÂçÊäÈëµÄ¶ÔÏó£¬ÓÉÃèÊöÎÄŒþ¿ÉÖª£¬ÍŒÏñ³ßŽçͳһΪ224*224
Mat inputBlob = blobFromImage(src, 1.0, Size(224, 224), Scalar(104, 117, 123));
//œøÐÐÇ°ÏòŽ«²¥£¬ÓÉÃèÊöÎÄŒþ¿ÉÖª£¬µÚÒ»²ãÓÃÁË10žöŸí»ý²ã£¬ÌáÈ¡ÍŒÏñ10ÖÖ²»Í¬µÄÌØÕ÷
Mat prob;
for (int i = 0; i < 10; i++) {
net.setInput(inputBlob, "data");
prob = net.forward("prob");//×îºóÒ»²ãµÄÊä³öΪ¡°prob¡±
}
//Êä³ö
//µÃµœµÄžÅÂÊֵΪ1ÐÐ1000ÁеÄ
Mat promat = prob.reshape(1, 1);
Point classLoc;
double classProb;
minMaxLoc(promat, NULL, &classProb, NULL, &classLoc);
printf("current image classification: %s,probablity %f\n", labels.at(classLoc.x).c_str(), classProb);
putText(src, labels.at(classLoc.x), Point(20, 20), FONT_HERSHEY_COMPLEX, 1.0, Scalar(0, 0, 255), 2);
imshow("output", src);
waitKey(0);
return 0;
}
//¶ÁÈ¡Ä£Ð͵ÄÀà±ð£šÎıŸ£©
vector<String>readLabels() {
vector<String>classNames;
ifstream fp(labels_file);//Žò¿ªÎÄŒþ
if (!fp.is_open()) {//ÎÄŒþûŽò¿ª
printf("could not open the file ");
exit(-1);
}
string name;
while (!fp.eof()) {//ÎÄŒþû¶Áµœœáβ
getline(fp, name);//µÃµœÃ¿Ò»ÐУ¬·ÅµœnameÖÐ
if (name.length()) {//·Ç¿ÕÐÐ
classNames.push_back(name.substr(name.find(' ') + 1));//
}
}
fp.close();
return classNames;
}