上一回分析到,第一阶段的主要任务是改写detect函数,那么本次就是先将detect函数修改成C语言版本!
?
先上代码,下列代码仍有几个变量没进行修改,其中包括blockCacheFlags, blockCache,grad,qangle(Mat类型),这些mat类型的矩阵,考虑直接传入cuda PtrStep类型或者用二维数组传入
?
void mynormalizeBlockHistogram(float* _hist, size_t blockHistogramSize, double Threshold) { float* hist = &_hist[0]; size_t i, sz = blockHistogramSize; float sum = 0; for( i = 0; i < sz; i++ ) sum += hist[i]*hist[i]; float scale = 1.f/(std::sqrt(sum)+sz*0.1f), thresh = (float)Threshold; for( i = 0, sum = 0; i < sz; i++ ) { hist[i] = std::min(hist[i]*scale, thresh); sum += hist[i]*hist[i]; } scale = 1.f/(std::sqrt(sum)+1e-3f); for( i = 0; i < sz; i++ ) hist[i] *= scale; } const float* mygetBlock(int*ymaxCache, int blockHistogramSize, int blockCache_h, int cacheStride_w, int cacheStride_h, int pt_x, int pt_y, int imgoffset_x, int imgoffset_y, float* buf, bool useCache, Mat_<uchar> blockCacheFlags, Mat_<float> blockCache, int count1, int count2, int count4, Mat grad, Mat qangle, size_t* gradOf, size_t* qangleOf, int* histOf1, int* histOf2, int* histOf3, int* histOf4, float* histWeights1, float* histWeights2, float* histWeights3, float* histWeights4, float* gradweight ) {//blockCacheFlags, blockCache待处理 float* blockHist = buf; //int blockSize_w = 16, blockSize_h = 16; pt_x += imgoffset_x; pt_y += imgoffset_y; //cout << "1" << endl; if( useCache ) { //int cacheSize_w = (grad_w - blockSize_w)/cacheStride_w + 1; //int cacheSize_h = (winSize_h/cacheStride_h) + 1; int cacheIdx_x = pt_x/cacheStride_w; int cacheIdx_y = (pt_y/cacheStride_h) % blockCache_h; if( pt_y != ymaxCache[cacheIdx_y] ) { Mat_<uchar> cacheRow = blockCacheFlags.row(cacheIdx_y); //cout << cacheIdx_y << endl; cacheRow = (uchar)0; ymaxCache[cacheIdx_y] = pt_y; } //cout << "2" << endl; blockHist = &blockCache[cacheIdx_y][cacheIdx_x*blockHistogramSize]; uchar& computedFlag = blockCacheFlags(cacheIdx_y, cacheIdx_x); if( computedFlag != 0 ) return blockHist; computedFlag = (uchar)1; // set it at once, before actual computing } int k, C1 = count1, C2 = count2, C4 = count4; const float* gradPtr = (const float*)(grad.data + grad.step*pt_y) + pt_x*2; const uchar* qanglePtr = qangle.data + qangle.step*pt_y + pt_x*2; for( k = 0; k < blockHistogramSize; k++ ) blockHist[k] = 0.f; for( k = 0; k < C1; k++ ) { //const PixData& pk = _pixData[k]; const float* a = gradPtr + gradOf[k]; float w = gradweight[k]*histWeights1[k]; const uchar* h = qanglePtr + qangleOf[k]; int h0 = h[0], h1 = h[1]; float* hist = blockHist + histOf1[k]; float t0 = hist[h0] + a[0]*w; float t1 = hist[h1] + a[1]*w; hist[h0] = t0; hist[h1] = t1; } //cout << "3" << endl; for( ; k < C2; k++ ) { //const PixData& pk = _pixData[k]; const float* a = gradPtr + gradOf[k]; float w, t0, t1, a0 = a[0], a1 = a[1]; const uchar* h = qanglePtr + qangleOf[k]; int h0 = h[0], h1 = h[1]; float* hist = blockHist + histOf1[k]; w = gradweight[k] * histWeights1[k]; t0 = hist[h0] + a0*w; t1 = hist[h1] + a1*w; hist[h0] = t0; hist[h1] = t1; hist = blockHist + histOf2[k]; w = gradweight[k]*histWeights2[k]; t0 = hist[h0] + a0*w; t1 = hist[h1] + a1*w; hist[h0] = t0; hist[h1] = t1; } //cout << "4" << endl; for( ; k < C4; k++ ) { //const PixData& pk = _pixData[k]; const float* a = gradPtr + gradOf[k]; float w, t0, t1, a0 = a[0], a1 = a[1]; const uchar* h = qanglePtr + qangleOf[k]; int h0 = h[0], h1 = h[1]; float* hist = blockHist + histOf1[k]; w = gradweight[k]*histWeights1[k]; t0 = hist[h0] + a0*w; t1 = hist[h1] + a1*w; hist[h0] = t0; hist[h1] = t1; hist = blockHist + histOf2[k]; w = gradweight[k]*histWeights2[k]; t0 = hist[h0] + a0*w; t1 = hist[h1] + a1*w; hist[h0] = t0; hist[h1] = t1; hist = blockHist + histOf3[k]; w = gradweight[k]*histWeights3[k]; t0 = hist[h0] + a0*w; t1 = hist[h1] + a1*w; hist[h0] = t0; hist[h1] = t1; hist = blockHist + histOf4[k]; w = gradweight[k]*histWeights4[k]; t0 = hist[h0] + a0*w; t1 = hist[h1] + a1*w; hist[h0] = t0; hist[h1] = t1; } //cout << "5" << endl; mynormalizeBlockHistogram(blockHist, blockHistogramSize, 0.2); /*for(int i = 0; i < blockHistogramSize; i ++) { cout << blockHist[i] << " "; } cout << endl; */ return blockHist; } void HOGDescriptor::detect(const Mat& img, vector<Point>& hits, vector<double>& weights, double hitThreshold, Size winStride, Size padding, const vector<Point>& locations) const { hits.clear(); if( svmDetector.empty() ) return; if( winStride == Size() ) winStride = cellSize; Size cacheStride(gcd(winStride.width, blockStride.width), gcd(winStride.height, blockStride.height)); size_t nwindows = locations.size(); padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width); padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height); Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2); double start = (double)getTickCount(); //time!!! HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride); double t = ((double)getTickCount() - start)/getTickFrequency(); if( !nwindows ) nwindows = cache.windowsInImage(paddedImgSize, winStride).area(); const HOGCache::BlockData* blockData = &cache.blockData[0]; int nblocks = cache.nblocks.area(); int blockHistogramSize = cache.blockHistogramSize; size_t dsize = getDescriptorSize(); double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0; vector<float> blockHist(blockHistogramSize); //*****define para***** int paddedImgSize_w = paddedImgSize.width, paddedImgSize_h = paddedImgSize.height; int winSize_w = winSize.width, winSize_h = winSize.height; int winStride_w = winStride.width, winStride_h = winStride.height; int count = svmDetector.size(); float* svmDetect = (float*)malloc(sizeof(float) * count); for (int i = 0; i < count;i++) { svmDetect[i] = svmDetector[i]; } count = cache.blockData.size(); int* blockdata_histOfs = (int*)malloc(sizeof(int) * count); int* blockdata_imgOffsetx = (int*)malloc(sizeof(int) * count); int* blockdata_imgOffsety = (int*)malloc(sizeof(int) * count); for (int i = 0; i < count;i++) { blockdata_histOfs[i] = cache.blockData[i].histOfs; blockdata_imgOffsetx[i] = cache.blockData[i].imgOffset.x; blockdata_imgOffsety[i] = cache.blockData[i].imgOffset.y; } count = cache.pixData.size(); size_t* gradOf = (size_t*)malloc(sizeof(size_t) * count); size_t* qangleOf = (size_t*)malloc(sizeof(size_t) * count); int* histOf1 = (int*)malloc(sizeof(int) * count); int* histOf2 = (int*)malloc(sizeof(int) * count); int* histOf3 = (int*)malloc(sizeof(int) * count); int* histOf4 = (int*)malloc(sizeof(int) * count); float* histWeights1 = (float*)malloc(sizeof(float) * count); float* histWeights2 = (float*)malloc(sizeof(float) * count); float* histWeights3 = (float*)malloc(sizeof(float) * count); float* histWeights4 = (float*)malloc(sizeof(float) * count); float* gradweight = (float*)malloc(sizeof(float) * count); for(int i = 0; i < count; i ++) { gradOf[i] = cache.pixData[i].gradOfs; qangleOf[i] = cache.pixData[i].qangleOfs; histOf1[i] = cache.pixData[i].histOfs[0]; histOf2[i] = cache.pixData[i].histOfs[1]; histOf3[i] = cache.pixData[i].histOfs[2]; histOf4[i] = cache.pixData[i].histOfs[3]; histWeights1[i] = cache.pixData[i].histWeights[0]; histWeights2[i] = cache.pixData[i].histWeights[1]; histWeights3[i] = cache.pixData[i].histWeights[2]; histWeights4[i] = cache.pixData[i].histWeights[3]; gradweight[i] = cache.pixData[i].gradWeight; } count = nwindows * nblocks; float* myweights = (float*)malloc(sizeof(float) * count); int* hits_x = (int*)malloc(sizeof(int) * count); int* hits_y = (int*)malloc(sizeof(int) * count); count = cache.ymaxCached.size(); int* ymaxCache = (int*)malloc(sizeof(int) * count); for(int i = 0; i < count; i ++) { ymaxCache[i] = cache.ymaxCached[i]; } count = blockHistogramSize; float* block_Hist = (float*)malloc(sizeof(float) * count); for(int i = 0; i < count; i ++) { block_Hist[i] = blockHist[i]; } //********************* for( size_t i = 0; i < nwindows; i++ ) { //****************************改***************************** int pt0_x, pt0_y; //***getwindows*** int nwindowsX = (paddedImgSize_w - winSize_w)/winStride_w + 1; int y = (int)i / nwindowsX; int x = (int)i - nwindowsX*y; pt0_x = x*winStride_w; pt0_y = y*winStride_h; //**************** double s = rho; const float* svmVec = &svmDetect[0]; int j, k; for( j = 0; j < nblocks; j++, svmVec += blockHistogramSize ) { int pt_x = pt0_x + blockdata_imgOffsetx[j]; int pt_y = pt0_y + blockdata_imgOffsety[j]; if(pt_x < 0 || pt_y < 0) { //cout << pt_x << endl; //cout << pt_y << endl; } const float* vec = mygetBlock(ymaxCache, blockHistogramSize, cache.blockCache.rows, cache.cacheStride.width, cache.cacheStride.height, pt_x, pt_y, cache.imgoffset.x, cache.imgoffset.y, block_Hist, true, cache.blockCacheFlags, cache.blockCache, cache.count1, cache.count2, cache.count4, cache.grad, cache.qangle, gradOf, qangleOf, histOf1, histOf2, histOf3, histOf4, histWeights1, histWeights2, histWeights3, histWeights4, gradweight); for( k = 0; k <= blockHistogramSize - 4; k += 4 ) s += vec[k]*svmVec[k] + vec[k+1]*svmVec[k+1] + vec[k+2]*svmVec[k+2] + vec[k+3]*svmVec[k+3]; for( ; k < blockHistogramSize; k++ ) s += vec[k]*svmVec[k]; } //cout << s << endl; if( s >= hitThreshold ) { Point pt0; pt0.x = pt0_x; pt0.y = pt0_y; hits.push_back(pt0); weights.push_back(s); hits_x[i] = pt0_x; hits_y[i] = pt0_y; myweights[i] = s; //cout << pt0_x << "+" << pt0_y << "+" << s << endl; } else { hits_x[i] = -1; hits_y[i] = -1; myweights[i] = -1; //cout << "pass" << endl; } } }
?
?
跟源码进行比对,不难发现,主要改变的原因来自于mygetBlock函数不再是HOGCache的成员函数了,而是自己定义的普通函数,所以cache对象中的成员变量mygetBlock是没办法直接获得的,所以要通过传参的形式传入函数。这就是detect函数中有了很多malloc的原因。
?
这里注意几个地方:
(1).??const float* gradPtr = (const float*)(grad.data + grad.step*pt_y) + pt_x*2;
??????????? const uchar* qanglePtr = qangle.data + qangle.step*pt_y + pt_x*2;
这里实际上返回的是一个通道指针,是第pt_y行第pt_x*2列的元素,若gradPtr[0]则为第一通道元素,[1]则为第二通道元素,因为grad和qangle均为两通道矩阵!
(2).这里有一个bug,至今没有想得非常清楚,因为malloc了往往是要free的,但是这里前面有提到过,使用了parallel_for_函数,利用了CPU的并行计算,就是同时又很多个detect在进行,当第一个detect完成后,若free了变量,那么其他detect将无法运行,这里将出现内存错误!如何解决这个问题,还有待研究!
原文:http://lps-683.iteye.com/blog/2286597