上一回分析到,第一阶段的主要任务是改写detect函数,那么本次就是先将detect函数修改成C语言版本!
?
先上代码,下列代码仍有几个变量没进行修改,其中包括blockCacheFlags, blockCache,grad,qangle(Mat类型),这些mat类型的矩阵,考虑直接传入cuda PtrStep类型或者用二维数组传入
?
void mynormalizeBlockHistogram(float* _hist, size_t blockHistogramSize, double Threshold)
{
float* hist = &_hist[0];
size_t i, sz = blockHistogramSize;
float sum = 0;
for( i = 0; i < sz; i++ )
sum += hist[i]*hist[i];
float scale = 1.f/(std::sqrt(sum)+sz*0.1f), thresh = (float)Threshold;
for( i = 0, sum = 0; i < sz; i++ )
{
hist[i] = std::min(hist[i]*scale, thresh);
sum += hist[i]*hist[i];
}
scale = 1.f/(std::sqrt(sum)+1e-3f);
for( i = 0; i < sz; i++ )
hist[i] *= scale;
}
const float* mygetBlock(int*ymaxCache, int blockHistogramSize, int blockCache_h, int cacheStride_w, int cacheStride_h, int pt_x, int pt_y, int imgoffset_x, int imgoffset_y, float* buf, bool useCache, Mat_<uchar> blockCacheFlags, Mat_<float> blockCache, int count1, int count2, int count4, Mat grad, Mat qangle,
size_t* gradOf, size_t* qangleOf, int* histOf1, int* histOf2, int* histOf3, int* histOf4,
float* histWeights1, float* histWeights2, float* histWeights3, float* histWeights4, float* gradweight )
{//blockCacheFlags, blockCache待处理
float* blockHist = buf;
//int blockSize_w = 16, blockSize_h = 16;
pt_x += imgoffset_x;
pt_y += imgoffset_y;
//cout << "1" << endl;
if( useCache )
{
//int cacheSize_w = (grad_w - blockSize_w)/cacheStride_w + 1;
//int cacheSize_h = (winSize_h/cacheStride_h) + 1;
int cacheIdx_x = pt_x/cacheStride_w;
int cacheIdx_y = (pt_y/cacheStride_h) % blockCache_h;
if( pt_y != ymaxCache[cacheIdx_y] )
{
Mat_<uchar> cacheRow = blockCacheFlags.row(cacheIdx_y);
//cout << cacheIdx_y << endl;
cacheRow = (uchar)0;
ymaxCache[cacheIdx_y] = pt_y;
}
//cout << "2" << endl;
blockHist = &blockCache[cacheIdx_y][cacheIdx_x*blockHistogramSize];
uchar& computedFlag = blockCacheFlags(cacheIdx_y, cacheIdx_x);
if( computedFlag != 0 )
return blockHist;
computedFlag = (uchar)1; // set it at once, before actual computing
}
int k, C1 = count1, C2 = count2, C4 = count4;
const float* gradPtr = (const float*)(grad.data + grad.step*pt_y) + pt_x*2;
const uchar* qanglePtr = qangle.data + qangle.step*pt_y + pt_x*2;
for( k = 0; k < blockHistogramSize; k++ )
blockHist[k] = 0.f;
for( k = 0; k < C1; k++ )
{
//const PixData& pk = _pixData[k];
const float* a = gradPtr + gradOf[k];
float w = gradweight[k]*histWeights1[k];
const uchar* h = qanglePtr + qangleOf[k];
int h0 = h[0], h1 = h[1];
float* hist = blockHist + histOf1[k];
float t0 = hist[h0] + a[0]*w;
float t1 = hist[h1] + a[1]*w;
hist[h0] = t0; hist[h1] = t1;
}
//cout << "3" << endl;
for( ; k < C2; k++ )
{
//const PixData& pk = _pixData[k];
const float* a = gradPtr + gradOf[k];
float w, t0, t1, a0 = a[0], a1 = a[1];
const uchar* h = qanglePtr + qangleOf[k];
int h0 = h[0], h1 = h[1];
float* hist = blockHist + histOf1[k];
w = gradweight[k] * histWeights1[k];
t0 = hist[h0] + a0*w;
t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1;
hist = blockHist + histOf2[k];
w = gradweight[k]*histWeights2[k];
t0 = hist[h0] + a0*w;
t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1;
}
//cout << "4" << endl;
for( ; k < C4; k++ )
{
//const PixData& pk = _pixData[k];
const float* a = gradPtr + gradOf[k];
float w, t0, t1, a0 = a[0], a1 = a[1];
const uchar* h = qanglePtr + qangleOf[k];
int h0 = h[0], h1 = h[1];
float* hist = blockHist + histOf1[k];
w = gradweight[k]*histWeights1[k];
t0 = hist[h0] + a0*w;
t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1;
hist = blockHist + histOf2[k];
w = gradweight[k]*histWeights2[k];
t0 = hist[h0] + a0*w;
t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1;
hist = blockHist + histOf3[k];
w = gradweight[k]*histWeights3[k];
t0 = hist[h0] + a0*w;
t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1;
hist = blockHist + histOf4[k];
w = gradweight[k]*histWeights4[k];
t0 = hist[h0] + a0*w;
t1 = hist[h1] + a1*w;
hist[h0] = t0; hist[h1] = t1;
}
//cout << "5" << endl;
mynormalizeBlockHistogram(blockHist, blockHistogramSize, 0.2);
/*for(int i = 0; i < blockHistogramSize; i ++)
{
cout << blockHist[i] << " ";
}
cout << endl;
*/
return blockHist;
}
void HOGDescriptor::detect(const Mat& img,
vector<Point>& hits, vector<double>& weights, double hitThreshold,
Size winStride, Size padding, const vector<Point>& locations) const
{
hits.clear();
if( svmDetector.empty() )
return;
if( winStride == Size() )
winStride = cellSize;
Size cacheStride(gcd(winStride.width, blockStride.width),
gcd(winStride.height, blockStride.height));
size_t nwindows = locations.size();
padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2);
double start = (double)getTickCount(); //time!!!
HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride);
double t = ((double)getTickCount() - start)/getTickFrequency();
if( !nwindows )
nwindows = cache.windowsInImage(paddedImgSize, winStride).area();
const HOGCache::BlockData* blockData = &cache.blockData[0];
int nblocks = cache.nblocks.area();
int blockHistogramSize = cache.blockHistogramSize;
size_t dsize = getDescriptorSize();
double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
vector<float> blockHist(blockHistogramSize);
//*****define para*****
int paddedImgSize_w = paddedImgSize.width, paddedImgSize_h = paddedImgSize.height;
int winSize_w = winSize.width, winSize_h = winSize.height;
int winStride_w = winStride.width, winStride_h = winStride.height;
int count = svmDetector.size();
float* svmDetect = (float*)malloc(sizeof(float) * count);
for (int i = 0; i < count;i++)
{
svmDetect[i] = svmDetector[i];
}
count = cache.blockData.size();
int* blockdata_histOfs = (int*)malloc(sizeof(int) * count);
int* blockdata_imgOffsetx = (int*)malloc(sizeof(int) * count);
int* blockdata_imgOffsety = (int*)malloc(sizeof(int) * count);
for (int i = 0; i < count;i++)
{
blockdata_histOfs[i] = cache.blockData[i].histOfs;
blockdata_imgOffsetx[i] = cache.blockData[i].imgOffset.x;
blockdata_imgOffsety[i] = cache.blockData[i].imgOffset.y;
}
count = cache.pixData.size();
size_t* gradOf = (size_t*)malloc(sizeof(size_t) * count);
size_t* qangleOf = (size_t*)malloc(sizeof(size_t) * count);
int* histOf1 = (int*)malloc(sizeof(int) * count);
int* histOf2 = (int*)malloc(sizeof(int) * count);
int* histOf3 = (int*)malloc(sizeof(int) * count);
int* histOf4 = (int*)malloc(sizeof(int) * count);
float* histWeights1 = (float*)malloc(sizeof(float) * count);
float* histWeights2 = (float*)malloc(sizeof(float) * count);
float* histWeights3 = (float*)malloc(sizeof(float) * count);
float* histWeights4 = (float*)malloc(sizeof(float) * count);
float* gradweight = (float*)malloc(sizeof(float) * count);
for(int i = 0; i < count; i ++)
{
gradOf[i] = cache.pixData[i].gradOfs;
qangleOf[i] = cache.pixData[i].qangleOfs;
histOf1[i] = cache.pixData[i].histOfs[0];
histOf2[i] = cache.pixData[i].histOfs[1];
histOf3[i] = cache.pixData[i].histOfs[2];
histOf4[i] = cache.pixData[i].histOfs[3];
histWeights1[i] = cache.pixData[i].histWeights[0];
histWeights2[i] = cache.pixData[i].histWeights[1];
histWeights3[i] = cache.pixData[i].histWeights[2];
histWeights4[i] = cache.pixData[i].histWeights[3];
gradweight[i] = cache.pixData[i].gradWeight;
}
count = nwindows * nblocks;
float* myweights = (float*)malloc(sizeof(float) * count);
int* hits_x = (int*)malloc(sizeof(int) * count);
int* hits_y = (int*)malloc(sizeof(int) * count);
count = cache.ymaxCached.size();
int* ymaxCache = (int*)malloc(sizeof(int) * count);
for(int i = 0; i < count; i ++)
{
ymaxCache[i] = cache.ymaxCached[i];
}
count = blockHistogramSize;
float* block_Hist = (float*)malloc(sizeof(float) * count);
for(int i = 0; i < count; i ++)
{
block_Hist[i] = blockHist[i];
}
//*********************
for( size_t i = 0; i < nwindows; i++ )
{
//****************************改*****************************
int pt0_x, pt0_y;
//***getwindows***
int nwindowsX = (paddedImgSize_w - winSize_w)/winStride_w + 1;
int y = (int)i / nwindowsX;
int x = (int)i - nwindowsX*y;
pt0_x = x*winStride_w;
pt0_y = y*winStride_h;
//****************
double s = rho;
const float* svmVec = &svmDetect[0];
int j, k;
for( j = 0; j < nblocks; j++, svmVec += blockHistogramSize )
{
int pt_x = pt0_x + blockdata_imgOffsetx[j];
int pt_y = pt0_y + blockdata_imgOffsety[j];
if(pt_x < 0 || pt_y < 0)
{
//cout << pt_x << endl;
//cout << pt_y << endl;
}
const float* vec = mygetBlock(ymaxCache, blockHistogramSize, cache.blockCache.rows, cache.cacheStride.width, cache.cacheStride.height, pt_x, pt_y, cache.imgoffset.x, cache.imgoffset.y, block_Hist, true, cache.blockCacheFlags, cache.blockCache, cache.count1, cache.count2, cache.count4, cache.grad, cache.qangle,
gradOf, qangleOf, histOf1, histOf2, histOf3, histOf4, histWeights1, histWeights2, histWeights3, histWeights4, gradweight);
for( k = 0; k <= blockHistogramSize - 4; k += 4 )
s += vec[k]*svmVec[k] + vec[k+1]*svmVec[k+1] +
vec[k+2]*svmVec[k+2] + vec[k+3]*svmVec[k+3];
for( ; k < blockHistogramSize; k++ )
s += vec[k]*svmVec[k];
}
//cout << s << endl;
if( s >= hitThreshold )
{
Point pt0;
pt0.x = pt0_x;
pt0.y = pt0_y;
hits.push_back(pt0);
weights.push_back(s);
hits_x[i] = pt0_x;
hits_y[i] = pt0_y;
myweights[i] = s;
//cout << pt0_x << "+" << pt0_y << "+" << s << endl;
}
else
{
hits_x[i] = -1;
hits_y[i] = -1;
myweights[i] = -1;
//cout << "pass" << endl;
}
}
}
?
?
跟源码进行比对,不难发现,主要改变的原因来自于mygetBlock函数不再是HOGCache的成员函数了,而是自己定义的普通函数,所以cache对象中的成员变量mygetBlock是没办法直接获得的,所以要通过传参的形式传入函数。这就是detect函数中有了很多malloc的原因。
?
这里注意几个地方:
(1).??const float* gradPtr = (const float*)(grad.data + grad.step*pt_y) + pt_x*2;
??????????? const uchar* qanglePtr = qangle.data + qangle.step*pt_y + pt_x*2;
这里实际上返回的是一个通道指针,是第pt_y行第pt_x*2列的元素,若gradPtr[0]则为第一通道元素,[1]则为第二通道元素,因为grad和qangle均为两通道矩阵!
(2).这里有一个bug,至今没有想得非常清楚,因为malloc了往往是要free的,但是这里前面有提到过,使用了parallel_for_函数,利用了CPU的并行计算,就是同时又很多个detect在进行,当第一个detect完成后,若free了变量,那么其他detect将无法运行,这里将出现内存错误!如何解决这个问题,还有待研究!
原文:http://lps-683.iteye.com/blog/2286597