lps_683 阅读(61) 评论(0)

上一回分析到,第一阶段的主要任务是改写detect函数,那么本次就是先将detect函数修改成C语言版本!

 

先上代码,下列代码仍有几个变量没进行修改,其中包括blockCacheFlags, blockCache,grad,qangle(Mat类型),这些mat类型的矩阵,考虑直接传入cuda PtrStep类型或者用二维数组传入

 

void mynormalizeBlockHistogram(float* _hist, size_t blockHistogramSize, double Threshold)
{
	    float* hist = &_hist[0];

    size_t i, sz = blockHistogramSize;

    float sum = 0;
    for( i = 0; i < sz; i++ )
        sum += hist[i]*hist[i];

    float scale = 1.f/(std::sqrt(sum)+sz*0.1f), thresh = (float)Threshold;

    for( i = 0, sum = 0; i < sz; i++ )
    {
        hist[i] = std::min(hist[i]*scale, thresh);
        sum += hist[i]*hist[i];
    }
    scale = 1.f/(std::sqrt(sum)+1e-3f);
    for( i = 0; i < sz; i++ )
        hist[i] *= scale;
}




const float* mygetBlock(int*ymaxCache, int blockHistogramSize, int blockCache_h, int cacheStride_w, int cacheStride_h, int pt_x, int pt_y, int imgoffset_x, int imgoffset_y, float* buf, bool useCache, Mat_<uchar> blockCacheFlags, Mat_<float> blockCache, int count1, int count2, int count4, Mat grad, Mat qangle,
						size_t* gradOf, size_t* qangleOf, int* histOf1, int* histOf2, int* histOf3, int* histOf4, 
						float* histWeights1, float* histWeights2, float* histWeights3, float* histWeights4, float* gradweight )
{//blockCacheFlags, blockCache待处理
	  float* blockHist = buf;
	  //int blockSize_w = 16, blockSize_h = 16;
	  pt_x += imgoffset_x;
	  pt_y += imgoffset_y;
	  //cout << "1" << endl; 
	   if( useCache )
	   {
		   //int cacheSize_w = (grad_w - blockSize_w)/cacheStride_w + 1;
		   //int cacheSize_h = (winSize_h/cacheStride_h) + 1; 
		   int cacheIdx_x = pt_x/cacheStride_w;
		   int cacheIdx_y = (pt_y/cacheStride_h) % blockCache_h;
		   if( pt_y != ymaxCache[cacheIdx_y] )
		   {
            Mat_<uchar> cacheRow = blockCacheFlags.row(cacheIdx_y);
			//cout << cacheIdx_y << endl;
            cacheRow = (uchar)0;
            ymaxCache[cacheIdx_y] = pt_y;
		    }
		   //cout << "2" << endl; 
        blockHist = &blockCache[cacheIdx_y][cacheIdx_x*blockHistogramSize];
        uchar& computedFlag = blockCacheFlags(cacheIdx_y, cacheIdx_x);
        if( computedFlag != 0 )
            return blockHist;
        computedFlag = (uchar)1; // set it at once, before actual computing
	   }

	    int k, C1 = count1, C2 = count2, C4 = count4;
		const float* gradPtr = (const float*)(grad.data + grad.step*pt_y) + pt_x*2;
		const uchar* qanglePtr = qangle.data + qangle.step*pt_y + pt_x*2;

	    for( k = 0; k < blockHistogramSize; k++ )
			 blockHist[k] = 0.f;
		for( k = 0; k < C1; k++ )
		{
			//const PixData& pk = _pixData[k];
			const float* a = gradPtr + gradOf[k];
			float w = gradweight[k]*histWeights1[k];
			const uchar* h = qanglePtr + qangleOf[k];
			int h0 = h[0], h1 = h[1];
			float* hist = blockHist + histOf1[k];
			float t0 = hist[h0] + a[0]*w;
			float t1 = hist[h1] + a[1]*w;
			hist[h0] = t0; hist[h1] = t1;
		}
		//cout << "3" << endl;
		
    for( ; k < C2; k++ )
    {
        //const PixData& pk = _pixData[k];
        const float* a = gradPtr + gradOf[k];
        float w, t0, t1, a0 = a[0], a1 = a[1];
        const uchar* h = qanglePtr + qangleOf[k];
        int h0 = h[0], h1 = h[1];

        float* hist = blockHist + histOf1[k];
        w = gradweight[k] * histWeights1[k];
        t0 = hist[h0] + a0*w;
        t1 = hist[h1] + a1*w;
        hist[h0] = t0; hist[h1] = t1;

        hist = blockHist + histOf2[k];
        w = gradweight[k]*histWeights2[k];
        t0 = hist[h0] + a0*w;
        t1 = hist[h1] + a1*w;
        hist[h0] = t0; hist[h1] = t1;
    }
	//cout << "4" << endl;
	for( ; k < C4; k++ )
    {
        //const PixData& pk = _pixData[k];
        const float* a = gradPtr + gradOf[k];
        float w, t0, t1, a0 = a[0], a1 = a[1];
        const uchar* h = qanglePtr + qangleOf[k];
        int h0 = h[0], h1 = h[1];

        float* hist = blockHist + histOf1[k];
        w = gradweight[k]*histWeights1[k];
        t0 = hist[h0] + a0*w;
        t1 = hist[h1] + a1*w;
        hist[h0] = t0; hist[h1] = t1;

        hist = blockHist + histOf2[k];
        w = gradweight[k]*histWeights2[k];
        t0 = hist[h0] + a0*w;
        t1 = hist[h1] + a1*w;
        hist[h0] = t0; hist[h1] = t1;

        hist = blockHist + histOf3[k];
        w = gradweight[k]*histWeights3[k];
        t0 = hist[h0] + a0*w;
        t1 = hist[h1] + a1*w;
        hist[h0] = t0; hist[h1] = t1;

        hist = blockHist + histOf4[k];
        w = gradweight[k]*histWeights4[k];
        t0 = hist[h0] + a0*w;
        t1 = hist[h1] + a1*w;
        hist[h0] = t0; hist[h1] = t1;
    }
	//cout << "5" << endl; 
	    mynormalizeBlockHistogram(blockHist, blockHistogramSize, 0.2);

	/*for(int i = 0; i < blockHistogramSize; i ++)
	{
		cout << blockHist[i] << " ";
	}
	cout << endl;
	*/

	  return blockHist;
}


void HOGDescriptor::detect(const Mat& img,
    vector<Point>& hits, vector<double>& weights, double hitThreshold,
    Size winStride, Size padding, const vector<Point>& locations) const
{
    hits.clear();
    if( svmDetector.empty() )
        return;

    if( winStride == Size() )
        winStride = cellSize;
    Size cacheStride(gcd(winStride.width, blockStride.width),
                     gcd(winStride.height, blockStride.height));
    size_t nwindows = locations.size();
    padding.width = (int)alignSize(std::max(padding.width, 0), cacheStride.width);
    padding.height = (int)alignSize(std::max(padding.height, 0), cacheStride.height);
    Size paddedImgSize(img.cols + padding.width*2, img.rows + padding.height*2);
	double start = (double)getTickCount();   //time!!!
    HOGCache cache(this, img, padding, padding, nwindows == 0, cacheStride);
	double t = ((double)getTickCount() - start)/getTickFrequency();  

    if( !nwindows )
        nwindows = cache.windowsInImage(paddedImgSize, winStride).area();

    const HOGCache::BlockData* blockData = &cache.blockData[0];

    int nblocks = cache.nblocks.area();
    int blockHistogramSize = cache.blockHistogramSize;
    size_t dsize = getDescriptorSize();

    double rho = svmDetector.size() > dsize ? svmDetector[dsize] : 0;
    vector<float> blockHist(blockHistogramSize);
	
	//*****define para*****
	int paddedImgSize_w = paddedImgSize.width, paddedImgSize_h = paddedImgSize.height;
	int winSize_w = winSize.width, winSize_h = winSize.height;
	int winStride_w = winStride.width, winStride_h = winStride.height;
	int count = svmDetector.size();
	float* svmDetect = (float*)malloc(sizeof(float) * count);
    for (int i = 0; i < count;i++)
    {
		svmDetect[i] = svmDetector[i];
    }
	count = cache.blockData.size();
	int* blockdata_histOfs = (int*)malloc(sizeof(int) * count);
	int* blockdata_imgOffsetx = (int*)malloc(sizeof(int) * count); 
	int* blockdata_imgOffsety = (int*)malloc(sizeof(int) * count);
    for (int i = 0; i < count;i++)
    {
		blockdata_histOfs[i] = cache.blockData[i].histOfs;
		blockdata_imgOffsetx[i] = cache.blockData[i].imgOffset.x;
		blockdata_imgOffsety[i] = cache.blockData[i].imgOffset.y;
    }
	count = cache.pixData.size();
	size_t* gradOf = (size_t*)malloc(sizeof(size_t) * count);
	size_t* qangleOf = (size_t*)malloc(sizeof(size_t) * count);
	int* histOf1 = (int*)malloc(sizeof(int) * count);
	int* histOf2 = (int*)malloc(sizeof(int) * count);
	int* histOf3 = (int*)malloc(sizeof(int) * count);
	int* histOf4 = (int*)malloc(sizeof(int) * count);
	float* histWeights1 = (float*)malloc(sizeof(float) * count);
	float* histWeights2 = (float*)malloc(sizeof(float) * count);
	float* histWeights3 = (float*)malloc(sizeof(float) * count);
	float* histWeights4 = (float*)malloc(sizeof(float) * count);
	float* gradweight = (float*)malloc(sizeof(float) * count);
	for(int i = 0; i < count; i ++)
	{
		gradOf[i] = cache.pixData[i].gradOfs;
		qangleOf[i] = cache.pixData[i].qangleOfs;
		histOf1[i] = cache.pixData[i].histOfs[0];
		histOf2[i] = cache.pixData[i].histOfs[1];
		histOf3[i] = cache.pixData[i].histOfs[2];
		histOf4[i] = cache.pixData[i].histOfs[3];
		histWeights1[i] = cache.pixData[i].histWeights[0];
		histWeights2[i] = cache.pixData[i].histWeights[1];
		histWeights3[i] = cache.pixData[i].histWeights[2];
		histWeights4[i] = cache.pixData[i].histWeights[3];
		gradweight[i] = cache.pixData[i].gradWeight;
	}



	count = nwindows * nblocks;
	float* myweights = (float*)malloc(sizeof(float) * count);
	int* hits_x = (int*)malloc(sizeof(int) * count);
	int* hits_y = (int*)malloc(sizeof(int) * count); 
	count = cache.ymaxCached.size();
	int* ymaxCache = (int*)malloc(sizeof(int) * count);
	for(int i = 0; i < count; i ++)
	{
		ymaxCache[i] = cache.ymaxCached[i];
	}
	count = blockHistogramSize;
	float* block_Hist = (float*)malloc(sizeof(float) * count);
	for(int i = 0; i < count; i ++)
	{
		block_Hist[i] = blockHist[i];
	}
	//*********************


    for( size_t i = 0; i < nwindows; i++ )
    {
		
		//****************************改*****************************
		 int pt0_x, pt0_y;
		//***getwindows***
		 int nwindowsX = (paddedImgSize_w - winSize_w)/winStride_w + 1;
		 int y = (int)i / nwindowsX;
		 int x = (int)i - nwindowsX*y;
		 pt0_x = x*winStride_w;
		 pt0_y = y*winStride_h;
		//****************
		 double s = rho;
         const float* svmVec = &svmDetect[0];
         int j, k;
		 for( j = 0; j < nblocks; j++, svmVec += blockHistogramSize )
		 {
			 int pt_x = pt0_x + blockdata_imgOffsetx[j];
			 int pt_y = pt0_y + blockdata_imgOffsety[j];
			 if(pt_x < 0 || pt_y < 0)
			 {
				//cout << pt_x << endl;
				//cout << pt_y << endl;
			 }

			 const float* vec = mygetBlock(ymaxCache, blockHistogramSize, cache.blockCache.rows, cache.cacheStride.width, cache.cacheStride.height, pt_x, pt_y, cache.imgoffset.x, cache.imgoffset.y, block_Hist, true, cache.blockCacheFlags, cache.blockCache, cache.count1, cache.count2, cache.count4, cache.grad, cache.qangle,
				 gradOf, qangleOf, histOf1, histOf2, histOf3, histOf4, histWeights1, histWeights2, histWeights3, histWeights4, gradweight);
			
			  for( k = 0; k <= blockHistogramSize - 4; k += 4 )
                s += vec[k]*svmVec[k] + vec[k+1]*svmVec[k+1] +
                    vec[k+2]*svmVec[k+2] + vec[k+3]*svmVec[k+3];
              for( ; k < blockHistogramSize; k++ )
                s += vec[k]*svmVec[k];
		  }
			  //cout << s << endl;
			  if( s >= hitThreshold )
			    {
					Point pt0;
					pt0.x = pt0_x;
					pt0.y = pt0_y;
					 hits.push_back(pt0);
					 weights.push_back(s);
					hits_x[i] = pt0_x;
					hits_y[i] = pt0_y;
					myweights[i] = s;
					//cout << pt0_x << "+" << pt0_y << "+" << s << endl;
				}
			  else
				{
					hits_x[i] = -1;
					hits_y[i] = -1;
					myweights[i] = -1;
					//cout << "pass" << endl;
				}
		 }
}

 

 

跟源码进行比对,不难发现,主要改变的原因来自于mygetBlock函数不再是HOGCache的成员函数了,而是自己定义的普通函数,所以cache对象中的成员变量mygetBlock是没办法直接获得的,所以要通过传参的形式传入函数。这就是detect函数中有了很多malloc的原因。

 

这里注意几个地方:

(1).  const float* gradPtr = (const float*)(grad.data + grad.step*pt_y) + pt_x*2;
            const uchar* qanglePtr = qangle.data + qangle.step*pt_y + pt_x*2;

这里实际上返回的是一个通道指针,是第pt_y行第pt_x*2列的元素,若gradPtr[0]则为第一通道元素,[1]则为第二通道元素,因为grad和qangle均为两通道矩阵!

(2).这里有一个bug,至今没有想得非常清楚,因为malloc了往往是要free的,但是这里前面有提到过,使用了parallel_for_函数,利用了CPU的并行计算,就是同时又很多个detect在进行,当第一个detect完成后,若free了变量,那么其他detect将无法运行,这里将出现内存错误!如何解决这个问题,还有待研究!