上周尝试用opencl求极大值,在网上查到大多是求和,所谓的reduction算法。不过思路是一样的。
CPP:
int err = 0; unsigned long int nNumCount = 102400000; int nLocalSize = 256; int nGroupSize = 102400; int nGroup = nGroupSize / nLocalSize; int* pArray = new int[nNumCount]; unsigned long int nReal = 0; int nStart = GetTickCount(); for (int i=0;i<nNumCount;++i) { pArray[i] = i*2; nReal += pArray[i]; } cout<<GetTickCount() - nStart<<endl; cl_mem clmemArray = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(int) * nNumCount,NULL,NULL); err = clEnqueueWriteBuffer(queue,clmemArray,CL_TRUE,0,sizeof(int)*nNumCount,pArray,0,0,0); cl_mem clmemRes = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(int) * nGroup,NULL,NULL); nStart = GetTickCount(); err = clSetKernelArg(m_KerCalcRay,0,sizeof(cl_mem),&clmemArray); err = clSetKernelArg(m_KerCalcRay,1,sizeof(cl_mem),&clmemRes); err = clSetKernelArg(m_KerCalcRay,2,sizeof(int)*nLocalSize,0); err = clSetKernelArg(m_KerCalcRay,3,sizeof(int),&nNumCount); size_t localws[1] = {nLocalSize}; size_t globalws[1] = {nGroupSize}; err = clEnqueueNDRangeKernel(queue,m_KerCalcRay,1,NULL,globalws,localws,0,NULL,NULL); clFinish(queue); int* pRes = new int[nGroup]; err = clEnqueueReadBuffer(queue,clmemRes,CL_TRUE,0,sizeof(int)*nGroup,pRes,0,0,0); clFinish(queue); unsigned long int nRes = 0; for(int i=0;i<nGroup;++i) { nRes += pRes[i]; }
assert(nRes == nReal);
kernel:
__kernel void ReduceSum(__global int* num,__global int* res,__local int* pData,int nCount) { unsigned int tid = get_local_id(0); unsigned int bid = get_group_id(0); unsigned int gid = get_global_id(0); unsigned int localSize = get_local_size(0); unsigned int globalSize = get_global_size(0); int nRes = 0; while(gid < nCount) { nRes += num[gid]; gid += globalSize; } pData[tid] = nRes; barrier(CLK_LOCAL_MEM_FENCE); // do reduction in shared mem for(unsigned int s = localSize >> 1; s > 0; s >>= 1) { if(tid < s) { pData[tid] += pData[tid + s]; } barrier(CLK_LOCAL_MEM_FENCE); } if(tid == 0) res[bid] = pData[0]; }
Reduction求和是这样一种方法,比如8个数0到7依次存放,求和的时候就是下标0和4、1和5、2和6、3和7,求和结果放到下标0、1、2、3中(同步一把barrier(CLK_LOCAL_MEM_FENCE))。然后继续就是0和2,、1和3求和结果放到0、1中。如此往复、最终结果就放到下标0中啦。
另:我试过循环展开减少同步次数、不过效率增长微乎其微。
原文:http://www.cnblogs.com/slean/p/3770094.html