? 照着书上的代码,写了几个一步归约的计算,只计算一步,将原数组归约到不超过 1024 个工作项
● 代码
1 // kernel.cl 2 __kernel void reduce01(__global uint* input, __global uint* output, __local uint* sdata) 3 { 4 const unsigned int tid = get_local_id(0), blockSize = get_local_size(0); 5 unsigned int s; 6 7 sdata[tid] = input[get_global_id(0)]; 8 barrier(CLK_LOCAL_MEM_FENCE); 9 10 // 三种写法,用一种就够 11 // 1、模法,问题: % 运算很慢 12 for (s = 1; s < blockSize; s <<= 1) 13 { 14 if (tid % (2 * s) == 0) 15 sdata[tid] += sdata[tid + s]; 16 barrier(CLK_LOCAL_MEM_FENCE); 17 } 18 // 2、间隔缩短法,问题:首次迭代只用一半的工作项,之后每次迭代活跃的工作项持续减少 19 for (s = blockSize / 2; s > 0; s >>= 1) 20 { 21 if (tid < s) 22 sdata[tid] += sdata[tid + s]; 23 barrier(CLK_LOCAL_MEM_FENCE); 24 } 25 // 3、间隔增长法,问题:当间隔等于某几个数的时候会产生 26 unsigned int index; 27 for (s = 1; s < blockSize; s <<= 1) 28 { 29 if ((index = 2 * s * tid) < blockSize) 30 sdata[index] += sdata[index + s]; 31 barrier(CLK_LOCAL_MEM_FENCE); 32 } 33 34 if (tid == 0) 35 output[get_group_id(0)] = sdata[0]; 36 } 37 38 __kernel void reduce02(__global uint* input, __global uint* output, __local uint* sdata) 39 { 40 const unsigned int tid = get_local_id(0), bid = get_group_id(0), blockSize = get_local_size(0); 41 const unsigned int index = bid * (blockSize * 2) + tid; 42 unsigned int s; 43 44 sdata[tid] = input[index] + input[index + blockSize];// 读入局部内存时就进行一次归约 45 barrier(CLK_LOCAL_MEM_FENCE); 46 47 // 两种写法,用一种就够 48 // 1、不手动展开循环,仍然有工作项浪费的问题 49 for (s = blockSize / 2; s > 0; s >>= 1) 50 { 51 if (tid < s) 52 sdata[tid] += sdata[tid + s]; 53 barrier(CLK_LOCAL_MEM_FENCE); 54 } 55 // 2、手动展开最后的循环 56 for (s = blockSize / 2; s > 32; s >>= 1)// BUG:如果从 64 开始手工归约,在这一行有且仅有一个工作项会算出 640 = 512 + 128 来,其他行却没问题 57 { 58 if (tid < s) 59 sdata[tid] += sdata[tid + s]; 60 barrier(CLK_LOCAL_MEM_FENCE); 61 } 62 if (tid < 32) // 手动展开最后的归约,注意同步,书中源代码中没有同步,计算结果是错的 63 { 64 if (blockSize >= 64) 65 sdata[tid] += sdata[tid + 32]; 66 barrier(CLK_LOCAL_MEM_FENCE); 67 if (blockSize >= 32) 68 sdata[tid] += sdata[tid + 16]; 69 barrier(CLK_LOCAL_MEM_FENCE); 70 if (blockSize >= 16) 71 sdata[tid] += sdata[tid + 8]; 72 barrier(CLK_LOCAL_MEM_FENCE); 73 if (blockSize >= 8) 74 sdata[tid] += sdata[tid + 4]; 75 barrier(CLK_LOCAL_MEM_FENCE); 76 if (blockSize >= 4) 77 sdata[tid] += sdata[tid + 2]; 78 barrier(CLK_LOCAL_MEM_FENCE); 79 if (blockSize >= 2) 80 sdata[tid] += sdata[tid + 1]; 81 barrier(CLK_LOCAL_MEM_FENCE); 82 } 83 84 if (tid == 0) 85 output[bid] = sdata[0]; 86 }
1 // main.c 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <cl.h> 5 6 #define BLOCK_SIZE 256 // 工作组内最大工作项数为 1024 7 #define DATA_SIZE (BLOCK_SIZE * 1024) // 一维最大工作组数为1024 8 9 const char *sourceText = "D:/Code/OpenCL/OpenCLProjectTemp/OpenCLProjectTemp/kernel.cl"; 10 11 int readText(const char* kernelPath, char **pcode)// 读取文本文件放入 pcode,返回字符串长度 12 { 13 FILE *fp; 14 int size; 15 //printf("<readText> File: %s\n", kernelPath); 16 fopen_s(&fp, kernelPath, "rb"); 17 if (!fp) 18 { 19 printf("Open kernel file failed\n"); 20 getchar(); 21 exit(-1); 22 } 23 if (fseek(fp, 0, SEEK_END) != 0) 24 { 25 printf("Seek end of file failed\n"); 26 getchar(); 27 exit(-1); 28 } 29 if ((size = ftell(fp)) < 0) 30 { 31 printf("Get file position failed\n"); 32 getchar(); 33 exit(-1); 34 } 35 rewind(fp); 36 if ((*pcode = (char *)malloc(size + 1)) == NULL) 37 { 38 printf("Allocate space failed\n"); 39 getchar(); 40 exit(-1); 41 } 42 fread(*pcode, 1, size, fp); 43 (*pcode)[size] = ‘\0‘; 44 fclose(fp); 45 return size + 1; 46 } 47 48 int main() 49 { 50 cl_int status; 51 cl_uint nPlatform; 52 clGetPlatformIDs(0, NULL, &nPlatform); 53 cl_platform_id *listPlatform = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id)); 54 clGetPlatformIDs(nPlatform, listPlatform, NULL); 55 cl_uint nDevice; 56 clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, 0, NULL, &nDevice); 57 cl_device_id *listDevice = (cl_device_id*)malloc(nDevice * sizeof(cl_device_id)); 58 clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_ALL, nDevice, listDevice, NULL); 59 cl_context context = clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status); 60 cl_command_queue queue = clCreateCommandQueue(context, listDevice[0], CL_QUEUE_PROFILING_ENABLE, &status); 61 62 //const unsigned int nGroup = DATA_SIZE / BLOCK_SIZE; // reduce01 使用 63 const unsigned int nGroup = DATA_SIZE / BLOCK_SIZE / 2; // reduce02 使用 64 int *hostA = (cl_int*)malloc(sizeof(cl_int) * DATA_SIZE); 65 int *hostB = (cl_int*)malloc(sizeof(cl_int) * nGroup); 66 int i; 67 unsigned long refSum; 68 srand(97); 69 for (i = 0, refSum = 0L; i < DATA_SIZE; refSum += (hostA[i++] = 1));// rand())); 70 memset(hostB, 0, sizeof(int) * nGroup); 71 cl_mem deviceA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_int) * DATA_SIZE, hostA, &status); 72 cl_mem deviceB = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_int) * nGroup, NULL, &status); 73 74 char *code; 75 size_t codeLength = readText(sourceText, &code); 76 cl_program program = clCreateProgramWithSource(context, 1, (const char**)&code, &codeLength, &status); 77 status = clBuildProgram(program, nDevice, listDevice, NULL, NULL, NULL); 78 if (status) 79 { 80 char info[10000]; 81 clGetProgramBuildInfo(program, listDevice[0], CL_PROGRAM_BUILD_LOG, 10000, info, NULL); 82 printf("\n%s\n", info); 83 } 84 //cl_kernel kernel = clCreateKernel(program, "reduce01", &status); 85 cl_kernel kernel = clCreateKernel(program, "reduce02", &status); 86 87 clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&deviceA); 88 clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&deviceB); 89 clSetKernelArg(kernel, 2, BLOCK_SIZE * sizeof(cl_int), NULL); 90 91 size_t globalSize = DATA_SIZE, localSize = BLOCK_SIZE; 92 cl_event ev; 93 //cl_ulong startTime, endTime; 94 status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, &ev); 95 clFinish(queue); 96 //clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); // 不启用计时,因为一趟归约时间太短 97 //clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &endTime, NULL); 98 //printf("Time:%lu.%lu\n", (endTime - startTime) / 1000000000, (endTime - startTime) % 1000000000); 99 100 clEnqueueReadBuffer(queue, deviceB, CL_TRUE, 0, sizeof(cl_int) * nGroup, hostB, 0, NULL, NULL); 101 for (i = 0; i < nGroup; refSum -= hostB[i++]); 102 printf("Result %s.\n", (refSum == 0) ? "correct" : "error"); 103 104 free(hostA); 105 free(hostB); 106 free(code); 107 free(listPlatform); 108 free(listDevice); 109 clReleaseContext(context); 110 clReleaseCommandQueue(queue); 111 clReleaseProgram(program); 112 clReleaseKernel(kernel); 113 clReleaseEvent(ev); 114 clReleaseMemObject(deviceA); 115 clReleaseMemObject(deviceB); 116 getchar(); 117 return 0; 118 }
● 输出结果
Result correct.
原文:https://www.cnblogs.com/cuancuancuanhao/p/9148224.html