双调排序
参考:《opencl实战》
// 获取设备
// 获取设备 cl_device_id create_device() { cl_platform_id platform; cl_device_id dev; int err; /* Identify a platform */ err = clGetPlatformIDs(1, &platform, NULL); if(err < 0) { perror("Couldn‘t identify a platform"); exit(1); } /* Access a device */ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL); if(err == CL_DEVICE_NOT_FOUND) { err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL); } if(err < 0) { perror("Couldn‘t access any devices"); exit(1); } return dev; }
// 创建并编译cl_program
1 // 创建并编译cl_program 2 cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) { 3 4 cl_program program; 5 FILE *program_handle; 6 char *program_buffer, *program_log; 7 size_t program_size, log_size; 8 int err; 9 10 /* Read program file and place content into buffer */ 11 program_handle = fopen(filename, "r"); 12 if(program_handle == NULL) { 13 perror("Couldn‘t find the program file"); 14 exit(1); 15 } 16 fseek(program_handle, 0, SEEK_END); 17 program_size = ftell(program_handle); 18 rewind(program_handle); 19 program_buffer = (char*)malloc(program_size + 1); 20 program_buffer[program_size] = ‘\0‘; 21 fread(program_buffer, sizeof(char), program_size, program_handle); 22 fclose(program_handle); 23 24 /* Create program from file */ 25 program = clCreateProgramWithSource(ctx, 1, 26 (const char**)&program_buffer, &program_size, &err); 27 if(err < 0) { 28 perror("Couldn‘t create the program"); 29 exit(1); 30 } 31 free(program_buffer); 32 33 /* Build program */ 34 err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 35 if(err < 0) { 36 37 /* Find size of log and print to std output */ 38 clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 39 0, NULL, &log_size); 40 program_log = (char*) malloc(log_size + 1); 41 program_log[log_size] = ‘\0‘; 42 clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 43 log_size + 1, program_log, NULL); 44 printf("%s\n", program_log); 45 free(program_log); 46 exit(1); 47 } 48 49 return program; 50 }
// main
#禁止不安全的错误警告 #define _CRT_SECURE_NO_WARNINGS #define PROGRAM_FILE "bsort.cl" #define BSORT_INIT "bsort_init" #define BSORT_STAGE_0 "bsort_stage_0" #define BSORT_STAGE_N "bsort_stage_n" #define BSORT_MERGE "bsort_merge" #define BSORT_MERGE_LAST "bsort_merge_last" /* Ascending: 0, Descending: -1 */ #define DIRECTION 0 #define NUM_FLOATS 1048576 #include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> #ifdef MAC #include <OpenCL/cl.h> #else #include <CL/cl.h> #endif /* Find a GPU or CPU associated with the first available platform */ cl_device_id create_device() ; /* Create program from a file and compile it */ cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) ; int main() { /* Host/device data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel_init, kernel_stage_0, kernel_stage_n, kernel_merge, kernel_merge_last; cl_int i, err, check, direction; /* Data and buffers */ float data[NUM_FLOATS]; cl_mem data_buffer; cl_uint stage, high_stage, num_stages; size_t local_size, global_size; /* Initialize data */ srand(time(NULL)); for(i=0; i<NUM_FLOATS; i++) { data[i] = rand(); } /* Create a device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn‘t create a context"); exit(1); } /* Build the program */ program = build_program(context, device, PROGRAM_FILE); /* Create kernels */ kernel_init = clCreateKernel(program, BSORT_INIT, &err); if(err < 0) { perror("Couldn‘t create the initial kernel"); exit(1); }; kernel_stage_0 = clCreateKernel(program, BSORT_STAGE_0, &err); if(err < 0) { perror("Couldn‘t create the stage_0 kernel"); exit(1); }; kernel_stage_n = clCreateKernel(program, BSORT_STAGE_N, &err); if(err < 0) { perror("Couldn‘t create the stage_n kernel"); exit(1); }; kernel_merge = clCreateKernel(program, BSORT_MERGE, &err); if(err < 0) { perror("Couldn‘t create the merge kernel"); exit(1); }; kernel_merge_last = clCreateKernel(program, BSORT_MERGE_LAST, &err); if(err < 0) { perror("Couldn‘t create the merge_last kernel"); exit(1); }; /* Determine maximum work-group size */ // 获取工作组中工作项的 数量限制 err = clGetKernelWorkGroupInfo(kernel_init, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL); if(err < 0) { perror("Couldn‘t find the maximum work-group size"); exit(1); }; local_size = (int)pow(2, trunc(log2(local_size))); //函数 TRUNC 直接去除数字的小数部分 /* Create buffer */ data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(data), data, &err); if(err < 0) { perror("Couldn‘t create a buffer"); exit(1); }; /* Create kernel argument */ err = clSetKernelArg(kernel_init, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(kernel_stage_0, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(kernel_stage_n, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(kernel_merge, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(kernel_merge_last, 0, sizeof(cl_mem), &data_buffer); if(err < 0) { printf("Couldn‘t set a kernel argument"); exit(1); }; /* Create kernel argument */ err = clSetKernelArg(kernel_init, 1, 8*local_size*sizeof(float), NULL); err |= clSetKernelArg(kernel_stage_0, 1, 8*local_size*sizeof(float), NULL); err |= clSetKernelArg(kernel_stage_n, 1, 8*local_size*sizeof(float), NULL); err |= clSetKernelArg(kernel_merge, 1, 8*local_size*sizeof(float), NULL); err |= clSetKernelArg(kernel_merge_last, 1, 8*local_size*sizeof(float), NULL); if(err < 0) { printf("Couldn‘t set a kernel argument"); exit(1); }; /* Create a command queue */ // 创建命令队列 queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn‘t create a command queue"); exit(1); }; /* Enqueue initial sorting kernel */ global_size = NUM_FLOATS/8; if(global_size < local_size) { local_size = global_size; } err = clEnqueueNDRangeKernel(queue, kernel_init, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn‘t enqueue the kernel"); exit(1); } /* Execute further stages */ num_stages = global_size/local_size; for(high_stage = 2; high_stage < num_stages; high_stage <<= 1) { err = clSetKernelArg(kernel_stage_0, 2, sizeof(int), &high_stage); err |= clSetKernelArg(kernel_stage_n, 3, sizeof(int), &high_stage); if(err < 0) { printf("Couldn‘t set a kernel argument"); exit(1); }; for(stage = high_stage; stage > 1; stage >>= 1) { err = clSetKernelArg(kernel_stage_n, 2, sizeof(int), &stage); if(err < 0) { printf("Couldn‘t set a kernel argument"); exit(1); }; err = clEnqueueNDRangeKernel(queue, kernel_stage_n, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn‘t enqueue the kernel"); exit(1); } } err = clEnqueueNDRangeKernel(queue, kernel_stage_0, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn‘t enqueue the kernel"); exit(1); } } /* Set the sort direction */ direction = DIRECTION; err = clSetKernelArg(kernel_merge, 3, sizeof(int), &direction); err |= clSetKernelArg(kernel_merge_last, 2, sizeof(int), &direction); if(err < 0) { printf("Couldn‘t set a kernel argument"); exit(1); }; /* Perform the bitonic merge */ for(stage = num_stages; stage > 1; stage >>= 1) { err = clSetKernelArg(kernel_merge, 2, sizeof(int), &stage); if(err < 0) { printf("Couldn‘t set a kernel argument"); exit(1); }; err = clEnqueueNDRangeKernel(queue, kernel_merge, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn‘t enqueue the kernel"); exit(1); } } err = clEnqueueNDRangeKernel(queue, kernel_merge_last, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn‘t enqueue the kernel"); exit(1); } /* Read the result */ err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, sizeof(data), &data, 0, NULL, NULL); if(err < 0) { perror("Couldn‘t read the buffer"); exit(1); } check = 1; /* Check ascending sort */ if(direction == 0) { for(i=1; i<NUM_FLOATS; i++) { if(data[i] < data[i-1]) { check = 0; break; } } } /* Check descending sort */ if(direction == -1) { for(i=1; i<NUM_FLOATS; i++) { if(data[i] > data[i-1]) { check = 0; break; } } } /* Display check result */ printf("Local size: %zu\n", local_size); printf("Global size: %zu\n", global_size); if(check) printf("Bitonic sort succeeded.\n"); else printf("Bitonic sort failed.\n"); /* Deallocate resources */ clReleaseMemObject(data_buffer); clReleaseKernel(kernel_init); clReleaseKernel(kernel_stage_0); clReleaseKernel(kernel_stage_n); clReleaseKernel(kernel_merge); clReleaseKernel(kernel_merge_last); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
/* Sort elements within a vector */ #define VECTOR_SORT(input, dir) \ comp = input < shuffle(input, mask2) ^ dir; input = shuffle(input, as_uint4(comp * 2 + add2)); comp = input < shuffle(input, mask1) ^ dir; input = shuffle(input, as_uint4(comp + add1)); #define VECTOR_SWAP(input1, input2, dir) \ temp = input1; comp = (input1 < input2 ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); /* Perform initial sort */ __kernel void bsort_init(__global float4 *g_data, __local float4 *l_data) { int dir; uint id, global_start, size, stride; float4 input1, input2, temp; int4 comp; uint4 mask1 = (uint4)(1, 0, 3, 2); uint4 mask2 = (uint4)(2, 3, 0, 1); uint4 mask3 = (uint4)(3, 2, 1, 0); int4 add1 = (int4)(1, 1, 3, 3); int4 add2 = (int4)(2, 3, 2, 3); int4 add3 = (int4)(1, 2, 2, 3); id = get_local_id(0) * 2; global_start = get_group_id(0) * get_local_size(0) * 2 + id; input1 = g_data[global_start]; input2 = g_data[global_start+1]; /* Sort input 1 - ascending */ comp = input1 < shuffle(input1, mask1); input1 = shuffle(input1, as_uint4(comp + add1)); comp = input1 < shuffle(input1, mask2); input1 = shuffle(input1, as_uint4(comp * 2 + add2)); comp = input1 < shuffle(input1, mask3); input1 = shuffle(input1, as_uint4(comp + add3)); /* Sort input 2 - descending */ comp = input2 > shuffle(input2, mask1); input2 = shuffle(input2, as_uint4(comp + add1)); comp = input2 > shuffle(input2, mask2); input2 = shuffle(input2, as_uint4(comp * 2 + add2)); comp = input2 > shuffle(input2, mask3); input2 = shuffle(input2, as_uint4(comp + add3)); /* Swap corresponding elements of input 1 and 2 */ add3 = (int4)(4, 5, 6, 7); dir = get_local_id(0) % 2 * -1; temp = input1; comp = (input1 < input2 ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); /* Sort data and store in local memory */ VECTOR_SORT(input1, dir); VECTOR_SORT(input2, dir); l_data[id] = input1; l_data[id+1] = input2; /* Create bitonic set */ for(size = 2; size < get_local_size(0); size <<= 1) { dir = (get_local_id(0)/size & 1) * -1; for(stride = size; stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) + (get_local_id(0)/stride)*stride; VECTOR_SWAP(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = (input1 < input2 ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); VECTOR_SORT(input1, dir); VECTOR_SORT(input2, dir); l_data[id] = input1; l_data[id+1] = input2; } /* Perform bitonic merge */ dir = (get_group_id(0) % 2) * -1; for(stride = get_local_size(0); stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) + (get_local_id(0)/stride)*stride; VECTOR_SWAP(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); /* Perform final sort */ id = get_local_id(0) * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = (input1 < input2 ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); VECTOR_SORT(input1, dir); VECTOR_SORT(input2, dir); g_data[global_start] = input1; g_data[global_start+1] = input2; } /* Perform lowest stage of the bitonic sort */ __kernel void bsort_stage_0(__global float4 *g_data, __local float4 *l_data, uint high_stage) { int dir; uint id, global_start, stride; float4 input1, input2, temp; int4 comp; uint4 mask1 = (uint4)(1, 0, 3, 2); uint4 mask2 = (uint4)(2, 3, 0, 1); uint4 mask3 = (uint4)(3, 2, 1, 0); int4 add1 = (int4)(1, 1, 3, 3); int4 add2 = (int4)(2, 3, 2, 3); int4 add3 = (int4)(4, 5, 6, 7); /* Determine data location in global memory */ id = get_local_id(0); dir = (get_group_id(0)/high_stage & 1) * -1; global_start = get_group_id(0) * get_local_size(0) * 2 + id; /* Perform initial swap */ input1 = g_data[global_start]; input2 = g_data[global_start + get_local_size(0)]; comp = (input1 < input2 ^ dir) * 4 + add3; l_data[id] = shuffle2(input1, input2, as_uint4(comp)); l_data[id + get_local_size(0)] = shuffle2(input2, input1, as_uint4(comp)); /* Perform bitonic merge */ for(stride = get_local_size(0)/2; stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) + (get_local_id(0)/stride)*stride; VECTOR_SWAP(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); /* Perform final sort */ id = get_local_id(0) * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = (input1 < input2 ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); VECTOR_SORT(input1, dir); VECTOR_SORT(input2, dir); /* Store output in global memory */ g_data[global_start + get_local_id(0)] = input1; g_data[global_start + get_local_id(0) + 1] = input2; } /* Perform successive stages of the bitonic sort */ __kernel void bsort_stage_n(__global float4 *g_data, __local float4 *l_data, uint stage, uint high_stage) { int dir; float4 input1, input2; int4 comp, add; uint global_start, global_offset; add = (int4)(4, 5, 6, 7); /* Determine location of data in global memory */ dir = (get_group_id(0)/high_stage & 1) * -1; global_start = (get_group_id(0) + (get_group_id(0)/stage)*stage) * get_local_size(0) + get_local_id(0); global_offset = stage * get_local_size(0); /* Perform swap */ input1 = g_data[global_start]; input2 = g_data[global_start + global_offset]; comp = (input1 < input2 ^ dir) * 4 + add; g_data[global_start] = shuffle2(input1, input2, as_uint4(comp)); g_data[global_start + global_offset] = shuffle2(input2, input1, as_uint4(comp)); } /* Sort the bitonic set */ __kernel void bsort_merge(__global float4 *g_data, __local float4 *l_data, uint stage, int dir) { float4 input1, input2; int4 comp, add; uint global_start, global_offset; add = (int4)(4, 5, 6, 7); /* Determine location of data in global memory */ global_start = (get_group_id(0) + (get_group_id(0)/stage)*stage) * get_local_size(0) + get_local_id(0); global_offset = stage * get_local_size(0); /* Perform swap */ input1 = g_data[global_start]; input2 = g_data[global_start + global_offset]; comp = (input1 < input2 ^ dir) * 4 + add; g_data[global_start] = shuffle2(input1, input2, as_uint4(comp)); g_data[global_start + global_offset] = shuffle2(input2, input1, as_uint4(comp)); } /* Perform final step of the bitonic merge */ __kernel void bsort_merge_last(__global float4 *g_data, __local float4 *l_data, int dir) { uint id, global_start, stride; float4 input1, input2, temp; int4 comp; uint4 mask1 = (uint4)(1, 0, 3, 2); uint4 mask2 = (uint4)(2, 3, 0, 1); uint4 mask3 = (uint4)(3, 2, 1, 0); int4 add1 = (int4)(1, 1, 3, 3); int4 add2 = (int4)(2, 3, 2, 3); int4 add3 = (int4)(4, 5, 6, 7); /* Determine location of data in global memory */ id = get_local_id(0); global_start = get_group_id(0) * get_local_size(0) * 2 + id; /* Perform initial swap */ input1 = g_data[global_start]; input2 = g_data[global_start + get_local_size(0)]; comp = (input1 < input2 ^ dir) * 4 + add3; l_data[id] = shuffle2(input1, input2, as_uint4(comp)); l_data[id + get_local_size(0)] = shuffle2(input2, input1, as_uint4(comp)); /* Perform bitonic merge */ for(stride = get_local_size(0)/2; stride > 1; stride >>= 1) { barrier(CLK_LOCAL_MEM_FENCE); id = get_local_id(0) + (get_local_id(0)/stride)*stride; VECTOR_SWAP(l_data[id], l_data[id + stride], dir) } barrier(CLK_LOCAL_MEM_FENCE); /* Perform final sort */ id = get_local_id(0) * 2; input1 = l_data[id]; input2 = l_data[id+1]; temp = input1; comp = (input1 < input2 ^ dir) * 4 + add3; input1 = shuffle2(input1, input2, as_uint4(comp)); input2 = shuffle2(input2, temp, as_uint4(comp)); VECTOR_SORT(input1, dir); VECTOR_SORT(input2, dir); /* Store the result to global memory */ g_data[global_start + get_local_id(0)] = input1; g_data[global_start + get_local_id(0) + 1] = input2; }
原文:https://www.cnblogs.com/feihu-h/p/12107714.html