opencl(二十五)----双调排序

时间：2020-01-05 19:11:39 阅读：59 评论：0 收藏：0 [点我收藏+]

双调排序

参考：《opencl实战》

主机程序：

// 获取设备

// 获取设备
cl_device_id create_device() {

   cl_platform_id platform;
   cl_device_id dev;
   int err;

   /* Identify a platform */
   err = clGetPlatformIDs(1, &platform, NULL);
   if(err < 0) {
      perror("Couldn‘t identify a platform");
      exit(1);
   } 

   /* Access a device */
   err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
   if(err == CL_DEVICE_NOT_FOUND) {
      err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
   }
   if(err < 0) {
      perror("Couldn‘t access any devices");
      exit(1);   
   }

   return dev;
}

// 创建并编译cl_program

 1 // 创建并编译cl_program
 2 cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
 3 
 4    cl_program program;
 5    FILE *program_handle;
 6    char *program_buffer, *program_log;
 7    size_t program_size, log_size;
 8    int err;
 9 
10    /* Read program file and place content into buffer */
11    program_handle = fopen(filename, "r");
12    if(program_handle == NULL) {
13       perror("Couldn‘t find the program file");
14       exit(1);
15    }
16    fseek(program_handle, 0, SEEK_END);
17    program_size = ftell(program_handle);
18    rewind(program_handle);
19    program_buffer = (char*)malloc(program_size + 1);
20    program_buffer[program_size] = ‘\0‘;
21    fread(program_buffer, sizeof(char), program_size, program_handle);
22    fclose(program_handle);
23 
24    /* Create program from file */
25    program = clCreateProgramWithSource(ctx, 1, 
26       (const char**)&program_buffer, &program_size, &err);
27    if(err < 0) {
28       perror("Couldn‘t create the program");
29       exit(1);
30    }
31    free(program_buffer);
32 
33    /* Build program */
34    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
35    if(err < 0) {
36 
37       /* Find size of log and print to std output */
38       clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
39             0, NULL, &log_size);
40       program_log = (char*) malloc(log_size + 1);
41       program_log[log_size] = ‘\0‘;
42       clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
43             log_size + 1, program_log, NULL);
44       printf("%s\n", program_log);
45       free(program_log);
46       exit(1);
47    }
48 
49    return program;
50 }

View Code

// main

#禁止不安全的错误警告
#define _CRT_SECURE_NO_WARNINGS  
#define PROGRAM_FILE       "bsort.cl"
#define BSORT_INIT         "bsort_init"
#define BSORT_STAGE_0      "bsort_stage_0"
#define BSORT_STAGE_N      "bsort_stage_n"
#define BSORT_MERGE        "bsort_merge"
#define BSORT_MERGE_LAST   "bsort_merge_last"

/* Ascending: 0, Descending: -1 */
#define DIRECTION 0
#define NUM_FLOATS 1048576

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>       

#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() ;

/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) ;
int main() {

   /* Host/device data structures */
   cl_device_id device;
   cl_context context;
   cl_command_queue queue;
   cl_program program;
   cl_kernel kernel_init, kernel_stage_0, kernel_stage_n, kernel_merge,
         kernel_merge_last;
   cl_int i, err, check, direction;

   /* Data and buffers */
   float data[NUM_FLOATS];
   cl_mem data_buffer;
   cl_uint stage, high_stage, num_stages;
   size_t local_size, global_size;

   /* Initialize data */
   srand(time(NULL));
   for(i=0; i<NUM_FLOATS; i++) {
      data[i] = rand();
   }

   /* Create a device and context */
   device = create_device();
   context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
   if(err < 0) {
      perror("Couldn‘t create a context");
      exit(1);   
   }

   /* Build the program */
   program = build_program(context, device, PROGRAM_FILE);

   /* Create kernels */
   kernel_init = clCreateKernel(program, BSORT_INIT, &err);
   if(err < 0) {
      perror("Couldn‘t create the initial kernel");
      exit(1);   
   };
   kernel_stage_0 = clCreateKernel(program, BSORT_STAGE_0, &err);
   if(err < 0) {
      perror("Couldn‘t create the stage_0 kernel");
      exit(1);   
   };
   kernel_stage_n = clCreateKernel(program, BSORT_STAGE_N, &err);
   if(err < 0) {
      perror("Couldn‘t create the stage_n kernel");
      exit(1);   
   };
   kernel_merge = clCreateKernel(program, BSORT_MERGE, &err);
   if(err < 0) {
      perror("Couldn‘t create the merge kernel");
      exit(1);   
   };
   kernel_merge_last = clCreateKernel(program, BSORT_MERGE_LAST, &err);
   if(err < 0) {
      perror("Couldn‘t create the merge_last kernel");
      exit(1);   
   };

   /* Determine maximum work-group size */
   // 获取工作组中工作项的 数量限制
   err = clGetKernelWorkGroupInfo(kernel_init, device, CL_KERNEL_WORK_GROUP_SIZE,
      sizeof(local_size), &local_size, NULL);
   if(err < 0) {
      perror("Couldn‘t find the maximum work-group size");
      exit(1);   
   };
   local_size = (int)pow(2, trunc(log2(local_size))); //函数 TRUNC 直接去除数字的小数部分


   /* Create buffer */
   data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
         CL_MEM_COPY_HOST_PTR, sizeof(data), data, &err);
   if(err < 0) {
      perror("Couldn‘t create a buffer");
      exit(1);   
   };

   /* Create kernel argument */
   err = clSetKernelArg(kernel_init, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(kernel_stage_0, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(kernel_stage_n, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(kernel_merge, 0, sizeof(cl_mem), &data_buffer);
   err |= clSetKernelArg(kernel_merge_last, 0, sizeof(cl_mem), &data_buffer);
   if(err < 0) {
      printf("Couldn‘t set a kernel argument");
      exit(1);
   };

   /* Create kernel argument */
   err = clSetKernelArg(kernel_init, 1, 8*local_size*sizeof(float), NULL);
   err |= clSetKernelArg(kernel_stage_0, 1, 8*local_size*sizeof(float), NULL);
   err |= clSetKernelArg(kernel_stage_n, 1, 8*local_size*sizeof(float), NULL);
   err |= clSetKernelArg(kernel_merge, 1, 8*local_size*sizeof(float), NULL);
   err |= clSetKernelArg(kernel_merge_last, 1, 8*local_size*sizeof(float), NULL);
   if(err < 0) {
      printf("Couldn‘t set a kernel argument");
      exit(1);
   };

   /* Create a command queue */
   // 创建命令队列
   queue = clCreateCommandQueue(context, device, 0, &err);
   if(err < 0) {
      perror("Couldn‘t create a command queue");
      exit(1);   
   };

   /* Enqueue initial sorting kernel */
   global_size = NUM_FLOATS/8;
   if(global_size < local_size) {
      local_size = global_size;
   }
   err = clEnqueueNDRangeKernel(queue, kernel_init, 1, NULL, &global_size, 
         &local_size, 0, NULL, NULL); 
   if(err < 0) {
      perror("Couldn‘t enqueue the kernel");
      exit(1);   
   }

   /* Execute further stages */
   num_stages = global_size/local_size;
   for(high_stage = 2; high_stage < num_stages; high_stage <<= 1) {

      err = clSetKernelArg(kernel_stage_0, 2, sizeof(int), &high_stage);      
      err |= clSetKernelArg(kernel_stage_n, 3, sizeof(int), &high_stage);
      if(err < 0) {
         printf("Couldn‘t set a kernel argument");
         exit(1);
      };

      for(stage = high_stage; stage > 1; stage >>= 1) {

         err = clSetKernelArg(kernel_stage_n, 2, sizeof(int), &stage);
         if(err < 0) {
            printf("Couldn‘t set a kernel argument");
            exit(1);
         };

         err = clEnqueueNDRangeKernel(queue, kernel_stage_n, 1, NULL, 
               &global_size, &local_size, 0, NULL, NULL); 
         if(err < 0) {
            perror("Couldn‘t enqueue the kernel");
            exit(1);   
         }
      }

      err = clEnqueueNDRangeKernel(queue, kernel_stage_0, 1, NULL, 
            &global_size, &local_size, 0, NULL, NULL); 
      if(err < 0) {
         perror("Couldn‘t enqueue the kernel");
         exit(1);   
      }
   }

   /* Set the sort direction */
   direction = DIRECTION;
   err = clSetKernelArg(kernel_merge, 3, sizeof(int), &direction);
   err |= clSetKernelArg(kernel_merge_last, 2, sizeof(int), &direction);
   if(err < 0) {
      printf("Couldn‘t set a kernel argument");
      exit(1);
   };

   /* Perform the bitonic merge */
   for(stage = num_stages; stage > 1; stage >>= 1) {

      err = clSetKernelArg(kernel_merge, 2, sizeof(int), &stage);
      if(err < 0) {
         printf("Couldn‘t set a kernel argument");
         exit(1);
      };

      err = clEnqueueNDRangeKernel(queue, kernel_merge, 1, NULL, 
            &global_size, &local_size, 0, NULL, NULL); 
      if(err < 0) {
         perror("Couldn‘t enqueue the kernel");
         exit(1);   
      }
   }
   err = clEnqueueNDRangeKernel(queue, kernel_merge_last, 1, NULL, 
         &global_size, &local_size, 0, NULL, NULL); 
   if(err < 0) {
      perror("Couldn‘t enqueue the kernel");
      exit(1);   
   }

   /* Read the result */
   err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, 
      sizeof(data), &data, 0, NULL, NULL);
   if(err < 0) {
      perror("Couldn‘t read the buffer");
      exit(1);   
   }

   check = 1;

   /* Check ascending sort */
   if(direction == 0) {
      for(i=1; i<NUM_FLOATS; i++) {
         if(data[i] < data[i-1]) {
            check = 0;
            break;
         }
      }
   }
   /* Check descending sort */
   if(direction == -1) {
      for(i=1; i<NUM_FLOATS; i++) {
         if(data[i] > data[i-1]) {
            check = 0;
            break;
         }
      }
   }

   /* Display check result */
   printf("Local size: %zu\n", local_size);
   printf("Global size: %zu\n", global_size);
   if(check)
      printf("Bitonic sort succeeded.\n");
   else
      printf("Bitonic sort failed.\n");

   /* Deallocate resources */
   clReleaseMemObject(data_buffer);
   clReleaseKernel(kernel_init);
   clReleaseKernel(kernel_stage_0);
   clReleaseKernel(kernel_stage_n);
   clReleaseKernel(kernel_merge);
   clReleaseKernel(kernel_merge_last);
   clReleaseCommandQueue(queue);
   clReleaseProgram(program);
   clReleaseContext(context);
   return 0;
}

核函数：

/* Sort elements within a vector */
#define VECTOR_SORT(input, dir)                                   \
   comp = input < shuffle(input, mask2) ^ dir;                       input = shuffle(input, as_uint4(comp * 2 + add2));                comp = input < shuffle(input, mask1) ^ dir;                       input = shuffle(input, as_uint4(comp + add1));                 
#define VECTOR_SWAP(input1, input2, dir)                          \
   temp = input1;                                                    comp = (input1 < input2 ^ dir) * 4 + add3;                        input1 = shuffle2(input1, input2, as_uint4(comp));                input2 = shuffle2(input2, temp, as_uint4(comp));               
/* Perform initial sort */
__kernel void bsort_init(__global float4 *g_data, __local float4 *l_data) {

   int dir;
   uint id, global_start, size, stride;
   float4 input1, input2, temp;
   int4 comp;

   uint4 mask1 = (uint4)(1, 0, 3, 2);
   uint4 mask2 = (uint4)(2, 3, 0, 1);
   uint4 mask3 = (uint4)(3, 2, 1, 0);

   int4 add1 = (int4)(1, 1, 3, 3);
   int4 add2 = (int4)(2, 3, 2, 3);
   int4 add3 = (int4)(1, 2, 2, 3);

   id = get_local_id(0) * 2;
   global_start = get_group_id(0) * get_local_size(0) * 2 + id;

   input1 = g_data[global_start]; 
   input2 = g_data[global_start+1];

   /* Sort input 1 - ascending */
   comp = input1 < shuffle(input1, mask1);
   input1 = shuffle(input1, as_uint4(comp + add1));
   comp = input1 < shuffle(input1, mask2);
   input1 = shuffle(input1, as_uint4(comp * 2 + add2));
   comp = input1 < shuffle(input1, mask3);
   input1 = shuffle(input1, as_uint4(comp + add3));

   /* Sort input 2 - descending */
   comp = input2 > shuffle(input2, mask1);
   input2 = shuffle(input2, as_uint4(comp + add1));
   comp = input2 > shuffle(input2, mask2);
   input2 = shuffle(input2, as_uint4(comp * 2 + add2));
   comp = input2 > shuffle(input2, mask3);
   input2 = shuffle(input2, as_uint4(comp + add3));     

   /* Swap corresponding elements of input 1 and 2 */
   add3 = (int4)(4, 5, 6, 7);
   dir = get_local_id(0) % 2 * -1;
   temp = input1;
   comp = (input1 < input2 ^ dir) * 4 + add3;
   input1 = shuffle2(input1, input2, as_uint4(comp));
   input2 = shuffle2(input2, temp, as_uint4(comp));

   /* Sort data and store in local memory */
   VECTOR_SORT(input1, dir);
   VECTOR_SORT(input2, dir);
   l_data[id] = input1;
   l_data[id+1] = input2;

   /* Create bitonic set */
   for(size = 2; size < get_local_size(0); size <<= 1) {
      dir = (get_local_id(0)/size & 1) * -1;

      for(stride = size; stride > 1; stride >>= 1) {
         barrier(CLK_LOCAL_MEM_FENCE);
         id = get_local_id(0) + (get_local_id(0)/stride)*stride;
         VECTOR_SWAP(l_data[id], l_data[id + stride], dir)
      }

      barrier(CLK_LOCAL_MEM_FENCE);
      id = get_local_id(0) * 2;
      input1 = l_data[id]; input2 = l_data[id+1];
      temp = input1;
      comp = (input1 < input2 ^ dir) * 4 + add3;
      input1 = shuffle2(input1, input2, as_uint4(comp));
      input2 = shuffle2(input2, temp, as_uint4(comp));
      VECTOR_SORT(input1, dir);
      VECTOR_SORT(input2, dir);
      l_data[id] = input1;
      l_data[id+1] = input2;
   }

   /* Perform bitonic merge */
   dir = (get_group_id(0) % 2) * -1;
   for(stride = get_local_size(0); stride > 1; stride >>= 1) {
      barrier(CLK_LOCAL_MEM_FENCE);
      id = get_local_id(0) + (get_local_id(0)/stride)*stride;
      VECTOR_SWAP(l_data[id], l_data[id + stride], dir)
   }
   barrier(CLK_LOCAL_MEM_FENCE);

   /* Perform final sort */
   id = get_local_id(0) * 2;
   input1 = l_data[id]; input2 = l_data[id+1];
   temp = input1;
   comp = (input1 < input2 ^ dir) * 4 + add3;
   input1 = shuffle2(input1, input2, as_uint4(comp));
   input2 = shuffle2(input2, temp, as_uint4(comp));
   VECTOR_SORT(input1, dir);
   VECTOR_SORT(input2, dir);
   g_data[global_start] = input1;
   g_data[global_start+1] = input2;
}

/* Perform lowest stage of the bitonic sort */
__kernel void bsort_stage_0(__global float4 *g_data, __local float4 *l_data, 
                            uint high_stage) {

   int dir;
   uint id, global_start, stride;
   float4 input1, input2, temp;
   int4 comp;

   uint4 mask1 = (uint4)(1, 0, 3, 2);
   uint4 mask2 = (uint4)(2, 3, 0, 1);
   uint4 mask3 = (uint4)(3, 2, 1, 0);

   int4 add1 = (int4)(1, 1, 3, 3);
   int4 add2 = (int4)(2, 3, 2, 3);
   int4 add3 = (int4)(4, 5, 6, 7);

   /* Determine data location in global memory */
   id = get_local_id(0);
   dir = (get_group_id(0)/high_stage & 1) * -1;
   global_start = get_group_id(0) * get_local_size(0) * 2 + id;

   /* Perform initial swap */
   input1 = g_data[global_start];
   input2 = g_data[global_start + get_local_size(0)];
   comp = (input1 < input2 ^ dir) * 4 + add3;
   l_data[id] = shuffle2(input1, input2, as_uint4(comp));
   l_data[id + get_local_size(0)] = shuffle2(input2, input1, as_uint4(comp));

   /* Perform bitonic merge */
   for(stride = get_local_size(0)/2; stride > 1; stride >>= 1) {
      barrier(CLK_LOCAL_MEM_FENCE);
      id = get_local_id(0) + (get_local_id(0)/stride)*stride;
      VECTOR_SWAP(l_data[id], l_data[id + stride], dir)
   }
   barrier(CLK_LOCAL_MEM_FENCE);

   /* Perform final sort */
   id = get_local_id(0) * 2;
   input1 = l_data[id]; input2 = l_data[id+1];
   temp = input1;
   comp = (input1 < input2 ^ dir) * 4 + add3;
   input1 = shuffle2(input1, input2, as_uint4(comp));
   input2 = shuffle2(input2, temp, as_uint4(comp));
   VECTOR_SORT(input1, dir);
   VECTOR_SORT(input2, dir);

   /* Store output in global memory */
   g_data[global_start + get_local_id(0)] = input1;
   g_data[global_start + get_local_id(0) + 1] = input2;
}

/* Perform successive stages of the bitonic sort */
__kernel void bsort_stage_n(__global float4 *g_data, __local float4 *l_data, 
                            uint stage, uint high_stage) {

   int dir;
   float4 input1, input2;
   int4 comp, add;
   uint global_start, global_offset;

   add = (int4)(4, 5, 6, 7);

   /* Determine location of data in global memory */
   dir = (get_group_id(0)/high_stage & 1) * -1;
   global_start = (get_group_id(0) + (get_group_id(0)/stage)*stage) *
                   get_local_size(0) + get_local_id(0);
   global_offset = stage * get_local_size(0);

   /* Perform swap */
   input1 = g_data[global_start];
   input2 = g_data[global_start + global_offset];
   comp = (input1 < input2 ^ dir) * 4 + add;
   g_data[global_start] = shuffle2(input1, input2, as_uint4(comp));
   g_data[global_start + global_offset] = shuffle2(input2, input1, as_uint4(comp));
}

/* Sort the bitonic set */
__kernel void bsort_merge(__global float4 *g_data, __local float4 *l_data, uint stage, int dir) {

   float4 input1, input2;
   int4 comp, add;
   uint global_start, global_offset;

   add = (int4)(4, 5, 6, 7);

   /* Determine location of data in global memory */
   global_start = (get_group_id(0) + (get_group_id(0)/stage)*stage) *
                   get_local_size(0) + get_local_id(0);
   global_offset = stage * get_local_size(0);

   /* Perform swap */
   input1 = g_data[global_start];
   input2 = g_data[global_start + global_offset];
   comp = (input1 < input2 ^ dir) * 4 + add;
   g_data[global_start] = shuffle2(input1, input2, as_uint4(comp));
   g_data[global_start + global_offset] = shuffle2(input2, input1, as_uint4(comp));
}

/* Perform final step of the bitonic merge */
__kernel void bsort_merge_last(__global float4 *g_data, __local float4 *l_data, int dir) {

   uint id, global_start, stride;
   float4 input1, input2, temp;
   int4 comp;

   uint4 mask1 = (uint4)(1, 0, 3, 2);
   uint4 mask2 = (uint4)(2, 3, 0, 1);
   uint4 mask3 = (uint4)(3, 2, 1, 0);

   int4 add1 = (int4)(1, 1, 3, 3);
   int4 add2 = (int4)(2, 3, 2, 3);
   int4 add3 = (int4)(4, 5, 6, 7);

   /* Determine location of data in global memory */
   id = get_local_id(0);
   global_start = get_group_id(0) * get_local_size(0) * 2 + id;

   /* Perform initial swap */
   input1 = g_data[global_start];
   input2 = g_data[global_start + get_local_size(0)];
   comp = (input1 < input2 ^ dir) * 4 + add3;
   l_data[id] = shuffle2(input1, input2, as_uint4(comp));
   l_data[id + get_local_size(0)] = shuffle2(input2, input1, as_uint4(comp));

   /* Perform bitonic merge */
   for(stride = get_local_size(0)/2; stride > 1; stride >>= 1) {
      barrier(CLK_LOCAL_MEM_FENCE);
      id = get_local_id(0) + (get_local_id(0)/stride)*stride;
      VECTOR_SWAP(l_data[id], l_data[id + stride], dir)
   }
   barrier(CLK_LOCAL_MEM_FENCE);

   /* Perform final sort */
   id = get_local_id(0) * 2;
   input1 = l_data[id]; input2 = l_data[id+1];
   temp = input1;
   comp = (input1 < input2 ^ dir) * 4 + add3;
   input1 = shuffle2(input1, input2, as_uint4(comp));
   input2 = shuffle2(input2, temp, as_uint4(comp));
   VECTOR_SORT(input1, dir);
   VECTOR_SORT(input2, dir);

   /* Store the result to global memory */
   g_data[global_start + get_local_id(0)] = input1;
   g_data[global_start + get_local_id(0) + 1] = input2;
}

opencl(二十五)----双调排序

原文：https://www.cnblogs.com/feihu-h/p/12107714.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)