// kernel __kernel void transpose(__global float4 *g_mat, __local float4 *l_mat, uint size) { __global float4 *src, *dst; /* Determine row and column location */ int col = get_global_id(0); int row = 0; while(col >= size) { col -= size--; row++; } col += row; size += row; /* Read source block into local memory */ src = g_mat + row * size * 4 + col; l_mat += get_local_id(0)*8; l_mat[0] = src[0]; l_mat[1] = src[size]; l_mat[2] = src[2*size]; l_mat[3] = src[3*size]; /* Process block on diagonal */ if(row == col) { src[0] = (float4)(l_mat[0].x, l_mat[1].x, l_mat[2].x, l_mat[3].x); src[size] = (float4)(l_mat[0].y, l_mat[1].y, l_mat[2].y, l_mat[3].y); src[2*size] = (float4)(l_mat[0].z, l_mat[1].z, l_mat[2].z, l_mat[3].z); src[3*size] = (float4)(l_mat[0].w, l_mat[1].w, l_mat[2].w, l_mat[3].w); } /* Process block off diagonal */ else { /* Read destination block into local memory */ dst = g_mat + col * size * 4 + row; l_mat[4] = dst[0]; l_mat[5] = dst[size]; l_mat[6] = dst[2*size]; l_mat[7] = dst[3*size]; /* Set elements of destination block */ dst[0] = (float4)(l_mat[0].x, l_mat[1].x, l_mat[2].x, l_mat[3].x); dst[size] = (float4)(l_mat[0].y, l_mat[1].y, l_mat[2].y, l_mat[3].y); dst[2*size] = (float4)(l_mat[0].z, l_mat[1].z, l_mat[2].z, l_mat[3].z); dst[3*size] = (float4)(l_mat[0].w, l_mat[1].w, l_mat[2].w, l_mat[3].w); /* Set elements of source block */ src[0] = (float4)(l_mat[4].x, l_mat[5].x, l_mat[6].x, l_mat[7].x); src[size] = (float4)(l_mat[4].y, l_mat[5].y, l_mat[6].y, l_mat[7].y); src[2*size] = (float4)(l_mat[4].z, l_mat[5].z, l_mat[6].z, l_mat[7].z); src[3*size] = (float4)(l_mat[4].w, l_mat[5].w, l_mat[6].w, l_mat[7].w); } }
// 主机程序
#define _CRT_SECURE_NO_WARNINGS #define PROGRAM_FILE "transpose.cl" #define KERNEL_FUNC "transpose" #define MATRIX_DIM 64 #include <stdio.h> #include <stdlib.h> #include <string.h> #ifdef MAC #include <OpenCL/cl.h> #else #include <CL/cl.h> #endif /* Find a GPU or CPU associated with the first available platform */ cl_device_id create_device() { cl_platform_id platform; cl_device_id dev; int err; /* Identify a platform */ err = clGetPlatformIDs(1, &platform, NULL); if(err < 0) { perror("Couldn‘t identify a platform"); exit(1); } /* Access a device */ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL); if(err == CL_DEVICE_NOT_FOUND) { err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL); } if(err < 0) { perror("Couldn‘t access any devices"); exit(1); } return dev; } /* Create program from a file and compile it */ cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) { cl_program program; FILE *program_handle; char *program_buffer, *program_log; size_t program_size, log_size; int err; /* Read program file and place content into buffer */ program_handle = fopen(filename, "r"); if(program_handle == NULL) { perror("Couldn‘t find the program file"); exit(1); } fseek(program_handle, 0, SEEK_END); program_size = ftell(program_handle); rewind(program_handle); program_buffer = (char*)malloc(program_size + 1); program_buffer[program_size] = ‘\0‘; fread(program_buffer, sizeof(char), program_size, program_handle); fclose(program_handle); /* Create program from file */ program = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); if(err < 0) { perror("Couldn‘t create the program"); exit(1); } free(program_buffer); /* Build program */ err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if(err < 0) { /* Find size of log and print to std output */ clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size + 1); program_log[log_size] = ‘\0‘; clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); printf("%s\n", program_log); free(program_log); exit(1); } return program; } int main() { /* Host/device data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; size_t global_size; cl_ulong mem_size; int i, j, err, check; /* Data and buffers */ cl_uint matrix_dim; float data[MATRIX_DIM][MATRIX_DIM]; cl_mem data_buffer; /* Initialize data */ for(i=0; i<MATRIX_DIM; i++) { for(j=0; j<MATRIX_DIM; j++) { data[i][j] = 1.0f * i * MATRIX_DIM + j; } } /* Create a device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn‘t create a context"); exit(1); } /* Build the program */ program = build_program(context, device, PROGRAM_FILE); /* Create a kernel */ kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn‘t create a kernel"); exit(1); }; /* Create buffer to hold matrix */ data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(data), data, &err); if(err < 0) { perror("Couldn‘t create a buffer"); exit(1); }; /* Determine execution parameters */ global_size = (MATRIX_DIM/4 * (MATRIX_DIM/4 + 1))/2; clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL); /* Create kernel arguments */ matrix_dim = MATRIX_DIM/4; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(kernel, 1, (size_t)mem_size, NULL); err |= clSetKernelArg(kernel, 2, sizeof(matrix_dim), &matrix_dim); if(err < 0) { printf("Couldn‘t set a kernel argument"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn‘t create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, NULL); if(err < 0) { perror("Couldn‘t enqueue the kernel"); printf("Error: %d\n", err); exit(1); } /* Read the result */ err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, sizeof(data), data, 0, NULL, NULL); if(err < 0) { perror("Couldn‘t read the buffer"); exit(1); } /* Check data */ check = 1; for(i=0; i<MATRIX_DIM; i++) { for(j=0; j<MATRIX_DIM; j++) { if(data[i][j] != 1.0*j*MATRIX_DIM+i) { check = 0; break; } } } if(check) printf("Transpose check succeeded.\n"); else printf("Transpose check failed.\n"); /* Deallocate resources */ clReleaseMemObject(data_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
原文:https://www.cnblogs.com/feihu-h/p/12107384.html