本案例讲述使用OpenCL计算矩阵乘法:C = A * B 。
设A、B、C分别是大小为N*P、P*M和N*M的矩阵,那么顺序实现的C代码可以如下所示:
// C Function void mat_mul( int Ndim, int Mdim, int Pdim, float* A, float* B, float* C) { int i, j, k; float tmp; for (i = 0; i < Ndim; i++) { for (j = 0; j < Mdim; j++) { tmp = 0.0; for (k = 0; k < Pdim; k++) tmp += A[i*Pdim + k] * B[k*Mdim + j]; C[i*Mdim + j] = tmp; } } }
// OpenCL Kernel Function __kernel void HelloOpenCL( const int Ndim, const int Mdim, const int Pdim, __global const float* A, __global const float* B, __global float* C) { int i = get_global_id(0); int j = get_global_id(1); int k; float tmp; if ((i < Ndim) && (j < Mdim)) { tmp = 0.0; for (k = 0; k < Pdim; k++) tmp += A[i*Pdim + k] * B[k*Mdim + j]; C[i*Mdim + j] = tmp; } }为每个工作项分配一个要计算的乘法矩阵的元素。将针对i,j的外层循环删除,替换为函数调用,查找这两维中对应工作项的全局ID。要特别当心,必须保证得到的工作项ID在矩阵C的范围内。这三个矩阵都留在全局内存中。
下面是在《基于CUDA的OpenCL开发环境搭建与入门程序示例》中main.cpp宿主机代码为基础的补丁文件。测量运行时间的部分:首先,在clCreateCommandQueue()函数中设置CL_QUEUE_PROFILING_ENABLE标志;然后,在clEnqueueNDRangeKernel()函数中设置事件对象;最后,通过clGetEventProfilingInfo()函数获取命令入队时间和命令执行结束时间。注意:时间的单位是纳秒,在最后打印时转换为秒显示。
--- /root/Desktop/main.cpp +++ /root/Desktop/main_new.cpp @@ -143,8 +143,10 @@ } // 4. Choose the first device - commandQueue = clCreateCommandQueue(context, - devices[0], 0, NULL); + commandQueue = clCreateCommandQueue(context, + devices[0], + CL_QUEUE_PROFILING_ENABLE, + NULL); if (commandQueue == NULL) { perror("Failed to create commandQueue for device 0."); exit(1); @@ -183,14 +185,33 @@ /******** 第四部分 创建内核和内存对象 ********/ - #define ARRAY_SIZE 10 + const int Ndim = 3; + const int Mdim = 4; + const int Pdim = 5; + + int szA = Ndim * Pdim; + int szB = Pdim * Mdim; + int szC = Ndim * Mdim; cl_kernel kernel = 0; cl_mem memObjects[3] = {0, 0, 0}; - float a[ARRAY_SIZE]; - float b[ARRAY_SIZE]; - float result[ARRAY_SIZE]; + float *A; + float *B; + float *C; + + A = (float *)malloc(szA * sizeof(float)); + B = (float *)malloc(szB * sizeof(float)); + C = (float *)malloc(szC * sizeof(float)); + + int i, j; + + for (i = 0; i < szA; i++) + A[i] = i + 1; + + for (i = 0; i < szB; i++) + B[i] = i + 1; + // 8. Create the kernel kernel = clCreateKernel(program, "HelloOpenCL", NULL); @@ -200,23 +221,18 @@ } // 9. Create memory objects - for (int i = 0; i < ARRAY_SIZE; i++) { - a[i] = (float)i + 1; - b[i] = (float)i + 1; - } - memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, - sizeof(float) * ARRAY_SIZE, - a, NULL); + sizeof(float) * szA, + A, NULL); memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, - sizeof(float) * ARRAY_SIZE, - b, NULL); + sizeof(float) * szB, + B, NULL); memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - sizeof(float) * ARRAY_SIZE, - result, NULL); + sizeof(float) * szC, + C, NULL); if (memObjects[0] == NULL || memObjects[1] == NULL || memObjects[2] == NULL) { perror("Error in clCreateBuffer.\n"); @@ -225,48 +241,98 @@ /******** 第五部分 执行内核 ********/ - size_t globalWorkSize[1] = { ARRAY_SIZE }; - size_t localWorkSize[1] = { 1 }; // 10. Set the kernel arguments - errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); - errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); - errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]); + errNum = clSetKernelArg(kernel, 0, sizeof(int), &Ndim); + errNum |= clSetKernelArg(kernel, 1, sizeof(int), &Mdim); + errNum |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim); + errNum |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &memObjects[0]); + errNum |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &memObjects[1]); + errNum |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &memObjects[2]); if (errNum != CL_SUCCESS) { perror("Error in clSetKernelArg.\n"); exit(1); } // 11. Queue the kernel up for execution across the array - errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, - globalWorkSize, localWorkSize, - 0, NULL, NULL); + size_t global[2]; + cl_event prof_event; + cl_ulong ev_start_time = (cl_ulong)0; + cl_ulong ev_end_time = (cl_ulong)0; + double rum_time; + + global[0] = (size_t)Ndim; + global[1] = (size_t)Mdim; + + errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, + global, NULL, 0, NULL, &prof_event); if (errNum != CL_SUCCESS) { perror("Error in clEnqueueNDRangeKernel.\n"); exit(1); } + clFinish(commandQueue); + errNum = clWaitForEvents(1, &prof_event); + if (errNum != CL_SUCCESS) { + perror("Error in clWaitForEvents.\n"); + exit(1); + } + + errNum = clGetEventProfilingInfo(prof_event, + CL_PROFILING_COMMAND_QUEUED, + sizeof(cl_ulong), + &ev_start_time, + NULL); + + errNum |= clGetEventProfilingInfo(prof_event, + CL_PROFILING_COMMAND_END, + sizeof(cl_ulong), + &ev_end_time, + NULL); + + if (errNum != CL_SUCCESS) { + perror("Error in clGetEventProfilingInfo.\n"); + while(1); + exit(1); + } + // 12. Read the output buffer back to the Host errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE, 0, - ARRAY_SIZE * sizeof(float), result, + sizeof(float) * szC, C, 0, NULL, NULL); if (errNum != CL_SUCCESS) { perror("Error in clEnqueueReadBuffer.\n"); exit(1); } + rum_time = (double)(ev_end_time - ev_start_time); + /******** 第六部分 测试结果 ********/ - printf("\nTest: a * b = c\n\n"); - - printf("Input numbers:\n"); - for (int i = 0; i < ARRAY_SIZE; i++) - printf("a[%d] = %f, b[%d] = %f\n", i, a[i], i, b[i]); - - printf("\nOutput numbers:\n"); - for (int i = 0; i < ARRAY_SIZE; i++) - printf("a[%d] * b[%d] = %f\n", i, i, result[i]); + + printf("\nArray A:\n"); + for (i = 0; i < Ndim; i++) { + for (j = 0; j < Pdim; j++) + printf("%.3f\t", A[i*Pdim + j]); + printf("\n"); + } + + printf("\nArray B:\n"); + for (i = 0; i < Pdim; i++) { + for (j = 0; j < Mdim; j++) + printf("%.3f\t", B[i*Mdim + j]); + printf("\n"); + } + + printf("\nArray C:\n"); + for (i = 0; i < Ndim; i++) { + for (j = 0; j < Mdim; j++) + printf("%.3f\t", C[i*Mdim + j]); + printf("\n"); + } + + printf("\n\nRunning Time: %f s\n", rum_time*1.0e-9); while(1);
(1). N = 3,M = 4,P = 5。
(2).
N = 1000,M = 1000,P = 1000。
原文:http://blog.csdn.net/cloud_desktop/article/details/19822025