对10w大小的数组做1w次重复赋值,
分别用多线程和单线程实现。
结果发现
1begin 1 main Time to generate: 50.0 ms 2begin 2 main Time to generate: 117630.0 ms
#include <stdio.h> #define MAX_BLOCKS_PER_GRID 65535 #define MAX_BLOCK_ROWS 255 #define MAX_BLOCK_COLS 255 #define MAX_THREADS_PER_BLOCK 1024 #define MAX_THREAD_ROWS 32 #define MAX_ThREAD_COLS 32 __global__ void kernel(double* dev_array) { int tid = blockIdx.x*blockDim.x+threadIdx.x; dev_array[tid] = 1; //dev_array[0] = 1; } __global__ void kernel2(double* dev_array,int array_size) { for(int i=0;i<array_size;i++) { dev_array[i] = 1; //dev_array[0] = 1; } } int main() { int rounds = 10000; clock_t start,stop; float elapsedTime; int array_size = 100000; /* double* array = new double[array_size]; printf("0begin\n"); start = clock(); for(int i=0;i<rounds;i++) { for(int i=0;i<array_size;i++) { array[i] = 1; } } stop= clock(); elapsedTime = (float)(stop - start) / (float)CLOCKS_PER_SEC * 1000.0f; printf( "0 main Time to generate: %3.1f ms\n", elapsedTime ); delete []array; */ double * dev_array = NULL; cudaMalloc(&dev_array,array_size*sizeof(double)); int threads = MAX_THREADS_PER_BLOCK; if(threads>array_size) threads = array_size; int blocks = (array_size+threads-1)/threads; if(blocks > MAX_BLOCKS_PER_GRID) blocks = MAX_BLOCKS_PER_GRID; printf("1begin\n"); start = clock(); for(int i=0;i<rounds;i++) { kernel<<<blocks,threads>>>(dev_array); } cudaDeviceSynchronize(); stop= clock(); elapsedTime = (float)(stop - start) / (float)CLOCKS_PER_SEC * 1000.0f; printf( "1 main Time to generate: %3.1f ms\n", elapsedTime ); printf("2begin\n"); start = clock(); for(int i=0;i<rounds;i++) { kernel2<<<1,1>>>(dev_array,array_size); } cudaDeviceSynchronize(); stop= clock(); elapsedTime = (float)(stop - start) / (float)CLOCKS_PER_SEC * 1000.0f; printf( "2 main Time to generate: %3.1f ms\n", elapsedTime ); cudaFree(dev_array); return 0; }
原文:http://blog.csdn.net/linger2012liu/article/details/26271921