#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "cublas_v2.h"
#define BLOCK_SIZE 16
cudaError_t multiCuda(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);
__global__ void multiKernel(float *c, float *a, float*b, unsigned int aW, unsigned int bW)
{
//saved in register
int xBlock = blockIdx.x;
int yBlock = blockIdx.y;
int xThread = threadIdx.x;
int yThread = threadIdx.y;
unsigned int aWidth = aW;
unsigned int bWidth = bW;
float Cvalue= 0;
for(int i=0; i< aWidth/BLOCK_SIZE; ++i)
{
__shared__ int aSub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ int bSub[BLOCK_SIZE][BLOCK_SIZE];
aSub[yThread][xThread] = a[(yBlock*blockDim.y + yThread)*aWidth + i*blockDim.x + xThread];
bSub[yThread][xThread] = b[(i*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread];
__syncthreads();
for(int e=0; e<BLOCK_SIZE; ++e)
{
Cvalue += aSub[yThread][e]*bSub[e][xThread];
}
__syncthreads();
}
int cIndex = (yBlock*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread;
c[cIndex] = Cvalue;
}
__global__ void multiKernel_NoLoop(float *c, float *a, float*b, unsigned int aW, unsigned int bW)
{
int xBlock = blockIdx.x;
int yBlock = blockIdx.y;
int xThread = threadIdx.x;
int yThread = threadIdx.y;
unsigned int aWidth = aW;
unsigned int bWidth = bW;
float Cvalue= 0;
for(int i=0; i< aWidth/BLOCK_SIZE; ++i)
{
__shared__ int aSub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ int bSub[BLOCK_SIZE][BLOCK_SIZE];
aSub[yThread][xThread] = a[(yBlock*blockDim.y + yThread)*aWidth + i*blockDim.x + xThread];
bSub[yThread][xThread] = b[(i*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread];
__syncthreads();
Cvalue += aSub[yThread][0]*bSub[0][xThread] + aSub[yThread][1]*bSub[1][xThread] + \
aSub[yThread][2]*bSub[2][xThread] + aSub[yThread][3]*bSub[3][xThread] + \
aSub[yThread][4]*bSub[4][xThread] + aSub[yThread][5]*bSub[5][xThread] + \
aSub[yThread][6]*bSub[6][xThread] + aSub[yThread][7]*bSub[7][xThread] + \
aSub[yThread][8]*bSub[8][xThread] + aSub[yThread][9]*bSub[9][xThread] + \
aSub[yThread][10]*bSub[10][xThread] + aSub[yThread][11]*bSub[11][xThread] + \
aSub[yThread][12]*bSub[12][xThread] + aSub[yThread][13]*bSub[13][xThread] + \
aSub[yThread][14]*bSub[14][xThread] + aSub[yThread][15]*bSub[15][xThread] ;
__syncthreads();
}
int cIndex = (yBlock*blockDim.y + yThread)*bWidth + xBlock*blockDim.x + xThread;
c[cIndex] = Cvalue;
}
cudaError_t multiWithcuBlase(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);
void multiCPU(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW);
int main()
{
const unsigned int aH = 320;
const unsigned int aW = 320;
const unsigned int bW = 320;
const unsigned int bH = aW;
const unsigned int cH = aH;
const unsigned int cW = bW;
float *cpu_a, *cpu_b,*cpu_c;
cpu_a = (float*)malloc(aH*aW*sizeof(float));
cpu_b = (float*)malloc(bH*bW*sizeof(float));
cpu_c = (float*)malloc(cH*cW*sizeof(float));
for(int y=0; y<aH; ++y)
{
for(int x =0; x<aW; ++x)
{
int index = y*aW + x;
cpu_a[index] = (float)(x<y?x:y);
}
}
for(int y=0; y<bH; ++y)
{
for(int x =0; x<bW; ++x)
{
int index = y*bW + x;
cpu_b[index] = (float)(x<y?x:y);
}
}
cudaError_t cudaStatus = multiCuda(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "multiCuda failed!");
return 1;
}
/*
for(int y=0; y<cH; ++y)
{
for(int x =0; x<cW; ++x)
{
if(x==1&&y==1)
{
int index = y*cW + x;
printf("c(1,1)=%.1f\n",cpu_c[index]);
}
}
}
*/
cudaStatus = multiWithcuBlase(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "multiCuda failed!");
return 1;
}
/*
for(int y=0; y<cH; ++y)
{
for(int x =0; x<cW; ++x)
{
if(x==1&&y==1)
{
int index = y*cW + x;
printf("c(1,1)=%.1f\n",cpu_c[index]);
}
}
}
*/
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
float start,end;
start = clock();
multiCPU(cpu_c, cpu_a, cpu_b, aH, aW, bH, bW);
end = clock();
printf("CPU runtime is %f msec\n ",end - start);
free(cpu_a);
free(cpu_b);
free(cpu_c);
getchar();
return 0;
}
cudaError_t multiCuda(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
{
float *gpu_a = 0;
float *gpu_b = 0;
float *gpu_c = 0;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
size_t size_a = aH*aW*sizeof(float);
cudaStatus = cudaMalloc((void**)&gpu_a, size_a);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
size_t size_b = bH*bW*sizeof(float);
cudaStatus = cudaMalloc((void**)&gpu_b, size_b);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
size_t size_c = aH*bW*sizeof(float);
cudaStatus = cudaMalloc((void**)&gpu_c,size_c);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(gpu_a, a, size_a, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(gpu_b, b,size_b, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
dim3 blocks(BLOCK_SIZE,BLOCK_SIZE);
dim3 grids(bW/BLOCK_SIZE,aH/BLOCK_SIZE);
cudaEvent_t start;
cudaStatus = cudaEventCreate(&start);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaEvent_t stop;
cudaStatus = cudaEventCreate(&stop);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaEventRecord(start, NULL);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
multiKernel<<<grids,blocks>>>(gpu_c,gpu_a,gpu_b,aW,bW);
cudaStatus = cudaEventRecord(stop, NULL);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaEventSynchronize(stop);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
float msecTotal = 0.0f;
cudaStatus = cudaEventElapsedTime(&msecTotal, start, stop);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
printf("HS__ GPU runtime is %f msec\n",msecTotal);
/*******************************************************/
cudaEvent_t start1;
cudaStatus = cudaEventCreate(&start1);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaEvent_t stop1;
cudaStatus = cudaEventCreate(&stop1);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaEventRecord(start1, NULL);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
multiKernel_NoLoop<<<grids,blocks>>>(gpu_c,gpu_a,gpu_b,aW,bW);
cudaStatus = cudaEventRecord(stop1, NULL);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaEventSynchronize(stop1);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
float msecTotal1 = 0.0f;
cudaStatus = cudaEventElapsedTime(&msecTotal1, start1, stop1);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
printf("HS__NoLoop GPU runtime is %f msec\n",msecTotal1);
/***********************************************************/
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(c, gpu_c, size_c, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(gpu_a);
cudaFree(gpu_b);
cudaFree(gpu_c);
return cudaStatus;
}
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
cudaError_t multiWithcuBlase(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
{
float *gpu_a = 0;
float *gpu_b = 0;
float *gpu_c = 0;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
size_t size_a = aH*aW*sizeof(float);
cudaStatus = cudaMalloc((void**)&gpu_a, size_a);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
size_t size_b = bH*bW*sizeof(float);
cudaStatus = cudaMalloc((void**)&gpu_b, size_b);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
size_t size_c = aH*bW*sizeof(float);
cudaStatus = cudaMalloc((void**)&gpu_c,size_c);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(gpu_a, a, size_a, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(gpu_b, b,size_b, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
dim3 blocks(BLOCK_SIZE,BLOCK_SIZE);
dim3 grids(bW/BLOCK_SIZE,aH/BLOCK_SIZE);
//printf("Computing result using CUBLAS...\n");
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
goto Error;
}
const float alpha = 1.0f;
const float beta = 0.0f;
cudaEvent_t start;
cudaStatus = cudaEventCreate(&start);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaEvent_t stop;
cudaStatus = cudaEventCreate(&stop);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaEventRecord(start, NULL);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, aH, bW, aW, &alpha, gpu_a, aH, gpu_b, bH, &beta, gpu_c, aH);
cudaStatus = cudaEventRecord(stop, NULL);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaEventSynchronize(stop);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
float msecTotal = 0.0f;
cudaStatus = cudaEventElapsedTime(&msecTotal, start, stop);
if (cudaStatus != cudaSuccess){
fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(cudaStatus));
goto Error;
}
printf("cuBlas__ GPU runtime is %f msec\n",msecTotal);
cudaStatus = cudaMemcpy(c, gpu_c, size_c, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
Error:
cudaFree(gpu_a);
cudaFree(gpu_b);
cudaFree(gpu_c);
return cudaStatus;
}
void multiCPU(float *c, float *a, float *b, unsigned int aH, unsigned int aW, unsigned int bH, unsigned int bW)
{
for(int y=0; y<aH; ++y)
{
for(int x =0; x<bW; ++x)
{
int index = y*bW + x;
c[index] = 0.0f;
for(int i=0; i<aW; ++i)
{
c[index] += a[y*aW+i]*b[i*bW + x];
}
}
}
}
原文:http://www.cnblogs.com/huangshan/p/3916918.html