本章介绍了纹理内存的使用,并给出了热传导的两个个例子。分别使用了一维和二维纹理单元。
热传导(使用一维纹理)
1 #include <stdio.h> 2 #include "cuda_runtime.h" 3 #include "device_launch_parameters.h" 4 #include "D:\Code\CUDA\book\common\book.h" 5 #include "D:\Code\CUDA\book\common\cpu_anim.h" 6 7 #define DIM 1024 8 #define PI 3.1415926535897932f 9 #define MAX_TEMP 1.0f 10 #define MIN_TEMP 0.0001f 11 #define SPEED 0.25f 12 13 //在全局位置上声明纹理引用,存在于GPU中 14 texture<float> texConstSrc; 15 texture<float> texIn; 16 texture<float> texOut; 17 18 struct DataBlock 19 { 20 unsigned char *output_bitmap; 21 float *dev_inSrc; 22 float *dev_outSrc; 23 float *dev_constSrc; 24 CPUAnimBitmap *bitmap; 25 cudaEvent_t start, stop; 26 float totalTime; 27 float frames; 28 }; 29 30 __global__ void blend_kernel(float *dst, bool dstOut) 31 { 32 int x = threadIdx.x + blockIdx.x * blockDim.x; 33 int y = threadIdx.y + blockIdx.y * blockDim.y; 34 int offset = x + y * blockDim.x * gridDim.x; 35 36 int left = offset - 1;//找到上下左右的块 37 int right = offset + 1; 38 int top = offset - DIM; 39 int bottom = offset + DIM; 40 if (x == 0) 41 left++; 42 if (x == DIM - 1) 43 right--; 44 if (y == 0) 45 top += DIM; 46 if (y == DIM - 1) 47 bottom -= DIM; 48 float t, l, c, r, b; 49 if (dstOut) 50 { 51 t = tex1Dfetch(texIn, top); 52 l = tex1Dfetch(texIn, left); 53 c = tex1Dfetch(texIn, offset); 54 r = tex1Dfetch(texIn, right); 55 b = tex1Dfetch(texIn, bottom); 56 } 57 else 58 { 59 t = tex1Dfetch(texOut, top); 60 l = tex1Dfetch(texOut, left); 61 c = tex1Dfetch(texOut, offset); 62 r = tex1Dfetch(texOut, right); 63 b = tex1Dfetch(texOut, bottom); 64 } 65 66 dst[offset] = c + SPEED * (t + b + r + l - 4 * c); 67 68 return; 69 } 70 71 __global__ void copy_const_kernel(float *iptr)// 将恒温常量矩阵覆盖输入矩阵 72 { 73 int x = threadIdx.x + blockIdx.x * blockDim.x; 74 int y = threadIdx.y + blockIdx.y * blockDim.y; 75 int offset = x + y * blockDim.x * gridDim.x; 76 77 float c = tex1Dfetch(texConstSrc, offset); 78 if (c != 0) 79 iptr[offset] = c; 80 81 return; 82 } 83 84 void anim_gpu(DataBlock *d, int ticks) 85 { 86 cudaEventRecord(d->start, 0); 87 dim3 blocks(DIM / 16, DIM / 16); 88 dim3 threads(16, 16); 89 CPUAnimBitmap *bitmap = d->bitmap; 90 91 volatile bool dstOut = true;//确定输入矩阵是哪一个,true代表dev_inSrc,false代表ev_outSrc 92 for (int i = 0; i < 90; i++) 93 { 94 float *in, *out; 95 if (dstOut) 96 { 97 in = d->dev_inSrc; 98 out = d->dev_outSrc; 99 } 100 else 101 { 102 in = d->dev_outSrc; 103 out = d->dev_inSrc; 104 } 105 106 copy_const_kernel << < blocks, threads >> > (in); 107 blend_kernel << < blocks, threads >> > (out, dstOut); 108 dstOut = !dstOut; 109 } 110 float_to_color << < blocks, threads >> > (d->output_bitmap, d->dev_inSrc); 111 112 cudaMemcpy(bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost); 113 114 cudaEventRecord(d->stop, 0); 115 cudaEventSynchronize(d->stop); 116 float elapsedTime; 117 cudaEventElapsedTime(&elapsedTime, d->start, d->stop); 118 d->totalTime += elapsedTime; 119 ++d->frames; 120 printf("Average Time per frame: %3.1f ms\n", d->totalTime / d->frames); 121 } 122 123 void anim_exit(DataBlock *d)// 收拾申请的内存 124 { 125 cudaUnbindTexture(texIn); 126 cudaUnbindTexture(texOut); 127 cudaUnbindTexture(texConstSrc); 128 cudaFree(d->dev_inSrc); 129 cudaFree(d->dev_outSrc); 130 cudaFree(d->dev_constSrc); 131 132 cudaEventDestroy(d->start); 133 cudaEventDestroy(d->stop); 134 return; 135 } 136 137 int main(void) 138 { 139 DataBlock data; 140 CPUAnimBitmap bitmap(DIM, DIM, &data); 141 data.bitmap = &bitmap; 142 data.totalTime = 0; 143 data.frames = 0; 144 cudaEventCreate(&data.start); 145 cudaEventCreate(&data.stop); 146 147 int imageSize = bitmap.image_size(); 148 149 cudaMalloc((void**)&data.output_bitmap, imageSize); 150 151 cudaMalloc((void**)&data.dev_inSrc, imageSize); 152 cudaMalloc((void**)&data.dev_outSrc, imageSize); 153 cudaMalloc((void**)&data.dev_constSrc, imageSize); 154 cudaBindTexture(NULL, texConstSrc, data.dev_constSrc, imageSize);//将内存绑定到之前声明的纹理引用中去 155 cudaBindTexture(NULL, texIn, data.dev_inSrc, imageSize); 156 cudaBindTexture(NULL, texOut, data.dev_outSrc, imageSize); 157 158 float *temp = (float*)malloc(imageSize); 159 for (int i = 0; i < DIM*DIM; i++)// 恒温格点数据 160 { 161 temp[i] = 0; 162 int x = i % DIM; 163 int y = i / DIM; 164 if ((x >= 181) && (x < 281) && (y >= 462) && (y < 562)) 165 temp[i] = MAX_TEMP; 166 if ((x >= 462) && (x < 562) && (y >= 462) && (y < 562)) 167 temp[i] = MIN_TEMP; 168 } 169 cudaMemcpy(data.dev_constSrc, temp, imageSize, cudaMemcpyHostToDevice); 170 171 for (int i = 0; i < DIM*DIM; i++)// 初始温度场数据 172 { 173 temp[i] = 0.5; 174 int x = i % DIM; 175 int y = i / DIM; 176 if ((x >= 718) && (x < 818) && (y >= 462) && (y < 562)) 177 temp[i] = MAX_TEMP; 178 } 179 cudaMemcpy(data.dev_inSrc, temp, imageSize, cudaMemcpyHostToDevice); 180 181 free(temp); 182 183 bitmap.anim_and_exit((void(*)(void*, int))anim_gpu, (void(*)(void*))anim_exit); 184 185 getchar(); 186 return; 187 }
? 输出结果(左侧为恒高温,中间为恒低温,右侧为初始高温点)
? 使用一维纹理内存的过程浓缩一下就变成了以下过程
1 texture<float> texSrc;// 在全局位置上声明纹理引用 2 3 float *dev_Src; 4 cudaMalloc((void**)&dev_Src, sizeof(float)*DIM);// 申请和绑定纹理内存 5 cudaBindTexture(NULL, texSrc, dev_Src, NULL); 6 7 float *temp = (float *)malloc(sizeof(float)*DIM);// 初始化该内存中的内容 8 //Initalize data in temp and then free(temp) 9 10 cudaMemcpy(dev_Src, temp, sizeof(float)*DIM, cudaMemcpyHostToDevice); 11 12 //Do something 13 14 cudaUnbindTexture(texSrc);// 解绑和释放内存 15 cudaFree(dev_Src);
? 访问纹理内存不用中括号下标,而是
1 int x = threadIdx.x + blockIdx.x * blockDim.x; 2 int y = threadIdx.y + blockIdx.y * blockDim.y; 3 int offset = x + y * blockDim.x * gridDim.x; 4 float c = tex1Dfetch(texSrc, offset);
热传导(使用二维纹理)
1 #include <stdio.h> 2 #include "cuda_runtime.h" 3 #include "device_launch_parameters.h" 4 #include "D:\Code\CUDA\book\common\book.h" 5 #include "D:\Code\CUDA\book\common\cpu_anim.h" 6 7 #define DIM 1024 8 #define PI 3.1415926535897932f 9 #define MAX_TEMP 1.0f 10 #define MIN_TEMP 0.0001f 11 #define SPEED 0.25f 12 13 texture<float, 2> texConstSrc; 14 texture<float, 2> texIn; 15 texture<float, 2> texOut; 16 17 struct DataBlock 18 { 19 unsigned char *output_bitmap; 20 float *dev_inSrc; 21 float *dev_outSrc; 22 float *dev_constSrc; 23 CPUAnimBitmap *bitmap; 24 cudaEvent_t start, stop; 25 float totalTime; 26 float frames; 27 }; 28 29 __global__ void blend_kernel(float *dst,bool dstOut) 30 { 31 int x = threadIdx.x + blockIdx.x * blockDim.x; 32 int y = threadIdx.y + blockIdx.y * blockDim.y; 33 int offset = x + y * blockDim.x * gridDim.x; 34 35 float t, l, c, r, b; 36 if (dstOut)//不需要自己处理边界情况 37 { 38 t = tex2D(texIn, x, y - 1); 39 l = tex2D(texIn, x - 1, y); 40 c = tex2D(texIn, x, y); 41 r = tex2D(texIn, x + 1, y); 42 b = tex2D(texIn, x, y + 1); 43 } 44 else 45 { 46 t = tex2D(texOut, x, y - 1); 47 l = tex2D(texOut, x - 1, y); 48 c = tex2D(texOut, x, y); 49 r = tex2D(texOut, x + 1, y); 50 b = tex2D(texOut, x, y + 1); 51 } 52 dst[offset] = c + SPEED * (t + b + r + l - 4 * c); 53 54 return; 55 } 56 57 __global__ void copy_const_kernel(float *iptr) 58 { 59 // map from threadIdx/BlockIdx to pixel position 60 int x = threadIdx.x + blockIdx.x * blockDim.x; 61 int y = threadIdx.y + blockIdx.y * blockDim.y; 62 int offset = x + y * blockDim.x * gridDim.x; 63 64 float c = tex2D(texConstSrc, x, y); 65 if (c != 0) 66 iptr[offset] = c; 67 68 return; 69 } 70 71 void anim_gpu(DataBlock *d, int ticks) 72 { 73 cudaEventRecord(d->start, 0); 74 dim3 blocks(DIM / 16, DIM / 16); 75 dim3 threads(16, 16); 76 CPUAnimBitmap *bitmap = d->bitmap; 77 78 volatile bool dstOut = true; 79 for (int i = 0; i < 90; i++) 80 { 81 float *in, *out; 82 if (dstOut) { 83 in = d->dev_inSrc; 84 out = d->dev_outSrc; 85 } 86 else 87 { 88 out = d->dev_inSrc; 89 in = d->dev_outSrc; 90 } 91 copy_const_kernel << <blocks, threads >> > (in); 92 blend_kernel << <blocks, threads >> > (out, dstOut); 93 dstOut = !dstOut; 94 } 95 float_to_color << <blocks, threads >> > (d->output_bitmap, d->dev_inSrc); 96 97 cudaMemcpy(bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost); 98 99 cudaEventRecord(d->stop, 0); 100 cudaEventSynchronize(d->stop); 101 102 float elapsedTime; 103 cudaEventElapsedTime(&elapsedTime, d->start, d->stop); 104 d->totalTime += elapsedTime; 105 ++d->frames; 106 printf("Average Time per frame: %3.1f ms\n", d->totalTime / d->frames); 107 108 return; 109 } 110 111 void anim_exit(DataBlock *d) 112 { 113 cudaUnbindTexture(texIn); 114 cudaUnbindTexture(texOut); 115 cudaUnbindTexture(texConstSrc); 116 cudaFree(d->dev_inSrc); 117 cudaFree(d->dev_outSrc); 118 cudaFree(d->dev_constSrc); 119 120 cudaEventDestroy(d->start); 121 cudaEventDestroy(d->stop); 122 return; 123 } 124 125 126 int main(void) 127 { 128 DataBlock data; 129 CPUAnimBitmap bitmap(DIM, DIM, &data); 130 data.bitmap = &bitmap; 131 data.totalTime = 0; 132 data.frames = 0; 133 cudaEventCreate(&data.start); 134 cudaEventCreate(&data.stop); 135 136 int imageSize = bitmap.image_size(); 137 138 cudaMalloc((void**)&data.output_bitmap, imageSize); 139 140 cudaMalloc((void**)&data.dev_inSrc, imageSize); 141 cudaMalloc((void**)&data.dev_outSrc, imageSize); 142 cudaMalloc((void**)&data.dev_constSrc, imageSize); 143 144 cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>(); 145 cudaBindTexture2D(NULL, texConstSrc, data.dev_constSrc, desc, DIM, DIM, sizeof(float) * DIM); 146 cudaBindTexture2D(NULL, texIn, data.dev_inSrc, desc, DIM, DIM, sizeof(float) * DIM); 147 cudaBindTexture2D(NULL, texOut, data.dev_outSrc, desc, DIM, DIM, sizeof(float) * DIM); 148 149 float *temp = (float*)malloc(imageSize); 150 for (int i = 0; i<DIM*DIM; i++) { 151 temp[i] = 0; 152 int x = i % DIM; 153 int y = i / DIM; 154 if ((x >= 181) && (x < 281) && (y >= 462) && (y < 562)) 155 temp[i] = MAX_TEMP; 156 if ((x >= 462) && (x < 562) && (y >= 462) && (y < 562)) 157 temp[i] = MIN_TEMP; 158 } 159 cudaMemcpy(data.dev_constSrc, temp, imageSize, cudaMemcpyHostToDevice); 160 161 for (int i = 0; i < DIM*DIM; i++)// 初始温度场数据 162 { 163 temp[i] = 0.5; 164 int x = i % DIM; 165 int y = i / DIM; 166 if ((x >= 718) && (x < 818) && (y >= 462) && (y < 562)) 167 temp[i] = MAX_TEMP; 168 } 169 cudaMemcpy(data.dev_inSrc, temp, imageSize, cudaMemcpyHostToDevice); 170 free(temp); 171 172 bitmap.anim_and_exit((void(*)(void*, int))anim_gpu, (void(*)(void*))anim_exit); 173 174 getchar(); 175 return 0; 176 }
? 输出结果同一维纹理的的情况,速度上没有明显差别
? 使用纹理内存的过程浓缩一下就变成了以下过程
1 texture<float, 2> texSrc;// 在全局位置上声明纹理引用 2 3 float *dev_Src; 4 cudaMalloc((void**)&dev_Src, DIM*DIM);// 申请和绑定纹理内存 5 cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>(); 6 cudaBindTexture2D(NULL, texSrc, dev_Src, desc, DIM, DIM, sizeof(float) * DIM*DIM); 7 8 float *temp = (float*)malloc(sizeof(float)*DIM*DIM);// 初始化该内存中的内容 9 //Initalize data in temp and then free(temp) 10 11 cudaMemcpy(dev_Src, temp, sizeof(float)*DIM*DIM, cudaMemcpyHostToDevice); 12 13 //Do something 14 15 cudaUnbindTexture(texSrc);// 解绑和释放内存 16 cudaFree(dev_Src);
? 访问纹理内存不用中括号下标,而是
1 int x = threadIdx.x + blockIdx.x * blockDim.x; 2 int y = threadIdx.y + blockIdx.y * blockDim.y; 3 float c = tex2D(texSrc, x, y);
原文:http://www.cnblogs.com/cuancuancuanhao/p/7641428.html