CUDA纹理内存的访问速度比全局内存要快,因此处理图像数据时,使用纹理内存是一个提升性能的好方法。
贴一段自己写的简单的实现两幅图像加权和的代码,使用纹理内存实现。
输入:两幅图 lena, moon
输出:两幅图像加权和
1 #include <opencv2\opencv.hpp> 2 #include <iostream> 3 #include <string> 4 #include <cuda.h> 5 #include <cuda_runtime.h> 6 #include <device_launch_parameters.h> 7 8 using namespace std; 9 using namespace cv; 10 11 //声明CUDA纹理 12 texture <uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> refTex1; 13 texture <uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> refTex2; 14 //声明CUDA数组 15 cudaArray* cuArray1; 16 cudaArray* cuArray2; 17 //通道数 18 cudaChannelFormatDesc cuDesc = cudaCreateChannelDesc<uchar4>(); 19 20 21 __global__ void weightAddKerkel(uchar *pDstImgData, int imgHeight, int imgWidth,int channels) 22 { 23 const int tidx=blockDim.x*blockIdx.x+threadIdx.x; 24 const int tidy=blockDim.y*blockIdx.y+threadIdx.y; 25 26 if (tidx<imgWidth && tidy<imgHeight) 27 { 28 float4 lenaBGR,moonBGR; 29 //使用tex2D函数采样纹理 30 lenaBGR=tex2D(refTex1, tidx, tidy); 31 moonBGR=tex2D(refTex2, tidx, tidy); 32 33 int idx=(tidy*imgWidth+tidx)*channels; 34 float alpha=0.5; 35 pDstImgData[idx+0]=(alpha*lenaBGR.x+(1-alpha)*moonBGR.x)*255; 36 pDstImgData[idx+1]=(alpha*lenaBGR.y+(1-alpha)*moonBGR.y)*255; 37 pDstImgData[idx+2]=(alpha*lenaBGR.z+(1-alpha)*moonBGR.z)*255; 38 pDstImgData[idx+3]=0; 39 } 40 } 41 42 void main() 43 { 44 Mat Lena=imread("data/lena.jpg"); 45 Mat moon=imread("data/moon.jpg"); 46 cvtColor(Lena, Lena, CV_BGR2BGRA); 47 cvtColor(moon, moon, CV_BGR2BGRA); 48 int imgWidth=Lena.cols; 49 int imgHeight=Lena.rows; 50 int channels=Lena.channels(); 51 52 //设置纹理属性 53 cudaError_t t; 54 refTex1.addressMode[0] = cudaAddressModeClamp; 55 refTex1.addressMode[1] = cudaAddressModeClamp; 56 refTex1.normalized = false; 57 refTex1.filterMode = cudaFilterModeLinear; 58 //绑定cuArray到纹理 59 cudaMallocArray(&cuArray1, &cuDesc, imgWidth, imgHeight); 60 t = cudaBindTextureToArray(refTex1, cuArray1); 61 62 refTex2.addressMode[0] = cudaAddressModeClamp; 63 refTex2.addressMode[1] = cudaAddressModeClamp; 64 refTex2.normalized = false; 65 refTex2.filterMode = cudaFilterModeLinear; 66 cudaMallocArray(&cuArray2, &cuDesc, imgWidth, imgHeight); 67 t = cudaBindTextureToArray(refTex2, cuArray2); 68 69 //拷贝数据到cudaArray 70 t=cudaMemcpyToArray(cuArray1, 0,0, Lena.data, imgWidth*imgHeight*sizeof(uchar)*channels, cudaMemcpyHostToDevice); 71 t=cudaMemcpyToArray(cuArray2, 0,0, moon.data, imgWidth*imgHeight*sizeof(uchar)*channels, cudaMemcpyHostToDevice); 72 73 //输出图像 74 Mat dstImg=Mat::zeros(imgHeight, imgWidth, CV_8UC4); 75 uchar *pDstImgData=NULL; 76 t=cudaMalloc(&pDstImgData, imgHeight*imgWidth*sizeof(uchar)*channels); 77 78 //核函数,实现两幅图像加权和 79 dim3 block(8,8); 80 dim3 grid( (imgWidth+block.x-1)/block.x, (imgHeight+block.y-1)/block.y ); 81 weightAddKerkel<<<grid, block, 0>>>(pDstImgData, imgHeight, imgWidth, channels); 82 cudaThreadSynchronize(); 83 84 //从GPU拷贝输出数据到CPU 85 t=cudaMemcpy(dstImg.data, pDstImgData, imgWidth*imgHeight*sizeof(uchar)*channels, cudaMemcpyDeviceToHost); 86 87 //显示 88 namedWindow("show"); 89 imshow("show", dstImg); 90 waitKey(0); 91 }
原文:http://www.cnblogs.com/riddick/p/7892663.html