#include "stdafx.h"
#include <stdlib.h>
#include <windows.h>
#include <xmmintrin.h>
void sse_add(float *srcA, float *srcB, float *dest, int n)
{
int len = n >> 2;
for (int i = 0; i < len; i++)
{
*(__m128*)(dest + i*4) = _mm_add_ps(*(__m128*)(srcA + i*4), *(__m128*)(srcB + i*4));
}
}
void normal_add(float *srcA, float *srcB, float *dest, int n)
{
for (int i = 0; i < n; i++)
{
dest[i] = srcA[i] + srcB[i];
}
}
int main(int argc, _TCHAR* argv[])
{
DWORD timeStart = 0, timeEnd = 0;
const int size = 10000, count = 10000;
// 分配16字节对齐的内存
_MM_ALIGN16 float *srcA = (_MM_ALIGN16 float*)_mm_malloc(sizeof(float)*size, 16);
_MM_ALIGN16 float *srcB = (_MM_ALIGN16 float*)_mm_malloc(sizeof(float)*size, 16);
_MM_ALIGN16 float *dest = (_MM_ALIGN16 float*)_mm_malloc(sizeof(float)*size, 16);
// 初始化
for (int i = 0; i < size; i++)
{
srcA[i] = (float)i;
}
memcpy_s(srcB, sizeof(float) * size, srcA, sizeof(float) * size);
// 标准加法
timeStart = GetTickCount();
for (int i = 0; i < count; i++)
{
normal_add(srcA, srcB, dest, size);
}
timeEnd = GetTickCount();
printf("normal test...time ---> %f \n", (timeEnd - timeStart) * 0.001);
// SSE指令加法
timeStart = GetTickCount();
for (int i = 0; i < count; i++)
{
sse_add(srcA, srcB, dest, size);
}
timeEnd = GetTickCount();
printf("sse test...time ---> %f \n", (timeEnd - timeStart) * 0.001);
// 释放内存
_mm_free(srcA);
_mm_free(srcB);
_mm_free(dest);
system("pause");
return 0;
}上述程序使用vs 2005采用release模式编译后,运行结果如下,通过时间对比可知,采用sse指令的加法运算,效率明显得到了提高。原文:http://blog.csdn.net/grafx/article/details/20001589