以下仅实现了第一部分的SIMD的功能。
(注意:你仅仅需要对内层的循环体进行优化。)
在优化的过程中,你可能会用到以下内蕴函数:
d) 一些有益的提示
i. SIMD版本的代码与未优化的代码的计算结果保持一致;
ii. SIMD版本的代码应该要比未优化的代码快(快多少,记录下性能加速比,并且分析为什么你认为它是正确的结果)
在你的实验报告中提供你的代码、运行结果和你的思考过程,并上传一个简短的带解说的演示视频(只给出代码截图和实验结果,但却不提供分析过程的不给分。)
实验给出的源代码common.h
:
#ifndef COMMON_H
#define COMMON_H
#include <x86intrin.h>
#define NUM_ELEMS ((1 << 15) + 10)
#define OUTER_ITERATIONS (1 << 15)
/* 不要修改这个函数 */
long long int sum(unsigned int vals[NUM_ELEMS]) {
clock_t start = clock();
long long int sum = 0;
for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
for(unsigned int i = 0; i < NUM_ELEMS; i++) {
if(vals[i] >= 128) {
sum += vals[i];
}
}
}
clock_t end = clock();
printf("Time taken: %f s\n", (double)(end - start) / CLOCKS_PER_SEC);
return sum;
}
/* 不要修改这个函数 */
long long int sum_unrolled(unsigned int vals[NUM_ELEMS]) {
clock_t start = clock();
long long int sum = 0;
for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
for(unsigned int i = 0; i < NUM_ELEMS / 4 * 4; i += 4) {
if(vals[i] >= 128) sum += vals[i];
if(vals[i + 1] >= 128) sum += vals[i + 1];
if(vals[i + 2] >= 128) sum += vals[i + 2];
if(vals[i + 3] >= 128) sum += vals[i + 3];
}
//This is what we call the TAIL CASE
//For when NUM_ELEMS isn‘t a multiple of 4
//NONTRIVIAL FACT: NUM_ELEMS / 4 * 4 is the largest multiple of 4 less than NUM_ELEMS
for(unsigned int i = NUM_ELEMS / 4 * 4; i < NUM_ELEMS; i++) {
if (vals[i] >= 128) {
sum += vals[i];
}
}
}
clock_t end = clock();
printf("Time taken: %f s\n", (double)(end - start) / CLOCKS_PER_SEC);
return sum;
}
long long int sum_simd(unsigned int vals[NUM_ELEMS]) {
clock_t start = clock();
//这句代码会为你生成一个含有若干个127的向量
//思考题:为什么你需要它?
__m128i _127 = _mm_set1_epi32(127);
long long int result = 0;// 将最终计算的结果保存在这里
//不要修改此行之上的任何代码!!!
for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
/* 你的代码从这里开始 */
/* 你的代码在这里结束 */
}
clock_t end = clock();
printf("Time taken: %f s\n", (double)(end - start) / CLOCKS_PER_SEC);
return result;
}
/* 不要修改这个函数 */
int int_comparator(const void* a, const void* b) {
if(*(unsigned int*)a == *(unsigned int*)b) return 0;
else if(*(unsigned int*)a < *(unsigned int*)b) return -1;
else return 1;
}
#endif
randomized.cpp
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include "common.h"
/* ***不要修改这个文件!只能修改common.h的内容!*** */
int main(int argc, char* argv[]) {
printf("Let‘s generate a randomized array.\n");
unsigned int vals[NUM_ELEMS];
long long int reference;
long long int simd;
long long int simdu;
for(unsigned int i = 0; i < NUM_ELEMS; i++) vals[i] = rand() % 256;
printf("Starting randomized sum.\n");
printf("Sum: %lld\n", reference = sum(vals));
printf("Starting randomized unrolled sum.\n");
printf("Sum: %lld\n", sum_unrolled(vals));
printf("Starting randomized SIMD sum.\n");
printf("Sum: %lld\n", simd = sum_simd(vals));
if (simd != reference) {
printf("OH NO! SIMD sum %lld doesn‘t match reference sum %lld!\n", simd, reference);
}
}
common.h
我实现的代码,如下:
#ifndef COMMON_H
#define COMMON_H
#include <x86intrin.h>
#include <stdint.h>
#define NUM_ELEMS ((1 << 15) + 10)
#define OUTER_ITERATIONS (1 << 15)
/* 不要修改这个函数 */
long long int sum(unsigned int vals[NUM_ELEMS]) {
clock_t start = clock();
long long int sum = 0;
for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
for(unsigned int i = 0; i < NUM_ELEMS; i++) {
if(vals[i] >= 128) {
sum += vals[i];
}
}
}
clock_t end = clock();
printf("Time taken: %f s\n", (double)(end - start) / CLOCKS_PER_SEC);
return sum;
}
/* 不要修改这个函数 */
long long int sum_unrolled(unsigned int vals[NUM_ELEMS]) {
clock_t start = clock();
long long int sum = 0;
for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
for(unsigned int i = 0; i < NUM_ELEMS / 4 * 4; i += 4) {
if(vals[i] >= 128) sum += vals[i];
if(vals[i + 1] >= 128) sum += vals[i + 1];
if(vals[i + 2] >= 128) sum += vals[i + 2];
if(vals[i + 3] >= 128) sum += vals[i + 3];
}
//This is what we call the TAIL CASE
//For when NUM_ELEMS isn‘t a multiple of 4
//NONTRIVIAL FACT: NUM_ELEMS / 4 * 4 is the
// largest multiple of 4 less than NUM_ELEMS
for(unsigned int i = NUM_ELEMS / 4 * 4; i < NUM_ELEMS; i++) {
if (vals[i] >= 128) {
sum += vals[i];
}
}
}
clock_t end = clock();
printf("Time taken: %f s\n", (double)(end - start) / CLOCKS_PER_SEC);
return sum;
}
long long int sum_simd(unsigned int vals[NUM_ELEMS]) {
clock_t start = clock();
//这句代码会为你生成一个含有若干个127的向量
//思考题:为什么你需要它?
//用来实现比较对应的128, (>127) == (>=128)
__m128i _127 = _mm_set1_epi32(127);
//用来比较数字是否>=128
long long int result = 0;// 将最终计算的结果保存在这里
//不要修改此行之上的任何代码!!!
__m128i p = _mm_setzero_si128( );//用来保存从数组中读取的数据
for(unsigned int w = 0; w < OUTER_ITERATIONS; w++) {
/* 你的代码从这里开始 */
__m128i sum = _mm_setzero_si128( );//需要sum的值
for(unsigned int i = 0; i < NUM_ELEMS / 4 * 4; i += 4){
__m128i* h = (__m128i*)(vals+i);//地址强制类型转换
p = _mm_loadu_si128( h );//从指针中获取值,获取128位,即4Byte,恰好是vals[0-3]
__m128i flag = _mm_setzero_si128( );
flag = _mm_cmpgt_epi32( p , _127 ); //大于127时应该是oxfffff,
//一下子比较四个向量, 4Byte的每一bits都是应该是十进制的-1,而但小于时则会出现ox0000,
//所以就是0, 在flag中的四个部分中分别比较后的结果会变成四个部分保存,
//所以只存在-1和0的结果.如果都大于,(-1,-1,-1,-1).
//这样子的话,-1时是每一bis都是1, 那么我们取大于的时候,
//直接用flag和我们的p来做and与计算, 那么如果大于,直接保留不变,
//小于的话是0,与完后变成0,相当于跳过计算,不影响我们的计算结果.
__m128i hi = _mm_setzero_si128( );
hi = _mm_and_si128( p, flag );
sum = _mm_add_epi32(sum, hi);
}
//int 一般为32位, 4*32 = 128
int32_t *k = (int *)∑//将_m128i转换为4个int型的数组 (
result = result + k[0] + k[1] + k[2] + k[3];//将其全部加入result
for(unsigned int i = NUM_ELEMS / 4 * 4; i < NUM_ELEMS; i++) {
if (vals[i] >= 128) {
result += vals[i];
}
}
//result *= OUTER_ITERATIONS;
/* 你的代码在这里结束 */
}
clock_t end = clock();
printf("Time taken: %f s\n", (double)(end - start) / CLOCKS_PER_SEC);
return result;
}
/* 不要修改这个函数 */
int int_comparator(const void* a, const void* b) {
if(*(unsigned int*)a == *(unsigned int*)b) return 0;
else if(*(unsigned int*)a < *(unsigned int*)b) return -1;
else return 1;
}
#endif
原文:https://www.cnblogs.com/billyme/p/14089218.html