Arm Neon Usages
Each Neon instruction detail can be searched in https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics?search=
1. Load into Neon from data ptr/store into the data ptr from Neon.
input_s16x8 = vld1q_s16(pIn);
vst1_s8(pOut, output_s8x8);
2. find sign/abs/max
sign_s16x8 = vshrq_n_s16(input_s16x8, 16);
abs_input_s16x8 = vqabsq_s16(input_s16x8); // satuation |min(int8)|=|-128|=127
abs_input_s16x8 = vabsq_s16(input_s16x8); //without satuation
c_s16x8 = vmaxq_s16(a_s16x8, b_s16x8); // find max{a[i], b[i]}
c_s16x8 = vpmaxq_s16(a_s16x8, b_s16x8); // find max{a[2i], a[2i+1]}
3. find zero padding bits
clz_s16x8 = vclzq_s16(abs_max_s16x8);
4. split/combine vectors
abs_temp_s16x4_h = vget_high_s16(abs_temp_s16x8);
abs_temp_s16x4_l = vget_low_s16(abs_temp_s16x8);
abs_max_s16x8 = vcombine_s16(abs_temp_s16x4, abs_temp_s16x4);
5. converting vector (s16x8 <-> s8x8)
idx_s8x8 = vmovn_s16(idx_s16x8); //narrow move
bias_s16x8 = vmovl_s8(bias_s8x8); // long move
6. reinterpretion (int<->uint)
idx_s16x8 = vreinterpretq_s16_u16(idx_u16x8);
7. Lookup table(frequetly use in data shuffle)
int8_t lut[8]= {1, 2, 3, 4, 5, 6, 7, 0};
data_shuffle_idx_s8x8 = vld1_s8(lut);
// put the first element in the tail, others move forward
data_shuffle_s8x8 = vtbl1_s8(data_s8x8, data_shuffle_idx_s8x8);
8. comparison
If(a>4)
b = 49;
else if (a>1)
b = 90;
else
b = 340;
greater_than_4_u8x8 = vcgt_s8(a_s8x8, 4_s8x8);
greater_than_1_u8x8 = vcgt_s8(a_s8x8, 1_s8x8);
idx_u8x8 = vadd_u8(greater_than_4_u8x8, greater_than_1_u8x8);
int8_t lut[8]= {49, 90, 340, 0, 0, 0, 0, 0};
lut_s8x8 = vld1_s8(lut);
b_s8x8 = vtbl1_s8(lut_s8x8, idx_u8x8);
9. bit selection
result_s16x8 = vbslq_s16(dest_s16x8, src1_s16x8, src2_s16x8);
0050(dest): 0000 0000 0101 0000 (bit 0: src2, bit 1: src1)
ffaa(src1): 1111 1111 1010 1010
0x7(src2): 0000 0000 0000 0111
result: 0000 0000 0000 0111
10. shift and insert (have same effect with bit selection)
dbg_s16x8 = vsriq_n_s16(input_s16x8[0], input_s16x8[1], 5);
input_s16x8[0]: 0050 ffffff81 ffffff86 0022 ffffffc5 008c 0078 0047
input_s16x8[1]: ffffffaa 001a ffffff98 0068 0091 0067 ffffff75 ffffff9c
dbg_s16x8: 07fd fffff800 fffffffc 0003 fffff804 0003 07fb 07fc
input0: 0000 0000 0101 0000 (0x0050)
input1: 1111 1111 1010 1010 (0xffaa)
input1 after right shift 5: 0000 0111 1111 1101 (unsigned shift)
output after combine input0: 0000 0111 1111 1101(0x07fd)
input0: 1111 1111 1100 0101 (0xffc5)
input1: 0000 0000 1001 0001 (0x0091)
input1 after right shift 5: 0000 0000 0000 0100
output after combine input0: 1111 1000 0000 0100(0xf804) choose the first 5 bits from input0 and left bits from input1
dbg_s16x8 = vsliq_n_s16(input_s16x8[0], input_s16x8[1], 5);
input_s16x8[0]: 0050 ffffff81 ffffff86 0022 ffffffc5 008c 0078 0047
input_s16x8[1]: ffffffaa 001a ffffff98 0068 0091 0067 ffffff75 ffffff9c
dbg_s16x8: fffff550 0341 fffff306 0d02 1225 0cec ffffeeb8 fffff387
input0: 0000 0000 0101 0000 (0x50)
input1: 1111 1111 1010 1010 (0xffaa)
input1 after left shift 5: 1111 0101 0100 0000
output: 1111 0101 0101 0000(0xf550)
11. element transposition/interleave
C_s8x8x2[0] = vzip_s8(B0_s8x8[0], B1_s8x8[0]); // interleave elements
C_s8x8x2[0] = vtrn_s8(B0_s8x8[0], B1_s8x8[0]); // transpose elements
C_s8x8x2[0] = vuzp_s8(B0_s8x8[0], B1_s8x8[0]); // de-interleave elements
B0_s8x8[0]: ffffff80 ffffffa6 00 ffffffe0 ffffffe0 0f 30 fffffffb
B1_s8x8[0]: 68 fffffffa ffffff98 ffffffff fffffffe 00 ffffffb2 00
C0_s8x8[0].val[0] after zip: ffffff80 68 ffffffa6 fffffffa 00 ffffff98 ffffffe0 ffffffff
C0_s8x8[0].val[1] after zip: ffffffe0 fffffffe 0f 00 30 ffffffb2 fffffffb 00
C0_s8x8[0].val[0] after transposition: ffffff80 68 00 ffffff98 ffffffe0 fffffffe 30 ffffffb2
C0_s8x8[0].val[1] after transposition: ffffffa6 fffffffa ffffffe0 ffffffff 0f 00 fffffffb 00
// inverse process of interleave zip. you can imagine use the following as input, then zip(), the data will be recovered.
C0_s8x8[0].val[0] after uzp: ffffff80 00 ffffffe0 30 68 ffffff98 fffffffe ffffffb2
C0_s8x8[0].val[1] after uzp: ffffffa6 ffffffe0 0f fffffffb fffffffa ffffffff 00 00
Examples:
We want to choose the latter 9bit from each element in input_s16x8, and form them into 9 bytes.
input_s16x8:
7 |
9 |
7 |
9 |
7 |
9 |
7 |
9 |
7 |
9 |
7 |
9 |
7 |
9 |
7 |
9 |
Extract 8 9-bit from each element:
9bits |
9bits |
9bits |
9bits |
9bits |
9bits |
9bits |
9bits |
Form 8 9-bit into 9 bytes:
byte |
byte |
byte |
byte |
byte |
byte |
byte |
byte |
byte |
uint16_t bitSel_head[8] = {0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff};
uint16_t bitSel_tail[8] = {0x1fe, 0x1fc, 0x1f8, 0x1f0, 0x1e0, 0x1c0, 0x180, 0x100};
int16_t bitshift_tail[8] = {-1, -2, -3, -4, -5, -6, -7, -8};
int16_t bitshift_head[8] = {7, 6, 5, 4, 3, 2, 1, 0};
int8_t data_shuffle[8] = {1, 2, 3, 4, 5, 6, 7, 0};
uint8_t bitSel_combine[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
bitSel_head_u16x8 = vld1q_u16(bitSel_head);
bitSel_tail_u16x8 = vld1q_u16(bitSel_tail);
bitShift_tail_s16x8 = vld1q_s16(bitshift_tail);
bitShift_head_s16x8 = vld1q_s16(bitshift_head);
data_shuffle_s8x8 = vld1_s8(data_shuffle);
bitSel_combine_u8x8 = vld1_u8(bitSel_combine);
head_s16x8[0] = vbslq_s16(bitSel_head_u16x8, input_s16x8[0], zero_s16x8);
tail_s16x8[0] = vbslq_s16(bitSel_tail_u16x8, input_s16x8[0], zero_s16x8);
head_s16x8:
7 |
8 |
1 |
7 |
7 |
2 |
7 |
6 |
3 |
7 |
5 |
4 |
7 |
4 |
5 |
7 |
3 |
6 |
7 |
2 |
7 |
7 |
1 |
8 |
tail_s16x8:
7 |
8 |
1 |
7 |
7 |
2 |
7 |
6 |
3 |
7 |
5 |
4 |
7 |
4 |
5 |
7 |
3 |
6 |
7 |
2 |
7 |
7 |
1 |
8 |
head_s16x8[0] = vshlq_s16(head_s16x8[0], bitShift_head_s16x8);
tail_s16x8[0] = vshlq_s16(tail_s16x8[0], bitShift_tail_s16x8);
head_s8x8[0] = vmovn_s16(head_s16x8[0]);
tail_s8x8[0] = vmovn_s16(tail_s16x8[0]);
vst1_lane_s8(pOut+1, tail_s8x8[0], 0);
tail_shuffle_s8x8[0] = vtbl1_s8(tail_s8x8[0], data_shuffle_s8x8);
output_s8x8[0] = vbsl_s8(bitSel_combine_u8x8, head_s8x8[0], tail_shuffle_s8x8[0]);
vst1_s8(pOut+2, output_s8x8[0]);
原文:https://www.cnblogs.com/biggerjun2015/p/11754435.html