Convolution RGB with SIMD
Convolution RGB with SIMD
Prerequisites
1
SIMD
1. Naive Code
About 5119 by 5119 image.
Processing time: 305ms
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
uint8_t* pU8SrcBuf = (uint8_t*)imgSrc.data;
uint8_t* pU8DstBuf = (uint8_t*)imgDst.data;
int32_t i32SteopRows = imgSrc.step;
int32_t i32PixelBytes = imgSrc.elemSize();
for(int i32Row = 2; i32Row < imgSrc.rows - 2; ++i32Row)
{
uint8_t* pU8SrcBufRowR = pU8SrcBuf + i32Row * imgSrc.step + 2 * i32PixelBytes;
uint8_t* pU8SrcBufRowG = pU8SrcBuf + i32Row * imgSrc.step + 2 * i32PixelBytes + 1;
uint8_t* pU8SrcBufRowB = pU8SrcBuf + i32Row * imgSrc.step + 2 * i32PixelBytes + 2;
uint8_t* pU8DstBufRowR = pU8DstBuf + i32Row * imgDst.step + 2 * i32PixelBytes;
uint8_t* pU8DstBufRowG = pU8DstBuf + i32Row * imgDst.step + 2 * i32PixelBytes + 1;
uint8_t* pU8DstBufRowB = pU8DstBuf + i32Row * imgDst.step + 2 * i32PixelBytes + 2;
for(int i32Col = 2; i32Col < imgSrc.cols - 2; ++i32Col)
{
int i32SumR = 0;
int i32SumG = 0;
int i32SumB = 0;
for(int i = -2; i <= 2; ++i)
{
for(int j = -2; j <= 2; ++j)
{
i32SumR += *(pU8SrcBufRowR + j * i32PixelBytes + (i * imgSrc.step));
i32SumG += *(pU8SrcBufRowG + j * i32PixelBytes + (i * imgSrc.step));
i32SumB += *(pU8SrcBufRowB + j * i32PixelBytes + (i * imgSrc.step));
}
}
*pU8DstBufRowR = uint8_t(i32SumR / 25);
*pU8DstBufRowG = uint8_t(i32SumG / 25);
*pU8DstBufRowB = uint8_t(i32SumB / 25);
pU8SrcBufRowR += i32PixelBytes;
pU8SrcBufRowG += i32PixelBytes;
pU8SrcBufRowB += i32PixelBytes;
pU8DstBufRowR += i32PixelBytes;
pU8DstBufRowG += i32PixelBytes;
pU8DstBufRowB += i32PixelBytes;
}
}
Optimization Points
Cache locality
approach Contiguous memory about row, not column
1
2
3
4
5
6
7
8
9
for(int i = -2; i <= 2; ++i)
{
for(int j = -2; j <= 2; ++j)
{
i32SumR += *(pU8SrcBufRowR + j * i32PixelBytes + (i * imgSrc.step));
i32SumG += *(pU8SrcBufRowG + j * i32PixelBytes + (i * imgSrc.step));
i32SumB += *(pU8SrcBufRowB + j * i32PixelBytes + (i * imgSrc.step));
}
}
Don’t make double point dereference cost
1
2
3
4
5
6
uint8_t* pU8SrcBufRowR = pU8SrcBuf + i32Row * imgSrc.step + 2 * i32PixelBytes;
uint8_t* pU8SrcBufRowG = pU8SrcBuf + i32Row * imgSrc.step + 2 * i32PixelBytes + 1;
uint8_t* pU8SrcBufRowB = pU8SrcBuf + i32Row * imgSrc.step + 2 * i32PixelBytes + 2;
uint8_t* pU8DstBufRowR = pU8DstBuf + i32Row * imgDst.step + 2 * i32PixelBytes;
uint8_t* pU8DstBufRowG = pU8DstBuf + i32Row * imgDst.step + 2 * i32PixelBytes + 1;
uint8_t* pU8DstBufRowB = pU8DstBuf + i32Row * imgDst.step + 2 * i32PixelBytes + 2;
Don’t make duplicate cost
1
2
3
*pU8DstBufRowR = uint8_t(i32SumR / 25);
*pU8DstBufRowG = uint8_t(i32SumG / 25);
*pU8DstBufRowB = uint8_t(i32SumB / 25);
or
1
(i32Sum * 2621) >> 16;
Because division cost is very higher than mul and shift. But it is approximate value.
2. Naive Code, try getting better But getting worse
Processing time: 480ms
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
uint8_t* pU8SrcBuf = (uint8_t*)imgSrc.data;
uint8_t* pU8DstBuf = (uint8_t*)imgDst.data;
int32_t i32SteopRows = imgSrc.step;
int32_t i32PixelBytes = imgSrc.elemSize();
for(int i32Row = 2; i32Row < imgSrc.rows - 2; ++i32Row)
{
int32_t i32OffsetX = 2 * i32PixelBytes;
int32_t i32OffsetY = i32Row * imgSrc.step;
uint8_t* pU8SrcBufRowR = pU8SrcBuf + i32OffsetY + i32OffsetX;
uint8_t* pU8SrcBufRowG = pU8SrcBuf + i32OffsetY + i32OffsetX + 1;
uint8_t* pU8SrcBufRowB = pU8SrcBuf + i32OffsetY + i32OffsetX + 2;
uint8_t* pU8DstBufRowR = pU8DstBuf + i32OffsetY + i32OffsetX;
uint8_t* pU8DstBufRowG = pU8DstBuf + i32OffsetY + i32OffsetX + 1;
uint8_t* pU8DstBufRowB = pU8DstBuf + i32OffsetY + i32OffsetX + 2;
for(int i32Col = 2; i32Col < imgSrc.cols - 2; ++i32Col)
{
int i32SumR = 0;
int i32SumG = 0;
int i32SumB = 0;
int i32CurRow = -2 * imgSrc.step;
i32SumR += *(pU8SrcBufRowR + -2 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + -2 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + -2 * i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + -1 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + -1 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + -1 * i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + i32CurRow);
i32SumG += *(pU8SrcBufRowG + i32CurRow);
i32SumB += *(pU8SrcBufRowB + i32CurRow);
i32SumR += *(pU8SrcBufRowR + i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + 2 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + 2 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + 2 * i32PixelBytes + i32CurRow);
i32CurRow = -1 * imgSrc.step;
i32SumR += *(pU8SrcBufRowR + -2 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + -2 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + -2 * i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + -1 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + -1 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + -1 * i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + i32CurRow);
i32SumG += *(pU8SrcBufRowG + i32CurRow);
i32SumB += *(pU8SrcBufRowB + i32CurRow);
i32SumR += *(pU8SrcBufRowR + i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + 2 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + 2 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + 2 * i32PixelBytes + i32CurRow);
i32CurRow = 0;
i32SumR += *(pU8SrcBufRowR + -2 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + -2 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + -2 * i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + -1 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + -1 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + -1 * i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + i32CurRow);
i32SumG += *(pU8SrcBufRowG + i32CurRow);
i32SumB += *(pU8SrcBufRowB + i32CurRow);
i32SumR += *(pU8SrcBufRowR + i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + 2 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + 2 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + 2 * i32PixelBytes + i32CurRow);
i32CurRow = 1 * imgSrc.step;
i32SumR += *(pU8SrcBufRowR + -2 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + -2 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + -2 * i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + -1 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + -1 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + -1 * i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + i32CurRow);
i32SumG += *(pU8SrcBufRowG + i32CurRow);
i32SumB += *(pU8SrcBufRowB + i32CurRow);
i32SumR += *(pU8SrcBufRowR + i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + 2 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + 2 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + 2 * i32PixelBytes + i32CurRow);
i32CurRow = 2 * imgSrc.step;
i32SumR += *(pU8SrcBufRowR + -2 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + -2 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + -2 * i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + -1 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + -1 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + -1 * i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + i32CurRow);
i32SumG += *(pU8SrcBufRowG + i32CurRow);
i32SumB += *(pU8SrcBufRowB + i32CurRow);
i32SumR += *(pU8SrcBufRowR + i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + i32PixelBytes + i32CurRow);
i32SumR += *(pU8SrcBufRowR + 2 * i32PixelBytes + i32CurRow);
i32SumG += *(pU8SrcBufRowG + 2 * i32PixelBytes + i32CurRow);
i32SumB += *(pU8SrcBufRowB + 2 * i32PixelBytes + i32CurRow);
*pU8DstBufRowR = uint8_t(i32SumR / 25);
*pU8DstBufRowG = uint8_t(i32SumG / 25);
*pU8DstBufRowB = uint8_t(i32SumB / 25);
pU8SrcBufRowR += i32PixelBytes;
pU8SrcBufRowG += i32PixelBytes;
pU8SrcBufRowB += i32PixelBytes;
pU8DstBufRowR += i32PixelBytes;
pU8DstBufRowG += i32PixelBytes;
pU8DstBufRowB += i32PixelBytes;
}
}
Optimization Points (But Getting worse)
Loop unrolling
1
2
3
i32SumR += ...
i32SumR += ...
i32SumR += ...
There’s calculation dependency chain. But when we build the loop unrolling, it makes complier worse becuase disturbing optimization. By means that is, it disturbing prefetch from compiler. And there’s too much variable so reigster is not enough than just loop.
CPU usually like same pattern, easy branch prediction, using register with being minimized variables
3. SIMD
Processing time: 40ms
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
uint8_t* pU8SrcBuf = (uint8_t*)imgSrc.data;
uint8_t* pU8DstBuf = (uint8_t*)imgDst.data;
int32_t i32SteopRows = imgSrc.step;
int32_t i32PixelBytes = imgSrc.elemSize();
__m256i m256iZero = _mm256_setzero_si256();
__m256i m256iDiv = _mm256_set1_epi16(25);
for(int i32Row = 2; i32Row < imgSrc.rows - 2; ++i32Row)
{
uint8_t* pU8SrcBufRow = pU8SrcBuf + i32Row * imgSrc.step + 2 * i32PixelBytes;
uint8_t* pU8DstBufRow = pU8DstBuf + i32Row * imgDst.step + 2 * i32PixelBytes;
for(int i32Col = 2 * 3; i32Col +32 < (imgSrc.cols - 2) * 3 + 2; i32Col+=32)
{
__m256i m256iSumLo = _mm256_setzero_si256();
__m256i m256iSumHi = _mm256_setzero_si256();
for(int i = -2; i <= 2; ++i)
{
for(int j = -2; j <= 2; ++j)
{
__m256i m256iSet0 = _mm256_loadu_si256((__m256i*)(pU8SrcBufRow + i * imgSrc.step + j * 3));
__m256i m256iLo = _mm256_unpacklo_epi8(m256iSet0, m256iZero);
__m256i m256iHi = _mm256_unpackhi_epi8(m256iSet0, m256iZero);
m256iSumLo = _mm256_add_epi16(m256iSumLo, m256iLo);
m256iSumHi = _mm256_add_epi16(m256iSumHi, m256iHi);
}
}
m256iSumLo = _mm256_div_epi16(m256iSumLo, m256iDiv);
m256iSumHi = _mm256_div_epi16(m256iSumHi, m256iDiv);
__m256i m256Combine = _mm256_packus_epi16(m256iSumLo, m256iSumHi);
_mm256_storeu_si256((__m256i*)pU8DstBufRow ,m256Combine);
pU8SrcBufRow += 32;
pU8DstBufRow += 32;
}
}
Optimization Points
SIMD
Convolution is to be sumed from multiple data. considering overflow and seperate the memory, process and pack again.
1
2
3
4
5
6
7
8
_mm256_setzero_si256() // latency 1
_mm256_loadu_si256() // latency 7
_mm256_unpacklo_epi8() // latency 1
_mm256_unpackhi_epi8() // latency 1
_mm256_add_epi16() // latency 1
_mm256_div_epi16() // ?
_mm256_packus_epi16() // latency 3
_mm256_storeu_si256() // latency 1
- _mm256_setzero_si256
1
dst[MAX:0] := 0
- _mm256_loadu_si256()
1
2
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
- _mm256_unpacklo_epi8()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) {
dst[7:0] := src1[7:0]
dst[15:8] := src2[7:0]
dst[23:16] := src1[15:8]
dst[31:24] := src2[15:8]
dst[39:32] := src1[23:16]
dst[47:40] := src2[23:16]
dst[55:48] := src1[31:24]
dst[63:56] := src2[31:24]
dst[71:64] := src1[39:32]
dst[79:72] := src2[39:32]
dst[87:80] := src1[47:40]
dst[95:88] := src2[47:40]
dst[103:96] := src1[55:48]
dst[111:104] := src2[55:48]
dst[119:112] := src1[63:56]
dst[127:120] := src2[63:56]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
dst[MAX:256] := 0
- _mm256_unpackhi_epi8()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) {
dst[7:0] := src1[71:64]
dst[15:8] := src2[71:64]
dst[23:16] := src1[79:72]
dst[31:24] := src2[79:72]
dst[39:32] := src1[87:80]
dst[47:40] := src2[87:80]
dst[55:48] := src1[95:88]
dst[63:56] := src2[95:88]
dst[71:64] := src1[103:96]
dst[79:72] := src2[103:96]
dst[87:80] := src1[111:104]
dst[95:88] := src2[111:104]
dst[103:96] := src1[119:112]
dst[111:104] := src2[119:112]
dst[119:112] := src1[127:120]
dst[127:120] := src2[127:120]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
dst[MAX:256] := 0
- _mm256_add_epi16()
1
2
3
4
5
FOR j := 0 to 15
i := j*16
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
dst[MAX:256] := 0
- _mm256_div_epi16()
1
2
3
4
5
6
7
8
FOR j := 0 to 15
i := 16*j
IF b[i+15:i] == 0
#DE
FI
dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:256] := 0
- _mm256_packus_epi16()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
dst[7:0] := SaturateU8(a[15:0])
dst[15:8] := SaturateU8(a[31:16])
dst[23:16] := SaturateU8(a[47:32])
dst[31:24] := SaturateU8(a[63:48])
dst[39:32] := SaturateU8(a[79:64])
dst[47:40] := SaturateU8(a[95:80])
dst[55:48] := SaturateU8(a[111:96])
dst[63:56] := SaturateU8(a[127:112])
dst[71:64] := SaturateU8(b[15:0])
dst[79:72] := SaturateU8(b[31:16])
dst[87:80] := SaturateU8(b[47:32])
dst[95:88] := SaturateU8(b[63:48])
dst[103:96] := SaturateU8(b[79:64])
dst[111:104] := SaturateU8(b[95:80])
dst[119:112] := SaturateU8(b[111:96])
dst[127:120] := SaturateU8(b[127:112])
dst[135:128] := SaturateU8(a[143:128])
dst[143:136] := SaturateU8(a[159:144])
dst[151:144] := SaturateU8(a[175:160])
dst[159:152] := SaturateU8(a[191:176])
dst[167:160] := SaturateU8(a[207:192])
dst[175:168] := SaturateU8(a[223:208])
dst[183:176] := SaturateU8(a[239:224])
dst[191:184] := SaturateU8(a[255:240])
dst[199:192] := SaturateU8(b[143:128])
dst[207:200] := SaturateU8(b[159:144])
dst[215:208] := SaturateU8(b[175:160])
dst[223:216] := SaturateU8(b[191:176])
dst[231:224] := SaturateU8(b[207:192])
dst[239:232] := SaturateU8(b[223:208])
dst[247:240] := SaturateU8(b[239:224])
dst[255:248] := SaturateU8(b[255:240])
dst[MAX:256] := 0
- _mm256_storeu_si256()
1
MEM[mem_addr+255:mem_addr] := a[255:0]
4. SIMD with replacing divide to multiple and shift
Processing time: 31ms
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
uint8_t* pU8SrcBuf = (uint8_t*)imgSrc.data;
uint8_t* pU8DstBuf = (uint8_t*)imgDst.data;
int32_t i32SteopRows = imgSrc.step;
int32_t i32PixelBytes = imgSrc.elemSize();
__m256i m256iZero = _mm256_setzero_si256();
__m256i m256iDivApprox = _mm256_set1_epi16(2621);
for(int i32Row = 2; i32Row < imgSrc.rows - 2; ++i32Row)
{
uint8_t* pU8SrcBufRow = pU8SrcBuf + i32Row * imgSrc.step + 2 * i32PixelBytes;
uint8_t* pU8DstBufRow = pU8DstBuf + i32Row * imgDst.step + 2 * i32PixelBytes;
for(int i32Col = 2 * 3; i32Col +32 < (imgSrc.cols - 2) * 3 + 2; i32Col+=32)
{
__m256i m256iSumLo = _mm256_setzero_si256();
__m256i m256iSumHi = _mm256_setzero_si256();
for(int i = -2; i <= 2; ++i)
{
for(int j = -2; j <= 2; ++j)
{
__m256i m256iSet0 = _mm256_loadu_si256((__m256i*)(pU8SrcBufRow + i * imgSrc.step + j * 3));
__m256i m256iLo = _mm256_unpacklo_epi8(m256iSet0, m256iZero);
__m256i m256iHi = _mm256_unpackhi_epi8(m256iSet0, m256iZero);
m256iSumLo = _mm256_add_epi16(m256iSumLo, m256iLo);
m256iSumHi = _mm256_add_epi16(m256iSumHi, m256iHi);
}
}
m256iSumLo = _mm256_mulhi_epu16(m256iSumLo, m256iDivApprox);
m256iSumHi = _mm256_mulhi_epu16(m256iSumHi, m256iDivApprox);
__m256i m256Combine = _mm256_packus_epi16(m256iSumLo, m256iSumHi);
_mm256_storeu_si256((__m256i*)pU8DstBufRow ,m256Combine);
pU8SrcBufRow += 32;
pU8DstBufRow += 32;
}
}
5. SIMD, not useful but try another way
Processing time: 104ms
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
uint8_t* pU8SrcBuf = (uint8_t*)imgSrc.data;
uint8_t* pU8DstBuf = (uint8_t*)imgDst.data;
int32_t i32SteopRows = imgSrc.step;
int32_t i32PixelBytes = imgSrc.elemSize();
__m256i m256iZero = _mm256_setzero_si256();
__m256i m256iDiv = _mm256_set1_epi16(25);
__m256i m256iMaskB0 = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i m256iMaskB1 = _mm256_setr_epi8(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i m256iMaskB2 = _mm256_setr_epi8(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i m256iMaskG0 = _mm256_setr_epi8(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i m256iMaskG1 = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i m256iMaskG2 = _mm256_setr_epi8(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i m256iMaskR0 = _mm256_setr_epi8(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i m256iMaskR1 = _mm256_setr_epi8(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i m256iMaskR2 = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
for(int i32Row = 2; i32Row < imgSrc.rows - 2; ++i32Row)
{
uint8_t* pU8SrcBufRow = pU8SrcBuf + i32Row * imgSrc.step + 2 * i32PixelBytes;
uint8_t* pU8DstBufRow = pU8DstBuf + i32Row * imgDst.step + 2 * i32PixelBytes;
for(int i32Col = 2 * 3; i32Col + 96 < (imgSrc.cols - 2) * 3 + 2; i32Col+=96)
{
__m256i m256iSumBLo = _mm256_setzero_si256();
__m256i m256iSumBHi = _mm256_setzero_si256();
__m256i m256iSumGLo = _mm256_setzero_si256();
__m256i m256iSumGHi = _mm256_setzero_si256();
__m256i m256iSumRLo = _mm256_setzero_si256();
__m256i m256iSumRHi = _mm256_setzero_si256();
for(int i = -2; i <= 2; ++i)
{
for(int j = -2; j <= 2; ++j)
{
// B0 G0 R0 B1 G1 R1 B2 G2 R3 B3 G3 R3 B4 G4 R4 B5 - L0
// G5 R5 B6 G6 R6 B7 G7 R7 B8 G8 R8 B9 G9 R9 B10 G10 - L1
// R10 B11 G11 R11 B12 G12 R12 B13 G13 R13 B14 G14 R14 B15 G15 R15 - L2
// B16 G16 R16 B17 G17 R17 B18 G18 R18 B19 G19 R19 B20 G20 R20 B21 - L3
// G21 R21 B22 G22 R22 B23 G23 R23 B24 G24 R24 B25 G25 R25 B26 G26 - L4
// R26 B27 G27 R27 B28 G28 R28 B29 G29 R29 B30 G30 R30 B31 G31 R31 - L5
__m256i m256iSet0 = _mm256_loadu_si256((__m256i*)(pU8SrcBufRow + i * imgSrc.step + j * 3));
__m256i m256iSet1 = _mm256_loadu_si256((__m256i*)(pU8SrcBufRow + 32 + i * imgSrc.step + j * 3));
__m256i m256iSet2 = _mm256_loadu_si256((__m256i*)(pU8SrcBufRow + 64 + i * imgSrc.step + j * 3));
__m256i m256iL0L3Src = _mm256_permute2x128_si256(m256iSet0, m256iSet1, 0x30); // L0 | L3
__m256i m256iL1L4Src = _mm256_permute2x128_si256(m256iSet0, m256iSet2, 0x21); // L1 | L4
__m256i m256iL2L5Src = _mm256_permute2x128_si256(m256iSet1, m256iSet2, 0x30); // L2 | L5
__m256i m256iB;
{
__m256i m256iL0L3 = _mm256_shuffle_epi8(m256iL0L3Src, m256iMaskB0); // B0 ~ B5 | B16 ~ B21
__m256i m256iL1L4 = _mm256_shuffle_epi8(m256iL1L4Src, m256iMaskB1); // B6 ~ B10 | B22 ~ B26
__m256i m256iL2L5 = _mm256_shuffle_epi8(m256iL2L5Src, m256iMaskB2); // B11 ~ B15 | B27 ~ B31
m256iL1L4 = _mm256_slli_si256(m256iL1L4, 6);
m256iL2L5 = _mm256_slli_si256(m256iL2L5, 11);
m256iB = _mm256_or_si256(m256iL0L3, _mm256_or_si256(m256iL1L4, m256iL2L5));
}
__m256i m256iG;
{
__m256i m256iL0L3 = _mm256_shuffle_epi8(m256iL0L3Src, m256iMaskG0);
__m256i m256iL1L4 = _mm256_shuffle_epi8(m256iL1L4Src, m256iMaskG1);
__m256i m256iL2L5 = _mm256_shuffle_epi8(m256iL2L5Src, m256iMaskG2);
m256iL1L4 = _mm256_slli_si256(m256iL1L4, 5);
m256iL2L5 = _mm256_slli_si256(m256iL2L5, 11);
m256iG = _mm256_or_si256(m256iL0L3, _mm256_or_si256(m256iL1L4, m256iL2L5));
}
__m256i m256iR;
{
__m256i m256iL0L3 = _mm256_shuffle_epi8(m256iL0L3Src, m256iMaskR0);
__m256i m256iL1L4 = _mm256_shuffle_epi8(m256iL1L4Src, m256iMaskR1);
__m256i m256iL2L5 = _mm256_shuffle_epi8(m256iL2L5Src, m256iMaskR2);
m256iL1L4 = _mm256_slli_si256(m256iL1L4, 5);
m256iL2L5 = _mm256_slli_si256(m256iL2L5, 10);
m256iR = _mm256_or_si256(m256iL0L3, _mm256_or_si256(m256iL1L4, m256iL2L5));
}
__m256i m256iBLo = _mm256_unpacklo_epi8(m256iB, m256iZero);
__m256i m256iBHi = _mm256_unpackhi_epi8(m256iB, m256iZero);
m256iSumBLo = _mm256_add_epi16(m256iSumBLo, m256iBLo);
m256iSumBHi = _mm256_add_epi16(m256iSumBHi, m256iBHi);
__m256i m256iGLo = _mm256_unpacklo_epi8(m256iG, m256iZero);
__m256i m256iGHi = _mm256_unpackhi_epi8(m256iG, m256iZero);
m256iSumGLo = _mm256_add_epi16(m256iSumGLo, m256iGLo);
m256iSumGHi = _mm256_add_epi16(m256iSumGHi, m256iGHi);
__m256i m256iRLo = _mm256_unpacklo_epi8(m256iR, m256iZero);
__m256i m256iRHi = _mm256_unpackhi_epi8(m256iR, m256iZero);
m256iSumRLo = _mm256_add_epi16(m256iSumRLo, m256iRLo);
m256iSumRHi = _mm256_add_epi16(m256iSumRHi, m256iRHi);
}
}
m256iSumBLo = _mm256_div_epi16(m256iSumBLo, m256iDiv);
m256iSumBHi = _mm256_div_epi16(m256iSumBHi, m256iDiv);
m256iSumGLo = _mm256_div_epi16(m256iSumGLo, m256iDiv);
m256iSumGHi = _mm256_div_epi16(m256iSumGHi, m256iDiv);
m256iSumRLo = _mm256_div_epi16(m256iSumRLo, m256iDiv);
m256iSumRHi = _mm256_div_epi16(m256iSumRHi, m256iDiv);
__m256i m256CombineB = _mm256_packus_epi16(m256iSumBLo, m256iSumBHi);
__m256i m256CombineG = _mm256_packus_epi16(m256iSumGLo, m256iSumGHi);
__m256i m256CombineR = _mm256_packus_epi16(m256iSumRLo, m256iSumRHi);
// B = [B0~B15 | B16~B31]
__m128i m128iBLo = _mm256_castsi256_si128(m256CombineB);
__m128i m128iGLo = _mm256_castsi256_si128(m256CombineG);
__m128i m128iRLo = _mm256_castsi256_si128(m256CombineR);
__m128i m128iBHi = _mm256_extracti128_si256(m256CombineB, 1);
__m128i m128iGHi = _mm256_extracti128_si256(m256CombineG, 1);
__m128i m128iRHi = _mm256_extracti128_si256(m256CombineR, 1);
auto OR3 = [](__m128i a, __m128i b, __m128i c)
{
return _mm_or_si128(a, _mm_or_si128(b, c));
};
__m128i m128iBlock0 = OR3(_mm_shuffle_epi8(m128iBLo, _mm_setr_epi8(0,-1,-1, 1,-1,-1, 2,-1,-1, 3,-1,-1, 4,-1,-1, 5)),
_mm_shuffle_epi8(m128iGLo, _mm_setr_epi8(-1,0,-1, -1,1,-1, -1,2,-1, -1,3,-1, -1,4,-1, -1)),
_mm_shuffle_epi8(m128iRLo, _mm_setr_epi8(-1,-1,0, -1,-1,1, -1,-1,2, -1,-1,3, -1,-1,4, -1)));
__m128i m128iBlock1 = OR3(
_mm_shuffle_epi8(m128iBLo, _mm_setr_epi8(-1,-1,6, -1,-1,7, -1,-1,8, -1,-1,9, -1,-1,10, -1)),
_mm_shuffle_epi8(m128iGLo, _mm_setr_epi8(5,-1,-1, 6,-1,-1, 7,-1,-1, 8,-1,-1, 9,-1,-1, 10)),
_mm_shuffle_epi8(m128iRLo, _mm_setr_epi8(-1,5,-1, -1,6,-1, -1,7,-1, -1,8,-1, -1,9,-1, -1)));
__m128i m128iBlock2 = OR3(
_mm_shuffle_epi8(m128iBLo, _mm_setr_epi8(-1,11,-1, -1,12,-1, -1,13,-1, -1,14,-1, -1,15,-1, -1)),
_mm_shuffle_epi8(m128iGLo, _mm_setr_epi8(-1,-1,11, -1,-1,12, -1,-1,13, -1,-1,14, -1,-1,15, -1)),
_mm_shuffle_epi8(m128iRLo, _mm_setr_epi8(10,-1,-1, 11,-1,-1, 12,-1,-1, 13,-1,-1, 14,-1,-1, 15)));
__m128i m128iBlock3 = OR3(
_mm_shuffle_epi8(m128iBHi, _mm_setr_epi8(0,-1,-1, 1,-1,-1, 2,-1,-1, 3,-1,-1, 4,-1,-1, 5)),
_mm_shuffle_epi8(m128iGHi, _mm_setr_epi8(-1,0,-1, -1,1,-1, -1,2,-1, -1,3,-1, -1,4,-1, -1)),
_mm_shuffle_epi8(m128iRHi, _mm_setr_epi8(-1,-1,0, -1,-1,1, -1,-1,2, -1,-1,3, -1,-1,4, -1)));
__m128i m128iBlock4 = OR3(
_mm_shuffle_epi8(m128iBHi, _mm_setr_epi8(-1,-1,6, -1,-1,7, -1,-1,8, -1,-1,9, -1,-1,10, -1)),
_mm_shuffle_epi8(m128iGHi, _mm_setr_epi8(5,-1,-1, 6,-1,-1, 7,-1,-1, 8,-1,-1, 9,-1,-1, 10)),
_mm_shuffle_epi8(m128iRHi, _mm_setr_epi8(-1,5,-1, -1,6,-1, -1,7,-1, -1,8,-1, -1,9,-1, -1)));
__m128i m128iBlock5 = OR3(
_mm_shuffle_epi8(m128iBHi, _mm_setr_epi8(-1,11,-1, -1,12,-1, -1,13,-1, -1,14,-1, -1,15,-1, -1)),
_mm_shuffle_epi8(m128iGHi, _mm_setr_epi8(-1,-1,11, -1,-1,12, -1,-1,13, -1,-1,14, -1,-1,15, -1)),
_mm_shuffle_epi8(m128iRHi, _mm_setr_epi8(10,-1,-1, 11,-1,-1, 12,-1,-1, 13,-1,-1, 14,-1,-1, 15)));
// [L0 | L1], [L2 | L3], [L4 | L5]
__m256i m256iSection0 = _mm256_set_m128i(m128iBlock1, m128iBlock0);
__m256i m256iSection1 = _mm256_set_m128i(m128iBlock3, m128iBlock2);
__m256i m256iSection2 = _mm256_set_m128i(m128iBlock5, m128iBlock4);
_mm256_storeu_si256((__m256i*)(pU8DstBufRow + 0), m256iSection0);
_mm256_storeu_si256((__m256i*)(pU8DstBufRow + 32), m256iSection1);
_mm256_storeu_si256((__m256i*)(pU8DstBufRow + 64), m256iSection2);
pU8SrcBufRow += 96;
pU8DstBufRow += 96;
}
}
Processing Order
BGR Order
1
2
3
4
5
6
L0: B0 G0 R0 B1 G1 R1 B2 G2 R3 B3 G3 R3 B4 G4 R4 B5
L1: G5 R5 B6 G6 R6 B7 G7 R7 B8 G8 R8 B9 G9 R9 B10 G10
L2: R10 B11 G11 R11 B12 G12 R12 B13 G13 R13 B14 G14 R14 B15 G15 R15
L3: B16 G16 R16 B17 G17 R17 B18 G18 R18 B19 G19 R19 B20 G20 R20 B21
L4: G21 R21 B22 G22 R22 B23 G23 R23 B24 G24 R24 B25 G25 R25 B26 G26
L5: R26 B27 G27 R27 B28 G28 R28 B29 G29 R29 B30 G30 R30 B31 G31 R31
permutate same order: _mm256_permute2x128_si256
1
2
3
L0 | L3
L1 | L4
L2 | L5
extract each B / G / R: _mm256_shuffle_epi8
1
2
3
4
5
6
7
8
9
10
11
L0|L3: B0 ~ B5 | B16 ~ B21
L1|L4: B6 ~ B10 | B22 ~ B26
L2|L5: B11 ~ B15 | B27 ~ B31
L0|L3: G0 ~ G4 | G16 ~ G20
L1|L4: G5 ~ G10 | G21 ~ G26
L2|L5: G11 ~ G15 | G27 ~ G31
L0|L3: R0 ~ R4 | R16 ~ R20
L1|L4: R5 ~ R9 | R22 ~ R26
L2|L5: R10 ~ R15 | R27 ~ R31
B0 ~ B31: _mm256_slli_si256, _mm256_or_si256
1
2
3
4
5
L0|L3: B0 ~ B5 | B16 ~ B21
L1|L4: B6 ~ B10 | B22 ~ B26 >> 6
L2|L5: B11 ~ B15 | B27 ~ B31 >> 11
-> B0 ~ B31
| Seperating B0~B15 | B16~B31: _mm256_castsi256_si128, _mm256_extracti128_si256 |
Extract and Combine: _mm_shuffle_epi8, _mm_setr_epi8, _mm_or_si128
1
2
3
4
5
B0 B1 ~
G0 G1 ~
R0 R1 ~
B0 G0 R0 B1 G1 R1 ----
Because shuffle is not support over 16.
1
2
3
pos: 0 ...... 15 | 16 ....... 31
| | | |
extract idx 0 ..... 15 0 ....... 15
Optimization Points
SIMD
1
2
3
4
5
6
7
_mm256_permute2x128_si256() // latency 3
_mm256_shuffle_epi8() // latency 1
_mm256_slli_si256() // latency 1
_mm256_or_si256() // latency 1
_mm256_castsi256_si128() // latency ?
_mm256_extracti128_si256() // latency 3
_mm256_set_m128i() // latency 3
- _mm256_permute2x128_si256()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
DEFINE SELECT4(src1, src2, control) {
CASE(control[1:0]) OF
0: tmp[127:0] := src1[127:0]
1: tmp[127:0] := src1[255:128]
2: tmp[127:0] := src2[127:0]
3: tmp[127:0] := src2[255:128]
ESAC
IF control[3]
tmp[127:0] := 0
FI
RETURN tmp[127:0]
}
dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
dst[MAX:256] := 0
- _mm256_shuffle_epi8()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
FOR j := 0 to 15
i := j*8
IF b[i+7] == 1
dst[i+7:i] := 0
ELSE
index[3:0] := b[i+3:i]
dst[i+7:i] := a[index*8+7:index*8]
FI
IF b[128+i+7] == 1
dst[128+i+7:128+i] := 0
ELSE
index[3:0] := b[128+i+3:128+i]
dst[128+i+7:128+i] := a[128+index*8+7:128+index*8]
FI
ENDFOR
dst[MAX:256] := 0
- _mm256_slli_si256()
1
2
3
4
5
6
7
tmp := imm8[7:0]
IF tmp > 15
tmp := 16
FI
dst[127:0] := a[127:0] << (tmp*8)
dst[255:128] := a[255:128] << (tmp*8)
dst[MAX:256] := 0
- _mm256_or_si256()
1
2
dst[255:0] := (a[255:0] OR b[255:0])
dst[MAX:256] := 0
- _mm256_castsi256_si128()
1
- _mm256_extracti128_si256()
1
2
3
4
5
CASE imm8[0] OF
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
- _mm256_set_m128i()
1
2
3
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
6. Practice
It is not working like 5. SIMD, let’s check the issue
Hint: shuffle is not working about 16 ~ 31 idx.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
// B0 G0 R0 B1 G1 R1 B2 G2 R3 B3 G3 R3 B4 G4 R4 B5 - L0
// G5 R5 B6 G6 R6 B7 G7 R7 B8 G8 R8 B9 G9 R9 B10 G10 - L1
__m256i m256iSection0;
{
__m256i m256iMask0 = _mm256_setr_epi8(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5,
-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1);
__m256i m256iMask1 = _mm256_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1,
5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10);
__m256i m256iMask2 = _mm256_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1,
-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1);
__m256i m256i0 = _mm256_shuffle_epi8(m256CombineB, m256iMask0);
__m256i m256i1 = _mm256_shuffle_epi8(m256CombineG, m256iMask1);
__m256i m256i2 = _mm256_shuffle_epi8(m256CombineR, m256iMask2);
m256iSection0 = _mm256_or_si256(m256i0, _mm256_or_si256(m256i1, m256i2));
}
// R10 B11 G11 R11 B12 G12 R12 B13 G13 R13 B14 G14 R14 B15 G15 R15 - L2
// B16 G16 R16 B17 G17 R17 B18 G18 R18 B19 G19 R19 B20 G20 R20 B21 - L3
__m256i m256iSection1;
{
__m256i m256iMask0 = _mm256_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15,
-1, -1, 16, -1, -1, 17, -1, -1, 18, -1, -1, 19, -1, -1, 20, -1);
__m256i m256iMask1 = _mm256_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1,
16, -1, -1, 17, -1, -1, 18, -1, -1, 19, -1, -1, 20, -1, -1, 21);
__m256i m256iMask2 = _mm256_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1,
-1, 16, -1, -1, 17, -1, -1, 18, -1, -1, 19, -1, -1, 20, -1, -1);
__m256i m256i0 = _mm256_shuffle_epi8(m256CombineR, m256iMask0);
__m256i m256i1 = _mm256_shuffle_epi8(m256CombineB, m256iMask1);
__m256i m256i2 = _mm256_shuffle_epi8(m256CombineG, m256iMask2);
m256iSection1 = _mm256_or_si256(m256i0, _mm256_or_si256(m256i1, m256i2));
}
// G21 R21 B22 G22 R22 B23 G23 R23 B24 G24 R24 B25 G25 R25 B26 G26 - L4
// R26 B27 G27 R27 B28 G28 R28 B29 G29 R29 B30 G30 R30 B31 G31 R31 - L5
__m256i m256iSection2;
{
__m256i m256iMask0 = _mm256_setr_epi8(21, -1, -1, 22, -1, -1, 23, -1, -1, 24, -1, -1, 25, -1, -1, 26,
-1, -1, 27, -1, -1, 28, -1, -1, 29, -1, -1, 30, -1, -1, 31, -1);
__m256i m256iMask1 = _mm256_setr_epi8(-1, 21, -1, -1, 22, -1, -1, 23, -1, -1, 24, -1, -1, 25, -1, -1,
26, -1, -1, 27, -1, -1, 28, -1, -1, 29, -1, -1, 30, -1, -1, 31);
__m256i m256iMask2 = _mm256_setr_epi8(-1, -1, 22, -1, -1, 23, -1, -1, 24, -1, -1, 25, -1, -1, 26, -1,
-1, 27, -1, -1, 28, -1, -1, 29, -1, -1, 30, -1, -1, 31, -1, -1);
__m256i m256i0 = _mm256_shuffle_epi8(m256CombineG, m256iMask0);
__m256i m256i1 = _mm256_shuffle_epi8(m256CombineR, m256iMask1);
__m256i m256i2 = _mm256_shuffle_epi8(m256CombineB, m256iMask2);
m256iSection2 = _mm256_or_si256(m256i0, _mm256_or_si256(m256i1, m256i2));
}
_mm256_storeu_si256((__m256i*)pU8DstBufRow, m256iSection0);
_mm256_storeu_si256((__m256i*)(pU8DstBufRow + 32), m256iSection1);
_mm256_storeu_si256((__m256i*)(pU8DstBufRow + 64), m256iSection2);