1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2015, Itseez Inc., all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 // * Redistribution's of source code must retain the above copyright notice,
20 // this list of conditions and the following disclaimer.
21 //
22 // * Redistribution's in binary form must reproduce the above copyright notice,
23 // this list of conditions and the following disclaimer in the documentation
24 // and/or other materials provided with the distribution.
25 //
26 // * The name of the copyright holders may not be used to endorse or promote products
27 // derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the Intel Corporation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #ifndef __OPENCV_CORE_SSE_UTILS_HPP__
43 #define __OPENCV_CORE_SSE_UTILS_HPP__
44
45 #ifndef __cplusplus
46 # error sse_utils.hpp header must be compiled as C++
47 #endif
48
49 #if CV_SSE2
50
_mm_deinterleave_epi8(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1)51 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
52 {
53 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
54 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
55 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
56 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
57
58 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
59 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
60 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
61 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
62
63 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
64 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
65 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
66 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
67
68 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
69 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
70 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
71 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
72
73 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
74 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
75 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
76 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
77 }
78
_mm_deinterleave_epi8(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1)79 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
80 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
81 {
82 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
83 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
84 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
85 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
86 __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
87 __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
88
89 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
90 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
91 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
92 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
93 __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
94 __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
95
96 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
97 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
98 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
99 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
100 __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
101 __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
102
103 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
104 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
105 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
106 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
107 __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
108 __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
109
110 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
111 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
112 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
113 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
114 v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
115 v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
116 }
117
_mm_deinterleave_epi8(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1,__m128i & v_a0,__m128i & v_a1)118 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
119 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
120 {
121 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
122 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
123 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
124 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
125 __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
126 __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
127 __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
128 __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
129
130 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
131 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
132 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
133 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
134 __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
135 __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
136 __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
137 __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
138
139 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
140 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
141 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
142 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
143 __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
144 __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
145 __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
146 __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
147
148 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
149 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
150 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
151 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
152 __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
153 __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
154 __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
155 __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
156
157 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
158 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
159 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
160 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
161 v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
162 v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
163 v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
164 v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
165 }
166
_mm_interleave_epi8(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1)167 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
168 {
169 __m128i v_mask = _mm_set1_epi16(0x00ff);
170
171 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
172 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
173 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
174 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
175
176 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
177 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
178 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
179 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
180
181 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
182 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
183 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
184 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
185
186 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
187 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
188 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
189 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
190
191 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
192 v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
193 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
194 v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
195 }
196
_mm_interleave_epi8(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1)197 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
198 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
199 {
200 __m128i v_mask = _mm_set1_epi16(0x00ff);
201
202 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
203 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
204 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
205 __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
206 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
207 __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
208
209 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
210 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
211 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
212 __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
213 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
214 __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
215
216 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
217 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
218 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
219 __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
220 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
221 __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
222
223 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
224 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
225 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
226 __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
227 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
228 __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
229
230 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
231 v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
232 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
233 v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
234 v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
235 v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
236 }
237
_mm_interleave_epi8(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1,__m128i & v_a0,__m128i & v_a1)238 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
239 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
240 {
241 __m128i v_mask = _mm_set1_epi16(0x00ff);
242
243 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
244 __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
245 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
246 __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
247 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
248 __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
249 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
250 __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
251
252 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
253 __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
254 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
255 __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
256 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
257 __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
258 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
259 __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
260
261 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
262 __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
263 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
264 __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
265 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
266 __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
267 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
268 __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
269
270 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
271 __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
272 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
273 __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
274 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
275 __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
276 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
277 __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
278
279 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
280 v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
281 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
282 v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
283 v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
284 v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
285 v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
286 v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
287 }
288
_mm_deinterleave_epi16(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1)289 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
290 {
291 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
292 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
293 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
294 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
295
296 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
297 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
298 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
299 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
300
301 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
302 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
303 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
304 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
305
306 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
307 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
308 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
309 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
310 }
311
_mm_deinterleave_epi16(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1)312 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
313 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
314 {
315 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
316 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
317 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
318 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
319 __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
320 __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
321
322 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
323 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
324 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
325 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
326 __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
327 __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
328
329 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
330 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
331 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
332 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
333 __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
334 __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
335
336 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
337 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
338 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
339 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
340 v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
341 v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
342 }
343
_mm_deinterleave_epi16(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1,__m128i & v_a0,__m128i & v_a1)344 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
345 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
346 {
347 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
348 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
349 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
350 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
351 __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
352 __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
353 __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
354 __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
355
356 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
357 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
358 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
359 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
360 __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
361 __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
362 __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
363 __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
364
365 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
366 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
367 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
368 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
369 __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
370 __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
371 __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
372 __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
373
374 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
375 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
376 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
377 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
378 v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
379 v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
380 v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
381 v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
382 }
383
384 #if CV_SSE4_1
385
_mm_interleave_epi16(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1)386 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
387 {
388 __m128i v_mask = _mm_set1_epi32(0x0000ffff);
389
390 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
391 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
392 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
393 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
394
395 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
396 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
397 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
398 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
399
400 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
401 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
402 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
403 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
404
405 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
406 v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
407 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
408 v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
409 }
410
_mm_interleave_epi16(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1)411 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
412 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
413 {
414 __m128i v_mask = _mm_set1_epi32(0x0000ffff);
415
416 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
417 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
418 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
419 __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
420 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
421 __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
422
423 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
424 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
425 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
426 __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
427 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
428 __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
429
430 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
431 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
432 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
433 __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
434 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
435 __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
436
437 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
438 v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
439 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
440 v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
441 v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
442 v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
443 }
444
_mm_interleave_epi16(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1,__m128i & v_a0,__m128i & v_a1)445 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
446 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
447 {
448 __m128i v_mask = _mm_set1_epi32(0x0000ffff);
449
450 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
451 __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
452 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
453 __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
454 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
455 __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
456 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
457 __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
458
459 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
460 __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
461 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
462 __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
463 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
464 __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
465 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
466 __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
467
468 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
469 __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
470 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
471 __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
472 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
473 __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
474 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
475 __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
476
477 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
478 v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
479 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
480 v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
481 v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
482 v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
483 v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
484 v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
485 }
486
487 #endif // CV_SSE4_1
488
_mm_deinterleave_ps(__m128 & v_r0,__m128 & v_r1,__m128 & v_g0,__m128 & v_g1)489 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
490 {
491 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
492 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
493 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
494 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
495
496 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
497 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
498 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
499 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
500
501 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
502 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
503 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
504 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
505 }
506
_mm_deinterleave_ps(__m128 & v_r0,__m128 & v_r1,__m128 & v_g0,__m128 & v_g1,__m128 & v_b0,__m128 & v_b1)507 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
508 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
509 {
510 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
511 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
512 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
513 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
514 __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
515 __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
516
517 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
518 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
519 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
520 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
521 __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
522 __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
523
524 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
525 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
526 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
527 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
528 v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
529 v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
530 }
531
_mm_deinterleave_ps(__m128 & v_r0,__m128 & v_r1,__m128 & v_g0,__m128 & v_g1,__m128 & v_b0,__m128 & v_b1,__m128 & v_a0,__m128 & v_a1)532 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
533 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
534 {
535 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
536 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
537 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
538 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
539 __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
540 __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
541 __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
542 __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
543
544 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
545 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
546 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
547 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
548 __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
549 __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
550 __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
551 __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
552
553 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
554 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
555 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
556 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
557 v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
558 v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
559 v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
560 v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
561 }
562
_mm_interleave_ps(__m128 & v_r0,__m128 & v_r1,__m128 & v_g0,__m128 & v_g1)563 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
564 {
565 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
566
567 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
568 __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
569 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
570 __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
571
572 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
573 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
574 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
575 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
576
577 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
578 v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
579 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
580 v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
581 }
582
_mm_interleave_ps(__m128 & v_r0,__m128 & v_r1,__m128 & v_g0,__m128 & v_g1,__m128 & v_b0,__m128 & v_b1)583 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
584 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
585 {
586 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
587
588 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
589 __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
590 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
591 __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
592 __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
593 __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
594
595 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
596 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
597 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
598 __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
599 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
600 __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
601
602 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
603 v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
604 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
605 v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
606 v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
607 v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
608 }
609
_mm_interleave_ps(__m128 & v_r0,__m128 & v_r1,__m128 & v_g0,__m128 & v_g1,__m128 & v_b0,__m128 & v_b1,__m128 & v_a0,__m128 & v_a1)610 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
611 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
612 {
613 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
614
615 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
616 __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
617 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
618 __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
619 __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
620 __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
621 __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
622 __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
623
624 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
625 __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
626 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
627 __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
628 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
629 __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
630 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
631 __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
632
633 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
634 v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
635 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
636 v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
637 v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
638 v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
639 v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
640 v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
641 }
642
643 #endif // CV_SSE2
644
645 #endif //__OPENCV_CORE_SSE_UTILS_HPP__
646