1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2015, Itseez Inc., all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the Intel Corporation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41 
42 #ifndef __OPENCV_CORE_SSE_UTILS_HPP__
43 #define __OPENCV_CORE_SSE_UTILS_HPP__
44 
45 #ifndef __cplusplus
46 #  error sse_utils.hpp header must be compiled as C++
47 #endif
48 
49 #if CV_SSE2
50 
_mm_deinterleave_epi8(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1)51 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
52 {
53     __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
54     __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
55     __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
56     __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
57 
58     __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
59     __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
60     __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
61     __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
62 
63     __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
64     __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
65     __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
66     __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
67 
68     __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
69     __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
70     __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
71     __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
72 
73     v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
74     v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
75     v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
76     v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
77 }
78 
_mm_deinterleave_epi8(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1)79 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
80                                   __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
81 {
82     __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
83     __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
84     __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
85     __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
86     __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
87     __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
88 
89     __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
90     __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
91     __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
92     __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
93     __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
94     __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
95 
96     __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
97     __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
98     __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
99     __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
100     __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
101     __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
102 
103     __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
104     __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
105     __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
106     __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
107     __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
108     __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
109 
110     v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
111     v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
112     v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
113     v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
114     v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
115     v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
116 }
117 
_mm_deinterleave_epi8(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1,__m128i & v_a0,__m128i & v_a1)118 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
119                                   __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
120 {
121     __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
122     __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
123     __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
124     __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
125     __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
126     __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
127     __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
128     __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
129 
130     __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
131     __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
132     __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
133     __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
134     __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
135     __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
136     __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
137     __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
138 
139     __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
140     __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
141     __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
142     __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
143     __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
144     __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
145     __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
146     __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
147 
148     __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
149     __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
150     __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
151     __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
152     __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
153     __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
154     __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
155     __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
156 
157     v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
158     v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
159     v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
160     v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
161     v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
162     v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
163     v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
164     v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
165 }
166 
_mm_interleave_epi8(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1)167 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
168 {
169     __m128i v_mask = _mm_set1_epi16(0x00ff);
170 
171     __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
172     __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
173     __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
174     __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
175 
176     __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
177     __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
178     __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
179     __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
180 
181     __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
182     __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
183     __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
184     __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
185 
186     __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
187     __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
188     __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
189     __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
190 
191     v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
192     v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
193     v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
194     v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
195 }
196 
_mm_interleave_epi8(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1)197 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
198                                 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
199 {
200     __m128i v_mask = _mm_set1_epi16(0x00ff);
201 
202     __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
203     __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
204     __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
205     __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
206     __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
207     __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
208 
209     __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
210     __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
211     __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
212     __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
213     __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
214     __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
215 
216     __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
217     __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
218     __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
219     __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
220     __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
221     __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
222 
223     __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
224     __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
225     __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
226     __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
227     __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
228     __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
229 
230     v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
231     v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
232     v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
233     v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
234     v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
235     v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
236 }
237 
_mm_interleave_epi8(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1,__m128i & v_a0,__m128i & v_a1)238 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
239                                 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
240 {
241     __m128i v_mask = _mm_set1_epi16(0x00ff);
242 
243     __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
244     __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
245     __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
246     __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
247     __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
248     __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
249     __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
250     __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
251 
252     __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
253     __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
254     __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
255     __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
256     __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
257     __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
258     __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
259     __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
260 
261     __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
262     __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
263     __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
264     __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
265     __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
266     __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
267     __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
268     __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
269 
270     __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
271     __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
272     __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
273     __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
274     __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
275     __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
276     __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
277     __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
278 
279     v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
280     v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
281     v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
282     v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
283     v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
284     v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
285     v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
286     v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
287 }
288 
_mm_deinterleave_epi16(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1)289 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
290 {
291     __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
292     __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
293     __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
294     __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
295 
296     __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
297     __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
298     __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
299     __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
300 
301     __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
302     __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
303     __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
304     __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
305 
306     v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
307     v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
308     v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
309     v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
310 }
311 
_mm_deinterleave_epi16(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1)312 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
313                                    __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
314 {
315     __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
316     __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
317     __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
318     __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
319     __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
320     __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
321 
322     __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
323     __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
324     __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
325     __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
326     __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
327     __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
328 
329     __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
330     __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
331     __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
332     __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
333     __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
334     __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
335 
336     v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
337     v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
338     v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
339     v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
340     v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
341     v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
342 }
343 
_mm_deinterleave_epi16(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1,__m128i & v_a0,__m128i & v_a1)344 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
345                                    __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
346 {
347     __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
348     __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
349     __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
350     __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
351     __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
352     __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
353     __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
354     __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
355 
356     __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
357     __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
358     __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
359     __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
360     __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
361     __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
362     __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
363     __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
364 
365     __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
366     __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
367     __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
368     __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
369     __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
370     __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
371     __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
372     __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
373 
374     v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
375     v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
376     v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
377     v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
378     v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
379     v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
380     v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
381     v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
382 }
383 
384 #if CV_SSE4_1
385 
_mm_interleave_epi16(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1)386 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
387 {
388     __m128i v_mask = _mm_set1_epi32(0x0000ffff);
389 
390     __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
391     __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
392     __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
393     __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
394 
395     __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
396     __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
397     __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
398     __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
399 
400     __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
401     __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
402     __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
403     __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
404 
405     v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
406     v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
407     v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
408     v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
409 }
410 
_mm_interleave_epi16(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1)411 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
412                                  __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
413 {
414     __m128i v_mask = _mm_set1_epi32(0x0000ffff);
415 
416     __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
417     __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
418     __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
419     __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
420     __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
421     __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
422 
423     __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
424     __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
425     __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
426     __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
427     __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
428     __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
429 
430     __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
431     __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
432     __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
433     __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
434     __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
435     __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
436 
437     v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
438     v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
439     v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
440     v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
441     v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
442     v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
443 }
444 
_mm_interleave_epi16(__m128i & v_r0,__m128i & v_r1,__m128i & v_g0,__m128i & v_g1,__m128i & v_b0,__m128i & v_b1,__m128i & v_a0,__m128i & v_a1)445 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
446                                  __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
447 {
448     __m128i v_mask = _mm_set1_epi32(0x0000ffff);
449 
450     __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
451     __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
452     __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
453     __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
454     __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
455     __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
456     __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
457     __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
458 
459     __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
460     __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
461     __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
462     __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
463     __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
464     __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
465     __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
466     __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
467 
468     __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
469     __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
470     __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
471     __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
472     __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
473     __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
474     __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
475     __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
476 
477     v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
478     v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
479     v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
480     v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
481     v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
482     v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
483     v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
484     v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
485 }
486 
487 #endif // CV_SSE4_1
488 
_mm_deinterleave_ps(__m128 & v_r0,__m128 & v_r1,__m128 & v_g0,__m128 & v_g1)489 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
490 {
491     __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
492     __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
493     __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
494     __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
495 
496     __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
497     __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
498     __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
499     __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
500 
501     v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
502     v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
503     v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
504     v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
505 }
506 
_mm_deinterleave_ps(__m128 & v_r0,__m128 & v_r1,__m128 & v_g0,__m128 & v_g1,__m128 & v_b0,__m128 & v_b1)507 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
508                                 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
509 {
510     __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
511     __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
512     __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
513     __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
514     __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
515     __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
516 
517     __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
518     __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
519     __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
520     __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
521     __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
522     __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
523 
524     v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
525     v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
526     v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
527     v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
528     v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
529     v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
530 }
531 
_mm_deinterleave_ps(__m128 & v_r0,__m128 & v_r1,__m128 & v_g0,__m128 & v_g1,__m128 & v_b0,__m128 & v_b1,__m128 & v_a0,__m128 & v_a1)532 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
533                                 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
534 {
535     __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
536     __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
537     __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
538     __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
539     __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
540     __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
541     __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
542     __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
543 
544     __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
545     __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
546     __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
547     __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
548     __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
549     __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
550     __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
551     __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
552 
553     v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
554     v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
555     v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
556     v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
557     v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
558     v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
559     v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
560     v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
561 }
562 
_mm_interleave_ps(__m128 & v_r0,__m128 & v_r1,__m128 & v_g0,__m128 & v_g1)563 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
564 {
565     const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
566 
567     __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
568     __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
569     __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
570     __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
571 
572     __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
573     __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
574     __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
575     __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
576 
577     v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
578     v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
579     v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
580     v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
581 }
582 
_mm_interleave_ps(__m128 & v_r0,__m128 & v_r1,__m128 & v_g0,__m128 & v_g1,__m128 & v_b0,__m128 & v_b1)583 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
584                               __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
585 {
586     const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
587 
588     __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
589     __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
590     __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
591     __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
592     __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
593     __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
594 
595     __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
596     __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
597     __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
598     __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
599     __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
600     __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
601 
602     v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
603     v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
604     v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
605     v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
606     v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
607     v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
608 }
609 
_mm_interleave_ps(__m128 & v_r0,__m128 & v_r1,__m128 & v_g0,__m128 & v_g1,__m128 & v_b0,__m128 & v_b1,__m128 & v_a0,__m128 & v_a1)610 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
611                               __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
612 {
613     const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
614 
615     __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
616     __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
617     __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
618     __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
619     __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
620     __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
621     __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
622     __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
623 
624     __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
625     __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
626     __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
627     __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
628     __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
629     __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
630     __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
631     __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
632 
633     v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
634     v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
635     v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
636     v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
637     v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
638     v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
639     v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
640     v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
641 }
642 
643 #endif // CV_SSE2
644 
645 #endif //__OPENCV_CORE_SSE_UTILS_HPP__
646