1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 //   * Redistribution's of source code must retain the above copyright notice,
23 //     this list of conditions and the following disclaimer.
24 //
25 //   * Redistribution's in binary form must reproduce the above copyright notice,
26 //     this list of conditions and the following disclaimer in the documentation
27 //     and/or other materials provided with the distribution.
28 //
29 //   * The name of the copyright holders may not be used to endorse or promote products
30 //     derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44 
45 #ifndef __OPENCV_HAL_SSE_HPP__
46 #define __OPENCV_HAL_SSE_HPP__
47 
48 #define CV_SIMD128 1
49 #define CV_SIMD128_64F 1
50 
51 namespace cv
52 {
53 
54 struct v_uint8x16
55 {
56     typedef uchar lane_type;
57     enum { nlanes = 16 };
58 
v_uint8x16cv::v_uint8x1659     v_uint8x16() {}
v_uint8x16cv::v_uint8x1660     explicit v_uint8x16(__m128i v) : val(v) {}
v_uint8x16cv::v_uint8x1661     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
62                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
63     {
64         val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
65                             (char)v4, (char)v5, (char)v6, (char)v7,
66                             (char)v8, (char)v9, (char)v10, (char)v11,
67                             (char)v12, (char)v13, (char)v14, (char)v15);
68     }
get0cv::v_uint8x1669     uchar get0() const
70     {
71         return (uchar)_mm_cvtsi128_si32(val);
72     }
73 
74     __m128i val;
75 };
76 
77 struct v_int8x16
78 {
79     typedef schar lane_type;
80     enum { nlanes = 16 };
81 
v_int8x16cv::v_int8x1682     v_int8x16() {}
v_int8x16cv::v_int8x1683     explicit v_int8x16(__m128i v) : val(v) {}
v_int8x16cv::v_int8x1684     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
85               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
86     {
87         val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
88                             (char)v4, (char)v5, (char)v6, (char)v7,
89                             (char)v8, (char)v9, (char)v10, (char)v11,
90                             (char)v12, (char)v13, (char)v14, (char)v15);
91     }
get0cv::v_int8x1692     schar get0() const
93     {
94         return (schar)_mm_cvtsi128_si32(val);
95     }
96 
97     __m128i val;
98 };
99 
100 struct v_uint16x8
101 {
102     typedef ushort lane_type;
103     enum { nlanes = 8 };
104 
v_uint16x8cv::v_uint16x8105     v_uint16x8() {}
v_uint16x8cv::v_uint16x8106     explicit v_uint16x8(__m128i v) : val(v) {}
v_uint16x8cv::v_uint16x8107     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
108     {
109         val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
110                              (short)v4, (short)v5, (short)v6, (short)v7);
111     }
get0cv::v_uint16x8112     ushort get0() const
113     {
114         return (ushort)_mm_cvtsi128_si32(val);
115     }
116 
117     __m128i val;
118 };
119 
120 struct v_int16x8
121 {
122     typedef short lane_type;
123     enum { nlanes = 8 };
124 
v_int16x8cv::v_int16x8125     v_int16x8() {}
v_int16x8cv::v_int16x8126     explicit v_int16x8(__m128i v) : val(v) {}
v_int16x8cv::v_int16x8127     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
128     {
129         val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
130                              (short)v4, (short)v5, (short)v6, (short)v7);
131     }
get0cv::v_int16x8132     short get0() const
133     {
134         return (short)_mm_cvtsi128_si32(val);
135     }
136     __m128i val;
137 };
138 
139 struct v_uint32x4
140 {
141     typedef unsigned lane_type;
142     enum { nlanes = 4 };
143 
v_uint32x4cv::v_uint32x4144     v_uint32x4() {}
v_uint32x4cv::v_uint32x4145     explicit v_uint32x4(__m128i v) : val(v) {}
v_uint32x4cv::v_uint32x4146     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
147     {
148         val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
149     }
get0cv::v_uint32x4150     unsigned get0() const
151     {
152         return (unsigned)_mm_cvtsi128_si32(val);
153     }
154     __m128i val;
155 };
156 
157 struct v_int32x4
158 {
159     typedef int lane_type;
160     enum { nlanes = 4 };
161 
v_int32x4cv::v_int32x4162     v_int32x4() {}
v_int32x4cv::v_int32x4163     explicit v_int32x4(__m128i v) : val(v) {}
v_int32x4cv::v_int32x4164     v_int32x4(int v0, int v1, int v2, int v3)
165     {
166         val = _mm_setr_epi32(v0, v1, v2, v3);
167     }
get0cv::v_int32x4168     int get0() const
169     {
170         return _mm_cvtsi128_si32(val);
171     }
172     __m128i val;
173 };
174 
175 struct v_float32x4
176 {
177     typedef float lane_type;
178     enum { nlanes = 4 };
179 
v_float32x4cv::v_float32x4180     v_float32x4() {}
v_float32x4cv::v_float32x4181     explicit v_float32x4(__m128 v) : val(v) {}
v_float32x4cv::v_float32x4182     v_float32x4(float v0, float v1, float v2, float v3)
183     {
184         val = _mm_setr_ps(v0, v1, v2, v3);
185     }
get0cv::v_float32x4186     float get0() const
187     {
188         return _mm_cvtss_f32(val);
189     }
190     __m128 val;
191 };
192 
193 struct v_uint64x2
194 {
195     typedef uint64 lane_type;
196     enum { nlanes = 2 };
197 
v_uint64x2cv::v_uint64x2198     v_uint64x2() {}
v_uint64x2cv::v_uint64x2199     explicit v_uint64x2(__m128i v) : val(v) {}
v_uint64x2cv::v_uint64x2200     v_uint64x2(uint64 v0, uint64 v1)
201     {
202         val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
203     }
get0cv::v_uint64x2204     uint64 get0() const
205     {
206         int a = _mm_cvtsi128_si32(val);
207         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
208         return (unsigned)a | ((uint64)(unsigned)b << 32);
209     }
210     __m128i val;
211 };
212 
213 struct v_int64x2
214 {
215     typedef int64 lane_type;
216     enum { nlanes = 2 };
217 
v_int64x2cv::v_int64x2218     v_int64x2() {}
v_int64x2cv::v_int64x2219     explicit v_int64x2(__m128i v) : val(v) {}
v_int64x2cv::v_int64x2220     v_int64x2(int64 v0, int64 v1)
221     {
222         val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
223     }
get0cv::v_int64x2224     int64 get0() const
225     {
226         int a = _mm_cvtsi128_si32(val);
227         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
228         return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
229     }
230     __m128i val;
231 };
232 
233 struct v_float64x2
234 {
235     typedef double lane_type;
236     enum { nlanes = 2 };
237 
v_float64x2cv::v_float64x2238     v_float64x2() {}
v_float64x2cv::v_float64x2239     explicit v_float64x2(__m128d v) : val(v) {}
v_float64x2cv::v_float64x2240     v_float64x2(double v0, double v1)
241     {
242         val = _mm_setr_pd(v0, v1);
243     }
get0cv::v_float64x2244     double get0() const
245     {
246         return _mm_cvtsd_f64(val);
247     }
248     __m128d val;
249 };
250 
251 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
252 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
253 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
254 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
255 { return _Tpvec(cast(a.val)); }
256 
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16,uchar,u8,si128,epi8,char,OPENCV_HAL_NOP)257 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
258 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
259 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
260 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
261 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
262 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
263 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
264 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
265 
266 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
v_setzero_s64()267 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
v_setall_u64(uint64 val)268 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
v_setall_s64(int64 val)269 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
270 
271 template<typename _Tpvec> inline
v_reinterpret_as_u64(const _Tpvec & a)272 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
273 template<typename _Tpvec> inline
v_reinterpret_as_s64(const _Tpvec & a)274 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
v_reinterpret_as_f32(const v_uint64x2 & a)275 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
276 { return v_float32x4(_mm_castsi128_ps(a.val)); }
v_reinterpret_as_f32(const v_int64x2 & a)277 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
278 { return v_float32x4(_mm_castsi128_ps(a.val)); }
v_reinterpret_as_f64(const v_uint64x2 & a)279 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
280 { return v_float64x2(_mm_castsi128_pd(a.val)); }
v_reinterpret_as_f64(const v_int64x2 & a)281 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
282 { return v_float64x2(_mm_castsi128_pd(a.val)); }
283 
284 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
285 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
286 { return _Tpvec(_mm_castps_si128(a.val)); } \
287 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
288 { return _Tpvec(_mm_castpd_si128(a.val)); }
289 
OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16,u8)290 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
291 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
292 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
293 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
294 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
295 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
296 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
297 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
298 
299 //////////////// PACK ///////////////
300 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
301 {
302     __m128i delta = _mm_set1_epi16(255);
303     return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
304                                        _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
305 }
306 
v_pack_store(uchar * ptr,const v_uint16x8 & a)307 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
308 {
309     __m128i delta = _mm_set1_epi16(255);
310     __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
311     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
312 }
313 
v_pack_u(const v_int16x8 & a,const v_int16x8 & b)314 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
315 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
316 
v_pack_u_store(uchar * ptr,const v_int16x8 & a)317 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
318 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
319 
320 template<int n> inline
v_rshr_pack(const v_uint16x8 & a,const v_uint16x8 & b)321 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
322 {
323     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
324     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
325     return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
326                                        _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
327 }
328 
329 template<int n> inline
v_rshr_pack_store(uchar * ptr,const v_uint16x8 & a)330 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
331 {
332     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
333     __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
334     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
335 }
336 
337 template<int n> inline
v_rshr_pack_u(const v_int16x8 & a,const v_int16x8 & b)338 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
339 {
340     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
341     return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
342                                        _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
343 }
344 
345 template<int n> inline
v_rshr_pack_u_store(uchar * ptr,const v_int16x8 & a)346 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
347 {
348     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
349     __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
350     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
351 }
352 
v_pack(const v_int16x8 & a,const v_int16x8 & b)353 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
354 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
355 
v_pack_store(schar * ptr,v_int16x8 & a)356 inline void v_pack_store(schar* ptr, v_int16x8& a)
357 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
358 
359 template<int n> inline
v_rshr_pack(const v_int16x8 & a,const v_int16x8 & b)360 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
361 {
362     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
363     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
364     return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
365                                      _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
366 }
367 template<int n> inline
v_rshr_pack_store(schar * ptr,const v_int16x8 & a)368 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
369 {
370     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
371     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
372     __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
373     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
374 }
375 
376 
377 // bit-wise "mask ? a : b"
v_select_si128(__m128i mask,__m128i a,__m128i b)378 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
379 {
380     return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
381 }
382 
v_pack(const v_uint32x4 & a,const v_uint32x4 & b)383 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
384 {
385     __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
386     __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
387     __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
388     __m128i r = _mm_packs_epi32(a1, b1);
389     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
390 }
391 
v_pack_store(ushort * ptr,const v_uint32x4 & a)392 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
393 {
394     __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
395     __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
396     __m128i r = _mm_packs_epi32(a1, a1);
397     _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
398 }
399 
400 template<int n> inline
v_rshr_pack(const v_uint32x4 & a,const v_uint32x4 & b)401 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
402 {
403     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
404     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
405     __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
406     return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
407 }
408 
409 template<int n> inline
v_rshr_pack_store(ushort * ptr,const v_uint32x4 & a)410 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
411 {
412     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
413     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
414     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
415     _mm_storel_epi64((__m128i*)ptr, a2);
416 }
417 
v_pack_u(const v_int32x4 & a,const v_int32x4 & b)418 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
419 {
420     __m128i delta32 = _mm_set1_epi32(32768);
421     __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
422     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
423 }
424 
v_pack_u_store(ushort * ptr,const v_int32x4 & a)425 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
426 {
427     __m128i delta32 = _mm_set1_epi32(32768);
428     __m128i a1 = _mm_sub_epi32(a.val, delta32);
429     __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
430     _mm_storel_epi64((__m128i*)ptr, r);
431 }
432 
433 template<int n> inline
v_rshr_pack_u_store(ushort * ptr,const v_int32x4 & a)434 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
435 {
436     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
437     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
438     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
439     _mm_storel_epi64((__m128i*)ptr, a2);
440 }
441 
v_pack(const v_int32x4 & a,const v_int32x4 & b)442 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
443 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
444 
v_pack_store(short * ptr,const v_int32x4 & a)445 inline void v_pack_store(short* ptr, const v_int32x4& a)
446 {
447     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
448 }
449 
450 template<int n> inline
v_rshr_pack(const v_int32x4 & a,const v_int32x4 & b)451 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
452 {
453     __m128i delta = _mm_set1_epi32(1 << (n-1));
454     return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
455                                      _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
456 }
457 
458 template<int n> inline
v_rshr_pack_store(short * ptr,const v_int32x4 & a)459 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
460 {
461     __m128i delta = _mm_set1_epi32(1 << (n-1));
462     __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
463     _mm_storel_epi64((__m128i*)ptr, a1);
464 }
465 
466 
467 // [a0 0 | b0 0]  [a1 0 | b1 0]
v_pack(const v_uint64x2 & a,const v_uint64x2 & b)468 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
469 {
470     __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
471     __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
472     return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
473 }
474 
v_pack_store(unsigned * ptr,const v_uint64x2 & a)475 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
476 {
477     __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
478     _mm_storel_epi64((__m128i*)ptr, a1);
479 }
480 
481 // [a0 0 | b0 0]  [a1 0 | b1 0]
v_pack(const v_int64x2 & a,const v_int64x2 & b)482 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
483 {
484     __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
485     __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
486     return v_int32x4(_mm_unpacklo_epi64(v0, v1));
487 }
488 
v_pack_store(int * ptr,const v_int64x2 & a)489 inline void v_pack_store(int* ptr, const v_int64x2& a)
490 {
491     __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
492     _mm_storel_epi64((__m128i*)ptr, a1);
493 }
494 
495 template<int n> inline
v_rshr_pack(const v_uint64x2 & a,const v_uint64x2 & b)496 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
497 {
498     uint64 delta = (uint64)1 << (n-1);
499     v_uint64x2 delta2(delta, delta);
500     __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
501     __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
502     __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
503     __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
504     return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
505 }
506 
507 template<int n> inline
v_rshr_pack_store(unsigned * ptr,const v_uint64x2 & a)508 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
509 {
510     uint64 delta = (uint64)1 << (n-1);
511     v_uint64x2 delta2(delta, delta);
512     __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
513     __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
514     _mm_storel_epi64((__m128i*)ptr, a2);
515 }
516 
v_sign_epi64(__m128i a)517 inline __m128i v_sign_epi64(__m128i a)
518 {
519     return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
520 }
521 
v_srai_epi64(__m128i a,int imm)522 inline __m128i v_srai_epi64(__m128i a, int imm)
523 {
524     __m128i smask = v_sign_epi64(a);
525     return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
526 }
527 
528 template<int n> inline
v_rshr_pack(const v_int64x2 & a,const v_int64x2 & b)529 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
530 {
531     int64 delta = (int64)1 << (n-1);
532     v_int64x2 delta2(delta, delta);
533     __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
534     __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
535     __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
536     __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
537     return v_int32x4(_mm_unpacklo_epi64(v0, v1));
538 }
539 
540 template<int n> inline
v_rshr_pack_store(int * ptr,const v_int64x2 & a)541 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
542 {
543     int64 delta = (int64)1 << (n-1);
544     v_int64x2 delta2(delta, delta);
545     __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
546     __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
547     _mm_storel_epi64((__m128i*)ptr, a2);
548 }
549 
v_matmul(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & m3)550 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
551                             const v_float32x4& m1, const v_float32x4& m2,
552                             const v_float32x4& m3)
553 {
554     __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
555     __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
556     __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
557     __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
558 
559     return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
560 }
561 
562 
563 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
564     inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
565     { \
566         return _Tpvec(intrin(a.val, b.val)); \
567     } \
568     inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
569     { \
570         a.val = intrin(a.val, b.val); \
571         return a; \
572     }
573 
574 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
575 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
576 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
577 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
578 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
579 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
580 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
581 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
582 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
583 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
584 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
585 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
586 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
587 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
588 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
589 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
590 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
591 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
592 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
593 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
594 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
595 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
596 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
597 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
598 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
599 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
600 
operator *(const v_uint32x4 & a,const v_uint32x4 & b)601 inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
602 {
603     __m128i c0 = _mm_mul_epu32(a.val, b.val);
604     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
605     __m128i d0 = _mm_unpacklo_epi32(c0, c1);
606     __m128i d1 = _mm_unpackhi_epi32(c0, c1);
607     return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
608 }
operator *(const v_int32x4 & a,const v_int32x4 & b)609 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
610 {
611     __m128i c0 = _mm_mul_epu32(a.val, b.val);
612     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
613     __m128i d0 = _mm_unpacklo_epi32(c0, c1);
614     __m128i d1 = _mm_unpackhi_epi32(c0, c1);
615     return v_int32x4(_mm_unpacklo_epi64(d0, d1));
616 }
operator *=(v_uint32x4 & a,const v_uint32x4 & b)617 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
618 {
619     a = a * b;
620     return a;
621 }
operator *=(v_int32x4 & a,const v_int32x4 & b)622 inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
623 {
624     a = a * b;
625     return a;
626 }
627 
v_mul_expand(const v_int16x8 & a,const v_int16x8 & b,v_int32x4 & c,v_int32x4 & d)628 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
629                          v_int32x4& c, v_int32x4& d)
630 {
631     __m128i v0 = _mm_mullo_epi16(a.val, b.val);
632     __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
633     c.val = _mm_unpacklo_epi32(v0, v1);
634     d.val = _mm_unpackhi_epi32(v0, v1);
635 }
636 
v_mul_expand(const v_uint16x8 & a,const v_uint16x8 & b,v_uint32x4 & c,v_uint32x4 & d)637 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
638                          v_uint32x4& c, v_uint32x4& d)
639 {
640     __m128i v0 = _mm_mullo_epi16(a.val, b.val);
641     __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
642     c.val = _mm_unpacklo_epi32(v0, v1);
643     d.val = _mm_unpackhi_epi32(v0, v1);
644 }
645 
v_mul_expand(const v_uint32x4 & a,const v_uint32x4 & b,v_uint64x2 & c,v_uint64x2 & d)646 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
647                          v_uint64x2& c, v_uint64x2& d)
648 {
649     __m128i c0 = _mm_mul_epu32(a.val, b.val);
650     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
651     c.val = _mm_unpacklo_epi64(c0, c1);
652     d.val = _mm_unpackhi_epi64(c0, c1);
653 }
654 
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)655 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
656 {
657     return v_int32x4(_mm_madd_epi16(a.val, b.val));
658 }
659 
660 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
661     OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
662     OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
663     OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
664     inline _Tpvec operator ~ (const _Tpvec& a) \
665     { \
666         return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
667     }
668 
669 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
670 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
671 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
672 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
673 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
674 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
675 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
676 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
677 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
678 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
679 
v_sqrt(const v_float32x4 & x)680 inline v_float32x4 v_sqrt(const v_float32x4& x)
681 { return v_float32x4(_mm_sqrt_ps(x.val)); }
682 
v_invsqrt(const v_float32x4 & x)683 inline v_float32x4 v_invsqrt(const v_float32x4& x)
684 {
685     static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
686     __m128 t = x.val;
687     __m128 h = _mm_mul_ps(t, _0_5);
688     t = _mm_rsqrt_ps(t);
689     t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
690     return v_float32x4(t);
691 }
692 
v_sqrt(const v_float64x2 & x)693 inline v_float64x2 v_sqrt(const v_float64x2& x)
694 { return v_float64x2(_mm_sqrt_pd(x.val)); }
695 
v_invsqrt(const v_float64x2 & x)696 inline v_float64x2 v_invsqrt(const v_float64x2& x)
697 {
698     static const __m128d v_1 = _mm_set1_pd(1.);
699     return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
700 }
701 
v_abs(const v_float32x4 & x)702 inline v_float32x4 v_abs(const v_float32x4& x)
703 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
v_abs(const v_float64x2 & x)704 inline v_float64x2 v_abs(const v_float64x2& x)
705 {
706     return v_float64x2(_mm_and_pd(x.val,
707         _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
708 }
709 
710 // TODO: exp, log, sin, cos
711 
712 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
713 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
714 { \
715     return _Tpvec(intrin(a.val, b.val)); \
716 }
717 
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16,v_min,_mm_min_epu8)718 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
719 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
720 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
721 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
722 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
723 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
724 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
725 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
726 
727 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
728 {
729     __m128i delta = _mm_set1_epi8((char)-128);
730     return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
731                                                        _mm_xor_si128(b.val, delta))));
732 }
v_max(const v_int8x16 & a,const v_int8x16 & b)733 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
734 {
735     __m128i delta = _mm_set1_epi8((char)-128);
736     return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
737                                                        _mm_xor_si128(b.val, delta))));
738 }
v_min(const v_uint16x8 & a,const v_uint16x8 & b)739 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
740 {
741     return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
742 }
v_max(const v_uint16x8 & a,const v_uint16x8 & b)743 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
744 {
745     return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
746 }
v_min(const v_uint32x4 & a,const v_uint32x4 & b)747 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
748 {
749     __m128i delta = _mm_set1_epi32((int)0x80000000);
750     __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
751     return v_uint32x4(v_select_si128(mask, b.val, a.val));
752 }
v_max(const v_uint32x4 & a,const v_uint32x4 & b)753 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
754 {
755     __m128i delta = _mm_set1_epi32((int)0x80000000);
756     __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
757     return v_uint32x4(v_select_si128(mask, a.val, b.val));
758 }
v_min(const v_int32x4 & a,const v_int32x4 & b)759 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
760 {
761     return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
762 }
v_max(const v_int32x4 & a,const v_int32x4 & b)763 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
764 {
765     return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
766 }
767 
768 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
769 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
770 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
771 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
772 { \
773     __m128i not_mask = _mm_set1_epi32(-1); \
774     return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
775 } \
776 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
777 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
778 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
779 { \
780     __m128i not_mask = _mm_set1_epi32(-1); \
781     return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
782 } \
783 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
784 { \
785     __m128i smask = _mm_set1_##suffix(sbit); \
786     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
787 } \
788 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
789 { \
790     __m128i smask = _mm_set1_##suffix(sbit); \
791     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
792 } \
793 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
794 { \
795     __m128i smask = _mm_set1_##suffix(sbit); \
796     __m128i not_mask = _mm_set1_epi32(-1); \
797     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
798     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
799 } \
800 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
801 { \
802     __m128i smask = _mm_set1_##suffix(sbit); \
803     __m128i not_mask = _mm_set1_epi32(-1); \
804     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
805     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
806 } \
807 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
808 { \
809     return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
810 } \
811 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
812 { \
813     return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
814 } \
815 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
816 { \
817     __m128i not_mask = _mm_set1_epi32(-1); \
818     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
819 } \
820 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
821 { \
822     __m128i not_mask = _mm_set1_epi32(-1); \
823     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
824 }
825 
826 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
827 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
828 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
829 
830 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
831 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
832 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
833 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
834 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
835 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
836 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
837 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
838 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
839 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
840 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
841 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
842 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
843 
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4,ps)844 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
845 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
846 
847 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
848 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
849 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
850 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
851 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
852 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
853 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
854 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
855 
856 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
857 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
858 { \
859     return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
860 } \
861 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
862 { \
863     __m128i smask = _mm_set1_epi32(smask32); \
864     __m128i a1 = _mm_xor_si128(a.val, smask); \
865     __m128i b1 = _mm_xor_si128(b.val, smask); \
866     return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
867 }
868 
869 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
870 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
871 
872 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
873 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
874 { \
875     _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
876     return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
877 } \
878 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
879 { \
880     _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
881     return _Tpvec(_mm_sqrt_##suffix(res)); \
882 } \
883 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
884 { \
885     _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
886     return _Tpvec(res); \
887 } \
888 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
889 { \
890     return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
891 }
892 
893 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
894 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
895 
896 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
897 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
898 { \
899     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
900 } \
901 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
902 { \
903     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
904 } \
905 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
906 { \
907     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
908 } \
909 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
910 { \
911     return _Tpsvec(srai(a.val, imm)); \
912 } \
913 template<int imm> \
914 inline _Tpuvec v_shl(const _Tpuvec& a) \
915 { \
916     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
917 } \
918 template<int imm> \
919 inline _Tpsvec v_shl(const _Tpsvec& a) \
920 { \
921     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
922 } \
923 template<int imm> \
924 inline _Tpuvec v_shr(const _Tpuvec& a) \
925 { \
926     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
927 } \
928 template<int imm> \
929 inline _Tpsvec v_shr(const _Tpsvec& a) \
930 { \
931     return _Tpsvec(srai(a.val, imm)); \
932 }
933 
934 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
935 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
936 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
937 
938 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
939 inline _Tpvec v_load(const _Tp* ptr) \
940 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
941 inline _Tpvec v_load_aligned(const _Tp* ptr) \
942 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
943 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
944 { \
945     return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
946                                      _mm_loadl_epi64((const __m128i*)ptr1))); \
947 } \
948 inline void v_store(_Tp* ptr, const _Tpvec& a) \
949 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
950 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
951 { _mm_store_si128((__m128i*)ptr, a.val); } \
952 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
953 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
954 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
955 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
956 
957 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
958 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
959 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
960 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
961 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
962 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
963 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
964 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
965 
966 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
967 inline _Tpvec v_load(const _Tp* ptr) \
968 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
969 inline _Tpvec v_load_aligned(const _Tp* ptr) \
970 { return _Tpvec(_mm_load_##suffix(ptr)); } \
971 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
972 { \
973     return _Tpvec(_mm_castsi128_##suffix( \
974         _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
975                            _mm_loadl_epi64((const __m128i*)ptr1)))); \
976 } \
977 inline void v_store(_Tp* ptr, const _Tpvec& a) \
978 { _mm_storeu_##suffix(ptr, a.val); } \
979 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
980 { _mm_store_##suffix(ptr, a.val); } \
981 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
982 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
983 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
984 { \
985     __m128i a1 = _mm_cast##suffix##_si128(a.val); \
986     _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
987 }
988 
989 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
990 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
991 
992 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
993 inline scalartype v_reduce_##func(const _Tpvec& a) \
994 { \
995     scalartype CV_DECL_ALIGNED(16) buf[4]; \
996     v_store_aligned(buf, a); \
997     scalartype s0 = scalar_func(buf[0], buf[1]); \
998     scalartype s1 = scalar_func(buf[2], buf[3]); \
999     return scalar_func(s0, s1); \
1000 }
1001 
1002 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
1003 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1004 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1005 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
1006 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
1007 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1008 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
1009 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
1010 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
1011 
1012 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
1013 inline int v_signmask(const _Tpvec& a) \
1014 { \
1015     return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
1016 } \
1017 inline bool v_check_all(const _Tpvec& a) \
1018 { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
1019 inline bool v_check_any(const _Tpvec& a) \
1020 { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
1021 
1022 #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
1023 inline __m128i v_packq_epi32(__m128i a)
1024 {
1025     __m128i b = _mm_packs_epi32(a, a);
1026     return _mm_packs_epi16(b, b);
1027 }
1028 
1029 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1030 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1031 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1032 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1033 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1034 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1035 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
1036 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
1037 
1038 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1039 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1040 { \
1041     return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1042 }
1043 
OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16,si128)1044 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
1045 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
1046 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
1047 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
1048 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
1049 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
1050 OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
1051 OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
1052 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
1053 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
1054 
1055 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
1056 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
1057 { \
1058     __m128i z = _mm_setzero_si128(); \
1059     b0.val = _mm_unpacklo_##suffix(a.val, z); \
1060     b1.val = _mm_unpackhi_##suffix(a.val, z); \
1061 } \
1062 inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
1063 { \
1064     __m128i z = _mm_setzero_si128(); \
1065     return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
1066 } \
1067 inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
1068 { \
1069     b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
1070     b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
1071 } \
1072 inline _Tpwsvec v_load_expand(const _Tps* ptr) \
1073 { \
1074     __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1075     return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
1076 }
1077 
1078 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
1079 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
1080 
1081 inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
1082 {
1083     __m128i z = _mm_setzero_si128();
1084     b0.val = _mm_unpacklo_epi32(a.val, z);
1085     b1.val = _mm_unpackhi_epi32(a.val, z);
1086 }
v_load_expand(const unsigned * ptr)1087 inline v_uint64x2 v_load_expand(const unsigned* ptr)
1088 {
1089     __m128i z = _mm_setzero_si128();
1090     return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
1091 }
v_expand(const v_int32x4 & a,v_int64x2 & b0,v_int64x2 & b1)1092 inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
1093 {
1094     __m128i s = _mm_srai_epi32(a.val, 31);
1095     b0.val = _mm_unpacklo_epi32(a.val, s);
1096     b1.val = _mm_unpackhi_epi32(a.val, s);
1097 }
v_load_expand(const int * ptr)1098 inline v_int64x2 v_load_expand(const int* ptr)
1099 {
1100     __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
1101     __m128i s = _mm_srai_epi32(a, 31);
1102     return v_int64x2(_mm_unpacklo_epi32(a, s));
1103 }
1104 
v_load_expand_q(const uchar * ptr)1105 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1106 {
1107     __m128i z = _mm_setzero_si128();
1108     __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1109     return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
1110 }
1111 
v_load_expand_q(const schar * ptr)1112 inline v_int32x4 v_load_expand_q(const schar* ptr)
1113 {
1114     __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1115     a = _mm_unpacklo_epi8(a, a);
1116     a = _mm_unpacklo_epi8(a, a);
1117     return v_int32x4(_mm_srai_epi32(a, 24));
1118 }
1119 
1120 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1121 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1122 { \
1123     b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1124     b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1125 } \
1126 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1127 { \
1128     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1129     return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1130 } \
1131 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1132 { \
1133     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1134     return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1135 } \
1136 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1137 { \
1138     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1139     c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1140     d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1141 }
1142 
OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16,epi8,OPENCV_HAL_NOP,OPENCV_HAL_NOP)1143 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1144 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1145 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1146 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1147 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1148 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1149 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1150 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1151 
1152 inline v_int32x4 v_round(const v_float32x4& a)
1153 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
1154 
v_floor(const v_float32x4 & a)1155 inline v_int32x4 v_floor(const v_float32x4& a)
1156 {
1157     __m128i a1 = _mm_cvtps_epi32(a.val);
1158     __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
1159     return v_int32x4(_mm_add_epi32(a1, mask));
1160 }
1161 
v_ceil(const v_float32x4 & a)1162 inline v_int32x4 v_ceil(const v_float32x4& a)
1163 {
1164     __m128i a1 = _mm_cvtps_epi32(a.val);
1165     __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
1166     return v_int32x4(_mm_sub_epi32(a1, mask));
1167 }
1168 
v_trunc(const v_float32x4 & a)1169 inline v_int32x4 v_trunc(const v_float32x4& a)
1170 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
1171 
v_round(const v_float64x2 & a)1172 inline v_int32x4 v_round(const v_float64x2& a)
1173 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
1174 
v_floor(const v_float64x2 & a)1175 inline v_int32x4 v_floor(const v_float64x2& a)
1176 {
1177     __m128i a1 = _mm_cvtpd_epi32(a.val);
1178     __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
1179     mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1180     return v_int32x4(_mm_add_epi32(a1, mask));
1181 }
1182 
v_ceil(const v_float64x2 & a)1183 inline v_int32x4 v_ceil(const v_float64x2& a)
1184 {
1185     __m128i a1 = _mm_cvtpd_epi32(a.val);
1186     __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
1187     mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1188     return v_int32x4(_mm_sub_epi32(a1, mask));
1189 }
1190 
v_trunc(const v_float64x2 & a)1191 inline v_int32x4 v_trunc(const v_float64x2& a)
1192 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
1193 
1194 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1195 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1196                            const _Tpvec& a2, const _Tpvec& a3, \
1197                            _Tpvec& b0, _Tpvec& b1, \
1198                            _Tpvec& b2, _Tpvec& b3) \
1199 { \
1200     __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
1201     __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
1202     __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
1203     __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
1204 \
1205     b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
1206     b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
1207     b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
1208     b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
1209 }
1210 
OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4,epi32,OPENCV_HAL_NOP,OPENCV_HAL_NOP)1211 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1212 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1213 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1214 
1215 // adopted from sse_utils.hpp
1216 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
1217 {
1218     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1219     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1220     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
1221 
1222     __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
1223     __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
1224     __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
1225 
1226     __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
1227     __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
1228     __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
1229 
1230     __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
1231     __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
1232     __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
1233 
1234     a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
1235     b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
1236     c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
1237 }
1238 
v_load_deinterleave(const uchar * ptr,v_uint8x16 & a,v_uint8x16 & b,v_uint8x16 & c,v_uint8x16 & d)1239 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
1240 {
1241     __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
1242     __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1243     __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
1244     __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
1245 
1246     __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
1247     __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
1248     __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
1249     __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ...
1250 
1251     u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
1252     u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
1253     u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
1254     u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
1255 
1256     v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
1257     v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
1258     v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ...
1259     v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ...
1260 
1261     a.val = _mm_unpacklo_epi8(v0, v1);
1262     b.val = _mm_unpacklo_epi8(v2, v3);
1263     c.val = _mm_unpackhi_epi8(v0, v1);
1264     d.val = _mm_unpacklo_epi8(v2, v3);
1265 }
1266 
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b,v_uint16x8 & c)1267 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
1268 {
1269     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1270     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1271     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1272 
1273     __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
1274     __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
1275     __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
1276 
1277     __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
1278     __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
1279     __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
1280 
1281     a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
1282     b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
1283     c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
1284 }
1285 
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b,v_uint16x8 & c,v_uint16x8 & d)1286 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
1287 {
1288     __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
1289     __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
1290     __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1291     __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
1292 
1293     __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
1294     __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
1295     __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
1296     __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
1297 
1298     u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
1299     u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
1300     u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
1301     u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
1302 
1303     a.val = _mm_unpacklo_epi16(u0, u1);
1304     b.val = _mm_unpackhi_epi16(u0, u1);
1305     c.val = _mm_unpacklo_epi16(u2, u3);
1306     d.val = _mm_unpackhi_epi16(u2, u3);
1307 }
1308 
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b,v_uint32x4 & c)1309 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
1310 {
1311     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1312     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
1313     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1314 
1315     __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
1316     __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
1317     __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
1318 
1319     a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
1320     b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
1321     c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
1322 }
1323 
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b,v_uint32x4 & c,v_uint32x4 & d)1324 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
1325 {
1326     v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
1327     v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
1328     v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
1329     v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
1330 
1331     v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
1332 }
1333 
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,const v_uint8x16 & c)1334 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
1335                                 const v_uint8x16& c )
1336 {
1337     __m128i z = _mm_setzero_si128();
1338     __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
1339     __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
1340     __m128i c0 = _mm_unpacklo_epi8(c.val, z);
1341     __m128i c1 = _mm_unpackhi_epi8(c.val, z);
1342 
1343     __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
1344     __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
1345     __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
1346     __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
1347 
1348     __m128i p10 = _mm_unpacklo_epi32(p00, p01);
1349     __m128i p11 = _mm_unpackhi_epi32(p00, p01);
1350     __m128i p12 = _mm_unpacklo_epi32(p02, p03);
1351     __m128i p13 = _mm_unpackhi_epi32(p02, p03);
1352 
1353     __m128i p20 = _mm_unpacklo_epi64(p10, p11);
1354     __m128i p21 = _mm_unpackhi_epi64(p10, p11);
1355     __m128i p22 = _mm_unpacklo_epi64(p12, p13);
1356     __m128i p23 = _mm_unpackhi_epi64(p12, p13);
1357 
1358     p20 = _mm_slli_si128(p20, 1);
1359     p22 = _mm_slli_si128(p22, 1);
1360 
1361     __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
1362     __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
1363     __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
1364     __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
1365 
1366     __m128i p40 = _mm_unpacklo_epi64(p30, p31);
1367     __m128i p41 = _mm_unpackhi_epi64(p30, p31);
1368     __m128i p42 = _mm_unpacklo_epi64(p32, p33);
1369     __m128i p43 = _mm_unpackhi_epi64(p32, p33);
1370 
1371     __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
1372     __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
1373     __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
1374 
1375     _mm_storeu_si128((__m128i*)(ptr), v0);
1376     _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1377     _mm_storeu_si128((__m128i*)(ptr + 32), v2);
1378 }
1379 
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,const v_uint8x16 & c,const v_uint8x16 & d)1380 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
1381                                 const v_uint8x16& c, const v_uint8x16& d)
1382 {
1383     // a0 a1 a2 a3 ....
1384     // b0 b1 b2 b3 ....
1385     // c0 c1 c2 c3 ....
1386     // d0 d1 d2 d3 ....
1387     __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
1388     __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
1389     __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
1390     __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
1391 
1392     __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
1393     __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
1394     __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
1395     __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
1396 
1397     _mm_storeu_si128((__m128i*)ptr, v0);
1398     _mm_storeu_si128((__m128i*)(ptr + 16), v2);
1399     _mm_storeu_si128((__m128i*)(ptr + 32), v1);
1400     _mm_storeu_si128((__m128i*)(ptr + 48), v3);
1401 }
1402 
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,const v_uint16x8 & c)1403 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
1404                                 const v_uint16x8& b,
1405                                 const v_uint16x8& c )
1406 {
1407     __m128i z = _mm_setzero_si128();
1408     __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
1409     __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
1410     __m128i c0 = _mm_unpacklo_epi16(c.val, z);
1411     __m128i c1 = _mm_unpackhi_epi16(c.val, z);
1412 
1413     __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
1414     __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
1415     __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
1416     __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
1417 
1418     __m128i p20 = _mm_unpacklo_epi64(p10, p11);
1419     __m128i p21 = _mm_unpackhi_epi64(p10, p11);
1420     __m128i p22 = _mm_unpacklo_epi64(p12, p13);
1421     __m128i p23 = _mm_unpackhi_epi64(p12, p13);
1422 
1423     p20 = _mm_slli_si128(p20, 2);
1424     p22 = _mm_slli_si128(p22, 2);
1425 
1426     __m128i p30 = _mm_unpacklo_epi64(p20, p21);
1427     __m128i p31 = _mm_unpackhi_epi64(p20, p21);
1428     __m128i p32 = _mm_unpacklo_epi64(p22, p23);
1429     __m128i p33 = _mm_unpackhi_epi64(p22, p23);
1430 
1431     __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
1432     __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
1433     __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
1434 
1435     _mm_storeu_si128((__m128i*)(ptr), v0);
1436     _mm_storeu_si128((__m128i*)(ptr + 8), v1);
1437     _mm_storeu_si128((__m128i*)(ptr + 16), v2);
1438 }
1439 
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,const v_uint16x8 & c,const v_uint16x8 & d)1440 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
1441                                 const v_uint16x8& c, const v_uint16x8& d)
1442 {
1443     // a0 a1 a2 a3 ....
1444     // b0 b1 b2 b3 ....
1445     // c0 c1 c2 c3 ....
1446     // d0 d1 d2 d3 ....
1447     __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
1448     __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
1449     __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
1450     __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
1451 
1452     __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
1453     __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
1454     __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
1455     __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
1456 
1457     _mm_storeu_si128((__m128i*)ptr, v0);
1458     _mm_storeu_si128((__m128i*)(ptr + 8), v2);
1459     _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1460     _mm_storeu_si128((__m128i*)(ptr + 24), v3);
1461 }
1462 
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c)1463 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
1464                                 const v_uint32x4& c )
1465 {
1466     v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
1467     v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
1468 
1469     __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
1470     __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
1471     __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
1472 
1473     _mm_storeu_si128((__m128i*)ptr, v0);
1474     _mm_storeu_si128((__m128i*)(ptr + 4), v1);
1475     _mm_storeu_si128((__m128i*)(ptr + 8), v2);
1476 }
1477 
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d)1478 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
1479                                const v_uint32x4& c, const v_uint32x4& d)
1480 {
1481     v_uint32x4 t0, t1, t2, t3;
1482     v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
1483     v_store(ptr, t0);
1484     v_store(ptr + 4, t1);
1485     v_store(ptr + 8, t2);
1486     v_store(ptr + 12, t3);
1487 }
1488 
1489 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
1490 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
1491                                  _Tpvec& b0, _Tpvec& c0 ) \
1492 { \
1493     _Tpuvec a1, b1, c1; \
1494     v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
1495     a0 = v_reinterpret_as_##suffix(a1); \
1496     b0 = v_reinterpret_as_##suffix(b1); \
1497     c0 = v_reinterpret_as_##suffix(c1); \
1498 } \
1499 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
1500                                  _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
1501 { \
1502     _Tpuvec a1, b1, c1, d1; \
1503     v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
1504     a0 = v_reinterpret_as_##suffix(a1); \
1505     b0 = v_reinterpret_as_##suffix(b1); \
1506     c0 = v_reinterpret_as_##suffix(c1); \
1507     d0 = v_reinterpret_as_##suffix(d1); \
1508 } \
1509 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
1510                                const _Tpvec& b0, const _Tpvec& c0 ) \
1511 { \
1512     _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
1513     _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
1514     _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
1515     v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
1516 } \
1517 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
1518                                const _Tpvec& c0, const _Tpvec& d0 ) \
1519 { \
1520     _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
1521     _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
1522     _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
1523     _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
1524     v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
1525 }
1526 
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16,schar,s8,v_uint8x16,uchar,u8)1527 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
1528 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
1529 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
1530 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
1531 
1532 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1533 {
1534     return v_float32x4(_mm_cvtepi32_ps(a.val));
1535 }
1536 
v_cvt_f32(const v_float64x2 & a)1537 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1538 {
1539     return v_float32x4(_mm_cvtpd_ps(a.val));
1540 }
1541 
v_cvt_f64(const v_int32x4 & a)1542 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1543 {
1544     return v_float64x2(_mm_cvtepi32_pd(a.val));
1545 }
1546 
v_cvt_f64(const v_float32x4 & a)1547 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1548 {
1549     return v_float64x2(_mm_cvtps_pd(a.val));
1550 }
1551 
1552 }
1553 
1554 #endif
1555