1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44
45 #ifndef __OPENCV_HAL_SSE_HPP__
46 #define __OPENCV_HAL_SSE_HPP__
47
48 #define CV_SIMD128 1
49 #define CV_SIMD128_64F 1
50
51 namespace cv
52 {
53
54 struct v_uint8x16
55 {
56 typedef uchar lane_type;
57 enum { nlanes = 16 };
58
v_uint8x16cv::v_uint8x1659 v_uint8x16() {}
v_uint8x16cv::v_uint8x1660 explicit v_uint8x16(__m128i v) : val(v) {}
v_uint8x16cv::v_uint8x1661 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
62 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
63 {
64 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
65 (char)v4, (char)v5, (char)v6, (char)v7,
66 (char)v8, (char)v9, (char)v10, (char)v11,
67 (char)v12, (char)v13, (char)v14, (char)v15);
68 }
get0cv::v_uint8x1669 uchar get0() const
70 {
71 return (uchar)_mm_cvtsi128_si32(val);
72 }
73
74 __m128i val;
75 };
76
77 struct v_int8x16
78 {
79 typedef schar lane_type;
80 enum { nlanes = 16 };
81
v_int8x16cv::v_int8x1682 v_int8x16() {}
v_int8x16cv::v_int8x1683 explicit v_int8x16(__m128i v) : val(v) {}
v_int8x16cv::v_int8x1684 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
85 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
86 {
87 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
88 (char)v4, (char)v5, (char)v6, (char)v7,
89 (char)v8, (char)v9, (char)v10, (char)v11,
90 (char)v12, (char)v13, (char)v14, (char)v15);
91 }
get0cv::v_int8x1692 schar get0() const
93 {
94 return (schar)_mm_cvtsi128_si32(val);
95 }
96
97 __m128i val;
98 };
99
100 struct v_uint16x8
101 {
102 typedef ushort lane_type;
103 enum { nlanes = 8 };
104
v_uint16x8cv::v_uint16x8105 v_uint16x8() {}
v_uint16x8cv::v_uint16x8106 explicit v_uint16x8(__m128i v) : val(v) {}
v_uint16x8cv::v_uint16x8107 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
108 {
109 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
110 (short)v4, (short)v5, (short)v6, (short)v7);
111 }
get0cv::v_uint16x8112 ushort get0() const
113 {
114 return (ushort)_mm_cvtsi128_si32(val);
115 }
116
117 __m128i val;
118 };
119
120 struct v_int16x8
121 {
122 typedef short lane_type;
123 enum { nlanes = 8 };
124
v_int16x8cv::v_int16x8125 v_int16x8() {}
v_int16x8cv::v_int16x8126 explicit v_int16x8(__m128i v) : val(v) {}
v_int16x8cv::v_int16x8127 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
128 {
129 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
130 (short)v4, (short)v5, (short)v6, (short)v7);
131 }
get0cv::v_int16x8132 short get0() const
133 {
134 return (short)_mm_cvtsi128_si32(val);
135 }
136 __m128i val;
137 };
138
139 struct v_uint32x4
140 {
141 typedef unsigned lane_type;
142 enum { nlanes = 4 };
143
v_uint32x4cv::v_uint32x4144 v_uint32x4() {}
v_uint32x4cv::v_uint32x4145 explicit v_uint32x4(__m128i v) : val(v) {}
v_uint32x4cv::v_uint32x4146 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
147 {
148 val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
149 }
get0cv::v_uint32x4150 unsigned get0() const
151 {
152 return (unsigned)_mm_cvtsi128_si32(val);
153 }
154 __m128i val;
155 };
156
157 struct v_int32x4
158 {
159 typedef int lane_type;
160 enum { nlanes = 4 };
161
v_int32x4cv::v_int32x4162 v_int32x4() {}
v_int32x4cv::v_int32x4163 explicit v_int32x4(__m128i v) : val(v) {}
v_int32x4cv::v_int32x4164 v_int32x4(int v0, int v1, int v2, int v3)
165 {
166 val = _mm_setr_epi32(v0, v1, v2, v3);
167 }
get0cv::v_int32x4168 int get0() const
169 {
170 return _mm_cvtsi128_si32(val);
171 }
172 __m128i val;
173 };
174
175 struct v_float32x4
176 {
177 typedef float lane_type;
178 enum { nlanes = 4 };
179
v_float32x4cv::v_float32x4180 v_float32x4() {}
v_float32x4cv::v_float32x4181 explicit v_float32x4(__m128 v) : val(v) {}
v_float32x4cv::v_float32x4182 v_float32x4(float v0, float v1, float v2, float v3)
183 {
184 val = _mm_setr_ps(v0, v1, v2, v3);
185 }
get0cv::v_float32x4186 float get0() const
187 {
188 return _mm_cvtss_f32(val);
189 }
190 __m128 val;
191 };
192
193 struct v_uint64x2
194 {
195 typedef uint64 lane_type;
196 enum { nlanes = 2 };
197
v_uint64x2cv::v_uint64x2198 v_uint64x2() {}
v_uint64x2cv::v_uint64x2199 explicit v_uint64x2(__m128i v) : val(v) {}
v_uint64x2cv::v_uint64x2200 v_uint64x2(uint64 v0, uint64 v1)
201 {
202 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
203 }
get0cv::v_uint64x2204 uint64 get0() const
205 {
206 int a = _mm_cvtsi128_si32(val);
207 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
208 return (unsigned)a | ((uint64)(unsigned)b << 32);
209 }
210 __m128i val;
211 };
212
213 struct v_int64x2
214 {
215 typedef int64 lane_type;
216 enum { nlanes = 2 };
217
v_int64x2cv::v_int64x2218 v_int64x2() {}
v_int64x2cv::v_int64x2219 explicit v_int64x2(__m128i v) : val(v) {}
v_int64x2cv::v_int64x2220 v_int64x2(int64 v0, int64 v1)
221 {
222 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
223 }
get0cv::v_int64x2224 int64 get0() const
225 {
226 int a = _mm_cvtsi128_si32(val);
227 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
228 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
229 }
230 __m128i val;
231 };
232
233 struct v_float64x2
234 {
235 typedef double lane_type;
236 enum { nlanes = 2 };
237
v_float64x2cv::v_float64x2238 v_float64x2() {}
v_float64x2cv::v_float64x2239 explicit v_float64x2(__m128d v) : val(v) {}
v_float64x2cv::v_float64x2240 v_float64x2(double v0, double v1)
241 {
242 val = _mm_setr_pd(v0, v1);
243 }
get0cv::v_float64x2244 double get0() const
245 {
246 return _mm_cvtsd_f64(val);
247 }
248 __m128d val;
249 };
250
251 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
252 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
253 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
254 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
255 { return _Tpvec(cast(a.val)); }
256
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16,uchar,u8,si128,epi8,char,OPENCV_HAL_NOP)257 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
258 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
259 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
260 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
261 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
262 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
263 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
264 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
265
266 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
v_setzero_s64()267 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
v_setall_u64(uint64 val)268 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
v_setall_s64(int64 val)269 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
270
271 template<typename _Tpvec> inline
v_reinterpret_as_u64(const _Tpvec & a)272 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
273 template<typename _Tpvec> inline
v_reinterpret_as_s64(const _Tpvec & a)274 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
v_reinterpret_as_f32(const v_uint64x2 & a)275 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
276 { return v_float32x4(_mm_castsi128_ps(a.val)); }
v_reinterpret_as_f32(const v_int64x2 & a)277 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
278 { return v_float32x4(_mm_castsi128_ps(a.val)); }
v_reinterpret_as_f64(const v_uint64x2 & a)279 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
280 { return v_float64x2(_mm_castsi128_pd(a.val)); }
v_reinterpret_as_f64(const v_int64x2 & a)281 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
282 { return v_float64x2(_mm_castsi128_pd(a.val)); }
283
284 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
285 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
286 { return _Tpvec(_mm_castps_si128(a.val)); } \
287 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
288 { return _Tpvec(_mm_castpd_si128(a.val)); }
289
OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16,u8)290 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
291 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
292 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
293 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
294 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
295 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
296 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
297 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
298
299 //////////////// PACK ///////////////
300 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
301 {
302 __m128i delta = _mm_set1_epi16(255);
303 return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
304 _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
305 }
306
v_pack_store(uchar * ptr,const v_uint16x8 & a)307 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
308 {
309 __m128i delta = _mm_set1_epi16(255);
310 __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
311 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
312 }
313
v_pack_u(const v_int16x8 & a,const v_int16x8 & b)314 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
315 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
316
v_pack_u_store(uchar * ptr,const v_int16x8 & a)317 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
318 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
319
320 template<int n> inline
v_rshr_pack(const v_uint16x8 & a,const v_uint16x8 & b)321 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
322 {
323 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
324 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
325 return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
326 _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
327 }
328
329 template<int n> inline
v_rshr_pack_store(uchar * ptr,const v_uint16x8 & a)330 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
331 {
332 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
333 __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
334 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
335 }
336
337 template<int n> inline
v_rshr_pack_u(const v_int16x8 & a,const v_int16x8 & b)338 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
339 {
340 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
341 return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
342 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
343 }
344
345 template<int n> inline
v_rshr_pack_u_store(uchar * ptr,const v_int16x8 & a)346 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
347 {
348 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
349 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
350 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
351 }
352
v_pack(const v_int16x8 & a,const v_int16x8 & b)353 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
354 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
355
v_pack_store(schar * ptr,v_int16x8 & a)356 inline void v_pack_store(schar* ptr, v_int16x8& a)
357 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
358
359 template<int n> inline
v_rshr_pack(const v_int16x8 & a,const v_int16x8 & b)360 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
361 {
362 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
363 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
364 return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
365 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
366 }
367 template<int n> inline
v_rshr_pack_store(schar * ptr,const v_int16x8 & a)368 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
369 {
370 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
371 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
372 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
373 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
374 }
375
376
377 // bit-wise "mask ? a : b"
v_select_si128(__m128i mask,__m128i a,__m128i b)378 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
379 {
380 return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
381 }
382
v_pack(const v_uint32x4 & a,const v_uint32x4 & b)383 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
384 {
385 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
386 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
387 __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
388 __m128i r = _mm_packs_epi32(a1, b1);
389 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
390 }
391
v_pack_store(ushort * ptr,const v_uint32x4 & a)392 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
393 {
394 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
395 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
396 __m128i r = _mm_packs_epi32(a1, a1);
397 _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
398 }
399
400 template<int n> inline
v_rshr_pack(const v_uint32x4 & a,const v_uint32x4 & b)401 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
402 {
403 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
404 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
405 __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
406 return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
407 }
408
409 template<int n> inline
v_rshr_pack_store(ushort * ptr,const v_uint32x4 & a)410 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
411 {
412 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
413 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
414 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
415 _mm_storel_epi64((__m128i*)ptr, a2);
416 }
417
v_pack_u(const v_int32x4 & a,const v_int32x4 & b)418 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
419 {
420 __m128i delta32 = _mm_set1_epi32(32768);
421 __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
422 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
423 }
424
v_pack_u_store(ushort * ptr,const v_int32x4 & a)425 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
426 {
427 __m128i delta32 = _mm_set1_epi32(32768);
428 __m128i a1 = _mm_sub_epi32(a.val, delta32);
429 __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
430 _mm_storel_epi64((__m128i*)ptr, r);
431 }
432
433 template<int n> inline
v_rshr_pack_u_store(ushort * ptr,const v_int32x4 & a)434 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
435 {
436 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
437 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
438 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
439 _mm_storel_epi64((__m128i*)ptr, a2);
440 }
441
v_pack(const v_int32x4 & a,const v_int32x4 & b)442 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
443 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
444
v_pack_store(short * ptr,const v_int32x4 & a)445 inline void v_pack_store(short* ptr, const v_int32x4& a)
446 {
447 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
448 }
449
450 template<int n> inline
v_rshr_pack(const v_int32x4 & a,const v_int32x4 & b)451 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
452 {
453 __m128i delta = _mm_set1_epi32(1 << (n-1));
454 return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
455 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
456 }
457
458 template<int n> inline
v_rshr_pack_store(short * ptr,const v_int32x4 & a)459 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
460 {
461 __m128i delta = _mm_set1_epi32(1 << (n-1));
462 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
463 _mm_storel_epi64((__m128i*)ptr, a1);
464 }
465
466
467 // [a0 0 | b0 0] [a1 0 | b1 0]
v_pack(const v_uint64x2 & a,const v_uint64x2 & b)468 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
469 {
470 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
471 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
472 return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
473 }
474
v_pack_store(unsigned * ptr,const v_uint64x2 & a)475 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
476 {
477 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
478 _mm_storel_epi64((__m128i*)ptr, a1);
479 }
480
481 // [a0 0 | b0 0] [a1 0 | b1 0]
v_pack(const v_int64x2 & a,const v_int64x2 & b)482 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
483 {
484 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
485 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
486 return v_int32x4(_mm_unpacklo_epi64(v0, v1));
487 }
488
v_pack_store(int * ptr,const v_int64x2 & a)489 inline void v_pack_store(int* ptr, const v_int64x2& a)
490 {
491 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
492 _mm_storel_epi64((__m128i*)ptr, a1);
493 }
494
495 template<int n> inline
v_rshr_pack(const v_uint64x2 & a,const v_uint64x2 & b)496 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
497 {
498 uint64 delta = (uint64)1 << (n-1);
499 v_uint64x2 delta2(delta, delta);
500 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
501 __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
502 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
503 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
504 return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
505 }
506
507 template<int n> inline
v_rshr_pack_store(unsigned * ptr,const v_uint64x2 & a)508 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
509 {
510 uint64 delta = (uint64)1 << (n-1);
511 v_uint64x2 delta2(delta, delta);
512 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
513 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
514 _mm_storel_epi64((__m128i*)ptr, a2);
515 }
516
v_sign_epi64(__m128i a)517 inline __m128i v_sign_epi64(__m128i a)
518 {
519 return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
520 }
521
v_srai_epi64(__m128i a,int imm)522 inline __m128i v_srai_epi64(__m128i a, int imm)
523 {
524 __m128i smask = v_sign_epi64(a);
525 return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
526 }
527
528 template<int n> inline
v_rshr_pack(const v_int64x2 & a,const v_int64x2 & b)529 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
530 {
531 int64 delta = (int64)1 << (n-1);
532 v_int64x2 delta2(delta, delta);
533 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
534 __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
535 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
536 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
537 return v_int32x4(_mm_unpacklo_epi64(v0, v1));
538 }
539
540 template<int n> inline
v_rshr_pack_store(int * ptr,const v_int64x2 & a)541 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
542 {
543 int64 delta = (int64)1 << (n-1);
544 v_int64x2 delta2(delta, delta);
545 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
546 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
547 _mm_storel_epi64((__m128i*)ptr, a2);
548 }
549
v_matmul(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & m3)550 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
551 const v_float32x4& m1, const v_float32x4& m2,
552 const v_float32x4& m3)
553 {
554 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
555 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
556 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
557 __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
558
559 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
560 }
561
562
563 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
564 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
565 { \
566 return _Tpvec(intrin(a.val, b.val)); \
567 } \
568 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
569 { \
570 a.val = intrin(a.val, b.val); \
571 return a; \
572 }
573
574 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
575 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
576 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
577 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
578 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
579 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
580 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
581 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
582 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
583 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
584 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
585 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
586 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
587 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
588 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
589 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
590 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
591 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
592 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
593 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
594 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
595 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
596 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
597 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
598 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
599 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
600
operator *(const v_uint32x4 & a,const v_uint32x4 & b)601 inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
602 {
603 __m128i c0 = _mm_mul_epu32(a.val, b.val);
604 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
605 __m128i d0 = _mm_unpacklo_epi32(c0, c1);
606 __m128i d1 = _mm_unpackhi_epi32(c0, c1);
607 return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
608 }
operator *(const v_int32x4 & a,const v_int32x4 & b)609 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
610 {
611 __m128i c0 = _mm_mul_epu32(a.val, b.val);
612 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
613 __m128i d0 = _mm_unpacklo_epi32(c0, c1);
614 __m128i d1 = _mm_unpackhi_epi32(c0, c1);
615 return v_int32x4(_mm_unpacklo_epi64(d0, d1));
616 }
operator *=(v_uint32x4 & a,const v_uint32x4 & b)617 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
618 {
619 a = a * b;
620 return a;
621 }
operator *=(v_int32x4 & a,const v_int32x4 & b)622 inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
623 {
624 a = a * b;
625 return a;
626 }
627
v_mul_expand(const v_int16x8 & a,const v_int16x8 & b,v_int32x4 & c,v_int32x4 & d)628 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
629 v_int32x4& c, v_int32x4& d)
630 {
631 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
632 __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
633 c.val = _mm_unpacklo_epi32(v0, v1);
634 d.val = _mm_unpackhi_epi32(v0, v1);
635 }
636
v_mul_expand(const v_uint16x8 & a,const v_uint16x8 & b,v_uint32x4 & c,v_uint32x4 & d)637 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
638 v_uint32x4& c, v_uint32x4& d)
639 {
640 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
641 __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
642 c.val = _mm_unpacklo_epi32(v0, v1);
643 d.val = _mm_unpackhi_epi32(v0, v1);
644 }
645
v_mul_expand(const v_uint32x4 & a,const v_uint32x4 & b,v_uint64x2 & c,v_uint64x2 & d)646 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
647 v_uint64x2& c, v_uint64x2& d)
648 {
649 __m128i c0 = _mm_mul_epu32(a.val, b.val);
650 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
651 c.val = _mm_unpacklo_epi64(c0, c1);
652 d.val = _mm_unpackhi_epi64(c0, c1);
653 }
654
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)655 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
656 {
657 return v_int32x4(_mm_madd_epi16(a.val, b.val));
658 }
659
660 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
661 OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
662 OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
663 OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
664 inline _Tpvec operator ~ (const _Tpvec& a) \
665 { \
666 return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
667 }
668
669 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
670 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
671 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
672 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
673 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
674 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
675 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
676 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
677 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
678 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
679
v_sqrt(const v_float32x4 & x)680 inline v_float32x4 v_sqrt(const v_float32x4& x)
681 { return v_float32x4(_mm_sqrt_ps(x.val)); }
682
v_invsqrt(const v_float32x4 & x)683 inline v_float32x4 v_invsqrt(const v_float32x4& x)
684 {
685 static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
686 __m128 t = x.val;
687 __m128 h = _mm_mul_ps(t, _0_5);
688 t = _mm_rsqrt_ps(t);
689 t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
690 return v_float32x4(t);
691 }
692
v_sqrt(const v_float64x2 & x)693 inline v_float64x2 v_sqrt(const v_float64x2& x)
694 { return v_float64x2(_mm_sqrt_pd(x.val)); }
695
v_invsqrt(const v_float64x2 & x)696 inline v_float64x2 v_invsqrt(const v_float64x2& x)
697 {
698 static const __m128d v_1 = _mm_set1_pd(1.);
699 return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
700 }
701
v_abs(const v_float32x4 & x)702 inline v_float32x4 v_abs(const v_float32x4& x)
703 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
v_abs(const v_float64x2 & x)704 inline v_float64x2 v_abs(const v_float64x2& x)
705 {
706 return v_float64x2(_mm_and_pd(x.val,
707 _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
708 }
709
710 // TODO: exp, log, sin, cos
711
712 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
713 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
714 { \
715 return _Tpvec(intrin(a.val, b.val)); \
716 }
717
OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16,v_min,_mm_min_epu8)718 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
719 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
720 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
721 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
722 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
723 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
724 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
725 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
726
727 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
728 {
729 __m128i delta = _mm_set1_epi8((char)-128);
730 return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
731 _mm_xor_si128(b.val, delta))));
732 }
v_max(const v_int8x16 & a,const v_int8x16 & b)733 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
734 {
735 __m128i delta = _mm_set1_epi8((char)-128);
736 return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
737 _mm_xor_si128(b.val, delta))));
738 }
v_min(const v_uint16x8 & a,const v_uint16x8 & b)739 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
740 {
741 return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
742 }
v_max(const v_uint16x8 & a,const v_uint16x8 & b)743 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
744 {
745 return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
746 }
v_min(const v_uint32x4 & a,const v_uint32x4 & b)747 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
748 {
749 __m128i delta = _mm_set1_epi32((int)0x80000000);
750 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
751 return v_uint32x4(v_select_si128(mask, b.val, a.val));
752 }
v_max(const v_uint32x4 & a,const v_uint32x4 & b)753 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
754 {
755 __m128i delta = _mm_set1_epi32((int)0x80000000);
756 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
757 return v_uint32x4(v_select_si128(mask, a.val, b.val));
758 }
v_min(const v_int32x4 & a,const v_int32x4 & b)759 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
760 {
761 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
762 }
v_max(const v_int32x4 & a,const v_int32x4 & b)763 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
764 {
765 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
766 }
767
768 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
769 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
770 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
771 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
772 { \
773 __m128i not_mask = _mm_set1_epi32(-1); \
774 return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
775 } \
776 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
777 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
778 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
779 { \
780 __m128i not_mask = _mm_set1_epi32(-1); \
781 return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
782 } \
783 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
784 { \
785 __m128i smask = _mm_set1_##suffix(sbit); \
786 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
787 } \
788 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
789 { \
790 __m128i smask = _mm_set1_##suffix(sbit); \
791 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
792 } \
793 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
794 { \
795 __m128i smask = _mm_set1_##suffix(sbit); \
796 __m128i not_mask = _mm_set1_epi32(-1); \
797 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
798 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
799 } \
800 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
801 { \
802 __m128i smask = _mm_set1_##suffix(sbit); \
803 __m128i not_mask = _mm_set1_epi32(-1); \
804 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
805 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
806 } \
807 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
808 { \
809 return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
810 } \
811 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
812 { \
813 return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
814 } \
815 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
816 { \
817 __m128i not_mask = _mm_set1_epi32(-1); \
818 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
819 } \
820 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
821 { \
822 __m128i not_mask = _mm_set1_epi32(-1); \
823 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
824 }
825
826 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
827 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
828 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
829
830 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
831 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
832 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
833 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
834 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
835 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
836 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
837 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
838 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
839 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
840 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
841 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
842 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
843
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4,ps)844 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
845 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
846
847 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
848 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
849 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
850 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
851 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
852 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
853 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
854 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
855
856 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
857 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
858 { \
859 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
860 } \
861 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
862 { \
863 __m128i smask = _mm_set1_epi32(smask32); \
864 __m128i a1 = _mm_xor_si128(a.val, smask); \
865 __m128i b1 = _mm_xor_si128(b.val, smask); \
866 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
867 }
868
869 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
870 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
871
872 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
873 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
874 { \
875 _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
876 return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
877 } \
878 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
879 { \
880 _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
881 return _Tpvec(_mm_sqrt_##suffix(res)); \
882 } \
883 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
884 { \
885 _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
886 return _Tpvec(res); \
887 } \
888 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
889 { \
890 return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
891 }
892
893 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
894 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
895
896 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
897 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
898 { \
899 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
900 } \
901 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
902 { \
903 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
904 } \
905 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
906 { \
907 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
908 } \
909 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
910 { \
911 return _Tpsvec(srai(a.val, imm)); \
912 } \
913 template<int imm> \
914 inline _Tpuvec v_shl(const _Tpuvec& a) \
915 { \
916 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
917 } \
918 template<int imm> \
919 inline _Tpsvec v_shl(const _Tpsvec& a) \
920 { \
921 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
922 } \
923 template<int imm> \
924 inline _Tpuvec v_shr(const _Tpuvec& a) \
925 { \
926 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
927 } \
928 template<int imm> \
929 inline _Tpsvec v_shr(const _Tpsvec& a) \
930 { \
931 return _Tpsvec(srai(a.val, imm)); \
932 }
933
934 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
935 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
936 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
937
938 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
939 inline _Tpvec v_load(const _Tp* ptr) \
940 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
941 inline _Tpvec v_load_aligned(const _Tp* ptr) \
942 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
943 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
944 { \
945 return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
946 _mm_loadl_epi64((const __m128i*)ptr1))); \
947 } \
948 inline void v_store(_Tp* ptr, const _Tpvec& a) \
949 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
950 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
951 { _mm_store_si128((__m128i*)ptr, a.val); } \
952 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
953 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
954 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
955 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
956
957 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
958 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
959 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
960 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
961 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
962 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
963 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
964 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
965
966 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
967 inline _Tpvec v_load(const _Tp* ptr) \
968 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
969 inline _Tpvec v_load_aligned(const _Tp* ptr) \
970 { return _Tpvec(_mm_load_##suffix(ptr)); } \
971 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
972 { \
973 return _Tpvec(_mm_castsi128_##suffix( \
974 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
975 _mm_loadl_epi64((const __m128i*)ptr1)))); \
976 } \
977 inline void v_store(_Tp* ptr, const _Tpvec& a) \
978 { _mm_storeu_##suffix(ptr, a.val); } \
979 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
980 { _mm_store_##suffix(ptr, a.val); } \
981 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
982 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
983 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
984 { \
985 __m128i a1 = _mm_cast##suffix##_si128(a.val); \
986 _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
987 }
988
989 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
990 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
991
992 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
993 inline scalartype v_reduce_##func(const _Tpvec& a) \
994 { \
995 scalartype CV_DECL_ALIGNED(16) buf[4]; \
996 v_store_aligned(buf, a); \
997 scalartype s0 = scalar_func(buf[0], buf[1]); \
998 scalartype s1 = scalar_func(buf[2], buf[3]); \
999 return scalar_func(s0, s1); \
1000 }
1001
1002 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
1003 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1004 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1005 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
1006 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
1007 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1008 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
1009 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
1010 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
1011
1012 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
1013 inline int v_signmask(const _Tpvec& a) \
1014 { \
1015 return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
1016 } \
1017 inline bool v_check_all(const _Tpvec& a) \
1018 { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
1019 inline bool v_check_any(const _Tpvec& a) \
1020 { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
1021
1022 #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
1023 inline __m128i v_packq_epi32(__m128i a)
1024 {
1025 __m128i b = _mm_packs_epi32(a, a);
1026 return _mm_packs_epi16(b, b);
1027 }
1028
1029 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1030 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1031 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1032 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1033 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1034 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1035 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
1036 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
1037
1038 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1039 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1040 { \
1041 return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1042 }
1043
OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16,si128)1044 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
1045 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
1046 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
1047 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
1048 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
1049 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
1050 OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
1051 OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
1052 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
1053 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
1054
1055 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
1056 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
1057 { \
1058 __m128i z = _mm_setzero_si128(); \
1059 b0.val = _mm_unpacklo_##suffix(a.val, z); \
1060 b1.val = _mm_unpackhi_##suffix(a.val, z); \
1061 } \
1062 inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
1063 { \
1064 __m128i z = _mm_setzero_si128(); \
1065 return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
1066 } \
1067 inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
1068 { \
1069 b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
1070 b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
1071 } \
1072 inline _Tpwsvec v_load_expand(const _Tps* ptr) \
1073 { \
1074 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1075 return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
1076 }
1077
1078 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
1079 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
1080
1081 inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
1082 {
1083 __m128i z = _mm_setzero_si128();
1084 b0.val = _mm_unpacklo_epi32(a.val, z);
1085 b1.val = _mm_unpackhi_epi32(a.val, z);
1086 }
v_load_expand(const unsigned * ptr)1087 inline v_uint64x2 v_load_expand(const unsigned* ptr)
1088 {
1089 __m128i z = _mm_setzero_si128();
1090 return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
1091 }
v_expand(const v_int32x4 & a,v_int64x2 & b0,v_int64x2 & b1)1092 inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
1093 {
1094 __m128i s = _mm_srai_epi32(a.val, 31);
1095 b0.val = _mm_unpacklo_epi32(a.val, s);
1096 b1.val = _mm_unpackhi_epi32(a.val, s);
1097 }
v_load_expand(const int * ptr)1098 inline v_int64x2 v_load_expand(const int* ptr)
1099 {
1100 __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
1101 __m128i s = _mm_srai_epi32(a, 31);
1102 return v_int64x2(_mm_unpacklo_epi32(a, s));
1103 }
1104
v_load_expand_q(const uchar * ptr)1105 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1106 {
1107 __m128i z = _mm_setzero_si128();
1108 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1109 return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
1110 }
1111
v_load_expand_q(const schar * ptr)1112 inline v_int32x4 v_load_expand_q(const schar* ptr)
1113 {
1114 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1115 a = _mm_unpacklo_epi8(a, a);
1116 a = _mm_unpacklo_epi8(a, a);
1117 return v_int32x4(_mm_srai_epi32(a, 24));
1118 }
1119
1120 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1121 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1122 { \
1123 b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1124 b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1125 } \
1126 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1127 { \
1128 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1129 return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1130 } \
1131 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1132 { \
1133 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1134 return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1135 } \
1136 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1137 { \
1138 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1139 c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1140 d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1141 }
1142
OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16,epi8,OPENCV_HAL_NOP,OPENCV_HAL_NOP)1143 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1144 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1145 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1146 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1147 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1148 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1149 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1150 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1151
1152 inline v_int32x4 v_round(const v_float32x4& a)
1153 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
1154
v_floor(const v_float32x4 & a)1155 inline v_int32x4 v_floor(const v_float32x4& a)
1156 {
1157 __m128i a1 = _mm_cvtps_epi32(a.val);
1158 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
1159 return v_int32x4(_mm_add_epi32(a1, mask));
1160 }
1161
v_ceil(const v_float32x4 & a)1162 inline v_int32x4 v_ceil(const v_float32x4& a)
1163 {
1164 __m128i a1 = _mm_cvtps_epi32(a.val);
1165 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
1166 return v_int32x4(_mm_sub_epi32(a1, mask));
1167 }
1168
v_trunc(const v_float32x4 & a)1169 inline v_int32x4 v_trunc(const v_float32x4& a)
1170 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
1171
v_round(const v_float64x2 & a)1172 inline v_int32x4 v_round(const v_float64x2& a)
1173 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
1174
v_floor(const v_float64x2 & a)1175 inline v_int32x4 v_floor(const v_float64x2& a)
1176 {
1177 __m128i a1 = _mm_cvtpd_epi32(a.val);
1178 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
1179 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1180 return v_int32x4(_mm_add_epi32(a1, mask));
1181 }
1182
v_ceil(const v_float64x2 & a)1183 inline v_int32x4 v_ceil(const v_float64x2& a)
1184 {
1185 __m128i a1 = _mm_cvtpd_epi32(a.val);
1186 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
1187 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1188 return v_int32x4(_mm_sub_epi32(a1, mask));
1189 }
1190
v_trunc(const v_float64x2 & a)1191 inline v_int32x4 v_trunc(const v_float64x2& a)
1192 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
1193
1194 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1195 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1196 const _Tpvec& a2, const _Tpvec& a3, \
1197 _Tpvec& b0, _Tpvec& b1, \
1198 _Tpvec& b2, _Tpvec& b3) \
1199 { \
1200 __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
1201 __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
1202 __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
1203 __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
1204 \
1205 b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
1206 b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
1207 b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
1208 b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
1209 }
1210
OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4,epi32,OPENCV_HAL_NOP,OPENCV_HAL_NOP)1211 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1212 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1213 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1214
1215 // adopted from sse_utils.hpp
1216 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
1217 {
1218 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1219 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1220 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
1221
1222 __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
1223 __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
1224 __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
1225
1226 __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
1227 __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
1228 __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
1229
1230 __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
1231 __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
1232 __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
1233
1234 a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
1235 b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
1236 c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
1237 }
1238
v_load_deinterleave(const uchar * ptr,v_uint8x16 & a,v_uint8x16 & b,v_uint8x16 & c,v_uint8x16 & d)1239 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
1240 {
1241 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
1242 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1243 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
1244 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
1245
1246 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
1247 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
1248 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
1249 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ...
1250
1251 u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
1252 u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
1253 u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
1254 u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
1255
1256 v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
1257 v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
1258 v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ...
1259 v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ...
1260
1261 a.val = _mm_unpacklo_epi8(v0, v1);
1262 b.val = _mm_unpacklo_epi8(v2, v3);
1263 c.val = _mm_unpackhi_epi8(v0, v1);
1264 d.val = _mm_unpacklo_epi8(v2, v3);
1265 }
1266
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b,v_uint16x8 & c)1267 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
1268 {
1269 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1270 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1271 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1272
1273 __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
1274 __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
1275 __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
1276
1277 __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
1278 __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
1279 __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
1280
1281 a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
1282 b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
1283 c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
1284 }
1285
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b,v_uint16x8 & c,v_uint16x8 & d)1286 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
1287 {
1288 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
1289 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
1290 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1291 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
1292
1293 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
1294 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
1295 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
1296 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
1297
1298 u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
1299 u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
1300 u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
1301 u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
1302
1303 a.val = _mm_unpacklo_epi16(u0, u1);
1304 b.val = _mm_unpackhi_epi16(u0, u1);
1305 c.val = _mm_unpacklo_epi16(u2, u3);
1306 d.val = _mm_unpackhi_epi16(u2, u3);
1307 }
1308
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b,v_uint32x4 & c)1309 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
1310 {
1311 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1312 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
1313 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1314
1315 __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
1316 __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
1317 __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
1318
1319 a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
1320 b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
1321 c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
1322 }
1323
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b,v_uint32x4 & c,v_uint32x4 & d)1324 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
1325 {
1326 v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
1327 v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
1328 v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
1329 v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
1330
1331 v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
1332 }
1333
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,const v_uint8x16 & c)1334 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
1335 const v_uint8x16& c )
1336 {
1337 __m128i z = _mm_setzero_si128();
1338 __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
1339 __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
1340 __m128i c0 = _mm_unpacklo_epi8(c.val, z);
1341 __m128i c1 = _mm_unpackhi_epi8(c.val, z);
1342
1343 __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
1344 __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
1345 __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
1346 __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
1347
1348 __m128i p10 = _mm_unpacklo_epi32(p00, p01);
1349 __m128i p11 = _mm_unpackhi_epi32(p00, p01);
1350 __m128i p12 = _mm_unpacklo_epi32(p02, p03);
1351 __m128i p13 = _mm_unpackhi_epi32(p02, p03);
1352
1353 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
1354 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
1355 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
1356 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
1357
1358 p20 = _mm_slli_si128(p20, 1);
1359 p22 = _mm_slli_si128(p22, 1);
1360
1361 __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
1362 __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
1363 __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
1364 __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
1365
1366 __m128i p40 = _mm_unpacklo_epi64(p30, p31);
1367 __m128i p41 = _mm_unpackhi_epi64(p30, p31);
1368 __m128i p42 = _mm_unpacklo_epi64(p32, p33);
1369 __m128i p43 = _mm_unpackhi_epi64(p32, p33);
1370
1371 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
1372 __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
1373 __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
1374
1375 _mm_storeu_si128((__m128i*)(ptr), v0);
1376 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1377 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
1378 }
1379
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,const v_uint8x16 & c,const v_uint8x16 & d)1380 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
1381 const v_uint8x16& c, const v_uint8x16& d)
1382 {
1383 // a0 a1 a2 a3 ....
1384 // b0 b1 b2 b3 ....
1385 // c0 c1 c2 c3 ....
1386 // d0 d1 d2 d3 ....
1387 __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
1388 __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
1389 __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
1390 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
1391
1392 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
1393 __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
1394 __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
1395 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
1396
1397 _mm_storeu_si128((__m128i*)ptr, v0);
1398 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
1399 _mm_storeu_si128((__m128i*)(ptr + 32), v1);
1400 _mm_storeu_si128((__m128i*)(ptr + 48), v3);
1401 }
1402
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,const v_uint16x8 & c)1403 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
1404 const v_uint16x8& b,
1405 const v_uint16x8& c )
1406 {
1407 __m128i z = _mm_setzero_si128();
1408 __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
1409 __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
1410 __m128i c0 = _mm_unpacklo_epi16(c.val, z);
1411 __m128i c1 = _mm_unpackhi_epi16(c.val, z);
1412
1413 __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
1414 __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
1415 __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
1416 __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
1417
1418 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
1419 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
1420 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
1421 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
1422
1423 p20 = _mm_slli_si128(p20, 2);
1424 p22 = _mm_slli_si128(p22, 2);
1425
1426 __m128i p30 = _mm_unpacklo_epi64(p20, p21);
1427 __m128i p31 = _mm_unpackhi_epi64(p20, p21);
1428 __m128i p32 = _mm_unpacklo_epi64(p22, p23);
1429 __m128i p33 = _mm_unpackhi_epi64(p22, p23);
1430
1431 __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
1432 __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
1433 __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
1434
1435 _mm_storeu_si128((__m128i*)(ptr), v0);
1436 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
1437 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
1438 }
1439
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,const v_uint16x8 & c,const v_uint16x8 & d)1440 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
1441 const v_uint16x8& c, const v_uint16x8& d)
1442 {
1443 // a0 a1 a2 a3 ....
1444 // b0 b1 b2 b3 ....
1445 // c0 c1 c2 c3 ....
1446 // d0 d1 d2 d3 ....
1447 __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
1448 __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
1449 __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
1450 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
1451
1452 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
1453 __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
1454 __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
1455 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
1456
1457 _mm_storeu_si128((__m128i*)ptr, v0);
1458 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
1459 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1460 _mm_storeu_si128((__m128i*)(ptr + 24), v3);
1461 }
1462
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c)1463 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
1464 const v_uint32x4& c )
1465 {
1466 v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
1467 v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
1468
1469 __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
1470 __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
1471 __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
1472
1473 _mm_storeu_si128((__m128i*)ptr, v0);
1474 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
1475 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
1476 }
1477
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d)1478 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
1479 const v_uint32x4& c, const v_uint32x4& d)
1480 {
1481 v_uint32x4 t0, t1, t2, t3;
1482 v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
1483 v_store(ptr, t0);
1484 v_store(ptr + 4, t1);
1485 v_store(ptr + 8, t2);
1486 v_store(ptr + 12, t3);
1487 }
1488
1489 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
1490 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
1491 _Tpvec& b0, _Tpvec& c0 ) \
1492 { \
1493 _Tpuvec a1, b1, c1; \
1494 v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
1495 a0 = v_reinterpret_as_##suffix(a1); \
1496 b0 = v_reinterpret_as_##suffix(b1); \
1497 c0 = v_reinterpret_as_##suffix(c1); \
1498 } \
1499 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
1500 _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
1501 { \
1502 _Tpuvec a1, b1, c1, d1; \
1503 v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
1504 a0 = v_reinterpret_as_##suffix(a1); \
1505 b0 = v_reinterpret_as_##suffix(b1); \
1506 c0 = v_reinterpret_as_##suffix(c1); \
1507 d0 = v_reinterpret_as_##suffix(d1); \
1508 } \
1509 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
1510 const _Tpvec& b0, const _Tpvec& c0 ) \
1511 { \
1512 _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
1513 _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
1514 _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
1515 v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
1516 } \
1517 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
1518 const _Tpvec& c0, const _Tpvec& d0 ) \
1519 { \
1520 _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
1521 _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
1522 _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
1523 _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
1524 v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
1525 }
1526
OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16,schar,s8,v_uint8x16,uchar,u8)1527 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
1528 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
1529 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
1530 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
1531
1532 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1533 {
1534 return v_float32x4(_mm_cvtepi32_ps(a.val));
1535 }
1536
v_cvt_f32(const v_float64x2 & a)1537 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1538 {
1539 return v_float32x4(_mm_cvtpd_ps(a.val));
1540 }
1541
v_cvt_f64(const v_int32x4 & a)1542 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1543 {
1544 return v_float64x2(_mm_cvtepi32_pd(a.val));
1545 }
1546
v_cvt_f64(const v_float32x4 & a)1547 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1548 {
1549 return v_float64x2(_mm_cvtps_pd(a.val));
1550 }
1551
1552 }
1553
1554 #endif
1555