1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44
45 #ifndef __OPENCV_HAL_INTRIN_CPP_HPP__
46 #define __OPENCV_HAL_INTRIN_CPP_HPP__
47
48 namespace cv
49 {
50
51 template<typename _Tp, int n> struct v_reg
52 {
53 typedef _Tp lane_type;
54 typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
55 typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
56 enum { nlanes = n };
57
v_regcv::v_reg58 explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
v_regcv::v_reg59 v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
v_regcv::v_reg60 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
v_regcv::v_reg61 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
62 _Tp s4, _Tp s5, _Tp s6, _Tp s7)
63 {
64 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
65 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
66 }
v_regcv::v_reg67 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
68 _Tp s4, _Tp s5, _Tp s6, _Tp s7,
69 _Tp s8, _Tp s9, _Tp s10, _Tp s11,
70 _Tp s12, _Tp s13, _Tp s14, _Tp s15)
71 {
72 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
73 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
74 s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
75 s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
76 }
77
v_regcv::v_reg78 v_reg() {}
v_regcv::v_reg79 v_reg(const v_reg<_Tp, n> & r)
80 {
81 for( int i = 0; i < n; i++ )
82 s[i] = r.s[i];
83 }
84
getcv::v_reg85 _Tp get(const int i) const { return s[i]; }
get0cv::v_reg86 _Tp get0() const { return s[0]; }
highcv::v_reg87 v_reg<_Tp, n> high() const
88 {
89 v_reg<_Tp, n> c;
90 int i;
91 for( i = 0; i < n/2; i++ )
92 {
93 c.s[i] = s[i+(n/2)];
94 c.s[i+(n/2)] = 0;
95 }
96 return c;
97 }
98
zerocv::v_reg99 static v_reg<_Tp, n> zero()
100 {
101 v_reg<_Tp, n> c;
102 for( int i = 0; i < n; i++ )
103 c.s[i] = (_Tp)0;
104 return c;
105 }
106
allcv::v_reg107 static v_reg<_Tp, n> all(_Tp s)
108 {
109 v_reg<_Tp, n> c;
110 for( int i = 0; i < n; i++ )
111 c.s[i] = s;
112 return c;
113 }
114
reinterpret_ascv::v_reg115 template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
116 {
117 size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
118 v_reg<_Tp2, n2> c;
119 memcpy(&c.s[0], &s[0], bytes);
120 return c;
121 }
122
123 _Tp s[n];
124 };
125
126 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
127 template<typename _Tp, int n> inline v_reg<_Tp, n> \
128 operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
129 { \
130 v_reg<_Tp, n> c; \
131 for( int i = 0; i < n; i++ ) \
132 c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
133 return c; \
134 } \
135 template<typename _Tp, int n> inline v_reg<_Tp, n>& \
136 operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
137 { \
138 for( int i = 0; i < n; i++ ) \
139 a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
140 return a; \
141 }
142
143 OPENCV_HAL_IMPL_BIN_OP(+)
144 OPENCV_HAL_IMPL_BIN_OP(-)
145 OPENCV_HAL_IMPL_BIN_OP(*)
146 OPENCV_HAL_IMPL_BIN_OP(/)
147
148 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
149 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
150 (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
151 { \
152 v_reg<_Tp, n> c; \
153 typedef typename V_TypeTraits<_Tp>::int_type itype; \
154 for( int i = 0; i < n; i++ ) \
155 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
156 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
157 return c; \
158 } \
159 template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
160 bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
161 { \
162 typedef typename V_TypeTraits<_Tp>::int_type itype; \
163 for( int i = 0; i < n; i++ ) \
164 a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
165 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
166 return a; \
167 }
168
169 OPENCV_HAL_IMPL_BIT_OP(&)
170 OPENCV_HAL_IMPL_BIT_OP(|)
171 OPENCV_HAL_IMPL_BIT_OP(^)
172
operator ~(const v_reg<_Tp,n> & a)173 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
174 {
175 v_reg<_Tp, n> c;
176 for( int i = 0; i < n; i++ )
177 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
178 return c;
179 }
180
181 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
182 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
183 { \
184 v_reg<_Tp2, n> c; \
185 for( int i = 0; i < n; i++ ) \
186 c.s[i] = cfunc(a.s[i]); \
187 return c; \
188 }
189
OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt,std::sqrt,_Tp)190 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
191 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
192 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
193 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
194 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
195 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
196 typename V_TypeTraits<_Tp>::abs_type)
197 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
198 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
199 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
200 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
201
202 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \
203 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
204 { \
205 v_reg<_Tp, n> c; \
206 for( int i = 0; i < n; i++ ) \
207 c.s[i] = cfunc(a.s[i], b.s[i]); \
208 return c; \
209 } \
210 template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \
211 { \
212 _Tp c = a.s[0]; \
213 for( int i = 1; i < n; i++ ) \
214 c = cfunc(c, a.s[i]); \
215 return c; \
216 }
217
218 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min)
219 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max)
220
221 template<typename _Tp, int n>
222 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
223 v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
224 {
225 for( int i = 0; i < n; i++ )
226 {
227 minval.s[i] = std::min(a.s[i], b.s[i]);
228 maxval.s[i] = std::max(a.s[i], b.s[i]);
229 }
230 }
231
232
233 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
234 template<typename _Tp, int n> \
235 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
236 { \
237 typedef typename V_TypeTraits<_Tp>::int_type itype; \
238 v_reg<_Tp, n> c; \
239 for( int i = 0; i < n; i++ ) \
240 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
241 return c; \
242 }
243
244 OPENCV_HAL_IMPL_CMP_OP(<)
245 OPENCV_HAL_IMPL_CMP_OP(>)
246 OPENCV_HAL_IMPL_CMP_OP(<=)
247 OPENCV_HAL_IMPL_CMP_OP(>=)
248 OPENCV_HAL_IMPL_CMP_OP(==)
249 OPENCV_HAL_IMPL_CMP_OP(!=)
250
251 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
252 template<typename _Tp, int n> \
253 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
254 { \
255 typedef _Tp2 rtype; \
256 v_reg<rtype, n> c; \
257 for( int i = 0; i < n; i++ ) \
258 c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
259 return c; \
260 }
261
262 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
263 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
264 OPENCV_HAL_IMPL_ADD_SUB_OP(v_absdiff, -, (rtype)std::abs, typename V_TypeTraits<_Tp>::abs_type)
265
266 template<typename _Tp, int n>
v_invsqrt(const v_reg<_Tp,n> & a)267 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
268 {
269 v_reg<_Tp, n> c;
270 for( int i = 0; i < n; i++ )
271 c.s[i] = 1.f/std::sqrt(a.s[i]);
272 return c;
273 }
274
275 template<typename _Tp, int n>
v_magnitude(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)276 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
277 {
278 v_reg<_Tp, n> c;
279 for( int i = 0; i < n; i++ )
280 c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
281 return c;
282 }
283
284
285 template<typename _Tp, int n>
v_sqr_magnitude(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)286 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
287 {
288 v_reg<_Tp, n> c;
289 for( int i = 0; i < n; i++ )
290 c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
291 return c;
292 }
293
294 template<typename _Tp, int n>
v_muladd(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c)295 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
296 const v_reg<_Tp, n>& c)
297 {
298 v_reg<_Tp, n> d;
299 for( int i = 0; i < n; i++ )
300 d.s[i] = a.s[i]*b.s[i] + c.s[i];
301 return d;
302 }
303
304 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)305 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
306 {
307 typedef typename V_TypeTraits<_Tp>::w_type w_type;
308 v_reg<w_type, n/2> c;
309 for( int i = 0; i < (n/2); i++ )
310 c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
311 return c;
312 }
313
v_mul_expand(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & c,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & d)314 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
315 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
316 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
317 {
318 typedef typename V_TypeTraits<_Tp>::w_type w_type;
319 for( int i = 0; i < (n/2); i++ )
320 {
321 c.s[i] = (w_type)a.s[i]*b.s[i]*2;
322 d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
323 }
324 }
325
v_hsum(const v_reg<_Tp,n> & a,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & c)326 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
327 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
328 {
329 typedef typename V_TypeTraits<_Tp>::w_type w_type;
330 for( int i = 0; i < (n/2); i++ )
331 {
332 c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
333 }
334 }
335
336 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
337 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
338 { \
339 v_reg<_Tp, n> c; \
340 for( int i = 0; i < n; i++ ) \
341 c.s[i] = (_Tp)(a.s[i] shift_op imm); \
342 return c; \
343 }
344
345 OPENCV_HAL_IMPL_SHIFT_OP(<<)
346 OPENCV_HAL_IMPL_SHIFT_OP(>>)
347
v_reduce_sum(const v_reg<_Tp,n> & a)348 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
349 {
350 typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
351 for( int i = 1; i < n; i++ )
352 c += a.s[i];
353 return c;
354 }
355
v_signmask(const v_reg<_Tp,n> & a)356 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
357 {
358 int mask = 0;
359 for( int i = 0; i < n; i++ )
360 mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
361 return mask;
362 }
363
v_check_all(const v_reg<_Tp,n> & a)364 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
365 {
366 for( int i = 0; i < n; i++ )
367 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
368 return false;
369 return true;
370 }
371
v_check_any(const v_reg<_Tp,n> & a)372 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
373 {
374 for( int i = 0; i < n; i++ )
375 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
376 return true;
377 return false;
378 }
379
v_select(const v_reg<_Tp,n> & mask,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)380 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
381 const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
382 {
383 v_reg<_Tp, n> c;
384 for( int i = 0; i < n; i++ )
385 c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i];
386 return c;
387 }
388
v_expand(const v_reg<_Tp,n> & a,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & b0,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & b1)389 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
390 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
391 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
392 {
393 for( int i = 0; i < (n/2); i++ )
394 {
395 b0.s[i] = a.s[i];
396 b1.s[i] = a.s[i+(n/2)];
397 }
398 }
399
400 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
v_reinterpret_as_int(const v_reg<_Tp,n> & a)401 v_reinterpret_as_int(const v_reg<_Tp, n>& a)
402 {
403 v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
404 for( int i = 0; i < n; i++ )
405 c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
406 return c;
407 }
408
409 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
v_reinterpret_as_uint(const v_reg<_Tp,n> & a)410 v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
411 {
412 v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
413 for( int i = 0; i < n; i++ )
414 c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
415 return c;
416 }
417
v_zip(const v_reg<_Tp,n> & a0,const v_reg<_Tp,n> & a1,v_reg<_Tp,n> & b0,v_reg<_Tp,n> & b1)418 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
419 v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
420 {
421 int i;
422 for( i = 0; i < n/2; i++ )
423 {
424 b0.s[i*2] = a0.s[i];
425 b0.s[i*2+1] = a1.s[i];
426 }
427 for( ; i < n; i++ )
428 {
429 b1.s[i*2-n] = a0.s[i];
430 b1.s[i*2-n+1] = a1.s[i];
431 }
432 }
433
v_load(const _Tp * ptr)434 template<typename _Tp, int n> inline v_reg<_Tp, n> v_load(const _Tp* ptr)
435 {
436 return v_reg<_Tp, n>(ptr);
437 }
438
v_load_aligned(const _Tp * ptr)439 template<typename _Tp, int n> inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr)
440 {
441 return v_reg<_Tp, n>(ptr);
442 }
443
v_load_halves(const _Tp * loptr,const _Tp * hiptr)444 template<typename _Tp, int n> inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr)
445 {
446 v_reg<_Tp, n> c;
447 for( int i = 0; i < n/2; i++ )
448 {
449 c.s[i] = loptr[i];
450 c.s[i+n/2] = hiptr[i];
451 }
452 return c;
453 }
454
v_load_expand(const _Tp * ptr)455 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n> v_load_expand(const _Tp* ptr)
456 {
457 typedef typename V_TypeTraits<_Tp>::w_type w_type;
458 v_reg<w_type, n> c;
459 for( int i = 0; i < n; i++ )
460 {
461 c.s[i] = ptr[i];
462 }
463 return c;
464 }
465
466 template<typename _Tp, int n> inline v_reg<typename
v_load_expand_q(const _Tp * ptr)467 V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr)
468 {
469 typedef typename V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type w_type;
470 v_reg<w_type, n> c;
471 for( int i = 0; i < n; i++ )
472 {
473 c.s[i] = ptr[i];
474 }
475 return c;
476 }
477
v_load_deinterleave(const _Tp * ptr,v_reg<_Tp,n> & a,v_reg<_Tp,n> & b,v_reg<_Tp,n> & c)478 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
479 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
480 {
481 int i, i3;
482 for( i = i3 = 0; i < n; i++, i3 += 3 )
483 {
484 a.s[i] = ptr[i3];
485 b.s[i] = ptr[i3+1];
486 c.s[i] = ptr[i3+2];
487 }
488 }
489
490 template<typename _Tp, int n>
v_load_deinterleave(const _Tp * ptr,v_reg<_Tp,n> & a,v_reg<_Tp,n> & b,v_reg<_Tp,n> & c,v_reg<_Tp,n> & d)491 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
492 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
493 v_reg<_Tp, n>& d)
494 {
495 int i, i4;
496 for( i = i4 = 0; i < n; i++, i4 += 4 )
497 {
498 a.s[i] = ptr[i4];
499 b.s[i] = ptr[i4+1];
500 c.s[i] = ptr[i4+2];
501 d.s[i] = ptr[i4+3];
502 }
503 }
504
505 template<typename _Tp, int n>
v_store_interleave(_Tp * ptr,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c)506 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
507 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
508 {
509 int i, i3;
510 for( i = i3 = 0; i < n; i++, i3 += 3 )
511 {
512 ptr[i3] = a.s[i];
513 ptr[i3+1] = b.s[i];
514 ptr[i3+2] = c.s[i];
515 }
516 }
517
v_store_interleave(_Tp * ptr,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c,const v_reg<_Tp,n> & d)518 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
519 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
520 const v_reg<_Tp, n>& d)
521 {
522 int i, i4;
523 for( i = i4 = 0; i < n; i++, i4 += 4 )
524 {
525 ptr[i4] = a.s[i];
526 ptr[i4+1] = b.s[i];
527 ptr[i4+2] = c.s[i];
528 ptr[i4+3] = d.s[i];
529 }
530 }
531
532 template<typename _Tp, int n>
v_store(_Tp * ptr,const v_reg<_Tp,n> & a)533 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
534 {
535 for( int i = 0; i < n; i++ )
536 ptr[i] = a.s[i];
537 }
538
539 template<typename _Tp, int n>
v_store_low(_Tp * ptr,const v_reg<_Tp,n> & a)540 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
541 {
542 for( int i = 0; i < (n/2); i++ )
543 ptr[i] = a.s[i];
544 }
545
546 template<typename _Tp, int n>
v_store_high(_Tp * ptr,const v_reg<_Tp,n> & a)547 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
548 {
549 for( int i = 0; i < (n/2); i++ )
550 ptr[i] = a.s[i+(n/2)];
551 }
552
553 template<typename _Tp, int n>
v_store_aligned(_Tp * ptr,const v_reg<_Tp,n> & a)554 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
555 {
556 for( int i = 0; i < n; i++ )
557 ptr[i] = a.s[i];
558 }
559
560 template<typename _Tp, int n>
v_combine_low(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)561 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
562 {
563 v_reg<_Tp, n> c;
564 for( int i = 0; i < (n/2); i++ )
565 {
566 c.s[i] = a.s[i];
567 c.s[i+(n/2)] = b.s[i];
568 }
569 }
570
571 template<typename _Tp, int n>
v_combine_high(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)572 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
573 {
574 v_reg<_Tp, n> c;
575 for( int i = 0; i < (n/2); i++ )
576 {
577 c.s[i] = a.s[i+(n/2)];
578 c.s[i+(n/2)] = b.s[i+(n/2)];
579 }
580 }
581
582 template<typename _Tp, int n>
v_recombine(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,v_reg<_Tp,n> & low,v_reg<_Tp,n> & high)583 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
584 v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
585 {
586 for( int i = 0; i < (n/2); i++ )
587 {
588 low.s[i] = a.s[i];
589 low.s[i+(n/2)] = b.s[i];
590 high.s[i] = a.s[i+(n/2)];
591 high.s[i+(n/2)] = b.s[i+(n/2)];
592 }
593 }
594
v_round(const v_reg<float,n> & a)595 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
596 {
597 v_reg<int, n> c;
598 for( int i = 0; i < n; i++ )
599 c.s[i] = cvRound(a.s[i]);
600 return c;
601 }
602
v_floor(const v_reg<float,n> & a)603 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
604 {
605 v_reg<int, n> c;
606 for( int i = 0; i < n; i++ )
607 c.s[i] = cvFloor(a.s[i]);
608 return c;
609 }
610
v_ceil(const v_reg<float,n> & a)611 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
612 {
613 v_reg<int, n> c;
614 for( int i = 0; i < n; i++ )
615 c.s[i] = cvCeil(a.s[i]);
616 return c;
617 }
618
v_trunc(const v_reg<float,n> & a)619 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
620 {
621 v_reg<int, n> c;
622 for( int i = 0; i < n; i++ )
623 c.s[i] = (int)(a.s[i]);
624 return c;
625 }
626
v_round(const v_reg<double,n> & a)627 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
628 {
629 v_reg<int, n*2> c;
630 for( int i = 0; i < n; i++ )
631 {
632 c.s[i] = cvRound(a.s[i]);
633 c.s[i+n] = 0;
634 }
635 return c;
636 }
637
v_floor(const v_reg<double,n> & a)638 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
639 {
640 v_reg<int, n> c;
641 for( int i = 0; i < n; i++ )
642 {
643 c.s[i] = cvFloor(a.s[i]);
644 c.s[i+n] = 0;
645 }
646 return c;
647 }
648
v_ceil(const v_reg<double,n> & a)649 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
650 {
651 v_reg<int, n> c;
652 for( int i = 0; i < n; i++ )
653 {
654 c.s[i] = cvCeil(a.s[i]);
655 c.s[i+n] = 0;
656 }
657 return c;
658 }
659
v_trunc(const v_reg<double,n> & a)660 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
661 {
662 v_reg<int, n> c;
663 for( int i = 0; i < n; i++ )
664 {
665 c.s[i] = cvCeil(a.s[i]);
666 c.s[i+n] = 0;
667 }
668 return c;
669 }
670
v_cvt_f32(const v_reg<int,n> & a)671 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
672 {
673 v_reg<float, n> c;
674 for( int i = 0; i < n; i++ )
675 c.s[i] = (float)a.s[i];
676 return c;
677 }
678
v_cvt_f64(const v_reg<int,n * 2> & a)679 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
680 {
681 v_reg<double, n> c;
682 for( int i = 0; i < n; i++ )
683 c.s[i] = (double)a.s[i];
684 return c;
685 }
686
v_cvt_f64(const v_reg<float,n * 2> & a)687 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
688 {
689 v_reg<double, n> c;
690 for( int i = 0; i < n; i++ )
691 c.s[i] = (double)a.s[i];
692 return c;
693 }
694
695 template<typename _Tp>
v_transpose4x4(v_reg<_Tp,4> & a0,const v_reg<_Tp,4> & a1,const v_reg<_Tp,4> & a2,const v_reg<_Tp,4> & a3,v_reg<_Tp,4> & b0,v_reg<_Tp,4> & b1,v_reg<_Tp,4> & b2,v_reg<_Tp,4> & b3)696 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
697 const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
698 v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
699 v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
700 {
701 b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
702 b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
703 b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
704 b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
705 }
706
707 typedef v_reg<uchar, 16> v_uint8x16;
708 typedef v_reg<schar, 16> v_int8x16;
709 typedef v_reg<ushort, 8> v_uint16x8;
710 typedef v_reg<short, 8> v_int16x8;
711 typedef v_reg<unsigned, 4> v_uint32x4;
712 typedef v_reg<int, 4> v_int32x4;
713 typedef v_reg<float, 4> v_float32x4;
714 typedef v_reg<float, 8> v_float32x8;
715 typedef v_reg<double, 2> v_float64x2;
716 typedef v_reg<uint64, 2> v_uint64x2;
717 typedef v_reg<int64, 2> v_int64x2;
718
719 #define OPENCV_HAL_IMPL_C_INIT(_Tpvec, _Tp, suffix) \
720 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } \
721 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
722 template<typename _Tp0, int n0> inline _Tpvec \
723 v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
724 { return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(a); }
725
OPENCV_HAL_IMPL_C_INIT(v_uint8x16,uchar,u8)726 OPENCV_HAL_IMPL_C_INIT(v_uint8x16, uchar, u8)
727 OPENCV_HAL_IMPL_C_INIT(v_int8x16, schar, s8)
728 OPENCV_HAL_IMPL_C_INIT(v_uint16x8, ushort, u16)
729 OPENCV_HAL_IMPL_C_INIT(v_int16x8, short, s16)
730 OPENCV_HAL_IMPL_C_INIT(v_uint32x4, unsigned, u32)
731 OPENCV_HAL_IMPL_C_INIT(v_int32x4, int, s32)
732 OPENCV_HAL_IMPL_C_INIT(v_float32x4, float, f32)
733 OPENCV_HAL_IMPL_C_INIT(v_float64x2, double, f64)
734 OPENCV_HAL_IMPL_C_INIT(v_uint64x2, uint64, u64)
735 OPENCV_HAL_IMPL_C_INIT(v_uint64x2, int64, s64)
736
737 #define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \
738 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
739 { return a << n; } \
740 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
741 { return a >> n; } \
742 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
743 { \
744 _Tpvec c; \
745 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
746 c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
747 return c; \
748 }
749
750 OPENCV_HAL_IMPL_C_SHIFT(v_uint16x8, ushort)
751 OPENCV_HAL_IMPL_C_SHIFT(v_int16x8, short)
752 OPENCV_HAL_IMPL_C_SHIFT(v_uint32x4, unsigned)
753 OPENCV_HAL_IMPL_C_SHIFT(v_int32x4, int)
754 OPENCV_HAL_IMPL_C_SHIFT(v_uint64x2, uint64)
755 OPENCV_HAL_IMPL_C_SHIFT(v_int64x2, int64)
756
757
758 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
759 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
760 { \
761 _Tpnvec c; \
762 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
763 { \
764 c.s[i] = saturate_cast<_Tpn>(a.s[i]); \
765 c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \
766 } \
767 return c; \
768 } \
769 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
770 { \
771 _Tpnvec c; \
772 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
773 { \
774 c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
775 c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
776 } \
777 return c; \
778 } \
779 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
780 { \
781 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
782 ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
783 } \
784 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
785 { \
786 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
787 ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
788 }
789
790 OPENCV_HAL_IMPL_C_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack)
791 OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_int8x16, schar, pack)
792 OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u)
793 OPENCV_HAL_IMPL_C_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
794 OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_int16x8, short, pack)
795 OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
796 OPENCV_HAL_IMPL_C_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
797 OPENCV_HAL_IMPL_C_PACK(v_int64x2, int64, v_int32x4, int, pack)
798
799 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
800 const v_float32x4& m1, const v_float32x4& m2,
801 const v_float32x4& m3)
802 {
803 return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
804 v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
805 v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
806 v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
807 }
808
809 }
810
811 #endif
812