1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 //   * Redistribution's of source code must retain the above copyright notice,
23 //     this list of conditions and the following disclaimer.
24 //
25 //   * Redistribution's in binary form must reproduce the above copyright notice,
26 //     this list of conditions and the following disclaimer in the documentation
27 //     and/or other materials provided with the distribution.
28 //
29 //   * The name of the copyright holders may not be used to endorse or promote products
30 //     derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44 
45 #ifndef __OPENCV_HAL_INTRIN_CPP_HPP__
46 #define __OPENCV_HAL_INTRIN_CPP_HPP__
47 
48 namespace cv
49 {
50 
51 template<typename _Tp, int n> struct v_reg
52 {
53     typedef _Tp lane_type;
54     typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
55     typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
56     enum { nlanes = n };
57 
v_regcv::v_reg58     explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
v_regcv::v_reg59     v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
v_regcv::v_reg60     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
v_regcv::v_reg61     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
62            _Tp s4, _Tp s5, _Tp s6, _Tp s7)
63     {
64         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
65         s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
66     }
v_regcv::v_reg67     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
68            _Tp s4, _Tp s5, _Tp s6, _Tp s7,
69            _Tp s8, _Tp s9, _Tp s10, _Tp s11,
70            _Tp s12, _Tp s13, _Tp s14, _Tp s15)
71     {
72         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
73         s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
74         s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
75         s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
76     }
77 
v_regcv::v_reg78     v_reg() {}
v_regcv::v_reg79     v_reg(const v_reg<_Tp, n> & r)
80     {
81         for( int i = 0; i < n; i++ )
82             s[i] = r.s[i];
83     }
84 
getcv::v_reg85     _Tp get(const int i) const { return s[i]; }
get0cv::v_reg86     _Tp get0() const { return s[0]; }
highcv::v_reg87     v_reg<_Tp, n> high() const
88     {
89         v_reg<_Tp, n> c;
90         int i;
91         for( i = 0; i < n/2; i++ )
92         {
93             c.s[i] = s[i+(n/2)];
94             c.s[i+(n/2)] = 0;
95         }
96         return c;
97     }
98 
zerocv::v_reg99     static v_reg<_Tp, n> zero()
100     {
101         v_reg<_Tp, n> c;
102         for( int i = 0; i < n; i++ )
103             c.s[i] = (_Tp)0;
104         return c;
105     }
106 
allcv::v_reg107     static v_reg<_Tp, n> all(_Tp s)
108     {
109         v_reg<_Tp, n> c;
110         for( int i = 0; i < n; i++ )
111             c.s[i] = s;
112         return c;
113     }
114 
reinterpret_ascv::v_reg115     template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
116     {
117         size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
118         v_reg<_Tp2, n2> c;
119         memcpy(&c.s[0], &s[0], bytes);
120         return c;
121     }
122 
123     _Tp s[n];
124 };
125 
126 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
127 template<typename _Tp, int n> inline v_reg<_Tp, n> \
128     operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
129 { \
130     v_reg<_Tp, n> c; \
131     for( int i = 0; i < n; i++ ) \
132         c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
133     return c; \
134 } \
135 template<typename _Tp, int n> inline v_reg<_Tp, n>& \
136     operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
137 { \
138     for( int i = 0; i < n; i++ ) \
139         a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
140     return a; \
141 }
142 
143 OPENCV_HAL_IMPL_BIN_OP(+)
144 OPENCV_HAL_IMPL_BIN_OP(-)
145 OPENCV_HAL_IMPL_BIN_OP(*)
146 OPENCV_HAL_IMPL_BIN_OP(/)
147 
148 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
149 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
150     (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
151 { \
152     v_reg<_Tp, n> c; \
153     typedef typename V_TypeTraits<_Tp>::int_type itype; \
154     for( int i = 0; i < n; i++ ) \
155         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
156                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
157     return c; \
158 } \
159 template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
160     bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
161 { \
162     typedef typename V_TypeTraits<_Tp>::int_type itype; \
163     for( int i = 0; i < n; i++ ) \
164         a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
165                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
166     return a; \
167 }
168 
169 OPENCV_HAL_IMPL_BIT_OP(&)
170 OPENCV_HAL_IMPL_BIT_OP(|)
171 OPENCV_HAL_IMPL_BIT_OP(^)
172 
operator ~(const v_reg<_Tp,n> & a)173 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
174 {
175     v_reg<_Tp, n> c;
176     for( int i = 0; i < n; i++ )
177         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
178         return c;
179 }
180 
181 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
182 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
183 { \
184     v_reg<_Tp2, n> c; \
185     for( int i = 0; i < n; i++ ) \
186         c.s[i] = cfunc(a.s[i]); \
187     return c; \
188 }
189 
OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt,std::sqrt,_Tp)190 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
191 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
192 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
193 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
194 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
195 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
196                           typename V_TypeTraits<_Tp>::abs_type)
197 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
198 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
199 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
200 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
201 
202 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \
203 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
204 { \
205     v_reg<_Tp, n> c; \
206     for( int i = 0; i < n; i++ ) \
207         c.s[i] = cfunc(a.s[i], b.s[i]); \
208     return c; \
209 } \
210 template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \
211 { \
212     _Tp c = a.s[0]; \
213     for( int i = 1; i < n; i++ ) \
214         c = cfunc(c, a.s[i]); \
215     return c; \
216 }
217 
218 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min)
219 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max)
220 
221 template<typename _Tp, int n>
222 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
223                       v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
224 {
225     for( int i = 0; i < n; i++ )
226     {
227         minval.s[i] = std::min(a.s[i], b.s[i]);
228         maxval.s[i] = std::max(a.s[i], b.s[i]);
229     }
230 }
231 
232 
233 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
234 template<typename _Tp, int n> \
235 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
236 { \
237     typedef typename V_TypeTraits<_Tp>::int_type itype; \
238     v_reg<_Tp, n> c; \
239     for( int i = 0; i < n; i++ ) \
240         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
241     return c; \
242 }
243 
244 OPENCV_HAL_IMPL_CMP_OP(<)
245 OPENCV_HAL_IMPL_CMP_OP(>)
246 OPENCV_HAL_IMPL_CMP_OP(<=)
247 OPENCV_HAL_IMPL_CMP_OP(>=)
248 OPENCV_HAL_IMPL_CMP_OP(==)
249 OPENCV_HAL_IMPL_CMP_OP(!=)
250 
251 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
252 template<typename _Tp, int n> \
253 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
254 { \
255     typedef _Tp2 rtype; \
256     v_reg<rtype, n> c; \
257     for( int i = 0; i < n; i++ ) \
258         c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
259     return c; \
260 }
261 
262 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
263 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
264 OPENCV_HAL_IMPL_ADD_SUB_OP(v_absdiff, -, (rtype)std::abs, typename V_TypeTraits<_Tp>::abs_type)
265 
266 template<typename _Tp, int n>
v_invsqrt(const v_reg<_Tp,n> & a)267 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
268 {
269     v_reg<_Tp, n> c;
270     for( int i = 0; i < n; i++ )
271         c.s[i] = 1.f/std::sqrt(a.s[i]);
272     return c;
273 }
274 
275 template<typename _Tp, int n>
v_magnitude(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)276 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
277 {
278     v_reg<_Tp, n> c;
279     for( int i = 0; i < n; i++ )
280         c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
281     return c;
282 }
283 
284 
285 template<typename _Tp, int n>
v_sqr_magnitude(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)286 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
287 {
288     v_reg<_Tp, n> c;
289     for( int i = 0; i < n; i++ )
290         c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
291     return c;
292 }
293 
294 template<typename _Tp, int n>
v_muladd(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c)295 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
296                               const v_reg<_Tp, n>& c)
297 {
298     v_reg<_Tp, n> d;
299     for( int i = 0; i < n; i++ )
300         d.s[i] = a.s[i]*b.s[i] + c.s[i];
301     return d;
302 }
303 
304 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)305     v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
306 {
307     typedef typename V_TypeTraits<_Tp>::w_type w_type;
308     v_reg<w_type, n/2> c;
309     for( int i = 0; i < (n/2); i++ )
310         c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
311     return c;
312 }
313 
v_mul_expand(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & c,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & d)314 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
315                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
316                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
317 {
318     typedef typename V_TypeTraits<_Tp>::w_type w_type;
319     for( int i = 0; i < (n/2); i++ )
320     {
321         c.s[i] = (w_type)a.s[i]*b.s[i]*2;
322         d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
323     }
324 }
325 
v_hsum(const v_reg<_Tp,n> & a,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & c)326 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
327                                                  v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
328 {
329     typedef typename V_TypeTraits<_Tp>::w_type w_type;
330     for( int i = 0; i < (n/2); i++ )
331     {
332         c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
333     }
334 }
335 
336 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
337 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
338 { \
339     v_reg<_Tp, n> c; \
340     for( int i = 0; i < n; i++ ) \
341         c.s[i] = (_Tp)(a.s[i] shift_op imm); \
342     return c; \
343 }
344 
345 OPENCV_HAL_IMPL_SHIFT_OP(<<)
346 OPENCV_HAL_IMPL_SHIFT_OP(>>)
347 
v_reduce_sum(const v_reg<_Tp,n> & a)348 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
349 {
350     typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
351     for( int i = 1; i < n; i++ )
352         c += a.s[i];
353     return c;
354 }
355 
v_signmask(const v_reg<_Tp,n> & a)356 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
357 {
358     int mask = 0;
359     for( int i = 0; i < n; i++ )
360         mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
361     return mask;
362 }
363 
v_check_all(const v_reg<_Tp,n> & a)364 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
365 {
366     for( int i = 0; i < n; i++ )
367         if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
368             return false;
369     return true;
370 }
371 
v_check_any(const v_reg<_Tp,n> & a)372 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
373 {
374     for( int i = 0; i < n; i++ )
375         if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
376             return true;
377     return false;
378 }
379 
v_select(const v_reg<_Tp,n> & mask,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)380 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
381                                                            const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
382 {
383     v_reg<_Tp, n> c;
384     for( int i = 0; i < n; i++ )
385         c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i];
386     return c;
387 }
388 
v_expand(const v_reg<_Tp,n> & a,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & b0,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & b1)389 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
390                             v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
391                             v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
392 {
393     for( int i = 0; i < (n/2); i++ )
394     {
395         b0.s[i] = a.s[i];
396         b1.s[i] = a.s[i+(n/2)];
397     }
398 }
399 
400 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
v_reinterpret_as_int(const v_reg<_Tp,n> & a)401     v_reinterpret_as_int(const v_reg<_Tp, n>& a)
402 {
403     v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
404     for( int i = 0; i < n; i++ )
405         c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
406     return c;
407 }
408 
409 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
v_reinterpret_as_uint(const v_reg<_Tp,n> & a)410     v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
411 {
412     v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
413     for( int i = 0; i < n; i++ )
414         c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
415     return c;
416 }
417 
v_zip(const v_reg<_Tp,n> & a0,const v_reg<_Tp,n> & a1,v_reg<_Tp,n> & b0,v_reg<_Tp,n> & b1)418 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
419                                                v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
420 {
421     int i;
422     for( i = 0; i < n/2; i++ )
423     {
424         b0.s[i*2] = a0.s[i];
425         b0.s[i*2+1] = a1.s[i];
426     }
427     for( ; i < n; i++ )
428     {
429         b1.s[i*2-n] = a0.s[i];
430         b1.s[i*2-n+1] = a1.s[i];
431     }
432 }
433 
v_load(const _Tp * ptr)434 template<typename _Tp, int n> inline v_reg<_Tp, n> v_load(const _Tp* ptr)
435 {
436     return v_reg<_Tp, n>(ptr);
437 }
438 
v_load_aligned(const _Tp * ptr)439 template<typename _Tp, int n> inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr)
440 {
441     return v_reg<_Tp, n>(ptr);
442 }
443 
v_load_halves(const _Tp * loptr,const _Tp * hiptr)444 template<typename _Tp, int n> inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr)
445 {
446     v_reg<_Tp, n> c;
447     for( int i = 0; i < n/2; i++ )
448     {
449         c.s[i] = loptr[i];
450         c.s[i+n/2] = hiptr[i];
451     }
452     return c;
453 }
454 
v_load_expand(const _Tp * ptr)455 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n> v_load_expand(const _Tp* ptr)
456 {
457     typedef typename V_TypeTraits<_Tp>::w_type w_type;
458     v_reg<w_type, n> c;
459     for( int i = 0; i < n; i++ )
460     {
461         c.s[i] = ptr[i];
462     }
463     return c;
464 }
465 
466 template<typename _Tp, int n> inline v_reg<typename
v_load_expand_q(const _Tp * ptr)467     V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr)
468 {
469     typedef typename V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type w_type;
470     v_reg<w_type, n> c;
471     for( int i = 0; i < n; i++ )
472     {
473         c.s[i] = ptr[i];
474     }
475     return c;
476 }
477 
v_load_deinterleave(const _Tp * ptr,v_reg<_Tp,n> & a,v_reg<_Tp,n> & b,v_reg<_Tp,n> & c)478 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
479                                                             v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
480 {
481     int i, i3;
482     for( i = i3 = 0; i < n; i++, i3 += 3 )
483     {
484         a.s[i] = ptr[i3];
485         b.s[i] = ptr[i3+1];
486         c.s[i] = ptr[i3+2];
487     }
488 }
489 
490 template<typename _Tp, int n>
v_load_deinterleave(const _Tp * ptr,v_reg<_Tp,n> & a,v_reg<_Tp,n> & b,v_reg<_Tp,n> & c,v_reg<_Tp,n> & d)491 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
492                                 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
493                                 v_reg<_Tp, n>& d)
494 {
495     int i, i4;
496     for( i = i4 = 0; i < n; i++, i4 += 4 )
497     {
498         a.s[i] = ptr[i4];
499         b.s[i] = ptr[i4+1];
500         c.s[i] = ptr[i4+2];
501         d.s[i] = ptr[i4+3];
502     }
503 }
504 
505 template<typename _Tp, int n>
v_store_interleave(_Tp * ptr,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c)506 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
507                                 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
508 {
509     int i, i3;
510     for( i = i3 = 0; i < n; i++, i3 += 3 )
511     {
512         ptr[i3] = a.s[i];
513         ptr[i3+1] = b.s[i];
514         ptr[i3+2] = c.s[i];
515     }
516 }
517 
v_store_interleave(_Tp * ptr,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c,const v_reg<_Tp,n> & d)518 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
519                                                             const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
520                                                             const v_reg<_Tp, n>& d)
521 {
522     int i, i4;
523     for( i = i4 = 0; i < n; i++, i4 += 4 )
524     {
525         ptr[i4] = a.s[i];
526         ptr[i4+1] = b.s[i];
527         ptr[i4+2] = c.s[i];
528         ptr[i4+3] = d.s[i];
529     }
530 }
531 
532 template<typename _Tp, int n>
v_store(_Tp * ptr,const v_reg<_Tp,n> & a)533 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
534 {
535     for( int i = 0; i < n; i++ )
536         ptr[i] = a.s[i];
537 }
538 
539 template<typename _Tp, int n>
v_store_low(_Tp * ptr,const v_reg<_Tp,n> & a)540 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
541 {
542     for( int i = 0; i < (n/2); i++ )
543         ptr[i] = a.s[i];
544 }
545 
546 template<typename _Tp, int n>
v_store_high(_Tp * ptr,const v_reg<_Tp,n> & a)547 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
548 {
549     for( int i = 0; i < (n/2); i++ )
550         ptr[i] = a.s[i+(n/2)];
551 }
552 
553 template<typename _Tp, int n>
v_store_aligned(_Tp * ptr,const v_reg<_Tp,n> & a)554 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
555 {
556     for( int i = 0; i < n; i++ )
557         ptr[i] = a.s[i];
558 }
559 
560 template<typename _Tp, int n>
v_combine_low(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)561 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
562 {
563     v_reg<_Tp, n> c;
564     for( int i = 0; i < (n/2); i++ )
565     {
566         c.s[i] = a.s[i];
567         c.s[i+(n/2)] = b.s[i];
568     }
569 }
570 
571 template<typename _Tp, int n>
v_combine_high(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)572 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
573 {
574     v_reg<_Tp, n> c;
575     for( int i = 0; i < (n/2); i++ )
576     {
577         c.s[i] = a.s[i+(n/2)];
578         c.s[i+(n/2)] = b.s[i+(n/2)];
579     }
580 }
581 
582 template<typename _Tp, int n>
v_recombine(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,v_reg<_Tp,n> & low,v_reg<_Tp,n> & high)583 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
584                         v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
585 {
586     for( int i = 0; i < (n/2); i++ )
587     {
588         low.s[i] = a.s[i];
589         low.s[i+(n/2)] = b.s[i];
590         high.s[i] = a.s[i+(n/2)];
591         high.s[i+(n/2)] = b.s[i+(n/2)];
592     }
593 }
594 
v_round(const v_reg<float,n> & a)595 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
596 {
597     v_reg<int, n> c;
598     for( int i = 0; i < n; i++ )
599         c.s[i] = cvRound(a.s[i]);
600     return c;
601 }
602 
v_floor(const v_reg<float,n> & a)603 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
604 {
605     v_reg<int, n> c;
606     for( int i = 0; i < n; i++ )
607         c.s[i] = cvFloor(a.s[i]);
608     return c;
609 }
610 
v_ceil(const v_reg<float,n> & a)611 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
612 {
613     v_reg<int, n> c;
614     for( int i = 0; i < n; i++ )
615         c.s[i] = cvCeil(a.s[i]);
616     return c;
617 }
618 
v_trunc(const v_reg<float,n> & a)619 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
620 {
621     v_reg<int, n> c;
622     for( int i = 0; i < n; i++ )
623         c.s[i] = (int)(a.s[i]);
624     return c;
625 }
626 
v_round(const v_reg<double,n> & a)627 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
628 {
629     v_reg<int, n*2> c;
630     for( int i = 0; i < n; i++ )
631     {
632         c.s[i] = cvRound(a.s[i]);
633         c.s[i+n] = 0;
634     }
635     return c;
636 }
637 
v_floor(const v_reg<double,n> & a)638 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
639 {
640     v_reg<int, n> c;
641     for( int i = 0; i < n; i++ )
642     {
643         c.s[i] = cvFloor(a.s[i]);
644         c.s[i+n] = 0;
645     }
646     return c;
647 }
648 
v_ceil(const v_reg<double,n> & a)649 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
650 {
651     v_reg<int, n> c;
652     for( int i = 0; i < n; i++ )
653     {
654         c.s[i] = cvCeil(a.s[i]);
655         c.s[i+n] = 0;
656     }
657     return c;
658 }
659 
v_trunc(const v_reg<double,n> & a)660 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
661 {
662     v_reg<int, n> c;
663     for( int i = 0; i < n; i++ )
664     {
665         c.s[i] = cvCeil(a.s[i]);
666         c.s[i+n] = 0;
667     }
668     return c;
669 }
670 
v_cvt_f32(const v_reg<int,n> & a)671 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
672 {
673     v_reg<float, n> c;
674     for( int i = 0; i < n; i++ )
675         c.s[i] = (float)a.s[i];
676     return c;
677 }
678 
v_cvt_f64(const v_reg<int,n * 2> & a)679 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
680 {
681     v_reg<double, n> c;
682     for( int i = 0; i < n; i++ )
683         c.s[i] = (double)a.s[i];
684     return c;
685 }
686 
v_cvt_f64(const v_reg<float,n * 2> & a)687 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
688 {
689     v_reg<double, n> c;
690     for( int i = 0; i < n; i++ )
691         c.s[i] = (double)a.s[i];
692     return c;
693 }
694 
695 template<typename _Tp>
v_transpose4x4(v_reg<_Tp,4> & a0,const v_reg<_Tp,4> & a1,const v_reg<_Tp,4> & a2,const v_reg<_Tp,4> & a3,v_reg<_Tp,4> & b0,v_reg<_Tp,4> & b1,v_reg<_Tp,4> & b2,v_reg<_Tp,4> & b3)696 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
697                             const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
698                             v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
699                             v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
700 {
701     b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
702     b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
703     b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
704     b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
705 }
706 
707 typedef v_reg<uchar, 16> v_uint8x16;
708 typedef v_reg<schar, 16> v_int8x16;
709 typedef v_reg<ushort, 8> v_uint16x8;
710 typedef v_reg<short, 8> v_int16x8;
711 typedef v_reg<unsigned, 4> v_uint32x4;
712 typedef v_reg<int, 4> v_int32x4;
713 typedef v_reg<float, 4> v_float32x4;
714 typedef v_reg<float, 8> v_float32x8;
715 typedef v_reg<double, 2> v_float64x2;
716 typedef v_reg<uint64, 2> v_uint64x2;
717 typedef v_reg<int64, 2> v_int64x2;
718 
719 #define OPENCV_HAL_IMPL_C_INIT(_Tpvec, _Tp, suffix) \
720 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } \
721 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
722 template<typename _Tp0, int n0> inline _Tpvec \
723     v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
724 { return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(a); }
725 
OPENCV_HAL_IMPL_C_INIT(v_uint8x16,uchar,u8)726 OPENCV_HAL_IMPL_C_INIT(v_uint8x16, uchar, u8)
727 OPENCV_HAL_IMPL_C_INIT(v_int8x16, schar, s8)
728 OPENCV_HAL_IMPL_C_INIT(v_uint16x8, ushort, u16)
729 OPENCV_HAL_IMPL_C_INIT(v_int16x8, short, s16)
730 OPENCV_HAL_IMPL_C_INIT(v_uint32x4, unsigned, u32)
731 OPENCV_HAL_IMPL_C_INIT(v_int32x4, int, s32)
732 OPENCV_HAL_IMPL_C_INIT(v_float32x4, float, f32)
733 OPENCV_HAL_IMPL_C_INIT(v_float64x2, double, f64)
734 OPENCV_HAL_IMPL_C_INIT(v_uint64x2, uint64, u64)
735 OPENCV_HAL_IMPL_C_INIT(v_uint64x2, int64, s64)
736 
737 #define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \
738 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
739 { return a << n; } \
740 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
741 { return a >> n; } \
742 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
743 { \
744     _Tpvec c; \
745     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
746         c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
747     return c; \
748 }
749 
750 OPENCV_HAL_IMPL_C_SHIFT(v_uint16x8, ushort)
751 OPENCV_HAL_IMPL_C_SHIFT(v_int16x8, short)
752 OPENCV_HAL_IMPL_C_SHIFT(v_uint32x4, unsigned)
753 OPENCV_HAL_IMPL_C_SHIFT(v_int32x4, int)
754 OPENCV_HAL_IMPL_C_SHIFT(v_uint64x2, uint64)
755 OPENCV_HAL_IMPL_C_SHIFT(v_int64x2, int64)
756 
757 
758 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
759 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
760 { \
761     _Tpnvec c; \
762     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
763     { \
764         c.s[i] = saturate_cast<_Tpn>(a.s[i]); \
765         c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \
766     } \
767     return c; \
768 } \
769 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
770 { \
771     _Tpnvec c; \
772     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
773     { \
774         c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
775         c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
776     } \
777     return c; \
778 } \
779 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
780 { \
781     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
782         ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
783 } \
784 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
785 { \
786     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
787         ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
788 }
789 
790 OPENCV_HAL_IMPL_C_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack)
791 OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_int8x16, schar, pack)
792 OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u)
793 OPENCV_HAL_IMPL_C_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
794 OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_int16x8, short, pack)
795 OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
796 OPENCV_HAL_IMPL_C_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
797 OPENCV_HAL_IMPL_C_PACK(v_int64x2, int64, v_int32x4, int, pack)
798 
799 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
800                             const v_float32x4& m1, const v_float32x4& m2,
801                             const v_float32x4& m3)
802 {
803     return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
804                        v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
805                        v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
806                        v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
807 }
808 
809 }
810 
811 #endif
812