1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 //   * Redistribution's of source code must retain the above copyright notice,
23 //     this list of conditions and the following disclaimer.
24 //
25 //   * Redistribution's in binary form must reproduce the above copyright notice,
26 //     this list of conditions and the following disclaimer in the documentation
27 //     and/or other materials provided with the distribution.
28 //
29 //   * The name of the copyright holders may not be used to endorse or promote products
30 //     derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44 
45 #ifndef __OPENCV_HAL_INTRIN_NEON_HPP__
46 #define __OPENCV_HAL_INTRIN_NEON_HPP__
47 
48 namespace cv
49 {
50 
51 #define CV_SIMD128 1
52 
53 struct v_uint8x16
54 {
55     typedef uchar lane_type;
56     enum { nlanes = 16 };
57 
v_uint8x16cv::v_uint8x1658     v_uint8x16() {}
v_uint8x16cv::v_uint8x1659     explicit v_uint8x16(uint8x16_t v) : val(v) {}
v_uint8x16cv::v_uint8x1660     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
61                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
62     {
63         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
64         val = vld1q_u8(v);
65     }
get0cv::v_uint8x1666     uchar get0() const
67     {
68         return vgetq_lane_u8(val, 0);
69     }
70 
71     uint8x16_t val;
72 };
73 
74 struct v_int8x16
75 {
76     typedef schar lane_type;
77     enum { nlanes = 16 };
78 
v_int8x16cv::v_int8x1679     v_int8x16() {}
v_int8x16cv::v_int8x1680     explicit v_int8x16(int8x16_t v) : val(v) {}
v_int8x16cv::v_int8x1681     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
82                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
83     {
84         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
85         val = vld1q_s8(v);
86     }
get0cv::v_int8x1687     schar get0() const
88     {
89         return vgetq_lane_s8(val, 0);
90     }
91 
92     int8x16_t val;
93 };
94 
95 struct v_uint16x8
96 {
97     typedef ushort lane_type;
98     enum { nlanes = 8 };
99 
v_uint16x8cv::v_uint16x8100     v_uint16x8() {}
v_uint16x8cv::v_uint16x8101     explicit v_uint16x8(uint16x8_t v) : val(v) {}
v_uint16x8cv::v_uint16x8102     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
103     {
104         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
105         val = vld1q_u16(v);
106     }
get0cv::v_uint16x8107     ushort get0() const
108     {
109         return vgetq_lane_u16(val, 0);
110     }
111 
112     uint16x8_t val;
113 };
114 
115 struct v_int16x8
116 {
117     typedef short lane_type;
118     enum { nlanes = 8 };
119 
v_int16x8cv::v_int16x8120     v_int16x8() {}
v_int16x8cv::v_int16x8121     explicit v_int16x8(int16x8_t v) : val(v) {}
v_int16x8cv::v_int16x8122     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
123     {
124         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
125         val = vld1q_s16(v);
126     }
get0cv::v_int16x8127     short get0() const
128     {
129         return vgetq_lane_s16(val, 0);
130     }
131 
132     int16x8_t val;
133 };
134 
135 struct v_uint32x4
136 {
137     typedef unsigned lane_type;
138     enum { nlanes = 4 };
139 
v_uint32x4cv::v_uint32x4140     v_uint32x4() {}
v_uint32x4cv::v_uint32x4141     explicit v_uint32x4(uint32x4_t v) : val(v) {}
v_uint32x4cv::v_uint32x4142     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
143     {
144         unsigned v[] = {v0, v1, v2, v3};
145         val = vld1q_u32(v);
146     }
get0cv::v_uint32x4147     unsigned get0() const
148     {
149         return vgetq_lane_u32(val, 0);
150     }
151 
152     uint32x4_t val;
153 };
154 
155 struct v_int32x4
156 {
157     typedef int lane_type;
158     enum { nlanes = 4 };
159 
v_int32x4cv::v_int32x4160     v_int32x4() {}
v_int32x4cv::v_int32x4161     explicit v_int32x4(int32x4_t v) : val(v) {}
v_int32x4cv::v_int32x4162     v_int32x4(int v0, int v1, int v2, int v3)
163     {
164         int v[] = {v0, v1, v2, v3};
165         val = vld1q_s32(v);
166     }
get0cv::v_int32x4167     int get0() const
168     {
169         return vgetq_lane_s32(val, 0);
170     }
171     int32x4_t val;
172 };
173 
174 struct v_float32x4
175 {
176     typedef float lane_type;
177     enum { nlanes = 4 };
178 
v_float32x4cv::v_float32x4179     v_float32x4() {}
v_float32x4cv::v_float32x4180     explicit v_float32x4(float32x4_t v) : val(v) {}
v_float32x4cv::v_float32x4181     v_float32x4(float v0, float v1, float v2, float v3)
182     {
183         float v[] = {v0, v1, v2, v3};
184         val = vld1q_f32(v);
185     }
get0cv::v_float32x4186     float get0() const
187     {
188         return vgetq_lane_f32(val, 0);
189     }
190     float32x4_t val;
191 };
192 
193 struct v_uint64x2
194 {
195     typedef uint64 lane_type;
196     enum { nlanes = 2 };
197 
v_uint64x2cv::v_uint64x2198     v_uint64x2() {}
v_uint64x2cv::v_uint64x2199     explicit v_uint64x2(uint64x2_t v) : val(v) {}
v_uint64x2cv::v_uint64x2200     v_uint64x2(unsigned v0, unsigned v1)
201     {
202         uint64 v[] = {v0, v1};
203         val = vld1q_u64(v);
204     }
get0cv::v_uint64x2205     uint64 get0() const
206     {
207         return vgetq_lane_u64(val, 0);
208     }
209     uint64x2_t val;
210 };
211 
212 struct v_int64x2
213 {
214     typedef int64 lane_type;
215     enum { nlanes = 2 };
216 
v_int64x2cv::v_int64x2217     v_int64x2() {}
v_int64x2cv::v_int64x2218     explicit v_int64x2(int64x2_t v) : val(v) {}
v_int64x2cv::v_int64x2219     v_int64x2(int v0, int v1)
220     {
221         int64 v[] = {v0, v1};
222         val = vld1q_s64(v);
223     }
get0cv::v_int64x2224     int64 get0() const
225     {
226         return vgetq_lane_s64(val, 0);
227     }
228     int64x2_t val;
229 };
230 
231 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
232 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
233 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
234 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
235 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
236 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
237 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
238 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
239 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
240 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
241 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
242 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
243 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
244 
OPENCV_HAL_IMPL_NEON_INIT(uint8x16,uchar,u8)245 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
246 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
247 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
248 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
249 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
250 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
251 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
252 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
253 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
254 
255 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \
256 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
257 { \
258     hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \
259     return _Tpvec(vcombine_##suffix(a1, b1)); \
260 } \
261 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
262 { \
263     hreg a1 = vqmov##op##_##wsuffix(a.val); \
264     vst1_##suffix(ptr, a1); \
265 } \
266 template<int n> inline \
267 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
268 { \
269     hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
270     hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \
271     return _Tpvec(vcombine_##suffix(a1, b1)); \
272 } \
273 template<int n> inline \
274 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
275 { \
276     hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
277     vst1_##suffix(ptr, a1); \
278 }
279 
280 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n)
281 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
282 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n)
283 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
284 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
285 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
286 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
287 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
288 
289 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
290                             const v_float32x4& m1, const v_float32x4& m2,
291                             const v_float32x4& m3)
292 {
293     float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
294     float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
295     res = vmlaq_lane_f32(res, m1.val, vl, 1);
296     res = vmlaq_lane_f32(res, m2.val, vh, 0);
297     res = vmlaq_lane_f32(res, m3.val, vh, 1);
298     return v_float32x4(res);
299 }
300 
301 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
302 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
303 { \
304     return _Tpvec(intrin(a.val, b.val)); \
305 } \
306 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
307 { \
308     a.val = intrin(a.val, b.val); \
309     return a; \
310 }
311 
312 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
313 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
314 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
315 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
316 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
317 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
318 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
319 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
320 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
321 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
322 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
323 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
324 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
325 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
326 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
327 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
328 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
329 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
330 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
331 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
332 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
333 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
334 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
335 
operator /(const v_float32x4 & a,const v_float32x4 & b)336 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
337 {
338     float32x4_t reciprocal = vrecpeq_f32(b.val);
339     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
340     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
341     return v_float32x4(vmulq_f32(a.val, reciprocal));
342 }
operator /=(v_float32x4 & a,const v_float32x4 & b)343 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
344 {
345     float32x4_t reciprocal = vrecpeq_f32(b.val);
346     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
347     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
348     a.val = vmulq_f32(a.val, reciprocal);
349     return a;
350 }
351 
v_mul_expand(const v_int16x8 & a,const v_int16x8 & b,v_int32x4 & c,v_int32x4 & d)352 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
353                          v_int32x4& c, v_int32x4& d)
354 {
355     c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
356     d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
357 }
358 
v_mul_expand(const v_uint16x8 & a,const v_uint16x8 & b,v_uint32x4 & c,v_uint32x4 & d)359 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
360                          v_uint32x4& c, v_uint32x4& d)
361 {
362     c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
363     d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
364 }
365 
v_mul_expand(const v_uint32x4 & a,const v_uint32x4 & b,v_uint64x2 & c,v_uint64x2 & d)366 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
367                          v_uint64x2& c, v_uint64x2& d)
368 {
369     c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
370     d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
371 }
372 
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)373 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
374 {
375     int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
376     int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
377     int32x4x2_t cd = vtrnq_s32(c, d);
378     return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
379 }
380 
381 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
382     OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
383     OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
384     OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
385     inline _Tpvec operator ~ (const _Tpvec& a) \
386     { \
387         return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
388     }
389 
390 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
391 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
392 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
393 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
394 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
395 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
396 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
397 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
398 
399 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
400 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
401 { \
402     return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
403 } \
404 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
405 { \
406     a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
407     return a; \
408 }
409 
410 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
411 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
412 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
413 
414 inline v_float32x4 operator ~ (const v_float32x4& a)
415 {
416     return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
417 }
418 
v_sqrt(const v_float32x4 & x)419 inline v_float32x4 v_sqrt(const v_float32x4& x)
420 {
421     float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
422     float32x4_t e = vrsqrteq_f32(x1);
423     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
424     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
425     return v_float32x4(vmulq_f32(x.val, e));
426 }
427 
v_invsqrt(const v_float32x4 & x)428 inline v_float32x4 v_invsqrt(const v_float32x4& x)
429 {
430     float32x4_t e = vrsqrteq_f32(x.val);
431     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
432     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
433     return v_float32x4(e);
434 }
435 
v_abs(v_float32x4 x)436 inline v_float32x4 v_abs(v_float32x4 x)
437 { return v_float32x4(vabsq_f32(x.val)); }
438 
439 // TODO: exp, log, sin, cos
440 
441 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
442 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
443 { \
444     return _Tpvec(intrin(a.val, b.val)); \
445 }
446 
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16,v_min,vminq_u8)447 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
448 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
449 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
450 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
451 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
452 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
453 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
454 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
455 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
456 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
457 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
458 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
459 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
460 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
461 
462 
463 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
464 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
465 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
466 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
467 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
468 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
469 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
470 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
471 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
472 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
473 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
474 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
475 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
476 
477 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
478 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
479 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
480 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
481 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
482 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
483 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
484 
485 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
486 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
487 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
488 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
489 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
490 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
491 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
492 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
493 
494 // TODO: absdiff for signed integers
495 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
496 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
497 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
498 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
499 
500 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
501 {
502     v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
503     return v_sqrt(x);
504 }
505 
v_sqr_magnitude(const v_float32x4 & a,const v_float32x4 & b)506 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
507 {
508     return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
509 }
510 
v_muladd(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)511 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
512 {
513     return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
514 }
515 
516 // trade efficiency for convenience
517 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
518 inline _Tpvec operator << (const _Tpvec& a, int n) \
519 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
520 inline _Tpvec operator >> (const _Tpvec& a, int n) \
521 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
522 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
523 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
524 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
525 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
526 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
527 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
528 
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16,u8,schar,s8)529 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
530 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
531 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
532 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
533 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
534 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
535 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
536 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
537 
538 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
539 inline _Tpvec v_load(const _Tp* ptr) \
540 { return _Tpvec(vld1q_##suffix(ptr)); } \
541 inline _Tpvec v_load_aligned(const _Tp* ptr) \
542 { return _Tpvec(vld1q_##suffix(ptr)); } \
543 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
544 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
545 inline void v_store(_Tp* ptr, const _Tpvec& a) \
546 { vst1q_##suffix(ptr, a.val); } \
547 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
548 { vst1q_##suffix(ptr, a.val); } \
549 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
550 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
551 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
552 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
553 
554 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
555 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
556 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
557 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
558 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
559 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
560 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
561 
562 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
563 inline scalartype v_reduce_##func(const _Tpvec& a) \
564 { \
565     scalartype CV_DECL_ALIGNED(16) buf[4]; \
566     v_store_aligned(buf, a); \
567     scalartype s0 = scalar_func(buf[0], buf[1]); \
568     scalartype s1 = scalar_func(buf[2], buf[3]); \
569     return scalar_func(s0, s1); \
570 }
571 
572 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
573 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
574 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
575 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
576 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max)
577 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min)
578 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
579 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max)
580 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min)
581 
582 inline int v_signmask(const v_uint8x16& a)
583 {
584     int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
585     uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
586     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
587     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
588 }
v_signmask(const v_int8x16 & a)589 inline int v_signmask(const v_int8x16& a)
590 { return v_signmask(v_reinterpret_as_u8(a)); }
591 
v_signmask(const v_uint16x8 & a)592 inline int v_signmask(const v_uint16x8& a)
593 {
594     int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
595     uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
596     uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
597     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
598 }
v_signmask(const v_int16x8 & a)599 inline int v_signmask(const v_int16x8& a)
600 { return v_signmask(v_reinterpret_as_u16(a)); }
601 
v_signmask(const v_uint32x4 & a)602 inline int v_signmask(const v_uint32x4& a)
603 {
604     int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
605     uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
606     uint64x2_t v1 = vpaddlq_u32(v0);
607     return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
608 }
v_signmask(const v_int32x4 & a)609 inline int v_signmask(const v_int32x4& a)
610 { return v_signmask(v_reinterpret_as_u32(a)); }
v_signmask(const v_float32x4 & a)611 inline int v_signmask(const v_float32x4& a)
612 { return v_signmask(v_reinterpret_as_u32(a)); }
613 
614 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
615 inline bool v_check_all(const v_##_Tpvec& a) \
616 { \
617     _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
618     uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
619     return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
620 } \
621 inline bool v_check_any(const v_##_Tpvec& a) \
622 { \
623     _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
624     uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
625     return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
626 }
627 
628 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
629 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
630 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
631 
v_check_all(const v_int8x16 & a)632 inline bool v_check_all(const v_int8x16& a)
633 { return v_check_all(v_reinterpret_as_u8(a)); }
v_check_all(const v_int16x8 & a)634 inline bool v_check_all(const v_int16x8& a)
635 { return v_check_all(v_reinterpret_as_u16(a)); }
v_check_all(const v_int32x4 & a)636 inline bool v_check_all(const v_int32x4& a)
637 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_all(const v_float32x4 & a)638 inline bool v_check_all(const v_float32x4& a)
639 { return v_check_all(v_reinterpret_as_u32(a)); }
640 
v_check_any(const v_int8x16 & a)641 inline bool v_check_any(const v_int8x16& a)
642 { return v_check_all(v_reinterpret_as_u8(a)); }
v_check_any(const v_int16x8 & a)643 inline bool v_check_any(const v_int16x8& a)
644 { return v_check_all(v_reinterpret_as_u16(a)); }
v_check_any(const v_int32x4 & a)645 inline bool v_check_any(const v_int32x4& a)
646 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_any(const v_float32x4 & a)647 inline bool v_check_any(const v_float32x4& a)
648 { return v_check_all(v_reinterpret_as_u32(a)); }
649 
650 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
651 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
652 { \
653     return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
654 }
655 
OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16,u8,u8)656 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
657 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
658 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
659 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
660 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
661 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
662 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
663 
664 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
665 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
666 { \
667     b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
668     b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
669 } \
670 inline _Tpwvec v_load_expand(const _Tp* ptr) \
671 { \
672     return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
673 }
674 
675 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
676 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
677 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
678 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
679 
680 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
681 {
682     uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
683     uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
684     return v_uint32x4(vmovl_u16(v1));
685 }
686 
v_load_expand_q(const schar * ptr)687 inline v_int32x4 v_load_expand_q(const schar* ptr)
688 {
689     int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
690     int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
691     return v_int32x4(vmovl_s16(v1));
692 }
693 
694 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
695 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
696 { \
697     _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
698     b0.val = p.val[0]; \
699     b1.val = p.val[1]; \
700 } \
701 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
702 { \
703     return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
704 } \
705 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
706 { \
707     return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
708 } \
709 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
710 { \
711     c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
712     d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
713 }
714 
OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16,u8)715 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
716 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
717 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
718 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
719 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
720 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
721 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
722 
723 inline v_int32x4 v_round(const v_float32x4& a)
724 {
725     static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
726         v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
727 
728     int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
729     return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
730 }
731 
v_floor(const v_float32x4 & a)732 inline v_int32x4 v_floor(const v_float32x4& a)
733 {
734     int32x4_t a1 = vcvtq_s32_f32(a.val);
735     uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
736     return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
737 }
738 
v_ceil(const v_float32x4 & a)739 inline v_int32x4 v_ceil(const v_float32x4& a)
740 {
741     int32x4_t a1 = vcvtq_s32_f32(a.val);
742     uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
743     return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
744 }
745 
v_trunc(const v_float32x4 & a)746 inline v_int32x4 v_trunc(const v_float32x4& a)
747 { return v_int32x4(vcvtq_s32_f32(a.val)); }
748 
749 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
750 inline void transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
751                          const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
752                          v_##_Tpvec& b0, v_##_Tpvec& b1, \
753                          v_##_Tpvec& b2, v_##_Tpvec& b3) \
754 { \
755     /* m00 m01 m02 m03 */ \
756     /* m10 m11 m12 m13 */ \
757     /* m20 m21 m22 m23 */ \
758     /* m30 m31 m32 m33 */ \
759     _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
760     _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
761     /* m00 m10 m02 m12 */ \
762     /* m01 m11 m03 m13 */ \
763     /* m20 m30 m22 m32 */ \
764     /* m21 m31 m23 m33 */ \
765     b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
766     b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
767     b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
768     b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
769 }
770 
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4,u32)771 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
772 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
773 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
774 
775 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
776 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
777 { \
778     _Tpvec##x3_t v = vld3q_##suffix(ptr); \
779     a.val = v.val[0]; \
780     b.val = v.val[1]; \
781     c.val = v.val[2]; \
782 } \
783 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
784                                 v_##_Tpvec& c, v_##_Tpvec& d) \
785 { \
786     _Tpvec##x4_t v = vld4q_##suffix(ptr); \
787     a.val = v.val[0]; \
788     b.val = v.val[1]; \
789     c.val = v.val[2]; \
790     d.val = v.val[3]; \
791 } \
792 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
793 { \
794     _Tpvec##x3_t v; \
795     v.val[0] = a.val; \
796     v.val[1] = b.val; \
797     v.val[2] = c.val; \
798     vst3q_##suffix(ptr, v); \
799 } \
800 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
801                                const v_##_Tpvec& c, const v_##_Tpvec& d) \
802 { \
803     _Tpvec##x4_t v; \
804     v.val[0] = a.val; \
805     v.val[1] = b.val; \
806     v.val[2] = c.val; \
807     v.val[3] = d.val; \
808     vst4q_##suffix(ptr, v); \
809 }
810 
811 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
812 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
813 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
814 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
815 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
816 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
817 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
818 
819 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
820 {
821     return v_float32x4(vcvtq_f32_s32(a.val));
822 }
823 
824 }
825 
826 #endif
827