1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44
45 #ifndef __OPENCV_HAL_INTRIN_NEON_HPP__
46 #define __OPENCV_HAL_INTRIN_NEON_HPP__
47
48 namespace cv
49 {
50
51 #define CV_SIMD128 1
52
53 struct v_uint8x16
54 {
55 typedef uchar lane_type;
56 enum { nlanes = 16 };
57
v_uint8x16cv::v_uint8x1658 v_uint8x16() {}
v_uint8x16cv::v_uint8x1659 explicit v_uint8x16(uint8x16_t v) : val(v) {}
v_uint8x16cv::v_uint8x1660 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
61 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
62 {
63 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
64 val = vld1q_u8(v);
65 }
get0cv::v_uint8x1666 uchar get0() const
67 {
68 return vgetq_lane_u8(val, 0);
69 }
70
71 uint8x16_t val;
72 };
73
74 struct v_int8x16
75 {
76 typedef schar lane_type;
77 enum { nlanes = 16 };
78
v_int8x16cv::v_int8x1679 v_int8x16() {}
v_int8x16cv::v_int8x1680 explicit v_int8x16(int8x16_t v) : val(v) {}
v_int8x16cv::v_int8x1681 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
82 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
83 {
84 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
85 val = vld1q_s8(v);
86 }
get0cv::v_int8x1687 schar get0() const
88 {
89 return vgetq_lane_s8(val, 0);
90 }
91
92 int8x16_t val;
93 };
94
95 struct v_uint16x8
96 {
97 typedef ushort lane_type;
98 enum { nlanes = 8 };
99
v_uint16x8cv::v_uint16x8100 v_uint16x8() {}
v_uint16x8cv::v_uint16x8101 explicit v_uint16x8(uint16x8_t v) : val(v) {}
v_uint16x8cv::v_uint16x8102 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
103 {
104 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
105 val = vld1q_u16(v);
106 }
get0cv::v_uint16x8107 ushort get0() const
108 {
109 return vgetq_lane_u16(val, 0);
110 }
111
112 uint16x8_t val;
113 };
114
115 struct v_int16x8
116 {
117 typedef short lane_type;
118 enum { nlanes = 8 };
119
v_int16x8cv::v_int16x8120 v_int16x8() {}
v_int16x8cv::v_int16x8121 explicit v_int16x8(int16x8_t v) : val(v) {}
v_int16x8cv::v_int16x8122 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
123 {
124 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
125 val = vld1q_s16(v);
126 }
get0cv::v_int16x8127 short get0() const
128 {
129 return vgetq_lane_s16(val, 0);
130 }
131
132 int16x8_t val;
133 };
134
135 struct v_uint32x4
136 {
137 typedef unsigned lane_type;
138 enum { nlanes = 4 };
139
v_uint32x4cv::v_uint32x4140 v_uint32x4() {}
v_uint32x4cv::v_uint32x4141 explicit v_uint32x4(uint32x4_t v) : val(v) {}
v_uint32x4cv::v_uint32x4142 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
143 {
144 unsigned v[] = {v0, v1, v2, v3};
145 val = vld1q_u32(v);
146 }
get0cv::v_uint32x4147 unsigned get0() const
148 {
149 return vgetq_lane_u32(val, 0);
150 }
151
152 uint32x4_t val;
153 };
154
155 struct v_int32x4
156 {
157 typedef int lane_type;
158 enum { nlanes = 4 };
159
v_int32x4cv::v_int32x4160 v_int32x4() {}
v_int32x4cv::v_int32x4161 explicit v_int32x4(int32x4_t v) : val(v) {}
v_int32x4cv::v_int32x4162 v_int32x4(int v0, int v1, int v2, int v3)
163 {
164 int v[] = {v0, v1, v2, v3};
165 val = vld1q_s32(v);
166 }
get0cv::v_int32x4167 int get0() const
168 {
169 return vgetq_lane_s32(val, 0);
170 }
171 int32x4_t val;
172 };
173
174 struct v_float32x4
175 {
176 typedef float lane_type;
177 enum { nlanes = 4 };
178
v_float32x4cv::v_float32x4179 v_float32x4() {}
v_float32x4cv::v_float32x4180 explicit v_float32x4(float32x4_t v) : val(v) {}
v_float32x4cv::v_float32x4181 v_float32x4(float v0, float v1, float v2, float v3)
182 {
183 float v[] = {v0, v1, v2, v3};
184 val = vld1q_f32(v);
185 }
get0cv::v_float32x4186 float get0() const
187 {
188 return vgetq_lane_f32(val, 0);
189 }
190 float32x4_t val;
191 };
192
193 struct v_uint64x2
194 {
195 typedef uint64 lane_type;
196 enum { nlanes = 2 };
197
v_uint64x2cv::v_uint64x2198 v_uint64x2() {}
v_uint64x2cv::v_uint64x2199 explicit v_uint64x2(uint64x2_t v) : val(v) {}
v_uint64x2cv::v_uint64x2200 v_uint64x2(unsigned v0, unsigned v1)
201 {
202 uint64 v[] = {v0, v1};
203 val = vld1q_u64(v);
204 }
get0cv::v_uint64x2205 uint64 get0() const
206 {
207 return vgetq_lane_u64(val, 0);
208 }
209 uint64x2_t val;
210 };
211
212 struct v_int64x2
213 {
214 typedef int64 lane_type;
215 enum { nlanes = 2 };
216
v_int64x2cv::v_int64x2217 v_int64x2() {}
v_int64x2cv::v_int64x2218 explicit v_int64x2(int64x2_t v) : val(v) {}
v_int64x2cv::v_int64x2219 v_int64x2(int v0, int v1)
220 {
221 int64 v[] = {v0, v1};
222 val = vld1q_s64(v);
223 }
get0cv::v_int64x2224 int64 get0() const
225 {
226 return vgetq_lane_s64(val, 0);
227 }
228 int64x2_t val;
229 };
230
231 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
232 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
233 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
234 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
235 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
236 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
237 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(vreinterpretq_u16_##suffix(v.val)); } \
238 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpretq_s16_##suffix(v.val)); } \
239 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(vreinterpretq_u32_##suffix(v.val)); } \
240 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(vreinterpretq_s32_##suffix(v.val)); } \
241 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(vreinterpretq_u64_##suffix(v.val)); } \
242 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(vreinterpretq_s64_##suffix(v.val)); } \
243 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(vreinterpretq_f32_##suffix(v.val)); }
244
OPENCV_HAL_IMPL_NEON_INIT(uint8x16,uchar,u8)245 OPENCV_HAL_IMPL_NEON_INIT(uint8x16, uchar, u8)
246 OPENCV_HAL_IMPL_NEON_INIT(int8x16, schar, s8)
247 OPENCV_HAL_IMPL_NEON_INIT(uint16x8, ushort, u16)
248 OPENCV_HAL_IMPL_NEON_INIT(int16x8, short, s16)
249 OPENCV_HAL_IMPL_NEON_INIT(uint32x4, unsigned, u32)
250 OPENCV_HAL_IMPL_NEON_INIT(int32x4, int, s32)
251 OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, u64)
252 OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, s64)
253 OPENCV_HAL_IMPL_NEON_INIT(float32x4, float, f32)
254
255 #define OPENCV_HAL_IMPL_NEON_PACK(_Tpvec, _Tp, hreg, suffix, _Tpwvec, wsuffix, pack, op) \
256 inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
257 { \
258 hreg a1 = vqmov##op##_##wsuffix(a.val), b1 = vqmov##op##_##wsuffix(b.val); \
259 return _Tpvec(vcombine_##suffix(a1, b1)); \
260 } \
261 inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
262 { \
263 hreg a1 = vqmov##op##_##wsuffix(a.val); \
264 vst1_##suffix(ptr, a1); \
265 } \
266 template<int n> inline \
267 _Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
268 { \
269 hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
270 hreg b1 = vqrshr##op##_n_##wsuffix(b.val, n); \
271 return _Tpvec(vcombine_##suffix(a1, b1)); \
272 } \
273 template<int n> inline \
274 void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
275 { \
276 hreg a1 = vqrshr##op##_n_##wsuffix(a.val, n); \
277 vst1_##suffix(ptr, a1); \
278 }
279
280 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_uint16x8, u16, pack, n)
281 OPENCV_HAL_IMPL_NEON_PACK(v_uint8x16, uchar, uint8x8_t, u8, v_int16x8, s16, pack_u, un)
282 OPENCV_HAL_IMPL_NEON_PACK(v_int8x16, schar, int8x8_t, s8, v_int16x8, s16, pack, n)
283 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_uint32x4, u32, pack, n)
284 OPENCV_HAL_IMPL_NEON_PACK(v_uint16x8, ushort, uint16x4_t, u16, v_int32x4, s32, pack_u, un)
285 OPENCV_HAL_IMPL_NEON_PACK(v_int16x8, short, int16x4_t, s16, v_int32x4, s32, pack, n)
286 OPENCV_HAL_IMPL_NEON_PACK(v_uint32x4, unsigned, uint32x2_t, u32, v_uint64x2, u64, pack, n)
287 OPENCV_HAL_IMPL_NEON_PACK(v_int32x4, int, int32x2_t, s32, v_int64x2, s64, pack, n)
288
289 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
290 const v_float32x4& m1, const v_float32x4& m2,
291 const v_float32x4& m3)
292 {
293 float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
294 float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
295 res = vmlaq_lane_f32(res, m1.val, vl, 1);
296 res = vmlaq_lane_f32(res, m2.val, vh, 0);
297 res = vmlaq_lane_f32(res, m3.val, vh, 1);
298 return v_float32x4(res);
299 }
300
301 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
302 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
303 { \
304 return _Tpvec(intrin(a.val, b.val)); \
305 } \
306 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
307 { \
308 a.val = intrin(a.val, b.val); \
309 return a; \
310 }
311
312 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
313 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
314 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
315 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
316 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
317 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
318 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint16x8, vmulq_u16)
319 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
320 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
321 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int16x8, vmulq_s16)
322 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
323 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
324 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
325 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
326 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
327 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
328 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
329 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
330 OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
331 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
332 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
333 OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
334 OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
335
operator /(const v_float32x4 & a,const v_float32x4 & b)336 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
337 {
338 float32x4_t reciprocal = vrecpeq_f32(b.val);
339 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
340 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
341 return v_float32x4(vmulq_f32(a.val, reciprocal));
342 }
operator /=(v_float32x4 & a,const v_float32x4 & b)343 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
344 {
345 float32x4_t reciprocal = vrecpeq_f32(b.val);
346 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
347 reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
348 a.val = vmulq_f32(a.val, reciprocal);
349 return a;
350 }
351
v_mul_expand(const v_int16x8 & a,const v_int16x8 & b,v_int32x4 & c,v_int32x4 & d)352 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
353 v_int32x4& c, v_int32x4& d)
354 {
355 c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
356 d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
357 }
358
v_mul_expand(const v_uint16x8 & a,const v_uint16x8 & b,v_uint32x4 & c,v_uint32x4 & d)359 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
360 v_uint32x4& c, v_uint32x4& d)
361 {
362 c.val = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
363 d.val = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
364 }
365
v_mul_expand(const v_uint32x4 & a,const v_uint32x4 & b,v_uint64x2 & c,v_uint64x2 & d)366 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
367 v_uint64x2& c, v_uint64x2& d)
368 {
369 c.val = vmull_u32(vget_low_u32(a.val), vget_low_u32(b.val));
370 d.val = vmull_u32(vget_high_u32(a.val), vget_high_u32(b.val));
371 }
372
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)373 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
374 {
375 int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
376 int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
377 int32x4x2_t cd = vtrnq_s32(c, d);
378 return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
379 }
380
381 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
382 OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
383 OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
384 OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
385 inline _Tpvec operator ~ (const _Tpvec& a) \
386 { \
387 return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
388 }
389
390 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint8x16, u8)
391 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int8x16, s8)
392 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint16x8, u16)
393 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int16x8, s16)
394 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint32x4, u32)
395 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int32x4, s32)
396 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
397 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
398
399 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
400 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
401 { \
402 return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
403 } \
404 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
405 { \
406 a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
407 return a; \
408 }
409
410 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
411 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
412 OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
413
414 inline v_float32x4 operator ~ (const v_float32x4& a)
415 {
416 return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
417 }
418
v_sqrt(const v_float32x4 & x)419 inline v_float32x4 v_sqrt(const v_float32x4& x)
420 {
421 float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN));
422 float32x4_t e = vrsqrteq_f32(x1);
423 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
424 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
425 return v_float32x4(vmulq_f32(x.val, e));
426 }
427
v_invsqrt(const v_float32x4 & x)428 inline v_float32x4 v_invsqrt(const v_float32x4& x)
429 {
430 float32x4_t e = vrsqrteq_f32(x.val);
431 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
432 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e);
433 return v_float32x4(e);
434 }
435
v_abs(v_float32x4 x)436 inline v_float32x4 v_abs(v_float32x4 x)
437 { return v_float32x4(vabsq_f32(x.val)); }
438
439 // TODO: exp, log, sin, cos
440
441 #define OPENCV_HAL_IMPL_NEON_BIN_FUNC(_Tpvec, func, intrin) \
442 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
443 { \
444 return _Tpvec(intrin(a.val, b.val)); \
445 }
446
OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16,v_min,vminq_u8)447 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_min, vminq_u8)
448 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_max, vmaxq_u8)
449 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_min, vminq_s8)
450 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_max, vmaxq_s8)
451 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_min, vminq_u16)
452 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_max, vmaxq_u16)
453 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_min, vminq_s16)
454 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_max, vmaxq_s16)
455 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_min, vminq_u32)
456 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_max, vmaxq_u32)
457 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_min, vminq_s32)
458 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int32x4, v_max, vmaxq_s32)
459 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_min, vminq_f32)
460 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_max, vmaxq_f32)
461
462
463 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
464 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
465 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
466 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
467 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
468 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
469 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
470 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
471 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
472 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
473 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
474 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
475 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
476
477 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
478 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int8x16, vreinterpretq_s8_u8, s8, u8)
479 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint16x8, OPENCV_HAL_NOP, u16, u16)
480 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
481 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
482 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
483 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
484
485 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
486 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
487 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
488 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_add_wrap, vaddq_s16)
489 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
490 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_sub_wrap, vsubq_s8)
491 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_sub_wrap, vsubq_u16)
492 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int16x8, v_sub_wrap, vsubq_s16)
493
494 // TODO: absdiff for signed integers
495 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_absdiff, vabdq_u8)
496 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_absdiff, vabdq_u16)
497 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint32x4, v_absdiff, vabdq_u32)
498 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float32x4, v_absdiff, vabdq_f32)
499
500 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
501 {
502 v_float32x4 x(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
503 return v_sqrt(x);
504 }
505
v_sqr_magnitude(const v_float32x4 & a,const v_float32x4 & b)506 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
507 {
508 return v_float32x4(vmlaq_f32(vmulq_f32(a.val, a.val), b.val, b.val));
509 }
510
v_muladd(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)511 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
512 {
513 return v_float32x4(vmlaq_f32(c.val, a.val, b.val));
514 }
515
516 // trade efficiency for convenience
517 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
518 inline _Tpvec operator << (const _Tpvec& a, int n) \
519 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
520 inline _Tpvec operator >> (const _Tpvec& a, int n) \
521 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
522 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
523 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
524 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
525 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
526 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
527 { return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
528
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16,u8,schar,s8)529 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
530 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
531 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint16x8, u16, short, s16)
532 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int16x8, s16, short, s16)
533 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint32x4, u32, int, s32)
534 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int32x4, s32, int, s32)
535 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint64x2, u64, int64, s64)
536 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
537
538 #define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
539 inline _Tpvec v_load(const _Tp* ptr) \
540 { return _Tpvec(vld1q_##suffix(ptr)); } \
541 inline _Tpvec v_load_aligned(const _Tp* ptr) \
542 { return _Tpvec(vld1q_##suffix(ptr)); } \
543 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
544 { return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
545 inline void v_store(_Tp* ptr, const _Tpvec& a) \
546 { vst1q_##suffix(ptr, a.val); } \
547 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
548 { vst1q_##suffix(ptr, a.val); } \
549 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
550 { vst1_##suffix(ptr, vget_low_##suffix(a.val)); } \
551 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
552 { vst1_##suffix(ptr, vget_high_##suffix(a.val)); }
553
554 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint8x16, uchar, u8)
555 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int8x16, schar, s8)
556 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint16x8, ushort, u16)
557 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int16x8, short, s16)
558 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_uint32x4, unsigned, u32)
559 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_int32x4, int, s32)
560 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
561
562 #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
563 inline scalartype v_reduce_##func(const _Tpvec& a) \
564 { \
565 scalartype CV_DECL_ALIGNED(16) buf[4]; \
566 v_store_aligned(buf, a); \
567 scalartype s0 = scalar_func(buf[0], buf[1]); \
568 scalartype s1 = scalar_func(buf[2], buf[3]); \
569 return scalar_func(s0, s1); \
570 }
571
572 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
573 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
574 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
575 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
576 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, max, std::max)
577 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_int32x4, int, min, std::min)
578 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
579 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, max, std::max)
580 OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float, min, std::min)
581
582 inline int v_signmask(const v_uint8x16& a)
583 {
584 int8x8_t m0 = vcreate_s8(CV_BIG_UINT(0x0706050403020100));
585 uint8x16_t v0 = vshlq_u8(vshrq_n_u8(a.val, 7), vcombine_s8(m0, m0));
586 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v0)));
587 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 8);
588 }
v_signmask(const v_int8x16 & a)589 inline int v_signmask(const v_int8x16& a)
590 { return v_signmask(v_reinterpret_as_u8(a)); }
591
v_signmask(const v_uint16x8 & a)592 inline int v_signmask(const v_uint16x8& a)
593 {
594 int16x4_t m0 = vcreate_s16(CV_BIG_UINT(0x0003000200010000));
595 uint16x8_t v0 = vshlq_u16(vshrq_n_u16(a.val, 15), vcombine_s16(m0, m0));
596 uint64x2_t v1 = vpaddlq_u32(vpaddlq_u16(v0));
597 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 4);
598 }
v_signmask(const v_int16x8 & a)599 inline int v_signmask(const v_int16x8& a)
600 { return v_signmask(v_reinterpret_as_u16(a)); }
601
v_signmask(const v_uint32x4 & a)602 inline int v_signmask(const v_uint32x4& a)
603 {
604 int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000));
605 uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0));
606 uint64x2_t v1 = vpaddlq_u32(v0);
607 return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2);
608 }
v_signmask(const v_int32x4 & a)609 inline int v_signmask(const v_int32x4& a)
610 { return v_signmask(v_reinterpret_as_u32(a)); }
v_signmask(const v_float32x4 & a)611 inline int v_signmask(const v_float32x4& a)
612 { return v_signmask(v_reinterpret_as_u32(a)); }
613
614 #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
615 inline bool v_check_all(const v_##_Tpvec& a) \
616 { \
617 _Tpvec##_t v0 = vshrq_n_##suffix(vmvnq_##suffix(a.val), shift); \
618 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
619 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) == 0; \
620 } \
621 inline bool v_check_any(const v_##_Tpvec& a) \
622 { \
623 _Tpvec##_t v0 = vshrq_n_##suffix(a.val, shift); \
624 uint64x2_t v1 = vreinterpretq_u64_##suffix(v0); \
625 return (vgetq_lane_u64(v1, 0) | vgetq_lane_u64(v1, 1)) != 0; \
626 }
627
628 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
629 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
630 OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
631
v_check_all(const v_int8x16 & a)632 inline bool v_check_all(const v_int8x16& a)
633 { return v_check_all(v_reinterpret_as_u8(a)); }
v_check_all(const v_int16x8 & a)634 inline bool v_check_all(const v_int16x8& a)
635 { return v_check_all(v_reinterpret_as_u16(a)); }
v_check_all(const v_int32x4 & a)636 inline bool v_check_all(const v_int32x4& a)
637 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_all(const v_float32x4 & a)638 inline bool v_check_all(const v_float32x4& a)
639 { return v_check_all(v_reinterpret_as_u32(a)); }
640
v_check_any(const v_int8x16 & a)641 inline bool v_check_any(const v_int8x16& a)
642 { return v_check_all(v_reinterpret_as_u8(a)); }
v_check_any(const v_int16x8 & a)643 inline bool v_check_any(const v_int16x8& a)
644 { return v_check_all(v_reinterpret_as_u16(a)); }
v_check_any(const v_int32x4 & a)645 inline bool v_check_any(const v_int32x4& a)
646 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_any(const v_float32x4 & a)647 inline bool v_check_any(const v_float32x4& a)
648 { return v_check_all(v_reinterpret_as_u32(a)); }
649
650 #define OPENCV_HAL_IMPL_NEON_SELECT(_Tpvec, suffix, usuffix) \
651 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
652 { \
653 return _Tpvec(vbslq_##suffix(vreinterpretq_##usuffix##_##suffix(mask.val), a.val, b.val)); \
654 }
655
OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16,u8,u8)656 OPENCV_HAL_IMPL_NEON_SELECT(v_uint8x16, u8, u8)
657 OPENCV_HAL_IMPL_NEON_SELECT(v_int8x16, s8, u8)
658 OPENCV_HAL_IMPL_NEON_SELECT(v_uint16x8, u16, u16)
659 OPENCV_HAL_IMPL_NEON_SELECT(v_int16x8, s16, u16)
660 OPENCV_HAL_IMPL_NEON_SELECT(v_uint32x4, u32, u32)
661 OPENCV_HAL_IMPL_NEON_SELECT(v_int32x4, s32, u32)
662 OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
663
664 #define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
665 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
666 { \
667 b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
668 b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
669 } \
670 inline _Tpwvec v_load_expand(const _Tp* ptr) \
671 { \
672 return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
673 }
674
675 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
676 OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
677 OPENCV_HAL_IMPL_NEON_EXPAND(v_uint16x8, v_uint32x4, ushort, u16)
678 OPENCV_HAL_IMPL_NEON_EXPAND(v_int16x8, v_int32x4, short, s16)
679
680 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
681 {
682 uint8x8_t v0 = vcreate_u8(*(unsigned*)ptr);
683 uint16x4_t v1 = vget_low_u16(vmovl_u8(v0));
684 return v_uint32x4(vmovl_u16(v1));
685 }
686
v_load_expand_q(const schar * ptr)687 inline v_int32x4 v_load_expand_q(const schar* ptr)
688 {
689 int8x8_t v0 = vcreate_s8(*(unsigned*)ptr);
690 int16x4_t v1 = vget_low_s16(vmovl_s8(v0));
691 return v_int32x4(vmovl_s16(v1));
692 }
693
694 #define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
695 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
696 { \
697 _Tpvec##x2_t p = vzipq_##suffix(a0.val, a1.val); \
698 b0.val = p.val[0]; \
699 b1.val = p.val[1]; \
700 } \
701 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
702 { \
703 return v_##_Tpvec(vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val))); \
704 } \
705 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
706 { \
707 return v_##_Tpvec(vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val))); \
708 } \
709 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
710 { \
711 c.val = vcombine_##suffix(vget_low_##suffix(a.val), vget_low_##suffix(b.val)); \
712 d.val = vcombine_##suffix(vget_high_##suffix(a.val), vget_high_##suffix(b.val)); \
713 }
714
OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16,u8)715 OPENCV_HAL_IMPL_NEON_UNPACKS(uint8x16, u8)
716 OPENCV_HAL_IMPL_NEON_UNPACKS(int8x16, s8)
717 OPENCV_HAL_IMPL_NEON_UNPACKS(uint16x8, u16)
718 OPENCV_HAL_IMPL_NEON_UNPACKS(int16x8, s16)
719 OPENCV_HAL_IMPL_NEON_UNPACKS(uint32x4, u32)
720 OPENCV_HAL_IMPL_NEON_UNPACKS(int32x4, s32)
721 OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
722
723 inline v_int32x4 v_round(const v_float32x4& a)
724 {
725 static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
726 v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
727
728 int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
729 return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
730 }
731
v_floor(const v_float32x4 & a)732 inline v_int32x4 v_floor(const v_float32x4& a)
733 {
734 int32x4_t a1 = vcvtq_s32_f32(a.val);
735 uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
736 return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
737 }
738
v_ceil(const v_float32x4 & a)739 inline v_int32x4 v_ceil(const v_float32x4& a)
740 {
741 int32x4_t a1 = vcvtq_s32_f32(a.val);
742 uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
743 return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
744 }
745
v_trunc(const v_float32x4 & a)746 inline v_int32x4 v_trunc(const v_float32x4& a)
747 { return v_int32x4(vcvtq_s32_f32(a.val)); }
748
749 #define OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(_Tpvec, suffix) \
750 inline void transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
751 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
752 v_##_Tpvec& b0, v_##_Tpvec& b1, \
753 v_##_Tpvec& b2, v_##_Tpvec& b3) \
754 { \
755 /* m00 m01 m02 m03 */ \
756 /* m10 m11 m12 m13 */ \
757 /* m20 m21 m22 m23 */ \
758 /* m30 m31 m32 m33 */ \
759 _Tpvec##x2_t t0 = vtrnq_##suffix(a0.val, a1.val); \
760 _Tpvec##x2_t t1 = vtrnq_##suffix(a2.val, a3.val); \
761 /* m00 m10 m02 m12 */ \
762 /* m01 m11 m03 m13 */ \
763 /* m20 m30 m22 m32 */ \
764 /* m21 m31 m23 m33 */ \
765 b0.val = vcombine_##suffix(vget_low_##suffix(t0.val[0]), vget_low_##suffix(t1.val[0])); \
766 b1.val = vcombine_##suffix(vget_low_##suffix(t0.val[1]), vget_low_##suffix(t1.val[1])); \
767 b2.val = vcombine_##suffix(vget_high_##suffix(t0.val[0]), vget_high_##suffix(t1.val[0])); \
768 b3.val = vcombine_##suffix(vget_high_##suffix(t0.val[1]), vget_high_##suffix(t1.val[1])); \
769 }
770
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4,u32)771 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(uint32x4, u32)
772 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
773 OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
774
775 #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
776 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
777 { \
778 _Tpvec##x3_t v = vld3q_##suffix(ptr); \
779 a.val = v.val[0]; \
780 b.val = v.val[1]; \
781 c.val = v.val[2]; \
782 } \
783 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
784 v_##_Tpvec& c, v_##_Tpvec& d) \
785 { \
786 _Tpvec##x4_t v = vld4q_##suffix(ptr); \
787 a.val = v.val[0]; \
788 b.val = v.val[1]; \
789 c.val = v.val[2]; \
790 d.val = v.val[3]; \
791 } \
792 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
793 { \
794 _Tpvec##x3_t v; \
795 v.val[0] = a.val; \
796 v.val[1] = b.val; \
797 v.val[2] = c.val; \
798 vst3q_##suffix(ptr, v); \
799 } \
800 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
801 const v_##_Tpvec& c, const v_##_Tpvec& d) \
802 { \
803 _Tpvec##x4_t v; \
804 v.val[0] = a.val; \
805 v.val[1] = b.val; \
806 v.val[2] = c.val; \
807 v.val[3] = d.val; \
808 vst4q_##suffix(ptr, v); \
809 }
810
811 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8)
812 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8)
813 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16)
814 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int16x8, short, s16)
815 OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint32x4, unsigned, u32)
816 OPENCV_HAL_IMPL_NEON_INTERLEAVED(int32x4, int, s32)
817 OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32)
818
819 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
820 {
821 return v_float32x4(vcvtq_f32_s32(a.val));
822 }
823
824 }
825
826 #endif
827