1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/sum_neon.h"
19 
vpx_sad4x4_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)20 uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
21                          const uint8_t *ref_ptr, int ref_stride) {
22   const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
23   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
24   uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
25   abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
26   return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
27 }
28 
vpx_sad4x4_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const uint8_t * second_pred)29 uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
30                              const uint8_t *ref_ptr, int ref_stride,
31                              const uint8_t *second_pred) {
32   const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
33   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
34   const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
35   const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
36   uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
37   abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
38   return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
39 }
40 
vpx_sad4x8_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)41 uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
42                          const uint8_t *ref_ptr, int ref_stride) {
43   int i;
44   uint16x8_t abs = vdupq_n_u16(0);
45   for (i = 0; i < 8; i += 4) {
46     const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
47     const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
48     src_ptr += 4 * src_stride;
49     ref_ptr += 4 * ref_stride;
50     abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8));
51     abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
52   }
53 
54   return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
55 }
56 
vpx_sad4x8_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const uint8_t * second_pred)57 uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
58                              const uint8_t *ref_ptr, int ref_stride,
59                              const uint8_t *second_pred) {
60   int i;
61   uint16x8_t abs = vdupq_n_u16(0);
62   for (i = 0; i < 8; i += 4) {
63     const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
64     const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
65     const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
66     const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
67     src_ptr += 4 * src_stride;
68     ref_ptr += 4 * ref_stride;
69     second_pred += 16;
70     abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg));
71     abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
72   }
73 
74   return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
75 }
76 
sad8x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const int height)77 static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
78                                const uint8_t *ref_ptr, int ref_stride,
79                                const int height) {
80   int i;
81   uint16x8_t abs = vdupq_n_u16(0);
82 
83   for (i = 0; i < height; ++i) {
84     const uint8x8_t a_u8 = vld1_u8(src_ptr);
85     const uint8x8_t b_u8 = vld1_u8(ref_ptr);
86     src_ptr += src_stride;
87     ref_ptr += ref_stride;
88     abs = vabal_u8(abs, a_u8, b_u8);
89   }
90   return abs;
91 }
92 
sad8x_avg(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const uint8_t * second_pred,const int height)93 static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
94                                    const uint8_t *ref_ptr, int ref_stride,
95                                    const uint8_t *second_pred,
96                                    const int height) {
97   int i;
98   uint16x8_t abs = vdupq_n_u16(0);
99 
100   for (i = 0; i < height; ++i) {
101     const uint8x8_t a_u8 = vld1_u8(src_ptr);
102     const uint8x8_t b_u8 = vld1_u8(ref_ptr);
103     const uint8x8_t c_u8 = vld1_u8(second_pred);
104     const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
105     src_ptr += src_stride;
106     ref_ptr += ref_stride;
107     second_pred += 8;
108     abs = vabal_u8(abs, a_u8, avg);
109   }
110   return abs;
111 }
112 
113 #define sad8xN(n)                                                              \
114   uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \
115                                const uint8_t *ref_ptr, int ref_stride) {       \
116     const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
117     return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
118   }                                                                            \
119                                                                                \
120   uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,     \
121                                    const uint8_t *ref_ptr, int ref_stride,     \
122                                    const uint8_t *second_pred) {               \
123     const uint16x8_t abs =                                                     \
124         sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
125     return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
126   }
127 
128 sad8xN(4);
129 sad8xN(8);
130 sad8xN(16);
131 
sad16x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const int height)132 static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
133                                 const uint8_t *ref_ptr, int ref_stride,
134                                 const int height) {
135   int i;
136   uint16x8_t abs = vdupq_n_u16(0);
137 
138   for (i = 0; i < height; ++i) {
139     const uint8x16_t a_u8 = vld1q_u8(src_ptr);
140     const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
141     src_ptr += src_stride;
142     ref_ptr += ref_stride;
143     abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
144     abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
145   }
146   return abs;
147 }
148 
sad16x_avg(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const uint8_t * second_pred,const int height)149 static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
150                                     const uint8_t *ref_ptr, int ref_stride,
151                                     const uint8_t *second_pred,
152                                     const int height) {
153   int i;
154   uint16x8_t abs = vdupq_n_u16(0);
155 
156   for (i = 0; i < height; ++i) {
157     const uint8x16_t a_u8 = vld1q_u8(src_ptr);
158     const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
159     const uint8x16_t c_u8 = vld1q_u8(second_pred);
160     const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
161     src_ptr += src_stride;
162     ref_ptr += ref_stride;
163     second_pred += 16;
164     abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
165     abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
166   }
167   return abs;
168 }
169 
170 #define sad16xN(n)                                                            \
171   uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
172                                 const uint8_t *ref_ptr, int ref_stride) {     \
173     const uint16x8_t abs =                                                    \
174         sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
175     return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
176   }                                                                           \
177                                                                               \
178   uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
179                                     const uint8_t *ref_ptr, int ref_stride,   \
180                                     const uint8_t *second_pred) {             \
181     const uint16x8_t abs =                                                    \
182         sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
183     return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
184   }
185 
186 sad16xN(8);
187 sad16xN(16);
188 sad16xN(32);
189 
sad32x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const int height)190 static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
191                                 const uint8_t *ref_ptr, int ref_stride,
192                                 const int height) {
193   int i;
194   uint16x8_t abs = vdupq_n_u16(0);
195 
196   for (i = 0; i < height; ++i) {
197     const uint8x16_t a_lo = vld1q_u8(src_ptr);
198     const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
199     const uint8x16_t b_lo = vld1q_u8(ref_ptr);
200     const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
201     src_ptr += src_stride;
202     ref_ptr += ref_stride;
203     abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
204     abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
205     abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
206     abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi));
207   }
208   return abs;
209 }
210 
sad32x_avg(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const uint8_t * second_pred,const int height)211 static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
212                                     const uint8_t *ref_ptr, int ref_stride,
213                                     const uint8_t *second_pred,
214                                     const int height) {
215   int i;
216   uint16x8_t abs = vdupq_n_u16(0);
217 
218   for (i = 0; i < height; ++i) {
219     const uint8x16_t a_lo = vld1q_u8(src_ptr);
220     const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
221     const uint8x16_t b_lo = vld1q_u8(ref_ptr);
222     const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
223     const uint8x16_t c_lo = vld1q_u8(second_pred);
224     const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
225     const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
226     const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
227     src_ptr += src_stride;
228     ref_ptr += ref_stride;
229     second_pred += 32;
230     abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
231     abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
232     abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
233     abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi));
234   }
235   return abs;
236 }
237 
238 #define sad32xN(n)                                                            \
239   uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
240                                 const uint8_t *ref_ptr, int ref_stride) {     \
241     const uint16x8_t abs =                                                    \
242         sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
243     return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
244   }                                                                           \
245                                                                               \
246   uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
247                                     const uint8_t *ref_ptr, int ref_stride,   \
248                                     const uint8_t *second_pred) {             \
249     const uint16x8_t abs =                                                    \
250         sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
251     return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
252   }
253 
254 sad32xN(16);
255 sad32xN(32);
256 sad32xN(64);
257 
sad64x(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const int height)258 static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
259                                 const uint8_t *ref_ptr, int ref_stride,
260                                 const int height) {
261   int i;
262   uint16x8_t abs_0 = vdupq_n_u16(0);
263   uint16x8_t abs_1 = vdupq_n_u16(0);
264 
265   for (i = 0; i < height; ++i) {
266     const uint8x16_t a_0 = vld1q_u8(src_ptr);
267     const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
268     const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
269     const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
270     const uint8x16_t b_0 = vld1q_u8(ref_ptr);
271     const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
272     const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
273     const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
274     src_ptr += src_stride;
275     ref_ptr += ref_stride;
276     abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
277     abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
278     abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
279     abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1));
280     abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2));
281     abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2));
282     abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3));
283     abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3));
284   }
285 
286   {
287     const uint32x4_t sum = vpaddlq_u16(abs_0);
288     return vpadalq_u16(sum, abs_1);
289   }
290 }
291 
sad64x_avg(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,const uint8_t * second_pred,const int height)292 static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
293                                     const uint8_t *ref_ptr, int ref_stride,
294                                     const uint8_t *second_pred,
295                                     const int height) {
296   int i;
297   uint16x8_t abs_0 = vdupq_n_u16(0);
298   uint16x8_t abs_1 = vdupq_n_u16(0);
299 
300   for (i = 0; i < height; ++i) {
301     const uint8x16_t a_0 = vld1q_u8(src_ptr);
302     const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
303     const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
304     const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
305     const uint8x16_t b_0 = vld1q_u8(ref_ptr);
306     const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
307     const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
308     const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
309     const uint8x16_t c_0 = vld1q_u8(second_pred);
310     const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
311     const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
312     const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
313     const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
314     const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
315     const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
316     const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
317     src_ptr += src_stride;
318     ref_ptr += ref_stride;
319     second_pred += 64;
320     abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
321     abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
322     abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
323     abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1));
324     abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2));
325     abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2));
326     abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3));
327     abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3));
328   }
329 
330   {
331     const uint32x4_t sum = vpaddlq_u16(abs_0);
332     return vpadalq_u16(sum, abs_1);
333   }
334 }
335 
336 #define sad64xN(n)                                                            \
337   uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
338                                 const uint8_t *ref_ptr, int ref_stride) {     \
339     const uint32x4_t abs =                                                    \
340         sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
341     return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
342   }                                                                           \
343                                                                               \
344   uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
345                                     const uint8_t *ref_ptr, int ref_stride,   \
346                                     const uint8_t *second_pred) {             \
347     const uint32x4_t abs =                                                    \
348         sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
349     return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
350   }
351 
352 sad64xN(32);
353 sad64xN(64);
354