1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx/vpx_integer.h"
16 
horizontal_long_add_16x8(const uint16x8_t vec_lo,const uint16x8_t vec_hi)17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
18                                                     const uint16x8_t vec_hi) {
19   const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
20                                         vget_high_u16(vec_lo));
21   const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
22                                         vget_high_u16(vec_hi));
23   const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
24   const uint64x2_t b = vpaddlq_u32(a);
25   const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
26                                 vreinterpret_u32_u64(vget_high_u64(b)));
27   return vget_lane_u32(c, 0);
28 }
29 
30 // Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
31 // vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
32 // and vec_sum_ref_hi.
sad_neon_64(const uint8x16_t vec_src_00,const uint8x16_t vec_src_16,const uint8x16_t vec_src_32,const uint8x16_t vec_src_48,const uint8_t * ref,uint16x8_t * vec_sum_ref_lo,uint16x8_t * vec_sum_ref_hi)33 static void sad_neon_64(const uint8x16_t vec_src_00,
34                         const uint8x16_t vec_src_16,
35                         const uint8x16_t vec_src_32,
36                         const uint8x16_t vec_src_48,
37                         const uint8_t *ref,
38                         uint16x8_t *vec_sum_ref_lo,
39                         uint16x8_t *vec_sum_ref_hi) {
40   const uint8x16_t vec_ref_00 = vld1q_u8(ref);
41   const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
42   const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
43   const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
44 
45   *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
46                              vget_low_u8(vec_ref_00));
47   *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
48                              vget_high_u8(vec_ref_00));
49   *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
50                              vget_low_u8(vec_ref_16));
51   *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
52                              vget_high_u8(vec_ref_16));
53   *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
54                              vget_low_u8(vec_ref_32));
55   *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
56                              vget_high_u8(vec_ref_32));
57   *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
58                              vget_low_u8(vec_ref_48));
59   *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
60                              vget_high_u8(vec_ref_48));
61 }
62 
63 // Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
64 // and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
sad_neon_32(const uint8x16_t vec_src_00,const uint8x16_t vec_src_16,const uint8_t * ref,uint16x8_t * vec_sum_ref_lo,uint16x8_t * vec_sum_ref_hi)65 static void sad_neon_32(const uint8x16_t vec_src_00,
66                         const uint8x16_t vec_src_16,
67                         const uint8_t *ref,
68                         uint16x8_t *vec_sum_ref_lo,
69                         uint16x8_t *vec_sum_ref_hi) {
70   const uint8x16_t vec_ref_00 = vld1q_u8(ref);
71   const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
72 
73   *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
74                              vget_low_u8(vec_ref_00));
75   *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
76                              vget_high_u8(vec_ref_00));
77   *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
78                              vget_low_u8(vec_ref_16));
79   *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
80                              vget_high_u8(vec_ref_16));
81 }
82 
vpx_sad64x64x4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t * res)83 void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
84                           const uint8_t* const ref[4], int ref_stride,
85                           uint32_t *res) {
86   int i;
87   uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
88   uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
89   uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
90   uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
91   uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
92   uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
93   uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
94   uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
95   const uint8_t *ref0, *ref1, *ref2, *ref3;
96   ref0 = ref[0];
97   ref1 = ref[1];
98   ref2 = ref[2];
99   ref3 = ref[3];
100 
101   for (i = 0; i < 64; ++i) {
102     const uint8x16_t vec_src_00 = vld1q_u8(src);
103     const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
104     const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
105     const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
106 
107     sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
108                 &vec_sum_ref0_lo, &vec_sum_ref0_hi);
109     sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
110                 &vec_sum_ref1_lo, &vec_sum_ref1_hi);
111     sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
112                 &vec_sum_ref2_lo, &vec_sum_ref2_hi);
113     sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
114                 &vec_sum_ref3_lo, &vec_sum_ref3_hi);
115 
116     src += src_stride;
117     ref0 += ref_stride;
118     ref1 += ref_stride;
119     ref2 += ref_stride;
120     ref3 += ref_stride;
121   }
122 
123   res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
124   res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
125   res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
126   res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
127 }
128 
vpx_sad32x32x4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t * res)129 void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
130                           const uint8_t* const ref[4], int ref_stride,
131                           uint32_t *res) {
132   int i;
133   uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
134   uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
135   uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
136   uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
137   uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
138   uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
139   uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
140   uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
141   const uint8_t *ref0, *ref1, *ref2, *ref3;
142   ref0 = ref[0];
143   ref1 = ref[1];
144   ref2 = ref[2];
145   ref3 = ref[3];
146 
147   for (i = 0; i < 32; ++i) {
148     const uint8x16_t vec_src_00 = vld1q_u8(src);
149     const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
150 
151     sad_neon_32(vec_src_00, vec_src_16, ref0,
152                 &vec_sum_ref0_lo, &vec_sum_ref0_hi);
153     sad_neon_32(vec_src_00, vec_src_16, ref1,
154                 &vec_sum_ref1_lo, &vec_sum_ref1_hi);
155     sad_neon_32(vec_src_00, vec_src_16, ref2,
156                 &vec_sum_ref2_lo, &vec_sum_ref2_hi);
157     sad_neon_32(vec_src_00, vec_src_16, ref3,
158                 &vec_sum_ref3_lo, &vec_sum_ref3_hi);
159 
160     src += src_stride;
161     ref0 += ref_stride;
162     ref1 += ref_stride;
163     ref2 += ref_stride;
164     ref3 += ref_stride;
165   }
166 
167   res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
168   res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
169   res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
170   res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
171 }
172 
vpx_sad16x16x4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t * res)173 void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
174                           const uint8_t* const ref[4], int ref_stride,
175                           uint32_t *res) {
176   int i;
177   uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
178   uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
179   uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
180   uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
181   uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
182   uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
183   uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
184   uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
185   const uint8_t *ref0, *ref1, *ref2, *ref3;
186   ref0 = ref[0];
187   ref1 = ref[1];
188   ref2 = ref[2];
189   ref3 = ref[3];
190 
191   for (i = 0; i < 16; ++i) {
192     const uint8x16_t vec_src = vld1q_u8(src);
193     const uint8x16_t vec_ref0 = vld1q_u8(ref0);
194     const uint8x16_t vec_ref1 = vld1q_u8(ref1);
195     const uint8x16_t vec_ref2 = vld1q_u8(ref2);
196     const uint8x16_t vec_ref3 = vld1q_u8(ref3);
197 
198     vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
199                                vget_low_u8(vec_ref0));
200     vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
201                                vget_high_u8(vec_ref0));
202     vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
203                                vget_low_u8(vec_ref1));
204     vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
205                                vget_high_u8(vec_ref1));
206     vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
207                                vget_low_u8(vec_ref2));
208     vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
209                                vget_high_u8(vec_ref2));
210     vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
211                                vget_low_u8(vec_ref3));
212     vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
213                                vget_high_u8(vec_ref3));
214 
215     src += src_stride;
216     ref0 += ref_stride;
217     ref1 += ref_stride;
218     ref2 += ref_stride;
219     ref3 += ref_stride;
220   }
221 
222   res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
223   res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
224   res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
225   res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
226 }
227