1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_config.h"
14 
15 #include "vpx/vpx_integer.h"
16 
vpx_sad8x16_neon(unsigned char * src_ptr,int src_stride,unsigned char * ref_ptr,int ref_stride)17 unsigned int vpx_sad8x16_neon(
18         unsigned char *src_ptr,
19         int src_stride,
20         unsigned char *ref_ptr,
21         int ref_stride) {
22     uint8x8_t d0, d8;
23     uint16x8_t q12;
24     uint32x4_t q1;
25     uint64x2_t q3;
26     uint32x2_t d5;
27     int i;
28 
29     d0 = vld1_u8(src_ptr);
30     src_ptr += src_stride;
31     d8 = vld1_u8(ref_ptr);
32     ref_ptr += ref_stride;
33     q12 = vabdl_u8(d0, d8);
34 
35     for (i = 0; i < 15; i++) {
36         d0 = vld1_u8(src_ptr);
37         src_ptr += src_stride;
38         d8 = vld1_u8(ref_ptr);
39         ref_ptr += ref_stride;
40         q12 = vabal_u8(q12, d0, d8);
41     }
42 
43     q1 = vpaddlq_u16(q12);
44     q3 = vpaddlq_u32(q1);
45     d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
46                   vreinterpret_u32_u64(vget_high_u64(q3)));
47 
48     return vget_lane_u32(d5, 0);
49 }
50 
vpx_sad4x4_neon(unsigned char * src_ptr,int src_stride,unsigned char * ref_ptr,int ref_stride)51 unsigned int vpx_sad4x4_neon(
52         unsigned char *src_ptr,
53         int src_stride,
54         unsigned char *ref_ptr,
55         int ref_stride) {
56     uint8x8_t d0, d8;
57     uint16x8_t q12;
58     uint32x2_t d1;
59     uint64x1_t d3;
60     int i;
61 
62     d0 = vld1_u8(src_ptr);
63     src_ptr += src_stride;
64     d8 = vld1_u8(ref_ptr);
65     ref_ptr += ref_stride;
66     q12 = vabdl_u8(d0, d8);
67 
68     for (i = 0; i < 3; i++) {
69         d0 = vld1_u8(src_ptr);
70         src_ptr += src_stride;
71         d8 = vld1_u8(ref_ptr);
72         ref_ptr += ref_stride;
73         q12 = vabal_u8(q12, d0, d8);
74     }
75 
76     d1 = vpaddl_u16(vget_low_u16(q12));
77     d3 = vpaddl_u32(d1);
78 
79     return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
80 }
81 
vpx_sad16x8_neon(unsigned char * src_ptr,int src_stride,unsigned char * ref_ptr,int ref_stride)82 unsigned int vpx_sad16x8_neon(
83         unsigned char *src_ptr,
84         int src_stride,
85         unsigned char *ref_ptr,
86         int ref_stride) {
87     uint8x16_t q0, q4;
88     uint16x8_t q12, q13;
89     uint32x4_t q1;
90     uint64x2_t q3;
91     uint32x2_t d5;
92     int i;
93 
94     q0 = vld1q_u8(src_ptr);
95     src_ptr += src_stride;
96     q4 = vld1q_u8(ref_ptr);
97     ref_ptr += ref_stride;
98     q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
99     q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
100 
101     for (i = 0; i < 7; i++) {
102         q0 = vld1q_u8(src_ptr);
103         src_ptr += src_stride;
104         q4 = vld1q_u8(ref_ptr);
105         ref_ptr += ref_stride;
106         q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
107         q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
108     }
109 
110     q12 = vaddq_u16(q12, q13);
111     q1 = vpaddlq_u16(q12);
112     q3 = vpaddlq_u32(q1);
113     d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
114                   vreinterpret_u32_u64(vget_high_u64(q3)));
115 
116     return vget_lane_u32(d5, 0);
117 }
118 
horizontal_long_add_16x8(const uint16x8_t vec_lo,const uint16x8_t vec_hi)119 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
120                                                     const uint16x8_t vec_hi) {
121   const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
122                                         vget_high_u16(vec_lo));
123   const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
124                                         vget_high_u16(vec_hi));
125   const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
126   const uint64x2_t b = vpaddlq_u32(a);
127   const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
128                                 vreinterpret_u32_u64(vget_high_u64(b)));
129   return vget_lane_u32(c, 0);
130 }
horizontal_add_16x8(const uint16x8_t vec_16x8)131 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
132   const uint32x4_t a = vpaddlq_u16(vec_16x8);
133   const uint64x2_t b = vpaddlq_u32(a);
134   const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
135                                 vreinterpret_u32_u64(vget_high_u64(b)));
136   return vget_lane_u32(c, 0);
137 }
138 
vpx_sad64x64_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride)139 unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
140                                const uint8_t *ref, int ref_stride) {
141   int i;
142   uint16x8_t vec_accum_lo = vdupq_n_u16(0);
143   uint16x8_t vec_accum_hi = vdupq_n_u16(0);
144   for (i = 0; i < 64; ++i) {
145     const uint8x16_t vec_src_00 = vld1q_u8(src);
146     const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
147     const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
148     const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
149     const uint8x16_t vec_ref_00 = vld1q_u8(ref);
150     const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
151     const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
152     const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
153     src += src_stride;
154     ref += ref_stride;
155     vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
156                             vget_low_u8(vec_ref_00));
157     vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
158                             vget_high_u8(vec_ref_00));
159     vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
160                             vget_low_u8(vec_ref_16));
161     vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
162                             vget_high_u8(vec_ref_16));
163     vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
164                             vget_low_u8(vec_ref_32));
165     vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
166                             vget_high_u8(vec_ref_32));
167     vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
168                             vget_low_u8(vec_ref_48));
169     vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
170                             vget_high_u8(vec_ref_48));
171   }
172   return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
173 }
174 
vpx_sad32x32_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride)175 unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
176                                const uint8_t *ref, int ref_stride) {
177   int i;
178   uint16x8_t vec_accum_lo = vdupq_n_u16(0);
179   uint16x8_t vec_accum_hi = vdupq_n_u16(0);
180 
181   for (i = 0; i < 32; ++i) {
182     const uint8x16_t vec_src_00 = vld1q_u8(src);
183     const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
184     const uint8x16_t vec_ref_00 = vld1q_u8(ref);
185     const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
186     src += src_stride;
187     ref += ref_stride;
188     vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
189                             vget_low_u8(vec_ref_00));
190     vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
191                             vget_high_u8(vec_ref_00));
192     vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
193                             vget_low_u8(vec_ref_16));
194     vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
195                             vget_high_u8(vec_ref_16));
196   }
197   return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
198 }
199 
vpx_sad16x16_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride)200 unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
201                                const uint8_t *ref, int ref_stride) {
202   int i;
203   uint16x8_t vec_accum_lo = vdupq_n_u16(0);
204   uint16x8_t vec_accum_hi = vdupq_n_u16(0);
205 
206   for (i = 0; i < 16; ++i) {
207     const uint8x16_t vec_src = vld1q_u8(src);
208     const uint8x16_t vec_ref = vld1q_u8(ref);
209     src += src_stride;
210     ref += ref_stride;
211     vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src),
212                             vget_low_u8(vec_ref));
213     vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),
214                             vget_high_u8(vec_ref));
215   }
216   return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
217 }
218 
vpx_sad8x8_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride)219 unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
220                              const uint8_t *ref, int ref_stride) {
221   int i;
222   uint16x8_t vec_accum = vdupq_n_u16(0);
223 
224   for (i = 0; i < 8; ++i) {
225     const uint8x8_t vec_src = vld1_u8(src);
226     const uint8x8_t vec_ref = vld1_u8(ref);
227     src += src_stride;
228     ref += ref_stride;
229     vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
230   }
231   return horizontal_add_16x8(vec_accum);
232 }
233