1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #ifdef _MSC_VER
14 #define __builtin_prefetch(x)
15 #endif
16
vp8_variance16x16_neon(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)17 unsigned int vp8_variance16x16_neon(
18 const unsigned char *src_ptr,
19 int source_stride,
20 const unsigned char *ref_ptr,
21 int recon_stride,
22 unsigned int *sse) {
23 int i;
24 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
25 uint32x2_t d0u32, d10u32;
26 int64x1_t d0s64, d1s64;
27 uint8x16_t q0u8, q1u8, q2u8, q3u8;
28 uint16x8_t q11u16, q12u16, q13u16, q14u16;
29 int32x4_t q8s32, q9s32, q10s32;
30 int64x2_t q0s64, q1s64, q5s64;
31
32 q8s32 = vdupq_n_s32(0);
33 q9s32 = vdupq_n_s32(0);
34 q10s32 = vdupq_n_s32(0);
35
36 for (i = 0; i < 8; i++) {
37 q0u8 = vld1q_u8(src_ptr);
38 src_ptr += source_stride;
39 q1u8 = vld1q_u8(src_ptr);
40 src_ptr += source_stride;
41 __builtin_prefetch(src_ptr);
42
43 q2u8 = vld1q_u8(ref_ptr);
44 ref_ptr += recon_stride;
45 q3u8 = vld1q_u8(ref_ptr);
46 ref_ptr += recon_stride;
47 __builtin_prefetch(ref_ptr);
48
49 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
50 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
51 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
52 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
53
54 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
55 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
56 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
57 q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
58 q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
59
60 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
61 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
62 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
63 q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
64 q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
65
66 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
67 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
68 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
69 q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
70 q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
71
72 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
73 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
74 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
75 q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
76 q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
77 }
78
79 q10s32 = vaddq_s32(q10s32, q9s32);
80 q0s64 = vpaddlq_s32(q8s32);
81 q1s64 = vpaddlq_s32(q10s32);
82
83 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
84 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
85
86 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
87 vreinterpret_s32_s64(d0s64));
88 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
89
90 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
91 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
92
93 return vget_lane_u32(d0u32, 0);
94 }
95
vp8_variance16x8_neon(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)96 unsigned int vp8_variance16x8_neon(
97 const unsigned char *src_ptr,
98 int source_stride,
99 const unsigned char *ref_ptr,
100 int recon_stride,
101 unsigned int *sse) {
102 int i;
103 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
104 uint32x2_t d0u32, d10u32;
105 int64x1_t d0s64, d1s64;
106 uint8x16_t q0u8, q1u8, q2u8, q3u8;
107 uint16x8_t q11u16, q12u16, q13u16, q14u16;
108 int32x4_t q8s32, q9s32, q10s32;
109 int64x2_t q0s64, q1s64, q5s64;
110
111 q8s32 = vdupq_n_s32(0);
112 q9s32 = vdupq_n_s32(0);
113 q10s32 = vdupq_n_s32(0);
114
115 for (i = 0; i < 4; i++) { // variance16x8_neon_loop
116 q0u8 = vld1q_u8(src_ptr);
117 src_ptr += source_stride;
118 q1u8 = vld1q_u8(src_ptr);
119 src_ptr += source_stride;
120 __builtin_prefetch(src_ptr);
121
122 q2u8 = vld1q_u8(ref_ptr);
123 ref_ptr += recon_stride;
124 q3u8 = vld1q_u8(ref_ptr);
125 ref_ptr += recon_stride;
126 __builtin_prefetch(ref_ptr);
127
128 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
129 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
130 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
131 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
132
133 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
134 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
135 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
136 q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
137 q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
138
139 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
140 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
141 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
142 q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
143 q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
144
145 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
146 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
147 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
148 q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
149 q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
150
151 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
152 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
153 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
154 q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
155 q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
156 }
157
158 q10s32 = vaddq_s32(q10s32, q9s32);
159 q0s64 = vpaddlq_s32(q8s32);
160 q1s64 = vpaddlq_s32(q10s32);
161
162 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
163 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
164
165 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
166 vreinterpret_s32_s64(d0s64));
167 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
168
169 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
170 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
171
172 return vget_lane_u32(d0u32, 0);
173 }
174
vp8_variance8x16_neon(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)175 unsigned int vp8_variance8x16_neon(
176 const unsigned char *src_ptr,
177 int source_stride,
178 const unsigned char *ref_ptr,
179 int recon_stride,
180 unsigned int *sse) {
181 int i;
182 uint8x8_t d0u8, d2u8, d4u8, d6u8;
183 int16x4_t d22s16, d23s16, d24s16, d25s16;
184 uint32x2_t d0u32, d10u32;
185 int64x1_t d0s64, d1s64;
186 uint16x8_t q11u16, q12u16;
187 int32x4_t q8s32, q9s32, q10s32;
188 int64x2_t q0s64, q1s64, q5s64;
189
190 q8s32 = vdupq_n_s32(0);
191 q9s32 = vdupq_n_s32(0);
192 q10s32 = vdupq_n_s32(0);
193
194 for (i = 0; i < 8; i++) { // variance8x16_neon_loop
195 d0u8 = vld1_u8(src_ptr);
196 src_ptr += source_stride;
197 d2u8 = vld1_u8(src_ptr);
198 src_ptr += source_stride;
199 __builtin_prefetch(src_ptr);
200
201 d4u8 = vld1_u8(ref_ptr);
202 ref_ptr += recon_stride;
203 d6u8 = vld1_u8(ref_ptr);
204 ref_ptr += recon_stride;
205 __builtin_prefetch(ref_ptr);
206
207 q11u16 = vsubl_u8(d0u8, d4u8);
208 q12u16 = vsubl_u8(d2u8, d6u8);
209
210 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
211 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
212 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
213 q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
214 q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
215
216 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
217 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
218 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
219 q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
220 q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
221 }
222
223 q10s32 = vaddq_s32(q10s32, q9s32);
224 q0s64 = vpaddlq_s32(q8s32);
225 q1s64 = vpaddlq_s32(q10s32);
226
227 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
228 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
229
230 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
231 vreinterpret_s32_s64(d0s64));
232 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
233
234 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
235 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
236
237 return vget_lane_u32(d0u32, 0);
238 }
239
vp8_variance8x8_neon(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)240 unsigned int vp8_variance8x8_neon(
241 const unsigned char *src_ptr,
242 int source_stride,
243 const unsigned char *ref_ptr,
244 int recon_stride,
245 unsigned int *sse) {
246 int i;
247 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
248 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
249 uint32x2_t d0u32, d10u32;
250 int64x1_t d0s64, d1s64;
251 uint16x8_t q11u16, q12u16, q13u16, q14u16;
252 int32x4_t q8s32, q9s32, q10s32;
253 int64x2_t q0s64, q1s64, q5s64;
254
255 q8s32 = vdupq_n_s32(0);
256 q9s32 = vdupq_n_s32(0);
257 q10s32 = vdupq_n_s32(0);
258
259 for (i = 0; i < 2; i++) { // variance8x8_neon_loop
260 d0u8 = vld1_u8(src_ptr);
261 src_ptr += source_stride;
262 d1u8 = vld1_u8(src_ptr);
263 src_ptr += source_stride;
264 d2u8 = vld1_u8(src_ptr);
265 src_ptr += source_stride;
266 d3u8 = vld1_u8(src_ptr);
267 src_ptr += source_stride;
268
269 d4u8 = vld1_u8(ref_ptr);
270 ref_ptr += recon_stride;
271 d5u8 = vld1_u8(ref_ptr);
272 ref_ptr += recon_stride;
273 d6u8 = vld1_u8(ref_ptr);
274 ref_ptr += recon_stride;
275 d7u8 = vld1_u8(ref_ptr);
276 ref_ptr += recon_stride;
277
278 q11u16 = vsubl_u8(d0u8, d4u8);
279 q12u16 = vsubl_u8(d1u8, d5u8);
280 q13u16 = vsubl_u8(d2u8, d6u8);
281 q14u16 = vsubl_u8(d3u8, d7u8);
282
283 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
284 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
285 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
286 q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
287 q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
288
289 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
290 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
291 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
292 q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
293 q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
294
295 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
296 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
297 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
298 q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
299 q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
300
301 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
302 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
303 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
304 q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
305 q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
306 }
307
308 q10s32 = vaddq_s32(q10s32, q9s32);
309 q0s64 = vpaddlq_s32(q8s32);
310 q1s64 = vpaddlq_s32(q10s32);
311
312 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
313 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
314
315 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
316 vreinterpret_s32_s64(d0s64));
317 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
318
319 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
320 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
321
322 return vget_lane_u32(d0u32, 0);
323 }
324