1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx/vpx_integer.h"
16
horizontal_long_add_16x8(const uint16x8_t vec_lo,const uint16x8_t vec_hi)17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
18 const uint16x8_t vec_hi) {
19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
20 vget_high_u16(vec_lo));
21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
22 vget_high_u16(vec_hi));
23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
24 const uint64x2_t b = vpaddlq_u32(a);
25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
26 vreinterpret_u32_u64(vget_high_u64(b)));
27 return vget_lane_u32(c, 0);
28 }
29
30 // Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
31 // vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
32 // and vec_sum_ref_hi.
sad_neon_64(const uint8x16_t vec_src_00,const uint8x16_t vec_src_16,const uint8x16_t vec_src_32,const uint8x16_t vec_src_48,const uint8_t * ref,uint16x8_t * vec_sum_ref_lo,uint16x8_t * vec_sum_ref_hi)33 static void sad_neon_64(const uint8x16_t vec_src_00,
34 const uint8x16_t vec_src_16,
35 const uint8x16_t vec_src_32,
36 const uint8x16_t vec_src_48,
37 const uint8_t *ref,
38 uint16x8_t *vec_sum_ref_lo,
39 uint16x8_t *vec_sum_ref_hi) {
40 const uint8x16_t vec_ref_00 = vld1q_u8(ref);
41 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
42 const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
43 const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
44
45 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
46 vget_low_u8(vec_ref_00));
47 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
48 vget_high_u8(vec_ref_00));
49 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
50 vget_low_u8(vec_ref_16));
51 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
52 vget_high_u8(vec_ref_16));
53 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
54 vget_low_u8(vec_ref_32));
55 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
56 vget_high_u8(vec_ref_32));
57 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
58 vget_low_u8(vec_ref_48));
59 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
60 vget_high_u8(vec_ref_48));
61 }
62
63 // Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
64 // and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
sad_neon_32(const uint8x16_t vec_src_00,const uint8x16_t vec_src_16,const uint8_t * ref,uint16x8_t * vec_sum_ref_lo,uint16x8_t * vec_sum_ref_hi)65 static void sad_neon_32(const uint8x16_t vec_src_00,
66 const uint8x16_t vec_src_16,
67 const uint8_t *ref,
68 uint16x8_t *vec_sum_ref_lo,
69 uint16x8_t *vec_sum_ref_hi) {
70 const uint8x16_t vec_ref_00 = vld1q_u8(ref);
71 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
72
73 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
74 vget_low_u8(vec_ref_00));
75 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
76 vget_high_u8(vec_ref_00));
77 *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
78 vget_low_u8(vec_ref_16));
79 *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
80 vget_high_u8(vec_ref_16));
81 }
82
vpx_sad64x64x4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t * res)83 void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
84 const uint8_t* const ref[4], int ref_stride,
85 uint32_t *res) {
86 int i;
87 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
88 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
89 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
90 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
91 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
92 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
93 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
94 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
95 const uint8_t *ref0, *ref1, *ref2, *ref3;
96 ref0 = ref[0];
97 ref1 = ref[1];
98 ref2 = ref[2];
99 ref3 = ref[3];
100
101 for (i = 0; i < 64; ++i) {
102 const uint8x16_t vec_src_00 = vld1q_u8(src);
103 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
104 const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
105 const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
106
107 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
108 &vec_sum_ref0_lo, &vec_sum_ref0_hi);
109 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
110 &vec_sum_ref1_lo, &vec_sum_ref1_hi);
111 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
112 &vec_sum_ref2_lo, &vec_sum_ref2_hi);
113 sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
114 &vec_sum_ref3_lo, &vec_sum_ref3_hi);
115
116 src += src_stride;
117 ref0 += ref_stride;
118 ref1 += ref_stride;
119 ref2 += ref_stride;
120 ref3 += ref_stride;
121 }
122
123 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
124 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
125 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
126 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
127 }
128
vpx_sad32x32x4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t * res)129 void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
130 const uint8_t* const ref[4], int ref_stride,
131 uint32_t *res) {
132 int i;
133 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
134 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
135 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
136 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
137 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
138 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
139 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
140 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
141 const uint8_t *ref0, *ref1, *ref2, *ref3;
142 ref0 = ref[0];
143 ref1 = ref[1];
144 ref2 = ref[2];
145 ref3 = ref[3];
146
147 for (i = 0; i < 32; ++i) {
148 const uint8x16_t vec_src_00 = vld1q_u8(src);
149 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
150
151 sad_neon_32(vec_src_00, vec_src_16, ref0,
152 &vec_sum_ref0_lo, &vec_sum_ref0_hi);
153 sad_neon_32(vec_src_00, vec_src_16, ref1,
154 &vec_sum_ref1_lo, &vec_sum_ref1_hi);
155 sad_neon_32(vec_src_00, vec_src_16, ref2,
156 &vec_sum_ref2_lo, &vec_sum_ref2_hi);
157 sad_neon_32(vec_src_00, vec_src_16, ref3,
158 &vec_sum_ref3_lo, &vec_sum_ref3_hi);
159
160 src += src_stride;
161 ref0 += ref_stride;
162 ref1 += ref_stride;
163 ref2 += ref_stride;
164 ref3 += ref_stride;
165 }
166
167 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
168 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
169 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
170 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
171 }
172
vpx_sad16x16x4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t * res)173 void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
174 const uint8_t* const ref[4], int ref_stride,
175 uint32_t *res) {
176 int i;
177 uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
178 uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
179 uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
180 uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
181 uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
182 uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
183 uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
184 uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
185 const uint8_t *ref0, *ref1, *ref2, *ref3;
186 ref0 = ref[0];
187 ref1 = ref[1];
188 ref2 = ref[2];
189 ref3 = ref[3];
190
191 for (i = 0; i < 16; ++i) {
192 const uint8x16_t vec_src = vld1q_u8(src);
193 const uint8x16_t vec_ref0 = vld1q_u8(ref0);
194 const uint8x16_t vec_ref1 = vld1q_u8(ref1);
195 const uint8x16_t vec_ref2 = vld1q_u8(ref2);
196 const uint8x16_t vec_ref3 = vld1q_u8(ref3);
197
198 vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
199 vget_low_u8(vec_ref0));
200 vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
201 vget_high_u8(vec_ref0));
202 vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
203 vget_low_u8(vec_ref1));
204 vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
205 vget_high_u8(vec_ref1));
206 vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
207 vget_low_u8(vec_ref2));
208 vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
209 vget_high_u8(vec_ref2));
210 vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
211 vget_low_u8(vec_ref3));
212 vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
213 vget_high_u8(vec_ref3));
214
215 src += src_stride;
216 ref0 += ref_stride;
217 ref1 += ref_stride;
218 ref2 += ref_stride;
219 ref3 += ref_stride;
220 }
221
222 res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
223 res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
224 res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
225 res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
226 }
227