1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx/vpx_integer.h"
16 
17 //------------------------------------------------------------------------------
18 // DC 4x4
19 
20 // 'do_above' and 'do_left' facilitate branch removal when inlined.
dc_4x4(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int do_above,int do_left)21 static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
22                           const uint8_t *above, const uint8_t *left,
23                           int do_above, int do_left) {
24   uint16x8_t sum_top;
25   uint16x8_t sum_left;
26   uint8x8_t dc0;
27 
28   if (do_above) {
29     const uint8x8_t A = vld1_u8(above);  // top row
30     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
31     const uint16x4_t p1 = vpadd_u16(p0, p0);
32     sum_top = vcombine_u16(p1, p1);
33   }
34 
35   if (do_left) {
36     const uint8x8_t L = vld1_u8(left);  // left border
37     const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
38     const uint16x4_t p1 = vpadd_u16(p0, p0);
39     sum_left = vcombine_u16(p1, p1);
40   }
41 
42   if (do_above && do_left) {
43     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
44     dc0 = vrshrn_n_u16(sum, 3);
45   } else if (do_above) {
46     dc0 = vrshrn_n_u16(sum_top, 2);
47   } else if (do_left) {
48     dc0 = vrshrn_n_u16(sum_left, 2);
49   } else {
50     dc0 = vdup_n_u8(0x80);
51   }
52 
53   {
54     const uint8x8_t dc = vdup_lane_u8(dc0, 0);
55     int i;
56     for (i = 0; i < 4; ++i) {
57       vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
58     }
59   }
60 }
61 
vpx_dc_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)62 void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
63                                const uint8_t *above, const uint8_t *left) {
64   dc_4x4(dst, stride, above, left, 1, 1);
65 }
66 
vpx_dc_left_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)67 void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
68                                     const uint8_t *above, const uint8_t *left) {
69   (void)above;
70   dc_4x4(dst, stride, NULL, left, 0, 1);
71 }
72 
vpx_dc_top_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)73 void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
74                                    const uint8_t *above, const uint8_t *left) {
75   (void)left;
76   dc_4x4(dst, stride, above, NULL, 1, 0);
77 }
78 
vpx_dc_128_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)79 void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
80                                    const uint8_t *above, const uint8_t *left) {
81   (void)above;
82   (void)left;
83   dc_4x4(dst, stride, NULL, NULL, 0, 0);
84 }
85 
86 //------------------------------------------------------------------------------
87 // DC 8x8
88 
89 // 'do_above' and 'do_left' facilitate branch removal when inlined.
dc_8x8(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int do_above,int do_left)90 static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
91                           const uint8_t *above, const uint8_t *left,
92                           int do_above, int do_left) {
93   uint16x8_t sum_top;
94   uint16x8_t sum_left;
95   uint8x8_t dc0;
96 
97   if (do_above) {
98     const uint8x8_t A = vld1_u8(above);  // top row
99     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
100     const uint16x4_t p1 = vpadd_u16(p0, p0);
101     const uint16x4_t p2 = vpadd_u16(p1, p1);
102     sum_top = vcombine_u16(p2, p2);
103   }
104 
105   if (do_left) {
106     const uint8x8_t L = vld1_u8(left);  // left border
107     const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
108     const uint16x4_t p1 = vpadd_u16(p0, p0);
109     const uint16x4_t p2 = vpadd_u16(p1, p1);
110     sum_left = vcombine_u16(p2, p2);
111   }
112 
113   if (do_above && do_left) {
114     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
115     dc0 = vrshrn_n_u16(sum, 4);
116   } else if (do_above) {
117     dc0 = vrshrn_n_u16(sum_top, 3);
118   } else if (do_left) {
119     dc0 = vrshrn_n_u16(sum_left, 3);
120   } else {
121     dc0 = vdup_n_u8(0x80);
122   }
123 
124   {
125     const uint8x8_t dc = vdup_lane_u8(dc0, 0);
126     int i;
127     for (i = 0; i < 8; ++i) {
128       vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
129     }
130   }
131 }
132 
vpx_dc_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)133 void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
134                                const uint8_t *above, const uint8_t *left) {
135   dc_8x8(dst, stride, above, left, 1, 1);
136 }
137 
vpx_dc_left_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)138 void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
139                                     const uint8_t *above, const uint8_t *left) {
140   (void)above;
141   dc_8x8(dst, stride, NULL, left, 0, 1);
142 }
143 
vpx_dc_top_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)144 void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
145                                    const uint8_t *above, const uint8_t *left) {
146   (void)left;
147   dc_8x8(dst, stride, above, NULL, 1, 0);
148 }
149 
vpx_dc_128_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)150 void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
151                                    const uint8_t *above, const uint8_t *left) {
152   (void)above;
153   (void)left;
154   dc_8x8(dst, stride, NULL, NULL, 0, 0);
155 }
156 
157 //------------------------------------------------------------------------------
158 // DC 16x16
159 
160 // 'do_above' and 'do_left' facilitate branch removal when inlined.
dc_16x16(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int do_above,int do_left)161 static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
162                             const uint8_t *above, const uint8_t *left,
163                             int do_above, int do_left) {
164   uint16x8_t sum_top;
165   uint16x8_t sum_left;
166   uint8x8_t dc0;
167 
168   if (do_above) {
169     const uint8x16_t A = vld1q_u8(above);  // top row
170     const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
171     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
172     const uint16x4_t p2 = vpadd_u16(p1, p1);
173     const uint16x4_t p3 = vpadd_u16(p2, p2);
174     sum_top = vcombine_u16(p3, p3);
175   }
176 
177   if (do_left) {
178     const uint8x16_t L = vld1q_u8(left);  // left row
179     const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
180     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
181     const uint16x4_t p2 = vpadd_u16(p1, p1);
182     const uint16x4_t p3 = vpadd_u16(p2, p2);
183     sum_left = vcombine_u16(p3, p3);
184   }
185 
186   if (do_above && do_left) {
187     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
188     dc0 = vrshrn_n_u16(sum, 5);
189   } else if (do_above) {
190     dc0 = vrshrn_n_u16(sum_top, 4);
191   } else if (do_left) {
192     dc0 = vrshrn_n_u16(sum_left, 4);
193   } else {
194     dc0 = vdup_n_u8(0x80);
195   }
196 
197   {
198     const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
199     int i;
200     for (i = 0; i < 16; ++i) {
201       vst1q_u8(dst + i * stride, dc);
202     }
203   }
204 }
205 
vpx_dc_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)206 void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
207                                  const uint8_t *above, const uint8_t *left) {
208   dc_16x16(dst, stride, above, left, 1, 1);
209 }
210 
vpx_dc_left_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)211 void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
212                                       const uint8_t *above,
213                                       const uint8_t *left) {
214   (void)above;
215   dc_16x16(dst, stride, NULL, left, 0, 1);
216 }
217 
vpx_dc_top_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)218 void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
219                                      const uint8_t *above,
220                                      const uint8_t *left) {
221   (void)left;
222   dc_16x16(dst, stride, above, NULL, 1, 0);
223 }
224 
vpx_dc_128_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)225 void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
226                                      const uint8_t *above,
227                                      const uint8_t *left) {
228   (void)above;
229   (void)left;
230   dc_16x16(dst, stride, NULL, NULL, 0, 0);
231 }
232 
233 //------------------------------------------------------------------------------
234 // DC 32x32
235 
236 // 'do_above' and 'do_left' facilitate branch removal when inlined.
dc_32x32(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int do_above,int do_left)237 static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
238                             const uint8_t *above, const uint8_t *left,
239                             int do_above, int do_left) {
240   uint16x8_t sum_top;
241   uint16x8_t sum_left;
242   uint8x8_t dc0;
243 
244   if (do_above) {
245     const uint8x16_t A0 = vld1q_u8(above);  // top row
246     const uint8x16_t A1 = vld1q_u8(above + 16);
247     const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
248     const uint16x8_t p1 = vpaddlq_u8(A1);
249     const uint16x8_t p2 = vaddq_u16(p0, p1);
250     const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
251     const uint16x4_t p4 = vpadd_u16(p3, p3);
252     const uint16x4_t p5 = vpadd_u16(p4, p4);
253     sum_top = vcombine_u16(p5, p5);
254   }
255 
256   if (do_left) {
257     const uint8x16_t L0 = vld1q_u8(left);  // left row
258     const uint8x16_t L1 = vld1q_u8(left + 16);
259     const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
260     const uint16x8_t p1 = vpaddlq_u8(L1);
261     const uint16x8_t p2 = vaddq_u16(p0, p1);
262     const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
263     const uint16x4_t p4 = vpadd_u16(p3, p3);
264     const uint16x4_t p5 = vpadd_u16(p4, p4);
265     sum_left = vcombine_u16(p5, p5);
266   }
267 
268   if (do_above && do_left) {
269     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
270     dc0 = vrshrn_n_u16(sum, 6);
271   } else if (do_above) {
272     dc0 = vrshrn_n_u16(sum_top, 5);
273   } else if (do_left) {
274     dc0 = vrshrn_n_u16(sum_left, 5);
275   } else {
276     dc0 = vdup_n_u8(0x80);
277   }
278 
279   {
280     const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
281     int i;
282     for (i = 0; i < 32; ++i) {
283       vst1q_u8(dst + i * stride, dc);
284       vst1q_u8(dst + i * stride + 16, dc);
285     }
286   }
287 }
288 
vpx_dc_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)289 void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
290                                  const uint8_t *above, const uint8_t *left) {
291   dc_32x32(dst, stride, above, left, 1, 1);
292 }
293 
vpx_dc_left_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)294 void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
295                                       const uint8_t *above,
296                                       const uint8_t *left) {
297   (void)above;
298   dc_32x32(dst, stride, NULL, left, 0, 1);
299 }
300 
vpx_dc_top_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)301 void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
302                                      const uint8_t *above,
303                                      const uint8_t *left) {
304   (void)left;
305   dc_32x32(dst, stride, above, NULL, 1, 0);
306 }
307 
vpx_dc_128_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)308 void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
309                                      const uint8_t *above,
310                                      const uint8_t *left) {
311   (void)above;
312   (void)left;
313   dc_32x32(dst, stride, NULL, NULL, 0, 0);
314 }
315 
316 // -----------------------------------------------------------------------------
317 
vpx_d45_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)318 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
319                                 const uint8_t *above, const uint8_t *left) {
320   const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
321   const uint64x1_t A1 = vshr_n_u64(A0, 8);
322   const uint64x1_t A2 = vshr_n_u64(A0, 16);
323   const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
324   const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
325   const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
326   const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
327   const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
328   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
329   const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
330   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
331   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
332   const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
333   (void)left;
334   vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
335   vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
336   vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
337   vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
338   dst[3 * stride + 3] = above[7];
339 }
340 
vpx_d45_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)341 void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
342                                 const uint8_t *above, const uint8_t *left) {
343   static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
344   static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
345   const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
346   const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
347   const uint8x8_t A0 = vld1_u8(above);  // top row
348   const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
349   const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
350   const uint8x8_t avg1 = vhadd_u8(A0, A2);
351   uint8x8_t row = vrhadd_u8(avg1, A1);
352   int i;
353   (void)left;
354   for (i = 0; i < 7; ++i) {
355     vst1_u8(dst + i * stride, row);
356     row = vtbl1_u8(row, sh_12345677);
357   }
358   vst1_u8(dst + i * stride, row);
359 }
360 
vpx_d45_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)361 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
362                                   const uint8_t *above, const uint8_t *left) {
363   const uint8x16_t A0 = vld1q_u8(above);  // top row
364   const uint8x16_t above_right = vld1q_dup_u8(above + 15);
365   const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
366   const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
367   const uint8x16_t avg1 = vhaddq_u8(A0, A2);
368   uint8x16_t row = vrhaddq_u8(avg1, A1);
369   int i;
370   (void)left;
371   for (i = 0; i < 15; ++i) {
372     vst1q_u8(dst + i * stride, row);
373     row = vextq_u8(row, above_right, 1);
374   }
375   vst1q_u8(dst + i * stride, row);
376 }
377 
378 // -----------------------------------------------------------------------------
379 
vpx_d135_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)380 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
381                                  const uint8_t *above, const uint8_t *left) {
382   const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
383   const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
384   const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
385   const uint32x2_t zero = vdup_n_u32(0);
386   const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
387   const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
388   const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
389   const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
390   const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
391   const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
392   const uint8_t D = vget_lane_u8(XABCD_u8, 4);
393   const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
394   const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
395   const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
396   const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
397   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
398   const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
399   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
400   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
401   const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
402   vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
403   vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
404   vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
405   vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
406 }
407 
408 #if !HAVE_NEON_ASM
409 
vpx_v_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)410 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
411                               const uint8_t *above, const uint8_t *left) {
412   int i;
413   uint32x2_t d0u32 = vdup_n_u32(0);
414   (void)left;
415 
416   d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
417   for (i = 0; i < 4; i++, dst += stride)
418     vst1_lane_u32((uint32_t *)dst, d0u32, 0);
419 }
420 
vpx_v_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)421 void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
422                               const uint8_t *above, const uint8_t *left) {
423   int i;
424   uint8x8_t d0u8 = vdup_n_u8(0);
425   (void)left;
426 
427   d0u8 = vld1_u8(above);
428   for (i = 0; i < 8; i++, dst += stride)
429     vst1_u8(dst, d0u8);
430 }
431 
vpx_v_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)432 void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
433                                 const uint8_t *above, const uint8_t *left) {
434   int i;
435   uint8x16_t q0u8 = vdupq_n_u8(0);
436   (void)left;
437 
438   q0u8 = vld1q_u8(above);
439   for (i = 0; i < 16; i++, dst += stride)
440     vst1q_u8(dst, q0u8);
441 }
442 
vpx_v_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)443 void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
444                                 const uint8_t *above, const uint8_t *left) {
445   int i;
446   uint8x16_t q0u8 = vdupq_n_u8(0);
447   uint8x16_t q1u8 = vdupq_n_u8(0);
448   (void)left;
449 
450   q0u8 = vld1q_u8(above);
451   q1u8 = vld1q_u8(above + 16);
452   for (i = 0; i < 32; i++, dst += stride) {
453     vst1q_u8(dst, q0u8);
454     vst1q_u8(dst + 16, q1u8);
455   }
456 }
457 
vpx_h_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)458 void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
459                               const uint8_t *above, const uint8_t *left) {
460   uint8x8_t d0u8 = vdup_n_u8(0);
461   uint32x2_t d1u32 = vdup_n_u32(0);
462   (void)above;
463 
464   d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
465 
466   d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
467   vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
468   dst += stride;
469   d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
470   vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
471   dst += stride;
472   d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
473   vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
474   dst += stride;
475   d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
476   vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
477 }
478 
vpx_h_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)479 void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
480                               const uint8_t *above, const uint8_t *left) {
481   uint8x8_t d0u8 = vdup_n_u8(0);
482   uint64x1_t d1u64 = vdup_n_u64(0);
483   (void)above;
484 
485   d1u64 = vld1_u64((const uint64_t *)left);
486 
487   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
488   vst1_u8(dst, d0u8);
489   dst += stride;
490   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
491   vst1_u8(dst, d0u8);
492   dst += stride;
493   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
494   vst1_u8(dst, d0u8);
495   dst += stride;
496   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
497   vst1_u8(dst, d0u8);
498   dst += stride;
499   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
500   vst1_u8(dst, d0u8);
501   dst += stride;
502   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
503   vst1_u8(dst, d0u8);
504   dst += stride;
505   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
506   vst1_u8(dst, d0u8);
507   dst += stride;
508   d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
509   vst1_u8(dst, d0u8);
510 }
511 
vpx_h_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)512 void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
513                                 const uint8_t *above, const uint8_t *left) {
514   int j;
515   uint8x8_t d2u8 = vdup_n_u8(0);
516   uint8x16_t q0u8 = vdupq_n_u8(0);
517   uint8x16_t q1u8 = vdupq_n_u8(0);
518   (void)above;
519 
520   q1u8 = vld1q_u8(left);
521   d2u8 = vget_low_u8(q1u8);
522   for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
523     q0u8 = vdupq_lane_u8(d2u8, 0);
524     vst1q_u8(dst, q0u8);
525     dst += stride;
526     q0u8 = vdupq_lane_u8(d2u8, 1);
527     vst1q_u8(dst, q0u8);
528     dst += stride;
529     q0u8 = vdupq_lane_u8(d2u8, 2);
530     vst1q_u8(dst, q0u8);
531     dst += stride;
532     q0u8 = vdupq_lane_u8(d2u8, 3);
533     vst1q_u8(dst, q0u8);
534     dst += stride;
535     q0u8 = vdupq_lane_u8(d2u8, 4);
536     vst1q_u8(dst, q0u8);
537     dst += stride;
538     q0u8 = vdupq_lane_u8(d2u8, 5);
539     vst1q_u8(dst, q0u8);
540     dst += stride;
541     q0u8 = vdupq_lane_u8(d2u8, 6);
542     vst1q_u8(dst, q0u8);
543     dst += stride;
544     q0u8 = vdupq_lane_u8(d2u8, 7);
545     vst1q_u8(dst, q0u8);
546     dst += stride;
547   }
548 }
549 
vpx_h_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)550 void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
551                                 const uint8_t *above, const uint8_t *left) {
552   int j, k;
553   uint8x8_t d2u8 = vdup_n_u8(0);
554   uint8x16_t q0u8 = vdupq_n_u8(0);
555   uint8x16_t q1u8 = vdupq_n_u8(0);
556   (void)above;
557 
558   for (k = 0; k < 2; k++, left += 16) {
559     q1u8 = vld1q_u8(left);
560     d2u8 = vget_low_u8(q1u8);
561     for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
562       q0u8 = vdupq_lane_u8(d2u8, 0);
563       vst1q_u8(dst, q0u8);
564       vst1q_u8(dst + 16, q0u8);
565       dst += stride;
566       q0u8 = vdupq_lane_u8(d2u8, 1);
567       vst1q_u8(dst, q0u8);
568       vst1q_u8(dst + 16, q0u8);
569       dst += stride;
570       q0u8 = vdupq_lane_u8(d2u8, 2);
571       vst1q_u8(dst, q0u8);
572       vst1q_u8(dst + 16, q0u8);
573       dst += stride;
574       q0u8 = vdupq_lane_u8(d2u8, 3);
575       vst1q_u8(dst, q0u8);
576       vst1q_u8(dst + 16, q0u8);
577       dst += stride;
578       q0u8 = vdupq_lane_u8(d2u8, 4);
579       vst1q_u8(dst, q0u8);
580       vst1q_u8(dst + 16, q0u8);
581       dst += stride;
582       q0u8 = vdupq_lane_u8(d2u8, 5);
583       vst1q_u8(dst, q0u8);
584       vst1q_u8(dst + 16, q0u8);
585       dst += stride;
586       q0u8 = vdupq_lane_u8(d2u8, 6);
587       vst1q_u8(dst, q0u8);
588       vst1q_u8(dst + 16, q0u8);
589       dst += stride;
590       q0u8 = vdupq_lane_u8(d2u8, 7);
591       vst1q_u8(dst, q0u8);
592       vst1q_u8(dst + 16, q0u8);
593       dst += stride;
594     }
595   }
596 }
597 
vpx_tm_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)598 void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
599                                const uint8_t *above, const uint8_t *left) {
600   int i;
601   uint16x8_t q1u16, q3u16;
602   int16x8_t q1s16;
603   uint8x8_t d0u8 = vdup_n_u8(0);
604   uint32x2_t d2u32 = vdup_n_u32(0);
605 
606   d0u8 = vld1_dup_u8(above - 1);
607   d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
608   q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
609   for (i = 0; i < 4; i++, dst += stride) {
610     q1u16 = vdupq_n_u16((uint16_t)left[i]);
611     q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
612                       vreinterpretq_s16_u16(q3u16));
613     d0u8 = vqmovun_s16(q1s16);
614     vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
615   }
616 }
617 
vpx_tm_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)618 void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
619                                const uint8_t *above, const uint8_t *left) {
620   int j;
621   uint16x8_t q0u16, q3u16, q10u16;
622   int16x8_t q0s16;
623   uint16x4_t d20u16;
624   uint8x8_t d0u8, d2u8, d30u8;
625 
626   d0u8 = vld1_dup_u8(above - 1);
627   d30u8 = vld1_u8(left);
628   d2u8 = vld1_u8(above);
629   q10u16 = vmovl_u8(d30u8);
630   q3u16 = vsubl_u8(d2u8, d0u8);
631   d20u16 = vget_low_u16(q10u16);
632   for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
633     q0u16 = vdupq_lane_u16(d20u16, 0);
634     q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
635                       vreinterpretq_s16_u16(q0u16));
636     d0u8 = vqmovun_s16(q0s16);
637     vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
638     dst += stride;
639     q0u16 = vdupq_lane_u16(d20u16, 1);
640     q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
641                       vreinterpretq_s16_u16(q0u16));
642     d0u8 = vqmovun_s16(q0s16);
643     vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
644     dst += stride;
645     q0u16 = vdupq_lane_u16(d20u16, 2);
646     q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
647                       vreinterpretq_s16_u16(q0u16));
648     d0u8 = vqmovun_s16(q0s16);
649     vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
650     dst += stride;
651     q0u16 = vdupq_lane_u16(d20u16, 3);
652     q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
653                       vreinterpretq_s16_u16(q0u16));
654     d0u8 = vqmovun_s16(q0s16);
655     vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
656     dst += stride;
657   }
658 }
659 
vpx_tm_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)660 void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
661                                  const uint8_t *above, const uint8_t *left) {
662   int j, k;
663   uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
664   uint8x16_t q0u8, q1u8;
665   int16x8_t q0s16, q1s16, q8s16, q11s16;
666   uint16x4_t d20u16;
667   uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
668 
669   q0u8 = vld1q_dup_u8(above - 1);
670   q1u8 = vld1q_u8(above);
671   q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
672   q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
673   for (k = 0; k < 2; k++, left += 8) {
674     d18u8 = vld1_u8(left);
675     q10u16 = vmovl_u8(d18u8);
676     d20u16 = vget_low_u16(q10u16);
677     for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
678       q0u16 = vdupq_lane_u16(d20u16, 0);
679       q8u16 = vdupq_lane_u16(d20u16, 1);
680       q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
681                         vreinterpretq_s16_u16(q2u16));
682       q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
683                         vreinterpretq_s16_u16(q3u16));
684       q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
685                          vreinterpretq_s16_u16(q2u16));
686       q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
687                         vreinterpretq_s16_u16(q3u16));
688       d2u8 = vqmovun_s16(q1s16);
689       d3u8 = vqmovun_s16(q0s16);
690       d22u8 = vqmovun_s16(q11s16);
691       d23u8 = vqmovun_s16(q8s16);
692       vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
693       vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
694       dst += stride;
695       vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
696       vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
697       dst += stride;
698 
699       q0u16 = vdupq_lane_u16(d20u16, 2);
700       q8u16 = vdupq_lane_u16(d20u16, 3);
701       q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
702                         vreinterpretq_s16_u16(q2u16));
703       q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
704                         vreinterpretq_s16_u16(q3u16));
705       q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
706                          vreinterpretq_s16_u16(q2u16));
707       q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
708                         vreinterpretq_s16_u16(q3u16));
709       d2u8 = vqmovun_s16(q1s16);
710       d3u8 = vqmovun_s16(q0s16);
711       d22u8 = vqmovun_s16(q11s16);
712       d23u8 = vqmovun_s16(q8s16);
713       vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
714       vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
715       dst += stride;
716       vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
717       vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
718       dst += stride;
719     }
720   }
721 }
722 
vpx_tm_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)723 void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
724                                  const uint8_t *above, const uint8_t *left) {
725   int j, k;
726   uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
727   uint8x16_t q0u8, q1u8, q2u8;
728   int16x8_t q12s16, q13s16, q14s16, q15s16;
729   uint16x4_t d6u16;
730   uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
731 
732   q0u8 = vld1q_dup_u8(above - 1);
733   q1u8 = vld1q_u8(above);
734   q2u8 = vld1q_u8(above + 16);
735   q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
736   q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
737   q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
738   q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
739   for (k = 0; k < 4; k++, left += 8) {
740     d26u8 = vld1_u8(left);
741     q3u16 = vmovl_u8(d26u8);
742     d6u16 = vget_low_u16(q3u16);
743     for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
744       q0u16 = vdupq_lane_u16(d6u16, 0);
745       q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
746                          vreinterpretq_s16_u16(q8u16));
747       q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
748                          vreinterpretq_s16_u16(q9u16));
749       q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
750                          vreinterpretq_s16_u16(q10u16));
751       q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
752                          vreinterpretq_s16_u16(q11u16));
753       d0u8 = vqmovun_s16(q12s16);
754       d1u8 = vqmovun_s16(q13s16);
755       d2u8 = vqmovun_s16(q14s16);
756       d3u8 = vqmovun_s16(q15s16);
757       q0u8 = vcombine_u8(d0u8, d1u8);
758       q1u8 = vcombine_u8(d2u8, d3u8);
759       vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
760       vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
761       dst += stride;
762 
763       q0u16 = vdupq_lane_u16(d6u16, 1);
764       q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
765                          vreinterpretq_s16_u16(q8u16));
766       q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
767                          vreinterpretq_s16_u16(q9u16));
768       q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
769                          vreinterpretq_s16_u16(q10u16));
770       q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
771                          vreinterpretq_s16_u16(q11u16));
772       d0u8 = vqmovun_s16(q12s16);
773       d1u8 = vqmovun_s16(q13s16);
774       d2u8 = vqmovun_s16(q14s16);
775       d3u8 = vqmovun_s16(q15s16);
776       q0u8 = vcombine_u8(d0u8, d1u8);
777       q1u8 = vcombine_u8(d2u8, d3u8);
778       vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
779       vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
780       dst += stride;
781 
782       q0u16 = vdupq_lane_u16(d6u16, 2);
783       q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
784                          vreinterpretq_s16_u16(q8u16));
785       q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
786                          vreinterpretq_s16_u16(q9u16));
787       q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
788                          vreinterpretq_s16_u16(q10u16));
789       q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
790                          vreinterpretq_s16_u16(q11u16));
791       d0u8 = vqmovun_s16(q12s16);
792       d1u8 = vqmovun_s16(q13s16);
793       d2u8 = vqmovun_s16(q14s16);
794       d3u8 = vqmovun_s16(q15s16);
795       q0u8 = vcombine_u8(d0u8, d1u8);
796       q1u8 = vcombine_u8(d2u8, d3u8);
797       vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
798       vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
799       dst += stride;
800 
801       q0u16 = vdupq_lane_u16(d6u16, 3);
802       q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
803                          vreinterpretq_s16_u16(q8u16));
804       q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
805                          vreinterpretq_s16_u16(q9u16));
806       q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
807                          vreinterpretq_s16_u16(q10u16));
808       q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
809                          vreinterpretq_s16_u16(q11u16));
810       d0u8 = vqmovun_s16(q12s16);
811       d1u8 = vqmovun_s16(q13s16);
812       d2u8 = vqmovun_s16(q14s16);
813       d3u8 = vqmovun_s16(q15s16);
814       q0u8 = vcombine_u8(d0u8, d1u8);
815       q1u8 = vcombine_u8(d2u8, d3u8);
816       vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
817       vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
818       dst += stride;
819     }
820   }
821 }
822 #endif  // !HAVE_NEON_ASM
823