1 /*
2  *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
12 #define AOM_AV1_COMMON_ARM_MEM_NEON_H_
13 
14 #include <arm_neon.h>
15 #include <string.h>
16 
store_row2_u8_8x8(uint8_t * s,int p,const uint8x8_t s0,const uint8x8_t s1)17 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
18                                      const uint8x8_t s1) {
19   vst1_u8(s, s0);
20   s += p;
21   vst1_u8(s, s1);
22   s += p;
23 }
24 
25 /* These intrinsics require immediate values, so we must use #defines
26    to enforce that. */
27 #define load_u8_4x1(s, s0, lane)                                           \
28   do {                                                                     \
29     *(s0) = vreinterpret_u8_u32(                                           \
30         vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
31   } while (0)
32 
load_u8_8x8(const uint8_t * s,ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)33 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
34                                uint8x8_t *const s0, uint8x8_t *const s1,
35                                uint8x8_t *const s2, uint8x8_t *const s3,
36                                uint8x8_t *const s4, uint8x8_t *const s5,
37                                uint8x8_t *const s6, uint8x8_t *const s7) {
38   *s0 = vld1_u8(s);
39   s += p;
40   *s1 = vld1_u8(s);
41   s += p;
42   *s2 = vld1_u8(s);
43   s += p;
44   *s3 = vld1_u8(s);
45   s += p;
46   *s4 = vld1_u8(s);
47   s += p;
48   *s5 = vld1_u8(s);
49   s += p;
50   *s6 = vld1_u8(s);
51   s += p;
52   *s7 = vld1_u8(s);
53 }
54 
load_u8_8x16(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)55 static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p,
56                                 uint8x16_t *const s0, uint8x16_t *const s1,
57                                 uint8x16_t *const s2, uint8x16_t *const s3) {
58   *s0 = vld1q_u8(s);
59   s += p;
60   *s1 = vld1q_u8(s);
61   s += p;
62   *s2 = vld1q_u8(s);
63   s += p;
64   *s3 = vld1q_u8(s);
65 }
66 
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)67 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
68                                uint8x8_t *const s0, uint8x8_t *const s1,
69                                uint8x8_t *const s2, uint8x8_t *const s3) {
70   *s0 = vld1_u8(s);
71   s += p;
72   *s1 = vld1_u8(s);
73   s += p;
74   *s2 = vld1_u8(s);
75   s += p;
76   *s3 = vld1_u8(s);
77 }
78 
load_u16_4x4(const uint16_t * s,const ptrdiff_t p,uint16x4_t * const s0,uint16x4_t * const s1,uint16x4_t * const s2,uint16x4_t * const s3)79 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
80                                 uint16x4_t *const s0, uint16x4_t *const s1,
81                                 uint16x4_t *const s2, uint16x4_t *const s3) {
82   *s0 = vld1_u16(s);
83   s += p;
84   *s1 = vld1_u16(s);
85   s += p;
86   *s2 = vld1_u16(s);
87   s += p;
88   *s3 = vld1_u16(s);
89   s += p;
90 }
91 
load_u16_8x4(const uint16_t * s,const ptrdiff_t p,uint16x8_t * const s0,uint16x8_t * const s1,uint16x8_t * const s2,uint16x8_t * const s3)92 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
93                                 uint16x8_t *const s0, uint16x8_t *const s1,
94                                 uint16x8_t *const s2, uint16x8_t *const s3) {
95   *s0 = vld1q_u16(s);
96   s += p;
97   *s1 = vld1q_u16(s);
98   s += p;
99   *s2 = vld1q_u16(s);
100   s += p;
101   *s3 = vld1q_u16(s);
102   s += p;
103 }
104 
load_s16_4x8(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3,int16x4_t * const s4,int16x4_t * const s5,int16x4_t * const s6,int16x4_t * const s7)105 static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
106                                 int16x4_t *const s0, int16x4_t *const s1,
107                                 int16x4_t *const s2, int16x4_t *const s3,
108                                 int16x4_t *const s4, int16x4_t *const s5,
109                                 int16x4_t *const s6, int16x4_t *const s7) {
110   *s0 = vld1_s16(s);
111   s += p;
112   *s1 = vld1_s16(s);
113   s += p;
114   *s2 = vld1_s16(s);
115   s += p;
116   *s3 = vld1_s16(s);
117   s += p;
118   *s4 = vld1_s16(s);
119   s += p;
120   *s5 = vld1_s16(s);
121   s += p;
122   *s6 = vld1_s16(s);
123   s += p;
124   *s7 = vld1_s16(s);
125 }
126 
load_s16_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * const s0,int16x4_t * const s1,int16x4_t * const s2,int16x4_t * const s3)127 static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
128                                 int16x4_t *const s0, int16x4_t *const s1,
129                                 int16x4_t *const s2, int16x4_t *const s3) {
130   *s0 = vld1_s16(s);
131   s += p;
132   *s1 = vld1_s16(s);
133   s += p;
134   *s2 = vld1_s16(s);
135   s += p;
136   *s3 = vld1_s16(s);
137 }
138 
139 /* These intrinsics require immediate values, so we must use #defines
140    to enforce that. */
141 #define store_u8_4x1(s, s0, lane)                                  \
142   do {                                                             \
143     vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
144   } while (0)
145 
store_u8_8x8(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3,const uint8x8_t s4,const uint8x8_t s5,const uint8x8_t s6,const uint8x8_t s7)146 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
147                                 const uint8x8_t s1, const uint8x8_t s2,
148                                 const uint8x8_t s3, const uint8x8_t s4,
149                                 const uint8x8_t s5, const uint8x8_t s6,
150                                 const uint8x8_t s7) {
151   vst1_u8(s, s0);
152   s += p;
153   vst1_u8(s, s1);
154   s += p;
155   vst1_u8(s, s2);
156   s += p;
157   vst1_u8(s, s3);
158   s += p;
159   vst1_u8(s, s4);
160   s += p;
161   vst1_u8(s, s5);
162   s += p;
163   vst1_u8(s, s6);
164   s += p;
165   vst1_u8(s, s7);
166 }
167 
store_u8_8x4(uint8_t * s,ptrdiff_t p,const uint8x8_t s0,const uint8x8_t s1,const uint8x8_t s2,const uint8x8_t s3)168 static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
169                                 const uint8x8_t s1, const uint8x8_t s2,
170                                 const uint8x8_t s3) {
171   vst1_u8(s, s0);
172   s += p;
173   vst1_u8(s, s1);
174   s += p;
175   vst1_u8(s, s2);
176   s += p;
177   vst1_u8(s, s3);
178 }
179 
store_u8_8x16(uint8_t * s,ptrdiff_t p,const uint8x16_t s0,const uint8x16_t s1,const uint8x16_t s2,const uint8x16_t s3)180 static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
181                                  const uint8x16_t s1, const uint8x16_t s2,
182                                  const uint8x16_t s3) {
183   vst1q_u8(s, s0);
184   s += p;
185   vst1q_u8(s, s1);
186   s += p;
187   vst1q_u8(s, s2);
188   s += p;
189   vst1q_u8(s, s3);
190 }
191 
store_u16_8x8(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)192 static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
193                                  const uint16x8_t s0, const uint16x8_t s1,
194                                  const uint16x8_t s2, const uint16x8_t s3,
195                                  const uint16x8_t s4, const uint16x8_t s5,
196                                  const uint16x8_t s6, const uint16x8_t s7) {
197   vst1q_u16(s, s0);
198   s += dst_stride;
199   vst1q_u16(s, s1);
200   s += dst_stride;
201   vst1q_u16(s, s2);
202   s += dst_stride;
203   vst1q_u16(s, s3);
204   s += dst_stride;
205   vst1q_u16(s, s4);
206   s += dst_stride;
207   vst1q_u16(s, s5);
208   s += dst_stride;
209   vst1q_u16(s, s6);
210   s += dst_stride;
211   vst1q_u16(s, s7);
212 }
213 
store_u16_4x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x4_t s0,const uint16x4_t s1,const uint16x4_t s2,const uint16x4_t s3)214 static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
215                                  const uint16x4_t s0, const uint16x4_t s1,
216                                  const uint16x4_t s2, const uint16x4_t s3) {
217   vst1_u16(s, s0);
218   s += dst_stride;
219   vst1_u16(s, s1);
220   s += dst_stride;
221   vst1_u16(s, s2);
222   s += dst_stride;
223   vst1_u16(s, s3);
224 }
225 
store_u16_8x4(uint16_t * s,ptrdiff_t dst_stride,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3)226 static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
227                                  const uint16x8_t s0, const uint16x8_t s1,
228                                  const uint16x8_t s2, const uint16x8_t s3) {
229   vst1q_u16(s, s0);
230   s += dst_stride;
231   vst1q_u16(s, s1);
232   s += dst_stride;
233   vst1q_u16(s, s2);
234   s += dst_stride;
235   vst1q_u16(s, s3);
236 }
237 
store_s16_8x8(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7)238 static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
239                                  const int16x8_t s0, const int16x8_t s1,
240                                  const int16x8_t s2, const int16x8_t s3,
241                                  const int16x8_t s4, const int16x8_t s5,
242                                  const int16x8_t s6, const int16x8_t s7) {
243   vst1q_s16(s, s0);
244   s += dst_stride;
245   vst1q_s16(s, s1);
246   s += dst_stride;
247   vst1q_s16(s, s2);
248   s += dst_stride;
249   vst1q_s16(s, s3);
250   s += dst_stride;
251   vst1q_s16(s, s4);
252   s += dst_stride;
253   vst1q_s16(s, s5);
254   s += dst_stride;
255   vst1q_s16(s, s6);
256   s += dst_stride;
257   vst1q_s16(s, s7);
258 }
259 
store_s16_4x4(int16_t * s,ptrdiff_t dst_stride,const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3)260 static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
261                                  const int16x4_t s0, const int16x4_t s1,
262                                  const int16x4_t s2, const int16x4_t s3) {
263   vst1_s16(s, s0);
264   s += dst_stride;
265   vst1_s16(s, s1);
266   s += dst_stride;
267   vst1_s16(s, s2);
268   s += dst_stride;
269   vst1_s16(s, s3);
270 }
271 
store_s16_8x4(int16_t * s,ptrdiff_t dst_stride,const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3)272 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
273                                  const int16x8_t s0, const int16x8_t s1,
274                                  const int16x8_t s2, const int16x8_t s3) {
275   vst1q_s16(s, s0);
276   s += dst_stride;
277   vst1q_s16(s, s1);
278   s += dst_stride;
279   vst1q_s16(s, s2);
280   s += dst_stride;
281   vst1q_s16(s, s3);
282 }
283 
load_s16_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3,int16x8_t * const s4,int16x8_t * const s5,int16x8_t * const s6,int16x8_t * const s7)284 static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
285                                 int16x8_t *const s0, int16x8_t *const s1,
286                                 int16x8_t *const s2, int16x8_t *const s3,
287                                 int16x8_t *const s4, int16x8_t *const s5,
288                                 int16x8_t *const s6, int16x8_t *const s7) {
289   *s0 = vld1q_s16(s);
290   s += p;
291   *s1 = vld1q_s16(s);
292   s += p;
293   *s2 = vld1q_s16(s);
294   s += p;
295   *s3 = vld1q_s16(s);
296   s += p;
297   *s4 = vld1q_s16(s);
298   s += p;
299   *s5 = vld1q_s16(s);
300   s += p;
301   *s6 = vld1q_s16(s);
302   s += p;
303   *s7 = vld1q_s16(s);
304 }
305 
load_s16_8x4(const int16_t * s,ptrdiff_t p,int16x8_t * const s0,int16x8_t * const s1,int16x8_t * const s2,int16x8_t * const s3)306 static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
307                                 int16x8_t *const s0, int16x8_t *const s1,
308                                 int16x8_t *const s2, int16x8_t *const s3) {
309   *s0 = vld1q_s16(s);
310   s += p;
311   *s1 = vld1q_s16(s);
312   s += p;
313   *s2 = vld1q_s16(s);
314   s += p;
315   *s3 = vld1q_s16(s);
316 }
317 
load_unaligned_u8_4x8(const uint8_t * buf,int stride,uint32x2_t * tu0,uint32x2_t * tu1,uint32x2_t * tu2,uint32x2_t * tu3)318 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
319                                          uint32x2_t *tu0, uint32x2_t *tu1,
320                                          uint32x2_t *tu2, uint32x2_t *tu3) {
321   uint32_t a;
322 
323   memcpy(&a, buf, 4);
324   buf += stride;
325   *tu0 = vset_lane_u32(a, *tu0, 0);
326   memcpy(&a, buf, 4);
327   buf += stride;
328   *tu0 = vset_lane_u32(a, *tu0, 1);
329   memcpy(&a, buf, 4);
330   buf += stride;
331   *tu1 = vset_lane_u32(a, *tu1, 0);
332   memcpy(&a, buf, 4);
333   buf += stride;
334   *tu1 = vset_lane_u32(a, *tu1, 1);
335   memcpy(&a, buf, 4);
336   buf += stride;
337   *tu2 = vset_lane_u32(a, *tu2, 0);
338   memcpy(&a, buf, 4);
339   buf += stride;
340   *tu2 = vset_lane_u32(a, *tu2, 1);
341   memcpy(&a, buf, 4);
342   buf += stride;
343   *tu3 = vset_lane_u32(a, *tu3, 0);
344   memcpy(&a, buf, 4);
345   *tu3 = vset_lane_u32(a, *tu3, 1);
346 }
347 
load_unaligned_u8_4x4(const uint8_t * buf,int stride,uint32x2_t * tu0,uint32x2_t * tu1)348 static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
349                                          uint32x2_t *tu0, uint32x2_t *tu1) {
350   uint32_t a;
351 
352   memcpy(&a, buf, 4);
353   buf += stride;
354   *tu0 = vset_lane_u32(a, *tu0, 0);
355   memcpy(&a, buf, 4);
356   buf += stride;
357   *tu0 = vset_lane_u32(a, *tu0, 1);
358   memcpy(&a, buf, 4);
359   buf += stride;
360   *tu1 = vset_lane_u32(a, *tu1, 0);
361   memcpy(&a, buf, 4);
362   *tu1 = vset_lane_u32(a, *tu1, 1);
363 }
364 
load_unaligned_u8_4x1(const uint8_t * buf,int stride,uint32x2_t * tu0)365 static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
366                                          uint32x2_t *tu0) {
367   uint32_t a;
368 
369   memcpy(&a, buf, 4);
370   buf += stride;
371   *tu0 = vset_lane_u32(a, *tu0, 0);
372 }
373 
load_unaligned_u8_4x2(const uint8_t * buf,int stride,uint32x2_t * tu0)374 static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
375                                          uint32x2_t *tu0) {
376   uint32_t a;
377 
378   memcpy(&a, buf, 4);
379   buf += stride;
380   *tu0 = vset_lane_u32(a, *tu0, 0);
381   memcpy(&a, buf, 4);
382   buf += stride;
383   *tu0 = vset_lane_u32(a, *tu0, 1);
384 }
385 
386 /* These intrinsics require immediate values, so we must use #defines
387    to enforce that. */
388 #define store_unaligned_u8_4x1(dst, src, lane)         \
389   do {                                                 \
390     uint32_t a;                                        \
391     a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
392     memcpy(dst, &a, 4);                                \
393   } while (0)
394 
load_unaligned_u8_2x2(const uint8_t * buf,int stride,uint16x4_t * tu0)395 static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
396                                          uint16x4_t *tu0) {
397   uint16_t a;
398 
399   memcpy(&a, buf, 2);
400   buf += stride;
401   *tu0 = vset_lane_u16(a, *tu0, 0);
402   memcpy(&a, buf, 2);
403   buf += stride;
404   *tu0 = vset_lane_u16(a, *tu0, 1);
405 }
406 
load_u8_16x8(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)407 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
408                                 uint8x16_t *const s0, uint8x16_t *const s1,
409                                 uint8x16_t *const s2, uint8x16_t *const s3,
410                                 uint8x16_t *const s4, uint8x16_t *const s5,
411                                 uint8x16_t *const s6, uint8x16_t *const s7) {
412   *s0 = vld1q_u8(s);
413   s += p;
414   *s1 = vld1q_u8(s);
415   s += p;
416   *s2 = vld1q_u8(s);
417   s += p;
418   *s3 = vld1q_u8(s);
419   s += p;
420   *s4 = vld1q_u8(s);
421   s += p;
422   *s5 = vld1q_u8(s);
423   s += p;
424   *s6 = vld1q_u8(s);
425   s += p;
426   *s7 = vld1q_u8(s);
427 }
428 
load_u8_16x4(const uint8_t * s,ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3)429 static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
430                                 uint8x16_t *const s0, uint8x16_t *const s1,
431                                 uint8x16_t *const s2, uint8x16_t *const s3) {
432   *s0 = vld1q_u8(s);
433   s += p;
434   *s1 = vld1q_u8(s);
435   s += p;
436   *s2 = vld1q_u8(s);
437   s += p;
438   *s3 = vld1q_u8(s);
439 }
440 
load_unaligned_u16_4x4(const uint16_t * buf,uint32_t stride,uint64x2_t * tu0,uint64x2_t * tu1)441 static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
442                                           uint64x2_t *tu0, uint64x2_t *tu1) {
443   uint64_t a;
444 
445   memcpy(&a, buf, 8);
446   buf += stride;
447   *tu0 = vsetq_lane_u64(a, *tu0, 0);
448   memcpy(&a, buf, 8);
449   buf += stride;
450   *tu0 = vsetq_lane_u64(a, *tu0, 1);
451   memcpy(&a, buf, 8);
452   buf += stride;
453   *tu1 = vsetq_lane_u64(a, *tu1, 0);
454   memcpy(&a, buf, 8);
455   *tu1 = vsetq_lane_u64(a, *tu1, 1);
456 }
457 
load_s32_4x4(int32_t * s,int32_t p,int32x4_t * s1,int32x4_t * s2,int32x4_t * s3,int32x4_t * s4)458 static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
459                                 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
460   *s1 = vld1q_s32(s);
461   s += p;
462   *s2 = vld1q_s32(s);
463   s += p;
464   *s3 = vld1q_s32(s);
465   s += p;
466   *s4 = vld1q_s32(s);
467 }
468 
store_s32_4x4(int32_t * s,int32_t p,int32x4_t s1,int32x4_t s2,int32x4_t s3,int32x4_t s4)469 static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
470                                  int32x4_t s2, int32x4_t s3, int32x4_t s4) {
471   vst1q_s32(s, s1);
472   s += p;
473   vst1q_s32(s, s2);
474   s += p;
475   vst1q_s32(s, s3);
476   s += p;
477   vst1q_s32(s, s4);
478 }
479 
load_u32_4x4(uint32_t * s,int32_t p,uint32x4_t * s1,uint32x4_t * s2,uint32x4_t * s3,uint32x4_t * s4)480 static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
481                                 uint32x4_t *s2, uint32x4_t *s3,
482                                 uint32x4_t *s4) {
483   *s1 = vld1q_u32(s);
484   s += p;
485   *s2 = vld1q_u32(s);
486   s += p;
487   *s3 = vld1q_u32(s);
488   s += p;
489   *s4 = vld1q_u32(s);
490 }
491 
store_u32_4x4(uint32_t * s,int32_t p,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4)492 static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
493                                  uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
494   vst1q_u32(s, s1);
495   s += p;
496   vst1q_u32(s, s2);
497   s += p;
498   vst1q_u32(s, s3);
499   s += p;
500   vst1q_u32(s, s4);
501 }
502 
503 #endif  // AOM_AV1_COMMON_ARM_MEM_NEON_H_
504