1@ This file was created from a .asm file
2@  using the ads2gas.pl script.
3	.equ DO1STROUNDING, 0
4	.syntax unified
5@
6@  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
7@
8@  Use of this source code is governed by a BSD-style license
9@  that can be found in the LICENSE file in the root of the source
10@  tree. An additional intellectual property rights grant can be found
11@  in the file PATENTS.  All contributing project authors may
12@  be found in the AUTHORS file in the root of the source tree.
13@
14
15    .global vpx_v_predictor_4x4_neon
16	.type vpx_v_predictor_4x4_neon, function
17    .global vpx_v_predictor_8x8_neon
18	.type vpx_v_predictor_8x8_neon, function
19    .global vpx_v_predictor_16x16_neon
20	.type vpx_v_predictor_16x16_neon, function
21    .global vpx_v_predictor_32x32_neon
22	.type vpx_v_predictor_32x32_neon, function
23    .global vpx_h_predictor_4x4_neon
24	.type vpx_h_predictor_4x4_neon, function
25    .global vpx_h_predictor_8x8_neon
26	.type vpx_h_predictor_8x8_neon, function
27    .global vpx_h_predictor_16x16_neon
28	.type vpx_h_predictor_16x16_neon, function
29    .global vpx_h_predictor_32x32_neon
30	.type vpx_h_predictor_32x32_neon, function
31    .global vpx_tm_predictor_4x4_neon
32	.type vpx_tm_predictor_4x4_neon, function
33    .global vpx_tm_predictor_8x8_neon
34	.type vpx_tm_predictor_8x8_neon, function
35    .global vpx_tm_predictor_16x16_neon
36	.type vpx_tm_predictor_16x16_neon, function
37    .global vpx_tm_predictor_32x32_neon
38	.type vpx_tm_predictor_32x32_neon, function
39   .arm
40   .eabi_attribute 24, 1 @Tag_ABI_align_needed
41   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
42
43.text
44.p2align 2
45
46@void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
47@                              const uint8_t *above,
48@                              const uint8_t *left)
49@ r0  uint8_t *dst
50@ r1  ptrdiff_t y_stride
51@ r2  const uint8_t *above
52@ r3  const uint8_t *left
53
54_vpx_v_predictor_4x4_neon:
55	vpx_v_predictor_4x4_neon: @ PROC
56    vld1.32             {d0[0]}, [r2]
57    vst1.32             {d0[0]}, [r0], r1
58    vst1.32             {d0[0]}, [r0], r1
59    vst1.32             {d0[0]}, [r0], r1
60    vst1.32             {d0[0]}, [r0], r1
61    bx                  lr
62	.size vpx_v_predictor_4x4_neon, .-vpx_v_predictor_4x4_neon    @ ENDP                @ |vpx_v_predictor_4x4_neon|
63
64@void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
65@                              const uint8_t *above,
66@                              const uint8_t *left)
67@ r0  uint8_t *dst
68@ r1  ptrdiff_t y_stride
69@ r2  const uint8_t *above
70@ r3  const uint8_t *left
71
72_vpx_v_predictor_8x8_neon:
73	vpx_v_predictor_8x8_neon: @ PROC
74    vld1.8              {d0}, [r2]
75    vst1.8              {d0}, [r0], r1
76    vst1.8              {d0}, [r0], r1
77    vst1.8              {d0}, [r0], r1
78    vst1.8              {d0}, [r0], r1
79    vst1.8              {d0}, [r0], r1
80    vst1.8              {d0}, [r0], r1
81    vst1.8              {d0}, [r0], r1
82    vst1.8              {d0}, [r0], r1
83    bx                  lr
84	.size vpx_v_predictor_8x8_neon, .-vpx_v_predictor_8x8_neon    @ ENDP                @ |vpx_v_predictor_8x8_neon|
85
86@void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
87@                                const uint8_t *above,
88@                                const uint8_t *left)
89@ r0  uint8_t *dst
90@ r1  ptrdiff_t y_stride
91@ r2  const uint8_t *above
92@ r3  const uint8_t *left
93
94_vpx_v_predictor_16x16_neon:
95	vpx_v_predictor_16x16_neon: @ PROC
96    vld1.8              {q0}, [r2]
97    vst1.8              {q0}, [r0], r1
98    vst1.8              {q0}, [r0], r1
99    vst1.8              {q0}, [r0], r1
100    vst1.8              {q0}, [r0], r1
101    vst1.8              {q0}, [r0], r1
102    vst1.8              {q0}, [r0], r1
103    vst1.8              {q0}, [r0], r1
104    vst1.8              {q0}, [r0], r1
105    vst1.8              {q0}, [r0], r1
106    vst1.8              {q0}, [r0], r1
107    vst1.8              {q0}, [r0], r1
108    vst1.8              {q0}, [r0], r1
109    vst1.8              {q0}, [r0], r1
110    vst1.8              {q0}, [r0], r1
111    vst1.8              {q0}, [r0], r1
112    vst1.8              {q0}, [r0], r1
113    bx                  lr
114	.size vpx_v_predictor_16x16_neon, .-vpx_v_predictor_16x16_neon    @ ENDP                @ |vpx_v_predictor_16x16_neon|
115
116@void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
117@                                const uint8_t *above,
118@                                const uint8_t *left)
119@ r0  uint8_t *dst
120@ r1  ptrdiff_t y_stride
121@ r2  const uint8_t *above
122@ r3  const uint8_t *left
123
124_vpx_v_predictor_32x32_neon:
125	vpx_v_predictor_32x32_neon: @ PROC
126    vld1.8              {q0, q1}, [r2]
127    mov                 r2, #2
128loop_v:
129    vst1.8              {q0, q1}, [r0], r1
130    vst1.8              {q0, q1}, [r0], r1
131    vst1.8              {q0, q1}, [r0], r1
132    vst1.8              {q0, q1}, [r0], r1
133    vst1.8              {q0, q1}, [r0], r1
134    vst1.8              {q0, q1}, [r0], r1
135    vst1.8              {q0, q1}, [r0], r1
136    vst1.8              {q0, q1}, [r0], r1
137    vst1.8              {q0, q1}, [r0], r1
138    vst1.8              {q0, q1}, [r0], r1
139    vst1.8              {q0, q1}, [r0], r1
140    vst1.8              {q0, q1}, [r0], r1
141    vst1.8              {q0, q1}, [r0], r1
142    vst1.8              {q0, q1}, [r0], r1
143    vst1.8              {q0, q1}, [r0], r1
144    vst1.8              {q0, q1}, [r0], r1
145    subs                r2, r2, #1
146    bgt                 loop_v
147    bx                  lr
148	.size vpx_v_predictor_32x32_neon, .-vpx_v_predictor_32x32_neon    @ ENDP                @ |vpx_v_predictor_32x32_neon|
149
150@void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
151@                              const uint8_t *above,
152@                              const uint8_t *left)
153@ r0  uint8_t *dst
154@ r1  ptrdiff_t y_stride
155@ r2  const uint8_t *above
156@ r3  const uint8_t *left
157
158_vpx_h_predictor_4x4_neon:
159	vpx_h_predictor_4x4_neon: @ PROC
160    vld1.32             {d1[0]}, [r3]
161    vdup.8              d0, d1[0]
162    vst1.32             {d0[0]}, [r0], r1
163    vdup.8              d0, d1[1]
164    vst1.32             {d0[0]}, [r0], r1
165    vdup.8              d0, d1[2]
166    vst1.32             {d0[0]}, [r0], r1
167    vdup.8              d0, d1[3]
168    vst1.32             {d0[0]}, [r0], r1
169    bx                  lr
170	.size vpx_h_predictor_4x4_neon, .-vpx_h_predictor_4x4_neon    @ ENDP                @ |vpx_h_predictor_4x4_neon|
171
172@void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
173@                              const uint8_t *above,
174@                              const uint8_t *left)
175@ r0  uint8_t *dst
176@ r1  ptrdiff_t y_stride
177@ r2  const uint8_t *above
178@ r3  const uint8_t *left
179
180_vpx_h_predictor_8x8_neon:
181	vpx_h_predictor_8x8_neon: @ PROC
182    vld1.64             {d1}, [r3]
183    vdup.8              d0, d1[0]
184    vst1.64             {d0}, [r0], r1
185    vdup.8              d0, d1[1]
186    vst1.64             {d0}, [r0], r1
187    vdup.8              d0, d1[2]
188    vst1.64             {d0}, [r0], r1
189    vdup.8              d0, d1[3]
190    vst1.64             {d0}, [r0], r1
191    vdup.8              d0, d1[4]
192    vst1.64             {d0}, [r0], r1
193    vdup.8              d0, d1[5]
194    vst1.64             {d0}, [r0], r1
195    vdup.8              d0, d1[6]
196    vst1.64             {d0}, [r0], r1
197    vdup.8              d0, d1[7]
198    vst1.64             {d0}, [r0], r1
199    bx                  lr
200	.size vpx_h_predictor_8x8_neon, .-vpx_h_predictor_8x8_neon    @ ENDP                @ |vpx_h_predictor_8x8_neon|
201
202@void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
203@                                const uint8_t *above,
204@                                const uint8_t *left)
205@ r0  uint8_t *dst
206@ r1  ptrdiff_t y_stride
207@ r2  const uint8_t *above
208@ r3  const uint8_t *left
209
210_vpx_h_predictor_16x16_neon:
211	vpx_h_predictor_16x16_neon: @ PROC
212    vld1.8              {q1}, [r3]
213    vdup.8              q0, d2[0]
214    vst1.8              {q0}, [r0], r1
215    vdup.8              q0, d2[1]
216    vst1.8              {q0}, [r0], r1
217    vdup.8              q0, d2[2]
218    vst1.8              {q0}, [r0], r1
219    vdup.8              q0, d2[3]
220    vst1.8              {q0}, [r0], r1
221    vdup.8              q0, d2[4]
222    vst1.8              {q0}, [r0], r1
223    vdup.8              q0, d2[5]
224    vst1.8              {q0}, [r0], r1
225    vdup.8              q0, d2[6]
226    vst1.8              {q0}, [r0], r1
227    vdup.8              q0, d2[7]
228    vst1.8              {q0}, [r0], r1
229    vdup.8              q0, d3[0]
230    vst1.8              {q0}, [r0], r1
231    vdup.8              q0, d3[1]
232    vst1.8              {q0}, [r0], r1
233    vdup.8              q0, d3[2]
234    vst1.8              {q0}, [r0], r1
235    vdup.8              q0, d3[3]
236    vst1.8              {q0}, [r0], r1
237    vdup.8              q0, d3[4]
238    vst1.8              {q0}, [r0], r1
239    vdup.8              q0, d3[5]
240    vst1.8              {q0}, [r0], r1
241    vdup.8              q0, d3[6]
242    vst1.8              {q0}, [r0], r1
243    vdup.8              q0, d3[7]
244    vst1.8              {q0}, [r0], r1
245    bx                  lr
246	.size vpx_h_predictor_16x16_neon, .-vpx_h_predictor_16x16_neon    @ ENDP                @ |vpx_h_predictor_16x16_neon|
247
248@void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
249@                                const uint8_t *above,
250@                                const uint8_t *left)
251@ r0  uint8_t *dst
252@ r1  ptrdiff_t y_stride
253@ r2  const uint8_t *above
254@ r3  const uint8_t *left
255
256_vpx_h_predictor_32x32_neon:
257	vpx_h_predictor_32x32_neon: @ PROC
258    sub                 r1, r1, #16
259    mov                 r2, #2
260loop_h:
261    vld1.8              {q1}, [r3]!
262    vdup.8              q0, d2[0]
263    vst1.8              {q0}, [r0]!
264    vst1.8              {q0}, [r0], r1
265    vdup.8              q0, d2[1]
266    vst1.8              {q0}, [r0]!
267    vst1.8              {q0}, [r0], r1
268    vdup.8              q0, d2[2]
269    vst1.8              {q0}, [r0]!
270    vst1.8              {q0}, [r0], r1
271    vdup.8              q0, d2[3]
272    vst1.8              {q0}, [r0]!
273    vst1.8              {q0}, [r0], r1
274    vdup.8              q0, d2[4]
275    vst1.8              {q0}, [r0]!
276    vst1.8              {q0}, [r0], r1
277    vdup.8              q0, d2[5]
278    vst1.8              {q0}, [r0]!
279    vst1.8              {q0}, [r0], r1
280    vdup.8              q0, d2[6]
281    vst1.8              {q0}, [r0]!
282    vst1.8              {q0}, [r0], r1
283    vdup.8              q0, d2[7]
284    vst1.8              {q0}, [r0]!
285    vst1.8              {q0}, [r0], r1
286    vdup.8              q0, d3[0]
287    vst1.8              {q0}, [r0]!
288    vst1.8              {q0}, [r0], r1
289    vdup.8              q0, d3[1]
290    vst1.8              {q0}, [r0]!
291    vst1.8              {q0}, [r0], r1
292    vdup.8              q0, d3[2]
293    vst1.8              {q0}, [r0]!
294    vst1.8              {q0}, [r0], r1
295    vdup.8              q0, d3[3]
296    vst1.8              {q0}, [r0]!
297    vst1.8              {q0}, [r0], r1
298    vdup.8              q0, d3[4]
299    vst1.8              {q0}, [r0]!
300    vst1.8              {q0}, [r0], r1
301    vdup.8              q0, d3[5]
302    vst1.8              {q0}, [r0]!
303    vst1.8              {q0}, [r0], r1
304    vdup.8              q0, d3[6]
305    vst1.8              {q0}, [r0]!
306    vst1.8              {q0}, [r0], r1
307    vdup.8              q0, d3[7]
308    vst1.8              {q0}, [r0]!
309    vst1.8              {q0}, [r0], r1
310    subs                r2, r2, #1
311    bgt                 loop_h
312    bx                  lr
313	.size vpx_h_predictor_32x32_neon, .-vpx_h_predictor_32x32_neon    @ ENDP                @ |vpx_h_predictor_32x32_neon|
314
315@void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
316@                                const uint8_t *above,
317@                                const uint8_t *left)
318@ r0  uint8_t *dst
319@ r1  ptrdiff_t y_stride
320@ r2  const uint8_t *above
321@ r3  const uint8_t *left
322
323_vpx_tm_predictor_4x4_neon:
324	vpx_tm_predictor_4x4_neon: @ PROC
325    @ Load ytop_left = above[-1];
326    sub                 r12, r2, #1
327    vld1.u8             {d0[]}, [r12]
328
329    @ Load above 4 pixels
330    vld1.32             {d2[0]}, [r2]
331
332    @ Compute above - ytop_left
333    vsubl.u8            q3, d2, d0
334
335    @ Load left row by row and compute left + (above - ytop_left)
336    @ 1st row and 2nd row
337    vld1.u8             {d2[]}, [r3]!
338    vld1.u8             {d4[]}, [r3]!
339    vmovl.u8            q1, d2
340    vmovl.u8            q2, d4
341    vadd.s16            q1, q1, q3
342    vadd.s16            q2, q2, q3
343    vqmovun.s16         d0, q1
344    vqmovun.s16         d1, q2
345    vst1.32             {d0[0]}, [r0], r1
346    vst1.32             {d1[0]}, [r0], r1
347
348    @ 3rd row and 4th row
349    vld1.u8             {d2[]}, [r3]!
350    vld1.u8             {d4[]}, [r3]
351    vmovl.u8            q1, d2
352    vmovl.u8            q2, d4
353    vadd.s16            q1, q1, q3
354    vadd.s16            q2, q2, q3
355    vqmovun.s16         d0, q1
356    vqmovun.s16         d1, q2
357    vst1.32             {d0[0]}, [r0], r1
358    vst1.32             {d1[0]}, [r0], r1
359    bx                  lr
360	.size vpx_tm_predictor_4x4_neon, .-vpx_tm_predictor_4x4_neon    @ ENDP                @ |vpx_tm_predictor_4x4_neon|
361
362@void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
363@                                const uint8_t *above,
364@                                const uint8_t *left)
365@ r0  uint8_t *dst
366@ r1  ptrdiff_t y_stride
367@ r2  const uint8_t *above
368@ r3  const uint8_t *left
369
370_vpx_tm_predictor_8x8_neon:
371	vpx_tm_predictor_8x8_neon: @ PROC
372    @ Load ytop_left = above[-1];
373    sub                 r12, r2, #1
374    vld1.8              {d0[]}, [r12]
375
376    @ preload 8 left
377    vld1.8              {d30}, [r3]
378
379    @ Load above 8 pixels
380    vld1.64             {d2}, [r2]
381
382    vmovl.u8            q10, d30
383
384    @ Compute above - ytop_left
385    vsubl.u8            q3, d2, d0
386
387    @ Load left row by row and compute left + (above - ytop_left)
388    @ 1st row and 2nd row
389    vdup.16             q0, d20[0]
390    vdup.16             q1, d20[1]
391    vadd.s16            q0, q3, q0
392    vadd.s16            q1, q3, q1
393
394    @ 3rd row and 4th row
395    vdup.16             q8, d20[2]
396    vdup.16             q9, d20[3]
397    vadd.s16            q8, q3, q8
398    vadd.s16            q9, q3, q9
399
400    vqmovun.s16         d0, q0
401    vqmovun.s16         d1, q1
402    vqmovun.s16         d2, q8
403    vqmovun.s16         d3, q9
404
405    vst1.64             {d0}, [r0], r1
406    vst1.64             {d1}, [r0], r1
407    vst1.64             {d2}, [r0], r1
408    vst1.64             {d3}, [r0], r1
409
410    @ 5th row and 6th row
411    vdup.16             q0, d21[0]
412    vdup.16             q1, d21[1]
413    vadd.s16            q0, q3, q0
414    vadd.s16            q1, q3, q1
415
416    @ 7th row and 8th row
417    vdup.16             q8, d21[2]
418    vdup.16             q9, d21[3]
419    vadd.s16            q8, q3, q8
420    vadd.s16            q9, q3, q9
421
422    vqmovun.s16         d0, q0
423    vqmovun.s16         d1, q1
424    vqmovun.s16         d2, q8
425    vqmovun.s16         d3, q9
426
427    vst1.64             {d0}, [r0], r1
428    vst1.64             {d1}, [r0], r1
429    vst1.64             {d2}, [r0], r1
430    vst1.64             {d3}, [r0], r1
431
432    bx                  lr
433	.size vpx_tm_predictor_8x8_neon, .-vpx_tm_predictor_8x8_neon    @ ENDP                @ |vpx_tm_predictor_8x8_neon|
434
435@void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
436@                                const uint8_t *above,
437@                                const uint8_t *left)
438@ r0  uint8_t *dst
439@ r1  ptrdiff_t y_stride
440@ r2  const uint8_t *above
441@ r3  const uint8_t *left
442
443_vpx_tm_predictor_16x16_neon:
444	vpx_tm_predictor_16x16_neon: @ PROC
445    @ Load ytop_left = above[-1];
446    sub                 r12, r2, #1
447    vld1.8              {d0[]}, [r12]
448
449    @ Load above 8 pixels
450    vld1.8              {q1}, [r2]
451
452    @ preload 8 left into r12
453    vld1.8              {d18}, [r3]!
454
455    @ Compute above - ytop_left
456    vsubl.u8            q2, d2, d0
457    vsubl.u8            q3, d3, d0
458
459    vmovl.u8            q10, d18
460
461    @ Load left row by row and compute left + (above - ytop_left)
462    @ Process 8 rows in each single loop and loop 2 times to process 16 rows.
463    mov                 r2, #2
464
465loop_16x16_neon:
466    @ Process two rows.
467    vdup.16             q0, d20[0]
468    vdup.16             q8, d20[1]
469    vadd.s16            q1, q0, q2
470    vadd.s16            q0, q0, q3
471    vadd.s16            q11, q8, q2
472    vadd.s16            q8, q8, q3
473    vqmovun.s16         d2, q1
474    vqmovun.s16         d3, q0
475    vqmovun.s16         d22, q11
476    vqmovun.s16         d23, q8
477    vdup.16             q0, d20[2]                  @ proload next 2 rows data
478    vdup.16             q8, d20[3]
479    vst1.64             {d2,d3}, [r0], r1
480    vst1.64             {d22,d23}, [r0], r1
481
482    @ Process two rows.
483    vadd.s16            q1, q0, q2
484    vadd.s16            q0, q0, q3
485    vadd.s16            q11, q8, q2
486    vadd.s16            q8, q8, q3
487    vqmovun.s16         d2, q1
488    vqmovun.s16         d3, q0
489    vqmovun.s16         d22, q11
490    vqmovun.s16         d23, q8
491    vdup.16             q0, d21[0]                  @ proload next 2 rows data
492    vdup.16             q8, d21[1]
493    vst1.64             {d2,d3}, [r0], r1
494    vst1.64             {d22,d23}, [r0], r1
495
496    vadd.s16            q1, q0, q2
497    vadd.s16            q0, q0, q3
498    vadd.s16            q11, q8, q2
499    vadd.s16            q8, q8, q3
500    vqmovun.s16         d2, q1
501    vqmovun.s16         d3, q0
502    vqmovun.s16         d22, q11
503    vqmovun.s16         d23, q8
504    vdup.16             q0, d21[2]                  @ proload next 2 rows data
505    vdup.16             q8, d21[3]
506    vst1.64             {d2,d3}, [r0], r1
507    vst1.64             {d22,d23}, [r0], r1
508
509
510    vadd.s16            q1, q0, q2
511    vadd.s16            q0, q0, q3
512    vadd.s16            q11, q8, q2
513    vadd.s16            q8, q8, q3
514    vqmovun.s16         d2, q1
515    vqmovun.s16         d3, q0
516    vqmovun.s16         d22, q11
517    vqmovun.s16         d23, q8
518    vld1.8              {d18}, [r3]!                  @ preload 8 left into r12
519    vmovl.u8            q10, d18
520    vst1.64             {d2,d3}, [r0], r1
521    vst1.64             {d22,d23}, [r0], r1
522
523    subs                r2, r2, #1
524    bgt                 loop_16x16_neon
525
526    bx                  lr
527	.size vpx_tm_predictor_16x16_neon, .-vpx_tm_predictor_16x16_neon    @ ENDP                @ |vpx_tm_predictor_16x16_neon|
528
529@void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
530@                                  const uint8_t *above,
531@                                  const uint8_t *left)
532@ r0  uint8_t *dst
533@ r1  ptrdiff_t y_stride
534@ r2  const uint8_t *above
535@ r3  const uint8_t *left
536
537_vpx_tm_predictor_32x32_neon:
538	vpx_tm_predictor_32x32_neon: @ PROC
539    @ Load ytop_left = above[-1];
540    sub                 r12, r2, #1
541    vld1.8              {d0[]}, [r12]
542
543    @ Load above 32 pixels
544    vld1.8              {q1}, [r2]!
545    vld1.8              {q2}, [r2]
546
547    @ preload 8 left pixels
548    vld1.8              {d26}, [r3]!
549
550    @ Compute above - ytop_left
551    vsubl.u8            q8, d2, d0
552    vsubl.u8            q9, d3, d0
553    vsubl.u8            q10, d4, d0
554    vsubl.u8            q11, d5, d0
555
556    vmovl.u8            q3, d26
557
558    @ Load left row by row and compute left + (above - ytop_left)
559    @ Process 8 rows in each single loop and loop 4 times to process 32 rows.
560    mov                 r2, #4
561
562loop_32x32_neon:
563    @ Process two rows.
564    vdup.16             q0, d6[0]
565    vdup.16             q2, d6[1]
566    vadd.s16            q12, q0, q8
567    vadd.s16            q13, q0, q9
568    vadd.s16            q14, q0, q10
569    vadd.s16            q15, q0, q11
570    vqmovun.s16         d0, q12
571    vqmovun.s16         d1, q13
572    vadd.s16            q12, q2, q8
573    vadd.s16            q13, q2, q9
574    vqmovun.s16         d2, q14
575    vqmovun.s16         d3, q15
576    vadd.s16            q14, q2, q10
577    vadd.s16            q15, q2, q11
578    vst1.64             {d0-d3}, [r0], r1
579    vqmovun.s16         d24, q12
580    vqmovun.s16         d25, q13
581    vqmovun.s16         d26, q14
582    vqmovun.s16         d27, q15
583    vdup.16             q1, d6[2]
584    vdup.16             q2, d6[3]
585    vst1.64             {d24-d27}, [r0], r1
586
587    @ Process two rows.
588    vadd.s16            q12, q1, q8
589    vadd.s16            q13, q1, q9
590    vadd.s16            q14, q1, q10
591    vadd.s16            q15, q1, q11
592    vqmovun.s16         d0, q12
593    vqmovun.s16         d1, q13
594    vadd.s16            q12, q2, q8
595    vadd.s16            q13, q2, q9
596    vqmovun.s16         d2, q14
597    vqmovun.s16         d3, q15
598    vadd.s16            q14, q2, q10
599    vadd.s16            q15, q2, q11
600    vst1.64             {d0-d3}, [r0], r1
601    vqmovun.s16         d24, q12
602    vqmovun.s16         d25, q13
603    vqmovun.s16         d26, q14
604    vqmovun.s16         d27, q15
605    vdup.16             q0, d7[0]
606    vdup.16             q2, d7[1]
607    vst1.64             {d24-d27}, [r0], r1
608
609    @ Process two rows.
610    vadd.s16            q12, q0, q8
611    vadd.s16            q13, q0, q9
612    vadd.s16            q14, q0, q10
613    vadd.s16            q15, q0, q11
614    vqmovun.s16         d0, q12
615    vqmovun.s16         d1, q13
616    vadd.s16            q12, q2, q8
617    vadd.s16            q13, q2, q9
618    vqmovun.s16         d2, q14
619    vqmovun.s16         d3, q15
620    vadd.s16            q14, q2, q10
621    vadd.s16            q15, q2, q11
622    vst1.64             {d0-d3}, [r0], r1
623    vqmovun.s16         d24, q12
624    vqmovun.s16         d25, q13
625    vqmovun.s16         d26, q14
626    vqmovun.s16         d27, q15
627    vdup.16             q0, d7[2]
628    vdup.16             q2, d7[3]
629    vst1.64             {d24-d27}, [r0], r1
630
631    @ Process two rows.
632    vadd.s16            q12, q0, q8
633    vadd.s16            q13, q0, q9
634    vadd.s16            q14, q0, q10
635    vadd.s16            q15, q0, q11
636    vqmovun.s16         d0, q12
637    vqmovun.s16         d1, q13
638    vadd.s16            q12, q2, q8
639    vadd.s16            q13, q2, q9
640    vqmovun.s16         d2, q14
641    vqmovun.s16         d3, q15
642    vadd.s16            q14, q2, q10
643    vadd.s16            q15, q2, q11
644    vst1.64             {d0-d3}, [r0], r1
645    vqmovun.s16         d24, q12
646    vqmovun.s16         d25, q13
647    vld1.8              {d0}, [r3]!                   @ preload 8 left pixels
648    vqmovun.s16         d26, q14
649    vqmovun.s16         d27, q15
650    vmovl.u8            q3, d0
651    vst1.64             {d24-d27}, [r0], r1
652
653    subs                r2, r2, #1
654    bgt                 loop_32x32_neon
655
656    bx                  lr
657	.size vpx_tm_predictor_32x32_neon, .-vpx_tm_predictor_32x32_neon    @ ENDP                @ |vpx_tm_predictor_32x32_neon|
658
659	.section	.note.GNU-stack,"",%progbits
660