1@ This file was created from a .asm file
2@  using the ads2gas.pl script.
3	.syntax unified
4@
5@  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
6@
7@  Use of this source code is governed by a BSD-style license
8@  that can be found in the LICENSE file in the root of the source
9@  tree. An additional intellectual property rights grant can be found
10@  in the file PATENTS.  All contributing project authors may
11@  be found in the AUTHORS file in the root of the source tree.
12@
13
14    .global vpx_v_predictor_4x4_neon
15	.type vpx_v_predictor_4x4_neon, function
16    .global vpx_v_predictor_8x8_neon
17	.type vpx_v_predictor_8x8_neon, function
18    .global vpx_v_predictor_16x16_neon
19	.type vpx_v_predictor_16x16_neon, function
20    .global vpx_v_predictor_32x32_neon
21	.type vpx_v_predictor_32x32_neon, function
22    .global vpx_h_predictor_4x4_neon
23	.type vpx_h_predictor_4x4_neon, function
24    .global vpx_h_predictor_8x8_neon
25	.type vpx_h_predictor_8x8_neon, function
26    .global vpx_h_predictor_16x16_neon
27	.type vpx_h_predictor_16x16_neon, function
28    .global vpx_h_predictor_32x32_neon
29	.type vpx_h_predictor_32x32_neon, function
30    .global vpx_tm_predictor_4x4_neon
31	.type vpx_tm_predictor_4x4_neon, function
32    .global vpx_tm_predictor_8x8_neon
33	.type vpx_tm_predictor_8x8_neon, function
34    .global vpx_tm_predictor_16x16_neon
35	.type vpx_tm_predictor_16x16_neon, function
36    .global vpx_tm_predictor_32x32_neon
37	.type vpx_tm_predictor_32x32_neon, function
38   .arm
39   .eabi_attribute 24, 1 @Tag_ABI_align_needed
40   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
41
42.text
43.p2align 2
44
45@void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
46@                              const uint8_t *above,
47@                              const uint8_t *left)
48@ r0  uint8_t *dst
49@ r1  ptrdiff_t y_stride
50@ r2  const uint8_t *above
51@ r3  const uint8_t *left
52
53_vpx_v_predictor_4x4_neon:
54	vpx_v_predictor_4x4_neon: @ PROC
55    vld1.32             {d0[0]}, [r2]
56    vst1.32             {d0[0]}, [r0], r1
57    vst1.32             {d0[0]}, [r0], r1
58    vst1.32             {d0[0]}, [r0], r1
59    vst1.32             {d0[0]}, [r0], r1
60    bx                  lr
61	.size vpx_v_predictor_4x4_neon, .-vpx_v_predictor_4x4_neon    @ ENDP                @ |vpx_v_predictor_4x4_neon|
62
63@void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
64@                              const uint8_t *above,
65@                              const uint8_t *left)
66@ r0  uint8_t *dst
67@ r1  ptrdiff_t y_stride
68@ r2  const uint8_t *above
69@ r3  const uint8_t *left
70
71_vpx_v_predictor_8x8_neon:
72	vpx_v_predictor_8x8_neon: @ PROC
73    vld1.8              {d0}, [r2]
74    vst1.8              {d0}, [r0], r1
75    vst1.8              {d0}, [r0], r1
76    vst1.8              {d0}, [r0], r1
77    vst1.8              {d0}, [r0], r1
78    vst1.8              {d0}, [r0], r1
79    vst1.8              {d0}, [r0], r1
80    vst1.8              {d0}, [r0], r1
81    vst1.8              {d0}, [r0], r1
82    bx                  lr
83	.size vpx_v_predictor_8x8_neon, .-vpx_v_predictor_8x8_neon    @ ENDP                @ |vpx_v_predictor_8x8_neon|
84
85@void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
86@                                const uint8_t *above,
87@                                const uint8_t *left)
88@ r0  uint8_t *dst
89@ r1  ptrdiff_t y_stride
90@ r2  const uint8_t *above
91@ r3  const uint8_t *left
92
93_vpx_v_predictor_16x16_neon:
94	vpx_v_predictor_16x16_neon: @ PROC
95    vld1.8              {q0}, [r2]
96    vst1.8              {q0}, [r0], r1
97    vst1.8              {q0}, [r0], r1
98    vst1.8              {q0}, [r0], r1
99    vst1.8              {q0}, [r0], r1
100    vst1.8              {q0}, [r0], r1
101    vst1.8              {q0}, [r0], r1
102    vst1.8              {q0}, [r0], r1
103    vst1.8              {q0}, [r0], r1
104    vst1.8              {q0}, [r0], r1
105    vst1.8              {q0}, [r0], r1
106    vst1.8              {q0}, [r0], r1
107    vst1.8              {q0}, [r0], r1
108    vst1.8              {q0}, [r0], r1
109    vst1.8              {q0}, [r0], r1
110    vst1.8              {q0}, [r0], r1
111    vst1.8              {q0}, [r0], r1
112    bx                  lr
113	.size vpx_v_predictor_16x16_neon, .-vpx_v_predictor_16x16_neon    @ ENDP                @ |vpx_v_predictor_16x16_neon|
114
115@void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
116@                                const uint8_t *above,
117@                                const uint8_t *left)
118@ r0  uint8_t *dst
119@ r1  ptrdiff_t y_stride
120@ r2  const uint8_t *above
121@ r3  const uint8_t *left
122
123_vpx_v_predictor_32x32_neon:
124	vpx_v_predictor_32x32_neon: @ PROC
125    vld1.8              {q0, q1}, [r2]
126    mov                 r2, #2
127loop_v:
128    vst1.8              {q0, q1}, [r0], r1
129    vst1.8              {q0, q1}, [r0], r1
130    vst1.8              {q0, q1}, [r0], r1
131    vst1.8              {q0, q1}, [r0], r1
132    vst1.8              {q0, q1}, [r0], r1
133    vst1.8              {q0, q1}, [r0], r1
134    vst1.8              {q0, q1}, [r0], r1
135    vst1.8              {q0, q1}, [r0], r1
136    vst1.8              {q0, q1}, [r0], r1
137    vst1.8              {q0, q1}, [r0], r1
138    vst1.8              {q0, q1}, [r0], r1
139    vst1.8              {q0, q1}, [r0], r1
140    vst1.8              {q0, q1}, [r0], r1
141    vst1.8              {q0, q1}, [r0], r1
142    vst1.8              {q0, q1}, [r0], r1
143    vst1.8              {q0, q1}, [r0], r1
144    subs                r2, r2, #1
145    bgt                 loop_v
146    bx                  lr
147	.size vpx_v_predictor_32x32_neon, .-vpx_v_predictor_32x32_neon    @ ENDP                @ |vpx_v_predictor_32x32_neon|
148
149@void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
150@                              const uint8_t *above,
151@                              const uint8_t *left)
152@ r0  uint8_t *dst
153@ r1  ptrdiff_t y_stride
154@ r2  const uint8_t *above
155@ r3  const uint8_t *left
156
157_vpx_h_predictor_4x4_neon:
158	vpx_h_predictor_4x4_neon: @ PROC
159    vld1.32             {d1[0]}, [r3]
160    vdup.8              d0, d1[0]
161    vst1.32             {d0[0]}, [r0], r1
162    vdup.8              d0, d1[1]
163    vst1.32             {d0[0]}, [r0], r1
164    vdup.8              d0, d1[2]
165    vst1.32             {d0[0]}, [r0], r1
166    vdup.8              d0, d1[3]
167    vst1.32             {d0[0]}, [r0], r1
168    bx                  lr
169	.size vpx_h_predictor_4x4_neon, .-vpx_h_predictor_4x4_neon    @ ENDP                @ |vpx_h_predictor_4x4_neon|
170
171@void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
172@                              const uint8_t *above,
173@                              const uint8_t *left)
174@ r0  uint8_t *dst
175@ r1  ptrdiff_t y_stride
176@ r2  const uint8_t *above
177@ r3  const uint8_t *left
178
179_vpx_h_predictor_8x8_neon:
180	vpx_h_predictor_8x8_neon: @ PROC
181    vld1.64             {d1}, [r3]
182    vdup.8              d0, d1[0]
183    vst1.64             {d0}, [r0], r1
184    vdup.8              d0, d1[1]
185    vst1.64             {d0}, [r0], r1
186    vdup.8              d0, d1[2]
187    vst1.64             {d0}, [r0], r1
188    vdup.8              d0, d1[3]
189    vst1.64             {d0}, [r0], r1
190    vdup.8              d0, d1[4]
191    vst1.64             {d0}, [r0], r1
192    vdup.8              d0, d1[5]
193    vst1.64             {d0}, [r0], r1
194    vdup.8              d0, d1[6]
195    vst1.64             {d0}, [r0], r1
196    vdup.8              d0, d1[7]
197    vst1.64             {d0}, [r0], r1
198    bx                  lr
199	.size vpx_h_predictor_8x8_neon, .-vpx_h_predictor_8x8_neon    @ ENDP                @ |vpx_h_predictor_8x8_neon|
200
201@void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
202@                                const uint8_t *above,
203@                                const uint8_t *left)
204@ r0  uint8_t *dst
205@ r1  ptrdiff_t y_stride
206@ r2  const uint8_t *above
207@ r3  const uint8_t *left
208
209_vpx_h_predictor_16x16_neon:
210	vpx_h_predictor_16x16_neon: @ PROC
211    vld1.8              {q1}, [r3]
212    vdup.8              q0, d2[0]
213    vst1.8              {q0}, [r0], r1
214    vdup.8              q0, d2[1]
215    vst1.8              {q0}, [r0], r1
216    vdup.8              q0, d2[2]
217    vst1.8              {q0}, [r0], r1
218    vdup.8              q0, d2[3]
219    vst1.8              {q0}, [r0], r1
220    vdup.8              q0, d2[4]
221    vst1.8              {q0}, [r0], r1
222    vdup.8              q0, d2[5]
223    vst1.8              {q0}, [r0], r1
224    vdup.8              q0, d2[6]
225    vst1.8              {q0}, [r0], r1
226    vdup.8              q0, d2[7]
227    vst1.8              {q0}, [r0], r1
228    vdup.8              q0, d3[0]
229    vst1.8              {q0}, [r0], r1
230    vdup.8              q0, d3[1]
231    vst1.8              {q0}, [r0], r1
232    vdup.8              q0, d3[2]
233    vst1.8              {q0}, [r0], r1
234    vdup.8              q0, d3[3]
235    vst1.8              {q0}, [r0], r1
236    vdup.8              q0, d3[4]
237    vst1.8              {q0}, [r0], r1
238    vdup.8              q0, d3[5]
239    vst1.8              {q0}, [r0], r1
240    vdup.8              q0, d3[6]
241    vst1.8              {q0}, [r0], r1
242    vdup.8              q0, d3[7]
243    vst1.8              {q0}, [r0], r1
244    bx                  lr
245	.size vpx_h_predictor_16x16_neon, .-vpx_h_predictor_16x16_neon    @ ENDP                @ |vpx_h_predictor_16x16_neon|
246
247@void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
248@                                const uint8_t *above,
249@                                const uint8_t *left)
250@ r0  uint8_t *dst
251@ r1  ptrdiff_t y_stride
252@ r2  const uint8_t *above
253@ r3  const uint8_t *left
254
255_vpx_h_predictor_32x32_neon:
256	vpx_h_predictor_32x32_neon: @ PROC
257    sub                 r1, r1, #16
258    mov                 r2, #2
259loop_h:
260    vld1.8              {q1}, [r3]!
261    vdup.8              q0, d2[0]
262    vst1.8              {q0}, [r0]!
263    vst1.8              {q0}, [r0], r1
264    vdup.8              q0, d2[1]
265    vst1.8              {q0}, [r0]!
266    vst1.8              {q0}, [r0], r1
267    vdup.8              q0, d2[2]
268    vst1.8              {q0}, [r0]!
269    vst1.8              {q0}, [r0], r1
270    vdup.8              q0, d2[3]
271    vst1.8              {q0}, [r0]!
272    vst1.8              {q0}, [r0], r1
273    vdup.8              q0, d2[4]
274    vst1.8              {q0}, [r0]!
275    vst1.8              {q0}, [r0], r1
276    vdup.8              q0, d2[5]
277    vst1.8              {q0}, [r0]!
278    vst1.8              {q0}, [r0], r1
279    vdup.8              q0, d2[6]
280    vst1.8              {q0}, [r0]!
281    vst1.8              {q0}, [r0], r1
282    vdup.8              q0, d2[7]
283    vst1.8              {q0}, [r0]!
284    vst1.8              {q0}, [r0], r1
285    vdup.8              q0, d3[0]
286    vst1.8              {q0}, [r0]!
287    vst1.8              {q0}, [r0], r1
288    vdup.8              q0, d3[1]
289    vst1.8              {q0}, [r0]!
290    vst1.8              {q0}, [r0], r1
291    vdup.8              q0, d3[2]
292    vst1.8              {q0}, [r0]!
293    vst1.8              {q0}, [r0], r1
294    vdup.8              q0, d3[3]
295    vst1.8              {q0}, [r0]!
296    vst1.8              {q0}, [r0], r1
297    vdup.8              q0, d3[4]
298    vst1.8              {q0}, [r0]!
299    vst1.8              {q0}, [r0], r1
300    vdup.8              q0, d3[5]
301    vst1.8              {q0}, [r0]!
302    vst1.8              {q0}, [r0], r1
303    vdup.8              q0, d3[6]
304    vst1.8              {q0}, [r0]!
305    vst1.8              {q0}, [r0], r1
306    vdup.8              q0, d3[7]
307    vst1.8              {q0}, [r0]!
308    vst1.8              {q0}, [r0], r1
309    subs                r2, r2, #1
310    bgt                 loop_h
311    bx                  lr
312	.size vpx_h_predictor_32x32_neon, .-vpx_h_predictor_32x32_neon    @ ENDP                @ |vpx_h_predictor_32x32_neon|
313
314@void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
315@                                const uint8_t *above,
316@                                const uint8_t *left)
317@ r0  uint8_t *dst
318@ r1  ptrdiff_t y_stride
319@ r2  const uint8_t *above
320@ r3  const uint8_t *left
321
322_vpx_tm_predictor_4x4_neon:
323	vpx_tm_predictor_4x4_neon: @ PROC
324    @ Load ytop_left = above[-1];
325    sub                 r12, r2, #1
326    vld1.u8             {d0[]}, [r12]
327
328    @ Load above 4 pixels
329    vld1.32             {d2[0]}, [r2]
330
331    @ Compute above - ytop_left
332    vsubl.u8            q3, d2, d0
333
334    @ Load left row by row and compute left + (above - ytop_left)
335    @ 1st row and 2nd row
336    vld1.u8             {d2[]}, [r3]!
337    vld1.u8             {d4[]}, [r3]!
338    vmovl.u8            q1, d2
339    vmovl.u8            q2, d4
340    vadd.s16            q1, q1, q3
341    vadd.s16            q2, q2, q3
342    vqmovun.s16         d0, q1
343    vqmovun.s16         d1, q2
344    vst1.32             {d0[0]}, [r0], r1
345    vst1.32             {d1[0]}, [r0], r1
346
347    @ 3rd row and 4th row
348    vld1.u8             {d2[]}, [r3]!
349    vld1.u8             {d4[]}, [r3]
350    vmovl.u8            q1, d2
351    vmovl.u8            q2, d4
352    vadd.s16            q1, q1, q3
353    vadd.s16            q2, q2, q3
354    vqmovun.s16         d0, q1
355    vqmovun.s16         d1, q2
356    vst1.32             {d0[0]}, [r0], r1
357    vst1.32             {d1[0]}, [r0], r1
358    bx                  lr
359	.size vpx_tm_predictor_4x4_neon, .-vpx_tm_predictor_4x4_neon    @ ENDP                @ |vpx_tm_predictor_4x4_neon|
360
361@void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
362@                                const uint8_t *above,
363@                                const uint8_t *left)
364@ r0  uint8_t *dst
365@ r1  ptrdiff_t y_stride
366@ r2  const uint8_t *above
367@ r3  const uint8_t *left
368
369_vpx_tm_predictor_8x8_neon:
370	vpx_tm_predictor_8x8_neon: @ PROC
371    @ Load ytop_left = above[-1];
372    sub                 r12, r2, #1
373    vld1.8              {d0[]}, [r12]
374
375    @ preload 8 left
376    vld1.8              {d30}, [r3]
377
378    @ Load above 8 pixels
379    vld1.64             {d2}, [r2]
380
381    vmovl.u8            q10, d30
382
383    @ Compute above - ytop_left
384    vsubl.u8            q3, d2, d0
385
386    @ Load left row by row and compute left + (above - ytop_left)
387    @ 1st row and 2nd row
388    vdup.16             q0, d20[0]
389    vdup.16             q1, d20[1]
390    vadd.s16            q0, q3, q0
391    vadd.s16            q1, q3, q1
392
393    @ 3rd row and 4th row
394    vdup.16             q8, d20[2]
395    vdup.16             q9, d20[3]
396    vadd.s16            q8, q3, q8
397    vadd.s16            q9, q3, q9
398
399    vqmovun.s16         d0, q0
400    vqmovun.s16         d1, q1
401    vqmovun.s16         d2, q8
402    vqmovun.s16         d3, q9
403
404    vst1.64             {d0}, [r0], r1
405    vst1.64             {d1}, [r0], r1
406    vst1.64             {d2}, [r0], r1
407    vst1.64             {d3}, [r0], r1
408
409    @ 5th row and 6th row
410    vdup.16             q0, d21[0]
411    vdup.16             q1, d21[1]
412    vadd.s16            q0, q3, q0
413    vadd.s16            q1, q3, q1
414
415    @ 7th row and 8th row
416    vdup.16             q8, d21[2]
417    vdup.16             q9, d21[3]
418    vadd.s16            q8, q3, q8
419    vadd.s16            q9, q3, q9
420
421    vqmovun.s16         d0, q0
422    vqmovun.s16         d1, q1
423    vqmovun.s16         d2, q8
424    vqmovun.s16         d3, q9
425
426    vst1.64             {d0}, [r0], r1
427    vst1.64             {d1}, [r0], r1
428    vst1.64             {d2}, [r0], r1
429    vst1.64             {d3}, [r0], r1
430
431    bx                  lr
432	.size vpx_tm_predictor_8x8_neon, .-vpx_tm_predictor_8x8_neon    @ ENDP                @ |vpx_tm_predictor_8x8_neon|
433
434@void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
435@                                const uint8_t *above,
436@                                const uint8_t *left)
437@ r0  uint8_t *dst
438@ r1  ptrdiff_t y_stride
439@ r2  const uint8_t *above
440@ r3  const uint8_t *left
441
442_vpx_tm_predictor_16x16_neon:
443	vpx_tm_predictor_16x16_neon: @ PROC
444    @ Load ytop_left = above[-1];
445    sub                 r12, r2, #1
446    vld1.8              {d0[]}, [r12]
447
448    @ Load above 8 pixels
449    vld1.8              {q1}, [r2]
450
451    @ preload 8 left into r12
452    vld1.8              {d18}, [r3]!
453
454    @ Compute above - ytop_left
455    vsubl.u8            q2, d2, d0
456    vsubl.u8            q3, d3, d0
457
458    vmovl.u8            q10, d18
459
460    @ Load left row by row and compute left + (above - ytop_left)
461    @ Process 8 rows in each single loop and loop 2 times to process 16 rows.
462    mov                 r2, #2
463
464loop_16x16_neon:
465    @ Process two rows.
466    vdup.16             q0, d20[0]
467    vdup.16             q8, d20[1]
468    vadd.s16            q1, q0, q2
469    vadd.s16            q0, q0, q3
470    vadd.s16            q11, q8, q2
471    vadd.s16            q8, q8, q3
472    vqmovun.s16         d2, q1
473    vqmovun.s16         d3, q0
474    vqmovun.s16         d22, q11
475    vqmovun.s16         d23, q8
476    vdup.16             q0, d20[2]                  @ proload next 2 rows data
477    vdup.16             q8, d20[3]
478    vst1.64             {d2,d3}, [r0], r1
479    vst1.64             {d22,d23}, [r0], r1
480
481    @ Process two rows.
482    vadd.s16            q1, q0, q2
483    vadd.s16            q0, q0, q3
484    vadd.s16            q11, q8, q2
485    vadd.s16            q8, q8, q3
486    vqmovun.s16         d2, q1
487    vqmovun.s16         d3, q0
488    vqmovun.s16         d22, q11
489    vqmovun.s16         d23, q8
490    vdup.16             q0, d21[0]                  @ proload next 2 rows data
491    vdup.16             q8, d21[1]
492    vst1.64             {d2,d3}, [r0], r1
493    vst1.64             {d22,d23}, [r0], r1
494
495    vadd.s16            q1, q0, q2
496    vadd.s16            q0, q0, q3
497    vadd.s16            q11, q8, q2
498    vadd.s16            q8, q8, q3
499    vqmovun.s16         d2, q1
500    vqmovun.s16         d3, q0
501    vqmovun.s16         d22, q11
502    vqmovun.s16         d23, q8
503    vdup.16             q0, d21[2]                  @ proload next 2 rows data
504    vdup.16             q8, d21[3]
505    vst1.64             {d2,d3}, [r0], r1
506    vst1.64             {d22,d23}, [r0], r1
507
508
509    vadd.s16            q1, q0, q2
510    vadd.s16            q0, q0, q3
511    vadd.s16            q11, q8, q2
512    vadd.s16            q8, q8, q3
513    vqmovun.s16         d2, q1
514    vqmovun.s16         d3, q0
515    vqmovun.s16         d22, q11
516    vqmovun.s16         d23, q8
517    vld1.8              {d18}, [r3]!                  @ preload 8 left into r12
518    vmovl.u8            q10, d18
519    vst1.64             {d2,d3}, [r0], r1
520    vst1.64             {d22,d23}, [r0], r1
521
522    subs                r2, r2, #1
523    bgt                 loop_16x16_neon
524
525    bx                  lr
526	.size vpx_tm_predictor_16x16_neon, .-vpx_tm_predictor_16x16_neon    @ ENDP                @ |vpx_tm_predictor_16x16_neon|
527
528@void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
529@                                  const uint8_t *above,
530@                                  const uint8_t *left)
531@ r0  uint8_t *dst
532@ r1  ptrdiff_t y_stride
533@ r2  const uint8_t *above
534@ r3  const uint8_t *left
535
536_vpx_tm_predictor_32x32_neon:
537	vpx_tm_predictor_32x32_neon: @ PROC
538    @ Load ytop_left = above[-1];
539    sub                 r12, r2, #1
540    vld1.8              {d0[]}, [r12]
541
542    @ Load above 32 pixels
543    vld1.8              {q1}, [r2]!
544    vld1.8              {q2}, [r2]
545
546    @ preload 8 left pixels
547    vld1.8              {d26}, [r3]!
548
549    @ Compute above - ytop_left
550    vsubl.u8            q8, d2, d0
551    vsubl.u8            q9, d3, d0
552    vsubl.u8            q10, d4, d0
553    vsubl.u8            q11, d5, d0
554
555    vmovl.u8            q3, d26
556
557    @ Load left row by row and compute left + (above - ytop_left)
558    @ Process 8 rows in each single loop and loop 4 times to process 32 rows.
559    mov                 r2, #4
560
561loop_32x32_neon:
562    @ Process two rows.
563    vdup.16             q0, d6[0]
564    vdup.16             q2, d6[1]
565    vadd.s16            q12, q0, q8
566    vadd.s16            q13, q0, q9
567    vadd.s16            q14, q0, q10
568    vadd.s16            q15, q0, q11
569    vqmovun.s16         d0, q12
570    vqmovun.s16         d1, q13
571    vadd.s16            q12, q2, q8
572    vadd.s16            q13, q2, q9
573    vqmovun.s16         d2, q14
574    vqmovun.s16         d3, q15
575    vadd.s16            q14, q2, q10
576    vadd.s16            q15, q2, q11
577    vst1.64             {d0-d3}, [r0], r1
578    vqmovun.s16         d24, q12
579    vqmovun.s16         d25, q13
580    vqmovun.s16         d26, q14
581    vqmovun.s16         d27, q15
582    vdup.16             q1, d6[2]
583    vdup.16             q2, d6[3]
584    vst1.64             {d24-d27}, [r0], r1
585
586    @ Process two rows.
587    vadd.s16            q12, q1, q8
588    vadd.s16            q13, q1, q9
589    vadd.s16            q14, q1, q10
590    vadd.s16            q15, q1, q11
591    vqmovun.s16         d0, q12
592    vqmovun.s16         d1, q13
593    vadd.s16            q12, q2, q8
594    vadd.s16            q13, q2, q9
595    vqmovun.s16         d2, q14
596    vqmovun.s16         d3, q15
597    vadd.s16            q14, q2, q10
598    vadd.s16            q15, q2, q11
599    vst1.64             {d0-d3}, [r0], r1
600    vqmovun.s16         d24, q12
601    vqmovun.s16         d25, q13
602    vqmovun.s16         d26, q14
603    vqmovun.s16         d27, q15
604    vdup.16             q0, d7[0]
605    vdup.16             q2, d7[1]
606    vst1.64             {d24-d27}, [r0], r1
607
608    @ Process two rows.
609    vadd.s16            q12, q0, q8
610    vadd.s16            q13, q0, q9
611    vadd.s16            q14, q0, q10
612    vadd.s16            q15, q0, q11
613    vqmovun.s16         d0, q12
614    vqmovun.s16         d1, q13
615    vadd.s16            q12, q2, q8
616    vadd.s16            q13, q2, q9
617    vqmovun.s16         d2, q14
618    vqmovun.s16         d3, q15
619    vadd.s16            q14, q2, q10
620    vadd.s16            q15, q2, q11
621    vst1.64             {d0-d3}, [r0], r1
622    vqmovun.s16         d24, q12
623    vqmovun.s16         d25, q13
624    vqmovun.s16         d26, q14
625    vqmovun.s16         d27, q15
626    vdup.16             q0, d7[2]
627    vdup.16             q2, d7[3]
628    vst1.64             {d24-d27}, [r0], r1
629
630    @ Process two rows.
631    vadd.s16            q12, q0, q8
632    vadd.s16            q13, q0, q9
633    vadd.s16            q14, q0, q10
634    vadd.s16            q15, q0, q11
635    vqmovun.s16         d0, q12
636    vqmovun.s16         d1, q13
637    vadd.s16            q12, q2, q8
638    vadd.s16            q13, q2, q9
639    vqmovun.s16         d2, q14
640    vqmovun.s16         d3, q15
641    vadd.s16            q14, q2, q10
642    vadd.s16            q15, q2, q11
643    vst1.64             {d0-d3}, [r0], r1
644    vqmovun.s16         d24, q12
645    vqmovun.s16         d25, q13
646    vld1.8              {d0}, [r3]!                   @ preload 8 left pixels
647    vqmovun.s16         d26, q14
648    vqmovun.s16         d27, q15
649    vmovl.u8            q3, d0
650    vst1.64             {d24-d27}, [r0], r1
651
652    subs                r2, r2, #1
653    bgt                 loop_32x32_neon
654
655    bx                  lr
656	.size vpx_tm_predictor_32x32_neon, .-vpx_tm_predictor_32x32_neon    @ ENDP                @ |vpx_tm_predictor_32x32_neon|
657
658	.section	.note.GNU-stack,"",%progbits
659