1;
2;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11;TODO(cd): adjust these constant to be able to use vqdmulh for faster
12;          dct_const_round_shift(a * b) within butterfly calculations.
13cospi_1_64  EQU 16364
14cospi_2_64  EQU 16305
15cospi_3_64  EQU 16207
16cospi_4_64  EQU 16069
17cospi_5_64  EQU 15893
18cospi_6_64  EQU 15679
19cospi_7_64  EQU 15426
20cospi_8_64  EQU 15137
21cospi_9_64  EQU 14811
22cospi_10_64 EQU 14449
23cospi_11_64 EQU 14053
24cospi_12_64 EQU 13623
25cospi_13_64 EQU 13160
26cospi_14_64 EQU 12665
27cospi_15_64 EQU 12140
28cospi_16_64 EQU 11585
29cospi_17_64 EQU 11003
30cospi_18_64 EQU 10394
31cospi_19_64 EQU  9760
32cospi_20_64 EQU  9102
33cospi_21_64 EQU  8423
34cospi_22_64 EQU  7723
35cospi_23_64 EQU  7005
36cospi_24_64 EQU  6270
37cospi_25_64 EQU  5520
38cospi_26_64 EQU  4756
39cospi_27_64 EQU  3981
40cospi_28_64 EQU  3196
41cospi_29_64 EQU  2404
42cospi_30_64 EQU  1606
43cospi_31_64 EQU   804
44
45
46    EXPORT  |vp9_idct32x32_1024_add_neon|
47    ARM
48    REQUIRE8
49    PRESERVE8
50
51    AREA ||.text||, CODE, READONLY, ALIGN=2
52
53    AREA     Block, CODE, READONLY
54
55    ; --------------------------------------------------------------------------
56    ; Load from transposed_buffer
57    ;   q13 = transposed_buffer[first_offset]
58    ;   q14 = transposed_buffer[second_offset]
59    ;   for proper address calculation, the last offset used when manipulating
60    ;   transposed_buffer must be passed in. use 0 for first use.
61    MACRO
62    LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
63    ; address calculation with proper stride and loading
64    add r0, #($first_offset  - $prev_offset )*8*2
65    vld1.s16        {q14}, [r0]
66    add r0, #($second_offset - $first_offset)*8*2
67    vld1.s16        {q13}, [r0]
68    ; (used) two registers (q14, q13)
69    MEND
70    ; --------------------------------------------------------------------------
71    ; Load from output (used as temporary storage)
72    ;   reg1 = output[first_offset]
73    ;   reg2 = output[second_offset]
74    ;   for proper address calculation, the last offset used when manipulating
75    ;   output, whether reading or storing) must be passed in. use 0 for first
76    ;   use.
77    MACRO
78    LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
79    ; address calculation with proper stride and loading
80    add r1, #($first_offset  - $prev_offset )*32*2
81    vld1.s16        {$reg1}, [r1]
82    add r1, #($second_offset - $first_offset)*32*2
83    vld1.s16        {$reg2}, [r1]
84    ; (used) two registers ($reg1, $reg2)
85    MEND
86    ; --------------------------------------------------------------------------
87    ; Store into output (sometimes as as temporary storage)
88    ;   output[first_offset] = reg1
89    ;   output[second_offset] = reg2
90    ;   for proper address calculation, the last offset used when manipulating
91    ;   output, whether reading or storing) must be passed in. use 0 for first
92    ;   use.
93    MACRO
94    STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
95    ; address calculation with proper stride and storing
96    add r1, #($first_offset  - $prev_offset )*32*2
97    vst1.16 {$reg1}, [r1]
98    add r1, #($second_offset - $first_offset)*32*2
99    vst1.16 {$reg2}, [r1]
100    MEND
101    ; --------------------------------------------------------------------------
102    ; Combine-add results with current destination content
103    ;   q6-q9 contain the results (out[j * 32 + 0-31])
104    MACRO
105    STORE_COMBINE_CENTER_RESULTS
106    ; load dest[j * dest_stride + 0-31]
107    vld1.s16        {d8}, [r10], r2
108    vld1.s16        {d11}, [r9], r11
109    vld1.s16        {d9}, [r10]
110    vld1.s16        {d10}, [r9]
111    ; ROUND_POWER_OF_TWO
112    vrshr.s16       q7, q7, #6
113    vrshr.s16       q8, q8, #6
114    vrshr.s16       q9, q9, #6
115    vrshr.s16       q6, q6, #6
116    ; add to dest[j * dest_stride + 0-31]
117    vaddw.u8        q7, q7, d9
118    vaddw.u8        q8, q8, d10
119    vaddw.u8        q9, q9, d11
120    vaddw.u8        q6, q6, d8
121    ; clip pixel
122    vqmovun.s16     d9,  q7
123    vqmovun.s16     d10, q8
124    vqmovun.s16     d11, q9
125    vqmovun.s16     d8,  q6
126    ; store back into dest[j * dest_stride + 0-31]
127    vst1.16         {d9}, [r10], r11
128    vst1.16         {d10}, [r9], r2
129    vst1.16         {d8}, [r10]
130    vst1.16         {d11}, [r9]
131    ; update pointers (by dest_stride * 2)
132    sub r9,  r9,  r2, lsl #1
133    add r10, r10, r2, lsl #1
134    MEND
135    ; --------------------------------------------------------------------------
136    ; Combine-add results with current destination content
137    ;   q6-q9 contain the results (out[j * 32 + 0-31])
138    MACRO
139    STORE_COMBINE_CENTER_RESULTS_LAST
140    ; load dest[j * dest_stride + 0-31]
141    vld1.s16        {d8}, [r10], r2
142    vld1.s16        {d11}, [r9], r11
143    vld1.s16        {d9}, [r10]
144    vld1.s16        {d10}, [r9]
145    ; ROUND_POWER_OF_TWO
146    vrshr.s16       q7, q7, #6
147    vrshr.s16       q8, q8, #6
148    vrshr.s16       q9, q9, #6
149    vrshr.s16       q6, q6, #6
150    ; add to dest[j * dest_stride + 0-31]
151    vaddw.u8        q7, q7, d9
152    vaddw.u8        q8, q8, d10
153    vaddw.u8        q9, q9, d11
154    vaddw.u8        q6, q6, d8
155    ; clip pixel
156    vqmovun.s16     d9,  q7
157    vqmovun.s16     d10, q8
158    vqmovun.s16     d11, q9
159    vqmovun.s16     d8,  q6
160    ; store back into dest[j * dest_stride + 0-31]
161    vst1.16         {d9}, [r10], r11
162    vst1.16         {d10}, [r9], r2
163    vst1.16         {d8}, [r10]!
164    vst1.16         {d11}, [r9]!
165    ; update pointers (by dest_stride * 2)
166    sub r9,  r9,  r2, lsl #1
167    add r10, r10, r2, lsl #1
168    MEND
169    ; --------------------------------------------------------------------------
170    ; Combine-add results with current destination content
171    ;   q4-q7 contain the results (out[j * 32 + 0-31])
172    MACRO
173    STORE_COMBINE_EXTREME_RESULTS
174    ; load dest[j * dest_stride + 0-31]
175    vld1.s16        {d4}, [r7], r2
176    vld1.s16        {d7}, [r6], r11
177    vld1.s16        {d5}, [r7]
178    vld1.s16        {d6}, [r6]
179    ; ROUND_POWER_OF_TWO
180    vrshr.s16       q5, q5, #6
181    vrshr.s16       q6, q6, #6
182    vrshr.s16       q7, q7, #6
183    vrshr.s16       q4, q4, #6
184    ; add to dest[j * dest_stride + 0-31]
185    vaddw.u8        q5, q5, d5
186    vaddw.u8        q6, q6, d6
187    vaddw.u8        q7, q7, d7
188    vaddw.u8        q4, q4, d4
189    ; clip pixel
190    vqmovun.s16     d5, q5
191    vqmovun.s16     d6, q6
192    vqmovun.s16     d7, q7
193    vqmovun.s16     d4, q4
194    ; store back into dest[j * dest_stride + 0-31]
195    vst1.16         {d5}, [r7], r11
196    vst1.16         {d6}, [r6], r2
197    vst1.16         {d7}, [r6]
198    vst1.16         {d4}, [r7]
199    ; update pointers (by dest_stride * 2)
200    sub r6, r6, r2, lsl #1
201    add r7, r7, r2, lsl #1
202    MEND
203    ; --------------------------------------------------------------------------
204    ; Combine-add results with current destination content
205    ;   q4-q7 contain the results (out[j * 32 + 0-31])
206    MACRO
207    STORE_COMBINE_EXTREME_RESULTS_LAST
208    ; load dest[j * dest_stride + 0-31]
209    vld1.s16        {d4}, [r7], r2
210    vld1.s16        {d7}, [r6], r11
211    vld1.s16        {d5}, [r7]
212    vld1.s16        {d6}, [r6]
213    ; ROUND_POWER_OF_TWO
214    vrshr.s16       q5, q5, #6
215    vrshr.s16       q6, q6, #6
216    vrshr.s16       q7, q7, #6
217    vrshr.s16       q4, q4, #6
218    ; add to dest[j * dest_stride + 0-31]
219    vaddw.u8        q5, q5, d5
220    vaddw.u8        q6, q6, d6
221    vaddw.u8        q7, q7, d7
222    vaddw.u8        q4, q4, d4
223    ; clip pixel
224    vqmovun.s16     d5, q5
225    vqmovun.s16     d6, q6
226    vqmovun.s16     d7, q7
227    vqmovun.s16     d4, q4
228    ; store back into dest[j * dest_stride + 0-31]
229    vst1.16         {d5}, [r7], r11
230    vst1.16         {d6}, [r6], r2
231    vst1.16         {d7}, [r6]!
232    vst1.16         {d4}, [r7]!
233    ; update pointers (by dest_stride * 2)
234    sub r6, r6, r2, lsl #1
235    add r7, r7, r2, lsl #1
236    MEND
237    ; --------------------------------------------------------------------------
238    ; Touches q8-q12, q15 (q13-q14 are preserved)
239    ; valid output registers are anything but q8-q11
240    MACRO
241    DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
242    ; TODO(cd): have special case to re-use constants when they are similar for
243    ;           consecutive butterflies
244    ; TODO(cd): have special case when both constants are the same, do the
245    ;           additions/subtractions before the multiplies.
246    ; generate the constants
247    ;   generate scalar constants
248    mov             r8,  #$first_constant  & 0xFF00
249    mov             r12, #$second_constant & 0xFF00
250    add             r8,  #$first_constant  & 0x00FF
251    add             r12, #$second_constant & 0x00FF
252    ;   generate vector constants
253    vdup.16         d30, r8
254    vdup.16         d31, r12
255    ; (used) two for inputs (regA-regD), one for constants (q15)
256    ; do some multiplications (ordered for maximum latency hiding)
257    vmull.s16 q8,  $regC, d30
258    vmull.s16 q10, $regA, d31
259    vmull.s16 q9,  $regD, d30
260    vmull.s16 q11, $regB, d31
261    vmull.s16 q12, $regC, d31
262    ; (used) five for intermediate (q8-q12), one for constants (q15)
263    ; do some addition/subtractions (to get back two register)
264    vsub.s32  q8, q8, q10
265    vsub.s32  q9, q9, q11
266    ; do more multiplications (ordered for maximum latency hiding)
267    vmull.s16 q10, $regD, d31
268    vmull.s16 q11, $regA, d30
269    vmull.s16 q15, $regB, d30
270    ; (used) six for intermediate (q8-q12, q15)
271    ; do more addition/subtractions
272    vadd.s32  q11, q12, q11
273    vadd.s32  q10, q10, q15
274    ; (used) four for intermediate (q8-q11)
275    ; dct_const_round_shift
276    vqrshrn.s32 $reg1, q8,  #14
277    vqrshrn.s32 $reg2, q9,  #14
278    vqrshrn.s32 $reg3, q11, #14
279    vqrshrn.s32 $reg4, q10, #14
280    ; (used) two for results, well four d registers
281    MEND
282    ; --------------------------------------------------------------------------
283    ; Touches q8-q12, q15 (q13-q14 are preserved)
284    ; valid output registers are anything but q8-q11
285    MACRO
286    DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
287    DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
288    MEND
289    ; --------------------------------------------------------------------------
290
291;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
292;
293;   r0  int16_t *input,
294;   r1  uint8_t *dest,
295;   r2  int dest_stride)
296; loop counters
297;   r4  bands loop counter
298;   r5  pass loop counter
299;   r8  transpose loop counter
300; combine-add pointers
301;   r6  dest + 31 * dest_stride, descending (30, 29, 28, ...)
302;   r7  dest +  0 * dest_stride, ascending  (1, 2, 3, ...)
303;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
304;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
305
306|vp9_idct32x32_1024_add_neon| PROC
307    ; This function does one pass of idct32x32 transform.
308    ;
309    ; This is done by transposing the input and then doing a 1d transform on
310    ; columns. In the first pass, the transposed columns are the original
311    ; rows. In the second pass, after the transposition, the colums are the
312    ; original columns.
313    ; The 1d transform is done by looping over bands of eight columns (the
314    ; idct32_bands loop). For each band, the transform input transposition
315    ; is done on demand, one band of four 8x8 matrices at a time. The four
316    ; matrices are transposed by pairs (the idct32_transpose_pair loop).
317    push  {r4-r11}
318    vpush {d8-d15}
319    ; stack operation
320    ; internal buffer used to transpose 8 lines into before transforming them
321    ;   int16_t transpose_buffer[32 * 8];
322    ;   at sp + [4096, 4607]
323    ; results of the first pass (transpose and transform rows)
324    ;   int16_t pass1[32 * 32];
325    ;   at sp + [0, 2047]
326    ; results of the second pass (transpose and transform columns)
327    ;   int16_t pass2[32 * 32];
328    ;   at sp + [2048, 4095]
329    sub sp, sp, #512+2048+2048
330
331    ; r6  = dest + 31 * dest_stride
332    ; r7  = dest +  0 * dest_stride
333    ; r9  = dest + 15 * dest_stride
334    ; r10 = dest + 16 * dest_stride
335    rsb r6,  r2, r2, lsl #5
336    rsb r9,  r2, r2, lsl #4
337    add r10, r1, r2, lsl #4
338    mov r7, r1
339    add r6, r6, r1
340    add r9, r9, r1
341    ; r11 = -dest_stride
342    neg r11, r2
343    ; r3 = input
344    mov r3, r0
345    ; parameters for first pass
346      ; r0 = transpose_buffer[32 * 8]
347    add r0, sp, #4096
348      ; r1 = pass1[32 * 32]
349    mov r1, sp
350
351    mov r5, #0          ; initialize pass loop counter
352idct32_pass_loop
353    mov r4, #4          ; initialize bands loop counter
354idct32_bands_loop
355    mov r8, #2          ; initialize transpose loop counter
356idct32_transpose_pair_loop
357    ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
358    ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
359    ; adjusted to 32 because of the two post-increments.
360    vld1.s16        {q8},  [r3]!
361    vld1.s16        {q0},  [r3]!
362    add r3, #32
363    vld1.s16        {q9},  [r3]!
364    vld1.s16        {q1},  [r3]!
365    add r3, #32
366    vld1.s16        {q10}, [r3]!
367    vld1.s16        {q2},  [r3]!
368    add r3, #32
369    vld1.s16        {q11}, [r3]!
370    vld1.s16        {q3},  [r3]!
371    add r3, #32
372    vld1.s16        {q12}, [r3]!
373    vld1.s16        {q4},  [r3]!
374    add r3, #32
375    vld1.s16        {q13}, [r3]!
376    vld1.s16        {q5},  [r3]!
377    add r3, #32
378    vld1.s16        {q14}, [r3]!
379    vld1.s16        {q6},  [r3]!
380    add r3, #32
381    vld1.s16        {q15}, [r3]!
382    vld1.s16        {q7},  [r3]!
383
384    ; Transpose the two 8x8 16bit data matrices.
385    vswp            d17, d24
386    vswp            d23, d30
387    vswp            d21, d28
388    vswp            d19, d26
389    vswp            d1,  d8
390    vswp            d7,  d14
391    vswp            d5,  d12
392    vswp            d3,  d10
393    vtrn.32         q8,  q10
394    vtrn.32         q9,  q11
395    vtrn.32         q12, q14
396    vtrn.32         q13, q15
397    vtrn.32         q0,  q2
398    vtrn.32         q1,  q3
399    vtrn.32         q4,  q6
400    vtrn.32         q5,  q7
401    vtrn.16         q8,  q9
402    vtrn.16         q10, q11
403    vtrn.16         q12, q13
404    vtrn.16         q14, q15
405    vtrn.16         q0,  q1
406    vtrn.16         q2,  q3
407    vtrn.16         q4,  q5
408    vtrn.16         q6,  q7
409
410    ; Store both matrices after each other. There is a stride of 32, which
411    ; adjusts to nothing because of the post-increments.
412    vst1.16        {q8},  [r0]!
413    vst1.16        {q9},  [r0]!
414    vst1.16        {q10}, [r0]!
415    vst1.16        {q11}, [r0]!
416    vst1.16        {q12}, [r0]!
417    vst1.16        {q13}, [r0]!
418    vst1.16        {q14}, [r0]!
419    vst1.16        {q15}, [r0]!
420    vst1.16        {q0},  [r0]!
421    vst1.16        {q1},  [r0]!
422    vst1.16        {q2},  [r0]!
423    vst1.16        {q3},  [r0]!
424    vst1.16        {q4},  [r0]!
425    vst1.16        {q5},  [r0]!
426    vst1.16        {q6},  [r0]!
427    vst1.16        {q7},  [r0]!
428
429    ; increment pointers by adjusted stride (not necessary for r0/out)
430    ;   go back by 7*32 for the seven lines moved fully by read and add
431    ;   go back by 32 for the eigth line only read
432    ;   advance by 16*2 to go the next pair
433    sub r3,  r3,  #7*32*2 + 32 - 16*2
434    ; transpose pair loop processing
435    subs r8, r8, #1
436    bne idct32_transpose_pair_loop
437
438    ; restore r0/input to its original value
439    sub r0, r0, #32*8*2
440
441    ; Instead of doing the transforms stage by stage, it is done by loading
442    ; some input values and doing as many stages as possible to minimize the
443    ; storing/loading of intermediate results. To fit within registers, the
444    ; final coefficients are cut into four blocks:
445    ; BLOCK A: 16-19,28-31
446    ; BLOCK B: 20-23,24-27
447    ; BLOCK C: 8-10,11-15
448    ; BLOCK D: 0-3,4-7
449    ; Blocks A and C are straight calculation through the various stages. In
450    ; block B, further calculations are performed using the results from
451    ; block A. In block D, further calculations are performed using the results
452    ; from block C and then the final calculations are done using results from
453    ; block A and B which have been combined at the end of block B.
454
455    ; --------------------------------------------------------------------------
456    ; BLOCK A: 16-19,28-31
457    ; --------------------------------------------------------------------------
458    ; generate 16,17,30,31
459    ; --------------------------------------------------------------------------
460    ; part of stage 1
461    ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;
462    ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;
463    ;step1b[16][i] = dct_const_round_shift(temp1);
464    ;step1b[31][i] = dct_const_round_shift(temp2);
465    LOAD_FROM_TRANSPOSED 0, 1, 31
466    DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
467    ; --------------------------------------------------------------------------
468    ; part of stage 1
469    ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
470    ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
471    ;step1b[17][i] = dct_const_round_shift(temp1);
472    ;step1b[30][i] = dct_const_round_shift(temp2);
473    LOAD_FROM_TRANSPOSED 31, 17, 15
474    DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
475    ; --------------------------------------------------------------------------
476    ; part of stage 2
477    ;step2[16] =  step1b[16][i] + step1b[17][i];
478    ;step2[17] =  step1b[16][i] - step1b[17][i];
479    ;step2[30] = -step1b[30][i] + step1b[31][i];
480    ;step2[31] =  step1b[30][i] + step1b[31][i];
481    vadd.s16  q4, q0, q1
482    vsub.s16  q13, q0, q1
483    vadd.s16  q6, q2, q3
484    vsub.s16  q14, q2, q3
485    ; --------------------------------------------------------------------------
486    ; part of stage 3
487    ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
488    ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;
489    ;step3[17] = dct_const_round_shift(temp1);
490    ;step3[30] = dct_const_round_shift(temp2);
491    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
492    ; --------------------------------------------------------------------------
493    ; generate 18,19,28,29
494    ; --------------------------------------------------------------------------
495    ; part of stage 1
496    ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
497    ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;
498    ;step1b[18][i] = dct_const_round_shift(temp1);
499    ;step1b[29][i] = dct_const_round_shift(temp2);
500    LOAD_FROM_TRANSPOSED 15, 9, 23
501    DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
502    ; --------------------------------------------------------------------------
503    ; part of stage 1
504    ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;
505    ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
506    ;step1b[19][i] = dct_const_round_shift(temp1);
507    ;step1b[28][i] = dct_const_round_shift(temp2);
508    LOAD_FROM_TRANSPOSED 23, 25, 7
509    DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
510    ; --------------------------------------------------------------------------
511    ; part of stage 2
512    ;step2[18] = -step1b[18][i] + step1b[19][i];
513    ;step2[19] =  step1b[18][i] + step1b[19][i];
514    ;step2[28] =  step1b[28][i] + step1b[29][i];
515    ;step2[29] =  step1b[28][i] - step1b[29][i];
516    vsub.s16  q13, q3, q2
517    vadd.s16  q3,  q3, q2
518    vsub.s16  q14, q1, q0
519    vadd.s16  q2,  q1, q0
520    ; --------------------------------------------------------------------------
521    ; part of stage 3
522    ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);
523    ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
524    ;step3[29] = dct_const_round_shift(temp1);
525    ;step3[18] = dct_const_round_shift(temp2);
526    DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
527    ; --------------------------------------------------------------------------
528    ; combine 16-19,28-31
529    ; --------------------------------------------------------------------------
530    ; part of stage 4
531    ;step1[16] = step1b[16][i] + step1b[19][i];
532    ;step1[17] = step1b[17][i] + step1b[18][i];
533    ;step1[18] = step1b[17][i] - step1b[18][i];
534    ;step1[29] = step1b[30][i] - step1b[29][i];
535    ;step1[30] = step1b[30][i] + step1b[29][i];
536    ;step1[31] = step1b[31][i] + step1b[28][i];
537    vadd.s16  q8,  q4, q2
538    vadd.s16  q9,  q5, q0
539    vadd.s16  q10, q7, q1
540    vadd.s16  q15, q6, q3
541    vsub.s16  q13, q5, q0
542    vsub.s16  q14, q7, q1
543    STORE_IN_OUTPUT 0,  16, 31, q8,  q15
544    STORE_IN_OUTPUT 31, 17, 30, q9,  q10
545    ; --------------------------------------------------------------------------
546    ; part of stage 5
547    ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
548    ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;
549    ;step2[18] = dct_const_round_shift(temp1);
550    ;step2[29] = dct_const_round_shift(temp2);
551    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
552    STORE_IN_OUTPUT 30, 29, 18, q1, q0
553    ; --------------------------------------------------------------------------
554    ; part of stage 4
555    ;step1[19] = step1b[16][i] - step1b[19][i];
556    ;step1[28] = step1b[31][i] - step1b[28][i];
557    vsub.s16  q13, q4, q2
558    vsub.s16  q14, q6, q3
559    ; --------------------------------------------------------------------------
560    ; part of stage 5
561    ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
562    ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;
563    ;step2[19] = dct_const_round_shift(temp1);
564    ;step2[28] = dct_const_round_shift(temp2);
565    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
566    STORE_IN_OUTPUT 18, 19, 28, q4, q6
567    ; --------------------------------------------------------------------------
568
569
570    ; --------------------------------------------------------------------------
571    ; BLOCK B: 20-23,24-27
572    ; --------------------------------------------------------------------------
573    ; generate 20,21,26,27
574    ; --------------------------------------------------------------------------
575    ; part of stage 1
576    ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
577    ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;
578    ;step1b[20][i] = dct_const_round_shift(temp1);
579    ;step1b[27][i] = dct_const_round_shift(temp2);
580    LOAD_FROM_TRANSPOSED 7, 5, 27
581    DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
582    ; --------------------------------------------------------------------------
583    ; part of stage 1
584    ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
585    ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
586    ;step1b[21][i] = dct_const_round_shift(temp1);
587    ;step1b[26][i] = dct_const_round_shift(temp2);
588    LOAD_FROM_TRANSPOSED 27, 21, 11
589    DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
590    ; --------------------------------------------------------------------------
591    ; part of stage 2
592    ;step2[20] =  step1b[20][i] + step1b[21][i];
593    ;step2[21] =  step1b[20][i] - step1b[21][i];
594    ;step2[26] = -step1b[26][i] + step1b[27][i];
595    ;step2[27] =  step1b[26][i] + step1b[27][i];
596    vsub.s16  q13, q0, q1
597    vadd.s16  q0, q0, q1
598    vsub.s16  q14, q2, q3
599    vadd.s16  q2, q2, q3
600    ; --------------------------------------------------------------------------
601    ; part of stage 3
602    ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
603    ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
604    ;step3[21] = dct_const_round_shift(temp1);
605    ;step3[26] = dct_const_round_shift(temp2);
606    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
607    ; --------------------------------------------------------------------------
608    ; generate 22,23,24,25
609    ; --------------------------------------------------------------------------
610    ; part of stage 1
611    ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
612    ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
613    ;step1b[22][i] = dct_const_round_shift(temp1);
614    ;step1b[25][i] = dct_const_round_shift(temp2);
615    LOAD_FROM_TRANSPOSED 11, 13, 19
616    DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
617    ; --------------------------------------------------------------------------
618    ; part of stage 1
619    ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;
620    ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
621    ;step1b[23][i] = dct_const_round_shift(temp1);
622    ;step1b[24][i] = dct_const_round_shift(temp2);
623    LOAD_FROM_TRANSPOSED 19, 29, 3
624    DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
625    ; --------------------------------------------------------------------------
626    ; part of stage 2
627    ;step2[22] = -step1b[22][i] + step1b[23][i];
628    ;step2[23] =  step1b[22][i] + step1b[23][i];
629    ;step2[24] =  step1b[24][i] + step1b[25][i];
630    ;step2[25] =  step1b[24][i] - step1b[25][i];
631    vsub.s16  q14, q4, q5
632    vadd.s16  q5, q4, q5
633    vsub.s16  q13, q6, q7
634    vadd.s16  q6, q6, q7
635    ; --------------------------------------------------------------------------
636    ; part of stage 3
637    ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
638    ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
639    ;step3[25] = dct_const_round_shift(temp1);
640    ;step3[22] = dct_const_round_shift(temp2);
641    DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
642    ; --------------------------------------------------------------------------
643    ; combine 20-23,24-27
644    ; --------------------------------------------------------------------------
645    ; part of stage 4
646    ;step1[22] = step1b[22][i] + step1b[21][i];
647    ;step1[23] = step1b[23][i] + step1b[20][i];
648    vadd.s16  q10, q7, q1
649    vadd.s16  q11, q5, q0
650    ;step1[24] = step1b[24][i] + step1b[27][i];
651    ;step1[25] = step1b[25][i] + step1b[26][i];
652    vadd.s16  q12, q6, q2
653    vadd.s16  q15, q4, q3
654    ; --------------------------------------------------------------------------
655    ; part of stage 6
656    ;step3[16] = step1b[16][i] + step1b[23][i];
657    ;step3[17] = step1b[17][i] + step1b[22][i];
658    ;step3[22] = step1b[17][i] - step1b[22][i];
659    ;step3[23] = step1b[16][i] - step1b[23][i];
660    LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
661    vadd.s16  q8,  q14, q11
662    vadd.s16  q9,  q13, q10
663    vsub.s16  q13, q13, q10
664    vsub.s16  q11, q14, q11
665    STORE_IN_OUTPUT 17, 17, 16, q9, q8
666    ; --------------------------------------------------------------------------
667    ; part of stage 6
668    ;step3[24] = step1b[31][i] - step1b[24][i];
669    ;step3[25] = step1b[30][i] - step1b[25][i];
670    ;step3[30] = step1b[30][i] + step1b[25][i];
671    ;step3[31] = step1b[31][i] + step1b[24][i];
672    LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
673    vsub.s16  q8,  q9,  q12
674    vadd.s16  q10, q14, q15
675    vsub.s16  q14, q14, q15
676    vadd.s16  q12, q9,  q12
677    STORE_IN_OUTPUT 31, 30, 31, q10, q12
678    ; --------------------------------------------------------------------------
679    ; TODO(cd) do some register allocation change to remove these push/pop
680    vpush {q8}  ; [24]
681    vpush {q11} ; [23]
682    ; --------------------------------------------------------------------------
683    ; part of stage 7
684    ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
685    ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
686    ;step1[22] = dct_const_round_shift(temp1);
687    ;step1[25] = dct_const_round_shift(temp2);
688    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
689    STORE_IN_OUTPUT 31, 25, 22, q14, q13
690    ; --------------------------------------------------------------------------
691    ; part of stage 7
692    ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
693    ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
694    ;step1[23] = dct_const_round_shift(temp1);
695    ;step1[24] = dct_const_round_shift(temp2);
696    ; TODO(cd) do some register allocation change to remove these push/pop
697    vpop  {q13} ; [23]
698    vpop  {q14} ; [24]
699    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
700    STORE_IN_OUTPUT 22, 24, 23, q14, q13
701    ; --------------------------------------------------------------------------
702    ; part of stage 4
703    ;step1[20] = step1b[23][i] - step1b[20][i];
704    ;step1[27] = step1b[24][i] - step1b[27][i];
705    vsub.s16  q14, q5, q0
706    vsub.s16  q13, q6, q2
707    ; --------------------------------------------------------------------------
708    ; part of stage 5
709    ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);
710    ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
711    ;step2[27] = dct_const_round_shift(temp1);
712    ;step2[20] = dct_const_round_shift(temp2);
713    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
714    ; --------------------------------------------------------------------------
715    ; part of stage 4
716    ;step1[21] = step1b[22][i] - step1b[21][i];
717    ;step1[26] = step1b[25][i] - step1b[26][i];
718    vsub.s16  q14,  q7, q1
719    vsub.s16  q13,  q4, q3
720    ; --------------------------------------------------------------------------
721    ; part of stage 5
722    ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);
723    ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
724    ;step2[26] = dct_const_round_shift(temp1);
725    ;step2[21] = dct_const_round_shift(temp2);
726    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
727    ; --------------------------------------------------------------------------
728    ; part of stage 6
729    ;step3[18] = step1b[18][i] + step1b[21][i];
730    ;step3[19] = step1b[19][i] + step1b[20][i];
731    ;step3[20] = step1b[19][i] - step1b[20][i];
732    ;step3[21] = step1b[18][i] - step1b[21][i];
733    LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
734    vadd.s16  q8,  q14, q1
735    vadd.s16  q9,  q13, q6
736    vsub.s16  q13, q13, q6
737    vsub.s16  q1,  q14, q1
738    STORE_IN_OUTPUT 19, 18, 19, q8, q9
739    ; --------------------------------------------------------------------------
740    ; part of stage 6
741    ;step3[27] = step1b[28][i] - step1b[27][i];
742    ;step3[28] = step1b[28][i] + step1b[27][i];
743    ;step3[29] = step1b[29][i] + step1b[26][i];
744    ;step3[26] = step1b[29][i] - step1b[26][i];
745    LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
746    vsub.s16  q14, q8, q5
747    vadd.s16  q10, q8, q5
748    vadd.s16  q11, q9, q0
749    vsub.s16  q0, q9, q0
750    STORE_IN_OUTPUT 29, 28, 29, q10, q11
751    ; --------------------------------------------------------------------------
752    ; part of stage 7
753    ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
754    ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
755    ;step1[20] = dct_const_round_shift(temp1);
756    ;step1[27] = dct_const_round_shift(temp2);
757    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
758    STORE_IN_OUTPUT 29, 20, 27, q13, q14
759    ; --------------------------------------------------------------------------
760    ; part of stage 7
761    ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
762    ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
763    ;step1[21] = dct_const_round_shift(temp1);
764    ;step1[26] = dct_const_round_shift(temp2);
765    DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
766    STORE_IN_OUTPUT 27, 21, 26, q1, q0
767    ; --------------------------------------------------------------------------
768
769
770    ; --------------------------------------------------------------------------
771    ; BLOCK C: 8-10,11-15
772    ; --------------------------------------------------------------------------
773    ; generate 8,9,14,15
774    ; --------------------------------------------------------------------------
775    ; part of stage 2
776    ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
777    ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
778    ;step2[8] = dct_const_round_shift(temp1);
779    ;step2[15] = dct_const_round_shift(temp2);
780    LOAD_FROM_TRANSPOSED 3, 2, 30
781    DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
782    ; --------------------------------------------------------------------------
783    ; part of stage 2
784    ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
785    ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
786    ;step2[9] = dct_const_round_shift(temp1);
787    ;step2[14] = dct_const_round_shift(temp2);
788    LOAD_FROM_TRANSPOSED 30, 18, 14
789    DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
790    ; --------------------------------------------------------------------------
791    ; part of stage 3
792    ;step3[8] = step1b[8][i] + step1b[9][i];
793    ;step3[9] = step1b[8][i] - step1b[9][i];
794    ;step3[14] = step1b[15][i] - step1b[14][i];
795    ;step3[15] = step1b[15][i] + step1b[14][i];
796    vsub.s16  q13, q0, q1
797    vadd.s16  q0, q0, q1
798    vsub.s16  q14, q2, q3
799    vadd.s16  q2, q2, q3
800    ; --------------------------------------------------------------------------
801    ; part of stage 4
802    ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
803    ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;
804    ;step1[9]  = dct_const_round_shift(temp1);
805    ;step1[14] = dct_const_round_shift(temp2);
806    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
807    ; --------------------------------------------------------------------------
808    ; generate 10,11,12,13
809    ; --------------------------------------------------------------------------
810    ; part of stage 2
811    ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
812    ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
813    ;step2[10] = dct_const_round_shift(temp1);
814    ;step2[13] = dct_const_round_shift(temp2);
815    LOAD_FROM_TRANSPOSED 14, 10, 22
816    DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
817    ; --------------------------------------------------------------------------
818    ; part of stage 2
819    ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
820    ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
821    ;step2[11] = dct_const_round_shift(temp1);
822    ;step2[12] = dct_const_round_shift(temp2);
823    LOAD_FROM_TRANSPOSED 22, 26, 6
824    DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
825    ; --------------------------------------------------------------------------
826    ; part of stage 3
827    ;step3[10] = step1b[11][i] - step1b[10][i];
828    ;step3[11] = step1b[11][i] + step1b[10][i];
829    ;step3[12] = step1b[12][i] + step1b[13][i];
830    ;step3[13] = step1b[12][i] - step1b[13][i];
831    vsub.s16  q14, q4, q5
832    vadd.s16  q5, q4, q5
833    vsub.s16  q13, q6, q7
834    vadd.s16  q6, q6, q7
835    ; --------------------------------------------------------------------------
836    ; part of stage 4
837    ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);
838    ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
839    ;step1[13] = dct_const_round_shift(temp1);
840    ;step1[10] = dct_const_round_shift(temp2);
841    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
842    ; --------------------------------------------------------------------------
843    ; combine 8-10,11-15
844    ; --------------------------------------------------------------------------
845    ; part of stage 5
846    ;step2[8]  = step1b[8][i] + step1b[11][i];
847    ;step2[9]  = step1b[9][i] + step1b[10][i];
848    ;step2[10] = step1b[9][i] - step1b[10][i];
849    vadd.s16  q8,  q0, q5
850    vadd.s16  q9,  q1, q7
851    vsub.s16  q13, q1, q7
852    ;step2[13] = step1b[14][i] - step1b[13][i];
853    ;step2[14] = step1b[14][i] + step1b[13][i];
854    ;step2[15] = step1b[15][i] + step1b[12][i];
855    vsub.s16  q14, q3, q4
856    vadd.s16  q10, q3, q4
857    vadd.s16  q15, q2, q6
858    STORE_IN_OUTPUT 26, 8, 15, q8, q15
859    STORE_IN_OUTPUT 15, 9, 14, q9, q10
860    ; --------------------------------------------------------------------------
861    ; part of stage 6
862    ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
863    ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
864    ;step3[10] = dct_const_round_shift(temp1);
865    ;step3[13] = dct_const_round_shift(temp2);
866    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
867    STORE_IN_OUTPUT 14, 13, 10, q3, q1
868    ; --------------------------------------------------------------------------
869    ; part of stage 5
870    ;step2[11] = step1b[8][i] - step1b[11][i];
871    ;step2[12] = step1b[15][i] - step1b[12][i];
872    vsub.s16  q13, q0, q5
873    vsub.s16  q14,  q2, q6
874    ; --------------------------------------------------------------------------
875    ; part of stage 6
876    ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
877    ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
878    ;step3[11] = dct_const_round_shift(temp1);
879    ;step3[12] = dct_const_round_shift(temp2);
880    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
881    STORE_IN_OUTPUT 10, 11, 12, q1, q3
882    ; --------------------------------------------------------------------------
883
884
885    ; --------------------------------------------------------------------------
886    ; BLOCK D: 0-3,4-7
887    ; --------------------------------------------------------------------------
888    ; generate 4,5,6,7
889    ; --------------------------------------------------------------------------
890    ; part of stage 3
891    ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
892    ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
893    ;step3[4] = dct_const_round_shift(temp1);
894    ;step3[7] = dct_const_round_shift(temp2);
895    LOAD_FROM_TRANSPOSED 6, 4, 28
896    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
897    ; --------------------------------------------------------------------------
898    ; part of stage 3
899    ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
900    ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
901    ;step3[5] = dct_const_round_shift(temp1);
902    ;step3[6] = dct_const_round_shift(temp2);
903    LOAD_FROM_TRANSPOSED 28, 20, 12
904    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
905    ; --------------------------------------------------------------------------
906    ; part of stage 4
907    ;step1[4] = step1b[4][i] + step1b[5][i];
908    ;step1[5] = step1b[4][i] - step1b[5][i];
909    ;step1[6] = step1b[7][i] - step1b[6][i];
910    ;step1[7] = step1b[7][i] + step1b[6][i];
911    vsub.s16  q13, q0, q1
912    vadd.s16  q0, q0, q1
913    vsub.s16  q14, q2, q3
914    vadd.s16  q2, q2, q3
915    ; --------------------------------------------------------------------------
916    ; part of stage 5
917    ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
918    ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
919    ;step2[5] = dct_const_round_shift(temp1);
920    ;step2[6] = dct_const_round_shift(temp2);
921    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
922    ; --------------------------------------------------------------------------
923    ; generate 0,1,2,3
924    ; --------------------------------------------------------------------------
925    ; part of stage 4
926    ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
927    ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
928    ;step1[1] = dct_const_round_shift(temp1);
929    ;step1[0] = dct_const_round_shift(temp2);
930    LOAD_FROM_TRANSPOSED 12, 0, 16
931    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
932    ; --------------------------------------------------------------------------
933    ; part of stage 4
934    ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
935    ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
936    ;step1[2] = dct_const_round_shift(temp1);
937    ;step1[3] = dct_const_round_shift(temp2);
938    LOAD_FROM_TRANSPOSED 16, 8, 24
939    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
940    ; --------------------------------------------------------------------------
941    ; part of stage 5
942    ;step2[0] = step1b[0][i] + step1b[3][i];
943    ;step2[1] = step1b[1][i] + step1b[2][i];
944    ;step2[2] = step1b[1][i] - step1b[2][i];
945    ;step2[3] = step1b[0][i] - step1b[3][i];
946    vadd.s16  q4, q7, q6
947    vsub.s16  q7, q7, q6
948    vsub.s16  q6, q5, q14
949    vadd.s16  q5, q5, q14
950    ; --------------------------------------------------------------------------
951    ; combine 0-3,4-7
952    ; --------------------------------------------------------------------------
953    ; part of stage 6
954    ;step3[0] = step1b[0][i] + step1b[7][i];
955    ;step3[1] = step1b[1][i] + step1b[6][i];
956    ;step3[2] = step1b[2][i] + step1b[5][i];
957    ;step3[3] = step1b[3][i] + step1b[4][i];
958    vadd.s16  q8,  q4, q2
959    vadd.s16  q9,  q5, q3
960    vadd.s16  q10, q6, q1
961    vadd.s16  q11, q7, q0
962    ;step3[4] = step1b[3][i] - step1b[4][i];
963    ;step3[5] = step1b[2][i] - step1b[5][i];
964    ;step3[6] = step1b[1][i] - step1b[6][i];
965    ;step3[7] = step1b[0][i] - step1b[7][i];
966    vsub.s16  q12, q7, q0
967    vsub.s16  q13, q6, q1
968    vsub.s16  q14, q5, q3
969    vsub.s16  q15, q4, q2
970    ; --------------------------------------------------------------------------
971    ; part of stage 7
972    ;step1[0] = step1b[0][i] + step1b[15][i];
973    ;step1[1] = step1b[1][i] + step1b[14][i];
974    ;step1[14] = step1b[1][i] - step1b[14][i];
975    ;step1[15] = step1b[0][i] - step1b[15][i];
976    LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
977    vadd.s16  q2, q8, q1
978    vadd.s16  q3, q9, q0
979    vsub.s16  q4, q9, q0
980    vsub.s16  q5, q8, q1
981    ; --------------------------------------------------------------------------
982    ; part of final stage
983    ;output[14 * 32] = step1b[14][i] + step1b[17][i];
984    ;output[15 * 32] = step1b[15][i] + step1b[16][i];
985    ;output[16 * 32] = step1b[15][i] - step1b[16][i];
986    ;output[17 * 32] = step1b[14][i] - step1b[17][i];
987    LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
988    vadd.s16  q8, q4, q1
989    vadd.s16  q9, q5, q0
990    vsub.s16  q6, q5, q0
991    vsub.s16  q7, q4, q1
992
993    cmp r5, #0
994    bgt idct32_bands_end_2nd_pass
995
996idct32_bands_end_1st_pass
997    STORE_IN_OUTPUT 17, 16, 17, q6, q7
998    STORE_IN_OUTPUT 17, 14, 15, q8, q9
999    ; --------------------------------------------------------------------------
1000    ; part of final stage
1001    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
1002    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
1003    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
1004    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
1005    LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
1006    vadd.s16  q4, q2, q1
1007    vadd.s16  q5, q3, q0
1008    vsub.s16  q6, q3, q0
1009    vsub.s16  q7, q2, q1
1010    STORE_IN_OUTPUT 31, 30, 31, q6, q7
1011    STORE_IN_OUTPUT 31,  0,  1, q4, q5
1012    ; --------------------------------------------------------------------------
1013    ; part of stage 7
1014    ;step1[2] = step1b[2][i] + step1b[13][i];
1015    ;step1[3] = step1b[3][i] + step1b[12][i];
1016    ;step1[12] = step1b[3][i] - step1b[12][i];
1017    ;step1[13] = step1b[2][i] - step1b[13][i];
1018    LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
1019    vadd.s16  q2, q10, q1
1020    vadd.s16  q3, q11, q0
1021    vsub.s16  q4, q11, q0
1022    vsub.s16  q5, q10, q1
1023    ; --------------------------------------------------------------------------
1024    ; part of final stage
1025    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
1026    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
1027    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
1028    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
1029    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
1030    vadd.s16  q8, q4, q1
1031    vadd.s16  q9, q5, q0
1032    vsub.s16  q6, q5, q0
1033    vsub.s16  q7, q4, q1
1034    STORE_IN_OUTPUT 19, 18, 19, q6, q7
1035    STORE_IN_OUTPUT 19, 12, 13, q8, q9
1036    ; --------------------------------------------------------------------------
1037    ; part of final stage
1038    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
1039    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
1040    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
1041    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
1042    LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
1043    vadd.s16  q4, q2, q1
1044    vadd.s16  q5, q3, q0
1045    vsub.s16  q6, q3, q0
1046    vsub.s16  q7, q2, q1
1047    STORE_IN_OUTPUT 29, 28, 29, q6, q7
1048    STORE_IN_OUTPUT 29,  2,  3, q4, q5
1049    ; --------------------------------------------------------------------------
1050    ; part of stage 7
1051    ;step1[4] = step1b[4][i] + step1b[11][i];
1052    ;step1[5] = step1b[5][i] + step1b[10][i];
1053    ;step1[10] = step1b[5][i] - step1b[10][i];
1054    ;step1[11] = step1b[4][i] - step1b[11][i];
1055    LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
1056    vadd.s16  q2, q12, q1
1057    vadd.s16  q3, q13, q0
1058    vsub.s16  q4, q13, q0
1059    vsub.s16  q5, q12, q1
1060    ; --------------------------------------------------------------------------
1061    ; part of final stage
1062    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
1063    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
1064    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
1065    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
1066    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
1067    vadd.s16  q8, q4, q1
1068    vadd.s16  q9, q5, q0
1069    vsub.s16  q6, q5, q0
1070    vsub.s16  q7, q4, q1
1071    STORE_IN_OUTPUT 21, 20, 21, q6, q7
1072    STORE_IN_OUTPUT 21, 10, 11, q8, q9
1073    ; --------------------------------------------------------------------------
1074    ; part of final stage
1075    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
1076    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
1077    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
1078    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
1079    LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
1080    vadd.s16  q4, q2, q1
1081    vadd.s16  q5, q3, q0
1082    vsub.s16  q6, q3, q0
1083    vsub.s16  q7, q2, q1
1084    STORE_IN_OUTPUT 27, 26, 27, q6, q7
1085    STORE_IN_OUTPUT 27,  4,  5, q4, q5
1086    ; --------------------------------------------------------------------------
1087    ; part of stage 7
1088    ;step1[6] = step1b[6][i] + step1b[9][i];
1089    ;step1[7] = step1b[7][i] + step1b[8][i];
1090    ;step1[8] = step1b[7][i] - step1b[8][i];
1091    ;step1[9] = step1b[6][i] - step1b[9][i];
1092    LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
1093    vadd.s16  q2, q14, q1
1094    vadd.s16  q3, q15, q0
1095    vsub.s16  q4, q15, q0
1096    vsub.s16  q5, q14, q1
1097    ; --------------------------------------------------------------------------
1098    ; part of final stage
1099    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
1100    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
1101    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
1102    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
1103    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
1104    vadd.s16  q8, q4, q1
1105    vadd.s16  q9, q5, q0
1106    vsub.s16  q6, q5, q0
1107    vsub.s16  q7, q4, q1
1108    STORE_IN_OUTPUT 23, 22, 23, q6, q7
1109    STORE_IN_OUTPUT 23, 8, 9, q8, q9
1110    ; --------------------------------------------------------------------------
1111    ; part of final stage
1112    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
1113    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
1114    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
1115    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
1116    LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
1117    vadd.s16  q4, q2, q1
1118    vadd.s16  q5, q3, q0
1119    vsub.s16  q6, q3, q0
1120    vsub.s16  q7, q2, q1
1121    STORE_IN_OUTPUT 25, 24, 25, q6, q7
1122    STORE_IN_OUTPUT 25,  6,  7, q4, q5
1123
1124    ; restore r0 by removing the last offset from the last
1125    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
1126    sub r0, r0, #24*8*2
1127    ; restore r1 by removing the last offset from the last
1128    ;     operation (STORE_IN_OUTPUT 24,  6,  7) => 7*32*2
1129    ; advance by 8 columns => 8*2
1130    sub r1, r1, #7*32*2 - 8*2
1131    ;   advance by 8 lines (8*32*2)
1132    ;   go back by the two pairs from the loop (32*2)
1133    add r3, r3, #8*32*2 - 32*2
1134
1135    ; bands loop processing
1136    subs r4, r4, #1
1137    bne idct32_bands_loop
1138
1139    ; parameters for second pass
1140    ; the input of pass2 is the result of pass1. we have to remove the offset
1141    ;   of 32 columns induced by the above idct32_bands_loop
1142    sub r3, r1, #32*2
1143      ; r1 = pass2[32 * 32]
1144    add r1, sp, #2048
1145
1146    ; pass loop processing
1147    add r5, r5, #1
1148    b idct32_pass_loop
1149
1150idct32_bands_end_2nd_pass
1151    STORE_COMBINE_CENTER_RESULTS
1152    ; --------------------------------------------------------------------------
1153    ; part of final stage
1154    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
1155    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
1156    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
1157    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
1158    LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
1159    vadd.s16  q4, q2, q1
1160    vadd.s16  q5, q3, q0
1161    vsub.s16  q6, q3, q0
1162    vsub.s16  q7, q2, q1
1163    STORE_COMBINE_EXTREME_RESULTS
1164    ; --------------------------------------------------------------------------
1165    ; part of stage 7
1166    ;step1[2] = step1b[2][i] + step1b[13][i];
1167    ;step1[3] = step1b[3][i] + step1b[12][i];
1168    ;step1[12] = step1b[3][i] - step1b[12][i];
1169    ;step1[13] = step1b[2][i] - step1b[13][i];
1170    LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
1171    vadd.s16  q2, q10, q1
1172    vadd.s16  q3, q11, q0
1173    vsub.s16  q4, q11, q0
1174    vsub.s16  q5, q10, q1
1175    ; --------------------------------------------------------------------------
1176    ; part of final stage
1177    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
1178    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
1179    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
1180    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
1181    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
1182    vadd.s16  q8, q4, q1
1183    vadd.s16  q9, q5, q0
1184    vsub.s16  q6, q5, q0
1185    vsub.s16  q7, q4, q1
1186    STORE_COMBINE_CENTER_RESULTS
1187    ; --------------------------------------------------------------------------
1188    ; part of final stage
1189    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
1190    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
1191    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
1192    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
1193    LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
1194    vadd.s16  q4, q2, q1
1195    vadd.s16  q5, q3, q0
1196    vsub.s16  q6, q3, q0
1197    vsub.s16  q7, q2, q1
1198    STORE_COMBINE_EXTREME_RESULTS
1199    ; --------------------------------------------------------------------------
1200    ; part of stage 7
1201    ;step1[4] = step1b[4][i] + step1b[11][i];
1202    ;step1[5] = step1b[5][i] + step1b[10][i];
1203    ;step1[10] = step1b[5][i] - step1b[10][i];
1204    ;step1[11] = step1b[4][i] - step1b[11][i];
1205    LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
1206    vadd.s16  q2, q12, q1
1207    vadd.s16  q3, q13, q0
1208    vsub.s16  q4, q13, q0
1209    vsub.s16  q5, q12, q1
1210    ; --------------------------------------------------------------------------
1211    ; part of final stage
1212    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
1213    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
1214    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
1215    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
1216    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
1217    vadd.s16  q8, q4, q1
1218    vadd.s16  q9, q5, q0
1219    vsub.s16  q6, q5, q0
1220    vsub.s16  q7, q4, q1
1221    STORE_COMBINE_CENTER_RESULTS
1222    ; --------------------------------------------------------------------------
1223    ; part of final stage
1224    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
1225    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
1226    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
1227    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
1228    LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
1229    vadd.s16  q4, q2, q1
1230    vadd.s16  q5, q3, q0
1231    vsub.s16  q6, q3, q0
1232    vsub.s16  q7, q2, q1
1233    STORE_COMBINE_EXTREME_RESULTS
1234    ; --------------------------------------------------------------------------
1235    ; part of stage 7
1236    ;step1[6] = step1b[6][i] + step1b[9][i];
1237    ;step1[7] = step1b[7][i] + step1b[8][i];
1238    ;step1[8] = step1b[7][i] - step1b[8][i];
1239    ;step1[9] = step1b[6][i] - step1b[9][i];
1240    LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
1241    vadd.s16  q2, q14, q1
1242    vadd.s16  q3, q15, q0
1243    vsub.s16  q4, q15, q0
1244    vsub.s16  q5, q14, q1
1245    ; --------------------------------------------------------------------------
1246    ; part of final stage
1247    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
1248    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
1249    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
1250    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
1251    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
1252    vadd.s16  q8, q4, q1
1253    vadd.s16  q9, q5, q0
1254    vsub.s16  q6, q5, q0
1255    vsub.s16  q7, q4, q1
1256    STORE_COMBINE_CENTER_RESULTS_LAST
1257    ; --------------------------------------------------------------------------
1258    ; part of final stage
1259    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
1260    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
1261    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
1262    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
1263    LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
1264    vadd.s16  q4, q2, q1
1265    vadd.s16  q5, q3, q0
1266    vsub.s16  q6, q3, q0
1267    vsub.s16  q7, q2, q1
1268    STORE_COMBINE_EXTREME_RESULTS_LAST
1269    ; --------------------------------------------------------------------------
1270    ; restore pointers to their initial indices for next band pass by
1271    ;     removing/adding dest_stride * 8. The actual increment by eight
1272    ;     is taken care of within the _LAST macros.
1273    add r6,  r6,  r2, lsl #3
1274    add r9,  r9,  r2, lsl #3
1275    sub r7,  r7,  r2, lsl #3
1276    sub r10, r10, r2, lsl #3
1277
1278    ; restore r0 by removing the last offset from the last
1279    ;     operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2
1280    sub r0, r0, #24*8*2
1281    ; restore r1 by removing the last offset from the last
1282    ;     operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2
1283    ; advance by 8 columns => 8*2
1284    sub r1, r1, #25*32*2 - 8*2
1285    ;   advance by 8 lines (8*32*2)
1286    ;   go back by the two pairs from the loop (32*2)
1287    add r3, r3, #8*32*2 - 32*2
1288
1289    ; bands loop processing
1290    subs r4, r4, #1
1291    bne idct32_bands_loop
1292
1293    ; stack operation
1294    add sp, sp, #512+2048+2048
1295    vpop {d8-d15}
1296    pop  {r4-r11}
1297    bx              lr
1298    ENDP  ; |vp9_idct32x32_1024_add_neon|
1299    END
1300