1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@ *******************************************************************************
22@ * @file
23@ *  ih264_iquant_itrans_recon_a9.s
24@ *
25@ * @brief
26@ *  Contains function definitions for single stage  inverse transform
27@ *
28@ * @author
29@ *  Mohit
30@ *  Harinarayanaan
31@ *
32@ * @par List of Functions:
33@ *  - ih264_iquant_itrans_recon_4x4_a9()
34@ *  - ih264_iquant_itrans_recon_8x8_a9()
35@ *  - ih264_iquant_itrans_recon_chroma_4x4_a9()
36@ *
37@ * @remarks
38@ *  None
39@ *
40@ *******************************************************************************
41@*
42@**
43@ *******************************************************************************
44@ *
45@ * @brief
46@ *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
47@ *
48@ * @par Description:
49@ *  Performs inverse transform Ci4 and adds the residue to get the
50@ *  reconstructed block
51@ *
52@ * @param[in] pi2_src
53@ *  Input 4x4 coefficients
54@ *
55@ * @param[in] pu1_pred
56@ *  Prediction 4x4 block
57@ *
58@ * @param[out] pu1_out
59@ *  Output 4x4 block
60@ *
61@ * @param[in] u4_qp_div_6
62@ *     QP
63@ *
64@ * @param[in] pu2_weigh_mat
65@ * Pointer to weight matrix
66@ *
67@ * @param[in] pred_strd,
68@ *  Prediction stride
69@ *
70@ * @param[in] out_strd
71@ *  Output Stride
72@ *
73@ *@param[in] pi2_tmp
74@ * temporary buffer of size 1*16
75@ *
76@ * @param[in] pu2_iscal_mat
77@ * Pointer to the inverse quantization matrix
78@ *
79@ * @returns  Void
80@ *
81@ * @remarks
82@ *  None
83@ *
84@ *******************************************************************************
85@ *
86@void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
87@                                   UWORD8 *pu1_pred,
88@                                   UWORD8 *pu1_out,
89@                                   WORD32 pred_strd,
90@                                   WORD32 out_strd,
91@                                   const UWORD16 *pu2_iscal_mat,
92@                                   const UWORD16 *pu2_weigh_mat,
93@                                   UWORD32 u4_qp_div_6,
94@                                   WORD32 *pi4_tmp,
95@                                   WORD32 iq_start_idx
96@                                   WORD16 *pi2_dc_ld_addr)
97@**************Variables Vs Registers*****************************************
98@r0 => *pi2_src
99@r1 => *pu1_pred
100@r2 => *pu1_out
101@r3 =>  pred_strd
102@r4 =>  out_strd
103@r5 =>  *pu2_iscal_mat
104@r6 =>  *pu2_weigh_mat
105@r7 =>  u4_qp_div_6
106@r8 =>  iq_start_idx
107@r10=>  pi2_dc_ld_addr
108.text
109.syntax unified
110.p2align 2
111
112    .global ih264_iquant_itrans_recon_4x4_a9
113
114ih264_iquant_itrans_recon_4x4_a9:
115
116@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
117@If the macro value changes need to change the instruction according to it.
118@Only one shift is done in horizontal inverse because,
119@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
120@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
121
122    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
123    ldr           r7, [sp, #52]         @Loads u4_qp_div_6
124    ldr           r4, [sp, #40]         @Loads out_strd
125    vdup.s32      q15, r7               @Populate the u4_qp_div_6 in Q15
126    ldr           r5, [sp, #44]         @Loads *pu2_iscal_mat
127
128    ldr           r6, [sp, #48]         @Loads *pu2_weigh_mat
129
130    ldr           r8, [sp, #60]         @Loads iq_start_idx
131
132    ldr           r10, [sp, #64]        @Load alternate dc address
133
134    vpush         {d8-d15}
135@=======================DEQUANT FROM HERE===================================
136
137    vld4.s16      {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15
138    vld4.s16      {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15
139    vmul.s16      q10, q10, q13         @x[i]=(scale[i] * dequant[i]) where i = 0..7
140    vld4.s16      {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15
141
142    vmul.s16      q11, q11, q14         @x[i]=(scale[i] * dequant[i]) where i = 8..15
143
144    subs          r8, r8, #1            @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
145    ldrsheq       r9, [r10]             @ Loads signed halfword pi2_dc_ld_addr[0], if r8==1
146
147    vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
148    vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
149    vmull.s16     q2, d18, d22          @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
150    vmull.s16     q3, d19, d23          @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
151
152    vshl.s32      q0, q0, q15           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
153    vshl.s32      q1, q1, q15           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
154    vshl.s32      q2, q2, q15           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
155    vshl.s32      q3, q3, q15           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
156
157    vqrshrn.s32   d0, q0, #0x4          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
158    vqrshrn.s32   d1, q1, #0x4          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
159    vqrshrn.s32   d2, q2, #0x4          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
160    vqrshrn.s32   d3, q3, #0x4          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
161
162    vmoveq.16     d0[0], r9             @ Restore dc value in case of intra, i.e. r8 == 1
163
164@========= PROCESS IDCT FROM HERE =======
165@Steps for Stage 1:
166@------------------
167    vld1.32       d30[0], [r1], r3      @I row Load pu1_pred buffer
168    vadd.s16      d4, d0, d2            @x0 = q0 + q1;
169
170    vsub.s16      d5, d0, d2            @x1 = q0 - q1;
171
172    vshr.s16      d8, d1, #1            @q0>>1
173    vshr.s16      d9, d3, #1            @q1>>1
174
175    vsub.s16      d6, d8, d3            @x2 = (q0 >> 1) -  q1;
176    vadd.s16      d7, d1, d9            @x3 = q0+ (q1 >> 1);
177    vld1.32       d30[1], [r1], r3      @II row Load pu1_pred buffer
178
179    vswp          d6, d7                @Reverse positions of x2 and x3
180
181    vsub.s16      q6, q2, q3            @x0-x3 and x1-x2 combined
182    vadd.s16      q5, q2, q3            @x0 + x3 and x1+x2 combined
183
184    vld1.32       d31[0], [r1], r3      @III row Load pu1_pred buf
185
186    vswp          d12, d13
187@Steps for Stage 2:
188@------------------
189    vtrn.16       d10, d11
190    vtrn.16       d12, d13
191    vtrn.32       d10, d12
192    vtrn.32       d11, d13
193    vadd.s16      d14, d10, d12         @x0 = q0 + q1;
194
195    vsub.s16      d15, d10, d12         @x1 = q0 - q1;
196
197    vshr.s16      d18, d11, #1          @q0>>1
198    vshr.s16      d19, d13, #1          @q1>>1
199
200    vsub.s16      d16, d18, d13         @x2 = (q0 >> 1) -  q1;
201    vadd.s16      d17, d11, d19         @x3 = q0+ (q1 >> 1);
202
203    vld1.32       d31[1], [r1], r3      @IV row Load pu1_pred buffer
204    vswp          d16, d17              @Reverse positions of x2 and x3
205
206    vsub.s16      q11, q7, q8           @x0-x3 and x1-x2 combined
207    vadd.s16      q10, q7, q8           @x0 + x3 and x1+x2 combined
208
209    vswp          d22, d23
210
211    vrshr.s16     q10, q10, #6          @
212    vrshr.s16     q11, q11, #6
213
214    vaddw.u8      q10, q10, d30
215    vaddw.u8      q11, q11, d31
216
217    vqmovun.s16   d0, q10
218    vqmovun.s16   d1, q11
219
220    vst1.32       d0[0], [r2], r4       @I row store the value
221    vst1.32       d0[1], [r2], r4       @II row store the value
222    vst1.32       d1[0], [r2], r4       @III row store the value
223    vst1.32       d1[1], [r2]           @IV row store the value
224
225    vpop          {d8-d15}
226    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
227
228
229@**
230@ *******************************************************************************
231@ *
232@ * @brief
233@ *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
234@ *
235@ * @par Description:
236@ *  Performs inverse transform Ci4 and adds the residue to get the
237@ *  reconstructed block
238@ *
239@ * @param[in] pi2_src
240@ *  Input 4x4 coefficients
241@ *
242@ * @param[in] pu1_pred
243@ *  Prediction 4x4 block
244@ *
245@ * @param[out] pu1_out
246@ *  Output 4x4 block
247@ *
248@ * @param[in] u4_qp_div_6
249@ *     QP
250@ *
251@ * @param[in] pu2_weigh_mat
252@ * Pointer to weight matrix
253@ *
254@ * @param[in] pred_strd,
255@ *  Prediction stride
256@ *
257@ * @param[in] out_strd
258@ *  Output Stride
259@ *
260@ *@param[in] pi2_tmp
261@ * temporary buffer of size 1*16
262@ *
263@ * @param[in] pu2_iscal_mat
264@ * Pointer to the inverse quantization matrix
265@ *
266@ * @returns  Void
267@ *
268@ * @remarks
269@ *  None
270@ *
271@ *******************************************************************************
272@ *
273@void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
274@                                   UWORD8 *pu1_pred,
275@                                   UWORD8 *pu1_out,
276@                                   WORD32 pred_strd,
277@                                   WORD32 out_strd,
278@                                   const UWORD16 *pu2_iscal_mat,
279@                                   const UWORD16 *pu2_weigh_mat,
280@                                   UWORD32 u4_qp_div_6,
281@                                   WORD32 *pi4_tmp
282@                                   WORD16 *pi2_dc_src)
283@**************Variables Vs Registers*****************************************
284@r0 => *pi2_src
285@r1 => *pu1_pred
286@r2 => *pu1_out
287@r3 =>  pred_strd
288@r4 =>  out_strd
289@r5 =>  *pu2_iscal_mat
290@r6 =>  *pu2_weigh_mat
291@r7 =>  u4_qp_div_6
292
293    .global ih264_iquant_itrans_recon_chroma_4x4_a9
294ih264_iquant_itrans_recon_chroma_4x4_a9:
295
296@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
297@If the macro value changes need to change the instruction according to it.
298@Only one shift is done in horizontal inverse because,
299@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
300@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
301
302    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
303    ldr           r7, [sp, #52]         @Loads u4_qp_div_6
304    ldr           r4, [sp, #40]         @Loads out_strd
305    vdup.s32      q15, r7               @Populate the u4_qp_div_6 in Q15
306    ldr           r5, [sp, #44]         @Loads *pu2_iscal_mat
307    ldr           r6, [sp, #48]         @Loads *pu2_weigh_mat
308    ldr           r8, [sp, #60]         @loads *pi2_dc_src
309
310    vpush         {d8-d15}
311@=======================DEQUANT FROM HERE===================================
312
313    vld4.s16      {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15
314    vld4.s16      {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15
315    vmul.s16      q10, q10, q13         @x[i]=(scale[i] * dequant[i]) where i = 0..7
316    vld4.s16      {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15
317
318    vmul.s16      q11, q11, q14         @x[i]=(scale[i] * dequant[i]) where i = 8..15
319
320    vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
321    vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
322    vmull.s16     q2, d18, d22          @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
323    vmull.s16     q3, d19, d23          @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
324
325    vshl.s32      q0, q0, q15           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
326    vshl.s32      q1, q1, q15           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
327    vshl.s32      q2, q2, q15           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
328    vshl.s32      q3, q3, q15           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
329
330    vqrshrn.s32   d0, q0, #0x4          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
331    vqrshrn.s32   d1, q1, #0x4          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
332    vqrshrn.s32   d2, q2, #0x4          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
333    vqrshrn.s32   d3, q3, #0x4          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
334
335    ldrsh         r9, [r8]              @ Loads signed halfword pi2_dc_src[0]
336    vmov.16       d0[0], r9             @ Restore dc value since its chroma iq-it
337
338@========= PROCESS IDCT FROM HERE =======
339@Steps for Stage 1:
340@------------------
341    vld2.8        {d28, d29}, [r1], r3  @I row Load pu1_pred buffer
342    vadd.s16      d4, d0, d2            @x0 = q0 + q1;
343
344    vsub.s16      d5, d0, d2            @x1 = q0 - q1;
345
346    vshr.s16      d8, d1, #1            @q0>>1
347    vshr.s16      d9, d3, #1            @q1>>1
348
349    vsub.s16      d6, d8, d3            @x2 = (q0 >> 1) -  q1;
350    vadd.s16      d7, d1, d9            @x3 = q0+ (q1 >> 1);
351    vld2.8        {d29, d30}, [r1], r3  @II row Load pu1_pred buffer
352
353    vswp          d6, d7                @Reverse positions of x2 and x3
354
355    vsub.s16      q6, q2, q3            @x0-x3 and x1-x2 combined
356    vtrn.32       d28, d29              @ D28 -- row I and II of pu1_pred_buffer
357    vadd.s16      q5, q2, q3            @x0 + x3 and x1+x2 combined
358
359    vld2.8        {d29, d30}, [r1], r3  @III row Load pu1_pred buf
360
361    vswp          d12, d13
362@Steps for Stage 2:
363@------------------
364    vtrn.16       d10, d11
365    vtrn.16       d12, d13
366    vtrn.32       d10, d12
367    vtrn.32       d11, d13
368    vadd.s16      d14, d10, d12         @x0 = q0 + q1;
369
370    vsub.s16      d15, d10, d12         @x1 = q0 - q1;
371
372    vshr.s16      d18, d11, #1          @q0>>1
373    vshr.s16      d19, d13, #1          @q1>>1
374
375    vsub.s16      d16, d18, d13         @x2 = (q0 >> 1) -  q1;
376    vadd.s16      d17, d11, d19         @x3 = q0+ (q1 >> 1);
377
378    vld2.8        {d30, d31}, [r1], r3  @IV row Load pu1_pred buffer
379    vswp          d16, d17              @Reverse positions of x2 and x3
380
381    vsub.s16      q11, q7, q8           @x0-x3 and x1-x2 combined
382    vtrn.32       d29, d30              @ D29 -- row III and IV of pu1_pred_buf
383    vadd.s16      q10, q7, q8           @x0 + x3 and x1+x2 combined
384
385    vswp          d22, d23
386
387    vrshr.s16     q10, q10, #6          @
388    vrshr.s16     q11, q11, #6
389
390    vaddw.u8      q10, q10, d28
391    vaddw.u8      q11, q11, d29
392
393    vld1.u8       d0, [r2], r4          @Loading out buffer 16 coeffs
394    vld1.u8       d1, [r2], r4
395    vld1.u8       d2, [r2], r4
396    vld1.u8       d3, [r2], r4
397
398    sub           r2, r2, r4, lsl #2
399
400    vqmovun.s16   d20, q10              @Getting quantized coeffs
401    vqmovun.s16   d22, q11
402
403    vmovl.u8      q10, d20              @Move the coffs into 16 bit
404    vmovl.u8      q11, d22              @so that we can use vbit to copy
405
406    vmov.u16      q14, #0x00ff          @Copy lsb from qantized(long)coeffs
407
408    vbit.u8       q0, q10, q14
409    vbit.u8       q1, q11, q14
410
411    vst1.u8       d0, [r2], r4
412    vst1.u8       d1, [r2], r4
413    vst1.u8       d2, [r2], r4
414    vst1.u8       d3, [r2]
415
416    vpop          {d8-d15}
417    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
418
419
420@*
421@ *******************************************************************************
422@ *
423@ * @brief
424@ *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
425@ *
426@ * @par Description:
427@ *  Performs inverse transform Ci8 and adds the residue to get the
428@ *  reconstructed block
429@ *
430@ * @param[in] pi2_src
431@ *  Input 4x4 coefficients
432@ *
433@ * @param[in] pu1_pred
434@ *  Prediction 4x4 block
435@ *
436@ * @param[out] pu1_out
437@ *  Output 4x4 block
438@ *
439@ * @param[in] u4_qp_div_6
440@ *     QP
441@ *
442@ * @param[in] pu2_weigh_mat
443@ * Pointer to weight matrix
444@ *
445@ * @param[in] pred_strd,
446@ *  Prediction stride
447@ *
448@ * @param[in] out_strd
449@ *  Output Stride
450@ *
451@ *@param[in] pi2_tmp
452@ * temporary buffer of size 1*64
453@ *
454@ * @param[in] pu2_iscal_mat
455@ * Pointer to the inverse quantization matrix
456@ *
457@ * @returns  Void
458@ *
459@ * @remarks
460@ *  None
461@ *
462@ *******************************************************************************
463@ *
464@void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
465@                                   UWORD8 *pu1_pred,
466@                                   UWORD8 *pu1_out,
467@                                   WORD32 pred_strd,
468@                                   WORD32 out_strd,
469@                                   const UWORD16 *pu2_iscal_mat,
470@                                   const UWORD16 *pu2_weigh_mat,
471@                                   UWORD32 u4_qp_div_6,
472@                                   WORD32 *pi4_tmp,
473@                                   WORD32 iq_start_idx)
474@**************Variables Vs Registers*****************************************
475@r0 => *pi2_src
476@r1 => *pu1_pred
477@r2 => *pu1_out
478@r3 =>  pred_strd
479@r4 =>  out_strd
480@r5 =>  *pu2_iscal_mat
481@r6 =>  *pu2_weigh_mat
482@r7 =>  u4_qp_div_6
483
484
485    .global ih264_iquant_itrans_recon_8x8_a9
486ih264_iquant_itrans_recon_8x8_a9:
487
488    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
489    ldr           r7, [sp, #52]         @Loads u4_qp_div_6
490    ldr           r4, [sp, #40]         @Loads out_strd
491
492    ldr           r5, [sp, #44]         @Loads *pu2_iscal_mat
493    ldr           r6, [sp, #48]         @Loads *pu2_weigh_mat
494    vdup.s32      q15, r7               @Populate the u4_qp_div_6 in Q15
495    vpush         {d8-d15}
496
497idct_8x8_begin:
498
499@========= DEQUANT FROM HERE ===========
500
501    vld1.32       {q13}, [r5]!          @ Q13 = dequant values row 0
502    vld1.32       {q10}, [r6]!          @ Q10 = scaling factors row 0
503    vld1.32       {q14}, [r5]!          @ Q14 = dequant values row 1
504    vmul.s16      q10, q10, q13         @ Q10 = x[i] = (scale[i] * dequant[i]) where i = 0..7
505    vld1.32       {q11}, [r6]!          @ Q11 = scaling factors row 1
506    vld1.32       {q8}, [r0]!           @ Q8  = Source row 0
507    vmul.s16      q11, q11, q14         @ Q11 = x[i] = (scale[i] * dequant[i]) where i = 8..15
508    vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
509    vld1.32       {q9}, [r0]!           @ Q8  = Source row 1
510    vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
511    vmull.s16     q2, d18, d22          @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
512    vld1.32       {q13}, [r6]!          @ Scaling factors row 2
513    vmull.s16     q3, d19, d23          @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
514    vld1.32       {q14}, [r6]!          @ Scaling factors row 3
515    vshl.s32      q0, q0, q15           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
516    vld1.32       {q10}, [r5]!          @ Q10 = Dequant values row 2
517    vshl.s32      q1, q1, q15           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
518    vld1.32       {q8}, [r0]!           @ Source Row 2
519    vshl.s32      q2, q2, q15           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
520    vld1.32       {q11}, [r5]!          @ Q11 = Dequant values row 3
521    vshl.s32      q3, q3, q15           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
522    vld1.32       {q9}, [r0]!           @ Source Row 3
523    vmul.s16      q10, q10, q13         @ Dequant row2*scale matrix row 2
524    vmul.s16      q11, q11, q14         @ Dequant row 3*scale matrix row 3
525    vld1.32       {q4}, [r6]!           @ Scaling factors row 4
526    vqrshrn.s32   d0, q0, #0x6          @ D0  = c[i] = ((q[i] + 32) >> 6) where i = 0..3
527    vqrshrn.s32   d1, q1, #0x6          @ D1  = c[i] = ((q[i] + 32) >> 6) where i = 4..7
528    vld1.32       {q5}, [r6]!           @ Scaling factors row 5
529    vqrshrn.s32   d2, q2, #0x6          @ D2  = c[i] = ((q[i] + 32) >> 6) where i = 8..11
530    vqrshrn.s32   d3, q3, #0x6          @ D3  = c[i] = ((q[i] + 32) >> 6) where i = 12..15
531    vld1.32       {q13}, [r5]!          @ Q13 = Dequant values row 4
532    vmull.s16     q2, d16, d20          @ p[i] = (x[i] * trns_coeff[i]) where i=16..19
533    vmull.s16     q3, d17, d21          @ p[i] = (x[i] * trns_coeff[i]) where i=20..23
534    vld1.32       {q12}, [r5]!          @ Q12 = Dequant values row 5
535    vmull.s16     q6, d18, d22          @ p[i] = (x[i] * trns_coeff[i]) where i=24..27
536    vmull.s16     q7, d19, d23          @ p[i] = (x[i] * trns_coeff[i]) where i=28..31
537
538    vld1.32       {q14}, [r0]!          @ Source row 4
539    vmul.s16      q10, q4, q13          @ Dequant row4*scale matrix row 4
540    vmul.s16      q11, q5, q12          @ Dequant row5*scale matrix row 5
541    vld1.32       {q9}, [r0]!           @ Source row 5
542    vshl.s32      q2, q2, q15           @
543    vshl.s32      q3, q3, q15           @
544    vld1.32       {q13}, [r6]!          @ Scaling factors row 6
545    vshl.s32      q6, q6, q15           @
546    vshl.s32      q7, q7, q15           @
547    vmull.s16     q4, d28, d20          @ i = 32..35
548    vqrshrn.s32   d4, q2, #0x6          @ D4  = c[i] = ((q[i] + 32) >> 6) where i = 16..19
549    vqrshrn.s32   d5, q3, #0x6          @ D5  = c[i] = ((q[i] + 32) >> 6) where i = 20..23
550    vmull.s16     q5, d29, d21          @ i =36..39
551    vld1.32       {q10}, [r5]!          @ Dequant values row 6
552    vqrshrn.s32   d6, q6, #0x6          @ D6  = c[i] = ((q[i] + 32) >> 6) where i = 24..27
553    vqrshrn.s32   d7, q7, #0x6          @ D7  = c[i] = ((q[i] + 32) >> 6) where i = 28..31
554    vld1.32       {q14}, [r6]!          @ Scaling factors row 7
555    vmull.s16     q6, d18, d22          @
556    vld1.32       {q8}, [r0]!           @ Source row 6
557    vmull.s16     q7, d19, d23          @
558    vld1.32       {q11}, [r5]!          @ Dequant values row 7
559    vshl.s32      q4, q4, q15           @
560    vld1.32       {q9}, [r0]!           @ Source row 7
561    vshl.s32      q5, q5, q15           @
562
563    vshl.s32      q6, q6, q15           @
564    vshl.s32      q7, q7, q15           @
565    vmul.s16      q10, q10, q13         @ Dequant*scaling row 6
566    vmul.s16      q11, q11, q14         @ Dequant*scaling row 7
567    vqrshrn.s32   d8, q4, #0x6          @ D8  = c[i] = ((q[i] + 32) >> 6) where i = 32..35
568    vqrshrn.s32   d9, q5, #0x6          @ D9  = c[i] = ((q[i] + 32) >> 6) where i = 36..39
569    vqrshrn.s32   d10, q6, #0x6         @ D10  = c[i] = ((q[i] + 32) >> 6) where i = 40..43
570    vqrshrn.s32   d11, q7, #0x6         @ D11  = c[i] = ((q[i] + 32) >> 6) where i = 44..47
571    vmull.s16     q6, d16, d20          @ i= 48..51
572    vmull.s16     q7, d17, d21          @ i= 52..55
573    vmull.s16     q8, d18, d22          @ i=56..59
574    vmull.s16     q9, d19, d23          @ i=60..63
575    vshl.s32      q6, q6, q15           @
576    vzip.s16      q0, q1                @Transpose
577    vshl.s32      q7, q7, q15           @
578    vshl.s32      q8, q8, q15           @
579    vzip.s16      q2, q3                @
580    vshl.s32      q9, q9, q15           @
581    vqrshrn.s32   d12, q6, #0x6         @ D12  = c[i] = ((q[i] + 32) >> 6) where i = 48..51
582    vzip.s16      q4, q5                @Transpose
583    vqrshrn.s32   d13, q7, #0x6         @ D13  = c[i] = ((q[i] + 32) >> 6) where i = 52..55
584    vqrshrn.s32   d14, q8, #0x6         @ D14  = c[i] = ((q[i] + 32) >> 6) where i = 56..59
585    vzip.s32      q0, q2                @Transpose
586    vqrshrn.s32   d15, q9, #0x6         @ D15  = c[i] = ((q[i] + 32) >> 6) where i = 60..63
587
588@========= PROCESS IDCT FROM HERE =======
589
590@Steps for Stage 2:
591@------------------
592
593@   TRANSPOSE 8x8 coeffs to actual order
594
595    vzip.s16      q6, q7                @
596
597    vzip.s32      q1, q3                @
598    vzip.s32      q4, q6                @
599    vzip.s32      q5, q7                @
600
601    vswp          d1, d8                @ Q0/Q1 = Row order x0/x1
602    vswp          d3, d10               @ Q2/Q3 = Row order x2/x3
603    vswp          d5, d12               @ Q4/Q5 = Row order x4/x5
604    vswp          d7, d14               @ Q6/Q7 = Row order x6/x7
605
606    vswp          q1, q4                @
607    vshr.s16      q10, q2, #0x1         @
608    vswp          q3, q6                @
609
610@Steps for Stage 1:
611@------------------
612
613    vadd.s16      q8, q0, q4            @ Q8 = y0
614    vsub.s16      q9, q0, q4            @ Q9 = y2
615
616    vsra.s16      q2, q6, #0x1          @ Q2 = y6
617    vsub.s16      q6, q10, q6           @ Q6 = y4
618
619    vaddl.s16     q12, d14, d2          @ y3 (0-3) 1+7
620    vaddl.s16     q13, d15, d3          @ y3 (4-7) 1+7
621
622    vsubl.s16     q10, d14, d2          @ y5 (0-3) 7-1
623    vsubl.s16     q11, d15, d3          @ y5 (4-7) 7-1
624
625    vadd.s16      q0, q8, q2            @ Q0 = z0
626    vsub.s16      q4, q8, q2            @ Q4 = z6
627
628    vadd.s16      q8, q9, q6            @ Q8 = z2
629    vsub.s16      q2, q9, q6            @ Q2 = z4
630
631    vsubw.s16     q12, q12, d6          @ y3 (0-3) 1+7-3
632    vsubw.s16     q13, q13, d7          @ y3 (0-7) 1+7-3
633
634    vshr.s16      q6, q3, #0x1          @
635
636    vaddw.s16     q10, q10, d10         @
637    vaddw.s16     q11, q11, d11         @
638
639    vshr.s16      q9, q5, #0x1          @
640
641    vsubw.s16     q12, q12, d12         @
642    vsubw.s16     q13, q13, d13         @
643
644    vaddw.s16     q10, q10, d18         @
645    vaddw.s16     q11, q11, d19         @
646
647    vqmovn.s32    d12, q12              @
648    vaddl.s16     q12, d10, d6          @
649    vqmovn.s32    d13, q13              @ Q6 = y3
650    vaddl.s16     q13, d11, d7          @
651    vqmovn.s32    d18, q10              @
652    vsubl.s16     q10, d10, d6          @
653    vqmovn.s32    d19, q11              @ Q9 = y5
654    vsubl.s16     q11, d11, d7          @
655
656    vshr.s16      q3, q6, #0x2          @
657
658    vsra.s16      q6, q9, #0x2          @ Q6 = z3
659
660    vaddw.s16     q12, q12, d2          @
661    vaddw.s16     q13, q13, d3          @
662
663    vshr.s16      q1, #0x1              @
664
665    vsub.s16      q5, q3, q9            @ Q5 = z5
666
667    vsubw.s16     q10, q10, d14         @
668    vsubw.s16     q11, q11, d15         @
669
670    vshr.s16      q7, #0x1              @
671
672    vaddw.s16     q12, q12, d2          @
673    vaddw.s16     q13, q13, d3          @
674
675    vsubw.s16     q10, q10, d14         @
676    vsubw.s16     q11, q11, d15         @
677
678
679    vqmovn.s32    d14, q12              @
680    vadd.s16      q1, q8, q5            @ Q1 = x1
681    vqmovn.s32    d15, q13              @ Q7 = y7
682    vsub.s16      q3, q8, q5            @ Q3 = x6
683    vqmovn.s32    d18, q10              @
684    vsub.s16      q5, q2, q6            @ Q5 = x5
685    vqmovn.s32    d19, q11              @ Q9 = y1
686    vadd.s16      q2, q2, q6            @ Q2 = x2
687
688    vshr.s16      q12, q9, #0x2         @
689    vsra.s16      q9, q7, #0x2          @ Q9 = z1
690
691    vsub.s16      q11, q7, q12          @ Q11 = z7
692
693    vadd.s16      q6, q4, q9            @ Q6 = x3
694    vsub.s16      q4, q4, q9            @ Q4 = x4
695
696    vsub.s16      q7, q0, q11           @ Q7 = x7
697    vadd.s16      q0, q0, q11           @ Q0 = x0
698
699    vswp.s16      q3, q6                @ Q3 = x3, Q6 = x6
700
701
702@Steps for Stage 2:
703@------------------
704
705@   TRANSPOSE 8x8 coeffs to actual order
706
707    vzip.s16      q0, q1                @
708    vzip.s16      q2, q3                @
709    vzip.s16      q4, q5                @
710    vzip.s16      q6, q7                @
711
712    vzip.s32      q0, q2                @
713    vzip.s32      q1, q3                @
714    vzip.s32      q4, q6                @
715    vzip.s32      q5, q7                @
716
717    vswp          d1, d8                @ Q0/Q1 = Row order x0/x1
718    vswp          d3, d10               @ Q2/Q3 = Row order x2/x3
719    vswp          d5, d12               @ Q4/Q5 = Row order x4/x5
720    vswp          d7, d14               @ Q6/Q7 = Row order x6/x7
721
722    vswp          q1, q4                @
723    vshr.s16      q10, q2, #0x1         @
724    vswp          q3, q6                @
725
726@Steps for Stage 3:
727@------------------
728
729@Repeat stage 1 again for vertical transform
730
731    vadd.s16      q8, q0, q4            @ Q8 = y0
732    vld1.32       d28, [r1], r3         @ Q12 = 0x070605....0x070605....
733    vsub.s16      q9, q0, q4            @ Q9 = y2
734
735    vsra.s16      q2, q6, #0x1          @ Q2 = y6
736    vsub.s16      q6, q10, q6           @ Q6 = y4
737
738    vaddl.s16     q12, d14, d2          @
739    vld1.32       d29, [r1], r3         @ Q12 = 0x070605....0x070605....
740    vaddl.s16     q13, d15, d3          @
741
742    vsubl.s16     q10, d14, d2          @
743    vld1.32       d30, [r1], r3         @ Q12 = 0x070605....0x070605....
744    vsubl.s16     q11, d15, d3          @
745
746    vadd.s16      q0, q8, q2            @ Q0 = z0
747    vld1.32       d31, [r1], r3         @ Q12 = 0x070605....0x070605....
748    vsub.s16      q4, q8, q2            @ Q4 = z6
749
750    vadd.s16      q8, q9, q6            @ Q8 = z2
751    vsub.s16      q2, q9, q6            @ Q2 = z4
752
753    vsubw.s16     q12, q12, d6          @
754    vsubw.s16     q13, q13, d7          @
755
756    vshr.s16      q6, q3, #0x1          @
757
758    vaddw.s16     q10, q10, d10         @
759    vaddw.s16     q11, q11, d11         @
760
761    vshr.s16      q9, q5, #0x1          @
762
763    vsubw.s16     q12, q12, d12         @
764    vsubw.s16     q13, q13, d13         @
765
766    vaddw.s16     q10, q10, d18         @
767    vaddw.s16     q11, q11, d19         @
768
769    vqmovn.s32    d12, q12              @
770    vaddl.s16     q12, d10, d6          @
771    vqmovn.s32    d13, q13              @ Q6 = y3
772    vaddl.s16     q13, d11, d7          @
773    vqmovn.s32    d18, q10              @
774    vsubl.s16     q10, d10, d6          @
775    vqmovn.s32    d19, q11              @ Q9 = y5
776    vsubl.s16     q11, d11, d7          @
777
778    vshr.s16      q3, q6, #0x2          @
779
780    vsra.s16      q6, q9, #0x2          @ Q6 = z3
781
782    vaddw.s16     q12, q12, d2          @
783    vaddw.s16     q13, q13, d3          @
784
785    vshr.s16      q1, #0x1              @
786
787    vsub.s16      q5, q3, q9            @ Q5 = z5
788
789    vsubw.s16     q10, q10, d14         @
790    vsubw.s16     q11, q11, d15         @
791
792    vshr.s16      q7, #0x1              @
793
794    vaddw.s16     q12, q12, d2          @
795    vaddw.s16     q13, q13, d3          @
796
797    vsubw.s16     q10, q10, d14         @
798    vsubw.s16     q11, q11, d15         @
799
800    vqmovn.s32    d14, q12              @
801    vadd.s16      q1, q8, q5            @ Q1 = x1
802    vqmovn.s32    d15, q13              @ Q7 = y7
803    vsub.s16      q3, q8, q5            @ Q3 = x6
804    vqmovn.s32    d18, q10              @
805    vsub.s16      q5, q2, q6            @ Q5 = x5
806    vqmovn.s32    d19, q11              @ Q9 = y1
807    vadd.s16      q2, q2, q6            @ Q2 = x2
808
809    vshr.s16      q12, q9, #0x2         @
810    vsra.s16      q9, q7, #0x2          @ Q9 = z1
811
812    vsub.s16      q11, q7, q12          @ Q11 = z7
813
814    vadd.s16      q6, q4, q9            @ Q6 = x3
815    vsub.s16      q4, q4, q9            @ Q4 = x4
816
817    vsub.s16      q7, q0, q11           @ Q7 = x7
818    vadd.s16      q0, q0, q11           @ Q0 = x0
819
820    vswp.s16      q3, q6                @ Q3 <-> Q6
821
822    vrshr.s16     q1, q1, #6            @
823    vld1.32       d16, [r1], r3         @ Q12 = 0x070605....0x070605....
824    vrshr.s16     q2, q2, #6            @
825    vrshr.s16     q4, q4, #6            @
826    vld1.32       d17, [r1], r3         @ Q12 = 0x070605....0x070605....
827    vrshr.s16     q5, q5, #6            @
828    vrshr.s16     q7, q7, #6            @
829    vld1.32       d18, [r1], r3         @ Q12 = 0x070605....0x070605....
830    vrshr.s16     q0, q0, #6            @
831    vrshr.s16     q3, q3, #6            @
832    vld1.32       d19, [r1], r3         @ Q12 = 0x070605....0x070605....
833    vrshr.s16     q6, q6, #6            @
834
835@ Code Added to pack sign and magnitudes
836
837    vaddw.u8      q0, q0, d28
838    vaddw.u8      q1, q1, d29
839    vaddw.u8      q2, q2, d30
840    vaddw.u8      q3, q3, d31
841    vqmovun.s16   d0, q0
842    vaddw.u8      q4, q4, d16
843    vqmovun.s16   d1, q1
844    vaddw.u8      q5, q5, d17
845    vqmovun.s16   d2, q2
846    vaddw.u8      q6, q6, d18
847    vqmovun.s16   d3, q3
848    vaddw.u8      q7, q7, d19
849
850    vqmovun.s16   d4, q4
851    vst1.32       d0, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
852    vqmovun.s16   d5, q5
853    vst1.32       d1, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
854    vqmovun.s16   d6, q6
855    vst1.32       d2, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
856    vqmovun.s16   d7, q7
857    vst1.32       d3, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
858    vst1.32       d4, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
859
860    vst1.32       d5, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
861
862
863    vst1.32       d6, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
864
865
866    vst1.32       d7, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
867
868idct_8x8_end:
869
870    vpop          {d8-d15}
871    ldmfd         sp!, {r4-r12, r15}
872
873