@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@ *******************************************************************************
@ * @file
@ *  ihevc_itrans_recon_8x8_neon.s
@ *
@ * @brief
@ *  contains function definitions for single stage  inverse transform
@ *
@ * @author
@ *  anand s
@ *
@ * @par list of functions:
@ *  - ihevc_itrans_recon_8x8()
@ *
@ * @remarks
@ *  none
@ *
@ *******************************************************************************
@*/

@/**
@ *******************************************************************************
@ *
@ * @brief
@ *  this function performs inverse transform  and reconstruction for 8x8
@ * input block
@ *
@ * @par description:
@ *  performs inverse transform and adds the prediction  data and clips output
@ * to 8 bit
@ *
@ * @param[in] pi2_src
@ *  input 8x8 coefficients
@ *
@ * @param[in] pi2_tmp
@ *  temporary 8x8 buffer for storing inverse
@ *
@ *  transform
@ *  1st stage output
@ *
@ * @param[in] pu1_pred
@ *  prediction 8x8 block
@ *
@ * @param[out] pu1_dst
@ *  output 8x8 block
@ *
@ * @param[in] src_strd
@ *  input stride
@ *
@ * @param[in] pred_strd
@ *  prediction stride
@ *
@ * @param[in] dst_strd
@ *  output stride
@ *
@ * @param[in] shift
@ *  output shift
@ *
@ * @param[in] zero_cols
@ *  zero columns in pi2_src
@ *
@ * @returns  void
@ *
@ * @remarks
@ *  none
@ *
@ *******************************************************************************
@ */

@void ihevc_itrans_recon_8x8(word16 *pi2_src,
@                            word16 *pi2_tmp,
@                            uword8 *pu1_pred,
@                            uword8 *pu1_dst,
@                            word32 src_strd,
@                            word32 pred_strd,
@                            word32 dst_strd,
@                            word32 zero_cols
@                            word32 zero_rows               )

@**************variables vs registers*************************
@   r0 => *pi2_src
@   r1 => *pi2_tmp
@   r2 => *pu1_pred
@   r3 => *pu1_dst
@   src_strd
@   pred_strd
@   dst_strd
@   zero_cols


.text
.align 4


.set width_x_size_x5 ,   40
.set width_x_size_x2 ,   32
.set shift_stage1_idct ,   7
.set shift_stage2_idct ,   12

.globl ihevc_itrans_recon_8x8_a9q

.extern g_ai2_ihevc_trans_8_transpose

g_ai2_ihevc_trans_8_transpose_addr:
.long g_ai2_ihevc_trans_8_transpose - ulbl1 - 8

.type ihevc_itrans_recon_8x8_a9q, %function

ihevc_itrans_recon_8x8_a9q:
@//register usage.extern        - loading and until idct of columns
@// cosine constants    -   d0
@// sine constants      -   d1
@// row 0 first half    -   d2      -   y0
@// row 1 first half    -   d6      -   y1
@// row 2 first half    -   d3      -   y2
@// row 3 first half    -   d7      -   y3
@// row 4 first half    -   d10     -   y4
@// row 5 first half    -   d14     -   y5
@// row 6 first half    -   d11     -   y6
@// row 7 first half    -   d15     -   y7

@// row 0 second half   -   d4      -   y0
@// row 1 second half   -   d8      -   y1
@// row 2 second half   -   d5      -   y2
@// row 3 second half   -   d9      -   y3
@// row 4 second half   -   d12     -   y4
@// row 5 second half   -   d16     -   y5
@// row 6 second half   -   d13     -   y6
@// row 7 second half   -   d17     -   y7

    @// copy the input pointer to another register
    @// step 1 : load all constants
    stmfd       sp!,{r4-r12,lr}

    ldr         r8,[sp,#44]                  @ prediction stride
    ldr         r7,[sp,#48]                  @ destination stride
    ldr         r6,[sp, #40]                     @ src stride
    ldr         r12,[sp,#52]
    ldr         r11,[sp,#56]
    mov         r6,r6,lsl #1                @ x sizeof(word16)
    add         r9,r0,r6, lsl #1            @ 2 rows

    add         r10,r6,r6, lsl #1           @ 3 rows

    sub         r10,r10, #8                 @ - 4 cols * sizeof(word16)
    sub         r5,r6, #8                   @ src_strd - 4 cols * sizeof(word16)

@   ldr         r14,=g_imp4d_cxa8_idct_q15
    ldr         r14,g_ai2_ihevc_trans_8_transpose_addr
ulbl1:
    add         r14,r14,pc
    vld1.16     {d0,d1},[r14]               @//d0,d1 are used for storing the constant data

    @//step 2 load all the input data
    @//step 3 operate first 4 colums at a time

    and         r11,r11,#0xff
    and         r12,r12,#0xff

    cmp         r11,#0xf0
    bge         skip_last4_rows


    vld1.16     d2,[r0]!
    vld1.16     d3,[r9]!
    vld1.16     d4,[r0],r5
    vmull.s16   q10,d2,d0[0]                @// y0 * cos4(part of c0 and c1)
    vld1.16     d5,[r9],r5
    vmull.s16   q9,d3,d1[2]                 @// y2 * sin2 (q3 is freed by this time)(part of d1)
    vld1.16     d6,[r0]!
    vld1.16     d7,[r9]!
    vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
    vld1.16     d8,[r0],r10
    vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
    vld1.16     d9,[r9],r10
    vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
    vld1.16     d10,[r0]!
    vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)
    vld1.16     d11,[r9]!
    vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
    vld1.16     d12,[r0],r5
    vmlsl.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
    vld1.16     d13,[r9],r5
    vmlsl.s16   q14,d7,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
    vld1.16     d14,[r0]!
    vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)
    vld1.16     d15,[r9]!
    vmull.s16   q11,d10,d0[0]               @// y4 * cos4(part of c0 and c1)
    vld1.16     d16,[r0],r10
    vmull.s16   q3,d3,d0[2]                 @// y2 * cos2(part of d0)
    vld1.16     d17,[r9],r10

    @/* this following was activated when alignment is not there */
@// vld1.16     d2,[r0]!
@// vld1.16     d3,[r2]!
@// vld1.16     d4,[r0]!
@// vld1.16     d5,[r2]!
@// vld1.16     d6,[r0]!
@// vld1.16     d7,[r2]!
@// vld1.16     d8,[r0],r3
@// vld1.16     d9,[r2],r3
@// vld1.16     d10,[r0]!
@// vld1.16     d11,[r2]!
@// vld1.16     d12,[r0]!
@// vld1.16     d13,[r2]!
@// vld1.16     d14,[r0]!
@// vld1.16     d15,[r2]!
@// vld1.16     d16,[r0],r3
@// vld1.16     d17,[r2],r3


    vmlal.s16   q12,d14,d1[1]               @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    vmlsl.s16   q13,d14,d0[1]               @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    vmlal.s16   q14,d14,d1[3]               @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    vmlal.s16   q15,d14,d0[3]               @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)

    vmlsl.s16   q9,d11,d0[2]                @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    vmlal.s16   q3,d11,d1[2]                @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)

    vadd.s32    q5,q10,q11                  @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    vsub.s32    q10,q10,q11                 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)

    vmlal.s16   q12,d15,d1[3]               @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
    vmlsl.s16   q13,d15,d1[1]               @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
    vmlal.s16   q14,d15,d0[3]               @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
    vmlsl.s16   q15,d15,d0[1]               @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)

    vadd.s32    q7,q5,q3                    @// a0 = c0 + d0(part of r0,r7)
    vsub.s32    q5,q5,q3                    @// a3 = c0 - d0(part of r3,r4)
    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)

    vadd.s32    q10,q7,q12                  @// a0 + b0(part of r0)
    vsub.s32    q3,q7,q12                   @// a0 - b0(part of r7)

    vadd.s32    q12,q11,q14                 @// a2 + b2(part of r2)
    vsub.s32    q11,q11,q14                 @// a2 - b2(part of r5)

    vadd.s32    q14,q9,q13                  @// a1 + b1(part of r1)
    vsub.s32    q9,q9,q13                   @// a1 - b1(part of r6)

    vadd.s32    q13,q5,q15                  @// a3 + b3(part of r3)
    vsub.s32    q15,q5,q15                  @// a3 - b3(part of r4)

    vqrshrn.s32 d2,q10,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d15,q3,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d3,q12,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d14,q11,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d6,q14,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d11,q9,#shift_stage1_idct   @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d7,q13,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d10,q15,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)


    b           last4_cols


skip_last4_rows:


    vld1.16     d2,[r0]!
    vld1.16     d3,[r9]!
    vld1.16     d4,[r0],r5
    vld1.16     d5,[r9],r5
    vld1.16     d6,[r0]!
    vld1.16     d7,[r9]!
    vld1.16     d8,[r0],r10
    vld1.16     d9,[r9],r10


    vmov.s16    q6,#0
    vmov.s16    q8,#0


    vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
    vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
    vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
    vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)

    vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
    vmlsl.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
    vmlsl.s16   q14,d7,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
    vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)

    vmull.s16   q9,d3,d1[2]                 @// y2 * sin2 (q3 is freed by this time)(part of d1)
    vmull.s16   q3,d3,d0[2]                 @// y2 * cos2(part of d0)

    vmull.s16   q10,d2,d0[0]                @// y0 * cos4(part of c0 and c1)


    vadd.s32    q7,q10,q3                   @// a0 = c0 + d0(part of r0,r7)
    vsub.s32    q5,q10,q3                   @// a3 = c0 - d0(part of r3,r4)
    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)

    vadd.s32    q10,q7,q12                  @// a0 + b0(part of r0)
    vsub.s32    q3,q7,q12                   @// a0 - b0(part of r7)

    vadd.s32    q12,q11,q14                 @// a2 + b2(part of r2)
    vsub.s32    q11,q11,q14                 @// a2 - b2(part of r5)

    vadd.s32    q14,q9,q13                  @// a1 + b1(part of r1)
    vsub.s32    q9,q9,q13                   @// a1 - b1(part of r6)

    vadd.s32    q13,q5,q15                  @// a3 + b3(part of r3)
    vsub.s32    q15,q5,q15                  @// a3 - b3(part of r4)

    vqrshrn.s32 d2,q10,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d15,q3,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d3,q12,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d14,q11,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d6,q14,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d11,q9,#shift_stage1_idct   @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d7,q13,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d10,q15,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)


last4_cols:


    cmp         r12,#0xf0
    bge         skip_last4cols

    vmull.s16   q12,d8,d0[1]                @// y1 * cos1(part of b0)
    vmull.s16   q13,d8,d0[3]                @// y1 * cos3(part of b1)
    vmull.s16   q14,d8,d1[1]                @// y1 * sin3(part of b2)
    vmull.s16   q15,d8,d1[3]                @// y1 * sin1(part of b3)

    vmlal.s16   q12,d9,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
    vmlsl.s16   q13,d9,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
    vmlsl.s16   q14,d9,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
    vmlsl.s16   q15,d9,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)

    vmull.s16   q9,d5,d1[2]                 @// y2 * sin2 (q4 is freed by this time)(part of d1)
    vmull.s16   q4,d5,d0[2]                 @// y2 * cos2(part of d0)

    vmull.s16   q10,d4,d0[0]                @// y0 * cos4(part of c0 and c1)
    vmull.s16   q11,d12,d0[0]               @// y4 * cos4(part of c0 and c1)

    vmlal.s16   q12,d16,d1[1]               @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    vmlsl.s16   q13,d16,d0[1]               @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    vmlal.s16   q14,d16,d1[3]               @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    vmlal.s16   q15,d16,d0[3]               @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)

    vmlsl.s16   q9,d13,d0[2]                @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    vmlal.s16   q4,d13,d1[2]                @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)

    vadd.s32    q6,q10,q11                  @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    vsub.s32    q10,q10,q11                 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)

    vmlal.s16   q12,d17,d1[3]               @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
    vmlsl.s16   q13,d17,d1[1]               @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
    vmlal.s16   q14,d17,d0[3]               @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
    vmlsl.s16   q15,d17,d0[1]               @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)

    vadd.s32    q8,q6,q4                    @// a0 = c0 + d0(part of e0,e7)
    vsub.s32    q6,q6,q4                    @// a3 = c0 - d0(part of e3,e4)
    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of e2,e5)
    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of e1,e6)

    vadd.s32    q10,q8,q12                  @// a0 + b0(part of e0)
    vsub.s32    q4,q8,q12                   @// a0 - b0(part of e7)

    vadd.s32    q12,q11,q14                 @// a2 + b2(part of e2)
    vsub.s32    q11,q11,q14                 @// a2 - b2(part of e5)

    vadd.s32    q14,q9,q13                  @// a1 + b1(part of e1)
    vsub.s32    q9,q9,q13                   @// a1 - b1(part of e6)

    vadd.s32    q13,q6,q15                  @// a3 + b3(part of e3)
    vsub.s32    q15,q6,q15                  @// a3 - b3(part of r4)

    vqrshrn.s32 d4,q10,#shift_stage1_idct   @// r0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d17,q4,#shift_stage1_idct   @// r7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d5,q12,#shift_stage1_idct   @// r2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d16,q11,#shift_stage1_idct  @// r5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d8,q14,#shift_stage1_idct   @// r1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d13,q9,#shift_stage1_idct   @// r6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d9,q13,#shift_stage1_idct   @// r3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
    vqrshrn.s32 d12,q15,#shift_stage1_idct  @// r4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
    b           end_skip_last4cols


skip_last4cols:


    vtrn.16     q1,q3                       @//[r3,r1],[r2,r0] first qudrant transposing

    vtrn.16     q5,q7                       @//[r7,r5],[r6,r4] third qudrant transposing


    vtrn.32     d6,d7                       @//r0,r1,r2,r3 first qudrant transposing continued.....
    vtrn.32     d2,d3                       @//r0,r1,r2,r3 first qudrant transposing continued.....

    vtrn.32     d10,d11                     @//r4,r5,r6,r7 third qudrant transposing continued.....
    vtrn.32     d14,d15                     @//r4,r5,r6,r7 third qudrant transposing continued.....


    vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
    vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
    vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
    vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)

    vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
    vmlsl.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
    vmlsl.s16   q14,d7,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
    vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)

    vmull.s16   q10,d2,d0[0]                @// y0 * cos4(part of c0 and c1)
@   vmull.s16   q11,d4,d0[0]                    @// y4 * cos4(part of c0 and c1)

    vmull.s16   q9,d3,d1[2]                 @// y2 * sin2 (q3 is freed by this time)(part of d1)
    vmull.s16   q3,d3,d0[2]                 @// y2 * cos2(part of d0)


    vsub.s32    q11,q10,q3                  @// a3 = c0 - d0(part of r3,r4)
    vadd.s32    q2,q10,q3                   @// a0 = c0 + d0(part of r0,r7)


    vadd.s32    q1,q2,q12

    vsub.s32    q3,q2,q12

    vadd.s32    q4,q11,q15

    vsub.s32    q12,q11,q15

    vqrshrn.s32 d5,q4,#shift_stage2_idct
    vqrshrn.s32 d2,q1,#shift_stage2_idct
    vqrshrn.s32 d9,q3,#shift_stage2_idct
    vqrshrn.s32 d6,q12,#shift_stage2_idct

    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)


    vadd.s32    q15,q11,q14

    vsub.s32    q12,q11,q14

    vadd.s32    q14,q9,q13

    vsub.s32    q11,q9,q13
    vqrshrn.s32 d4,q15,#shift_stage2_idct
    vqrshrn.s32 d7,q12,#shift_stage2_idct
    vqrshrn.s32 d3,q14,#shift_stage2_idct
    vqrshrn.s32 d8,q11,#shift_stage2_idct


    vmull.s16   q12,d14,d0[1]               @// y1 * cos1(part of b0)

    vmull.s16   q13,d14,d0[3]               @// y1 * cos3(part of b1)
    vmull.s16   q14,d14,d1[1]               @// y1 * sin3(part of b2)
    vmull.s16   q15,d14,d1[3]               @// y1 * sin1(part of b3)

    vmlal.s16   q12,d15,d0[3]               @// y1 * cos1 + y3 * cos3(part of b0)
    vtrn.16     d2,d3
    vmlsl.s16   q13,d15,d1[3]               @// y1 * cos3 - y3 * sin1(part of b1)
    vtrn.16     d4,d5
    vmlsl.s16   q14,d15,d0[1]               @// y1 * sin3 - y3 * cos1(part of b2)
    vtrn.16     d6,d7
    vmlsl.s16   q15,d15,d1[1]               @// y1 * sin1 - y3 * sin3(part of b3)
    vtrn.16     d8,d9
    vmull.s16   q10,d10,d0[0]               @// y0 * cos4(part of c0 and c1)
    vtrn.32     d2,d4

    vtrn.32     d3,d5
    vmull.s16   q9,d11,d1[2]                @// y2 * sin2 (q7 is freed by this time)(part of d1)
    vtrn.32     d6,d8
    vmull.s16   q7,d11,d0[2]                @// y2 * cos2(part of d0)
    vtrn.32     d7,d9


    add         r4,r2,r8, lsl #1            @ r4 = r2 + pred_strd * 2    => r4 points to 3rd row of pred data


    add         r5,r8,r8, lsl #1            @


    add         r0,r3,r7, lsl #1            @ r0 points to 3rd row of dest data


    add         r10,r7,r7, lsl #1           @


    vswp        d3,d6


    vswp        d5,d8


    vsub.s32    q11,q10,q7                  @// a3 = c0 - d0(part of r3,r4)
    vadd.s32    q6,q10,q7                   @// a0 = c0 + d0(part of r0,r7)


    vadd.s32    q0,q6,q12


    vsub.s32    q12,q6,q12


    vadd.s32    q6,q11,q15


    vsub.s32    q7,q11,q15

    vqrshrn.s32 d10,q0,#shift_stage2_idct
    vqrshrn.s32 d17,q12,#shift_stage2_idct
    vqrshrn.s32 d13,q6,#shift_stage2_idct
    vqrshrn.s32 d14,q7,#shift_stage2_idct

    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)


    vadd.s32    q0,q11,q14


    vsub.s32    q12,q11,q14


    vadd.s32    q14,q9,q13


    vsub.s32    q13,q9,q13
    vld1.8      d18,[r2],r8

    vqrshrn.s32 d12,q0,#shift_stage2_idct
    vld1.8      d20,[r2],r5


    vqrshrn.s32 d15,q12,#shift_stage2_idct
    vld1.8      d19,[r2],r8


    vqrshrn.s32 d11,q14,#shift_stage2_idct
    vld1.8      d22,[r4],r8


    vqrshrn.s32 d16,q13,#shift_stage2_idct
    vld1.8      d21,[r2],r5


    b           pred_buff_addition
end_skip_last4cols:


@/* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
    vtrn.16     q1,q3                       @//[r3,r1],[r2,r0] first qudrant transposing
    vtrn.16     q2,q4                       @//[r3,r1],[r2,r0] second qudrant transposing
    vtrn.16     q5,q7                       @//[r7,r5],[r6,r4] third qudrant transposing
    vtrn.16     q6,q8                       @//[r7,r5],[r6,r4] fourth qudrant transposing

    vtrn.32     d6,d7                       @//r0,r1,r2,r3 first qudrant transposing continued.....
    vtrn.32     d2,d3                       @//r0,r1,r2,r3 first qudrant transposing continued.....
    vtrn.32     d4,d5                       @//r0,r1,r2,r3 second qudrant transposing continued.....
    vtrn.32     d8,d9                       @//r0,r1,r2,r3 second qudrant transposing continued.....
    vtrn.32     d10,d11                     @//r4,r5,r6,r7 third qudrant transposing continued.....
    vtrn.32     d14,d15                     @//r4,r5,r6,r7 third qudrant transposing continued.....
    vtrn.32     d12,d13                     @//r4,r5,r6,r7 fourth qudrant transposing continued.....
    vtrn.32     d16,d17                     @//r4,r5,r6,r7 fourth qudrant transposing continued.....

    @//step6 operate on first four rows and find their idct
    @//register usage.extern        - storing and idct of rows
@// cosine constants    -   d0
@// sine constants      -   d1
@// element 0 first four    -   d2      -   y0
@// element 1 first four    -   d6      -   y1
@// element 2 first four    -   d3      -   y2
@// element 3 first four    -   d7      -   y3
@// element 4 first four    -   d4      -   y4
@// element 5 first four    -   d8      -   y5
@// element 6 first four    -   d5      -   y6
@// element 7 first four    -   d9      -   y7
@// element 0 second four   -   d10     -   y0
@// element 1 second four   -   d14     -   y1
@// element 2 second four   -   d11     -   y2
@// element 3 second four   -   d15     -   y3
@// element 4 second four   -   d12     -   y4
@// element 5 second four   -   d16     -   y5
@// element 6 second four   -   d13     -   y6
@// element 7 second four   -   d17     -   y7

    @// map between first kernel code seq and current
@//     d2  ->  d2
@//     d6  ->  d6
@//     d3  ->  d3
@//     d7  ->  d7
@//     d10 ->  d4
@//     d14 ->  d8
@//     d11 ->  d5
@//     d15 ->  d9
@//     q3  ->  q3
@//     q5  ->  q2
@//     q7  ->  q4

    vmull.s16   q12,d6,d0[1]                @// y1 * cos1(part of b0)
    vmull.s16   q13,d6,d0[3]                @// y1 * cos3(part of b1)
    vmull.s16   q14,d6,d1[1]                @// y1 * sin3(part of b2)
    vmull.s16   q15,d6,d1[3]                @// y1 * sin1(part of b3)

    vmlal.s16   q12,d7,d0[3]                @// y1 * cos1 + y3 * cos3(part of b0)
    vmlsl.s16   q13,d7,d1[3]                @// y1 * cos3 - y3 * sin1(part of b1)
    vmlsl.s16   q14,d7,d0[1]                @// y1 * sin3 - y3 * cos1(part of b2)
    vmlsl.s16   q15,d7,d1[1]                @// y1 * sin1 - y3 * sin3(part of b3)

    vmull.s16   q10,d2,d0[0]                @// y0 * cos4(part of c0 and c1)
    vmull.s16   q11,d4,d0[0]                @// y4 * cos4(part of c0 and c1)

    vmull.s16   q9,d3,d1[2]                 @// y2 * sin2 (q3 is freed by this time)(part of d1)
    vmull.s16   q3,d3,d0[2]                 @// y2 * cos2(part of d0)


    vmlal.s16   q12,d8,d1[1]                @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    vmlsl.s16   q13,d8,d0[1]                @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    vmlal.s16   q14,d8,d1[3]                @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    vmlal.s16   q15,d8,d0[3]                @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)

    vmlsl.s16   q9,d5,d0[2]                 @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    vmlal.s16   q3,d5,d1[2]                 @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)

    vadd.s32    q1,q10,q11                  @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    vsub.s32    q10,q10,q11                 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)

    vmlal.s16   q12,d9,d1[3]                @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
    vmlsl.s16   q13,d9,d1[1]                @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
    vmlal.s16   q14,d9,d0[3]                @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
    vmlsl.s16   q15,d9,d0[1]                @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)

    vsub.s32    q11,q1,q3                   @// a3 = c0 - d0(part of r3,r4)
    vadd.s32    q2,q1,q3                    @// a0 = c0 + d0(part of r0,r7)


    vadd.s32    q1,q2,q12

    vsub.s32    q3,q2,q12

    vadd.s32    q4,q11,q15

    vsub.s32    q12,q11,q15

    vqrshrn.s32 d5,q4,#shift_stage2_idct
    vqrshrn.s32 d2,q1,#shift_stage2_idct
    vqrshrn.s32 d9,q3,#shift_stage2_idct
    vqrshrn.s32 d6,q12,#shift_stage2_idct

    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)


    vadd.s32    q15,q11,q14

    vsub.s32    q12,q11,q14

    vadd.s32    q14,q9,q13

    vsub.s32    q11,q9,q13
    vqrshrn.s32 d4,q15,#shift_stage2_idct
    vqrshrn.s32 d7,q12,#shift_stage2_idct
    vqrshrn.s32 d3,q14,#shift_stage2_idct
    vqrshrn.s32 d8,q11,#shift_stage2_idct


    vmull.s16   q12,d14,d0[1]               @// y1 * cos1(part of b0)

    vmull.s16   q13,d14,d0[3]               @// y1 * cos3(part of b1)
    vmull.s16   q14,d14,d1[1]               @// y1 * sin3(part of b2)
    vmull.s16   q15,d14,d1[3]               @// y1 * sin1(part of b3)

    vmlal.s16   q12,d15,d0[3]               @// y1 * cos1 + y3 * cos3(part of b0)
    vtrn.16     d2,d3
    vmlsl.s16   q13,d15,d1[3]               @// y1 * cos3 - y3 * sin1(part of b1)
    vtrn.16     d4,d5
    vmlsl.s16   q14,d15,d0[1]               @// y1 * sin3 - y3 * cos1(part of b2)
    vtrn.16     d6,d7
    vmlsl.s16   q15,d15,d1[1]               @// y1 * sin1 - y3 * sin3(part of b3)
    vtrn.16     d8,d9
    vmull.s16   q10,d10,d0[0]               @// y0 * cos4(part of c0 and c1)
    vtrn.32     d2,d4
    vmull.s16   q11,d12,d0[0]               @// y4 * cos4(part of c0 and c1)
    vtrn.32     d3,d5
    vmull.s16   q9,d11,d1[2]                @// y2 * sin2 (q7 is freed by this time)(part of d1)
    vtrn.32     d6,d8
    vmull.s16   q7,d11,d0[2]                @// y2 * cos2(part of d0)
    vtrn.32     d7,d9
    vmlal.s16   q12,d16,d1[1]               @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)

    add         r4,r2,r8, lsl #1            @ r4 = r2 + pred_strd * 2    => r4 points to 3rd row of pred data
    vmlsl.s16   q13,d16,d0[1]               @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)

    add         r5,r8,r8, lsl #1            @
    vmlal.s16   q14,d16,d1[3]               @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)

    add         r0,r3,r7, lsl #1            @ r0 points to 3rd row of dest data
    vmlal.s16   q15,d16,d0[3]               @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)

    add         r10,r7,r7, lsl #1           @
    vmlsl.s16   q9,d13,d0[2]                @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)


    vmlal.s16   q7,d13,d1[2]                @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)

    vadd.s32    q6,q10,q11                  @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    vsub.s32    q10,q10,q11                 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)

    vmlal.s16   q12,d17,d1[3]               @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
    vswp        d3,d6
    vmlsl.s16   q13,d17,d1[1]               @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)

    vswp        d5,d8
    vmlal.s16   q14,d17,d0[3]               @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
    vmlsl.s16   q15,d17,d0[1]               @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)

    vsub.s32    q11,q6,q7                   @// a3 = c0 - d0(part of r3,r4)
    vadd.s32    q6,q6,q7                    @// a0 = c0 + d0(part of r0,r7)


    vadd.s32    q0,q6,q12


    vsub.s32    q12,q6,q12


    vadd.s32    q6,q11,q15


    vsub.s32    q7,q11,q15

    vqrshrn.s32 d10,q0,#shift_stage2_idct
    vqrshrn.s32 d17,q12,#shift_stage2_idct
    vqrshrn.s32 d13,q6,#shift_stage2_idct
    vqrshrn.s32 d14,q7,#shift_stage2_idct

    vsub.s32    q11,q10,q9                  @// a2 = c1 - d1(part of r2,r5)
    vadd.s32    q9,q10,q9                   @// a1 = c1 + d1(part of r1,r6)


    vadd.s32    q0,q11,q14


    vsub.s32    q12,q11,q14


    vadd.s32    q14,q9,q13


    vsub.s32    q13,q9,q13
    vld1.8      d18,[r2],r8

    vqrshrn.s32 d12,q0,#shift_stage2_idct
    vld1.8      d20,[r2],r5


    vqrshrn.s32 d15,q12,#shift_stage2_idct
    vld1.8      d19,[r2],r8


    vqrshrn.s32 d11,q14,#shift_stage2_idct
    vld1.8      d22,[r4],r8


    vqrshrn.s32 d16,q13,#shift_stage2_idct
    vld1.8      d21,[r2],r5


pred_buff_addition:


    vtrn.16     d10,d11
    vld1.8      d24,[r4],r5

    vtrn.16     d12,d13
    vld1.8      d23,[r4],r8

    vaddw.u8    q1,q1,d18
    vld1.8      d25,[r4],r5

    vtrn.16     d14,d15
    vaddw.u8    q2,q2,d22

    vtrn.16     d16,d17
    vaddw.u8    q3,q3,d20

    vtrn.32     d10,d12
    vaddw.u8    q4,q4,d24

    vtrn.32     d11,d13
    vtrn.32     d14,d16
    vtrn.32     d15,d17

    vswp        d11,d14
    vswp        d13,d16

@ row values stored in the q register.

@q1 :r0
@q3: r1
@q2: r2
@q4: r3
@q5: r4
@q7: r5
@q6: r6
@q8: r7


@/// adding the prediction buffer


    @ load prediction data


    @adding recon with prediction


    vaddw.u8    q5,q5,d19
    vqmovun.s16 d2,q1
    vaddw.u8    q7,q7,d21
    vqmovun.s16 d4,q2
    vaddw.u8    q6,q6,d23
    vqmovun.s16 d6,q3
    vaddw.u8    q8,q8,d25
    vqmovun.s16 d8,q4


    vst1.8      {d2},[r3],r7
    vqmovun.s16 d10,q5
    vst1.8      {d6},[r3],r10
    vqmovun.s16 d14,q7
    vst1.8      {d4},[r0],r7
    vqmovun.s16 d12,q6
    vst1.8      {d8},[r0],r10
    vqmovun.s16 d16,q8


    vst1.8      {d10},[r3],r7
    vst1.8      {d14},[r3],r10
    vst1.8      {d12},[r0],r7
    vst1.8      {d16},[r0],r10


    ldmfd       sp!,{r4-r12,pc}