1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@ *******************************************************************************
22@ * @file
23@ *  ih264_iquant_itrans_recon_dc_a9.s
24@ *
25@ * @brief
26@ *  Contains function definitions for single stage  inverse transform
27@ *
28@ * @author
29@ *  Mohit
30@ *
31@ * @par List of Functions:
32@ *  - ih264_iquant_itrans_recon_4x4_dc_a9()
33@ *  - ih264_iquant_itrans_recon_8x8_dc_a9()
34@ *  - ih264_iquant_itrans_recon_chroma_4x4_dc_a9()
35@ *
36@ * @remarks
37@ *  None
38@ *
39@ *******************************************************************************
40@*
41@**
42@ *******************************************************************************
43@ *
44@ * @brief
45@ *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
46@ *  for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is
47@ *  non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
48@ *
49@ * @par Description:
50@ *  Performs inverse transform Ci4 and adds the residue to get the
51@ *  reconstructed block
52@ *
53@ * @param[in] pi2_src
54@ *  Input 4x4 coefficients
55@ *
56@ * @param[in] pu1_pred
57@ *  Prediction 4x4 block
58@ *
59@ * @param[out] pu1_out
60@ *  Output 4x4 block
61@ *
62@ * @param[in] u4_qp_div_6
63@ *     QP
64@ *
65@ * @param[in] pu2_weigh_mat
66@ * Pointer to weight matrix
67@ *
68@ * @param[in] pred_strd,
69@ *  Prediction stride
70@ *
71@ * @param[in] out_strd
72@ *  Output Stride
73@ *
74@ *@param[in] pi2_tmp
75@ * temporary buffer of size 1*16
76@ *
77@ * @param[in] pu2_iscal_mat
78@ * Pointer to the inverse quantization matrix
79@ *
80@ * @returns  Void
81@ *
82@ * @remarks
83@ *  None
84@ *
85@ *******************************************************************************
86@ *
87@void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
88@                                   UWORD8 *pu1_pred,
89@                                   UWORD8 *pu1_out,
90@                                   WORD32 pred_strd,
91@                                   WORD32 out_strd,
92@                                   const UWORD16 *pu2_iscal_mat,
93@                                   const UWORD16 *pu2_weigh_mat,
94@                                   UWORD32 u4_qp_div_6,
95@                                   WORD32 *pi4_tmp,
96@                                   WORD32 iq_start_idx
97@                                   WORD16 *pi2_dc_ld_addr)
98@**************Variables Vs Registers*****************************************
99@r0 => *pi2_src
100@r1 => *pu1_pred
101@r2 => *pu1_out
102@r3 =>  pred_strd
103@r4 =>  out_strd
104@r5 =>  *pu2_iscal_mat
105@r6 =>  *pu2_weigh_mat
106@r7 =>  u4_qp_div_6
107@r9 =>  iq_start_idx
108@unused =>  pi2_dc_ld_addr
109
110.text
111.syntax unified
112.p2align 2
113
114    .global ih264_iquant_itrans_recon_4x4_dc_a9
115
116ih264_iquant_itrans_recon_4x4_dc_a9:
117
118@Only one shift is done in horizontal inverse because,
119@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
120@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
121
122    stmfd         sp!, {r4-r10, r14}    @stack stores the values of the arguments
123    ldr           r5, [sp, #36]         @Loads *pu2_iscal_mat
124    ldr           r6, [sp, #40]         @Loads *pu2_weigh_mat
125    ldrsh         r8, [r0]              @load pi2_src[0], SH for signed halfword load
126    ldrh          r6, [r6]              @load pu2_weight_mat[0] , H for unsigned halfword load
127    ldrh          r5, [r5]              @load pu2_iscal_mat[0] , H for unsigned halfword load
128@=======================DEQUANT FROM HERE===================================
129    mul           r6, r6, r5            @pu2_iscal_mat[0]*pu2_weigh_mat[0]
130    ldr           r7, [sp, #44]         @Loads u4_qp_div_6
131    mul           r6, r6, r8            @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0]
132    ldr           r4, [sp, #32]         @Loads out_strd
133    ldr           r9, [sp, #52]         @Loads iq_start_idx
134
135    lsl           r6, r6, r7            @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6
136    add           r6, r6, #8            @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact
137    asr           r6, r6, #4            @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4)
138
139    subs          r9, r9, #1            @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
140    ldrsheq       r10, [r0]             @ Loads signed halfword pi2_src[0], if r9==1
141    moveq         r6, r10               @ Restore dc value in case of intra, i.e. r9 == 1
142
143    add           r6, r6, #32           @i_macro = q0 + 32
144    asr           r6, r6, #6            @i_macro >>6 = DC output of 2-stage transform
145    vdup.s16      q0, r6                @copy transform output to Q0
146
147    vld1.32       d30[0], [r1], r3      @I row Load pu1_pred buffer
148
149    vld1.32       d30[1], [r1], r3      @II row Load pu1_pred buffer
150
151    vld1.32       d31[0], [r1], r3      @III row Load pu1_pred buf
152
153    vld1.32       d31[1], [r1], r3      @IV row Load pu1_pred buffer
154    vaddw.u8      q10, q0, d30
155
156    vaddw.u8      q11, q0, d31
157
158    vqmovun.s16   d0, q10
159
160    vst1.32       d0[0], [r2], r4       @I row store the value
161    vqmovun.s16   d1, q11
162    vst1.32       d0[1], [r2], r4       @II row store the value
163    vst1.32       d1[0], [r2], r4       @III row store the value
164    vst1.32       d1[1], [r2]           @IV row store the value
165
166    ldmfd         sp!, {r4-r10, r15}    @Reload the registers from SP
167
168
169
170
171@*
172@ *******************************************************************************
173@ *
174@ * @brief
175@ *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
176@ *  for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is
177@ *  non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
178@ *
179@ * @par Description:
180@ *  Performs inverse transform Ci8 and adds the residue to get the
181@ *  reconstructed block
182@ *
183@ * @param[in] pi2_src
184@ *  Input 4x4 coefficients
185@ *
186@ * @param[in] pu1_pred
187@ *  Prediction 4x4 block
188@ *
189@ * @param[out] pu1_out
190@ *  Output 4x4 block
191@ *
192@ * @param[in] u4_qp_div_6
193@ *     QP
194@ *
195@ * @param[in] pu2_weigh_mat
196@ * Pointer to weight matrix
197@ *
198@ * @param[in] pred_strd,
199@ *  Prediction stride
200@ *
201@ * @param[in] out_strd
202@ *  Output Stride
203@ *
204@ *@param[in] pi2_tmp
205@ * temporary buffer of size 1*64
206@ *
207@ * @param[in] pu2_iscal_mat
208@ * Pointer to the inverse quantization matrix
209@ *
210@ * @returns  Void
211@ *
212@ * @remarks
213@ *  None
214@ *
215@ *******************************************************************************
216@ *
217@void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src,
218@                                   UWORD8 *pu1_pred,
219@                                   UWORD8 *pu1_out,
220@                                   WORD32 pred_strd,
221@                                   WORD32 out_strd,
222@                                   const UWORD16 *pu2_iscal_mat,
223@                                   const UWORD16 *pu2_weigh_mat,
224@                                   UWORD32 u4_qp_div_6,
225@                                   WORD32 *pi4_tmp,
226@                                   WORD32 iq_start_idx)
227@**************Variables Vs Registers*****************************************
228@r0 => *pi2_src
229@r1 => *pu1_pred
230@r2 => *pu1_out
231@r3 =>  pred_strd
232@r4 =>  out_strd
233@r5 =>  *pu2_iscal_mat
234@r6 =>  *pu2_weigh_mat
235@r7 =>  u4_qp_div_6
236
237
238    .global ih264_iquant_itrans_recon_8x8_dc_a9
239ih264_iquant_itrans_recon_8x8_dc_a9:
240
241    stmfd         sp!, {r4-r8, r14}     @stack stores the values of the arguments
242    ldr           r5, [sp, #28]         @Loads *pu2_iscal_mat
243    ldr           r6, [sp, #32]         @Loads *pu2_weigh_mat
244    ldrsh         r8, [r0]              @load pi2_src[0], SH for signed halfword load
245    ldrh          r6, [r6]              @load pu2_weight_mat[0] , H for unsigned halfword load
246    ldrh          r5, [r5]              @load pu2_iscal_mat[0] , H for unsigned halfword load
247@=======================DEQUANT FROM HERE===================================
248    mul           r6, r6, r5            @pu2_iscal_mat[0]*pu2_weigh_mat[0]
249    ldr           r7, [sp, #36]         @Loads u4_qp_div_6
250    mul           r6, r6, r8            @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0]
251    ldr           r4, [sp, #24]         @Loads out_strd
252
253    vpush         {d8-d15}
254    lsl           r6, r6, r7            @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6
255    add           r6, r6, #32           @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact
256    asr           r6, r6, #6            @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4)
257    add           r6, r6, #32           @i_macro = q0 + 32
258    asr           r6, r6, #6            @i_macro >>6 = DC output of 2-stage transform
259    vdup.s16      q8, r6                @copy transform output to Q0
260
261    vld1.32       d24, [r1], r3         @ Q12 = 0x070605....0x070605....
262
263    vld1.32       d25, [r1], r3         @ Q12 = 0x070605....0x070605....
264
265    vld1.32       d26, [r1], r3         @ Q12 = 0x070605....0x070605....
266    vaddw.u8      q0, q8, d24
267    vld1.32       d27, [r1], r3         @ Q12 = 0x070605....0x070605....
268    vaddw.u8      q1, q8, d25
269    vld1.32       d28, [r1], r3         @ Q12 = 0x070605....0x070605....
270    vaddw.u8      q2, q8, d26
271    vld1.32       d29, [r1], r3         @ Q12 = 0x070605....0x070605....
272    vaddw.u8      q3, q8, d27
273    vld1.32       d30, [r1], r3         @ Q12 = 0x070605....0x070605....
274    vaddw.u8      q4, q8, d28
275    vld1.32       d31, [r1], r3         @ Q12 = 0x070605....0x070605....
276
277@ Code Added to pack sign and magnitudes
278
279
280    vqmovun.s16   d0, q0
281    vaddw.u8      q5, q8, d29
282    vqmovun.s16   d1, q1
283    vaddw.u8      q6, q8, d30
284    vqmovun.s16   d2, q2
285    vqmovun.s16   d3, q3
286    vaddw.u8      q7, q8, d31
287    vqmovun.s16   d4, q4
288    vqmovun.s16   d5, q5
289    vst1.32       d0, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
290    vqmovun.s16   d6, q6
291    vst1.32       d1, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
292    vqmovun.s16   d7, q7
293    vst1.32       d2, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
294    vst1.32       d3, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
295    vst1.32       d4, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
296    vst1.32       d5, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
297    vst1.32       d6, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
298    vst1.32       d7, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
299
300    vpop          {d8-d15}
301    ldmfd         sp!, {r4-r8, r15}
302
303
304@ *
305@ ********************************************************************************
306@ *
307@ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
308@ * prediction buffer if only dc value is present for residue
309@ *
310@ * @par Description:
311@ *  The quantized residue is first inverse quantized,
312@ *  This inverse quantized content is added to the prediction buffer to recon-
313@ *  struct the end output
314@ *
315@ * @param[in] pi2_src
316@ *  quantized dc coeffiient
317@ *
318@ * @param[in] pu1_pred
319@ *  prediction 4x4 block in interleaved format
320@ *
321@ * @param[in] pred_strd,
322@ *  Prediction buffer stride in interleaved format
323@ *
324@ * @param[in] out_strd
325@ *  recon buffer Stride
326@ *
327@ * @returns none
328@ *
329@ * @remarks none
330@ *
331@ *******************************************************************************
332@ *
333@ void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
334@                                             UWORD8 *pu1_pred,
335@                                             UWORD8 *pu1_out,
336@                                             WORD32 pred_strd,
337@                                             WORD32 out_strd,
338@                                             const UWORD16 *pu2_iscal_mat,
339@                                             const UWORD16 *pu2_weigh_mat,
340@                                             UWORD32 u4_qp_div_6,
341@                                             WORD16 *pi2_tmp,
342@                                             WORD16 *pi2_dc_src)
343@ Register Usage
344@ r0 : pi2_src
345@ r1 : pu1_pred
346@ r2 : pu1_out
347@ r3 : pred_strd
348@ Neon registers d0-d7, d16-d30 are used
349@ No need for pushing  arm and neon registers
350    .global ih264_iquant_itrans_recon_chroma_4x4_dc_a9
351ih264_iquant_itrans_recon_chroma_4x4_dc_a9:
352
353    ldr           r0, [sp, #20]
354    vld1.s16      d0, [r0]              @load pi2_dc_src
355
356    ldr           r0, [sp]              @load out_strd
357
358    vld2.s8       {d2, d3}, [r1], r3    @load pred plane 1 => d2 &pred palne 2 => d3
359    vld2.s8       {d3, d4}, [r1], r3
360    vrshr.s16     d0, d0, #6            @i_macro = ((q0 + 32) >> 6);
361    vld2.s8       {d4, d5}, [r1], r3
362    vld2.s8       {d5, d6}, [r1], r3
363
364    vdup.s16      q0, d0[0]             @duplicate pi2_sr[0]
365    mov           r1, r2                @backup pu1_out
366
367    vtrn.32       d2, d3                @mov the 4 coeffs of current block to d2
368    vtrn.32       d4, d5
369
370    vmov.u16      q15, #0x00ff
371
372
373    vld1.u8       d18, [r2], r0         @load out [8 bit size) -8 coeffs
374    vaddw.u8      q1, q0, d2            @Add pred
375    vld1.u8       d19, [r2], r0
376    vaddw.u8      q2, q0, d4
377    vld1.u8       d20, [r2], r0
378    vld1.u8       d21, [r2], r0
379
380    vqmovun.s16   d2, q1
381    vqmovun.s16   d4, q2
382
383    vmovl.u8      q1, d2
384    vmovl.u8      q2, d4
385
386    vbit.u8       q9, q1, q15
387    vbit.u8       q10, q2, q15
388
389    vst1.u8       d18, [r1], r0         @store  out
390    vst1.u8       d19, [r1], r0
391    vst1.u8       d20, [r1], r0
392    vst1.u8       d21, [r1], r0
393
394    bx            lr
395
396
397
398
399
400
401
402