1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21// *******************************************************************************
22// * @file
23// *  ih264_iquant_itrans_recon_dc_av8.s
24// *
25// * @brief
26// *  Contains function definitions for single stage  inverse transform
27// *
28// * @author
29// *  Mohit
30// *
31// * @par List of Functions:
32// *  - ih264_iquant_itrans_recon_4x4_dc_av8()
33// *     - ih264_iquant_itrans_recon_8x8_dc_av8()
34// *  - ih264_iquant_itrans_recon_chroma_4x4_dc_av8()
35// *
36// * @remarks
37// *  None
38// *
39// *******************************************************************************
40//*/
41
42
43.include "ih264_neon_macros.s"
44
45
46///**
47// *******************************************************************************
48// *
49// * @brief
50// *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
51// *     for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is
52// *  non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
53// *
54// * @par Description:
55// *  Performs inverse transform Ci4 and adds the residue to get the
56// *  reconstructed block
57// *
58// * @param[in] pi2_src
59// *  Input 4x4 coefficients
60// *
61// * @param[in] pu1_pred
62// *  Prediction 4x4 block
63// *
64// * @param[out] pu1_out
65// *  Output 4x4 block
66// *
67// * @param[in] u4_qp_div_6
68// *     QP
69// *
70// * @param[in] pu2_weigh_mat
71// * Pointer to weight matrix
72// *
73// * @param[in] pred_strd,
74// *  Prediction stride
75// *
76// * @param[in] out_strd
77// *  Output Stride
78// *
79// *@param[in] pi2_tmp
80// * temporary buffer of size 1*16
81// *
82// * @param[in] pu2_iscal_mat
83// * Pointer to the inverse quantization matrix
84// *
85// * @returns  Void
86// *
87// * @remarks
88// *  None
89// *
90// *******************************************************************************
91// */
92//void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
93//                                    UWORD8 *pu1_pred,
94//                                    UWORD8 *pu1_out,
95//                                    WORD32 pred_strd,
96//                                    WORD32 out_strd,
97//                                    const UWORD16 *pu2_iscal_mat,
98//                                    const UWORD16 *pu2_weigh_mat,
99//                                    UWORD32 u4_qp_div_6,
100//                                    WORD32 *pi4_tmp,
101//                                    WORD32 iq_start_idx
102//                                   WORD16 *pi2_dc_ld_addr)
103//**************Variables Vs Registers*****************************************
104//x0 => *pi2_src
105//x1 => *pu1_pred
106//x2 => *pu1_out
107//x3 =>  pred_strd
108//x4 =>  out_strd
109//x5 => *pu2_iscal_mat
110//x6 => *pu2_weigh_mat
111//x7 =>  u4_qp_div_6
112//   =>  pi4_tmp
113//   =>  iq_start_idx
114//   =>  pi2_dc_ld_addr
115
116.text
117.p2align 2
118
119    .global ih264_iquant_itrans_recon_4x4_dc_av8
120ih264_iquant_itrans_recon_4x4_dc_av8:
121
122    ldr       w8, [sp, #8]              //Loads iq_start_idx
123    subs      w8, w8, #1                // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set
124
125    ldr       x10, [sp, #16]            //Load alternate dc address
126    push_v_regs
127    dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
128
129
130    bne       donot_use_pi2_dc_ld_addr_luma_dc
131    ld1       {v0.h}[0], [x10]
132donot_use_pi2_dc_ld_addr_luma_dc:
133
134    beq       donot_use_pi2_src_luma_dc
135    ld1       {v0.h}[0], [x5]
136    ld1       {v1.h}[0], [x6]
137    ld1       {v2.h}[0], [x0]
138    mul       v0.4h, v1.4h, v0.4h
139    smull     v0.4s, v0.4h, v2.4h
140    sshl      v0.4s, v0.4s, v30.4s
141    sqrshrn   v0.4h, v0.4s, #4
142donot_use_pi2_src_luma_dc:
143
144
145    dup       v0.8h, v0.h[0]
146    srshr     v0.8h, v0.8h, #6
147
148    ld1       {v1.s}[0], [x1], x3
149    ld1       {v1.s}[1], [x1], x3
150    ld1       {v2.s}[0], [x1], x3
151    ld1       {v2.s}[1], [x1]
152
153    uxtl      v1.8h, v1.8b
154    uxtl      v2.8h, v2.8b
155
156    add       v1.8h, v0.8h, v1.8h
157    add       v2.8h, v0.8h, v2.8h
158
159    sqxtun    v1.8b, v1.8h
160    sqxtun    v2.8b, v2.8h
161
162    st1       {v1.s}[0], [x2], x4
163    st1       {v1.s}[1], [x2], x4
164    st1       {v2.s}[0], [x2], x4
165    st1       {v2.s}[1], [x2]
166    pop_v_regs
167    ret
168
169// /*
170// ********************************************************************************
171// *
172// * @brief This function reconstructs a 4x4 sub block from quantized resiude and
173// * prediction buffer if only dc value is present for residue
174// *
175// * @par Description:
176// *  The quantized residue is first inverse quantized,
177// *  This inverse quantized content is added to the prediction buffer to recon-
178// *  struct the end output
179// *
180// * @param[in] pi2_src
181// *  quantized dc coeffiient
182// *
183// * @param[in] pu1_pred
184// *  prediction 4x4 block in interleaved format
185// *
186// * @param[in] pred_strd,
187// *  Prediction buffer stride in interleaved format
188// *
189// * @param[in] out_strd
190// *  recon buffer Stride
191// *
192// * @returns none
193// *
194// * @remarks none
195// *
196// *******************************************************************************
197// */
198// void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
199//                                             UWORD8 *pu1_pred,
200//                                             UWORD8 *pu1_out,
201//                                             WORD32 pred_strd,
202//                                             WORD32 out_strd,
203//                                             const UWORD16 *pu2_iscal_mat,
204//                                             const UWORD16 *pu2_weigh_mat,
205//                                             UWORD32 u4_qp_div_6,
206//                                             WORD16 *pi2_tmp,
207//                                             WORD16 *pi2_dc_src)
208// Register Usage
209// x0 : pi2_src
210// x1 : pu1_pred
211// x2 : pu1_out
212// x3 : pred_strd
213// x4 : out_strd
214// x5 : pu2_iscal_mat
215// x6 : pu2_weigh_mat
216// x7 : u4_qp_div_6
217//    : pi2_tmp
218//    : pi2_dc_src
219// Neon registers d0-d7, d16-d30 are used
220// No need for pushing  arm and neon registers
221
222
223    .global ih264_iquant_itrans_recon_chroma_4x4_dc_av8
224ih264_iquant_itrans_recon_chroma_4x4_dc_av8:
225
226    ldr       x0, [sp, #8]
227    push_v_regs
228    ld1       {v0.h}[0], [x0]
229    dup       v0.8h, v0.h[0]
230    srshr     v0.8h, v0.8h, #6
231
232
233    //backup pu1_out
234    mov       x0, x2
235
236    //nop       v3.16b                            //dummy for deinterleaving
237    movi      v31.8h, #0x00ff           //mask for interleaving [copy lower 8 bits]
238
239    ld1       {v1.d}[0], [x1], x3
240    ld1       {v1.d}[1], [x1], x3
241    ld1       {v2.d}[0], [x1], x3
242    ld1       {v2.d}[1], [x1], x3
243
244    ld1       {v11.d}[0], [x2], x4      //load pu1_out for interleaving
245    ld1       {v11.d}[1], [x2], x4
246    ld1       {v12.d}[0], [x2], x4
247    ld1       {v12.d}[1], [x2]
248
249    uzp1      v1.16b, v1.16b, v3.16b
250    uzp1      v2.16b, v2.16b, v3.16b
251
252    uaddw     v1.8h, v0.8h, v1.8b
253    uaddw     v2.8h, v0.8h, v2.8b
254
255    sqxtun    v1.8b, v1.8h
256    sqxtun    v2.8b, v2.8h
257
258    uxtl      v1.8h, v1.8b
259    uxtl      v2.8h, v2.8b
260
261    bit       v11.16b, v1.16b, v31.16b
262    bit       v12.16b, v2.16b, v31.16b
263
264    st1       {v11.d}[0], [x0], x4
265    st1       {v11.d}[1], [x0], x4
266    st1       {v12.d}[0], [x0], x4
267    st1       {v12.d}[1], [x0]
268    pop_v_regs
269    ret
270
271///*
272// *******************************************************************************
273// *
274// * //brief
275// *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
276// *   [Only for Dc coeff]
277// * //par Description:
278// *  Performs inverse transform Ci8 and adds the residue to get the
279// *  reconstructed block
280// *
281// * //param[in] pi2_src
282// *  Input 4x4 coefficients
283// *
284// * //param[in] pu1_pred
285// *  Prediction 4x4 block
286// *
287// * //param[out] pu1_out
288// *  Output 4x4 block
289// *
290// * //param[in] u4_qp_div_6
291// *     QP
292// *
293// * //param[in] pu2_weigh_mat
294// * Pointer to weight matrix
295// *
296// * //param[in] pred_strd,
297// *  Prediction stride
298// *
299// * //param[in] out_strd
300// *  Output Stride
301// *
302// *//param[in] pi2_tmp
303// * temporary buffer of size 1*64
304// *
305// * //param[in] pu2_iscal_mat
306// * Pointer to the inverse quantization matrix
307// *
308// * //returns  Void
309// *
310// * //remarks
311// *  None
312// *
313// *******************************************************************************
314// */
315//void ih264_iquant_itrans_recon_dc_8x8(WORD16 *pi2_src,
316//                                   UWORD8 *pu1_pred,
317//                                   UWORD8 *pu1_out,
318//                                   WORD32 pred_strd,
319//                                   WORD32 out_strd,
320//                                   const UWORD16 *pu2_iscal_mat,
321//                                   const UWORD16 *pu2_weigh_mat,
322//                                   UWORD32 u4_qp_div_6,
323//                                   WORD32 *pi4_tmp,
324//                                   WORD32 iq_start_idx
325//                                   WORD16 *pi2_dc_ld_addr)
326//**************Variables Vs Registers*****************************************
327//x0       => *pi2_src
328//x1       => *pu1_pred
329//x2       => *pu1_out
330//x3       =>  pred_strd
331//x4       =>  out_strd
332//x5       =>  *pu2_iscal_mat
333//x6       =>  *pu2_weigh_mat
334//x7       =>  u4_qp_div_6
335//NOT USED =>  pi4_tmp
336//NOT USED =>  iq_start_idx
337//NOT USED =>  pi2_dc_ld_addr
338
339    .global ih264_iquant_itrans_recon_8x8_dc_av8
340ih264_iquant_itrans_recon_8x8_dc_av8:
341
342    push_v_regs
343
344    ld1       {v1.h}[0], [x5]
345    ld1       {v2.h}[0], [x6]
346    ld1       {v0.h}[0], [x0]
347    dup       v3.4s, w7
348
349
350    mul       v1.8h, v1.8h, v2.8h
351    smull     v0.4s, v0.4h, v1.4h
352    sshl      v0.4s, v0.4s, v3.4s
353
354    sqrshrn   v0.4h, v0.4s, #6
355    srshr     v0.8h, v0.8h, #6
356    dup       v0.8h, v0.h[0]
357
358    ld1       {v22.8b}, [x1], x3
359    ld1       {v23.8b}, [x1], x3
360    ld1       {v24.8b}, [x1], x3
361    ld1       {v25.8b}, [x1], x3
362    ld1       {v26.8b}, [x1], x3
363    ld1       {v27.8b}, [x1], x3
364    ld1       {v28.8b}, [x1], x3
365    ld1       {v29.8b}, [x1]
366
367    uaddw     v1.8h, v0.8h, v22.8b
368    uaddw     v2.8h, v0.8h, v23.8b
369    uaddw     v3.8h, v0.8h, v24.8b
370    uaddw     v8.8h, v0.8h, v25.8b
371    uaddw     v9.8h, v0.8h, v26.8b
372    uaddw     v10.8h, v0.8h, v27.8b
373    uaddw     v11.8h, v0.8h, v28.8b
374    uaddw     v12.8h, v0.8h, v29.8b
375
376    sqxtun    v1.8b, v1.8h
377    sqxtun    v2.8b, v2.8h
378    sqxtun    v3.8b, v3.8h
379    sqxtun    v8.8b, v8.8h
380    sqxtun    v9.8b, v9.8h
381    sqxtun    v10.8b, v10.8h
382    sqxtun    v11.8b, v11.8h
383    sqxtun    v12.8b, v12.8h
384
385    st1       {v1.8b}, [x2], x4
386    st1       {v2.8b}, [x2], x4
387    st1       {v3.8b}, [x2], x4
388    st1       {v8.8b}, [x2], x4
389    st1       {v9.8b}, [x2], x4
390    st1       {v10.8b}, [x2], x4
391    st1       {v11.8b}, [x2], x4
392    st1       {v12.8b}, [x2]
393
394    pop_v_regs
395    ret
396
397
398