1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21// *******************************************************************************
22// * @file
23// *  ih264_iquant_itrans_recon_dc_av8.s
24// *
25// * @brief
26// *  Contains function definitions for single stage  inverse transform
27// *
28// * @author
29// *  Mohit
30// *
31// * @par List of Functions:
32// *  - ih264_iquant_itrans_recon_4x4_dc_av8()
33// *     - ih264_iquant_itrans_recon_8x8_dc_av8()
34// *  - ih264_iquant_itrans_recon_chroma_4x4_dc_av8()
35// *
36// * @remarks
37// *  None
38// *
39// *******************************************************************************
40//*/
41
42
43.include "ih264_neon_macros.s"
44
45
46///**
47// *******************************************************************************
48// *
49// * @brief
50// *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
51// *     for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is
52// *  non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
53// *
54// * @par Description:
55// *  Performs inverse transform Ci4 and adds the residue to get the
56// *  reconstructed block
57// *
58// * @param[in] pi2_src
59// *  Input 4x4 coefficients
60// *
61// * @param[in] pu1_pred
62// *  Prediction 4x4 block
63// *
64// * @param[out] pu1_out
65// *  Output 4x4 block
66// *
67// * @param[in] u4_qp_div_6
68// *     QP
69// *
70// * @param[in] pu2_weigh_mat
71// * Pointer to weight matrix
72// *
73// * @param[in] pred_strd,
74// *  Prediction stride
75// *
76// * @param[in] out_strd
77// *  Output Stride
78// *
79// *@param[in] pi2_tmp
80// * temporary buffer of size 1*16
81// *
82// * @param[in] pu2_iscal_mat
83// * Pointer to the inverse quantization matrix
84// *
85// * @returns  Void
86// *
87// * @remarks
88// *  None
89// *
90// *******************************************************************************
91// */
92//void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
93//                                    UWORD8 *pu1_pred,
94//                                    UWORD8 *pu1_out,
95//                                    WORD32 pred_strd,
96//                                    WORD32 out_strd,
97//                                    const UWORD16 *pu2_iscal_mat,
98//                                    const UWORD16 *pu2_weigh_mat,
99//                                    UWORD32 u4_qp_div_6,
100//                                    WORD32 *pi4_tmp,
101//                                    WORD32 iq_start_idx
102//                                   WORD16 *pi2_dc_ld_addr)
103//**************Variables Vs Registers*****************************************
104//x0 => *pi2_src
105//x1 => *pu1_pred
106//x2 => *pu1_out
107//w3 =>  pred_strd
108//w4 =>  out_strd
109//x5 => *pu2_iscal_mat
110//x6 => *pu2_weigh_mat
111//w7 =>  u4_qp_div_6
112//   =>  pi4_tmp
113//   =>  iq_start_idx
114//   =>  pi2_dc_ld_addr
115
116.text
117.p2align 2
118
119    .global ih264_iquant_itrans_recon_4x4_dc_av8
120ih264_iquant_itrans_recon_4x4_dc_av8:
121
122    sxtw      x3, w3
123    sxtw      x4, w4
124    ldr       w8, [sp, #8]              //Loads iq_start_idx
125    subs      w8, w8, #1                // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set
126
127    ldr       x10, [sp, #16]            //Load alternate dc address
128    push_v_regs
129    dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
130
131
132    bne       donot_use_pi2_dc_ld_addr_luma_dc
133    ld1       {v0.h}[0], [x10]
134donot_use_pi2_dc_ld_addr_luma_dc:
135
136    beq       donot_use_pi2_src_luma_dc
137    ld1       {v0.h}[0], [x5]
138    ld1       {v1.h}[0], [x6]
139    ld1       {v2.h}[0], [x0]
140    mul       v0.4h, v1.4h, v0.4h
141    smull     v0.4s, v0.4h, v2.4h
142    sshl      v0.4s, v0.4s, v30.4s
143    sqrshrn   v0.4h, v0.4s, #4
144donot_use_pi2_src_luma_dc:
145
146
147    dup       v0.8h, v0.h[0]
148    srshr     v0.8h, v0.8h, #6
149
150    ld1       {v1.s}[0], [x1], x3
151    ld1       {v1.s}[1], [x1], x3
152    ld1       {v2.s}[0], [x1], x3
153    ld1       {v2.s}[1], [x1]
154
155    uxtl      v1.8h, v1.8b
156    uxtl      v2.8h, v2.8b
157
158    add       v1.8h, v0.8h, v1.8h
159    add       v2.8h, v0.8h, v2.8h
160
161    sqxtun    v1.8b, v1.8h
162    sqxtun    v2.8b, v2.8h
163
164    st1       {v1.s}[0], [x2], x4
165    st1       {v1.s}[1], [x2], x4
166    st1       {v2.s}[0], [x2], x4
167    st1       {v2.s}[1], [x2]
168    pop_v_regs
169    ret
170
171// /*
172// ********************************************************************************
173// *
174// * @brief This function reconstructs a 4x4 sub block from quantized resiude and
175// * prediction buffer if only dc value is present for residue
176// *
177// * @par Description:
178// *  The quantized residue is first inverse quantized,
179// *  This inverse quantized content is added to the prediction buffer to recon-
180// *  struct the end output
181// *
182// * @param[in] pi2_src
183// *  quantized dc coeffiient
184// *
185// * @param[in] pu1_pred
186// *  prediction 4x4 block in interleaved format
187// *
188// * @param[in] pred_strd,
189// *  Prediction buffer stride in interleaved format
190// *
191// * @param[in] out_strd
192// *  recon buffer Stride
193// *
194// * @returns none
195// *
196// * @remarks none
197// *
198// *******************************************************************************
199// */
200// void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
201//                                             UWORD8 *pu1_pred,
202//                                             UWORD8 *pu1_out,
203//                                             WORD32 pred_strd,
204//                                             WORD32 out_strd,
205//                                             const UWORD16 *pu2_iscal_mat,
206//                                             const UWORD16 *pu2_weigh_mat,
207//                                             UWORD32 u4_qp_div_6,
208//                                             WORD16 *pi2_tmp,
209//                                             WORD16 *pi2_dc_src)
210// Register Usage
211// x0 : pi2_src
212// x1 : pu1_pred
213// x2 : pu1_out
214// w3 : pred_strd
215// w4 : out_strd
216// x5 : pu2_iscal_mat
217// x6 : pu2_weigh_mat
218// w7 : u4_qp_div_6
219//    : pi2_tmp
220//    : pi2_dc_src
221// Neon registers d0-d7, d16-d30 are used
222// No need for pushing  arm and neon registers
223
224
225    .global ih264_iquant_itrans_recon_chroma_4x4_dc_av8
226ih264_iquant_itrans_recon_chroma_4x4_dc_av8:
227
228    sxtw      x3, w3
229    sxtw      x4, w4
230    ldr       x0, [sp, #8]
231    push_v_regs
232    ld1       {v0.h}[0], [x0]
233    dup       v0.8h, v0.h[0]
234    srshr     v0.8h, v0.8h, #6
235
236
237    //backup pu1_out
238    mov       x0, x2
239
240    //nop       v3.16b                            //dummy for deinterleaving
241    movi      v31.8h, #0x00ff           //mask for interleaving [copy lower 8 bits]
242
243    ld1       {v1.d}[0], [x1], x3
244    ld1       {v1.d}[1], [x1], x3
245    ld1       {v2.d}[0], [x1], x3
246    ld1       {v2.d}[1], [x1], x3
247
248    ld1       {v11.d}[0], [x2], x4      //load pu1_out for interleaving
249    ld1       {v11.d}[1], [x2], x4
250    ld1       {v12.d}[0], [x2], x4
251    ld1       {v12.d}[1], [x2]
252
253    uzp1      v1.16b, v1.16b, v3.16b
254    uzp1      v2.16b, v2.16b, v3.16b
255
256    uaddw     v1.8h, v0.8h, v1.8b
257    uaddw     v2.8h, v0.8h, v2.8b
258
259    sqxtun    v1.8b, v1.8h
260    sqxtun    v2.8b, v2.8h
261
262    uxtl      v1.8h, v1.8b
263    uxtl      v2.8h, v2.8b
264
265    bit       v11.16b, v1.16b, v31.16b
266    bit       v12.16b, v2.16b, v31.16b
267
268    st1       {v11.d}[0], [x0], x4
269    st1       {v11.d}[1], [x0], x4
270    st1       {v12.d}[0], [x0], x4
271    st1       {v12.d}[1], [x0]
272    pop_v_regs
273    ret
274
275///*
276// *******************************************************************************
277// *
278// * //brief
279// *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
280// *   [Only for Dc coeff]
281// * //par Description:
282// *  Performs inverse transform Ci8 and adds the residue to get the
283// *  reconstructed block
284// *
285// * //param[in] pi2_src
286// *  Input 4x4 coefficients
287// *
288// * //param[in] pu1_pred
289// *  Prediction 4x4 block
290// *
291// * //param[out] pu1_out
292// *  Output 4x4 block
293// *
294// * //param[in] u4_qp_div_6
295// *     QP
296// *
297// * //param[in] pu2_weigh_mat
298// * Pointer to weight matrix
299// *
300// * //param[in] pred_strd,
301// *  Prediction stride
302// *
303// * //param[in] out_strd
304// *  Output Stride
305// *
306// *//param[in] pi2_tmp
307// * temporary buffer of size 1*64
308// *
309// * //param[in] pu2_iscal_mat
310// * Pointer to the inverse quantization matrix
311// *
312// * //returns  Void
313// *
314// * //remarks
315// *  None
316// *
317// *******************************************************************************
318// */
319//void ih264_iquant_itrans_recon_dc_8x8(WORD16 *pi2_src,
320//                                   UWORD8 *pu1_pred,
321//                                   UWORD8 *pu1_out,
322//                                   WORD32 pred_strd,
323//                                   WORD32 out_strd,
324//                                   const UWORD16 *pu2_iscal_mat,
325//                                   const UWORD16 *pu2_weigh_mat,
326//                                   UWORD32 u4_qp_div_6,
327//                                   WORD32 *pi4_tmp,
328//                                   WORD32 iq_start_idx
329//                                   WORD16 *pi2_dc_ld_addr)
330//**************Variables Vs Registers*****************************************
331//x0       => *pi2_src
332//x1       => *pu1_pred
333//x2       => *pu1_out
334//w3       =>  pred_strd
335//w4       =>  out_strd
336//x5       =>  *pu2_iscal_mat
337//x6       =>  *pu2_weigh_mat
338//w7       =>  u4_qp_div_6
339//NOT USED =>  pi4_tmp
340//NOT USED =>  iq_start_idx
341//NOT USED =>  pi2_dc_ld_addr
342
343    .global ih264_iquant_itrans_recon_8x8_dc_av8
344ih264_iquant_itrans_recon_8x8_dc_av8:
345
346    push_v_regs
347    sxtw      x3, w3
348    sxtw      x4, w4
349
350    ld1       {v1.h}[0], [x5]
351    ld1       {v2.h}[0], [x6]
352    ld1       {v0.h}[0], [x0]
353    dup       v3.4s, w7
354
355
356    mul       v1.8h, v1.8h, v2.8h
357    smull     v0.4s, v0.4h, v1.4h
358    sshl      v0.4s, v0.4s, v3.4s
359
360    sqrshrn   v0.4h, v0.4s, #6
361    srshr     v0.8h, v0.8h, #6
362    dup       v0.8h, v0.h[0]
363
364    ld1       {v22.8b}, [x1], x3
365    ld1       {v23.8b}, [x1], x3
366    ld1       {v24.8b}, [x1], x3
367    ld1       {v25.8b}, [x1], x3
368    ld1       {v26.8b}, [x1], x3
369    ld1       {v27.8b}, [x1], x3
370    ld1       {v28.8b}, [x1], x3
371    ld1       {v29.8b}, [x1]
372
373    uaddw     v1.8h, v0.8h, v22.8b
374    uaddw     v2.8h, v0.8h, v23.8b
375    uaddw     v3.8h, v0.8h, v24.8b
376    uaddw     v8.8h, v0.8h, v25.8b
377    uaddw     v9.8h, v0.8h, v26.8b
378    uaddw     v10.8h, v0.8h, v27.8b
379    uaddw     v11.8h, v0.8h, v28.8b
380    uaddw     v12.8h, v0.8h, v29.8b
381
382    sqxtun    v1.8b, v1.8h
383    sqxtun    v2.8b, v2.8h
384    sqxtun    v3.8b, v3.8h
385    sqxtun    v8.8b, v8.8h
386    sqxtun    v9.8b, v9.8h
387    sqxtun    v10.8b, v10.8h
388    sqxtun    v11.8b, v11.8h
389    sqxtun    v12.8b, v12.8h
390
391    st1       {v1.8b}, [x2], x4
392    st1       {v2.8b}, [x2], x4
393    st1       {v3.8b}, [x2], x4
394    st1       {v8.8b}, [x2], x4
395    st1       {v9.8b}, [x2], x4
396    st1       {v10.8b}, [x2], x4
397    st1       {v11.8b}, [x2], x4
398    st1       {v12.8b}, [x2]
399
400    pop_v_regs
401    ret
402
403
404