1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///*****************************************************************************/
21///**
22//*******************************************************************************
23//* @file
24//*  ih264_resi_trans_quant_av8.c
25//*
26//* @brief
27//*  contains function definitions for residual and forward trans
28//*
29//* @author
30//*  ittiam
31//*
32//* @par list of functions:
33//*    ih264_resi_trans_quant_4x4_av8
34//*    ih264_resi_trans_quant_8x8_av8
35//*    ih264_resi_trans_quant_chroma_4x4_av8
36//* @remarks
37//*  none
38//*
39//*******************************************************************************
40.include "ih264_neon_macros.s"
41.text
42.p2align 2
43//*****************************************************************************
44//*
45//* function name     : ih264_resi_trans_quant_4x4
46//* description       : this function does cf4 of h264
47//*
48//* arguments         :   x0 :pointer to src buffer
49//                        x1 :pointer to pred buffer
50//                        x2 :pointer to dst buffer
51//                        x3 :source stride
52//                        x4 :pred stride,
53//                        x5 :dst stride,
54//                        x6 :pointer to scaling matrix,
55//                        x7 :pointer to threshold matrix,
56//                        stack   qbits,
57//                                rounding factor,
58//                                pointer to store nnz
59//                                pointer to store non quantized dc value
60// values returned   : none
61//
62// register usage    :
63// stack usage       : 64 bytes
64// cycles            :
65// interruptiaility  : interruptable
66//
67// known limitations
68//   \assumptions    :
69//
70// revision history  :
71//         dd mm yyyy    author(s)   changes
72//         1 12 2013    100633      first version
73//         20 1 2014    100633      changes the api, optimization
74//
75//*****************************************************************************
76
77    .global ih264_resi_trans_quant_4x4_av8
78ih264_resi_trans_quant_4x4_av8:
79
80    //x0     :pointer to src buffer
81    //x1     :pointer to pred buffer
82    //x2     :pointer to dst buffer
83    //x3     :source stride
84    //x4     :pred stride
85    //x5     :dst stride,
86    //x6     :scale matirx,
87    //x7     :threshold matrix
88    //       :qbits
89    //       :round factor
90    //       :nnz
91    //       :pointer to store non quantized dc value
92    push_v_regs
93    //x0     :pointer to src buffer
94    //x1     :pointer to pred buffer
95    //x2     :pointer to dst buffer
96    //x3     :source stride
97    //x4     :pred stride
98    //x5     :scale matirx,
99    //x6     :threshold matrix
100    //x7     :qbits
101    //x8        :round factor
102    //x9        :nnz
103    //x10       :pointer to store non quantized dc value
104
105    ldr       w8, [sp, #64]             //load round factor
106    ldr       x10, [sp, #80]            //load addres for non quant val
107    neg       x7, x7                    //negate the qbit value for usiing lsl
108    ldr       x9, [sp, #72]
109
110    //------------fucntion loading done----------------;
111
112    ld1       {v30.8b}, [x0], x3        //load first 8 pix src  row 1
113    ld1       {v31.8b}, [x1], x4        //load first 8 pix pred row 1
114    ld1       {v28.8b}, [x0], x3        //load first 8 pix src  row 2
115    ld1       {v29.8b}, [x1], x4        //load first 8 pix pred row 2
116    ld1       {v26.8b}, [x0], x3        //load first 8 pix src  row 3
117    ld1       {v27.8b}, [x1], x4        //load first 8 pix pred row 3
118    ld1       {v24.8b}, [x0]            //load first 8 pix src row 4
119    ld1       {v25.8b}, [x1]            //load first 8 pix pred row 4
120
121    usubl     v0.8h, v30.8b, v31.8b     //find residue row 1
122    usubl     v2.8h, v28.8b, v29.8b     //find residue row 2
123    usubl     v4.8h, v26.8b, v27.8b     //find residue row 3
124    usubl     v6.8h, v24.8b, v25.8b     //find residue row 4
125
126    trn1      v1.4h, v0.4h, v2.4h
127    trn2      v3.4h, v0.4h, v2.4h       //t12
128    trn1      v5.4h, v4.4h, v6.4h
129    trn2      v7.4h, v4.4h, v6.4h       //t23
130
131    trn1      v0.2s, v1.2s, v5.2s
132    trn2      v4.2s, v1.2s, v5.2s       //t13
133    trn1      v2.2s, v3.2s, v7.2s
134    trn2      v6.2s, v3.2s, v7.2s       //t14
135
136    add       v8.4h, v0.4h, v6.4h       //x0 = x4+x7
137    add       v9.4h, v2.4h, v4.4h       //x1 = x5+x6
138    sub       v10.4h, v2.4h, v4.4h      //x2 = x5-x6
139    sub       v11.4h, v0.4h, v6.4h      //x3 = x4-x7
140
141    shl       v12.4h, v10.4h, #1        //u_shift(x2,1,shft)
142    shl       v13.4h, v11.4h, #1        //u_shift(x3,1,shft)
143
144    add       v14.4h, v8.4h, v9.4h      //x4 = x0 + x1;
145    sub       v16.4h, v8.4h, v9.4h      //x6 = x0 - x1;
146    add       v15.4h, v13.4h, v10.4h    //x5 = u_shift(x3,1,shft) + x2;
147    sub       v17.4h, v11.4h, v12.4h    //x7 = x3 - u_shift(x2,1,shft);
148
149    //taking transpose again so as to make do vert transform
150    trn1      v0.4h, v14.4h, v15.4h
151    trn2      v1.4h, v14.4h, v15.4h     //t12
152    trn1      v2.4h, v16.4h, v17.4h
153    trn2      v3.4h, v16.4h, v17.4h     //t23
154
155    trn1      v14.2s, v0.2s, v2.2s
156    trn2      v16.2s, v0.2s, v2.2s      //t13
157    trn1      v15.2s, v1.2s, v3.2s
158    trn2      v17.2s, v1.2s, v3.2s      //t24
159
160    //let us do vertical transform
161    //same code as horiz
162    add       v18.4h, v14.4h , v17.4h   //x0 = x4+x7
163    add       v19.4h, v15.4h , v16.4h   //x1 = x5+x6
164    sub       v20.4h, v15.4h , v16.4h   //x2 = x5-x6
165    sub       v21.4h, v14.4h , v17.4h   //x3 = x4-x7
166
167    shl       v22.4h, v20.4h, #1        //u_shift(x2,1,shft)
168    shl       v23.4h, v21.4h, #1        //u_shift(x3,1,shft)
169
170    dup       v8.4s, w8                 //load rounding value row 1
171
172    add       v24.4h, v18.4h , v19.4h   //x5 = x0 + x1;
173    sub       v26.4h, v18.4h , v19.4h   //x7 = x0 - x1;
174    add       v25.4h, v23.4h , v20.4h   //x6 = u_shift(x3,1,shft) + x2;
175    sub       v27.4h, v21.4h , v22.4h   //x8 = x3 - u_shift(x2,1,shft);
176
177    dup       v23.4s, w8                //load round factor values
178
179    st1       {v24.h}[0], [x10]         //store the dc value to alternate dc sddress
180//core tranform is done for 4x8 block 1
181    ld1       {v28.4h-v31.4h}, [x5]     //load the scaling values
182
183    abs       v0.4h, v24.4h             //abs val of row 1
184    abs       v1.4h, v25.4h             //abs val of row 2
185    abs       v2.4h, v26.4h             //abs val of row 3
186    abs       v3.4h, v27.4h             //abs val of row 4
187
188    cmgt      v4.4h, v24.4h, #0
189    cmgt      v5.4h, v25.4h, #0
190    cmgt      v6.4h, v26.4h, #0
191    cmgt      v7.4h, v27.4h, #0
192
193    smull     v0.4s, v0.4h, v28.4h      //multiply and add row 1
194    smull     v1.4s, v1.4h, v29.4h      //multiply and add row 2
195    smull     v2.4s, v2.4h, v30.4h      //multiply and add row 3
196    smull     v3.4s, v3.4h, v31.4h      //multiply and add row 4
197
198    add       v20.4s, v0.4s, v23.4s
199    add       v21.4s, v1.4s, v23.4s
200    add       v22.4s, v2.4s, v23.4s
201    add       v23.4s, v3.4s, v23.4s
202
203    dup       v24.4s, w7
204
205    sshl      v20.4s, v20.4s, v24.4s    //shift row 1
206    sshl      v21.4s, v21.4s, v24.4s    //shift row 2
207    sshl      v22.4s, v22.4s, v24.4s    //shift row 3
208    sshl      v23.4s, v23.4s, v24.4s    //shift row 4
209
210    xtn       v20.4h, v20.4s            //narrow row 1
211    xtn       v21.4h, v21.4s            //narrow row 2
212    xtn       v22.4h, v22.4s            //narrow row 3
213    xtn       v23.4h, v23.4s            //narrow row 4
214
215    neg       v24.8h, v20.8h            //get negative
216    neg       v25.8h, v21.8h            //get negative
217    neg       v26.8h, v22.8h            //get negative
218    neg       v27.8h, v23.8h            //get negative
219
220    //compare with zero for computng nnz
221    cmeq      v0.4h, v20.4h, #0
222    cmeq      v1.4h, v21.4h, #0
223    cmeq      v2.4h, v22.4h, #0
224    cmeq      v3.4h, v23.4h, #0
225
226    bsl       v4.8b, v20.8b, v24.8b     //restore sign of row 1 and 2
227    bsl       v5.8b, v21.8b, v25.8b     //restore sign of row 3 and 4
228    bsl       v6.8b, v22.8b, v26.8b     //restore sign of row 1 and 2
229    bsl       v7.8b, v23.8b, v27.8b     //restore sign of row 3 and 4
230
231    //narrow the comaprison result
232    mov       v0.d[1], v2.d[0]
233    mov       v1.d[1], v3.d[0]
234
235    xtn       v0.8b, v0.8h
236    xtn       v1.8b, v1.8h
237
238    ushr      v0.8b, v0.8b, #7          //i    reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
239    ushr      v1.8b, v1.8b, #7          //i    reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
240
241    add       v0.8b, v0.8b, v1.8b       //i pair add nnz 1
242    addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
243    addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
244    addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
245
246    st1       {v4.4h-v7.4h}, [x2]       //store blk
247
248    movi      v25.8b, #16               //get max nnz
249    sub       v26.8b, v25.8b , v0.8b    //invert current nnz
250    st1       {v26.b}[0], [x9]          //write nnz
251
252    pop_v_regs
253    ret
254
255
256//*****************************************************************************
257//*
258//* function name     : ih264_resi_trans_quant_chroma_4x4
259//* description       : this function does residue calculation, forward transform
260//*                        and quantization for 4x4 chroma block.
261//*
262//* arguments         :   x0 :pointer to src buffer
263//                        x1 :pointer to pred buffer
264//                        x2 :pointer to dst buffer
265//                        x3 :source stride
266//                        x4 :pred stride,
267//                        x5 :dst stride,
268//                        x6 :pointer to scaling matrix,
269//                        x7 :pointer to threshold matrix,
270//                        stack     qbits,
271//                                  rounding factor,
272//                                  pointer to store nnz
273//                                  pointer to store unquantized dc values
274// values returned   : none
275//
276// register usage    :
277// stack usage       : 64 bytes
278// cycles            :
279// interruptiaility  : interruptable
280//
281// known limitations
282//   \assumptions    :
283//
284// revision history  :
285//         dd mm yyyy    author(s)   changes
286//         11 2 2015    100664      first version
287//         25 2 2015    100633      first av8 version
288//*****************************************************************************
289
290    .global ih264_resi_trans_quant_chroma_4x4_av8
291ih264_resi_trans_quant_chroma_4x4_av8:
292
293    //x0     :pointer to src buffer
294    //x1     :pointer to pred buffer
295    //x2     :pointer to dst buffer
296    //x3     :source stride
297    //stack     :pred stride
298    //          :scale matirx,
299    //          :threshold matrix
300    //          :qbits
301    //          :round factor
302    //          :nnz
303    //          :pu1_dc_alt_addr
304    push_v_regs
305    //x0     :pointer to src buffer
306    //x1     :pointer to pred buffer
307    //x2     :pointer to dst buffer
308    //x3     :source stride
309    //x4     :pred stride
310    //x5     :scale matirx,
311    //x6     :threshold matrix
312    //x7     :qbits
313    //x8        :round factor
314    //x9        :nnz
315    //x10       :pointer to store non quantized dc value
316
317    ldr       w8, [sp, #64]             //load round factor
318    ldr       x10, [sp, #80]            //load addres for non quant val
319    neg       x7, x7                    //negate the qbit value for usiing lsl
320    ldr       x9, [sp, #72]
321    //------------fucntion loading done----------------;
322
323    ld1       {v30.8b}, [x0], x3        //load first 8 pix src  row 1
324    ld1       {v31.8b}, [x1], x4        //load first 8 pix pred row 1
325    ld1       {v28.8b}, [x0], x3        //load first 8 pix src  row 2
326    ld1       {v29.8b}, [x1], x4        //load first 8 pix pred row 2
327    ld1       {v26.8b}, [x0], x3        //load first 8 pix src  row 3
328    ld1       {v27.8b}, [x1], x4        //load first 8 pix pred row 3
329    ld1       {v24.8b}, [x0]            //load first 8 pix src row 4
330    ld1       {v25.8b}, [x1]            //load first 8 pix pred row 4
331
332
333    //deinterleave the loaded values
334    uzp1      v30.8b, v30.8b, v30.8b
335    uzp1      v31.8b, v31.8b, v31.8b
336    uzp1      v28.8b, v28.8b, v28.8b
337    uzp1      v29.8b, v29.8b, v29.8b
338    uzp1      v26.8b, v26.8b, v26.8b
339    uzp1      v27.8b, v27.8b, v27.8b
340    uzp1      v24.8b, v24.8b, v24.8b
341    uzp1      v25.8b, v25.8b, v25.8b
342    //this deinterleaving is the only differnece betweenchrom and luma fucntions
343
344    usubl     v0.8h, v30.8b, v31.8b     //find residue row 1
345    usubl     v2.8h, v28.8b, v29.8b     //find residue row 2
346    usubl     v4.8h, v26.8b, v27.8b     //find residue row 3
347    usubl     v6.8h, v24.8b, v25.8b     //find residue row 4
348
349    trn1      v1.4h, v0.4h, v2.4h
350    trn2      v3.4h, v0.4h, v2.4h       //t12
351    trn1      v5.4h, v4.4h, v6.4h
352    trn2      v7.4h, v4.4h, v6.4h       //t23
353
354    trn1      v0.2s, v1.2s, v5.2s
355    trn2      v4.2s, v1.2s, v5.2s       //t13
356    trn1      v2.2s, v3.2s, v7.2s
357    trn2      v6.2s, v3.2s, v7.2s       //t14
358
359    add       v8.4h, v0.4h, v6.4h       //x0 = x4+x7
360    add       v9.4h, v2.4h, v4.4h       //x1 = x5+x6
361    sub       v10.4h, v2.4h, v4.4h      //x2 = x5-x6
362    sub       v11.4h, v0.4h, v6.4h      //x3 = x4-x7
363
364    shl       v12.4h, v10.4h, #1        //u_shift(x2,1,shft)
365    shl       v13.4h, v11.4h, #1        //u_shift(x3,1,shft)
366
367    add       v14.4h, v8.4h, v9.4h      //x4 = x0 + x1;
368    sub       v16.4h, v8.4h, v9.4h      //x6 = x0 - x1;
369    add       v15.4h, v13.4h, v10.4h    //x5 = u_shift(x3,1,shft) + x2;
370    sub       v17.4h, v11.4h, v12.4h    //x7 = x3 - u_shift(x2,1,shft);
371
372    //taking transpose again so as to make do vert transform
373    trn1      v0.4h, v14.4h, v15.4h
374    trn2      v1.4h, v14.4h, v15.4h     //t12
375    trn1      v2.4h, v16.4h, v17.4h
376    trn2      v3.4h, v16.4h, v17.4h     //t23
377
378    trn1      v14.2s, v0.2s, v2.2s
379    trn2      v16.2s, v0.2s, v2.2s      //t13
380    trn1      v15.2s, v1.2s, v3.2s
381    trn2      v17.2s, v1.2s, v3.2s      //t24
382
383    //let us do vertical transform
384    //same code as horiz
385    add       v18.4h, v14.4h , v17.4h   //x0 = x4+x7
386    add       v19.4h, v15.4h , v16.4h   //x1 = x5+x6
387    sub       v20.4h, v15.4h , v16.4h   //x2 = x5-x6
388    sub       v21.4h, v14.4h , v17.4h   //x3 = x4-x7
389
390    shl       v22.4h, v20.4h, #1        //u_shift(x2,1,shft)
391    shl       v23.4h, v21.4h, #1        //u_shift(x3,1,shft)
392
393    dup       v8.4s, w8                 //load rounding value row 1
394
395    add       v24.4h, v18.4h , v19.4h   //x5 = x0 + x1;
396    sub       v26.4h, v18.4h , v19.4h   //x7 = x0 - x1;
397    add       v25.4h, v23.4h , v20.4h   //x6 = u_shift(x3,1,shft) + x2;
398    sub       v27.4h, v21.4h , v22.4h   //x8 = x3 - u_shift(x2,1,shft);
399
400    dup       v23.4s, w8                //load round factor values
401
402    st1       {v24.h}[0], [x10]         //store the dc value to alternate dc sddress
403//core tranform is done for 4x8 block 1
404    ld1       {v28.4h-v31.4h}, [x5]     //load the scaling values
405
406    abs       v0.4h, v24.4h             //abs val of row 1
407    abs       v1.4h, v25.4h             //abs val of row 2
408    abs       v2.4h, v26.4h             //abs val of row 3
409    abs       v3.4h, v27.4h             //abs val of row 4
410
411    cmgt      v4.4h, v24.4h, #0
412    cmgt      v5.4h, v25.4h, #0
413    cmgt      v6.4h, v26.4h, #0
414    cmgt      v7.4h, v27.4h, #0
415
416    smull     v0.4s, v0.4h, v28.4h      //multiply and add row 1
417    smull     v1.4s, v1.4h, v29.4h      //multiply and add row 2
418    smull     v2.4s, v2.4h, v30.4h      //multiply and add row 3
419    smull     v3.4s, v3.4h, v31.4h      //multiply and add row 4
420
421    add       v20.4s, v0.4s, v23.4s
422    add       v21.4s, v1.4s, v23.4s
423    add       v22.4s, v2.4s, v23.4s
424    add       v23.4s, v3.4s, v23.4s
425
426    dup       v24.4s, w7
427
428    sshl      v20.4s, v20.4s, v24.4s    //shift row 1
429    sshl      v21.4s, v21.4s, v24.4s    //shift row 2
430    sshl      v22.4s, v22.4s, v24.4s    //shift row 3
431    sshl      v23.4s, v23.4s, v24.4s    //shift row 4
432
433    xtn       v20.4h, v20.4s            //narrow row 1
434    xtn       v21.4h, v21.4s            //narrow row 2
435    xtn       v22.4h, v22.4s            //narrow row 3
436    xtn       v23.4h, v23.4s            //narrow row 4
437
438    neg       v24.8h, v20.8h            //get negative
439    neg       v25.8h, v21.8h            //get negative
440    neg       v26.8h, v22.8h            //get negative
441    neg       v27.8h, v23.8h            //get negative
442
443    //compare with zero for computng nnz
444    cmeq      v0.4h, v20.4h, #0
445    cmeq      v1.4h, v21.4h, #0
446    cmeq      v2.4h, v22.4h, #0
447    cmeq      v3.4h, v23.4h, #0
448
449    bsl       v4.8b, v20.8b, v24.8b     //restore sign of row 1 and 2
450    bsl       v5.8b, v21.8b, v25.8b     //restore sign of row 3 and 4
451    bsl       v6.8b, v22.8b, v26.8b     //restore sign of row 1 and 2
452    bsl       v7.8b, v23.8b, v27.8b     //restore sign of row 3 and 4
453
454    //narrow the comaprison result
455    mov       v0.d[1], v2.d[0]
456    mov       v1.d[1], v3.d[0]
457
458    xtn       v0.8b, v0.8h
459    xtn       v1.8b, v1.8h
460
461    ushr      v0.8b, v0.8b, #7          //i    reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
462    ushr      v1.8b, v1.8b, #7          //i    reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
463
464    add       v0.8b, v0.8b, v1.8b       //i pair add nnz 1
465    addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
466    addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
467    addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
468
469    st1       {v4.4h-v7.4h}, [x2]       //store blk
470
471    movi      v25.8b, #16               //get max nnz
472    sub       v26.8b, v25.8b , v0.8b    //invert current nnz
473    st1       {v26.b}[0], [x9]          //write nnz
474
475    pop_v_regs
476    ret
477
478
479//*****************************************************************************
480//*
481//* function name     : ih264_hadamard_quant_4x4_av8
482//* description       : this function does forward hadamard transform and
483//*                     quantization for luma dc block
484//*
485//* arguments         :  x0 :pointer to src buffer
486//                       x1 :pointer to dst buffer
487//                       x2 :pu2_scale_matrix
488//                       x2 :pu2_threshold_matrix
489//                       x3 :u4_qbits
490//                       x4 :u4_round_factor
491//                       x5 :pu1_nnz
492// values returned   : none
493//
494// register usage    :
495// stack usage       : 0 bytes
496// cycles            : around
497// interruptiaility  : interruptable
498//
499// known limitations
500//   \assumptions    :
501//
502// revision history  :
503//         dd mm yyyy    author(s)   changes
504//         20 2 2015    100633      first version
505//
506//*****************************************************************************
507//ih264_hadamard_quant_4x4_av8(word16 *pi2_src, word16 *pi2_dst,
508//                           const uword16 *pu2_scale_matrix,
509//                           const uword16 *pu2_threshold_matrix, uword32 u4_qbits,
510//                           uword32 u4_round_factor,uword8  *pu1_nnz
511//                           )
512    .global ih264_hadamard_quant_4x4_av8
513ih264_hadamard_quant_4x4_av8:
514
515//x0 :pointer to src buffer
516//x1 :pointer to dst buffer
517//x2 :pu2_scale_matrix
518//x3 :pu2_threshold_matrix
519//x4 :u4_qbits
520//x5 :u4_round_factor
521//x6 :pu1_nnz
522
523    push_v_regs
524
525    ld4       {v0.4h-v3.4h}, [x0]       //load 4x4 block
526    ld1       {v30.h}[0], [x2]          //load pu2_scale_matrix[0]
527
528    saddl     v4.4s, v0.4h, v3.4h       //x0 = x4 + x7;
529    saddl     v5.4s, v1.4h, v2.4h       //x1 = x5 + x6;
530    ssubl     v6.4s, v1.4h, v2.4h       //x2 = x5 - x6;
531    ssubl     v7.4s, v0.4h, v3.4h       //x3 = x4 - x7;
532
533    dup       v30.8h, v30.h[0]          //pu2_scale_matrix[0]
534
535    add       v14.4s, v4.4s, v5.4s      //pi2_dst[0] = x0 + x1;
536    add       v15.4s, v7.4s, v6.4s      //pi2_dst[1] = x3 + x2;
537    sub       v16.4s, v4.4s, v5.4s      //pi2_dst[2] = x0 - x1;
538    sub       v17.4s, v7.4s, v6.4s      //pi2_dst[3] = x3 - x2;
539
540    //transpose 4x4 block
541    trn1      v18.4s, v14.4s, v15.4s
542    trn2      v19.4s, v14.4s, v15.4s
543    trn1      v20.4s, v16.4s, v17.4s
544    trn2      v21.4s, v16.4s, v17.4s
545
546    trn1      v14.2d, v18.2d, v20.2d
547    trn2      v16.2d, v18.2d, v20.2d
548    trn1      v15.2d, v19.2d, v21.2d
549    trn2      v17.2d, v19.2d, v21.2d
550    //end transpose
551
552    add       v18.4s, v14.4s, v17.4s    //x0 = x4 + x7;
553    add       v19.4s, v15.4s, v16.4s    //x1 = x5 + x6;
554    sub       v20.4s, v15.4s, v16.4s    //x2 = x5 - x6;
555    sub       v21.4s, v14.4s, v17.4s    //x3 = x4 - x7;
556
557    dup       v14.4s, w5                //round factor
558    dup       v15.4s, v14.s[0]
559    dup       v16.4s, v14.s[0]
560    dup       v17.4s, v14.s[0]
561
562    add       v22.4s, v18.4s, v19.4s    //(x0 + x1)
563    add       v23.4s, v21.4s, v20.4s    //(x3 + x2)
564    sub       v24.4s, v18.4s, v19.4s    //(x0 - x1)
565    sub       v25.4s, v21.4s, v20.4s    //(x3 - x2)
566
567    shrn      v0.4h, v22.4s, #1         //i4_value = (x0 + x1) >> 1;
568    shrn2     v0.8h, v23.4s, #1         //i4_value = (x3 + x2) >> 1;
569    shrn      v1.4h, v24.4s, #1         //i4_value = (x0 - x1) >> 1;
570    shrn2     v1.8h, v25.4s, #1         //i4_value = (x3 - x2) >> 1;
571
572    abs       v2.8h, v0.8h
573    abs       v3.8h, v1.8h
574
575    cmgt      v4.8h, v0.8h, #0          //get the sign row 1,2
576    cmgt      v5.8h, v1.8h, #0
577
578    neg       w4, w4                    //-u4_qbits
579    dup       v22.4s, w4                //load  -u4_qbits
580
581    umlal     v14.4s, v2.4h, v30.4h
582    umlal2    v15.4s, v2.8h, v30.8h
583    umlal     v16.4s, v3.4h, v30.4h
584    umlal2    v17.4s, v3.8h, v30.8h
585
586    ushl      v14.4s, v14.4s, v22.4s
587    ushl      v15.4s, v15.4s, v22.4s
588    ushl      v16.4s, v16.4s, v22.4s
589    ushl      v17.4s, v17.4s, v22.4s
590
591    uqxtn     v14.4h, v14.4s
592    uqxtn2    v14.8h, v15.4s
593    uqxtn     v16.4h, v16.4s
594    uqxtn2    v16.8h, v17.4s
595
596    neg       v15.8h, v14.8h
597    neg       v17.8h, v16.8h
598
599    bsl       v4.16b, v14.16b, v15.16b
600    bsl       v5.16b, v16.16b, v17.16b
601
602    cmeq      v0.8h, v14.8h, #0
603    cmeq      v1.8h, v16.8h, #0
604
605    st1       {v4.8h-v5.8h}, [x1]
606
607    movi      v20.8b, #16
608
609    xtn       v2.8b, v0.8h
610    xtn       v3.8b, v1.8h
611
612    ushr      v2.8b, v2.8b, #7
613    ushr      v3.8b, v3.8b, #7
614
615    add       v2.8b, v2.8b, v3.8b
616    addp      v2.8b, v2.8b, v2.8b
617    addp      v2.8b, v2.8b, v2.8b
618    addp      v2.8b, v2.8b, v2.8b
619    sub       v20.8b, v20.8b, v2.8b
620    st1       {v20.b}[0], [x6]
621
622    pop_v_regs
623    ret
624
625
626//*****************************************************************************
627//*
628//* function name     : ih264_hadamard_quant_2x2_uv
629//* description       : this function does forward hadamard transform and
630//*                     quantization for dc block of chroma for both planes
631//*
632//* arguments         :  x0 :pointer to src buffer
633//                       x1 :pointer to dst buffer
634//                       x2 :pu2_scale_matrix
635//                       x2 :pu2_threshold_matrix
636//                       x3 :u4_qbits
637//                       x4 :u4_round_factor
638//                       x5 :pu1_nnz
639// values returned   : none
640//
641// register usage    :
642// stack usage       : 0 bytes
643// cycles            : around
644// interruptiaility  : interruptable
645//
646// known limitations
647//   \assumptions    :
648//
649// revision history  :
650//         dd mm yyyy    author(s)   changes
651//         20 2 2015    100633      first version
652//
653//*****************************************************************************
654// ih264_hadamard_quant_2x2_uv_av8(word16 *pi2_src, word16 *pi2_dst,
655//                             const uword16 *pu2_scale_matrix,
656//                             const uword16 *pu2_threshold_matrix, uword32 u4_qbits,
657//                             uword32 u4_round_factor,uword8  *pu1_nnz
658//                             )
659
660    .global ih264_hadamard_quant_2x2_uv_av8
661ih264_hadamard_quant_2x2_uv_av8:
662
663    push_v_regs
664
665    ld2       {v0.4h-v1.4h}, [x0]       //load src
666
667    ld1       {v30.h}[0], [x2]          //load pu2_scale_matrix[0]
668    dup       v30.4h, v30.h[0]          //pu2_scale_matrix
669    uxtl      v30.4s, v30.4h            //pu2_scale_matrix
670
671    neg       w4, w4
672    dup       v24.4s, w4                //u4_qbits
673
674    dup       v25.4s, w5                //round fact
675    dup       v26.4s, v25.s[0]
676
677    saddl     v2.4s, v0.4h, v1.4h       //x0 = x4 + x5;, x2 = x6 + x7;
678    ssubl     v3.4s, v0.4h, v1.4h       //x1 = x4 - x5;  x3 = x6 - x7;
679
680    trn1      v4.4s, v2.4s, v3.4s
681    trn2      v5.4s, v2.4s, v3.4s       //q1 -> x0 x1, q2 -> x2 x3
682
683    add       v0.4s, v4.4s , v5.4s      // (x0 + x2) (x1 + x3)  (y0 + y2); (y1 + y3);
684    sub       v1.4s, v4.4s , v5.4s      // (x0 - x2) (x1 - x3)  (y0 - y2); (y1 - y3);
685
686    abs       v2.4s, v0.4s
687    abs       v3.4s, v1.4s
688
689    cmgt      v4.4s, v0.4s, #0          //get the sign row 1,2
690    cmgt      v5.4s, v1.4s, #0
691
692    uqxtn     v4.4h, v4.4s
693    sqxtn2    v4.8h, v5.4s
694
695    mla       v25.4s, v2.4s, v30.4s
696    mla       v26.4s, v3.4s, v30.4s
697
698    ushl      v2.4s, v25.4s, v24.4s     //>>qbit
699    ushl      v3.4s, v26.4s, v24.4s     //>>qbit
700
701    uqxtn     v2.4h, v2.4s
702    uqxtn2    v2.8h, v3.4s
703
704    neg       v5.8h, v2.8h
705
706    bsl       v4.16b, v2.16b, v5.16b    //*sign
707
708    //rearrange such that we get each plane coeffs as continous
709    mov       v5.s[0], v4.s[1]
710    mov       v4.s[1], v4.s[2]
711    mov       v4.s[2], v5.s[0]
712
713    cmeq      v5.8h, v4.8h, #0          //compute nnz
714    xtn       v5.8b, v5.8h              //reduce nnz comparison to 1 bit
715    ushr      v5.8b, v5.8b, #7          //reduce nnz comparison to 1 bit
716    movi      v20.8b, #4                //since we add zeros, we need to subtract from 4 to get nnz
717    addp      v5.8b, v5.8b, v5.8b       //sum up nnz
718    addp      v5.8b, v5.8b, v5.8b       //sum up nnz
719
720    st1       {v4.8h}, [x1]             //store the block
721
722    st1       {v4.8h}, [x1]             //store the block
723    sub       v20.8b, v20.8b, v5.8b     //4- numzeros
724
725    st1       {v20.h}[0], [x6]          //store nnz
726
727    pop_v_regs
728    ret
729
730
731
732