1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///*****************************************************************************/
21///*                                                                           */
22///*  File Name         : ih264_deblk_luma_av8.s                               */
23///*                                                                           */
24///*  Description       : Contains function definitions for deblocking luma    */
25///*                      edge. Functions are coded in NEON assembly and can   */
26///*                      be compiled using ARM RVDS.                          */
27///*                                                                           */
28///*  List of Functions : ih264_deblk_luma_vert_bs4_av8()                      */
29///*                      ih264_deblk_luma_vert_bslt4_av8()                    */
30///*                      ih264_deblk_luma_horz_bs4_av8()                      */
31///*                      ih264_deblk_luma_horz_bslt4_av8()                    */
32///*                                                                           */
33///*  Issues / Problems : None                                                 */
34///*                                                                           */
35///*  Revision History  :                                                      */
36///*                                                                           */
37///*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
38///*         28 11 2013   Ittiam          Draft                                */
39///*                                                                           */
40///*****************************************************************************/
41
42
43.text
44.p2align 2
45.include "ih264_neon_macros.s"
46
47
48
49///**
50//*******************************************************************************
51//*
52//* @brief
53//*     Performs filtering of a luma block horizontal edge for cases where the
54//*     boundary strength is less than 4
55//*
56//* @par Description:
57//*       This operation is described in  Sec. 8.7.2.4 under the title
58//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
59//*
60//* @param[in] x0 - pu1_src
61//*  Pointer to the src sample q0
62//*
63//* @param[in] w1 - src_strd
64//*  Source stride
65//*
66//* @param[in] w2 - alpha
67//*  Alpha Value for the boundary
68//*
69//* @param[in] w3 - beta
70//*  Beta Value for the boundary
71//*
72//* @param[in] w4 - u4_bs
73//*    Packed Boundary strength array
74//*
75//* @param[in] x5 - pu1_cliptab
76//*    tc0_table
77//*
78//* @returns
79//*  None
80//*
81//* @remarks
82//*  None
83//*
84//*******************************************************************************
85//*/
86
87    .global ih264_deblk_luma_horz_bslt4_av8
88
89ih264_deblk_luma_horz_bslt4_av8:
90
91    // STMFD sp!,{x4-x7,x14}
92    push_v_regs
93    sxtw      x1, w1
94    stp       x19, x20, [sp, #-16]!
95
96    //LDRD            x4,x5,[SP,#0x14]        //x4 = ui_Bs , x5 = *puc_ClpTab
97    sub       x0, x0, x1, lsl #1        //x1 = uc_Horizonpad
98    sub       x0, x0, x1                //x0 pointer to p2
99    rev       w4, w4                    //
100    ld1       {v10.8b, v11.8b}, [x0], x1 //p2 values are loaded into q5
101    mov       v12.s[0], w4              //d12[0] = ui_Bs
102    mov       x6, x0                    //keeping backup of pointer to p1
103    ld1       {v8.8b, v9.8b}, [x0], x1  //p1 values are loaded into q4
104    mov       x7, x0                    //keeping backup of pointer to p0
105    ld1       {v6.8b, v7.8b}, [x0], x1  //p0 values are loaded into q3
106    uxtl      v12.8h, v12.8b            //q6 = uc_Bs in each 16 bt scalar
107    ld1       {v0.8b, v1.8b}, [x0], x1  //q0 values are loaded into q0
108    mov       v10.d[1], v11.d[0]
109    mov       v8.d[1], v9.d[0]
110    mov       v6.d[1], v7.d[0]
111    uabd      v26.16b, v8.16b, v6.16b
112    ld1       {v2.8b, v3.8b}, [x0], x1  //q1 values are loaded into q1
113    mov       v0.d[1], v1.d[0]
114    mov       v2.d[1], v3.d[0]
115    uabd      v22.16b, v6.16b, v0.16b
116    ld1       {v16.s}[0], [x5]          //D16[0] contains cliptab
117    uabd      v24.16b, v2.16b, v0.16b
118    ld1       {v4.8b, v5.8b}, [x0], x1  //q2 values are loaded into q2
119    tbl       v14.8b, {v16.16b}, v12.8b //
120    mov       v4.d[1], v5.d[0]
121    dup       v20.16b, w2               //Q10 contains alpha
122    dup       v16.16b, w3               //Q8 contains beta
123    uxtl      v12.4s, v12.4h            //
124    uxtl      v14.4s, v14.4h            //
125    uabd      v28.16b, v10.16b, v6.16b
126    uabd      v30.16b, v4.16b, v0.16b
127    cmgt      v12.4s, v12.4s, #0
128    sli       v14.4s, v14.4s, #8
129    cmhs      v18.16b, v22.16b, v20.16b
130    cmhs      v24.16b, v24.16b, v16.16b
131    cmhs      v26.16b, v26.16b, v16.16b
132    cmhi      v20.16b, v16.16b , v28.16b //Q10=(Ap<Beta)
133    cmhi      v22.16b, v16.16b , v30.16b //Q11=(Aq<Beta)
134    sli       v14.4s, v14.4s, #16
135    orr       v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
136    usubl     v30.8h, v1.8b, v7.8b      //
137    usubl     v24.8h, v0.8b, v6.8b      //Q15,Q12 = (q0 - p0)
138    orr       v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
139    usubl     v28.8h, v8.8b, v2.8b      //Q14 = (p1 - q1)L
140    shl       v26.8h, v30.8h, #2        //Q13 = (q0 - p0)<<2
141    shl       v24.8h, v24.8h, #2        //Q12 = (q0 - p0)<<2
142    usubl     v30.8h, v9.8b, v3.8b      //Q15 = (p1 - q1)H
143    bic       v12.16b, v12.16b , v18.16b //final condition
144    add       v24.8h, v24.8h , v28.8h   //
145    add       v26.8h, v26.8h , v30.8h   //Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1)
146    sub       v18.16b, v14.16b , v20.16b //Q9 = C0 + (Ap < Beta)
147    urhadd    v16.16b, v6.16b , v0.16b  //Q8 = ((p0+q0+1) >> 1)
148    mov       v17.d[0], v16.d[1]
149    sqrshrn   v24.8b, v24.8h, #3        //
150    sqrshrn   v25.8b, v26.8h, #3        //Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
151    mov       v24.d[1], v25.d[0]
152    sub       v18.16b, v18.16b , v22.16b //Q9 = C0 + (Ap < Beta) + (Aq < Beta)
153    and       v20.16b, v20.16b , v12.16b //
154    and       v22.16b, v22.16b , v12.16b //
155    abs       v26.16b, v24.16b          //Q13 = ABS (i_macro)
156    uaddl     v28.8h, v17.8b, v11.8b    //
157    uaddl     v10.8h, v16.8b, v10.8b    //Q14,Q5 = p2 + (p0+q0+1)>>1
158    uaddl     v30.8h, v17.8b, v5.8b     //
159    umin      v18.16b, v26.16b , v18.16b //Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
160    ushll     v26.8h, v9.8b, #1         //
161    uaddl     v4.8h, v16.8b, v4.8b      //Q15,Q2 = q2 + (p0+q0+1)>>1
162    ushll     v16.8h, v8.8b, #1         //Q13,Q8 = (p1<<1)
163    and       v18.16b, v18.16b , v12.16b //Making delta zero in places where values shouldn be filterd
164    sub       v28.8h, v28.8h , v26.8h   //Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1)
165    sub       v10.8h, v10.8h , v16.8h   //
166    ushll     v16.8h, v2.8b, #1         //
167    ushll     v26.8h, v3.8b, #1         //Q13,Q8 = (q1<<1)
168    sqshrn    v29.8b, v28.8h, #1        //
169    sqshrn    v28.8b, v10.8h, #1        //Q14 = i_macro_p1
170    mov       v28.d[1], v29.d[0]
171    sub       v4.8h, v4.8h , v16.8h     //
172    sub       v30.8h, v30.8h , v26.8h   //Q15,Q2  = [q2 + (p0+q0+1)>>1] - (q1<<1)
173    neg       v26.16b, v14.16b          //Q13 = -C0
174    smin      v28.16b, v28.16b , v14.16b //Q14 = min(C0,i_macro_p1)
175    cmge      v24.16b, v24.16b, #0
176    sqshrn    v31.8b, v30.8h, #1        //
177    sqshrn    v30.8b, v4.8h, #1         //Q15 = i_macro_q1
178    mov       v30.d[1], v31.d[0]
179    smax      v28.16b, v28.16b , v26.16b //Q14 = max( - C0 , min(C0, i_macro_p1) )
180    uqadd     v16.16b, v6.16b , v18.16b //Q8  = p0 + delta
181    uqsub     v6.16b, v6.16b , v18.16b  //Q3 = p0 - delta
182    smin      v30.16b, v30.16b , v14.16b //Q15 = min(C0,i_macro_q1)
183    and       v28.16b, v20.16b , v28.16b //condition check Ap<beta
184    uqadd     v14.16b, v0.16b , v18.16b //Q7 = q0 + delta
185    uqsub     v0.16b, v0.16b , v18.16b  //Q0   = q0 - delta
186    smax      v30.16b, v30.16b , v26.16b //Q15 = max( - C0 , min(C0, i_macro_q1) )
187    bif       v16.16b, v6.16b , v24.16b //Q8  = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
188    bif       v0.16b, v14.16b , v24.16b //Q0  = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
189    add       v28.16b, v28.16b , v8.16b //
190    and       v30.16b, v22.16b , v30.16b //condition check Aq<beta
191    st1       {v16.16b}, [x7], x1       //writting back filtered value of p0
192    add       v30.16b, v30.16b , v2.16b //
193    st1       {v0.16b}, [x7], x1        //writting back filtered value of q0
194    st1       {v28.16b}, [x6]           //writting back filtered value of p1
195    st1       {v30.16b}, [x7], x1       //writting back filtered value of q1
196
197    // LDMFD sp!,{x4-x7,pc}
198    ldp       x19, x20, [sp], #16
199    pop_v_regs
200    ret
201
202
203
204///**
205//*******************************************************************************
206//*
207//* @brief
208//*     Performs filtering of a luma block horizontal edge when the
209//*     boundary strength is set to 4
210//*
211//* @par Description:
212//*       This operation is described in  Sec. 8.7.2.4 under the title
213//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
214//*
215//* @param[in] x0 - pu1_src
216//*  Pointer to the src sample q0
217//*
218//* @param[in] w1 - src_strd
219//*  Source stride
220//*
221//* @param[in] w2 - alpha
222//*  Alpha Value for the boundary
223//*
224//* @param[in] w3 - beta
225//*  Beta Value for the boundary
226//*
227//* @returns
228//*  None
229//*
230//* @remarks
231//*  None
232//*
233//*******************************************************************************
234//*/
235
236    .global ih264_deblk_luma_horz_bs4_av8
237
238ih264_deblk_luma_horz_bs4_av8:
239
240    // Back up necessary registers on stack
241    // STMFD sp!,{x12,x14}
242    push_v_regs
243    stp       x19, x20, [sp, #-16]!
244    sxtw      x1, w1
245
246    // Init
247    dup       v0.16b, w2                //duplicate alpha
248    sub       x12, x0, x1               //pointer to p0 = q0 - src_strd
249    dup       v2.16b, w3                //duplicate beta
250    sub       x14, x0, x1, lsl#1        //pointer to p1 = q0 - src_strd*2
251    sub       x2, x0, x1, lsl#2         //pointer to p3 = q0 - src_strd*4
252    sub       x3, x14, x1               //pointer to p2 = p1 - src_strd
253
254    // Load Data
255    ld1       {v4.8b, v5.8b}, [x0], x1  //load q0 to Q2, q0 = q0 + src_strd
256    ld1       {v6.8b, v7.8b}, [x12]     //load p0 to Q3
257    ld1       {v8.8b, v9.8b}, [x0], x1  //load q1 to Q4, q0 = q0 + src_strd
258    ld1       {v10.8b, v11.8b}, [x14]   //load p1 to Q5
259    mov       v4.d[1] , v5.d[0]
260    mov       v6.d[1] , v7.d[0]
261    mov       v8.d[1] , v9.d[0]
262    mov       v10.d[1] , v11.d[0]
263
264    // Filter Decision
265    uabd      v12.16b  , v4.16b, v6.16b
266    uabd      v14.16b  , v8.16b, v4.16b
267    uabd      v16.16b  , v10.16b, v6.16b
268    cmhs      v18.16b, v12.16b , v0.16b //ABS(p0 - q0) >= Alpha
269    cmhs      v14.16b, v14.16b , v2.16b //ABS(q1 - q0) >= Beta
270    cmhs      v16.16b, v16.16b , v2.16b //ABS(q1 - q0) >= Beta
271    movi      v20.16b, #2
272    orr       v18.16b, v18.16b , v14.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta
273    ld1       {v14.8b, v15.8b}, [x0], x1 //load q2 to Q7, q0 = q0 + src_strd
274    mov       v14.d[1] , v15.d[0]
275    orr       v18.16b, v18.16b , v16.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta
276    usra      v20.16b, v0.16b, #2       //alpha >>2 +2
277    uabd      v22.16b  , v14.16b, v4.16b
278    uaddl     v24.8h, v4.8b, v6.8b      //p0+q0 L
279    uaddl     v26.8h, v5.8b, v7.8b      //p0+q0 H
280    cmhi      v22.16b, v2.16b , v22.16b //Aq < Beta
281    cmhi      v20.16b, v20.16b , v12.16b //(ABS(p0 - q0) <((Alpha >>2) + 2))
282    // Deblock Filtering q0', q1', q2'
283    uaddw     v28.8h, v24.8h , v8.8b    //p0+q0+q1 L
284    uaddw     v30.8h, v26.8h , v9.8b    //p0+q0+q1 H
285    and       v22.16b, v22.16b , v20.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
286    // q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE
287    add       v16.8h, v28.8h , v28.8h   //2*(p0+q0+q1)L
288    add       v0.8h, v30.8h , v30.8h    //2*(p0+q0+q1)H
289    uaddw     v16.8h, v16.8h , v14.8b   //2*(p0+q0+q1)+q2 L
290    uaddw     v0.8h, v0.8h , v15.8b     //2*(p0+q0+q1)+q2 H
291    uaddw     v16.8h, v16.8h , v10.8b   //2*(p0+q0+q1)+q2 +p1 L
292    uaddw     v0.8h, v0.8h , v11.8b     //2*(p0+q0+q1)+q2 +p1 H
293    rshrn     v12.8b, v16.8h, #3        //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0']
294    rshrn     v13.8b, v0.8h, #3         //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0']
295    mov       v12.d[1] , v13.d[0]
296    // q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE
297    uaddl     v16.8h, v8.8b, v8.8b      //2*q1 L
298    uaddl     v0.8h, v9.8b, v9.8b       //2*q1 H
299    uaddw     v16.8h, v16.8h , v4.8b    //2*q1+q0 L
300    uaddw     v0.8h, v0.8h , v5.8b      //2*q1+q0 H
301    uaddw     v16.8h, v16.8h , v10.8b   //2*q1+q0+p1  L
302    uaddw     v0.8h, v0.8h , v11.8b     //2*q1+q0+p1 H
303    rshrn     v16.8b, v16.8h, #2        //(2*q1+q0+p1+2)>>2 L [q0"]
304    rshrn     v17.8b, v0.8h, #2         //(2*q1+q0+p1+2)>>2 H [q0"]
305    mov       v16.d[1] , v17.d[0]
306    uaddw     v28.8h, v28.8h , v14.8b   //p0+q0+q1+q2 L
307    uaddw     v30.8h, v30.8h , v15.8b   //p0+q0+q1+q2 H
308    ld1       {v0.8b, v1.8b}, [x0], x1  //load q3 to Q0, q0 = q0 + src_strd
309    mov       v0.d[1] , v1.d[0]
310    bit       v16.16b, v12.16b , v22.16b //choosing between q0' and q0" depending on condn
311    sub       x0, x0, x1, lsl #2        //pointer to q0
312    bic       v22.16b, v22.16b , v18.16b //((ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
313                                        // && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
314    rshrn     v12.8b, v28.8h, #2        //(p0+q0+q1+q2+2)>>2 L [q1']
315    rshrn     v13.8b, v30.8h, #2        //(p0+q0+q1+q2+2)>>2 H [q1']
316    mov       v12.d[1] , v13.d[0]
317    bif       v4.16b, v16.16b , v18.16b //choose q0 or filtered q0
318    mov       v5.d[0] , v4.d[1]
319    uaddl     v16.8h, v14.8b, v0.8b     //q2+q3,L
320    uaddl     v0.8h, v15.8b, v1.8b      //q2+q3,H
321    add       v28.8h, v28.8h , v16.8h   //p0+q0+q1+2*q2+q3 L
322    st1       {v4.8b, v5.8b}, [x0], x1  //store q0
323    add       v30.8h, v30.8h , v0.8h    //p0+q0+q1+2*q2+q3 H
324    add       v28.8h, v28.8h , v16.8h   //p0+q0+q1+3*q2+2*q3 L
325    add       v30.8h, v30.8h , v0.8h    //p0+q0+q1+3*q2+2*q3 H
326    rshrn     v0.8b, v28.8h, #3         //(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2']
327    rshrn     v1.8b, v30.8h, #3         //(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2']
328    mov       v0.d[1] , v1.d[0]
329    ld1       {v30.8b, v31.8b}, [x3]    //load p2 to Q15
330    mov       v30.d[1] , v31.d[0]
331    bif       v12.16b, v8.16b , v22.16b //choose q1 or filtered value of q1
332    mov       v13.d[0] , v12.d[1]
333    uabd      v16.16b  , v30.16b, v6.16b
334    uaddw     v24.8h, v24.8h , v10.8b   //p0+q0+p1 L
335    bif       v0.16b, v14.16b , v22.16b //choose q2 or filtered q2
336    mov       v1.d[0] , v0.d[1]
337    uaddw     v26.8h, v26.8h , v11.8b   //p0+q0+p1 H
338    st1       {v12.8b, v13.8b}, [x0], x1 //store q1
339    cmhi      v16.16b, v2.16b , v16.16b //Ap < Beta
340    add       v28.8h, v24.8h , v24.8h   //2*(p0+q0+p1) L
341    add       v4.8h, v26.8h , v26.8h    //2*(p0+q0+p1) H
342    st1       {v0.8b, v1.8b}, [x0], x1  //store q2
343    and       v20.16b, v20.16b , v16.16b //((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2)))
344    uaddw     v28.8h, v28.8h , v30.8b   //2*(p0+q0+p1)+p2 l
345    uaddw     v4.8h, v4.8h , v31.8b     //2*(p0+q0+p1)+p2 H
346    uaddw     v28.8h, v28.8h , v8.8b    //2*(p0+q0+p1)+p2+q1 L
347    uaddw     v4.8h, v4.8h , v9.8b      //2*(p0+q0+p1)+p2+q1 H
348    rshrn     v28.8b, v28.8h, #3        //(2*(p0+q0+p1)+p2+q1+4)>>3  L,p0'
349    rshrn     v29.8b, v4.8h, #3         //(2*(p0+q0+p1)+p2+q1+4)>>3  H,p0'
350    mov       v28.d[1] , v29.d[0]
351    movi      v0.8b, #2
352    movi      v1.4h, #2
353    uaddl     v2.8h, v6.8b, v8.8b       //p0+q1      L
354    umlal     v2.8h, v10.8b, v0.8b      //2*p1+p0+q1 L
355    uaddl     v16.8h, v7.8b, v9.8b      //p0+q1  H
356    umlal     v16.8h, v11.8b, v0.8b     //2*p1+p0+q1 H
357    uaddw     v12.8h, v24.8h , v30.8b   //(p0+q0+p1) +p2 L
358    ld1       {v24.8b, v25.8b}, [x2]    //load p3,Q12
359    mov       v24.d[1] , v25.d[0]
360    uaddw     v4.8h, v26.8h , v31.8b    //(p0+q0+p1) +p2 H
361    uaddl     v8.8h, v30.8b, v24.8b     //p2+p3 L
362    rshrn     v26.8b, v12.8h, #2        //((p0+q0+p1)+p2 +2)>>2,p1' L
363    rshrn     v2.8b, v2.8h, #2          //(2*p1+p0+q1+2)>>2,p0"L
364    rshrn     v27.8b, v4.8h, #2         //((p0+q0+p1)+p2 +2)>>2,p1' H
365    rshrn     v3.8b, v16.8h, #2         //(2*p1+p0+q1+2)>>2,p0" H
366    mov       v26.d[1] , v27.d[0]
367    mov       v2.d[1] , v3.d[0]
368    uaddl     v16.8h, v31.8b, v25.8b    //p2+p3 H
369    mla       v12.8h, v8.8h , v1.h[0]   //(p0+q0+p1)+3*p2+2*p3 L
370    mla       v4.8h, v16.8h , v1.h[0]   //(p0+q0+p1)+3*p2+2*p3 H
371    bic       v16.16b, v20.16b , v18.16b //((ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
372    mov       v17.d[0] , v16.d[1]       //&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
373    bit       v2.16b, v28.16b , v20.16b //choosing between po' and p0"
374    mov       v3.d[0] , v2.d[1]
375    rshrn     v12.8b, v12.8h, #3        //((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2'
376    rshrn     v13.8b, v4.8h, #3         //((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2'
377    mov       v12.d[1] , v13.d[0]
378    bif       v6.16b, v2.16b , v18.16b  //choosing between p0 and filtered value of p0
379    bit       v10.16b, v26.16b , v16.16b //choosing between p1 and p1'
380    bit       v30.16b, v12.16b , v16.16b //choosing between p2 and p2'
381    st1       {v6.16b}, [x12]           //store p0
382    st1       {v10.16b}, [x14]          //store p1
383    st1       {v30.16b}, [x3]           //store p2
384
385    // LDMFD sp!,{x12,pc}
386    ldp       x19, x20, [sp], #16
387    pop_v_regs
388    ret
389
390
391
392///**
393//*******************************************************************************
394//*
395//* @brief
396//*     Performs filtering of a luma block vertical edge for cases where the
397//*     boundary strength is less than 4
398//*
399//* @par Description:
400//*       This operation is described in  Sec. 8.7.2.4 under the title
401//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
402//*
403//* @param[in] x0 - pu1_src
404//*  Pointer to the src sample q0
405//*
406//* @param[in] w1 - src_strd
407//*  Source stride
408//*
409//* @param[in] w2 - alpha
410//*  Alpha Value for the boundary
411//*
412//* @param[in] w3 - beta
413//*  Beta Value for the boundary
414//*
415//* @param[in] w4 - u4_bs
416//*    Packed Boundary strength array
417//*
418//* @param[in] x5 - pu1_cliptab
419//*    tc0_table
420//*
421//* @returns
422//*  None
423//*
424//* @remarks
425//*  None
426//*
427//*******************************************************************************
428//*/
429
430    .global ih264_deblk_luma_vert_bslt4_av8
431
432ih264_deblk_luma_vert_bslt4_av8:
433
434    // STMFD sp!,{x12,x14}
435    push_v_regs
436    stp       x19, x20, [sp, #-16]!
437    sxtw      x1, w1
438
439    sub       x0, x0, #4                //pointer uc_edgePixel-4
440    mov       x12, x4
441    mov       x14, x5
442    mov       x17, x0
443    //loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
444    ld1       {v0.8b}, [x0], x1         //row1
445    ld1       {v2.8b}, [x0], x1         //row2
446    ld1       {v4.8b}, [x0], x1         //row3
447    rev       w12, w12                  //reversing ui_bs
448    ld1       {v6.8b}, [x0], x1         //row4
449    mov       v18.s[0], w12             //d12[0] = ui_Bs
450    ld1       {v16.s}[0], [x14]         //D16[0] contains cliptab
451    ld1       {v8.8b}, [x0], x1         //row5
452    uxtl      v18.8h, v18.8b            //q6 = uc_Bs in each 16 bt scalar
453    ld1       {v10.8b}, [x0], x1        //row6
454    ld1       {v12.8b}, [x0], x1        //row7
455    tbl       v16.8b, {v16.16b}, v18.8b //puc_ClipTab[uc_Bs]
456    ld1       {v14.8b}, [x0], x1        //row8
457    ld1       {v1.8b}, [x0], x1         //row9
458    uxtl      v16.4s, v16.4h            //
459    ld1       {v3.8b}, [x0], x1         //row10
460    ld1       {v5.8b}, [x0], x1         //row11
461    ld1       {v7.8b}, [x0], x1         //row12
462    sli       v16.4s, v16.4s, #8        //
463    ld1       {v9.8b}, [x0], x1         //row13
464    ld1       {v11.8b}, [x0], x1        //row14
465    ld1       {v13.8b}, [x0], x1        //row15
466    sli       v16.4s, v16.4s, #16
467    ld1       {v15.8b}, [x0], x1        //row16
468
469
470    //taking two 8x8 transposes
471    //2X2 transposes
472    trn1      v21.8b, v0.8b, v2.8b
473    trn2      v2.8b, v0.8b, v2.8b       //row1 &2
474    mov       v0.8b, v21.8b
475    trn1      v21.8b, v4.8b, v6.8b
476    trn2      v6.8b, v4.8b, v6.8b       //row3&row4
477    mov       v4.8b, v21.8b
478    trn1      v21.8b, v8.8b, v10.8b
479    trn2      v10.8b, v8.8b, v10.8b     //row5&6
480    mov       v8.8b, v21.8b
481    trn1      v21.8b, v12.8b, v14.8b
482    trn2      v14.8b, v12.8b, v14.8b    //row7 & 8
483    mov       v12.8b, v21.8b
484    trn1      v21.8b, v1.8b, v3.8b
485    trn2      v3.8b, v1.8b, v3.8b       //row9 &10
486    mov       v1.8b, v21.8b
487    trn1      v21.8b, v5.8b, v7.8b
488    trn2      v7.8b, v5.8b, v7.8b       //row11 & 12
489    mov       v5.8b, v21.8b
490    trn1      v21.8b, v9.8b, v11.8b
491    trn2      v11.8b, v9.8b, v11.8b     //row13 &14
492    mov       v9.8b, v21.8b
493    trn1      v21.8b, v13.8b, v15.8b
494    trn2      v15.8b, v13.8b, v15.8b    //row15 & 16
495    mov       v13.8b, v21.8b
496    //4x4 transposes
497    trn1      v21.4h, v2.4h, v6.4h
498    trn2      v6.4h, v2.4h, v6.4h       //row2 & row4
499    mov       v2.8b, v21.8b
500    trn1      v21.4h, v10.4h, v14.4h
501    trn2      v14.4h, v10.4h, v14.4h    //row6 & row8
502    mov       v10.8b, v21.8b
503    trn1      v21.4h, v3.4h, v7.4h
504    trn2      v7.4h, v3.4h, v7.4h       //row10 & 12
505    mov       v3.8b, v21.8b
506    trn1      v21.4h, v11.4h, v15.4h
507    trn2      v15.4h, v11.4h, v15.4h    //row14 & row16
508    mov       v11.8b, v21.8b
509    trn1      v21.2s, v6.2s, v14.2s
510    trn2      v14.2s, v6.2s, v14.2s     //row4 & 8
511    mov       v6.8b, v21.8b
512    trn1      v21.2s, v7.2s, v15.2s
513    trn2      v15.2s, v7.2s, v15.2s     //row 12 & 16
514    mov       v7.8b, v21.8b
515    //now Q3 ->p0 and Q7->q3
516    trn1      v21.4h, v0.4h, v4.4h
517    trn2      v4.4h, v0.4h, v4.4h       //row1 & 3
518    mov       v0.8b, v21.8b
519    trn1      v21.4h, v8.4h, v12.4h
520    trn2      v12.4h, v8.4h, v12.4h     //row 5 & 7
521    mov       v8.8b, v21.8b
522    trn1      v21.4h, v1.4h, v5.4h
523    trn2      v5.4h, v1.4h, v5.4h       //row9 & row11
524    mov       v1.8b, v21.8b
525    trn1      v21.4h, v9.4h, v13.4h
526    trn2      v13.4h, v9.4h, v13.4h     //row13 & row15
527    mov       v9.8b, v21.8b
528    trn1      v21.2s, v0.2s, v8.2s
529    trn2      v8.2s, v0.2s, v8.2s       //row1 & row5
530    mov       v0.8b, v21.8b
531    trn1      v21.2s, v1.2s, v9.2s
532    trn2      v9.2s, v1.2s, v9.2s       //row9 & 13
533    mov       v1.8b, v21.8b
534    //now Q0->p3 & Q4->q0
535    //starting processing as p0 and q0 are now ready
536    trn1      v21.2s, v2.2s, v10.2s
537    trn2      v10.2s, v2.2s, v10.2s     //row2 &6
538    mov       v2.8b, v21.8b
539    mov       v6.d[1] , v7.d[0]
540    mov       v8.d[1] , v9.d[0]
541    urhadd    v20.16b, v6.16b , v8.16b  //((p0 + q0 + 1) >> 1)
542    mov       v21.d[0], v20.d[1]
543    trn1      v31.2s, v3.2s, v11.2s
544    trn2      v11.2s, v3.2s, v11.2s     //row10&row14
545    mov       v3.8b, v31.8b
546    movi      v19.8b, #2
547    mov       v18.d[1], v19.d[0]
548    //now Q1->p2     & Q5->q1
549    trn1      v31.2s, v4.2s, v12.2s
550    trn2      v12.2s, v4.2s, v12.2s     //row3 & 7
551    mov       v4.8b, v31.8b
552    uabd      v22.16b  , v6.16b, v8.16b //ABS(q1 - q0)
553    trn1      v31.2s, v5.2s, v13.2s
554    trn2      v13.2s, v5.2s, v13.2s     //row11 & row15
555    mov       v5.8b, v31.8b
556    mov       v0.d[1] , v1.d[0]
557    mov       v2.d[1] , v3.d[0]
558    mov       v4.d[1] , v5.d[0]
559    mov       v10.d[1] , v11.d[0]
560    mov       v12.d[1] , v13.d[0]
561    mov       v14.d[1] , v15.d[0]
562    uaddl     v24.8h, v20.8b, v2.8b     //(p2 + ((p0 + q0 + 1) >> 1) L
563    //now            Q2->p1,Q6->q2
564    uaddl     v26.8h, v21.8b, v3.8b     //(p2 + ((p0 + q0 + 1) >> 1) H
565    umlsl     v24.8h, v4.8b, v19.8b     //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L
566    umlsl     v26.8h, v5.8b, v19.8b     //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H
567    dup       v28.16b, w2               //alpha
568    cmhs      v22.16b, v22.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
569    dup       v28.16b, w3               //beta
570    uabd      v30.16b  , v10.16b, v8.16b //ABS(q1 - q0)
571    sqshrn    v24.8b, v24.8h, #1        //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L
572    sqshrn    v25.8b, v26.8h, #1        //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H
573    mov       v24.d[1], v25.d[0]
574    cmhs      v30.16b, v30.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
575    uabd      v26.16b  , v4.16b, v6.16b //ABS(q1 - q0)
576
577    smin      v24.16b, v24.16b , v16.16b //min(deltap1 ,C0)
578    orr       v22.16b, v22.16b , v30.16b //ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
579    neg       v30.16b, v16.16b          //-C0
580    cmhs      v26.16b, v26.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
581    smax      v24.16b, v24.16b , v30.16b //max(deltap1,-C0)
582    orr       v22.16b, v22.16b , v26.16b //ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta ||    ABS(p1 - p0) >= Beta)
583    uxtl      v26.4s, v18.4h            //ui_bs
584    uaddl     v18.8h, v20.8b, v12.8b    //q2 + ((p0 + q0 + 1) >> 1) L
585    cmeq      v26.4s, v26.4s , #0       //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
586    usubw     v18.8h, v18.8h , v10.8b   //(q2 + ((p0 + q0 + 1) >> 1) - q1) L
587    uaddl     v20.8h, v21.8b, v13.8b    //q2 + ((p0 + q0 + 1) >> 1) H
588    usubw     v18.8h, v18.8h , v10.8b   //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L
589    usubw     v20.8h, v20.8h , v11.8b   //(q2 + ((p0 + q0 + 1) >> 1) - q1) H
590    orr       v26.16b, v26.16b , v22.16b //(ABS(p0 - q0) >= Alpha  || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs)
591    usubw     v20.8h, v20.8h , v11.8b   //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H
592    sqshrn    v18.8b, v18.8h, #1        //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L
593    uabd      v22.16b  , v2.16b, v6.16b //ABS(q1 - q0)
594    sqshrn    v19.8b, v20.8h, #1        //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H
595    mov       v18.d[1], v19.d[0]
596    uabd      v20.16b  , v12.16b, v8.16b //ABS(q1 - q0)
597    cmhi      v22.16b, v28.16b , v22.16b //Ap < Beta
598    smin      v18.16b, v18.16b , v16.16b //min(delatq1,C0)
599    cmhi      v20.16b, v28.16b , v20.16b //Aq <Beta
600    usubl     v28.8h, v8.8b, v6.8b      //(q0 - p0) L
601    smax      v18.16b, v18.16b , v30.16b //max(deltaq1,-C0)
602    usubl     v30.8h, v9.8b, v7.8b      //(q0 - p0) H
603    shl       v28.8h, v28.8h, #2        //(q0 - p0)<<2 L
604    sub       v16.16b, v16.16b , v22.16b //C0 + (Ap < Beta)
605    shl       v30.8h, v30.8h, #2        //(q0 - p0) << 2) H
606    uaddw     v28.8h, v28.8h , v4.8b    //((q0 - p0) << 2) + (p1  L
607    uaddw     v30.8h, v30.8h , v5.8b    //((q0 - p0) << 2) + (p1 H
608    usubw     v28.8h, v28.8h , v10.8b   //((q0 - p0) << 2) + (p1 - q1) L
609    usubw     v30.8h, v30.8h , v11.8b   //((q0 - p0) << 2) + (p1 - q1) H
610    bic       v22.16b, v22.16b , v26.16b //final condition for p1
611    rshrn     v28.8b, v28.8h, #3        //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L
612    rshrn     v29.8b, v30.8h, #3        //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H
613    mov       v28.d[1], v29.d[0]
614    sub       v16.16b, v16.16b , v20.16b //C0 + (Ap < Beta) + (Aq < Beta)
615    bic       v20.16b, v20.16b , v26.16b //final condition for q1
616    abs       v30.16b, v28.16b          //abs(delta)
617    and       v24.16b, v24.16b , v22.16b //delatp1
618    and       v18.16b, v18.16b , v20.16b //delta q1
619    umin      v30.16b, v30.16b , v16.16b //min((abs(delta),C)
620    add       v4.16b, v4.16b , v24.16b  //p1+deltap1
621    add       v10.16b, v10.16b , v18.16b //q1+deltaq1
622    mov       v5.d[0], v4.d[1]
623    mov       v11.d[0], v10.d[1]
624    bic       v30.16b, v30.16b , v26.16b //abs(delta) of pixels to be changed only
625    // VCGE.S8 Q14,    Q14,#0                    //sign(delta)
626    cmge      v28.16b, v28.16b , #0
627    uqsub     v22.16b, v6.16b , v30.16b //clip(p0-delta)
628
629    trn1      v21.8b, v0.8b, v2.8b
630    trn2      v2.8b, v0.8b, v2.8b       //row1 &2
631    mov       v0.8b, v21.8b
632    uqadd     v6.16b, v6.16b , v30.16b  //clip(p0+delta)
633
634    trn1      v21.8b, v1.8b, v3.8b
635    trn2      v3.8b, v1.8b, v3.8b       //row9 &10
636    mov       v1.8b, v21.8b
637    uqadd     v24.16b, v8.16b , v30.16b //clip(q0+delta)
638    trn1      v21.8b, v12.8b, v14.8b
639    trn2      v14.8b, v12.8b, v14.8b    //row7 & 8
640    mov       v12.8b, v21.8b
641    uqsub     v8.16b, v8.16b , v30.16b  //clip(q0-delta)
642    trn1      v21.8b, v13.8b, v15.8b
643    trn2      v15.8b, v13.8b, v15.8b    //row15 & 16
644    mov       v13.8b, v21.8b
645    bif       v6.16b, v22.16b , v28.16b //p0
646    bif       v8.16b, v24.16b , v28.16b //q0
647    mov       v7.d[0], v6.d[1]
648    mov       v9.d[0], v8.d[1]
649    trn1      v21.8b, v4.8b, v6.8b
650    trn2      v6.8b, v4.8b, v6.8b       //row3&row4
651    mov       v4.8b, v21.8b
652    trn1      v21.8b, v8.8b, v10.8b
653    trn2      v10.8b, v8.8b, v10.8b     //row5&6
654    mov       v8.8b, v21.8b
655    trn1      v21.8b, v5.8b, v7.8b
656    trn2      v7.8b, v5.8b, v7.8b       //row11 & 12
657    mov       v5.8b, v21.8b
658    trn1      v21.8b, v9.8b, v11.8b
659    trn2      v11.8b, v9.8b, v11.8b     //row13 &14
660    mov       v9.8b, v21.8b
661    trn1      v21.4h, v2.4h, v6.4h
662    trn2      v6.4h, v2.4h, v6.4h       //row2 & row4
663    mov       v2.8b, v21.8b
664    trn1      v21.4h, v10.4h, v14.4h
665    trn2      v14.4h, v10.4h, v14.4h    //row6 & row8
666    mov       v10.8b, v21.8b
667    trn1      v21.4h, v3.4h, v7.4h
668    trn2      v7.4h, v3.4h, v7.4h       //row10 & 12
669    mov       v3.8b, v21.8b
670    trn1      v21.4h, v11.4h, v15.4h
671    trn2      v15.4h, v11.4h, v15.4h    //row14 & row16
672    mov       v11.8b, v21.8b
673    trn1      v21.2s, v6.2s, v14.2s
674    trn2      v14.2s, v6.2s, v14.2s     //row4 & 8
675    mov       v6.8b, v21.8b
676    trn1      v21.2s, v7.2s, v15.2s
677    trn2      v15.2s, v7.2s, v15.2s     //row 12 & 16
678    mov       v7.8b, v21.8b
679    //now Q3 ->p0 and Q7->q3
680    trn1      v21.4h, v0.4h, v4.4h
681    trn2      v4.4h, v0.4h, v4.4h       //row1 & 3
682    mov       v0.8b, v21.8b
683    trn1      v21.4h, v8.4h, v12.4h
684    trn2      v12.4h, v8.4h, v12.4h     //row 5 & 7
685    mov       v8.8b, v21.8b
686    trn1      v21.4h, v1.4h, v5.4h
687    trn2      v5.4h, v1.4h, v5.4h       //row9 & row11
688    mov       v1.8b, v21.8b
689    trn1      v21.4h, v9.4h, v13.4h
690    trn2      v13.4h, v9.4h, v13.4h     //row13 & row15
691    mov       v9.8b, v21.8b
692    sub       x0, x0, x1, lsl#4         //restore pointer
693    trn1      v21.2s, v0.2s, v8.2s
694    trn2      v8.2s, v0.2s, v8.2s       //row1 & row5
695    mov       v0.8b, v21.8b
696    trn1      v21.2s, v1.2s, v9.2s
697    trn2      v9.2s, v1.2s, v9.2s       //row9 & 13
698    mov       v1.8b, v21.8b
699    trn1      v21.2s, v2.2s, v10.2s
700    trn2      v10.2s, v2.2s, v10.2s     //row2 &6
701    mov       v2.8b, v21.8b
702    trn1      v21.2s, v3.2s, v11.2s
703    trn2      v11.2s, v3.2s, v11.2s     //row10&row14
704    mov       v3.8b, v21.8b
705    trn1      v21.2s, v4.2s, v12.2s
706    trn2      v12.2s, v4.2s, v12.2s     //row3 & 7
707    mov       v4.8b, v21.8b
708    trn1      v21.2s, v5.2s, v13.2s
709    trn2      v13.2s, v5.2s, v13.2s     //row11 & row15
710    mov       v5.8b, v21.8b
711    st1       {v0.8b}, [x0], x1         //row1
712    st1       {v2.8b}, [x0], x1         //row2
713    st1       {v4.8b}, [x0], x1         //row3
714    st1       {v6.8b}, [x0], x1         //row4
715    st1       {v8.8b}, [x0], x1         //row5
716    st1       {v10.8b}, [x0], x1        //row6
717    st1       {v12.8b}, [x0], x1        //row7
718    st1       {v14.8b}, [x0], x1        //row8
719    st1       {v1.8b}, [x0], x1         //row9
720    st1       {v3.8b}, [x0], x1         //row10
721    st1       {v5.8b}, [x0], x1         //row11
722    st1       {v7.8b}, [x0], x1         //row12
723    st1       {v9.8b}, [x0], x1         //row13
724    st1       {v11.8b}, [x0], x1        //row14
725    st1       {v13.8b}, [x0], x1        //row15
726    st1       {v15.8b}, [x0], x1        //row16
727
728    // LDMFD sp!,{x12,pc}
729    ldp       x19, x20, [sp], #16
730    pop_v_regs
731    ret
732
733
734
735///**
736//*******************************************************************************
737//*
738//* @brief
739//*     Performs filtering of a luma block vertical edge when the
740//*     boundary strength is set to 4
741//*
742//* @par Description:
743//*       This operation is described in  Sec. 8.7.2.4 under the title
744//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
745//*
746//* @param[in] x0 - pu1_src
747//*  Pointer to the src sample q0
748//*
749//* @param[in] w1 - src_strd
750//*  Source stride
751//*
752//* @param[in] w2 - alpha
753//*  Alpha Value for the boundary
754//*
755//* @param[in] w3 - beta
756//*  Beta Value for the boundary
757//*
758//* @returns
759//*  None
760//*
761//* @remarks
762//*  None
763//*
764//*******************************************************************************
765//*/
766
767    .global ih264_deblk_luma_vert_bs4_av8
768
769ih264_deblk_luma_vert_bs4_av8:
770
771    // STMFD sp!,{x12,x14}
772    push_v_regs
773    stp       x19, x20, [sp, #-16]!
774
775    sub       x0, x0, #4                //pointer uc_edgePixel-4
776    mov       x17, x0
777    //loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
778    ld1       {v0.8b}, [x0], x1         //row1
779    ld1       {v2.8b}, [x0], x1         //row2
780    ld1       {v4.8b}, [x0], x1         //row3
781    ld1       {v6.8b}, [x0], x1         //row4
782    ld1       {v8.8b}, [x0], x1         //row5
783    ld1       {v10.8b}, [x0], x1        //row6
784    ld1       {v12.8b}, [x0], x1        //row7
785    ld1       {v14.8b}, [x0], x1        //row8
786    ld1       {v1.8b}, [x0], x1         //row9
787    ld1       {v3.8b}, [x0], x1         //row10
788    ld1       {v5.8b}, [x0], x1         //row11
789    ld1       {v7.8b}, [x0], x1         //row12
790    ld1       {v9.8b}, [x0], x1         //row13
791    ld1       {v11.8b}, [x0], x1        //row14
792    ld1       {v13.8b}, [x0], x1        //row15
793    ld1       {v15.8b}, [x0], x1        //row16
794
795    //taking two 8x8 transposes
796    //2X2 transposes
797    trn1      v21.8b, v0.8b, v2.8b
798    trn2      v2.8b, v0.8b, v2.8b       //row1 &2
799    mov       v0.8b, v21.8b
800    trn1      v21.8b, v4.8b, v6.8b
801    trn2      v6.8b, v4.8b, v6.8b       //row3&row4
802    mov       v4.8b, v21.8b
803    trn1      v21.8b, v8.8b, v10.8b
804    trn2      v10.8b, v8.8b, v10.8b     //row5&6
805    mov       v8.8b, v21.8b
806    trn1      v21.8b, v12.8b, v14.8b
807    trn2      v14.8b, v12.8b, v14.8b    //row7 & 8
808    mov       v12.8b, v21.8b
809    trn1      v21.8b, v1.8b, v3.8b
810    trn2      v3.8b, v1.8b, v3.8b       //row9 &10
811    mov       v1.8b , v21.8b
812    trn1      v21.8b, v5.8b, v7.8b
813    trn2      v7.8b, v5.8b, v7.8b       //row11 & 12
814    mov       v5.8b , v21.8b
815    trn1      v21.8b, v9.8b, v11.8b
816    trn2      v11.8b, v9.8b, v11.8b     //row13 &14
817    mov       v9.8b , v21.8b
818    trn1      v21.8b, v13.8b, v15.8b
819    trn2      v15.8b, v13.8b, v15.8b    //row15 & 16
820    mov       v13.8b , v21.8b
821    //4x4 transposes
822    trn1      v21.4h, v2.4h, v6.4h
823    trn2      v6.4h, v2.4h, v6.4h       //row2 & row4
824    mov       v2.8b, v21.8b
825    trn1      v21.4h, v10.4h, v14.4h
826    trn2      v14.4h, v10.4h, v14.4h    //row6 & row8
827    mov       v10.8b , v21.8b
828    trn1      v21.4h, v3.4h, v7.4h
829    trn2      v7.4h, v3.4h, v7.4h       //row10 & 12
830    mov       v3.8b, v21.8b
831    trn1      v21.4h, v11.4h, v15.4h
832    trn2      v15.4h, v11.4h, v15.4h    //row14 & row16
833    mov       v11.8b, v21.8b
834    trn1      v21.2s, v6.2s, v14.2s
835    trn2      v14.2s, v6.2s, v14.2s     //row4 & 8
836    mov       v6.8b, v21.8b
837    trn1      v21.2s, v7.2s, v15.2s
838    trn2      v15.2s, v7.2s, v15.2s     //row 12 & 16
839    mov       v7.8b, v21.8b
840    //now Q3 ->p0 and Q7->q3
841    trn1      v21.4h, v0.4h, v4.4h
842    trn2      v4.4h, v0.4h, v4.4h       //row1 & 3
843    mov       v0.8b , v21.8b
844    trn1      v21.4h, v8.4h, v12.4h
845    trn2      v12.4h, v8.4h, v12.4h     //row 5 & 7
846    mov       v8.8b, v21.8b
847    trn1      v21.4h, v1.4h, v5.4h
848    trn2      v5.4h, v1.4h, v5.4h       //row9 & row11
849    mov       v1.8b, v21.8b
850    trn1      v21.4h, v9.4h, v13.4h
851    trn2      v13.4h, v9.4h, v13.4h     //row13 & row15
852    mov       v9.8b , v21.8b
853    trn1      v21.2s, v0.2s, v8.2s
854    trn2      v8.2s, v0.2s, v8.2s       //row1 & row5
855    mov       v0.8b, v21.8b
856    trn1      v21.2s, v1.2s, v9.2s
857    trn2      v9.2s, v1.2s, v9.2s       //row9 & 13
858    mov       v1.8b, v21.8b
859    //now Q0->p3 & Q4->q0
860    //starting processing as p0 and q0 are now ready
861    //now Q1->p2 & Q5->q1
862    mov       v31.d[0], v14.d[0]
863    mov       v31.d[1], v15.d[0]
864    trn1      v21.2s, v4.2s, v12.2s
865    trn2      v12.2s, v4.2s, v12.2s     //row3 & 7
866    mov       v4.8b, v21.8b
867    movi      v28.8h, #2
868    trn1      v21.2s, v5.2s, v13.2s
869    trn2      v13.2s, v5.2s, v13.2s     //row11 & row15
870    mov       v5.8b, v21.8b
871    uaddl     v16.8h, v6.8b, v8.8b      //p0+q0 L
872    trn1      v21.2s, v2.2s, v10.2s
873    trn2      v10.2s, v2.2s, v10.2s     //row2 &6
874    mov       v2.8b, v21.8b
875    uaddl     v18.8h, v7.8b, v9.8b      //p0+q0 H
876    trn1      v21.2s, v3.2s, v11.2s
877    trn2      v11.2s, v3.2s, v11.2s     //row10&row14
878    mov       v3.8b, v21.8b
879    uaddw     v20.8h, v16.8h , v4.8b    //p0+q0+p1 L
880    uaddw     v22.8h, v18.8h , v5.8b    //p0+q0+p1 H
881    uaddl     v24.8h, v2.8b, v10.8b     //p2+q1 L
882    uaddl     v26.8h, v3.8b, v11.8b     //p2+q1 H
883    mla       v24.8h, v20.8h , v28.8h   //p2 + X2(p1) + X2(p0) + X2(q0) + q1 L
884    mla       v26.8h, v22.8h , v28.8h   //p2 + X2(p1) + X2(p0) + X2(q0) + q1 H
885    movi      v28.16b, #2
886    uaddw     v16.8h, v20.8h , v2.8b    //p0+q0+p1+p2 L
887    uaddw     v18.8h, v22.8h , v3.8b    //p0+q0+p1+p2 H
888    dup       v30.16b, w2               //duplicate alpha
889    rshrn     v20.8b, v16.8h, #2        //(p2 + p1 + p0 + q0 + 2) >> 2)L p1'
890    rshrn     v21.8b, v18.8h, #2        //(p2 + p1 + p0 + q0 + 2) >> 2)H p1'
891    mov       v20.d[1] , v21.d[0]
892    mov       v0.d[1] , v1.d[0]
893    mov       v2.d[1] , v3.d[0]
894    mov       v4.d[1] , v5.d[0]
895    mov       v6.d[1] , v7.d[0]
896    mov       v8.d[1] , v9.d[0]
897    mov       v10.d[1] , v11.d[0]
898    mov       v12.d[1] , v13.d[0]
899    mov       v14.d[1] , v15.d[0]
900    uabd      v22.16b  , v6.16b, v8.16b
901    usra      v28.16b, v30.16b, #2      //alpha >>2 +2
902    uabd      v30.16b  , v2.16b, v6.16b
903    rshrn     v24.8b, v24.8h, #3        //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0'
904    rshrn     v25.8b, v26.8h, #3        //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0'
905    mov       v24.d[1] , v25.d[0]
906    dup       v26.16b, w3               //beta
907    cmhi      v28.16b, v28.16b , v22.16b //ABS(p0 - q0) <((Alpha >>2) + 2)
908    uaddl     v22.8h, v6.8b, v10.8b     //p0+q1 L
909    cmhi      v14.16b, v26.16b , v30.16b //beta>Ap
910    uaddl     v30.8h, v7.8b, v11.8b     //p0+q1 H
911    uaddw     v22.8h, v22.8h , v4.8b    //p0+q1+p1 L
912    uaddw     v30.8h, v30.8h , v5.8b    //p0+q1+p1 H
913    uaddw     v22.8h, v22.8h , v4.8b    //p0+q1+2*p1 L
914    uaddw     v30.8h, v30.8h , v5.8b    //p0+q1+2*p1 H
915    and       v14.16b, v14.16b , v28.16b //(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
916    rshrn     v22.8b, v22.8h, #2        //((X2(p1) + p0 + q1 + 2) >> 2) L p0"
917    rshrn     v23.8b, v30.8h, #2        //((X2(p1) + p0 + q1 + 2) >> 2) H p0"
918    mov       v22.d[1] , v23.d[0]
919    uaddl     v30.8h, v2.8b, v0.8b      //p2+p3 L
920    bif       v24.16b, v22.16b , v14.16b //p0' or p0 "
921    uaddl     v22.8h, v3.8b, v1.8b      //p2+p3 H
922    add       v30.8h, v30.8h , v30.8h   //2*(p2+p3) L
923    add       v22.8h, v22.8h , v22.8h   //2*(p2+p3)H
924    add       v16.8h, v16.8h , v30.8h   //(X2(p3) + X3(p2) + p1 + p0 + q0) L
925    add       v18.8h, v18.8h , v22.8h   //(X2(p3) + X3(p2) + p1 + p0 + q0) H
926    uabd      v30.16b  , v12.16b, v8.16b
927    uabd      v22.16b  , v10.16b, v8.16b
928    rshrn     v16.8b, v16.8h, #3        //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2'
929    rshrn     v17.8b, v18.8h, #3        //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2'
930    mov       v16.d[1] , v17.d[0]
931    uabd      v18.16b  , v4.16b, v6.16b
932    cmhi      v30.16b, v26.16b , v30.16b //Aq < Beta
933    cmhs      v22.16b, v22.16b, v26.16b
934    cmhs      v18.16b, v18.16b, v26.16b
935    dup       v26.16b, w2               //duplicate alpha
936    and       v30.16b, v30.16b , v28.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
937    uabd      v28.16b  , v6.16b, v8.16b
938    orr       v22.16b, v22.16b , v18.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
939    uaddl     v18.8h, v6.8b, v8.8b      //p0+q0 L
940    cmhs      v28.16b, v28.16b, v26.16b
941    uaddl     v26.8h, v7.8b, v9.8b      //p0+q0 H
942    uaddw     v18.8h, v18.8h , v10.8b   //p0+q0+q1 L
943    orr       v22.16b, v22.16b , v28.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha
944    uaddw     v26.8h, v26.8h , v11.8b   //p0+q0+q1 H
945    bic       v14.16b, v14.16b , v22.16b //final condn for p's
946    movi      v28.16b, #2
947    bif       v6.16b, v24.16b , v22.16b //final p0
948    bit       v2.16b, v16.16b , v14.16b //final p2
949    bif       v20.16b, v4.16b , v14.16b //final p1
950    mov       v7.d[0] , v6.d[1]
951    mov       v3.d[0] , v2.d[1]
952    mov       v21.d[0] , v20.d[1]
953    uaddl     v24.8h, v8.8b, v4.8b      //q0+p1 L
954    umlal     v24.8h, v10.8b, v28.8b    //X2(q1) + q0 + p1 L
955    uaddl     v16.8h, v9.8b, v5.8b      //q0+p1 H
956    umlal     v16.8h, v11.8b, v28.8b    //X2(q1) + q0 + p1 H
957    movi      v28.8h, #2
958    uaddl     v14.8h, v4.8b, v12.8b     //p1+q2 L
959    mla       v14.8h, v18.8h , v28.8h   //p1 + X2(p0) + X2(q0) + X2(q1) + q2L
960    uaddl     v4.8h, v5.8b, v13.8b      //p1+q2H
961    mla       v4.8h, v26.8h , v28.8h    //p1 + X2(p0) + X2(q0) + X2(q1) + q2H
962    rshrn     v24.8b, v24.8h, #2        //(X2(q1) + q0 + p1 + 2) >> 2; L q0'
963    rshrn     v25.8b, v16.8h, #2        //(X2(q1) + q0 + p1 + 2) >> 2; H q0'
964    mov       v24.d[1] , v25.d[0]
965    uaddw     v18.8h, v18.8h , v12.8b   //p0 + q0 + q1 + q2 L
966    uaddw     v26.8h, v26.8h , v13.8b   //p0 + q0 + q1 + q2 H
967    rshrn     v16.8b, v14.8h, #3        //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo"
968    mov       v14.16b, v31.16b
969    rshrn     v17.8b, v4.8h, #3         //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo"
970    mov       v16.d[1] , v17.d[0]
971    rshrn     v4.8b, v18.8h, #2         //p0 + q0 + q1 + q2 + 2)>>2 L q1'
972    rshrn     v5.8b, v26.8h, #2         //p0 + q0 + q1 + q2 + 2)>>2 H q1'
973    mov       v4.d[1] , v5.d[0]
974    bit       v24.16b, v16.16b , v30.16b //q0' or q0"
975    bic       v30.16b, v30.16b , v22.16b //final condn for q's
976    trn1      v31.8b, v0.8b, v2.8b
977    trn2      v2.8b, v0.8b, v2.8b       //row1 &2
978    mov       v0.8b, v31.8b
979    bit       v10.16b, v4.16b , v30.16b
980    mov       v11.d[0] , v10.d[1]
981    mov       v25.d[0] , v24.d[1]
982    mov       v31.d[0] , v30.d[1]
983    trn1      v31.8b, v1.8b, v3.8b
984    trn2      v3.8b, v1.8b, v3.8b       //row9 &10
985    mov       v1.8b, v31.8b
986    uaddl     v16.8h, v12.8b, v14.8b    //q2+q3 L
987    trn1      v31.8b, v20.8b, v6.8b
988    trn2      v6.8b, v20.8b, v6.8b      //row3&row4
989    mov       v20.8b , v31.8b
990    uaddl     v4.8h, v13.8b, v15.8b     //q2+q3 H
991    trn1      v31.8b, v21.8b, v7.8b
992    trn2      v7.8b, v21.8b, v7.8b      //row11 & 12
993    mov       v21.8b , v31.8b
994    mla       v18.8h, v16.8h , v28.8h   //X2(q3) + X3(q2) + q1 + q0 + p0 L
995    trn1      v31.4h, v2.4h, v6.4h
996    trn2      v6.4h, v2.4h, v6.4h       //row2 & row4
997    mov       v2.8b, v31.8b
998    mla       v26.8h, v4.8h , v28.8h    //X2(q3) + X3(q2) + q1 + q0 + p0 H
999    trn1      v31.4h, v3.4h, v7.4h
1000    trn2      v7.4h, v3.4h, v7.4h       //row10 & 12
1001    mov       v3.8b , v31.8b
1002    bif       v8.16b, v24.16b , v22.16b //final q0
1003    mov       v9.d[0] , v8.d[1]
1004    trn1      v31.4h, v0.4h, v20.4h
1005    trn2      v20.4h, v0.4h, v20.4h     //row1 & 3
1006    mov       v0.8b , v31.8b
1007    rshrn     v18.8b, v18.8h, #3        //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L
1008    trn1      v31.4h, v1.4h, v21.4h
1009    trn2      v21.4h, v1.4h, v21.4h     //row9 & row11
1010    mov       v1.8b, v31.8b
1011    rshrn     v19.8b, v26.8h, #3        //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H
1012    mov       v18.d[1] , v19.d[0]
1013    trn1      v31.8b, v8.8b, v10.8b
1014    trn2      v10.8b, v8.8b, v10.8b     //row5&6
1015    mov       v8.8b, v31.8b
1016    bit       v12.16b, v18.16b , v30.16b //final q2
1017    mov       v13.d[0] , v12.d[1]
1018    trn1      v31.8b, v9.8b, v11.8b
1019    trn2      v11.8b, v9.8b, v11.8b     //row13 &14
1020    mov       v9.8b, v31.8b
1021    trn1      v31.8b, v12.8b, v14.8b
1022    trn2      v14.8b, v12.8b, v14.8b    //row7 & 8
1023    mov       v12.8b, v31.8b
1024    trn1      v31.8b, v13.8b, v15.8b
1025    trn2      v15.8b, v13.8b, v15.8b    //row15 & 16
1026    mov       v13.8b , v31.8b
1027    trn1      v31.4h, v10.4h, v14.4h
1028    trn2      v14.4h, v10.4h, v14.4h    //row6 & row8
1029    mov       v10.8b, v31.8b
1030    trn1      v31.4h, v11.4h, v15.4h
1031    trn2      v15.4h, v11.4h, v15.4h    //row14 & row16
1032    mov       v11.8b, v31.8b
1033    //now Q3 ->p0 and Q7->q3
1034    trn1      v31.4h, v8.4h, v12.4h
1035    trn2      v12.4h, v8.4h, v12.4h     //row 5 & 7
1036    mov       v8.8b, v31.8b
1037    trn1      v31.4h, v9.4h, v13.4h
1038    trn2      v13.4h, v9.4h, v13.4h     //row13 & row15
1039    mov       v9.8b, v31.8b
1040    sub       x0, x0, x1, lsl#4         //restore pointer
1041    trn1      v31.2s, v6.2s, v14.2s
1042    trn2      v14.2s, v6.2s, v14.2s     //row4 & 8
1043    mov       v6.8b , v31.8b
1044    trn1      v31.2s, v7.2s, v15.2s
1045    trn2      v15.2s, v7.2s, v15.2s     //row 12 & 16
1046    mov       v7.8b, v31.8b
1047    trn1      v31.2s, v0.2s, v8.2s
1048    trn2      v8.2s, v0.2s, v8.2s       //row1 & row5
1049    mov       v0.8b , v31.8b
1050    trn1      v31.2s, v1.2s, v9.2s
1051    trn2      v9.2s, v1.2s, v9.2s       //row9 & 13
1052    mov       v1.8b , v31.8b
1053    trn1      v31.2s, v2.2s, v10.2s
1054    trn2      v10.2s, v2.2s, v10.2s     //row2 &6
1055    mov       v2.8b , v31.8b
1056    trn1      v31.2s, v3.2s, v11.2s
1057    trn2      v11.2s, v3.2s, v11.2s     //row10&row14
1058    mov       v3.8b , v31.8b
1059    trn1      v31.2s, v20.2s, v12.2s
1060    trn2      v12.2s, v20.2s, v12.2s    //row3 & 7
1061    mov       v20.8b , v31.8b
1062    trn1      v31.2s, v21.2s, v13.2s
1063    trn2      v13.2s, v21.2s, v13.2s    //row11 & row15
1064    mov       v21.8b, v31.8b
1065    st1       {v0.8b}, [x0], x1         //row1
1066    st1       {v2.8b}, [x0], x1         //row2
1067    st1       {v20.8b}, [x0], x1        //row3
1068    st1       {v6.8b}, [x0], x1         //row4
1069    st1       {v8.8b}, [x0], x1         //row5
1070    st1       {v10.8b}, [x0], x1        //row6
1071    st1       {v12.8b}, [x0], x1        //row7
1072    st1       {v14.8b}, [x0], x1        //row8
1073    st1       {v1.8b}, [x0], x1         //row9
1074    st1       {v3.8b}, [x0], x1         //row10
1075    st1       {v21.8b}, [x0], x1        //row11
1076    st1       {v7.8b}, [x0], x1         //row12
1077    st1       {v9.8b}, [x0], x1         //row13
1078    st1       {v11.8b}, [x0], x1        //row14
1079    st1       {v13.8b}, [x0], x1        //row15
1080    st1       {v15.8b}, [x0], x1        //row16
1081
1082    // LDMFD sp!,{x12,pc}
1083    ldp       x19, x20, [sp], #16
1084    pop_v_regs
1085    ret
1086
1087
1088