1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///*****************************************************************************/
21///*                                                                           */
22///*  File Name         : ih264_deblk_chroma_av8.s                              */
23///*                                                                           */
24///*  Description       : Contains function definitions for deblocking luma    */
25///*                      edge. Functions are coded in NEON assembly and can   */
26///*                      be compiled using ARM RVDS.                          */
27///*                                                                           */
28///*  List of Functions : ih264_deblk_chroma_vert_bs4_av8()              */
29///*                      ih264_deblk_chroma_vert_bslt4_av8()            */
30///*                      ih264_deblk_chroma_horz_bs4_av8()              */
31///*                      ih264_deblk_chroma_horz_bslt4_av8()            */
32///*  Issues / Problems : None                                                 */
33///*                                                                           */
34///*  Revision History  :                                                      */
35///*                                                                           */
36///*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
37///*         28 11 2013   Ittiam          Draft                                */
38///*****************************************************************************/
39
40
41.text
42.p2align 2
43.include "ih264_neon_macros.s"
44
45///**
46//*******************************************************************************
47//*
48//* @brief
49//*     Performs filtering of a chroma block horizontal edge when the
50//*     boundary strength is set to 4 in high profile
51//*
52//* @par Description:
53//*       This operation is described in  Sec. 8.7.2.4 under the title
54//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
55//*
56//* @param[in] x0 - pu1_src
57//*  Pointer to the src sample q0
58//*
59//* @param[in] x1 - src_strd
60//*  Source stride
61//*
62//* @param[in] x2 - alpha_cb
63//*  Alpha Value for the boundary in U
64//*
65//* @param[in] x3 - beta_cb
66//*  Beta Value for the boundary in U
67//*
68//* @param[in] sp(0) - alpha_cr
69//*    Alpha Value for the boundary in V
70//*
71//* @param[in] sp(4) - beta_cr
72//*    Beta Value for the boundary in V
73//*
74//* @returns
75//*  None
76//*
77//* @remarks
78//*  None
79//*
80//*******************************************************************************
81//*/
82
83    .global ih264_deblk_chroma_horz_bs4_av8
84
85ih264_deblk_chroma_horz_bs4_av8:
86
87    // STMFD sp!,{x4-x6,x14}            //
88    push_v_regs
89    stp       x19, x20, [sp, #-16]!
90    mov       x6, x5
91    mov       x5, x4
92    sub       x0, x0, x1, lsl #1        //x0 = uc_edgePixel pointing to p1 of chroma
93    ld2       {v6.8b, v7.8b}, [x0], x1  //D6 = p1u , D7 = p1v
94    mov       x4, x0                    //Keeping a backup of the pointer p0 of chroma
95    ld2       {v4.8b, v5.8b}, [x0], x1  //D4 = p0u , D5 = p0v
96    dup       v20.8b, w2                //D20 contains alpha_cb
97    dup       v21.8b, w5                //D21 contains alpha_cr
98    mov       v20.d[1], v21.d[0]
99    ld2       {v0.8b, v1.8b}, [x0], x1  //D0 = q0u , D1 = q0v
100    uaddl     v8.8h, v6.8b, v0.8b       //
101    uaddl     v10.8h, v7.8b, v1.8b      //Q4,Q5 = q0 + p1
102    movi      v31.8b, #2                //
103    ld2       {v2.8b, v3.8b}, [x0]      //D2 = q1u , D3 = q1v
104    mov       v0.d[1], v1.d[0]
105    mov       v2.d[1], v3.d[0]
106    mov       v4.d[1], v5.d[0]
107    mov       v6.d[1], v7.d[0]
108    uabd      v26.16b, v6.16b , v4.16b  //Q13 = ABS(p1 - p0)
109    umlal     v8.8h, v2.8b, v31.8b      //
110    umlal     v10.8h, v3.8b, v31.8b     //Q5,Q4 = (X2(q1U) + q0U + p1U)
111    uabd      v22.16b, v4.16b , v0.16b  //Q11 = ABS(p0 - q0)
112    uabd      v24.16b, v2.16b , v0.16b  //Q12 = ABS(q1 - q0)
113    uaddl     v14.8h, v4.8b, v2.8b      //
114    uaddl     v28.8h, v5.8b, v3.8b      //Q14,Q7 = P0 + Q1
115    dup       v16.8b, w3                //D16 contains beta_cb
116    dup       v17.8b, w6                //D17 contains beta_cr
117    mov       v16.d[1], v17.d[0]
118    umlal     v14.8h, v6.8b, v31.8b     //
119    umlal     v28.8h, v7.8b, v31.8b     //Q14,Q7 = (X2(p1U) + p0U + q1U)
120    cmhs      v18.16b, v22.16b, v20.16b
121    cmhs      v24.16b, v24.16b, v16.16b
122    cmhs      v26.16b, v26.16b, v16.16b
123    rshrn     v8.8b, v8.8h, #2          //
124    rshrn     v9.8b, v10.8h, #2         //Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
125    mov       v8.d[1], v9.d[0]
126    orr       v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
127    rshrn     v10.8b, v14.8h, #2        //
128    rshrn     v11.8b, v28.8h, #2        //Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
129    mov       v10.d[1], v11.d[0]
130    orr       v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
131    bit       v10.16b, v4.16b , v18.16b //
132    bit       v8.16b, v0.16b , v18.16b  //
133    mov       v11.d[0], v10.d[1]
134    mov       v9.d[0], v8.d[1]
135    st2       {v10.8b, v11.8b}, [x4], x1 //
136    st2       {v8.8b, v9.8b}, [x4]      //
137    // LDMFD sp!,{x4-x6,pc}                //
138    ldp       x19, x20, [sp], #16
139    pop_v_regs
140    ret
141
142
143
144///**
145//*******************************************************************************
146//*
147//* @brief
148//*     Performs filtering of a chroma block vertical edge when the
149//*     boundary strength is set to 4 in high profile
150//*
151//* @par Description:
152//*       This operation is described in  Sec. 8.7.2.4 under the title
153//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
154//*
155//* @param[in] x0 - pu1_src
156//*  Pointer to the src sample q0
157//*
158//* @param[in] x1 - src_strd
159//*  Source stride
160//*
161//* @param[in] x2 - alpha_cb
162//*  Alpha Value for the boundary in U
163//*
164//* @param[in] x3 - beta_cb
165//*  Beta Value for the boundary in U
166//*
167//* @param[in] sp(0) - alpha_cr
168//*    Alpha Value for the boundary in V
169//*
170//* @param[in] sp(4) - beta_cr
171//*    Beta Value for the boundary in V
172//*
173//* @returns
174//*  None
175//*
176//* @remarks
177//*  None
178//*
179//*******************************************************************************
180//*/
181
182    .global ih264_deblk_chroma_vert_bs4_av8
183
184ih264_deblk_chroma_vert_bs4_av8:
185
186    // STMFD sp!,{x4,x5,x12,x14}
187    push_v_regs
188    stp       x19, x20, [sp, #-16]!
189
190    sub       x0, x0, #4                //point x0 to p1u of row0.
191    mov       x12, x0                   //keep a back up of x0 for buffer write
192
193    add       x2, x2, x4, lsl #8        //x2 = (alpha_cr,alpha_cb)
194    add       x3, x3, x5, lsl #8        //x3 = (beta_cr,beta_cb)
195
196    ld4       {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
197    ld4       {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
198    ld4       {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1
199    ld4       {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1
200
201    ld4       {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1
202    ld4       {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1
203    ld4       {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1
204    ld4       {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1
205
206    mov       v10.16b, v2.16b
207    mov       v2.16b, v1.16b
208    mov       v1.16b, v4.16b
209    mov       v4.16b, v10.16b
210    mov       v10.16b, v6.16b
211    mov       v6.16b, v3.16b
212    mov       v3.16b, v5.16b
213    mov       v5.16b, v10.16b
214
215    dup       v22.8h, w2                //Q11 = alpha
216    dup       v24.8h, w3                //Q12 = beta
217    movi      v31.8b, #2
218
219    mov       v0.d[1], v1.d[0]
220    mov       v2.d[1], v3.d[0]
221    mov       v4.d[1], v5.d[0]
222    mov       v6.d[1], v7.d[0]
223
224    uabd      v8.16b, v2.16b , v4.16b   //|p0-q0|
225    uabd      v10.16b, v6.16b , v4.16b  //|q1-q0|
226    uabd      v12.16b, v0.16b , v2.16b  //|p1-p0|
227    uaddl     v14.8h, v2.8b, v6.8b
228    uaddl     v16.8h, v3.8b, v7.8b      //(p0 + q1)
229    cmhi      v8.16b, v22.16b , v8.16b  //|p0-q0| < alpha ?
230    cmhi      v10.16b, v24.16b , v10.16b //|q1-q0| < beta ?
231    cmhi      v12.16b, v24.16b , v12.16b //|p1-p0| < beta ?
232    umlal     v14.8h, v0.8b, v31.8b
233    umlal     v16.8h, v1.8b, v31.8b     //2*p1 + (p0 + q1)
234    uaddl     v18.8h, v0.8b, v4.8b
235    uaddl     v20.8h, v1.8b, v5.8b      //(p1 + q0)
236    and       v8.16b, v8.16b , v10.16b  //|p0-q0| < alpha && |q1-q0| < beta
237    umlal     v18.8h, v6.8b, v31.8b
238    umlal     v20.8h, v7.8b, v31.8b     //2*q1 + (p1 + q0)
239
240    rshrn     v14.8b, v14.8h, #2
241    rshrn     v15.8b, v16.8h, #2        //(2*p1 + (p0 + q1) + 2) >> 2
242    mov       v14.d[1], v15.d[0]
243    and       v8.16b, v8.16b , v12.16b  //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
244    rshrn     v18.8b, v18.8h, #2
245    rshrn     v19.8b, v20.8h, #2        //(2*q1 + (p1 + q0) + 2) >> 2
246    mov       v18.d[1], v19.d[0]
247    bit       v2.16b, v14.16b , v8.16b
248    bit       v4.16b, v18.16b , v8.16b
249
250    mov       v1.d[0], v0.d[1]
251    mov       v3.d[0], v2.d[1]
252    mov       v5.d[0], v4.d[1]
253    mov       v7.d[0], v6.d[1]
254
255    mov       v10.16b, v1.16b
256    mov       v1.16b, v2.16b
257    mov       v2.16b, v4.16b
258    mov       v4.16b, v10.16b
259    mov       v10.16b, v3.16b
260    mov       v3.16b, v6.16b
261    mov       v6.16b, v5.16b
262    mov       v5.16b, v10.16b
263
264    st4       {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1
265    st4       {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1
266    st4       {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1
267    st4       {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1
268
269    st4       {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1
270    st4       {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1
271    st4       {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1
272    st4       {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1
273
274    // LDMFD sp!,{x4,x5,x12,pc}
275    ldp       x19, x20, [sp], #16
276    pop_v_regs
277    ret
278
279
280
281///**
282//*******************************************************************************
283//*
284//* @brief
285//*     Performs filtering of a chroma block horizontal edge for cases where the
286//*     boundary strength is less than 4 in high profile
287//*
288//* @par Description:
289//*       This operation is described in  Sec. 8.7.2.4 under the title
290//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
291//*
292//* @param[in] x0 - pu1_src
293//*  Pointer to the src sample q0
294//*
295//* @param[in] x1 - src_strd
296//*  Source stride
297//*
298//* @param[in] x2 - alpha_cb
299//*  Alpha Value for the boundary in U
300//*
301//* @param[in] x3 - beta_cb
302//*  Beta Value for the boundary in U
303//*
304//* @param[in] sp(0) - alpha_cr
305//*    Alpha Value for the boundary in V
306//*
307//* @param[in] sp(4) - beta_cr
308//*    Beta Value for the boundary in V
309//*
310//* @param[in] sp(8) - u4_bs
311//*    Packed Boundary strength array
312//*
313//* @param[in] sp(12) - pu1_cliptab_cb
314//*    tc0_table for U
315//*
316//* @param[in] sp(16) - pu1_cliptab_cr
317//*    tc0_table for V
318//*
319//* @returns
320//*  None
321//*
322//* @remarks
323//*  None
324//*
325//*******************************************************************************
326//*/
327
328    .global ih264_deblk_chroma_horz_bslt4_av8
329
330ih264_deblk_chroma_horz_bslt4_av8:
331
332    // STMFD sp!,{x4-x9,x14}        //
333    push_v_regs
334    stp       x19, x20, [sp, #-16]!
335    mov       x8, x7
336    mov       x7, x6
337    ldr       x9, [sp, #80]
338    sub       x0, x0, x1, lsl #1        //x0 = uc_edgePixelU pointing to p1 of chroma U
339    rev       w7, w7                    //
340    mov       v12.s[0], w7              //D12[0] = ui_Bs
341    ld1       {v16.s}[0], [x8]          //D16[0] contains cliptab_cb
342    ld1       {v17.s}[0], [x9]          //D17[0] contains cliptab_cr
343    ld2       {v6.8b, v7.8b}, [x0], x1  //Q3=p1
344    tbl       v14.8b, {v16.16b}, v12.8b //Retreiving cliptab values for U
345    tbl       v28.8b, {v17.16b}, v12.8b //Retrieving cliptab values for V
346    uxtl      v12.8h, v12.8b            //Q6 = uc_Bs in each 16 bit scalar
347    mov       x6, x0                    //Keeping a backup of the pointer to chroma U P0
348    ld2       {v4.8b, v5.8b}, [x0], x1  //Q2=p0
349    movi      v30.8b, #1                //
350    dup       v20.8b, w2                //D20 contains alpha_cb
351    dup       v21.8b, w4                //D21 contains alpha_cr
352    mov       v20.d[1], v21.d[0]
353    ld2       {v0.8b, v1.8b}, [x0], x1  //Q0=q0
354    uxtl      v14.8h, v14.8b            //
355    uxtl      v28.8h, v28.8b            //
356    mov       v15.d[0], v28.d[0]        //D14 has cliptab values for U, D15 for V
357    mov       v14.d[1], v28.d[0]
358    ld2       {v2.8b, v3.8b}, [x0]      //Q1=q1
359    usubl     v10.8h, v1.8b, v5.8b      //
360    usubl     v8.8h, v0.8b, v4.8b       //Q5,Q4 = (q0 - p0)
361    mov       v6.d[1], v7.d[0]
362    mov       v4.d[1], v5.d[0]
363    uabd      v26.16b, v6.16b , v4.16b  //Q13 = ABS(p1 - p0)
364    shl       v10.8h, v10.8h, #2        //Q5 = (q0 - p0)<<2
365    mov       v0.d[1], v1.d[0]
366    uabd      v22.16b, v4.16b , v0.16b  //Q11 = ABS(p0 - q0)
367    shl       v8.8h, v8.8h, #2          //Q4 = (q0 - p0)<<2
368    mov       v14.d[1], v15.d[0]
369    sli       v14.8h, v14.8h, #8
370    mov       v15.d[0], v14.d[1]
371    mov       v2.d[1], v3.d[0]
372    uabd      v24.16b, v2.16b , v0.16b  //Q12 = ABS(q1 - q0)
373    cmhs      v18.16b, v22.16b, v20.16b
374    usubl     v20.8h, v6.8b, v2.8b      //Q10 = (p1 - q1)L
375    usubl     v6.8h, v7.8b, v3.8b       //Q3 = (p1 - q1)H
376    dup       v16.8b, w3                //Q8 contains beta_cb
377    dup       v17.8b, w5                //Q8 contains beta_cr
378    mov       v16.d[1], v17.d[0]
379    add       v8.8h, v8.8h , v20.8h     //
380    add       v10.8h, v10.8h , v6.8h    //Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
381    cmhs      v24.16b, v24.16b, v16.16b
382    cmgt      v12.4h, v12.4h, #0
383    sqrshrn   v8.8b, v8.8h, #3          //
384    sqrshrn   v9.8b, v10.8h, #3         //Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
385    mov       v8.d[1], v9.d[0]
386    add       v14.8b, v14.8b , v30.8b   //D14 = C = C0+1 for U
387    cmhs      v26.16b, v26.16b, v16.16b
388    orr       v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
389    abs       v6.16b, v8.16b            //Q4 = ABS (i_macro)
390    add       v15.8b, v15.8b , v30.8b   //D15 = C = C0+1 for V
391    mov       v14.d[1], v15.d[0]
392    mov       v13.8b, v12.8b
393    mov       v12.d[1], v13.d[0]        //
394    orr       v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
395    umin      v14.16b, v6.16b , v14.16b //Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
396    bic       v12.16b, v12.16b , v18.16b //final condition
397    cmge      v8.16b, v8.16b, #0
398    and       v14.16b, v14.16b , v12.16b //Making delta zero in places where values shouldn be filterd
399    uqadd     v16.16b, v4.16b , v14.16b //Q8 = p0 + delta
400    uqsub     v4.16b, v4.16b , v14.16b  //Q2 = p0 - delta
401    uqadd     v18.16b, v0.16b , v14.16b //Q9 = q0 + delta
402    uqsub     v0.16b, v0.16b , v14.16b  //Q0 = q0 - delta
403    bif       v16.16b, v4.16b , v8.16b  //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
404    bif       v0.16b, v18.16b , v8.16b  //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
405    mov       v17.d[0], v16.d[1]
406    mov       v1.d[0], v0.d[1]
407    st2       {v16.8b, v17.8b}, [x6], x1 //
408    st2       {v0.8b, v1.8b}, [x6]      //
409
410    ldp       x19, x20, [sp], #16
411    pop_v_regs
412    ret
413
414
415
416
417///**
418//*******************************************************************************
419//*
420//* @brief
421//*     Performs filtering of a chroma block vertical edge for cases where the
422//*     boundary strength is less than 4 in high profile
423//*
424//* @par Description:
425//*       This operation is described in  Sec. 8.7.2.4 under the title
426//*       "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
427//*
428//* @param[in] x0 - pu1_src
429//*  Pointer to the src sample q0
430//*
431//* @param[in] x1 - src_strd
432//*  Source stride
433//*
434//* @param[in] x2 - alpha_cb
435//*  Alpha Value for the boundary in U
436//*
437//* @param[in] x3 - beta_cb
438//*  Beta Value for the boundary in U
439//*
440//* @param[in] sp(0) - alpha_cr
441//*    Alpha Value for the boundary in V
442//*
443//* @param[in] sp(4) - beta_cr
444//*    Beta Value for the boundary in V
445//*
446//* @param[in] sp(8) - u4_bs
447//*    Packed Boundary strength array
448//*
449//* @param[in] sp(12) - pu1_cliptab_cb
450//*    tc0_table for U
451//*
452//* @param[in] sp(16) - pu1_cliptab_cr
453//*    tc0_table for V
454//*
455//* @returns
456//*  None
457//*
458//* @remarks
459//*  None
460//*
461//*******************************************************************************
462//*/
463
464    .global ih264_deblk_chroma_vert_bslt4_av8
465
466ih264_deblk_chroma_vert_bslt4_av8:
467
468    // STMFD sp!,{x4-x7,x10-x12,x14}
469    push_v_regs
470    stp       x19, x20, [sp, #-16]!
471    mov       x10, x7
472    ldr       x11, [sp, #80]            //x6 = u4_bs
473    sub       x0, x0, #4                //point x0 to p1u of row0.
474    add       x2, x2, x4, lsl #8
475    add       x3, x3, x5, lsl #8
476    mov       x12, x0                   //keep a back up of x0 for buffer write
477    ld4       {v0.h, v1.h, v2.h, v3.h}[0], [x0], x1
478    ld4       {v0.h, v1.h, v2.h, v3.h}[1], [x0], x1
479    ld4       {v0.h, v1.h, v2.h, v3.h}[2], [x0], x1
480    ld4       {v0.h, v1.h, v2.h, v3.h}[3], [x0], x1
481
482    ld4       {v4.h, v5.h, v6.h, v7.h}[0], [x0], x1
483    ld4       {v4.h, v5.h, v6.h, v7.h}[1], [x0], x1
484    ld4       {v4.h, v5.h, v6.h, v7.h}[2], [x0], x1
485    ld4       {v4.h, v5.h, v6.h, v7.h}[3], [x0], x1
486
487    mov       v10.16b, v2.16b
488    mov       v2.16b, v1.16b
489    mov       v1.16b, v4.16b
490    mov       v4.16b, v10.16b
491    mov       v10.16b, v6.16b
492    mov       v6.16b, v3.16b
493    mov       v3.16b, v5.16b
494    mov       v5.16b, v10.16b
495    dup       v22.8h, w2                //Q11 = alpha
496    mov       v2.d[1], v3.d[0]
497    mov       v4.d[1], v5.d[0]
498    uabd      v8.16b, v2.16b , v4.16b   //|p0-q0|
499    dup       v24.8h, w3                //Q12 = beta
500    mov       v25.d[0], v24.d[1]
501    mov       v6.d[1], v7.d[0]
502    mov       v0.d[1], v1.d[0]
503    uabd      v10.16b, v6.16b , v4.16b  //|q1-q0|
504    uabd      v12.16b, v0.16b , v2.16b  //|p1-p0|
505    cmhi      v8.16b, v22.16b , v8.16b  //|p0-q0| < alpha ?
506    usubl     v14.8h, v0.8b, v6.8b
507    cmhi      v10.16b, v24.16b , v10.16b //|q1-q0| < beta ?
508    usubl     v16.8h, v1.8b, v7.8b      //(p1 - q1)
509    cmhi      v12.16b, v24.16b , v12.16b //|p1-p0| < beta ?
510    usubl     v18.8h, v4.8b, v2.8b
511    and       v8.16b, v8.16b , v10.16b  //|p0-q0| < alpha && |q1-q0| < beta
512    usubl     v20.8h, v5.8b, v3.8b      //(q0 - p0)
513    movi      v28.8h, #4
514    ld1       {v24.s}[0], [x10]         //Load ClipTable for U
515    ld1       {v25.s}[0], [x11]         //Load ClipTable for V
516    rev       w6, w6                    //Blocking strengths
517    and       v8.16b, v8.16b , v12.16b  //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
518    mov       v10.s[0], w6
519    mla       v14.8h, v18.8h , v28.8h
520    mla       v16.8h, v20.8h , v28.8h   //4*(q0 - p0) + (p1 - q1)
521    uxtl      v10.8h, v10.8b
522    sli       v10.4h, v10.4h, #8
523    tbl       v12.8b, {v24.16b}, v10.8b //tC0 for U
524    tbl       v13.8b, {v25.16b}, v10.8b //tC0 for V
525    zip1      v31.8b, v12.8b, v13.8b
526    zip2      v13.8b, v12.8b, v13.8b
527    mov       v12.8b, v31.8b
528    mov       v12.d[1], v13.d[0]
529    uxtl      v10.4s, v10.4h
530    sli       v10.4s, v10.4s, #16
531    movi      v24.16b, #1
532    add       v12.16b, v12.16b , v24.16b //tC0 + 1
533    cmhs      v10.16b, v10.16b , v24.16b
534    and       v8.16b, v8.16b , v10.16b  //|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
535    // Q0 - Q3(inputs),
536    // Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
537    // Q6 (tC)
538    srshr     v14.8h, v14.8h, #3
539    srshr     v16.8h, v16.8h, #3        //(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
540    cmgt      v18.8h, v14.8h , #0
541    cmgt      v20.8h, v16.8h , #0
542    xtn       v18.8b, v18.8h
543    xtn       v19.8b, v20.8h            //Q9 = sign(delta)
544    mov       v18.d[1], v19.d[0]
545    abs       v14.8h, v14.8h
546    abs       v16.8h, v16.8h
547    xtn       v14.8b, v14.8h
548    xtn       v15.8b, v16.8h
549    mov       v14.d[1], v15.d[0]
550    umin      v14.16b, v14.16b , v12.16b //Q7 = |delta|
551    uqadd     v20.16b, v2.16b , v14.16b //p0+|delta|
552    uqadd     v22.16b, v4.16b , v14.16b //q0+|delta|
553    uqsub     v24.16b, v2.16b , v14.16b //p0-|delta|
554    uqsub     v26.16b, v4.16b , v14.16b //q0-|delta|
555    bit       v24.16b, v20.16b , v18.16b //p0 + delta
556    bit       v22.16b, v26.16b , v18.16b //q0 - delta
557    bit       v2.16b, v24.16b , v8.16b
558    bit       v4.16b, v22.16b , v8.16b
559    mov       v1.d[0], v0.d[1]
560    mov       v3.d[0], v2.d[1]
561    mov       v5.d[0], v4.d[1]
562    mov       v7.d[0], v6.d[1]
563    mov       v10.16b, v1.16b
564    mov       v1.16b, v2.16b
565    mov       v2.16b, v4.16b
566    mov       v4.16b, v10.16b
567    mov       v10.16b, v3.16b
568    mov       v3.16b, v6.16b
569    mov       v6.16b, v5.16b
570    mov       v5.16b, v10.16b
571    st4       {v0.h, v1.h, v2.h, v3.h}[0], [x12], x1
572    st4       {v0.h, v1.h, v2.h, v3.h}[1], [x12], x1
573    st4       {v0.h, v1.h, v2.h, v3.h}[2], [x12], x1
574    st4       {v0.h, v1.h, v2.h, v3.h}[3], [x12], x1
575
576    st4       {v4.h, v5.h, v6.h, v7.h}[0], [x12], x1
577    st4       {v4.h, v5.h, v6.h, v7.h}[1], [x12], x1
578    st4       {v4.h, v5.h, v6.h, v7.h}[2], [x12], x1
579    st4       {v4.h, v5.h, v6.h, v7.h}[3], [x12], x1
580
581    ldp       x19, x20, [sp], #16
582    pop_v_regs
583    ret
584
585
586