1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///*******************************************************************************
19//* @file
20//*  ihevc_deblk_luma_vert.s
21//*
22//* @brief
23//*  contains function definitions for inter prediction  interpolation.
24//* functions are coded using neon  intrinsics and can be compiled using
25
26//* rvct
27//*
28//* @author
29//*  anand s
30//*
31//* @par list of functions:
32//*
33//*
34//* @remarks
35//*  none
36//*
37//*******************************************************************************/
38
39.text
40.align 4
41
42
43.extern gai4_ihevc_tc_table
44.extern gai4_ihevc_beta_table
45.globl ihevc_deblk_luma_horz_av8
46
47.type ihevc_deblk_luma_horz_av8, %function
48
49ihevc_deblk_luma_horz_av8:
50    // stmfd sp!, {x3-x12,x14}
51    sxtw        x5,w5
52    sxtw        x6,w6
53    stp         d8,d9,[sp,#-16]!            // Storing d9 using { sub sp,sp,#8; str d9,[sp] } is giving bus error.
54                                            // d8 is used as dummy register and stored along with d9 using stp. d8 is not used in the function.
55    stp         d10,d11,[sp,#-16]!
56    stp         d12,d13,[sp,#-16]!
57    stp         d14,d15,[sp,#-16]!
58    stp         x19, x20,[sp,#-16]!
59    stp         x21, x22,[sp,#-16]!
60
61    mov         x21,x7
62    ldr         w22,[sp,#96]
63
64    add         x3,x3,x4
65    add         x3,x3,#1
66    asr         x3,x3,#1
67    add         x7,x3,x5,lsl #1
68    add         x3,x3,x6,lsl #1
69    cmp         x7,#0x33
70    mov         x20,#0x33
71    csel        x7, x20, x7,gt
72    bgt         l1.1532
73    cmp         x7,#0x0
74    mov         x20,#0x0
75    csel        x7, x20, x7,lt              // x7 has the beta_index value
76l1.1532:
77    //     bic      x2,x2,#1
78    asr         x2,x2,#1
79
80    add         x3,x3,x2,lsl #1
81    cmp         x3,#0x35
82    mov         x20,#0x35
83    csel        x3, x20, x3,gt
84    bgt         l1.1564
85    cmp         x3,#0x0
86    mov         x20,#0x0
87    csel        x3, x20, x3,lt              // x3 has the tc_index value
88
89    //    qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
90    //    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
91    //    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
92
93l1.1564:
94    adrp        x2, :got:gai4_ihevc_beta_table
95    ldr         x2, [x2, #:got_lo12:gai4_ihevc_beta_table]
96
97    adrp        x4, :got:gai4_ihevc_tc_table
98    ldr         x4, [x4, #:got_lo12:gai4_ihevc_tc_table]
99
100    ldr         w5, [x2,x7,lsl #2]          // beta
101    ldr         w6, [x4,x3,lsl #2]          // tc
102
103
104
105    cmp         x6,#0
106    beq         l1.2404
107    movi        v0.4h, #0x2
108    lsl         x7,x6,#1
109    add         x14,x1,x1,lsl #1
110    neg         x19,x14
111    ldr         w8, [x0,x19]                // -3 value
112    dup         v1.8b,w7
113    lsl         x19,x1,#1
114    neg         x19,x19
115    ldr         w10, [x0,x19]               //-2 value
116    dup         v23.2s,w8                   // -3 value
117    neg         x19,x1
118    ldr         w11, [x0,x19]               //-1 value
119    dup         v24.2s,w10                  // -2 value
120    and         x8,x8,#0xff
121    ldr         w12, [x0,#0]                // 0 value
122    dup         v25.2s,w11                  // -1 value
123    and         x10,x10,#0xff
124    ldr         w9, [x0,x1]                 // 1 value
125    dup         v26.2s,w12                  // 0 value
126    and         x11,x11,#0xff
127    lsl         x19,x1,#1
128    ldr         w2, [x0,x19]                // 2 value
129    dup         v27.2s,w9                   // 1value
130    and         x12,x12,#0xff
131    dup         v28.2s,w2                   // 2 value
132    and         x9,x9,#0xff
133    and         x2,x2,#0xff
134
135    add         x12,x12,x2
136    subs        x9,x12,x9,lsl #1            // dq0 value is stored in x9
137    csneg       x9,x9,x9,pl
138    //dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
139
140    add         x8,x8,x11
141    subs        x8,x8,x10,lsl #1
142    csneg       x8,x8,x8,pl                 // dp0 value is stored in x8
143    //  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
144
145
146
147    add         x3,x1,x1,lsl #1
148    add         x14,x0,#3
149
150
151    neg         x19,x3
152    ldrb        w2,[x14,x19]                // -2 value
153    lsl         x19,x1,#1
154    neg         x19,x19
155    ldrb        w10,[x14,x19]               // -2 value
156    neg         x19,x1
157    ldrb        w11,[x14,x19]               // -1 value
158    ldrb        w12,[x14,#0]                // 0 value
159    ldrb        w3,[x14,x1]                 // 1 value
160    lsl         x19,x1,#1
161    ldrb        w4,[x14,x19]                // 2 value
162
163
164    add         x12,x12,x4
165    subs        x12,x12,x3,lsl #1           // dq3value is stored in x12
166    csneg       x12,x12,x12,pl
167    //    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
168
169
170    add         x2,x2,x11
171    subs        x11,x2,x10,lsl #1
172    csneg       x11,x11,x11,pl              // dp3 value is stored in x8
173    //    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )@
174
175
176
177    add         x3,x8,x9                    // x3 has the d0 value
178    add         x4,x11,x12                  // x4 has the d3 value
179
180
181    //    d0 = dp0 + dq0@
182    //    d3 = dp3 + dq3@
183
184    add         x14,x8,x11                  // x13 has the value dp
185    add         x12,x12,x9                  // x12 has the value  dq
186    //    dp = dp0 + dp3@
187    //   dq = dq0 + dq3@
188
189    add         x11, x3, x4                 // x3 has the value d
190
191    //   d = d0 + d3@
192
193
194    cmp         x11,x5
195    bge         l1.2404
196
197    //    if(d < beta)
198
199
200    // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11
201
202    // registers for use: x2,x7,x8,x9,x10,
203
204    asr         x10,x5,#2
205    uqadd       v30.8b,  v26.8b ,  v1.8b
206    cmp         x10,x3,lsl #1
207    uqsub       v31.8b,  v26.8b ,  v1.8b
208    ble         l1.1840
209    add         x10,x1,x1,lsl #1
210    uaddl       v6.8h,  v25.8b ,  v26.8b
211    neg         x19,x1
212    ldr         w2, [x0,x19,lsl #2]         // has the -4 value
213    neg         x19, x1
214    ldrb        w7,[x0,x19]                 // has the -1 value
215    dup         v22.2s,w2                   // -4 value
216    uaddw       v7.8h,  v6.8h ,  v27.8b
217    ldrb        w3,[x0,#0]                  // x4 has the 0 value
218    uqadd       v16.8b,  v27.8b ,  v1.8b
219    and         x2,x2,#0xff
220    mul         v12.8h, v7.8h, v0.h[0]
221    ldr         w8, [x0,x10]                // has the 3 value
222    uaddl       v10.8h,  v24.8b ,  v28.8b
223    subs        x2,x2,x7
224    uqsub       v17.8b,  v27.8b ,  v1.8b
225    dup         v29.2s,w8                   // 3 value
226    and         x8,x8,#0xff
227    add         v12.8h,  v12.8h ,  v10.8h
228    csneg       x2,x2,x2,pl
229    rshrn       v20.8b, v12.8h,#3
230    subs        x8,x8,x3
231    csneg       x8,x8,x8,pl
232    umin        v18.8b,  v20.8b ,  v30.8b
233    add         x8,x8,x2
234
235    cmp         x8,x5,asr #3
236    bge         l1.1840
237    uaddw       v14.8h,  v7.8h ,  v28.8b
238    subs        x7,x3,x7
239    umax        v4.8b,  v18.8b ,  v31.8b
240    csneg       x7,x7,x7,pl
241    uqadd       v30.8b,  v28.8b ,  v1.8b
242    mov         x10,#5
243    rshrn       v21.8b, v14.8h,#2
244    mul         x10, x10, x6
245    uqsub       v31.8b,  v28.8b ,  v1.8b
246    add         x10, x10,#1
247    cmp         x7,x10,asr #1
248    umin        v18.8b,  v21.8b ,  v16.8b
249    bge         l1.1840
250
251
252    //        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
253    //            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
254
255    umax        v5.8b,  v18.8b ,  v17.8b
256    asr         x10,x5,#2
257    uaddl       v16.8h,  v29.8b ,  v28.8b
258    cmp         x10,x4,lsl #1
259    ble         l1.1840
260
261    add         x10,x1,x1,lsl #1
262    mul         v16.8h, v16.8h, v0.h[0]
263    add         x4,x0,#3
264
265
266    lsl         x19,x1,#2
267    neg         x19,x19
268    ldrb        w2,[x4,x19]
269    add         v16.8h,  v16.8h ,  v14.8h
270    neg         x19,x1
271    ldrb        w7,[x4,x19]
272    rshrn       v19.8b, v16.8h,#3
273    ldrb        w3,[x4,#0]
274    ldrb        w8,[x4,x10]
275    //   ubfx   x7,x2,#24,#8           @ has the -1 value
276    //  and    x2,#0xff               @ has the -4 value
277    //  ubfx   x8,x3,#24,#8           @ has the 3 value
278    //  and    x3,#0xff               @ x4 has the 0 value
279
280
281
282    subs        x8,x8,x3
283    umin        v18.8b,  v19.8b ,  v30.8b
284    csneg       x8,x8,x8,pl
285    uaddl       v6.8h,  v25.8b ,  v24.8b
286    subs        x2,x2,x7
287    umax        v3.8b,  v18.8b ,  v31.8b
288    csneg       x2,x2,x2,pl
289    uaddw       v7.8h,  v6.8h ,  v26.8b
290    add         x8,x8,x2
291    uqadd       v30.8b,  v25.8b ,  v1.8b
292    cmp         x8,x5,asr #3
293    uqsub       v31.8b,  v25.8b ,  v1.8b
294    bge         l1.1840
295    mul         v12.8h, v7.8h, v0.h[0]
296    subs        x7,x3,x7
297    uqadd       v16.8b,  v24.8b ,  v1.8b
298    csneg       x7,x7,x7,pl
299    uaddl       v10.8h,  v23.8b ,  v27.8b
300    mov         x10,#5
301    uqsub       v17.8b,  v24.8b ,  v1.8b
302    mul         x10, x10, x6
303    add         v12.8h,  v12.8h ,  v10.8h
304    add         x10, x10,#1
305    rshrn       v20.8b, v12.8h,#3
306    cmp         x7,x10,asr #1
307    uaddw       v14.8h,  v7.8h ,  v23.8b
308    bge         l1.1840
309    umin        v18.8b,  v20.8b ,  v30.8b
310    mov         x2,#2
311    uqadd       v30.8b,  v23.8b ,  v1.8b
312    mov         w4,w21
313    umax        v2.8b,  v18.8b ,  v31.8b
314    mov         w5,w22
315    rshrn       v21.8b, v14.8h,#2
316    b           end_dep_deq_decision_horz
317    // x2 has the value of de
318    // x6 has teh value of tc
319    // x5 has the value of beta
320    // x14 has the value of dp
321    // x12 has the value of dq
322    // x0 has the value of source address
323    // x1 has the src stride
324
325l1.1840:
326    mov         x2,#1
327
328    mov         x11,x5
329    mov         w4,w21
330    mov         w5,w22
331
332    cmp         x6,#1
333    mov         x20,#0
334    csel        x9, x20, x9,eq
335    mov         x20,#0
336    csel        x10, x20, x10,eq
337    beq         end_dep_deq_decision_horz
338
339    and         x7,x4,x5
340    cmp         x7,#1
341    beq         both_flags_set_horz
342    cmp         x4,#0
343    beq         set_flag_dep_zero_horz
344
345
346    add         x8,x11,x11,asr #1
347    mov         x10,#0
348    asr         x8,x8,#3
349    cmp         x8,x14
350    mov         x20,#1
351    csel        x9, x20, x9,gt
352    mov         x20,#0
353    csel        x9, x20, x9,le
354    b           end_dep_deq_decision_horz
355set_flag_dep_zero_horz:
356
357    add         x8,x11,x11,asr #1
358    mov         x9,#0
359    asr         x8,x8,#3
360    cmp         x8,x12
361    mov         x20,#1
362    csel        x10, x20, x10,gt
363    mov         x20,#0
364    csel        x10, x20, x10,le
365    b           end_dep_deq_decision_horz
366
367both_flags_set_horz:
368    add         x8,x11,x11,asr #1
369    asr         x8,x8,#3
370    cmp         x8,x14
371    mov         x20,#1
372    csel        x9, x20, x9,gt
373    mov         x20,#0
374    csel        x9, x20, x9,le
375    cmp         x8,x12
376    mov         x20,#1
377    csel        x10, x20, x10,gt
378    mov         x20,#0
379    csel        x10, x20, x10,le
380end_dep_deq_decision_horz:
381
382    //x0=source address
383    //x1=stride
384    // x2 =de
385    // x4=flag p
386    //x5= flag q
387    //x6 =tc
388    // x9 =dep
389    // x10=deq
390
391
392
393    //    add        x14,x1,x1,lsl #1
394    //    lsl        x7,x6,#1
395    //    vdup.8    d1,x7
396    //    vmov.i16  d0,#0x2
397    umin        v18.8b,  v21.8b ,  v16.8b
398    cmp         x2,#1
399    uqsub       v31.8b,  v23.8b ,  v1.8b
400    beq         l1.2408
401    uaddl       v7.8h,  v23.8b ,  v22.8b
402    cmp         x5,#1
403
404    bne         strong_filtering_p
405
406strong_filtering_q:
407    mov         x12,x0
408    st1         {v4.s}[0],[x12],x1
409    st1         {v5.s}[0],[x12],x1
410    st1         {v3.s}[0],[x12]
411    cmp         x4,#1
412    bne         l1.2404
413strong_filtering_p:
414    umax        v5.8b,  v18.8b ,  v17.8b
415    mov         x12,x0
416    mul         v7.8h, v7.8h, v0.h[0]
417    sub         x20,x1,#0
418    neg         x11, x20
419    add         v16.8h,  v7.8h ,  v14.8h
420    add         x12,x12,x11
421    rshrn       v19.8b, v16.8h,#3
422    st1         {v2.s}[0],[x12],x11
423    umin        v18.8b,  v19.8b ,  v30.8b
424    st1         {v5.s}[0],[x12],x11
425    umax        v3.8b,  v18.8b ,  v31.8b
426    st1         {v3.s}[0],[x12]
427
428l1.2404:
429    // ldmfd sp!, {x3-x12,pc}
430    ldp         x21, x22,[sp],#16
431    ldp         x19, x20,[sp],#16
432    ldp         d14,d15,[sp],#16
433    ldp         d12,d13,[sp],#16
434    ldp         d10,d11,[sp],#16
435    ldp         d8,d9,[sp],#16              // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error.
436                                            // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function.
437    ret
438
439    // x4=flag p
440    //x5= flag q
441    //x6 =tc
442    // x9 =dep
443    // x10=deq
444
445
446    //        d22             -4 value
447
448    //d23        @ -3 value
449
450    //    vdup.32    d24,x11            @ -2 value
451
452    //    vdup.32    d25, x11        @-1 value
453
454    //    vdup.32    d26,x11            @ 0 value
455
456    //    vdup.32    d27,x11            @ 1value
457
458    //    vdup.32    d28,x11            @ 2 value
459
460    //    vdup.32    d29,x11            @ 3 value
461
462l1.2408:
463
464    movi        v0.4h, #0x9
465
466    usubl       v10.8h,  v26.8b ,  v25.8b
467
468    mul         v10.8h, v10.8h, v0.h[0]
469
470    movi        v0.4h, #0x3
471
472    usubl       v12.8h,  v27.8b ,  v24.8b
473    mul         v12.8h, v12.8h, v0.h[0]
474
475
476    dup         v30.8b,w6                   // duplicating the +tc value
477
478    sub         x20,x6,#0
479    neg         x12, x20
480    dup         v31.8b,w12                  // duplicating the -tc value
481
482
483
484    sub         v10.8h,  v10.8h ,  v12.8h
485
486
487
488    srshr       v10.8h, v10.8h,#4
489    //   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
490
491    abs         v7.8h, v10.8h
492    xtn         v9.8b,  v7.8h
493    // storing the absolute values of delta in d9
494
495    sqxtn       v10.8b,  v10.8h
496    // storing the clipped values of delta in d16
497
498
499    smin        v11.8b,  v10.8b ,  v30.8b
500    smax        v7.8b,  v31.8b ,  v11.8b    // d8 has the value  delta = clip3(delta, -tc, tc)//
501
502
503    uxtl        v6.8h, v25.8b
504
505    saddw       v4.8h,  v6.8h ,  v7.8b
506
507    sqxtun      v12.8b, v4.8h
508    uxtl        v6.8h, v26.8b
509    ssubw       v4.8h,  v6.8h ,  v7.8b
510    sqxtun      v13.8b, v4.8h
511
512
513    mov         x11,#0xa
514    mul         x12, x11, x6
515    dup         v2.8b,w12                   // d2 has the 10*tc value
516    mov         v18.8b, v24.8b
517    dup         v0.8b,w6
518    sshr        v0.8b,v0.8b,#1
519    neg         v1.8b, v0.8b
520
521    cmp         x4,#1
522    bne         l1.2724
523    cmp         x9,#1
524    bne         l1.2700
525
526    // d12 and d13 have the value temp_p0 and temp_q0
527    uaddl       v14.8h,  v23.8b ,  v25.8b
528    rshrn       v14.8b, v14.8h,#1
529    usubl       v14.8h,  v14.8b ,  v24.8b
530    saddw       v14.8h,  v14.8h ,  v7.8b
531    sqshrn      v14.8b, v14.8h,#1
532    smin        v15.8b,  v14.8b ,  v0.8b
533    smax        v14.8b,  v1.8b ,  v15.8b
534
535    // d14 has the delta p value
536    uxtl        v16.8h, v24.8b
537    saddw       v16.8h,  v16.8h ,  v14.8b
538    sqxtun      v14.8b, v16.8h
539
540    //  d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
541    cmhs        v18.8b,v9.8b,v2.8b
542    bsl         v18.8b,v24.8b,v14.8b
543
544l1.2700:
545    mov         x12,x0
546    sub         x20,x1,#0
547    neg         x11, x20
548    add         x12,x12,x11
549    cmhs        v19.8b,v9.8b,v2.8b
550    bsl         v19.8b,v25.8b,v12.8b
551    st1         {v19.s}[0],[x12],x11
552    st1         {v18.s}[0],[x12]
553l1.2724:
554    cmp         x5,#1
555    bne         l1.2404
556    cmp         x10,#1
557    mov         v18.8b, v27.8b
558    bne         l1.2852
559
560    uaddl       v14.8h,  v26.8b ,  v28.8b
561    rshrn       v14.8b, v14.8h,#1
562    usubl       v14.8h,  v14.8b ,  v27.8b
563    ssubw       v14.8h,  v14.8h ,  v7.8b
564    sqshrn      v14.8b, v14.8h,#1
565    smin        v15.8b,  v14.8b ,  v0.8b
566    smax        v14.8b,  v1.8b ,  v15.8b
567// d14 has the delta p value
568    uxtl        v16.8h, v27.8b
569    saddw       v16.8h,  v16.8h ,  v14.8b
570    sqxtun      v14.8b, v16.8h
571    cmhs        v18.8b,v9.8b,v2.8b
572    bsl         v18.8b,v27.8b,v14.8b
573l1.2852:
574    mov         x12,x0
575    cmhs        v19.8b,v9.8b,v2.8b
576    bsl         v19.8b,v26.8b,v13.8b
577    st1         {v19.s}[0],[x12],x1
578    st1         {v18.s}[0],[x12]
579    // ldmfd sp!, {x3-x12,x15}
580    ldp         x21, x22,[sp],#16
581    ldp         x19, x20,[sp],#16
582    ldp         d14,d15,[sp],#16
583    ldp         d12,d13,[sp],#16
584    ldp         d10,d11,[sp],#16
585    ldp         d8,d9,[sp],#16              // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error.
586                                            // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function.
587    ret
588
589
590