1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/*******************************************************************************
19@* @file
20@*  ihevc_deblk_luma_vert.s
21@*
22@* @brief
23@*  contains function definitions for inter prediction  interpolation.
24@* functions are coded using neon  intrinsics and can be compiled using
25
26@* rvct
27@*
28@* @author
29@*  anand s
30@*
31@* @par list of functions:
32@*
33@*
34@* @remarks
35@*  none
36@*
37@*******************************************************************************/
38
39.text
40.align 4
41
42
43
44
45
46.extern gai4_ihevc_tc_table
47.extern gai4_ihevc_beta_table
48.globl ihevc_deblk_luma_horz_a9q
49
50gai4_ihevc_tc_table_addr:
51.long gai4_ihevc_tc_table  - ulbl1 - 8
52
53gai4_ihevc_beta_table_addr:
54.long gai4_ihevc_beta_table  - ulbl2 - 8
55
56.type ihevc_deblk_luma_horz_a9q, %function
57
58ihevc_deblk_luma_horz_a9q:
59    stmfd       sp!, {r3-r12,lr}
60    ldr         r4,[sp,#0x2c]
61    ldr         r5,[sp,#0x30]
62
63    add         r3,r3,r4
64    add         r3,r3,#1
65    ldr         r6, [sp,#0x34]
66    asr         r3,r3,#1
67    add         r7,r3,r5,lsl #1
68    add         r3,r3,r6,lsl #1
69    cmp         r7,#0x33
70    movgt       r7,#0x33
71    bgt         l1.1532
72    cmp         r7,#0x0
73    movlt       r7,#0x0                     @ r7 has the beta_index value
74l1.1532:
75    @     bic      r2,r2,#1
76    asr         r2,r2,#1
77
78    add         r3,r3,r2,lsl #1
79    cmp         r3,#0x35
80    movgt       r3,#0x35
81    bgt         l1.1564
82    cmp         r3,#0x0
83    movlt       r3,#0x0                     @ r3 has the tc_index value
84
85    @    qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
86    @    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
87    @    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
88
89l1.1564:
90    ldr         r2,gai4_ihevc_beta_table_addr
91ulbl2:
92    add         r2,r2,pc
93    ldr         r4,gai4_ihevc_tc_table_addr
94ulbl1:
95    add         r4,r4,pc
96
97    ldr         r5,[r2,r7,lsl #2]           @ beta
98    ldr         r6,[r4,r3,lsl #2]           @ tc
99
100
101
102    cmp         r6,#0
103    beq         l1.2404
104    vmov.i16    d0,#0x2
105    lsl         r7,r6,#1
106    add         r14,r1,r1,lsl #1
107    ldr         r8,[r0,-r14]                @ -3 value
108    vdup.8      d1,r7
109    ldr         r10,[r0,-r1,lsl #1]         @-2 value
110    vdup.32     d23,r8                      @ -3 value
111    ldr         r11,[r0,-r1]                @-1 value
112    vdup.32     d24,r10                     @ -2 value
113    and         r8,#0xff
114    ldr         r12,[r0,#0]                 @ 0 value
115    vdup.32     d25, r11                    @-1 value
116    and         r10,#0xff
117    ldr         r9,[r0,r1]                  @ 1 value
118    vdup.32     d26,r12                     @ 0 value
119    and         r11,#0xff
120    ldr         r2,[r0,r1,lsl #1]           @ 2 value
121    vdup.32     d27,r9                      @ 1value
122    and         r12,#0xff
123    vdup.32     d28,r2                      @ 2 value
124    and         r9,#0xff
125    and         r2,#0xff
126
127    add         r12,r12,r2
128    subs        r9,r12,r9,lsl #1            @ dq0 value is stored in r9
129    rsbmi       r9,r9,#0
130    @dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
131
132    add         r8,r8,r11
133    subs        r8,r8,r10,lsl #1
134    rsbmi       r8,r8,#0                    @ dp0 value is stored in r8
135    @  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
136
137
138
139    add         r3,r1,r1,lsl #1
140    add         r14,r0,#3
141
142
143    ldrb        r2,[r14,-r3]                @ -2 value
144    ldrb        r10,[r14,-r1,lsl #1]        @ -2 value
145    ldrb        r11,[r14,-r1]               @ -1 value
146    ldrb        r12,[r14,#0]                @ 0 value
147    ldrb        r3,[r14,r1]                 @ 1 value
148    ldrb        r4,[r14,r1,lsl #1]          @ 2 value
149
150
151    add         r12,r12,r4
152    subs        r12,r12,r3,lsl #1           @ dq3value is stored in r12
153    rsbmi       r12,r12,#0
154    @    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
155
156
157    add         r2,r2,r11
158    subs        r11,r2,r10,lsl #1
159    rsbmi       r11,r11,#0                  @ dp3 value is stored in r8
160    @    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )@
161
162
163
164    add         r3,r8,r9                    @ r3 has the d0 value
165    add         r4,r11,r12                  @ r4 has the d3 value
166
167
168    @    d0 = dp0 + dq0@
169    @    d3 = dp3 + dq3@
170
171    add         r14,r8,r11                  @ r13 has the value dp
172    add         r12,r12,r9                  @ r12 has the value  dq
173    @    dp = dp0 + dp3@
174    @   dq = dq0 + dq3@
175
176    add         r11, r3, r4                 @ r3 has the value d
177
178    @   d = d0 + d3@
179
180
181    cmp         r11,r5
182    bge         l1.2404
183
184    @    if(d < beta)
185
186
187    @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
188
189    @ registers for use: r2,r7,r8,r9,r10,
190
191    asr         r10,r5,#2
192    vqadd.u8    d30,d26,d1
193    cmp         r10,r3,lsl #1
194    vqsub.u8    d31,d26,d1
195    ble         l1.1840
196    add         r10,r1,r1,lsl #1
197    vaddl.u8    q3,d25,d26
198    ldr         r2,[r0,-r1,lsl #2]          @ has the -4 value
199    ldrb        r7,[r0,-r1]                 @ has the -1 value
200    vdup.32     d22,r2                      @ -4 value
201    vaddw.u8    q4,q3,d27
202    ldrb        r3,[r0,#0]                  @ r4 has the 0 value
203    vqadd.u8    d16,d27,d1
204    and         r2,#0xff
205    vmul.i16    q6,q4,d0[0]
206    ldr         r8,[r0,r10]                 @ has the 3 value
207    vaddl.u8    q5,d24,d28
208    subs        r2,r2,r7
209    vqsub.u8    d17,d27,d1
210    vdup.32     d29,r8                      @ 3 value
211    and         r8,#0xff
212    vadd.i16    q6,q6,q5
213    rsbmi       r2,r2,#0
214    vrshrn.i16  d20,q6,#3
215    subs        r8,r8,r3
216    rsbmi       r8,r8,#0
217    vmin.u8     d18,d20,d30
218    add         r8,r8,r2
219
220    cmp         r8,r5,asr #3
221    bge         l1.1840
222    vaddw.u8    q7,q4,d28
223    subs        r7,r3,r7
224    vmax.u8     d4,d18,d31
225    rsbmi       r7,r7,#0
226    vqadd.u8    d30,d28,d1
227    mov         r10,#5
228    vrshrn.i16  d21,q7,#2
229    mul         r10,r10,r6
230    vqsub.u8    d31,d28,d1
231    add         r10,#1
232    cmp         r7,r10,asr #1
233    vmin.u8     d18,d21,d16
234    bge         l1.1840
235
236
237    @        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
238    @            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
239
240    vmax.u8     d5,d18,d17
241    asr         r10,r5,#2
242    vaddl.u8    q8,d29,d28
243    cmp         r10,r4,lsl #1
244    ble         l1.1840
245
246    add         r10,r1,r1,lsl #1
247    vmul.i16    q8,q8,d0[0]
248    add         r4,r0,#3
249
250
251    ldrb        r2,[r4,-r1,lsl #2]
252    vadd.i16    q8,q8,q7
253    ldrb        r7,[r4,-r1]
254    vrshrn.i16  d19,q8,#3
255    ldrb        r3,[r4,#0]
256    ldrb        r8,[r4,r10]
257    @   ubfx   r7,r2,#24,#8           @ has the -1 value
258    @  and    r2,#0xff               @ has the -4 value
259    @  ubfx   r8,r3,#24,#8           @ has the 3 value
260    @  and    r3,#0xff               @ r4 has the 0 value
261
262
263
264    subs        r8,r8,r3
265    vmin.u8     d18,d19,d30
266    rsbmi       r8,r8,#0
267    vaddl.u8    q3,d25,d24
268    subs        r2,r2,r7
269    vmax.u8     d3,d18,d31
270    rsbmi       r2,r2,#0
271    vaddw.u8    q4,q3,d26
272    add         r8,r8,r2
273    vqadd.u8    d30,d25,d1
274    cmp         r8,r5,asr #3
275    vqsub.u8    d31,d25,d1
276    bge         l1.1840
277    vmul.i16    q6,q4,d0[0]
278    subs        r7,r3,r7
279    vqadd.u8    d16,d24,d1
280    rsbmi       r7,r7,#0
281    vaddl.u8    q5,d23,d27
282    mov         r10,#5
283    vqsub.u8    d17,d24,d1
284    mul         r10,r10,r6
285    vadd.i16    q6,q6,q5
286    add         r10,#1
287    vrshrn.i16  d20,q6,#3
288    cmp         r7,r10,asr #1
289    vaddw.u8    q7,q4,d23
290    bge         l1.1840
291    vmin.u8     d18,d20,d30
292    mov         r2,#2
293    vqadd.u8    d30,d23,d1
294    ldr         r4,[sp,#0x38]               @ loading the filter_flag_p
295    vmax.u8     d2,d18,d31
296    ldr         r5,[sp,#0x3c]               @ loading the filter_flag_q
297    vrshrn.i16  d21,q7,#2
298    b           end_dep_deq_decision_horz
299    @ r2 has the value of de
300    @ r6 has teh value of tc
301    @ r5 has the value of beta
302    @ r14 has the value of dp
303    @ r12 has the value of dq
304    @ r0 has the value of source address
305    @ r1 has the src stride
306
307l1.1840:
308    mov         r2,#1
309
310    mov         r11,r5
311    ldr         r4,[sp,#0x38]               @ loading the filter_flag_p
312    ldr         r5,[sp,#0x3c]               @ loading the filter_flag_q
313
314    cmp         r6,#1
315    moveq       r9,#0
316    moveq       r10,#0
317    beq         end_dep_deq_decision_horz
318
319    and         r7,r4,r5
320    cmp         r7,#1
321    beq         both_flags_set_horz
322    cmp         r4,#0
323    beq         set_flag_dep_zero_horz
324
325
326    add         r8,r11,r11,asr #1
327    mov         r10,#0
328    asr         r8,#3
329    cmp         r8,r14
330    movgt       r9,#1
331    movle       r9,#0
332    b           end_dep_deq_decision_horz
333set_flag_dep_zero_horz:
334
335    add         r8,r11,r11,asr #1
336    mov         r9,#0
337    asr         r8,#3
338    cmp         r8,r12
339    movgt       r10,#1
340    movle       r10,#0
341    b           end_dep_deq_decision_horz
342
343both_flags_set_horz:
344    add         r8,r11,r11,asr #1
345    asr         r8,#3
346    cmp         r8,r14
347    movgt       r9,#1
348    movle       r9,#0
349    cmp         r8,r12
350    movgt       r10,#1
351    movle       r10,#0
352end_dep_deq_decision_horz:
353
354    @r0=source address
355    @r1=stride
356    @ r2 =de
357    @ r4=flag p
358    @r5= flag q
359    @r6 =tc
360    @ r9 =dep
361    @ r10=deq
362
363
364
365    @   add     r14,r1,r1,lsl #1
366    @   lsl     r7,r6,#1
367    @   vdup.8  d1,r7
368    @   vmov.i16  d0,#0x2
369    vmin.u8     d18,d21,d16
370    cmp         r2,#1
371    vqsub.u8    d31,d23,d1
372    beq         l1.2408
373    vaddl.u8    q4,d23,d22
374    cmp         r5,#1
375
376    bne         strong_filtering_p
377
378strong_filtering_q:
379    mov         r12,r0
380    vst1.32     d4[0],[r12],r1
381    vst1.32     d5[0],[r12],r1
382    vst1.32     d3[0],[r12]
383    cmp         r4,#1
384    bne         l1.2404
385strong_filtering_p:
386    vmax.u8     d5,d18,d17
387    mov         r12,r0
388    vmul.i16    q4,q4,d0[0]
389    rsb         r11,r1,#0
390    vadd.i16    q8,q4,q7
391    add         r12,r12,r11
392    vrshrn.i16  d19,q8,#3
393    vst1.32     d2[0],[r12],r11
394    vmin.u8     d18,d19,d30
395    vst1.32     d5[0],[r12],r11
396    vmax.u8     d3,d18,d31
397    vst1.32     d3[0],[r12]
398
399l1.2404:
400    ldmfd       sp!, {r3-r12,pc}
401
402    @ r4=flag p
403    @r5= flag q
404    @r6 =tc
405    @ r9 =dep
406    @ r10=deq
407
408
409    @       d22          -4 value
410
411    @d23        @ -3 value
412
413    @   vdup.32 d24,r11         @ -2 value
414
415    @   vdup.32 d25, r11        @-1 value
416
417    @   vdup.32 d26,r11         @ 0 value
418
419    @   vdup.32 d27,r11         @ 1value
420
421    @   vdup.32 d28,r11         @ 2 value
422
423    @   vdup.32 d29,r11         @ 3 value
424
425l1.2408:
426
427    vmov.i16    d0,#0x9
428
429    vsubl.u8    q5,d26,d25
430
431    vmul.i16    q5,q5,d0[0]
432
433    vmov.i16    d0,#0x3
434
435    vsubl.u8    q6,d27,d24
436    vmul.i16    q6,q6,d0[0]
437
438
439    vdup.8      d30,r6                      @ duplicating the +tc value
440
441    rsb         r12,r6,#0
442    vdup.8      d31,r12                     @ duplicating the -tc value
443
444
445
446    vsub.i16    q5,q5,q6
447
448
449
450    vrshr.s16   q5,q5,#4
451    @   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
452
453    vabs.s16    q4,q5
454    vmovn.i16   d9,q4
455    @ storing the absolute values of delta in d9
456
457    vqmovn.s16  d10,q5
458    @ storing the clipped values of delta in d16
459
460
461    vmin.s8     d11,d10,d30
462    vmax.s8     d8,d31,d11                  @ d8 has the value  delta = clip3(delta, -tc, tc)@
463
464
465    vmovl.u8    q3,d25
466
467    vaddw.s8    q2,q3,d8
468
469    vqmovun.s16 d12,q2
470    vmovl.u8    q3,d26
471    vsubw.s8    q2,q3,d8
472    vqmovun.s16 d13,q2
473
474
475    mov         r11,#0xa
476    mul         r12,r11,r6
477    vdup.8      d2,r12                      @ d2 has the 10*tc value
478    vmov        d18,d24
479    vdup.8      d0,r6
480    vshr.s8     d0,#1
481    vneg.s8     d1,d0
482
483    cmp         r4,#1
484    bne         l1.2724
485    cmp         r9,#1
486    bne         l1.2700
487
488    @ d12 and d13 have the value temp_p0 and temp_q0
489    vaddl.u8    q7,d23,d25
490    vrshrn.u16  d14,q7,#1
491    vsubl.u8    q7,d14,d24
492    vaddw.s8    q7,q7,d8
493    vqshrn.s16  d14,q7,#1
494    vmin.s8     d15,d14,d0
495    vmax.s8     d14,d1,d15
496
497    @ d14 has the delta p value
498    vmovl.u8    q8,d24
499    vaddw.s8    q8,q8,d14
500    vqmovun.s16 d14,q8
501
502    @  d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
503    vcge.u8     d18,d9,d2
504    vbsl        d18,d24,d14
505
506l1.2700:
507    mov         r12,r0
508    rsb         r11,r1,#0
509    add         r12,r11
510    vcge.u8     d19,d9,d2
511    vbsl        d19,d25,d12
512    vst1.32     {d19[0]},[r12],r11
513    vst1.32     {d18[0]},[r12]
514l1.2724:
515    cmp         r5,#1
516    bne         l1.2404
517    cmp         r10,#1
518    vmov        d18, d27
519    bne         l1.2852
520
521    vaddl.u8    q7,d26,d28
522    vrshrn.u16  d14,q7,#1
523    vsubl.u8    q7,d14,d27
524    vsubw.s8    q7,q7,d8
525    vqshrn.s16  d14,q7,#1
526    vmin.s8     d15,d14,d0
527    vmax.s8     d14,d1,d15
528@ d14 has the delta p value
529    vmovl.u8    q8,d27
530    vaddw.s8    q8,q8,d14
531    vqmovun.s16 d14,q8
532    vcge.u8     d18,d9,d2
533    vbsl        d18,d27,d14
534l1.2852:
535    mov         r12,r0
536    vcge.u8     d19,d9,d2
537    vbsl        d19,d26,d13
538    vst1.32     {d19[0]},[r12],r1
539    vst1.32     {d18[0]},[r12]
540    ldmfd       sp!, {r3-r12,r15}
541
542
543
544