1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_band_offset_luma.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_band_offset_luma(UWORD8 *pu1_src,
40@                           WORD32 src_strd,
41@                           UWORD8 *pu1_src_left,
42@                           UWORD8 *pu1_src_top,
43@                           UWORD8 *pu1_src_top_left,
44@                           WORD32 sao_band_pos,
45@                           WORD8 *pi1_sao_offset,
46@                           WORD32 wd,
47@                           WORD32 ht)
48@
49@**************Variables Vs Registers*****************************************
50@r0 =>  *pu1_src
51@r1 =>  src_strd
52@r2 =>  *pu1_src_left
53@r3 =>  *pu1_src_top
54@r4 =>  *pu1_src_top_left
55@r5 =>  sao_band_pos
56@r6 =>  *pi1_sao_offset
57@r7 =>  wd
58@r8 =>  ht
59
60.equ    pu1_src_top_left_offset,    104
61.equ    sao_band_pos_offset,        108
62.equ    pi1_sao_offset,             112
63.equ    wd_offset,                  116
64.equ    ht_offset,                  120
65
66.text
67.p2align 2
68
69.extern gu1_table_band_idx
70.globl ihevc_sao_band_offset_luma_a9q
71
72gu1_table_band_idx_addr:
73.long gu1_table_band_idx - ulbl1 - 8
74
75ihevc_sao_band_offset_luma_a9q:
76
77    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
78    vpush       {d8  -  d15}
79
80    LDR         r8,[sp,#ht_offset]          @Loads ht
81    LDR         r7,[sp,#wd_offset]          @Loads wd
82
83    MOV         r9,r8                       @Move the ht to r9 for loop counter
84    LDR         r5,[sp,#sao_band_pos_offset]    @Loads sao_band_pos
85    ADD         r10,r0,r7                   @pu1_src[row * src_strd + (wd)]
86
87    LDR         r4,[sp,#pu1_src_top_left_offset]    @Loads pu1_src_top_left
88    SUB         r10,r10,#1                  @wd-1
89    LDR         r14, gu1_table_band_idx_addr
90ulbl1:
91    add         r14,r14,pc
92
93SRC_LEFT_LOOP:
94    LDRB        r11,[r10],r1                @Load the value
95    SUBS        r9,r9,#1                    @Decrement the loop counter
96    STRB        r11,[r2],#1                 @Store the value in pu1_src_left pointer
97    BNE         SRC_LEFT_LOOP
98
99    ADD         r9,r3,r7                    @pu1_src_top[wd]
100    VLD1.8      D1,[r14]!                   @band_table.val[0]
101    LDR         r6,[sp,#pi1_sao_offset]     @Loads pi1_sao_offset
102
103    LSL         r11,r5,#3
104    VLD1.8      D2,[r14]!                   @band_table.val[1]
105
106    LDRB        r10,[r9,#-1]
107    VDUP.8      D31,r11                     @band_pos
108    SUB         r12,r8,#1                   @ht-1
109
110    STRB        r10,[r4]                    @store to pu1_src_top_left[0]
111    VLD1.8      D3,[r14]!                   @band_table.val[2]
112    MUL         r12,r12,r1                  @ht-1 * src_strd
113
114    ADD         r4,r12,r0                   @pu1_src[(ht - 1) * src_strd]
115    VLD1.8      D4,[r14]!                   @band_table.val[3]
116    MOV         r9,r7                       @Move the wd to r9 for loop counter
117
118SRC_TOP_LOOP:                               @wd is always multiple of 8
119    VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
120    SUBS        r9,r9,#8                    @Decrement the loop counter by 8
121    VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
122    BNE         SRC_TOP_LOOP
123
124    VLD1.8      D30,[r6]                    @pi1_sao_offset load
125    VADD.I8     D5,D1,D31                   @band_table.val[0] = vadd_u8(band_table.val[0], band_pos)
126
127    VDUP.8      D29,D30[1]                  @vdup_n_u8(pi1_sao_offset[1])
128    VADD.I8     D6,D2,D31                   @band_table.val[1] = vadd_u8(band_table.val[1], band_pos)
129
130    VDUP.8      D28,D30[2]                  @vdup_n_u8(pi1_sao_offset[2])
131    VADD.I8     D7,D3,D31                   @band_table.val[2] = vadd_u8(band_table.val[2], band_pos)
132
133    VDUP.8      D27,D30[3]                  @vdup_n_u8(pi1_sao_offset[3])
134    VADD.I8     D8,D4,D31                   @band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
135
136    VDUP.8      D26,D30[4]                  @vdup_n_u8(pi1_sao_offset[4])
137    VADD.I8     D1,D5,D29                   @band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1]))
138
139    VMOV.I8     D29,#16                     @vdup_n_u8(16)
140    VADD.I8     D2,D6,D28                   @band_table.val[1] = vadd_u8(band_table.val[1], vdup_n_u8(pi1_sao_offset[2]))
141
142    CMP         r5,#28
143    VADD.I8     D3,D7,D27                   @band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3]))
144
145    VADD.I8     D4,D8,D26                   @band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4]))
146    BLT         SAO_BAND_POS_0
147
148SAO_BAND_POS_28:                            @case 28
149
150    VCLE.U8     D12,D4,D29                  @vcle_u8(band_table.val[3], vdup_n_u8(16))
151
152    BNE         SAO_BAND_POS_29
153    VORR.U8     D4,D4,D12                   @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
154    B           SWITCH_BREAK
155
156SAO_BAND_POS_29:                            @case 29
157    CMP         r5,#29
158    VCLE.U8     D11,D3,D29                  @vcle_u8(band_table.val[2], vdup_n_u8(16))
159
160    BNE         SAO_BAND_POS_30
161    VORR.U8     D3,D3,D11                   @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
162
163    VAND.U8     D4,D4,D12                   @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
164    B           SWITCH_BREAK
165
166SAO_BAND_POS_30:                            @case 30
167    CMP         r5,#30
168    VCLE.U8     D10,D2,D29                  @vcle_u8(band_table.val[1], vdup_n_u8(16))
169
170    BNE         SAO_BAND_POS_31
171    VORR.U8     D2,D2,D10                   @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
172
173    VAND.U8     D3,D3,D11                   @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
174    B           SWITCH_BREAK
175
176SAO_BAND_POS_31:                            @case 31
177    CMP         r5,#31
178    BNE         SWITCH_BREAK
179
180    VCLE.U8     D9,D1,D29                   @vcle_u8(band_table.val[0], vdup_n_u8(16))
181    VORR.U8     D1,D1,D9                    @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
182
183    VAND.U8     D2,D2,D10                   @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
184
185SAO_BAND_POS_0:
186    CMP         r5,#0                       @case 0
187    BNE         SWITCH_BREAK
188
189    VCLE.U8     D9,D1,D29                   @vcle_u8(band_table.val[0], vdup_n_u8(16))
190    VAND.U8     D1,D1,D9                    @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
191
192SWITCH_BREAK:
193    MOV         r4,r0                       @pu1_src_cpy
194    MOV         r11,r8                      @move ht
195    ADD         r5,r4,r1
196
197HEIGHT_LOOP:
198    ADD         r6,r5,r1
199    VLD1.8      D13,[r4]                    @au1_cur_row = vld1_u8(pu1_src_cpy)
200
201    ADD         r10,r6,r1
202    VLD1.8      D15,[r5]                    @au1_cur_row = vld1_u8(pu1_src_cpy)
203
204    VLD1.8      D17,[r6]                    @au1_cur_row = vld1_u8(pu1_src_cpy)
205
206    VLD1.8      D19,[r10]                   @au1_cur_row = vld1_u8(pu1_src_cpy)
207    VSUB.I8     D14,D13,D31                 @vsub_u8(au1_cur_row, band_pos)
208
209    VTBX.8      D13,{D1-D4},D14             @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
210    VSUB.I8     D16,D15,D31                 @vsub_u8(au1_cur_row, band_pos)
211
212    VTBX.8      D15,{D1-D4},D16             @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
213    VSUB.I8     D18,D17,D31                 @vsub_u8(au1_cur_row, band_pos)
214
215    VTBX.8      D17,{D1-D4},D18             @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
216    VSUB.I8     D20,D19,D31                 @vsub_u8(au1_cur_row, band_pos)
217
218    VTBX.8      D19,{D1-D4},D20             @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
219    VST1.8      D13,[r4],r1                 @vst1_u8(pu1_src_cpy, au1_cur_row)
220
221    VST1.8      D15,[r5]                    @vst1_u8(pu1_src_cpy, au1_cur_row)
222    SUBS        r11,r11,#4                  @Decrement the ht loop count by 4
223
224    VST1.8      D17,[r6],r1                 @vst1_u8(pu1_src_cpy, au1_cur_row)
225
226    ADD         r4,r6,r1
227    VST1.8      D19,[r10]                   @vst1_u8(pu1_src_cpy, au1_cur_row)
228    ADD         r5,r4,r1
229
230    BNE         HEIGHT_LOOP
231
232    SUBS        r7,r7,#8                    @Decrement the width loop by 8
233    ADD         r0,r0,#8
234    BNE         SWITCH_BREAK
235
236    vpop        {d8  -  d15}
237    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
238
239
240
241
242