1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_band_offset_chroma.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
40@                           WORD32 src_strd,
41@                           UWORD8 *pu1_src_left,
42@                           UWORD8 *pu1_src_top,
43@                           UWORD8 *pu1_src_top_left,
44@                           WORD32 sao_band_pos_u,
45@                           WORD32 sao_band_pos_v,
46@                           WORD8 *pi1_sao_offset_u,
47@                           WORD8 *pi1_sao_offset_v,
48@                           WORD32 wd,
49@                           WORD32 ht)
50@
51@**************Variables Vs Registers*****************************************
52@r0 =>  *pu1_src
53@r1 =>  src_strd
54@r2 =>  *pu1_src_left
55@r3 =>  *pu1_src_top
56@r4 =>  *pu1_src_top_left
57@r5 =>  sao_band_pos_u
58@r6 =>  sao_band_pos_v
59@r7 =>  *pi1_sao_offset_u
60@r8 =>  *pi1_sao_offset_v
61@r9 =>  wd
62@r10=>  ht
63
64.equ    pu1_src_top_left_offset,    104
65.equ    sao_band_pos_u_offset,      108
66.equ    sao_band_pos_v_offset,      112
67.equ    pi1_sao_u_offset,           116
68.equ    pi1_sao_v_offset,           120
69.equ    wd_offset,                  124
70.equ    ht_offset,                  128
71
72.text
73.p2align 2
74
75.extern gu1_table_band_idx
76.globl ihevc_sao_band_offset_chroma_a9q
77
78gu1_table_band_idx_addr_1:
79.long gu1_table_band_idx - ulbl1 - 8
80
81gu1_table_band_idx_addr_2:
82.long gu1_table_band_idx - ulbl2 - 8
83
84ihevc_sao_band_offset_chroma_a9q:
85
86    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
87    vpush       {d8  -  d15}
88    LDR         r4,[sp,#pu1_src_top_left_offset]    @Loads pu1_src_top_left
89    LDR         r10,[sp,#ht_offset]         @Loads ht
90
91    LDR         r9,[sp,#wd_offset]          @Loads wd
92    MOV         r11,r10                     @Move the ht to r9 for loop counter
93
94    ADD         r12,r0,r9                   @pu1_src[row * src_strd + (wd)]
95    LDR         r14, gu1_table_band_idx_addr_1
96ulbl1:
97    add         r14,r14,pc
98    SUB         r12,r12,#2                  @wd-2
99
100SRC_LEFT_LOOP:
101    LDRH        r5,[r12],r1                 @Load the value
102    SUBS        r11,r11,#1                  @Decrement the loop counter
103    STRH        r5,[r2],#2                  @Store the value in pu1_src_left pointer
104    BNE         SRC_LEFT_LOOP
105
106    LDR         r5,[sp,#sao_band_pos_u_offset]  @Loads sao_band_pos_u
107    VLD1.8      D1,[r14]!                   @band_table_u.val[0]
108    ADD         r12,r3,r9                   @pu1_src_top[wd]
109
110    LDRH        r11,[r12,#-2]
111    VLD1.8      D2,[r14]!                   @band_table_u.val[1]
112    LSL         r6,r5,#3                    @sao_band_pos_u
113
114    STRH        r11,[r4]                    @store to pu1_src_top_left[0]
115    VLD1.8      D3,[r14]!                   @band_table_u.val[2]
116    LDR         r7,[sp,#pi1_sao_u_offset]   @Loads pi1_sao_offset_u
117
118    SUB         r4,r10,#1                   @ht-1
119    VDUP.8      D31,r6                      @band_pos_u
120    MUL         r4,r4,r1                    @ht-1 * src_strd
121
122    ADD         r4,r4,r0                    @pu1_src[(ht - 1) * src_strd]
123    VLD1.8      D4,[r14]!                   @band_table_u.val[3]
124    MOV         r11,r9                      @Move the wd to r9 for loop counter
125
126SRC_TOP_LOOP:                               @wd is always multiple of 8
127    VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
128    SUBS        r11,r11,#8                  @Decrement the loop counter by 8
129    VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
130    BNE         SRC_TOP_LOOP
131
132    VLD1.8      D30,[r7]                    @pi1_sao_offset_u load
133    VADD.I8     D5,D1,D31                   @band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u)
134
135    VDUP.8      D29,D30[1]                  @vdup_n_u8(pi1_sao_offset_u[1])
136    VADD.I8     D6,D2,D31                   @band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u)
137
138    VDUP.8      D28,D30[2]                  @vdup_n_u8(pi1_sao_offset_u[2])
139    VADD.I8     D7,D3,D31                   @band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u)
140
141    VDUP.8      D27,D30[3]                  @vdup_n_u8(pi1_sao_offset_u[3])
142    VADD.I8     D8,D4,D31                   @band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u)
143
144    CMP         r5,#28
145    VDUP.8      D26,D30[4]                  @vdup_n_u8(pi1_sao_offset_u[4])
146    LDR         r14, gu1_table_band_idx_addr_2
147ulbl2:
148    add         r14,r14,pc
149
150    VMOV.I8     D30,#16                     @vdup_n_u8(16)
151    VADD.I8     D1,D5,D29                   @band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1]))
152
153    VLD1.8      D9,[r14]!                   @band_table_v.val[0]
154    VADD.I8     D2,D6,D28                   @band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2]))
155
156    VLD1.8      D10,[r14]!                  @band_table_v.val[1]
157    VADD.I8     D3,D7,D27                   @band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
158
159    LDR         r6,[sp,#sao_band_pos_v_offset]  @Loads sao_band_pos_v
160    VADD.I8     D4,D8,D26                   @band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
161    LSL         r11,r6,#3                   @sao_band_pos_v
162
163    BLT         SAO_BAND_POS_U_0
164
165SAO_BAND_POS_U_28:                          @case 28
166    VCLE.U8     D13,D4,D30                  @vcle_u8(band_table.val[3], vdup_n_u8(16))
167    BNE         SAO_BAND_POS_U_29
168
169    VORR.U8     D4,D4,D13                   @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
170    B           SWITCH_BREAK_U
171
172SAO_BAND_POS_U_29:                          @case 29
173    CMP         r5,#29
174
175    VCLE.U8     D14,D3,D30                  @vcle_u8(band_table.val[2], vdup_n_u8(16))
176    BNE         SAO_BAND_POS_U_30
177    VORR.U8     D3,D3,D14                   @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
178
179    VAND.U8     D4,D4,D13                   @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
180    B           SWITCH_BREAK_U
181
182SAO_BAND_POS_U_30:                          @case 30
183    CMP         r5,#30
184
185    VCLE.U8     D15,D2,D30                  @vcle_u8(band_table.val[1], vdup_n_u8(16))
186    BNE         SAO_BAND_POS_U_31
187    VORR.U8     D2,D2,D15                   @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
188
189    VAND.U8     D3,D3,D14                   @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
190
191SAO_BAND_POS_U_31:                          @case 31
192    CMP         r5,#31
193    BNE         SWITCH_BREAK_U
194
195    VCLE.U8     D16,D1,D30                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
196    VORR.U8     D1,D1,D16                   @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
197
198    VAND.U8     D2,D2,D15                   @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
199    B           SWITCH_BREAK_U
200
201SAO_BAND_POS_U_0:
202    CMP         r5,#0                       @case 0
203    BNE         SWITCH_BREAK_U
204
205    VCLE.U8     D16,D1,D30                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
206    VAND.U8     D1,D1,D16                   @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
207
208SWITCH_BREAK_U:
209    VDUP.8      D30,r11                     @band_pos_v
210    LDR         r8,[sp,#pi1_sao_v_offset]   @Loads pi1_sao_offset_v
211
212    VLD1.8      D11,[r14]!                  @band_table_v.val[2]
213    VADD.I8     D13,D9,D30                  @band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
214
215    VLD1.8      D12,[r14]!                  @band_table_v.val[3]
216    VADD.I8     D14,D10,D30                 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v)
217
218    VLD1.8      D25,[r8]                    @pi1_sao_offset_v load
219    VADD.I8     D15,D11,D30                 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v)
220
221    VDUP.8      D29,D25[1]                  @vdup_n_u8(pi1_sao_offset_v[1])
222    VADD.I8     D16,D12,D30                 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v)
223
224    VDUP.8      D28,D25[2]                  @vdup_n_u8(pi1_sao_offset_v[2])
225    VADD.I8     D9,D13,D29                  @band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1]))
226
227    VDUP.8      D27,D25[3]                  @vdup_n_u8(pi1_sao_offset_v[3])
228    VADD.I8     D10,D14,D28                 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2]))
229
230    VDUP.8      D26,D25[4]                  @vdup_n_u8(pi1_sao_offset_v[4])
231    VADD.I8     D11,D15,D27                 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3]))
232
233    VMOV.I8     D29,#16                     @vdup_n_u8(16)
234    VADD.I8     D12,D16,D26                 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4]))
235    AND         r12,r9,#0xf
236
237    CMP         r6,#28
238    BLT         SAO_BAND_POS_V_0
239
240SAO_BAND_POS_V_28:                          @case 28
241    VCLE.U8     D17,D12,D29                 @vcle_u8(band_table.val[3], vdup_n_u8(16))
242    BNE         SAO_BAND_POS_V_29
243    VORR.U8     D12,D12,D17                 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
244    B           SWITCH_BREAK_V
245
246SAO_BAND_POS_V_29:                          @case 29
247    CMP         r6,#29
248
249    VCLE.U8     D18,D11,D29                 @vcle_u8(band_table.val[2], vdup_n_u8(16))
250    BNE         SAO_BAND_POS_V_30
251    VORR.U8     D11,D11,D18                 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
252
253    VAND.U8     D12,D12,D17                 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
254    B           SWITCH_BREAK_V
255
256SAO_BAND_POS_V_30:                          @case 30
257    CMP         r6,#30
258
259    VCLE.U8     D19,D10,D29                 @vcle_u8(band_table.val[1], vdup_n_u8(16))
260    BNE         SAO_BAND_POS_V_31
261    VORR.U8     D10,D10,D19                 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
262
263    VAND.U8     D11,D11,D18                 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
264    B           SWITCH_BREAK_V
265
266SAO_BAND_POS_V_31:                          @case 31
267    CMP         r6,#31
268    BNE         SWITCH_BREAK_V
269
270    VCLE.U8     D20,D9,D29                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
271    VORR.U8     D9,D9,D20                   @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
272
273    VAND.U8     D10,D10,D19                 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
274    B           SWITCH_BREAK_V
275
276SAO_BAND_POS_V_0:
277    CMP         r6,#0                       @case 0
278    BNE         SWITCH_BREAK_V
279
280    VCLE.U8     D20,D9,D29                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
281    VAND.U8     D9,D9,D20                   @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
282
283SWITCH_BREAK_V:
284    CMP         r9,#16
285    MOV         r4,r0                       @pu1_src_cpy
286    BLT         WIDTH_RESIDUE
287
288WIDTH_LOOP:                                 @Width is assigned to be multiple of 16
289    MOV         r4,r0                       @pu1_src_cpy
290    MOV         r11,r10                     @move ht
291    ADD         r5,r4,r1
292
293HEIGHT_LOOP:                                @unrolled for 4 rows
294    ADD         r6,r5,r1
295    VLD2.8      {D5,D6},[r4]                @vld1q_u8(pu1_src_cpy)
296    ADD         r7,r6,r1
297
298    VLD2.8      {D13,D14},[r5]              @vld1q_u8(pu1_src_cpy)
299    VSUB.I8     D7,D5,D31                   @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
300
301    VLD2.8      {D17,D18},[r6]              @vld1q_u8(pu1_src_cpy)
302    VSUB.I8     D8,D6,D30                   @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
303
304    VLD2.8      {D21,D22},[r7]              @vld1q_u8(pu1_src_cpy)
305    VSUB.I8     D15,D13,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
306
307    VTBX.8      D5,{D1-D4},D7               @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
308    VSUB.I8     D16,D14,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
309
310    VTBX.8      D6,{D9-D12},D8              @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
311    VSUB.I8     D19,D17,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
312
313    VTBX.8      D13,{D1-D4},D15             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
314    VSUB.I8     D20,D18,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
315
316    VTBX.8      D14,{D9-D12},D16            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
317    VSUB.I8     D23,D21,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
318
319    VST2.8      {D5,D6},[r4]                @vst1q_u8(pu1_src_cpy, au1_cur_row)
320    VSUB.I8     D24,D22,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
321
322    SUBS        r11,r11,#4                  @Decrement the ht loop count by 4
323    VTBX.8      D17,{D1-D4},D19             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
324
325    VST2.8      {D13,D14},[r5]              @vst1q_u8(pu1_src_cpy, au1_cur_row)
326
327    VTBX.8      D18,{D9-D12},D20            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
328    VTBX.8      D21,{D1-D4},D23             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
329    VTBX.8      D22,{D9-D12},D24            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
330
331    VST2.8      {D17,D18},[r6],r1           @vst1q_u8(pu1_src_cpy, au1_cur_row)
332
333    ADD         r4,r6,r1
334    VST2.8      {D21,D22},[r7]              @vst1q_u8(pu1_src_cpy, au1_cur_row)
335    ADD         r5,r4,r1
336
337    BNE         HEIGHT_LOOP
338
339    SUB         r9,r9,#16                   @Decrement the width loop by 16
340    ADD         r0,r0,#16
341    CMP         r9,#8
342    BGT         WIDTH_LOOP
343    BLT         END_LOOP
344    MOV         r4,r0                       @pu1_src_cpy
345
346WIDTH_RESIDUE:                              @If width is not multiple of 16
347    ADD         r5,r4,r1
348    VLD2.8      {D5,D6},[r4]                @vld1q_u8(pu1_src_cpy)
349    ADD         r6,r5,r1
350
351    ADD         r7,r6,r1
352    VLD2.8      {D13,D14},[r5]              @vld1q_u8(pu1_src_cpy)
353    VSUB.I8     D7,D5,D31                   @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
354
355    VLD2.8      {D17,D18},[r6]              @vld1q_u8(pu1_src_cpy)
356    VSUB.I8     D8,D6,D30                   @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
357
358    VTBX.8      D5,{D1-D4},D7               @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
359    VSUB.I8     D15,D13,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
360
361    VTBX.8      D6,{D9-D12},D8              @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
362    VSUB.I8     D16,D14,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
363
364    VLD2.8      {D21,D22},[r7]              @vld1q_u8(pu1_src_cpy)
365    VSUB.I8     D19,D17,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
366
367    VTBX.8      D13,{D1-D4},D15             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
368    VSUB.I8     D20,D18,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
369
370    VTBX.8      D14,{D9-D12},D16            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
371    VZIP.8      D5,D6
372
373    VTBX.8      D17,{D1-D4},D19             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
374    VSUB.I8     D23,D21,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
375
376    VST1.8      {D5},[r4]                   @vst1q_u8(pu1_src_cpy, au1_cur_row)
377    VZIP.8      D13,D14
378
379    VTBX.8      D18,{D9-D12},D20            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
380    VSUB.I8     D24,D22,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
381
382    VST1.8      {D13},[r5]                  @vst1q_u8(pu1_src_cpy, au1_cur_row)
383    SUBS        r10,r10,#4                  @Decrement the ht loop count by 4
384
385    VTBX.8      D21,{D1-D4},D23             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
386    VZIP.8      D17,D18
387
388    VTBX.8      D22,{D9-D12},D24            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
389    VST1.8      {D17},[r6],r1               @vst1q_u8(pu1_src_cpy, au1_cur_row)
390    VZIP.8      D21,D22
391
392    ADD         r4,r6,r1
393    VST1.8      {D21},[r7]                  @vst1q_u8(pu1_src_cpy, au1_cur_row)
394    ADD         r5,r4,r1
395
396    BNE         WIDTH_RESIDUE
397
398END_LOOP:
399    vpop        {d8  -  d15}
400    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
401
402
403
404