1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_band_offset_chroma.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
40@                           WORD32 src_strd,
41@                           UWORD8 *pu1_src_left,
42@                           UWORD8 *pu1_src_top,
43@                           UWORD8 *pu1_src_top_left,
44@                           WORD32 sao_band_pos_u,
45@                           WORD32 sao_band_pos_v,
46@                           WORD8 *pi1_sao_offset_u,
47@                           WORD8 *pi1_sao_offset_v,
48@                           WORD32 wd,
49@                           WORD32 ht)
50@
51@**************Variables Vs Registers*****************************************
52@r0 =>  *pu1_src
53@r1 =>  src_strd
54@r2 =>  *pu1_src_left
55@r3 =>  *pu1_src_top
56@r4 =>  *pu1_src_top_left
57@r5 =>  sao_band_pos_u
58@r6 =>  sao_band_pos_v
59@r7 =>  *pi1_sao_offset_u
60@r8 =>  *pi1_sao_offset_v
61@r9 =>  wd
62@r10=>  ht
63
64.text
65.p2align 2
66
67.extern gu1_table_band_idx
68.globl ihevc_sao_band_offset_chroma_a9q
69
70gu1_table_band_idx_addr_1:
71.long gu1_table_band_idx - ulbl1 - 8
72
73gu1_table_band_idx_addr_2:
74.long gu1_table_band_idx - ulbl2 - 8
75
76ihevc_sao_band_offset_chroma_a9q:
77
78    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
79    LDR         r4,[sp,#40]                 @Loads pu1_src_top_left
80    LDR         r10,[sp,#64]                @Loads ht
81
82    LDR         r9,[sp,#60]                 @Loads wd
83    MOV         r11,r10                     @Move the ht to r9 for loop counter
84
85    ADD         r12,r0,r9                   @pu1_src[row * src_strd + (wd)]
86    LDR         r14, gu1_table_band_idx_addr_1
87ulbl1:
88    add         r14,r14,pc
89    SUB         r12,r12,#2                  @wd-2
90
91SRC_LEFT_LOOP:
92    LDRH        r5,[r12],r1                 @Load the value
93    SUBS        r11,r11,#1                  @Decrement the loop counter
94    STRH        r5,[r2],#2                  @Store the value in pu1_src_left pointer
95    BNE         SRC_LEFT_LOOP
96
97    LDR         r5,[sp,#44]                 @Loads sao_band_pos_u
98    VLD1.8      D1,[r14]!                   @band_table_u.val[0]
99    ADD         r12,r3,r9                   @pu1_src_top[wd]
100
101    LDRH        r11,[r12,#-2]
102    VLD1.8      D2,[r14]!                   @band_table_u.val[1]
103    LSL         r6,r5,#3                    @sao_band_pos_u
104
105    STRH        r11,[r4]                    @store to pu1_src_top_left[0]
106    VLD1.8      D3,[r14]!                   @band_table_u.val[2]
107    LDR         r7,[sp,#52]                 @Loads pi1_sao_offset_u
108
109    SUB         r4,r10,#1                   @ht-1
110    VDUP.8      D31,r6                      @band_pos_u
111    MUL         r4,r4,r1                    @ht-1 * src_strd
112
113    ADD         r4,r4,r0                    @pu1_src[(ht - 1) * src_strd]
114    VLD1.8      D4,[r14]!                   @band_table_u.val[3]
115    MOV         r11,r9                      @Move the wd to r9 for loop counter
116
117SRC_TOP_LOOP:                               @wd is always multiple of 8
118    VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
119    SUBS        r11,r11,#8                  @Decrement the loop counter by 8
120    VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
121    BNE         SRC_TOP_LOOP
122
123    VLD1.8      D30,[r7]                    @pi1_sao_offset_u load
124    VADD.I8     D5,D1,D31                   @band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u)
125
126    VDUP.8      D29,D30[1]                  @vdup_n_u8(pi1_sao_offset_u[1])
127    VADD.I8     D6,D2,D31                   @band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u)
128
129    VDUP.8      D28,D30[2]                  @vdup_n_u8(pi1_sao_offset_u[2])
130    VADD.I8     D7,D3,D31                   @band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u)
131
132    VDUP.8      D27,D30[3]                  @vdup_n_u8(pi1_sao_offset_u[3])
133    VADD.I8     D8,D4,D31                   @band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u)
134
135    CMP         r5,#28
136    VDUP.8      D26,D30[4]                  @vdup_n_u8(pi1_sao_offset_u[4])
137    LDR         r14, gu1_table_band_idx_addr_2
138ulbl2:
139    add         r14,r14,pc
140
141    VMOV.I8     D30,#16                     @vdup_n_u8(16)
142    VADD.I8     D1,D5,D29                   @band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1]))
143
144    VLD1.8      D9,[r14]!                   @band_table_v.val[0]
145    VADD.I8     D2,D6,D28                   @band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2]))
146
147    VLD1.8      D10,[r14]!                  @band_table_v.val[1]
148    VADD.I8     D3,D7,D27                   @band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
149
150    LDR         r6,[sp,#48]                 @Loads sao_band_pos_v
151    VADD.I8     D4,D8,D26                   @band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
152    LSL         r11,r6,#3                   @sao_band_pos_v
153
154    BLT         SAO_BAND_POS_U_0
155
156SAO_BAND_POS_U_28:                          @case 28
157    VCLE.U8     D13,D4,D30                  @vcle_u8(band_table.val[3], vdup_n_u8(16))
158    BNE         SAO_BAND_POS_U_29
159
160    VORR.U8     D4,D4,D13                   @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
161    B           SWITCH_BREAK_U
162
163SAO_BAND_POS_U_29:                          @case 29
164    CMP         r5,#29
165
166    VCLE.U8     D14,D3,D30                  @vcle_u8(band_table.val[2], vdup_n_u8(16))
167    BNE         SAO_BAND_POS_U_30
168    VORR.U8     D3,D3,D14                   @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
169
170    VAND.U8     D4,D4,D13                   @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
171    B           SWITCH_BREAK_U
172
173SAO_BAND_POS_U_30:                          @case 30
174    CMP         r5,#30
175
176    VCLE.U8     D15,D2,D30                  @vcle_u8(band_table.val[1], vdup_n_u8(16))
177    BNE         SAO_BAND_POS_U_31
178    VORR.U8     D2,D2,D15                   @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
179
180    VAND.U8     D3,D3,D14                   @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
181
182SAO_BAND_POS_U_31:                          @case 31
183    CMP         r5,#31
184    BNE         SWITCH_BREAK_U
185
186    VCLE.U8     D16,D1,D30                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
187    VORR.U8     D1,D1,D16                   @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
188
189    VAND.U8     D2,D2,D15                   @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
190    B           SWITCH_BREAK_U
191
192SAO_BAND_POS_U_0:
193    CMP         r5,#0                       @case 0
194    BNE         SWITCH_BREAK_U
195
196    VCLE.U8     D16,D1,D30                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
197    VAND.U8     D1,D1,D16                   @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
198
199SWITCH_BREAK_U:
200    VDUP.8      D30,r11                     @band_pos_v
201    LDR         r8,[sp,#56]                 @Loads pi1_sao_offset_v
202
203    VLD1.8      D11,[r14]!                  @band_table_v.val[2]
204    VADD.I8     D13,D9,D30                  @band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
205
206    VLD1.8      D12,[r14]!                  @band_table_v.val[3]
207    VADD.I8     D14,D10,D30                 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v)
208
209    VLD1.8      D25,[r8]                    @pi1_sao_offset_v load
210    VADD.I8     D15,D11,D30                 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v)
211
212    VDUP.8      D29,D25[1]                  @vdup_n_u8(pi1_sao_offset_v[1])
213    VADD.I8     D16,D12,D30                 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v)
214
215    VDUP.8      D28,D25[2]                  @vdup_n_u8(pi1_sao_offset_v[2])
216    VADD.I8     D9,D13,D29                  @band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1]))
217
218    VDUP.8      D27,D25[3]                  @vdup_n_u8(pi1_sao_offset_v[3])
219    VADD.I8     D10,D14,D28                 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2]))
220
221    VDUP.8      D26,D25[4]                  @vdup_n_u8(pi1_sao_offset_v[4])
222    VADD.I8     D11,D15,D27                 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3]))
223
224    VMOV.I8     D29,#16                     @vdup_n_u8(16)
225    VADD.I8     D12,D16,D26                 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4]))
226    AND         r12,r9,#0xf
227
228    CMP         r6,#28
229    BLT         SAO_BAND_POS_V_0
230
231SAO_BAND_POS_V_28:                          @case 28
232    VCLE.U8     D17,D12,D29                 @vcle_u8(band_table.val[3], vdup_n_u8(16))
233    BNE         SAO_BAND_POS_V_29
234    VORR.U8     D12,D12,D17                 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
235    B           SWITCH_BREAK_V
236
237SAO_BAND_POS_V_29:                          @case 29
238    CMP         r6,#29
239
240    VCLE.U8     D18,D11,D29                 @vcle_u8(band_table.val[2], vdup_n_u8(16))
241    BNE         SAO_BAND_POS_V_30
242    VORR.U8     D11,D11,D18                 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
243
244    VAND.U8     D12,D12,D17                 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
245    B           SWITCH_BREAK_V
246
247SAO_BAND_POS_V_30:                          @case 30
248    CMP         r6,#30
249
250    VCLE.U8     D19,D10,D29                 @vcle_u8(band_table.val[1], vdup_n_u8(16))
251    BNE         SAO_BAND_POS_V_31
252    VORR.U8     D10,D10,D19                 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
253
254    VAND.U8     D11,D11,D18                 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
255    B           SWITCH_BREAK_V
256
257SAO_BAND_POS_V_31:                          @case 31
258    CMP         r6,#31
259    BNE         SWITCH_BREAK_V
260
261    VCLE.U8     D20,D9,D29                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
262    VORR.U8     D9,D9,D20                   @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
263
264    VAND.U8     D10,D10,D19                 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
265    B           SWITCH_BREAK_V
266
267SAO_BAND_POS_V_0:
268    CMP         r6,#0                       @case 0
269    BNE         SWITCH_BREAK_V
270
271    VCLE.U8     D20,D9,D29                  @vcle_u8(band_table.val[0], vdup_n_u8(16))
272    VAND.U8     D9,D9,D20                   @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
273
274SWITCH_BREAK_V:
275    CMP         r9,#16
276    MOV         r4,r0                       @pu1_src_cpy
277    BLT         WIDTH_RESIDUE
278
279WIDTH_LOOP:                                 @Width is assigned to be multiple of 16
280    MOV         r4,r0                       @pu1_src_cpy
281    MOV         r11,r10                     @move ht
282    ADD         r5,r4,r1
283
284HEIGHT_LOOP:                                @unrolled for 4 rows
285    ADD         r6,r5,r1
286    VLD2.8      {D5,D6},[r4]                @vld1q_u8(pu1_src_cpy)
287    ADD         r7,r6,r1
288
289    VLD2.8      {D13,D14},[r5]              @vld1q_u8(pu1_src_cpy)
290    VSUB.I8     D7,D5,D31                   @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
291
292    VLD2.8      {D17,D18},[r6]              @vld1q_u8(pu1_src_cpy)
293    VSUB.I8     D8,D6,D30                   @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
294
295    VLD2.8      {D21,D22},[r7]              @vld1q_u8(pu1_src_cpy)
296    VSUB.I8     D15,D13,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
297
298    VTBX.8      D5,{D1-D4},D7               @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
299    VSUB.I8     D16,D14,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
300
301    VTBX.8      D6,{D9-D12},D8              @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
302    VSUB.I8     D19,D17,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
303
304    VTBX.8      D13,{D1-D4},D15             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
305    VSUB.I8     D20,D18,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
306
307    VTBX.8      D14,{D9-D12},D16            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
308    VSUB.I8     D23,D21,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
309
310    VST2.8      {D5,D6},[r4]                @vst1q_u8(pu1_src_cpy, au1_cur_row)
311    VSUB.I8     D24,D22,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
312
313    SUBS        r11,r11,#4                  @Decrement the ht loop count by 4
314    VTBX.8      D17,{D1-D4},D19             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
315
316    VST2.8      {D13,D14},[r5]              @vst1q_u8(pu1_src_cpy, au1_cur_row)
317
318    VTBX.8      D18,{D9-D12},D20            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
319    VTBX.8      D21,{D1-D4},D23             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
320    VTBX.8      D22,{D9-D12},D24            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
321
322    VST2.8      {D17,D18},[r6],r1           @vst1q_u8(pu1_src_cpy, au1_cur_row)
323
324    ADD         r4,r6,r1
325    VST2.8      {D21,D22},[r7]              @vst1q_u8(pu1_src_cpy, au1_cur_row)
326    ADD         r5,r4,r1
327
328    BNE         HEIGHT_LOOP
329
330    SUB         r9,r9,#16                   @Decrement the width loop by 16
331    ADD         r0,r0,#16
332    CMP         r9,#8
333    BGT         WIDTH_LOOP
334    BLT         END_LOOP
335    MOV         r4,r0                       @pu1_src_cpy
336
337WIDTH_RESIDUE:                              @If width is not multiple of 16
338    ADD         r5,r4,r1
339    VLD2.8      {D5,D6},[r4]                @vld1q_u8(pu1_src_cpy)
340    ADD         r6,r5,r1
341
342    ADD         r7,r6,r1
343    VLD2.8      {D13,D14},[r5]              @vld1q_u8(pu1_src_cpy)
344    VSUB.I8     D7,D5,D31                   @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
345
346    VLD2.8      {D17,D18},[r6]              @vld1q_u8(pu1_src_cpy)
347    VSUB.I8     D8,D6,D30                   @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
348
349    VTBX.8      D5,{D1-D4},D7               @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
350    VSUB.I8     D15,D13,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
351
352    VTBX.8      D6,{D9-D12},D8              @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
353    VSUB.I8     D16,D14,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
354
355    VLD2.8      {D21,D22},[r7]              @vld1q_u8(pu1_src_cpy)
356    VSUB.I8     D19,D17,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
357
358    VTBX.8      D13,{D1-D4},D15             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
359    VSUB.I8     D20,D18,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
360
361    VTBX.8      D14,{D9-D12},D16            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
362    VZIP.8      D5,D6
363
364    VTBX.8      D17,{D1-D4},D19             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
365    VSUB.I8     D23,D21,D31                 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
366
367    VST1.8      {D5},[r4]                   @vst1q_u8(pu1_src_cpy, au1_cur_row)
368    VZIP.8      D13,D14
369
370    VTBX.8      D18,{D9-D12},D20            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
371    VSUB.I8     D24,D22,D30                 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
372
373    VST1.8      {D13},[r5]                  @vst1q_u8(pu1_src_cpy, au1_cur_row)
374    SUBS        r10,r10,#4                  @Decrement the ht loop count by 4
375
376    VTBX.8      D21,{D1-D4},D23             @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
377    VZIP.8      D17,D18
378
379    VTBX.8      D22,{D9-D12},D24            @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
380    VST1.8      {D17},[r6],r1               @vst1q_u8(pu1_src_cpy, au1_cur_row)
381    VZIP.8      D21,D22
382
383    ADD         r4,r6,r1
384    VST1.8      {D21},[r7]                  @vst1q_u8(pu1_src_cpy, au1_cur_row)
385    ADD         r5,r4,r1
386
387    BNE         WIDTH_RESIDUE
388
389END_LOOP:
390    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
391
392
393
394