1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* ,:file 21@* ihevc_sao_band_offset_chroma.s 22@* 23@* ,:brief 24@* Contains function definitions for inter prediction interpolation. 25@* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26@* RVCT 27@* 28@* ,:author 29@* Parthiban V 30@* 31@* ,:par List of Functions: 32@* 33@* 34@* ,:remarks 35@* None 36@* 37@******************************************************************************* 38@*/ 39@void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src, 40@ WORD32 src_strd, 41@ UWORD8 *pu1_src_left, 42@ UWORD8 *pu1_src_top, 43@ UWORD8 *pu1_src_top_left, 44@ WORD32 sao_band_pos_u, 45@ WORD32 sao_band_pos_v, 46@ WORD8 *pi1_sao_offset_u, 47@ WORD8 *pi1_sao_offset_v, 48@ WORD32 wd, 49@ WORD32 ht) 50@ 51@**************Variables Vs Registers***************************************** 52@r0 => *pu1_src 53@r1 => src_strd 54@r2 => *pu1_src_left 55@r3 => *pu1_src_top 56@r4 => *pu1_src_top_left 57@r5 => sao_band_pos_u 58@r6 => sao_band_pos_v 59@r7 => *pi1_sao_offset_u 60@r8 => *pi1_sao_offset_v 61@r9 => wd 62@r10=> ht 63 64.equ pu1_src_top_left_offset, 104 65.equ sao_band_pos_u_offset, 108 66.equ sao_band_pos_v_offset, 112 67.equ pi1_sao_u_offset, 116 68.equ pi1_sao_v_offset, 120 69.equ wd_offset, 124 70.equ ht_offset, 128 71 72.text 73.p2align 2 74 75.extern gu1_table_band_idx 76.globl ihevc_sao_band_offset_chroma_a9q 77 78gu1_table_band_idx_addr_1: 79.long gu1_table_band_idx - ulbl1 - 8 80 81gu1_table_band_idx_addr_2: 82.long gu1_table_band_idx - ulbl2 - 8 83 84ihevc_sao_band_offset_chroma_a9q: 85 86 STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments 87 vpush {d8 - d15} 88 LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left 89 LDR r10,[sp,#ht_offset] @Loads ht 90 91 LDR r9,[sp,#wd_offset] @Loads wd 92 MOV r11,r10 @Move the ht to r9 for loop counter 93 94 ADD r12,r0,r9 @pu1_src[row * src_strd + (wd)] 95 LDR r14, gu1_table_band_idx_addr_1 96ulbl1: 97 add r14,r14,pc 98 SUB r12,r12,#2 @wd-2 99 100SRC_LEFT_LOOP: 101 LDRH r5,[r12],r1 @Load the value 102 SUBS r11,r11,#1 @Decrement the loop counter 103 STRH r5,[r2],#2 @Store the value in pu1_src_left pointer 104 BNE SRC_LEFT_LOOP 105 106 LDR r5,[sp,#sao_band_pos_u_offset] @Loads sao_band_pos_u 107 VLD1.8 D1,[r14]! @band_table_u.val[0] 108 ADD r12,r3,r9 @pu1_src_top[wd] 109 110 LDRH r11,[r12,#-2] 111 VLD1.8 D2,[r14]! @band_table_u.val[1] 112 LSL r6,r5,#3 @sao_band_pos_u 113 114 STRH r11,[r4] @store to pu1_src_top_left[0] 115 VLD1.8 D3,[r14]! @band_table_u.val[2] 116 LDR r7,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u 117 118 SUB r4,r10,#1 @ht-1 119 VDUP.8 D31,r6 @band_pos_u 120 MUL r4,r4,r1 @ht-1 * src_strd 121 122 ADD r4,r4,r0 @pu1_src[(ht - 1) * src_strd] 123 VLD1.8 D4,[r14]! @band_table_u.val[3] 124 MOV r11,r9 @Move the wd to r9 for loop counter 125 126SRC_TOP_LOOP: @wd is always multiple of 8 127 VLD1.8 D0,[r4]! @Load pu1_src[(ht - 1) * src_strd + col] 128 SUBS r11,r11,#8 @Decrement the loop counter by 8 129 VST1.8 D0,[r3]! @Store to pu1_src_top[col] 130 BNE SRC_TOP_LOOP 131 132 VLD1.8 D30,[r7] @pi1_sao_offset_u load 133 VADD.I8 D5,D1,D31 @band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u) 134 135 VDUP.8 D29,D30[1] @vdup_n_u8(pi1_sao_offset_u[1]) 136 VADD.I8 D6,D2,D31 @band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u) 137 138 VDUP.8 D28,D30[2] @vdup_n_u8(pi1_sao_offset_u[2]) 139 VADD.I8 D7,D3,D31 @band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u) 140 141 VDUP.8 D27,D30[3] @vdup_n_u8(pi1_sao_offset_u[3]) 142 VADD.I8 D8,D4,D31 @band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u) 143 144 CMP r5,#28 145 VDUP.8 D26,D30[4] @vdup_n_u8(pi1_sao_offset_u[4]) 146 LDR r14, gu1_table_band_idx_addr_2 147ulbl2: 148 add r14,r14,pc 149 150 VMOV.I8 D30,#16 @vdup_n_u8(16) 151 VADD.I8 D1,D5,D29 @band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1])) 152 153 VLD1.8 D9,[r14]! @band_table_v.val[0] 154 VADD.I8 D2,D6,D28 @band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2])) 155 156 VLD1.8 D10,[r14]! @band_table_v.val[1] 157 VADD.I8 D3,D7,D27 @band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3])) 158 159 LDR r6,[sp,#sao_band_pos_v_offset] @Loads sao_band_pos_v 160 VADD.I8 D4,D8,D26 @band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4])) 161 LSL r11,r6,#3 @sao_band_pos_v 162 163 BLT SAO_BAND_POS_U_0 164 165SAO_BAND_POS_U_28: @case 28 166 VCLE.U8 D13,D4,D30 @vcle_u8(band_table.val[3], vdup_n_u8(16)) 167 BNE SAO_BAND_POS_U_29 168 169 VORR.U8 D4,D4,D13 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp) 170 B SWITCH_BREAK_U 171 172SAO_BAND_POS_U_29: @case 29 173 CMP r5,#29 174 175 VCLE.U8 D14,D3,D30 @vcle_u8(band_table.val[2], vdup_n_u8(16)) 176 BNE SAO_BAND_POS_U_30 177 VORR.U8 D3,D3,D14 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp) 178 179 VAND.U8 D4,D4,D13 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp) 180 B SWITCH_BREAK_U 181 182SAO_BAND_POS_U_30: @case 30 183 CMP r5,#30 184 185 VCLE.U8 D15,D2,D30 @vcle_u8(band_table.val[1], vdup_n_u8(16)) 186 BNE SAO_BAND_POS_U_31 187 VORR.U8 D2,D2,D15 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp) 188 189 VAND.U8 D3,D3,D14 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp) 190 191SAO_BAND_POS_U_31: @case 31 192 CMP r5,#31 193 BNE SWITCH_BREAK_U 194 195 VCLE.U8 D16,D1,D30 @vcle_u8(band_table.val[0], vdup_n_u8(16)) 196 VORR.U8 D1,D1,D16 @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp) 197 198 VAND.U8 D2,D2,D15 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp) 199 B SWITCH_BREAK_U 200 201SAO_BAND_POS_U_0: 202 CMP r5,#0 @case 0 203 BNE SWITCH_BREAK_U 204 205 VCLE.U8 D16,D1,D30 @vcle_u8(band_table.val[0], vdup_n_u8(16)) 206 VAND.U8 D1,D1,D16 @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp) 207 208SWITCH_BREAK_U: 209 VDUP.8 D30,r11 @band_pos_v 210 LDR r8,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v 211 212 VLD1.8 D11,[r14]! @band_table_v.val[2] 213 VADD.I8 D13,D9,D30 @band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v) 214 215 VLD1.8 D12,[r14]! @band_table_v.val[3] 216 VADD.I8 D14,D10,D30 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v) 217 218 VLD1.8 D25,[r8] @pi1_sao_offset_v load 219 VADD.I8 D15,D11,D30 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v) 220 221 VDUP.8 D29,D25[1] @vdup_n_u8(pi1_sao_offset_v[1]) 222 VADD.I8 D16,D12,D30 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v) 223 224 VDUP.8 D28,D25[2] @vdup_n_u8(pi1_sao_offset_v[2]) 225 VADD.I8 D9,D13,D29 @band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1])) 226 227 VDUP.8 D27,D25[3] @vdup_n_u8(pi1_sao_offset_v[3]) 228 VADD.I8 D10,D14,D28 @band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2])) 229 230 VDUP.8 D26,D25[4] @vdup_n_u8(pi1_sao_offset_v[4]) 231 VADD.I8 D11,D15,D27 @band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3])) 232 233 VMOV.I8 D29,#16 @vdup_n_u8(16) 234 VADD.I8 D12,D16,D26 @band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4])) 235 AND r12,r9,#0xf 236 237 CMP r6,#28 238 BLT SAO_BAND_POS_V_0 239 240SAO_BAND_POS_V_28: @case 28 241 VCLE.U8 D17,D12,D29 @vcle_u8(band_table.val[3], vdup_n_u8(16)) 242 BNE SAO_BAND_POS_V_29 243 VORR.U8 D12,D12,D17 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp) 244 B SWITCH_BREAK_V 245 246SAO_BAND_POS_V_29: @case 29 247 CMP r6,#29 248 249 VCLE.U8 D18,D11,D29 @vcle_u8(band_table.val[2], vdup_n_u8(16)) 250 BNE SAO_BAND_POS_V_30 251 VORR.U8 D11,D11,D18 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp) 252 253 VAND.U8 D12,D12,D17 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp) 254 B SWITCH_BREAK_V 255 256SAO_BAND_POS_V_30: @case 30 257 CMP r6,#30 258 259 VCLE.U8 D19,D10,D29 @vcle_u8(band_table.val[1], vdup_n_u8(16)) 260 BNE SAO_BAND_POS_V_31 261 VORR.U8 D10,D10,D19 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp) 262 263 VAND.U8 D11,D11,D18 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp) 264 B SWITCH_BREAK_V 265 266SAO_BAND_POS_V_31: @case 31 267 CMP r6,#31 268 BNE SWITCH_BREAK_V 269 270 VCLE.U8 D20,D9,D29 @vcle_u8(band_table.val[0], vdup_n_u8(16)) 271 VORR.U8 D9,D9,D20 @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp) 272 273 VAND.U8 D10,D10,D19 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp) 274 B SWITCH_BREAK_V 275 276SAO_BAND_POS_V_0: 277 CMP r6,#0 @case 0 278 BNE SWITCH_BREAK_V 279 280 VCLE.U8 D20,D9,D29 @vcle_u8(band_table.val[0], vdup_n_u8(16)) 281 VAND.U8 D9,D9,D20 @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp) 282 283SWITCH_BREAK_V: 284 CMP r9,#16 285 MOV r4,r0 @pu1_src_cpy 286 BLT WIDTH_RESIDUE 287 288WIDTH_LOOP: @Width is assigned to be multiple of 16 289 MOV r4,r0 @pu1_src_cpy 290 MOV r11,r10 @move ht 291 ADD r5,r4,r1 292 293HEIGHT_LOOP: @unrolled for 4 rows 294 ADD r6,r5,r1 295 VLD2.8 {D5,D6},[r4] @vld1q_u8(pu1_src_cpy) 296 ADD r7,r6,r1 297 298 VLD2.8 {D13,D14},[r5] @vld1q_u8(pu1_src_cpy) 299 VSUB.I8 D7,D5,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 300 301 VLD2.8 {D17,D18},[r6] @vld1q_u8(pu1_src_cpy) 302 VSUB.I8 D8,D6,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 303 304 VLD2.8 {D21,D22},[r7] @vld1q_u8(pu1_src_cpy) 305 VSUB.I8 D15,D13,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 306 307 VTBX.8 D5,{D1-D4},D7 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 308 VSUB.I8 D16,D14,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 309 310 VTBX.8 D6,{D9-D12},D8 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 311 VSUB.I8 D19,D17,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 312 313 VTBX.8 D13,{D1-D4},D15 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 314 VSUB.I8 D20,D18,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 315 316 VTBX.8 D14,{D9-D12},D16 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 317 VSUB.I8 D23,D21,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 318 319 VST2.8 {D5,D6},[r4] @vst1q_u8(pu1_src_cpy, au1_cur_row) 320 VSUB.I8 D24,D22,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 321 322 SUBS r11,r11,#4 @Decrement the ht loop count by 4 323 VTBX.8 D17,{D1-D4},D19 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 324 325 VST2.8 {D13,D14},[r5] @vst1q_u8(pu1_src_cpy, au1_cur_row) 326 327 VTBX.8 D18,{D9-D12},D20 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 328 VTBX.8 D21,{D1-D4},D23 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 329 VTBX.8 D22,{D9-D12},D24 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 330 331 VST2.8 {D17,D18},[r6],r1 @vst1q_u8(pu1_src_cpy, au1_cur_row) 332 333 ADD r4,r6,r1 334 VST2.8 {D21,D22},[r7] @vst1q_u8(pu1_src_cpy, au1_cur_row) 335 ADD r5,r4,r1 336 337 BNE HEIGHT_LOOP 338 339 SUB r9,r9,#16 @Decrement the width loop by 16 340 ADD r0,r0,#16 341 CMP r9,#8 342 BGT WIDTH_LOOP 343 BLT END_LOOP 344 MOV r4,r0 @pu1_src_cpy 345 346WIDTH_RESIDUE: @If width is not multiple of 16 347 ADD r5,r4,r1 348 VLD2.8 {D5,D6},[r4] @vld1q_u8(pu1_src_cpy) 349 ADD r6,r5,r1 350 351 ADD r7,r6,r1 352 VLD2.8 {D13,D14},[r5] @vld1q_u8(pu1_src_cpy) 353 VSUB.I8 D7,D5,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 354 355 VLD2.8 {D17,D18},[r6] @vld1q_u8(pu1_src_cpy) 356 VSUB.I8 D8,D6,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 357 358 VTBX.8 D5,{D1-D4},D7 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 359 VSUB.I8 D15,D13,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 360 361 VTBX.8 D6,{D9-D12},D8 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 362 VSUB.I8 D16,D14,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 363 364 VLD2.8 {D21,D22},[r7] @vld1q_u8(pu1_src_cpy) 365 VSUB.I8 D19,D17,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 366 367 VTBX.8 D13,{D1-D4},D15 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 368 VSUB.I8 D20,D18,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 369 370 VTBX.8 D14,{D9-D12},D16 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 371 VZIP.8 D5,D6 372 373 VTBX.8 D17,{D1-D4},D19 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 374 VSUB.I8 D23,D21,D31 @vsub_u8(au1_cur_row_deint.val[0], band_pos_u) 375 376 VST1.8 {D5},[r4] @vst1q_u8(pu1_src_cpy, au1_cur_row) 377 VZIP.8 D13,D14 378 379 VTBX.8 D18,{D9-D12},D20 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 380 VSUB.I8 D24,D22,D30 @vsub_u8(au1_cur_row_deint.val[1], band_pos_v) 381 382 VST1.8 {D13},[r5] @vst1q_u8(pu1_src_cpy, au1_cur_row) 383 SUBS r10,r10,#4 @Decrement the ht loop count by 4 384 385 VTBX.8 D21,{D1-D4},D23 @vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u)) 386 VZIP.8 D17,D18 387 388 VTBX.8 D22,{D9-D12},D24 @vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v)) 389 VST1.8 {D17},[r6],r1 @vst1q_u8(pu1_src_cpy, au1_cur_row) 390 VZIP.8 D21,D22 391 392 ADD r4,r6,r1 393 VST1.8 {D21},[r7] @vst1q_u8(pu1_src_cpy, au1_cur_row) 394 ADD r5,r4,r1 395 396 BNE WIDTH_RESIDUE 397 398END_LOOP: 399 vpop {d8 - d15} 400 LDMFD sp!,{r4-r12,r15} @Reload the registers from SP 401 402 403 404