1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* ,:file 21@* ihevc_sao_edge_offset_class2_chroma.s 22@* 23@* ,:brief 24@* Contains function definitions for inter prediction interpolation. 25@* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26@* RVCT 27@* 28@* ,:author 29@* Parthiban V 30@* 31@* ,:par List of Functions: 32@* 33@* 34@* ,:remarks 35@* None 36@* 37@******************************************************************************* 38@*/ 39@void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src, 40@ WORD32 src_strd, 41@ UWORD8 *pu1_src_left, 42@ UWORD8 *pu1_src_top, 43@ UWORD8 *pu1_src_top_left, 44@ UWORD8 *pu1_src_top_right, 45@ UWORD8 *pu1_src_bot_left, 46@ UWORD8 *pu1_avail, 47@ WORD8 *pi1_sao_offset_u, 48@ WORD8 *pi1_sao_offset_v, 49@ WORD32 wd, 50@ WORD32 ht) 51@**************Variables Vs Registers***************************************** 52@r0 => *pu1_src 53@r1 => src_strd 54@r2 => *pu1_src_left 55@r3 => *pu1_src_top 56@r4 => *pu1_src_top_left 57@r5 => *pu1_avail 58@r6 => *pi1_sao_offset_u 59@r9 => *pi1_sao_offset_v 60@r7 => wd 61@r8=> ht 62 63.text 64.syntax unified 65.p2align 2 66 67.extern gi1_table_edge_idx 68.globl ihevc_sao_edge_offset_class2_chroma_a9q 69 70gi1_table_edge_idx_addr_1: 71.long gi1_table_edge_idx - ulbl1 - 8 72 73gi1_table_edge_idx_addr_2: 74.long gi1_table_edge_idx - ulbl2 - 8 75 76gi1_table_edge_idx_addr_3: 77.long gi1_table_edge_idx - ulbl3 - 8 78 79gi1_table_edge_idx_addr_4: 80.long gi1_table_edge_idx - ulbl4 - 8 81 82gi1_table_edge_idx_addr_5: 83.long gi1_table_edge_idx - ulbl5 - 8 84 85ihevc_sao_edge_offset_class2_chroma_a9q: 86 87 88 STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments 89 90 LDR r7,[sp,#0x40] @Loads wd 91 LDR r8,[sp,#0x44] @Loads ht 92 SUB r9,r7,#2 @wd - 2 93 94 LDR r4,[sp,#0x28] @Loads pu1_src_top_left 95 LDRH r10,[r3,r9] @pu1_src_top[wd - 2] 96 97 STR r0,[sp,#0x2C] @Store pu1_src in sp 98 MOV r9,r7 @Move width to r9 for loop count 99 100 STR r2,[sp,#0x30] @Store pu1_src_left in sp 101 LDR r5,[sp,#0x34] @Loads pu1_avail 102 LDR r6,[sp,#0x38] @Loads pi1_sao_offset_u 103 104 STR r3,[sp,#0x38] @Store pu1_src_top in sp 105 SUB sp,sp,#0xD4 @Decrement the stack pointer to store some temp arr values 106 107 STRH r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 2] 108 SUB r10,r8,#1 @ht-1 109 MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col] 110 ADD r12,sp,#10 @temp array 111 112AU1_SRC_TOP_LOOP: 113 VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col] 114 SUBS r9,r9,#8 @Decrement the loop count by 8 115 VST1.8 D0,[r12]! @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 116 BNE AU1_SRC_TOP_LOOP 117 118PU1_AVAIL_4_LOOP_U: 119 LDRB r9,[r5,#4] @pu1_avail[4] 120 CMP r9,#0 121 LDRB r9,[r0] @u1_pos_0_0_tmp_u = pu1_src[0] 122 LDRB r10,[r0,#1] @u1_pos_0_0_tmp_v = pu1_src[1] 123 BEQ PU1_AVAIL_7_LOOP_U 124 125 LDRB r11,[r4] @pu1_src_top_left[0] 126 ADD r14,r0,r1 @pu1_src + src_strd 127 128 SUB r12,r9,r11 @pu1_src[0] - pu1_src_top_left[0] 129 130 LDRB r14,[r14,#2] @pu1_src[2 + src_strd] 131 CMP r12,#0 132 133 MVNLT r12,#0 134 SUB r11,r9,r14 @pu1_src[0] - pu1_src[2 + src_strd] 135 136 MOVGT r12,#1 @SIGN(pu1_src[0] - pu1_src_top_left[0]) 137 138 CMP r11,#0 139 MVNLT r11,#0 140 LDR r14, gi1_table_edge_idx_addr_1 @table pointer 141ulbl1: 142 add r14,r14,pc 143 MOVGT r11,#1 @SIGN(pu1_src[0] - pu1_src[2 + src_strd]) 144 145 ADD r11,r12,r11 @SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[2 + src_strd]) 146 ADD r11,r11,#2 @edge_idx 147 148 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 149 CMP r12,#0 @0 != edge_idx 150 BEQ PU1_AVAIL_4_LOOP_V 151 LDRSB r11,[r6,r12] @pi1_sao_offset_u[edge_idx] 152 ADD r9,r9,r11 @pu1_src[0] + pi1_sao_offset_u[edge_idx] 153 USAT r9,#8,r9 @u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 154 155PU1_AVAIL_4_LOOP_V: 156 157 LDRB r11,[r4,#1] @pu1_src_top_left[1] 158 ADD r14,r0,r1 @pu1_src + src_strd 159 160 SUB r12,r10,r11 @pu1_src[1] - pu1_src_top_left[1] 161 LDRB r14,[r14,#3] @pu1_src[3 + src_strd] 162 163 CMP r12,#0 164 MVNLT r12,#0 165 SUB r11,r10,r14 @pu1_src[1] - pu1_src[3 + src_strd] 166 MOVGT r12,#1 @SIGN(pu1_src[0] - pu1_src_top_left[0]) 167 168 CMP r11,#0 169 MVNLT r11,#0 170 LDR r14, gi1_table_edge_idx_addr_2 @table pointer 171ulbl2: 172 add r14,r14,pc 173 MOVGT r11,#1 @SIGN(pu1_src[0] - pu1_src[3 + src_strd]) 174 175 ADD r11,r12,r11 @SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[3 + src_strd]) 176 ADD r11,r11,#2 @edge_idx 177 178 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 179 CMP r12,#0 @0 != edge_idx 180 BEQ PU1_AVAIL_7_LOOP_U 181 LDR r11,[sp,#0x110] @Loads pi1_sao_offset_v 182 LDRSB r11,[r11,r12] @pi1_sao_offset_v[edge_idx] 183 ADD r10,r10,r11 @pu1_src[0] + pi1_sao_offset_v[edge_idx] 184 USAT r10,#8,r10 @u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1) 185 186PU1_AVAIL_7_LOOP_U: 187 STRB r10,[sp,#7] 188 STRB r9,[sp,#6] 189 190 LDRB r10,[r5,#7] @pu1_avail[7] 191 CMP r10,#0 192 SUB r10,r7,#2 @wd - 2 193 SUB r11,r8,#1 @ht - 1 194 MLA r12,r11,r1,r10 @wd - 2 + (ht - 1) * src_strd 195 ADD r12,r12,r0 @pu1_src[wd - 2 + (ht - 1) * src_strd] 196 LDRB r10,[r12] @u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd] 197 LDRB r9,[r12,#1] @u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd] 198 BEQ PU1_AVAIL_3_LOOP 199 200 SUB r11,r12,r1 @pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd] 201 SUB r11,r11,#2 @pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd] 202 LDRB r11,[r11] @Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd] 203 SUB r11,r10,r11 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd] 204 CMP r11,#0 205 MVNLT r11,#0 206 MOVGT r11,#1 @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]) 207 208 ADD r14,r12,r1 @pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd] 209 ADD r14,r14,#2 @pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd] 210 LDRB r14,[r14] @Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd] 211 SUB r14,r10,r14 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd] 212 CMP r14,#0 213 MVNLT r14,#0 214 MOVGT r14,#1 @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]) 215 216 ADD r11,r11,r14 @Add 2 sign value 217 ADD r11,r11,#2 @edge_idx 218 LDR r14, gi1_table_edge_idx_addr_3 @table pointer 219ulbl3: 220 add r14,r14,pc 221 222 LDRSB r14,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 223 CMP r14,#0 224 BEQ PU1_AVAIL_7_LOOP_V 225 LDRSB r11,[r6,r14] @pi1_sao_offset_u[edge_idx] 226 ADD r10,r10,r11 @pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 227 USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 228 229PU1_AVAIL_7_LOOP_V: 230 ADD r12,r12,#1 231 SUB r11,r12,r1 @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd] 232 SUB r11,r11,#2 @pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd] 233 LDRB r11,[r11] @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd] 234 SUB r11,r9,r11 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd] 235 CMP r11,#0 236 MVNLT r11,#0 237 MOVGT r11,#1 @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) 238 239 ADD r14,r12,r1 @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd] 240 ADD r14,r14,#2 @pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd] 241 LDRB r14,[r14] @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd] 242 SUB r14,r9,r14 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd] 243 CMP r14,#0 244 MVNLT r14,#0 245 MOVGT r14,#1 @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]) 246 247 ADD r11,r11,r14 @Add 2 sign value 248 ADD r11,r11,#2 @edge_idx 249 LDR r14, gi1_table_edge_idx_addr_4 @table pointer 250ulbl4: 251 add r14,r14,pc 252 253 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 254 CMP r12,#0 255 BEQ PU1_AVAIL_3_LOOP 256 LDR r14,[sp,#0x110] @Loads pi1_sao_offset_v 257 LDRSB r11,[r14,r12] @pi1_sao_offset_v[edge_idx] 258 ADD r9,r9,r11 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 259 USAT r9,#8,r9 @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 260 261PU1_AVAIL_3_LOOP: 262 STRB r10,[sp,#8] 263 VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2) 264 STRB r9,[sp,#9] 265 266 MOV r12,r8 @Move ht 267 VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0) 268 MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy 269 270 LDRB r11,[r5,#3] @pu1_avail[3] 271 VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 272 CMP r11,#0 273 274 SUBEQ r12,r12,#1 @ht_tmp-- 275 LDRB r5,[r5,#2] @pu1_avail[2] 276 277 CMP r5,#0 278 279 ADDEQ r0,r0,r1 @pu1_src += src_strd 280 VLD1.8 D6,[r6] @offset_tbl_u = vld1_s8(pi1_sao_offset_u) 281 SUBEQ r12,r12,#1 @ht_tmp-- 282 283 LDR r6,[sp,#0x110] @Loads pi1_sao_offset_v 284 ADDEQ r14,r14,#2 @pu1_src_left_cpy += 2 285 286 STR r0,[sp,#2] @Store pu1_src in sp 287 VLD1.8 D7,[r6] @offset_tbl_v = vld1_s8(pi1_sao_offset_v) 288 LDR r2, gi1_table_edge_idx_addr_5 @table pointer 289ulbl5: 290 add r2,r2,pc 291 292 MOV r6,r7 @move wd to r6 loop_count 293 VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) 294 CMP r7,#16 @Compare wd with 16 295 296 BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 297 CMP r8,#4 @Compare ht with 4 298 BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP 299 300WIDTH_LOOP_16: 301 LDR r5,[sp,#0x108] @Loads pu1_avail 302 LDR r7,[sp,#0x114] @Loads wd 303 CMP r6,r7 @col == wd 304 LDRBEQ r8,[r5] @pu1_avail[0] 305 306 MOVNE r8,#-1 307 VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 308 309 CMP r6,#16 @if(col == 16) 310 VMOV.8 D8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 311 312 BNE SKIP_AU1_MASK_VAL 313 LDRB r8,[r5,#1] @pu1_avail[1] 314 VMOV.8 D9[6],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 315 VMOV.8 D9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 316 317SKIP_AU1_MASK_VAL: 318 LDRB r9,[r5,#2] @pu1_avail[2] 319 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 320 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 321 SUB r0,#8 322 CMP r9,#0 323 324 LDR r4,[sp,#0x118] @Loads ht 325 SUBEQ r8,r0,r1 @pu1_src - src_strd 326 327 LDR r7,[sp,#0x114] @Loads wd 328 MOVNE r8,r3 @pu1_src_top_cpy 329 330 SUB r8,r8,#2 @pu1_src - src_strd - 2 331 ADD r3,r3,#16 332 333 ADD r5,sp,#0x4B @*au1_src_left_tmp 334 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) 335 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) 336 SUB r8,#8 337 SUB r7,r7,r6 @(wd - col) 338 339 ADD r7,r7,#14 @15 + (wd - col) 340 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 341 LDR r8,[sp,#0x100] @Loads *pu1_src 342 343 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 344 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 345 346AU1_SRC_LEFT_LOOP: 347 LDRH r8,[r7] @load the value and increment by src_strd 348 SUBS r4,r4,#1 @decrement the loop count 349 350 STRH r8,[r5],#2 @store it in the stack pointer 351 ADD r7,r7,r1 352 353 BNE AU1_SRC_LEFT_LOOP 354 355 ADD r8,r0,r1 @I *pu1_src + src_strd 356 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 357 MOV r7,r12 @row count, move ht_tmp to r7 358 359 VLD1.8 D16,[r8]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 360 VLD1.8 D17,[r8] @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 361 SUB r8,#8 362 363 ADD r8,r8,#16 @I 364 VMOV.I8 Q9,#0 365 LDRH r5,[r8] @I pu1_src_cpy[src_strd + 16] 366 367 LDR r10,[sp,#0x108] @I Loads pu1_avail 368 VMOV.16 D18[0],r5 @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 369 LDRB r10,[r10,#2] @I pu1_avail[2] 370 371 CMP r10,#0 @I 372 VEXT.8 Q9,Q8,Q9,#2 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 373 BNE SIGN_UP_CHANGE_DONE @I 374 375 LDRB r11,[r0] @I pu1_src_cpy[0] 376 SUB r4,r12,r7 @I ht_tmp - row 377 378 LDRB r10,[r0,#1] @I pu1_src_cpy[0] 379 LSL r4,r4,#1 @I (ht_tmp - row) * 2 380 381 ADD r9,r14,r4 @I pu1_src_left_cpy[(ht_tmp - row) * 2] 382 LDRB r5,[r9,#-2] @I load the value 383 384 SUB r8,r11,r5 @I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 385 LDRB r5,[r9,#-1] @I load the value 386 387 CMP r8,#0 @I 388 SUB r4,r10,r5 @I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 389 390 MVNLT r8,#0 @I 391 MOVGT r8,#1 @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 392 393 CMP r4,#0 @I 394 VMOV.8 D14[0],r8 @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 395 MVNLT r4,#0 @I 396 397 MOVGT r4,#1 @I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 398 VMOV.8 D14[1],r4 @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 399 400SIGN_UP_CHANGE_DONE: 401 VLD1.8 D30,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 402 VCGT.U8 Q10,Q6,Q9 @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 403 404 VCLT.U8 Q11,Q6,Q9 @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 405 VSUB.U8 Q11,Q11,Q10 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 406 407 VADD.I8 Q9,Q0,Q7 @I edge_idx = vaddq_s8(const_2, sign_up) 408 VADD.I8 Q9,Q9,Q11 @I edge_idx = vaddq_s8(edge_idx, sign_down) 409 410 VTBL.8 D18,{D30},D18 @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 411 VNEG.S8 Q7,Q11 @I sign_up = vnegq_s8(sign_down) 412 413 VTBL.8 D19,{D30},D19 @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 414 VEXT.8 Q7,Q7,Q7,#14 @I sign_up = vextq_s8(sign_up, sign_up, 14) 415 416 VMOVL.U8 Q10,D12 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 417 VAND Q11,Q9,Q4 @I edge_idx = vandq_s8(edge_idx, au1_mask) 418 419 VMOVL.U8 Q9,D13 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 420 VUZP.8 D22,D23 @I 421 422 VTBL.8 D22,{D6},D22 @I 423 VTBL.8 D23,{D7},D23 @I 424 VZIP.8 D22,D23 @I 425 426 VMOV Q6,Q8 @I pu1_cur_row = pu1_next_row 427 VADDW.S8 Q10,Q10,D22 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 428 429 VMAX.S16 Q10,Q10,Q1 @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 430 VMIN.U16 Q10,Q10,Q2 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 431 432 VADDW.S8 Q9,Q9,D23 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 433 VMAX.S16 Q9,Q9,Q1 @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 434 435 VMIN.U16 Q9,Q9,Q2 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 436 SUB r7,r7,#1 @I Decrement the ht_tmp loop count by 1 437 438 439PU1_SRC_LOOP: 440 ADD r8,r0,r1,LSL #1 @II *pu1_src + src_strd 441 VMOVN.I16 D20,Q10 @I vmovn_s16(pi2_tmp_cur_row.val[0]) 442 ADD r11,r8,r1 @III *pu1_src + src_strd 443 444 VLD1.8 D16,[r8]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 445 VLD1.8 D17,[r8] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 446 SUB r8,#8 447 VLD1.8 D30,[r11]! @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 448 VLD1.8 D31,[r11] @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 449 SUB r11,#8 450 451 ADD r8,r8,#16 @II 452 VMOVN.I16 D21,Q9 @I vmovn_s16(pi2_tmp_cur_row.val[1]) 453 LDRH r5,[r8] @II pu1_src_cpy[src_strd + 16] 454 455 ADD r11,r11,#16 @III 456 VMOV.16 D28[0],r5 @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 457 LDRH r4,[r11] @III pu1_src_cpy[src_strd + 16] 458 459 LDRB r8,[r0,r1] @II pu1_src_cpy[0] 460 VEXT.8 Q14,Q8,Q14,#2 @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 461 SUB r5,r12,r7 @II ht_tmp - row 462 463 LSL r5,r5,#1 @II (ht_tmp - row) * 2 464 VMOV.16 D18[0],r4 @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 465 ADD r9,r14,r5 @II pu1_src_left_cpy[(ht_tmp - row) * 2] 466 467 LDRB r11,[r9,#-2] @II load the value 468 VST1.8 {Q10},[r0],r1 @I vst1q_u8(pu1_src_cpy, pu1_cur_row) 469 SUB r8,r8,r11 @II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 470 471 CMP r8,#0 @II 472 VEXT.8 Q9,Q15,Q9,#2 @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 473 LDRB r11,[r0,#1] @II pu1_src_cpy[0] 474 475 MVNLT r8,#0 @II 476 VCGT.U8 Q11,Q6,Q14 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 477 MOVGT r8,#1 @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 478 479 LDRB r5,[r9,#-1] @II load the value 480 VMOV.8 D14[0],r8 @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 481 SUB r7,r7,#1 @II Decrement the ht_tmp loop count by 1 482 483 SUB r11,r11,r5 @II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 484 VCLT.U8 Q12,Q6,Q14 @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 485 CMP r11,#0 @II 486 487 MVNLT r11,#0 @II 488 VSUB.U8 Q12,Q12,Q11 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 489 MOVGT r11,#1 @II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 490 491 LDRB r4,[r0,r1] @III pu1_src_cpy[0] 492 VLD1.8 D22,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 493 SUB r5,r12,r7 @III ht_tmp - row 494 495 ADD r10,r0,r1 496 VMOV.8 D14[1],r11 @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 497 LSL r5,r5,#1 @III (ht_tmp - row) * 2 498 499 ADD r9,r14,r5 @III pu1_src_left_cpy[(ht_tmp - row) * 2] 500 VADD.I8 Q13,Q0,Q7 @II edge_idx = vaddq_s8(const_2, sign_up) 501 LDRB r10,[r10,#1] @III pu1_src_cpy[0] 502 503 LDRB r5,[r9,#-2] @III load the value 504 VADD.I8 Q13,Q13,Q12 @II edge_idx = vaddq_s8(edge_idx, sign_down) 505 SUB r4,r4,r5 @III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 506 507 CMP r4,#0 @III 508 LDRB r9,[r9,#-1] @III load the value 509 VTBL.8 D26,{D22},D26 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 510 VNEG.S8 Q7,Q12 @II sign_up = vnegq_s8(sign_down) 511 512 MVNLT r4,#0 @III 513 SUB r10,r10,r9 @III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 514 VTBL.8 D27,{D22},D27 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 515 VEXT.8 Q7,Q7,Q7,#14 @II sign_up = vextq_s8(sign_up, sign_up, 14) 516 517 MOVGT r4,#1 @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 518 VAND Q13,Q13,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask) 519 CMP r10,#0 @III 520 521 VUZP.8 D26,D27 @II 522 VMOV.8 d14[0],r4 @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 523 524 MVNLT r10,#0 @III 525 MOVGT r10,#1 @III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 526 VTBL.8 D24,{D6},D26 @II 527 VCGT.U8 Q10,Q8,Q9 @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 528 529 VCLT.U8 Q11,Q8,Q9 @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 530 VTBL.8 D25,{D7},D27 @II 531 VSUB.U8 Q11,Q11,Q10 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 532 533 VMOV.8 D14[1],r10 @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 534 VZIP.8 D24,D25 @II 535 536 VMOVL.U8 Q14,D12 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 537 VADD.I8 Q9,Q0,Q7 @III edge_idx = vaddq_s8(const_2, sign_up) 538 539 VLD1.8 D20,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 540 VADDW.S8 Q14,Q14,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 541 542 VADD.I8 Q9,Q9,Q11 @III edge_idx = vaddq_s8(edge_idx, sign_down) 543 VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 544 545 VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 546 VTBL.8 D18,{D20},D18 @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 547 VNEG.S8 Q7,Q11 @III sign_up = vnegq_s8(sign_down) 548 549 VTBL.8 D19,{D20},D19 @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 550 VEXT.8 Q7,Q7,Q7,#14 @III sign_up = vextq_s8(sign_up, sign_up, 14) 551 552 VMOVL.U8 Q13,D13 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 553 VAND Q9,Q9,Q4 @III edge_idx = vandq_s8(edge_idx, au1_mask) 554 555 VUZP.8 D18,D19 @III 556 VTBL.8 D22,{D6},D18 @III 557 VADDW.S8 Q13,Q13,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 558 559 VMOV Q6,Q15 @III pu1_cur_row = pu1_next_row 560 VTBL.8 D23,{D7},D19 @III 561 VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 562 563 VMOVL.U8 Q10,D16 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 564 VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 565 566 VZIP.8 D22,D23 @III 567 VMOVN.I16 D28,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[0]) 568 569 VMOVN.I16 D29,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[1]) 570 VADDW.S8 Q10,Q10,D22 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 571 572 VMOVL.U8 Q9,D17 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 573 VMAX.S16 Q10,Q10,Q1 @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 574 575 VMIN.U16 Q10,Q10,Q2 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 576 VADDW.S8 Q9,Q9,D23 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 577 578 SUB r7,r7,#1 @III Decrement the ht_tmp loop count by 1 579 VMAX.S16 Q9,Q9,Q1 @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 580 CMP r7,#1 581 582 VST1.8 {Q14},[r0],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row) 583 VMIN.U16 Q9,Q9,Q2 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 584 585 BGT PU1_SRC_LOOP @If not equal jump to PU1_SRC_LOOP 586 BLT INNER_LOOP_DONE 587 588 ADD r8,r0,r1,LSL #1 @*pu1_src + src_strd 589 VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0]) 590 591 LDRB r11,[r0,r1] @pu1_src_cpy[0] 592 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 593 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 594 SUB r8,#8 595 SUB r4,r12,r7 @ht_tmp - row 596 597 ADD r8,r8,#16 598 VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1]) 599 LDRH r5,[r8] @pu1_src_cpy[src_strd + 16] 600 601 LSL r4,r4,#1 @(ht_tmp - row) * 2 602 VMOV.16 D18[0],r5 @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 603 ADD r9,r14,r4 @pu1_src_left_cpy[(ht_tmp - row) * 2] 604 605 LDRB r5,[r9,#-2] @load the value 606 VEXT.8 Q9,Q8,Q9,#2 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 607 SUB r8,r11,r5 @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 608 609 CMP r8,#0 610 VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row) 611 MVNLT r8,#0 612 613 MOVGT r8,#1 @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 614 VLD1.8 D30,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 615 616 LDRB r11,[r0,#1] @pu1_src_cpy[0] 617 VMOV.8 D14[0],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 618 LDRB r5,[r9,#-1] @load the value 619 620 SUB r4,r11,r5 @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 621 VCGT.U8 Q11,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 622 CMP r4,#0 623 624 MVNLT r4,#0 625 VCLT.U8 Q12,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 626 MOVGT r4,#1 @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 627 628 VMOV.8 D14[1],r4 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 629 VSUB.U8 Q12,Q12,Q11 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 630 631 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 632 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 633 634 VTBL.8 D26,{D30},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 635 VTBL.8 D27,{D30},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 636 637 VMOVL.U8 Q10,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 638 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 639 640 VMOVL.U8 Q9,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 641 VUZP.8 D26,D27 642 643 VTBL.8 D24,{D6},D26 644 VTBL.8 D25,{D7},D27 645 VZIP.8 D24,D25 646 647 VADDW.S8 Q10,Q10,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 648 VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 649 VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 650 651 VADDW.S8 Q9,Q9,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 652 VMAX.S16 Q9,Q9,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 653 VMIN.U16 Q9,Q9,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 654 655 656INNER_LOOP_DONE: 657 LDR r8,[sp,#0x118] @Loads ht 658 VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0]) 659 ADD r5,sp,#0x4B @*au1_src_left_tmp 660 661 LDR r11,[sp,#0x104] @Loads *pu1_src_left 662 VMOVN.I16 D21,Q9 @vmovn_s16(pi2_tmp_cur_row.val[1]) 663 664 665SRC_LEFT_LOOP: 666 LDR r7,[r5],#4 @au1_src_left_tmp[row] 667 SUBS r8,r8,#2 668 STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 669 BNE SRC_LEFT_LOOP 670 671 SUBS r6,r6,#16 @Decrement the wd loop count by 16 672 VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 673 CMP r6,#8 @Check whether residue remains 674 675 BLT RE_ASSINING_LOOP @Jump to re-assigning loop 676 LDR r7,[sp,#0x114] @Loads wd 677 LDR r0,[sp,#0x02] @Loads *pu1_src 678 SUB r7,r7,r6 679 ADD r0,r0,r7 680 BGT WIDTH_LOOP_16 @If not equal jump to width_loop 681 BEQ WIDTH_RESIDUE @If residue remains jump to residue loop 682 683 684WD_16_HT_4_LOOP: 685 LDR r5,[sp,#0x108] @Loads pu1_avail 686 LDR r7,[sp,#0x114] @Loads wd 687 CMP r6,r7 @col == wd 688 LDRBEQ r8,[r5] @pu1_avail[0] 689 690 MOVNE r8,#-1 691 VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 692 VMOV.8 D8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 693 694 CMP r6,#16 @if(col == 16) 695 BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 696 LDRB r8,[r5,#1] @pu1_avail[1] 697 VMOV.8 D9[6],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 698 VMOV.8 D9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 699 700SKIP_AU1_MASK_VAL_WD_16_HT_4: 701 LDRB r8,[r5,#2] @pu1_avail[2] 702 CMP r8,#0 703 704 SUBEQ r8,r0,r1 @pu1_src - src_strd 705 MOVNE r8,r3 @pu1_src_top_cpy 706 SUB r8,r8,#2 @pu1_src - src_strd - 2 707 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) 708 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) 709 SUB r8,#8 710 711 ADD r3,r3,#16 712 ADD r5,sp,#0x4B @*au1_src_left_tmp 713 LDR r4,[sp,#0x118] @Loads ht 714 LDR r7,[sp,#0x114] @Loads wd 715 SUB r7,r7,r6 @(wd - col) 716 ADD r7,r7,#14 @15 + (wd - col) 717 LDR r8,[sp,#0x100] @Loads *pu1_src 718 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 719 720AU1_SRC_LEFT_LOOP_WD_16_HT_4: 721 LDRH r8,[r7] @load the value and increment by src_strd 722 STRH r8,[r5],#2 @store it in the stack pointer 723 ADD r7,r7,r1 724 725 SUBS r4,r4,#1 @decrement the loop count 726 BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 727 728 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 729 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 730 SUB r0,#8 731 732 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 733 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 734 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 735 VMOV.I8 Q9,#0 736 MOV r7,r12 @row count, move ht_tmp to r7 737 738PU1_SRC_LOOP_WD_16_HT_4: 739 VMOV.I8 Q9,#0 740 ADD r8,r0,r1 @*pu1_src + src_strd 741 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 742 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 743 SUB r8,#8 744 745 ADD r8,r8,#16 746 LDRH r5,[r8] @pu1_src_cpy[src_strd + 16] 747 VMOV.16 D18[0],r5 @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 748 VEXT.8 Q9,Q8,Q9,#2 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 749 750 CMP r7,r12 751 BLT SIGN_UP_CHANGE_WD_16_HT_4 752 LDR r5,[sp,#0x108] @Loads pu1_avail 753 LDRB r5,[r5,#2] @pu1_avail[2] 754 CMP r5,#0 755 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 756 757SIGN_UP_CHANGE_WD_16_HT_4: 758 LDRB r8,[r0] @pu1_src_cpy[0] 759 SUB r5,r12,r7 @ht_tmp - row 760 LSL r5,r5,#1 @(ht_tmp - row) * 2 761 ADD r9,r14,r5 @pu1_src_left_cpy[(ht_tmp - row) * 2] 762 LDRB r5,[r9,#-2] @load the value 763 SUB r8,r8,r5 @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 764 CMP r8,#0 765 MVNLT r8,#0 766 MOVGT r8,#1 @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 767 VMOV.8 d14[0],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 768 769 LDRB r8,[r0,#1] @pu1_src_cpy[0] 770 LDRB r5,[r9,#-1] @load the value 771 SUB r8,r8,r5 @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 772 CMP r8,#0 773 MVNLT r8,#0 774 MOVGT r8,#1 @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 775 VMOV.8 d14[1],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 776 777SIGN_UP_CHANGE_DONE_WD_16_HT_4: 778 VCGT.U8 Q11,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 779 VCLT.U8 Q12,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 780 VSUB.U8 Q12,Q12,Q11 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 781 782 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 783 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 784 785 VLD1.8 D22,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 786 VTBL.8 D26,{D22},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 787 VTBL.8 D27,{D22},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 788 789 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 790 791 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 792 VEXT.8 Q7,Q7,Q7,#14 @sign_up = vextq_s8(sign_up, sign_up, 14) 793 794 VUZP.8 D26,D27 795 VTBL.8 D24,{D6},D26 796 VTBL.8 D25,{D7},D27 797 VZIP.8 D24,D25 798 799 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 800 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 801 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 802 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 803 804 VMOVL.U8 Q13,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 805 VADDW.S8 Q13,Q13,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 806 VMAX.S16 Q13,Q13,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 807 VMIN.U16 Q13,Q13,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 808 809 VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 810 VMOVN.I16 D29,Q13 @vmovn_s16(pi2_tmp_cur_row.val[1]) 811 812 VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 813 814 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 815 SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 816 BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 817 818 LDR r8,[sp,#0x118] @Loads ht 819 ADD r5,sp,#0x4B @*au1_src_left_tmp 820 LDR r11,[sp,#0x104] @Loads *pu1_src_left 821 822SRC_LEFT_LOOP_WD_16_HT_4: 823 LDR r7,[r5],#4 @au1_src_left_tmp[row] 824 STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 825 826 SUBS r8,r8,#2 827 BNE SRC_LEFT_LOOP_WD_16_HT_4 828 829 830 SUBS r6,r6,#16 @Decrement the wd loop count by 16 831 BLE RE_ASSINING_LOOP @Jump to re-assigning loop 832 BGT WD_16_HT_4_LOOP 833 834 835WIDTH_RESIDUE: 836 LDR r7,[sp,#0x114] @Loads wd 837 LDR r5,[sp,#0x108] @Loads pu1_avail 838 CMP r6,r7 @wd_residue == wd 839 LDRBEQ r8,[r5] @pu1_avail[0] 840 841 MOVNE r8,#-1 842 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 843 VMOV.8 d8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 844 845 LDRB r8,[r5,#1] @pu1_avail[1] 846 VMOV.8 d8[6],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 847 VMOV.8 d8[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 848 849 LDRB r8,[r5,#2] @pu1_avail[2] 850 CMP r8,#0 851 852 SUBEQ r8,r0,r1 @pu1_src - src_strd 853 MOVNE r8,r3 854 SUB r8,r8,#2 @pu1_src - src_strd - 2 855 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) 856 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) 857 SUB r8,#8 858 859 ADD r5,sp,#0x4B @*au1_src_left_tmp 860 LDR r4,[sp,#0x118] @Loads ht 861 LDR r7,[sp,#0x114] @Loads wd 862 LDR r8,[sp,#0x100] @Loads *pu1_src 863 SUB r7,r7,#2 @(wd - 2) 864 ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 2)] 865 866AU1_SRC_LEFT_LOOP_RESIDUE: 867 LDRH r8,[r7] @load the value and increment by src_strd 868 STRH r8,[r5],#2 @store it in the stack pointer 869 ADD r7,r7,r1 870 SUBS r4,r4,#1 @decrement the loop count 871 BNE AU1_SRC_LEFT_LOOP_RESIDUE 872 873 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 874 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 875 SUB r0,#8 876 877 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 878 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 879 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 880 MOV r7,r12 @row count, move ht_tmp to r7 881 882PU1_SRC_LOOP_RESIDUE: 883 VMOV.I8 Q9,#0 884 ADD r8,r0,r1 @*pu1_src + src_strd 885 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 886 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 887 SUB r8,#8 888 889 ADD r8,r8,#16 890 LDRH r5,[r8] @pu1_src_cpy[src_strd + 16] 891 VMOV.16 D18[0],r5 @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 892 VEXT.8 Q9,Q8,Q9,#2 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 893 894 CMP r7,r12 895 BLT SIGN_UP_CHANGE_RESIDUE 896 LDR r5,[sp,#0x108] @Loads pu1_avail 897 LDRB r5,[r5,#2] @pu1_avail[2] 898 CMP r5,#0 899 BNE SIGN_UP_CHANGE_DONE_RESIDUE 900 901SIGN_UP_CHANGE_RESIDUE: 902 LDRB r8,[r0] @pu1_src_cpy[0] 903 SUB r5,r12,r7 @ht_tmp - row 904 LSL r5,r5,#1 @(ht_tmp - row) * 2 905 ADD r9,r14,r5 @pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 906 LDRB r5,[r9,#-2] @load the value 907 SUB r8,r8,r5 @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 908 CMP r8,#0 909 MVNLT r8,#0 910 MOVGT r8,#1 @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 911 VMOV.8 d14[0],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 912 913 LDRB r8,[r0,#1] @pu1_src_cpy[0] 914 LDRB r5,[r9,#-1] @load the value 915 SUB r8,r8,r5 @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 916 CMP r8,#0 917 MVNLT r8,#0 918 MOVGT r8,#1 @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 919 VMOV.8 d14[1],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 920 921SIGN_UP_CHANGE_DONE_RESIDUE: 922 VCGT.U8 Q11,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 923 VCLT.U8 Q12,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 924 VSUB.U8 Q12,Q12,Q11 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 925 926 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 927 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 928 929 VLD1.8 D22,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 930 VTBL.8 D26,{D22},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 931 VTBL.8 D27,{D22},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 932 933 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 934 935 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 936 VEXT.8 Q7,Q7,Q7,#14 @sign_up = vextq_s8(sign_up, sign_up, 14) 937 938 VUZP.8 D26,D27 939 VTBL.8 D24,{D6},D26 940 VTBL.8 D25,{D7},D27 941 VZIP.8 D24,D25 942 943 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 944 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 945 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 946 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 947 948 VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 949 950 VST1.8 {D28},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 951 952 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 953 SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 954 BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP 955 956 LDR r8,[sp,#0x118] @Loads ht 957 LDR r11,[sp,#0x104] @Loads *pu1_src_left 958 ADD r5,sp,#0x4B @*au1_src_left_tmp 959 960SRC_LEFT_LOOP_RESIDUE: 961 LDR r7,[r5],#4 @au1_src_left_tmp[row] 962 SUBS r8,r8,#2 963 STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 964 965 BNE SRC_LEFT_LOOP_RESIDUE 966 967 968RE_ASSINING_LOOP: 969 LDR r8,[sp,#0x118] @Loads ht 970 971 LDR r0,[sp,#0x100] @Loads *pu1_src 972 SUB r8,r8,#1 @ht - 1 973 974 LDR r7,[sp,#0x114] @Loads wd 975 976 LDRH r9,[sp,#6] 977 MLA r6,r8,r1,r7 @wd - 2 + (ht - 1) * src_strd 978 979 STRH r9,[r0] @pu1_src_org[0] = u1_pos_0_0_tmp 980 ADD r6,r0,r6 @pu1_src[wd - 2 + (ht - 1) * src_strd] 981 982 LDRH r9,[sp,#8] 983 ADD r12,sp,#10 984 STRH r9,[r6,#-2] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u 985 986 LDR r4,[sp,#0xFC] @Loads pu1_src_top_left 987 LDRH r10,[sp] @load u1_src_top_left_tmp from stack pointer 988 STRH r10,[r4] @*pu1_src_top_left = u1_src_top_left_tmp 989 LDR r3,[sp,#0x10C] @Loads pu1_src_top 990 991SRC_TOP_LOOP: 992 VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col] 993 SUBS r7,r7,#8 @Decrement the width 994 VST1.8 D0,[r3]! @pu1_src_top[col] = au1_src_top_tmp[col] 995 BNE SRC_TOP_LOOP 996 997END_LOOPS: 998 ADD sp,sp,#0xD4 999 LDMFD sp!,{r4-r12,r15} @Reload the registers from SP 1000 1001 1002 1003