1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* ,:file 21//* ihevc_sao_edge_offset_class3_chroma.s 22//* 23//* ,:brief 24//* Contains function definitions for inter prediction interpolation. 25//* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26//* RVCT 27//* 28//* ,:author 29//* Parthiban V 30//* 31//* ,:par List of Functions: 32//* 33//* 34//* ,:remarks 35//* None 36//* 37//******************************************************************************* 38//*/ 39//void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src, 40// WORD32 src_strd, 41// UWORD8 *pu1_src_left, 42// UWORD8 *pu1_src_top, 43// UWORD8 *pu1_src_top_left, 44// UWORD8 *pu1_src_top_right, 45// UWORD8 *pu1_src_bot_left, 46// UWORD8 *pu1_avail, 47// WORD8 *pi1_sao_offset_u, 48// WORD8 *pi1_sao_offset_v, 49// WORD32 wd, 50// WORD32 ht) 51//**************Variables Vs Registers***************************************** 52//x0 => *pu1_src 53//x1 => src_strd 54//x2 => *pu1_src_left 55//x3 => *pu1_src_top 56//x4 => *pu1_src_top_left 57//x5 => *pu1_avail 58//x6 => *pi1_sao_offset_u 59//x9 => *pi1_sao_offset_v 60//x7 => wd 61//x8=> ht 62 63.text 64.p2align 2 65.include "ihevc_neon_macros.s" 66.globl gi1_table_edge_idx 67.globl ihevc_sao_edge_offset_class3_chroma_av8 68 69ihevc_sao_edge_offset_class3_chroma_av8: 70 71 72 // STMFD sp!,{x4-x12,x14} //stack stores the values of the arguments 73 74 75 ldr x8,[sp,#0] 76 ldr x9,[sp,#8] 77 ldr w10,[sp,#16] 78 ldr w11,[sp,#24] 79 80 81 // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments 82 stp x19, x20,[sp,#-16]! 83 stp x21, x22,[sp,#-16]! 84 stp x23, x24,[sp,#-16]! 85 stp x25, x26,[sp,#-16]! 86 stp x27, x28,[sp,#-16]! 87 88 mov x15,x4 // *pu1_src_top_left 0x28 89 mov x16,x5 // *pu1_src_top_right 0x2c 90 mov x17,x6 // *pu1_src_bot_left 0x30 91 mov x21,x7 // *pu1_avail 0x34 92 mov x22,x8 // *pi1_sao_offset_u 0x38 93 mov x23,x9 // *pi1_sao_offset_v 0x3c 94 mov x24,x10 // wd 0x40 95 mov x25,x11 // ht 0x44 96 97 98 mov w7, w24 //Loads wd 99 mov w8, w25 //Loads ht 100 SUB x9,x7,#2 //wd - 2 101 102 mov x4, x15 //Loads pu1_src_top_left 103 LDRH w10,[x3,x9] //pu1_src_top[wd - 2] 104 105 MOV x9,x7 //Move width to x9 for loop count 106 107 mov x5, x21 //Loads pu1_avail 108 mov x6, x22 //Loads pi1_sao_offset_u 109 110 mov x22, x3 //Store pu1_src_top in sp 111 SUB sp,sp,#0xE0 //Decrement the stack pointer to store some temp arr values 112 113 STRH w10,[sp] //u1_src_top_left_tmp = pu1_src_top[wd - 2] 114 SUB x10,x8,#1 //ht-1 115 madd x11, x10, x1, x0 //pu1_src[(ht - 1) * src_strd + col] 116 ADD x12,sp,#10 //temp array 117 118AU1_SRC_TOP_LOOP: 119 LD1 {v0.8b},[x11],#8 //pu1_src[(ht - 1) * src_strd + col] 120 SUBS x9,x9,#8 //Decrement the loop count by 8 121 ST1 {v0.8b},[x12],#8 //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 122 BNE AU1_SRC_TOP_LOOP 123 124PU1_AVAIL_5_LOOP_U: 125 LDRB w9,[x5,#5] //pu1_avail[5] 126 CMP x9,#0 127 SUB x14,x7,#2 //[wd - 2] 128 LDRB w9,[x0,x14] //u1_pos_0_0_tmp_u = pu1_src[wd - 2] 129 SUB x11,x7,#1 //[wd - 1] 130 LDRB w10,[x0,x11] //u1_pos_0_0_tmp_v = pu1_src[wd - 1] 131 BEQ PU1_AVAIL_6_LOOP_U 132 133 mov x11, x16 //Load pu1_src_top_right from sp 134 LDRB w11,[x11] //pu1_src_top_right[0] 135 SUB x12,x9,x11 //pu1_src[wd - 2] - pu1_src_top_right[0] 136 CMP x12,#0 137 movn x20,#0 138 csel x12, x20, x12,LT 139 MOV x20,#1 140 csel x12, x20, x12,GT //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) 141 ADD x11,x0,x1 //pu1_src + src_strd 142 SUB x14,x14,#2 //[wd - 2 - 2] 143 LDRB w14,[x11,x14] //pu1_src[wd - 2 - 2 + src_strd] 144 SUB x11,x9,x14 //pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd] 145 CMP x11,#0 146 movn x20,#0 147 csel x11, x20, x11,LT 148 MOV x20,#1 149 csel x11, x20, x11,GT //SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]) 150 ADD x11,x12,x11 //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]) 151 ADD x11,x11,#2 //edge_idx 152 ADRP x14, :got:gi1_table_edge_idx //table pointer 153 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 154 155 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 156 CMP x12,#0 //0 != edge_idx 157 BEQ PU1_AVAIL_5_LOOP_V 158 LDRSB x11,[x6,x12] //pi1_sao_offset_u[edge_idx] 159 ADD x9,x9,x11 //pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx] 160 mov x20,#255 161 cmp x9,x20 162 csel x9, x20, x9, ge //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 163 mov x20,#0 164 cmp x9,x20 165 csel x9, x20, x9, LT //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 166 167PU1_AVAIL_5_LOOP_V: 168 169 mov x11, x16 //Load pu1_src_top_right from sp 170 LDRB w11,[x11,#1] //pu1_src_top_right[1] 171 SUB x12,x10,x11 //pu1_src[wd - 1] - pu1_src_top_right[1] 172 CMP x12,#0 173 movn x20,#0 174 csel x12, x20, x12,LT 175 MOV x20,#1 176 csel x12, x20, x12,GT //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) 177 ADD x11,x0,x1 //pu1_src + src_strd 178 SUB x14,x7,#3 //[wd - 1 - 2] 179 LDRB w14,[x11,x14] //pu1_src[wd - 1 - 2 + src_strd] 180 SUB x11,x10,x14 //pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd] 181 CMP x11,#0 182 movn x20,#0 183 csel x11, x20, x11,LT 184 MOV x20,#1 185 csel x11, x20, x11,GT //SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]) 186 ADD x11,x12,x11 //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]) 187 ADD x11,x11,#2 //edge_idx 188 ADRP x14, :got:gi1_table_edge_idx //table pointer 189 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 190 191 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 192 CMP x12,#0 //0 != edge_idx 193 BEQ PU1_AVAIL_6_LOOP_U 194 mov x11, x23 //Loads pi1_sao_offset_v 195 LDRSB x11,[x11,x12] //pi1_sao_offset_v[edge_idx] 196 ADD x10,x10,x11 //pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx] 197 mov x20,#255 198 cmp x10,x20 199 csel x10, x20, x10, ge //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1) 200 mov x20,#0 201 cmp x10,x20 202 csel x10, x20, x10, LT //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1) 203 204PU1_AVAIL_6_LOOP_U: 205 STRB w9,[sp,#6] 206 STRB w10,[sp,#7] 207 mov x26, x0 //Store pu1_src in sp 208 209 LDRB w10,[x5,#6] //pu1_avail[6] 210 CMP x10,#0 211 SUB x11,x8,#1 //ht - 1 212 madd x12, x11, x1, x0 //pu1_src[(ht - 1) * src_strd] 213 LDRB w10,[x12] //u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd] 214 LDRB w9,[x12,#1] //u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1] 215 BEQ PU1_AVAIL_3_LOOP 216 217 SUB x11,x12,x1 //pu1_src[(ht - 1) * src_strd - src_strd] 218 ADD x11,x11,#2 //pu1_src[(ht - 1) * src_strd + 2 - src_strd] 219 LDRB w11,[x11] //Load pu1_src[(ht - 1) * src_strd + 2 - src_strd] 220 SUB x11,x10,x11 //pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd] 221 CMP x11,#0 222 movn x20,#0 223 csel x11, x20, x11,LT 224 MOV x20,#1 225 csel x11, x20, x11,GT //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) 226 227 mov x14, x17 //Load pu1_src_bot_left from sp 228 LDRB w14,[x14] //Load pu1_src_bot_left[0] 229 SUB x14,x10,x14 //pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0] 230 CMP x14,#0 231 movn x20,#0 232 csel x14, x20, x14,LT 233 MOV x20,#1 234 csel x14, x20, x14,GT //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]) 235 236 ADD x11,x11,x14 //Add 2 sign value 237 ADD x11,x11,#2 //edge_idx 238 ADRP x14, :got:gi1_table_edge_idx //table pointer 239 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 240 241 LDRSB x14,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 242 CMP x14,#0 243 BEQ PU1_AVAIL_6_LOOP_V 244 LDRSB x11,[x6,x14] //pi1_sao_offset_u[edge_idx] 245 ADD x10,x10,x11 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 246 mov x20,#255 247 cmp x10,x20 248 csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 249 mov x20,#0 250 cmp x10,x20 251 csel x10, x20, x10, LT //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 252 253PU1_AVAIL_6_LOOP_V: 254 ADD x12,x12,#1 //pu1_src[(ht - 1) * src_strd + 1] 255 SUB x11,x12,x1 //pu1_src[(ht - 1) * src_strd + 1) - src_strd] 256 ADD x11,x11,#2 //pu1_src[(ht - 1) * src_strd + 2 - src_strd] 257 LDRB w11,[x11] //Load pu1_src[(ht - 1) * src_strd + 2 - src_strd] 258 SUB x11,x9,x11 //pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd] 259 CMP x11,#0 260 movn x20,#0 261 csel x11, x20, x11,LT 262 MOV x20,#1 263 csel x11, x20, x11,GT //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) 264 265 mov x14, x17 //Load pu1_src_bot_left from sp 266 LDRB w14,[x14,#1] //Load pu1_src_bot_left[1] 267 SUB x14,x9,x14 //pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1] 268 CMP x14,#0 269 movn x20,#0 270 csel x14, x20, x14,LT 271 MOV x20,#1 272 csel x14, x20, x14,GT //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]) 273 274 ADD x11,x11,x14 //Add 2 sign value 275 ADD x11,x11,#2 //edge_idx 276 ADRP x14, :got:gi1_table_edge_idx //table pointer 277 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 278 279 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 280 CMP x12,#0 281 BEQ PU1_AVAIL_3_LOOP 282 mov x14, x23 //Loads pi1_sao_offset_v 283 LDRSB x11,[x14,x12] //pi1_sao_offset_v[edge_idx] 284 ADD x9,x9,x11 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 285 mov x20,#255 286 cmp x9,x20 287 csel x9, x20, x9, ge //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 288 mov x20,#0 289 cmp x9,x20 290 csel x9, x20, x9, LT //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 291 292PU1_AVAIL_3_LOOP: 293 STRB w10,[sp,#8] 294 STRB w9,[sp,#9] 295 mov x27, x2 //Store pu1_src_left in sp 296 297 MOV x12,x8 //Move ht 298 MOV x14,x2 //Move pu1_src_left to pu1_src_left_cpy 299 LDRB w11,[x5,#3] //pu1_avail[3] 300 CMP x11,#0 301 BNE PU1_AVAIL_2_LOOP 302 SUB x12,x12,#1 //ht_tmp-- 303 304PU1_AVAIL_2_LOOP: 305 LDRB w5,[x5,#2] //pu1_avail[2] 306 CMP x5,#0 307 BNE PU1_AVAIL_2_LOOP_END 308 309 ADD x0,x0,x1 //pu1_src += src_strd 310 SUB x12,x12,#1 //ht_tmp-- 311 ADD x14,x14,#2 //pu1_src_left_cpy += 2 312 313PU1_AVAIL_2_LOOP_END: 314 mov x28, x0 //Store pu1_src in sp 315 movi v0.16b, #2 //const_2 = vdupq_n_s8(2) 316 movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0) 317 movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 318 LD1 {v6.8b},[x6] //offset_tbl_u = vld1_s8(pi1_sao_offset_u) 319 mov x6, x23 //Loads pi1_sao_offset_v 320 LD1 {v7.8b},[x6] //offset_tbl_v = vld1_s8(pi1_sao_offset_v) 321 ADRP x2, :got:gi1_table_edge_idx //table pointer 322 LDR x2, [x2, #:got_lo12:gi1_table_edge_idx] 323 324 //VLD1.8 D6,[x6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 325 movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1) 326 MOV x6,x7 //move wd to x6 loop_count 327 328 CMP x7,#16 //Compare wd with 16 329 BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 330 CMP x8,#4 //Compare ht with 4 331 BLE WD_16_HT_4_LOOP //If jump to WD_16_HT_4_LOOP 332 333WIDTH_LOOP_16: 334 mov w7, w24 //Loads wd 335 CMP x6,x7 //col == wd 336 mov x5, x21 //Loads pu1_avail 337 338 LDRb w20, [x5] //pu1_avail[0] 339 csel w8,w20,w8,EQ 340 MOV x20,#-1 341 csel x8, x20, x8,NE 342 343 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 344 LDRB w11,[x5,#2] //pu1_avail[2] 345 346 CMP x6,#16 //if(col == 16) 347 mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 348 349 BNE SKIP_AU1_MASK_VAL 350 LDRB w8,[x5,#1] //pu1_avail[1] 351 mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 352 mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 353 354SKIP_AU1_MASK_VAL: 355 CMP x11,#0 356 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 357 //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 358 //SUB x0, x0,#8 359 ADD x5,sp,#0x4B //*au1_src_left_tmp 360 361 SUB x20,x0,x1 //pu1_src - src_strd 362 csel x8, x20, x8,EQ 363 movi v18.16b, #0 364 csel x8, x3, x8,NE 365 366 ADD x8,x8,#2 //pu1_src - src_strd + 2 367 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 368 //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 369 //SUB x8, x8,#8 370 ADD x3,x3,#16 371 372 mov w4, w25 //Loads ht 373 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 374 mov w7, w24 //Loads wd 375 376 SUB x7,x7,x6 //(wd - col) 377 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 378 ADD x7,x7,#14 //15 + (wd - col) 379 380 mov x8, x26 //Loads *pu1_src 381 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 382 ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] 383 384AU1_SRC_LEFT_LOOP: 385 LDRH w8,[x7] //load the value and increment by src_strd 386 SUBS x4,x4,#1 //decrement the loop count 387 388 STRH w8,[x5],#2 //store it in the stack pointer 389 ADD x7,x7,x1 390 BNE AU1_SRC_LEFT_LOOP 391 392 393 MOV x7,x12 //row count, move ht_tmp to x7 394 movi v18.16b, #0 //I 395 ADD x11,x0,x1 //I *pu1_src + src_strd 396 397 SUB x5,x12,x7 //I ht_tmp - row 398 LD1 {v16.16b},[x11] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 399 //LD1 {v17.8b},[x11] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 400 //SUB x11, x11,#8 401 ADD x8,x14,x5,LSL #1 //I pu1_src_left_cpy[(ht_tmp - row) * 2] 402 403 LDRH w5,[x8,#2] //I 404 mov v18.h[7], w5 //I vsetq_lane_u8 405 mov x11, x21 //I Loads pu1_avail 406 407 LDRB w11,[x11,#2] //I pu1_avail[2] 408 EXT v18.16b, v18.16b , v16.16b,#14 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 409 CMP x11,#0 //I 410 BNE SIGN_UP_CHANGE_DONE //I 411 412 LDRB w8,[x0,#14] //I pu1_src_cpy[14] 413 SUB x5,x0,x1 //I 414 415 LDRB w11,[x5,#16] //I load the value pu1_src_cpy[16 - src_strd] 416 417 LDRB w9,[x0,#15] //I pu1_src_cpy[15] 418 SUB x8,x8,x11 //I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 419 420 LDRB w10,[x5,#17] //I load the value pu1_src_cpy[17 - src_strd] 421 CMP x8,#0 //I 422 423 movn x20,#0 424 csel x8, x20, x8,LT //I 425 SUB x9,x9,x10 //I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 426 427 MOV x20,#1 428 csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 429 CMP x9,#0 //I 430 431 movn x20,#0 432 csel x9, x20, x9,LT //I 433 mov v17.b[14], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 434 MOV x20,#1 435 csel x9, x20, x9,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 436 437 mov v17.b[15], w9 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 438 439SIGN_UP_CHANGE_DONE: 440 LD1 {v28.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 441 cmhi v20.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 442 443 cmhi v22.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 444 SUB v22.16b, v22.16b , v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 445 446 ADD v18.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up) 447 ADD v18.16b, v18.16b , v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down) 448 TBL v18.16b, {v28.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 449 NEG v17.16b, v22.16b //I sign_up = vnegq_s8(sign_down) 450 451 //TBL v19.8b, {v28.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 452 EXT v17.16b, v17.16b , v17.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2) 453 454 Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 455 AND v18.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) 456 mov v19.d[0],v18.d[1] 457 458 UZP1 v31.8b, v18.8b, v19.8b 459 UZP2 v19.8b, v18.8b, v19.8b //I 460 mov v18.8b,v31.8b 461 TBL v22.8b, {v6.16b},v18.8b //I 462 TBL v23.8b, {v7.16b},v19.8b //I 463 ZIP1 v31.8b, v22.8b, v23.8b 464 ZIP2 v23.8b, v22.8b, v23.8b //I 465 mov v22.8b,v31.8b 466 467 Uxtl2 v18.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 468 SADDW v20.8h, v20.8h , v22.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 469 470 SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 471 UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 472 473 mov v5.16b, v16.16b //I pu1_cur_row = pu1_next_row 474 SADDW v18.8h, v18.8h , v23.8b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 475 476 SUB x7,x7,#1 //I Decrement the ht_tmp loop count by 1 477 SMAX v18.8h, v18.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 478 479 UMIN v18.8h, v18.8h , v4.8h //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 480 481 482PU1_SRC_LOOP: 483 ADD x11,x0,x1,LSL #1 //II *pu1_src + src_strd 484 xtn v20.8b, v20.8h //I vmovn_s16(pi2_tmp_cur_row.val[0]) 485 SUB x5,x12,x7 //II ht_tmp - row 486 487 ADD x4,x0,x1 //III *pu1_src + src_strd 488 xtn2 v20.16b, v18.8h //I vmovn_s16(pi2_tmp_cur_row.val[1]) 489 ADD x8,x14,x5,LSL #1 //II pu1_src_left_cpy[(ht_tmp - row) * 2] 490 491 LDRH w9,[x8,#2] 492 LD1 {v16.16b},[x11] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 493 //LD1 {v17.8b},[x11] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 494 //SUB x11, x11,#8 495 LDRB w10,[x4,#14] //II pu1_src_cpy[14] 496 497 LDRB w8,[x4,#15] //II pu1_src_cpy[15] 498 mov v28.h[7], w9 //II vsetq_lane_u8 499 ADD x4,x11,x1 //III *pu1_src + src_strd 500 501 LDRB w5,[x0,#17] //II load the value pu1_src_cpy[17 - src_strd] 502 LD1 {v30.16b},[x4] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 503 //LD1 {v31.8b},[x4] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 504 //SUB x4, x4,#8 505 LDRB w11,[x0,#16] //II load the value pu1_src_cpy[16 - src_strd] 506 507 SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1 508 ST1 { v20.16b},[x0],x1 //I vst1q_u8(pu1_src_cpy, pu1_cur_row) 509 SUB x10,x10,x11 //II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 510 511 CMP x10,#0 //II 512 EXT v28.16b, v28.16b , v16.16b,#14 //II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 513 SUB x8,x8,x5 //II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 514 515 movn x20,#0 516 csel x10, x20, x10,LT //II 517 LD1 {v21.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 518 MOV x20,#1 519 csel x10, x20, x10,GT //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 520 521 CMP x8,#0 //II 522 mov v17.b[14], w10 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 523 movn x20,#0 524 csel x8, x20, x8,LT //II 525 526 MOV x20,#1 527 csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 528 SUB x10,x12,x7 //III ht_tmp - row 529 mov v17.b[15], w8 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 530 ADD x11,x14,x10,LSL #1 //III pu1_src_left_cpy[(ht_tmp - row) * 2] 531 532 CMP x7,#1 //III 533 cmhi v22.16b, v5.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 534 BNE NEXT_ROW_POINTER_ASSIGNED_2 //III 535 536 mov x5, x21 //III Loads pu1_avail 537 LDRB w5,[x5,#3] //III pu1_avail[3] 538 CMP x5,#0 //III 539 SUB x20,x4,#4 //III pu1_src[src_strd - 2] 540 csel x11, x20, x11,NE 541 542NEXT_ROW_POINTER_ASSIGNED_2: 543 LDRH w5,[x11,#2] //III 544 cmhi v24.16b, v28.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 545 ADD x11,x0,x1 //III 546 547 LDRB w9,[x11,#14] //III pu1_src_cpy[14] 548 mov v18.h[7], w5 //III vsetq_lane_u8 549 LDRB w8,[x11,#15] //III pu1_src_cpy[15] 550 551 LDRB w11,[x0,#16] //III load the value pu1_src_cpy[16 - src_strd] 552 SUB v24.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 553 LDRB w10,[x0,#17] //III load the value pu1_src_cpy[17 - src_strd] 554 555 SUB x9,x9,x11 //III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 556 EXT v18.16b, v18.16b , v30.16b,#14 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 557 SUB x10,x8,x10 //III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 558 559 CMP x9,#0 //III 560 ADD v26.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up) 561 movn x20,#0 562 csel x9, x20, x9,LT //III 563 564 MOV x20,#1 565 csel x9, x20, x9,GT //III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 566 ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) 567 CMP x10,#0 //III 568 569 NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down) 570 TBL v26.16b, {v21.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 571 movn x20,#0 572 csel x10, x20, x10,LT //III 573 MOV x20,#1 574 csel x10, x20, x10,GT //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 575 576 EXT v17.16b, v17.16b , v17.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2) 577 //TBL v27.8b, {v21.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 578 cmhi v22.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 579 580 mov v17.b[14], w9 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 581 AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) 582 mov v27.d[0],v26.d[1] 583 584 mov v17.b[15], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 585 UZP1 v31.8b, v26.8b, v27.8b 586 UZP2 v27.8b, v26.8b, v27.8b //II 587 mov v26.8b,v31.8b 588 589 cmhi v20.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 590 TBL v24.8b, {v6.16b},v26.8b //II 591 SUB v22.16b, v20.16b , v22.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 592 593 ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up) 594 TBL v25.8b, {v7.16b},v27.8b //II 595 ADD v18.16b, v18.16b , v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down) 596 597 LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 598 ZIP1 v31.8b, v24.8b, v25.8b 599 ZIP2 v25.8b, v24.8b, v25.8b //II 600 mov v24.8b,v31.8b 601 602 Uxtl v28.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 603 TBL v18.16b, {v20.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 604 NEG v17.16b, v22.16b //III sign_up = vnegq_s8(sign_down) 605 606 SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 607 //TBL v19.8b, {v20.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 608 EXT v17.16b, v17.16b , v17.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2) 609 610 Uxtl2 v26.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 611 AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) 612 mov v19.d[0],v18.d[1] 613 614 Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 615 UZP1 v31.8b, v18.8b, v19.8b 616 UZP2 v19.8b, v18.8b, v19.8b //III 617 mov v18.8b,v31.8b 618 619 SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 620 TBL v22.8b, {v6.16b},v18.8b //III 621 UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 622 623 SADDW v26.8h, v26.8h , v25.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 624 TBL v23.8b, {v7.16b},v19.8b //III 625 SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 626 627 Uxtl2 v18.8h, v16.16b //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 628 ZIP1 v31.8b, v22.8b, v23.8b 629 ZIP2 v23.8b, v22.8b, v23.8b //III 630 mov v22.8b,v31.8b 631 632 xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) 633 SADDW v20.8h, v20.8h , v22.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 634 635 mov v5.16b, v30.16b //III pu1_cur_row = pu1_next_row 636 UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 637 638 SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1 639 SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 640 CMP x7,#1 //III 641 642 xtn2 v28.16b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[1]) 643 UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 644 645 SADDW v18.8h, v18.8h , v23.8b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 646 647 SMAX v18.8h, v18.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 648 649 ST1 { v28.16b},[x0],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row) 650 UMIN v18.8h, v18.8h , v4.8h //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 651 652 BGT PU1_SRC_LOOP //If not equal jump to PU1_SRC_LOOP 653 BLT INNER_LOOP_DONE 654 655 656 ADD x11,x0,x1,LSL #1 //*pu1_src + src_strd 657 xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0]) 658 SUB x5,x12,x7 //ht_tmp - row 659 660 ADD x8,x14,x5,LSL #1 //pu1_src_left_cpy[(ht_tmp - row) * 2] 661 xtn2 v20.16b, v18.8h //III vmovn_s16(pi2_tmp_cur_row.val[1]) 662 CMP x7,#1 663 664 LDRB w4,[x0,#16] //load the value pu1_src_cpy[16 - src_strd] 665 LD1 {v16.16b},[x11] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 666 //LD1 {v17.8b},[x11] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 667 //SUB x11, x11,#8 668 LDRB w9,[x0,#17] //load the value pu1_src_cpy[17 - src_strd] 669 670 BNE NEXT_ROW_POINTER_ASSIGNED_3 671 mov x5, x21 //Loads pu1_avail 672 LDRB w5,[x5,#3] //pu1_avail[3] 673 CMP x5,#0 674 SUB x20,x11,#4 //pu1_src[src_strd - 2] 675 csel x8, x20, x8,NE 676 677NEXT_ROW_POINTER_ASSIGNED_3: 678 LDRH w5,[x8,#2] 679 ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row) 680 LDRB w8,[x0,#14] //pu1_src_cpy[14] 681 682 SUB x8,x8,x4 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 683 mov v18.h[7], w5 //vsetq_lane_u8 684 LDRB w10,[x0,#15] //pu1_src_cpy[15] 685 686 CMP x8,#0 687 EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 688 SUB x10,x10,x9 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 689 690 movn x20,#0 691 csel x8, x20, x8,LT 692 LD1 {v28.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 693 MOV x20,#1 694 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 695 696 CMP x10,#0 697 mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 698 movn x20,#0 699 csel x10, x20, x10,LT 700 701 MOV x20,#1 702 csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 703 mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 704 cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 705 706 cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 707 SUB v22.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 708 709 ADD v18.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 710 ADD v18.16b, v18.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 711 TBL v18.16b, {v28.16b},v18.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 712 //TBL v19.8b, {v28.16b},v19.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 713 714 AND v18.16b, v18.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 715 mov v19.d[0],v18.d[1] 716 717 Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 718 UZP1 v31.8b, v18.8b, v19.8b 719 UZP2 v19.8b, v18.8b, v19.8b 720 mov v18.8b,v31.8b 721 722 TBL v22.8b, {v6.16b},v18.8b 723 TBL v23.8b, {v7.16b},v19.8b 724 725 Uxtl2 v18.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 726 ZIP1 v31.8b, v22.8b, v23.8b 727 ZIP2 v23.8b, v22.8b, v23.8b 728 mov v22.8b,v31.8b 729 730 SADDW v20.8h, v20.8h , v22.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 731 SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 732 UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 733 734 SADDW v18.8h, v18.8h , v23.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 735 SMAX v18.8h, v18.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 736 UMIN v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 737 738 739INNER_LOOP_DONE: 740 741 mov w8, w25 //Loads ht 742 xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0]) 743 ADD x5,sp,#0x4B //*au1_src_left_tmp 744 745 LSL x8,x8,#1 746 xtn2 v20.16b, v18.8h //III vmovn_s16(pi2_tmp_cur_row.val[1]) 747 mov x11, x27 //Loads *pu1_src_left 748 749SRC_LEFT_LOOP: 750 LDR w7, [x5],#4 //au1_src_left_tmp[row] 751 SUBS x8,x8,#4 752 STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 753 BNE SRC_LEFT_LOOP 754 755 SUBS x6,x6,#16 //Decrement the wd loop count by 16 756 ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row) 757 CMP x6,#8 //Check whether residue remains 758 759 BLT RE_ASSINING_LOOP //Jump to re-assigning loop 760 mov w7, w24 //Loads wd 761 mov x0, x28 //Loads *pu1_src 762 SUB x7,x7,x6 763 ADD x0,x0,x7 764 BGT WIDTH_LOOP_16 //If not equal jump to width_loop 765 BEQ WIDTH_RESIDUE //If residue remains jump to residue loop 766 767WD_16_HT_4_LOOP: 768 mov w7, w24 //Loads wd 769 770 mov x5, x21 //Loads pu1_avail 771 CMP x6,x7 //col == wd 772 773 LDRb w20, [x5] //pu1_avail[0] 774 csel w8,w20,w8,EQ 775 MOV x20,#-1 776 csel x8, x20, x8,NE 777 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 778 779 CMP x6,#16 //if(col == 16) 780 mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 781 782 BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 783 LDRB w8,[x5,#1] //pu1_avail[1] 784 mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 785 mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 786 787SKIP_AU1_MASK_VAL_WD_16_HT_4: 788 LDRB w11,[x5,#2] //pu1_avail[2] 789 SUB x20,x0,x1 //pu1_src - src_strd 790 CMP x11,#0 791 csel x8, x20, x8,EQ 792 793 csel x8, x3, x8,NE 794 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 795 //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 796 //SUB x0, x0,#8 797 ADD x8,x8,#2 //pu1_src - src_strd + 2 798 799 ADD x3,x3,#16 800 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 801 //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 802 //SUB x8, x8,#8 803 ADD x5,sp,#0x4B //*au1_src_left_tmp 804 805 mov w4, w25 //Loads ht 806 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 807 mov w7, w24 //Loads wd 808 809 SUB x7,x7,x6 //(wd - col) 810 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 811 ADD x7,x7,#14 //15 + (wd - col) 812 813 mov x8, x26 //Loads *pu1_src 814 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 815 ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] 816 817AU1_SRC_LEFT_LOOP_WD_16_HT_4: 818 LDRH w8,[x7] //load the value and increment by src_strd 819 SUBS x4,x4,#1 //decrement the loop count 820 821 STRH w8,[x5],#2 //store it in the stack pointer 822 ADD x7,x7,x1 823 BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 824 825 movi v18.16b, #0 826 MOV x7,x12 //row count, move ht_tmp to x7 827 828PU1_SRC_LOOP_WD_16_HT_4: 829 ADD x9,x0,x1 //*pu1_src + src_strd 830 831 mov x5, x21 //Loads pu1_avail 832 LD1 {v16.16b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 833 //LD1 {v17.8b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 834 //SUB x9, x9,#8 835 LDRB w5,[x5,#3] //pu1_avail[3] 836 837 SUB x11,x12,x7 //ht_tmp - row 838 ADD x8,x14,x11,LSL #1 //pu1_src_left_cpy[(ht_tmp - row) * 2] 839 ADD x8,x8,#2 //pu1_src_left_cpy[(ht_tmp - row + 1) * 2] 840 841 CMP x5,#0 842 BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4 843 CMP x7,#1 844 SUB x20,x9,#2 //pu1_src[src_strd - 2] 845 csel x8, x20, x8,EQ 846 847NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: 848 LDRH w5,[x8] 849 mov v18.h[7], w5 //vsetq_lane_u8 850 EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 851 852 CMP x7,x12 853 BLT SIGN_UP_CHANGE_WD_16_HT_4 854 mov x5, x21 //Loads pu1_avail 855 LDRB w5,[x5,#2] //pu1_avail[2] 856 CMP x5,#0 857 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 858 859SIGN_UP_CHANGE_WD_16_HT_4: 860 LDRB w8,[x0,#14] //pu1_src_cpy[14] 861 SUB x9,x0,x1 862 863 LDRB w5,[x9,#16] //load the value pu1_src_cpy[16 - src_strd] 864 865 LDRB w10,[x0,#15] //pu1_src_cpy[15] 866 SUB x8,x8,x5 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 867 868 LDRB w11,[x9,#17] //load the value pu1_src_cpy[17 - src_strd] 869 CMP x8,#0 870 871 movn x20,#0 872 csel x8, x20, x8,LT 873 SUB x10,x10,x11 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 874 875 MOV x20,#1 876 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 877 878 CMP x10,#0 879 mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 880 movn x20,#0 881 csel x10, x20, x10,LT 882 883 MOV x20,#1 884 csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 885 mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 886 887SIGN_UP_CHANGE_DONE_WD_16_HT_4: 888 LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 889 cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 890 891 cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 892 SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 893 894 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 895 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 896 897 mov v20.d[1],v20.d[0] 898 NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) 899 TBL v26.16b, {v20.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 900 901 //TBL v27.8b, {v20.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 902 EXT v17.16b, v17.16b , v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2) 903 904 Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 905 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 906 mov v27.d[0],v26.d[1] 907 908 UZP1 v31.8b, v26.8b, v27.8b 909 UZP2 v27.8b, v26.8b, v27.8b 910 mov v26.8b,v31.8b 911 TBL v24.8b, {v6.16b},v26.8b 912 TBL v25.8b, {v7.16b},v27.8b 913 ZIP1 v31.8b, v24.8b, v25.8b 914 ZIP2 v25.8b, v24.8b, v25.8b 915 mov v24.8b,v31.8b 916 917 Uxtl2 v30.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 918 SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 919 920 SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 921 UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 922 923 mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row 924 SADDW v30.8h, v30.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 925 926 SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 927 UMIN v30.8h, v30.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 928 929 xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 930 xtn2 v28.16b, v30.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 931 932 SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 933 ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 934 BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 935 936 mov w8, w25 //Loads ht 937 ADD x5,sp,#0x4B //*au1_src_left_tmp 938 mov x11, x27 //Loads *pu1_src_left 939 940SRC_LEFT_LOOP_WD_16_HT_4: 941 LDR w7, [x5],#4 //au1_src_left_tmp[row] 942 SUBS x8,x8,#2 943 STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 944 BNE SRC_LEFT_LOOP_WD_16_HT_4 945 946 SUBS x6,x6,#16 //Decrement the wd loop count by 16 947 CMP x6,#8 //Check whether residue remains 948 BLT RE_ASSINING_LOOP //Jump to re-assigning loop 949 mov w7, w24 //Loads wd 950 mov x0, x28 //Loads *pu1_src 951 SUB x7,x7,x6 952 ADD x0,x0,x7 953 BGT WD_16_HT_4_LOOP //If not equal jump to width_loop 954 BEQ WIDTH_RESIDUE //If residue remains jump to residue loop 955 956WIDTH_RESIDUE: 957 mov w7, w24 //Loads wd 958 959 mov x5, x21 //Loads pu1_avail 960 CMP x6,x7 //wd_residue == wd 961 962 LDRb w20, [x5] //pu1_avail[0] 963 csel w8,w20,w8,EQ 964 965 MOV x20,#-1 966 csel x8, x20, x8,NE 967 LDRB w11,[x5,#1] //pu1_avail[1] 968 969 LDRB w9,[x5,#2] //pu1_avail[2] 970 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 971 CMP x9,#0 972 973 SUB x20,x0,x1 //pu1_src - src_strd 974 csel x10, x20, x10,EQ 975 mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 976 csel x10, x3, x10,NE 977 978 ADD x10,x10,#2 //pu1_src - src_strd + 2 979 mov v1.b[6], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 980 ADD x5,sp,#0x4B //*au1_src_left_tmp 981 982 mov w4, w25 //Loads ht 983 mov v1.b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 984 mov w7, w24 //Loads wd 985 986 mov x8, x26 //Loads *pu1_src 987 LD1 {v3.16b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 988 //LD1 {v11.8b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 989 //SUB x10, x10,#8 990 SUB x7,x7,#2 //(wd - 2) 991 992 ADD x7,x8,x7 //pu1_src[0 * src_strd + (wd - 2)] 993 994AU1_SRC_LEFT_LOOP_RESIDUE: 995 LDRH w8,[x7] //load the value and increment by src_strd 996 ADD x7,x7,x1 997 STRH w8,[x5],#2 //store it in the stack pointer 998 SUBS x4,x4,#1 //decrement the loop count 999 BNE AU1_SRC_LEFT_LOOP_RESIDUE 1000 1001 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 1002 //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 1003 //SUB x0, x0,#8 1004 1005 movi v18.16b, #0 1006 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 1007 1008 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 1009 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1010 MOV x7,x12 //row count, move ht_tmp to x7 1011 1012PU1_SRC_LOOP_RESIDUE: 1013 ADD x9,x0,x1 //*pu1_src + src_strd 1014 1015 SUB x11,x12,x7 //ht_tmp - row 1016 LD1 {v16.16b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 1017 //LD1 {v17.8b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 1018 //SUB x9, x9,#8 1019 mov x5, x21 //Loads pu1_avail 1020 1021 LDRB w5,[x5,#3] //pu1_avail[3] 1022 ADD x8,x14,x11,LSL #1 //pu1_src_left_cpy[(ht_tmp - row) * 2] 1023 1024 CMP x5,#0 1025 ADD x8,x8,#2 //pu1_src_left_cpy[(ht_tmp - row + 1) * 2] 1026 1027 BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE 1028 CMP x7,#1 1029 SUB x20,x9,#2 //pu1_src[src_strd - 2] 1030 csel x8, x20, x8,EQ 1031 1032NEXT_ROW_POINTER_ASSIGNED_RESIDUE: 1033 LDRB w5,[x8] 1034 1035 LDRB w8,[x8,#1] 1036 mov v18.b[14], w5 //vsetq_lane_u8 1037 CMP x7,x12 1038 1039 mov v18.b[15], w8 //vsetq_lane_u8 1040 EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 1041 1042 BLT SIGN_UP_CHANGE_RESIDUE 1043 mov x5, x21 //Loads pu1_avail 1044 LDRB w5,[x5,#2] //pu1_avail[2] 1045 CMP x5,#0 1046 BNE SIGN_UP_CHANGE_DONE_RESIDUE 1047 1048SIGN_UP_CHANGE_RESIDUE: 1049 LDRB w8,[x0,#14] //pu1_src_cpy[14] 1050 SUB x9,x0,x1 1051 1052 LDRB w5,[x9,#16] //load the value pu1_src_cpy[16 - src_strd] 1053 1054 LDRB w10,[x0,#15] //pu1_src_cpy[15] 1055 SUB x8,x8,x5 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 1056 1057 LDRB w11,[x9,#17] //load the value pu1_src_cpy[17 - src_strd] 1058 CMP x8,#0 1059 1060 movn x20,#0 1061 csel x8, x20, x8,LT 1062 SUB x10,x10,x11 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 1063 1064 MOV x20,#1 1065 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 1066 1067 CMP x10,#0 1068 mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 1069 movn x20,#0 1070 csel x10, x20, x10,LT 1071 1072 MOV x20,#1 1073 csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 1074 mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 1075 1076SIGN_UP_CHANGE_DONE_RESIDUE: 1077 LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 1078 cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 1079 1080 cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 1081 SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1082 1083 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 1084 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 1085 1086 mov v20.d[1],v20.d[0] 1087 NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) 1088 TBL v26.16b, {v20.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 1089 1090 //TBL v27.8b, {v20.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 1091 EXT v17.16b, v17.16b , v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14) 1092 1093 Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 1094 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 1095 mov v27.d[0],v26.d[1] 1096 1097 UZP1 v31.8b, v26.8b, v27.8b 1098 UZP2 v27.8b, v26.8b, v27.8b 1099 mov v26.8b,v31.8b 1100 TBL v24.8b, {v6.16b},v26.8b 1101 TBL v25.8b, {v7.16b},v27.8b 1102 ZIP1 v31.8b, v24.8b, v25.8b 1103 ZIP2 v25.8b, v24.8b, v25.8b 1104 mov v24.8b,v31.8b 1105 1106 mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row 1107 SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 1108 1109 SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 1110 UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 1111 1112 SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 1113 xtn v30.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 1114 1115 ST1 {v30.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 1116 1117 BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to PU1_SRC_LOOP 1118 1119 mov w8, w25 //Loads ht 1120 ADD x5,sp,#0x4B //*au1_src_left_tmp 1121 1122 mov x11, x27 //Loads *pu1_src_left 1123 1124SRC_LEFT_LOOP_RESIDUE: 1125 LDR w7, [x5],#4 //au1_src_left_tmp[row] 1126 SUBS x8,x8,#2 1127 STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 1128 BNE SRC_LEFT_LOOP_RESIDUE 1129 1130 1131RE_ASSINING_LOOP: 1132 mov w7, w24 //Loads wd 1133 mov w8, w25 //Loads ht 1134 1135 mov x0, x26 //Loads *pu1_src 1136 SUB x10,x7,#2 //wd - 2 1137 1138 LDRH w9,[sp,#6] 1139 SUB x8,x8,#1 //ht - 1 1140 1141 STRH w9,[x0,x10] //pu1_src_org[0] = u1_pos_0_0_tmp 1142 madd x6, x8, x1, x0 //pu1_src[(ht - 1) * src_strd] 1143 1144 mov x4, x15 //Loads pu1_src_top_left 1145 1146 LDRH w9,[sp,#8] 1147 ADD x12,sp,#10 1148 1149 STRH w9,[x6] //pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u 1150 1151 LDRH w10,[sp] //load u1_src_top_left_tmp from stack pointer 1152 STRH w10,[x4] //*pu1_src_top_left = u1_src_top_left_tmp 1153 mov x3, x22 //Loads pu1_src_top 1154 1155SRC_TOP_LOOP: 1156 LD1 {v0.8b},[x12],#8 //pu1_src_top[col] = au1_src_top_tmp[col] 1157 SUBS x7,x7,#8 //Decrement the width 1158 ST1 {v0.8b},[x3],#8 //pu1_src_top[col] = au1_src_top_tmp[col] 1159 BNE SRC_TOP_LOOP 1160 1161END_LOOPS: 1162 ADD sp,sp,#0xE0 1163 // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 1164 ldp x27, x28,[sp],#16 1165 ldp x25, x26,[sp],#16 1166 ldp x23, x24,[sp],#16 1167 ldp x21, x22,[sp],#16 1168 ldp x19, x20,[sp],#16 1169 1170 ret 1171 1172 1173 1174