1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* ,:file 21//* ihevc_sao_edge_offset_class2_chroma.s 22//* 23//* ,:brief 24//* Contains function definitions for inter prediction interpolation. 25//* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26//* RVCT 27//* 28//* ,:author 29//* Parthiban V 30//* 31//* ,:par List of Functions: 32//* 33//* 34//* ,:remarks 35//* None 36//* 37//******************************************************************************* 38//*/ 39//void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src, 40// WORD32 src_strd, 41// UWORD8 *pu1_src_left, 42// UWORD8 *pu1_src_top, 43// UWORD8 *pu1_src_top_left, 44// UWORD8 *pu1_src_top_right, 45// UWORD8 *pu1_src_bot_left, 46// UWORD8 *pu1_avail, 47// WORD8 *pi1_sao_offset_u, 48// WORD8 *pi1_sao_offset_v, 49// WORD32 wd, 50// WORD32 ht) 51//**************Variables Vs Registers***************************************** 52//x0 => *pu1_src 53//x1 => src_strd 54//x2 => *pu1_src_left 55//x3 => *pu1_src_top 56//x4 => *pu1_src_top_left 57//x5 => *pu1_avail 58//x6 => *pi1_sao_offset_u 59//x9 => *pi1_sao_offset_v 60//x7 => wd 61//x8=> ht 62 63.text 64.p2align 2 65.include "ihevc_neon_macros.s" 66 67.globl gi1_table_edge_idx 68.globl ihevc_sao_edge_offset_class2_chroma_av8 69 70ihevc_sao_edge_offset_class2_chroma_av8: 71 72 73 // STMFD sp!,{x4-x12,x14} //stack stores the values of the arguments 74 75 ldr x8,[sp,#0] 76 ldr x9,[sp,#8] 77 ldr w10,[sp,#16] 78 ldr w11,[sp,#24] 79 80 81 82 // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments 83 stp x19, x20,[sp,#-16]! 84 stp x21, x22,[sp,#-16]! 85 stp x23, x24,[sp,#-16]! 86 stp x25, x26,[sp,#-16]! 87 stp x27, x28,[sp,#-16]! 88 89 mov x15,x4 // *pu1_src_top_left 0x28 90 //mov x16,x5 // *pu1_src_top_right 0x2c 91 mov x17,x6 // *pu1_src_bot_left 0x30 92 mov x21,x7 // *pu1_avail 0x34 93 mov x22,x8 // *pi1_sao_offset_u 0x38 94 mov x23,x9 // *pi1_sao_offset_v 0x3c 95 mov x24,x10 // wd 0x40 96 mov x25,x11 // ht 0x44 97 98 99 mov w7, w24 //Loads wd 100 mov w8, w25 //Loads ht 101 SUB x9,x7,#2 //wd - 2 102 103 mov x4, x15 //Loads pu1_src_top_left 104 LDRH w10,[x3,x9] //pu1_src_top[wd - 2] 105 106 mov x26, x0 //Store pu1_src in sp 107 MOV x9,x7 //Move width to x9 for loop count 108 109 mov x17, x2 //Store pu1_src_bot_left in sp 110 mov x5, x21 //Loads pu1_avail 111 mov x6, x22 //Loads pi1_sao_offset_u 112 113 mov x22, x3 //Store pu1_src_top in sp 114 SUB sp,sp,#0xE0 //Decrement the stack pointer to store some temp arr values 115 116 STRH w10,[sp] //u1_src_top_left_tmp = pu1_src_top[wd - 2] 117 SUB x10,x8,#1 //ht-1 118 madd x11, x10, x1, x0 //pu1_src[(ht - 1) * src_strd + col] 119 ADD x12,sp,#10 //temp array 120 121AU1_SRC_TOP_LOOP: 122 LD1 {v0.8b},[x11],#8 //pu1_src[(ht - 1) * src_strd + col] 123 SUBS x9,x9,#8 //Decrement the loop count by 8 124 ST1 {v0.8b},[x12],#8 //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 125 BNE AU1_SRC_TOP_LOOP 126 127PU1_AVAIL_4_LOOP_U: 128 LDRB w9,[x5,#4] //pu1_avail[4] 129 CMP x9,#0 130 LDRB w9,[x0] //u1_pos_0_0_tmp_u = pu1_src[0] 131 LDRB w10,[x0,#1] //u1_pos_0_0_tmp_v = pu1_src[1] 132 BEQ PU1_AVAIL_7_LOOP_U 133 134 LDRB w11,[x4] //pu1_src_top_left[0] 135 ADD x14,x0,x1 //pu1_src + src_strd 136 137 SUB x12,x9,x11 //pu1_src[0] - pu1_src_top_left[0] 138 139 LDRB w14,[x14,#2] //pu1_src[2 + src_strd] 140 CMP x12,#0 141 142 movn x20,#0 143 csel x12, x20, x12,LT 144 SUB x11,x9,x14 //pu1_src[0] - pu1_src[2 + src_strd] 145 146 MOV x20,#1 147 csel x12, x20, x12,GT //SIGN(pu1_src[0] - pu1_src_top_left[0]) 148 149 CMP x11,#0 150 movn x20,#0 151 csel x11, x20, x11,LT 152 ADRP x14, :got:gi1_table_edge_idx //table pointer 153 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 154 MOV x20,#1 155 csel x11, x20, x11,GT //SIGN(pu1_src[0] - pu1_src[2 + src_strd]) 156 157 ADD x11,x12,x11 //SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[2 + src_strd]) 158 ADD x11,x11,#2 //edge_idx 159 160 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 161 CMP x12,#0 //0 != edge_idx 162 BEQ PU1_AVAIL_4_LOOP_V 163 LDRSB x11,[x6,x12] //pi1_sao_offset_u[edge_idx] 164 ADD x9,x9,x11 //pu1_src[0] + pi1_sao_offset_u[edge_idx] 165 mov x20,#255 166 cmp x9,x20 167 csel x9, x20, x9, ge //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 168 169PU1_AVAIL_4_LOOP_V: 170 171 LDRB w11,[x4,#1] //pu1_src_top_left[1] 172 ADD x14,x0,x1 //pu1_src + src_strd 173 174 SUB x12,x10,x11 //pu1_src[1] - pu1_src_top_left[1] 175 LDRB w14,[x14,#3] //pu1_src[3 + src_strd] 176 177 CMP x12,#0 178 movn x20,#0 179 csel x12, x20, x12,LT 180 SUB x11,x10,x14 //pu1_src[1] - pu1_src[3 + src_strd] 181 MOV x20,#1 182 csel x12, x20, x12,GT //SIGN(pu1_src[0] - pu1_src_top_left[0]) 183 184 CMP x11,#0 185 movn x20,#0 186 csel x11, x20, x11,LT 187 ADRP x14, :got:gi1_table_edge_idx //table pointer 188 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 189 MOV x20,#1 190 csel x11, x20, x11,GT //SIGN(pu1_src[0] - pu1_src[3 + src_strd]) 191 192 ADD x11,x12,x11 //SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[3 + src_strd]) 193 ADD x11,x11,#2 //edge_idx 194 195 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 196 CMP x12,#0 //0 != edge_idx 197 BEQ PU1_AVAIL_7_LOOP_U 198 mov x11, x23 //Loads pi1_sao_offset_v 199 LDRSB x11,[x11,x12] //pi1_sao_offset_v[edge_idx] 200 ADD x10,x10,x11 //pu1_src[0] + pi1_sao_offset_v[edge_idx] 201 mov x20,#255 202 cmp x10,x20 203 csel x10, x20, x10, ge //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1) 204 205PU1_AVAIL_7_LOOP_U: 206 STRB w10,[sp,#7] 207 STRB w9,[sp,#6] 208 209 LDRB w10,[x5,#7] //pu1_avail[7] 210 CMP x10,#0 211 SUB x10,x7,#2 //wd - 2 212 SUB x11,x8,#1 //ht - 1 213 madd x12, x11, x1, x10 //wd - 2 + (ht - 1) * src_strd 214 ADD x12,x12,x0 //pu1_src[wd - 2 + (ht - 1) * src_strd] 215 LDRB w10,[x12] //u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd] 216 LDRB w9,[x12,#1] //u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd] 217 BEQ PU1_AVAIL_3_LOOP 218 219 SUB x11,x12,x1 //pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd] 220 SUB x11,x11,#2 //pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd] 221 LDRB w11,[x11] //Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd] 222 SUB x11,x10,x11 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd] 223 CMP x11,#0 224 movn x20,#0 225 csel x11, x20, x11,LT 226 MOV x20,#1 227 csel x11, x20, x11,GT //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]) 228 229 ADD x14,x12,x1 //pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd] 230 ADD x14,x14,#2 //pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd] 231 LDRB w14,[x14] //Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd] 232 SUB x14,x10,x14 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd] 233 CMP x14,#0 234 movn x20,#0 235 csel x14, x20, x14,LT 236 MOV x20,#1 237 csel x14, x20, x14,GT //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]) 238 239 ADD x11,x11,x14 //Add 2 sign value 240 ADD x11,x11,#2 //edge_idx 241 ADRP x14, :got:gi1_table_edge_idx //table pointer 242 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 243 244 LDRSB x14,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 245 CMP x14,#0 246 BEQ PU1_AVAIL_7_LOOP_V 247 LDRSB x11,[x6,x14] //pi1_sao_offset_u[edge_idx] 248 ADD x10,x10,x11 //pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 249 mov x20,#255 250 cmp x10,x20 251 csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 252 253PU1_AVAIL_7_LOOP_V: 254 ADD x12,x12,#1 255 SUB x11,x12,x1 //pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd] 256 SUB x11,x11,#2 //pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd] 257 LDRB w11,[x11] //Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd] 258 SUB x11,x9,x11 //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd] 259 CMP x11,#0 260 movn x20,#0 261 csel x11, x20, x11,LT 262 MOV x20,#1 263 csel x11, x20, x11,GT //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) 264 265 ADD x14,x12,x1 //pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd] 266 ADD x14,x14,#2 //pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd] 267 LDRB w14,[x14] //Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd] 268 SUB x14,x9,x14 //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd] 269 CMP x14,#0 270 movn x20,#0 271 csel x14, x20, x14,LT 272 MOV x20,#1 273 csel x14, x20, x14,GT //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]) 274 275 ADD x11,x11,x14 //Add 2 sign value 276 ADD x11,x11,#2 //edge_idx 277 ADRP x14, :got:gi1_table_edge_idx //table pointer 278 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 279 280 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 281 CMP x12,#0 282 BEQ PU1_AVAIL_3_LOOP 283 mov x14, x23 //Loads pi1_sao_offset_v 284 LDRSB x11,[x14,x12] //pi1_sao_offset_v[edge_idx] 285 ADD x9,x9,x11 //pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 286 mov x20,#255 287 cmp x9,x20 288 csel x9, x20, x9, ge //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 289 290PU1_AVAIL_3_LOOP: 291 STRB w10,[sp,#8] 292 movi v0.16b, #2 //const_2 = vdupq_n_s8(2) 293 STRB w9,[sp,#9] 294 295 MOV x12,x8 //Move ht 296 movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0) 297 MOV x14,x2 //Move pu1_src_left to pu1_src_left_cpy 298 299 LDRB w11,[x5,#3] //pu1_avail[3] 300 movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 301 CMP x11,#0 302 303 SUB x20,x12,#1 //ht_tmp-- 304 csel x12, x20, x12,EQ 305 LDRB w5,[x5,#2] //pu1_avail[2] 306 307 CMP x5,#0 308 309 ADD x20,x0,x1 //pu1_src += src_strd 310 csel x0, x20, x0,EQ 311 LD1 {v6.8b},[x6] //offset_tbl_u = vld1_s8(pi1_sao_offset_u) 312 SUB x20,x12,#1 //ht_tmp-- 313 csel x12, x20, x12,EQ 314 315 mov x6, x23 //Loads pi1_sao_offset_v 316 ADD x20,x14,#2 //pu1_src_left_cpy += 2 317 csel x14, x20, x14,EQ 318 319 mov x27, x0 //Store pu1_src in sp 320 LD1 {v7.8b},[x6] //offset_tbl_v = vld1_s8(pi1_sao_offset_v) 321 ADRP x2, :got:gi1_table_edge_idx //table pointer 322 LDR x2, [x2, #:got_lo12:gi1_table_edge_idx] 323 324 MOV x6,x7 //move wd to x6 loop_count 325 movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1) 326 CMP x7,#16 //Compare wd with 16 327 328 BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 329 CMP x8,#4 //Compare ht with 4 330 BLE WD_16_HT_4_LOOP //If jump to WD_16_HT_4_LOOP 331 332WIDTH_LOOP_16: 333 mov x5, x21 //Loads pu1_avail 334 mov w7, w24 //Loads wd 335 CMP x6,x7 //col == wd 336 LDRb w20, [x5] //pu1_avail[0] 337 csel w8,w20,w8,EQ 338 339 MOV x20,#-1 340 csel x8, x20, x8,NE 341 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 342 343 CMP x6,#16 //if(col == 16) 344 mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 345 346 BNE SKIP_AU1_MASK_VAL 347 LDRB w8,[x5,#1] //pu1_avail[1] 348 mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 349 mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 350 351SKIP_AU1_MASK_VAL: 352 LDRB w9,[x5,#2] //pu1_avail[2] 353 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 354 //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 355 //SUB x0, x0,#8 356 CMP x9,#0 357 358 mov w4, w25 //Loads ht 359 SUB x20,x0,x1 //pu1_src - src_strd 360 csel x8, x20, x8,EQ 361 362 mov w7, w24 //Loads wd 363 csel x8, x3, x8,NE //pu1_src_top_cpy 364 365 SUB x8,x8,#2 //pu1_src - src_strd - 2 366 ADD x3,x3,#16 367 368 ADD x5,sp,#0x4B //*au1_src_left_tmp 369 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) 370 //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) 371 //SUB x8, x8,#8 372 SUB x7,x7,x6 //(wd - col) 373 374 ADD x7,x7,#14 //15 + (wd - col) 375 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 376 mov x8, x26 //Loads *pu1_src 377 378 ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] 379 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 380 381AU1_SRC_LEFT_LOOP: 382 LDRH w8,[x7] //load the value and increment by src_strd 383 SUBS x4,x4,#1 //decrement the loop count 384 385 STRH w8,[x5],#2 //store it in the stack pointer 386 ADD x7,x7,x1 387 388 BNE AU1_SRC_LEFT_LOOP 389 390 ADD x8,x0,x1 //I *pu1_src + src_strd 391 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 392 MOV x7,x12 //row count, move ht_tmp to x7 393 394 LD1 {v16.16b},[x8] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 395 //LD1 {v17.8b},[x8] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 396 //SUB x8, x8,#8 397 398 ADD x8,x8,#16 //I 399 movi v18.16b, #0 400 LDRH w5,[x8] //I pu1_src_cpy[src_strd + 16] 401 402 mov x10, x21 //I Loads pu1_avail 403 mov v18.h[0], w5 //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 404 LDRB w10,[x10,#2] //I pu1_avail[2] 405 406 CMP x10,#0 //I 407 EXT v18.16b, v16.16b , v18.16b,#2 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 408 BNE SIGN_UP_CHANGE_DONE //I 409 410 LDRB w11,[x0] //I pu1_src_cpy[0] 411 SUB x4,x12,x7 //I ht_tmp - row 412 413 LDRB w10,[x0,#1] //I pu1_src_cpy[0] 414 LSL x4,x4,#1 //I (ht_tmp - row) * 2 415 416 ADD x9,x14,x4 //I pu1_src_left_cpy[(ht_tmp - row) * 2] 417 sub x13,x9,#2 418 LDRB w5,[x13] //I load the value 419 420 SUB x8,x11,x5 //I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 421 sub x13,x9,#1 422 LDRB w5,[x13] //I load the value 423 424 CMP x8,#0 //I 425 SUB x4,x10,x5 //I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 426 427 movn x20,#0 428 csel x8, x20, x8,LT //I 429 MOV x20,#1 430 csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 431 432 CMP x4,#0 //I 433 mov v17.b[0], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 434 movn x20,#0 435 csel x4, x20, x4,LT //I 436 437 MOV x20,#1 438 csel x4, x20, x4,GT //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 439 mov v17.b[1], w4 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 440 441SIGN_UP_CHANGE_DONE: 442 LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 443 cmhi v20.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 444 445 cmhi v22.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 446 SUB v22.16b, v22.16b , v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 447 448 ADD v18.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up) 449 ADD v18.16b, v18.16b , v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down) 450 451 TBL v18.16b, {v30.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 452 NEG v17.16b, v22.16b //I sign_up = vnegq_s8(sign_down) 453 454 //TBL v19.8b, {v30.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 455 EXT v17.16b, v17.16b , v17.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14) 456 457 Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 458 AND v22.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) 459 mov v23.d[0],v22.d[1] 460 461 Uxtl2 v18.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 462 UZP1 v31.8b, v22.8b, v23.8b 463 UZP2 v23.8b, v22.8b, v23.8b //I 464 mov v22.8b,v31.8b 465 466 TBL v22.8b, {v6.16b},v22.8b //I 467 TBL v23.8b, {v7.16b},v23.8b //I 468 ZIP1 v31.8b, v22.8b, v23.8b 469 ZIP2 v23.8b, v22.8b, v23.8b //I 470 mov v22.8b,v31.8b 471 472 mov v5.16b, v16.16b //I pu1_cur_row = pu1_next_row 473 SADDW v20.8h, v20.8h , v22.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 474 475 SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 476 UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 477 478 SADDW v18.8h, v18.8h , v23.8b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 479 SMAX v18.8h, v18.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 480 481 UMIN v18.8h, v18.8h , v4.8h //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 482 SUB x7,x7,#1 //I Decrement the ht_tmp loop count by 1 483 484 485PU1_SRC_LOOP: 486 ADD x8,x0,x1,LSL #1 //II *pu1_src + src_strd 487 xtn v20.8b, v20.8h //I vmovn_s16(pi2_tmp_cur_row.val[0]) 488 ADD x11,x8,x1 //III *pu1_src + src_strd 489 490 LD1 {v16.16b},[x8] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 491 //LD1 {v17.8b},[x8] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 492 //SUB x8, x8,#8 493 LD1 {v30.16b},[x11] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 494 //LD1 {v31.8b},[x11] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 495 //SUB x11, x11,#8 496 497 ADD x8,x8,#16 //II 498 xtn2 v20.16b, v18.8h //I vmovn_s16(pi2_tmp_cur_row.val[1]) 499 LDRH w5,[x8] //II pu1_src_cpy[src_strd + 16] 500 501 ADD x11,x11,#16 //III 502 mov v28.h[0], w5 //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 503 LDRH w4,[x11] //III pu1_src_cpy[src_strd + 16] 504 505 LDRB w8,[x0,x1] //II pu1_src_cpy[0] 506 EXT v28.16b, v16.16b , v28.16b,#2 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 507 SUB x5,x12,x7 //II ht_tmp - row 508 509 LSL x5,x5,#1 //II (ht_tmp - row) * 2 510 mov v18.h[0], w4 //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 511 ADD x9,x14,x5 //II pu1_src_left_cpy[(ht_tmp - row) * 2] 512 513 sub x13,x9,#2 514 LDRB w11,[x13] //II load the value 515 ST1 { v20.16b},[x0],x1 //I vst1q_u8(pu1_src_cpy, pu1_cur_row) 516 SUB x8,x8,x11 //II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 517 518 CMP x8,#0 //II 519 EXT v18.16b, v30.16b , v18.16b,#2 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 520 LDRB w11,[x0,#1] //II pu1_src_cpy[0] 521 522 movn x20,#0 523 csel x8, x20, x8,LT //II 524 cmhi v22.16b, v5.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 525 MOV x20,#1 526 csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 527 528 sub x13,x9,#1 529 LDRB w5,[x13] //II load the value 530 mov v17.b[0], w8 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 531 SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1 532 533 SUB x11,x11,x5 //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 534 cmhi v24.16b, v28.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 535 CMP x11,#0 //II 536 537 movn x20,#0 538 csel x11, x20, x11,LT //II 539 SUB v24.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 540 MOV x20,#1 541 csel x11, x20, x11,GT //II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 542 543 LDRB w4,[x0,x1] //III pu1_src_cpy[0] 544 LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 545 SUB x5,x12,x7 //III ht_tmp - row 546 547 ADD x10,x0,x1 548 mov v17.b[1], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 549 LSL x5,x5,#1 //III (ht_tmp - row) * 2 550 551 ADD x9,x14,x5 //III pu1_src_left_cpy[(ht_tmp - row) * 2] 552 ADD v26.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up) 553 LDRB w10,[x10,#1] //III pu1_src_cpy[0] 554 555 sub x13,x9,#2 556 LDRB w5,[x13] //III load the value 557 ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) 558 SUB x4,x4,x5 //III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 559 560 mov v22.d[1],v22.d[0] 561 CMP x4,#0 //III 562 sub x13,x9,#1 563 LDRB w9,[x13] //III load the value 564 TBL v26.16b, {v22.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 565 NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down) 566 567 movn x20,#0 568 csel x4, x20, x4,LT //III 569 SUB x10,x10,x9 //III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 570 //TBL v27.8b, {v22.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 571 EXT v17.16b, v17.16b , v17.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14) 572 573 MOV x20,#1 574 csel x4, x20, x4,GT //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 575 AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) 576 CMP x10,#0 //III 577 578 mov v27.d[0],v26.d[1] 579 UZP1 v31.8b, v26.8b, v27.8b 580 UZP2 v27.8b, v26.8b, v27.8b //II 581 mov v26.8b,v31.8b 582 mov v17.b[0], w4 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 583 584 movn x20,#0 585 csel x10, x20, x10,LT //III 586 MOV x20,#1 587 csel x10, x20, x10,GT //III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 588 TBL v24.8b, {v6.16b},v26.8b //II 589 cmhi v20.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 590 591 cmhi v22.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 592 TBL v25.8b, {v7.16b},v27.8b //II 593 SUB v22.16b, v22.16b , v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 594 595 mov v17.b[1], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 596 ZIP1 v31.8b, v24.8b, v25.8b 597 ZIP2 v25.8b, v24.8b, v25.8b //II 598 mov v24.8b,v31.8b 599 600 Uxtl v28.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 601 ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up) 602 603 LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 604 SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 605 606 ADD v18.16b, v18.16b , v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down) 607 SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 608 609 UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 610 TBL v18.16b, {v20.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 611 NEG v17.16b, v22.16b //III sign_up = vnegq_s8(sign_down) 612 613 //TBL v19.8b, {v20.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 614 EXT v17.16b, v17.16b , v17.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14) 615 616 Uxtl2 v26.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 617 AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) 618 619 mov v19.d[0],v18.d[1] 620 UZP1 v31.8b, v18.8b, v19.8b 621 UZP2 v19.8b, v18.8b, v19.8b //III 622 mov v18.8b,v31.8b 623 TBL v22.8b, {v6.16b},v18.8b //III 624 SADDW v26.8h, v26.8h , v25.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 625 626 mov v5.16b, v30.16b //III pu1_cur_row = pu1_next_row 627 TBL v23.8b, {v7.16b},v19.8b //III 628 SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 629 630 Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 631 UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 632 633 ZIP1 v31.8b, v22.8b, v23.8b 634 ZIP2 v23.8b, v22.8b, v23.8b //III 635 mov v22.8b,v31.8b 636 xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) 637 638 xtn2 v28.16b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[1]) 639 SADDW v20.8h, v20.8h , v22.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 640 641 Uxtl2 v18.8h, v16.16b //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 642 SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 643 644 UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 645 SADDW v18.8h, v18.8h , v23.8b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 646 647 SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1 648 SMAX v18.8h, v18.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 649 CMP x7,#1 650 651 ST1 { v28.16b},[x0],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row) 652 UMIN v18.8h, v18.8h , v4.8h //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 653 654 BGT PU1_SRC_LOOP //If not equal jump to PU1_SRC_LOOP 655 BLT INNER_LOOP_DONE 656 657 ADD x8,x0,x1,LSL #1 //*pu1_src + src_strd 658 xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0]) 659 660 LDRB w11,[x0,x1] //pu1_src_cpy[0] 661 LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 662 //LD1 {v17.8b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 663 //SUB x8, x8,#8 664 SUB x4,x12,x7 //ht_tmp - row 665 666 ADD x8,x8,#16 667 xtn2 v20.16b, v18.8h //III vmovn_s16(pi2_tmp_cur_row.val[1]) 668 LDRH w5,[x8] //pu1_src_cpy[src_strd + 16] 669 670 LSL x4,x4,#1 //(ht_tmp - row) * 2 671 mov v18.h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 672 ADD x9,x14,x4 //pu1_src_left_cpy[(ht_tmp - row) * 2] 673 674 sub x13,x9,#2 675 LDRB w5,[x13] //load the value 676 EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 677 SUB x8,x11,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 678 679 CMP x8,#0 680 ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row) 681 movn x20,#0 682 csel x8, x20, x8,LT 683 684 MOV x20,#1 685 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 686 LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 687 688 LDRB w11,[x0,#1] //pu1_src_cpy[0] 689 mov v17.b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 690 sub x13,x9,#1 691 LDRB w5,[x13] //load the value 692 693 SUB x4,x11,x5 //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 694 cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 695 CMP x4,#0 696 697 movn x20,#0 698 csel x4, x20, x4,LT 699 cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 700 MOV x20,#1 701 csel x4, x20, x4,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 702 703 mov v17.b[1], w4 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 704 SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 705 706 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 707 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 708 709 mov v30.d[1],v30.d[0] 710 TBL v26.16b, {v30.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 711 //TBL v27.8b, {v30.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 712 713 Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 714 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 715 mov v27.d[0],v26.d[1] 716 717 Uxtl2 v18.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 718 UZP1 v31.8b, v26.8b, v27.8b 719 UZP2 v27.8b, v26.8b, v27.8b 720 mov v26.8b,v31.8b 721 722 TBL v24.8b, {v6.16b},v26.8b 723 TBL v25.8b, {v7.16b},v27.8b 724 ZIP1 v31.8b, v24.8b, v25.8b 725 ZIP2 v25.8b, v24.8b, v25.8b 726 mov v24.8b,v31.8b 727 728 SADDW v20.8h, v20.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 729 SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 730 UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 731 732 SADDW v18.8h, v18.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 733 SMAX v18.8h, v18.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 734 UMIN v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 735 736 737INNER_LOOP_DONE: 738 mov w8, w25 //Loads ht 739 xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 740 ADD x5,sp,#0x4B //*au1_src_left_tmp 741 742 mov x11, x17 //Loads *pu1_src_left 743 xtn2 v20.16b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 744 745 746SRC_LEFT_LOOP: 747 LDR w7, [x5],#4 //au1_src_left_tmp[row] 748 SUBS x8,x8,#2 749 STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 750 BNE SRC_LEFT_LOOP 751 752 SUBS x6,x6,#16 //Decrement the wd loop count by 16 753 ST1 { v20.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 754 CMP x6,#8 //Check whether residue remains 755 756 BLT RE_ASSINING_LOOP //Jump to re-assigning loop 757 mov w7, w24 //Loads wd 758 mov x0, x27 //Loads *pu1_src 759 SUB x7,x7,x6 760 ADD x0,x0,x7 761 BGT WIDTH_LOOP_16 //If not equal jump to width_loop 762 BEQ WIDTH_RESIDUE //If residue remains jump to residue loop 763 764 765WD_16_HT_4_LOOP: 766 mov x5, x21 //Loads pu1_avail 767 mov w7, w24 //Loads wd 768 CMP x6,x7 //col == wd 769 LDRb w20, [x5] //pu1_avail[0] 770 csel w8,w20,w8,EQ 771 772 MOV x20,#-1 773 csel x8, x20, x8,NE 774 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 775 mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 776 777 CMP x6,#16 //if(col == 16) 778 BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 779 LDRB w8,[x5,#1] //pu1_avail[1] 780 mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 781 mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 782 783SKIP_AU1_MASK_VAL_WD_16_HT_4: 784 LDRB w8,[x5,#2] //pu1_avail[2] 785 CMP x8,#0 786 787 SUB x20,x0,x1 //pu1_src - src_strd 788 csel x8, x20, x8,EQ 789 csel x8, x3, x8,NE //pu1_src_top_cpy 790 SUB x8,x8,#2 //pu1_src - src_strd - 2 791 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) 792 //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) 793 //SUB x8, x8,#8 794 795 ADD x3,x3,#16 796 ADD x5,sp,#0x4B //*au1_src_left_tmp 797 mov w4, w25 //Loads ht 798 mov x7, x24 //Loads wd 799 SUB x7,x7,x6 //(wd - col) 800 ADD x7,x7,#14 //15 + (wd - col) 801 mov x8, x26 //Loads *pu1_src 802 ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] 803 804AU1_SRC_LEFT_LOOP_WD_16_HT_4: 805 LDRH w8,[x7] //load the value and increment by src_strd 806 STRH w8,[x5],#2 //store it in the stack pointer 807 ADD x7,x7,x1 808 809 SUBS x4,x4,#1 //decrement the loop count 810 BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 811 812 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 813 //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 814 //SUB x0, x0,#8 815 816 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 817 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 818 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 819 movi v18.16b, #0 820 MOV x7,x12 //row count, move ht_tmp to x7 821 822PU1_SRC_LOOP_WD_16_HT_4: 823 movi v18.16b, #0 824 ADD x8,x0,x1 //*pu1_src + src_strd 825 LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 826 //LD1 {v17.8b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 827 //SUB x8, x8,#8 828 829 ADD x8,x8,#16 830 LDRH w5,[x8] //pu1_src_cpy[src_strd + 16] 831 mov v18.h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 832 EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 833 834 CMP x7,x12 835 BLT SIGN_UP_CHANGE_WD_16_HT_4 836 mov x5, x21 //Loads pu1_avail 837 LDRB w5,[x5,#2] //pu1_avail[2] 838 CMP x5,#0 839 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 840 841SIGN_UP_CHANGE_WD_16_HT_4: 842 LDRB w8,[x0] //pu1_src_cpy[0] 843 SUB x5,x12,x7 //ht_tmp - row 844 LSL x5,x5,#1 //(ht_tmp - row) * 2 845 ADD x9,x14,x5 //pu1_src_left_cpy[(ht_tmp - row) * 2] 846 sub x13,x9,#2 847 LDRB w5,[x13] //load the value 848 SUB x8,x8,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 849 CMP x8,#0 850 movn x20,#0 851 csel x8, x20, x8,LT 852 MOV x20,#1 853 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 854 mov v17.b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 855 856 LDRB w8,[x0,#1] //pu1_src_cpy[0] 857 sub x13,x9,#1 858 LDRB w5,[x13] //load the value 859 SUB x8,x8,x5 //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 860 CMP x8,#0 861 movn x20,#0 862 csel x8, x20, x8,LT 863 MOV x20,#1 864 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 865 mov v17.b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 866 867SIGN_UP_CHANGE_DONE_WD_16_HT_4: 868 cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 869 cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 870 SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 871 872 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 873 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 874 875 LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 876 TBL v26.16b, {v22.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 877 //TBL v27.8b, {v22.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 878 879 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 880 mov v27.d[0],v26.d[1] 881 882 NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) 883 EXT v17.16b, v17.16b , v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14) 884 885 UZP1 v31.8b, v26.8b, v27.8b 886 UZP2 v27.8b, v26.8b, v27.8b 887 mov v26.8b,v31.8b 888 TBL v24.8b, {v6.16b},v26.8b 889 TBL v25.8b, {v7.16b},v27.8b 890 ZIP1 v31.8b, v24.8b, v25.8b 891 ZIP2 v25.8b, v24.8b, v25.8b 892 mov v24.8b,v31.8b 893 894 Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 895 SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 896 SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 897 UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 898 899 Uxtl2 v26.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 900 SADDW v26.8h, v26.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 901 SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 902 UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 903 904 xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 905 xtn2 v28.16b, v26.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 906 907 ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 908 909 mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row 910 SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 911 BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 912 913 mov w8, w25 //Loads ht 914 ADD x5,sp,#0x4B //*au1_src_left_tmp 915 mov x11, x17 //Loads *pu1_src_left 916 917SRC_LEFT_LOOP_WD_16_HT_4: 918 LDR w7, [x5],#4 //au1_src_left_tmp[row] 919 STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 920 921 SUBS x8,x8,#2 922 BNE SRC_LEFT_LOOP_WD_16_HT_4 923 924 925 SUBS x6,x6,#16 //Decrement the wd loop count by 16 926 BLE RE_ASSINING_LOOP //Jump to re-assigning loop 927 BGT WD_16_HT_4_LOOP 928 929 930WIDTH_RESIDUE: 931 mov w7, w24 //Loads wd 932 mov x5, x21 //Loads pu1_avail 933 CMP x6,x7 //wd_residue == wd 934 LDRb w20, [x5] //pu1_avail[0] 935 csel w8,w20,w8,EQ 936 937 MOV x20,#-1 938 csel x8, x20, x8,NE 939 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 940 mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 941 942 LDRB w8,[x5,#1] //pu1_avail[1] 943 mov v1.b[6], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 944 mov v1.b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 945 946 LDRB w8,[x5,#2] //pu1_avail[2] 947 CMP x8,#0 948 949 SUB x20,x0,x1 //pu1_src - src_strd 950 csel x8, x20, x8,EQ 951 csel x8, x3, x8,NE 952 SUB x8,x8,#2 //pu1_src - src_strd - 2 953 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) 954 //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) 955 //SUB x8, x8,#8 956 957 ADD x5,sp,#0x4B //*au1_src_left_tmp 958 mov w4, w25 //Loads ht 959 mov w7, w24 //Loads wd 960 mov x8, x26 //Loads *pu1_src 961 SUB x7,x7,#2 //(wd - 2) 962 ADD x7,x8,x7 //pu1_src[0 * src_strd + (wd - 2)] 963 964AU1_SRC_LEFT_LOOP_RESIDUE: 965 LDRH w8,[x7] //load the value and increment by src_strd 966 STRH w8,[x5],#2 //store it in the stack pointer 967 ADD x7,x7,x1 968 SUBS x4,x4,#1 //decrement the loop count 969 BNE AU1_SRC_LEFT_LOOP_RESIDUE 970 971 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 972 //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 973 //SUB x0, x0,#8 974 975 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 976 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 977 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 978 MOV x7,x12 //row count, move ht_tmp to x7 979 980PU1_SRC_LOOP_RESIDUE: 981 movi v18.16b, #0 982 ADD x8,x0,x1 //*pu1_src + src_strd 983 LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 984 //LD1 {v17.8b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 985 //SUB x8, x8,#8 986 987 ADD x8,x8,#16 988 LDRH w5,[x8] //pu1_src_cpy[src_strd + 16] 989 mov v18.h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 990 EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 991 992 CMP x7,x12 993 BLT SIGN_UP_CHANGE_RESIDUE 994 mov x5, x21 //Loads pu1_avail 995 LDRB w5,[x5,#2] //pu1_avail[2] 996 CMP x5,#0 997 BNE SIGN_UP_CHANGE_DONE_RESIDUE 998 999SIGN_UP_CHANGE_RESIDUE: 1000 LDRB w8,[x0] //pu1_src_cpy[0] 1001 SUB x5,x12,x7 //ht_tmp - row 1002 LSL x5,x5,#1 //(ht_tmp - row) * 2 1003 ADD x9,x14,x5 //pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 1004 sub x13,x9,#2 1005 LDRB w5,[x13] //load the value 1006 SUB x8,x8,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 1007 CMP x8,#0 1008 movn x20,#0 1009 csel x8, x20, x8,LT 1010 MOV x20,#1 1011 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 1012 mov v17.b[0], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 1013 1014 LDRB w8,[x0,#1] //pu1_src_cpy[0] 1015 sub x13,x9,#1 1016 LDRB w5,[x13] //load the value 1017 SUB x8,x8,x5 //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 1018 CMP x8,#0 1019 movn x20,#0 1020 csel x8, x20, x8,LT 1021 MOV x20,#1 1022 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 1023 mov v17.b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 1024 1025SIGN_UP_CHANGE_DONE_RESIDUE: 1026 cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 1027 cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 1028 SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1029 1030 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 1031 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 1032 1033 LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 1034 mov v22.d[1],v22.d[0] 1035 TBL v26.16b, {v22.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 1036 //TBL v27.8b, {v22.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 1037 1038 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 1039 mov v27.d[0],v26.d[1] 1040 1041 NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) 1042 EXT v17.16b, v17.16b , v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14) 1043 1044 UZP1 v31.8b, v26.8b, v27.8b 1045 UZP2 v27.8b, v26.8b, v27.8b 1046 mov v26.8b,v31.8b 1047 TBL v24.8b, {v6.16b},v26.8b 1048 TBL v25.8b, {v7.16b},v27.8b 1049 ZIP1 v31.8b, v24.8b, v25.8b 1050 ZIP2 v25.8b, v24.8b, v25.8b 1051 mov v24.8b,v31.8b 1052 1053 Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 1054 SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 1055 SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 1056 UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 1057 1058 xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 1059 1060 ST1 {v28.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 1061 1062 mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row 1063 SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 1064 BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to PU1_SRC_LOOP 1065 1066 mov w8, w25 //Loads ht 1067 mov x11, x17 //Loads *pu1_src_left 1068 ADD x5,sp,#0x4B //*au1_src_left_tmp 1069 1070SRC_LEFT_LOOP_RESIDUE: 1071 LDR w7, [x5],#4 //au1_src_left_tmp[row] 1072 SUBS x8,x8,#2 1073 STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 1074 1075 BNE SRC_LEFT_LOOP_RESIDUE 1076 1077 1078RE_ASSINING_LOOP: 1079 mov w8, w25 //Loads ht 1080 1081 mov x0, x26 //Loads *pu1_src 1082 SUB x8,x8,#1 //ht - 1 1083 1084 mov w7, w24 //Loads wd 1085 1086 LDRH w9,[sp,#6] 1087 madd x6, x8, x1, x7 //wd - 2 + (ht - 1) * src_strd 1088 1089 STRH w9,[x0] //pu1_src_org[0] = u1_pos_0_0_tmp 1090 ADD x6,x0,x6 //pu1_src[wd - 2 + (ht - 1) * src_strd] 1091 1092 LDRH w9,[sp,#8] 1093 ADD x12,sp,#10 1094 sub x13,x6,#2 1095 STRH w9,[x13] //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u 1096 1097 mov x4, x15 //Loads pu1_src_top_left 1098 LDRH w10,[sp] //load u1_src_top_left_tmp from stack pointer 1099 STRH w10,[x4] //*pu1_src_top_left = u1_src_top_left_tmp 1100 mov x3, x22 //Loads pu1_src_top 1101 1102SRC_TOP_LOOP: 1103 LD1 {v0.8b},[x12],#8 //pu1_src_top[col] = au1_src_top_tmp[col] 1104 SUBS x7,x7,#8 //Decrement the width 1105 ST1 {v0.8b},[x3],#8 //pu1_src_top[col] = au1_src_top_tmp[col] 1106 BNE SRC_TOP_LOOP 1107 1108END_LOOPS: 1109 ADD sp,sp,#0xE0 1110 // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 1111 ldp x27, x28,[sp],#16 1112 ldp x25, x26,[sp],#16 1113 ldp x23, x24,[sp],#16 1114 ldp x21, x22,[sp],#16 1115 ldp x19, x20,[sp],#16 1116 1117 ret 1118 1119 1120 1121