1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* ,:file 21//* ihevc_sao_band_offset_luma.s 22//* 23//* ,:brief 24//* Contains function definitions for inter prediction interpolation. 25//* Functions are coded using NEON intrinsics and can be compiled using// ARM 26//* RVCT 27//* 28//* ,:author 29//* Parthiban V 30//* 31//* ,:par List of Functions: 32//* 33//* 34//* ,:remarks 35//* None 36//* 37//******************************************************************************* 38//*/ 39//void ihevc_sao_band_offset_luma(UWORD8 *pu1_src, 40// WORD32 src_strd, 41// UWORD8 *pu1_src_left, 42// UWORD8 *pu1_src_top, 43// UWORD8 *pu1_src_top_left, 44// WORD32 sao_band_pos, 45// WORD8 *pi1_sao_offset, 46// WORD32 wd, 47// WORD32 ht) 48// 49//**************Variables Vs Registers***************************************** 50//x0 => *pu1_src 51//x1 => src_strd 52//x2 => *pu1_src_left 53//x3 => *pu1_src_top 54//x4 => *pu1_src_top_left 55//x5 => sao_band_pos 56//x6 => *pi1_sao_offset 57//x7 => wd 58//x8 => ht 59 60 61.set WIDE_REFERENCE, 0 62.set ARCHITECTURE, 5 63.set DO1STROUNDING, 0 64 65.include "ihevc_neon_macros.s" 66 67.text 68.p2align 2 69 70.globl gu1_table_band_idx 71.globl ihevc_sao_band_offset_luma_av8 72 73ihevc_sao_band_offset_luma_av8: 74 75 // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments 76 77 LDR w8,[sp] //Loads ht 78 79 80 stp d13,d14,[sp,#-16]! 81 stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error. 82 // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function. 83 stp x19, x20,[sp,#-16]! 84 85 MOV x9,x8 //Move the ht to x9 for loop counter 86 ADD x10,x0,x7 //pu1_src[row * src_strd + (wd)] 87 88 SUB x10,x10,#1 //wd-1 89 ADRP x14, :got:gu1_table_band_idx 90 LDR x14, [x14, #:got_lo12:gu1_table_band_idx] 91 92SRC_LEFT_LOOP: 93 LDRB w11,[x10] 94 add x10, x10, x1 //Load the value 95 SUBS x9,x9,#1 //Decrement the loop counter 96 STRB w11,[x2],#1 //Store the value in pu1_src_left pointer 97 BNE SRC_LEFT_LOOP 98 99 ADD x9,x3,x7 //pu1_src_top[wd] 100 LD1 {v1.8b},[x14],#8 //band_table.val[0] 101 102 LSL x11,x5,#3 103 LD1 {v2.8b},[x14],#8 //band_table.val[1] 104 105 LDRB w10,[x9,#-1] 106 dup v31.8b,w11 //band_pos 107 SUB x12,x8,#1 //ht-1 108 109 STRB w10,[x4] //store to pu1_src_top_left[0] 110 LD1 {v3.8b},[x14],#8 //band_table.val[2] 111 mul x12, x12, x1 //ht-1 * src_strd 112 113 ADD x4,x12,x0 //pu1_src[(ht - 1) * src_strd] 114 LD1 {v4.8b},[x14],#8 //band_table.val[3] 115 MOV x9,x7 //Move the wd to x9 for loop counter 116 117SRC_TOP_LOOP: //wd is always multiple of 8 118 LD1 {v0.8b},[x4],#8 //Load pu1_src[(ht - 1) * src_strd + col] 119 SUBS x9,x9,#8 //Decrement the loop counter by 8 120 ST1 {v0.8b},[x3],#8 //Store to pu1_src_top[col] 121 BNE SRC_TOP_LOOP 122 123 LD1 {v30.8b},[x6] //pi1_sao_offset load 124 ADD v5.8b, v1.8b , v31.8b //band_table.val[0] = vadd_u8(band_table.val[0], band_pos) 125 126 dup v29.8b, v30.b[1] //vdup_n_u8(pi1_sao_offset[1]) 127 ADD v6.8b, v2.8b , v31.8b //band_table.val[1] = vadd_u8(band_table.val[1], band_pos) 128 129 dup v28.8b, v30.b[2] //vdup_n_u8(pi1_sao_offset[2]) 130 ADD v7.8b, v3.8b , v31.8b //band_table.val[2] = vadd_u8(band_table.val[2], band_pos) 131 132 dup v27.8b, v30.b[3] //vdup_n_u8(pi1_sao_offset[3]) 133 ADD v21.8b, v4.8b , v31.8b //band_table.val[3] = vadd_u8(band_table.val[3], band_pos) 134 135 dup v26.8b, v30.b[4] //vdup_n_u8(pi1_sao_offset[4]) 136 ADD v1.8b, v5.8b , v29.8b //band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1])) 137 138 movi v29.8b, #16 //vdup_n_u8(16) 139 ADD v2.8b, v6.8b , v28.8b //band_table.val[1] = vadd_u8(band_table.val[1], vdup_n_u8(pi1_sao_offset[2])) 140 141 CMP x5,#28 142 ADD v3.8b, v7.8b , v27.8b //band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3])) 143 144 ADD v4.8b, v21.8b , v26.8b //band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4])) 145 BLT SAO_BAND_POS_0 146 147SAO_BAND_POS_28: //case 28 148 149 cmhs v25.8b, v29.8b , v4.8b //vcle_u8(band_table.val[3], vdup_n_u8(16)) 150 151 BNE SAO_BAND_POS_29 152 ORR v4.8b, v4.8b , v25.8b //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp) 153 B SWITCH_BREAK 154 155SAO_BAND_POS_29: //case 29 156 CMP x5,#29 157 cmhs v24.8b, v29.8b , v3.8b //vcle_u8(band_table.val[2], vdup_n_u8(16)) 158 159 BNE SAO_BAND_POS_30 160 ORR v3.8b, v3.8b , v24.8b //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp) 161 162 AND v4.8b, v4.8b , v25.8b //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp) 163 B SWITCH_BREAK 164 165SAO_BAND_POS_30: //case 30 166 CMP x5,#30 167 cmhs v23.8b, v29.8b , v2.8b //vcle_u8(band_table.val[1], vdup_n_u8(16)) 168 169 BNE SAO_BAND_POS_31 170 ORR v2.8b, v2.8b , v23.8b //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp) 171 172 AND v3.8b, v3.8b , v24.8b //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp) 173 B SWITCH_BREAK 174 175SAO_BAND_POS_31: //case 31 176 CMP x5,#31 177 BNE SWITCH_BREAK 178 179 cmhs v22.8b, v29.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) 180 ORR v1.8b, v1.8b , v22.8b //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp) 181 182 AND v2.8b, v2.8b , v23.8b //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp) 183 184SAO_BAND_POS_0: 185 CMP x5,#0 //case 0 186 BNE SWITCH_BREAK 187 188 cmhs v22.8b, v29.8b , v1.8b //vcle_u8(band_table.val[0], vdup_n_u8(16)) 189 AND v1.8b, v1.8b , v22.8b //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp) 190 191SWITCH_BREAK: 192 193 mov v1.d[1],v2.d[0] 194 mov v2.d[0],v3.d[0] 195 mov v2.d[1],v4.d[0] 196 197SWITCH_BREAK_1: 198 199 MOV x4,x0 //pu1_src_cpy 200 MOV x11,x8 //move ht 201 ADD x5,x4,x1 202 203HEIGHT_LOOP: 204 ADD x6,x5,x1 205 LD1 {v13.8b},[x4] //au1_cur_row = vld1_u8(pu1_src_cpy) 206 207 ADD x10,x6,x1 208 LD1 {v15.8b},[x5] //au1_cur_row = vld1_u8(pu1_src_cpy) 209 210 LD1 {v17.8b},[x6] //au1_cur_row = vld1_u8(pu1_src_cpy) 211 212 LD1 {v19.8b},[x10] //au1_cur_row = vld1_u8(pu1_src_cpy) 213 SUB v14.8b, v13.8b , v31.8b //vsub_u8(au1_cur_row, band_pos) 214 215 TBX v13.8b, {v1.16b- v2.16b},v14.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos)) 216 SUB v16.8b, v15.8b , v31.8b //vsub_u8(au1_cur_row, band_pos) 217 218 TBX v15.8b, {v1.16b- v2.16b},v16.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos)) 219 SUB v18.8b, v17.8b , v31.8b //vsub_u8(au1_cur_row, band_pos) 220 221 TBX v17.8b, {v1.16b- v2.16b},v18.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos)) 222 SUB v20.8b, v19.8b , v31.8b //vsub_u8(au1_cur_row, band_pos) 223 224 TBX v19.8b, {v1.16b- v2.16b},v20.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos)) 225 ST1 {v13.8b},[x4],x1 //vst1_u8(pu1_src_cpy, au1_cur_row) 226 227 ST1 {v15.8b},[x5] //vst1_u8(pu1_src_cpy, au1_cur_row) 228 SUBS x11,x11,#4 //Decrement the ht loop count by 4 229 230 ST1 {v17.8b},[x6],x1 //vst1_u8(pu1_src_cpy, au1_cur_row) 231 232 ADD x4,x6,x1 233 ST1 {v19.8b},[x10] //vst1_u8(pu1_src_cpy, au1_cur_row) 234 ADD x5,x4,x1 235 236 BNE HEIGHT_LOOP 237 238 SUBS x7,x7,#8 //Decrement the width loop by 8 239 ADD x0,x0,#8 240 BNE SWITCH_BREAK_1 241 242 // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 243 ldp x19, x20,[sp], #16 244 ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error. 245 // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function. 246 ldp d13,d14,[sp],#16 247 ret 248 249 250 251