1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* ,:file 21@* ihevc_sao_band_offset_luma.s 22@* 23@* ,:brief 24@* Contains function definitions for inter prediction interpolation. 25@* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26@* RVCT 27@* 28@* ,:author 29@* Parthiban V 30@* 31@* ,:par List of Functions: 32@* 33@* 34@* ,:remarks 35@* None 36@* 37@******************************************************************************* 38@*/ 39@void ihevc_sao_band_offset_luma(UWORD8 *pu1_src, 40@ WORD32 src_strd, 41@ UWORD8 *pu1_src_left, 42@ UWORD8 *pu1_src_top, 43@ UWORD8 *pu1_src_top_left, 44@ WORD32 sao_band_pos, 45@ WORD8 *pi1_sao_offset, 46@ WORD32 wd, 47@ WORD32 ht) 48@ 49@**************Variables Vs Registers***************************************** 50@r0 => *pu1_src 51@r1 => src_strd 52@r2 => *pu1_src_left 53@r3 => *pu1_src_top 54@r4 => *pu1_src_top_left 55@r5 => sao_band_pos 56@r6 => *pi1_sao_offset 57@r7 => wd 58@r8 => ht 59 60.text 61.p2align 2 62 63.extern gu1_table_band_idx 64.globl ihevc_sao_band_offset_luma_a9q 65 66gu1_table_band_idx_addr: 67.long gu1_table_band_idx - ulbl1 - 8 68 69ihevc_sao_band_offset_luma_a9q: 70 71 STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments 72 73 LDR r8,[sp,#56] @Loads ht 74 LDR r7,[sp,#52] @Loads wd 75 76 MOV r9,r8 @Move the ht to r9 for loop counter 77 LDR r5,[sp,#44] @Loads sao_band_pos 78 ADD r10,r0,r7 @pu1_src[row * src_strd + (wd)] 79 80 LDR r4,[sp,#40] @Loads pu1_src_top_left 81 SUB r10,r10,#1 @wd-1 82 LDR r14, gu1_table_band_idx_addr 83ulbl1: 84 add r14,r14,pc 85 86SRC_LEFT_LOOP: 87 LDRB r11,[r10],r1 @Load the value 88 SUBS r9,r9,#1 @Decrement the loop counter 89 STRB r11,[r2],#1 @Store the value in pu1_src_left pointer 90 BNE SRC_LEFT_LOOP 91 92 ADD r9,r3,r7 @pu1_src_top[wd] 93 VLD1.8 D1,[r14]! @band_table.val[0] 94 LDR r6,[sp,#48] @Loads pi1_sao_offset 95 96 LSL r11,r5,#3 97 VLD1.8 D2,[r14]! @band_table.val[1] 98 99 LDRB r10,[r9,#-1] 100 VDUP.8 D31,r11 @band_pos 101 SUB r12,r8,#1 @ht-1 102 103 STRB r10,[r4] @store to pu1_src_top_left[0] 104 VLD1.8 D3,[r14]! @band_table.val[2] 105 MUL r12,r12,r1 @ht-1 * src_strd 106 107 ADD r4,r12,r0 @pu1_src[(ht - 1) * src_strd] 108 VLD1.8 D4,[r14]! @band_table.val[3] 109 MOV r9,r7 @Move the wd to r9 for loop counter 110 111SRC_TOP_LOOP: @wd is always multiple of 8 112 VLD1.8 D0,[r4]! @Load pu1_src[(ht - 1) * src_strd + col] 113 SUBS r9,r9,#8 @Decrement the loop counter by 8 114 VST1.8 D0,[r3]! @Store to pu1_src_top[col] 115 BNE SRC_TOP_LOOP 116 117 VLD1.8 D30,[r6] @pi1_sao_offset load 118 VADD.I8 D5,D1,D31 @band_table.val[0] = vadd_u8(band_table.val[0], band_pos) 119 120 VDUP.8 D29,D30[1] @vdup_n_u8(pi1_sao_offset[1]) 121 VADD.I8 D6,D2,D31 @band_table.val[1] = vadd_u8(band_table.val[1], band_pos) 122 123 VDUP.8 D28,D30[2] @vdup_n_u8(pi1_sao_offset[2]) 124 VADD.I8 D7,D3,D31 @band_table.val[2] = vadd_u8(band_table.val[2], band_pos) 125 126 VDUP.8 D27,D30[3] @vdup_n_u8(pi1_sao_offset[3]) 127 VADD.I8 D8,D4,D31 @band_table.val[3] = vadd_u8(band_table.val[3], band_pos) 128 129 VDUP.8 D26,D30[4] @vdup_n_u8(pi1_sao_offset[4]) 130 VADD.I8 D1,D5,D29 @band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1])) 131 132 VMOV.I8 D29,#16 @vdup_n_u8(16) 133 VADD.I8 D2,D6,D28 @band_table.val[1] = vadd_u8(band_table.val[1], vdup_n_u8(pi1_sao_offset[2])) 134 135 CMP r5,#28 136 VADD.I8 D3,D7,D27 @band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3])) 137 138 VADD.I8 D4,D8,D26 @band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4])) 139 BLT SAO_BAND_POS_0 140 141SAO_BAND_POS_28: @case 28 142 143 VCLE.U8 D12,D4,D29 @vcle_u8(band_table.val[3], vdup_n_u8(16)) 144 145 BNE SAO_BAND_POS_29 146 VORR.U8 D4,D4,D12 @band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp) 147 B SWITCH_BREAK 148 149SAO_BAND_POS_29: @case 29 150 CMP r5,#29 151 VCLE.U8 D11,D3,D29 @vcle_u8(band_table.val[2], vdup_n_u8(16)) 152 153 BNE SAO_BAND_POS_30 154 VORR.U8 D3,D3,D11 @band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp) 155 156 VAND.U8 D4,D4,D12 @band_table.val[3] = vand_u8(band_table.val[3], au1_cmp) 157 B SWITCH_BREAK 158 159SAO_BAND_POS_30: @case 30 160 CMP r5,#30 161 VCLE.U8 D10,D2,D29 @vcle_u8(band_table.val[1], vdup_n_u8(16)) 162 163 BNE SAO_BAND_POS_31 164 VORR.U8 D2,D2,D10 @band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp) 165 166 VAND.U8 D3,D3,D11 @band_table.val[2] = vand_u8(band_table.val[2], au1_cmp) 167 B SWITCH_BREAK 168 169SAO_BAND_POS_31: @case 31 170 CMP r5,#31 171 BNE SWITCH_BREAK 172 173 VCLE.U8 D9,D1,D29 @vcle_u8(band_table.val[0], vdup_n_u8(16)) 174 VORR.U8 D1,D1,D9 @band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp) 175 176 VAND.U8 D2,D2,D10 @band_table.val[1] = vand_u8(band_table.val[1], au1_cmp) 177 178SAO_BAND_POS_0: 179 CMP r5,#0 @case 0 180 BNE SWITCH_BREAK 181 182 VCLE.U8 D9,D1,D29 @vcle_u8(band_table.val[0], vdup_n_u8(16)) 183 VAND.U8 D1,D1,D9 @band_table.val[0] = vand_u8(band_table.val[0], au1_cmp) 184 185SWITCH_BREAK: 186 MOV r4,r0 @pu1_src_cpy 187 MOV r11,r8 @move ht 188 ADD r5,r4,r1 189 190HEIGHT_LOOP: 191 ADD r6,r5,r1 192 VLD1.8 D13,[r4] @au1_cur_row = vld1_u8(pu1_src_cpy) 193 194 ADD r10,r6,r1 195 VLD1.8 D15,[r5] @au1_cur_row = vld1_u8(pu1_src_cpy) 196 197 VLD1.8 D17,[r6] @au1_cur_row = vld1_u8(pu1_src_cpy) 198 199 VLD1.8 D19,[r10] @au1_cur_row = vld1_u8(pu1_src_cpy) 200 VSUB.I8 D14,D13,D31 @vsub_u8(au1_cur_row, band_pos) 201 202 VTBX.8 D13,{D1-D4},D14 @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos)) 203 VSUB.I8 D16,D15,D31 @vsub_u8(au1_cur_row, band_pos) 204 205 VTBX.8 D15,{D1-D4},D16 @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos)) 206 VSUB.I8 D18,D17,D31 @vsub_u8(au1_cur_row, band_pos) 207 208 VTBX.8 D17,{D1-D4},D18 @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos)) 209 VSUB.I8 D20,D19,D31 @vsub_u8(au1_cur_row, band_pos) 210 211 VTBX.8 D19,{D1-D4},D20 @vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos)) 212 VST1.8 D13,[r4],r1 @vst1_u8(pu1_src_cpy, au1_cur_row) 213 214 VST1.8 D15,[r5] @vst1_u8(pu1_src_cpy, au1_cur_row) 215 SUBS r11,r11,#4 @Decrement the ht loop count by 4 216 217 VST1.8 D17,[r6],r1 @vst1_u8(pu1_src_cpy, au1_cur_row) 218 219 ADD r4,r6,r1 220 VST1.8 D19,[r10] @vst1_u8(pu1_src_cpy, au1_cur_row) 221 ADD r5,r4,r1 222 223 BNE HEIGHT_LOOP 224 225 SUBS r7,r7,#8 @Decrement the width loop by 8 226 ADD r0,r0,#8 227 BNE SWITCH_BREAK 228 229 LDMFD sp!,{r4-r12,r15} @Reload the registers from SP 230 231 232 233 234