1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_inter_pred_chroma_vert_neon_w16inp_w16out_neon.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs / parthiban 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@/** 42@******************************************************************************* 43@* 44@* @brief 45@* chroma interprediction filter for 16bit vertical input and output. 46@* 47@* @par description: 48@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 49@* the elements pointed by 'pu1_src' and writes to the location pointed by 50@* 'pu1_dst' input is 16 bits the filter output is downshifted by 6 and 51@* 8192 is subtracted to store it as a 16 bit number the output is used as 52@* a input to weighted prediction assumptions : the function is optimized 53@* considering the fact width and height are multiple of 2. 54@* 55@* @param[in] pi2_src 56@* word16 pointer to the source 57@* 58@* @param[out] pi2_dst 59@* word16 pointer to the destination 60@* 61@* @param[in] src_strd 62@* integer source stride 63@* 64@* @param[in] dst_strd 65@* integer destination stride 66@* 67@* @param[in] pi1_coeff 68@* word8 pointer to the filter coefficients 69@* 70@* @param[in] ht 71@* integer height of the array 72@* 73@* @param[in] wd 74@* integer width of the array 75@* 76@* @returns 77@* 78@* @remarks 79@* none 80@* 81@******************************************************************************* 82@*/ 83@void ihevc_inter_pred_chroma_vert_w16inp_w16out(word16 *pi2_src, 84@ word16 *pi2_dst, 85@ word32 src_strd, 86@ word32 dst_strd, 87@ word8 *pi1_coeff, 88@ word32 ht, 89@ word32 wd) 90@**************variables vs registers***************************************** 91@r0 => *pu1_src 92@r1 => *pi2_dst 93@r2 => src_strd 94@r3 => dst_strd 95.text 96.align 4 97 98 99 100 101.globl ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q 102 103.type ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q, %function 104 105ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q: 106 107 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 108 109 ldr r4, [sp,#40] @loads pi1_coeff 110 ldr r6, [sp,#48] @wd 111 lsl r2,r2,#1 @src_strd = 2* src_strd 112 ldr r5,[sp,#44] @loads ht 113 vld1.8 {d0},[r4] @loads pi1_coeff 114 sub r4,r0,r2 @pu1_src - src_strd 115 vmovl.s8 q0,d0 @long the value 116 117 tst r6,#3 @checks wd == 2 118 vdup.16 d12,d0[0] @coeff_0 119 vdup.16 d13,d0[1] @coeff_1 120 vdup.16 d14,d0[2] @coeff_2 121 vdup.16 d15,d0[3] @coeff_3 122 123 bgt core_loop_ht_2 @jumps to loop handles wd 2 124 125 tst r5,#3 @checks ht == mul of 4 126 beq core_loop_ht_4 @jumps to loop handles ht mul of 4 127 128core_loop_ht_2: 129 lsl r7,r2,#1 @2*src_strd 130 lsl r3,r3,#1 @2*dst_strd 131 lsl r9,r6,#2 @4*wd 132 sub r6,r3,r6,lsl #1 @2*dst_strd - 2*wd 133 sub r8,r7,r9 @2*src_strd - 4*wd 134 mov r12,r9 @4wd 135 136inner_loop_ht_2: 137 add r0,r4,r2 @increments pi2_src 138 vld1.16 {d0},[r4]! @loads pu1_src 139 vmull.s16 q0,d0,d12 @vmull_s16(src_tmp1, coeff_0) 140 subs r12,r12,#8 @2wd + 8 141 vld1.16 {d2},[r0],r2 @loads pi2_src 142 vmull.s16 q4,d2,d12 @vmull_s16(src_tmp2, coeff_0) 143 vld1.16 {d3},[r0],r2 @loads pi2_src 144 vmlal.s16 q0,d2,d13 145 vld1.16 {d6},[r0],r2 146 vmlal.s16 q4,d3,d13 147 vld1.16 {d2},[r0] 148 add r7,r1,r3 @pu1_dst + dst_strd 149 vmlal.s16 q0,d3,d14 150 vmlal.s16 q4,d6,d14 151 vmlal.s16 q0,d6,d15 152 vmlal.s16 q4,d2,d15 153 vqshrn.s32 d0,q0,#6 @right shift 154 vqshrn.s32 d30,q4,#6 @right shift 155 vst1.32 {d0},[r1]! @stores the loaded value 156 vst1.32 {d30},[r7] @stores the loaded value 157 bgt inner_loop_ht_2 @inner loop -again 158 159 @inner loop ends 160 subs r5,r5,#2 @increments ht 161 add r1,r1,r6,lsl #1 @pu1_dst += 2*dst_strd - 2*wd 162 mov r12,r9 @4wd 163 add r4,r4,r8 @pi1_src_tmp1 += 2*src_strd - 4*wd 164 bgt inner_loop_ht_2 @loop again 165 166 b end_loops @jumps to end 167 168core_loop_ht_4: 169 lsl r7,r2,#2 @2*src_strd 170 lsl r10,r3,#2 @2*dst_strd 171 mov r11,r6,lsr #1 @divide by 2 172 sub lr,r10,r6,lsl #1 @2*dst_strd - 2*wd 173 sub r8,r7,r6,lsl #2 @2*src_strd - 4*wd 174 175 mul r12,r5,r11 @multiply height by width 176 sub r12,#4 @subtract by one for epilog 177 mov r11,r6,lsl #1 @2*wd 178 lsl r3,r3,#1 @2*dst_strd 179 180prolog: 181 add r0,r4,r2 @increments pi2_src 182 vld1.16 {d0},[r4]! @loads pu1_src 183 vld1.16 {d1},[r0],r2 @loads pi2_src 184 subs r11,r11,#4 185 vld1.16 {d2},[r0],r2 @loads pi2_src 186 vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0) 187 vld1.16 {d3},[r0],r2 188 vmlal.s16 q15,d1,d13 189 vmlal.s16 q15,d2,d14 190 add r9,r1,r3 @pu1_dst + dst_strd 191 vmlal.s16 q15,d3,d15 192 193 vld1.16 {d4},[r0],r2 194 vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0) 195 addle r4,r4,r8 196 movle r11,r6,lsl #1 197 vmlal.s16 q14,d2,d13 198 vmlal.s16 q14,d3,d14 199 vld1.s16 {d5},[r0],r2 200 vmlal.s16 q14,d4,d15 201 202 vqshrn.s32 d30,q15,#6 @right shift 203 204 vld1.s16 {d6},[r0],r2 205 vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0) 206 vmlal.s16 q13,d3,d13 207 vmlal.s16 q13,d4,d14 208 add r0,r4,r2 209 vld1.16 {d0},[r4]! @loads pu1_src 210 vmlal.s16 q13,d5,d15 211 212 vqshrn.s32 d28,q14,#6 @right shift 213 214 vld1.16 {d1},[r0],r2 @loads pi2_src 215 vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0) 216 vst1.32 {d30},[r1]! @stores the loaded value 217 vmlal.s16 q12,d4,d13 218 vld1.16 {d2},[r0],r2 @loads pi2_src 219 vmlal.s16 q12,d5,d14 220 vld1.16 {d3},[r0],r2 221 vmlal.s16 q12,d6,d15 222 addle r1,r1,lr,lsl #1 223 224 vqshrn.s32 d26,q13,#6 @right shift 225 subs r12,r12,#4 226 227 beq epilog @jumps to epilog 228 229kernel_4: 230 vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0) 231 subs r11,r11,#4 232 vmlal.s16 q15,d1,d13 233 vst1.32 {d28},[r9],r3 @stores the loaded value 234 vmlal.s16 q15,d2,d14 235 vmlal.s16 q15,d3,d15 236 237 vqshrn.s32 d24,q12,#6 @right shift 238 239 vld1.16 {d4},[r0],r2 240 vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0) 241 vmlal.s16 q14,d2,d13 242 vmlal.s16 q14,d3,d14 243 vmlal.s16 q14,d4,d15 244 vst1.32 {d26},[r9],r3 @stores the loaded value 245 addle r4,r4,r8 246 movle r11,r6,lsl #1 247 248 vqshrn.s32 d30,q15,#6 @right shift 249 250 vld1.s16 {d5},[r0],r2 251 vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0) 252 vld1.s16 {d6},[r0],r2 253 vmlal.s16 q13,d3,d13 254 vst1.32 {d24},[r9] @stores the loaded value 255 add r0,r4,r2 256 vmlal.s16 q13,d4,d14 257 vld1.16 {d0},[r4]! @loads pu1_src 258 vmlal.s16 q13,d5,d15 259 260 vqshrn.s32 d28,q14,#6 @right shift 261 262 vld1.16 {d1},[r0],r2 @loads pi2_src 263 vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0) 264 vld1.16 {d2},[r0],r2 @loads pi2_src 265 vmlal.s16 q12,d4,d13 266 add r9,r1,r3 @pu1_dst + dst_strd 267 vld1.16 {d3},[r0],r2 268 vmlal.s16 q12,d5,d14 269 270 vst1.32 {d30},[r1]! @stores the loaded value 271 vmlal.s16 q12,d6,d15 272 273 vqshrn.s32 d26,q13,#6 @right shift 274 addle r1,r1,lr,lsl #1 275 276 subs r12,r12,#4 277 278 bgt kernel_4 @jumps to kernel_4 279 280epilog: 281 vmull.s16 q15,d0,d12 @vmull_s16(src_tmp1, coeff_0) 282 vst1.32 {d28},[r9],r3 @stores the loaded value 283 vmlal.s16 q15,d1,d13 284 vmlal.s16 q15,d2,d14 285 vmlal.s16 q15,d3,d15 286 287 vqshrn.s32 d24,q12,#6 @right shift 288 289 vmull.s16 q14,d1,d12 @vmull_s16(src_tmp2, coeff_0) 290 vld1.16 {d4},[r0],r2 291 vmlal.s16 q14,d2,d13 292 vst1.32 {d26},[r9],r3 @stores the loaded value 293 vmlal.s16 q14,d3,d14 294 vmlal.s16 q14,d4,d15 295 296 vqshrn.s32 d30,q15,#6 @right shift 297 298 vmull.s16 q13,d2,d12 @vmull_s16(src_tmp2, coeff_0) 299 vld1.s16 {d5},[r0],r2 300 vmlal.s16 q13,d3,d13 301 vmlal.s16 q13,d4,d14 302 vmlal.s16 q13,d5,d15 303 304 vqshrn.s32 d28,q14,#6 @right shift 305 306 vst1.32 {d24},[r9] @stores the loaded value 307 vmull.s16 q12,d3,d12 @vmull_s16(src_tmp2, coeff_0) 308 vmlal.s16 q12,d4,d13 309 add r9,r1,r3 @pu1_dst + dst_strd 310 vld1.s16 {d6},[r0],r2 311 vmlal.s16 q12,d5,d14 312 vmlal.s16 q12,d6,d15 313 vst1.32 {d30},[r1]! @stores the loaded value 314 315 vqshrn.s32 d26,q13,#6 @right shift 316 317 vst1.32 {d28},[r9],r3 @stores the loaded value 318 319 vqshrn.s32 d24,q12,#6 @right shift 320 vst1.32 {d26},[r9],r3 @stores the loaded value 321 322 vst1.32 {d24},[r9] @stores the loaded value 323 324end_loops: 325 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 326 327 328 329 330