1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_inter_pred_chroma_copy_w16out_neon.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* chroma interprediction filter for copy 45@* 46@* @par description: 47@* copies the array of width 'wd' and height 'ht' from the location pointed 48@* by 'src' to the location pointed by 'dst' 49@* 50@* @param[in] pu1_src 51@* uword8 pointer to the source 52@* 53@* @param[out] pu1_dst 54@* uword8 pointer to the destination 55@* 56@* @param[in] src_strd 57@* integer source stride 58@* 59@* @param[in] dst_strd 60@* integer destination stride 61@* 62@* @param[in] pi1_coeff 63@* word8 pointer to the filter coefficients 64@* 65@* @param[in] ht 66@* integer height of the array 67@* 68@* @param[in] wd 69@* integer width of the array 70@* 71@* @returns 72@* 73@* @remarks 74@* none 75@* 76@******************************************************************************* 77@*/ 78 79@void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src, 80@ word16 *pi2_dst, 81@ word32 src_strd, 82@ word32 dst_strd, 83@ word8 *pi1_coeff, 84@ word32 ht, 85@ word32 wd) 86@**************variables vs registers***************************************** 87@r0 => *pu1_src 88@r1 => *pi2_dst 89@r2 => src_strd 90@r3 => dst_strd 91@r4 => *pi1_coeff 92@r5 => ht 93@r6 => wd 94 95.text 96.align 4 97 98 99 100 101.globl ihevc_inter_pred_chroma_copy_w16out_a9q 102 103.type ihevc_inter_pred_chroma_copy_w16out_a9q, %function 104 105ihevc_inter_pred_chroma_copy_w16out_a9q: 106 107 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 108 ldr r12,[sp,#48] @loads wd 109 lsl r12,r12,#1 @2*wd 110 ldr r7,[sp,#44] @loads ht 111 cmp r7,#0 @ht condition(ht == 0) 112 ble end_loops @loop 113 and r8,r7,#3 @check ht for mul of 2 114 sub r9,r7,r8 @check the rounded height value 115 and r11,r7,#6 116 cmp r11,#6 117 beq loop_ht_6 118 tst r12,#7 @conditional check for wd (multiples) 119 beq core_loop_wd_8 120 121loop_ht_6: 122 sub r11,r12,#4 123 lsls r6,r3,#1 124 cmp r9,#0 125 beq outer_loop_wd_4_ht_2 126 127outer_loop_wd_4: 128 subs r4,r12,#0 @wd conditional subtract 129 ble end_inner_loop_wd_4 130 131inner_loop_wd_4: 132 vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp) 133 add r5,r0,r2 @pu1_src +src_strd 134 vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp) 135 add r10,r1,r6 136 subs r4,r4,#4 @wd - 4 137 vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6) 138 vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp) 139 add r0,r0,#4 @pu1_src += 4 140 vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 141 add r1,r1,#8 142 vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp) 143 vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp) 144 vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6) 145 vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp) 146 vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 147 vshl.i64 q12,q12,#6 @vshlq_n_s64(temp, 6) 148 vld1.8 {d26},[r5],r2 @vld1_u8(pu1_src_tmp) 149 vst1.64 {d24},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 150 vmovl.u8 q13,d26 @vmovl_u8(vld1_u8(pu1_src_tmp) 151 vshl.i64 q13,q13,#6 @vshlq_n_s64(temp, 6) 152 vst1.64 {d26},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 153 bgt inner_loop_wd_4 154 155end_inner_loop_wd_4: 156 subs r9,r9,#4 @ht - 4 157 sub r0,r5,r11 158 sub r1,r10,r11,lsl #1 159 bgt outer_loop_wd_4 160 cmp r8,#0 161 bgt outer_loop_wd_4_ht_2 162 163 164end_loops: 165 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 166 167 168outer_loop_wd_4_ht_2: 169 subs r4,r12,#0 @wd conditional subtract 170 ble end_inner_loop_wd_4 171 172inner_loop_wd_4_ht_2: 173 vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp) 174 add r5,r0,r2 @pu1_src +src_strd 175 vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp) 176 add r10,r1,r6 177 subs r4,r4,#4 @wd - 4 178 vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6) 179 vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp) 180 add r0,r0,#4 @pu1_src += 4 181 vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 182 add r1,r1,#8 183 vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp) 184 vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp) 185 vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6) 186 vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp) 187 vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 188 bgt inner_loop_wd_4_ht_2 189 b end_loops 190 191 192core_loop_wd_8: 193 @sub r11,r12,#8 194 lsls r5,r3,#1 195 rsb r11,r12,r3, lsl #2 @ r11 = (dst_strd * 4) - width 196 rsb r8,r12,r2,lsl #2 @r2->src_strd 197 mov r4,r12, lsr #3 @ divide by 8 198 mov r7,r9 199 mul r7, r4 200 sub r4,r12,#0 @wd conditional check 201 sub r7,r7,#4 @subtract one for epilog 202 cmp r9,#0 203 beq core_loop_wd_8_ht_2 204 205prolog: 206 add r6,r0,r2 @pu1_src_tmp += src_strd 207 add r10,r1,r5 208 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 209 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 210 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 211 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 212 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 213 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 214 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 215 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 216 subs r4,r4,#8 @wd decrements by 8 217 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 218 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 219 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 220 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 221 addle r0,r0,r8 222 add r6,r0,r2 @pu1_src_tmp += src_strd 223 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 224 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 225 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 226 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 227 228 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 229 addle r1,r1,r11,lsl #1 230 suble r4,r12,#0 @wd conditional check 231 232 subs r7,r7,#4 @ht - 4 233 234 blt epilog_end @jumps to epilog_end 235 beq epilog @jumps to epilog 236 237 238 239outer_loop_wd_8: 240 241 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 242 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 243 244 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 245 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 246 247 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 248 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 249 250 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 251 252 subs r4,r4,#8 @wd decrements by 8 253 addle r0,r0,r8 254 255 add r6,r0,r2 @pu1_src_tmp += src_strd 256 257 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 258 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 259 260 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 261 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 262 263 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 264 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 265 266 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 267 add r10,r1,r5 268 269 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 270 271 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 272 273 addle r1,r1,r11,lsl #1 274 suble r4,r12,#0 @wd conditional check 275 276 subs r7,r7,#4 @ht - 4 277 bgt outer_loop_wd_8 278 279epilog: 280 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 281 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 282 283 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 284 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 285 286 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 287 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 288 289 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 290 @add r6,r0,r2 @pu1_src_tmp += src_strd 291 292 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 293 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 294 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 295 add r10,r1,r5 296 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 297 298 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 299epilog_end: 300 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 301 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 302 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 303 b end_loops 304 305core_loop_wd_8_ht_2: 306 add r6,r0,r2 @pu1_src_tmp += src_strd 307 add r10,r1,r5 308 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 309 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 310 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 311 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 312 subs r12,r12,#8 @wd decrements by 8 313 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 314 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 315 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 316 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 317 bgt core_loop_wd_8_ht_2 318 319 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 320 321 322 323 324 325 326