1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_intra_pred_chroma_horz_neon.s 22@* 23@* @brief 24@* contains function definition for intra prediction interpolation filters 25@* 26@* 27@* @author 28@* parthiban v 29@* 30@* @par list of functions: 31@* - ihevc_intra_pred_luma_horz() 32@* 33@* @remarks 34@* none 35@* 36@******************************************************************************* 37@*/ 38@ 39@/** 40@******************************************************************************* 41@* 42@* @brief 43@* intra prediction interpolation filter for horizontal luma variable. 44@* 45@* @par description: 46@* horizontal intraprediction(mode 10) with.extern samples location 47@* pointed by 'pu1_ref' to the tu block location pointed by 'pu1_dst' refer 48@* to section 8.4.4.2.6 in the standard (special case) 49@* 50@* @param[in] pu1_src 51@* uword8 pointer to the source 52@* 53@* @param[out] pu1_dst 54@* uword8 pointer to the destination 55@* 56@* @param[in] src_strd 57@* integer source stride 58@* 59@* @param[in] dst_strd 60@* integer destination stride 61@* 62@* @param[in] nt 63@* integer transform block size 64@* 65@* @param[in] mode 66@* integer intraprediction mode 67@* 68@* @returns 69@* 70@* @remarks 71@* none 72@* 73@******************************************************************************* 74@*/ 75@void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref, 76@ word32 src_strd, 77@ uword8 *pu1_dst, 78@ word32 dst_strd, 79@ word32 nt, 80@ word32 mode) 81@**************variables vs registers***************************************** 82@r0 => *pu1_ref 83@r1 => src_strd 84@r2 => *pu1_dst 85@r3 => dst_strd 86 87.text 88.align 4 89 90 91 92 93.globl ihevc_intra_pred_chroma_horz_a9q 94 95.type ihevc_intra_pred_chroma_horz_a9q, %function 96 97ihevc_intra_pred_chroma_horz_a9q: 98 99 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 100 101 ldr r4,[sp,#40] @loads nt 102 103 lsl r6,r4,#2 @four_nt 104 105 add r12,r0,r6 @*pu1_ref[four_nt] 106 cmp r4,#4 @if nt == 4 107 beq core_loop_4 108 109 cmp r4,#8 @if nt == 8 110 beq core_loop_8 111 112 @cmp r4,#16 @if nt == 16 113 @beq core_loop_16 114 115 sub r12,r12,#16 @move to 16th value pointer 116 add r9,r2,#16 117 118core_loop_16: 119 vld1.16 {q0},[r12] @load 16 values. d1[7] will have the 1st value. 120 sub r12,r12,#16 121 vld1.16 {q5},[r12] @load 16 values. d1[7] will have the 1st value. 122 123 vdup.16 q1,d1[3] @duplicate the i value. 124 125 vdup.16 q2,d1[2] @duplicate the ii value. 126 vdup.16 q3,d1[1] @duplicate the iii value. 127 vst1.16 {q1},[r2],r3 @store in 1st row 0-16 columns 128 vst1.16 {q1},[r9],r3 @store in 1st row 16-32 columns 129 130 vdup.16 q4,d1[0] 131 vst1.16 {q2},[r2],r3 132 vst1.16 {q2},[r9],r3 133 134 vdup.16 q1,d0[3] 135 vst1.16 {q3},[r2],r3 136 vst1.16 {q3},[r9],r3 137 138 vdup.16 q2,d0[2] 139 vst1.16 {q4},[r2],r3 140 vst1.16 {q4},[r9],r3 141 142 vdup.16 q3,d0[1] 143 vst1.16 {q1},[r2],r3 144 vst1.16 {q1},[r9],r3 145 146 vdup.16 q4,d0[0] 147 vst1.16 {q2},[r2],r3 148 vst1.16 {q2},[r9],r3 149 150 vdup.16 q1,d11[3] 151 vst1.16 {q3},[r2],r3 152 vst1.16 {q3},[r9],r3 153 154 vdup.16 q2,d11[2] 155 vst1.16 {q4},[r2],r3 156 vst1.16 {q4},[r9],r3 157 158 vdup.16 q3,d11[1] 159 vst1.16 {q1},[r2],r3 160 vst1.16 {q1},[r9],r3 161 162 vdup.16 q4,d11[0] 163 vst1.16 {q2},[r2],r3 164 vst1.16 {q2},[r9],r3 165 166 vdup.16 q1,d10[3] 167 vst1.16 {q3},[r2],r3 168 vst1.16 {q3},[r9],r3 169 170 vdup.16 q2,d10[2] 171 vst1.16 {q4},[r2],r3 172 vst1.16 {q4},[r9],r3 173 174 vdup.16 q3,d10[1] 175 vst1.16 {q1},[r2],r3 176 vst1.16 {q1},[r9],r3 177 sub r12,r12,#16 @move to 16th value pointer 178 179 vdup.16 q4,d10[0] 180 vst1.16 {q2},[r2],r3 181 vst1.16 {q2},[r9],r3 182 183 subs r4,r4,#16 @decrement the loop count by 16 184 vst1.16 {q3},[r2],r3 185 vst1.16 {q3},[r9],r3 186 187 vst1.16 {q4},[r2],r3 188 vst1.16 {q4},[r9],r3 189 bgt core_loop_16 190 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 191 b endloop 192 193core_loop_8: 194 ldrb lr,[r12],#1 @pu1_ref[two_nt] 195 @vld1.8 {q15},[r12] @pu1_ref[two_nt + 1 + col] 196 197 vdup.8 d28,lr 198 sub r12,r12,#17 199 vld1.8 {q0},[r12] 200 201 sub r12,r12,#16 202 vld1.8 {q15},[r12] 203 vdup.16 q5,d1[3] 204 @vmovl.u8 q13,d26 205 206 vdup.16 q1,d1[2] 207 @vsubl.u8 q12,d30,d28 208 209 vdup.16 q2,d1[1] 210 @vshr.s16 q12,q12,#1 211 212 vdup.16 q3,d1[0] 213 @vqadd.s16 q11,q13,q12 214 215 vdup.16 q4,d0[3] 216 @vqmovun.s16 d22,q11 217 218 vst1.16 {q5},[r2],r3 219 220 vdup.16 q5,d0[2] 221 @vsubl.u8 q12,d31,d28 222 223 vdup.16 q6,d0[1] 224 @vshr.s16 q12,q12,#1 225 226 vdup.16 q7,d0[0] 227 @vqadd.s16 q11,q13,q12 228 229 vdup.16 q8,d0[3] 230 @vqmovun.s16 d22,q11 231 232 vst1.16 {q1},[r2],r3 233 @sub r2,r2,#8 234 235 vst1.16 {q2},[r2],r3 236 237 vst1.16 {q3},[r2],r3 238 vst1.16 {q4},[r2],r3 239 vst1.16 {q5},[r2],r3 240 241 @vdup.8 q1,d0[2] 242 vst1.16 {q6},[r2],r3 243 244 @vdup.8 q2,d0[1] 245 vst1.16 {q7},[r2],r3 246 247 @vdup.8 q3,d0[0] 248 @vst1.8 {q7},[r2],r3 249 250 @vdup.8 q4,d0[3] 251 @vst1.8 {q8},[r2],r3 252 253 @vdup.8 q5,d0[2] 254 @vst1.8 {q1},[r2],r3 255 256 @vdup.8 q6,d0[1] 257 @vst1.8 {q2},[r2],r3 258 259 @vdup.8 q7,d0[0] 260 @vst1.8 {q3},[r2],r3 261 262 @vst1.8 {q4},[r2],r3 263 @vst1.8 {q5},[r2],r3 264 @vst1.8 {q6},[r2],r3 265 @vst1.8 {q7},[r2],r3 266 267 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 268 b endloop 269 270 271core_loop_4: 272 ldrb lr,[r12] @pu1_ref[two_nt] 273 add r12,r12,#1 @pu1_ref[two_nt + 1] 274 @vld1.8 {d30},[r12] @pu1_ref[two_nt + 1 + col] 275 276 sub r12,r12,#9 277 vld1.8 {d0},[r12] 278 sub r12,r12,#8 279 vld1.8 {d30},[r12] 280 vdup.16 d26,d0[3] 281 vdup.8 d28,lr 282 283 vdup.16 d3,d0[2] 284 vmovl.u8 q13,d26 285 286 vdup.16 d4,d0[1] 287 vsubl.u8 q12,d30,d28 288 289 vdup.16 d5,d0[0] 290 vshr.s16 q12,q12,#1 291 292 vdup.16 d6,d0[3] 293 vqadd.s16 q11,q13,q12 294 295 vdup.16 d7,d0[2] 296 vqmovun.s16 d22,q11 297 298 vst1.8 {d6},[r2],r3 299 vst1.8 {d3},[r2],r3 300 301 vdup.16 d8,d0[1] 302 vst1.8 {d4},[r2],r3 303 vst1.8 {d5},[r2],r3 304 305 vdup.16 d9,d0[0] 306 @vst1.8 {d6},[r2],r3 307 @vst1.8 {d7},[r2],r3 308 309 @vst1.8 {d8},[r2],r3 310 @vst1.8 {d9},[r2],r3 311 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 312 b endloop 313 314 315@core_loop_4 316 ldrb lr,[r12] @pu1_ref[two_nt] 317 add r12,r12,#1 @pu1_ref[two_nt + 1] 318 vld1.8 {d30},[r12] @pu1_ref[two_nt + 1 + col] 319 320 sub r12,r12,#5 321 vld1.8 {d0},[r12] 322 vdup.8 d28,lr 323 vdup.8 d26,d0[3] 324 vmovl.u8 q13,d26 325 326 vdup.8 d3,d0[2] 327 vsubl.u8 q12,d30,d28 328 329 vdup.8 d4,d0[1] 330 vshr.s16 q12,q12,#1 331 332 vdup.8 d5,d0[0] 333 vqadd.s16 q11,q13,q12 334 335 vqmovun.s16 d22,q11 336 337 vst1.32 {d22[0]},[r2],r3 338 vst1.32 {d3[0]},[r2],r3 339 vst1.32 {d4[0]},[r2],r3 340 vst1.32 {d5[0]},[r2],r3 341 342 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 343 344endloop: 345 346 347