1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_intra_pred_chroma_horz_neon.s 22//* 23//* @brief 24//* contains function definition for intra prediction interpolation filters 25//* 26//* 27//* @author 28//* parthiban v 29//* 30//* @par list of functions: 31//* - ihevc_intra_pred_luma_horz() 32//* 33//* @remarks 34//* none 35//* 36//******************************************************************************* 37//*/ 38// 39///** 40//******************************************************************************* 41//* 42//* @brief 43//* intra prediction interpolation filter for horizontal luma variable. 44//* 45//* @par description: 46//* horizontal intraprediction(mode 10) with.extern samples location 47//* pointed by 'pu1_ref' to the tu block location pointed by 'pu1_dst' refer 48//* to section 8.4.4.2.6 in the standard (special case) 49//* 50//* @param[in] pu1_src 51//* uword8 pointer to the source 52//* 53//* @param[out] pu1_dst 54//* uword8 pointer to the destination 55//* 56//* @param[in] src_strd 57//* integer source stride 58//* 59//* @param[in] dst_strd 60//* integer destination stride 61//* 62//* @param[in] nt 63//* integer transform block size 64//* 65//* @param[in] mode 66//* integer intraprediction mode 67//* 68//* @returns 69//* 70//* @remarks 71//* none 72//* 73//******************************************************************************* 74//*/ 75//void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref, 76// word32 src_strd, 77// uword8 *pu1_dst, 78// word32 dst_strd, 79// word32 nt, 80// word32 mode) 81//**************variables vs registers***************************************** 82//x0 => *pu1_ref 83//x1 => src_strd 84//x2 => *pu1_dst 85//x3 => dst_strd 86 87.text 88.align 4 89.include "ihevc_neon_macros.s" 90 91 92.globl ihevc_intra_pred_chroma_horz_av8 93 94.type ihevc_intra_pred_chroma_horz_av8, %function 95 96ihevc_intra_pred_chroma_horz_av8: 97 98 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 99 100 stp x19, x20,[sp,#-16]! 101 102 lsl x6,x4,#2 //four_nt 103 104 add x12,x0,x6 //*pu1_ref[four_nt] 105 cmp x4,#4 //if nt == 4 106 beq core_loop_4 107 108 cmp x4,#8 //if nt == 8 109 beq core_loop_8 110 111 //cmp x4,#16 @if nt == 16 112 //beq core_loop_16 113 114 sub x12,x12,#16 //move to 16th value pointer 115 add x9,x2,#16 116 117core_loop_16: 118 ld1 { v0.8h},[x12] //load 16 values. d1[7] will have the 1st value. 119 sub x12,x12,#16 120 ld1 { v18.8h},[x12] //load 16 values. d1[7] will have the 1st value. 121 122 dup v2.8h, v0.h[7] //duplicate the i value. 123 124 dup v4.8h, v0.h[6] //duplicate the ii value. 125 dup v6.8h, v0.h[5] //duplicate the iii value. 126 st1 { v2.8h},[x2],x3 //store in 1st row 0-16 columns 127 st1 { v2.8h},[x9],x3 //store in 1st row 16-32 columns 128 129 dup v1.8h, v0.h[4] 130 st1 { v4.8h},[x2],x3 131 st1 { v4.8h},[x9],x3 132 133 dup v2.8h, v0.h[3] 134 st1 { v6.8h},[x2],x3 135 st1 { v6.8h},[x9],x3 136 137 dup v4.8h, v0.h[2] 138 st1 { v1.8h},[x2],x3 139 st1 { v1.8h},[x9],x3 140 141 dup v6.8h, v0.h[1] 142 st1 { v2.8h},[x2],x3 143 st1 { v2.8h},[x9],x3 144 145 dup v1.8h, v0.h[0] 146 st1 { v4.8h},[x2],x3 147 st1 { v4.8h},[x9],x3 148 149 dup v2.8h, v18.h[7] 150 st1 { v6.8h},[x2],x3 151 st1 { v6.8h},[x9],x3 152 153 dup v4.8h, v18.h[6] 154 st1 { v1.8h},[x2],x3 155 st1 { v1.8h},[x9],x3 156 157 dup v6.8h, v18.h[5] 158 st1 { v2.8h},[x2],x3 159 st1 { v2.8h},[x9],x3 160 161 dup v1.8h, v18.h[4] 162 st1 { v4.8h},[x2],x3 163 st1 { v4.8h},[x9],x3 164 165 dup v2.8h, v18.h[3] 166 st1 { v6.8h},[x2],x3 167 st1 { v6.8h},[x9],x3 168 169 dup v4.8h, v18.h[2] 170 st1 { v1.8h},[x2],x3 171 st1 { v1.8h},[x9],x3 172 173 dup v6.8h, v18.h[1] 174 st1 { v2.8h},[x2],x3 175 st1 { v2.8h},[x9],x3 176 sub x12,x12,#16 //move to 16th value pointer 177 178 dup v1.8h, v18.h[0] 179 st1 { v4.8h},[x2],x3 180 st1 { v4.8h},[x9],x3 181 182 subs x4,x4,#16 //decrement the loop count by 16 183 st1 { v6.8h},[x2],x3 184 st1 { v6.8h},[x9],x3 185 186 st1 { v1.8h},[x2],x3 187 st1 { v1.8h},[x9],x3 188 bgt core_loop_16 189 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 190 ldp x19, x20,[sp],#16 191 192 ret 193 b endloop 194 195core_loop_8: 196 ldrb w14,[x12],#1 //pu1_ref[two_nt] 197 sxtw x14,w14 198 //vld1.8 {q15},[x12] @pu1_ref[two_nt + 1 + col] 199 200 dup v28.8b,w14 201 sub x12,x12,#17 202 ld1 { v0.16b},[x12] 203 204 sub x12,x12,#16 205// ld1 { v30.16b},[x12] 206 dup v18.8h, v0.h[7] 207 //vmovl.u8 q13,d26 208 209 dup v2.8h, v0.h[6] 210 //vsubl.u8 q12,d30,d28 211 212 dup v4.8h, v0.h[5] 213 //vshr.s16 q12,q12,#1 214 215 dup v6.8h, v0.h[4] 216 //vqadd.s16 q11,q13,q12 217 218 dup v1.8h, v0.h[3] 219 //vqmovun.s16 d22,q11 220 221 st1 { v18.8h},[x2],x3 222 223 dup v18.8h, v0.h[2] 224 //vsubl.u8 q12,d31,d28 225 226 dup v19.8h, v0.h[1] 227 //vshr.s16 q12,q12,#1 228 229 dup v20.8h, v0.h[0] 230 //vqadd.s16 q11,q13,q12 231 232 dup v16.8h, v0.h[3] 233 //vqmovun.s16 d22,q11 234 235 st1 { v2.8h},[x2],x3 236 //sub x2,x2,#8 237 238 st1 { v4.8h},[x2],x3 239 240 st1 { v6.8h},[x2],x3 241 st1 { v1.8h},[x2],x3 242 st1 { v18.8h},[x2],x3 243 244 //vdup.8 q1,d0[2] 245 st1 { v19.8h},[x2],x3 246 247 //vdup.8 q2,d0[1] 248 st1 { v20.8h},[x2],x3 249 250 //vdup.8 q3,d0[0] 251 //vst1.8 {q7},[x2],x3 252 253 //vdup.8 q4,d0[3] 254 //vst1.8 {q8},[x2],x3 255 256 //vdup.8 q5,d0[2] 257 //vst1.8 {q1},[x2],x3 258 259 //vdup.8 q6,d0[1] 260 //vst1.8 {q2},[x2],x3 261 262 //vdup.8 q7,d0[0] 263 //vst1.8 {q3},[x2],x3 264 265 //vst1.8 {q4},[x2],x3 266 //vst1.8 {q5},[x2],x3 267 //vst1.8 {q6},[x2],x3 268 //vst1.8 {q7},[x2],x3 269 270 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 271 ldp x19, x20,[sp],#16 272 273 ret 274 b endloop 275 276 277core_loop_4: 278 ldrb w14,[x12] //pu1_ref[two_nt] 279 sxtw x14,w14 280 add x12,x12,#1 //pu1_ref[two_nt + 1] 281 //vld1.8 {d30},[x12] @pu1_ref[two_nt + 1 + col] 282 283 sub x12,x12,#9 284 ld1 {v0.8b},[x12] 285 sub x12,x12,#8 286 ld1 {v30.8b},[x12] 287 dup v26.4h, v0.h[3] 288 dup v28.8b,w14 289 290 dup v3.4h, v0.h[2] 291 uxtl v26.8h, v26.8b 292 293 dup v4.4h, v0.h[1] 294 usubl v24.8h, v30.8b, v28.8b 295 296 dup v5.4h, v0.h[0] 297 sshr v24.8h, v24.8h,#1 298 299 dup v6.4h, v0.h[3] 300 sqadd v22.8h, v26.8h , v24.8h 301 302 dup v7.4h, v0.h[2] 303 sqxtun v22.8b, v22.8h 304 305 st1 {v6.8b},[x2],x3 306 st1 {v3.8b},[x2],x3 307 308 dup v1.4h, v0.h[1] 309 st1 {v4.8b},[x2],x3 310 st1 {v5.8b},[x2],x3 311 312 dup v17.4h, v0.h[0] 313 //vst1.8 {d6},[x2],x3 314 //vst1.8 {d7},[x2],x3 315 316 //vst1.8 {d8},[x2],x3 317 //vst1.8 {d9},[x2],x3 318 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 319 ldp x19, x20,[sp],#16 320 321 ret 322 b endloop 323 324 325//core_loop_4 326 ldrb w14,[x12] //pu1_ref[two_nt] 327 sxtw x14,w14 328 add x12,x12,#1 //pu1_ref[two_nt + 1] 329 ld1 {v30.8b},[x12] //pu1_ref[two_nt + 1 + col] 330 331 sub x12,x12,#5 332 ld1 {v0.8b},[x12] 333 dup v28.8b,w14 334 dup v26.8b, v0.b[3] 335 uxtl v26.8h, v26.8b 336 337 dup v3.8b, v0.b[2] 338 usubl v24.8h, v30.8b, v28.8b 339 340 dup v4.8b, v0.b[1] 341 sshr v24.8h, v24.8h,#1 342 343 dup v5.8b, v0.b[0] 344 sqadd v22.8h, v26.8h , v24.8h 345 346 sqxtun v22.8b, v22.8h 347 348 st1 {v22.s}[0],[x2],x3 349 st1 {v3.s}[0],[x2],x3 350 st1 {v4.s}[0],[x2],x3 351 st1 {v5.s}[0],[x2],x3 352 353 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 354 ldp x19, x20,[sp],#16 355 356 ret 357 358endloop: 359 360 361 362