1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_intra_pred_luma_mode2_neon.s 22@* 23@* @brief 24@* contains function definitions for intra prediction dc filtering. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* luma intraprediction filter for dc input 45@* 46@* @par description: 47@* 48@* @param[in] pu1_ref 49@* uword8 pointer to the source 50@* 51@* @param[out] pu1_dst 52@* uword8 pointer to the destination 53@* 54@* @param[in] src_strd 55@* integer source stride 56@* 57@* @param[in] dst_strd 58@* integer destination stride 59@* 60@* @param[in] pi1_coeff 61@* word8 pointer to the planar coefficients 62@* 63@* @param[in] nt 64@* size of tranform block 65@* 66@* @param[in] mode 67@* type of filtering 68@* 69@* @returns 70@* 71@* @remarks 72@* none 73@* 74@******************************************************************************* 75@*/ 76 77@void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref, 78@ word32 src_strd, 79@ uword8 *pu1_dst, 80@ word32 dst_strd, 81@ word32 nt, 82@ word32 mode) 83@ 84@**************variables vs registers***************************************** 85@r0 => *pu1_ref 86@r1 => src_strd 87@r2 => *pu1_dst 88@r3 => dst_strd 89 90@stack contents from #40 91@ nt 92@ mode 93@ pi1_coeff 94 95.text 96.align 4 97 98 99 100 101.globl ihevc_intra_pred_chroma_mode2_a9q 102 103.type ihevc_intra_pred_chroma_mode2_a9q, %function 104 105ihevc_intra_pred_chroma_mode2_a9q: 106 107 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 108 109 ldr r4,[sp,#40] @loads nt 110 mov r8,#-4 111 112 cmp r4,#4 113 beq mode2_4 114 115 add r0,r0,r4,lsl #2 116 117 sub r0,r0,#0x12 @src[1] 118 add r10,r0,#-2 119 120prologue_cpy_32: 121 122 vld2.8 {d0,d1},[r0],r8 123 124 mov r11,r4 125 vrev64.8 d16,d0 126 vrev64.8 d17,d1 127 128 vld2.8 {d2,d3},[r10],r8 129 mov r6, r2 130 131 vld2.8 {d4,d5},[r0],r8 132 vld2.8 {d6,d7},[r10],r8 133 lsr r1, r4, #3 134 135 vld2.8 {d8,d9},[r0],r8 136 vld2.8 {d10,d11},[r10],r8 137 vld2.8 {d12,d13},[r0],r8 138 mul r1, r4, r1 139 140 vld2.8 {d14,d15},[r10],r8 141 add r7,r6,r3 142 143 vrev64.8 d18,d2 144 vrev64.8 d19,d3 145 lsl r5, r3, #2 146 147 vrev64.8 d20,d4 148 vrev64.8 d21,d5 149 add r9,r7,r3 150 151 vrev64.8 d22,d6 152 vrev64.8 d23,d7 153 154 vrev64.8 d24,d8 155 vrev64.8 d25,d9 156 157 vrev64.8 d26,d10 158 subs r1,r1,#8 159 160 vrev64.8 d27,d11 161 162 vrev64.8 d28,d12 163 vrev64.8 d29,d13 164 165 vrev64.8 d30,d14 166 add r14,r9,r3 167 vrev64.8 d31,d15 168 169 beq epilogue_mode2 170 171 sub r12,r4,#8 172 173kernel_mode2: 174 175 vst2.8 {d16,d17},[r6],r5 176 vst2.8 {d18,d19},[r7],r5 177 subs r11,r11,#8 178 vst2.8 {d20,d21},[r9],r5 179 vst2.8 {d22,d23},[r14],r5 180 vst2.8 {d24,d25},[r6],r5 181 addgt r2,r2,#16 182 vst2.8 {d26,d27},[r7],r5 183 vst2.8 {d28,d29},[r9],r5 184 vst2.8 {d30,d31},[r14],r5 185 186 vld2.8 {d0,d1},[r0],r8 187 movle r11,r4 188 189 vld2.8 {d2,d3},[r10],r8 190 vld2.8 {d4,d5},[r0],r8 191 addle r2, r2, r3, lsl #2 192 vld2.8 {d6,d7},[r10],r8 193 vrev64.8 d16,d0 194 195 vld2.8 {d8,d9},[r0],r8 196 vld2.8 {d10,d11},[r10],r8 197 suble r2, r6,#16 198 vld2.8 {d12,d13},[r0],r8 199 vrev64.8 d17,d1 200 vld2.8 {d14,d15},[r10],r8 201 202 subs r12,r12,#8 203 mov r6, r2 204 addle r0, r0, r4,lsl #1 205 add r7, r6, r3 206 207 vrev64.8 d18,d2 208 suble r0, r0, #16 209 vrev64.8 d19,d3 210 211 vrev64.8 d20,d4 212 movle r12,r4 213 vrev64.8 d21,d5 214 215 vrev64.8 d22,d6 216 add r9, r7, r3 217 vrev64.8 d23,d7 218 219 vrev64.8 d24,d8 220 add r10,r0,#-2 221 vrev64.8 d25,d9 222 223 vrev64.8 d26,d10 224 subs r1, r1, #8 225 vrev64.8 d27,d11 226 227 vrev64.8 d28,d12 228 vrev64.8 d29,d13 229 230 vrev64.8 d30,d14 231 add r14, r9, r3 232 vrev64.8 d31,d15 233 234 bne kernel_mode2 235 236epilogue_mode2: 237 238 vst2.8 {d16,d17},[r6],r5 239 vst2.8 {d18,d19},[r7],r5 240 vst2.8 {d20,d21},[r9],r5 241 vst2.8 {d22,d23},[r14],r5 242 vst2.8 {d24,d25},[r6],r5 243 vst2.8 {d26,d27},[r7],r5 244 vst2.8 {d28,d29},[r9],r5 245 vst2.8 {d30,d31},[r14],r5 246 247 b end_func 248 249mode2_4: 250 251 lsl r12,r4,#1 252 add r0,r0,r12 253 sub r0,r0,#2 254 255 vld2.8 {d12,d13},[r0],r8 256 vshl.i64 d0,d12,#32 257 add r10,r0,#2 258 vshl.i64 d1,d13,#32 259 260 vrev64.8 d0,d0 261 vld2.8 {d14,d15},[r10],r8 262 vshl.i64 d2,d14,#32 263 264 vrev64.8 d1,d1 265 vshl.i64 d3,d15,#32 266 vzip.8 d0,d1 267 vst1.8 {d0},[r2],r3 268 269 vrev64.8 d2,d2 270 vld2.8 {d16,d17},[r0],r8 271 vshl.i64 d4,d16,#32 272 vrev64.8 d3,d3 273 vshl.i64 d5,d17,#32 274 vzip.8 d2,d3 275 vrev64.8 d4,d4 276 vrev64.8 d5,d5 277 vst1.8 {d2},[r2],r3 278 279 280 vld2.8 {d18,d19},[r10],r8 281 vshl.i64 d6,d18,#32 282 283 vzip.8 d4,d5 284 vshl.i64 d7,d19,#32 285 vrev64.8 d6,d6 286 vst1.8 {d4},[r2],r3 287 288 vrev64.8 d7,d7 289 vzip.8 d6,d7 290 vst1.8 {d6},[r2],r3 291 292end_func: 293 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 294 295 296 297 298 299 300