1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20 21///* 22////---------------------------------------------------------------------------- 23//// File Name : impeg2_format_conv.s 24//// 25//// Description : This file has the Idct Implementations for the 26//// MPEG4 SP decoder on neon platform. 27//// 28//// Reference Document : 29//// 30//// Revision History : 31//// Date Author Detail Description 32//// ------------ ---------------- ---------------------------------- 33//// Jul 07, 2008 Naveen Kumar T Created 34//// 35////------------------------------------------------------------------------- 36//*/ 37 38///* 39//// ---------------------------------------------------------------------------- 40//// Include Files 41//// ---------------------------------------------------------------------------- 42//*/ 43.set log2_16 , 4 44.set log2_2 , 1 45 46.text 47.include "impeg2_neon_macros.s" 48///* 49//// ---------------------------------------------------------------------------- 50//// Struct/Union Types and Define 51//// ---------------------------------------------------------------------------- 52//*/ 53 54///* 55//// ---------------------------------------------------------------------------- 56//// Static Global Data section variables 57//// ---------------------------------------------------------------------------- 58//*/ 59////--------------------------- NONE -------------------------------------------- 60 61///* 62//// ---------------------------------------------------------------------------- 63//// Static Prototype Functions 64//// ---------------------------------------------------------------------------- 65//*/ 66//// -------------------------- NONE -------------------------------------------- 67 68///* 69//// ---------------------------------------------------------------------------- 70//// Exported functions 71//// ---------------------------------------------------------------------------- 72//*/ 73 74 75///***************************************************************************** 76//* * 77//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8() * 78//* * 79//* Description : This function conversts the image from YUV420P color * 80//* space to 420SP color space(UV interleaved). * 81//* * 82//* Arguments : x0 pu1_y * 83//* x1 pu1_u * 84//* x2 pu1_v * 85//* x3 pu1_dest_y * 86//* x4 pu1_dest_uv * 87//* x5 u2_height * 88//* x6 u2_width * 89//* x7 u2_stridey * 90//* sp, #80 u2_strideu * 91//* sp, #88 u2_stridev * 92//* sp, #96 u2_dest_stride_y * 93//* sp, #104 u2_dest_stride_uv * 94//* sp, #112 convert_uv_only * 95//* * 96//* Values Returned : None * 97//* * 98//* Register Usage : x8, x10, x16, x20, v0, v1 * 99//* * 100//* Stack Usage : 80 Bytes * 101//* * 102//* Interruptibility : Interruptible * 103//* * 104//* Known Limitations * 105//* Assumptions: Image Width: Assumed to be multiple of 16 and * 106//* greater than or equal to 16 * 107//* Image Height: Assumed to be even. * 108//* * 109//* Revision History : * 110//* DD MM YYYY Author(s) Changes (Describe the changes made) * 111//* 07 06 2010 Varshita Draft * 112//* 07 06 2010 Naveen Kr T Completed * 113//* * 114//*****************************************************************************/ 115.global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8 116impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8: 117 118 //// push the registers on the stack 119 // pu1_y, - x0 120 // pu1_u, - x1 121 // pu1_v, - x2 122 // pu1_dest_y, - x3 123 // pu1_dest_uv, - x4 124 // u2_height, - x5 125 // u2_width, - x6 126 // u2_stridey, - x7 127 // u2_strideu, - sp, #80 128 // u2_stridev, - sp, #88 129 // u2_dest_stride_y, - sp, #96 130 // u2_dest_stride_uv, - sp, #104 131 // convert_uv_only - sp, #112 132 // STMFD sp!,{x4-x12,x14} 133 push_v_regs 134 stp x19, x20, [sp, #-16]! 135 136 ldr w14, [sp, #112] //// Load convert_uv_only 137 138 cmp w14, #1 139 beq yuv420sp_uv_chroma 140 ///* Do the preprocessing before the main loops start */ 141 //// Load the parameters from stack 142 143 ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack 144 uxtw x8, w8 145 146 sub x7, x7, x6 //// Source increment 147 148 sub x8, x8, x6 //// Destination increment 149 150 151yuv420sp_uv_row_loop_y: 152 mov x16, x6 153 154yuv420sp_uv_col_loop_y: 155 prfm pldl1keep, [x0, #128] 156 ld1 {v0.8b, v1.8b}, [x0], #16 157 st1 {v0.8b, v1.8b}, [x3], #16 158 sub x16, x16, #16 159 cmp x16, #15 160 bgt yuv420sp_uv_col_loop_y 161 162 cmp x16, #0 163 beq yuv420sp_uv_row_loop__y 164 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 165 ////Ex if width is 162, above loop will process 160 pixels. And 166 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 167 //// and written using VLD1 and VST1 168 sub x20, x16, #16 169 neg x16, x20 170 sub x0, x0, x16 171 sub x3, x3, x16 172 173 ld1 {v0.8b, v1.8b}, [x0], #16 174 st1 {v0.8b, v1.8b}, [x3], #16 175 176yuv420sp_uv_row_loop__y: 177 add x0, x0, x7 178 add x3, x3, x8 179 subs x5, x5, #1 180 bgt yuv420sp_uv_row_loop_y 181 182yuv420sp_uv_chroma: 183 ldr w7, [sp, #88] //// Load u2_strideu from stack 184 sxtw x7, w7 185 186 ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack 187 sxtw x8, w8 188 189 sub x7, x7, x6, lsr #1 //// Source increment 190 191 sub x8, x8, x6 //// Destination increment 192 193 lsr x6, x6, #1 194 lsr x5, x5, #1 195yuv420sp_uv_row_loop_uv: 196 mov x16, x6 197 198 199yuv420sp_uv_col_loop_uv: 200 prfm pldl1keep, [x1, #128] 201 prfm pldl1keep, [x2, #128] 202 203 ld1 {v0.8b}, [x1], #8 204 ld1 {v1.8b}, [x2], #8 205 st2 {v0.8b, v1.8b}, [x4], #16 206 207 sub x16, x16, #8 208 cmp x16, #7 209 bgt yuv420sp_uv_col_loop_uv 210 211 cmp x16, #0 212 beq yuv420sp_uv_row_loop__uv 213 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 214 ////Ex if width is 162, above loop will process 160 pixels. And 215 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 216 //// and written using VLD1 and VST1 217 sub x20, x16, #8 218 neg x16, x20 219 sub x1, x1, x16 220 sub x2, x2, x16 221 sub x4, x4, x16, lsl #1 222 223 ld1 {v0.8b}, [x1], #8 224 ld1 {v1.8b}, [x2], #8 225 st2 {v0.8b, v1.8b}, [x4], #16 226 227yuv420sp_uv_row_loop__uv: 228 add x1, x1, x7 229 add x2, x2, x7 230 add x4, x4, x8 231 subs x5, x5, #1 232 bgt yuv420sp_uv_row_loop_uv 233 ////POP THE REGISTERS 234 // LDMFD sp!,{x4-x12,PC} 235 ldp x19, x20, [sp], #16 236 pop_v_regs 237 ret 238 239 240 241 242 243///***************************************************************************** 244//* * 245//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8() * 246//* * 247//* Description : This function conversts the image from YUV420P color * 248//* space to 420SP color space(VU interleaved). * 249//* This function is similar to above function * 250//* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in * 251//* VLD1.8 for chroma - order of registers is different * 252//* * 253//* Arguments : x0 pu1_y * 254//* x1 pu1_u * 255//* x2 pu1_v * 256//* x3 pu1_dest_y * 257//* x4 pu1_dest_uv * 258//* x5 u2_height * 259//* x6 u2_width * 260//* x7 u2_stridey * 261//* sp, #80 u2_strideu * 262//* sp, #88 u2_stridev * 263//* sp, #96 u2_dest_stride_y * 264//* sp, #104 u2_dest_stride_uv * 265//* sp, #112 convert_uv_only * 266//* * 267//* Values Returned : None * 268//* * 269//* Register Usage : x8, x14, x16, x20, v0, v1 * 270//* * 271//* Stack Usage : 80 Bytes * 272//* * 273//* Interruptibility : Interruptible * 274//* * 275//* Known Limitations * 276//* Assumptions: Image Width: Assumed to be multiple of 16 and * 277//* greater than or equal to 16 * 278//* Image Height: Assumed to be even. * 279//* * 280//* Revision History : * 281//* DD MM YYYY Author(s) Changes (Describe the changes made) * 282//* 07 06 2010 Varshita Draft * 283//* 07 06 2010 Naveen Kr T Completed * 284//* * 285//*****************************************************************************/ 286 287.global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8 288impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8: 289 290 //// push the registers on the stack 291 // pu1_y, - x0 292 // pu1_u, - x1 293 // pu1_v, - x2 294 // pu1_dest_y, - x3 295 // pu1_dest_uv, - x4 296 // u2_height, - x5 297 // u2_width, - x6 298 // u2_stridey, - x7 299 // u2_strideu, - sp, #80 300 // u2_stridev, - sp, #88 301 // u2_dest_stride_y, - sp, #96 302 // u2_dest_stride_uv, - sp, #104 303 // convert_uv_only - sp, #112 304 // STMFD sp!,{x4-x12,x14} 305 push_v_regs 306 stp x19, x20, [sp, #-16]! 307 308 ldr w14, [sp, #112] //// Load convert_uv_only 309 310 cmp w14, #1 311 beq yuv420sp_vu_chroma 312 313 ///* Do the preprocessing before the main loops start */ 314 //// Load the parameters from stack 315 316 ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack 317 uxtw x8, w8 318 319 sub x7, x7, x6 //// Source increment 320 321 sub x8, x8, x6 //// Destination increment 322 323 324yuv420sp_vu_row_loop_y: 325 mov x16, x6 326 327yuv420sp_vu_col_loop_y: 328 prfm pldl1keep, [x0, #128] 329 ld1 {v0.8b, v1.8b}, [x0], #16 330 st1 {v0.8b, v1.8b}, [x3], #16 331 sub x16, x16, #16 332 cmp x16, #15 333 bgt yuv420sp_vu_col_loop_y 334 335 cmp x16, #0 336 beq yuv420sp_vu_row_loop__y 337 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 338 ////Ex if width is 162, above loop will process 160 pixels. And 339 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 340 //// and written using VLD1 and VST1 341 sub x20, x16, #16 342 neg x16, x20 343 sub x0, x0, x16 344 sub x3, x3, x16 345 346 ld1 {v0.8b, v1.8b}, [x0], #16 347 st1 {v0.8b, v1.8b}, [x3], #16 348 349yuv420sp_vu_row_loop__y: 350 add x0, x0, x7 351 add x3, x3, x8 352 subs x5, x5, #1 353 bgt yuv420sp_vu_row_loop_y 354 355yuv420sp_vu_chroma: 356 ldr w7, [sp, #80] //// Load u2_strideu from stack 357 sxtw x7, w7 358 359 ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack 360 sxtw x8, w8 361 362 sub x7, x7, x6, lsr #1 //// Source increment 363 364 sub x8, x8, x6 //// Destination increment 365 366 lsr x6, x6, #1 367 lsr x5, x5, #1 368yuv420sp_vu_row_loop_uv: 369 mov x16, x6 370 371 372yuv420sp_vu_col_loop_uv: 373 prfm pldl1keep, [x1, #128] 374 prfm pldl1keep, [x2, #128] 375 ld1 {v1.8b}, [x1], #8 376 ld1 {v0.8b}, [x2], #8 377 st2 {v0.8b, v1.8b}, [x4], #16 378 sub x16, x16, #8 379 cmp x16, #7 380 bgt yuv420sp_vu_col_loop_uv 381 382 cmp x16, #0 383 beq yuv420sp_vu_row_loop__uv 384 ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read 385 ////Ex if width is 162, above loop will process 160 pixels. And 386 ////Both source and destination will point to 146th pixel and then 16 bytes will be read 387 //// and written using VLD1 and VST1 388 sub x20, x16, #8 389 neg x16, x20 390 sub x1, x1, x16 391 sub x2, x2, x16 392 sub x4, x4, x16, lsl #1 393 394 ld1 {v1.8b}, [x1], #8 395 ld1 {v0.8b}, [x2], #8 396 st2 {v0.8b, v1.8b}, [x4], #16 397 398yuv420sp_vu_row_loop__uv: 399 add x1, x1, x7 400 add x2, x2, x7 401 add x4, x4, x8 402 subs x5, x5, #1 403 bgt yuv420sp_vu_row_loop_uv 404 ////POP THE REGISTERS 405 // LDMFD sp!,{x4-x12,PC} 406 ldp x19, x20, [sp], #16 407 pop_v_regs 408 ret 409 410