1/* 2 * ARMv8 NEON optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies). 5 * All rights reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 7 * Copyright (C) 2013-2014, Linaro Limited 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> 9 * Copyright (C) 2014, D. R. Commander. All rights reserved. 10 * 11 * This software is provided 'as-is', without any express or implied 12 * warranty. In no event will the authors be held liable for any damages 13 * arising from the use of this software. 14 * 15 * Permission is granted to anyone to use this software for any purpose, 16 * including commercial applications, and to alter it and redistribute it 17 * freely, subject to the following restrictions: 18 * 19 * 1. The origin of this software must not be misrepresented; you must not 20 * claim that you wrote the original software. If you use this software 21 * in a product, an acknowledgment in the product documentation would be 22 * appreciated but is not required. 23 * 2. Altered source versions must be plainly marked as such, and must not be 24 * misrepresented as being the original software. 25 * 3. This notice may not be removed or altered from any source distribution. 26 */ 27 28#if defined(__linux__) && defined(__ELF__) 29.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */ 30#endif 31 32.text 33 34 35#define RESPECT_STRICT_ALIGNMENT 1 36 37 38/*****************************************************************************/ 39 40/* Supplementary macro for setting function attributes */ 41.macro asm_function fname 42#ifdef __APPLE__ 43 .globl _\fname 44_\fname: 45#else 46 .global \fname 47#ifdef __ELF__ 48 .hidden \fname 49 .type \fname, %function 50#endif 51\fname: 52#endif 53.endm 54 55/* Transpose elements of single 128 bit registers */ 56.macro transpose_single x0,x1,xi,xilen,literal 57 ins \xi\xilen[0], \x0\xilen[0] 58 ins \x1\xilen[0], \x0\xilen[1] 59 trn1 \x0\literal, \x0\literal, \x1\literal 60 trn2 \x1\literal, \xi\literal, \x1\literal 61.endm 62 63/* Transpose elements of 2 differnet registers */ 64.macro transpose x0,x1,xi,xilen,literal 65 mov \xi\xilen, \x0\xilen 66 trn1 \x0\literal, \x0\literal, \x1\literal 67 trn2 \x1\literal, \xi\literal, \x1\literal 68.endm 69 70/* Transpose a block of 4x4 coefficients in four 64-bit registers */ 71.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen 72 mov \xi\xilen, \x0\xilen 73 trn1 \x0\x0len, \x0\x0len, \x2\x2len 74 trn2 \x2\x2len, \xi\x0len, \x2\x2len 75 mov \xi\xilen, \x1\xilen 76 trn1 \x1\x1len, \x1\x1len, \x3\x3len 77 trn2 \x3\x3len, \xi\x1len, \x3\x3len 78.endm 79 80.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen 81 mov \xi\xilen, \x0\xilen 82 trn1 \x0\x0len, \x0\x0len, \x1\x1len 83 trn2 \x1\x2len, \xi\x0len, \x1\x2len 84 mov \xi\xilen, \x2\xilen 85 trn1 \x2\x2len, \x2\x2len, \x3\x3len 86 trn2 \x3\x2len, \xi\x1len, \x3\x3len 87.endm 88 89.macro transpose_4x4 x0, x1, x2, x3,x5 90 transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b 91 transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b 92.endm 93 94 95#define CENTERJSAMPLE 128 96 97/*****************************************************************************/ 98 99/* 100 * Perform dequantization and inverse DCT on one block of coefficients. 101 * 102 * GLOBAL(void) 103 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block, 104 * JSAMPARRAY output_buf, JDIMENSION output_col) 105 */ 106 107#define FIX_0_298631336 (2446) 108#define FIX_0_390180644 (3196) 109#define FIX_0_541196100 (4433) 110#define FIX_0_765366865 (6270) 111#define FIX_0_899976223 (7373) 112#define FIX_1_175875602 (9633) 113#define FIX_1_501321110 (12299) 114#define FIX_1_847759065 (15137) 115#define FIX_1_961570560 (16069) 116#define FIX_2_053119869 (16819) 117#define FIX_2_562915447 (20995) 118#define FIX_3_072711026 (25172) 119 120#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) 121#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) 122#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) 123#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) 124#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) 125#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) 126#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) 127#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) 128 129/* 130 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. 131 * Uses some ideas from the comments in 'simd/jiss2int-64.asm' 132 */ 133#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ 134{ \ 135 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ 136 INT32 q1, q2, q3, q4, q5, q6, q7; \ 137 INT32 tmp11_plus_tmp2, tmp11_minus_tmp2; \ 138 \ 139 /* 1-D iDCT input data */ \ 140 row0 = xrow0; \ 141 row1 = xrow1; \ 142 row2 = xrow2; \ 143 row3 = xrow3; \ 144 row4 = xrow4; \ 145 row5 = xrow5; \ 146 row6 = xrow6; \ 147 row7 = xrow7; \ 148 \ 149 q5 = row7 + row3; \ 150 q4 = row5 + row1; \ 151 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ 152 MULTIPLY(q4, FIX_1_175875602); \ 153 q7 = MULTIPLY(q5, FIX_1_175875602) + \ 154 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ 155 q2 = MULTIPLY(row2, FIX_0_541196100) + \ 156 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ 157 q4 = q6; \ 158 q3 = ((INT32) row0 - (INT32) row4) << 13; \ 159 q6 += MULTIPLY(row5, -FIX_2_562915447) + \ 160 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ 161 /* now we can use q1 (reloadable constants have been used up) */ \ 162 q1 = q3 + q2; \ 163 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ 164 MULTIPLY(row1, -FIX_0_899976223); \ 165 q5 = q7; \ 166 q1 = q1 + q6; \ 167 q7 += MULTIPLY(row7, -FIX_0_899976223) + \ 168 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ 169 \ 170 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ 171 tmp11_plus_tmp2 = q1; \ 172 row1 = 0; \ 173 \ 174 q1 = q1 - q6; \ 175 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ 176 MULTIPLY(row3, -FIX_2_562915447); \ 177 q1 = q1 - q6; \ 178 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ 179 MULTIPLY(row6, FIX_0_541196100); \ 180 q3 = q3 - q2; \ 181 \ 182 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ 183 tmp11_minus_tmp2 = q1; \ 184 \ 185 q1 = ((INT32) row0 + (INT32) row4) << 13; \ 186 q2 = q1 + q6; \ 187 q1 = q1 - q6; \ 188 \ 189 /* pick up the results */ \ 190 tmp0 = q4; \ 191 tmp1 = q5; \ 192 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ 193 tmp3 = q7; \ 194 tmp10 = q2; \ 195 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ 196 tmp12 = q3; \ 197 tmp13 = q1; \ 198} 199 200#define XFIX_0_899976223 v0.h[0] 201#define XFIX_0_541196100 v0.h[1] 202#define XFIX_2_562915447 v0.h[2] 203#define XFIX_0_298631336_MINUS_0_899976223 v0.h[3] 204#define XFIX_1_501321110_MINUS_0_899976223 v1.h[0] 205#define XFIX_2_053119869_MINUS_2_562915447 v1.h[1] 206#define XFIX_0_541196100_PLUS_0_765366865 v1.h[2] 207#define XFIX_1_175875602 v1.h[3] 208#define XFIX_1_175875602_MINUS_0_390180644 v2.h[0] 209#define XFIX_0_541196100_MINUS_1_847759065 v2.h[1] 210#define XFIX_3_072711026_MINUS_2_562915447 v2.h[2] 211#define XFIX_1_175875602_MINUS_1_961570560 v2.h[3] 212 213.balign 16 214Ljsimd_idct_islow_neon_consts: 215 .short FIX_0_899976223 /* d0[0] */ 216 .short FIX_0_541196100 /* d0[1] */ 217 .short FIX_2_562915447 /* d0[2] */ 218 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ 219 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ 220 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ 221 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ 222 .short FIX_1_175875602 /* d1[3] */ 223 /* reloadable constants */ 224 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ 225 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ 226 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ 227 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ 228 229asm_function jsimd_idct_islow_neon 230 231 DCT_TABLE .req x0 232 COEF_BLOCK .req x1 233 OUTPUT_BUF .req x2 234 OUTPUT_COL .req x3 235 TMP1 .req x0 236 TMP2 .req x1 237 TMP3 .req x2 238 TMP4 .req x15 239 240 ROW0L .req v16 241 ROW0R .req v17 242 ROW1L .req v18 243 ROW1R .req v19 244 ROW2L .req v20 245 ROW2R .req v21 246 ROW3L .req v22 247 ROW3R .req v23 248 ROW4L .req v24 249 ROW4R .req v25 250 ROW5L .req v26 251 ROW5R .req v27 252 ROW6L .req v28 253 ROW6R .req v29 254 ROW7L .req v30 255 ROW7R .req v31 256 /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */ 257 sub sp, sp, 272 258 str x15, [sp], 16 259 adr x15, Ljsimd_idct_islow_neon_consts 260 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 261 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 262 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 263 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 264 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 265 st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 266 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 267 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 268 ld1 {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32 269 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 270 ld1 {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32 271 mul v16.4h, v16.4h, v0.4h 272 mul v17.4h, v17.4h, v1.4h 273 ins v16.d[1], v17.d[0] /* 128 bit q8 */ 274 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 275 mul v18.4h, v18.4h, v2.4h 276 mul v19.4h, v19.4h, v3.4h 277 ins v18.d[1], v19.d[0] /* 128 bit q9 */ 278 ld1 {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32 279 mul v20.4h, v20.4h, v4.4h 280 mul v21.4h, v21.4h, v5.4h 281 ins v20.d[1], v21.d[0] /* 128 bit q10 */ 282 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32 283 mul v22.4h, v22.4h, v6.4h 284 mul v23.4h, v23.4h, v7.4h 285 ins v22.d[1], v23.d[0] /* 128 bit q11 */ 286 ld1 {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK] 287 mul v24.4h, v24.4h, v0.4h 288 mul v25.4h, v25.4h, v1.4h 289 ins v24.d[1], v25.d[0] /* 128 bit q12 */ 290 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32 291 mul v28.4h, v28.4h, v4.4h 292 mul v29.4h, v29.4h, v5.4h 293 ins v28.d[1], v29.d[0] /* 128 bit q14 */ 294 mul v26.4h, v26.4h, v2.4h 295 mul v27.4h, v27.4h, v3.4h 296 ins v26.d[1], v27.d[0] /* 128 bit q13 */ 297 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x15] /* load constants */ 298 add x15, x15, #16 299 mul v30.4h, v30.4h, v6.4h 300 mul v31.4h, v31.4h, v7.4h 301 ins v30.d[1], v31.d[0] /* 128 bit q15 */ 302 /* Go to the bottom of the stack */ 303 sub sp, sp, 352 304 stp x4, x5, [sp], 16 305 st1 {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32 /* save NEON registers */ 306 st1 {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32 307 /* 1-D IDCT, pass 1, left 4x8 half */ 308 add v4.4h, ROW7L.4h, ROW3L.4h 309 add v5.4h, ROW5L.4h, ROW1L.4h 310 smull v12.4s, v4.4h, XFIX_1_175875602_MINUS_1_961570560 311 smlal v12.4s, v5.4h, XFIX_1_175875602 312 smull v14.4s, v4.4h, XFIX_1_175875602 313 /* Check for the zero coefficients in the right 4x8 half */ 314 smlal v14.4s, v5.4h, XFIX_1_175875602_MINUS_0_390180644 315 ssubl v6.4s, ROW0L.4h, ROW4L.4h 316 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] 317 smull v4.4s, ROW2L.4h, XFIX_0_541196100 318 smlal v4.4s, ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065 319 orr x0, x4, x5 320 mov v8.16b, v12.16b 321 smlsl v12.4s, ROW5L.4h, XFIX_2_562915447 322 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] 323 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 324 shl v6.4s, v6.4s, #13 325 orr x0, x0, x4 326 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 327 orr x0, x0 , x5 328 add v2.4s, v6.4s, v4.4s 329 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] 330 mov v10.16b, v14.16b 331 add v2.4s, v2.4s, v12.4s 332 orr x0, x0, x4 333 smlsl v14.4s, ROW7L.4h, XFIX_0_899976223 334 orr x0, x0, x5 335 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 336 rshrn ROW1L.4h, v2.4s, #11 337 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] 338 sub v2.4s, v2.4s, v12.4s 339 smlal v10.4s, ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447 340 orr x0, x0, x4 341 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 342 orr x0, x0, x5 343 sub v2.4s, v2.4s, v12.4s 344 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 345 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] 346 smlal v12.4s, ROW6L.4h, XFIX_0_541196100 347 sub v6.4s, v6.4s, v4.4s 348 orr x0, x0, x4 349 rshrn ROW6L.4h, v2.4s, #11 350 orr x0, x0, x5 351 add v2.4s, v6.4s, v10.4s 352 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] 353 sub v6.4s, v6.4s, v10.4s 354 saddl v10.4s, ROW0L.4h, ROW4L.4h 355 orr x0, x0, x4 356 rshrn ROW2L.4h, v2.4s, #11 357 orr x0, x0, x5 358 rshrn ROW5L.4h, v6.4s, #11 359 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] 360 shl v10.4s, v10.4s, #13 361 smlal v8.4s, ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223 362 orr x0, x0, x4 363 add v4.4s, v10.4s, v12.4s 364 orr x0, x0, x5 365 cmp x0, #0 /* orrs instruction removed */ 366 sub v2.4s, v10.4s, v12.4s 367 add v12.4s, v4.4s, v14.4s 368 ldp w4, w5, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] 369 sub v4.4s, v4.4s, v14.4s 370 add v10.4s, v2.4s, v8.4s 371 orr x0, x4, x5 372 sub v6.4s, v2.4s, v8.4s 373 /* pop {x4, x5} */ 374 sub sp, sp, 80 375 ldp x4, x5, [sp], 16 376 rshrn ROW7L.4h, v4.4s, #11 377 rshrn ROW3L.4h, v10.4s, #11 378 rshrn ROW0L.4h, v12.4s, #11 379 rshrn ROW4L.4h, v6.4s, #11 380 381 b.eq 3f /* Go to do some special handling for the sparse right 4x8 half */ 382 383 /* 1-D IDCT, pass 1, right 4x8 half */ 384 ld1 {v2.4h}, [x15] /* reload constants */ 385 add v10.4h, ROW7R.4h, ROW3R.4h 386 add v8.4h, ROW5R.4h, ROW1R.4h 387 /* Transpose ROW6L <-> ROW7L (v3 available free register) */ 388 transpose ROW6L, ROW7L, v3, .16b, .4h 389 smull v12.4s, v10.4h, XFIX_1_175875602_MINUS_1_961570560 390 smlal v12.4s, v8.4h, XFIX_1_175875602 391 /* Transpose ROW2L <-> ROW3L (v3 available free register) */ 392 transpose ROW2L, ROW3L, v3, .16b, .4h 393 smull v14.4s, v10.4h, XFIX_1_175875602 394 smlal v14.4s, v8.4h, XFIX_1_175875602_MINUS_0_390180644 395 /* Transpose ROW0L <-> ROW1L (v3 available free register) */ 396 transpose ROW0L, ROW1L, v3, .16b, .4h 397 ssubl v6.4s, ROW0R.4h, ROW4R.4h 398 smull v4.4s, ROW2R.4h, XFIX_0_541196100 399 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065 400 /* Transpose ROW4L <-> ROW5L (v3 available free register) */ 401 transpose ROW4L, ROW5L, v3, .16b, .4h 402 mov v8.16b, v12.16b 403 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447 404 smlal v12.4s, ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447 405 /* Transpose ROW1L <-> ROW3L (v3 available free register) */ 406 transpose ROW1L, ROW3L, v3, .16b, .2s 407 shl v6.4s, v6.4s, #13 408 smlsl v8.4s, ROW1R.4h, XFIX_0_899976223 409 /* Transpose ROW4L <-> ROW6L (v3 available free register) */ 410 transpose ROW4L, ROW6L, v3, .16b, .2s 411 add v2.4s, v6.4s, v4.4s 412 mov v10.16b, v14.16b 413 add v2.4s, v2.4s, v12.4s 414 /* Transpose ROW0L <-> ROW2L (v3 available free register) */ 415 transpose ROW0L, ROW2L, v3, .16b, .2s 416 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223 417 smlal v14.4s, ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223 418 rshrn ROW1R.4h, v2.4s, #11 419 /* Transpose ROW5L <-> ROW7L (v3 available free register) */ 420 transpose ROW5L, ROW7L, v3, .16b, .2s 421 sub v2.4s, v2.4s, v12.4s 422 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447 423 smlsl v10.4s, ROW3R.4h, XFIX_2_562915447 424 sub v2.4s, v2.4s, v12.4s 425 smull v12.4s, ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865 426 smlal v12.4s, ROW6R.4h, XFIX_0_541196100 427 sub v6.4s, v6.4s, v4.4s 428 rshrn ROW6R.4h, v2.4s, #11 429 add v2.4s, v6.4s, v10.4s 430 sub v6.4s, v6.4s, v10.4s 431 saddl v10.4s, ROW0R.4h, ROW4R.4h 432 rshrn ROW2R.4h, v2.4s, #11 433 rshrn ROW5R.4h, v6.4s, #11 434 shl v10.4s, v10.4s, #13 435 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223 436 add v4.4s, v10.4s, v12.4s 437 sub v2.4s, v10.4s, v12.4s 438 add v12.4s, v4.4s, v14.4s 439 sub v4.4s, v4.4s, v14.4s 440 add v10.4s, v2.4s, v8.4s 441 sub v6.4s, v2.4s, v8.4s 442 rshrn ROW7R.4h, v4.4s, #11 443 rshrn ROW3R.4h, v10.4s, #11 444 rshrn ROW0R.4h, v12.4s, #11 445 rshrn ROW4R.4h, v6.4s, #11 446 /* Transpose right 4x8 half */ 447 transpose ROW6R, ROW7R, v3, .16b, .4h 448 transpose ROW2R, ROW3R, v3, .16b, .4h 449 transpose ROW0R, ROW1R, v3, .16b, .4h 450 transpose ROW4R, ROW5R, v3, .16b, .4h 451 transpose ROW1R, ROW3R, v3, .16b, .2s 452 transpose ROW4R, ROW6R, v3, .16b, .2s 453 transpose ROW0R, ROW2R, v3, .16b, .2s 454 transpose ROW5R, ROW7R, v3, .16b, .2s 455 4561: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ 457 ld1 {v2.4h}, [x15] /* reload constants */ 458 smull v12.4S, ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */ 459 smlal v12.4s, ROW1L.4h, XFIX_1_175875602 460 smlal v12.4s, ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */ 461 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 462 smull v14.4s, ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */ 463 smlal v14.4s, ROW3L.4h, XFIX_1_175875602 464 smlal v14.4s, ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */ 465 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 466 ssubl v6.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */ 467 smull v4.4s, ROW2L.4h, XFIX_0_541196100 468 smlal v4.4s, ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L.4h <-> ROW2R.4h */ 469 mov v8.16b, v12.16b 470 smlsl v12.4s, ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4h */ 471 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 472 shl v6.4s, v6.4s, #13 473 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 474 add v2.4s, v6.4s, v4.4s 475 mov v10.16b, v14.16b 476 add v2.4s, v2.4s, v12.4s 477 smlsl v14.4s, ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4h */ 478 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 479 shrn ROW1L.4h, v2.4s, #16 480 sub v2.4s, v2.4s, v12.4s 481 smlal v10.4s, ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L.4h <-> ROW1R.4h */ 482 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 483 sub v2.4s, v2.4s, v12.4s 484 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 485 smlal v12.4s, ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */ 486 sub v6.4s, v6.4s, v4.4s 487 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ 488 add v2.4s, v6.4s, v10.4s 489 sub v6.4s, v6.4s, v10.4s 490 saddl v10.4s, ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */ 491 shrn ROW2L.4h, v2.4s, #16 492 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ 493 shl v10.4s, v10.4s, #13 494 smlal v8.4s, ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L.4h <-> ROW3R.4h */ 495 add v4.4s, v10.4s, v12.4s 496 sub v2.4s, v10.4s, v12.4s 497 add v12.4s, v4.4s, v14.4s 498 sub v4.4s, v4.4s, v14.4s 499 add v10.4s, v2.4s, v8.4s 500 sub v6.4s, v2.4s, v8.4s 501 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ 502 shrn ROW3L.4h, v10.4s, #16 503 shrn ROW0L.4h, v12.4s, #16 504 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ 505 /* 1-D IDCT, pass 2, right 4x8 half */ 506 ld1 {v2.4h}, [x15] /* reload constants */ 507 smull v12.4s, ROW5R.4h, XFIX_1_175875602 508 smlal v12.4s, ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */ 509 smlal v12.4s, ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560 510 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */ 511 smull v14.4s, ROW7R.4h, XFIX_1_175875602 512 smlal v14.4s, ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */ 513 smlal v14.4s, ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644 514 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */ 515 ssubl v6.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */ 516 smull v4.4s, ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */ 517 smlal v4.4s, ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065 518 mov v8.16b, v12.16b 519 smlsl v12.4s, ROW5R.4h, XFIX_2_562915447 520 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L.4h <-> ROW3R.4h */ 521 shl v6.4s, v6.4s, #13 522 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4h */ 523 add v2.4s, v6.4s, v4.4s 524 mov v10.16b, v14.16b 525 add v2.4s, v2.4s, v12.4s 526 smlsl v14.4s, ROW7R.4h, XFIX_0_899976223 527 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L.4h <-> ROW1R.4h */ 528 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ 529 sub v2.4s, v2.4s, v12.4s 530 smlal v10.4s, ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447 531 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4h */ 532 sub v2.4s, v2.4s, v12.4s 533 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L.4h <-> ROW2R.4h */ 534 smlal v12.4s, ROW6R.4h, XFIX_0_541196100 535 sub v6.4s, v6.4s, v4.4s 536 shrn ROW6R.4h, v2.4s, #16 537 add v2.4s, v6.4s, v10.4s 538 sub v6.4s, v6.4s, v10.4s 539 saddl v10.4s, ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */ 540 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ 541 shrn ROW5R.4h, v6.4s, #16 542 shl v10.4s, v10.4s, #13 543 smlal v8.4s, ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223 544 add v4.4s, v10.4s, v12.4s 545 sub v2.4s, v10.4s, v12.4s 546 add v12.4s, v4.4s, v14.4s 547 sub v4.4s, v4.4s, v14.4s 548 add v10.4s, v2.4s, v8.4s 549 sub v6.4s, v2.4s, v8.4s 550 shrn ROW7R.4h, v4.4s, #16 551 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ 552 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ 553 shrn ROW4R.4h, v6.4s, #16 554 5552: /* Descale to 8-bit and range limit */ 556 ins v16.d[1], v17.d[0] 557 ins v18.d[1], v19.d[0] 558 ins v20.d[1], v21.d[0] 559 ins v22.d[1], v23.d[0] 560 sqrshrn v16.8b, v16.8h, #2 561 sqrshrn2 v16.16b, v18.8h, #2 562 sqrshrn v18.8b, v20.8h, #2 563 sqrshrn2 v18.16b, v22.8h, #2 564 565 /* vpop {v8.4h - d15.4h} */ /* restore NEON registers */ 566 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32 567 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32 568 ins v24.d[1], v25.d[0] 569 570 sqrshrn v20.8b, v24.8h, #2 571 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ 572 /* trn1 v16.8h, v16.8h, v18.8h */ 573 transpose v16, v18, v3, .16b, .8h 574 ins v26.d[1], v27.d[0] 575 ins v28.d[1], v29.d[0] 576 ins v30.d[1], v31.d[0] 577 sqrshrn2 v20.16b, v26.8h, #2 578 sqrshrn v22.8b, v28.8h, #2 579 movi v0.16b, #(CENTERJSAMPLE) 580 sqrshrn2 v22.16b, v30.8h, #2 581 transpose_single v16, v17, v3, .d, .8b 582 transpose_single v18, v19, v3, .d, .8b 583 add v16.8b, v16.8b, v0.8b 584 add v17.8b, v17.8b, v0.8b 585 add v18.8b, v18.8b, v0.8b 586 add v19.8b, v19.8b, v0.8b 587 transpose v20, v22, v3, .16b, .8h 588 /* Store results to the output buffer */ 589 ldp TMP1, TMP2, [OUTPUT_BUF], 16 590 add TMP1, TMP1, OUTPUT_COL 591 add TMP2, TMP2, OUTPUT_COL 592 st1 {v16.8b}, [TMP1] 593 transpose_single v20, v21, v3, .d, .8b 594 st1 {v17.8b}, [TMP2] 595 ldp TMP1, TMP2, [OUTPUT_BUF], 16 596 add TMP1, TMP1, OUTPUT_COL 597 add TMP2, TMP2, OUTPUT_COL 598 st1 {v18.8b}, [TMP1] 599 add v20.8b, v20.8b, v0.8b 600 add v21.8b, v21.8b, v0.8b 601 st1 {v19.8b}, [TMP2] 602 ldp TMP1, TMP2, [OUTPUT_BUF], 16 603 ldp TMP3, TMP4, [OUTPUT_BUF] 604 add TMP1, TMP1, OUTPUT_COL 605 add TMP2, TMP2, OUTPUT_COL 606 add TMP3, TMP3, OUTPUT_COL 607 add TMP4, TMP4, OUTPUT_COL 608 transpose_single v22, v23, v3, .d, .8b 609 st1 {v20.8b}, [TMP1] 610 add v22.8b, v22.8b, v0.8b 611 add v23.8b, v23.8b, v0.8b 612 st1 {v21.8b}, [TMP2] 613 st1 {v22.8b}, [TMP3] 614 st1 {v23.8b}, [TMP4] 615 ldr x15, [sp], 16 616 ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 617 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 618 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 619 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 620 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 621 ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 622 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 623 ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 624 blr x30 625 6263: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ 627 628 /* Transpose left 4x8 half */ 629 transpose ROW6L, ROW7L, v3, .16b, .4h 630 transpose ROW2L, ROW3L, v3, .16b, .4h 631 transpose ROW0L, ROW1L, v3, .16b, .4h 632 transpose ROW4L, ROW5L, v3, .16b, .4h 633 shl ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */ 634 transpose ROW1L, ROW3L, v3, .16b, .2s 635 transpose ROW4L, ROW6L, v3, .16b, .2s 636 transpose ROW0L, ROW2L, v3, .16b, .2s 637 transpose ROW5L, ROW7L, v3, .16b, .2s 638 cmp x0, #0 639 b.eq 4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */ 640 641 /* Only row 0 is non-zero for the right 4x8 half */ 642 dup ROW1R.4h, ROW0R.h[1] 643 dup ROW2R.4h, ROW0R.h[2] 644 dup ROW3R.4h, ROW0R.h[3] 645 dup ROW4R.4h, ROW0R.h[0] 646 dup ROW5R.4h, ROW0R.h[1] 647 dup ROW6R.4h, ROW0R.h[2] 648 dup ROW7R.4h, ROW0R.h[3] 649 dup ROW0R.4h, ROW0R.h[0] 650 b 1b /* Go to 'normal' second pass */ 651 6524: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ 653 ld1 {v2.4h}, [x15] /* reload constants */ 654 smull v12.4s, ROW1L.4h, XFIX_1_175875602 655 smlal v12.4s, ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560 656 smull v14.4s, ROW3L.4h, XFIX_1_175875602 657 smlal v14.4s, ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644 658 smull v4.4s, ROW2L.4h, XFIX_0_541196100 659 sshll v6.4s, ROW0L.4h, #13 660 mov v8.16b, v12.16b 661 smlal v12.4s, ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447 662 smlsl v8.4s, ROW1L.4h, XFIX_0_899976223 663 add v2.4s, v6.4s, v4.4s 664 mov v10.16b, v14.16b 665 smlal v14.4s, ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223 666 add v2.4s, v2.4s, v12.4s 667 add v12.4s, v12.4s, v12.4s 668 smlsl v10.4s, ROW3L.4h, XFIX_2_562915447 669 shrn ROW1L.4h, v2.4s, #16 670 sub v2.4s, v2.4s, v12.4s 671 smull v12.4s, ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865 672 sub v6.4s, v6.4s, v4.4s 673 shrn ROW2R.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ 674 add v2.4s, v6.4s, v10.4s 675 sub v6.4s, v6.4s, v10.4s 676 sshll v10.4s, ROW0L.4h, #13 677 shrn ROW2L.4h, v2.4s, #16 678 shrn ROW1R.4h, v6.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ 679 add v4.4s, v10.4s, v12.4s 680 sub v2.4s, v10.4s, v12.4s 681 add v12.4s, v4.4s, v14.4s 682 sub v4.4s, v4.4s, v14.4s 683 add v10.4s, v2.4s, v8.4s 684 sub v6.4s, v2.4s, v8.4s 685 shrn ROW3R.4h, v4.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ 686 shrn ROW3L.4h, v10.4s, #16 687 shrn ROW0L.4h, v12.4s, #16 688 shrn ROW0R.4h, v6.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ 689 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ 690 ld1 {v2.4h}, [x15] /* reload constants */ 691 smull v12.4s, ROW5L.4h, XFIX_1_175875602 692 smlal v12.4s, ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 693 smull v14.4s, ROW7L.4h, XFIX_1_175875602 694 smlal v14.4s, ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 695 smull v4.4s, ROW6L.4h, XFIX_0_541196100 696 sshll v6.4s, ROW4L.4h, #13 697 mov v8.16b, v12.16b 698 smlal v12.4s, ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 699 smlsl v8.4s, ROW5L.4h, XFIX_0_899976223 700 add v2.4s, v6.4s, v4.4s 701 mov v10.16b, v14.16b 702 smlal v14.4s, ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 703 add v2.4s, v2.4s, v12.4s 704 add v12.4s, v12.4s, v12.4s 705 smlsl v10.4s, ROW7L.4h, XFIX_2_562915447 706 shrn ROW5L.4h, v2.4s, #16 /* ROW5L.4h <-> ROW1R.4h */ 707 sub v2.4s, v2.4s, v12.4s 708 smull v12.4s, ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 709 sub v6.4s, v6.4s, v4.4s 710 shrn ROW6R.4h, v2.4s, #16 711 add v2.4s, v6.4s, v10.4s 712 sub v6.4s, v6.4s, v10.4s 713 sshll v10.4s, ROW4L.4h, #13 714 shrn ROW6L.4h, v2.4s, #16 /* ROW6L.4h <-> ROW2R.4h */ 715 shrn ROW5R.4h, v6.4s, #16 716 add v4.4s, v10.4s, v12.4s 717 sub v2.4s, v10.4s, v12.4s 718 add v12.4s, v4.4s, v14.4s 719 sub v4.4s, v4.4s, v14.4s 720 add v10.4s, v2.4s, v8.4s 721 sub v6.4s, v2.4s, v8.4s 722 shrn ROW7R.4h, v4.4s, #16 723 shrn ROW7L.4h, v10.4s, #16 /* ROW7L.4h <-> ROW3R.4h */ 724 shrn ROW4L.4h, v12.4s, #16 /* ROW4L.4h <-> ROW0R.4h */ 725 shrn ROW4R.4h, v6.4s, #16 726 b 2b /* Go to epilogue */ 727 728 .unreq DCT_TABLE 729 .unreq COEF_BLOCK 730 .unreq OUTPUT_BUF 731 .unreq OUTPUT_COL 732 .unreq TMP1 733 .unreq TMP2 734 .unreq TMP3 735 .unreq TMP4 736 737 .unreq ROW0L 738 .unreq ROW0R 739 .unreq ROW1L 740 .unreq ROW1R 741 .unreq ROW2L 742 .unreq ROW2R 743 .unreq ROW3L 744 .unreq ROW3R 745 .unreq ROW4L 746 .unreq ROW4R 747 .unreq ROW5L 748 .unreq ROW5R 749 .unreq ROW6L 750 .unreq ROW6R 751 .unreq ROW7L 752 .unreq ROW7R 753 754 755/*****************************************************************************/ 756 757/* 758 * jsimd_idct_ifast_neon 759 * 760 * This function contains a fast, not so accurate integer implementation of 761 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 762 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' 763 * function from jidctfst.c 764 * 765 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 766 * But in ARM NEON case some extra additions are required because VQDMULH 767 * instruction can't handle the constants larger than 1. So the expressions 768 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 769 * which introduces an extra addition. Overall, there are 6 extra additions 770 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 771 */ 772 773#define XFIX_1_082392200 v0.h[0] 774#define XFIX_1_414213562 v0.h[1] 775#define XFIX_1_847759065 v0.h[2] 776#define XFIX_2_613125930 v0.h[3] 777 778.balign 16 779Ljsimd_idct_ifast_neon_consts: 780 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 781 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 782 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 783 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 784 785asm_function jsimd_idct_ifast_neon 786 787 DCT_TABLE .req x0 788 COEF_BLOCK .req x1 789 OUTPUT_BUF .req x2 790 OUTPUT_COL .req x3 791 TMP1 .req x0 792 TMP2 .req x1 793 TMP3 .req x2 794 TMP4 .req x22 795 TMP5 .req x23 796 797 /* Load and dequantize coefficients into NEON registers 798 * with the following allocation: 799 * 0 1 2 3 | 4 5 6 7 800 * ---------+-------- 801 * 0 | d16 | d17 ( v8.8h ) 802 * 1 | d18 | d19 ( v9.8h ) 803 * 2 | d20 | d21 ( v10.8h ) 804 * 3 | d22 | d23 ( v11.8h ) 805 * 4 | d24 | d25 ( v12.8h ) 806 * 5 | d26 | d27 ( v13.8h ) 807 * 6 | d28 | d29 ( v14.8h ) 808 * 7 | d30 | d31 ( v15.8h ) 809 */ 810 /* Save NEON registers used in fast IDCT */ 811 sub sp, sp, #176 812 stp x22, x23, [sp], 16 813 adr x23, Ljsimd_idct_ifast_neon_consts 814 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 815 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 816 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 817 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 818 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 819 ld1 {v8.8h, v9.8h}, [COEF_BLOCK], 32 820 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 821 ld1 {v10.8h, v11.8h}, [COEF_BLOCK], 32 822 mul v8.8h, v8.8h, v0.8h 823 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 824 mul v9.8h, v9.8h, v1.8h 825 ld1 {v12.8h, v13.8h}, [COEF_BLOCK], 32 826 mul v10.8h, v10.8h, v2.8h 827 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 828 mul v11.8h, v11.8h, v3.8h 829 ld1 {v14.8h, v15.8h}, [COEF_BLOCK], 32 830 mul v12.8h, v12.8h, v0.8h 831 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 832 mul v14.8h, v14.8h, v2.8h 833 mul v13.8h, v13.8h, v1.8h 834 ld1 {v0.4h}, [x23] /* load constants */ 835 mul v15.8h, v15.8h, v3.8h 836 837 /* 1-D IDCT, pass 1 */ 838 sub v2.8h, v10.8h, v14.8h 839 add v14.8h, v10.8h, v14.8h 840 sub v1.8h, v11.8h, v13.8h 841 add v13.8h, v11.8h, v13.8h 842 sub v5.8h, v9.8h, v15.8h 843 add v15.8h, v9.8h, v15.8h 844 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 845 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 846 add v3.8h, v1.8h, v1.8h 847 sub v1.8h, v5.8h, v1.8h 848 add v10.8h, v2.8h, v4.8h 849 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 850 sub v2.8h, v15.8h, v13.8h 851 add v3.8h, v3.8h, v6.8h 852 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 853 add v1.8h, v1.8h, v4.8h 854 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 855 sub v10.8h, v10.8h, v14.8h 856 add v2.8h, v2.8h, v6.8h 857 sub v6.8h, v8.8h, v12.8h 858 add v12.8h, v8.8h, v12.8h 859 add v9.8h, v5.8h, v4.8h 860 add v5.8h, v6.8h, v10.8h 861 sub v10.8h, v6.8h, v10.8h 862 add v6.8h, v15.8h, v13.8h 863 add v8.8h, v12.8h, v14.8h 864 sub v3.8h, v6.8h, v3.8h 865 sub v12.8h, v12.8h, v14.8h 866 sub v3.8h, v3.8h, v1.8h 867 sub v1.8h, v9.8h, v1.8h 868 add v2.8h, v3.8h, v2.8h 869 sub v15.8h, v8.8h, v6.8h 870 add v1.8h, v1.8h, v2.8h 871 add v8.8h, v8.8h, v6.8h 872 add v14.8h, v5.8h, v3.8h 873 sub v9.8h, v5.8h, v3.8h 874 sub v13.8h, v10.8h, v2.8h 875 add v10.8h, v10.8h, v2.8h 876 /* Transpose q8-q9 */ 877 mov v18.16b, v8.16b 878 trn1 v8.8h, v8.8h, v9.8h 879 trn2 v9.8h, v18.8h, v9.8h 880 sub v11.8h, v12.8h, v1.8h 881 /* Transpose q14-q15 */ 882 mov v18.16b, v14.16b 883 trn1 v14.8h, v14.8h, v15.8h 884 trn2 v15.8h, v18.8h, v15.8h 885 add v12.8h, v12.8h, v1.8h 886 /* Transpose q10-q11 */ 887 mov v18.16b, v10.16b 888 trn1 v10.8h, v10.8h, v11.8h 889 trn2 v11.8h, v18.8h, v11.8h 890 /* Transpose q12-q13 */ 891 mov v18.16b, v12.16b 892 trn1 v12.8h, v12.8h, v13.8h 893 trn2 v13.8h, v18.8h, v13.8h 894 /* Transpose q9-q11 */ 895 mov v18.16b, v9.16b 896 trn1 v9.4s, v9.4s, v11.4s 897 trn2 v11.4s, v18.4s, v11.4s 898 /* Transpose q12-q14 */ 899 mov v18.16b, v12.16b 900 trn1 v12.4s, v12.4s, v14.4s 901 trn2 v14.4s, v18.4s, v14.4s 902 /* Transpose q8-q10 */ 903 mov v18.16b, v8.16b 904 trn1 v8.4s, v8.4s, v10.4s 905 trn2 v10.4s, v18.4s, v10.4s 906 /* Transpose q13-q15 */ 907 mov v18.16b, v13.16b 908 trn1 v13.4s, v13.4s, v15.4s 909 trn2 v15.4s, v18.4s, v15.4s 910 /* vswp v14.4h, v10-MSB.4h */ 911 umov x22, v14.d[0] 912 ins v14.d[0], v10.d[1] 913 ins v10.d[1], x22 914 /* vswp v13.4h, v9MSB.4h */ 915 916 umov x22, v13.d[0] 917 ins v13.d[0], v9.d[1] 918 ins v9.d[1], x22 919 /* 1-D IDCT, pass 2 */ 920 sub v2.8h, v10.8h, v14.8h 921 /* vswp v15.4h, v11MSB.4h */ 922 umov x22, v15.d[0] 923 ins v15.d[0], v11.d[1] 924 ins v11.d[1], x22 925 add v14.8h, v10.8h, v14.8h 926 /* vswp v12.4h, v8-MSB.4h */ 927 umov x22, v12.d[0] 928 ins v12.d[0], v8.d[1] 929 ins v8.d[1], x22 930 sub v1.8h, v11.8h, v13.8h 931 add v13.8h, v11.8h, v13.8h 932 sub v5.8h, v9.8h, v15.8h 933 add v15.8h, v9.8h, v15.8h 934 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 935 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 936 add v3.8h, v1.8h, v1.8h 937 sub v1.8h, v5.8h, v1.8h 938 add v10.8h, v2.8h, v4.8h 939 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 940 sub v2.8h, v15.8h, v13.8h 941 add v3.8h, v3.8h, v6.8h 942 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 943 add v1.8h, v1.8h, v4.8h 944 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 945 sub v10.8h, v10.8h, v14.8h 946 add v2.8h, v2.8h, v6.8h 947 sub v6.8h, v8.8h, v12.8h 948 add v12.8h, v8.8h, v12.8h 949 add v9.8h, v5.8h, v4.8h 950 add v5.8h, v6.8h, v10.8h 951 sub v10.8h, v6.8h, v10.8h 952 add v6.8h, v15.8h, v13.8h 953 add v8.8h, v12.8h, v14.8h 954 sub v3.8h, v6.8h, v3.8h 955 sub v12.8h, v12.8h, v14.8h 956 sub v3.8h, v3.8h, v1.8h 957 sub v1.8h, v9.8h, v1.8h 958 add v2.8h, v3.8h, v2.8h 959 sub v15.8h, v8.8h, v6.8h 960 add v1.8h, v1.8h, v2.8h 961 add v8.8h, v8.8h, v6.8h 962 add v14.8h, v5.8h, v3.8h 963 sub v9.8h, v5.8h, v3.8h 964 sub v13.8h, v10.8h, v2.8h 965 add v10.8h, v10.8h, v2.8h 966 sub v11.8h, v12.8h, v1.8h 967 add v12.8h, v12.8h, v1.8h 968 /* Descale to 8-bit and range limit */ 969 movi v0.16b, #0x80 970 sqshrn v8.8b, v8.8h, #5 971 sqshrn2 v8.16b, v9.8h, #5 972 sqshrn v9.8b, v10.8h, #5 973 sqshrn2 v9.16b, v11.8h, #5 974 sqshrn v10.8b, v12.8h, #5 975 sqshrn2 v10.16b, v13.8h, #5 976 sqshrn v11.8b, v14.8h, #5 977 sqshrn2 v11.16b, v15.8h, #5 978 add v8.16b, v8.16b, v0.16b 979 add v9.16b, v9.16b, v0.16b 980 add v10.16b, v10.16b, v0.16b 981 add v11.16b, v11.16b, v0.16b 982 /* Transpose the final 8-bit samples */ 983 /* Transpose q8-q9 */ 984 mov v18.16b, v8.16b 985 trn1 v8.8h, v8.8h, v9.8h 986 trn2 v9.8h, v18.8h, v9.8h 987 /* Transpose q10-q11 */ 988 mov v18.16b, v10.16b 989 trn1 v10.8h, v10.8h, v11.8h 990 trn2 v11.8h, v18.8h, v11.8h 991 /* Transpose q8-q10 */ 992 mov v18.16b, v8.16b 993 trn1 v8.4s, v8.4s, v10.4s 994 trn2 v10.4s, v18.4s, v10.4s 995 /* Transpose q9-q11 */ 996 mov v18.16b, v9.16b 997 trn1 v9.4s, v9.4s, v11.4s 998 trn2 v11.4s, v18.4s, v11.4s 999 /* make copy */ 1000 ins v17.d[0], v8.d[1] 1001 /* Transpose d16-d17-msb */ 1002 mov v18.16b, v8.16b 1003 trn1 v8.8b, v8.8b, v17.8b 1004 trn2 v17.8b, v18.8b, v17.8b 1005 /* make copy */ 1006 ins v19.d[0], v9.d[1] 1007 mov v18.16b, v9.16b 1008 trn1 v9.8b, v9.8b, v19.8b 1009 trn2 v19.8b, v18.8b, v19.8b 1010 /* Store results to the output buffer */ 1011 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1012 add TMP1, TMP1, OUTPUT_COL 1013 add TMP2, TMP2, OUTPUT_COL 1014 st1 {v8.8b}, [TMP1] 1015 st1 {v17.8b}, [TMP2] 1016 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1017 add TMP1, TMP1, OUTPUT_COL 1018 add TMP2, TMP2, OUTPUT_COL 1019 st1 {v9.8b}, [TMP1] 1020 /* make copy */ 1021 ins v7.d[0], v10.d[1] 1022 mov v18.16b, v10.16b 1023 trn1 v10.8b, v10.8b, v7.8b 1024 trn2 v7.8b, v18.8b, v7.8b 1025 st1 {v19.8b}, [TMP2] 1026 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1027 ldp TMP4, TMP5, [OUTPUT_BUF], 16 1028 add TMP1, TMP1, OUTPUT_COL 1029 add TMP2, TMP2, OUTPUT_COL 1030 add TMP4, TMP4, OUTPUT_COL 1031 add TMP5, TMP5, OUTPUT_COL 1032 st1 {v10.8b}, [TMP1] 1033 /* make copy */ 1034 ins v16.d[0], v11.d[1] 1035 mov v18.16b, v11.16b 1036 trn1 v11.8b, v11.8b, v16.8b 1037 trn2 v16.8b, v18.8b, v16.8b 1038 st1 {v7.8b}, [TMP2] 1039 st1 {v11.8b}, [TMP4] 1040 st1 {v16.8b}, [TMP5] 1041 sub sp, sp, #176 1042 ldp x22, x23, [sp], 16 1043 ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 1044 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 1045 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1046 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1047 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 1048 blr x30 1049 1050 .unreq DCT_TABLE 1051 .unreq COEF_BLOCK 1052 .unreq OUTPUT_BUF 1053 .unreq OUTPUT_COL 1054 .unreq TMP1 1055 .unreq TMP2 1056 .unreq TMP3 1057 .unreq TMP4 1058 1059 1060/*****************************************************************************/ 1061 1062/* 1063 * jsimd_idct_4x4_neon 1064 * 1065 * This function contains inverse-DCT code for getting reduced-size 1066 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 1067 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 1068 * function from jpeg-6b (jidctred.c). 1069 * 1070 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 1071 * requires much less arithmetic operations and hence should be faster. 1072 * The primary purpose of this particular NEON optimized function is 1073 * bit exact compatibility with jpeg-6b. 1074 * 1075 * TODO: a bit better instructions scheduling can be achieved by expanding 1076 * idct_helper/transpose_4x4 macros and reordering instructions, 1077 * but readability will suffer somewhat. 1078 */ 1079 1080#define CONST_BITS 13 1081 1082#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 1083#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 1084#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 1085#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 1086#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 1087#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 1088#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 1089#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 1090#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 1091#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 1092#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 1093#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 1094#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 1095#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 1096 1097.balign 16 1098Ljsimd_idct_4x4_neon_consts: 1099 .short FIX_1_847759065 /* v0.h[0] */ 1100 .short -FIX_0_765366865 /* v0.h[1] */ 1101 .short -FIX_0_211164243 /* v0.h[2] */ 1102 .short FIX_1_451774981 /* v0.h[3] */ 1103 .short -FIX_2_172734803 /* d1[0] */ 1104 .short FIX_1_061594337 /* d1[1] */ 1105 .short -FIX_0_509795579 /* d1[2] */ 1106 .short -FIX_0_601344887 /* d1[3] */ 1107 .short FIX_0_899976223 /* v2.h[0] */ 1108 .short FIX_2_562915447 /* v2.h[1] */ 1109 .short 1 << (CONST_BITS+1) /* v2.h[2] */ 1110 .short 0 /* v2.h[3] */ 1111 1112.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 1113 smull v28.4s, \x4, v2.h[2] 1114 smlal v28.4s, \x8, v0.h[0] 1115 smlal v28.4s, \x14, v0.h[1] 1116 1117 smull v26.4s, \x16, v1.h[2] 1118 smlal v26.4s, \x12, v1.h[3] 1119 smlal v26.4s, \x10, v2.h[0] 1120 smlal v26.4s, \x6, v2.h[1] 1121 1122 smull v30.4s, \x4, v2.h[2] 1123 smlsl v30.4s, \x8, v0.h[0] 1124 smlsl v30.4s, \x14, v0.h[1] 1125 1126 smull v24.4s, \x16, v0.h[2] 1127 smlal v24.4s, \x12, v0.h[3] 1128 smlal v24.4s, \x10, v1.h[0] 1129 smlal v24.4s, \x6, v1.h[1] 1130 1131 add v20.4s, v28.4s, v26.4s 1132 sub v28.4s, v28.4s, v26.4s 1133 1134.if \shift > 16 1135 srshr v20.4s, v20.4s, #\shift 1136 srshr v28.4s, v28.4s, #\shift 1137 xtn \y26, v20.4s 1138 xtn \y29, v28.4s 1139.else 1140 rshrn \y26, v20.4s, #\shift 1141 rshrn \y29, v28.4s, #\shift 1142.endif 1143 1144 add v20.4s, v30.4s, v24.4s 1145 sub v30.4s, v30.4s, v24.4s 1146 1147.if \shift > 16 1148 srshr v20.4s, v20.4s, #\shift 1149 srshr v30.4s, v30.4s, #\shift 1150 xtn \y27, v20.4s 1151 xtn \y28, v30.4s 1152.else 1153 rshrn \y27, v20.4s, #\shift 1154 rshrn \y28, v30.4s, #\shift 1155.endif 1156 1157.endm 1158 1159asm_function jsimd_idct_4x4_neon 1160 1161 DCT_TABLE .req x0 1162 COEF_BLOCK .req x1 1163 OUTPUT_BUF .req x2 1164 OUTPUT_COL .req x3 1165 TMP1 .req x0 1166 TMP2 .req x1 1167 TMP3 .req x2 1168 TMP4 .req x15 1169 1170 /* Save all used NEON registers */ 1171 sub sp, sp, 272 1172 str x15, [sp], 16 1173 /* Load constants (v3.4h is just used for padding) */ 1174 adr TMP4, Ljsimd_idct_4x4_neon_consts 1175 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 1176 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 1177 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1178 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1179 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 1180 st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 1181 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 1182 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 1183 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] 1184 1185 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1186 * 0 1 2 3 | 4 5 6 7 1187 * ---------+-------- 1188 * 0 | v4.4h | v5.4h 1189 * 1 | v6.4h | v7.4h 1190 * 2 | v8.4h | v9.4h 1191 * 3 | v10.4h | v11.4h 1192 * 4 | - | - 1193 * 5 | v12.4h | v13.4h 1194 * 6 | v14.4h | v15.4h 1195 * 7 | v16.4h | v17.4h 1196 */ 1197 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1198 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 1199 add COEF_BLOCK, COEF_BLOCK, #16 1200 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 1201 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1202 /* dequantize */ 1203 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1204 mul v4.4h, v4.4h, v18.4h 1205 mul v5.4h, v5.4h, v19.4h 1206 ins v4.d[1], v5.d[0] /* 128 bit q4 */ 1207 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 1208 mul v6.4h, v6.4h, v20.4h 1209 mul v7.4h, v7.4h, v21.4h 1210 ins v6.d[1], v7.d[0] /* 128 bit q6 */ 1211 mul v8.4h, v8.4h, v22.4h 1212 mul v9.4h, v9.4h, v23.4h 1213 ins v8.d[1], v9.d[0] /* 128 bit q8 */ 1214 add DCT_TABLE, DCT_TABLE, #16 1215 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 1216 mul v10.4h, v10.4h, v24.4h 1217 mul v11.4h, v11.4h, v25.4h 1218 ins v10.d[1], v11.d[0] /* 128 bit q10 */ 1219 mul v12.4h, v12.4h, v26.4h 1220 mul v13.4h, v13.4h, v27.4h 1221 ins v12.d[1], v13.d[0] /* 128 bit q12 */ 1222 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1223 mul v14.4h, v14.4h, v28.4h 1224 mul v15.4h, v15.4h, v29.4h 1225 ins v14.d[1], v15.d[0] /* 128 bit q14 */ 1226 mul v16.4h, v16.4h, v30.4h 1227 mul v17.4h, v17.4h, v31.4h 1228 ins v16.d[1], v17.d[0] /* 128 bit q16 */ 1229 1230 /* Pass 1 */ 1231 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h 1232 transpose_4x4 v4, v6, v8, v10, v3 1233 ins v10.d[1], v11.d[0] 1234 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h 1235 transpose_4x4 v5, v7, v9, v11, v3 1236 ins v10.d[1], v11.d[0] 1237 /* Pass 2 */ 1238 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h 1239 transpose_4x4 v26, v27, v28, v29, v3 1240 1241 /* Range limit */ 1242 movi v30.8h, #0x80 1243 ins v26.d[1], v27.d[0] 1244 ins v28.d[1], v29.d[0] 1245 add v26.8h, v26.8h, v30.8h 1246 add v28.8h, v28.8h, v30.8h 1247 sqxtun v26.8b, v26.8h 1248 sqxtun v27.8b, v28.8h 1249 1250 /* Store results to the output buffer */ 1251 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1252 ldp TMP3, TMP4, [OUTPUT_BUF] 1253 add TMP1, TMP1, OUTPUT_COL 1254 add TMP2, TMP2, OUTPUT_COL 1255 add TMP3, TMP3, OUTPUT_COL 1256 add TMP4, TMP4, OUTPUT_COL 1257 1258#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT 1259 /* We can use much less instructions on little endian systems if the 1260 * OS kernel is not configured to trap unaligned memory accesses 1261 */ 1262 st1 {v26.s}[0], [TMP1], 4 1263 st1 {v27.s}[0], [TMP3], 4 1264 st1 {v26.s}[1], [TMP2], 4 1265 st1 {v27.s}[1], [TMP4], 4 1266#else 1267 st1 {v26.b}[0], [TMP1], 1 1268 st1 {v27.b}[0], [TMP3], 1 1269 st1 {v26.b}[1], [TMP1], 1 1270 st1 {v27.b}[1], [TMP3], 1 1271 st1 {v26.b}[2], [TMP1], 1 1272 st1 {v27.b}[2], [TMP3], 1 1273 st1 {v26.b}[3], [TMP1], 1 1274 st1 {v27.b}[3], [TMP3], 1 1275 1276 st1 {v26.b}[4], [TMP2], 1 1277 st1 {v27.b}[4], [TMP4], 1 1278 st1 {v26.b}[5], [TMP2], 1 1279 st1 {v27.b}[5], [TMP4], 1 1280 st1 {v26.b}[6], [TMP2], 1 1281 st1 {v27.b}[6], [TMP4], 1 1282 st1 {v26.b}[7], [TMP2], 1 1283 st1 {v27.b}[7], [TMP4], 1 1284#endif 1285 1286 /* vpop {v8.4h - v15.4h} ;not available */ 1287 sub sp, sp, #272 1288 ldr x15, [sp], 16 1289 ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 1290 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 1291 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1292 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1293 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 1294 ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 1295 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 1296 ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 1297 blr x30 1298 1299 .unreq DCT_TABLE 1300 .unreq COEF_BLOCK 1301 .unreq OUTPUT_BUF 1302 .unreq OUTPUT_COL 1303 .unreq TMP1 1304 .unreq TMP2 1305 .unreq TMP3 1306 .unreq TMP4 1307 1308.purgem idct_helper 1309 1310 1311/*****************************************************************************/ 1312 1313/* 1314 * jsimd_idct_2x2_neon 1315 * 1316 * This function contains inverse-DCT code for getting reduced-size 1317 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations 1318 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 1319 * function from jpeg-6b (jidctred.c). 1320 * 1321 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 1322 * requires much less arithmetic operations and hence should be faster. 1323 * The primary purpose of this particular NEON optimized function is 1324 * bit exact compatibility with jpeg-6b. 1325 */ 1326 1327.balign 8 1328Ljsimd_idct_2x2_neon_consts: 1329 .short -FIX_0_720959822 /* v14[0] */ 1330 .short FIX_0_850430095 /* v14[1] */ 1331 .short -FIX_1_272758580 /* v14[2] */ 1332 .short FIX_3_624509785 /* v14[3] */ 1333 1334.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 1335 sshll v15.4s, \x4, #15 1336 smull v26.4s, \x6, v14.h[3] 1337 smlal v26.4s, \x10, v14.h[2] 1338 smlal v26.4s, \x12, v14.h[1] 1339 smlal v26.4s, \x16, v14.h[0] 1340 1341 add v20.4s, v15.4s, v26.4s 1342 sub v15.4s, v15.4s, v26.4s 1343 1344.if \shift > 16 1345 srshr v20.4s, v20.4s, #\shift 1346 srshr v15.4s, v15.4s, #\shift 1347 xtn \y26, v20.4s 1348 xtn \y27, v15.4s 1349.else 1350 rshrn \y26, v20.4s, #\shift 1351 rshrn \y27, v15.4s, #\shift 1352.endif 1353 1354.endm 1355 1356asm_function jsimd_idct_2x2_neon 1357 1358 DCT_TABLE .req x0 1359 COEF_BLOCK .req x1 1360 OUTPUT_BUF .req x2 1361 OUTPUT_COL .req x3 1362 TMP1 .req x0 1363 TMP2 .req x15 1364 1365 /* vpush {v8.4h - v15.4h} ; not available */ 1366 sub sp, sp, 208 1367 str x15, [sp], 16 1368 1369 /* Load constants */ 1370 adr TMP2, Ljsimd_idct_2x2_neon_consts 1371 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 1372 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1373 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1374 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 1375 st1 {v21.8b, v22.8b}, [sp], 16 1376 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 1377 st1 {v30.8b, v31.8b}, [sp], 16 1378 ld1 {v14.4h}, [TMP2] 1379 1380 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1381 * 0 1 2 3 | 4 5 6 7 1382 * ---------+-------- 1383 * 0 | v4.4h | v5.4h 1384 * 1 | v6.4h | v7.4h 1385 * 2 | - | - 1386 * 3 | v10.4h | v11.4h 1387 * 4 | - | - 1388 * 5 | v12.4h | v13.4h 1389 * 6 | - | - 1390 * 7 | v16.4h | v17.4h 1391 */ 1392 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1393 add COEF_BLOCK, COEF_BLOCK, #16 1394 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 1395 add COEF_BLOCK, COEF_BLOCK, #16 1396 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 1397 add COEF_BLOCK, COEF_BLOCK, #16 1398 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1399 /* Dequantize */ 1400 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1401 mul v4.4h, v4.4h, v18.4h 1402 mul v5.4h, v5.4h, v19.4h 1403 ins v4.d[1], v5.d[0] 1404 mul v6.4h, v6.4h, v20.4h 1405 mul v7.4h, v7.4h, v21.4h 1406 ins v6.d[1], v7.d[0] 1407 add DCT_TABLE, DCT_TABLE, #16 1408 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 1409 mul v10.4h, v10.4h, v24.4h 1410 mul v11.4h, v11.4h, v25.4h 1411 ins v10.d[1], v11.d[0] 1412 add DCT_TABLE, DCT_TABLE, #16 1413 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 1414 mul v12.4h, v12.4h, v26.4h 1415 mul v13.4h, v13.4h, v27.4h 1416 ins v12.d[1], v13.d[0] 1417 add DCT_TABLE, DCT_TABLE, #16 1418 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1419 mul v16.4h, v16.4h, v30.4h 1420 mul v17.4h, v17.4h, v31.4h 1421 ins v16.d[1], v17.d[0] 1422 1423 /* Pass 1 */ 1424#if 0 1425 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h 1426 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h 1427 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h 1428 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h 1429#else 1430 smull v26.4s, v6.4h, v14.h[3] 1431 smlal v26.4s, v10.4h, v14.h[2] 1432 smlal v26.4s, v12.4h, v14.h[1] 1433 smlal v26.4s, v16.4h, v14.h[0] 1434 smull v24.4s, v7.4h, v14.h[3] 1435 smlal v24.4s, v11.4h, v14.h[2] 1436 smlal v24.4s, v13.4h, v14.h[1] 1437 smlal v24.4s, v17.4h, v14.h[0] 1438 sshll v15.4s, v4.4h, #15 1439 sshll v30.4s, v5.4h, #15 1440 add v20.4s, v15.4s, v26.4s 1441 sub v15.4s, v15.4s, v26.4s 1442 rshrn v4.4h, v20.4s, #13 1443 rshrn v6.4h, v15.4s, #13 1444 add v20.4s, v30.4s, v24.4s 1445 sub v15.4s, v30.4s, v24.4s 1446 rshrn v5.4h, v20.4s, #13 1447 rshrn v7.4h, v15.4s, #13 1448 ins v4.d[1], v5.d[0] 1449 ins v6.d[1], v7.d[0] 1450 transpose v4, v6, v3, .16b, .8h 1451 transpose v6, v10, v3, .16b, .4s 1452 ins v11.d[0], v10.d[1] 1453 ins v7.d[0], v6.d[1] 1454#endif 1455 1456 /* Pass 2 */ 1457 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h 1458 1459 /* Range limit */ 1460 movi v30.8h, #0x80 1461 ins v26.d[1], v27.d[0] 1462 add v26.8h, v26.8h, v30.8h 1463 sqxtun v30.8b, v26.8h 1464 ins v26.d[0], v30.d[0] 1465 sqxtun v27.8b, v26.8h 1466 1467 /* Store results to the output buffer */ 1468 ldp TMP1, TMP2, [OUTPUT_BUF] 1469 add TMP1, TMP1, OUTPUT_COL 1470 add TMP2, TMP2, OUTPUT_COL 1471 1472 st1 {v26.b}[0], [TMP1], 1 1473 st1 {v27.b}[4], [TMP1], 1 1474 st1 {v26.b}[1], [TMP2], 1 1475 st1 {v27.b}[5], [TMP2], 1 1476 1477 sub sp, sp, #208 1478 ldr x15, [sp], 16 1479 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 1480 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1481 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1482 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 1483 ld1 {v21.8b, v22.8b}, [sp], 16 1484 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 1485 ld1 {v30.8b, v31.8b}, [sp], 16 1486 blr x30 1487 1488 .unreq DCT_TABLE 1489 .unreq COEF_BLOCK 1490 .unreq OUTPUT_BUF 1491 .unreq OUTPUT_COL 1492 .unreq TMP1 1493 .unreq TMP2 1494 1495.purgem idct_helper 1496 1497 1498/*****************************************************************************/ 1499 1500/* 1501 * jsimd_ycc_extrgb_convert_neon 1502 * jsimd_ycc_extbgr_convert_neon 1503 * jsimd_ycc_extrgbx_convert_neon 1504 * jsimd_ycc_extbgrx_convert_neon 1505 * jsimd_ycc_extxbgr_convert_neon 1506 * jsimd_ycc_extxrgb_convert_neon 1507 * 1508 * Colorspace conversion YCbCr -> RGB 1509 */ 1510 1511 1512.macro do_load size 1513 .if \size == 8 1514 ld1 {v4.8b}, [U], 8 1515 ld1 {v5.8b}, [V], 8 1516 ld1 {v0.8b}, [Y], 8 1517 prfm pldl1keep, [U, #64] 1518 prfm pldl1keep, [V, #64] 1519 prfm pldl1keep, [Y, #64] 1520 .elseif \size == 4 1521 ld1 {v4.b}[0], [U], 1 1522 ld1 {v4.b}[1], [U], 1 1523 ld1 {v4.b}[2], [U], 1 1524 ld1 {v4.b}[3], [U], 1 1525 ld1 {v5.b}[0], [V], 1 1526 ld1 {v5.b}[1], [V], 1 1527 ld1 {v5.b}[2], [V], 1 1528 ld1 {v5.b}[3], [V], 1 1529 ld1 {v0.b}[0], [Y], 1 1530 ld1 {v0.b}[1], [Y], 1 1531 ld1 {v0.b}[2], [Y], 1 1532 ld1 {v0.b}[3], [Y], 1 1533 .elseif \size == 2 1534 ld1 {v4.b}[4], [U], 1 1535 ld1 {v4.b}[5], [U], 1 1536 ld1 {v5.b}[4], [V], 1 1537 ld1 {v5.b}[5], [V], 1 1538 ld1 {v0.b}[4], [Y], 1 1539 ld1 {v0.b}[5], [Y], 1 1540 .elseif \size == 1 1541 ld1 {v4.b}[6], [U], 1 1542 ld1 {v5.b}[6], [V], 1 1543 ld1 {v0.b}[6], [Y], 1 1544 .else 1545 .error unsupported macroblock size 1546 .endif 1547.endm 1548 1549.macro do_store bpp, size 1550 .if \bpp == 24 1551 .if \size == 8 1552 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 1553 .elseif \size == 4 1554 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 1555 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 1556 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 1557 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 1558 .elseif \size == 2 1559 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 1560 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 1561 .elseif \size == 1 1562 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 1563 .else 1564 .error unsupported macroblock size 1565 .endif 1566 .elseif \bpp == 32 1567 .if \size == 8 1568 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 1569 .elseif \size == 4 1570 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 1571 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 1572 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 1573 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 1574 .elseif \size == 2 1575 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 1576 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 1577 .elseif \size == 1 1578 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 1579 .else 1580 .error unsupported macroblock size 1581 .endif 1582 .elseif \bpp==16 1583 .if \size == 8 1584 st1 {v25.8h}, [RGB],16 1585 .elseif \size == 4 1586 st1 {v25.4h}, [RGB],8 1587 .elseif \size == 2 1588 st1 {v25.h}[4], [RGB],2 1589 st1 {v25.h}[5], [RGB],2 1590 .elseif \size == 1 1591 st1 {v25.h}[6], [RGB],2 1592 .else 1593 .error unsupported macroblock size 1594 .endif 1595 .else 1596 .error unsupported bpp 1597 .endif 1598.endm 1599 1600.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize 1601 1602/* 1603 * 2-stage pipelined YCbCr->RGB conversion 1604 */ 1605 1606.macro do_yuv_to_rgb_stage1 1607 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ 1608 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1609 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1610 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1611 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1612 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1613 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1614 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1615 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ 1616 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ 1617.endm 1618 1619.macro do_yuv_to_rgb_stage2 1620 rshrn v20.4h, v20.4s, #15 1621 rshrn2 v20.8h, v22.4s, #15 1622 rshrn v24.4h, v24.4s, #14 1623 rshrn2 v24.8h, v26.4s, #14 1624 rshrn v28.4h, v28.4s, #14 1625 rshrn2 v28.8h, v30.4s, #14 1626 uaddw v20.8h, v20.8h, v0.8b 1627 uaddw v24.8h, v24.8h, v0.8b 1628 uaddw v28.8h, v28.8h, v0.8b 1629.if \bpp != 16 1630 sqxtun v1\g_offs\defsize, v20.8h 1631 sqxtun v1\r_offs\defsize, v24.8h 1632 sqxtun v1\b_offs\defsize, v28.8h 1633.else 1634 sqshlu v21.8h, v20.8h, #8 1635 sqshlu v25.8h, v24.8h, #8 1636 sqshlu v29.8h, v28.8h, #8 1637 sri v25.8h, v21.8h, #5 1638 sri v25.8h, v29.8h, #11 1639.endif 1640 1641.endm 1642 1643.macro do_yuv_to_rgb_stage2_store_load_stage1 1644 rshrn v20.4h, v20.4s, #15 1645 rshrn v24.4h, v24.4s, #14 1646 rshrn v28.4h, v28.4s, #14 1647 ld1 {v4.8b}, [U], 8 1648 rshrn2 v20.8h, v22.4s, #15 1649 rshrn2 v24.8h, v26.4s, #14 1650 rshrn2 v28.8h, v30.4s, #14 1651 ld1 {v5.8b}, [V], 8 1652 uaddw v20.8h, v20.8h, v0.8b 1653 uaddw v24.8h, v24.8h, v0.8b 1654 uaddw v28.8h, v28.8h, v0.8b 1655.if \bpp != 16 /**************** rgb24/rgb32 *********************************/ 1656 sqxtun v1\g_offs\defsize, v20.8h 1657 ld1 {v0.8b}, [Y], 8 1658 sqxtun v1\r_offs\defsize, v24.8h 1659 prfm pldl1keep, [U, #64] 1660 prfm pldl1keep, [V, #64] 1661 prfm pldl1keep, [Y, #64] 1662 sqxtun v1\b_offs\defsize, v28.8h 1663 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1664 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1665 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1666 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1667 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1668 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1669 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1670 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1671.else /**************************** rgb565 ***********************************/ 1672 sqshlu v21.8h, v20.8h, #8 1673 sqshlu v25.8h, v24.8h, #8 1674 sqshlu v29.8h, v28.8h, #8 1675 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1676 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1677 ld1 {v0.8b}, [Y], 8 1678 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1679 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1680 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1681 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1682 sri v25.8h, v21.8h, #5 1683 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1684 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1685 prfm pldl1keep, [U, #64] 1686 prfm pldl1keep, [V, #64] 1687 prfm pldl1keep, [Y, #64] 1688 sri v25.8h, v29.8h, #11 1689.endif 1690 do_store \bpp, 8 1691 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ 1692 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ 1693.endm 1694 1695.macro do_yuv_to_rgb 1696 do_yuv_to_rgb_stage1 1697 do_yuv_to_rgb_stage2 1698.endm 1699 1700/* Apple gas crashes on adrl, work around that by using adr. 1701 * But this requires a copy of these constants for each function. 1702 */ 1703 1704.balign 16 1705Ljsimd_ycc_\colorid\()_neon_consts: 1706 .short 0, 0, 0, 0 1707 .short 22971, -11277, -23401, 29033 1708 .short -128, -128, -128, -128 1709 .short -128, -128, -128, -128 1710 1711asm_function jsimd_ycc_\colorid\()_convert_neon 1712 OUTPUT_WIDTH .req x0 1713 INPUT_BUF .req x1 1714 INPUT_ROW .req x2 1715 OUTPUT_BUF .req x3 1716 NUM_ROWS .req x4 1717 1718 INPUT_BUF0 .req x5 1719 INPUT_BUF1 .req x6 1720 INPUT_BUF2 .req x1 1721 1722 RGB .req x7 1723 Y .req x8 1724 U .req x9 1725 V .req x10 1726 N .req x15 1727 1728 sub sp, sp, 336 1729 str x15, [sp], 16 1730 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ 1731 adr x15, Ljsimd_ycc_\colorid\()_neon_consts 1732 /* Save NEON registers */ 1733 st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 1734 st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 1735 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1736 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1737 st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 1738 st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 1739 st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 1740 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 1741 ld1 {v0.4h, v1.4h}, [x15], 16 1742 ld1 {v2.8h}, [x15] 1743 1744 /* Save ARM registers and handle input arguments */ 1745 /* push {x4, x5, x6, x7, x8, x9, x10, x30} */ 1746 stp x4, x5, [sp], 16 1747 stp x6, x7, [sp], 16 1748 stp x8, x9, [sp], 16 1749 stp x10, x30, [sp], 16 1750 ldr INPUT_BUF0, [INPUT_BUF] 1751 ldr INPUT_BUF1, [INPUT_BUF, #8] 1752 ldr INPUT_BUF2, [INPUT_BUF, #16] 1753 .unreq INPUT_BUF 1754 1755 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ 1756 movi v10.16b, #255 1757 movi v13.16b, #255 1758 1759 /* Outer loop over scanlines */ 1760 cmp NUM_ROWS, #1 1761 b.lt 9f 17620: 1763 lsl x16, INPUT_ROW, #3 1764 ldr Y, [INPUT_BUF0, x16] 1765 ldr U, [INPUT_BUF1, x16] 1766 mov N, OUTPUT_WIDTH 1767 ldr V, [INPUT_BUF2, x16] 1768 add INPUT_ROW, INPUT_ROW, #1 1769 ldr RGB, [OUTPUT_BUF], #8 1770 1771 /* Inner loop over pixels */ 1772 subs N, N, #8 1773 b.lt 3f 1774 do_load 8 1775 do_yuv_to_rgb_stage1 1776 subs N, N, #8 1777 b.lt 2f 17781: 1779 do_yuv_to_rgb_stage2_store_load_stage1 1780 subs N, N, #8 1781 b.ge 1b 17822: 1783 do_yuv_to_rgb_stage2 1784 do_store \bpp, 8 1785 tst N, #7 1786 b.eq 8f 17873: 1788 tst N, #4 1789 b.eq 3f 1790 do_load 4 17913: 1792 tst N, #2 1793 b.eq 4f 1794 do_load 2 17954: 1796 tst N, #1 1797 b.eq 5f 1798 do_load 1 17995: 1800 do_yuv_to_rgb 1801 tst N, #4 1802 b.eq 6f 1803 do_store \bpp, 4 18046: 1805 tst N, #2 1806 b.eq 7f 1807 do_store \bpp, 2 18087: 1809 tst N, #1 1810 b.eq 8f 1811 do_store \bpp, 1 18128: 1813 subs NUM_ROWS, NUM_ROWS, #1 1814 b.gt 0b 18159: 1816 /* Restore all registers and return */ 1817 sub sp, sp, #336 1818 ldr x15, [sp], 16 1819 ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32 1820 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32 1821 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1822 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1823 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32 1824 ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32 1825 ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32 1826 ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32 1827 /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */ 1828 ldp x4, x5, [sp], 16 1829 ldp x6, x7, [sp], 16 1830 ldp x8, x9, [sp], 16 1831 ldp x10, x30, [sp], 16 1832 br x30 1833 .unreq OUTPUT_WIDTH 1834 .unreq INPUT_ROW 1835 .unreq OUTPUT_BUF 1836 .unreq NUM_ROWS 1837 .unreq INPUT_BUF0 1838 .unreq INPUT_BUF1 1839 .unreq INPUT_BUF2 1840 .unreq RGB 1841 .unreq Y 1842 .unreq U 1843 .unreq V 1844 .unreq N 1845 1846.purgem do_yuv_to_rgb 1847.purgem do_yuv_to_rgb_stage1 1848.purgem do_yuv_to_rgb_stage2 1849.purgem do_yuv_to_rgb_stage2_store_load_stage1 1850.endm 1851 1852/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize */ 1853generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b 1854generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b 1855generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b 1856generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b 1857generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b 1858generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b 1859generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b 1860.purgem do_load 1861.purgem do_store 1862