1/* 2 * ARMv7 NEON optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). 5 * All Rights Reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 7 * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved. 8 * Copyright (C) 2014, Linaro Limited. All Rights Reserved. 9 * Copyright (C) 2015, D. R. Commander. All Rights Reserved. 10 * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved. 11 * 12 * This software is provided 'as-is', without any express or implied 13 * warranty. In no event will the authors be held liable for any damages 14 * arising from the use of this software. 15 * 16 * Permission is granted to anyone to use this software for any purpose, 17 * including commercial applications, and to alter it and redistribute it 18 * freely, subject to the following restrictions: 19 * 20 * 1. The origin of this software must not be misrepresented; you must not 21 * claim that you wrote the original software. If you use this software 22 * in a product, an acknowledgment in the product documentation would be 23 * appreciated but is not required. 24 * 2. Altered source versions must be plainly marked as such, and must not be 25 * misrepresented as being the original software. 26 * 3. This notice may not be removed or altered from any source distribution. 27 */ 28 29#if defined(__linux__) && defined(__ELF__) 30.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ 31#endif 32 33.text 34.fpu neon 35.arch armv7a 36.object_arch armv4 37.arm 38.syntax unified 39 40 41#define RESPECT_STRICT_ALIGNMENT 1 42 43 44/*****************************************************************************/ 45 46/* Supplementary macro for setting function attributes */ 47.macro asm_function fname 48#ifdef __APPLE__ 49 .globl _\fname 50_\fname: 51#else 52 .global \fname 53#ifdef __ELF__ 54 .hidden \fname 55 .type \fname, %function 56#endif 57\fname: 58#endif 59.endm 60 61/* Transpose a block of 4x4 coefficients in four 64-bit registers */ 62.macro transpose_4x4 x0, x1, x2, x3 63 vtrn.16 \x0, \x1 64 vtrn.16 \x2, \x3 65 vtrn.32 \x0, \x2 66 vtrn.32 \x1, \x3 67.endm 68 69 70#define CENTERJSAMPLE 128 71 72/*****************************************************************************/ 73 74/* 75 * Perform dequantization and inverse DCT on one block of coefficients. 76 * 77 * GLOBAL(void) 78 * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block, 79 * JSAMPARRAY output_buf, JDIMENSION output_col) 80 */ 81 82#define FIX_0_298631336 (2446) 83#define FIX_0_390180644 (3196) 84#define FIX_0_541196100 (4433) 85#define FIX_0_765366865 (6270) 86#define FIX_0_899976223 (7373) 87#define FIX_1_175875602 (9633) 88#define FIX_1_501321110 (12299) 89#define FIX_1_847759065 (15137) 90#define FIX_1_961570560 (16069) 91#define FIX_2_053119869 (16819) 92#define FIX_2_562915447 (20995) 93#define FIX_3_072711026 (25172) 94 95#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560) 96#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644) 97#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065) 98#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447) 99#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223) 100#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223) 101#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447) 102#define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865) 103 104/* 105 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation. 106 * Uses some ideas from the comments in 'simd/jiss2int-64.asm' 107 */ 108#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \ 109{ \ 110 DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \ 111 JLONG q1, q2, q3, q4, q5, q6, q7; \ 112 JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \ 113 \ 114 /* 1-D iDCT input data */ \ 115 row0 = xrow0; \ 116 row1 = xrow1; \ 117 row2 = xrow2; \ 118 row3 = xrow3; \ 119 row4 = xrow4; \ 120 row5 = xrow5; \ 121 row6 = xrow6; \ 122 row7 = xrow7; \ 123 \ 124 q5 = row7 + row3; \ 125 q4 = row5 + row1; \ 126 q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \ 127 MULTIPLY(q4, FIX_1_175875602); \ 128 q7 = MULTIPLY(q5, FIX_1_175875602) + \ 129 MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \ 130 q2 = MULTIPLY(row2, FIX_0_541196100) + \ 131 MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \ 132 q4 = q6; \ 133 q3 = ((JLONG) row0 - (JLONG) row4) << 13; \ 134 q6 += MULTIPLY(row5, -FIX_2_562915447) + \ 135 MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \ 136 /* now we can use q1 (reloadable constants have been used up) */ \ 137 q1 = q3 + q2; \ 138 q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \ 139 MULTIPLY(row1, -FIX_0_899976223); \ 140 q5 = q7; \ 141 q1 = q1 + q6; \ 142 q7 += MULTIPLY(row7, -FIX_0_899976223) + \ 143 MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \ 144 \ 145 /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \ 146 tmp11_plus_tmp2 = q1; \ 147 row1 = 0; \ 148 \ 149 q1 = q1 - q6; \ 150 q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \ 151 MULTIPLY(row3, -FIX_2_562915447); \ 152 q1 = q1 - q6; \ 153 q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \ 154 MULTIPLY(row6, FIX_0_541196100); \ 155 q3 = q3 - q2; \ 156 \ 157 /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \ 158 tmp11_minus_tmp2 = q1; \ 159 \ 160 q1 = ((JLONG) row0 + (JLONG) row4) << 13; \ 161 q2 = q1 + q6; \ 162 q1 = q1 - q6; \ 163 \ 164 /* pick up the results */ \ 165 tmp0 = q4; \ 166 tmp1 = q5; \ 167 tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \ 168 tmp3 = q7; \ 169 tmp10 = q2; \ 170 tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \ 171 tmp12 = q3; \ 172 tmp13 = q1; \ 173} 174 175#define XFIX_0_899976223 d0[0] 176#define XFIX_0_541196100 d0[1] 177#define XFIX_2_562915447 d0[2] 178#define XFIX_0_298631336_MINUS_0_899976223 d0[3] 179#define XFIX_1_501321110_MINUS_0_899976223 d1[0] 180#define XFIX_2_053119869_MINUS_2_562915447 d1[1] 181#define XFIX_0_541196100_PLUS_0_765366865 d1[2] 182#define XFIX_1_175875602 d1[3] 183#define XFIX_1_175875602_MINUS_0_390180644 d2[0] 184#define XFIX_0_541196100_MINUS_1_847759065 d2[1] 185#define XFIX_3_072711026_MINUS_2_562915447 d2[2] 186#define XFIX_1_175875602_MINUS_1_961570560 d2[3] 187 188.balign 16 189jsimd_idct_islow_neon_consts: 190 .short FIX_0_899976223 /* d0[0] */ 191 .short FIX_0_541196100 /* d0[1] */ 192 .short FIX_2_562915447 /* d0[2] */ 193 .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */ 194 .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */ 195 .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */ 196 .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */ 197 .short FIX_1_175875602 /* d1[3] */ 198 /* reloadable constants */ 199 .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */ 200 .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */ 201 .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */ 202 .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */ 203 204asm_function jsimd_idct_islow_neon 205 206 DCT_TABLE .req r0 207 COEF_BLOCK .req r1 208 OUTPUT_BUF .req r2 209 OUTPUT_COL .req r3 210 TMP1 .req r0 211 TMP2 .req r1 212 TMP3 .req r2 213 TMP4 .req ip 214 215 ROW0L .req d16 216 ROW0R .req d17 217 ROW1L .req d18 218 ROW1R .req d19 219 ROW2L .req d20 220 ROW2R .req d21 221 ROW3L .req d22 222 ROW3R .req d23 223 ROW4L .req d24 224 ROW4R .req d25 225 ROW5L .req d26 226 ROW5R .req d27 227 ROW6L .req d28 228 ROW6R .req d29 229 ROW7L .req d30 230 ROW7R .req d31 231 232 /* Load and dequantize coefficients into NEON registers 233 * with the following allocation: 234 * 0 1 2 3 | 4 5 6 7 235 * ---------+-------- 236 * 0 | d16 | d17 ( q8 ) 237 * 1 | d18 | d19 ( q9 ) 238 * 2 | d20 | d21 ( q10 ) 239 * 3 | d22 | d23 ( q11 ) 240 * 4 | d24 | d25 ( q12 ) 241 * 5 | d26 | d27 ( q13 ) 242 * 6 | d28 | d29 ( q14 ) 243 * 7 | d30 | d31 ( q15 ) 244 */ 245 adr ip, jsimd_idct_islow_neon_consts 246 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! 247 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 248 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! 249 vmul.s16 q8, q8, q0 250 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 251 vmul.s16 q9, q9, q1 252 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! 253 vmul.s16 q10, q10, q2 254 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 255 vmul.s16 q11, q11, q3 256 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] 257 vmul.s16 q12, q12, q0 258 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 259 vmul.s16 q14, q14, q2 260 vmul.s16 q13, q13, q1 261 vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */ 262 add ip, ip, #16 263 vmul.s16 q15, q15, q3 264 vpush {d8-d15} /* save NEON registers */ 265 /* 1-D IDCT, pass 1, left 4x8 half */ 266 vadd.s16 d4, ROW7L, ROW3L 267 vadd.s16 d5, ROW5L, ROW1L 268 vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560 269 vmlal.s16 q6, d5, XFIX_1_175875602 270 vmull.s16 q7, d4, XFIX_1_175875602 271 /* Check for the zero coefficients in the right 4x8 half */ 272 push {r4, r5} 273 vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644 274 vsubl.s16 q3, ROW0L, ROW4L 275 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))] 276 vmull.s16 q2, ROW2L, XFIX_0_541196100 277 vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065 278 orr r0, r4, r5 279 vmov q4, q6 280 vmlsl.s16 q6, ROW5L, XFIX_2_562915447 281 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))] 282 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 283 vshl.s32 q3, q3, #13 284 orr r0, r0, r4 285 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 286 orr r0, r0, r5 287 vadd.s32 q1, q3, q2 288 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))] 289 vmov q5, q7 290 vadd.s32 q1, q1, q6 291 orr r0, r0, r4 292 vmlsl.s16 q7, ROW7L, XFIX_0_899976223 293 orr r0, r0, r5 294 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 295 vrshrn.s32 ROW1L, q1, #11 296 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))] 297 vsub.s32 q1, q1, q6 298 vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447 299 orr r0, r0, r4 300 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 301 orr r0, r0, r5 302 vsub.s32 q1, q1, q6 303 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 304 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))] 305 vmlal.s16 q6, ROW6L, XFIX_0_541196100 306 vsub.s32 q3, q3, q2 307 orr r0, r0, r4 308 vrshrn.s32 ROW6L, q1, #11 309 orr r0, r0, r5 310 vadd.s32 q1, q3, q5 311 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))] 312 vsub.s32 q3, q3, q5 313 vaddl.s16 q5, ROW0L, ROW4L 314 orr r0, r0, r4 315 vrshrn.s32 ROW2L, q1, #11 316 orr r0, r0, r5 317 vrshrn.s32 ROW5L, q3, #11 318 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))] 319 vshl.s32 q5, q5, #13 320 vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223 321 orr r0, r0, r4 322 vadd.s32 q2, q5, q6 323 orrs r0, r0, r5 324 vsub.s32 q1, q5, q6 325 vadd.s32 q6, q2, q7 326 ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))] 327 vsub.s32 q2, q2, q7 328 vadd.s32 q5, q1, q4 329 orr r0, r4, r5 330 vsub.s32 q3, q1, q4 331 pop {r4, r5} 332 vrshrn.s32 ROW7L, q2, #11 333 vrshrn.s32 ROW3L, q5, #11 334 vrshrn.s32 ROW0L, q6, #11 335 vrshrn.s32 ROW4L, q3, #11 336 337 beq 3f /* Go to do some special handling for the sparse 338 right 4x8 half */ 339 340 /* 1-D IDCT, pass 1, right 4x8 half */ 341 vld1.s16 {d2}, [ip, :64] /* reload constants */ 342 vadd.s16 d10, ROW7R, ROW3R 343 vadd.s16 d8, ROW5R, ROW1R 344 /* Transpose left 4x8 half */ 345 vtrn.16 ROW6L, ROW7L 346 vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560 347 vmlal.s16 q6, d8, XFIX_1_175875602 348 vtrn.16 ROW2L, ROW3L 349 vmull.s16 q7, d10, XFIX_1_175875602 350 vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644 351 vtrn.16 ROW0L, ROW1L 352 vsubl.s16 q3, ROW0R, ROW4R 353 vmull.s16 q2, ROW2R, XFIX_0_541196100 354 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 355 vtrn.16 ROW4L, ROW5L 356 vmov q4, q6 357 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 358 vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447 359 vtrn.32 ROW1L, ROW3L 360 vshl.s32 q3, q3, #13 361 vmlsl.s16 q4, ROW1R, XFIX_0_899976223 362 vtrn.32 ROW4L, ROW6L 363 vadd.s32 q1, q3, q2 364 vmov q5, q7 365 vadd.s32 q1, q1, q6 366 vtrn.32 ROW0L, ROW2L 367 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 368 vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223 369 vrshrn.s32 ROW1R, q1, #11 370 vtrn.32 ROW5L, ROW7L 371 vsub.s32 q1, q1, q6 372 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 373 vmlsl.s16 q5, ROW3R, XFIX_2_562915447 374 vsub.s32 q1, q1, q6 375 vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865 376 vmlal.s16 q6, ROW6R, XFIX_0_541196100 377 vsub.s32 q3, q3, q2 378 vrshrn.s32 ROW6R, q1, #11 379 vadd.s32 q1, q3, q5 380 vsub.s32 q3, q3, q5 381 vaddl.s16 q5, ROW0R, ROW4R 382 vrshrn.s32 ROW2R, q1, #11 383 vrshrn.s32 ROW5R, q3, #11 384 vshl.s32 q5, q5, #13 385 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 386 vadd.s32 q2, q5, q6 387 vsub.s32 q1, q5, q6 388 vadd.s32 q6, q2, q7 389 vsub.s32 q2, q2, q7 390 vadd.s32 q5, q1, q4 391 vsub.s32 q3, q1, q4 392 vrshrn.s32 ROW7R, q2, #11 393 vrshrn.s32 ROW3R, q5, #11 394 vrshrn.s32 ROW0R, q6, #11 395 vrshrn.s32 ROW4R, q3, #11 396 /* Transpose right 4x8 half */ 397 vtrn.16 ROW6R, ROW7R 398 vtrn.16 ROW2R, ROW3R 399 vtrn.16 ROW0R, ROW1R 400 vtrn.16 ROW4R, ROW5R 401 vtrn.32 ROW1R, ROW3R 402 vtrn.32 ROW4R, ROW6R 403 vtrn.32 ROW0R, ROW2R 404 vtrn.32 ROW5R, ROW7R 405 4061: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */ 407 vld1.s16 {d2}, [ip, :64] /* reload constants */ 408 vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */ 409 vmlal.s16 q6, ROW1L, XFIX_1_175875602 410 vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ 411 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 412 vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */ 413 vmlal.s16 q7, ROW3L, XFIX_1_175875602 414 vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ 415 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 416 vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */ 417 vmull.s16 q2, ROW2L, XFIX_0_541196100 418 vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */ 419 vmov q4, q6 420 vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */ 421 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 422 vshl.s32 q3, q3, #13 423 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 424 vadd.s32 q1, q3, q2 425 vmov q5, q7 426 vadd.s32 q1, q1, q6 427 vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */ 428 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 429 vshrn.s32 ROW1L, q1, #16 430 vsub.s32 q1, q1, q6 431 vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */ 432 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 433 vsub.s32 q1, q1, q6 434 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 435 vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */ 436 vsub.s32 q3, q3, q2 437 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ 438 vadd.s32 q1, q3, q5 439 vsub.s32 q3, q3, q5 440 vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */ 441 vshrn.s32 ROW2L, q1, #16 442 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ 443 vshl.s32 q5, q5, #13 444 vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */ 445 vadd.s32 q2, q5, q6 446 vsub.s32 q1, q5, q6 447 vadd.s32 q6, q2, q7 448 vsub.s32 q2, q2, q7 449 vadd.s32 q5, q1, q4 450 vsub.s32 q3, q1, q4 451 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ 452 vshrn.s32 ROW3L, q5, #16 453 vshrn.s32 ROW0L, q6, #16 454 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ 455 /* 1-D IDCT, pass 2, right 4x8 half */ 456 vld1.s16 {d2}, [ip, :64] /* reload constants */ 457 vmull.s16 q6, ROW5R, XFIX_1_175875602 458 vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */ 459 vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560 460 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */ 461 vmull.s16 q7, ROW7R, XFIX_1_175875602 462 vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */ 463 vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644 464 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */ 465 vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */ 466 vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */ 467 vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065 468 vmov q4, q6 469 vmlsl.s16 q6, ROW5R, XFIX_2_562915447 470 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */ 471 vshl.s32 q3, q3, #13 472 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */ 473 vadd.s32 q1, q3, q2 474 vmov q5, q7 475 vadd.s32 q1, q1, q6 476 vmlsl.s16 q7, ROW7R, XFIX_0_899976223 477 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */ 478 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ 479 vsub.s32 q1, q1, q6 480 vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447 481 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */ 482 vsub.s32 q1, q1, q6 483 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */ 484 vmlal.s16 q6, ROW6R, XFIX_0_541196100 485 vsub.s32 q3, q3, q2 486 vshrn.s32 ROW6R, q1, #16 487 vadd.s32 q1, q3, q5 488 vsub.s32 q3, q3, q5 489 vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */ 490 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ 491 vshrn.s32 ROW5R, q3, #16 492 vshl.s32 q5, q5, #13 493 vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223 494 vadd.s32 q2, q5, q6 495 vsub.s32 q1, q5, q6 496 vadd.s32 q6, q2, q7 497 vsub.s32 q2, q2, q7 498 vadd.s32 q5, q1, q4 499 vsub.s32 q3, q1, q4 500 vshrn.s32 ROW7R, q2, #16 501 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ 502 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ 503 vshrn.s32 ROW4R, q3, #16 504 5052: /* Descale to 8-bit and range limit */ 506 vqrshrn.s16 d16, q8, #2 507 vqrshrn.s16 d17, q9, #2 508 vqrshrn.s16 d18, q10, #2 509 vqrshrn.s16 d19, q11, #2 510 vpop {d8-d15} /* restore NEON registers */ 511 vqrshrn.s16 d20, q12, #2 512 /* Transpose the final 8-bit samples and do signed->unsigned conversion */ 513 vtrn.16 q8, q9 514 vqrshrn.s16 d21, q13, #2 515 vqrshrn.s16 d22, q14, #2 516 vmov.u8 q0, #(CENTERJSAMPLE) 517 vqrshrn.s16 d23, q15, #2 518 vtrn.8 d16, d17 519 vtrn.8 d18, d19 520 vadd.u8 q8, q8, q0 521 vadd.u8 q9, q9, q0 522 vtrn.16 q10, q11 523 /* Store results to the output buffer */ 524 ldmia OUTPUT_BUF!, {TMP1, TMP2} 525 add TMP1, TMP1, OUTPUT_COL 526 add TMP2, TMP2, OUTPUT_COL 527 vst1.8 {d16}, [TMP1] 528 vtrn.8 d20, d21 529 vst1.8 {d17}, [TMP2] 530 ldmia OUTPUT_BUF!, {TMP1, TMP2} 531 add TMP1, TMP1, OUTPUT_COL 532 add TMP2, TMP2, OUTPUT_COL 533 vst1.8 {d18}, [TMP1] 534 vadd.u8 q10, q10, q0 535 vst1.8 {d19}, [TMP2] 536 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 537 add TMP1, TMP1, OUTPUT_COL 538 add TMP2, TMP2, OUTPUT_COL 539 add TMP3, TMP3, OUTPUT_COL 540 add TMP4, TMP4, OUTPUT_COL 541 vtrn.8 d22, d23 542 vst1.8 {d20}, [TMP1] 543 vadd.u8 q11, q11, q0 544 vst1.8 {d21}, [TMP2] 545 vst1.8 {d22}, [TMP3] 546 vst1.8 {d23}, [TMP4] 547 bx lr 548 5493: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */ 550 551 /* Transpose left 4x8 half */ 552 vtrn.16 ROW6L, ROW7L 553 vtrn.16 ROW2L, ROW3L 554 vtrn.16 ROW0L, ROW1L 555 vtrn.16 ROW4L, ROW5L 556 vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */ 557 vtrn.32 ROW1L, ROW3L 558 vtrn.32 ROW4L, ROW6L 559 vtrn.32 ROW0L, ROW2L 560 vtrn.32 ROW5L, ROW7L 561 562 cmp r0, #0 563 beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second 564 pass */ 565 566 /* Only row 0 is non-zero for the right 4x8 half */ 567 vdup.s16 ROW1R, ROW0R[1] 568 vdup.s16 ROW2R, ROW0R[2] 569 vdup.s16 ROW3R, ROW0R[3] 570 vdup.s16 ROW4R, ROW0R[0] 571 vdup.s16 ROW5R, ROW0R[1] 572 vdup.s16 ROW6R, ROW0R[2] 573 vdup.s16 ROW7R, ROW0R[3] 574 vdup.s16 ROW0R, ROW0R[0] 575 b 1b /* Go to 'normal' second pass */ 576 5774: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */ 578 vld1.s16 {d2}, [ip, :64] /* reload constants */ 579 vmull.s16 q6, ROW1L, XFIX_1_175875602 580 vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560 581 vmull.s16 q7, ROW3L, XFIX_1_175875602 582 vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644 583 vmull.s16 q2, ROW2L, XFIX_0_541196100 584 vshll.s16 q3, ROW0L, #13 585 vmov q4, q6 586 vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447 587 vmlsl.s16 q4, ROW1L, XFIX_0_899976223 588 vadd.s32 q1, q3, q2 589 vmov q5, q7 590 vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223 591 vadd.s32 q1, q1, q6 592 vadd.s32 q6, q6, q6 593 vmlsl.s16 q5, ROW3L, XFIX_2_562915447 594 vshrn.s32 ROW1L, q1, #16 595 vsub.s32 q1, q1, q6 596 vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865 597 vsub.s32 q3, q3, q2 598 vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */ 599 vadd.s32 q1, q3, q5 600 vsub.s32 q3, q3, q5 601 vshll.s16 q5, ROW0L, #13 602 vshrn.s32 ROW2L, q1, #16 603 vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */ 604 vadd.s32 q2, q5, q6 605 vsub.s32 q1, q5, q6 606 vadd.s32 q6, q2, q7 607 vsub.s32 q2, q2, q7 608 vadd.s32 q5, q1, q4 609 vsub.s32 q3, q1, q4 610 vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */ 611 vshrn.s32 ROW3L, q5, #16 612 vshrn.s32 ROW0L, q6, #16 613 vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */ 614 /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */ 615 vld1.s16 {d2}, [ip, :64] /* reload constants */ 616 vmull.s16 q6, ROW5L, XFIX_1_175875602 617 vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 618 vmull.s16 q7, ROW7L, XFIX_1_175875602 619 vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 620 vmull.s16 q2, ROW6L, XFIX_0_541196100 621 vshll.s16 q3, ROW4L, #13 622 vmov q4, q6 623 vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 624 vmlsl.s16 q4, ROW5L, XFIX_0_899976223 625 vadd.s32 q1, q3, q2 626 vmov q5, q7 627 vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 628 vadd.s32 q1, q1, q6 629 vadd.s32 q6, q6, q6 630 vmlsl.s16 q5, ROW7L, XFIX_2_562915447 631 vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */ 632 vsub.s32 q1, q1, q6 633 vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 634 vsub.s32 q3, q3, q2 635 vshrn.s32 ROW6R, q1, #16 636 vadd.s32 q1, q3, q5 637 vsub.s32 q3, q3, q5 638 vshll.s16 q5, ROW4L, #13 639 vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */ 640 vshrn.s32 ROW5R, q3, #16 641 vadd.s32 q2, q5, q6 642 vsub.s32 q1, q5, q6 643 vadd.s32 q6, q2, q7 644 vsub.s32 q2, q2, q7 645 vadd.s32 q5, q1, q4 646 vsub.s32 q3, q1, q4 647 vshrn.s32 ROW7R, q2, #16 648 vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */ 649 vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */ 650 vshrn.s32 ROW4R, q3, #16 651 b 2b /* Go to epilogue */ 652 653 .unreq DCT_TABLE 654 .unreq COEF_BLOCK 655 .unreq OUTPUT_BUF 656 .unreq OUTPUT_COL 657 .unreq TMP1 658 .unreq TMP2 659 .unreq TMP3 660 .unreq TMP4 661 662 .unreq ROW0L 663 .unreq ROW0R 664 .unreq ROW1L 665 .unreq ROW1R 666 .unreq ROW2L 667 .unreq ROW2R 668 .unreq ROW3L 669 .unreq ROW3R 670 .unreq ROW4L 671 .unreq ROW4R 672 .unreq ROW5L 673 .unreq ROW5R 674 .unreq ROW6L 675 .unreq ROW6R 676 .unreq ROW7L 677 .unreq ROW7R 678 679 680/*****************************************************************************/ 681 682/* 683 * jsimd_idct_ifast_neon 684 * 685 * This function contains a fast, not so accurate integer implementation of 686 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 687 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' 688 * function from jidctfst.c 689 * 690 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 691 * But in ARM NEON case some extra additions are required because VQDMULH 692 * instruction can't handle the constants larger than 1. So the expressions 693 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 694 * which introduces an extra addition. Overall, there are 6 extra additions 695 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 696 */ 697 698#define XFIX_1_082392200 d0[0] 699#define XFIX_1_414213562 d0[1] 700#define XFIX_1_847759065 d0[2] 701#define XFIX_2_613125930 d0[3] 702 703.balign 16 704jsimd_idct_ifast_neon_consts: 705 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 706 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 707 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 708 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 709 710asm_function jsimd_idct_ifast_neon 711 712 DCT_TABLE .req r0 713 COEF_BLOCK .req r1 714 OUTPUT_BUF .req r2 715 OUTPUT_COL .req r3 716 TMP1 .req r0 717 TMP2 .req r1 718 TMP3 .req r2 719 TMP4 .req ip 720 721 /* Load and dequantize coefficients into NEON registers 722 * with the following allocation: 723 * 0 1 2 3 | 4 5 6 7 724 * ---------+-------- 725 * 0 | d16 | d17 ( q8 ) 726 * 1 | d18 | d19 ( q9 ) 727 * 2 | d20 | d21 ( q10 ) 728 * 3 | d22 | d23 ( q11 ) 729 * 4 | d24 | d25 ( q12 ) 730 * 5 | d26 | d27 ( q13 ) 731 * 6 | d28 | d29 ( q14 ) 732 * 7 | d30 | d31 ( q15 ) 733 */ 734 adr ip, jsimd_idct_ifast_neon_consts 735 vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]! 736 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 737 vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]! 738 vmul.s16 q8, q8, q0 739 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 740 vmul.s16 q9, q9, q1 741 vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]! 742 vmul.s16 q10, q10, q2 743 vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]! 744 vmul.s16 q11, q11, q3 745 vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128] 746 vmul.s16 q12, q12, q0 747 vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]! 748 vmul.s16 q14, q14, q2 749 vmul.s16 q13, q13, q1 750 vld1.16 {d0}, [ip, :64] /* load constants */ 751 vmul.s16 q15, q15, q3 752 vpush {d8-d13} /* save NEON registers */ 753 /* 1-D IDCT, pass 1 */ 754 vsub.s16 q2, q10, q14 755 vadd.s16 q14, q10, q14 756 vsub.s16 q1, q11, q13 757 vadd.s16 q13, q11, q13 758 vsub.s16 q5, q9, q15 759 vadd.s16 q15, q9, q15 760 vqdmulh.s16 q4, q2, XFIX_1_414213562 761 vqdmulh.s16 q6, q1, XFIX_2_613125930 762 vadd.s16 q3, q1, q1 763 vsub.s16 q1, q5, q1 764 vadd.s16 q10, q2, q4 765 vqdmulh.s16 q4, q1, XFIX_1_847759065 766 vsub.s16 q2, q15, q13 767 vadd.s16 q3, q3, q6 768 vqdmulh.s16 q6, q2, XFIX_1_414213562 769 vadd.s16 q1, q1, q4 770 vqdmulh.s16 q4, q5, XFIX_1_082392200 771 vsub.s16 q10, q10, q14 772 vadd.s16 q2, q2, q6 773 vsub.s16 q6, q8, q12 774 vadd.s16 q12, q8, q12 775 vadd.s16 q9, q5, q4 776 vadd.s16 q5, q6, q10 777 vsub.s16 q10, q6, q10 778 vadd.s16 q6, q15, q13 779 vadd.s16 q8, q12, q14 780 vsub.s16 q3, q6, q3 781 vsub.s16 q12, q12, q14 782 vsub.s16 q3, q3, q1 783 vsub.s16 q1, q9, q1 784 vadd.s16 q2, q3, q2 785 vsub.s16 q15, q8, q6 786 vadd.s16 q1, q1, q2 787 vadd.s16 q8, q8, q6 788 vadd.s16 q14, q5, q3 789 vsub.s16 q9, q5, q3 790 vsub.s16 q13, q10, q2 791 vadd.s16 q10, q10, q2 792 /* Transpose */ 793 vtrn.16 q8, q9 794 vsub.s16 q11, q12, q1 795 vtrn.16 q14, q15 796 vadd.s16 q12, q12, q1 797 vtrn.16 q10, q11 798 vtrn.16 q12, q13 799 vtrn.32 q9, q11 800 vtrn.32 q12, q14 801 vtrn.32 q8, q10 802 vtrn.32 q13, q15 803 vswp d28, d21 804 vswp d26, d19 805 /* 1-D IDCT, pass 2 */ 806 vsub.s16 q2, q10, q14 807 vswp d30, d23 808 vadd.s16 q14, q10, q14 809 vswp d24, d17 810 vsub.s16 q1, q11, q13 811 vadd.s16 q13, q11, q13 812 vsub.s16 q5, q9, q15 813 vadd.s16 q15, q9, q15 814 vqdmulh.s16 q4, q2, XFIX_1_414213562 815 vqdmulh.s16 q6, q1, XFIX_2_613125930 816 vadd.s16 q3, q1, q1 817 vsub.s16 q1, q5, q1 818 vadd.s16 q10, q2, q4 819 vqdmulh.s16 q4, q1, XFIX_1_847759065 820 vsub.s16 q2, q15, q13 821 vadd.s16 q3, q3, q6 822 vqdmulh.s16 q6, q2, XFIX_1_414213562 823 vadd.s16 q1, q1, q4 824 vqdmulh.s16 q4, q5, XFIX_1_082392200 825 vsub.s16 q10, q10, q14 826 vadd.s16 q2, q2, q6 827 vsub.s16 q6, q8, q12 828 vadd.s16 q12, q8, q12 829 vadd.s16 q9, q5, q4 830 vadd.s16 q5, q6, q10 831 vsub.s16 q10, q6, q10 832 vadd.s16 q6, q15, q13 833 vadd.s16 q8, q12, q14 834 vsub.s16 q3, q6, q3 835 vsub.s16 q12, q12, q14 836 vsub.s16 q3, q3, q1 837 vsub.s16 q1, q9, q1 838 vadd.s16 q2, q3, q2 839 vsub.s16 q15, q8, q6 840 vadd.s16 q1, q1, q2 841 vadd.s16 q8, q8, q6 842 vadd.s16 q14, q5, q3 843 vsub.s16 q9, q5, q3 844 vsub.s16 q13, q10, q2 845 vpop {d8-d13} /* restore NEON registers */ 846 vadd.s16 q10, q10, q2 847 vsub.s16 q11, q12, q1 848 vadd.s16 q12, q12, q1 849 /* Descale to 8-bit and range limit */ 850 vmov.u8 q0, #0x80 851 vqshrn.s16 d16, q8, #5 852 vqshrn.s16 d17, q9, #5 853 vqshrn.s16 d18, q10, #5 854 vqshrn.s16 d19, q11, #5 855 vqshrn.s16 d20, q12, #5 856 vqshrn.s16 d21, q13, #5 857 vqshrn.s16 d22, q14, #5 858 vqshrn.s16 d23, q15, #5 859 vadd.u8 q8, q8, q0 860 vadd.u8 q9, q9, q0 861 vadd.u8 q10, q10, q0 862 vadd.u8 q11, q11, q0 863 /* Transpose the final 8-bit samples */ 864 vtrn.16 q8, q9 865 vtrn.16 q10, q11 866 vtrn.32 q8, q10 867 vtrn.32 q9, q11 868 vtrn.8 d16, d17 869 vtrn.8 d18, d19 870 /* Store results to the output buffer */ 871 ldmia OUTPUT_BUF!, {TMP1, TMP2} 872 add TMP1, TMP1, OUTPUT_COL 873 add TMP2, TMP2, OUTPUT_COL 874 vst1.8 {d16}, [TMP1] 875 vst1.8 {d17}, [TMP2] 876 ldmia OUTPUT_BUF!, {TMP1, TMP2} 877 add TMP1, TMP1, OUTPUT_COL 878 add TMP2, TMP2, OUTPUT_COL 879 vst1.8 {d18}, [TMP1] 880 vtrn.8 d20, d21 881 vst1.8 {d19}, [TMP2] 882 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 883 add TMP1, TMP1, OUTPUT_COL 884 add TMP2, TMP2, OUTPUT_COL 885 add TMP3, TMP3, OUTPUT_COL 886 add TMP4, TMP4, OUTPUT_COL 887 vst1.8 {d20}, [TMP1] 888 vtrn.8 d22, d23 889 vst1.8 {d21}, [TMP2] 890 vst1.8 {d22}, [TMP3] 891 vst1.8 {d23}, [TMP4] 892 bx lr 893 894 .unreq DCT_TABLE 895 .unreq COEF_BLOCK 896 .unreq OUTPUT_BUF 897 .unreq OUTPUT_COL 898 .unreq TMP1 899 .unreq TMP2 900 .unreq TMP3 901 .unreq TMP4 902 903 904/*****************************************************************************/ 905 906/* 907 * jsimd_idct_4x4_neon 908 * 909 * This function contains inverse-DCT code for getting reduced-size 910 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 911 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 912 * function from jpeg-6b (jidctred.c). 913 * 914 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 915 * requires much less arithmetic operations and hence should be faster. 916 * The primary purpose of this particular NEON optimized function is 917 * bit exact compatibility with jpeg-6b. 918 * 919 * TODO: a bit better instructions scheduling can be achieved by expanding 920 * idct_helper/transpose_4x4 macros and reordering instructions, 921 * but readability will suffer somewhat. 922 */ 923 924#define CONST_BITS 13 925 926#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 927#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 928#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 929#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 930#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 931#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 932#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 933#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 934#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 935#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 936#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 937#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 938#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 939#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 940 941.balign 16 942jsimd_idct_4x4_neon_consts: 943 .short FIX_1_847759065 /* d0[0] */ 944 .short -FIX_0_765366865 /* d0[1] */ 945 .short -FIX_0_211164243 /* d0[2] */ 946 .short FIX_1_451774981 /* d0[3] */ 947 .short -FIX_2_172734803 /* d1[0] */ 948 .short FIX_1_061594337 /* d1[1] */ 949 .short -FIX_0_509795579 /* d1[2] */ 950 .short -FIX_0_601344887 /* d1[3] */ 951 .short FIX_0_899976223 /* d2[0] */ 952 .short FIX_2_562915447 /* d2[1] */ 953 .short 1 << (CONST_BITS+1) /* d2[2] */ 954 .short 0 /* d2[3] */ 955 956.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 957 vmull.s16 q14, \x4, d2[2] 958 vmlal.s16 q14, \x8, d0[0] 959 vmlal.s16 q14, \x14, d0[1] 960 961 vmull.s16 q13, \x16, d1[2] 962 vmlal.s16 q13, \x12, d1[3] 963 vmlal.s16 q13, \x10, d2[0] 964 vmlal.s16 q13, \x6, d2[1] 965 966 vmull.s16 q15, \x4, d2[2] 967 vmlsl.s16 q15, \x8, d0[0] 968 vmlsl.s16 q15, \x14, d0[1] 969 970 vmull.s16 q12, \x16, d0[2] 971 vmlal.s16 q12, \x12, d0[3] 972 vmlal.s16 q12, \x10, d1[0] 973 vmlal.s16 q12, \x6, d1[1] 974 975 vadd.s32 q10, q14, q13 976 vsub.s32 q14, q14, q13 977 978 .if \shift > 16 979 vrshr.s32 q10, q10, #\shift 980 vrshr.s32 q14, q14, #\shift 981 vmovn.s32 \y26, q10 982 vmovn.s32 \y29, q14 983 .else 984 vrshrn.s32 \y26, q10, #\shift 985 vrshrn.s32 \y29, q14, #\shift 986 .endif 987 988 vadd.s32 q10, q15, q12 989 vsub.s32 q15, q15, q12 990 991 .if \shift > 16 992 vrshr.s32 q10, q10, #\shift 993 vrshr.s32 q15, q15, #\shift 994 vmovn.s32 \y27, q10 995 vmovn.s32 \y28, q15 996 .else 997 vrshrn.s32 \y27, q10, #\shift 998 vrshrn.s32 \y28, q15, #\shift 999 .endif 1000.endm 1001 1002asm_function jsimd_idct_4x4_neon 1003 1004 DCT_TABLE .req r0 1005 COEF_BLOCK .req r1 1006 OUTPUT_BUF .req r2 1007 OUTPUT_COL .req r3 1008 TMP1 .req r0 1009 TMP2 .req r1 1010 TMP3 .req r2 1011 TMP4 .req ip 1012 1013 vpush {d8-d15} 1014 1015 /* Load constants (d3 is just used for padding) */ 1016 adr TMP4, jsimd_idct_4x4_neon_consts 1017 vld1.16 {d0, d1, d2, d3}, [TMP4, :128] 1018 1019 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1020 * 0 1 2 3 | 4 5 6 7 1021 * ---------+-------- 1022 * 0 | d4 | d5 1023 * 1 | d6 | d7 1024 * 2 | d8 | d9 1025 * 3 | d10 | d11 1026 * 4 | - | - 1027 * 5 | d12 | d13 1028 * 6 | d14 | d15 1029 * 7 | d16 | d17 1030 */ 1031 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! 1032 vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]! 1033 add COEF_BLOCK, COEF_BLOCK, #16 1034 vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]! 1035 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! 1036 /* dequantize */ 1037 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! 1038 vmul.s16 q2, q2, q9 1039 vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]! 1040 vmul.s16 q3, q3, q10 1041 vmul.s16 q4, q4, q11 1042 add DCT_TABLE, DCT_TABLE, #16 1043 vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]! 1044 vmul.s16 q5, q5, q12 1045 vmul.s16 q6, q6, q13 1046 vld1.16 {d30, d31}, [DCT_TABLE, :128]! 1047 vmul.s16 q7, q7, q14 1048 vmul.s16 q8, q8, q15 1049 1050 /* Pass 1 */ 1051 idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10 1052 transpose_4x4 d4, d6, d8, d10 1053 idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11 1054 transpose_4x4 d5, d7, d9, d11 1055 1056 /* Pass 2 */ 1057 idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29 1058 transpose_4x4 d26, d27, d28, d29 1059 1060 /* Range limit */ 1061 vmov.u16 q15, #0x80 1062 vadd.s16 q13, q13, q15 1063 vadd.s16 q14, q14, q15 1064 vqmovun.s16 d26, q13 1065 vqmovun.s16 d27, q14 1066 1067 /* Store results to the output buffer */ 1068 ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4} 1069 add TMP1, TMP1, OUTPUT_COL 1070 add TMP2, TMP2, OUTPUT_COL 1071 add TMP3, TMP3, OUTPUT_COL 1072 add TMP4, TMP4, OUTPUT_COL 1073 1074#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT 1075 /* We can use much less instructions on little endian systems if the 1076 * OS kernel is not configured to trap unaligned memory accesses 1077 */ 1078 vst1.32 {d26[0]}, [TMP1]! 1079 vst1.32 {d27[0]}, [TMP3]! 1080 vst1.32 {d26[1]}, [TMP2]! 1081 vst1.32 {d27[1]}, [TMP4]! 1082#else 1083 vst1.8 {d26[0]}, [TMP1]! 1084 vst1.8 {d27[0]}, [TMP3]! 1085 vst1.8 {d26[1]}, [TMP1]! 1086 vst1.8 {d27[1]}, [TMP3]! 1087 vst1.8 {d26[2]}, [TMP1]! 1088 vst1.8 {d27[2]}, [TMP3]! 1089 vst1.8 {d26[3]}, [TMP1]! 1090 vst1.8 {d27[3]}, [TMP3]! 1091 1092 vst1.8 {d26[4]}, [TMP2]! 1093 vst1.8 {d27[4]}, [TMP4]! 1094 vst1.8 {d26[5]}, [TMP2]! 1095 vst1.8 {d27[5]}, [TMP4]! 1096 vst1.8 {d26[6]}, [TMP2]! 1097 vst1.8 {d27[6]}, [TMP4]! 1098 vst1.8 {d26[7]}, [TMP2]! 1099 vst1.8 {d27[7]}, [TMP4]! 1100#endif 1101 1102 vpop {d8-d15} 1103 bx lr 1104 1105 .unreq DCT_TABLE 1106 .unreq COEF_BLOCK 1107 .unreq OUTPUT_BUF 1108 .unreq OUTPUT_COL 1109 .unreq TMP1 1110 .unreq TMP2 1111 .unreq TMP3 1112 .unreq TMP4 1113 1114.purgem idct_helper 1115 1116 1117/*****************************************************************************/ 1118 1119/* 1120 * jsimd_idct_2x2_neon 1121 * 1122 * This function contains inverse-DCT code for getting reduced-size 1123 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations 1124 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 1125 * function from jpeg-6b (jidctred.c). 1126 * 1127 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 1128 * requires much less arithmetic operations and hence should be faster. 1129 * The primary purpose of this particular NEON optimized function is 1130 * bit exact compatibility with jpeg-6b. 1131 */ 1132 1133.balign 8 1134jsimd_idct_2x2_neon_consts: 1135 .short -FIX_0_720959822 /* d0[0] */ 1136 .short FIX_0_850430095 /* d0[1] */ 1137 .short -FIX_1_272758580 /* d0[2] */ 1138 .short FIX_3_624509785 /* d0[3] */ 1139 1140.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 1141 vshll.s16 q14, \x4, #15 1142 vmull.s16 q13, \x6, d0[3] 1143 vmlal.s16 q13, \x10, d0[2] 1144 vmlal.s16 q13, \x12, d0[1] 1145 vmlal.s16 q13, \x16, d0[0] 1146 1147 vadd.s32 q10, q14, q13 1148 vsub.s32 q14, q14, q13 1149 1150 .if \shift > 16 1151 vrshr.s32 q10, q10, #\shift 1152 vrshr.s32 q14, q14, #\shift 1153 vmovn.s32 \y26, q10 1154 vmovn.s32 \y27, q14 1155 .else 1156 vrshrn.s32 \y26, q10, #\shift 1157 vrshrn.s32 \y27, q14, #\shift 1158 .endif 1159.endm 1160 1161asm_function jsimd_idct_2x2_neon 1162 1163 DCT_TABLE .req r0 1164 COEF_BLOCK .req r1 1165 OUTPUT_BUF .req r2 1166 OUTPUT_COL .req r3 1167 TMP1 .req r0 1168 TMP2 .req ip 1169 1170 vpush {d8-d15} 1171 1172 /* Load constants */ 1173 adr TMP2, jsimd_idct_2x2_neon_consts 1174 vld1.16 {d0}, [TMP2, :64] 1175 1176 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1177 * 0 1 2 3 | 4 5 6 7 1178 * ---------+-------- 1179 * 0 | d4 | d5 1180 * 1 | d6 | d7 1181 * 2 | - | - 1182 * 3 | d10 | d11 1183 * 4 | - | - 1184 * 5 | d12 | d13 1185 * 6 | - | - 1186 * 7 | d16 | d17 1187 */ 1188 vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]! 1189 add COEF_BLOCK, COEF_BLOCK, #16 1190 vld1.16 {d10, d11}, [COEF_BLOCK, :128]! 1191 add COEF_BLOCK, COEF_BLOCK, #16 1192 vld1.16 {d12, d13}, [COEF_BLOCK, :128]! 1193 add COEF_BLOCK, COEF_BLOCK, #16 1194 vld1.16 {d16, d17}, [COEF_BLOCK, :128]! 1195 /* Dequantize */ 1196 vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]! 1197 vmul.s16 q2, q2, q9 1198 vmul.s16 q3, q3, q10 1199 add DCT_TABLE, DCT_TABLE, #16 1200 vld1.16 {d24, d25}, [DCT_TABLE, :128]! 1201 vmul.s16 q5, q5, q12 1202 add DCT_TABLE, DCT_TABLE, #16 1203 vld1.16 {d26, d27}, [DCT_TABLE, :128]! 1204 vmul.s16 q6, q6, q13 1205 add DCT_TABLE, DCT_TABLE, #16 1206 vld1.16 {d30, d31}, [DCT_TABLE, :128]! 1207 vmul.s16 q8, q8, q15 1208 1209 /* Pass 1 */ 1210#if 0 1211 idct_helper d4, d6, d10, d12, d16, 13, d4, d6 1212 transpose_4x4 d4, d6, d8, d10 1213 idct_helper d5, d7, d11, d13, d17, 13, d5, d7 1214 transpose_4x4 d5, d7, d9, d11 1215#else 1216 vmull.s16 q13, d6, d0[3] 1217 vmlal.s16 q13, d10, d0[2] 1218 vmlal.s16 q13, d12, d0[1] 1219 vmlal.s16 q13, d16, d0[0] 1220 vmull.s16 q12, d7, d0[3] 1221 vmlal.s16 q12, d11, d0[2] 1222 vmlal.s16 q12, d13, d0[1] 1223 vmlal.s16 q12, d17, d0[0] 1224 vshll.s16 q14, d4, #15 1225 vshll.s16 q15, d5, #15 1226 vadd.s32 q10, q14, q13 1227 vsub.s32 q14, q14, q13 1228 vrshrn.s32 d4, q10, #13 1229 vrshrn.s32 d6, q14, #13 1230 vadd.s32 q10, q15, q12 1231 vsub.s32 q14, q15, q12 1232 vrshrn.s32 d5, q10, #13 1233 vrshrn.s32 d7, q14, #13 1234 vtrn.16 q2, q3 1235 vtrn.32 q3, q5 1236#endif 1237 1238 /* Pass 2 */ 1239 idct_helper d4, d6, d10, d7, d11, 20, d26, d27 1240 1241 /* Range limit */ 1242 vmov.u16 q15, #0x80 1243 vadd.s16 q13, q13, q15 1244 vqmovun.s16 d26, q13 1245 vqmovun.s16 d27, q13 1246 1247 /* Store results to the output buffer */ 1248 ldmia OUTPUT_BUF, {TMP1, TMP2} 1249 add TMP1, TMP1, OUTPUT_COL 1250 add TMP2, TMP2, OUTPUT_COL 1251 1252 vst1.8 {d26[0]}, [TMP1]! 1253 vst1.8 {d27[4]}, [TMP1]! 1254 vst1.8 {d26[1]}, [TMP2]! 1255 vst1.8 {d27[5]}, [TMP2]! 1256 1257 vpop {d8-d15} 1258 bx lr 1259 1260 .unreq DCT_TABLE 1261 .unreq COEF_BLOCK 1262 .unreq OUTPUT_BUF 1263 .unreq OUTPUT_COL 1264 .unreq TMP1 1265 .unreq TMP2 1266 1267.purgem idct_helper 1268 1269 1270/*****************************************************************************/ 1271 1272/* 1273 * jsimd_ycc_extrgb_convert_neon 1274 * jsimd_ycc_extbgr_convert_neon 1275 * jsimd_ycc_extrgbx_convert_neon 1276 * jsimd_ycc_extbgrx_convert_neon 1277 * jsimd_ycc_extxbgr_convert_neon 1278 * jsimd_ycc_extxrgb_convert_neon 1279 * 1280 * Colorspace conversion YCbCr -> RGB 1281 */ 1282 1283 1284.macro do_load size 1285 .if \size == 8 1286 vld1.8 {d4}, [U, :64]! 1287 vld1.8 {d5}, [V, :64]! 1288 vld1.8 {d0}, [Y, :64]! 1289 pld [U, #64] 1290 pld [V, #64] 1291 pld [Y, #64] 1292 .elseif \size == 4 1293 vld1.8 {d4[0]}, [U]! 1294 vld1.8 {d4[1]}, [U]! 1295 vld1.8 {d4[2]}, [U]! 1296 vld1.8 {d4[3]}, [U]! 1297 vld1.8 {d5[0]}, [V]! 1298 vld1.8 {d5[1]}, [V]! 1299 vld1.8 {d5[2]}, [V]! 1300 vld1.8 {d5[3]}, [V]! 1301 vld1.8 {d0[0]}, [Y]! 1302 vld1.8 {d0[1]}, [Y]! 1303 vld1.8 {d0[2]}, [Y]! 1304 vld1.8 {d0[3]}, [Y]! 1305 .elseif \size == 2 1306 vld1.8 {d4[4]}, [U]! 1307 vld1.8 {d4[5]}, [U]! 1308 vld1.8 {d5[4]}, [V]! 1309 vld1.8 {d5[5]}, [V]! 1310 vld1.8 {d0[4]}, [Y]! 1311 vld1.8 {d0[5]}, [Y]! 1312 .elseif \size == 1 1313 vld1.8 {d4[6]}, [U]! 1314 vld1.8 {d5[6]}, [V]! 1315 vld1.8 {d0[6]}, [Y]! 1316 .else 1317 .error unsupported macroblock size 1318 .endif 1319.endm 1320 1321.macro do_store bpp, size 1322 .if \bpp == 24 1323 .if \size == 8 1324 vst3.8 {d10, d11, d12}, [RGB]! 1325 .elseif \size == 4 1326 vst3.8 {d10[0], d11[0], d12[0]}, [RGB]! 1327 vst3.8 {d10[1], d11[1], d12[1]}, [RGB]! 1328 vst3.8 {d10[2], d11[2], d12[2]}, [RGB]! 1329 vst3.8 {d10[3], d11[3], d12[3]}, [RGB]! 1330 .elseif \size == 2 1331 vst3.8 {d10[4], d11[4], d12[4]}, [RGB]! 1332 vst3.8 {d10[5], d11[5], d12[5]}, [RGB]! 1333 .elseif \size == 1 1334 vst3.8 {d10[6], d11[6], d12[6]}, [RGB]! 1335 .else 1336 .error unsupported macroblock size 1337 .endif 1338 .elseif \bpp == 32 1339 .if \size == 8 1340 vst4.8 {d10, d11, d12, d13}, [RGB]! 1341 .elseif \size == 4 1342 vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 1343 vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 1344 vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 1345 vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 1346 .elseif \size == 2 1347 vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 1348 vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 1349 .elseif \size == 1 1350 vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 1351 .else 1352 .error unsupported macroblock size 1353 .endif 1354 .elseif \bpp == 16 1355 .if \size == 8 1356 vst1.16 {q15}, [RGB]! 1357 .elseif \size == 4 1358 vst1.16 {d30}, [RGB]! 1359 .elseif \size == 2 1360 vst1.16 {d31[0]}, [RGB]! 1361 vst1.16 {d31[1]}, [RGB]! 1362 .elseif \size == 1 1363 vst1.16 {d31[2]}, [RGB]! 1364 .else 1365 .error unsupported macroblock size 1366 .endif 1367 .else 1368 .error unsupported bpp 1369 .endif 1370.endm 1371 1372.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs 1373 1374/* 1375 * 2-stage pipelined YCbCr->RGB conversion 1376 */ 1377 1378.macro do_yuv_to_rgb_stage1 1379 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 1380 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 1381 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 1382 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 1383 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 1384 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 1385 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 1386 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 1387 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 1388 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 1389.endm 1390 1391.macro do_yuv_to_rgb_stage2 1392 vrshrn.s32 d20, q10, #15 1393 vrshrn.s32 d21, q11, #15 1394 vrshrn.s32 d24, q12, #14 1395 vrshrn.s32 d25, q13, #14 1396 vrshrn.s32 d28, q14, #14 1397 vrshrn.s32 d29, q15, #14 1398 vaddw.u8 q11, q10, d0 1399 vaddw.u8 q12, q12, d0 1400 vaddw.u8 q14, q14, d0 1401 .if \bpp != 16 1402 vqmovun.s16 d1\g_offs, q11 1403 vqmovun.s16 d1\r_offs, q12 1404 vqmovun.s16 d1\b_offs, q14 1405 .else /* rgb565 */ 1406 vqshlu.s16 q13, q11, #8 1407 vqshlu.s16 q15, q12, #8 1408 vqshlu.s16 q14, q14, #8 1409 vsri.u16 q15, q13, #5 1410 vsri.u16 q15, q14, #11 1411 .endif 1412.endm 1413 1414.macro do_yuv_to_rgb_stage2_store_load_stage1 1415 /* "do_yuv_to_rgb_stage2" and "store" */ 1416 vrshrn.s32 d20, q10, #15 1417 /* "load" and "do_yuv_to_rgb_stage1" */ 1418 pld [U, #64] 1419 vrshrn.s32 d21, q11, #15 1420 pld [V, #64] 1421 vrshrn.s32 d24, q12, #14 1422 vrshrn.s32 d25, q13, #14 1423 vld1.8 {d4}, [U, :64]! 1424 vrshrn.s32 d28, q14, #14 1425 vld1.8 {d5}, [V, :64]! 1426 vrshrn.s32 d29, q15, #14 1427 vaddw.u8 q3, q1, d4 /* q3 = u - 128 */ 1428 vaddw.u8 q4, q1, d5 /* q2 = v - 128 */ 1429 vaddw.u8 q11, q10, d0 1430 vmull.s16 q10, d6, d1[1] /* multiply by -11277 */ 1431 vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */ 1432 vaddw.u8 q12, q12, d0 1433 vaddw.u8 q14, q14, d0 1434 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ 1435 vqmovun.s16 d1\g_offs, q11 1436 pld [Y, #64] 1437 vqmovun.s16 d1\r_offs, q12 1438 vld1.8 {d0}, [Y, :64]! 1439 vqmovun.s16 d1\b_offs, q14 1440 vmull.s16 q11, d7, d1[1] /* multiply by -11277 */ 1441 vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */ 1442 do_store \bpp, 8 1443 vmull.s16 q12, d8, d1[0] /* multiply by 22971 */ 1444 vmull.s16 q13, d9, d1[0] /* multiply by 22971 */ 1445 vmull.s16 q14, d6, d1[3] /* multiply by 29033 */ 1446 vmull.s16 q15, d7, d1[3] /* multiply by 29033 */ 1447 .else /**************************** rgb565 ********************************/ 1448 vqshlu.s16 q13, q11, #8 1449 pld [Y, #64] 1450 vqshlu.s16 q15, q12, #8 1451 vqshlu.s16 q14, q14, #8 1452 vld1.8 {d0}, [Y, :64]! 1453 vmull.s16 q11, d7, d1[1] 1454 vmlal.s16 q11, d9, d1[2] 1455 vsri.u16 q15, q13, #5 1456 vmull.s16 q12, d8, d1[0] 1457 vsri.u16 q15, q14, #11 1458 vmull.s16 q13, d9, d1[0] 1459 vmull.s16 q14, d6, d1[3] 1460 do_store \bpp, 8 1461 vmull.s16 q15, d7, d1[3] 1462 .endif 1463.endm 1464 1465.macro do_yuv_to_rgb 1466 do_yuv_to_rgb_stage1 1467 do_yuv_to_rgb_stage2 1468.endm 1469 1470/* Apple gas crashes on adrl, work around that by using adr. 1471 * But this requires a copy of these constants for each function. 1472 */ 1473 1474.balign 16 1475jsimd_ycc_\colorid\()_neon_consts: 1476 .short 0, 0, 0, 0 1477 .short 22971, -11277, -23401, 29033 1478 .short -128, -128, -128, -128 1479 .short -128, -128, -128, -128 1480 1481asm_function jsimd_ycc_\colorid\()_convert_neon 1482 OUTPUT_WIDTH .req r0 1483 INPUT_BUF .req r1 1484 INPUT_ROW .req r2 1485 OUTPUT_BUF .req r3 1486 NUM_ROWS .req r4 1487 1488 INPUT_BUF0 .req r5 1489 INPUT_BUF1 .req r6 1490 INPUT_BUF2 .req INPUT_BUF 1491 1492 RGB .req r7 1493 Y .req r8 1494 U .req r9 1495 V .req r10 1496 N .req ip 1497 1498 /* Load constants to d1, d2, d3 (d0 is just used for padding) */ 1499 adr ip, jsimd_ycc_\colorid\()_neon_consts 1500 vld1.16 {d0, d1, d2, d3}, [ip, :128] 1501 1502 /* Save ARM registers and handle input arguments */ 1503 push {r4, r5, r6, r7, r8, r9, r10, lr} 1504 ldr NUM_ROWS, [sp, #(4 * 8)] 1505 ldr INPUT_BUF0, [INPUT_BUF] 1506 ldr INPUT_BUF1, [INPUT_BUF, #4] 1507 ldr INPUT_BUF2, [INPUT_BUF, #8] 1508 .unreq INPUT_BUF 1509 1510 /* Save NEON registers */ 1511 vpush {d8-d15} 1512 1513 /* Initially set d10, d11, d12, d13 to 0xFF */ 1514 vmov.u8 q5, #255 1515 vmov.u8 q6, #255 1516 1517 /* Outer loop over scanlines */ 1518 cmp NUM_ROWS, #1 1519 blt 9f 15200: 1521 ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2] 1522 ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2] 1523 mov N, OUTPUT_WIDTH 1524 ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2] 1525 add INPUT_ROW, INPUT_ROW, #1 1526 ldr RGB, [OUTPUT_BUF], #4 1527 1528 /* Inner loop over pixels */ 1529 subs N, N, #8 1530 blt 3f 1531 do_load 8 1532 do_yuv_to_rgb_stage1 1533 subs N, N, #8 1534 blt 2f 15351: 1536 do_yuv_to_rgb_stage2_store_load_stage1 1537 subs N, N, #8 1538 bge 1b 15392: 1540 do_yuv_to_rgb_stage2 1541 do_store \bpp, 8 1542 tst N, #7 1543 beq 8f 15443: 1545 tst N, #4 1546 beq 3f 1547 do_load 4 15483: 1549 tst N, #2 1550 beq 4f 1551 do_load 2 15524: 1553 tst N, #1 1554 beq 5f 1555 do_load 1 15565: 1557 do_yuv_to_rgb 1558 tst N, #4 1559 beq 6f 1560 do_store \bpp, 4 15616: 1562 tst N, #2 1563 beq 7f 1564 do_store \bpp, 2 15657: 1566 tst N, #1 1567 beq 8f 1568 do_store \bpp, 1 15698: 1570 subs NUM_ROWS, NUM_ROWS, #1 1571 bgt 0b 15729: 1573 /* Restore all registers and return */ 1574 vpop {d8-d15} 1575 pop {r4, r5, r6, r7, r8, r9, r10, pc} 1576 1577 .unreq OUTPUT_WIDTH 1578 .unreq INPUT_ROW 1579 .unreq OUTPUT_BUF 1580 .unreq NUM_ROWS 1581 .unreq INPUT_BUF0 1582 .unreq INPUT_BUF1 1583 .unreq INPUT_BUF2 1584 .unreq RGB 1585 .unreq Y 1586 .unreq U 1587 .unreq V 1588 .unreq N 1589 1590.purgem do_yuv_to_rgb 1591.purgem do_yuv_to_rgb_stage1 1592.purgem do_yuv_to_rgb_stage2 1593.purgem do_yuv_to_rgb_stage2_store_load_stage1 1594 1595.endm 1596 1597/*--------------------------------- id ----- bpp R G B */ 1598generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2 1599generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0 1600generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2 1601generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0 1602generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1 1603generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3 1604generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0 1605 1606.purgem do_load 1607.purgem do_store 1608 1609 1610/*****************************************************************************/ 1611 1612/* 1613 * jsimd_extrgb_ycc_convert_neon 1614 * jsimd_extbgr_ycc_convert_neon 1615 * jsimd_extrgbx_ycc_convert_neon 1616 * jsimd_extbgrx_ycc_convert_neon 1617 * jsimd_extxbgr_ycc_convert_neon 1618 * jsimd_extxrgb_ycc_convert_neon 1619 * 1620 * Colorspace conversion RGB -> YCbCr 1621 */ 1622 1623.macro do_store size 1624 .if \size == 8 1625 vst1.8 {d20}, [Y]! 1626 vst1.8 {d21}, [U]! 1627 vst1.8 {d22}, [V]! 1628 .elseif \size == 4 1629 vst1.8 {d20[0]}, [Y]! 1630 vst1.8 {d20[1]}, [Y]! 1631 vst1.8 {d20[2]}, [Y]! 1632 vst1.8 {d20[3]}, [Y]! 1633 vst1.8 {d21[0]}, [U]! 1634 vst1.8 {d21[1]}, [U]! 1635 vst1.8 {d21[2]}, [U]! 1636 vst1.8 {d21[3]}, [U]! 1637 vst1.8 {d22[0]}, [V]! 1638 vst1.8 {d22[1]}, [V]! 1639 vst1.8 {d22[2]}, [V]! 1640 vst1.8 {d22[3]}, [V]! 1641 .elseif \size == 2 1642 vst1.8 {d20[4]}, [Y]! 1643 vst1.8 {d20[5]}, [Y]! 1644 vst1.8 {d21[4]}, [U]! 1645 vst1.8 {d21[5]}, [U]! 1646 vst1.8 {d22[4]}, [V]! 1647 vst1.8 {d22[5]}, [V]! 1648 .elseif \size == 1 1649 vst1.8 {d20[6]}, [Y]! 1650 vst1.8 {d21[6]}, [U]! 1651 vst1.8 {d22[6]}, [V]! 1652 .else 1653 .error unsupported macroblock size 1654 .endif 1655.endm 1656 1657.macro do_load bpp, size 1658 .if \bpp == 24 1659 .if \size == 8 1660 vld3.8 {d10, d11, d12}, [RGB]! 1661 pld [RGB, #128] 1662 .elseif \size == 4 1663 vld3.8 {d10[0], d11[0], d12[0]}, [RGB]! 1664 vld3.8 {d10[1], d11[1], d12[1]}, [RGB]! 1665 vld3.8 {d10[2], d11[2], d12[2]}, [RGB]! 1666 vld3.8 {d10[3], d11[3], d12[3]}, [RGB]! 1667 .elseif \size == 2 1668 vld3.8 {d10[4], d11[4], d12[4]}, [RGB]! 1669 vld3.8 {d10[5], d11[5], d12[5]}, [RGB]! 1670 .elseif \size == 1 1671 vld3.8 {d10[6], d11[6], d12[6]}, [RGB]! 1672 .else 1673 .error unsupported macroblock size 1674 .endif 1675 .elseif \bpp == 32 1676 .if \size == 8 1677 vld4.8 {d10, d11, d12, d13}, [RGB]! 1678 pld [RGB, #128] 1679 .elseif \size == 4 1680 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]! 1681 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]! 1682 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]! 1683 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]! 1684 .elseif \size == 2 1685 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]! 1686 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]! 1687 .elseif \size == 1 1688 vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]! 1689 .else 1690 .error unsupported macroblock size 1691 .endif 1692 .else 1693 .error unsupported bpp 1694 .endif 1695.endm 1696 1697.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs 1698 1699/* 1700 * 2-stage pipelined RGB->YCbCr conversion 1701 */ 1702 1703.macro do_rgb_to_yuv_stage1 1704 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ 1705 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ 1706 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ 1707 vmull.u16 q7, d4, d0[0] 1708 vmlal.u16 q7, d6, d0[1] 1709 vmlal.u16 q7, d8, d0[2] 1710 vmull.u16 q8, d5, d0[0] 1711 vmlal.u16 q8, d7, d0[1] 1712 vmlal.u16 q8, d9, d0[2] 1713 vrev64.32 q9, q1 1714 vrev64.32 q13, q1 1715 vmlsl.u16 q9, d4, d0[3] 1716 vmlsl.u16 q9, d6, d1[0] 1717 vmlal.u16 q9, d8, d1[1] 1718 vmlsl.u16 q13, d5, d0[3] 1719 vmlsl.u16 q13, d7, d1[0] 1720 vmlal.u16 q13, d9, d1[1] 1721 vrev64.32 q14, q1 1722 vrev64.32 q15, q1 1723 vmlal.u16 q14, d4, d1[1] 1724 vmlsl.u16 q14, d6, d1[2] 1725 vmlsl.u16 q14, d8, d1[3] 1726 vmlal.u16 q15, d5, d1[1] 1727 vmlsl.u16 q15, d7, d1[2] 1728 vmlsl.u16 q15, d9, d1[3] 1729.endm 1730 1731.macro do_rgb_to_yuv_stage2 1732 vrshrn.u32 d20, q7, #16 1733 vrshrn.u32 d21, q8, #16 1734 vshrn.u32 d22, q9, #16 1735 vshrn.u32 d23, q13, #16 1736 vshrn.u32 d24, q14, #16 1737 vshrn.u32 d25, q15, #16 1738 vmovn.u16 d20, q10 /* d20 = y */ 1739 vmovn.u16 d21, q11 /* d21 = u */ 1740 vmovn.u16 d22, q12 /* d22 = v */ 1741.endm 1742 1743.macro do_rgb_to_yuv 1744 do_rgb_to_yuv_stage1 1745 do_rgb_to_yuv_stage2 1746.endm 1747 1748.macro do_rgb_to_yuv_stage2_store_load_stage1 1749 vrshrn.u32 d20, q7, #16 1750 vrshrn.u32 d21, q8, #16 1751 vshrn.u32 d22, q9, #16 1752 vrev64.32 q9, q1 1753 vshrn.u32 d23, q13, #16 1754 vrev64.32 q13, q1 1755 vshrn.u32 d24, q14, #16 1756 vshrn.u32 d25, q15, #16 1757 do_load \bpp, 8 1758 vmovn.u16 d20, q10 /* d20 = y */ 1759 vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */ 1760 vmovn.u16 d21, q11 /* d21 = u */ 1761 vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */ 1762 vmovn.u16 d22, q12 /* d22 = v */ 1763 vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */ 1764 vmull.u16 q7, d4, d0[0] 1765 vmlal.u16 q7, d6, d0[1] 1766 vmlal.u16 q7, d8, d0[2] 1767 vst1.8 {d20}, [Y]! 1768 vmull.u16 q8, d5, d0[0] 1769 vmlal.u16 q8, d7, d0[1] 1770 vmlal.u16 q8, d9, d0[2] 1771 vmlsl.u16 q9, d4, d0[3] 1772 vmlsl.u16 q9, d6, d1[0] 1773 vmlal.u16 q9, d8, d1[1] 1774 vst1.8 {d21}, [U]! 1775 vmlsl.u16 q13, d5, d0[3] 1776 vmlsl.u16 q13, d7, d1[0] 1777 vmlal.u16 q13, d9, d1[1] 1778 vrev64.32 q14, q1 1779 vrev64.32 q15, q1 1780 vmlal.u16 q14, d4, d1[1] 1781 vmlsl.u16 q14, d6, d1[2] 1782 vmlsl.u16 q14, d8, d1[3] 1783 vst1.8 {d22}, [V]! 1784 vmlal.u16 q15, d5, d1[1] 1785 vmlsl.u16 q15, d7, d1[2] 1786 vmlsl.u16 q15, d9, d1[3] 1787.endm 1788 1789.balign 16 1790jsimd_\colorid\()_ycc_neon_consts: 1791 .short 19595, 38470, 7471, 11059 1792 .short 21709, 32768, 27439, 5329 1793 .short 32767, 128, 32767, 128 1794 .short 32767, 128, 32767, 128 1795 1796asm_function jsimd_\colorid\()_ycc_convert_neon 1797 OUTPUT_WIDTH .req r0 1798 INPUT_BUF .req r1 1799 OUTPUT_BUF .req r2 1800 OUTPUT_ROW .req r3 1801 NUM_ROWS .req r4 1802 1803 OUTPUT_BUF0 .req r5 1804 OUTPUT_BUF1 .req r6 1805 OUTPUT_BUF2 .req OUTPUT_BUF 1806 1807 RGB .req r7 1808 Y .req r8 1809 U .req r9 1810 V .req r10 1811 N .req ip 1812 1813 /* Load constants to d0, d1, d2, d3 */ 1814 adr ip, jsimd_\colorid\()_ycc_neon_consts 1815 vld1.16 {d0, d1, d2, d3}, [ip, :128] 1816 1817 /* Save ARM registers and handle input arguments */ 1818 push {r4, r5, r6, r7, r8, r9, r10, lr} 1819 ldr NUM_ROWS, [sp, #(4 * 8)] 1820 ldr OUTPUT_BUF0, [OUTPUT_BUF] 1821 ldr OUTPUT_BUF1, [OUTPUT_BUF, #4] 1822 ldr OUTPUT_BUF2, [OUTPUT_BUF, #8] 1823 .unreq OUTPUT_BUF 1824 1825 /* Save NEON registers */ 1826 vpush {d8-d15} 1827 1828 /* Outer loop over scanlines */ 1829 cmp NUM_ROWS, #1 1830 blt 9f 18310: 1832 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2] 1833 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2] 1834 mov N, OUTPUT_WIDTH 1835 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2] 1836 add OUTPUT_ROW, OUTPUT_ROW, #1 1837 ldr RGB, [INPUT_BUF], #4 1838 1839 /* Inner loop over pixels */ 1840 subs N, N, #8 1841 blt 3f 1842 do_load \bpp, 8 1843 do_rgb_to_yuv_stage1 1844 subs N, N, #8 1845 blt 2f 18461: 1847 do_rgb_to_yuv_stage2_store_load_stage1 1848 subs N, N, #8 1849 bge 1b 18502: 1851 do_rgb_to_yuv_stage2 1852 do_store 8 1853 tst N, #7 1854 beq 8f 18553: 1856 tst N, #4 1857 beq 3f 1858 do_load \bpp, 4 18593: 1860 tst N, #2 1861 beq 4f 1862 do_load \bpp, 2 18634: 1864 tst N, #1 1865 beq 5f 1866 do_load \bpp, 1 18675: 1868 do_rgb_to_yuv 1869 tst N, #4 1870 beq 6f 1871 do_store 4 18726: 1873 tst N, #2 1874 beq 7f 1875 do_store 2 18767: 1877 tst N, #1 1878 beq 8f 1879 do_store 1 18808: 1881 subs NUM_ROWS, NUM_ROWS, #1 1882 bgt 0b 18839: 1884 /* Restore all registers and return */ 1885 vpop {d8-d15} 1886 pop {r4, r5, r6, r7, r8, r9, r10, pc} 1887 1888 .unreq OUTPUT_WIDTH 1889 .unreq OUTPUT_ROW 1890 .unreq INPUT_BUF 1891 .unreq NUM_ROWS 1892 .unreq OUTPUT_BUF0 1893 .unreq OUTPUT_BUF1 1894 .unreq OUTPUT_BUF2 1895 .unreq RGB 1896 .unreq Y 1897 .unreq U 1898 .unreq V 1899 .unreq N 1900 1901.purgem do_rgb_to_yuv 1902.purgem do_rgb_to_yuv_stage1 1903.purgem do_rgb_to_yuv_stage2 1904.purgem do_rgb_to_yuv_stage2_store_load_stage1 1905 1906.endm 1907 1908/*--------------------------------- id ----- bpp R G B */ 1909generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2 1910generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0 1911generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2 1912generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0 1913generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1 1914generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3 1915 1916.purgem do_load 1917.purgem do_store 1918 1919 1920/*****************************************************************************/ 1921 1922/* 1923 * Load data into workspace, applying unsigned->signed conversion 1924 * 1925 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get 1926 * rid of VST1.16 instructions 1927 */ 1928 1929asm_function jsimd_convsamp_neon 1930 SAMPLE_DATA .req r0 1931 START_COL .req r1 1932 WORKSPACE .req r2 1933 TMP1 .req r3 1934 TMP2 .req r4 1935 TMP3 .req r5 1936 TMP4 .req ip 1937 1938 push {r4, r5} 1939 vmov.u8 d0, #128 1940 1941 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} 1942 add TMP1, TMP1, START_COL 1943 add TMP2, TMP2, START_COL 1944 add TMP3, TMP3, START_COL 1945 add TMP4, TMP4, START_COL 1946 vld1.8 {d16}, [TMP1] 1947 vsubl.u8 q8, d16, d0 1948 vld1.8 {d18}, [TMP2] 1949 vsubl.u8 q9, d18, d0 1950 vld1.8 {d20}, [TMP3] 1951 vsubl.u8 q10, d20, d0 1952 vld1.8 {d22}, [TMP4] 1953 ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4} 1954 vsubl.u8 q11, d22, d0 1955 vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]! 1956 add TMP1, TMP1, START_COL 1957 add TMP2, TMP2, START_COL 1958 vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]! 1959 add TMP3, TMP3, START_COL 1960 add TMP4, TMP4, START_COL 1961 vld1.8 {d24}, [TMP1] 1962 vsubl.u8 q12, d24, d0 1963 vld1.8 {d26}, [TMP2] 1964 vsubl.u8 q13, d26, d0 1965 vld1.8 {d28}, [TMP3] 1966 vsubl.u8 q14, d28, d0 1967 vld1.8 {d30}, [TMP4] 1968 vsubl.u8 q15, d30, d0 1969 vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]! 1970 vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]! 1971 pop {r4, r5} 1972 bx lr 1973 1974 .unreq SAMPLE_DATA 1975 .unreq START_COL 1976 .unreq WORKSPACE 1977 .unreq TMP1 1978 .unreq TMP2 1979 .unreq TMP3 1980 .unreq TMP4 1981 1982 1983/*****************************************************************************/ 1984 1985/* 1986 * jsimd_fdct_ifast_neon 1987 * 1988 * This function contains a fast, not so accurate integer implementation of 1989 * the forward DCT (Discrete Cosine Transform). It uses the same calculations 1990 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' 1991 * function from jfdctfst.c 1992 * 1993 * TODO: can be combined with 'jsimd_convsamp_neon' to get 1994 * rid of a bunch of VLD1.16 instructions 1995 */ 1996 1997#define XFIX_0_382683433 d0[0] 1998#define XFIX_0_541196100 d0[1] 1999#define XFIX_0_707106781 d0[2] 2000#define XFIX_1_306562965 d0[3] 2001 2002.balign 16 2003jsimd_fdct_ifast_neon_consts: 2004 .short (98 * 128) /* XFIX_0_382683433 */ 2005 .short (139 * 128) /* XFIX_0_541196100 */ 2006 .short (181 * 128) /* XFIX_0_707106781 */ 2007 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ 2008 2009asm_function jsimd_fdct_ifast_neon 2010 2011 DATA .req r0 2012 TMP .req ip 2013 2014 vpush {d8-d15} 2015 2016 /* Load constants */ 2017 adr TMP, jsimd_fdct_ifast_neon_consts 2018 vld1.16 {d0}, [TMP, :64] 2019 2020 /* Load all DATA into NEON registers with the following allocation: 2021 * 0 1 2 3 | 4 5 6 7 2022 * ---------+-------- 2023 * 0 | d16 | d17 | q8 2024 * 1 | d18 | d19 | q9 2025 * 2 | d20 | d21 | q10 2026 * 3 | d22 | d23 | q11 2027 * 4 | d24 | d25 | q12 2028 * 5 | d26 | d27 | q13 2029 * 6 | d28 | d29 | q14 2030 * 7 | d30 | d31 | q15 2031 */ 2032 2033 vld1.16 {d16, d17, d18, d19}, [DATA, :128]! 2034 vld1.16 {d20, d21, d22, d23}, [DATA, :128]! 2035 vld1.16 {d24, d25, d26, d27}, [DATA, :128]! 2036 vld1.16 {d28, d29, d30, d31}, [DATA, :128] 2037 sub DATA, DATA, #(128 - 32) 2038 2039 mov TMP, #2 20401: 2041 /* Transpose */ 2042 vtrn.16 q12, q13 2043 vtrn.16 q10, q11 2044 vtrn.16 q8, q9 2045 vtrn.16 q14, q15 2046 vtrn.32 q9, q11 2047 vtrn.32 q13, q15 2048 vtrn.32 q8, q10 2049 vtrn.32 q12, q14 2050 vswp d30, d23 2051 vswp d24, d17 2052 vswp d26, d19 2053 /* 1-D FDCT */ 2054 vadd.s16 q2, q11, q12 2055 vswp d28, d21 2056 vsub.s16 q12, q11, q12 2057 vsub.s16 q6, q10, q13 2058 vadd.s16 q10, q10, q13 2059 vsub.s16 q7, q9, q14 2060 vadd.s16 q9, q9, q14 2061 vsub.s16 q1, q8, q15 2062 vadd.s16 q8, q8, q15 2063 vsub.s16 q4, q9, q10 2064 vsub.s16 q5, q8, q2 2065 vadd.s16 q3, q9, q10 2066 vadd.s16 q4, q4, q5 2067 vadd.s16 q2, q8, q2 2068 vqdmulh.s16 q4, q4, XFIX_0_707106781 2069 vadd.s16 q11, q12, q6 2070 vadd.s16 q8, q2, q3 2071 vsub.s16 q12, q2, q3 2072 vadd.s16 q3, q6, q7 2073 vadd.s16 q7, q7, q1 2074 vqdmulh.s16 q3, q3, XFIX_0_707106781 2075 vsub.s16 q6, q11, q7 2076 vadd.s16 q10, q5, q4 2077 vqdmulh.s16 q6, q6, XFIX_0_382683433 2078 vsub.s16 q14, q5, q4 2079 vqdmulh.s16 q11, q11, XFIX_0_541196100 2080 vqdmulh.s16 q5, q7, XFIX_1_306562965 2081 vadd.s16 q4, q1, q3 2082 vsub.s16 q3, q1, q3 2083 vadd.s16 q7, q7, q6 2084 vadd.s16 q11, q11, q6 2085 vadd.s16 q7, q7, q5 2086 vadd.s16 q13, q3, q11 2087 vsub.s16 q11, q3, q11 2088 vadd.s16 q9, q4, q7 2089 vsub.s16 q15, q4, q7 2090 subs TMP, TMP, #1 2091 bne 1b 2092 2093 /* store results */ 2094 vst1.16 {d16, d17, d18, d19}, [DATA, :128]! 2095 vst1.16 {d20, d21, d22, d23}, [DATA, :128]! 2096 vst1.16 {d24, d25, d26, d27}, [DATA, :128]! 2097 vst1.16 {d28, d29, d30, d31}, [DATA, :128] 2098 2099 vpop {d8-d15} 2100 bx lr 2101 2102 .unreq DATA 2103 .unreq TMP 2104 2105 2106/*****************************************************************************/ 2107 2108/* 2109 * GLOBAL(void) 2110 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors, 2111 * DCTELEM *workspace); 2112 * 2113 * Note: the code uses 2 stage pipelining in order to improve instructions 2114 * scheduling and eliminate stalls (this provides ~15% better 2115 * performance for this function on both ARM Cortex-A8 and 2116 * ARM Cortex-A9 when compared to the non-pipelined variant). 2117 * The instructions which belong to the second stage use different 2118 * indentation for better readiability. 2119 */ 2120asm_function jsimd_quantize_neon 2121 2122 COEF_BLOCK .req r0 2123 DIVISORS .req r1 2124 WORKSPACE .req r2 2125 2126 RECIPROCAL .req DIVISORS 2127 CORRECTION .req r3 2128 SHIFT .req ip 2129 LOOP_COUNT .req r4 2130 2131 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! 2132 vabs.s16 q12, q0 2133 add CORRECTION, DIVISORS, #(64 * 2) 2134 add SHIFT, DIVISORS, #(64 * 6) 2135 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! 2136 vabs.s16 q13, q1 2137 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! 2138 vadd.u16 q12, q12, q10 /* add correction */ 2139 vadd.u16 q13, q13, q11 2140 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ 2141 vmull.u16 q11, d25, d17 2142 vmull.u16 q8, d26, d18 2143 vmull.u16 q9, d27, d19 2144 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! 2145 vshrn.u32 d20, q10, #16 2146 vshrn.u32 d21, q11, #16 2147 vshrn.u32 d22, q8, #16 2148 vshrn.u32 d23, q9, #16 2149 vneg.s16 q12, q12 2150 vneg.s16 q13, q13 2151 vshr.s16 q2, q0, #15 /* extract sign */ 2152 vshr.s16 q3, q1, #15 2153 vshl.u16 q14, q10, q12 /* shift */ 2154 vshl.u16 q15, q11, q13 2155 2156 push {r4, r5} 2157 mov LOOP_COUNT, #3 21581: 2159 vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]! 2160 veor.u16 q14, q14, q2 /* restore sign */ 2161 vabs.s16 q12, q0 2162 vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]! 2163 vabs.s16 q13, q1 2164 veor.u16 q15, q15, q3 2165 vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]! 2166 vadd.u16 q12, q12, q10 /* add correction */ 2167 vadd.u16 q13, q13, q11 2168 vmull.u16 q10, d24, d16 /* multiply by reciprocal */ 2169 vmull.u16 q11, d25, d17 2170 vmull.u16 q8, d26, d18 2171 vmull.u16 q9, d27, d19 2172 vsub.u16 q14, q14, q2 2173 vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]! 2174 vsub.u16 q15, q15, q3 2175 vshrn.u32 d20, q10, #16 2176 vshrn.u32 d21, q11, #16 2177 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! 2178 vshrn.u32 d22, q8, #16 2179 vshrn.u32 d23, q9, #16 2180 vneg.s16 q12, q12 2181 vneg.s16 q13, q13 2182 vshr.s16 q2, q0, #15 /* extract sign */ 2183 vshr.s16 q3, q1, #15 2184 vshl.u16 q14, q10, q12 /* shift */ 2185 vshl.u16 q15, q11, q13 2186 subs LOOP_COUNT, LOOP_COUNT, #1 2187 bne 1b 2188 pop {r4, r5} 2189 2190 veor.u16 q14, q14, q2 /* restore sign */ 2191 veor.u16 q15, q15, q3 2192 vsub.u16 q14, q14, q2 2193 vsub.u16 q15, q15, q3 2194 vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]! 2195 2196 bx lr /* return */ 2197 2198 .unreq COEF_BLOCK 2199 .unreq DIVISORS 2200 .unreq WORKSPACE 2201 .unreq RECIPROCAL 2202 .unreq CORRECTION 2203 .unreq SHIFT 2204 .unreq LOOP_COUNT 2205 2206 2207/*****************************************************************************/ 2208 2209/* 2210 * GLOBAL(void) 2211 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor, 2212 * JDIMENSION downsampled_width, 2213 * JSAMPARRAY input_data, 2214 * JSAMPARRAY *output_data_ptr); 2215 * 2216 * Note: the use of unaligned writes is the main remaining bottleneck in 2217 * this code, which can be potentially solved to get up to tens 2218 * of percents performance improvement on Cortex-A8/Cortex-A9. 2219 */ 2220 2221/* 2222 * Upsample 16 source pixels to 32 destination pixels. The new 16 source 2223 * pixels are loaded to q0. The previous 16 source pixels are in q1. The 2224 * shifted-by-one source pixels are constructed in q2 by using q0 and q1. 2225 * Register d28 is used for multiplication by 3. Register q15 is used 2226 * for adding +1 bias. 2227 */ 2228.macro upsample16 OUTPTR, INPTR 2229 vld1.8 {q0}, [\INPTR]! 2230 vmovl.u8 q8, d0 2231 vext.8 q2, q1, q0, #15 2232 vmovl.u8 q9, d1 2233 vaddw.u8 q10, q15, d4 2234 vaddw.u8 q11, q15, d5 2235 vmlal.u8 q8, d4, d28 2236 vmlal.u8 q9, d5, d28 2237 vmlal.u8 q10, d0, d28 2238 vmlal.u8 q11, d1, d28 2239 vmov q1, q0 /* backup source pixels to q1 */ 2240 vrshrn.u16 d6, q8, #2 2241 vrshrn.u16 d7, q9, #2 2242 vshrn.u16 d8, q10, #2 2243 vshrn.u16 d9, q11, #2 2244 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 2245.endm 2246 2247/* 2248 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16' 2249 * macro, the roles of q0 and q1 registers are reversed for even and odd 2250 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed. 2251 * Also this unrolling allows to reorder loads and stores to compensate 2252 * multiplication latency and reduce stalls. 2253 */ 2254.macro upsample32 OUTPTR, INPTR 2255 /* even 16 pixels group */ 2256 vld1.8 {q0}, [\INPTR]! 2257 vmovl.u8 q8, d0 2258 vext.8 q2, q1, q0, #15 2259 vmovl.u8 q9, d1 2260 vaddw.u8 q10, q15, d4 2261 vaddw.u8 q11, q15, d5 2262 vmlal.u8 q8, d4, d28 2263 vmlal.u8 q9, d5, d28 2264 vmlal.u8 q10, d0, d28 2265 vmlal.u8 q11, d1, d28 2266 /* odd 16 pixels group */ 2267 vld1.8 {q1}, [\INPTR]! 2268 vrshrn.u16 d6, q8, #2 2269 vrshrn.u16 d7, q9, #2 2270 vshrn.u16 d8, q10, #2 2271 vshrn.u16 d9, q11, #2 2272 vmovl.u8 q8, d2 2273 vext.8 q2, q0, q1, #15 2274 vmovl.u8 q9, d3 2275 vaddw.u8 q10, q15, d4 2276 vaddw.u8 q11, q15, d5 2277 vmlal.u8 q8, d4, d28 2278 vmlal.u8 q9, d5, d28 2279 vmlal.u8 q10, d2, d28 2280 vmlal.u8 q11, d3, d28 2281 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 2282 vrshrn.u16 d6, q8, #2 2283 vrshrn.u16 d7, q9, #2 2284 vshrn.u16 d8, q10, #2 2285 vshrn.u16 d9, q11, #2 2286 vst2.8 {d6, d7, d8, d9}, [\OUTPTR]! 2287.endm 2288 2289/* 2290 * Upsample a row of WIDTH pixels from INPTR to OUTPTR. 2291 */ 2292.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1 2293 /* special case for the first and last pixels */ 2294 sub \WIDTH, \WIDTH, #1 2295 add \OUTPTR, \OUTPTR, #1 2296 ldrb \TMP1, [\INPTR, \WIDTH] 2297 strb \TMP1, [\OUTPTR, \WIDTH, asl #1] 2298 ldrb \TMP1, [\INPTR], #1 2299 strb \TMP1, [\OUTPTR, #-1] 2300 vmov.8 d3[7], \TMP1 2301 2302 subs \WIDTH, \WIDTH, #32 2303 blt 5f 23040: /* process 32 pixels per iteration */ 2305 upsample32 \OUTPTR, \INPTR 2306 subs \WIDTH, \WIDTH, #32 2307 bge 0b 23085: 2309 adds \WIDTH, \WIDTH, #16 2310 blt 1f 23110: /* process 16 pixels if needed */ 2312 upsample16 \OUTPTR, \INPTR 2313 subs \WIDTH, \WIDTH, #16 23141: 2315 adds \WIDTH, \WIDTH, #16 2316 beq 9f 2317 2318 /* load the remaining 1-15 pixels */ 2319 add \INPTR, \INPTR, \WIDTH 2320 tst \WIDTH, #1 2321 beq 2f 2322 sub \INPTR, \INPTR, #1 2323 vld1.8 {d0[0]}, [\INPTR] 23242: 2325 tst \WIDTH, #2 2326 beq 2f 2327 vext.8 d0, d0, d0, #6 2328 sub \INPTR, \INPTR, #1 2329 vld1.8 {d0[1]}, [\INPTR] 2330 sub \INPTR, \INPTR, #1 2331 vld1.8 {d0[0]}, [\INPTR] 23322: 2333 tst \WIDTH, #4 2334 beq 2f 2335 vrev64.32 d0, d0 2336 sub \INPTR, \INPTR, #1 2337 vld1.8 {d0[3]}, [\INPTR] 2338 sub \INPTR, \INPTR, #1 2339 vld1.8 {d0[2]}, [\INPTR] 2340 sub \INPTR, \INPTR, #1 2341 vld1.8 {d0[1]}, [\INPTR] 2342 sub \INPTR, \INPTR, #1 2343 vld1.8 {d0[0]}, [\INPTR] 23442: 2345 tst \WIDTH, #8 2346 beq 2f 2347 vmov d1, d0 2348 sub \INPTR, \INPTR, #8 2349 vld1.8 {d0}, [\INPTR] 23502: /* upsample the remaining pixels */ 2351 vmovl.u8 q8, d0 2352 vext.8 q2, q1, q0, #15 2353 vmovl.u8 q9, d1 2354 vaddw.u8 q10, q15, d4 2355 vaddw.u8 q11, q15, d5 2356 vmlal.u8 q8, d4, d28 2357 vmlal.u8 q9, d5, d28 2358 vmlal.u8 q10, d0, d28 2359 vmlal.u8 q11, d1, d28 2360 vrshrn.u16 d10, q8, #2 2361 vrshrn.u16 d12, q9, #2 2362 vshrn.u16 d11, q10, #2 2363 vshrn.u16 d13, q11, #2 2364 vzip.8 d10, d11 2365 vzip.8 d12, d13 2366 /* store the remaining pixels */ 2367 tst \WIDTH, #8 2368 beq 2f 2369 vst1.8 {d10, d11}, [\OUTPTR]! 2370 vmov q5, q6 23712: 2372 tst \WIDTH, #4 2373 beq 2f 2374 vst1.8 {d10}, [\OUTPTR]! 2375 vmov d10, d11 23762: 2377 tst \WIDTH, #2 2378 beq 2f 2379 vst1.8 {d10[0]}, [\OUTPTR]! 2380 vst1.8 {d10[1]}, [\OUTPTR]! 2381 vst1.8 {d10[2]}, [\OUTPTR]! 2382 vst1.8 {d10[3]}, [\OUTPTR]! 2383 vext.8 d10, d10, d10, #4 23842: 2385 tst \WIDTH, #1 2386 beq 2f 2387 vst1.8 {d10[0]}, [\OUTPTR]! 2388 vst1.8 {d10[1]}, [\OUTPTR]! 23892: 23909: 2391.endm 2392 2393asm_function jsimd_h2v1_fancy_upsample_neon 2394 2395 MAX_V_SAMP_FACTOR .req r0 2396 DOWNSAMPLED_WIDTH .req r1 2397 INPUT_DATA .req r2 2398 OUTPUT_DATA_PTR .req r3 2399 OUTPUT_DATA .req OUTPUT_DATA_PTR 2400 2401 OUTPTR .req r4 2402 INPTR .req r5 2403 WIDTH .req ip 2404 TMP .req lr 2405 2406 push {r4, r5, r6, lr} 2407 vpush {d8-d15} 2408 2409 ldr OUTPUT_DATA, [OUTPUT_DATA_PTR] 2410 cmp MAX_V_SAMP_FACTOR, #0 2411 ble 99f 2412 2413 /* initialize constants */ 2414 vmov.u8 d28, #3 2415 vmov.u16 q15, #1 241611: 2417 ldr INPTR, [INPUT_DATA], #4 2418 ldr OUTPTR, [OUTPUT_DATA], #4 2419 mov WIDTH, DOWNSAMPLED_WIDTH 2420 upsample_row OUTPTR, INPTR, WIDTH, TMP 2421 subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1 2422 bgt 11b 2423 242499: 2425 vpop {d8-d15} 2426 pop {r4, r5, r6, pc} 2427 2428 .unreq MAX_V_SAMP_FACTOR 2429 .unreq DOWNSAMPLED_WIDTH 2430 .unreq INPUT_DATA 2431 .unreq OUTPUT_DATA_PTR 2432 .unreq OUTPUT_DATA 2433 2434 .unreq OUTPTR 2435 .unreq INPTR 2436 .unreq WIDTH 2437 .unreq TMP 2438 2439.purgem upsample16 2440.purgem upsample32 2441.purgem upsample_row 2442 2443 2444/*****************************************************************************/ 2445 2446/* 2447 * GLOBAL(JOCTET*) 2448 * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer, 2449 * JCOEFPTR block, int last_dc_val, 2450 * c_derived_tbl *dctbl, c_derived_tbl *actbl) 2451 * 2452 */ 2453 2454.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP 2455 sub \PUT_BITS, \PUT_BITS, #0x8 2456 lsr \TMP, \PUT_BUFFER, \PUT_BITS 2457 uxtb \TMP, \TMP 2458 strb \TMP, [\BUFFER, #1]! 2459 cmp \TMP, #0xff 2460 /*it eq*/ 2461 strbeq \ZERO, [\BUFFER, #1]! 2462.endm 2463 2464.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE 2465 /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/ 2466 add \PUT_BITS, \SIZE 2467 /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/ 2468 orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE 2469.endm 2470 2471.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP 2472 cmp \PUT_BITS, #0x10 2473 blt 15f 2474 eor \ZERO, \ZERO, \ZERO 2475 emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP 2476 emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP 247715: 2478.endm 2479 2480.balign 16 2481jsimd_huff_encode_one_block_neon_consts: 2482 .byte 0x01 2483 .byte 0x02 2484 .byte 0x04 2485 .byte 0x08 2486 .byte 0x10 2487 .byte 0x20 2488 .byte 0x40 2489 .byte 0x80 2490 2491asm_function jsimd_huff_encode_one_block_neon 2492 push {r4, r5, r6, r7, r8, r9, r10, r11, lr} 2493 add r7, sp, #0x1c 2494 sub r4, sp, #0x40 2495 bfc r4, #0, #5 2496 mov sp, r4 /* align sp on 32 bytes */ 2497 vst1.64 {d8, d9, d10, d11}, [r4, :128]! 2498 vst1.64 {d12, d13, d14, d15}, [r4, :128] 2499 sub sp, #0x140 /* reserve 320 bytes */ 2500 str r0, [sp, #0x18] /* working state > sp + Ox18 */ 2501 add r4, sp, #0x20 /* r4 = t1 */ 2502 ldr lr, [r7, #0x8] /* lr = dctbl */ 2503 sub r10, r1, #0x1 /* r10=buffer-- */ 2504 ldrsh r1, [r2] 2505 mov r9, #0x10 2506 mov r8, #0x1 2507 adr r5, jsimd_huff_encode_one_block_neon_consts 2508 /* prepare data */ 2509 vld1.8 {d26}, [r5, :64] 2510 veor q8, q8, q8 2511 veor q9, q9, q9 2512 vdup.16 q14, r9 2513 vdup.16 q15, r8 2514 veor q10, q10, q10 2515 veor q11, q11, q11 2516 sub r1, r1, r3 2517 add r9, r2, #0x22 2518 add r8, r2, #0x18 2519 add r3, r2, #0x36 2520 vmov.16 d0[0], r1 2521 vld1.16 {d2[0]}, [r9, :16] 2522 vld1.16 {d4[0]}, [r8, :16] 2523 vld1.16 {d6[0]}, [r3, :16] 2524 add r1, r2, #0x2 2525 add r9, r2, #0x30 2526 add r8, r2, #0x26 2527 add r3, r2, #0x28 2528 vld1.16 {d0[1]}, [r1, :16] 2529 vld1.16 {d2[1]}, [r9, :16] 2530 vld1.16 {d4[1]}, [r8, :16] 2531 vld1.16 {d6[1]}, [r3, :16] 2532 add r1, r2, #0x10 2533 add r9, r2, #0x40 2534 add r8, r2, #0x34 2535 add r3, r2, #0x1a 2536 vld1.16 {d0[2]}, [r1, :16] 2537 vld1.16 {d2[2]}, [r9, :16] 2538 vld1.16 {d4[2]}, [r8, :16] 2539 vld1.16 {d6[2]}, [r3, :16] 2540 add r1, r2, #0x20 2541 add r9, r2, #0x32 2542 add r8, r2, #0x42 2543 add r3, r2, #0xc 2544 vld1.16 {d0[3]}, [r1, :16] 2545 vld1.16 {d2[3]}, [r9, :16] 2546 vld1.16 {d4[3]}, [r8, :16] 2547 vld1.16 {d6[3]}, [r3, :16] 2548 add r1, r2, #0x12 2549 add r9, r2, #0x24 2550 add r8, r2, #0x50 2551 add r3, r2, #0xe 2552 vld1.16 {d1[0]}, [r1, :16] 2553 vld1.16 {d3[0]}, [r9, :16] 2554 vld1.16 {d5[0]}, [r8, :16] 2555 vld1.16 {d7[0]}, [r3, :16] 2556 add r1, r2, #0x4 2557 add r9, r2, #0x16 2558 add r8, r2, #0x60 2559 add r3, r2, #0x1c 2560 vld1.16 {d1[1]}, [r1, :16] 2561 vld1.16 {d3[1]}, [r9, :16] 2562 vld1.16 {d5[1]}, [r8, :16] 2563 vld1.16 {d7[1]}, [r3, :16] 2564 add r1, r2, #0x6 2565 add r9, r2, #0x8 2566 add r8, r2, #0x52 2567 add r3, r2, #0x2a 2568 vld1.16 {d1[2]}, [r1, :16] 2569 vld1.16 {d3[2]}, [r9, :16] 2570 vld1.16 {d5[2]}, [r8, :16] 2571 vld1.16 {d7[2]}, [r3, :16] 2572 add r1, r2, #0x14 2573 add r9, r2, #0xa 2574 add r8, r2, #0x44 2575 add r3, r2, #0x38 2576 vld1.16 {d1[3]}, [r1, :16] 2577 vld1.16 {d3[3]}, [r9, :16] 2578 vld1.16 {d5[3]}, [r8, :16] 2579 vld1.16 {d7[3]}, [r3, :16] 2580 vcgt.s16 q8, q8, q0 2581 vcgt.s16 q9, q9, q1 2582 vcgt.s16 q10, q10, q2 2583 vcgt.s16 q11, q11, q3 2584 vabs.s16 q0, q0 2585 vabs.s16 q1, q1 2586 vabs.s16 q2, q2 2587 vabs.s16 q3, q3 2588 veor q8, q8, q0 2589 veor q9, q9, q1 2590 veor q10, q10, q2 2591 veor q11, q11, q3 2592 add r9, r4, #0x20 2593 add r8, r4, #0x80 2594 add r3, r4, #0xa0 2595 vclz.i16 q0, q0 2596 vclz.i16 q1, q1 2597 vclz.i16 q2, q2 2598 vclz.i16 q3, q3 2599 vsub.i16 q0, q14, q0 2600 vsub.i16 q1, q14, q1 2601 vsub.i16 q2, q14, q2 2602 vsub.i16 q3, q14, q3 2603 vst1.16 {d0, d1, d2, d3}, [r4, :256] 2604 vst1.16 {d4, d5, d6, d7}, [r9, :256] 2605 vshl.s16 q0, q15, q0 2606 vshl.s16 q1, q15, q1 2607 vshl.s16 q2, q15, q2 2608 vshl.s16 q3, q15, q3 2609 vsub.i16 q0, q0, q15 2610 vsub.i16 q1, q1, q15 2611 vsub.i16 q2, q2, q15 2612 vsub.i16 q3, q3, q15 2613 vand q8, q8, q0 2614 vand q9, q9, q1 2615 vand q10, q10, q2 2616 vand q11, q11, q3 2617 vst1.16 {d16, d17, d18, d19}, [r8, :256] 2618 vst1.16 {d20, d21, d22, d23}, [r3, :256] 2619 add r1, r2, #0x46 2620 add r9, r2, #0x3a 2621 add r8, r2, #0x74 2622 add r3, r2, #0x6a 2623 vld1.16 {d8[0]}, [r1, :16] 2624 vld1.16 {d10[0]}, [r9, :16] 2625 vld1.16 {d12[0]}, [r8, :16] 2626 vld1.16 {d14[0]}, [r3, :16] 2627 veor q8, q8, q8 2628 veor q9, q9, q9 2629 veor q10, q10, q10 2630 veor q11, q11, q11 2631 add r1, r2, #0x54 2632 add r9, r2, #0x2c 2633 add r8, r2, #0x76 2634 add r3, r2, #0x78 2635 vld1.16 {d8[1]}, [r1, :16] 2636 vld1.16 {d10[1]}, [r9, :16] 2637 vld1.16 {d12[1]}, [r8, :16] 2638 vld1.16 {d14[1]}, [r3, :16] 2639 add r1, r2, #0x62 2640 add r9, r2, #0x1e 2641 add r8, r2, #0x68 2642 add r3, r2, #0x7a 2643 vld1.16 {d8[2]}, [r1, :16] 2644 vld1.16 {d10[2]}, [r9, :16] 2645 vld1.16 {d12[2]}, [r8, :16] 2646 vld1.16 {d14[2]}, [r3, :16] 2647 add r1, r2, #0x70 2648 add r9, r2, #0x2e 2649 add r8, r2, #0x5a 2650 add r3, r2, #0x6c 2651 vld1.16 {d8[3]}, [r1, :16] 2652 vld1.16 {d10[3]}, [r9, :16] 2653 vld1.16 {d12[3]}, [r8, :16] 2654 vld1.16 {d14[3]}, [r3, :16] 2655 add r1, r2, #0x72 2656 add r9, r2, #0x3c 2657 add r8, r2, #0x4c 2658 add r3, r2, #0x5e 2659 vld1.16 {d9[0]}, [r1, :16] 2660 vld1.16 {d11[0]}, [r9, :16] 2661 vld1.16 {d13[0]}, [r8, :16] 2662 vld1.16 {d15[0]}, [r3, :16] 2663 add r1, r2, #0x64 2664 add r9, r2, #0x4a 2665 add r8, r2, #0x3e 2666 add r3, r2, #0x6e 2667 vld1.16 {d9[1]}, [r1, :16] 2668 vld1.16 {d11[1]}, [r9, :16] 2669 vld1.16 {d13[1]}, [r8, :16] 2670 vld1.16 {d15[1]}, [r3, :16] 2671 add r1, r2, #0x56 2672 add r9, r2, #0x58 2673 add r8, r2, #0x4e 2674 add r3, r2, #0x7c 2675 vld1.16 {d9[2]}, [r1, :16] 2676 vld1.16 {d11[2]}, [r9, :16] 2677 vld1.16 {d13[2]}, [r8, :16] 2678 vld1.16 {d15[2]}, [r3, :16] 2679 add r1, r2, #0x48 2680 add r9, r2, #0x66 2681 add r8, r2, #0x5c 2682 add r3, r2, #0x7e 2683 vld1.16 {d9[3]}, [r1, :16] 2684 vld1.16 {d11[3]}, [r9, :16] 2685 vld1.16 {d13[3]}, [r8, :16] 2686 vld1.16 {d15[3]}, [r3, :16] 2687 vcgt.s16 q8, q8, q4 2688 vcgt.s16 q9, q9, q5 2689 vcgt.s16 q10, q10, q6 2690 vcgt.s16 q11, q11, q7 2691 vabs.s16 q4, q4 2692 vabs.s16 q5, q5 2693 vabs.s16 q6, q6 2694 vabs.s16 q7, q7 2695 veor q8, q8, q4 2696 veor q9, q9, q5 2697 veor q10, q10, q6 2698 veor q11, q11, q7 2699 add r1, r4, #0x40 2700 add r9, r4, #0x60 2701 add r8, r4, #0xc0 2702 add r3, r4, #0xe0 2703 vclz.i16 q4, q4 2704 vclz.i16 q5, q5 2705 vclz.i16 q6, q6 2706 vclz.i16 q7, q7 2707 vsub.i16 q4, q14, q4 2708 vsub.i16 q5, q14, q5 2709 vsub.i16 q6, q14, q6 2710 vsub.i16 q7, q14, q7 2711 vst1.16 {d8, d9, d10, d11}, [r1, :256] 2712 vst1.16 {d12, d13, d14, d15}, [r9, :256] 2713 vshl.s16 q4, q15, q4 2714 vshl.s16 q5, q15, q5 2715 vshl.s16 q6, q15, q6 2716 vshl.s16 q7, q15, q7 2717 vsub.i16 q4, q4, q15 2718 vsub.i16 q5, q5, q15 2719 vsub.i16 q6, q6, q15 2720 vsub.i16 q7, q7, q15 2721 vand q8, q8, q4 2722 vand q9, q9, q5 2723 vand q10, q10, q6 2724 vand q11, q11, q7 2725 vst1.16 {d16, d17, d18, d19}, [r8, :256] 2726 vst1.16 {d20, d21, d22, d23}, [r3, :256] 2727 ldr r12, [r7, #0xc] /* r12 = actbl */ 2728 add r1, lr, #0x400 /* r1 = dctbl->ehufsi */ 2729 mov r9, r12 /* r9 = actbl */ 2730 add r6, r4, #0x80 /* r6 = t2 */ 2731 ldr r11, [r0, #0x8] /* r11 = put_buffer */ 2732 ldr r4, [r0, #0xc] /* r4 = put_bits */ 2733 ldrh r2, [r6, #-128] /* r2 = nbits */ 2734 ldrh r3, [r6] /* r3 = temp2 & (((JLONG) 1)<<nbits) - 1; */ 2735 ldr r0, [lr, r2, lsl #2] 2736 ldrb r5, [r1, r2] 2737 put_bits r11, r4, r0, r5 2738 checkbuf15 r10, r11, r4, r5, r0 2739 put_bits r11, r4, r3, r2 2740 checkbuf15 r10, r11, r4, r5, r0 2741 mov lr, r6 /* lr = t2 */ 2742 add r5, r9, #0x400 /* r5 = actbl->ehufsi */ 2743 ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */ 2744 veor q8, q8, q8 2745 vceq.i16 q0, q0, q8 2746 vceq.i16 q1, q1, q8 2747 vceq.i16 q2, q2, q8 2748 vceq.i16 q3, q3, q8 2749 vceq.i16 q4, q4, q8 2750 vceq.i16 q5, q5, q8 2751 vceq.i16 q6, q6, q8 2752 vceq.i16 q7, q7, q8 2753 vmovn.i16 d0, q0 2754 vmovn.i16 d2, q1 2755 vmovn.i16 d4, q2 2756 vmovn.i16 d6, q3 2757 vmovn.i16 d8, q4 2758 vmovn.i16 d10, q5 2759 vmovn.i16 d12, q6 2760 vmovn.i16 d14, q7 2761 vand d0, d0, d26 2762 vand d2, d2, d26 2763 vand d4, d4, d26 2764 vand d6, d6, d26 2765 vand d8, d8, d26 2766 vand d10, d10, d26 2767 vand d12, d12, d26 2768 vand d14, d14, d26 2769 vpadd.i8 d0, d0, d2 2770 vpadd.i8 d4, d4, d6 2771 vpadd.i8 d8, d8, d10 2772 vpadd.i8 d12, d12, d14 2773 vpadd.i8 d0, d0, d4 2774 vpadd.i8 d8, d8, d12 2775 vpadd.i8 d0, d0, d8 2776 vmov.32 r1, d0[1] 2777 vmov.32 r8, d0[0] 2778 mvn r1, r1 2779 mvn r8, r8 2780 lsrs r1, r1, #0x1 2781 rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */ 2782 rbit r1, r1 /* r1 = index1 */ 2783 rbit r8, r8 /* r8 = index0 */ 2784 ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */ 2785 str r1, [sp, #0x14] /* index1 > sp + 0x14 */ 2786 cmp r8, #0x0 2787 beq 6f 27881: 2789 clz r2, r8 2790 add lr, lr, r2, lsl #1 2791 lsl r8, r8, r2 2792 ldrh r1, [lr, #-126] 27932: 2794 cmp r2, #0x10 2795 blt 3f 2796 sub r2, r2, #0x10 2797 put_bits r11, r4, r0, r6 2798 cmp r4, #0x10 2799 blt 2b 2800 eor r3, r3, r3 2801 emit_byte r10, r11, r4, r3, r12 2802 emit_byte r10, r11, r4, r3, r12 2803 b 2b 28043: 2805 add r2, r1, r2, lsl #4 2806 ldrh r3, [lr, #2]! 2807 ldr r12, [r9, r2, lsl #2] 2808 ldrb r2, [r5, r2] 2809 put_bits r11, r4, r12, r2 2810 checkbuf15 r10, r11, r4, r2, r12 2811 put_bits r11, r4, r3, r1 2812 checkbuf15 r10, r11, r4, r2, r12 2813 lsls r8, r8, #0x1 2814 bne 1b 28156: 2816 add r12, sp, #0x20 /* r12 = t1 */ 2817 ldr r8, [sp, #0x14] /* r8 = index1 */ 2818 adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */ 2819 cmp r8, #0x0 2820 beq 6f 2821 clz r2, r8 2822 sub r12, r12, lr 2823 lsl r8, r8, r2 2824 add r2, r2, r12, lsr #1 2825 add lr, lr, r2, lsl #1 2826 b 7f 28271: 2828 clz r2, r8 2829 add lr, lr, r2, lsl #1 2830 lsl r8, r8, r2 28317: 2832 ldrh r1, [lr, #-126] 28332: 2834 cmp r2, #0x10 2835 blt 3f 2836 sub r2, r2, #0x10 2837 put_bits r11, r4, r0, r6 2838 cmp r4, #0x10 2839 blt 2b 2840 eor r3, r3, r3 2841 emit_byte r10, r11, r4, r3, r12 2842 emit_byte r10, r11, r4, r3, r12 2843 b 2b 28443: 2845 add r2, r1, r2, lsl #4 2846 ldrh r3, [lr, #2]! 2847 ldr r12, [r9, r2, lsl #2] 2848 ldrb r2, [r5, r2] 2849 put_bits r11, r4, r12, r2 2850 checkbuf15 r10, r11, r4, r2, r12 2851 put_bits r11, r4, r3, r1 2852 checkbuf15 r10, r11, r4, r2, r12 2853 lsls r8, r8, #0x1 2854 bne 1b 28556: 2856 add r0, sp, #0x20 2857 add r0, #0xfe 2858 cmp lr, r0 2859 bhs 1f 2860 ldr r1, [r9] 2861 ldrb r0, [r5] 2862 put_bits r11, r4, r1, r0 2863 checkbuf15 r10, r11, r4, r0, r1 28641: 2865 ldr r12, [sp, #0x18] 2866 str r11, [r12, #0x8] 2867 str r4, [r12, #0xc] 2868 add r0, r10, #0x1 2869 add r4, sp, #0x140 2870 vld1.64 {d8, d9, d10, d11}, [r4, :128]! 2871 vld1.64 {d12, d13, d14, d15}, [r4, :128] 2872 sub r4, r7, #0x1c 2873 mov sp, r4 2874 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} 2875 2876.purgem emit_byte 2877.purgem put_bits 2878.purgem checkbuf15 2879