1/* 2 * ARMv8 NEON optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). 5 * All Rights Reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 7 * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> 9 * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved. 10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. 11 * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. 12 * 13 * This software is provided 'as-is', without any express or implied 14 * warranty. In no event will the authors be held liable for any damages 15 * arising from the use of this software. 16 * 17 * Permission is granted to anyone to use this software for any purpose, 18 * including commercial applications, and to alter it and redistribute it 19 * freely, subject to the following restrictions: 20 * 21 * 1. The origin of this software must not be misrepresented; you must not 22 * claim that you wrote the original software. If you use this software 23 * in a product, an acknowledgment in the product documentation would be 24 * appreciated but is not required. 25 * 2. Altered source versions must be plainly marked as such, and must not be 26 * misrepresented as being the original software. 27 * 3. This notice may not be removed or altered from any source distribution. 28 */ 29 30#if defined(__linux__) && defined(__ELF__) 31.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ 32#endif 33 34#if defined(__APPLE__) 35.section __DATA,__const 36#else 37.section .rodata, "a", %progbits 38#endif 39 40#define F_0_298 2446 /* FIX(0.298631336) */ 41#define F_0_390 3196 /* FIX(0.390180644) */ 42#define F_0_541 4433 /* FIX(0.541196100) */ 43#define F_0_765 6270 /* FIX(0.765366865) */ 44#define F_0_899 7373 /* FIX(0.899976223) */ 45#define F_1_175 9633 /* FIX(1.175875602) */ 46#define F_1_501 12299 /* FIX(1.501321110) */ 47#define F_1_847 15137 /* FIX(1.847759065) */ 48#define F_1_961 16069 /* FIX(1.961570560) */ 49#define F_2_053 16819 /* FIX(2.053119869) */ 50#define F_2_562 20995 /* FIX(2.562915447) */ 51#define F_3_072 25172 /* FIX(3.072711026) */ 52 53.balign 16 54Ljsimd_idct_islow_neon_consts: 55 .short F_0_298 56 .short -F_0_390 57 .short F_0_541 58 .short F_0_765 59 .short - F_0_899 60 .short F_1_175 61 .short F_1_501 62 .short - F_1_847 63 .short - F_1_961 64 .short F_2_053 65 .short - F_2_562 66 .short F_3_072 67 .short 0 /* padding */ 68 .short 0 69 .short 0 70 .short 0 71 72#undef F_0_298 73#undef F_0_390 74#undef F_0_541 75#undef F_0_765 76#undef F_0_899 77#undef F_1_175 78#undef F_1_501 79#undef F_1_847 80#undef F_1_961 81#undef F_2_053 82#undef F_2_562 83#undef F_3_072 84 85 86#define XFIX_1_082392200 v0.h[0] 87#define XFIX_1_414213562 v0.h[1] 88#define XFIX_1_847759065 v0.h[2] 89#define XFIX_2_613125930 v0.h[3] 90 91.balign 16 92Ljsimd_idct_ifast_neon_consts: 93 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 94 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 95 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 96 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 97 98#define CONST_BITS 13 99#define PASS1_BITS 2 100 101#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 102#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 103#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 104#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 105#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 106#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 107#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 108#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 109#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 110#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 111#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 112#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 113#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 114#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 115 116.balign 16 117Ljsimd_idct_4x4_neon_consts: 118 .short FIX_1_847759065 /* v0.h[0] */ 119 .short -FIX_0_765366865 /* v0.h[1] */ 120 .short -FIX_0_211164243 /* v0.h[2] */ 121 .short FIX_1_451774981 /* v0.h[3] */ 122 .short -FIX_2_172734803 /* d1[0] */ 123 .short FIX_1_061594337 /* d1[1] */ 124 .short -FIX_0_509795579 /* d1[2] */ 125 .short -FIX_0_601344887 /* d1[3] */ 126 .short FIX_0_899976223 /* v2.h[0] */ 127 .short FIX_2_562915447 /* v2.h[1] */ 128 .short 1 << (CONST_BITS + 1) /* v2.h[2] */ 129 .short 0 /* v2.h[3] */ 130 131.balign 8 132Ljsimd_idct_2x2_neon_consts: 133 .short -FIX_0_720959822 /* v14[0] */ 134 .short FIX_0_850430095 /* v14[1] */ 135 .short -FIX_1_272758580 /* v14[2] */ 136 .short FIX_3_624509785 /* v14[3] */ 137 138.balign 16 139Ljsimd_ycc_colorid_neon_consts: 140 .short 0, 0, 0, 0 141 .short 22971, -11277, -23401, 29033 142 .short -128, -128, -128, -128 143 .short -128, -128, -128, -128 144 145.balign 16 146Ljsimd_colorid_ycc_neon_consts: 147 .short 19595, 38470, 7471, 11059 148 .short 21709, 32768, 27439, 5329 149 .short 32767, 128, 32767, 128 150 .short 32767, 128, 32767, 128 151 152#define F_0_298 2446 /* FIX(0.298631336) */ 153#define F_0_390 3196 /* FIX(0.390180644) */ 154#define F_0_541 4433 /* FIX(0.541196100) */ 155#define F_0_765 6270 /* FIX(0.765366865) */ 156#define F_0_899 7373 /* FIX(0.899976223) */ 157#define F_1_175 9633 /* FIX(1.175875602) */ 158#define F_1_501 12299 /* FIX(1.501321110) */ 159#define F_1_847 15137 /* FIX(1.847759065) */ 160#define F_1_961 16069 /* FIX(1.961570560) */ 161#define F_2_053 16819 /* FIX(2.053119869) */ 162#define F_2_562 20995 /* FIX(2.562915447) */ 163#define F_3_072 25172 /* FIX(3.072711026) */ 164 165.balign 16 166Ljsimd_fdct_islow_neon_consts: 167 .short F_0_298 168 .short -F_0_390 169 .short F_0_541 170 .short F_0_765 171 .short - F_0_899 172 .short F_1_175 173 .short F_1_501 174 .short - F_1_847 175 .short - F_1_961 176 .short F_2_053 177 .short - F_2_562 178 .short F_3_072 179 .short 0 /* padding */ 180 .short 0 181 .short 0 182 .short 0 183 184#undef F_0_298 185#undef F_0_390 186#undef F_0_541 187#undef F_0_765 188#undef F_0_899 189#undef F_1_175 190#undef F_1_501 191#undef F_1_847 192#undef F_1_961 193#undef F_2_053 194#undef F_2_562 195#undef F_3_072 196 197.balign 16 198Ljsimd_fdct_ifast_neon_consts: 199 .short (98 * 128) /* XFIX_0_382683433 */ 200 .short (139 * 128) /* XFIX_0_541196100 */ 201 .short (181 * 128) /* XFIX_0_707106781 */ 202 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ 203 204.balign 16 205Ljsimd_h2_downsample_neon_consts: 206 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 207 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */ 208 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 209 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */ 210 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 211 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */ 212 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 213 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */ 214 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 215 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */ 216 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 217 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */ 218 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 219 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */ 220 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 221 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */ 222 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 223 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */ 224 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \ 225 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */ 226 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \ 227 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */ 228 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \ 229 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */ 230 .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \ 231 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */ 232 .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \ 233 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */ 234 .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \ 235 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */ 236 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 237 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */ 238 239Ljsimd_huff_encode_one_block_neon_consts: 240 .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ 241 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 242 .byte 0, 1, 2, 3, 16, 17, 32, 33, \ 243 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ 244 .byte 34, 35, 48, 49, 255, 255, 50, 51, \ 245 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ 246 .byte 8, 9, 22, 23, 36, 37, 50, 51, \ 247 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ 248 .byte 54, 55, 40, 41, 26, 27, 12, 13, \ 249 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ 250 .byte 6, 7, 20, 21, 34, 35, 48, 49, \ 251 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ 252 .byte 42, 43, 28, 29, 14, 15, 30, 31, \ 253 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ 254 .byte 255, 255, 255, 255, 56, 57, 42, 43, \ 255 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ 256 .byte 26, 27, 40, 41, 42, 43, 28, 29, \ 257 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ 258 .byte 255, 255, 255, 255, 0, 1, 255, 255, \ 259 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ 260 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ 261 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ 262 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ 263 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ 264 .byte 4, 5, 6, 7, 255, 255, 255, 255, \ 265 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ 266Ljsimd_huff_encode_one_block_neon_slowtbl_consts: 267 .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ 268 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 269 270.text 271 272 273#define RESPECT_STRICT_ALIGNMENT 1 274 275 276/*****************************************************************************/ 277 278/* Supplementary macro for setting function attributes */ 279.macro asm_function fname 280#ifdef __APPLE__ 281 .private_extern _\fname 282 .globl _\fname 283_\fname: 284#else 285 .global \fname 286#ifdef __ELF__ 287 .hidden \fname 288 .type \fname, %function 289#endif 290\fname: 291#endif 292.endm 293 294.macro get_symbol_loc xi, symbol 295#ifdef __APPLE__ 296 adrp \xi, \symbol@PAGE 297 add \xi, \xi, \symbol@PAGEOFF 298#else 299 adrp \xi, \symbol 300 add \xi, \xi, :lo12:\symbol 301#endif 302.endm 303 304/* Transpose elements of single 128 bit registers */ 305.macro transpose_single x0, x1, xi, xilen, literal 306 ins \xi\xilen[0], \x0\xilen[0] 307 ins \x1\xilen[0], \x0\xilen[1] 308 trn1 \x0\literal, \x0\literal, \x1\literal 309 trn2 \x1\literal, \xi\literal, \x1\literal 310.endm 311 312/* Transpose elements of 2 different registers */ 313.macro transpose x0, x1, xi, xilen, literal 314 mov \xi\xilen, \x0\xilen 315 trn1 \x0\literal, \x0\literal, \x1\literal 316 trn2 \x1\literal, \xi\literal, \x1\literal 317.endm 318 319/* Transpose a block of 4x4 coefficients in four 64-bit registers */ 320.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen 321 mov \xi\xilen, \x0\xilen 322 trn1 \x0\x0len, \x0\x0len, \x2\x2len 323 trn2 \x2\x2len, \xi\x0len, \x2\x2len 324 mov \xi\xilen, \x1\xilen 325 trn1 \x1\x1len, \x1\x1len, \x3\x3len 326 trn2 \x3\x3len, \xi\x1len, \x3\x3len 327.endm 328 329.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen 330 mov \xi\xilen, \x0\xilen 331 trn1 \x0\x0len, \x0\x0len, \x1\x1len 332 trn2 \x1\x2len, \xi\x0len, \x1\x2len 333 mov \xi\xilen, \x2\xilen 334 trn1 \x2\x2len, \x2\x2len, \x3\x3len 335 trn2 \x3\x2len, \xi\x1len, \x3\x3len 336.endm 337 338.macro transpose_4x4 x0, x1, x2, x3, x5 339 transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b 340 transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b 341.endm 342 343.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3 344 trn1 \t0\().8h, \l0\().8h, \l1\().8h 345 trn1 \t1\().8h, \l2\().8h, \l3\().8h 346 trn1 \t2\().8h, \l4\().8h, \l5\().8h 347 trn1 \t3\().8h, \l6\().8h, \l7\().8h 348 trn2 \l1\().8h, \l0\().8h, \l1\().8h 349 trn2 \l3\().8h, \l2\().8h, \l3\().8h 350 trn2 \l5\().8h, \l4\().8h, \l5\().8h 351 trn2 \l7\().8h, \l6\().8h, \l7\().8h 352 353 trn1 \l4\().4s, \t2\().4s, \t3\().4s 354 trn2 \t3\().4s, \t2\().4s, \t3\().4s 355 trn1 \t2\().4s, \t0\().4s, \t1\().4s 356 trn2 \l2\().4s, \t0\().4s, \t1\().4s 357 trn1 \t0\().4s, \l1\().4s, \l3\().4s 358 trn2 \l3\().4s, \l1\().4s, \l3\().4s 359 trn2 \t1\().4s, \l5\().4s, \l7\().4s 360 trn1 \l5\().4s, \l5\().4s, \l7\().4s 361 362 trn2 \l6\().2d, \l2\().2d, \t3\().2d 363 trn1 \l0\().2d, \t2\().2d, \l4\().2d 364 trn1 \l1\().2d, \t0\().2d, \l5\().2d 365 trn2 \l7\().2d, \l3\().2d, \t1\().2d 366 trn1 \l2\().2d, \l2\().2d, \t3\().2d 367 trn2 \l4\().2d, \t2\().2d, \l4\().2d 368 trn1 \l3\().2d, \l3\().2d, \t1\().2d 369 trn2 \l5\().2d, \t0\().2d, \l5\().2d 370.endm 371 372 373#define CENTERJSAMPLE 128 374 375/*****************************************************************************/ 376 377/* 378 * Perform dequantization and inverse DCT on one block of coefficients. 379 * 380 * GLOBAL(void) 381 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block, 382 * JSAMPARRAY output_buf, JDIMENSION output_col) 383 */ 384 385#define CONST_BITS 13 386#define PASS1_BITS 2 387 388#define XFIX_P_0_298 v0.h[0] 389#define XFIX_N_0_390 v0.h[1] 390#define XFIX_P_0_541 v0.h[2] 391#define XFIX_P_0_765 v0.h[3] 392#define XFIX_N_0_899 v0.h[4] 393#define XFIX_P_1_175 v0.h[5] 394#define XFIX_P_1_501 v0.h[6] 395#define XFIX_N_1_847 v0.h[7] 396#define XFIX_N_1_961 v1.h[0] 397#define XFIX_P_2_053 v1.h[1] 398#define XFIX_N_2_562 v1.h[2] 399#define XFIX_P_3_072 v1.h[3] 400 401asm_function jsimd_idct_islow_neon 402 DCT_TABLE .req x0 403 COEF_BLOCK .req x1 404 OUTPUT_BUF .req x2 405 OUTPUT_COL .req x3 406 TMP1 .req x0 407 TMP2 .req x1 408 TMP3 .req x9 409 TMP4 .req x10 410 TMP5 .req x11 411 TMP6 .req x12 412 TMP7 .req x13 413 TMP8 .req x14 414 415 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 416 guarantee that the upper (unused) 32 bits of x3 are valid. This 417 instruction ensures that those bits are set to zero. */ 418 uxtw x3, w3 419 420 sub sp, sp, #64 421 get_symbol_loc x15, Ljsimd_idct_islow_neon_consts 422 mov x10, sp 423 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32 424 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32 425 ld1 {v0.8h, v1.8h}, [x15] 426 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 427 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 428 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64 429 ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64 430 431 cmeq v16.8h, v3.8h, #0 432 cmeq v26.8h, v4.8h, #0 433 cmeq v27.8h, v5.8h, #0 434 cmeq v28.8h, v6.8h, #0 435 cmeq v29.8h, v7.8h, #0 436 cmeq v30.8h, v8.8h, #0 437 cmeq v31.8h, v9.8h, #0 438 439 and v10.16b, v16.16b, v26.16b 440 and v11.16b, v27.16b, v28.16b 441 and v12.16b, v29.16b, v30.16b 442 and v13.16b, v31.16b, v10.16b 443 and v14.16b, v11.16b, v12.16b 444 mul v2.8h, v2.8h, v18.8h 445 and v15.16b, v13.16b, v14.16b 446 shl v10.8h, v2.8h, #(PASS1_BITS) 447 sqxtn v16.8b, v15.8h 448 mov TMP1, v16.d[0] 449 mvn TMP2, TMP1 450 451 cbnz TMP2, 2f 452 /* case all AC coeffs are zeros */ 453 dup v2.2d, v10.d[0] 454 dup v6.2d, v10.d[1] 455 mov v3.16b, v2.16b 456 mov v7.16b, v6.16b 457 mov v4.16b, v2.16b 458 mov v8.16b, v6.16b 459 mov v5.16b, v2.16b 460 mov v9.16b, v6.16b 4611: 462 /* for this transpose, we should organise data like this: 463 * 00, 01, 02, 03, 40, 41, 42, 43 464 * 10, 11, 12, 13, 50, 51, 52, 53 465 * 20, 21, 22, 23, 60, 61, 62, 63 466 * 30, 31, 32, 33, 70, 71, 72, 73 467 * 04, 05, 06, 07, 44, 45, 46, 47 468 * 14, 15, 16, 17, 54, 55, 56, 57 469 * 24, 25, 26, 27, 64, 65, 66, 67 470 * 34, 35, 36, 37, 74, 75, 76, 77 471 */ 472 trn1 v28.8h, v2.8h, v3.8h 473 trn1 v29.8h, v4.8h, v5.8h 474 trn1 v30.8h, v6.8h, v7.8h 475 trn1 v31.8h, v8.8h, v9.8h 476 trn2 v16.8h, v2.8h, v3.8h 477 trn2 v17.8h, v4.8h, v5.8h 478 trn2 v18.8h, v6.8h, v7.8h 479 trn2 v19.8h, v8.8h, v9.8h 480 trn1 v2.4s, v28.4s, v29.4s 481 trn1 v6.4s, v30.4s, v31.4s 482 trn1 v3.4s, v16.4s, v17.4s 483 trn1 v7.4s, v18.4s, v19.4s 484 trn2 v4.4s, v28.4s, v29.4s 485 trn2 v8.4s, v30.4s, v31.4s 486 trn2 v5.4s, v16.4s, v17.4s 487 trn2 v9.4s, v18.4s, v19.4s 488 /* Even part: reverse the even part of the forward DCT. */ 489 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 490 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 491 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 492 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 493 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 494 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 495 mov v21.16b, v19.16b /* tmp3 = z1 */ 496 mov v20.16b, v18.16b /* tmp3 = z1 */ 497 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 498 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 499 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 500 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 501 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 502 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 503 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 504 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 505 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 506 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 507 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 508 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 509 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 510 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 511 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 512 513 /* Odd part per figure 8; the matrix is unitary and hence its 514 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 515 */ 516 517 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 518 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 519 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 520 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 521 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 522 523 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 524 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 525 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 526 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 527 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 528 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 529 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 530 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 531 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 532 533 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 534 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 535 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 536 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 537 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 538 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 539 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 540 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 541 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 542 543 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 544 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 545 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 546 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 547 548 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 549 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 550 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 551 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 552 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 553 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 554 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 555 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 556 557 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 558 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 559 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 560 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 561 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 562 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 563 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 564 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 565 566 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 567 568 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 569 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 570 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 571 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 572 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 573 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 574 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 575 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 576 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 577 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 578 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 579 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 580 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 581 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 582 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 583 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 584 585 shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ 586 shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ 587 shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ 588 shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ 589 shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ 590 shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ 591 shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ 592 shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ 593 shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ 594 shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ 595 shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ 596 shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ 597 shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ 598 shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ 599 shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ 600 shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ 601 movi v0.16b, #(CENTERJSAMPLE) 602 /* Prepare pointers (dual-issue with NEON instructions) */ 603 ldp TMP1, TMP2, [OUTPUT_BUF], 16 604 sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16) 605 ldp TMP3, TMP4, [OUTPUT_BUF], 16 606 sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16) 607 add TMP1, TMP1, OUTPUT_COL 608 sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16) 609 add TMP2, TMP2, OUTPUT_COL 610 sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16) 611 add TMP3, TMP3, OUTPUT_COL 612 sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16) 613 add TMP4, TMP4, OUTPUT_COL 614 sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16) 615 ldp TMP5, TMP6, [OUTPUT_BUF], 16 616 sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16) 617 ldp TMP7, TMP8, [OUTPUT_BUF], 16 618 sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16) 619 add TMP5, TMP5, OUTPUT_COL 620 add v16.16b, v28.16b, v0.16b 621 add TMP6, TMP6, OUTPUT_COL 622 add v18.16b, v29.16b, v0.16b 623 add TMP7, TMP7, OUTPUT_COL 624 add v20.16b, v30.16b, v0.16b 625 add TMP8, TMP8, OUTPUT_COL 626 add v22.16b, v31.16b, v0.16b 627 628 /* Transpose the final 8-bit samples */ 629 trn1 v28.16b, v16.16b, v18.16b 630 trn1 v30.16b, v20.16b, v22.16b 631 trn2 v29.16b, v16.16b, v18.16b 632 trn2 v31.16b, v20.16b, v22.16b 633 634 trn1 v16.8h, v28.8h, v30.8h 635 trn2 v18.8h, v28.8h, v30.8h 636 trn1 v20.8h, v29.8h, v31.8h 637 trn2 v22.8h, v29.8h, v31.8h 638 639 uzp1 v28.4s, v16.4s, v18.4s 640 uzp2 v30.4s, v16.4s, v18.4s 641 uzp1 v29.4s, v20.4s, v22.4s 642 uzp2 v31.4s, v20.4s, v22.4s 643 644 /* Store results to the output buffer */ 645 st1 {v28.d}[0], [TMP1] 646 st1 {v29.d}[0], [TMP2] 647 st1 {v28.d}[1], [TMP3] 648 st1 {v29.d}[1], [TMP4] 649 st1 {v30.d}[0], [TMP5] 650 st1 {v31.d}[0], [TMP6] 651 st1 {v30.d}[1], [TMP7] 652 st1 {v31.d}[1], [TMP8] 653 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 654 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 655 blr x30 656 657.balign 16 6582: 659 mul v3.8h, v3.8h, v19.8h 660 mul v4.8h, v4.8h, v20.8h 661 mul v5.8h, v5.8h, v21.8h 662 add TMP4, xzr, TMP2, LSL #32 663 mul v6.8h, v6.8h, v22.8h 664 mul v7.8h, v7.8h, v23.8h 665 adds TMP3, xzr, TMP2, LSR #32 666 mul v8.8h, v8.8h, v24.8h 667 mul v9.8h, v9.8h, v25.8h 668 b.ne 3f 669 /* Right AC coef is zero */ 670 dup v15.2d, v10.d[1] 671 /* Even part: reverse the even part of the forward DCT. */ 672 add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 673 add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 674 sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 675 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 676 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 677 mov v20.16b, v18.16b /* tmp3 = z1 */ 678 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 679 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 680 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 681 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 682 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 683 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 684 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 685 686 /* Odd part per figure 8; the matrix is unitary and hence its 687 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 688 */ 689 690 add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 691 add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 692 add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 693 add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 694 add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */ 695 696 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 697 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 698 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 699 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 700 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 701 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 702 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 703 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 704 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 705 706 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 707 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 708 709 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 710 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 711 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 712 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 713 714 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 715 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 716 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 717 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 718 719 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 720 721 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 722 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 723 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 724 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 725 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 726 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 727 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 728 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 729 730 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 731 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 732 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 733 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 734 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 735 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 736 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 737 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 738 mov v6.16b, v15.16b 739 mov v7.16b, v15.16b 740 mov v8.16b, v15.16b 741 mov v9.16b, v15.16b 742 b 1b 743 744.balign 16 7453: 746 cbnz TMP4, 4f 747 /* Left AC coef is zero */ 748 dup v14.2d, v10.d[0] 749 /* Even part: reverse the even part of the forward DCT. */ 750 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 751 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 752 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 753 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 754 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 755 mov v21.16b, v19.16b /* tmp3 = z1 */ 756 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 757 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 758 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 759 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 760 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 761 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 762 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 763 764 /* Odd part per figure 8; the matrix is unitary and hence its 765 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 766 */ 767 768 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 769 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 770 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 771 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 772 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 773 774 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 775 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 776 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 777 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 778 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 779 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 780 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 781 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 782 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 783 784 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 785 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 786 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 787 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 788 789 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 790 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 791 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 792 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 793 794 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 795 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 796 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 797 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 798 799 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 800 801 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 802 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 803 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 804 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 805 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 806 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 807 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 808 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 809 810 mov v2.16b, v14.16b 811 mov v3.16b, v14.16b 812 mov v4.16b, v14.16b 813 mov v5.16b, v14.16b 814 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 815 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 816 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 817 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 818 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 819 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 820 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 821 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 822 b 1b 823 824.balign 16 8254: 826 /* "No" AC coef is zero */ 827 /* Even part: reverse the even part of the forward DCT. */ 828 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 829 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 830 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 831 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 832 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 833 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 834 mov v21.16b, v19.16b /* tmp3 = z1 */ 835 mov v20.16b, v18.16b /* tmp3 = z1 */ 836 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 837 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 838 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 839 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 840 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 841 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 842 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 843 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 844 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 845 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 846 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 847 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 848 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 849 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 850 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 851 852 /* Odd part per figure 8; the matrix is unitary and hence its 853 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 854 */ 855 856 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 857 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 858 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 859 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 860 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 861 862 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 863 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 864 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 865 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 866 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 867 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 868 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 869 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 870 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 871 872 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 873 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 874 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 875 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 876 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 877 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 878 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 879 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 880 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 881 882 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 883 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 884 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 885 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 886 887 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 888 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 889 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 890 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 891 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 892 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 893 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 894 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 895 896 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 897 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 898 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 899 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 900 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 901 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 902 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 903 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 904 905 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 906 907 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 908 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 909 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 910 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 911 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 912 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 913 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 914 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 915 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 916 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 917 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 918 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 919 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 920 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 921 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 922 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 923 924 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 925 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 926 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 927 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 928 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 929 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 930 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 931 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 932 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 933 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 934 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 935 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 936 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 937 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 938 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 939 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 940 b 1b 941 942 .unreq DCT_TABLE 943 .unreq COEF_BLOCK 944 .unreq OUTPUT_BUF 945 .unreq OUTPUT_COL 946 .unreq TMP1 947 .unreq TMP2 948 .unreq TMP3 949 .unreq TMP4 950 .unreq TMP5 951 .unreq TMP6 952 .unreq TMP7 953 .unreq TMP8 954 955#undef CENTERJSAMPLE 956#undef CONST_BITS 957#undef PASS1_BITS 958#undef XFIX_P_0_298 959#undef XFIX_N_0_390 960#undef XFIX_P_0_541 961#undef XFIX_P_0_765 962#undef XFIX_N_0_899 963#undef XFIX_P_1_175 964#undef XFIX_P_1_501 965#undef XFIX_N_1_847 966#undef XFIX_N_1_961 967#undef XFIX_P_2_053 968#undef XFIX_N_2_562 969#undef XFIX_P_3_072 970 971 972/*****************************************************************************/ 973 974/* 975 * jsimd_idct_ifast_neon 976 * 977 * This function contains a fast, not so accurate integer implementation of 978 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 979 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' 980 * function from jidctfst.c 981 * 982 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 983 * But in ARM NEON case some extra additions are required because VQDMULH 984 * instruction can't handle the constants larger than 1. So the expressions 985 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 986 * which introduces an extra addition. Overall, there are 6 extra additions 987 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 988 */ 989 990asm_function jsimd_idct_ifast_neon 991 992 DCT_TABLE .req x0 993 COEF_BLOCK .req x1 994 OUTPUT_BUF .req x2 995 OUTPUT_COL .req x3 996 TMP1 .req x0 997 TMP2 .req x1 998 TMP3 .req x9 999 TMP4 .req x10 1000 TMP5 .req x11 1001 TMP6 .req x12 1002 TMP7 .req x13 1003 TMP8 .req x14 1004 1005 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 1006 guarantee that the upper (unused) 32 bits of x3 are valid. This 1007 instruction ensures that those bits are set to zero. */ 1008 uxtw x3, w3 1009 1010 /* Load and dequantize coefficients into NEON registers 1011 * with the following allocation: 1012 * 0 1 2 3 | 4 5 6 7 1013 * ---------+-------- 1014 * 0 | d16 | d17 ( v16.8h ) 1015 * 1 | d18 | d19 ( v17.8h ) 1016 * 2 | d20 | d21 ( v18.8h ) 1017 * 3 | d22 | d23 ( v19.8h ) 1018 * 4 | d24 | d25 ( v20.8h ) 1019 * 5 | d26 | d27 ( v21.8h ) 1020 * 6 | d28 | d29 ( v22.8h ) 1021 * 7 | d30 | d31 ( v23.8h ) 1022 */ 1023 /* Save NEON registers used in fast IDCT */ 1024 get_symbol_loc TMP5, Ljsimd_idct_ifast_neon_consts 1025 ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32 1026 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 1027 ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32 1028 mul v16.8h, v16.8h, v0.8h 1029 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 1030 mul v17.8h, v17.8h, v1.8h 1031 ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32 1032 mul v18.8h, v18.8h, v2.8h 1033 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 1034 mul v19.8h, v19.8h, v3.8h 1035 ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32 1036 mul v20.8h, v20.8h, v0.8h 1037 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 1038 mul v22.8h, v22.8h, v2.8h 1039 mul v21.8h, v21.8h, v1.8h 1040 ld1 {v0.4h}, [TMP5] /* load constants */ 1041 mul v23.8h, v23.8h, v3.8h 1042 1043 /* 1-D IDCT, pass 1 */ 1044 sub v2.8h, v18.8h, v22.8h 1045 add v22.8h, v18.8h, v22.8h 1046 sub v1.8h, v19.8h, v21.8h 1047 add v21.8h, v19.8h, v21.8h 1048 sub v5.8h, v17.8h, v23.8h 1049 add v23.8h, v17.8h, v23.8h 1050 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 1051 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 1052 add v3.8h, v1.8h, v1.8h 1053 sub v1.8h, v5.8h, v1.8h 1054 add v18.8h, v2.8h, v4.8h 1055 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 1056 sub v2.8h, v23.8h, v21.8h 1057 add v3.8h, v3.8h, v6.8h 1058 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 1059 add v1.8h, v1.8h, v4.8h 1060 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 1061 sub v18.8h, v18.8h, v22.8h 1062 add v2.8h, v2.8h, v6.8h 1063 sub v6.8h, v16.8h, v20.8h 1064 add v20.8h, v16.8h, v20.8h 1065 add v17.8h, v5.8h, v4.8h 1066 add v5.8h, v6.8h, v18.8h 1067 sub v18.8h, v6.8h, v18.8h 1068 add v6.8h, v23.8h, v21.8h 1069 add v16.8h, v20.8h, v22.8h 1070 sub v3.8h, v6.8h, v3.8h 1071 sub v20.8h, v20.8h, v22.8h 1072 sub v3.8h, v3.8h, v1.8h 1073 sub v1.8h, v17.8h, v1.8h 1074 add v2.8h, v3.8h, v2.8h 1075 sub v23.8h, v16.8h, v6.8h 1076 add v1.8h, v1.8h, v2.8h 1077 add v16.8h, v16.8h, v6.8h 1078 add v22.8h, v5.8h, v3.8h 1079 sub v17.8h, v5.8h, v3.8h 1080 sub v21.8h, v18.8h, v2.8h 1081 add v18.8h, v18.8h, v2.8h 1082 sub v19.8h, v20.8h, v1.8h 1083 add v20.8h, v20.8h, v1.8h 1084 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31 1085 /* 1-D IDCT, pass 2 */ 1086 sub v2.8h, v18.8h, v22.8h 1087 add v22.8h, v18.8h, v22.8h 1088 sub v1.8h, v19.8h, v21.8h 1089 add v21.8h, v19.8h, v21.8h 1090 sub v5.8h, v17.8h, v23.8h 1091 add v23.8h, v17.8h, v23.8h 1092 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 1093 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 1094 add v3.8h, v1.8h, v1.8h 1095 sub v1.8h, v5.8h, v1.8h 1096 add v18.8h, v2.8h, v4.8h 1097 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 1098 sub v2.8h, v23.8h, v21.8h 1099 add v3.8h, v3.8h, v6.8h 1100 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 1101 add v1.8h, v1.8h, v4.8h 1102 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 1103 sub v18.8h, v18.8h, v22.8h 1104 add v2.8h, v2.8h, v6.8h 1105 sub v6.8h, v16.8h, v20.8h 1106 add v20.8h, v16.8h, v20.8h 1107 add v17.8h, v5.8h, v4.8h 1108 add v5.8h, v6.8h, v18.8h 1109 sub v18.8h, v6.8h, v18.8h 1110 add v6.8h, v23.8h, v21.8h 1111 add v16.8h, v20.8h, v22.8h 1112 sub v3.8h, v6.8h, v3.8h 1113 sub v20.8h, v20.8h, v22.8h 1114 sub v3.8h, v3.8h, v1.8h 1115 sub v1.8h, v17.8h, v1.8h 1116 add v2.8h, v3.8h, v2.8h 1117 sub v23.8h, v16.8h, v6.8h 1118 add v1.8h, v1.8h, v2.8h 1119 add v16.8h, v16.8h, v6.8h 1120 add v22.8h, v5.8h, v3.8h 1121 sub v17.8h, v5.8h, v3.8h 1122 sub v21.8h, v18.8h, v2.8h 1123 add v18.8h, v18.8h, v2.8h 1124 sub v19.8h, v20.8h, v1.8h 1125 add v20.8h, v20.8h, v1.8h 1126 /* Descale to 8-bit and range limit */ 1127 movi v0.16b, #0x80 1128 /* Prepare pointers (dual-issue with NEON instructions) */ 1129 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1130 sqshrn v28.8b, v16.8h, #5 1131 ldp TMP3, TMP4, [OUTPUT_BUF], 16 1132 sqshrn v29.8b, v17.8h, #5 1133 add TMP1, TMP1, OUTPUT_COL 1134 sqshrn v30.8b, v18.8h, #5 1135 add TMP2, TMP2, OUTPUT_COL 1136 sqshrn v31.8b, v19.8h, #5 1137 add TMP3, TMP3, OUTPUT_COL 1138 sqshrn2 v28.16b, v20.8h, #5 1139 add TMP4, TMP4, OUTPUT_COL 1140 sqshrn2 v29.16b, v21.8h, #5 1141 ldp TMP5, TMP6, [OUTPUT_BUF], 16 1142 sqshrn2 v30.16b, v22.8h, #5 1143 ldp TMP7, TMP8, [OUTPUT_BUF], 16 1144 sqshrn2 v31.16b, v23.8h, #5 1145 add TMP5, TMP5, OUTPUT_COL 1146 add v16.16b, v28.16b, v0.16b 1147 add TMP6, TMP6, OUTPUT_COL 1148 add v18.16b, v29.16b, v0.16b 1149 add TMP7, TMP7, OUTPUT_COL 1150 add v20.16b, v30.16b, v0.16b 1151 add TMP8, TMP8, OUTPUT_COL 1152 add v22.16b, v31.16b, v0.16b 1153 1154 /* Transpose the final 8-bit samples */ 1155 trn1 v28.16b, v16.16b, v18.16b 1156 trn1 v30.16b, v20.16b, v22.16b 1157 trn2 v29.16b, v16.16b, v18.16b 1158 trn2 v31.16b, v20.16b, v22.16b 1159 1160 trn1 v16.8h, v28.8h, v30.8h 1161 trn2 v18.8h, v28.8h, v30.8h 1162 trn1 v20.8h, v29.8h, v31.8h 1163 trn2 v22.8h, v29.8h, v31.8h 1164 1165 uzp1 v28.4s, v16.4s, v18.4s 1166 uzp2 v30.4s, v16.4s, v18.4s 1167 uzp1 v29.4s, v20.4s, v22.4s 1168 uzp2 v31.4s, v20.4s, v22.4s 1169 1170 /* Store results to the output buffer */ 1171 st1 {v28.d}[0], [TMP1] 1172 st1 {v29.d}[0], [TMP2] 1173 st1 {v28.d}[1], [TMP3] 1174 st1 {v29.d}[1], [TMP4] 1175 st1 {v30.d}[0], [TMP5] 1176 st1 {v31.d}[0], [TMP6] 1177 st1 {v30.d}[1], [TMP7] 1178 st1 {v31.d}[1], [TMP8] 1179 blr x30 1180 1181 .unreq DCT_TABLE 1182 .unreq COEF_BLOCK 1183 .unreq OUTPUT_BUF 1184 .unreq OUTPUT_COL 1185 .unreq TMP1 1186 .unreq TMP2 1187 .unreq TMP3 1188 .unreq TMP4 1189 .unreq TMP5 1190 .unreq TMP6 1191 .unreq TMP7 1192 .unreq TMP8 1193 1194 1195/*****************************************************************************/ 1196 1197/* 1198 * jsimd_idct_4x4_neon 1199 * 1200 * This function contains inverse-DCT code for getting reduced-size 1201 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 1202 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 1203 * function from jpeg-6b (jidctred.c). 1204 * 1205 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 1206 * requires much less arithmetic operations and hence should be faster. 1207 * The primary purpose of this particular NEON optimized function is 1208 * bit exact compatibility with jpeg-6b. 1209 * 1210 * TODO: a bit better instructions scheduling can be achieved by expanding 1211 * idct_helper/transpose_4x4 macros and reordering instructions, 1212 * but readability will suffer somewhat. 1213 */ 1214 1215#define CONST_BITS 13 1216 1217.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 1218 smull v28.4s, \x4, v2.h[2] 1219 smlal v28.4s, \x8, v0.h[0] 1220 smlal v28.4s, \x14, v0.h[1] 1221 1222 smull v26.4s, \x16, v1.h[2] 1223 smlal v26.4s, \x12, v1.h[3] 1224 smlal v26.4s, \x10, v2.h[0] 1225 smlal v26.4s, \x6, v2.h[1] 1226 1227 smull v30.4s, \x4, v2.h[2] 1228 smlsl v30.4s, \x8, v0.h[0] 1229 smlsl v30.4s, \x14, v0.h[1] 1230 1231 smull v24.4s, \x16, v0.h[2] 1232 smlal v24.4s, \x12, v0.h[3] 1233 smlal v24.4s, \x10, v1.h[0] 1234 smlal v24.4s, \x6, v1.h[1] 1235 1236 add v20.4s, v28.4s, v26.4s 1237 sub v28.4s, v28.4s, v26.4s 1238 1239 .if \shift > 16 1240 srshr v20.4s, v20.4s, #\shift 1241 srshr v28.4s, v28.4s, #\shift 1242 xtn \y26, v20.4s 1243 xtn \y29, v28.4s 1244 .else 1245 rshrn \y26, v20.4s, #\shift 1246 rshrn \y29, v28.4s, #\shift 1247 .endif 1248 1249 add v20.4s, v30.4s, v24.4s 1250 sub v30.4s, v30.4s, v24.4s 1251 1252 .if \shift > 16 1253 srshr v20.4s, v20.4s, #\shift 1254 srshr v30.4s, v30.4s, #\shift 1255 xtn \y27, v20.4s 1256 xtn \y28, v30.4s 1257 .else 1258 rshrn \y27, v20.4s, #\shift 1259 rshrn \y28, v30.4s, #\shift 1260 .endif 1261.endm 1262 1263asm_function jsimd_idct_4x4_neon 1264 1265 DCT_TABLE .req x0 1266 COEF_BLOCK .req x1 1267 OUTPUT_BUF .req x2 1268 OUTPUT_COL .req x3 1269 TMP1 .req x0 1270 TMP2 .req x1 1271 TMP3 .req x2 1272 TMP4 .req x15 1273 1274 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 1275 guarantee that the upper (unused) 32 bits of x3 are valid. This 1276 instruction ensures that those bits are set to zero. */ 1277 uxtw x3, w3 1278 1279 /* Save all used NEON registers */ 1280 sub sp, sp, 64 1281 mov x9, sp 1282 /* Load constants (v3.4h is just used for padding) */ 1283 get_symbol_loc TMP4, Ljsimd_idct_4x4_neon_consts 1284 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1285 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1286 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] 1287 1288 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1289 * 0 1 2 3 | 4 5 6 7 1290 * ---------+-------- 1291 * 0 | v4.4h | v5.4h 1292 * 1 | v6.4h | v7.4h 1293 * 2 | v8.4h | v9.4h 1294 * 3 | v10.4h | v11.4h 1295 * 4 | - | - 1296 * 5 | v12.4h | v13.4h 1297 * 6 | v14.4h | v15.4h 1298 * 7 | v16.4h | v17.4h 1299 */ 1300 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1301 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 1302 add COEF_BLOCK, COEF_BLOCK, #16 1303 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 1304 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1305 /* dequantize */ 1306 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1307 mul v4.4h, v4.4h, v18.4h 1308 mul v5.4h, v5.4h, v19.4h 1309 ins v4.d[1], v5.d[0] /* 128 bit q4 */ 1310 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 1311 mul v6.4h, v6.4h, v20.4h 1312 mul v7.4h, v7.4h, v21.4h 1313 ins v6.d[1], v7.d[0] /* 128 bit q6 */ 1314 mul v8.4h, v8.4h, v22.4h 1315 mul v9.4h, v9.4h, v23.4h 1316 ins v8.d[1], v9.d[0] /* 128 bit q8 */ 1317 add DCT_TABLE, DCT_TABLE, #16 1318 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 1319 mul v10.4h, v10.4h, v24.4h 1320 mul v11.4h, v11.4h, v25.4h 1321 ins v10.d[1], v11.d[0] /* 128 bit q10 */ 1322 mul v12.4h, v12.4h, v26.4h 1323 mul v13.4h, v13.4h, v27.4h 1324 ins v12.d[1], v13.d[0] /* 128 bit q12 */ 1325 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1326 mul v14.4h, v14.4h, v28.4h 1327 mul v15.4h, v15.4h, v29.4h 1328 ins v14.d[1], v15.d[0] /* 128 bit q14 */ 1329 mul v16.4h, v16.4h, v30.4h 1330 mul v17.4h, v17.4h, v31.4h 1331 ins v16.d[1], v17.d[0] /* 128 bit q16 */ 1332 1333 /* Pass 1 */ 1334 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \ 1335 v4.4h, v6.4h, v8.4h, v10.4h 1336 transpose_4x4 v4, v6, v8, v10, v3 1337 ins v10.d[1], v11.d[0] 1338 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \ 1339 v5.4h, v7.4h, v9.4h, v11.4h 1340 transpose_4x4 v5, v7, v9, v11, v3 1341 ins v10.d[1], v11.d[0] 1342 1343 /* Pass 2 */ 1344 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \ 1345 v26.4h, v27.4h, v28.4h, v29.4h 1346 transpose_4x4 v26, v27, v28, v29, v3 1347 1348 /* Range limit */ 1349 movi v30.8h, #0x80 1350 ins v26.d[1], v27.d[0] 1351 ins v28.d[1], v29.d[0] 1352 add v26.8h, v26.8h, v30.8h 1353 add v28.8h, v28.8h, v30.8h 1354 sqxtun v26.8b, v26.8h 1355 sqxtun v27.8b, v28.8h 1356 1357 /* Store results to the output buffer */ 1358 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1359 ldp TMP3, TMP4, [OUTPUT_BUF] 1360 add TMP1, TMP1, OUTPUT_COL 1361 add TMP2, TMP2, OUTPUT_COL 1362 add TMP3, TMP3, OUTPUT_COL 1363 add TMP4, TMP4, OUTPUT_COL 1364 1365#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT 1366 /* We can use much less instructions on little endian systems if the 1367 * OS kernel is not configured to trap unaligned memory accesses 1368 */ 1369 st1 {v26.s}[0], [TMP1], 4 1370 st1 {v27.s}[0], [TMP3], 4 1371 st1 {v26.s}[1], [TMP2], 4 1372 st1 {v27.s}[1], [TMP4], 4 1373#else 1374 st1 {v26.b}[0], [TMP1], 1 1375 st1 {v27.b}[0], [TMP3], 1 1376 st1 {v26.b}[1], [TMP1], 1 1377 st1 {v27.b}[1], [TMP3], 1 1378 st1 {v26.b}[2], [TMP1], 1 1379 st1 {v27.b}[2], [TMP3], 1 1380 st1 {v26.b}[3], [TMP1], 1 1381 st1 {v27.b}[3], [TMP3], 1 1382 1383 st1 {v26.b}[4], [TMP2], 1 1384 st1 {v27.b}[4], [TMP4], 1 1385 st1 {v26.b}[5], [TMP2], 1 1386 st1 {v27.b}[5], [TMP4], 1 1387 st1 {v26.b}[6], [TMP2], 1 1388 st1 {v27.b}[6], [TMP4], 1 1389 st1 {v26.b}[7], [TMP2], 1 1390 st1 {v27.b}[7], [TMP4], 1 1391#endif 1392 1393 /* vpop {v8.4h - v15.4h} ;not available */ 1394 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1395 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1396 blr x30 1397 1398 .unreq DCT_TABLE 1399 .unreq COEF_BLOCK 1400 .unreq OUTPUT_BUF 1401 .unreq OUTPUT_COL 1402 .unreq TMP1 1403 .unreq TMP2 1404 .unreq TMP3 1405 .unreq TMP4 1406 1407.purgem idct_helper 1408 1409 1410/*****************************************************************************/ 1411 1412/* 1413 * jsimd_idct_2x2_neon 1414 * 1415 * This function contains inverse-DCT code for getting reduced-size 1416 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations 1417 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 1418 * function from jpeg-6b (jidctred.c). 1419 * 1420 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 1421 * requires much less arithmetic operations and hence should be faster. 1422 * The primary purpose of this particular NEON optimized function is 1423 * bit exact compatibility with jpeg-6b. 1424 */ 1425 1426.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 1427 sshll v15.4s, \x4, #15 1428 smull v26.4s, \x6, v14.h[3] 1429 smlal v26.4s, \x10, v14.h[2] 1430 smlal v26.4s, \x12, v14.h[1] 1431 smlal v26.4s, \x16, v14.h[0] 1432 1433 add v20.4s, v15.4s, v26.4s 1434 sub v15.4s, v15.4s, v26.4s 1435 1436 .if \shift > 16 1437 srshr v20.4s, v20.4s, #\shift 1438 srshr v15.4s, v15.4s, #\shift 1439 xtn \y26, v20.4s 1440 xtn \y27, v15.4s 1441 .else 1442 rshrn \y26, v20.4s, #\shift 1443 rshrn \y27, v15.4s, #\shift 1444 .endif 1445.endm 1446 1447asm_function jsimd_idct_2x2_neon 1448 1449 DCT_TABLE .req x0 1450 COEF_BLOCK .req x1 1451 OUTPUT_BUF .req x2 1452 OUTPUT_COL .req x3 1453 TMP1 .req x0 1454 TMP2 .req x15 1455 1456 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 1457 guarantee that the upper (unused) 32 bits of x3 are valid. This 1458 instruction ensures that those bits are set to zero. */ 1459 uxtw x3, w3 1460 1461 /* vpush {v8.4h - v15.4h} ; not available */ 1462 sub sp, sp, 64 1463 mov x9, sp 1464 1465 /* Load constants */ 1466 get_symbol_loc TMP2, Ljsimd_idct_2x2_neon_consts 1467 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1468 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1469 ld1 {v14.4h}, [TMP2] 1470 1471 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1472 * 0 1 2 3 | 4 5 6 7 1473 * ---------+-------- 1474 * 0 | v4.4h | v5.4h 1475 * 1 | v6.4h | v7.4h 1476 * 2 | - | - 1477 * 3 | v10.4h | v11.4h 1478 * 4 | - | - 1479 * 5 | v12.4h | v13.4h 1480 * 6 | - | - 1481 * 7 | v16.4h | v17.4h 1482 */ 1483 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1484 add COEF_BLOCK, COEF_BLOCK, #16 1485 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 1486 add COEF_BLOCK, COEF_BLOCK, #16 1487 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 1488 add COEF_BLOCK, COEF_BLOCK, #16 1489 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1490 /* Dequantize */ 1491 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1492 mul v4.4h, v4.4h, v18.4h 1493 mul v5.4h, v5.4h, v19.4h 1494 ins v4.d[1], v5.d[0] 1495 mul v6.4h, v6.4h, v20.4h 1496 mul v7.4h, v7.4h, v21.4h 1497 ins v6.d[1], v7.d[0] 1498 add DCT_TABLE, DCT_TABLE, #16 1499 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 1500 mul v10.4h, v10.4h, v24.4h 1501 mul v11.4h, v11.4h, v25.4h 1502 ins v10.d[1], v11.d[0] 1503 add DCT_TABLE, DCT_TABLE, #16 1504 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 1505 mul v12.4h, v12.4h, v26.4h 1506 mul v13.4h, v13.4h, v27.4h 1507 ins v12.d[1], v13.d[0] 1508 add DCT_TABLE, DCT_TABLE, #16 1509 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1510 mul v16.4h, v16.4h, v30.4h 1511 mul v17.4h, v17.4h, v31.4h 1512 ins v16.d[1], v17.d[0] 1513 1514 /* Pass 1 */ 1515#if 0 1516 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h 1517 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h 1518 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h 1519 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h 1520#else 1521 smull v26.4s, v6.4h, v14.h[3] 1522 smlal v26.4s, v10.4h, v14.h[2] 1523 smlal v26.4s, v12.4h, v14.h[1] 1524 smlal v26.4s, v16.4h, v14.h[0] 1525 smull v24.4s, v7.4h, v14.h[3] 1526 smlal v24.4s, v11.4h, v14.h[2] 1527 smlal v24.4s, v13.4h, v14.h[1] 1528 smlal v24.4s, v17.4h, v14.h[0] 1529 sshll v15.4s, v4.4h, #15 1530 sshll v30.4s, v5.4h, #15 1531 add v20.4s, v15.4s, v26.4s 1532 sub v15.4s, v15.4s, v26.4s 1533 rshrn v4.4h, v20.4s, #13 1534 rshrn v6.4h, v15.4s, #13 1535 add v20.4s, v30.4s, v24.4s 1536 sub v15.4s, v30.4s, v24.4s 1537 rshrn v5.4h, v20.4s, #13 1538 rshrn v7.4h, v15.4s, #13 1539 ins v4.d[1], v5.d[0] 1540 ins v6.d[1], v7.d[0] 1541 transpose v4, v6, v3, .16b, .8h 1542 transpose v6, v10, v3, .16b, .4s 1543 ins v11.d[0], v10.d[1] 1544 ins v7.d[0], v6.d[1] 1545#endif 1546 1547 /* Pass 2 */ 1548 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h 1549 1550 /* Range limit */ 1551 movi v30.8h, #0x80 1552 ins v26.d[1], v27.d[0] 1553 add v26.8h, v26.8h, v30.8h 1554 sqxtun v30.8b, v26.8h 1555 ins v26.d[0], v30.d[0] 1556 sqxtun v27.8b, v26.8h 1557 1558 /* Store results to the output buffer */ 1559 ldp TMP1, TMP2, [OUTPUT_BUF] 1560 add TMP1, TMP1, OUTPUT_COL 1561 add TMP2, TMP2, OUTPUT_COL 1562 1563 st1 {v26.b}[0], [TMP1], 1 1564 st1 {v27.b}[4], [TMP1], 1 1565 st1 {v26.b}[1], [TMP2], 1 1566 st1 {v27.b}[5], [TMP2], 1 1567 1568 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1569 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1570 blr x30 1571 1572 .unreq DCT_TABLE 1573 .unreq COEF_BLOCK 1574 .unreq OUTPUT_BUF 1575 .unreq OUTPUT_COL 1576 .unreq TMP1 1577 .unreq TMP2 1578 1579.purgem idct_helper 1580 1581 1582/*****************************************************************************/ 1583 1584/* 1585 * jsimd_ycc_extrgb_convert_neon 1586 * jsimd_ycc_extbgr_convert_neon 1587 * jsimd_ycc_extrgbx_convert_neon 1588 * jsimd_ycc_extbgrx_convert_neon 1589 * jsimd_ycc_extxbgr_convert_neon 1590 * jsimd_ycc_extxrgb_convert_neon 1591 * 1592 * Colorspace conversion YCbCr -> RGB 1593 */ 1594 1595.macro do_load size 1596 .if \size == 8 1597 ld1 {v4.8b}, [U], 8 1598 ld1 {v5.8b}, [V], 8 1599 ld1 {v0.8b}, [Y], 8 1600 prfm pldl1keep, [U, #64] 1601 prfm pldl1keep, [V, #64] 1602 prfm pldl1keep, [Y, #64] 1603 .elseif \size == 4 1604 ld1 {v4.b}[0], [U], 1 1605 ld1 {v4.b}[1], [U], 1 1606 ld1 {v4.b}[2], [U], 1 1607 ld1 {v4.b}[3], [U], 1 1608 ld1 {v5.b}[0], [V], 1 1609 ld1 {v5.b}[1], [V], 1 1610 ld1 {v5.b}[2], [V], 1 1611 ld1 {v5.b}[3], [V], 1 1612 ld1 {v0.b}[0], [Y], 1 1613 ld1 {v0.b}[1], [Y], 1 1614 ld1 {v0.b}[2], [Y], 1 1615 ld1 {v0.b}[3], [Y], 1 1616 .elseif \size == 2 1617 ld1 {v4.b}[4], [U], 1 1618 ld1 {v4.b}[5], [U], 1 1619 ld1 {v5.b}[4], [V], 1 1620 ld1 {v5.b}[5], [V], 1 1621 ld1 {v0.b}[4], [Y], 1 1622 ld1 {v0.b}[5], [Y], 1 1623 .elseif \size == 1 1624 ld1 {v4.b}[6], [U], 1 1625 ld1 {v5.b}[6], [V], 1 1626 ld1 {v0.b}[6], [Y], 1 1627 .else 1628 .error unsupported macroblock size 1629 .endif 1630.endm 1631 1632.macro do_store bpp, size, fast_st3 1633 .if \bpp == 24 1634 .if \size == 8 1635 .if \fast_st3 == 1 1636 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 1637 .else 1638 st1 {v10.b}[0], [RGB], #1 1639 st1 {v11.b}[0], [RGB], #1 1640 st1 {v12.b}[0], [RGB], #1 1641 1642 st1 {v10.b}[1], [RGB], #1 1643 st1 {v11.b}[1], [RGB], #1 1644 st1 {v12.b}[1], [RGB], #1 1645 1646 st1 {v10.b}[2], [RGB], #1 1647 st1 {v11.b}[2], [RGB], #1 1648 st1 {v12.b}[2], [RGB], #1 1649 1650 st1 {v10.b}[3], [RGB], #1 1651 st1 {v11.b}[3], [RGB], #1 1652 st1 {v12.b}[3], [RGB], #1 1653 1654 st1 {v10.b}[4], [RGB], #1 1655 st1 {v11.b}[4], [RGB], #1 1656 st1 {v12.b}[4], [RGB], #1 1657 1658 st1 {v10.b}[5], [RGB], #1 1659 st1 {v11.b}[5], [RGB], #1 1660 st1 {v12.b}[5], [RGB], #1 1661 1662 st1 {v10.b}[6], [RGB], #1 1663 st1 {v11.b}[6], [RGB], #1 1664 st1 {v12.b}[6], [RGB], #1 1665 1666 st1 {v10.b}[7], [RGB], #1 1667 st1 {v11.b}[7], [RGB], #1 1668 st1 {v12.b}[7], [RGB], #1 1669 .endif 1670 .elseif \size == 4 1671 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 1672 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 1673 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 1674 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 1675 .elseif \size == 2 1676 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 1677 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 1678 .elseif \size == 1 1679 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 1680 .else 1681 .error unsupported macroblock size 1682 .endif 1683 .elseif \bpp == 32 1684 .if \size == 8 1685 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 1686 .elseif \size == 4 1687 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 1688 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 1689 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 1690 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 1691 .elseif \size == 2 1692 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 1693 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 1694 .elseif \size == 1 1695 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 1696 .else 1697 .error unsupported macroblock size 1698 .endif 1699 .elseif \bpp == 16 1700 .if \size == 8 1701 st1 {v25.8h}, [RGB], 16 1702 .elseif \size == 4 1703 st1 {v25.4h}, [RGB], 8 1704 .elseif \size == 2 1705 st1 {v25.h}[4], [RGB], 2 1706 st1 {v25.h}[5], [RGB], 2 1707 .elseif \size == 1 1708 st1 {v25.h}[6], [RGB], 2 1709 .else 1710 .error unsupported macroblock size 1711 .endif 1712 .else 1713 .error unsupported bpp 1714 .endif 1715.endm 1716 1717.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \ 1718 g_offs, gsize, b_offs, bsize, \ 1719 defsize, fast_st3 1720 1721/* 1722 * 2-stage pipelined YCbCr->RGB conversion 1723 */ 1724 1725.macro do_yuv_to_rgb_stage1 1726 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ 1727 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1728 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1729 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1730 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1731 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1732 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1733 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1734 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ 1735 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ 1736.endm 1737 1738.macro do_yuv_to_rgb_stage2 1739 rshrn v20.4h, v20.4s, #15 1740 rshrn2 v20.8h, v22.4s, #15 1741 rshrn v24.4h, v24.4s, #14 1742 rshrn2 v24.8h, v26.4s, #14 1743 rshrn v28.4h, v28.4s, #14 1744 rshrn2 v28.8h, v30.4s, #14 1745 uaddw v20.8h, v20.8h, v0.8b 1746 uaddw v24.8h, v24.8h, v0.8b 1747 uaddw v28.8h, v28.8h, v0.8b 1748 .if \bpp != 16 1749 sqxtun v1\g_offs\defsize, v20.8h 1750 sqxtun v1\r_offs\defsize, v24.8h 1751 sqxtun v1\b_offs\defsize, v28.8h 1752 .else 1753 sqshlu v21.8h, v20.8h, #8 1754 sqshlu v25.8h, v24.8h, #8 1755 sqshlu v29.8h, v28.8h, #8 1756 sri v25.8h, v21.8h, #5 1757 sri v25.8h, v29.8h, #11 1758 .endif 1759.endm 1760 1761.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3 1762 rshrn v20.4h, v20.4s, #15 1763 rshrn v24.4h, v24.4s, #14 1764 rshrn v28.4h, v28.4s, #14 1765 ld1 {v4.8b}, [U], 8 1766 rshrn2 v20.8h, v22.4s, #15 1767 rshrn2 v24.8h, v26.4s, #14 1768 rshrn2 v28.8h, v30.4s, #14 1769 ld1 {v5.8b}, [V], 8 1770 uaddw v20.8h, v20.8h, v0.8b 1771 uaddw v24.8h, v24.8h, v0.8b 1772 uaddw v28.8h, v28.8h, v0.8b 1773 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ 1774 sqxtun v1\g_offs\defsize, v20.8h 1775 ld1 {v0.8b}, [Y], 8 1776 sqxtun v1\r_offs\defsize, v24.8h 1777 prfm pldl1keep, [U, #64] 1778 prfm pldl1keep, [V, #64] 1779 prfm pldl1keep, [Y, #64] 1780 sqxtun v1\b_offs\defsize, v28.8h 1781 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1782 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1783 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1784 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1785 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1786 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1787 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1788 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1789 .else /**************************** rgb565 ********************************/ 1790 sqshlu v21.8h, v20.8h, #8 1791 sqshlu v25.8h, v24.8h, #8 1792 sqshlu v29.8h, v28.8h, #8 1793 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1794 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1795 ld1 {v0.8b}, [Y], 8 1796 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1797 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1798 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1799 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1800 sri v25.8h, v21.8h, #5 1801 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1802 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1803 prfm pldl1keep, [U, #64] 1804 prfm pldl1keep, [V, #64] 1805 prfm pldl1keep, [Y, #64] 1806 sri v25.8h, v29.8h, #11 1807 .endif 1808 do_store \bpp, 8, \fast_st3 1809 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ 1810 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ 1811.endm 1812 1813.macro do_yuv_to_rgb 1814 do_yuv_to_rgb_stage1 1815 do_yuv_to_rgb_stage2 1816.endm 1817 1818/* Apple gas crashes on adrl, work around that by using adr. 1819 * But this requires a copy of these constants for each function. 1820 */ 1821 1822.if \fast_st3 == 1 1823asm_function jsimd_ycc_\colorid\()_convert_neon 1824.else 1825asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 1826.endif 1827 OUTPUT_WIDTH .req w0 1828 INPUT_BUF .req x1 1829 INPUT_ROW .req w2 1830 OUTPUT_BUF .req x3 1831 NUM_ROWS .req w4 1832 1833 INPUT_BUF0 .req x5 1834 INPUT_BUF1 .req x6 1835 INPUT_BUF2 .req x1 1836 1837 RGB .req x7 1838 Y .req x9 1839 U .req x10 1840 V .req x11 1841 N .req w15 1842 1843 sub sp, sp, 64 1844 mov x9, sp 1845 1846 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ 1847 get_symbol_loc x15, Ljsimd_ycc_colorid_neon_consts 1848 1849 /* Save NEON registers */ 1850 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1851 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1852 ld1 {v0.4h, v1.4h}, [x15], 16 1853 ld1 {v2.8h}, [x15] 1854 1855 ldr INPUT_BUF0, [INPUT_BUF] 1856 ldr INPUT_BUF1, [INPUT_BUF, #8] 1857 ldr INPUT_BUF2, [INPUT_BUF, #16] 1858 .unreq INPUT_BUF 1859 1860 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ 1861 movi v10.16b, #255 1862 movi v13.16b, #255 1863 1864 /* Outer loop over scanlines */ 1865 cmp NUM_ROWS, #1 1866 b.lt 9f 18670: 1868 ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3] 1869 ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3] 1870 mov N, OUTPUT_WIDTH 1871 ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3] 1872 add INPUT_ROW, INPUT_ROW, #1 1873 ldr RGB, [OUTPUT_BUF], #8 1874 1875 /* Inner loop over pixels */ 1876 subs N, N, #8 1877 b.lt 3f 1878 do_load 8 1879 do_yuv_to_rgb_stage1 1880 subs N, N, #8 1881 b.lt 2f 18821: 1883 do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3 1884 subs N, N, #8 1885 b.ge 1b 18862: 1887 do_yuv_to_rgb_stage2 1888 do_store \bpp, 8, \fast_st3 1889 tst N, #7 1890 b.eq 8f 18913: 1892 tst N, #4 1893 b.eq 3f 1894 do_load 4 18953: 1896 tst N, #2 1897 b.eq 4f 1898 do_load 2 18994: 1900 tst N, #1 1901 b.eq 5f 1902 do_load 1 19035: 1904 do_yuv_to_rgb 1905 tst N, #4 1906 b.eq 6f 1907 do_store \bpp, 4, \fast_st3 19086: 1909 tst N, #2 1910 b.eq 7f 1911 do_store \bpp, 2, \fast_st3 19127: 1913 tst N, #1 1914 b.eq 8f 1915 do_store \bpp, 1, \fast_st3 19168: 1917 subs NUM_ROWS, NUM_ROWS, #1 1918 b.gt 0b 19199: 1920 /* Restore all registers and return */ 1921 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1922 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1923 br x30 1924 .unreq OUTPUT_WIDTH 1925 .unreq INPUT_ROW 1926 .unreq OUTPUT_BUF 1927 .unreq NUM_ROWS 1928 .unreq INPUT_BUF0 1929 .unreq INPUT_BUF1 1930 .unreq INPUT_BUF2 1931 .unreq RGB 1932 .unreq Y 1933 .unreq U 1934 .unreq V 1935 .unreq N 1936 1937.purgem do_yuv_to_rgb 1938.purgem do_yuv_to_rgb_stage1 1939.purgem do_yuv_to_rgb_stage2 1940.purgem do_yuv_to_rgb_stage2_store_load_stage1 1941 1942.endm 1943 1944/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/ 1945generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1 1946generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1 1947generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1 1948generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1 1949generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1 1950generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1 1951generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1 1952 1953generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0 1954generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0 1955 1956.purgem do_load 1957.purgem do_store 1958 1959 1960/*****************************************************************************/ 1961 1962/* 1963 * jsimd_extrgb_ycc_convert_neon 1964 * jsimd_extbgr_ycc_convert_neon 1965 * jsimd_extrgbx_ycc_convert_neon 1966 * jsimd_extbgrx_ycc_convert_neon 1967 * jsimd_extxbgr_ycc_convert_neon 1968 * jsimd_extxrgb_ycc_convert_neon 1969 * 1970 * Colorspace conversion RGB -> YCbCr 1971 */ 1972 1973.macro do_store size 1974 .if \size == 8 1975 st1 {v20.8b}, [Y], #8 1976 st1 {v21.8b}, [U], #8 1977 st1 {v22.8b}, [V], #8 1978 .elseif \size == 4 1979 st1 {v20.b}[0], [Y], #1 1980 st1 {v20.b}[1], [Y], #1 1981 st1 {v20.b}[2], [Y], #1 1982 st1 {v20.b}[3], [Y], #1 1983 st1 {v21.b}[0], [U], #1 1984 st1 {v21.b}[1], [U], #1 1985 st1 {v21.b}[2], [U], #1 1986 st1 {v21.b}[3], [U], #1 1987 st1 {v22.b}[0], [V], #1 1988 st1 {v22.b}[1], [V], #1 1989 st1 {v22.b}[2], [V], #1 1990 st1 {v22.b}[3], [V], #1 1991 .elseif \size == 2 1992 st1 {v20.b}[4], [Y], #1 1993 st1 {v20.b}[5], [Y], #1 1994 st1 {v21.b}[4], [U], #1 1995 st1 {v21.b}[5], [U], #1 1996 st1 {v22.b}[4], [V], #1 1997 st1 {v22.b}[5], [V], #1 1998 .elseif \size == 1 1999 st1 {v20.b}[6], [Y], #1 2000 st1 {v21.b}[6], [U], #1 2001 st1 {v22.b}[6], [V], #1 2002 .else 2003 .error unsupported macroblock size 2004 .endif 2005.endm 2006 2007.macro do_load bpp, size, fast_ld3 2008 .if \bpp == 24 2009 .if \size == 8 2010 .if \fast_ld3 == 1 2011 ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24 2012 .else 2013 ld1 {v10.b}[0], [RGB], #1 2014 ld1 {v11.b}[0], [RGB], #1 2015 ld1 {v12.b}[0], [RGB], #1 2016 2017 ld1 {v10.b}[1], [RGB], #1 2018 ld1 {v11.b}[1], [RGB], #1 2019 ld1 {v12.b}[1], [RGB], #1 2020 2021 ld1 {v10.b}[2], [RGB], #1 2022 ld1 {v11.b}[2], [RGB], #1 2023 ld1 {v12.b}[2], [RGB], #1 2024 2025 ld1 {v10.b}[3], [RGB], #1 2026 ld1 {v11.b}[3], [RGB], #1 2027 ld1 {v12.b}[3], [RGB], #1 2028 2029 ld1 {v10.b}[4], [RGB], #1 2030 ld1 {v11.b}[4], [RGB], #1 2031 ld1 {v12.b}[4], [RGB], #1 2032 2033 ld1 {v10.b}[5], [RGB], #1 2034 ld1 {v11.b}[5], [RGB], #1 2035 ld1 {v12.b}[5], [RGB], #1 2036 2037 ld1 {v10.b}[6], [RGB], #1 2038 ld1 {v11.b}[6], [RGB], #1 2039 ld1 {v12.b}[6], [RGB], #1 2040 2041 ld1 {v10.b}[7], [RGB], #1 2042 ld1 {v11.b}[7], [RGB], #1 2043 ld1 {v12.b}[7], [RGB], #1 2044 .endif 2045 prfm pldl1keep, [RGB, #128] 2046 .elseif \size == 4 2047 ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3 2048 ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3 2049 ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3 2050 ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3 2051 .elseif \size == 2 2052 ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3 2053 ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3 2054 .elseif \size == 1 2055 ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3 2056 .else 2057 .error unsupported macroblock size 2058 .endif 2059 .elseif \bpp == 32 2060 .if \size == 8 2061 ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32 2062 prfm pldl1keep, [RGB, #128] 2063 .elseif \size == 4 2064 ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4 2065 ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4 2066 ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4 2067 ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4 2068 .elseif \size == 2 2069 ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4 2070 ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4 2071 .elseif \size == 1 2072 ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4 2073 .else 2074 .error unsupported macroblock size 2075 .endif 2076 .else 2077 .error unsupported bpp 2078 .endif 2079.endm 2080 2081.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \ 2082 b_offs, fast_ld3 2083 2084/* 2085 * 2-stage pipelined RGB->YCbCr conversion 2086 */ 2087 2088.macro do_rgb_to_yuv_stage1 2089 ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */ 2090 ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */ 2091 ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */ 2092 rev64 v18.4s, v1.4s 2093 rev64 v26.4s, v1.4s 2094 rev64 v28.4s, v1.4s 2095 rev64 v30.4s, v1.4s 2096 umull v14.4s, v4.4h, v0.h[0] 2097 umull2 v16.4s, v4.8h, v0.h[0] 2098 umlsl v18.4s, v4.4h, v0.h[3] 2099 umlsl2 v26.4s, v4.8h, v0.h[3] 2100 umlal v28.4s, v4.4h, v0.h[5] 2101 umlal2 v30.4s, v4.8h, v0.h[5] 2102 umlal v14.4s, v6.4h, v0.h[1] 2103 umlal2 v16.4s, v6.8h, v0.h[1] 2104 umlsl v18.4s, v6.4h, v0.h[4] 2105 umlsl2 v26.4s, v6.8h, v0.h[4] 2106 umlsl v28.4s, v6.4h, v0.h[6] 2107 umlsl2 v30.4s, v6.8h, v0.h[6] 2108 umlal v14.4s, v8.4h, v0.h[2] 2109 umlal2 v16.4s, v8.8h, v0.h[2] 2110 umlal v18.4s, v8.4h, v0.h[5] 2111 umlal2 v26.4s, v8.8h, v0.h[5] 2112 umlsl v28.4s, v8.4h, v0.h[7] 2113 umlsl2 v30.4s, v8.8h, v0.h[7] 2114.endm 2115 2116.macro do_rgb_to_yuv_stage2 2117 rshrn v20.4h, v14.4s, #16 2118 shrn v22.4h, v18.4s, #16 2119 shrn v24.4h, v28.4s, #16 2120 rshrn2 v20.8h, v16.4s, #16 2121 shrn2 v22.8h, v26.4s, #16 2122 shrn2 v24.8h, v30.4s, #16 2123 xtn v20.8b, v20.8h /* v20 = y */ 2124 xtn v21.8b, v22.8h /* v21 = u */ 2125 xtn v22.8b, v24.8h /* v22 = v */ 2126.endm 2127 2128.macro do_rgb_to_yuv 2129 do_rgb_to_yuv_stage1 2130 do_rgb_to_yuv_stage2 2131.endm 2132 2133/* TODO: expand macros and interleave instructions if some in-order 2134 * ARM64 processor actually can dual-issue LOAD/STORE with ALU */ 2135.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3 2136 do_rgb_to_yuv_stage2 2137 do_load \bpp, 8, \fast_ld3 2138 st1 {v20.8b}, [Y], #8 2139 st1 {v21.8b}, [U], #8 2140 st1 {v22.8b}, [V], #8 2141 do_rgb_to_yuv_stage1 2142.endm 2143 2144 2145.if \fast_ld3 == 1 2146asm_function jsimd_\colorid\()_ycc_convert_neon 2147.else 2148asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 2149.endif 2150 OUTPUT_WIDTH .req w0 2151 INPUT_BUF .req x1 2152 OUTPUT_BUF .req x2 2153 OUTPUT_ROW .req w3 2154 NUM_ROWS .req w4 2155 2156 OUTPUT_BUF0 .req x5 2157 OUTPUT_BUF1 .req x6 2158 OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */ 2159 2160 RGB .req x7 2161 Y .req x9 2162 U .req x10 2163 V .req x11 2164 N .req w12 2165 2166 /* Load constants to d0, d1, d2, d3 */ 2167 get_symbol_loc x13, Ljsimd_colorid_ycc_neon_consts 2168 2169 ld1 {v0.8h, v1.8h}, [x13] 2170 2171 ldr OUTPUT_BUF0, [OUTPUT_BUF] 2172 ldr OUTPUT_BUF1, [OUTPUT_BUF, #8] 2173 ldr OUTPUT_BUF2, [OUTPUT_BUF, #16] 2174 .unreq OUTPUT_BUF 2175 2176 /* Save NEON registers */ 2177 sub sp, sp, #64 2178 mov x9, sp 2179 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 2180 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 2181 2182 /* Outer loop over scanlines */ 2183 cmp NUM_ROWS, #1 2184 b.lt 9f 21850: 2186 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3] 2187 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3] 2188 mov N, OUTPUT_WIDTH 2189 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3] 2190 add OUTPUT_ROW, OUTPUT_ROW, #1 2191 ldr RGB, [INPUT_BUF], #8 2192 2193 /* Inner loop over pixels */ 2194 subs N, N, #8 2195 b.lt 3f 2196 do_load \bpp, 8, \fast_ld3 2197 do_rgb_to_yuv_stage1 2198 subs N, N, #8 2199 b.lt 2f 22001: 2201 do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3 2202 subs N, N, #8 2203 b.ge 1b 22042: 2205 do_rgb_to_yuv_stage2 2206 do_store 8 2207 tst N, #7 2208 b.eq 8f 22093: 2210 tbz N, #2, 3f 2211 do_load \bpp, 4, \fast_ld3 22123: 2213 tbz N, #1, 4f 2214 do_load \bpp, 2, \fast_ld3 22154: 2216 tbz N, #0, 5f 2217 do_load \bpp, 1, \fast_ld3 22185: 2219 do_rgb_to_yuv 2220 tbz N, #2, 6f 2221 do_store 4 22226: 2223 tbz N, #1, 7f 2224 do_store 2 22257: 2226 tbz N, #0, 8f 2227 do_store 1 22288: 2229 subs NUM_ROWS, NUM_ROWS, #1 2230 b.gt 0b 22319: 2232 /* Restore all registers and return */ 2233 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 2234 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 2235 br x30 2236 2237 .unreq OUTPUT_WIDTH 2238 .unreq OUTPUT_ROW 2239 .unreq INPUT_BUF 2240 .unreq NUM_ROWS 2241 .unreq OUTPUT_BUF0 2242 .unreq OUTPUT_BUF1 2243 .unreq OUTPUT_BUF2 2244 .unreq RGB 2245 .unreq Y 2246 .unreq U 2247 .unreq V 2248 .unreq N 2249 2250.purgem do_rgb_to_yuv 2251.purgem do_rgb_to_yuv_stage1 2252.purgem do_rgb_to_yuv_stage2 2253.purgem do_rgb_to_yuv_stage2_store_load_stage1 2254 2255.endm 2256 2257/*--------------------------------- id ----- bpp R G B Fast LD3 */ 2258generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1 2259generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1 2260generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1 2261generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1 2262generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1 2263generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1 2264 2265generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0 2266generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0 2267 2268.purgem do_load 2269.purgem do_store 2270 2271 2272/*****************************************************************************/ 2273 2274/* 2275 * Load data into workspace, applying unsigned->signed conversion 2276 * 2277 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get 2278 * rid of VST1.16 instructions 2279 */ 2280 2281asm_function jsimd_convsamp_neon 2282 SAMPLE_DATA .req x0 2283 START_COL .req x1 2284 WORKSPACE .req x2 2285 TMP1 .req x9 2286 TMP2 .req x10 2287 TMP3 .req x11 2288 TMP4 .req x12 2289 TMP5 .req x13 2290 TMP6 .req x14 2291 TMP7 .req x15 2292 TMP8 .req x4 2293 TMPDUP .req w3 2294 2295 /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 2296 guarantee that the upper (unused) 32 bits of x1 are valid. This 2297 instruction ensures that those bits are set to zero. */ 2298 uxtw x1, w1 2299 2300 mov TMPDUP, #128 2301 ldp TMP1, TMP2, [SAMPLE_DATA], 16 2302 ldp TMP3, TMP4, [SAMPLE_DATA], 16 2303 dup v0.8b, TMPDUP 2304 add TMP1, TMP1, START_COL 2305 add TMP2, TMP2, START_COL 2306 ldp TMP5, TMP6, [SAMPLE_DATA], 16 2307 add TMP3, TMP3, START_COL 2308 add TMP4, TMP4, START_COL 2309 ldp TMP7, TMP8, [SAMPLE_DATA], 16 2310 add TMP5, TMP5, START_COL 2311 add TMP6, TMP6, START_COL 2312 ld1 {v16.8b}, [TMP1] 2313 add TMP7, TMP7, START_COL 2314 add TMP8, TMP8, START_COL 2315 ld1 {v17.8b}, [TMP2] 2316 usubl v16.8h, v16.8b, v0.8b 2317 ld1 {v18.8b}, [TMP3] 2318 usubl v17.8h, v17.8b, v0.8b 2319 ld1 {v19.8b}, [TMP4] 2320 usubl v18.8h, v18.8b, v0.8b 2321 ld1 {v20.8b}, [TMP5] 2322 usubl v19.8h, v19.8b, v0.8b 2323 ld1 {v21.8b}, [TMP6] 2324 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64 2325 usubl v20.8h, v20.8b, v0.8b 2326 ld1 {v22.8b}, [TMP7] 2327 usubl v21.8h, v21.8b, v0.8b 2328 ld1 {v23.8b}, [TMP8] 2329 usubl v22.8h, v22.8b, v0.8b 2330 usubl v23.8h, v23.8b, v0.8b 2331 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64 2332 2333 br x30 2334 2335 .unreq SAMPLE_DATA 2336 .unreq START_COL 2337 .unreq WORKSPACE 2338 .unreq TMP1 2339 .unreq TMP2 2340 .unreq TMP3 2341 .unreq TMP4 2342 .unreq TMP5 2343 .unreq TMP6 2344 .unreq TMP7 2345 .unreq TMP8 2346 .unreq TMPDUP 2347 2348/*****************************************************************************/ 2349 2350/* 2351 * jsimd_fdct_islow_neon 2352 * 2353 * This file contains a slow-but-accurate integer implementation of the 2354 * forward DCT (Discrete Cosine Transform). The following code is based 2355 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for 2356 * more details. 2357 * 2358 * TODO: can be combined with 'jsimd_convsamp_neon' to get 2359 * rid of a bunch of VLD1.16 instructions 2360 */ 2361 2362#define CONST_BITS 13 2363#define PASS1_BITS 2 2364 2365#define DESCALE_P1 (CONST_BITS - PASS1_BITS) 2366#define DESCALE_P2 (CONST_BITS + PASS1_BITS) 2367 2368#define XFIX_P_0_298 v0.h[0] 2369#define XFIX_N_0_390 v0.h[1] 2370#define XFIX_P_0_541 v0.h[2] 2371#define XFIX_P_0_765 v0.h[3] 2372#define XFIX_N_0_899 v0.h[4] 2373#define XFIX_P_1_175 v0.h[5] 2374#define XFIX_P_1_501 v0.h[6] 2375#define XFIX_N_1_847 v0.h[7] 2376#define XFIX_N_1_961 v1.h[0] 2377#define XFIX_P_2_053 v1.h[1] 2378#define XFIX_N_2_562 v1.h[2] 2379#define XFIX_P_3_072 v1.h[3] 2380 2381asm_function jsimd_fdct_islow_neon 2382 2383 DATA .req x0 2384 TMP .req x9 2385 2386 /* Load constants */ 2387 get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts 2388 ld1 {v0.8h, v1.8h}, [TMP] 2389 2390 /* Save NEON registers */ 2391 sub sp, sp, #64 2392 mov x10, sp 2393 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32 2394 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32 2395 2396 /* Load all DATA into NEON registers with the following allocation: 2397 * 0 1 2 3 | 4 5 6 7 2398 * ---------+-------- 2399 * 0 | d16 | d17 | v16.8h 2400 * 1 | d18 | d19 | v17.8h 2401 * 2 | d20 | d21 | v18.8h 2402 * 3 | d22 | d23 | v19.8h 2403 * 4 | d24 | d25 | v20.8h 2404 * 5 | d26 | d27 | v21.8h 2405 * 6 | d28 | d29 | v22.8h 2406 * 7 | d30 | d31 | v23.8h 2407 */ 2408 2409 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2410 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2411 sub DATA, DATA, #64 2412 2413 /* Transpose */ 2414 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 2415 /* 1-D FDCT */ 2416 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ 2417 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ 2418 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ 2419 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ 2420 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ 2421 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ 2422 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ 2423 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ 2424 2425 /* even part */ 2426 2427 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ 2428 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ 2429 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ 2430 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ 2431 2432 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ 2433 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ 2434 2435 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ 2436 2437 shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ 2438 shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ 2439 2440 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2441 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2442 mov v22.16b, v18.16b 2443 mov v25.16b, v24.16b 2444 2445 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2446 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2447 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2448 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2449 2450 rshrn v18.4h, v18.4s, #DESCALE_P1 2451 rshrn v22.4h, v22.4s, #DESCALE_P1 2452 rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ 2453 rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ 2454 2455 /* Odd part */ 2456 2457 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ 2458 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ 2459 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ 2460 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ 2461 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ 2462 smull2 v5.4s, v10.8h, XFIX_P_1_175 2463 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ 2464 smlal2 v5.4s, v11.8h, XFIX_P_1_175 2465 2466 smull2 v24.4s, v28.8h, XFIX_P_0_298 2467 smull2 v25.4s, v29.8h, XFIX_P_2_053 2468 smull2 v26.4s, v30.8h, XFIX_P_3_072 2469 smull2 v27.4s, v31.8h, XFIX_P_1_501 2470 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ 2471 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ 2472 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ 2473 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ 2474 2475 smull2 v12.4s, v8.8h, XFIX_N_0_899 2476 smull2 v13.4s, v9.8h, XFIX_N_2_562 2477 smull2 v14.4s, v10.8h, XFIX_N_1_961 2478 smull2 v15.4s, v11.8h, XFIX_N_0_390 2479 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ 2480 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ 2481 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ 2482 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ 2483 2484 add v10.4s, v10.4s, v4.4s /* z3 += z5 */ 2485 add v14.4s, v14.4s, v5.4s 2486 add v11.4s, v11.4s, v4.4s /* z4 += z5 */ 2487 add v15.4s, v15.4s, v5.4s 2488 2489 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ 2490 add v24.4s, v24.4s, v12.4s 2491 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ 2492 add v25.4s, v25.4s, v13.4s 2493 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ 2494 add v26.4s, v26.4s, v14.4s 2495 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ 2496 add v27.4s, v27.4s, v15.4s 2497 2498 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ 2499 add v24.4s, v24.4s, v14.4s 2500 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ 2501 add v25.4s, v25.4s, v15.4s 2502 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ 2503 add v26.4s, v26.4s, v13.4s 2504 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ 2505 add v27.4s, v27.4s, v12.4s 2506 2507 rshrn v23.4h, v28.4s, #DESCALE_P1 2508 rshrn v21.4h, v29.4s, #DESCALE_P1 2509 rshrn v19.4h, v30.4s, #DESCALE_P1 2510 rshrn v17.4h, v31.4s, #DESCALE_P1 2511 rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ 2512 rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ 2513 rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ 2514 rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ 2515 2516 /* Transpose */ 2517 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 2518 2519 /* 1-D FDCT */ 2520 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ 2521 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ 2522 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ 2523 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ 2524 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ 2525 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ 2526 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ 2527 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ 2528 2529 /* even part */ 2530 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ 2531 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ 2532 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ 2533 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ 2534 2535 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ 2536 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ 2537 2538 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ 2539 2540 srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */ 2541 srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */ 2542 2543 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2544 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2545 mov v22.16b, v18.16b 2546 mov v25.16b, v24.16b 2547 2548 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2549 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2550 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2551 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2552 2553 rshrn v18.4h, v18.4s, #DESCALE_P2 2554 rshrn v22.4h, v22.4s, #DESCALE_P2 2555 rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ 2556 rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ 2557 2558 /* Odd part */ 2559 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ 2560 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ 2561 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ 2562 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ 2563 2564 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ 2565 smull2 v5.4s, v10.8h, XFIX_P_1_175 2566 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ 2567 smlal2 v5.4s, v11.8h, XFIX_P_1_175 2568 2569 smull2 v24.4s, v28.8h, XFIX_P_0_298 2570 smull2 v25.4s, v29.8h, XFIX_P_2_053 2571 smull2 v26.4s, v30.8h, XFIX_P_3_072 2572 smull2 v27.4s, v31.8h, XFIX_P_1_501 2573 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ 2574 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ 2575 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ 2576 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ 2577 2578 smull2 v12.4s, v8.8h, XFIX_N_0_899 2579 smull2 v13.4s, v9.8h, XFIX_N_2_562 2580 smull2 v14.4s, v10.8h, XFIX_N_1_961 2581 smull2 v15.4s, v11.8h, XFIX_N_0_390 2582 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ 2583 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ 2584 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ 2585 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ 2586 2587 add v10.4s, v10.4s, v4.4s 2588 add v14.4s, v14.4s, v5.4s 2589 add v11.4s, v11.4s, v4.4s 2590 add v15.4s, v15.4s, v5.4s 2591 2592 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ 2593 add v24.4s, v24.4s, v12.4s 2594 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ 2595 add v25.4s, v25.4s, v13.4s 2596 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ 2597 add v26.4s, v26.4s, v14.4s 2598 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ 2599 add v27.4s, v27.4s, v15.4s 2600 2601 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ 2602 add v24.4s, v24.4s, v14.4s 2603 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ 2604 add v25.4s, v25.4s, v15.4s 2605 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ 2606 add v26.4s, v26.4s, v13.4s 2607 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ 2608 add v27.4s, v27.4s, v12.4s 2609 2610 rshrn v23.4h, v28.4s, #DESCALE_P2 2611 rshrn v21.4h, v29.4s, #DESCALE_P2 2612 rshrn v19.4h, v30.4s, #DESCALE_P2 2613 rshrn v17.4h, v31.4s, #DESCALE_P2 2614 rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ 2615 rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ 2616 rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ 2617 rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ 2618 2619 /* store results */ 2620 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2621 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2622 2623 /* Restore NEON registers */ 2624 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 2625 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 2626 2627 br x30 2628 2629 .unreq DATA 2630 .unreq TMP 2631 2632#undef XFIX_P_0_298 2633#undef XFIX_N_0_390 2634#undef XFIX_P_0_541 2635#undef XFIX_P_0_765 2636#undef XFIX_N_0_899 2637#undef XFIX_P_1_175 2638#undef XFIX_P_1_501 2639#undef XFIX_N_1_847 2640#undef XFIX_N_1_961 2641#undef XFIX_P_2_053 2642#undef XFIX_N_2_562 2643#undef XFIX_P_3_072 2644 2645 2646/*****************************************************************************/ 2647 2648/* 2649 * jsimd_fdct_ifast_neon 2650 * 2651 * This function contains a fast, not so accurate integer implementation of 2652 * the forward DCT (Discrete Cosine Transform). It uses the same calculations 2653 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' 2654 * function from jfdctfst.c 2655 * 2656 * TODO: can be combined with 'jsimd_convsamp_neon' to get 2657 * rid of a bunch of VLD1.16 instructions 2658 */ 2659 2660#undef XFIX_0_541196100 2661#define XFIX_0_382683433 v0.h[0] 2662#define XFIX_0_541196100 v0.h[1] 2663#define XFIX_0_707106781 v0.h[2] 2664#define XFIX_1_306562965 v0.h[3] 2665 2666asm_function jsimd_fdct_ifast_neon 2667 2668 DATA .req x0 2669 TMP .req x9 2670 2671 /* Load constants */ 2672 get_symbol_loc TMP, Ljsimd_fdct_ifast_neon_consts 2673 ld1 {v0.4h}, [TMP] 2674 2675 /* Load all DATA into NEON registers with the following allocation: 2676 * 0 1 2 3 | 4 5 6 7 2677 * ---------+-------- 2678 * 0 | d16 | d17 | v0.8h 2679 * 1 | d18 | d19 | q9 2680 * 2 | d20 | d21 | q10 2681 * 3 | d22 | d23 | q11 2682 * 4 | d24 | d25 | q12 2683 * 5 | d26 | d27 | q13 2684 * 6 | d28 | d29 | q14 2685 * 7 | d30 | d31 | q15 2686 */ 2687 2688 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2689 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2690 mov TMP, #2 2691 sub DATA, DATA, #64 26921: 2693 /* Transpose */ 2694 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4 2695 subs TMP, TMP, #1 2696 /* 1-D FDCT */ 2697 add v4.8h, v19.8h, v20.8h 2698 sub v20.8h, v19.8h, v20.8h 2699 sub v28.8h, v18.8h, v21.8h 2700 add v18.8h, v18.8h, v21.8h 2701 sub v29.8h, v17.8h, v22.8h 2702 add v17.8h, v17.8h, v22.8h 2703 sub v21.8h, v16.8h, v23.8h 2704 add v16.8h, v16.8h, v23.8h 2705 sub v6.8h, v17.8h, v18.8h 2706 sub v7.8h, v16.8h, v4.8h 2707 add v5.8h, v17.8h, v18.8h 2708 add v6.8h, v6.8h, v7.8h 2709 add v4.8h, v16.8h, v4.8h 2710 sqdmulh v6.8h, v6.8h, XFIX_0_707106781 2711 add v19.8h, v20.8h, v28.8h 2712 add v16.8h, v4.8h, v5.8h 2713 sub v20.8h, v4.8h, v5.8h 2714 add v5.8h, v28.8h, v29.8h 2715 add v29.8h, v29.8h, v21.8h 2716 sqdmulh v5.8h, v5.8h, XFIX_0_707106781 2717 sub v28.8h, v19.8h, v29.8h 2718 add v18.8h, v7.8h, v6.8h 2719 sqdmulh v28.8h, v28.8h, XFIX_0_382683433 2720 sub v22.8h, v7.8h, v6.8h 2721 sqdmulh v19.8h, v19.8h, XFIX_0_541196100 2722 sqdmulh v7.8h, v29.8h, XFIX_1_306562965 2723 add v6.8h, v21.8h, v5.8h 2724 sub v5.8h, v21.8h, v5.8h 2725 add v29.8h, v29.8h, v28.8h 2726 add v19.8h, v19.8h, v28.8h 2727 add v29.8h, v29.8h, v7.8h 2728 add v21.8h, v5.8h, v19.8h 2729 sub v19.8h, v5.8h, v19.8h 2730 add v17.8h, v6.8h, v29.8h 2731 sub v23.8h, v6.8h, v29.8h 2732 2733 b.ne 1b 2734 2735 /* store results */ 2736 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2737 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2738 2739 br x30 2740 2741 .unreq DATA 2742 .unreq TMP 2743#undef XFIX_0_382683433 2744#undef XFIX_0_541196100 2745#undef XFIX_0_707106781 2746#undef XFIX_1_306562965 2747 2748 2749/*****************************************************************************/ 2750 2751/* 2752 * GLOBAL(void) 2753 * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors, 2754 * DCTELEM *workspace); 2755 * 2756 */ 2757asm_function jsimd_quantize_neon 2758 2759 COEF_BLOCK .req x0 2760 DIVISORS .req x1 2761 WORKSPACE .req x2 2762 2763 RECIPROCAL .req DIVISORS 2764 CORRECTION .req x9 2765 SHIFT .req x10 2766 LOOP_COUNT .req x11 2767 2768 mov LOOP_COUNT, #2 2769 add CORRECTION, DIVISORS, #(64 * 2) 2770 add SHIFT, DIVISORS, #(64 * 6) 27711: 2772 subs LOOP_COUNT, LOOP_COUNT, #1 2773 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64 2774 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64 2775 abs v20.8h, v0.8h 2776 abs v21.8h, v1.8h 2777 abs v22.8h, v2.8h 2778 abs v23.8h, v3.8h 2779 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64 2780 add v20.8h, v20.8h, v4.8h /* add correction */ 2781 add v21.8h, v21.8h, v5.8h 2782 add v22.8h, v22.8h, v6.8h 2783 add v23.8h, v23.8h, v7.8h 2784 umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */ 2785 umull2 v16.4s, v20.8h, v28.8h 2786 umull v5.4s, v21.4h, v29.4h 2787 umull2 v17.4s, v21.8h, v29.8h 2788 umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */ 2789 umull2 v18.4s, v22.8h, v30.8h 2790 umull v7.4s, v23.4h, v31.4h 2791 umull2 v19.4s, v23.8h, v31.8h 2792 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64 2793 shrn v4.4h, v4.4s, #16 2794 shrn v5.4h, v5.4s, #16 2795 shrn v6.4h, v6.4s, #16 2796 shrn v7.4h, v7.4s, #16 2797 shrn2 v4.8h, v16.4s, #16 2798 shrn2 v5.8h, v17.4s, #16 2799 shrn2 v6.8h, v18.4s, #16 2800 shrn2 v7.8h, v19.4s, #16 2801 neg v24.8h, v24.8h 2802 neg v25.8h, v25.8h 2803 neg v26.8h, v26.8h 2804 neg v27.8h, v27.8h 2805 sshr v0.8h, v0.8h, #15 /* extract sign */ 2806 sshr v1.8h, v1.8h, #15 2807 sshr v2.8h, v2.8h, #15 2808 sshr v3.8h, v3.8h, #15 2809 ushl v4.8h, v4.8h, v24.8h /* shift */ 2810 ushl v5.8h, v5.8h, v25.8h 2811 ushl v6.8h, v6.8h, v26.8h 2812 ushl v7.8h, v7.8h, v27.8h 2813 2814 eor v4.16b, v4.16b, v0.16b /* restore sign */ 2815 eor v5.16b, v5.16b, v1.16b 2816 eor v6.16b, v6.16b, v2.16b 2817 eor v7.16b, v7.16b, v3.16b 2818 sub v4.8h, v4.8h, v0.8h 2819 sub v5.8h, v5.8h, v1.8h 2820 sub v6.8h, v6.8h, v2.8h 2821 sub v7.8h, v7.8h, v3.8h 2822 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64 2823 2824 b.ne 1b 2825 2826 br x30 /* return */ 2827 2828 .unreq COEF_BLOCK 2829 .unreq DIVISORS 2830 .unreq WORKSPACE 2831 .unreq RECIPROCAL 2832 .unreq CORRECTION 2833 .unreq SHIFT 2834 .unreq LOOP_COUNT 2835 2836 2837/*****************************************************************************/ 2838 2839/* 2840 * Downsample pixel values of a single component. 2841 * This version handles the common case of 2:1 horizontal and 1:1 vertical, 2842 * without smoothing. 2843 * 2844 * GLOBAL(void) 2845 * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, 2846 * JDIMENSION v_samp_factor, 2847 * JDIMENSION width_in_blocks, 2848 * JSAMPARRAY input_data, JSAMPARRAY output_data); 2849 */ 2850 2851asm_function jsimd_h2v1_downsample_neon 2852 IMAGE_WIDTH .req x0 2853 MAX_V_SAMP .req x1 2854 V_SAMP .req x2 2855 BLOCK_WIDTH .req x3 2856 INPUT_DATA .req x4 2857 OUTPUT_DATA .req x5 2858 OUTPTR .req x9 2859 INPTR .req x10 2860 TMP1 .req x11 2861 TMP2 .req x12 2862 TMP3 .req x13 2863 TMPDUP .req w15 2864 2865 mov TMPDUP, #0x10000 2866 lsl TMP2, BLOCK_WIDTH, #4 2867 sub TMP2, TMP2, IMAGE_WIDTH 2868 get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts 2869 add TMP3, TMP3, TMP2, lsl #4 2870 dup v16.4s, TMPDUP 2871 ld1 {v18.16b}, [TMP3] 2872 28731: /* row loop */ 2874 ldr INPTR, [INPUT_DATA], #8 2875 ldr OUTPTR, [OUTPUT_DATA], #8 2876 subs TMP1, BLOCK_WIDTH, #1 2877 b.eq 3f 28782: /* columns */ 2879 ld1 {v0.16b}, [INPTR], #16 2880 mov v4.16b, v16.16b 2881 subs TMP1, TMP1, #1 2882 uadalp v4.8h, v0.16b 2883 shrn v6.8b, v4.8h, #1 2884 st1 {v6.8b}, [OUTPTR], #8 2885 b.ne 2b 28863: /* last columns */ 2887 ld1 {v0.16b}, [INPTR] 2888 mov v4.16b, v16.16b 2889 subs V_SAMP, V_SAMP, #1 2890 /* expand right */ 2891 tbl v2.16b, {v0.16b}, v18.16b 2892 uadalp v4.8h, v2.16b 2893 shrn v6.8b, v4.8h, #1 2894 st1 {v6.8b}, [OUTPTR], #8 2895 b.ne 1b 2896 2897 br x30 2898 2899 .unreq IMAGE_WIDTH 2900 .unreq MAX_V_SAMP 2901 .unreq V_SAMP 2902 .unreq BLOCK_WIDTH 2903 .unreq INPUT_DATA 2904 .unreq OUTPUT_DATA 2905 .unreq OUTPTR 2906 .unreq INPTR 2907 .unreq TMP1 2908 .unreq TMP2 2909 .unreq TMP3 2910 .unreq TMPDUP 2911 2912 2913/*****************************************************************************/ 2914 2915/* 2916 * Downsample pixel values of a single component. 2917 * This version handles the common case of 2:1 horizontal and 2:1 vertical, 2918 * without smoothing. 2919 * 2920 * GLOBAL(void) 2921 * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, 2922 * JDIMENSION v_samp_factor, 2923 * JDIMENSION width_in_blocks, 2924 * JSAMPARRAY input_data, JSAMPARRAY output_data); 2925 */ 2926 2927.balign 16 2928asm_function jsimd_h2v2_downsample_neon 2929 IMAGE_WIDTH .req x0 2930 MAX_V_SAMP .req x1 2931 V_SAMP .req x2 2932 BLOCK_WIDTH .req x3 2933 INPUT_DATA .req x4 2934 OUTPUT_DATA .req x5 2935 OUTPTR .req x9 2936 INPTR0 .req x10 2937 INPTR1 .req x14 2938 TMP1 .req x11 2939 TMP2 .req x12 2940 TMP3 .req x13 2941 TMPDUP .req w15 2942 2943 mov TMPDUP, #1 2944 lsl TMP2, BLOCK_WIDTH, #4 2945 lsl TMPDUP, TMPDUP, #17 2946 sub TMP2, TMP2, IMAGE_WIDTH 2947 get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts 2948 orr TMPDUP, TMPDUP, #1 2949 add TMP3, TMP3, TMP2, lsl #4 2950 dup v16.4s, TMPDUP 2951 ld1 {v18.16b}, [TMP3] 2952 29531: /* row loop */ 2954 ldr INPTR0, [INPUT_DATA], #8 2955 ldr OUTPTR, [OUTPUT_DATA], #8 2956 ldr INPTR1, [INPUT_DATA], #8 2957 subs TMP1, BLOCK_WIDTH, #1 2958 b.eq 3f 29592: /* columns */ 2960 ld1 {v0.16b}, [INPTR0], #16 2961 ld1 {v1.16b}, [INPTR1], #16 2962 mov v4.16b, v16.16b 2963 subs TMP1, TMP1, #1 2964 uadalp v4.8h, v0.16b 2965 uadalp v4.8h, v1.16b 2966 shrn v6.8b, v4.8h, #2 2967 st1 {v6.8b}, [OUTPTR], #8 2968 b.ne 2b 29693: /* last columns */ 2970 ld1 {v0.16b}, [INPTR0], #16 2971 ld1 {v1.16b}, [INPTR1], #16 2972 mov v4.16b, v16.16b 2973 subs V_SAMP, V_SAMP, #1 2974 /* expand right */ 2975 tbl v2.16b, {v0.16b}, v18.16b 2976 tbl v3.16b, {v1.16b}, v18.16b 2977 uadalp v4.8h, v2.16b 2978 uadalp v4.8h, v3.16b 2979 shrn v6.8b, v4.8h, #2 2980 st1 {v6.8b}, [OUTPTR], #8 2981 b.ne 1b 2982 2983 br x30 2984 2985 .unreq IMAGE_WIDTH 2986 .unreq MAX_V_SAMP 2987 .unreq V_SAMP 2988 .unreq BLOCK_WIDTH 2989 .unreq INPUT_DATA 2990 .unreq OUTPUT_DATA 2991 .unreq OUTPTR 2992 .unreq INPTR0 2993 .unreq INPTR1 2994 .unreq TMP1 2995 .unreq TMP2 2996 .unreq TMP3 2997 .unreq TMPDUP 2998 2999 3000/*****************************************************************************/ 3001 3002/* 3003 * GLOBAL(JOCTET *) 3004 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, 3005 * JCOEFPTR block, int last_dc_val, 3006 * c_derived_tbl *dctbl, c_derived_tbl *actbl) 3007 * 3008 */ 3009 3010 BUFFER .req x1 3011 PUT_BUFFER .req x6 3012 PUT_BITS .req x7 3013 PUT_BITSw .req w7 3014 3015.macro emit_byte 3016 sub PUT_BITS, PUT_BITS, #0x8 3017 lsr x19, PUT_BUFFER, PUT_BITS 3018 uxtb w19, w19 3019 strb w19, [BUFFER, #1]! 3020 cmp w19, #0xff 3021 b.ne 14f 3022 strb wzr, [BUFFER, #1]! 302314: 3024.endm 3025.macro put_bits CODE, SIZE 3026 lsl PUT_BUFFER, PUT_BUFFER, \SIZE 3027 add PUT_BITS, PUT_BITS, \SIZE 3028 orr PUT_BUFFER, PUT_BUFFER, \CODE 3029.endm 3030.macro checkbuf31 3031 cmp PUT_BITS, #0x20 3032 b.lt 31f 3033 emit_byte 3034 emit_byte 3035 emit_byte 3036 emit_byte 303731: 3038.endm 3039.macro checkbuf47 3040 cmp PUT_BITS, #0x30 3041 b.lt 47f 3042 emit_byte 3043 emit_byte 3044 emit_byte 3045 emit_byte 3046 emit_byte 3047 emit_byte 304847: 3049.endm 3050 3051.macro generate_jsimd_huff_encode_one_block fast_tbl 3052 3053.balign 16 3054 3055.if \fast_tbl == 1 3056asm_function jsimd_huff_encode_one_block_neon 3057.else 3058asm_function jsimd_huff_encode_one_block_neon_slowtbl 3059.endif 3060 sub sp, sp, 272 3061 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ 3062 /* Save ARM registers */ 3063 stp x19, x20, [sp] 3064.if \fast_tbl == 1 3065 get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts 3066.else 3067 get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts 3068.endif 3069 ldr PUT_BUFFER, [x0, #0x10] 3070 ldr PUT_BITSw, [x0, #0x18] 3071 ldrsh w12, [x2] /* load DC coeff in w12 */ 3072 /* prepare data */ 3073.if \fast_tbl == 1 3074 ld1 {v23.16b}, [x15], #16 3075 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 3076 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 3077 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 3078 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 3079 ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 3080 sub w12, w12, w3 /* last_dc_val, not used afterwards */ 3081 /* ZigZag 8x8 */ 3082 tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b 3083 tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b 3084 tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b 3085 tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b 3086 tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b 3087 tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b 3088 tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b 3089 tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b 3090 ins v0.h[0], w12 3091 tbx v1.16b, {v28.16b}, v16.16b 3092 tbx v2.16b, {v29.16b, v30.16b}, v17.16b 3093 tbx v5.16b, {v29.16b, v30.16b}, v18.16b 3094 tbx v6.16b, {v31.16b}, v19.16b 3095.else 3096 add x13, x2, #0x22 3097 sub w12, w12, w3 /* last_dc_val, not used afterwards */ 3098 ld1 {v23.16b}, [x15] 3099 add x14, x2, #0x18 3100 add x3, x2, #0x36 3101 ins v0.h[0], w12 3102 add x9, x2, #0x2 3103 ld1 {v1.h}[0], [x13] 3104 add x15, x2, #0x30 3105 ld1 {v2.h}[0], [x14] 3106 add x19, x2, #0x26 3107 ld1 {v3.h}[0], [x3] 3108 add x20, x2, #0x28 3109 ld1 {v0.h}[1], [x9] 3110 add x12, x2, #0x10 3111 ld1 {v1.h}[1], [x15] 3112 add x13, x2, #0x40 3113 ld1 {v2.h}[1], [x19] 3114 add x14, x2, #0x34 3115 ld1 {v3.h}[1], [x20] 3116 add x3, x2, #0x1a 3117 ld1 {v0.h}[2], [x12] 3118 add x9, x2, #0x20 3119 ld1 {v1.h}[2], [x13] 3120 add x15, x2, #0x32 3121 ld1 {v2.h}[2], [x14] 3122 add x19, x2, #0x42 3123 ld1 {v3.h}[2], [x3] 3124 add x20, x2, #0xc 3125 ld1 {v0.h}[3], [x9] 3126 add x12, x2, #0x12 3127 ld1 {v1.h}[3], [x15] 3128 add x13, x2, #0x24 3129 ld1 {v2.h}[3], [x19] 3130 add x14, x2, #0x50 3131 ld1 {v3.h}[3], [x20] 3132 add x3, x2, #0xe 3133 ld1 {v0.h}[4], [x12] 3134 add x9, x2, #0x4 3135 ld1 {v1.h}[4], [x13] 3136 add x15, x2, #0x16 3137 ld1 {v2.h}[4], [x14] 3138 add x19, x2, #0x60 3139 ld1 {v3.h}[4], [x3] 3140 add x20, x2, #0x1c 3141 ld1 {v0.h}[5], [x9] 3142 add x12, x2, #0x6 3143 ld1 {v1.h}[5], [x15] 3144 add x13, x2, #0x8 3145 ld1 {v2.h}[5], [x19] 3146 add x14, x2, #0x52 3147 ld1 {v3.h}[5], [x20] 3148 add x3, x2, #0x2a 3149 ld1 {v0.h}[6], [x12] 3150 add x9, x2, #0x14 3151 ld1 {v1.h}[6], [x13] 3152 add x15, x2, #0xa 3153 ld1 {v2.h}[6], [x14] 3154 add x19, x2, #0x44 3155 ld1 {v3.h}[6], [x3] 3156 add x20, x2, #0x38 3157 ld1 {v0.h}[7], [x9] 3158 add x12, x2, #0x46 3159 ld1 {v1.h}[7], [x15] 3160 add x13, x2, #0x3a 3161 ld1 {v2.h}[7], [x19] 3162 add x14, x2, #0x74 3163 ld1 {v3.h}[7], [x20] 3164 add x3, x2, #0x6a 3165 ld1 {v4.h}[0], [x12] 3166 add x9, x2, #0x54 3167 ld1 {v5.h}[0], [x13] 3168 add x15, x2, #0x2c 3169 ld1 {v6.h}[0], [x14] 3170 add x19, x2, #0x76 3171 ld1 {v7.h}[0], [x3] 3172 add x20, x2, #0x78 3173 ld1 {v4.h}[1], [x9] 3174 add x12, x2, #0x62 3175 ld1 {v5.h}[1], [x15] 3176 add x13, x2, #0x1e 3177 ld1 {v6.h}[1], [x19] 3178 add x14, x2, #0x68 3179 ld1 {v7.h}[1], [x20] 3180 add x3, x2, #0x7a 3181 ld1 {v4.h}[2], [x12] 3182 add x9, x2, #0x70 3183 ld1 {v5.h}[2], [x13] 3184 add x15, x2, #0x2e 3185 ld1 {v6.h}[2], [x14] 3186 add x19, x2, #0x5a 3187 ld1 {v7.h}[2], [x3] 3188 add x20, x2, #0x6c 3189 ld1 {v4.h}[3], [x9] 3190 add x12, x2, #0x72 3191 ld1 {v5.h}[3], [x15] 3192 add x13, x2, #0x3c 3193 ld1 {v6.h}[3], [x19] 3194 add x14, x2, #0x4c 3195 ld1 {v7.h}[3], [x20] 3196 add x3, x2, #0x5e 3197 ld1 {v4.h}[4], [x12] 3198 add x9, x2, #0x64 3199 ld1 {v5.h}[4], [x13] 3200 add x15, x2, #0x4a 3201 ld1 {v6.h}[4], [x14] 3202 add x19, x2, #0x3e 3203 ld1 {v7.h}[4], [x3] 3204 add x20, x2, #0x6e 3205 ld1 {v4.h}[5], [x9] 3206 add x12, x2, #0x56 3207 ld1 {v5.h}[5], [x15] 3208 add x13, x2, #0x58 3209 ld1 {v6.h}[5], [x19] 3210 add x14, x2, #0x4e 3211 ld1 {v7.h}[5], [x20] 3212 add x3, x2, #0x7c 3213 ld1 {v4.h}[6], [x12] 3214 add x9, x2, #0x48 3215 ld1 {v5.h}[6], [x13] 3216 add x15, x2, #0x66 3217 ld1 {v6.h}[6], [x14] 3218 add x19, x2, #0x5c 3219 ld1 {v7.h}[6], [x3] 3220 add x20, x2, #0x7e 3221 ld1 {v4.h}[7], [x9] 3222 ld1 {v5.h}[7], [x15] 3223 ld1 {v6.h}[7], [x19] 3224 ld1 {v7.h}[7], [x20] 3225.endif 3226 cmlt v24.8h, v0.8h, #0 3227 cmlt v25.8h, v1.8h, #0 3228 cmlt v26.8h, v2.8h, #0 3229 cmlt v27.8h, v3.8h, #0 3230 cmlt v28.8h, v4.8h, #0 3231 cmlt v29.8h, v5.8h, #0 3232 cmlt v30.8h, v6.8h, #0 3233 cmlt v31.8h, v7.8h, #0 3234 abs v0.8h, v0.8h 3235 abs v1.8h, v1.8h 3236 abs v2.8h, v2.8h 3237 abs v3.8h, v3.8h 3238 abs v4.8h, v4.8h 3239 abs v5.8h, v5.8h 3240 abs v6.8h, v6.8h 3241 abs v7.8h, v7.8h 3242 eor v24.16b, v24.16b, v0.16b 3243 eor v25.16b, v25.16b, v1.16b 3244 eor v26.16b, v26.16b, v2.16b 3245 eor v27.16b, v27.16b, v3.16b 3246 eor v28.16b, v28.16b, v4.16b 3247 eor v29.16b, v29.16b, v5.16b 3248 eor v30.16b, v30.16b, v6.16b 3249 eor v31.16b, v31.16b, v7.16b 3250 cmeq v16.8h, v0.8h, #0 3251 cmeq v17.8h, v1.8h, #0 3252 cmeq v18.8h, v2.8h, #0 3253 cmeq v19.8h, v3.8h, #0 3254 cmeq v20.8h, v4.8h, #0 3255 cmeq v21.8h, v5.8h, #0 3256 cmeq v22.8h, v6.8h, #0 3257 xtn v16.8b, v16.8h 3258 xtn v18.8b, v18.8h 3259 xtn v20.8b, v20.8h 3260 xtn v22.8b, v22.8h 3261 umov w14, v0.h[0] 3262 xtn2 v16.16b, v17.8h 3263 umov w13, v24.h[0] 3264 xtn2 v18.16b, v19.8h 3265 clz w14, w14 3266 xtn2 v20.16b, v21.8h 3267 lsl w13, w13, w14 3268 cmeq v17.8h, v7.8h, #0 3269 sub w12, w14, #32 3270 xtn2 v22.16b, v17.8h 3271 lsr w13, w13, w14 3272 and v16.16b, v16.16b, v23.16b 3273 neg w12, w12 3274 and v18.16b, v18.16b, v23.16b 3275 add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ 3276 and v20.16b, v20.16b, v23.16b 3277 add x15, sp, #0x90 /* x15 = t2 */ 3278 and v22.16b, v22.16b, v23.16b 3279 ldr w10, [x4, x12, lsl #2] 3280 addp v16.16b, v16.16b, v18.16b 3281 ldrb w11, [x3, x12] 3282 addp v20.16b, v20.16b, v22.16b 3283 checkbuf47 3284 addp v16.16b, v16.16b, v20.16b 3285 put_bits x10, x11 3286 addp v16.16b, v16.16b, v18.16b 3287 checkbuf47 3288 umov x9, v16.D[0] 3289 put_bits x13, x12 3290 cnt v17.8b, v16.8b 3291 mvn x9, x9 3292 addv B18, v17.8b 3293 add x4, x5, #0x400 /* x4 = actbl->ehufsi */ 3294 umov w12, v18.b[0] 3295 lsr x9, x9, #0x1 /* clear AC coeff */ 3296 ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ 3297 rbit x9, x9 /* x9 = index0 */ 3298 ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ 3299 cmp w12, #(64-8) 3300 add x11, sp, #16 3301 b.lt 4f 3302 cbz x9, 6f 3303 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 3304 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 3305 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 3306 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 33071: 3308 clz x2, x9 3309 add x15, x15, x2, lsl #1 3310 lsl x9, x9, x2 3311 ldrh w20, [x15, #-126] 33122: 3313 cmp x2, #0x10 3314 b.lt 3f 3315 sub x2, x2, #0x10 3316 checkbuf47 3317 put_bits x13, x14 3318 b 2b 33193: 3320 clz w20, w20 3321 ldrh w3, [x15, #2]! 3322 sub w11, w20, #32 3323 lsl w3, w3, w20 3324 neg w11, w11 3325 lsr w3, w3, w20 3326 add x2, x11, x2, lsl #4 3327 lsl x9, x9, #0x1 3328 ldr w12, [x5, x2, lsl #2] 3329 ldrb w10, [x4, x2] 3330 checkbuf31 3331 put_bits x12, x10 3332 put_bits x3, x11 3333 cbnz x9, 1b 3334 b 6f 33354: 3336 movi v21.8h, #0x0010 3337 clz v0.8h, v0.8h 3338 clz v1.8h, v1.8h 3339 clz v2.8h, v2.8h 3340 clz v3.8h, v3.8h 3341 clz v4.8h, v4.8h 3342 clz v5.8h, v5.8h 3343 clz v6.8h, v6.8h 3344 clz v7.8h, v7.8h 3345 ushl v24.8h, v24.8h, v0.8h 3346 ushl v25.8h, v25.8h, v1.8h 3347 ushl v26.8h, v26.8h, v2.8h 3348 ushl v27.8h, v27.8h, v3.8h 3349 ushl v28.8h, v28.8h, v4.8h 3350 ushl v29.8h, v29.8h, v5.8h 3351 ushl v30.8h, v30.8h, v6.8h 3352 ushl v31.8h, v31.8h, v7.8h 3353 neg v0.8h, v0.8h 3354 neg v1.8h, v1.8h 3355 neg v2.8h, v2.8h 3356 neg v3.8h, v3.8h 3357 neg v4.8h, v4.8h 3358 neg v5.8h, v5.8h 3359 neg v6.8h, v6.8h 3360 neg v7.8h, v7.8h 3361 ushl v24.8h, v24.8h, v0.8h 3362 ushl v25.8h, v25.8h, v1.8h 3363 ushl v26.8h, v26.8h, v2.8h 3364 ushl v27.8h, v27.8h, v3.8h 3365 ushl v28.8h, v28.8h, v4.8h 3366 ushl v29.8h, v29.8h, v5.8h 3367 ushl v30.8h, v30.8h, v6.8h 3368 ushl v31.8h, v31.8h, v7.8h 3369 add v0.8h, v21.8h, v0.8h 3370 add v1.8h, v21.8h, v1.8h 3371 add v2.8h, v21.8h, v2.8h 3372 add v3.8h, v21.8h, v3.8h 3373 add v4.8h, v21.8h, v4.8h 3374 add v5.8h, v21.8h, v5.8h 3375 add v6.8h, v21.8h, v6.8h 3376 add v7.8h, v21.8h, v7.8h 3377 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 3378 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 3379 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 3380 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 33811: 3382 clz x2, x9 3383 add x15, x15, x2, lsl #1 3384 lsl x9, x9, x2 3385 ldrh w11, [x15, #-126] 33862: 3387 cmp x2, #0x10 3388 b.lt 3f 3389 sub x2, x2, #0x10 3390 checkbuf47 3391 put_bits x13, x14 3392 b 2b 33933: 3394 ldrh w3, [x15, #2]! 3395 add x2, x11, x2, lsl #4 3396 lsl x9, x9, #0x1 3397 ldr w12, [x5, x2, lsl #2] 3398 ldrb w10, [x4, x2] 3399 checkbuf31 3400 put_bits x12, x10 3401 put_bits x3, x11 3402 cbnz x9, 1b 34036: 3404 add x13, sp, #0x10e 3405 cmp x15, x13 3406 b.hs 1f 3407 ldr w12, [x5] 3408 ldrb w14, [x4] 3409 checkbuf47 3410 put_bits x12, x14 34111: 3412 str PUT_BUFFER, [x0, #0x10] 3413 str PUT_BITSw, [x0, #0x18] 3414 ldp x19, x20, [sp], 16 3415 add x0, BUFFER, #0x1 3416 add sp, sp, 256 3417 br x30 3418 3419.endm 3420 3421generate_jsimd_huff_encode_one_block 1 3422generate_jsimd_huff_encode_one_block 0 3423 3424 .unreq BUFFER 3425 .unreq PUT_BUFFER 3426 .unreq PUT_BITS 3427 .unreq PUT_BITSw 3428 3429.purgem emit_byte 3430.purgem put_bits 3431.purgem checkbuf31 3432.purgem checkbuf47 3433