1/* 2 * ARMv8 NEON optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). 5 * All Rights Reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 7 * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> 9 * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved. 10 * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved. 11 * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. 12 * 13 * This software is provided 'as-is', without any express or implied 14 * warranty. In no event will the authors be held liable for any damages 15 * arising from the use of this software. 16 * 17 * Permission is granted to anyone to use this software for any purpose, 18 * including commercial applications, and to alter it and redistribute it 19 * freely, subject to the following restrictions: 20 * 21 * 1. The origin of this software must not be misrepresented; you must not 22 * claim that you wrote the original software. If you use this software 23 * in a product, an acknowledgment in the product documentation would be 24 * appreciated but is not required. 25 * 2. Altered source versions must be plainly marked as such, and must not be 26 * misrepresented as being the original software. 27 * 3. This notice may not be removed or altered from any source distribution. 28 */ 29 30#if defined(__linux__) && defined(__ELF__) 31.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ 32#endif 33 34.text 35 36 37#define RESPECT_STRICT_ALIGNMENT 1 38 39 40/*****************************************************************************/ 41 42/* Supplementary macro for setting function attributes */ 43.macro asm_function fname 44#ifdef __APPLE__ 45 .globl _\fname 46_\fname: 47#else 48 .global \fname 49#ifdef __ELF__ 50 .hidden \fname 51 .type \fname, %function 52#endif 53\fname: 54#endif 55.endm 56 57/* Transpose elements of single 128 bit registers */ 58.macro transpose_single x0, x1, xi, xilen, literal 59 ins \xi\xilen[0], \x0\xilen[0] 60 ins \x1\xilen[0], \x0\xilen[1] 61 trn1 \x0\literal, \x0\literal, \x1\literal 62 trn2 \x1\literal, \xi\literal, \x1\literal 63.endm 64 65/* Transpose elements of 2 differnet registers */ 66.macro transpose x0, x1, xi, xilen, literal 67 mov \xi\xilen, \x0\xilen 68 trn1 \x0\literal, \x0\literal, \x1\literal 69 trn2 \x1\literal, \xi\literal, \x1\literal 70.endm 71 72/* Transpose a block of 4x4 coefficients in four 64-bit registers */ 73.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen 74 mov \xi\xilen, \x0\xilen 75 trn1 \x0\x0len, \x0\x0len, \x2\x2len 76 trn2 \x2\x2len, \xi\x0len, \x2\x2len 77 mov \xi\xilen, \x1\xilen 78 trn1 \x1\x1len, \x1\x1len, \x3\x3len 79 trn2 \x3\x3len, \xi\x1len, \x3\x3len 80.endm 81 82.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen 83 mov \xi\xilen, \x0\xilen 84 trn1 \x0\x0len, \x0\x0len, \x1\x1len 85 trn2 \x1\x2len, \xi\x0len, \x1\x2len 86 mov \xi\xilen, \x2\xilen 87 trn1 \x2\x2len, \x2\x2len, \x3\x3len 88 trn2 \x3\x2len, \xi\x1len, \x3\x3len 89.endm 90 91.macro transpose_4x4 x0, x1, x2, x3, x5 92 transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b 93 transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b 94.endm 95 96.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3 97 trn1 \t0\().8h, \l0\().8h, \l1\().8h 98 trn1 \t1\().8h, \l2\().8h, \l3\().8h 99 trn1 \t2\().8h, \l4\().8h, \l5\().8h 100 trn1 \t3\().8h, \l6\().8h, \l7\().8h 101 trn2 \l1\().8h, \l0\().8h, \l1\().8h 102 trn2 \l3\().8h, \l2\().8h, \l3\().8h 103 trn2 \l5\().8h, \l4\().8h, \l5\().8h 104 trn2 \l7\().8h, \l6\().8h, \l7\().8h 105 106 trn1 \l4\().4s, \t2\().4s, \t3\().4s 107 trn2 \t3\().4s, \t2\().4s, \t3\().4s 108 trn1 \t2\().4s, \t0\().4s, \t1\().4s 109 trn2 \l2\().4s, \t0\().4s, \t1\().4s 110 trn1 \t0\().4s, \l1\().4s, \l3\().4s 111 trn2 \l3\().4s, \l1\().4s, \l3\().4s 112 trn2 \t1\().4s, \l5\().4s, \l7\().4s 113 trn1 \l5\().4s, \l5\().4s, \l7\().4s 114 115 trn2 \l6\().2d, \l2\().2d, \t3\().2d 116 trn1 \l0\().2d, \t2\().2d, \l4\().2d 117 trn1 \l1\().2d, \t0\().2d, \l5\().2d 118 trn2 \l7\().2d, \l3\().2d, \t1\().2d 119 trn1 \l2\().2d, \l2\().2d, \t3\().2d 120 trn2 \l4\().2d, \t2\().2d, \l4\().2d 121 trn1 \l3\().2d, \l3\().2d, \t1\().2d 122 trn2 \l5\().2d, \t0\().2d, \l5\().2d 123.endm 124 125 126#define CENTERJSAMPLE 128 127 128/*****************************************************************************/ 129 130/* 131 * Perform dequantization and inverse DCT on one block of coefficients. 132 * 133 * GLOBAL(void) 134 * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block, 135 * JSAMPARRAY output_buf, JDIMENSION output_col) 136 */ 137 138#define CONST_BITS 13 139#define PASS1_BITS 2 140 141#define F_0_298 2446 /* FIX(0.298631336) */ 142#define F_0_390 3196 /* FIX(0.390180644) */ 143#define F_0_541 4433 /* FIX(0.541196100) */ 144#define F_0_765 6270 /* FIX(0.765366865) */ 145#define F_0_899 7373 /* FIX(0.899976223) */ 146#define F_1_175 9633 /* FIX(1.175875602) */ 147#define F_1_501 12299 /* FIX(1.501321110) */ 148#define F_1_847 15137 /* FIX(1.847759065) */ 149#define F_1_961 16069 /* FIX(1.961570560) */ 150#define F_2_053 16819 /* FIX(2.053119869) */ 151#define F_2_562 20995 /* FIX(2.562915447) */ 152#define F_3_072 25172 /* FIX(3.072711026) */ 153 154.balign 16 155Ljsimd_idct_islow_neon_consts: 156 .short F_0_298 157 .short -F_0_390 158 .short F_0_541 159 .short F_0_765 160 .short - F_0_899 161 .short F_1_175 162 .short F_1_501 163 .short - F_1_847 164 .short - F_1_961 165 .short F_2_053 166 .short - F_2_562 167 .short F_3_072 168 .short 0 /* padding */ 169 .short 0 170 .short 0 171 .short 0 172 173#undef F_0_298 174#undef F_0_390 175#undef F_0_541 176#undef F_0_765 177#undef F_0_899 178#undef F_1_175 179#undef F_1_501 180#undef F_1_847 181#undef F_1_961 182#undef F_2_053 183#undef F_2_562 184#undef F_3_072 185 186#define XFIX_P_0_298 v0.h[0] 187#define XFIX_N_0_390 v0.h[1] 188#define XFIX_P_0_541 v0.h[2] 189#define XFIX_P_0_765 v0.h[3] 190#define XFIX_N_0_899 v0.h[4] 191#define XFIX_P_1_175 v0.h[5] 192#define XFIX_P_1_501 v0.h[6] 193#define XFIX_N_1_847 v0.h[7] 194#define XFIX_N_1_961 v1.h[0] 195#define XFIX_P_2_053 v1.h[1] 196#define XFIX_N_2_562 v1.h[2] 197#define XFIX_P_3_072 v1.h[3] 198 199asm_function jsimd_idct_islow_neon 200 DCT_TABLE .req x0 201 COEF_BLOCK .req x1 202 OUTPUT_BUF .req x2 203 OUTPUT_COL .req x3 204 TMP1 .req x0 205 TMP2 .req x1 206 TMP3 .req x9 207 TMP4 .req x10 208 TMP5 .req x11 209 TMP6 .req x12 210 TMP7 .req x13 211 TMP8 .req x14 212 213 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 214 guarantee that the upper (unused) 32 bits of x3 are valid. This 215 instruction ensures that those bits are set to zero. */ 216 uxtw x3, w3 217 218 sub sp, sp, #64 219 adr x15, Ljsimd_idct_islow_neon_consts 220 mov x10, sp 221 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32 222 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32 223 ld1 {v0.8h, v1.8h}, [x15] 224 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 225 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 226 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64 227 ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64 228 229 cmeq v16.8h, v3.8h, #0 230 cmeq v26.8h, v4.8h, #0 231 cmeq v27.8h, v5.8h, #0 232 cmeq v28.8h, v6.8h, #0 233 cmeq v29.8h, v7.8h, #0 234 cmeq v30.8h, v8.8h, #0 235 cmeq v31.8h, v9.8h, #0 236 237 and v10.16b, v16.16b, v26.16b 238 and v11.16b, v27.16b, v28.16b 239 and v12.16b, v29.16b, v30.16b 240 and v13.16b, v31.16b, v10.16b 241 and v14.16b, v11.16b, v12.16b 242 mul v2.8h, v2.8h, v18.8h 243 and v15.16b, v13.16b, v14.16b 244 shl v10.8h, v2.8h, #(PASS1_BITS) 245 sqxtn v16.8b, v15.8h 246 mov TMP1, v16.d[0] 247 mvn TMP2, TMP1 248 249 cbnz TMP2, 2f 250 /* case all AC coeffs are zeros */ 251 dup v2.2d, v10.d[0] 252 dup v6.2d, v10.d[1] 253 mov v3.16b, v2.16b 254 mov v7.16b, v6.16b 255 mov v4.16b, v2.16b 256 mov v8.16b, v6.16b 257 mov v5.16b, v2.16b 258 mov v9.16b, v6.16b 2591: 260 /* for this transpose, we should organise data like this: 261 * 00, 01, 02, 03, 40, 41, 42, 43 262 * 10, 11, 12, 13, 50, 51, 52, 53 263 * 20, 21, 22, 23, 60, 61, 62, 63 264 * 30, 31, 32, 33, 70, 71, 72, 73 265 * 04, 05, 06, 07, 44, 45, 46, 47 266 * 14, 15, 16, 17, 54, 55, 56, 57 267 * 24, 25, 26, 27, 64, 65, 66, 67 268 * 34, 35, 36, 37, 74, 75, 76, 77 269 */ 270 trn1 v28.8h, v2.8h, v3.8h 271 trn1 v29.8h, v4.8h, v5.8h 272 trn1 v30.8h, v6.8h, v7.8h 273 trn1 v31.8h, v8.8h, v9.8h 274 trn2 v16.8h, v2.8h, v3.8h 275 trn2 v17.8h, v4.8h, v5.8h 276 trn2 v18.8h, v6.8h, v7.8h 277 trn2 v19.8h, v8.8h, v9.8h 278 trn1 v2.4s, v28.4s, v29.4s 279 trn1 v6.4s, v30.4s, v31.4s 280 trn1 v3.4s, v16.4s, v17.4s 281 trn1 v7.4s, v18.4s, v19.4s 282 trn2 v4.4s, v28.4s, v29.4s 283 trn2 v8.4s, v30.4s, v31.4s 284 trn2 v5.4s, v16.4s, v17.4s 285 trn2 v9.4s, v18.4s, v19.4s 286 /* Even part: reverse the even part of the forward DCT. */ 287 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 288 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 289 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 290 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 291 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 292 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 293 mov v21.16b, v19.16b /* tmp3 = z1 */ 294 mov v20.16b, v18.16b /* tmp3 = z1 */ 295 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ 296 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ 297 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 298 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 299 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 300 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 301 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 302 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 303 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 304 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 305 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 306 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 307 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 308 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 309 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 310 311 /* Odd part per figure 8; the matrix is unitary and hence its 312 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 313 */ 314 315 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 316 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 317 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 318 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 319 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 320 321 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 322 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 323 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 324 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 325 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 326 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ 327 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ 328 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ 329 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ 330 331 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 332 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 333 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 334 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 335 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 336 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ 337 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ 338 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ 339 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ 340 341 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 342 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 343 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 344 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 345 346 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 347 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 348 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 349 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 350 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 351 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 352 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 353 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 354 355 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 356 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 357 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 358 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 359 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 360 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 361 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 362 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 363 364 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 365 366 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 367 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 368 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 369 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 370 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 371 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 372 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 373 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 374 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 375 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 376 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 377 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 378 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 379 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 380 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 381 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 382 383 shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ 384 shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ 385 shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ 386 shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ 387 shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ 388 shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ 389 shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ 390 shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ 391 shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ 392 shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ 393 shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ 394 shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ 395 shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ 396 shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ 397 shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ 398 shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ 399 movi v0.16b, #(CENTERJSAMPLE) 400 /* Prepare pointers (dual-issue with NEON instructions) */ 401 ldp TMP1, TMP2, [OUTPUT_BUF], 16 402 sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16) 403 ldp TMP3, TMP4, [OUTPUT_BUF], 16 404 sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16) 405 add TMP1, TMP1, OUTPUT_COL 406 sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16) 407 add TMP2, TMP2, OUTPUT_COL 408 sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16) 409 add TMP3, TMP3, OUTPUT_COL 410 sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16) 411 add TMP4, TMP4, OUTPUT_COL 412 sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16) 413 ldp TMP5, TMP6, [OUTPUT_BUF], 16 414 sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16) 415 ldp TMP7, TMP8, [OUTPUT_BUF], 16 416 sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16) 417 add TMP5, TMP5, OUTPUT_COL 418 add v16.16b, v28.16b, v0.16b 419 add TMP6, TMP6, OUTPUT_COL 420 add v18.16b, v29.16b, v0.16b 421 add TMP7, TMP7, OUTPUT_COL 422 add v20.16b, v30.16b, v0.16b 423 add TMP8, TMP8, OUTPUT_COL 424 add v22.16b, v31.16b, v0.16b 425 426 /* Transpose the final 8-bit samples */ 427 trn1 v28.16b, v16.16b, v18.16b 428 trn1 v30.16b, v20.16b, v22.16b 429 trn2 v29.16b, v16.16b, v18.16b 430 trn2 v31.16b, v20.16b, v22.16b 431 432 trn1 v16.8h, v28.8h, v30.8h 433 trn2 v18.8h, v28.8h, v30.8h 434 trn1 v20.8h, v29.8h, v31.8h 435 trn2 v22.8h, v29.8h, v31.8h 436 437 uzp1 v28.4s, v16.4s, v18.4s 438 uzp2 v30.4s, v16.4s, v18.4s 439 uzp1 v29.4s, v20.4s, v22.4s 440 uzp2 v31.4s, v20.4s, v22.4s 441 442 /* Store results to the output buffer */ 443 st1 {v28.d}[0], [TMP1] 444 st1 {v29.d}[0], [TMP2] 445 st1 {v28.d}[1], [TMP3] 446 st1 {v29.d}[1], [TMP4] 447 st1 {v30.d}[0], [TMP5] 448 st1 {v31.d}[0], [TMP6] 449 st1 {v30.d}[1], [TMP7] 450 st1 {v31.d}[1], [TMP8] 451 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 452 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 453 blr x30 454 455.balign 16 4562: 457 mul v3.8h, v3.8h, v19.8h 458 mul v4.8h, v4.8h, v20.8h 459 mul v5.8h, v5.8h, v21.8h 460 add TMP4, xzr, TMP2, LSL #32 461 mul v6.8h, v6.8h, v22.8h 462 mul v7.8h, v7.8h, v23.8h 463 adds TMP3, xzr, TMP2, LSR #32 464 mul v8.8h, v8.8h, v24.8h 465 mul v9.8h, v9.8h, v25.8h 466 b.ne 3f 467 /* Right AC coef is zero */ 468 dup v15.2d, v10.d[1] 469 /* Even part: reverse the even part of the forward DCT. */ 470 add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 471 add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 472 sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 473 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 474 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 475 mov v20.16b, v18.16b /* tmp3 = z1 */ 476 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 477 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ 478 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 479 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 480 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 481 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 482 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 483 484 /* Odd part per figure 8; the matrix is unitary and hence its 485 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 486 */ 487 488 add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 489 add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 490 add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 491 add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 492 add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */ 493 494 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 495 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 496 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 497 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 498 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 499 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ 500 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ 501 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ 502 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ 503 504 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 505 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 506 507 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 508 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 509 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 510 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 511 512 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 513 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 514 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 515 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 516 517 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 518 519 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 520 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 521 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 522 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 523 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 524 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 525 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 526 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 527 528 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 529 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 530 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 531 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 532 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 533 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 534 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 535 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 536 mov v6.16b, v15.16b 537 mov v7.16b, v15.16b 538 mov v8.16b, v15.16b 539 mov v9.16b, v15.16b 540 b 1b 541 542.balign 16 5433: 544 cbnz TMP4, 4f 545 /* Left AC coef is zero */ 546 dup v14.2d, v10.d[0] 547 /* Even part: reverse the even part of the forward DCT. */ 548 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 549 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 550 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 551 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 552 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 553 mov v21.16b, v19.16b /* tmp3 = z1 */ 554 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ 555 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 556 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 557 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 558 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 559 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 560 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 561 562 /* Odd part per figure 8; the matrix is unitary and hence its 563 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 564 */ 565 566 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 567 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 568 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 569 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 570 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 571 572 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 573 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 574 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 575 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 576 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 577 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ 578 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ 579 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ 580 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ 581 582 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 583 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 584 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 585 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 586 587 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 588 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 589 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 590 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 591 592 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 593 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 594 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 595 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 596 597 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 598 599 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 600 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 601 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 602 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 603 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 604 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 605 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 606 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 607 608 mov v2.16b, v14.16b 609 mov v3.16b, v14.16b 610 mov v4.16b, v14.16b 611 mov v5.16b, v14.16b 612 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 613 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 614 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 615 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 616 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 617 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 618 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 619 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 620 b 1b 621 622.balign 16 6234: 624 /* "No" AC coef is zero */ 625 /* Even part: reverse the even part of the forward DCT. */ 626 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 627 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 628 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 629 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 630 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 631 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 632 mov v21.16b, v19.16b /* tmp3 = z1 */ 633 mov v20.16b, v18.16b /* tmp3 = z1 */ 634 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ 635 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */ 636 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 637 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 638 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 639 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 640 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 641 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 642 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 643 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 644 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 645 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 646 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 647 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 648 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 649 650 /* Odd part per figure 8; the matrix is unitary and hence its 651 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 652 */ 653 654 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 655 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 656 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 657 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 658 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 659 660 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 661 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 662 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 663 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 664 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 665 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ 666 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ 667 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ 668 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ 669 670 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 671 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 672 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 673 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 674 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 675 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */ 676 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */ 677 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */ 678 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */ 679 680 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 681 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 682 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 683 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 684 685 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 686 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 687 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 688 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 689 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 690 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 691 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 692 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 693 694 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 695 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 696 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 697 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 698 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 699 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 700 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 701 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 702 703 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 704 705 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 706 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 707 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 708 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 709 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 710 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 711 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 712 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 713 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 714 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 715 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 716 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 717 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 718 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 719 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 720 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 721 722 rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 723 rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 724 rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 725 rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 726 rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 727 rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 728 rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 729 rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 730 rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 731 rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 732 rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 733 rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 734 rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 735 rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 736 rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 737 rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 738 b 1b 739 740 .unreq DCT_TABLE 741 .unreq COEF_BLOCK 742 .unreq OUTPUT_BUF 743 .unreq OUTPUT_COL 744 .unreq TMP1 745 .unreq TMP2 746 .unreq TMP3 747 .unreq TMP4 748 .unreq TMP5 749 .unreq TMP6 750 .unreq TMP7 751 .unreq TMP8 752 753#undef CENTERJSAMPLE 754#undef CONST_BITS 755#undef PASS1_BITS 756#undef XFIX_P_0_298 757#undef XFIX_N_0_390 758#undef XFIX_P_0_541 759#undef XFIX_P_0_765 760#undef XFIX_N_0_899 761#undef XFIX_P_1_175 762#undef XFIX_P_1_501 763#undef XFIX_N_1_847 764#undef XFIX_N_1_961 765#undef XFIX_P_2_053 766#undef XFIX_N_2_562 767#undef XFIX_P_3_072 768 769 770/*****************************************************************************/ 771 772/* 773 * jsimd_idct_ifast_neon 774 * 775 * This function contains a fast, not so accurate integer implementation of 776 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 777 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' 778 * function from jidctfst.c 779 * 780 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 781 * But in ARM NEON case some extra additions are required because VQDMULH 782 * instruction can't handle the constants larger than 1. So the expressions 783 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 784 * which introduces an extra addition. Overall, there are 6 extra additions 785 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 786 */ 787 788#define XFIX_1_082392200 v0.h[0] 789#define XFIX_1_414213562 v0.h[1] 790#define XFIX_1_847759065 v0.h[2] 791#define XFIX_2_613125930 v0.h[3] 792 793.balign 16 794Ljsimd_idct_ifast_neon_consts: 795 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 796 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 797 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 798 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 799 800asm_function jsimd_idct_ifast_neon 801 802 DCT_TABLE .req x0 803 COEF_BLOCK .req x1 804 OUTPUT_BUF .req x2 805 OUTPUT_COL .req x3 806 TMP1 .req x0 807 TMP2 .req x1 808 TMP3 .req x9 809 TMP4 .req x10 810 TMP5 .req x11 811 TMP6 .req x12 812 TMP7 .req x13 813 TMP8 .req x14 814 815 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 816 guarantee that the upper (unused) 32 bits of x3 are valid. This 817 instruction ensures that those bits are set to zero. */ 818 uxtw x3, w3 819 820 /* Load and dequantize coefficients into NEON registers 821 * with the following allocation: 822 * 0 1 2 3 | 4 5 6 7 823 * ---------+-------- 824 * 0 | d16 | d17 ( v16.8h ) 825 * 1 | d18 | d19 ( v17.8h ) 826 * 2 | d20 | d21 ( v18.8h ) 827 * 3 | d22 | d23 ( v19.8h ) 828 * 4 | d24 | d25 ( v20.8h ) 829 * 5 | d26 | d27 ( v21.8h ) 830 * 6 | d28 | d29 ( v22.8h ) 831 * 7 | d30 | d31 ( v23.8h ) 832 */ 833 /* Save NEON registers used in fast IDCT */ 834 adr TMP5, Ljsimd_idct_ifast_neon_consts 835 ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32 836 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 837 ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32 838 mul v16.8h, v16.8h, v0.8h 839 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 840 mul v17.8h, v17.8h, v1.8h 841 ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32 842 mul v18.8h, v18.8h, v2.8h 843 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 844 mul v19.8h, v19.8h, v3.8h 845 ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32 846 mul v20.8h, v20.8h, v0.8h 847 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 848 mul v22.8h, v22.8h, v2.8h 849 mul v21.8h, v21.8h, v1.8h 850 ld1 {v0.4h}, [TMP5] /* load constants */ 851 mul v23.8h, v23.8h, v3.8h 852 853 /* 1-D IDCT, pass 1 */ 854 sub v2.8h, v18.8h, v22.8h 855 add v22.8h, v18.8h, v22.8h 856 sub v1.8h, v19.8h, v21.8h 857 add v21.8h, v19.8h, v21.8h 858 sub v5.8h, v17.8h, v23.8h 859 add v23.8h, v17.8h, v23.8h 860 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 861 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 862 add v3.8h, v1.8h, v1.8h 863 sub v1.8h, v5.8h, v1.8h 864 add v18.8h, v2.8h, v4.8h 865 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 866 sub v2.8h, v23.8h, v21.8h 867 add v3.8h, v3.8h, v6.8h 868 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 869 add v1.8h, v1.8h, v4.8h 870 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 871 sub v18.8h, v18.8h, v22.8h 872 add v2.8h, v2.8h, v6.8h 873 sub v6.8h, v16.8h, v20.8h 874 add v20.8h, v16.8h, v20.8h 875 add v17.8h, v5.8h, v4.8h 876 add v5.8h, v6.8h, v18.8h 877 sub v18.8h, v6.8h, v18.8h 878 add v6.8h, v23.8h, v21.8h 879 add v16.8h, v20.8h, v22.8h 880 sub v3.8h, v6.8h, v3.8h 881 sub v20.8h, v20.8h, v22.8h 882 sub v3.8h, v3.8h, v1.8h 883 sub v1.8h, v17.8h, v1.8h 884 add v2.8h, v3.8h, v2.8h 885 sub v23.8h, v16.8h, v6.8h 886 add v1.8h, v1.8h, v2.8h 887 add v16.8h, v16.8h, v6.8h 888 add v22.8h, v5.8h, v3.8h 889 sub v17.8h, v5.8h, v3.8h 890 sub v21.8h, v18.8h, v2.8h 891 add v18.8h, v18.8h, v2.8h 892 sub v19.8h, v20.8h, v1.8h 893 add v20.8h, v20.8h, v1.8h 894 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31 895 /* 1-D IDCT, pass 2 */ 896 sub v2.8h, v18.8h, v22.8h 897 add v22.8h, v18.8h, v22.8h 898 sub v1.8h, v19.8h, v21.8h 899 add v21.8h, v19.8h, v21.8h 900 sub v5.8h, v17.8h, v23.8h 901 add v23.8h, v17.8h, v23.8h 902 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 903 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 904 add v3.8h, v1.8h, v1.8h 905 sub v1.8h, v5.8h, v1.8h 906 add v18.8h, v2.8h, v4.8h 907 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 908 sub v2.8h, v23.8h, v21.8h 909 add v3.8h, v3.8h, v6.8h 910 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 911 add v1.8h, v1.8h, v4.8h 912 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 913 sub v18.8h, v18.8h, v22.8h 914 add v2.8h, v2.8h, v6.8h 915 sub v6.8h, v16.8h, v20.8h 916 add v20.8h, v16.8h, v20.8h 917 add v17.8h, v5.8h, v4.8h 918 add v5.8h, v6.8h, v18.8h 919 sub v18.8h, v6.8h, v18.8h 920 add v6.8h, v23.8h, v21.8h 921 add v16.8h, v20.8h, v22.8h 922 sub v3.8h, v6.8h, v3.8h 923 sub v20.8h, v20.8h, v22.8h 924 sub v3.8h, v3.8h, v1.8h 925 sub v1.8h, v17.8h, v1.8h 926 add v2.8h, v3.8h, v2.8h 927 sub v23.8h, v16.8h, v6.8h 928 add v1.8h, v1.8h, v2.8h 929 add v16.8h, v16.8h, v6.8h 930 add v22.8h, v5.8h, v3.8h 931 sub v17.8h, v5.8h, v3.8h 932 sub v21.8h, v18.8h, v2.8h 933 add v18.8h, v18.8h, v2.8h 934 sub v19.8h, v20.8h, v1.8h 935 add v20.8h, v20.8h, v1.8h 936 /* Descale to 8-bit and range limit */ 937 movi v0.16b, #0x80 938 /* Prepare pointers (dual-issue with NEON instructions) */ 939 ldp TMP1, TMP2, [OUTPUT_BUF], 16 940 sqshrn v28.8b, v16.8h, #5 941 ldp TMP3, TMP4, [OUTPUT_BUF], 16 942 sqshrn v29.8b, v17.8h, #5 943 add TMP1, TMP1, OUTPUT_COL 944 sqshrn v30.8b, v18.8h, #5 945 add TMP2, TMP2, OUTPUT_COL 946 sqshrn v31.8b, v19.8h, #5 947 add TMP3, TMP3, OUTPUT_COL 948 sqshrn2 v28.16b, v20.8h, #5 949 add TMP4, TMP4, OUTPUT_COL 950 sqshrn2 v29.16b, v21.8h, #5 951 ldp TMP5, TMP6, [OUTPUT_BUF], 16 952 sqshrn2 v30.16b, v22.8h, #5 953 ldp TMP7, TMP8, [OUTPUT_BUF], 16 954 sqshrn2 v31.16b, v23.8h, #5 955 add TMP5, TMP5, OUTPUT_COL 956 add v16.16b, v28.16b, v0.16b 957 add TMP6, TMP6, OUTPUT_COL 958 add v18.16b, v29.16b, v0.16b 959 add TMP7, TMP7, OUTPUT_COL 960 add v20.16b, v30.16b, v0.16b 961 add TMP8, TMP8, OUTPUT_COL 962 add v22.16b, v31.16b, v0.16b 963 964 /* Transpose the final 8-bit samples */ 965 trn1 v28.16b, v16.16b, v18.16b 966 trn1 v30.16b, v20.16b, v22.16b 967 trn2 v29.16b, v16.16b, v18.16b 968 trn2 v31.16b, v20.16b, v22.16b 969 970 trn1 v16.8h, v28.8h, v30.8h 971 trn2 v18.8h, v28.8h, v30.8h 972 trn1 v20.8h, v29.8h, v31.8h 973 trn2 v22.8h, v29.8h, v31.8h 974 975 uzp1 v28.4s, v16.4s, v18.4s 976 uzp2 v30.4s, v16.4s, v18.4s 977 uzp1 v29.4s, v20.4s, v22.4s 978 uzp2 v31.4s, v20.4s, v22.4s 979 980 /* Store results to the output buffer */ 981 st1 {v28.d}[0], [TMP1] 982 st1 {v29.d}[0], [TMP2] 983 st1 {v28.d}[1], [TMP3] 984 st1 {v29.d}[1], [TMP4] 985 st1 {v30.d}[0], [TMP5] 986 st1 {v31.d}[0], [TMP6] 987 st1 {v30.d}[1], [TMP7] 988 st1 {v31.d}[1], [TMP8] 989 blr x30 990 991 .unreq DCT_TABLE 992 .unreq COEF_BLOCK 993 .unreq OUTPUT_BUF 994 .unreq OUTPUT_COL 995 .unreq TMP1 996 .unreq TMP2 997 .unreq TMP3 998 .unreq TMP4 999 .unreq TMP5 1000 .unreq TMP6 1001 .unreq TMP7 1002 .unreq TMP8 1003 1004 1005/*****************************************************************************/ 1006 1007/* 1008 * jsimd_idct_4x4_neon 1009 * 1010 * This function contains inverse-DCT code for getting reduced-size 1011 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 1012 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 1013 * function from jpeg-6b (jidctred.c). 1014 * 1015 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 1016 * requires much less arithmetic operations and hence should be faster. 1017 * The primary purpose of this particular NEON optimized function is 1018 * bit exact compatibility with jpeg-6b. 1019 * 1020 * TODO: a bit better instructions scheduling can be achieved by expanding 1021 * idct_helper/transpose_4x4 macros and reordering instructions, 1022 * but readability will suffer somewhat. 1023 */ 1024 1025#define CONST_BITS 13 1026 1027#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 1028#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 1029#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 1030#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 1031#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 1032#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 1033#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 1034#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 1035#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 1036#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 1037#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 1038#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 1039#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 1040#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 1041 1042.balign 16 1043Ljsimd_idct_4x4_neon_consts: 1044 .short FIX_1_847759065 /* v0.h[0] */ 1045 .short -FIX_0_765366865 /* v0.h[1] */ 1046 .short -FIX_0_211164243 /* v0.h[2] */ 1047 .short FIX_1_451774981 /* v0.h[3] */ 1048 .short -FIX_2_172734803 /* d1[0] */ 1049 .short FIX_1_061594337 /* d1[1] */ 1050 .short -FIX_0_509795579 /* d1[2] */ 1051 .short -FIX_0_601344887 /* d1[3] */ 1052 .short FIX_0_899976223 /* v2.h[0] */ 1053 .short FIX_2_562915447 /* v2.h[1] */ 1054 .short 1 << (CONST_BITS+1) /* v2.h[2] */ 1055 .short 0 /* v2.h[3] */ 1056 1057.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 1058 smull v28.4s, \x4, v2.h[2] 1059 smlal v28.4s, \x8, v0.h[0] 1060 smlal v28.4s, \x14, v0.h[1] 1061 1062 smull v26.4s, \x16, v1.h[2] 1063 smlal v26.4s, \x12, v1.h[3] 1064 smlal v26.4s, \x10, v2.h[0] 1065 smlal v26.4s, \x6, v2.h[1] 1066 1067 smull v30.4s, \x4, v2.h[2] 1068 smlsl v30.4s, \x8, v0.h[0] 1069 smlsl v30.4s, \x14, v0.h[1] 1070 1071 smull v24.4s, \x16, v0.h[2] 1072 smlal v24.4s, \x12, v0.h[3] 1073 smlal v24.4s, \x10, v1.h[0] 1074 smlal v24.4s, \x6, v1.h[1] 1075 1076 add v20.4s, v28.4s, v26.4s 1077 sub v28.4s, v28.4s, v26.4s 1078 1079 .if \shift > 16 1080 srshr v20.4s, v20.4s, #\shift 1081 srshr v28.4s, v28.4s, #\shift 1082 xtn \y26, v20.4s 1083 xtn \y29, v28.4s 1084 .else 1085 rshrn \y26, v20.4s, #\shift 1086 rshrn \y29, v28.4s, #\shift 1087 .endif 1088 1089 add v20.4s, v30.4s, v24.4s 1090 sub v30.4s, v30.4s, v24.4s 1091 1092 .if \shift > 16 1093 srshr v20.4s, v20.4s, #\shift 1094 srshr v30.4s, v30.4s, #\shift 1095 xtn \y27, v20.4s 1096 xtn \y28, v30.4s 1097 .else 1098 rshrn \y27, v20.4s, #\shift 1099 rshrn \y28, v30.4s, #\shift 1100 .endif 1101.endm 1102 1103asm_function jsimd_idct_4x4_neon 1104 1105 DCT_TABLE .req x0 1106 COEF_BLOCK .req x1 1107 OUTPUT_BUF .req x2 1108 OUTPUT_COL .req x3 1109 TMP1 .req x0 1110 TMP2 .req x1 1111 TMP3 .req x2 1112 TMP4 .req x15 1113 1114 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 1115 guarantee that the upper (unused) 32 bits of x3 are valid. This 1116 instruction ensures that those bits are set to zero. */ 1117 uxtw x3, w3 1118 1119 /* Save all used NEON registers */ 1120 sub sp, sp, 64 1121 mov x9, sp 1122 /* Load constants (v3.4h is just used for padding) */ 1123 adr TMP4, Ljsimd_idct_4x4_neon_consts 1124 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1125 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1126 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] 1127 1128 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1129 * 0 1 2 3 | 4 5 6 7 1130 * ---------+-------- 1131 * 0 | v4.4h | v5.4h 1132 * 1 | v6.4h | v7.4h 1133 * 2 | v8.4h | v9.4h 1134 * 3 | v10.4h | v11.4h 1135 * 4 | - | - 1136 * 5 | v12.4h | v13.4h 1137 * 6 | v14.4h | v15.4h 1138 * 7 | v16.4h | v17.4h 1139 */ 1140 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1141 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 1142 add COEF_BLOCK, COEF_BLOCK, #16 1143 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 1144 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1145 /* dequantize */ 1146 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1147 mul v4.4h, v4.4h, v18.4h 1148 mul v5.4h, v5.4h, v19.4h 1149 ins v4.d[1], v5.d[0] /* 128 bit q4 */ 1150 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 1151 mul v6.4h, v6.4h, v20.4h 1152 mul v7.4h, v7.4h, v21.4h 1153 ins v6.d[1], v7.d[0] /* 128 bit q6 */ 1154 mul v8.4h, v8.4h, v22.4h 1155 mul v9.4h, v9.4h, v23.4h 1156 ins v8.d[1], v9.d[0] /* 128 bit q8 */ 1157 add DCT_TABLE, DCT_TABLE, #16 1158 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 1159 mul v10.4h, v10.4h, v24.4h 1160 mul v11.4h, v11.4h, v25.4h 1161 ins v10.d[1], v11.d[0] /* 128 bit q10 */ 1162 mul v12.4h, v12.4h, v26.4h 1163 mul v13.4h, v13.4h, v27.4h 1164 ins v12.d[1], v13.d[0] /* 128 bit q12 */ 1165 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1166 mul v14.4h, v14.4h, v28.4h 1167 mul v15.4h, v15.4h, v29.4h 1168 ins v14.d[1], v15.d[0] /* 128 bit q14 */ 1169 mul v16.4h, v16.4h, v30.4h 1170 mul v17.4h, v17.4h, v31.4h 1171 ins v16.d[1], v17.d[0] /* 128 bit q16 */ 1172 1173 /* Pass 1 */ 1174 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \ 1175 v4.4h, v6.4h, v8.4h, v10.4h 1176 transpose_4x4 v4, v6, v8, v10, v3 1177 ins v10.d[1], v11.d[0] 1178 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \ 1179 v5.4h, v7.4h, v9.4h, v11.4h 1180 transpose_4x4 v5, v7, v9, v11, v3 1181 ins v10.d[1], v11.d[0] 1182 1183 /* Pass 2 */ 1184 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \ 1185 v26.4h, v27.4h, v28.4h, v29.4h 1186 transpose_4x4 v26, v27, v28, v29, v3 1187 1188 /* Range limit */ 1189 movi v30.8h, #0x80 1190 ins v26.d[1], v27.d[0] 1191 ins v28.d[1], v29.d[0] 1192 add v26.8h, v26.8h, v30.8h 1193 add v28.8h, v28.8h, v30.8h 1194 sqxtun v26.8b, v26.8h 1195 sqxtun v27.8b, v28.8h 1196 1197 /* Store results to the output buffer */ 1198 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1199 ldp TMP3, TMP4, [OUTPUT_BUF] 1200 add TMP1, TMP1, OUTPUT_COL 1201 add TMP2, TMP2, OUTPUT_COL 1202 add TMP3, TMP3, OUTPUT_COL 1203 add TMP4, TMP4, OUTPUT_COL 1204 1205#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT 1206 /* We can use much less instructions on little endian systems if the 1207 * OS kernel is not configured to trap unaligned memory accesses 1208 */ 1209 st1 {v26.s}[0], [TMP1], 4 1210 st1 {v27.s}[0], [TMP3], 4 1211 st1 {v26.s}[1], [TMP2], 4 1212 st1 {v27.s}[1], [TMP4], 4 1213#else 1214 st1 {v26.b}[0], [TMP1], 1 1215 st1 {v27.b}[0], [TMP3], 1 1216 st1 {v26.b}[1], [TMP1], 1 1217 st1 {v27.b}[1], [TMP3], 1 1218 st1 {v26.b}[2], [TMP1], 1 1219 st1 {v27.b}[2], [TMP3], 1 1220 st1 {v26.b}[3], [TMP1], 1 1221 st1 {v27.b}[3], [TMP3], 1 1222 1223 st1 {v26.b}[4], [TMP2], 1 1224 st1 {v27.b}[4], [TMP4], 1 1225 st1 {v26.b}[5], [TMP2], 1 1226 st1 {v27.b}[5], [TMP4], 1 1227 st1 {v26.b}[6], [TMP2], 1 1228 st1 {v27.b}[6], [TMP4], 1 1229 st1 {v26.b}[7], [TMP2], 1 1230 st1 {v27.b}[7], [TMP4], 1 1231#endif 1232 1233 /* vpop {v8.4h - v15.4h} ;not available */ 1234 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1235 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1236 blr x30 1237 1238 .unreq DCT_TABLE 1239 .unreq COEF_BLOCK 1240 .unreq OUTPUT_BUF 1241 .unreq OUTPUT_COL 1242 .unreq TMP1 1243 .unreq TMP2 1244 .unreq TMP3 1245 .unreq TMP4 1246 1247.purgem idct_helper 1248 1249 1250/*****************************************************************************/ 1251 1252/* 1253 * jsimd_idct_2x2_neon 1254 * 1255 * This function contains inverse-DCT code for getting reduced-size 1256 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations 1257 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 1258 * function from jpeg-6b (jidctred.c). 1259 * 1260 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 1261 * requires much less arithmetic operations and hence should be faster. 1262 * The primary purpose of this particular NEON optimized function is 1263 * bit exact compatibility with jpeg-6b. 1264 */ 1265 1266.balign 8 1267Ljsimd_idct_2x2_neon_consts: 1268 .short -FIX_0_720959822 /* v14[0] */ 1269 .short FIX_0_850430095 /* v14[1] */ 1270 .short -FIX_1_272758580 /* v14[2] */ 1271 .short FIX_3_624509785 /* v14[3] */ 1272 1273.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 1274 sshll v15.4s, \x4, #15 1275 smull v26.4s, \x6, v14.h[3] 1276 smlal v26.4s, \x10, v14.h[2] 1277 smlal v26.4s, \x12, v14.h[1] 1278 smlal v26.4s, \x16, v14.h[0] 1279 1280 add v20.4s, v15.4s, v26.4s 1281 sub v15.4s, v15.4s, v26.4s 1282 1283 .if \shift > 16 1284 srshr v20.4s, v20.4s, #\shift 1285 srshr v15.4s, v15.4s, #\shift 1286 xtn \y26, v20.4s 1287 xtn \y27, v15.4s 1288 .else 1289 rshrn \y26, v20.4s, #\shift 1290 rshrn \y27, v15.4s, #\shift 1291 .endif 1292.endm 1293 1294asm_function jsimd_idct_2x2_neon 1295 1296 DCT_TABLE .req x0 1297 COEF_BLOCK .req x1 1298 OUTPUT_BUF .req x2 1299 OUTPUT_COL .req x3 1300 TMP1 .req x0 1301 TMP2 .req x15 1302 1303 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 1304 guarantee that the upper (unused) 32 bits of x3 are valid. This 1305 instruction ensures that those bits are set to zero. */ 1306 uxtw x3, w3 1307 1308 /* vpush {v8.4h - v15.4h} ; not available */ 1309 sub sp, sp, 64 1310 mov x9, sp 1311 1312 /* Load constants */ 1313 adr TMP2, Ljsimd_idct_2x2_neon_consts 1314 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1315 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1316 ld1 {v14.4h}, [TMP2] 1317 1318 /* Load all COEF_BLOCK into NEON registers with the following allocation: 1319 * 0 1 2 3 | 4 5 6 7 1320 * ---------+-------- 1321 * 0 | v4.4h | v5.4h 1322 * 1 | v6.4h | v7.4h 1323 * 2 | - | - 1324 * 3 | v10.4h | v11.4h 1325 * 4 | - | - 1326 * 5 | v12.4h | v13.4h 1327 * 6 | - | - 1328 * 7 | v16.4h | v17.4h 1329 */ 1330 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1331 add COEF_BLOCK, COEF_BLOCK, #16 1332 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 1333 add COEF_BLOCK, COEF_BLOCK, #16 1334 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 1335 add COEF_BLOCK, COEF_BLOCK, #16 1336 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1337 /* Dequantize */ 1338 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1339 mul v4.4h, v4.4h, v18.4h 1340 mul v5.4h, v5.4h, v19.4h 1341 ins v4.d[1], v5.d[0] 1342 mul v6.4h, v6.4h, v20.4h 1343 mul v7.4h, v7.4h, v21.4h 1344 ins v6.d[1], v7.d[0] 1345 add DCT_TABLE, DCT_TABLE, #16 1346 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 1347 mul v10.4h, v10.4h, v24.4h 1348 mul v11.4h, v11.4h, v25.4h 1349 ins v10.d[1], v11.d[0] 1350 add DCT_TABLE, DCT_TABLE, #16 1351 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 1352 mul v12.4h, v12.4h, v26.4h 1353 mul v13.4h, v13.4h, v27.4h 1354 ins v12.d[1], v13.d[0] 1355 add DCT_TABLE, DCT_TABLE, #16 1356 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1357 mul v16.4h, v16.4h, v30.4h 1358 mul v17.4h, v17.4h, v31.4h 1359 ins v16.d[1], v17.d[0] 1360 1361 /* Pass 1 */ 1362#if 0 1363 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h 1364 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h 1365 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h 1366 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h 1367#else 1368 smull v26.4s, v6.4h, v14.h[3] 1369 smlal v26.4s, v10.4h, v14.h[2] 1370 smlal v26.4s, v12.4h, v14.h[1] 1371 smlal v26.4s, v16.4h, v14.h[0] 1372 smull v24.4s, v7.4h, v14.h[3] 1373 smlal v24.4s, v11.4h, v14.h[2] 1374 smlal v24.4s, v13.4h, v14.h[1] 1375 smlal v24.4s, v17.4h, v14.h[0] 1376 sshll v15.4s, v4.4h, #15 1377 sshll v30.4s, v5.4h, #15 1378 add v20.4s, v15.4s, v26.4s 1379 sub v15.4s, v15.4s, v26.4s 1380 rshrn v4.4h, v20.4s, #13 1381 rshrn v6.4h, v15.4s, #13 1382 add v20.4s, v30.4s, v24.4s 1383 sub v15.4s, v30.4s, v24.4s 1384 rshrn v5.4h, v20.4s, #13 1385 rshrn v7.4h, v15.4s, #13 1386 ins v4.d[1], v5.d[0] 1387 ins v6.d[1], v7.d[0] 1388 transpose v4, v6, v3, .16b, .8h 1389 transpose v6, v10, v3, .16b, .4s 1390 ins v11.d[0], v10.d[1] 1391 ins v7.d[0], v6.d[1] 1392#endif 1393 1394 /* Pass 2 */ 1395 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h 1396 1397 /* Range limit */ 1398 movi v30.8h, #0x80 1399 ins v26.d[1], v27.d[0] 1400 add v26.8h, v26.8h, v30.8h 1401 sqxtun v30.8b, v26.8h 1402 ins v26.d[0], v30.d[0] 1403 sqxtun v27.8b, v26.8h 1404 1405 /* Store results to the output buffer */ 1406 ldp TMP1, TMP2, [OUTPUT_BUF] 1407 add TMP1, TMP1, OUTPUT_COL 1408 add TMP2, TMP2, OUTPUT_COL 1409 1410 st1 {v26.b}[0], [TMP1], 1 1411 st1 {v27.b}[4], [TMP1], 1 1412 st1 {v26.b}[1], [TMP2], 1 1413 st1 {v27.b}[5], [TMP2], 1 1414 1415 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1416 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1417 blr x30 1418 1419 .unreq DCT_TABLE 1420 .unreq COEF_BLOCK 1421 .unreq OUTPUT_BUF 1422 .unreq OUTPUT_COL 1423 .unreq TMP1 1424 .unreq TMP2 1425 1426.purgem idct_helper 1427 1428 1429/*****************************************************************************/ 1430 1431/* 1432 * jsimd_ycc_extrgb_convert_neon 1433 * jsimd_ycc_extbgr_convert_neon 1434 * jsimd_ycc_extrgbx_convert_neon 1435 * jsimd_ycc_extbgrx_convert_neon 1436 * jsimd_ycc_extxbgr_convert_neon 1437 * jsimd_ycc_extxrgb_convert_neon 1438 * 1439 * Colorspace conversion YCbCr -> RGB 1440 */ 1441 1442.macro do_load size 1443 .if \size == 8 1444 ld1 {v4.8b}, [U], 8 1445 ld1 {v5.8b}, [V], 8 1446 ld1 {v0.8b}, [Y], 8 1447 prfm pldl1keep, [U, #64] 1448 prfm pldl1keep, [V, #64] 1449 prfm pldl1keep, [Y, #64] 1450 .elseif \size == 4 1451 ld1 {v4.b}[0], [U], 1 1452 ld1 {v4.b}[1], [U], 1 1453 ld1 {v4.b}[2], [U], 1 1454 ld1 {v4.b}[3], [U], 1 1455 ld1 {v5.b}[0], [V], 1 1456 ld1 {v5.b}[1], [V], 1 1457 ld1 {v5.b}[2], [V], 1 1458 ld1 {v5.b}[3], [V], 1 1459 ld1 {v0.b}[0], [Y], 1 1460 ld1 {v0.b}[1], [Y], 1 1461 ld1 {v0.b}[2], [Y], 1 1462 ld1 {v0.b}[3], [Y], 1 1463 .elseif \size == 2 1464 ld1 {v4.b}[4], [U], 1 1465 ld1 {v4.b}[5], [U], 1 1466 ld1 {v5.b}[4], [V], 1 1467 ld1 {v5.b}[5], [V], 1 1468 ld1 {v0.b}[4], [Y], 1 1469 ld1 {v0.b}[5], [Y], 1 1470 .elseif \size == 1 1471 ld1 {v4.b}[6], [U], 1 1472 ld1 {v5.b}[6], [V], 1 1473 ld1 {v0.b}[6], [Y], 1 1474 .else 1475 .error unsupported macroblock size 1476 .endif 1477.endm 1478 1479.macro do_store bpp, size, fast_st3 1480 .if \bpp == 24 1481 .if \size == 8 1482 .if \fast_st3 == 1 1483 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 1484 .else 1485 st1 {v10.b}[0], [RGB], #1 1486 st1 {v11.b}[0], [RGB], #1 1487 st1 {v12.b}[0], [RGB], #1 1488 1489 st1 {v10.b}[1], [RGB], #1 1490 st1 {v11.b}[1], [RGB], #1 1491 st1 {v12.b}[1], [RGB], #1 1492 1493 st1 {v10.b}[2], [RGB], #1 1494 st1 {v11.b}[2], [RGB], #1 1495 st1 {v12.b}[2], [RGB], #1 1496 1497 st1 {v10.b}[3], [RGB], #1 1498 st1 {v11.b}[3], [RGB], #1 1499 st1 {v12.b}[3], [RGB], #1 1500 1501 st1 {v10.b}[4], [RGB], #1 1502 st1 {v11.b}[4], [RGB], #1 1503 st1 {v12.b}[4], [RGB], #1 1504 1505 st1 {v10.b}[5], [RGB], #1 1506 st1 {v11.b}[5], [RGB], #1 1507 st1 {v12.b}[5], [RGB], #1 1508 1509 st1 {v10.b}[6], [RGB], #1 1510 st1 {v11.b}[6], [RGB], #1 1511 st1 {v12.b}[6], [RGB], #1 1512 1513 st1 {v10.b}[7], [RGB], #1 1514 st1 {v11.b}[7], [RGB], #1 1515 st1 {v12.b}[7], [RGB], #1 1516 .endif 1517 .elseif \size == 4 1518 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 1519 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 1520 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 1521 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 1522 .elseif \size == 2 1523 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 1524 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 1525 .elseif \size == 1 1526 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 1527 .else 1528 .error unsupported macroblock size 1529 .endif 1530 .elseif \bpp == 32 1531 .if \size == 8 1532 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 1533 .elseif \size == 4 1534 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 1535 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 1536 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 1537 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 1538 .elseif \size == 2 1539 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 1540 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 1541 .elseif \size == 1 1542 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 1543 .else 1544 .error unsupported macroblock size 1545 .endif 1546 .elseif \bpp==16 1547 .if \size == 8 1548 st1 {v25.8h}, [RGB], 16 1549 .elseif \size == 4 1550 st1 {v25.4h}, [RGB], 8 1551 .elseif \size == 2 1552 st1 {v25.h}[4], [RGB], 2 1553 st1 {v25.h}[5], [RGB], 2 1554 .elseif \size == 1 1555 st1 {v25.h}[6], [RGB], 2 1556 .else 1557 .error unsupported macroblock size 1558 .endif 1559 .else 1560 .error unsupported bpp 1561 .endif 1562.endm 1563 1564.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \ 1565 g_offs, gsize, b_offs, bsize, \ 1566 defsize, fast_st3 1567 1568/* 1569 * 2-stage pipelined YCbCr->RGB conversion 1570 */ 1571 1572.macro do_yuv_to_rgb_stage1 1573 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ 1574 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1575 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1576 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1577 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1578 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1579 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1580 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1581 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ 1582 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ 1583.endm 1584 1585.macro do_yuv_to_rgb_stage2 1586 rshrn v20.4h, v20.4s, #15 1587 rshrn2 v20.8h, v22.4s, #15 1588 rshrn v24.4h, v24.4s, #14 1589 rshrn2 v24.8h, v26.4s, #14 1590 rshrn v28.4h, v28.4s, #14 1591 rshrn2 v28.8h, v30.4s, #14 1592 uaddw v20.8h, v20.8h, v0.8b 1593 uaddw v24.8h, v24.8h, v0.8b 1594 uaddw v28.8h, v28.8h, v0.8b 1595 .if \bpp != 16 1596 sqxtun v1\g_offs\defsize, v20.8h 1597 sqxtun v1\r_offs\defsize, v24.8h 1598 sqxtun v1\b_offs\defsize, v28.8h 1599 .else 1600 sqshlu v21.8h, v20.8h, #8 1601 sqshlu v25.8h, v24.8h, #8 1602 sqshlu v29.8h, v28.8h, #8 1603 sri v25.8h, v21.8h, #5 1604 sri v25.8h, v29.8h, #11 1605 .endif 1606.endm 1607 1608.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3 1609 rshrn v20.4h, v20.4s, #15 1610 rshrn v24.4h, v24.4s, #14 1611 rshrn v28.4h, v28.4s, #14 1612 ld1 {v4.8b}, [U], 8 1613 rshrn2 v20.8h, v22.4s, #15 1614 rshrn2 v24.8h, v26.4s, #14 1615 rshrn2 v28.8h, v30.4s, #14 1616 ld1 {v5.8b}, [V], 8 1617 uaddw v20.8h, v20.8h, v0.8b 1618 uaddw v24.8h, v24.8h, v0.8b 1619 uaddw v28.8h, v28.8h, v0.8b 1620 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ 1621 sqxtun v1\g_offs\defsize, v20.8h 1622 ld1 {v0.8b}, [Y], 8 1623 sqxtun v1\r_offs\defsize, v24.8h 1624 prfm pldl1keep, [U, #64] 1625 prfm pldl1keep, [V, #64] 1626 prfm pldl1keep, [Y, #64] 1627 sqxtun v1\b_offs\defsize, v28.8h 1628 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1629 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1630 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1631 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1632 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1633 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1634 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1635 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1636 .else /**************************** rgb565 ********************************/ 1637 sqshlu v21.8h, v20.8h, #8 1638 sqshlu v25.8h, v24.8h, #8 1639 sqshlu v29.8h, v28.8h, #8 1640 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1641 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1642 ld1 {v0.8b}, [Y], 8 1643 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1644 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1645 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1646 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1647 sri v25.8h, v21.8h, #5 1648 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1649 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1650 prfm pldl1keep, [U, #64] 1651 prfm pldl1keep, [V, #64] 1652 prfm pldl1keep, [Y, #64] 1653 sri v25.8h, v29.8h, #11 1654 .endif 1655 do_store \bpp, 8, \fast_st3 1656 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ 1657 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ 1658.endm 1659 1660.macro do_yuv_to_rgb 1661 do_yuv_to_rgb_stage1 1662 do_yuv_to_rgb_stage2 1663.endm 1664 1665/* Apple gas crashes on adrl, work around that by using adr. 1666 * But this requires a copy of these constants for each function. 1667 */ 1668 1669.balign 16 1670.if \fast_st3 == 1 1671Ljsimd_ycc_\colorid\()_neon_consts: 1672.else 1673Ljsimd_ycc_\colorid\()_neon_slowst3_consts: 1674.endif 1675 .short 0, 0, 0, 0 1676 .short 22971, -11277, -23401, 29033 1677 .short -128, -128, -128, -128 1678 .short -128, -128, -128, -128 1679 1680.if \fast_st3 == 1 1681asm_function jsimd_ycc_\colorid\()_convert_neon 1682.else 1683asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 1684.endif 1685 OUTPUT_WIDTH .req w0 1686 INPUT_BUF .req x1 1687 INPUT_ROW .req w2 1688 OUTPUT_BUF .req x3 1689 NUM_ROWS .req w4 1690 1691 INPUT_BUF0 .req x5 1692 INPUT_BUF1 .req x6 1693 INPUT_BUF2 .req x1 1694 1695 RGB .req x7 1696 Y .req x9 1697 U .req x10 1698 V .req x11 1699 N .req w15 1700 1701 sub sp, sp, 64 1702 mov x9, sp 1703 1704 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ 1705 .if \fast_st3 == 1 1706 adr x15, Ljsimd_ycc_\colorid\()_neon_consts 1707 .else 1708 adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts 1709 .endif 1710 1711 /* Save NEON registers */ 1712 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1713 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1714 ld1 {v0.4h, v1.4h}, [x15], 16 1715 ld1 {v2.8h}, [x15] 1716 1717 ldr INPUT_BUF0, [INPUT_BUF] 1718 ldr INPUT_BUF1, [INPUT_BUF, #8] 1719 ldr INPUT_BUF2, [INPUT_BUF, #16] 1720 .unreq INPUT_BUF 1721 1722 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ 1723 movi v10.16b, #255 1724 movi v13.16b, #255 1725 1726 /* Outer loop over scanlines */ 1727 cmp NUM_ROWS, #1 1728 b.lt 9f 17290: 1730 ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3] 1731 ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3] 1732 mov N, OUTPUT_WIDTH 1733 ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3] 1734 add INPUT_ROW, INPUT_ROW, #1 1735 ldr RGB, [OUTPUT_BUF], #8 1736 1737 /* Inner loop over pixels */ 1738 subs N, N, #8 1739 b.lt 3f 1740 do_load 8 1741 do_yuv_to_rgb_stage1 1742 subs N, N, #8 1743 b.lt 2f 17441: 1745 do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3 1746 subs N, N, #8 1747 b.ge 1b 17482: 1749 do_yuv_to_rgb_stage2 1750 do_store \bpp, 8, \fast_st3 1751 tst N, #7 1752 b.eq 8f 17533: 1754 tst N, #4 1755 b.eq 3f 1756 do_load 4 17573: 1758 tst N, #2 1759 b.eq 4f 1760 do_load 2 17614: 1762 tst N, #1 1763 b.eq 5f 1764 do_load 1 17655: 1766 do_yuv_to_rgb 1767 tst N, #4 1768 b.eq 6f 1769 do_store \bpp, 4, \fast_st3 17706: 1771 tst N, #2 1772 b.eq 7f 1773 do_store \bpp, 2, \fast_st3 17747: 1775 tst N, #1 1776 b.eq 8f 1777 do_store \bpp, 1, \fast_st3 17788: 1779 subs NUM_ROWS, NUM_ROWS, #1 1780 b.gt 0b 17819: 1782 /* Restore all registers and return */ 1783 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1784 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1785 br x30 1786 .unreq OUTPUT_WIDTH 1787 .unreq INPUT_ROW 1788 .unreq OUTPUT_BUF 1789 .unreq NUM_ROWS 1790 .unreq INPUT_BUF0 1791 .unreq INPUT_BUF1 1792 .unreq INPUT_BUF2 1793 .unreq RGB 1794 .unreq Y 1795 .unreq U 1796 .unreq V 1797 .unreq N 1798 1799.purgem do_yuv_to_rgb 1800.purgem do_yuv_to_rgb_stage1 1801.purgem do_yuv_to_rgb_stage2 1802.purgem do_yuv_to_rgb_stage2_store_load_stage1 1803 1804.endm 1805 1806/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/ 1807generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1 1808generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1 1809generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1 1810generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1 1811generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1 1812generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1 1813generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1 1814 1815generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0 1816generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0 1817 1818.purgem do_load 1819.purgem do_store 1820 1821 1822/*****************************************************************************/ 1823 1824/* 1825 * jsimd_extrgb_ycc_convert_neon 1826 * jsimd_extbgr_ycc_convert_neon 1827 * jsimd_extrgbx_ycc_convert_neon 1828 * jsimd_extbgrx_ycc_convert_neon 1829 * jsimd_extxbgr_ycc_convert_neon 1830 * jsimd_extxrgb_ycc_convert_neon 1831 * 1832 * Colorspace conversion RGB -> YCbCr 1833 */ 1834 1835.macro do_store size 1836 .if \size == 8 1837 st1 {v20.8b}, [Y], #8 1838 st1 {v21.8b}, [U], #8 1839 st1 {v22.8b}, [V], #8 1840 .elseif \size == 4 1841 st1 {v20.b}[0], [Y], #1 1842 st1 {v20.b}[1], [Y], #1 1843 st1 {v20.b}[2], [Y], #1 1844 st1 {v20.b}[3], [Y], #1 1845 st1 {v21.b}[0], [U], #1 1846 st1 {v21.b}[1], [U], #1 1847 st1 {v21.b}[2], [U], #1 1848 st1 {v21.b}[3], [U], #1 1849 st1 {v22.b}[0], [V], #1 1850 st1 {v22.b}[1], [V], #1 1851 st1 {v22.b}[2], [V], #1 1852 st1 {v22.b}[3], [V], #1 1853 .elseif \size == 2 1854 st1 {v20.b}[4], [Y], #1 1855 st1 {v20.b}[5], [Y], #1 1856 st1 {v21.b}[4], [U], #1 1857 st1 {v21.b}[5], [U], #1 1858 st1 {v22.b}[4], [V], #1 1859 st1 {v22.b}[5], [V], #1 1860 .elseif \size == 1 1861 st1 {v20.b}[6], [Y], #1 1862 st1 {v21.b}[6], [U], #1 1863 st1 {v22.b}[6], [V], #1 1864 .else 1865 .error unsupported macroblock size 1866 .endif 1867.endm 1868 1869.macro do_load bpp, size, fast_ld3 1870 .if \bpp == 24 1871 .if \size == 8 1872 .if \fast_ld3 == 1 1873 ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24 1874 .else 1875 ld1 {v10.b}[0], [RGB], #1 1876 ld1 {v11.b}[0], [RGB], #1 1877 ld1 {v12.b}[0], [RGB], #1 1878 1879 ld1 {v10.b}[1], [RGB], #1 1880 ld1 {v11.b}[1], [RGB], #1 1881 ld1 {v12.b}[1], [RGB], #1 1882 1883 ld1 {v10.b}[2], [RGB], #1 1884 ld1 {v11.b}[2], [RGB], #1 1885 ld1 {v12.b}[2], [RGB], #1 1886 1887 ld1 {v10.b}[3], [RGB], #1 1888 ld1 {v11.b}[3], [RGB], #1 1889 ld1 {v12.b}[3], [RGB], #1 1890 1891 ld1 {v10.b}[4], [RGB], #1 1892 ld1 {v11.b}[4], [RGB], #1 1893 ld1 {v12.b}[4], [RGB], #1 1894 1895 ld1 {v10.b}[5], [RGB], #1 1896 ld1 {v11.b}[5], [RGB], #1 1897 ld1 {v12.b}[5], [RGB], #1 1898 1899 ld1 {v10.b}[6], [RGB], #1 1900 ld1 {v11.b}[6], [RGB], #1 1901 ld1 {v12.b}[6], [RGB], #1 1902 1903 ld1 {v10.b}[7], [RGB], #1 1904 ld1 {v11.b}[7], [RGB], #1 1905 ld1 {v12.b}[7], [RGB], #1 1906 .endif 1907 prfm pldl1keep, [RGB, #128] 1908 .elseif \size == 4 1909 ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3 1910 ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3 1911 ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3 1912 ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3 1913 .elseif \size == 2 1914 ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3 1915 ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3 1916 .elseif \size == 1 1917 ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3 1918 .else 1919 .error unsupported macroblock size 1920 .endif 1921 .elseif \bpp == 32 1922 .if \size == 8 1923 ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32 1924 prfm pldl1keep, [RGB, #128] 1925 .elseif \size == 4 1926 ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4 1927 ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4 1928 ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4 1929 ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4 1930 .elseif \size == 2 1931 ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4 1932 ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4 1933 .elseif \size == 1 1934 ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4 1935 .else 1936 .error unsupported macroblock size 1937 .endif 1938 .else 1939 .error unsupported bpp 1940 .endif 1941.endm 1942 1943.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \ 1944 b_offs, fast_ld3 1945 1946/* 1947 * 2-stage pipelined RGB->YCbCr conversion 1948 */ 1949 1950.macro do_rgb_to_yuv_stage1 1951 ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */ 1952 ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */ 1953 ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */ 1954 rev64 v18.4s, v1.4s 1955 rev64 v26.4s, v1.4s 1956 rev64 v28.4s, v1.4s 1957 rev64 v30.4s, v1.4s 1958 umull v14.4s, v4.4h, v0.h[0] 1959 umull2 v16.4s, v4.8h, v0.h[0] 1960 umlsl v18.4s, v4.4h, v0.h[3] 1961 umlsl2 v26.4s, v4.8h, v0.h[3] 1962 umlal v28.4s, v4.4h, v0.h[5] 1963 umlal2 v30.4s, v4.8h, v0.h[5] 1964 umlal v14.4s, v6.4h, v0.h[1] 1965 umlal2 v16.4s, v6.8h, v0.h[1] 1966 umlsl v18.4s, v6.4h, v0.h[4] 1967 umlsl2 v26.4s, v6.8h, v0.h[4] 1968 umlsl v28.4s, v6.4h, v0.h[6] 1969 umlsl2 v30.4s, v6.8h, v0.h[6] 1970 umlal v14.4s, v8.4h, v0.h[2] 1971 umlal2 v16.4s, v8.8h, v0.h[2] 1972 umlal v18.4s, v8.4h, v0.h[5] 1973 umlal2 v26.4s, v8.8h, v0.h[5] 1974 umlsl v28.4s, v8.4h, v0.h[7] 1975 umlsl2 v30.4s, v8.8h, v0.h[7] 1976.endm 1977 1978.macro do_rgb_to_yuv_stage2 1979 rshrn v20.4h, v14.4s, #16 1980 shrn v22.4h, v18.4s, #16 1981 shrn v24.4h, v28.4s, #16 1982 rshrn2 v20.8h, v16.4s, #16 1983 shrn2 v22.8h, v26.4s, #16 1984 shrn2 v24.8h, v30.4s, #16 1985 xtn v20.8b, v20.8h /* v20 = y */ 1986 xtn v21.8b, v22.8h /* v21 = u */ 1987 xtn v22.8b, v24.8h /* v22 = v */ 1988.endm 1989 1990.macro do_rgb_to_yuv 1991 do_rgb_to_yuv_stage1 1992 do_rgb_to_yuv_stage2 1993.endm 1994 1995/* TODO: expand macros and interleave instructions if some in-order 1996 * ARM64 processor actually can dual-issue LOAD/STORE with ALU */ 1997.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3 1998 do_rgb_to_yuv_stage2 1999 do_load \bpp, 8, \fast_ld3 2000 st1 {v20.8b}, [Y], #8 2001 st1 {v21.8b}, [U], #8 2002 st1 {v22.8b}, [V], #8 2003 do_rgb_to_yuv_stage1 2004.endm 2005 2006.balign 16 2007.if \fast_ld3 == 1 2008Ljsimd_\colorid\()_ycc_neon_consts: 2009.else 2010Ljsimd_\colorid\()_ycc_neon_slowld3_consts: 2011.endif 2012 .short 19595, 38470, 7471, 11059 2013 .short 21709, 32768, 27439, 5329 2014 .short 32767, 128, 32767, 128 2015 .short 32767, 128, 32767, 128 2016 2017.if \fast_ld3 == 1 2018asm_function jsimd_\colorid\()_ycc_convert_neon 2019.else 2020asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 2021.endif 2022 OUTPUT_WIDTH .req w0 2023 INPUT_BUF .req x1 2024 OUTPUT_BUF .req x2 2025 OUTPUT_ROW .req w3 2026 NUM_ROWS .req w4 2027 2028 OUTPUT_BUF0 .req x5 2029 OUTPUT_BUF1 .req x6 2030 OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */ 2031 2032 RGB .req x7 2033 Y .req x9 2034 U .req x10 2035 V .req x11 2036 N .req w12 2037 2038 /* Load constants to d0, d1, d2, d3 */ 2039 .if \fast_ld3 == 1 2040 adr x13, Ljsimd_\colorid\()_ycc_neon_consts 2041 .else 2042 adr x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts 2043 .endif 2044 ld1 {v0.8h, v1.8h}, [x13] 2045 2046 ldr OUTPUT_BUF0, [OUTPUT_BUF] 2047 ldr OUTPUT_BUF1, [OUTPUT_BUF, #8] 2048 ldr OUTPUT_BUF2, [OUTPUT_BUF, #16] 2049 .unreq OUTPUT_BUF 2050 2051 /* Save NEON registers */ 2052 sub sp, sp, #64 2053 mov x9, sp 2054 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 2055 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 2056 2057 /* Outer loop over scanlines */ 2058 cmp NUM_ROWS, #1 2059 b.lt 9f 20600: 2061 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3] 2062 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3] 2063 mov N, OUTPUT_WIDTH 2064 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3] 2065 add OUTPUT_ROW, OUTPUT_ROW, #1 2066 ldr RGB, [INPUT_BUF], #8 2067 2068 /* Inner loop over pixels */ 2069 subs N, N, #8 2070 b.lt 3f 2071 do_load \bpp, 8, \fast_ld3 2072 do_rgb_to_yuv_stage1 2073 subs N, N, #8 2074 b.lt 2f 20751: 2076 do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3 2077 subs N, N, #8 2078 b.ge 1b 20792: 2080 do_rgb_to_yuv_stage2 2081 do_store 8 2082 tst N, #7 2083 b.eq 8f 20843: 2085 tbz N, #2, 3f 2086 do_load \bpp, 4, \fast_ld3 20873: 2088 tbz N, #1, 4f 2089 do_load \bpp, 2, \fast_ld3 20904: 2091 tbz N, #0, 5f 2092 do_load \bpp, 1, \fast_ld3 20935: 2094 do_rgb_to_yuv 2095 tbz N, #2, 6f 2096 do_store 4 20976: 2098 tbz N, #1, 7f 2099 do_store 2 21007: 2101 tbz N, #0, 8f 2102 do_store 1 21038: 2104 subs NUM_ROWS, NUM_ROWS, #1 2105 b.gt 0b 21069: 2107 /* Restore all registers and return */ 2108 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 2109 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 2110 br x30 2111 2112 .unreq OUTPUT_WIDTH 2113 .unreq OUTPUT_ROW 2114 .unreq INPUT_BUF 2115 .unreq NUM_ROWS 2116 .unreq OUTPUT_BUF0 2117 .unreq OUTPUT_BUF1 2118 .unreq OUTPUT_BUF2 2119 .unreq RGB 2120 .unreq Y 2121 .unreq U 2122 .unreq V 2123 .unreq N 2124 2125.purgem do_rgb_to_yuv 2126.purgem do_rgb_to_yuv_stage1 2127.purgem do_rgb_to_yuv_stage2 2128.purgem do_rgb_to_yuv_stage2_store_load_stage1 2129 2130.endm 2131 2132/*--------------------------------- id ----- bpp R G B Fast LD3 */ 2133generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1 2134generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1 2135generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1 2136generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1 2137generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1 2138generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1 2139 2140generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0 2141generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0 2142 2143.purgem do_load 2144.purgem do_store 2145 2146 2147/*****************************************************************************/ 2148 2149/* 2150 * Load data into workspace, applying unsigned->signed conversion 2151 * 2152 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get 2153 * rid of VST1.16 instructions 2154 */ 2155 2156asm_function jsimd_convsamp_neon 2157 SAMPLE_DATA .req x0 2158 START_COL .req x1 2159 WORKSPACE .req x2 2160 TMP1 .req x9 2161 TMP2 .req x10 2162 TMP3 .req x11 2163 TMP4 .req x12 2164 TMP5 .req x13 2165 TMP6 .req x14 2166 TMP7 .req x15 2167 TMP8 .req x4 2168 TMPDUP .req w3 2169 2170 /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 2171 guarantee that the upper (unused) 32 bits of x1 are valid. This 2172 instruction ensures that those bits are set to zero. */ 2173 uxtw x1, w1 2174 2175 mov TMPDUP, #128 2176 ldp TMP1, TMP2, [SAMPLE_DATA], 16 2177 ldp TMP3, TMP4, [SAMPLE_DATA], 16 2178 dup v0.8b, TMPDUP 2179 add TMP1, TMP1, START_COL 2180 add TMP2, TMP2, START_COL 2181 ldp TMP5, TMP6, [SAMPLE_DATA], 16 2182 add TMP3, TMP3, START_COL 2183 add TMP4, TMP4, START_COL 2184 ldp TMP7, TMP8, [SAMPLE_DATA], 16 2185 add TMP5, TMP5, START_COL 2186 add TMP6, TMP6, START_COL 2187 ld1 {v16.8b}, [TMP1] 2188 add TMP7, TMP7, START_COL 2189 add TMP8, TMP8, START_COL 2190 ld1 {v17.8b}, [TMP2] 2191 usubl v16.8h, v16.8b, v0.8b 2192 ld1 {v18.8b}, [TMP3] 2193 usubl v17.8h, v17.8b, v0.8b 2194 ld1 {v19.8b}, [TMP4] 2195 usubl v18.8h, v18.8b, v0.8b 2196 ld1 {v20.8b}, [TMP5] 2197 usubl v19.8h, v19.8b, v0.8b 2198 ld1 {v21.8b}, [TMP6] 2199 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64 2200 usubl v20.8h, v20.8b, v0.8b 2201 ld1 {v22.8b}, [TMP7] 2202 usubl v21.8h, v21.8b, v0.8b 2203 ld1 {v23.8b}, [TMP8] 2204 usubl v22.8h, v22.8b, v0.8b 2205 usubl v23.8h, v23.8b, v0.8b 2206 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64 2207 2208 br x30 2209 2210 .unreq SAMPLE_DATA 2211 .unreq START_COL 2212 .unreq WORKSPACE 2213 .unreq TMP1 2214 .unreq TMP2 2215 .unreq TMP3 2216 .unreq TMP4 2217 .unreq TMP5 2218 .unreq TMP6 2219 .unreq TMP7 2220 .unreq TMP8 2221 .unreq TMPDUP 2222 2223/*****************************************************************************/ 2224 2225/* 2226 * jsimd_fdct_islow_neon 2227 * 2228 * This file contains a slow-but-accurate integer implementation of the 2229 * forward DCT (Discrete Cosine Transform). The following code is based 2230 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for 2231 * more details. 2232 * 2233 * TODO: can be combined with 'jsimd_convsamp_neon' to get 2234 * rid of a bunch of VLD1.16 instructions 2235 */ 2236 2237#define CONST_BITS 13 2238#define PASS1_BITS 2 2239 2240#define DESCALE_P1 (CONST_BITS-PASS1_BITS) 2241#define DESCALE_P2 (CONST_BITS+PASS1_BITS) 2242 2243#define F_0_298 2446 /* FIX(0.298631336) */ 2244#define F_0_390 3196 /* FIX(0.390180644) */ 2245#define F_0_541 4433 /* FIX(0.541196100) */ 2246#define F_0_765 6270 /* FIX(0.765366865) */ 2247#define F_0_899 7373 /* FIX(0.899976223) */ 2248#define F_1_175 9633 /* FIX(1.175875602) */ 2249#define F_1_501 12299 /* FIX(1.501321110) */ 2250#define F_1_847 15137 /* FIX(1.847759065) */ 2251#define F_1_961 16069 /* FIX(1.961570560) */ 2252#define F_2_053 16819 /* FIX(2.053119869) */ 2253#define F_2_562 20995 /* FIX(2.562915447) */ 2254#define F_3_072 25172 /* FIX(3.072711026) */ 2255 2256.balign 16 2257Ljsimd_fdct_islow_neon_consts: 2258 .short F_0_298 2259 .short -F_0_390 2260 .short F_0_541 2261 .short F_0_765 2262 .short - F_0_899 2263 .short F_1_175 2264 .short F_1_501 2265 .short - F_1_847 2266 .short - F_1_961 2267 .short F_2_053 2268 .short - F_2_562 2269 .short F_3_072 2270 .short 0 /* padding */ 2271 .short 0 2272 .short 0 2273 .short 0 2274 2275#undef F_0_298 2276#undef F_0_390 2277#undef F_0_541 2278#undef F_0_765 2279#undef F_0_899 2280#undef F_1_175 2281#undef F_1_501 2282#undef F_1_847 2283#undef F_1_961 2284#undef F_2_053 2285#undef F_2_562 2286#undef F_3_072 2287#define XFIX_P_0_298 v0.h[0] 2288#define XFIX_N_0_390 v0.h[1] 2289#define XFIX_P_0_541 v0.h[2] 2290#define XFIX_P_0_765 v0.h[3] 2291#define XFIX_N_0_899 v0.h[4] 2292#define XFIX_P_1_175 v0.h[5] 2293#define XFIX_P_1_501 v0.h[6] 2294#define XFIX_N_1_847 v0.h[7] 2295#define XFIX_N_1_961 v1.h[0] 2296#define XFIX_P_2_053 v1.h[1] 2297#define XFIX_N_2_562 v1.h[2] 2298#define XFIX_P_3_072 v1.h[3] 2299 2300asm_function jsimd_fdct_islow_neon 2301 2302 DATA .req x0 2303 TMP .req x9 2304 2305 /* Load constants */ 2306 adr TMP, Ljsimd_fdct_islow_neon_consts 2307 ld1 {v0.8h, v1.8h}, [TMP] 2308 2309 /* Save NEON registers */ 2310 sub sp, sp, #64 2311 mov x10, sp 2312 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32 2313 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32 2314 2315 /* Load all DATA into NEON registers with the following allocation: 2316 * 0 1 2 3 | 4 5 6 7 2317 * ---------+-------- 2318 * 0 | d16 | d17 | v16.8h 2319 * 1 | d18 | d19 | v17.8h 2320 * 2 | d20 | d21 | v18.8h 2321 * 3 | d22 | d23 | v19.8h 2322 * 4 | d24 | d25 | v20.8h 2323 * 5 | d26 | d27 | v21.8h 2324 * 6 | d28 | d29 | v22.8h 2325 * 7 | d30 | d31 | v23.8h 2326 */ 2327 2328 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2329 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2330 sub DATA, DATA, #64 2331 2332 /* Transpose */ 2333 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 2334 /* 1-D FDCT */ 2335 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ 2336 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ 2337 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ 2338 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ 2339 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ 2340 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ 2341 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ 2342 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ 2343 2344 /* even part */ 2345 2346 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ 2347 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ 2348 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ 2349 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ 2350 2351 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ 2352 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ 2353 2354 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ 2355 2356 shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ 2357 shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ 2358 2359 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2360 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2361 mov v22.16b, v18.16b 2362 mov v25.16b, v24.16b 2363 2364 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2365 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2366 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2367 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2368 2369 rshrn v18.4h, v18.4s, #DESCALE_P1 2370 rshrn v22.4h, v22.4s, #DESCALE_P1 2371 rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ 2372 rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ 2373 2374 /* Odd part */ 2375 2376 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ 2377 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ 2378 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ 2379 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ 2380 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ 2381 smull2 v5.4s, v10.8h, XFIX_P_1_175 2382 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ 2383 smlal2 v5.4s, v11.8h, XFIX_P_1_175 2384 2385 smull2 v24.4s, v28.8h, XFIX_P_0_298 2386 smull2 v25.4s, v29.8h, XFIX_P_2_053 2387 smull2 v26.4s, v30.8h, XFIX_P_3_072 2388 smull2 v27.4s, v31.8h, XFIX_P_1_501 2389 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ 2390 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ 2391 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ 2392 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ 2393 2394 smull2 v12.4s, v8.8h, XFIX_N_0_899 2395 smull2 v13.4s, v9.8h, XFIX_N_2_562 2396 smull2 v14.4s, v10.8h, XFIX_N_1_961 2397 smull2 v15.4s, v11.8h, XFIX_N_0_390 2398 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */ 2399 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */ 2400 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */ 2401 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */ 2402 2403 add v10.4s, v10.4s, v4.4s /* z3 += z5 */ 2404 add v14.4s, v14.4s, v5.4s 2405 add v11.4s, v11.4s, v4.4s /* z4 += z5 */ 2406 add v15.4s, v15.4s, v5.4s 2407 2408 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ 2409 add v24.4s, v24.4s, v12.4s 2410 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ 2411 add v25.4s, v25.4s, v13.4s 2412 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ 2413 add v26.4s, v26.4s, v14.4s 2414 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ 2415 add v27.4s, v27.4s, v15.4s 2416 2417 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ 2418 add v24.4s, v24.4s, v14.4s 2419 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ 2420 add v25.4s, v25.4s, v15.4s 2421 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ 2422 add v26.4s, v26.4s, v13.4s 2423 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ 2424 add v27.4s, v27.4s, v12.4s 2425 2426 rshrn v23.4h, v28.4s, #DESCALE_P1 2427 rshrn v21.4h, v29.4s, #DESCALE_P1 2428 rshrn v19.4h, v30.4s, #DESCALE_P1 2429 rshrn v17.4h, v31.4s, #DESCALE_P1 2430 rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ 2431 rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ 2432 rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ 2433 rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ 2434 2435 /* Transpose */ 2436 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 2437 2438 /* 1-D FDCT */ 2439 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ 2440 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ 2441 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ 2442 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ 2443 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ 2444 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ 2445 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ 2446 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ 2447 2448 /* even part */ 2449 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ 2450 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ 2451 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ 2452 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ 2453 2454 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ 2455 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ 2456 2457 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ 2458 2459 srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */ 2460 srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */ 2461 2462 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2463 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2464 mov v22.16b, v18.16b 2465 mov v25.16b, v24.16b 2466 2467 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2468 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2469 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2470 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2471 2472 rshrn v18.4h, v18.4s, #DESCALE_P2 2473 rshrn v22.4h, v22.4s, #DESCALE_P2 2474 rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ 2475 rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ 2476 2477 /* Odd part */ 2478 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ 2479 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ 2480 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ 2481 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ 2482 2483 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ 2484 smull2 v5.4s, v10.8h, XFIX_P_1_175 2485 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ 2486 smlal2 v5.4s, v11.8h, XFIX_P_1_175 2487 2488 smull2 v24.4s, v28.8h, XFIX_P_0_298 2489 smull2 v25.4s, v29.8h, XFIX_P_2_053 2490 smull2 v26.4s, v30.8h, XFIX_P_3_072 2491 smull2 v27.4s, v31.8h, XFIX_P_1_501 2492 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ 2493 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ 2494 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ 2495 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ 2496 2497 smull2 v12.4s, v8.8h, XFIX_N_0_899 2498 smull2 v13.4s, v9.8h, XFIX_N_2_562 2499 smull2 v14.4s, v10.8h, XFIX_N_1_961 2500 smull2 v15.4s, v11.8h, XFIX_N_0_390 2501 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */ 2502 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */ 2503 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */ 2504 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */ 2505 2506 add v10.4s, v10.4s, v4.4s 2507 add v14.4s, v14.4s, v5.4s 2508 add v11.4s, v11.4s, v4.4s 2509 add v15.4s, v15.4s, v5.4s 2510 2511 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ 2512 add v24.4s, v24.4s, v12.4s 2513 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ 2514 add v25.4s, v25.4s, v13.4s 2515 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ 2516 add v26.4s, v26.4s, v14.4s 2517 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ 2518 add v27.4s, v27.4s, v15.4s 2519 2520 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ 2521 add v24.4s, v24.4s, v14.4s 2522 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ 2523 add v25.4s, v25.4s, v15.4s 2524 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ 2525 add v26.4s, v26.4s, v13.4s 2526 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ 2527 add v27.4s, v27.4s, v12.4s 2528 2529 rshrn v23.4h, v28.4s, #DESCALE_P2 2530 rshrn v21.4h, v29.4s, #DESCALE_P2 2531 rshrn v19.4h, v30.4s, #DESCALE_P2 2532 rshrn v17.4h, v31.4s, #DESCALE_P2 2533 rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ 2534 rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ 2535 rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ 2536 rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ 2537 2538 /* store results */ 2539 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2540 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2541 2542 /* Restore NEON registers */ 2543 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 2544 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 2545 2546 br x30 2547 2548 .unreq DATA 2549 .unreq TMP 2550 2551#undef XFIX_P_0_298 2552#undef XFIX_N_0_390 2553#undef XFIX_P_0_541 2554#undef XFIX_P_0_765 2555#undef XFIX_N_0_899 2556#undef XFIX_P_1_175 2557#undef XFIX_P_1_501 2558#undef XFIX_N_1_847 2559#undef XFIX_N_1_961 2560#undef XFIX_P_2_053 2561#undef XFIX_N_2_562 2562#undef XFIX_P_3_072 2563 2564 2565/*****************************************************************************/ 2566 2567/* 2568 * jsimd_fdct_ifast_neon 2569 * 2570 * This function contains a fast, not so accurate integer implementation of 2571 * the forward DCT (Discrete Cosine Transform). It uses the same calculations 2572 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' 2573 * function from jfdctfst.c 2574 * 2575 * TODO: can be combined with 'jsimd_convsamp_neon' to get 2576 * rid of a bunch of VLD1.16 instructions 2577 */ 2578 2579#undef XFIX_0_541196100 2580#define XFIX_0_382683433 v0.h[0] 2581#define XFIX_0_541196100 v0.h[1] 2582#define XFIX_0_707106781 v0.h[2] 2583#define XFIX_1_306562965 v0.h[3] 2584 2585.balign 16 2586Ljsimd_fdct_ifast_neon_consts: 2587 .short (98 * 128) /* XFIX_0_382683433 */ 2588 .short (139 * 128) /* XFIX_0_541196100 */ 2589 .short (181 * 128) /* XFIX_0_707106781 */ 2590 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ 2591 2592asm_function jsimd_fdct_ifast_neon 2593 2594 DATA .req x0 2595 TMP .req x9 2596 2597 /* Load constants */ 2598 adr TMP, Ljsimd_fdct_ifast_neon_consts 2599 ld1 {v0.4h}, [TMP] 2600 2601 /* Load all DATA into NEON registers with the following allocation: 2602 * 0 1 2 3 | 4 5 6 7 2603 * ---------+-------- 2604 * 0 | d16 | d17 | v0.8h 2605 * 1 | d18 | d19 | q9 2606 * 2 | d20 | d21 | q10 2607 * 3 | d22 | d23 | q11 2608 * 4 | d24 | d25 | q12 2609 * 5 | d26 | d27 | q13 2610 * 6 | d28 | d29 | q14 2611 * 7 | d30 | d31 | q15 2612 */ 2613 2614 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2615 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2616 mov TMP, #2 2617 sub DATA, DATA, #64 26181: 2619 /* Transpose */ 2620 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4 2621 subs TMP, TMP, #1 2622 /* 1-D FDCT */ 2623 add v4.8h, v19.8h, v20.8h 2624 sub v20.8h, v19.8h, v20.8h 2625 sub v28.8h, v18.8h, v21.8h 2626 add v18.8h, v18.8h, v21.8h 2627 sub v29.8h, v17.8h, v22.8h 2628 add v17.8h, v17.8h, v22.8h 2629 sub v21.8h, v16.8h, v23.8h 2630 add v16.8h, v16.8h, v23.8h 2631 sub v6.8h, v17.8h, v18.8h 2632 sub v7.8h, v16.8h, v4.8h 2633 add v5.8h, v17.8h, v18.8h 2634 add v6.8h, v6.8h, v7.8h 2635 add v4.8h, v16.8h, v4.8h 2636 sqdmulh v6.8h, v6.8h, XFIX_0_707106781 2637 add v19.8h, v20.8h, v28.8h 2638 add v16.8h, v4.8h, v5.8h 2639 sub v20.8h, v4.8h, v5.8h 2640 add v5.8h, v28.8h, v29.8h 2641 add v29.8h, v29.8h, v21.8h 2642 sqdmulh v5.8h, v5.8h, XFIX_0_707106781 2643 sub v28.8h, v19.8h, v29.8h 2644 add v18.8h, v7.8h, v6.8h 2645 sqdmulh v28.8h, v28.8h, XFIX_0_382683433 2646 sub v22.8h, v7.8h, v6.8h 2647 sqdmulh v19.8h, v19.8h, XFIX_0_541196100 2648 sqdmulh v7.8h, v29.8h, XFIX_1_306562965 2649 add v6.8h, v21.8h, v5.8h 2650 sub v5.8h, v21.8h, v5.8h 2651 add v29.8h, v29.8h, v28.8h 2652 add v19.8h, v19.8h, v28.8h 2653 add v29.8h, v29.8h, v7.8h 2654 add v21.8h, v5.8h, v19.8h 2655 sub v19.8h, v5.8h, v19.8h 2656 add v17.8h, v6.8h, v29.8h 2657 sub v23.8h, v6.8h, v29.8h 2658 2659 b.ne 1b 2660 2661 /* store results */ 2662 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2663 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2664 2665 br x30 2666 2667 .unreq DATA 2668 .unreq TMP 2669#undef XFIX_0_382683433 2670#undef XFIX_0_541196100 2671#undef XFIX_0_707106781 2672#undef XFIX_1_306562965 2673 2674 2675/*****************************************************************************/ 2676 2677/* 2678 * GLOBAL(void) 2679 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors, 2680 * DCTELEM *workspace); 2681 * 2682 */ 2683asm_function jsimd_quantize_neon 2684 2685 COEF_BLOCK .req x0 2686 DIVISORS .req x1 2687 WORKSPACE .req x2 2688 2689 RECIPROCAL .req DIVISORS 2690 CORRECTION .req x9 2691 SHIFT .req x10 2692 LOOP_COUNT .req x11 2693 2694 mov LOOP_COUNT, #2 2695 add CORRECTION, DIVISORS, #(64 * 2) 2696 add SHIFT, DIVISORS, #(64 * 6) 26971: 2698 subs LOOP_COUNT, LOOP_COUNT, #1 2699 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64 2700 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64 2701 abs v20.8h, v0.8h 2702 abs v21.8h, v1.8h 2703 abs v22.8h, v2.8h 2704 abs v23.8h, v3.8h 2705 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64 2706 add v20.8h, v20.8h, v4.8h /* add correction */ 2707 add v21.8h, v21.8h, v5.8h 2708 add v22.8h, v22.8h, v6.8h 2709 add v23.8h, v23.8h, v7.8h 2710 umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */ 2711 umull2 v16.4s, v20.8h, v28.8h 2712 umull v5.4s, v21.4h, v29.4h 2713 umull2 v17.4s, v21.8h, v29.8h 2714 umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */ 2715 umull2 v18.4s, v22.8h, v30.8h 2716 umull v7.4s, v23.4h, v31.4h 2717 umull2 v19.4s, v23.8h, v31.8h 2718 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64 2719 shrn v4.4h, v4.4s, #16 2720 shrn v5.4h, v5.4s, #16 2721 shrn v6.4h, v6.4s, #16 2722 shrn v7.4h, v7.4s, #16 2723 shrn2 v4.8h, v16.4s, #16 2724 shrn2 v5.8h, v17.4s, #16 2725 shrn2 v6.8h, v18.4s, #16 2726 shrn2 v7.8h, v19.4s, #16 2727 neg v24.8h, v24.8h 2728 neg v25.8h, v25.8h 2729 neg v26.8h, v26.8h 2730 neg v27.8h, v27.8h 2731 sshr v0.8h, v0.8h, #15 /* extract sign */ 2732 sshr v1.8h, v1.8h, #15 2733 sshr v2.8h, v2.8h, #15 2734 sshr v3.8h, v3.8h, #15 2735 ushl v4.8h, v4.8h, v24.8h /* shift */ 2736 ushl v5.8h, v5.8h, v25.8h 2737 ushl v6.8h, v6.8h, v26.8h 2738 ushl v7.8h, v7.8h, v27.8h 2739 2740 eor v4.16b, v4.16b, v0.16b /* restore sign */ 2741 eor v5.16b, v5.16b, v1.16b 2742 eor v6.16b, v6.16b, v2.16b 2743 eor v7.16b, v7.16b, v3.16b 2744 sub v4.8h, v4.8h, v0.8h 2745 sub v5.8h, v5.8h, v1.8h 2746 sub v6.8h, v6.8h, v2.8h 2747 sub v7.8h, v7.8h, v3.8h 2748 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64 2749 2750 b.ne 1b 2751 2752 br x30 /* return */ 2753 2754 .unreq COEF_BLOCK 2755 .unreq DIVISORS 2756 .unreq WORKSPACE 2757 .unreq RECIPROCAL 2758 .unreq CORRECTION 2759 .unreq SHIFT 2760 .unreq LOOP_COUNT 2761 2762 2763/*****************************************************************************/ 2764 2765/* 2766 * Downsample pixel values of a single component. 2767 * This version handles the common case of 2:1 horizontal and 1:1 vertical, 2768 * without smoothing. 2769 * 2770 * GLOBAL(void) 2771 * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor, 2772 * JDIMENSION v_samp_factor, 2773 * JDIMENSION width_blocks, JSAMPARRAY input_data, 2774 * JSAMPARRAY output_data); 2775 */ 2776 2777.balign 16 2778Ljsimd_h2_downsample_neon_consts: 2779 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 2780 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */ 2781 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 2782 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */ 2783 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 2784 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */ 2785 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 2786 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */ 2787 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 2788 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */ 2789 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 2790 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */ 2791 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 2792 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */ 2793 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 2794 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */ 2795 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 2796 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */ 2797 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \ 2798 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */ 2799 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \ 2800 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */ 2801 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \ 2802 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */ 2803 .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \ 2804 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */ 2805 .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \ 2806 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */ 2807 .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \ 2808 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */ 2809 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 2810 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */ 2811 2812asm_function jsimd_h2v1_downsample_neon 2813 IMAGE_WIDTH .req x0 2814 MAX_V_SAMP .req x1 2815 V_SAMP .req x2 2816 BLOCK_WIDTH .req x3 2817 INPUT_DATA .req x4 2818 OUTPUT_DATA .req x5 2819 OUTPTR .req x9 2820 INPTR .req x10 2821 TMP1 .req x11 2822 TMP2 .req x12 2823 TMP3 .req x13 2824 TMPDUP .req w15 2825 2826 mov TMPDUP, #0x10000 2827 lsl TMP2, BLOCK_WIDTH, #4 2828 sub TMP2, TMP2, IMAGE_WIDTH 2829 adr TMP3, Ljsimd_h2_downsample_neon_consts 2830 add TMP3, TMP3, TMP2, lsl #4 2831 dup v16.4s, TMPDUP 2832 ld1 {v18.16b}, [TMP3] 2833 28341: /* row loop */ 2835 ldr INPTR, [INPUT_DATA], #8 2836 ldr OUTPTR, [OUTPUT_DATA], #8 2837 subs TMP1, BLOCK_WIDTH, #1 2838 b.eq 3f 28392: /* columns */ 2840 ld1 {v0.16b}, [INPTR], #16 2841 mov v4.16b, v16.16b 2842 subs TMP1, TMP1, #1 2843 uadalp v4.8h, v0.16b 2844 shrn v6.8b, v4.8h, #1 2845 st1 {v6.8b}, [OUTPTR], #8 2846 b.ne 2b 28473: /* last columns */ 2848 ld1 {v0.16b}, [INPTR] 2849 mov v4.16b, v16.16b 2850 subs V_SAMP, V_SAMP, #1 2851 /* expand right */ 2852 tbl v2.16b, {v0.16b}, v18.16b 2853 uadalp v4.8h, v2.16b 2854 shrn v6.8b, v4.8h, #1 2855 st1 {v6.8b}, [OUTPTR], #8 2856 b.ne 1b 2857 2858 br x30 2859 2860 .unreq IMAGE_WIDTH 2861 .unreq MAX_V_SAMP 2862 .unreq V_SAMP 2863 .unreq BLOCK_WIDTH 2864 .unreq INPUT_DATA 2865 .unreq OUTPUT_DATA 2866 .unreq OUTPTR 2867 .unreq INPTR 2868 .unreq TMP1 2869 .unreq TMP2 2870 .unreq TMP3 2871 .unreq TMPDUP 2872 2873 2874/*****************************************************************************/ 2875 2876/* 2877 * Downsample pixel values of a single component. 2878 * This version handles the common case of 2:1 horizontal and 2:1 vertical, 2879 * without smoothing. 2880 * 2881 * GLOBAL(void) 2882 * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor, 2883 * JDIMENSION v_samp_factor, JDIMENSION width_blocks, 2884 * JSAMPARRAY input_data, JSAMPARRAY output_data); 2885 */ 2886 2887.balign 16 2888asm_function jsimd_h2v2_downsample_neon 2889 IMAGE_WIDTH .req x0 2890 MAX_V_SAMP .req x1 2891 V_SAMP .req x2 2892 BLOCK_WIDTH .req x3 2893 INPUT_DATA .req x4 2894 OUTPUT_DATA .req x5 2895 OUTPTR .req x9 2896 INPTR0 .req x10 2897 INPTR1 .req x14 2898 TMP1 .req x11 2899 TMP2 .req x12 2900 TMP3 .req x13 2901 TMPDUP .req w15 2902 2903 mov TMPDUP, #1 2904 lsl TMP2, BLOCK_WIDTH, #4 2905 lsl TMPDUP, TMPDUP, #17 2906 sub TMP2, TMP2, IMAGE_WIDTH 2907 adr TMP3, Ljsimd_h2_downsample_neon_consts 2908 orr TMPDUP, TMPDUP, #1 2909 add TMP3, TMP3, TMP2, lsl #4 2910 dup v16.4s, TMPDUP 2911 ld1 {v18.16b}, [TMP3] 2912 29131: /* row loop */ 2914 ldr INPTR0, [INPUT_DATA], #8 2915 ldr OUTPTR, [OUTPUT_DATA], #8 2916 ldr INPTR1, [INPUT_DATA], #8 2917 subs TMP1, BLOCK_WIDTH, #1 2918 b.eq 3f 29192: /* columns */ 2920 ld1 {v0.16b}, [INPTR0], #16 2921 ld1 {v1.16b}, [INPTR1], #16 2922 mov v4.16b, v16.16b 2923 subs TMP1, TMP1, #1 2924 uadalp v4.8h, v0.16b 2925 uadalp v4.8h, v1.16b 2926 shrn v6.8b, v4.8h, #2 2927 st1 {v6.8b}, [OUTPTR], #8 2928 b.ne 2b 29293: /* last columns */ 2930 ld1 {v0.16b}, [INPTR0], #16 2931 ld1 {v1.16b}, [INPTR1], #16 2932 mov v4.16b, v16.16b 2933 subs V_SAMP, V_SAMP, #1 2934 /* expand right */ 2935 tbl v2.16b, {v0.16b}, v18.16b 2936 tbl v3.16b, {v1.16b}, v18.16b 2937 uadalp v4.8h, v2.16b 2938 uadalp v4.8h, v3.16b 2939 shrn v6.8b, v4.8h, #2 2940 st1 {v6.8b}, [OUTPTR], #8 2941 b.ne 1b 2942 2943 br x30 2944 2945 .unreq IMAGE_WIDTH 2946 .unreq MAX_V_SAMP 2947 .unreq V_SAMP 2948 .unreq BLOCK_WIDTH 2949 .unreq INPUT_DATA 2950 .unreq OUTPUT_DATA 2951 .unreq OUTPTR 2952 .unreq INPTR0 2953 .unreq INPTR1 2954 .unreq TMP1 2955 .unreq TMP2 2956 .unreq TMP3 2957 .unreq TMPDUP 2958 2959 2960/*****************************************************************************/ 2961 2962/* 2963 * GLOBAL(JOCTET*) 2964 * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer, 2965 * JCOEFPTR block, int last_dc_val, 2966 * c_derived_tbl *dctbl, c_derived_tbl *actbl) 2967 * 2968 */ 2969 2970 BUFFER .req x1 2971 PUT_BUFFER .req x6 2972 PUT_BITS .req x7 2973 PUT_BITSw .req w7 2974 2975.macro emit_byte 2976 sub PUT_BITS, PUT_BITS, #0x8 2977 lsr x19, PUT_BUFFER, PUT_BITS 2978 uxtb w19, w19 2979 strb w19, [BUFFER, #1]! 2980 cmp w19, #0xff 2981 b.ne 14f 2982 strb wzr, [BUFFER, #1]! 298314: 2984.endm 2985.macro put_bits CODE, SIZE 2986 lsl PUT_BUFFER, PUT_BUFFER, \SIZE 2987 add PUT_BITS, PUT_BITS, \SIZE 2988 orr PUT_BUFFER, PUT_BUFFER, \CODE 2989.endm 2990.macro checkbuf31 2991 cmp PUT_BITS, #0x20 2992 b.lt 31f 2993 emit_byte 2994 emit_byte 2995 emit_byte 2996 emit_byte 299731: 2998.endm 2999.macro checkbuf47 3000 cmp PUT_BITS, #0x30 3001 b.lt 47f 3002 emit_byte 3003 emit_byte 3004 emit_byte 3005 emit_byte 3006 emit_byte 3007 emit_byte 300847: 3009.endm 3010 3011.macro generate_jsimd_huff_encode_one_block fast_tbl 3012 3013.balign 16 3014.if \fast_tbl == 1 3015Ljsimd_huff_encode_one_block_neon_consts: 3016.else 3017Ljsimd_huff_encode_one_block_neon_slowtbl_consts: 3018.endif 3019 .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ 3020 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 3021.if \fast_tbl == 1 3022 .byte 0, 1, 2, 3, 16, 17, 32, 33, \ 3023 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ 3024 .byte 34, 35, 48, 49, 255, 255, 50, 51, \ 3025 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ 3026 .byte 8, 9, 22, 23, 36, 37, 50, 51, \ 3027 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ 3028 .byte 54, 55, 40, 41, 26, 27, 12, 13, \ 3029 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ 3030 .byte 6, 7, 20, 21, 34, 35, 48, 49, \ 3031 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ 3032 .byte 42, 43, 28, 29, 14, 15, 30, 31, \ 3033 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ 3034 .byte 255, 255, 255, 255, 56, 57, 42, 43, \ 3035 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ 3036 .byte 26, 27, 40, 41, 42, 43, 28, 29, \ 3037 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ 3038 .byte 255, 255, 255, 255, 0, 1, 255, 255, \ 3039 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ 3040 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ 3041 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ 3042 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ 3043 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ 3044 .byte 4, 5, 6, 7, 255, 255, 255, 255, \ 3045 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ 3046.endif 3047 3048.if \fast_tbl == 1 3049asm_function jsimd_huff_encode_one_block_neon 3050.else 3051asm_function jsimd_huff_encode_one_block_neon_slowtbl 3052.endif 3053 sub sp, sp, 272 3054 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ 3055 /* Save ARM registers */ 3056 stp x19, x20, [sp] 3057.if \fast_tbl == 1 3058 adr x15, Ljsimd_huff_encode_one_block_neon_consts 3059.else 3060 adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts 3061.endif 3062 ldr PUT_BUFFER, [x0, #0x10] 3063 ldr PUT_BITSw, [x0, #0x18] 3064 ldrsh w12, [x2] /* load DC coeff in w12 */ 3065 /* prepare data */ 3066.if \fast_tbl == 1 3067 ld1 {v23.16b}, [x15], #16 3068 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 3069 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 3070 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 3071 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 3072 ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 3073 sub w12, w12, w3 /* last_dc_val, not used afterwards */ 3074 /* ZigZag 8x8 */ 3075 tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b 3076 tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b 3077 tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b 3078 tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b 3079 tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b 3080 tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b 3081 tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b 3082 tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b 3083 ins v0.h[0], w12 3084 tbx v1.16b, {v28.16b}, v16.16b 3085 tbx v2.16b, {v29.16b, v30.16b}, v17.16b 3086 tbx v5.16b, {v29.16b, v30.16b}, v18.16b 3087 tbx v6.16b, {v31.16b}, v19.16b 3088.else 3089 add x13, x2, #0x22 3090 sub w12, w12, w3 /* last_dc_val, not used afterwards */ 3091 ld1 {v23.16b}, [x15] 3092 add x14, x2, #0x18 3093 add x3, x2, #0x36 3094 ins v0.h[0], w12 3095 add x9, x2, #0x2 3096 ld1 {v1.h}[0], [x13] 3097 add x15, x2, #0x30 3098 ld1 {v2.h}[0], [x14] 3099 add x19, x2, #0x26 3100 ld1 {v3.h}[0], [x3] 3101 add x20, x2, #0x28 3102 ld1 {v0.h}[1], [x9] 3103 add x12, x2, #0x10 3104 ld1 {v1.h}[1], [x15] 3105 add x13, x2, #0x40 3106 ld1 {v2.h}[1], [x19] 3107 add x14, x2, #0x34 3108 ld1 {v3.h}[1], [x20] 3109 add x3, x2, #0x1a 3110 ld1 {v0.h}[2], [x12] 3111 add x9, x2, #0x20 3112 ld1 {v1.h}[2], [x13] 3113 add x15, x2, #0x32 3114 ld1 {v2.h}[2], [x14] 3115 add x19, x2, #0x42 3116 ld1 {v3.h}[2], [x3] 3117 add x20, x2, #0xc 3118 ld1 {v0.h}[3], [x9] 3119 add x12, x2, #0x12 3120 ld1 {v1.h}[3], [x15] 3121 add x13, x2, #0x24 3122 ld1 {v2.h}[3], [x19] 3123 add x14, x2, #0x50 3124 ld1 {v3.h}[3], [x20] 3125 add x3, x2, #0xe 3126 ld1 {v0.h}[4], [x12] 3127 add x9, x2, #0x4 3128 ld1 {v1.h}[4], [x13] 3129 add x15, x2, #0x16 3130 ld1 {v2.h}[4], [x14] 3131 add x19, x2, #0x60 3132 ld1 {v3.h}[4], [x3] 3133 add x20, x2, #0x1c 3134 ld1 {v0.h}[5], [x9] 3135 add x12, x2, #0x6 3136 ld1 {v1.h}[5], [x15] 3137 add x13, x2, #0x8 3138 ld1 {v2.h}[5], [x19] 3139 add x14, x2, #0x52 3140 ld1 {v3.h}[5], [x20] 3141 add x3, x2, #0x2a 3142 ld1 {v0.h}[6], [x12] 3143 add x9, x2, #0x14 3144 ld1 {v1.h}[6], [x13] 3145 add x15, x2, #0xa 3146 ld1 {v2.h}[6], [x14] 3147 add x19, x2, #0x44 3148 ld1 {v3.h}[6], [x3] 3149 add x20, x2, #0x38 3150 ld1 {v0.h}[7], [x9] 3151 add x12, x2, #0x46 3152 ld1 {v1.h}[7], [x15] 3153 add x13, x2, #0x3a 3154 ld1 {v2.h}[7], [x19] 3155 add x14, x2, #0x74 3156 ld1 {v3.h}[7], [x20] 3157 add x3, x2, #0x6a 3158 ld1 {v4.h}[0], [x12] 3159 add x9, x2, #0x54 3160 ld1 {v5.h}[0], [x13] 3161 add x15, x2, #0x2c 3162 ld1 {v6.h}[0], [x14] 3163 add x19, x2, #0x76 3164 ld1 {v7.h}[0], [x3] 3165 add x20, x2, #0x78 3166 ld1 {v4.h}[1], [x9] 3167 add x12, x2, #0x62 3168 ld1 {v5.h}[1], [x15] 3169 add x13, x2, #0x1e 3170 ld1 {v6.h}[1], [x19] 3171 add x14, x2, #0x68 3172 ld1 {v7.h}[1], [x20] 3173 add x3, x2, #0x7a 3174 ld1 {v4.h}[2], [x12] 3175 add x9, x2, #0x70 3176 ld1 {v5.h}[2], [x13] 3177 add x15, x2, #0x2e 3178 ld1 {v6.h}[2], [x14] 3179 add x19, x2, #0x5a 3180 ld1 {v7.h}[2], [x3] 3181 add x20, x2, #0x6c 3182 ld1 {v4.h}[3], [x9] 3183 add x12, x2, #0x72 3184 ld1 {v5.h}[3], [x15] 3185 add x13, x2, #0x3c 3186 ld1 {v6.h}[3], [x19] 3187 add x14, x2, #0x4c 3188 ld1 {v7.h}[3], [x20] 3189 add x3, x2, #0x5e 3190 ld1 {v4.h}[4], [x12] 3191 add x9, x2, #0x64 3192 ld1 {v5.h}[4], [x13] 3193 add x15, x2, #0x4a 3194 ld1 {v6.h}[4], [x14] 3195 add x19, x2, #0x3e 3196 ld1 {v7.h}[4], [x3] 3197 add x20, x2, #0x6e 3198 ld1 {v4.h}[5], [x9] 3199 add x12, x2, #0x56 3200 ld1 {v5.h}[5], [x15] 3201 add x13, x2, #0x58 3202 ld1 {v6.h}[5], [x19] 3203 add x14, x2, #0x4e 3204 ld1 {v7.h}[5], [x20] 3205 add x3, x2, #0x7c 3206 ld1 {v4.h}[6], [x12] 3207 add x9, x2, #0x48 3208 ld1 {v5.h}[6], [x13] 3209 add x15, x2, #0x66 3210 ld1 {v6.h}[6], [x14] 3211 add x19, x2, #0x5c 3212 ld1 {v7.h}[6], [x3] 3213 add x20, x2, #0x7e 3214 ld1 {v4.h}[7], [x9] 3215 ld1 {v5.h}[7], [x15] 3216 ld1 {v6.h}[7], [x19] 3217 ld1 {v7.h}[7], [x20] 3218.endif 3219 cmlt v24.8h, v0.8h, #0 3220 cmlt v25.8h, v1.8h, #0 3221 cmlt v26.8h, v2.8h, #0 3222 cmlt v27.8h, v3.8h, #0 3223 cmlt v28.8h, v4.8h, #0 3224 cmlt v29.8h, v5.8h, #0 3225 cmlt v30.8h, v6.8h, #0 3226 cmlt v31.8h, v7.8h, #0 3227 abs v0.8h, v0.8h 3228 abs v1.8h, v1.8h 3229 abs v2.8h, v2.8h 3230 abs v3.8h, v3.8h 3231 abs v4.8h, v4.8h 3232 abs v5.8h, v5.8h 3233 abs v6.8h, v6.8h 3234 abs v7.8h, v7.8h 3235 eor v24.16b, v24.16b, v0.16b 3236 eor v25.16b, v25.16b, v1.16b 3237 eor v26.16b, v26.16b, v2.16b 3238 eor v27.16b, v27.16b, v3.16b 3239 eor v28.16b, v28.16b, v4.16b 3240 eor v29.16b, v29.16b, v5.16b 3241 eor v30.16b, v30.16b, v6.16b 3242 eor v31.16b, v31.16b, v7.16b 3243 cmeq v16.8h, v0.8h, #0 3244 cmeq v17.8h, v1.8h, #0 3245 cmeq v18.8h, v2.8h, #0 3246 cmeq v19.8h, v3.8h, #0 3247 cmeq v20.8h, v4.8h, #0 3248 cmeq v21.8h, v5.8h, #0 3249 cmeq v22.8h, v6.8h, #0 3250 xtn v16.8b, v16.8h 3251 xtn v18.8b, v18.8h 3252 xtn v20.8b, v20.8h 3253 xtn v22.8b, v22.8h 3254 umov w14, v0.h[0] 3255 xtn2 v16.16b, v17.8h 3256 umov w13, v24.h[0] 3257 xtn2 v18.16b, v19.8h 3258 clz w14, w14 3259 xtn2 v20.16b, v21.8h 3260 lsl w13, w13, w14 3261 cmeq v17.8h, v7.8h, #0 3262 sub w12, w14, #32 3263 xtn2 v22.16b, v17.8h 3264 lsr w13, w13, w14 3265 and v16.16b, v16.16b, v23.16b 3266 neg w12, w12 3267 and v18.16b, v18.16b, v23.16b 3268 add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ 3269 and v20.16b, v20.16b, v23.16b 3270 add x15, sp, #0x90 /* x15 = t2 */ 3271 and v22.16b, v22.16b, v23.16b 3272 ldr w10, [x4, x12, lsl #2] 3273 addp v16.16b, v16.16b, v18.16b 3274 ldrb w11, [x3, x12] 3275 addp v20.16b, v20.16b, v22.16b 3276 checkbuf47 3277 addp v16.16b, v16.16b, v20.16b 3278 put_bits x10, x11 3279 addp v16.16b, v16.16b, v18.16b 3280 checkbuf47 3281 umov x9,v16.D[0] 3282 put_bits x13, x12 3283 cnt v17.8b, v16.8b 3284 mvn x9, x9 3285 addv B18, v17.8b 3286 add x4, x5, #0x400 /* x4 = actbl->ehufsi */ 3287 umov w12, v18.b[0] 3288 lsr x9, x9, #0x1 /* clear AC coeff */ 3289 ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ 3290 rbit x9, x9 /* x9 = index0 */ 3291 ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ 3292 cmp w12, #(64-8) 3293 add x11, sp, #16 3294 b.lt 4f 3295 cbz x9, 6f 3296 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 3297 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 3298 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 3299 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 33001: 3301 clz x2, x9 3302 add x15, x15, x2, lsl #1 3303 lsl x9, x9, x2 3304 ldrh w20, [x15, #-126] 33052: 3306 cmp x2, #0x10 3307 b.lt 3f 3308 sub x2, x2, #0x10 3309 checkbuf47 3310 put_bits x13, x14 3311 b 2b 33123: 3313 clz w20, w20 3314 ldrh w3, [x15, #2]! 3315 sub w11, w20, #32 3316 lsl w3, w3, w20 3317 neg w11, w11 3318 lsr w3, w3, w20 3319 add x2, x11, x2, lsl #4 3320 lsl x9, x9, #0x1 3321 ldr w12, [x5, x2, lsl #2] 3322 ldrb w10, [x4, x2] 3323 checkbuf31 3324 put_bits x12, x10 3325 put_bits x3, x11 3326 cbnz x9, 1b 3327 b 6f 33284: 3329 movi v21.8h, #0x0010 3330 clz v0.8h, v0.8h 3331 clz v1.8h, v1.8h 3332 clz v2.8h, v2.8h 3333 clz v3.8h, v3.8h 3334 clz v4.8h, v4.8h 3335 clz v5.8h, v5.8h 3336 clz v6.8h, v6.8h 3337 clz v7.8h, v7.8h 3338 ushl v24.8h, v24.8h, v0.8h 3339 ushl v25.8h, v25.8h, v1.8h 3340 ushl v26.8h, v26.8h, v2.8h 3341 ushl v27.8h, v27.8h, v3.8h 3342 ushl v28.8h, v28.8h, v4.8h 3343 ushl v29.8h, v29.8h, v5.8h 3344 ushl v30.8h, v30.8h, v6.8h 3345 ushl v31.8h, v31.8h, v7.8h 3346 neg v0.8h, v0.8h 3347 neg v1.8h, v1.8h 3348 neg v2.8h, v2.8h 3349 neg v3.8h, v3.8h 3350 neg v4.8h, v4.8h 3351 neg v5.8h, v5.8h 3352 neg v6.8h, v6.8h 3353 neg v7.8h, v7.8h 3354 ushl v24.8h, v24.8h, v0.8h 3355 ushl v25.8h, v25.8h, v1.8h 3356 ushl v26.8h, v26.8h, v2.8h 3357 ushl v27.8h, v27.8h, v3.8h 3358 ushl v28.8h, v28.8h, v4.8h 3359 ushl v29.8h, v29.8h, v5.8h 3360 ushl v30.8h, v30.8h, v6.8h 3361 ushl v31.8h, v31.8h, v7.8h 3362 add v0.8h, v21.8h, v0.8h 3363 add v1.8h, v21.8h, v1.8h 3364 add v2.8h, v21.8h, v2.8h 3365 add v3.8h, v21.8h, v3.8h 3366 add v4.8h, v21.8h, v4.8h 3367 add v5.8h, v21.8h, v5.8h 3368 add v6.8h, v21.8h, v6.8h 3369 add v7.8h, v21.8h, v7.8h 3370 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 3371 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 3372 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 3373 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 33741: 3375 clz x2, x9 3376 add x15, x15, x2, lsl #1 3377 lsl x9, x9, x2 3378 ldrh w11, [x15, #-126] 33792: 3380 cmp x2, #0x10 3381 b.lt 3f 3382 sub x2, x2, #0x10 3383 checkbuf47 3384 put_bits x13, x14 3385 b 2b 33863: 3387 ldrh w3, [x15, #2]! 3388 add x2, x11, x2, lsl #4 3389 lsl x9, x9, #0x1 3390 ldr w12, [x5, x2, lsl #2] 3391 ldrb w10, [x4, x2] 3392 checkbuf31 3393 put_bits x12, x10 3394 put_bits x3, x11 3395 cbnz x9, 1b 33966: 3397 add x13, sp, #0x10e 3398 cmp x15, x13 3399 b.hs 1f 3400 ldr w12, [x5] 3401 ldrb w14, [x4] 3402 checkbuf47 3403 put_bits x12, x14 34041: 3405 str PUT_BUFFER, [x0, #0x10] 3406 str PUT_BITSw, [x0, #0x18] 3407 ldp x19, x20, [sp], 16 3408 add x0, BUFFER, #0x1 3409 add sp, sp, 256 3410 br x30 3411 3412.endm 3413 3414generate_jsimd_huff_encode_one_block 1 3415generate_jsimd_huff_encode_one_block 0 3416 3417 .unreq BUFFER 3418 .unreq PUT_BUFFER 3419 .unreq PUT_BITS 3420 .unreq PUT_BITSw 3421 3422.purgem emit_byte 3423.purgem put_bits 3424.purgem checkbuf31 3425.purgem checkbuf47 3426