1/*
2 * ARMv8 NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
5 *                          All Rights Reserved.
6 * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
8 * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org>
9 * Copyright (C) 2014-2016, D. R. Commander.  All Rights Reserved.
10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
11 * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
12 *
13 * This software is provided 'as-is', without any express or implied
14 * warranty.  In no event will the authors be held liable for any damages
15 * arising from the use of this software.
16 *
17 * Permission is granted to anyone to use this software for any purpose,
18 * including commercial applications, and to alter it and redistribute it
19 * freely, subject to the following restrictions:
20 *
21 * 1. The origin of this software must not be misrepresented; you must not
22 *    claim that you wrote the original software. If you use this software
23 *    in a product, an acknowledgment in the product documentation would be
24 *    appreciated but is not required.
25 * 2. Altered source versions must be plainly marked as such, and must not be
26 *    misrepresented as being the original software.
27 * 3. This notice may not be removed or altered from any source distribution.
28 */
29
30#if defined(__linux__) && defined(__ELF__)
31.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
32#endif
33
34#if defined(__APPLE__)
35.section __DATA,__const
36#else
37.section .rodata, "a", %progbits
38#endif
39
40#define F_0_298   2446  /* FIX(0.298631336) */
41#define F_0_390   3196  /* FIX(0.390180644) */
42#define F_0_541   4433  /* FIX(0.541196100) */
43#define F_0_765   6270  /* FIX(0.765366865) */
44#define F_0_899   7373  /* FIX(0.899976223) */
45#define F_1_175   9633  /* FIX(1.175875602) */
46#define F_1_501  12299  /* FIX(1.501321110) */
47#define F_1_847  15137  /* FIX(1.847759065) */
48#define F_1_961  16069  /* FIX(1.961570560) */
49#define F_2_053  16819  /* FIX(2.053119869) */
50#define F_2_562  20995  /* FIX(2.562915447) */
51#define F_3_072  25172  /* FIX(3.072711026) */
52
53.balign 16
54Ljsimd_idct_islow_neon_consts:
55  .short F_0_298
56  .short -F_0_390
57  .short F_0_541
58  .short F_0_765
59  .short - F_0_899
60  .short F_1_175
61  .short F_1_501
62  .short - F_1_847
63  .short - F_1_961
64  .short F_2_053
65  .short - F_2_562
66  .short F_3_072
67  .short 0          /* padding */
68  .short 0
69  .short 0
70  .short 0
71
72#undef F_0_298
73#undef F_0_390
74#undef F_0_541
75#undef F_0_765
76#undef F_0_899
77#undef F_1_175
78#undef F_1_501
79#undef F_1_847
80#undef F_1_961
81#undef F_2_053
82#undef F_2_562
83#undef F_3_072
84
85
86#define XFIX_1_082392200  v0.h[0]
87#define XFIX_1_414213562  v0.h[1]
88#define XFIX_1_847759065  v0.h[2]
89#define XFIX_2_613125930  v0.h[3]
90
91.balign 16
92Ljsimd_idct_ifast_neon_consts:
93  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
94  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
95  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
96  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
97
98#define CONST_BITS  13
99#define PASS1_BITS  2
100
101#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
102#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
103#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
104#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
105#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
106#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
107#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
108#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
109#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
110#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
111#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
112#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
113#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
114#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
115
116.balign 16
117Ljsimd_idct_4x4_neon_consts:
118  .short FIX_1_847759065        /* v0.h[0] */
119  .short -FIX_0_765366865       /* v0.h[1] */
120  .short -FIX_0_211164243       /* v0.h[2] */
121  .short FIX_1_451774981        /* v0.h[3] */
122  .short -FIX_2_172734803       /* d1[0] */
123  .short FIX_1_061594337        /* d1[1] */
124  .short -FIX_0_509795579       /* d1[2] */
125  .short -FIX_0_601344887       /* d1[3] */
126  .short FIX_0_899976223        /* v2.h[0] */
127  .short FIX_2_562915447        /* v2.h[1] */
128  .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
129  .short 0                      /* v2.h[3] */
130
131.balign 8
132Ljsimd_idct_2x2_neon_consts:
133  .short -FIX_0_720959822  /* v14[0] */
134  .short FIX_0_850430095   /* v14[1] */
135  .short -FIX_1_272758580  /* v14[2] */
136  .short FIX_3_624509785   /* v14[3] */
137
138.balign 16
139Ljsimd_ycc_colorid_neon_consts:
140  .short 0,      0,     0,      0
141  .short 22971, -11277, -23401, 29033
142  .short -128,  -128,   -128,   -128
143  .short -128,  -128,   -128,   -128
144
145.balign 16
146Ljsimd_colorid_ycc_neon_consts:
147  .short 19595, 38470, 7471, 11059
148  .short 21709, 32768, 27439, 5329
149  .short 32767, 128, 32767, 128
150  .short 32767, 128, 32767, 128
151
152#define F_0_298   2446  /* FIX(0.298631336) */
153#define F_0_390   3196  /* FIX(0.390180644) */
154#define F_0_541   4433  /* FIX(0.541196100) */
155#define F_0_765   6270  /* FIX(0.765366865) */
156#define F_0_899   7373  /* FIX(0.899976223) */
157#define F_1_175   9633  /* FIX(1.175875602) */
158#define F_1_501  12299  /* FIX(1.501321110) */
159#define F_1_847  15137  /* FIX(1.847759065) */
160#define F_1_961  16069  /* FIX(1.961570560) */
161#define F_2_053  16819  /* FIX(2.053119869) */
162#define F_2_562  20995  /* FIX(2.562915447) */
163#define F_3_072  25172  /* FIX(3.072711026) */
164
165.balign 16
166Ljsimd_fdct_islow_neon_consts:
167  .short F_0_298
168  .short -F_0_390
169  .short F_0_541
170  .short F_0_765
171  .short - F_0_899
172  .short F_1_175
173  .short F_1_501
174  .short - F_1_847
175  .short - F_1_961
176  .short F_2_053
177  .short - F_2_562
178  .short F_3_072
179  .short 0          /* padding */
180  .short 0
181  .short 0
182  .short 0
183
184#undef F_0_298
185#undef F_0_390
186#undef F_0_541
187#undef F_0_765
188#undef F_0_899
189#undef F_1_175
190#undef F_1_501
191#undef F_1_847
192#undef F_1_961
193#undef F_2_053
194#undef F_2_562
195#undef F_3_072
196
197.balign 16
198Ljsimd_fdct_ifast_neon_consts:
199  .short (98 * 128)               /* XFIX_0_382683433 */
200  .short (139 * 128)              /* XFIX_0_541196100 */
201  .short (181 * 128)              /* XFIX_0_707106781 */
202  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
203
204.balign 16
205Ljsimd_h2_downsample_neon_consts:
206  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
207        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
208  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
209        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
210  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
211        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
212  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
213        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
214  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
215        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
216  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
217        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
218  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
219        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
220  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
221        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
222  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
223        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
224  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
225        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
226  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
227        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
228  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
229        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
230  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
231        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
232  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
233        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
234  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
235        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
236  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
237        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
238
239Ljsimd_huff_encode_one_block_neon_consts:
240    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
241          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
242    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
243            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
244    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
245            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
246    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
247           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
248    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
249            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
250    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
251            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
252    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
253            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
254    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
255            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
256    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
257            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
258    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
259           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
260    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
261             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
262    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
263           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
264    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
265           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
266Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
267    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
268          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
269
270.text
271
272
273#define RESPECT_STRICT_ALIGNMENT  1
274
275
276/*****************************************************************************/
277
278/* Supplementary macro for setting function attributes */
279.macro asm_function fname
280#ifdef __APPLE__
281    .private_extern _\fname
282    .globl _\fname
283_\fname:
284#else
285    .global \fname
286#ifdef __ELF__
287    .hidden \fname
288    .type \fname, %function
289#endif
290\fname:
291#endif
292.endm
293
294.macro get_symbol_loc xi, symbol
295#ifdef __APPLE__
296    adrp            \xi, \symbol@PAGE
297    add             \xi, \xi, \symbol@PAGEOFF
298#else
299    adrp            \xi, \symbol
300    add             \xi, \xi, :lo12:\symbol
301#endif
302.endm
303
304/* Transpose elements of single 128 bit registers */
305.macro transpose_single x0, x1, xi, xilen, literal
306    ins             \xi\xilen[0], \x0\xilen[0]
307    ins             \x1\xilen[0], \x0\xilen[1]
308    trn1            \x0\literal, \x0\literal, \x1\literal
309    trn2            \x1\literal, \xi\literal, \x1\literal
310.endm
311
312/* Transpose elements of 2 different registers */
313.macro transpose x0, x1, xi, xilen, literal
314    mov             \xi\xilen, \x0\xilen
315    trn1            \x0\literal, \x0\literal, \x1\literal
316    trn2            \x1\literal, \xi\literal, \x1\literal
317.endm
318
319/* Transpose a block of 4x4 coefficients in four 64-bit registers */
320.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
321    mov             \xi\xilen, \x0\xilen
322    trn1            \x0\x0len, \x0\x0len, \x2\x2len
323    trn2            \x2\x2len, \xi\x0len, \x2\x2len
324    mov             \xi\xilen, \x1\xilen
325    trn1            \x1\x1len, \x1\x1len, \x3\x3len
326    trn2            \x3\x3len, \xi\x1len, \x3\x3len
327.endm
328
329.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
330    mov             \xi\xilen, \x0\xilen
331    trn1            \x0\x0len, \x0\x0len, \x1\x1len
332    trn2            \x1\x2len, \xi\x0len, \x1\x2len
333    mov             \xi\xilen, \x2\xilen
334    trn1            \x2\x2len, \x2\x2len, \x3\x3len
335    trn2            \x3\x2len, \xi\x1len, \x3\x3len
336.endm
337
338.macro transpose_4x4 x0, x1, x2, x3, x5
339    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
340    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
341.endm
342
343.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
344    trn1            \t0\().8h, \l0\().8h, \l1\().8h
345    trn1            \t1\().8h, \l2\().8h, \l3\().8h
346    trn1            \t2\().8h, \l4\().8h, \l5\().8h
347    trn1            \t3\().8h, \l6\().8h, \l7\().8h
348    trn2            \l1\().8h, \l0\().8h, \l1\().8h
349    trn2            \l3\().8h, \l2\().8h, \l3\().8h
350    trn2            \l5\().8h, \l4\().8h, \l5\().8h
351    trn2            \l7\().8h, \l6\().8h, \l7\().8h
352
353    trn1            \l4\().4s, \t2\().4s, \t3\().4s
354    trn2            \t3\().4s, \t2\().4s, \t3\().4s
355    trn1            \t2\().4s, \t0\().4s, \t1\().4s
356    trn2            \l2\().4s, \t0\().4s, \t1\().4s
357    trn1            \t0\().4s, \l1\().4s, \l3\().4s
358    trn2            \l3\().4s, \l1\().4s, \l3\().4s
359    trn2            \t1\().4s, \l5\().4s, \l7\().4s
360    trn1            \l5\().4s, \l5\().4s, \l7\().4s
361
362    trn2            \l6\().2d, \l2\().2d, \t3\().2d
363    trn1            \l0\().2d, \t2\().2d, \l4\().2d
364    trn1            \l1\().2d, \t0\().2d, \l5\().2d
365    trn2            \l7\().2d, \l3\().2d, \t1\().2d
366    trn1            \l2\().2d, \l2\().2d, \t3\().2d
367    trn2            \l4\().2d, \t2\().2d, \l4\().2d
368    trn1            \l3\().2d, \l3\().2d, \t1\().2d
369    trn2            \l5\().2d, \t0\().2d, \l5\().2d
370.endm
371
372
373#define CENTERJSAMPLE  128
374
375/*****************************************************************************/
376
377/*
378 * Perform dequantization and inverse DCT on one block of coefficients.
379 *
380 * GLOBAL(void)
381 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
382 *                       JSAMPARRAY output_buf, JDIMENSION output_col)
383 */
384
385#define CONST_BITS  13
386#define PASS1_BITS  2
387
388#define XFIX_P_0_298  v0.h[0]
389#define XFIX_N_0_390  v0.h[1]
390#define XFIX_P_0_541  v0.h[2]
391#define XFIX_P_0_765  v0.h[3]
392#define XFIX_N_0_899  v0.h[4]
393#define XFIX_P_1_175  v0.h[5]
394#define XFIX_P_1_501  v0.h[6]
395#define XFIX_N_1_847  v0.h[7]
396#define XFIX_N_1_961  v1.h[0]
397#define XFIX_P_2_053  v1.h[1]
398#define XFIX_N_2_562  v1.h[2]
399#define XFIX_P_3_072  v1.h[3]
400
401asm_function jsimd_idct_islow_neon
402    DCT_TABLE       .req x0
403    COEF_BLOCK      .req x1
404    OUTPUT_BUF      .req x2
405    OUTPUT_COL      .req x3
406    TMP1            .req x0
407    TMP2            .req x1
408    TMP3            .req x9
409    TMP4            .req x10
410    TMP5            .req x11
411    TMP6            .req x12
412    TMP7            .req x13
413    TMP8            .req x14
414
415    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
416       guarantee that the upper (unused) 32 bits of x3 are valid.  This
417       instruction ensures that those bits are set to zero. */
418    uxtw x3, w3
419
420    sub             sp, sp, #64
421    get_symbol_loc  x15, Ljsimd_idct_islow_neon_consts
422    mov             x10, sp
423    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
424    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
425    ld1             {v0.8h, v1.8h}, [x15]
426    ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
427    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
428    ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
429    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
430
431    cmeq            v16.8h, v3.8h, #0
432    cmeq            v26.8h, v4.8h, #0
433    cmeq            v27.8h, v5.8h, #0
434    cmeq            v28.8h, v6.8h, #0
435    cmeq            v29.8h, v7.8h, #0
436    cmeq            v30.8h, v8.8h, #0
437    cmeq            v31.8h, v9.8h, #0
438
439    and             v10.16b, v16.16b, v26.16b
440    and             v11.16b, v27.16b, v28.16b
441    and             v12.16b, v29.16b, v30.16b
442    and             v13.16b, v31.16b, v10.16b
443    and             v14.16b, v11.16b, v12.16b
444    mul             v2.8h, v2.8h, v18.8h
445    and             v15.16b, v13.16b, v14.16b
446    shl             v10.8h, v2.8h, #(PASS1_BITS)
447    sqxtn           v16.8b, v15.8h
448    mov             TMP1, v16.d[0]
449    mvn             TMP2, TMP1
450
451    cbnz            TMP2, 2f
452    /* case all AC coeffs are zeros */
453    dup             v2.2d, v10.d[0]
454    dup             v6.2d, v10.d[1]
455    mov             v3.16b, v2.16b
456    mov             v7.16b, v6.16b
457    mov             v4.16b, v2.16b
458    mov             v8.16b, v6.16b
459    mov             v5.16b, v2.16b
460    mov             v9.16b, v6.16b
4611:
462    /* for this transpose, we should organise data like this:
463     * 00, 01, 02, 03, 40, 41, 42, 43
464     * 10, 11, 12, 13, 50, 51, 52, 53
465     * 20, 21, 22, 23, 60, 61, 62, 63
466     * 30, 31, 32, 33, 70, 71, 72, 73
467     * 04, 05, 06, 07, 44, 45, 46, 47
468     * 14, 15, 16, 17, 54, 55, 56, 57
469     * 24, 25, 26, 27, 64, 65, 66, 67
470     * 34, 35, 36, 37, 74, 75, 76, 77
471     */
472    trn1            v28.8h, v2.8h, v3.8h
473    trn1            v29.8h, v4.8h, v5.8h
474    trn1            v30.8h, v6.8h, v7.8h
475    trn1            v31.8h, v8.8h, v9.8h
476    trn2            v16.8h, v2.8h, v3.8h
477    trn2            v17.8h, v4.8h, v5.8h
478    trn2            v18.8h, v6.8h, v7.8h
479    trn2            v19.8h, v8.8h, v9.8h
480    trn1            v2.4s, v28.4s, v29.4s
481    trn1            v6.4s, v30.4s, v31.4s
482    trn1            v3.4s, v16.4s, v17.4s
483    trn1            v7.4s, v18.4s, v19.4s
484    trn2            v4.4s, v28.4s, v29.4s
485    trn2            v8.4s, v30.4s, v31.4s
486    trn2            v5.4s, v16.4s, v17.4s
487    trn2            v9.4s, v18.4s, v19.4s
488    /* Even part: reverse the even part of the forward DCT. */
489    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
490    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
491    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
492    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
493    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
494    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
495    mov             v21.16b, v19.16b               /* tmp3 = z1 */
496    mov             v20.16b, v18.16b               /* tmp3 = z1 */
497    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
498    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
499    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
500    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
501    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
502    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
503    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
504    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
505    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
506    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
507    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
508    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
509    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
510    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
511    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
512
513    /* Odd part per figure 8; the matrix is unitary and hence its
514     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
515     */
516
517    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
518    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
519    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
520    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
521    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
522
523    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
524    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
525    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
526    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
527    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
528    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
529    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
530    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
531    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
532
533    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
534    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
535    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
536    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
537    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
538    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
539    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
540    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
541    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
542
543    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
544    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
545    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
546    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
547
548    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
549    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
550    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
551    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
552    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
553    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
554    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
555    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
556
557    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
558    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
559    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
560    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
561    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
562    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
563    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
564    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
565
566    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
567
568    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
569    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
570    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
571    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
572    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
573    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
574    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
575    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
576    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
577    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
578    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
579    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
580    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
581    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
582    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
583    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
584
585    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
586    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
587    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
588    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
589    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
590    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
591    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
592    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
593    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
594    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
595    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
596    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
597    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
598    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
599    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
600    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
601    movi            v0.16b, #(CENTERJSAMPLE)
602    /* Prepare pointers (dual-issue with NEON instructions) */
603      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
604    sqrshrn         v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
605      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
606    sqrshrn         v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
607      add             TMP1, TMP1, OUTPUT_COL
608    sqrshrn         v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
609      add             TMP2, TMP2, OUTPUT_COL
610    sqrshrn         v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
611      add             TMP3, TMP3, OUTPUT_COL
612    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
613      add             TMP4, TMP4, OUTPUT_COL
614    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
615      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
616    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
617      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
618    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
619      add             TMP5, TMP5, OUTPUT_COL
620    add             v16.16b, v28.16b, v0.16b
621      add             TMP6, TMP6, OUTPUT_COL
622    add             v18.16b, v29.16b, v0.16b
623      add             TMP7, TMP7, OUTPUT_COL
624    add             v20.16b, v30.16b, v0.16b
625      add             TMP8, TMP8, OUTPUT_COL
626    add             v22.16b, v31.16b, v0.16b
627
628    /* Transpose the final 8-bit samples */
629    trn1            v28.16b, v16.16b, v18.16b
630    trn1            v30.16b, v20.16b, v22.16b
631    trn2            v29.16b, v16.16b, v18.16b
632    trn2            v31.16b, v20.16b, v22.16b
633
634    trn1            v16.8h, v28.8h, v30.8h
635    trn2            v18.8h, v28.8h, v30.8h
636    trn1            v20.8h, v29.8h, v31.8h
637    trn2            v22.8h, v29.8h, v31.8h
638
639    uzp1            v28.4s, v16.4s, v18.4s
640    uzp2            v30.4s, v16.4s, v18.4s
641    uzp1            v29.4s, v20.4s, v22.4s
642    uzp2            v31.4s, v20.4s, v22.4s
643
644    /* Store results to the output buffer */
645    st1             {v28.d}[0], [TMP1]
646    st1             {v29.d}[0], [TMP2]
647    st1             {v28.d}[1], [TMP3]
648    st1             {v29.d}[1], [TMP4]
649    st1             {v30.d}[0], [TMP5]
650    st1             {v31.d}[0], [TMP6]
651    st1             {v30.d}[1], [TMP7]
652    st1             {v31.d}[1], [TMP8]
653    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
654    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
655    blr             x30
656
657.balign 16
6582:
659    mul             v3.8h, v3.8h, v19.8h
660    mul             v4.8h, v4.8h, v20.8h
661    mul             v5.8h, v5.8h, v21.8h
662    add             TMP4, xzr, TMP2, LSL #32
663    mul             v6.8h, v6.8h, v22.8h
664    mul             v7.8h, v7.8h, v23.8h
665    adds            TMP3, xzr, TMP2, LSR #32
666    mul             v8.8h, v8.8h, v24.8h
667    mul             v9.8h, v9.8h, v25.8h
668    b.ne            3f
669    /* Right AC coef is zero */
670    dup             v15.2d, v10.d[1]
671    /* Even part: reverse the even part of the forward DCT. */
672    add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
673    add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
674    sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
675    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
676    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
677    mov             v20.16b, v18.16b               /* tmp3 = z1 */
678    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
679    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
680    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
681    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
682    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
683    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
684    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
685
686    /* Odd part per figure 8; the matrix is unitary and hence its
687     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
688     */
689
690    add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
691    add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
692    add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
693    add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
694    add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
695
696    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
697    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
698    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
699    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
700    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
701    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
702    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
703    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
704    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
705
706    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
707    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
708
709    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
710    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
711    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
712    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
713
714    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
715    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
716    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
717    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
718
719    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
720
721    add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
722    sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
723    add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
724    sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
725    add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
726    sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
727    add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
728    sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
729
730    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
731    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
732    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
733    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
734    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
735    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
736    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
737    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
738    mov             v6.16b, v15.16b
739    mov             v7.16b, v15.16b
740    mov             v8.16b, v15.16b
741    mov             v9.16b, v15.16b
742    b               1b
743
744.balign 16
7453:
746    cbnz            TMP4, 4f
747    /* Left AC coef is zero */
748    dup             v14.2d, v10.d[0]
749    /* Even part: reverse the even part of the forward DCT. */
750    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
751    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
752    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
753    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
754    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
755    mov             v21.16b, v19.16b               /* tmp3 = z1 */
756    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
757    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
758    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
759    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
760    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
761    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
762    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
763
764    /* Odd part per figure 8; the matrix is unitary and hence its
765     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
766     */
767
768    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
769    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
770    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
771    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
772    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
773
774    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
775    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
776    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
777    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
778    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
779    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
780    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
781    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
782    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
783
784    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
785    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
786    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
787    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
788
789    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
790    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
791    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
792    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
793
794    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
795    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
796    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
797    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
798
799    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
800
801    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
802    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
803    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
804    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
805    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
806    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
807    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
808    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
809
810    mov             v2.16b, v14.16b
811    mov             v3.16b, v14.16b
812    mov             v4.16b, v14.16b
813    mov             v5.16b, v14.16b
814    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
815    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
816    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
817    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
818    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
819    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
820    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
821    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
822    b               1b
823
824.balign 16
8254:
826    /* "No" AC coef is zero */
827    /* Even part: reverse the even part of the forward DCT. */
828    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
829    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
830    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
831    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
832    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
833    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
834    mov             v21.16b, v19.16b               /* tmp3 = z1 */
835    mov             v20.16b, v18.16b               /* tmp3 = z1 */
836    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
837    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
838    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
839    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
840    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
841    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
842    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
843    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
844    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
845    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
846    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
847    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
848    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
849    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
850    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
851
852    /* Odd part per figure 8; the matrix is unitary and hence its
853     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
854     */
855
856    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
857    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
858    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
859    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
860    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
861
862    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
863    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
864    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
865    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
866    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
867    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
868    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
869    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
870    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
871
872    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
873    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
874    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
875    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
876    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
877    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
878    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
879    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
880    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
881
882    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
883    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
884    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
885    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
886
887    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
888    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
889    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
890    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
891    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
892    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
893    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
894    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
895
896    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
897    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
898    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
899    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
900    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
901    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
902    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
903    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
904
905    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
906
907    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
908    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
909    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
910    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
911    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
912    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
913    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
914    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
915    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
916    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
917    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
918    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
919    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
920    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
921    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
922    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
923
924    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
925    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
926    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
927    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
928    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
929    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
930    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
931    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
932    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
933    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
934    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
935    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
936    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
937    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
938    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
939    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
940    b               1b
941
942    .unreq          DCT_TABLE
943    .unreq          COEF_BLOCK
944    .unreq          OUTPUT_BUF
945    .unreq          OUTPUT_COL
946    .unreq          TMP1
947    .unreq          TMP2
948    .unreq          TMP3
949    .unreq          TMP4
950    .unreq          TMP5
951    .unreq          TMP6
952    .unreq          TMP7
953    .unreq          TMP8
954
955#undef CENTERJSAMPLE
956#undef CONST_BITS
957#undef PASS1_BITS
958#undef XFIX_P_0_298
959#undef XFIX_N_0_390
960#undef XFIX_P_0_541
961#undef XFIX_P_0_765
962#undef XFIX_N_0_899
963#undef XFIX_P_1_175
964#undef XFIX_P_1_501
965#undef XFIX_N_1_847
966#undef XFIX_N_1_961
967#undef XFIX_P_2_053
968#undef XFIX_N_2_562
969#undef XFIX_P_3_072
970
971
972/*****************************************************************************/
973
974/*
975 * jsimd_idct_ifast_neon
976 *
977 * This function contains a fast, not so accurate integer implementation of
978 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
979 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
980 * function from jidctfst.c
981 *
982 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
983 * But in ARM NEON case some extra additions are required because VQDMULH
984 * instruction can't handle the constants larger than 1. So the expressions
985 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
986 * which introduces an extra addition. Overall, there are 6 extra additions
987 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
988 */
989
990asm_function jsimd_idct_ifast_neon
991
992    DCT_TABLE       .req x0
993    COEF_BLOCK      .req x1
994    OUTPUT_BUF      .req x2
995    OUTPUT_COL      .req x3
996    TMP1            .req x0
997    TMP2            .req x1
998    TMP3            .req x9
999    TMP4            .req x10
1000    TMP5            .req x11
1001    TMP6            .req x12
1002    TMP7            .req x13
1003    TMP8            .req x14
1004
1005    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1006       guarantee that the upper (unused) 32 bits of x3 are valid.  This
1007       instruction ensures that those bits are set to zero. */
1008    uxtw x3, w3
1009
1010    /* Load and dequantize coefficients into NEON registers
1011     * with the following allocation:
1012     *       0 1 2 3 | 4 5 6 7
1013     *      ---------+--------
1014     *   0 | d16     | d17     ( v16.8h )
1015     *   1 | d18     | d19     ( v17.8h )
1016     *   2 | d20     | d21     ( v18.8h )
1017     *   3 | d22     | d23     ( v19.8h )
1018     *   4 | d24     | d25     ( v20.8h )
1019     *   5 | d26     | d27     ( v21.8h )
1020     *   6 | d28     | d29     ( v22.8h )
1021     *   7 | d30     | d31     ( v23.8h )
1022     */
1023    /* Save NEON registers used in fast IDCT */
1024    get_symbol_loc  TMP5, Ljsimd_idct_ifast_neon_consts
1025    ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
1026    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
1027    ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
1028    mul             v16.8h, v16.8h, v0.8h
1029    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
1030    mul             v17.8h, v17.8h, v1.8h
1031    ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
1032    mul             v18.8h, v18.8h, v2.8h
1033    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
1034    mul             v19.8h, v19.8h, v3.8h
1035    ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
1036    mul             v20.8h, v20.8h, v0.8h
1037    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
1038    mul             v22.8h, v22.8h, v2.8h
1039    mul             v21.8h, v21.8h, v1.8h
1040    ld1             {v0.4h}, [TMP5]        /* load constants */
1041    mul             v23.8h, v23.8h, v3.8h
1042
1043    /* 1-D IDCT, pass 1 */
1044    sub             v2.8h, v18.8h, v22.8h
1045    add             v22.8h, v18.8h, v22.8h
1046    sub             v1.8h, v19.8h, v21.8h
1047    add             v21.8h, v19.8h, v21.8h
1048    sub             v5.8h, v17.8h, v23.8h
1049    add             v23.8h, v17.8h, v23.8h
1050    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
1051    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
1052    add             v3.8h, v1.8h, v1.8h
1053    sub             v1.8h, v5.8h, v1.8h
1054    add             v18.8h, v2.8h, v4.8h
1055    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
1056    sub             v2.8h, v23.8h, v21.8h
1057    add             v3.8h, v3.8h, v6.8h
1058    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
1059    add             v1.8h, v1.8h, v4.8h
1060    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
1061    sub             v18.8h, v18.8h, v22.8h
1062    add             v2.8h, v2.8h, v6.8h
1063    sub             v6.8h, v16.8h, v20.8h
1064    add             v20.8h, v16.8h, v20.8h
1065    add             v17.8h, v5.8h, v4.8h
1066    add             v5.8h, v6.8h, v18.8h
1067    sub             v18.8h, v6.8h, v18.8h
1068    add             v6.8h, v23.8h, v21.8h
1069    add             v16.8h, v20.8h, v22.8h
1070    sub             v3.8h, v6.8h, v3.8h
1071    sub             v20.8h, v20.8h, v22.8h
1072    sub             v3.8h, v3.8h, v1.8h
1073    sub             v1.8h, v17.8h, v1.8h
1074    add             v2.8h, v3.8h, v2.8h
1075    sub             v23.8h, v16.8h, v6.8h
1076    add             v1.8h, v1.8h, v2.8h
1077    add             v16.8h, v16.8h, v6.8h
1078    add             v22.8h, v5.8h, v3.8h
1079    sub             v17.8h, v5.8h, v3.8h
1080    sub             v21.8h, v18.8h, v2.8h
1081    add             v18.8h, v18.8h, v2.8h
1082    sub             v19.8h, v20.8h, v1.8h
1083    add             v20.8h, v20.8h, v1.8h
1084    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
1085    /* 1-D IDCT, pass 2 */
1086    sub             v2.8h, v18.8h, v22.8h
1087    add             v22.8h, v18.8h, v22.8h
1088    sub             v1.8h, v19.8h, v21.8h
1089    add             v21.8h, v19.8h, v21.8h
1090    sub             v5.8h, v17.8h, v23.8h
1091    add             v23.8h, v17.8h, v23.8h
1092    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
1093    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
1094    add             v3.8h, v1.8h, v1.8h
1095    sub             v1.8h, v5.8h, v1.8h
1096    add             v18.8h, v2.8h, v4.8h
1097    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
1098    sub             v2.8h, v23.8h, v21.8h
1099    add             v3.8h, v3.8h, v6.8h
1100    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
1101    add             v1.8h, v1.8h, v4.8h
1102    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
1103    sub             v18.8h, v18.8h, v22.8h
1104    add             v2.8h, v2.8h, v6.8h
1105    sub             v6.8h, v16.8h, v20.8h
1106    add             v20.8h, v16.8h, v20.8h
1107    add             v17.8h, v5.8h, v4.8h
1108    add             v5.8h, v6.8h, v18.8h
1109    sub             v18.8h, v6.8h, v18.8h
1110    add             v6.8h, v23.8h, v21.8h
1111    add             v16.8h, v20.8h, v22.8h
1112    sub             v3.8h, v6.8h, v3.8h
1113    sub             v20.8h, v20.8h, v22.8h
1114    sub             v3.8h, v3.8h, v1.8h
1115    sub             v1.8h, v17.8h, v1.8h
1116    add             v2.8h, v3.8h, v2.8h
1117    sub             v23.8h, v16.8h, v6.8h
1118    add             v1.8h, v1.8h, v2.8h
1119    add             v16.8h, v16.8h, v6.8h
1120    add             v22.8h, v5.8h, v3.8h
1121    sub             v17.8h, v5.8h, v3.8h
1122    sub             v21.8h, v18.8h, v2.8h
1123    add             v18.8h, v18.8h, v2.8h
1124    sub             v19.8h, v20.8h, v1.8h
1125    add             v20.8h, v20.8h, v1.8h
1126    /* Descale to 8-bit and range limit */
1127    movi            v0.16b, #0x80
1128      /* Prepare pointers (dual-issue with NEON instructions) */
1129      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
1130    sqshrn          v28.8b, v16.8h, #5
1131      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
1132    sqshrn          v29.8b, v17.8h, #5
1133      add             TMP1, TMP1, OUTPUT_COL
1134    sqshrn          v30.8b, v18.8h, #5
1135      add             TMP2, TMP2, OUTPUT_COL
1136    sqshrn          v31.8b, v19.8h, #5
1137      add             TMP3, TMP3, OUTPUT_COL
1138    sqshrn2         v28.16b, v20.8h, #5
1139      add             TMP4, TMP4, OUTPUT_COL
1140    sqshrn2         v29.16b, v21.8h, #5
1141      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
1142    sqshrn2         v30.16b, v22.8h, #5
1143      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
1144    sqshrn2         v31.16b, v23.8h, #5
1145      add             TMP5, TMP5, OUTPUT_COL
1146    add             v16.16b, v28.16b, v0.16b
1147      add             TMP6, TMP6, OUTPUT_COL
1148    add             v18.16b, v29.16b, v0.16b
1149      add             TMP7, TMP7, OUTPUT_COL
1150    add             v20.16b, v30.16b, v0.16b
1151      add             TMP8, TMP8, OUTPUT_COL
1152    add             v22.16b, v31.16b, v0.16b
1153
1154    /* Transpose the final 8-bit samples */
1155    trn1            v28.16b, v16.16b, v18.16b
1156    trn1            v30.16b, v20.16b, v22.16b
1157    trn2            v29.16b, v16.16b, v18.16b
1158    trn2            v31.16b, v20.16b, v22.16b
1159
1160    trn1            v16.8h, v28.8h, v30.8h
1161    trn2            v18.8h, v28.8h, v30.8h
1162    trn1            v20.8h, v29.8h, v31.8h
1163    trn2            v22.8h, v29.8h, v31.8h
1164
1165    uzp1            v28.4s, v16.4s, v18.4s
1166    uzp2            v30.4s, v16.4s, v18.4s
1167    uzp1            v29.4s, v20.4s, v22.4s
1168    uzp2            v31.4s, v20.4s, v22.4s
1169
1170    /* Store results to the output buffer */
1171    st1             {v28.d}[0], [TMP1]
1172    st1             {v29.d}[0], [TMP2]
1173    st1             {v28.d}[1], [TMP3]
1174    st1             {v29.d}[1], [TMP4]
1175    st1             {v30.d}[0], [TMP5]
1176    st1             {v31.d}[0], [TMP6]
1177    st1             {v30.d}[1], [TMP7]
1178    st1             {v31.d}[1], [TMP8]
1179    blr             x30
1180
1181    .unreq          DCT_TABLE
1182    .unreq          COEF_BLOCK
1183    .unreq          OUTPUT_BUF
1184    .unreq          OUTPUT_COL
1185    .unreq          TMP1
1186    .unreq          TMP2
1187    .unreq          TMP3
1188    .unreq          TMP4
1189    .unreq          TMP5
1190    .unreq          TMP6
1191    .unreq          TMP7
1192    .unreq          TMP8
1193
1194
1195/*****************************************************************************/
1196
1197/*
1198 * jsimd_idct_4x4_neon
1199 *
1200 * This function contains inverse-DCT code for getting reduced-size
1201 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
1202 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1203 * function from jpeg-6b (jidctred.c).
1204 *
1205 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1206 *       requires much less arithmetic operations and hence should be faster.
1207 *       The primary purpose of this particular NEON optimized function is
1208 *       bit exact compatibility with jpeg-6b.
1209 *
1210 * TODO: a bit better instructions scheduling can be achieved by expanding
1211 *       idct_helper/transpose_4x4 macros and reordering instructions,
1212 *       but readability will suffer somewhat.
1213 */
1214
1215#define CONST_BITS  13
1216
1217.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1218    smull           v28.4s, \x4, v2.h[2]
1219    smlal           v28.4s, \x8, v0.h[0]
1220    smlal           v28.4s, \x14, v0.h[1]
1221
1222    smull           v26.4s, \x16, v1.h[2]
1223    smlal           v26.4s, \x12, v1.h[3]
1224    smlal           v26.4s, \x10, v2.h[0]
1225    smlal           v26.4s, \x6, v2.h[1]
1226
1227    smull           v30.4s, \x4, v2.h[2]
1228    smlsl           v30.4s, \x8, v0.h[0]
1229    smlsl           v30.4s, \x14, v0.h[1]
1230
1231    smull           v24.4s, \x16, v0.h[2]
1232    smlal           v24.4s, \x12, v0.h[3]
1233    smlal           v24.4s, \x10, v1.h[0]
1234    smlal           v24.4s, \x6, v1.h[1]
1235
1236    add             v20.4s, v28.4s, v26.4s
1237    sub             v28.4s, v28.4s, v26.4s
1238
1239  .if \shift > 16
1240    srshr           v20.4s, v20.4s, #\shift
1241    srshr           v28.4s, v28.4s, #\shift
1242    xtn             \y26, v20.4s
1243    xtn             \y29, v28.4s
1244  .else
1245    rshrn           \y26, v20.4s, #\shift
1246    rshrn           \y29, v28.4s, #\shift
1247  .endif
1248
1249    add             v20.4s, v30.4s, v24.4s
1250    sub             v30.4s, v30.4s, v24.4s
1251
1252  .if \shift > 16
1253    srshr           v20.4s, v20.4s, #\shift
1254    srshr           v30.4s, v30.4s, #\shift
1255    xtn             \y27, v20.4s
1256    xtn             \y28, v30.4s
1257  .else
1258    rshrn           \y27, v20.4s, #\shift
1259    rshrn           \y28, v30.4s, #\shift
1260  .endif
1261.endm
1262
1263asm_function jsimd_idct_4x4_neon
1264
1265    DCT_TABLE       .req x0
1266    COEF_BLOCK      .req x1
1267    OUTPUT_BUF      .req x2
1268    OUTPUT_COL      .req x3
1269    TMP1            .req x0
1270    TMP2            .req x1
1271    TMP3            .req x2
1272    TMP4            .req x15
1273
1274    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1275       guarantee that the upper (unused) 32 bits of x3 are valid.  This
1276       instruction ensures that those bits are set to zero. */
1277    uxtw x3, w3
1278
1279    /* Save all used NEON registers */
1280    sub             sp, sp, 64
1281    mov             x9, sp
1282    /* Load constants (v3.4h is just used for padding) */
1283    get_symbol_loc  TMP4, Ljsimd_idct_4x4_neon_consts
1284    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1285    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1286    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1287
1288    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1289     *       0 1 2 3 | 4 5 6 7
1290     *      ---------+--------
1291     *   0 | v4.4h   | v5.4h
1292     *   1 | v6.4h   | v7.4h
1293     *   2 | v8.4h   | v9.4h
1294     *   3 | v10.4h  | v11.4h
1295     *   4 | -       | -
1296     *   5 | v12.4h  | v13.4h
1297     *   6 | v14.4h  | v15.4h
1298     *   7 | v16.4h  | v17.4h
1299     */
1300    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1301    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1302    add             COEF_BLOCK, COEF_BLOCK, #16
1303    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1304    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
1305    /* dequantize */
1306    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1307    mul             v4.4h, v4.4h, v18.4h
1308    mul             v5.4h, v5.4h, v19.4h
1309    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
1310    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1311    mul             v6.4h, v6.4h, v20.4h
1312    mul             v7.4h, v7.4h, v21.4h
1313    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
1314    mul             v8.4h, v8.4h, v22.4h
1315    mul             v9.4h, v9.4h, v23.4h
1316    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
1317    add             DCT_TABLE, DCT_TABLE, #16
1318    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1319    mul             v10.4h, v10.4h, v24.4h
1320    mul             v11.4h, v11.4h, v25.4h
1321    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
1322    mul             v12.4h, v12.4h, v26.4h
1323    mul             v13.4h, v13.4h, v27.4h
1324    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
1325    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
1326    mul             v14.4h, v14.4h, v28.4h
1327    mul             v15.4h, v15.4h, v29.4h
1328    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
1329    mul             v16.4h, v16.4h, v30.4h
1330    mul             v17.4h, v17.4h, v31.4h
1331    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
1332
1333    /* Pass 1 */
1334    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
1335                    v4.4h, v6.4h, v8.4h, v10.4h
1336    transpose_4x4   v4, v6, v8, v10, v3
1337    ins             v10.d[1], v11.d[0]
1338    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
1339                    v5.4h, v7.4h, v9.4h, v11.4h
1340    transpose_4x4   v5, v7, v9, v11, v3
1341    ins             v10.d[1], v11.d[0]
1342
1343    /* Pass 2 */
1344    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
1345                    v26.4h, v27.4h, v28.4h, v29.4h
1346    transpose_4x4   v26, v27, v28, v29, v3
1347
1348    /* Range limit */
1349    movi            v30.8h, #0x80
1350    ins             v26.d[1], v27.d[0]
1351    ins             v28.d[1], v29.d[0]
1352    add             v26.8h, v26.8h, v30.8h
1353    add             v28.8h, v28.8h, v30.8h
1354    sqxtun          v26.8b, v26.8h
1355    sqxtun          v27.8b, v28.8h
1356
1357    /* Store results to the output buffer */
1358    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
1359    ldp             TMP3, TMP4, [OUTPUT_BUF]
1360    add             TMP1, TMP1, OUTPUT_COL
1361    add             TMP2, TMP2, OUTPUT_COL
1362    add             TMP3, TMP3, OUTPUT_COL
1363    add             TMP4, TMP4, OUTPUT_COL
1364
1365#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1366    /* We can use much less instructions on little endian systems if the
1367     * OS kernel is not configured to trap unaligned memory accesses
1368     */
1369    st1             {v26.s}[0], [TMP1], 4
1370    st1             {v27.s}[0], [TMP3], 4
1371    st1             {v26.s}[1], [TMP2], 4
1372    st1             {v27.s}[1], [TMP4], 4
1373#else
1374    st1             {v26.b}[0], [TMP1], 1
1375    st1             {v27.b}[0], [TMP3], 1
1376    st1             {v26.b}[1], [TMP1], 1
1377    st1             {v27.b}[1], [TMP3], 1
1378    st1             {v26.b}[2], [TMP1], 1
1379    st1             {v27.b}[2], [TMP3], 1
1380    st1             {v26.b}[3], [TMP1], 1
1381    st1             {v27.b}[3], [TMP3], 1
1382
1383    st1             {v26.b}[4], [TMP2], 1
1384    st1             {v27.b}[4], [TMP4], 1
1385    st1             {v26.b}[5], [TMP2], 1
1386    st1             {v27.b}[5], [TMP4], 1
1387    st1             {v26.b}[6], [TMP2], 1
1388    st1             {v27.b}[6], [TMP4], 1
1389    st1             {v26.b}[7], [TMP2], 1
1390    st1             {v27.b}[7], [TMP4], 1
1391#endif
1392
1393    /* vpop            {v8.4h - v15.4h}    ;not available */
1394    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1395    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1396    blr             x30
1397
1398    .unreq          DCT_TABLE
1399    .unreq          COEF_BLOCK
1400    .unreq          OUTPUT_BUF
1401    .unreq          OUTPUT_COL
1402    .unreq          TMP1
1403    .unreq          TMP2
1404    .unreq          TMP3
1405    .unreq          TMP4
1406
1407.purgem idct_helper
1408
1409
1410/*****************************************************************************/
1411
1412/*
1413 * jsimd_idct_2x2_neon
1414 *
1415 * This function contains inverse-DCT code for getting reduced-size
1416 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
1417 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1418 * function from jpeg-6b (jidctred.c).
1419 *
1420 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1421 *       requires much less arithmetic operations and hence should be faster.
1422 *       The primary purpose of this particular NEON optimized function is
1423 *       bit exact compatibility with jpeg-6b.
1424 */
1425
1426.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1427    sshll           v15.4s, \x4, #15
1428    smull           v26.4s, \x6, v14.h[3]
1429    smlal           v26.4s, \x10, v14.h[2]
1430    smlal           v26.4s, \x12, v14.h[1]
1431    smlal           v26.4s, \x16, v14.h[0]
1432
1433    add             v20.4s, v15.4s, v26.4s
1434    sub             v15.4s, v15.4s, v26.4s
1435
1436  .if \shift > 16
1437    srshr           v20.4s, v20.4s, #\shift
1438    srshr           v15.4s, v15.4s, #\shift
1439    xtn             \y26, v20.4s
1440    xtn             \y27, v15.4s
1441  .else
1442    rshrn           \y26, v20.4s, #\shift
1443    rshrn           \y27, v15.4s, #\shift
1444  .endif
1445.endm
1446
1447asm_function jsimd_idct_2x2_neon
1448
1449    DCT_TABLE       .req x0
1450    COEF_BLOCK      .req x1
1451    OUTPUT_BUF      .req x2
1452    OUTPUT_COL      .req x3
1453    TMP1            .req x0
1454    TMP2            .req x15
1455
1456    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1457       guarantee that the upper (unused) 32 bits of x3 are valid.  This
1458       instruction ensures that those bits are set to zero. */
1459    uxtw x3, w3
1460
1461    /* vpush           {v8.4h - v15.4h}            ; not available */
1462    sub             sp, sp, 64
1463    mov             x9, sp
1464
1465    /* Load constants */
1466    get_symbol_loc  TMP2, Ljsimd_idct_2x2_neon_consts
1467    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1468    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1469    ld1             {v14.4h}, [TMP2]
1470
1471    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1472     *       0 1 2 3 | 4 5 6 7
1473     *      ---------+--------
1474     *   0 | v4.4h   | v5.4h
1475     *   1 | v6.4h   | v7.4h
1476     *   2 | -       | -
1477     *   3 | v10.4h  | v11.4h
1478     *   4 | -       | -
1479     *   5 | v12.4h  | v13.4h
1480     *   6 | -       | -
1481     *   7 | v16.4h  | v17.4h
1482     */
1483    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1484    add             COEF_BLOCK, COEF_BLOCK, #16
1485    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
1486    add             COEF_BLOCK, COEF_BLOCK, #16
1487    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
1488    add             COEF_BLOCK, COEF_BLOCK, #16
1489    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
1490    /* Dequantize */
1491    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1492    mul             v4.4h, v4.4h, v18.4h
1493    mul             v5.4h, v5.4h, v19.4h
1494    ins             v4.d[1], v5.d[0]
1495    mul             v6.4h, v6.4h, v20.4h
1496    mul             v7.4h, v7.4h, v21.4h
1497    ins             v6.d[1], v7.d[0]
1498    add             DCT_TABLE, DCT_TABLE, #16
1499    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
1500    mul             v10.4h, v10.4h, v24.4h
1501    mul             v11.4h, v11.4h, v25.4h
1502    ins             v10.d[1], v11.d[0]
1503    add             DCT_TABLE, DCT_TABLE, #16
1504    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
1505    mul             v12.4h, v12.4h, v26.4h
1506    mul             v13.4h, v13.4h, v27.4h
1507    ins             v12.d[1], v13.d[0]
1508    add             DCT_TABLE, DCT_TABLE, #16
1509    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
1510    mul             v16.4h, v16.4h, v30.4h
1511    mul             v17.4h, v17.4h, v31.4h
1512    ins             v16.d[1], v17.d[0]
1513
1514    /* Pass 1 */
1515#if 0
1516    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1517    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
1518    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1519    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
1520#else
1521    smull           v26.4s, v6.4h, v14.h[3]
1522    smlal           v26.4s, v10.4h, v14.h[2]
1523    smlal           v26.4s, v12.4h, v14.h[1]
1524    smlal           v26.4s, v16.4h, v14.h[0]
1525    smull           v24.4s, v7.4h, v14.h[3]
1526    smlal           v24.4s, v11.4h, v14.h[2]
1527    smlal           v24.4s, v13.4h, v14.h[1]
1528    smlal           v24.4s, v17.4h, v14.h[0]
1529    sshll           v15.4s, v4.4h, #15
1530    sshll           v30.4s, v5.4h, #15
1531    add             v20.4s, v15.4s, v26.4s
1532    sub             v15.4s, v15.4s, v26.4s
1533    rshrn           v4.4h, v20.4s, #13
1534    rshrn           v6.4h, v15.4s, #13
1535    add             v20.4s, v30.4s, v24.4s
1536    sub             v15.4s, v30.4s, v24.4s
1537    rshrn           v5.4h, v20.4s, #13
1538    rshrn           v7.4h, v15.4s, #13
1539    ins             v4.d[1], v5.d[0]
1540    ins             v6.d[1], v7.d[0]
1541    transpose       v4, v6, v3, .16b, .8h
1542    transpose       v6, v10, v3, .16b, .4s
1543    ins             v11.d[0], v10.d[1]
1544    ins             v7.d[0], v6.d[1]
1545#endif
1546
1547    /* Pass 2 */
1548    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1549
1550    /* Range limit */
1551    movi            v30.8h, #0x80
1552    ins             v26.d[1], v27.d[0]
1553    add             v26.8h, v26.8h, v30.8h
1554    sqxtun          v30.8b, v26.8h
1555    ins             v26.d[0], v30.d[0]
1556    sqxtun          v27.8b, v26.8h
1557
1558    /* Store results to the output buffer */
1559    ldp             TMP1, TMP2, [OUTPUT_BUF]
1560    add             TMP1, TMP1, OUTPUT_COL
1561    add             TMP2, TMP2, OUTPUT_COL
1562
1563    st1             {v26.b}[0], [TMP1], 1
1564    st1             {v27.b}[4], [TMP1], 1
1565    st1             {v26.b}[1], [TMP2], 1
1566    st1             {v27.b}[5], [TMP2], 1
1567
1568    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1569    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1570    blr             x30
1571
1572    .unreq          DCT_TABLE
1573    .unreq          COEF_BLOCK
1574    .unreq          OUTPUT_BUF
1575    .unreq          OUTPUT_COL
1576    .unreq          TMP1
1577    .unreq          TMP2
1578
1579.purgem idct_helper
1580
1581
1582/*****************************************************************************/
1583
1584/*
1585 * jsimd_ycc_extrgb_convert_neon
1586 * jsimd_ycc_extbgr_convert_neon
1587 * jsimd_ycc_extrgbx_convert_neon
1588 * jsimd_ycc_extbgrx_convert_neon
1589 * jsimd_ycc_extxbgr_convert_neon
1590 * jsimd_ycc_extxrgb_convert_neon
1591 *
1592 * Colorspace conversion YCbCr -> RGB
1593 */
1594
1595.macro do_load size
1596  .if \size == 8
1597    ld1             {v4.8b}, [U], 8
1598    ld1             {v5.8b}, [V], 8
1599    ld1             {v0.8b}, [Y], 8
1600    prfm            pldl1keep, [U, #64]
1601    prfm            pldl1keep, [V, #64]
1602    prfm            pldl1keep, [Y, #64]
1603  .elseif \size == 4
1604    ld1             {v4.b}[0], [U], 1
1605    ld1             {v4.b}[1], [U], 1
1606    ld1             {v4.b}[2], [U], 1
1607    ld1             {v4.b}[3], [U], 1
1608    ld1             {v5.b}[0], [V], 1
1609    ld1             {v5.b}[1], [V], 1
1610    ld1             {v5.b}[2], [V], 1
1611    ld1             {v5.b}[3], [V], 1
1612    ld1             {v0.b}[0], [Y], 1
1613    ld1             {v0.b}[1], [Y], 1
1614    ld1             {v0.b}[2], [Y], 1
1615    ld1             {v0.b}[3], [Y], 1
1616  .elseif \size == 2
1617    ld1             {v4.b}[4], [U], 1
1618    ld1             {v4.b}[5], [U], 1
1619    ld1             {v5.b}[4], [V], 1
1620    ld1             {v5.b}[5], [V], 1
1621    ld1             {v0.b}[4], [Y], 1
1622    ld1             {v0.b}[5], [Y], 1
1623  .elseif \size == 1
1624    ld1             {v4.b}[6], [U], 1
1625    ld1             {v5.b}[6], [V], 1
1626    ld1             {v0.b}[6], [Y], 1
1627  .else
1628    .error unsupported macroblock size
1629  .endif
1630.endm
1631
1632.macro do_store bpp, size, fast_st3
1633  .if \bpp == 24
1634    .if \size == 8
1635      .if \fast_st3 == 1
1636        st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
1637      .else
1638        st1         {v10.b}[0], [RGB], #1
1639        st1         {v11.b}[0], [RGB], #1
1640        st1         {v12.b}[0], [RGB], #1
1641
1642        st1         {v10.b}[1], [RGB], #1
1643        st1         {v11.b}[1], [RGB], #1
1644        st1         {v12.b}[1], [RGB], #1
1645
1646        st1         {v10.b}[2], [RGB], #1
1647        st1         {v11.b}[2], [RGB], #1
1648        st1         {v12.b}[2], [RGB], #1
1649
1650        st1         {v10.b}[3], [RGB], #1
1651        st1         {v11.b}[3], [RGB], #1
1652        st1         {v12.b}[3], [RGB], #1
1653
1654        st1         {v10.b}[4], [RGB], #1
1655        st1         {v11.b}[4], [RGB], #1
1656        st1         {v12.b}[4], [RGB], #1
1657
1658        st1         {v10.b}[5], [RGB], #1
1659        st1         {v11.b}[5], [RGB], #1
1660        st1         {v12.b}[5], [RGB], #1
1661
1662        st1         {v10.b}[6], [RGB], #1
1663        st1         {v11.b}[6], [RGB], #1
1664        st1         {v12.b}[6], [RGB], #1
1665
1666        st1         {v10.b}[7], [RGB], #1
1667        st1         {v11.b}[7], [RGB], #1
1668        st1         {v12.b}[7], [RGB], #1
1669      .endif
1670    .elseif \size == 4
1671      st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
1672      st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
1673      st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
1674      st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
1675    .elseif \size == 2
1676      st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
1677      st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
1678    .elseif \size == 1
1679      st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
1680    .else
1681     .error unsupported macroblock size
1682    .endif
1683  .elseif \bpp == 32
1684    .if \size == 8
1685      st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1686    .elseif \size == 4
1687      st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1688      st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1689      st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1690      st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1691    .elseif \size == 2
1692      st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1693      st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1694    .elseif \size == 1
1695      st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1696    .else
1697      .error unsupported macroblock size
1698    .endif
1699  .elseif \bpp == 16
1700    .if \size == 8
1701      st1           {v25.8h}, [RGB], 16
1702    .elseif \size == 4
1703      st1           {v25.4h}, [RGB], 8
1704    .elseif \size == 2
1705      st1           {v25.h}[4], [RGB], 2
1706      st1           {v25.h}[5], [RGB], 2
1707    .elseif \size == 1
1708      st1           {v25.h}[6], [RGB], 2
1709    .else
1710      .error unsupported macroblock size
1711    .endif
1712  .else
1713    .error unsupported bpp
1714  .endif
1715.endm
1716
1717.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
1718                                           g_offs, gsize, b_offs, bsize, \
1719                                           defsize, fast_st3
1720
1721/*
1722 * 2-stage pipelined YCbCr->RGB conversion
1723 */
1724
1725.macro do_yuv_to_rgb_stage1
1726    uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
1727    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1728    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1729    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1730    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1731    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1732    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1733    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1734    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
1735    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
1736.endm
1737
1738.macro do_yuv_to_rgb_stage2
1739    rshrn           v20.4h, v20.4s, #15
1740    rshrn2          v20.8h, v22.4s, #15
1741    rshrn           v24.4h, v24.4s, #14
1742    rshrn2          v24.8h, v26.4s, #14
1743    rshrn           v28.4h, v28.4s, #14
1744    rshrn2          v28.8h, v30.4s, #14
1745    uaddw           v20.8h, v20.8h, v0.8b
1746    uaddw           v24.8h, v24.8h, v0.8b
1747    uaddw           v28.8h, v28.8h, v0.8b
1748  .if \bpp != 16
1749    sqxtun          v1\g_offs\defsize, v20.8h
1750    sqxtun          v1\r_offs\defsize, v24.8h
1751    sqxtun          v1\b_offs\defsize, v28.8h
1752  .else
1753    sqshlu          v21.8h, v20.8h, #8
1754    sqshlu          v25.8h, v24.8h, #8
1755    sqshlu          v29.8h, v28.8h, #8
1756    sri             v25.8h, v21.8h, #5
1757    sri             v25.8h, v29.8h, #11
1758  .endif
1759.endm
1760
1761.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
1762    rshrn           v20.4h, v20.4s, #15
1763    rshrn           v24.4h, v24.4s, #14
1764    rshrn           v28.4h, v28.4s, #14
1765    ld1             {v4.8b}, [U], 8
1766    rshrn2          v20.8h, v22.4s, #15
1767    rshrn2          v24.8h, v26.4s, #14
1768    rshrn2          v28.8h, v30.4s, #14
1769    ld1             {v5.8b}, [V], 8
1770    uaddw           v20.8h, v20.8h, v0.8b
1771    uaddw           v24.8h, v24.8h, v0.8b
1772    uaddw           v28.8h, v28.8h, v0.8b
1773  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
1774    sqxtun          v1\g_offs\defsize, v20.8h
1775    ld1             {v0.8b}, [Y], 8
1776    sqxtun          v1\r_offs\defsize, v24.8h
1777    prfm            pldl1keep, [U, #64]
1778    prfm            pldl1keep, [V, #64]
1779    prfm            pldl1keep, [Y, #64]
1780    sqxtun          v1\b_offs\defsize, v28.8h
1781    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
1782    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1783    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1784    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1785    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1786    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1787    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1788    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1789  .else  /**************************** rgb565 ********************************/
1790    sqshlu          v21.8h, v20.8h, #8
1791    sqshlu          v25.8h, v24.8h, #8
1792    sqshlu          v29.8h, v28.8h, #8
1793    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
1794    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1795    ld1             {v0.8b}, [Y], 8
1796    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1797    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1798    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1799    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1800    sri             v25.8h, v21.8h, #5
1801    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1802    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1803    prfm            pldl1keep, [U, #64]
1804    prfm            pldl1keep, [V, #64]
1805    prfm            pldl1keep, [Y, #64]
1806    sri             v25.8h, v29.8h, #11
1807  .endif
1808    do_store        \bpp, 8, \fast_st3
1809    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
1810    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
1811.endm
1812
1813.macro do_yuv_to_rgb
1814    do_yuv_to_rgb_stage1
1815    do_yuv_to_rgb_stage2
1816.endm
1817
1818/* Apple gas crashes on adrl, work around that by using adr.
1819 * But this requires a copy of these constants for each function.
1820 */
1821
1822.if \fast_st3 == 1
1823asm_function jsimd_ycc_\colorid\()_convert_neon
1824.else
1825asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
1826.endif
1827    OUTPUT_WIDTH    .req w0
1828    INPUT_BUF       .req x1
1829    INPUT_ROW       .req w2
1830    OUTPUT_BUF      .req x3
1831    NUM_ROWS        .req w4
1832
1833    INPUT_BUF0      .req x5
1834    INPUT_BUF1      .req x6
1835    INPUT_BUF2      .req x1
1836
1837    RGB             .req x7
1838    Y               .req x9
1839    U               .req x10
1840    V               .req x11
1841    N               .req w15
1842
1843    sub             sp, sp, 64
1844    mov             x9, sp
1845
1846    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
1847    get_symbol_loc x15, Ljsimd_ycc_colorid_neon_consts
1848
1849    /* Save NEON registers */
1850    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1851    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1852    ld1             {v0.4h, v1.4h}, [x15], 16
1853    ld1             {v2.8h}, [x15]
1854
1855    ldr             INPUT_BUF0, [INPUT_BUF]
1856    ldr             INPUT_BUF1, [INPUT_BUF, #8]
1857    ldr             INPUT_BUF2, [INPUT_BUF, #16]
1858    .unreq          INPUT_BUF
1859
1860    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1861    movi            v10.16b, #255
1862    movi            v13.16b, #255
1863
1864    /* Outer loop over scanlines */
1865    cmp             NUM_ROWS, #1
1866    b.lt            9f
18670:
1868    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
1869    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
1870    mov             N, OUTPUT_WIDTH
1871    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
1872    add             INPUT_ROW, INPUT_ROW, #1
1873    ldr             RGB, [OUTPUT_BUF], #8
1874
1875    /* Inner loop over pixels */
1876    subs            N, N, #8
1877    b.lt            3f
1878    do_load         8
1879    do_yuv_to_rgb_stage1
1880    subs            N, N, #8
1881    b.lt            2f
18821:
1883    do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
1884    subs            N, N, #8
1885    b.ge            1b
18862:
1887    do_yuv_to_rgb_stage2
1888    do_store        \bpp, 8, \fast_st3
1889    tst             N, #7
1890    b.eq            8f
18913:
1892    tst             N, #4
1893    b.eq            3f
1894    do_load         4
18953:
1896    tst             N, #2
1897    b.eq            4f
1898    do_load         2
18994:
1900    tst             N, #1
1901    b.eq            5f
1902    do_load         1
19035:
1904    do_yuv_to_rgb
1905    tst             N, #4
1906    b.eq            6f
1907    do_store        \bpp, 4, \fast_st3
19086:
1909    tst             N, #2
1910    b.eq            7f
1911    do_store        \bpp, 2, \fast_st3
19127:
1913    tst             N, #1
1914    b.eq            8f
1915    do_store        \bpp, 1, \fast_st3
19168:
1917    subs            NUM_ROWS, NUM_ROWS, #1
1918    b.gt            0b
19199:
1920    /* Restore all registers and return */
1921    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1922    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1923    br              x30
1924    .unreq          OUTPUT_WIDTH
1925    .unreq          INPUT_ROW
1926    .unreq          OUTPUT_BUF
1927    .unreq          NUM_ROWS
1928    .unreq          INPUT_BUF0
1929    .unreq          INPUT_BUF1
1930    .unreq          INPUT_BUF2
1931    .unreq          RGB
1932    .unreq          Y
1933    .unreq          U
1934    .unreq          V
1935    .unreq          N
1936
1937.purgem do_yuv_to_rgb
1938.purgem do_yuv_to_rgb_stage1
1939.purgem do_yuv_to_rgb_stage2
1940.purgem do_yuv_to_rgb_stage2_store_load_stage1
1941
1942.endm
1943
1944/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
1945generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
1946generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
1947generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
1948generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
1949generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
1950generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
1951generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
1952
1953generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
1954generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
1955
1956.purgem do_load
1957.purgem do_store
1958
1959
1960/*****************************************************************************/
1961
1962/*
1963 * jsimd_extrgb_ycc_convert_neon
1964 * jsimd_extbgr_ycc_convert_neon
1965 * jsimd_extrgbx_ycc_convert_neon
1966 * jsimd_extbgrx_ycc_convert_neon
1967 * jsimd_extxbgr_ycc_convert_neon
1968 * jsimd_extxrgb_ycc_convert_neon
1969 *
1970 * Colorspace conversion RGB -> YCbCr
1971 */
1972
1973.macro do_store size
1974  .if \size == 8
1975    st1             {v20.8b}, [Y], #8
1976    st1             {v21.8b}, [U], #8
1977    st1             {v22.8b}, [V], #8
1978  .elseif \size == 4
1979    st1             {v20.b}[0], [Y], #1
1980    st1             {v20.b}[1], [Y], #1
1981    st1             {v20.b}[2], [Y], #1
1982    st1             {v20.b}[3], [Y], #1
1983    st1             {v21.b}[0], [U], #1
1984    st1             {v21.b}[1], [U], #1
1985    st1             {v21.b}[2], [U], #1
1986    st1             {v21.b}[3], [U], #1
1987    st1             {v22.b}[0], [V], #1
1988    st1             {v22.b}[1], [V], #1
1989    st1             {v22.b}[2], [V], #1
1990    st1             {v22.b}[3], [V], #1
1991  .elseif \size == 2
1992    st1             {v20.b}[4], [Y], #1
1993    st1             {v20.b}[5], [Y], #1
1994    st1             {v21.b}[4], [U], #1
1995    st1             {v21.b}[5], [U], #1
1996    st1             {v22.b}[4], [V], #1
1997    st1             {v22.b}[5], [V], #1
1998  .elseif \size == 1
1999    st1             {v20.b}[6], [Y], #1
2000    st1             {v21.b}[6], [U], #1
2001    st1             {v22.b}[6], [V], #1
2002  .else
2003    .error unsupported macroblock size
2004  .endif
2005.endm
2006
2007.macro do_load bpp, size, fast_ld3
2008  .if \bpp == 24
2009    .if \size == 8
2010      .if \fast_ld3 == 1
2011        ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
2012      .else
2013        ld1         {v10.b}[0], [RGB], #1
2014        ld1         {v11.b}[0], [RGB], #1
2015        ld1         {v12.b}[0], [RGB], #1
2016
2017        ld1         {v10.b}[1], [RGB], #1
2018        ld1         {v11.b}[1], [RGB], #1
2019        ld1         {v12.b}[1], [RGB], #1
2020
2021        ld1         {v10.b}[2], [RGB], #1
2022        ld1         {v11.b}[2], [RGB], #1
2023        ld1         {v12.b}[2], [RGB], #1
2024
2025        ld1         {v10.b}[3], [RGB], #1
2026        ld1         {v11.b}[3], [RGB], #1
2027        ld1         {v12.b}[3], [RGB], #1
2028
2029        ld1         {v10.b}[4], [RGB], #1
2030        ld1         {v11.b}[4], [RGB], #1
2031        ld1         {v12.b}[4], [RGB], #1
2032
2033        ld1         {v10.b}[5], [RGB], #1
2034        ld1         {v11.b}[5], [RGB], #1
2035        ld1         {v12.b}[5], [RGB], #1
2036
2037        ld1         {v10.b}[6], [RGB], #1
2038        ld1         {v11.b}[6], [RGB], #1
2039        ld1         {v12.b}[6], [RGB], #1
2040
2041        ld1         {v10.b}[7], [RGB], #1
2042        ld1         {v11.b}[7], [RGB], #1
2043        ld1         {v12.b}[7], [RGB], #1
2044      .endif
2045      prfm          pldl1keep, [RGB, #128]
2046    .elseif \size == 4
2047      ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
2048      ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
2049      ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
2050      ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
2051    .elseif \size == 2
2052      ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
2053      ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
2054    .elseif \size == 1
2055      ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
2056    .else
2057      .error unsupported macroblock size
2058    .endif
2059  .elseif \bpp == 32
2060    .if \size == 8
2061      ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
2062      prfm          pldl1keep, [RGB, #128]
2063    .elseif \size == 4
2064      ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
2065      ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
2066      ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
2067      ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
2068    .elseif \size == 2
2069      ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
2070      ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
2071    .elseif \size == 1
2072      ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
2073    .else
2074      .error unsupported macroblock size
2075    .endif
2076  .else
2077    .error unsupported bpp
2078  .endif
2079.endm
2080
2081.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
2082                                           b_offs, fast_ld3
2083
2084/*
2085 * 2-stage pipelined RGB->YCbCr conversion
2086 */
2087
2088.macro do_rgb_to_yuv_stage1
2089    ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
2090    ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
2091    ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
2092    rev64           v18.4s, v1.4s
2093    rev64           v26.4s, v1.4s
2094    rev64           v28.4s, v1.4s
2095    rev64           v30.4s, v1.4s
2096    umull           v14.4s, v4.4h, v0.h[0]
2097    umull2          v16.4s, v4.8h, v0.h[0]
2098    umlsl           v18.4s, v4.4h, v0.h[3]
2099    umlsl2          v26.4s, v4.8h, v0.h[3]
2100    umlal           v28.4s, v4.4h, v0.h[5]
2101    umlal2          v30.4s, v4.8h, v0.h[5]
2102    umlal           v14.4s, v6.4h, v0.h[1]
2103    umlal2          v16.4s, v6.8h, v0.h[1]
2104    umlsl           v18.4s, v6.4h, v0.h[4]
2105    umlsl2          v26.4s, v6.8h, v0.h[4]
2106    umlsl           v28.4s, v6.4h, v0.h[6]
2107    umlsl2          v30.4s, v6.8h, v0.h[6]
2108    umlal           v14.4s, v8.4h, v0.h[2]
2109    umlal2          v16.4s, v8.8h, v0.h[2]
2110    umlal           v18.4s, v8.4h, v0.h[5]
2111    umlal2          v26.4s, v8.8h, v0.h[5]
2112    umlsl           v28.4s, v8.4h, v0.h[7]
2113    umlsl2          v30.4s, v8.8h, v0.h[7]
2114.endm
2115
2116.macro do_rgb_to_yuv_stage2
2117    rshrn           v20.4h, v14.4s, #16
2118    shrn            v22.4h, v18.4s, #16
2119    shrn            v24.4h, v28.4s, #16
2120    rshrn2          v20.8h, v16.4s, #16
2121    shrn2           v22.8h, v26.4s, #16
2122    shrn2           v24.8h, v30.4s, #16
2123    xtn             v20.8b, v20.8h       /* v20 = y */
2124    xtn             v21.8b, v22.8h       /* v21 = u */
2125    xtn             v22.8b, v24.8h       /* v22 = v */
2126.endm
2127
2128.macro do_rgb_to_yuv
2129    do_rgb_to_yuv_stage1
2130    do_rgb_to_yuv_stage2
2131.endm
2132
2133/* TODO: expand macros and interleave instructions if some in-order
2134 *       ARM64 processor actually can dual-issue LOAD/STORE with ALU */
2135.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
2136    do_rgb_to_yuv_stage2
2137    do_load         \bpp, 8, \fast_ld3
2138    st1             {v20.8b}, [Y], #8
2139    st1             {v21.8b}, [U], #8
2140    st1             {v22.8b}, [V], #8
2141    do_rgb_to_yuv_stage1
2142.endm
2143
2144
2145.if \fast_ld3 == 1
2146asm_function jsimd_\colorid\()_ycc_convert_neon
2147.else
2148asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
2149.endif
2150    OUTPUT_WIDTH    .req w0
2151    INPUT_BUF       .req x1
2152    OUTPUT_BUF      .req x2
2153    OUTPUT_ROW      .req w3
2154    NUM_ROWS        .req w4
2155
2156    OUTPUT_BUF0     .req x5
2157    OUTPUT_BUF1     .req x6
2158    OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
2159
2160    RGB             .req x7
2161    Y               .req x9
2162    U               .req x10
2163    V               .req x11
2164    N               .req w12
2165
2166    /* Load constants to d0, d1, d2, d3 */
2167    get_symbol_loc x13, Ljsimd_colorid_ycc_neon_consts
2168
2169    ld1             {v0.8h, v1.8h}, [x13]
2170
2171    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
2172    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
2173    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
2174    .unreq          OUTPUT_BUF
2175
2176    /* Save NEON registers */
2177    sub             sp, sp, #64
2178    mov             x9, sp
2179    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
2180    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
2181
2182    /* Outer loop over scanlines */
2183    cmp             NUM_ROWS, #1
2184    b.lt            9f
21850:
2186    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
2187    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
2188    mov             N, OUTPUT_WIDTH
2189    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
2190    add             OUTPUT_ROW, OUTPUT_ROW, #1
2191    ldr             RGB, [INPUT_BUF], #8
2192
2193    /* Inner loop over pixels */
2194    subs            N, N, #8
2195    b.lt            3f
2196    do_load         \bpp, 8, \fast_ld3
2197    do_rgb_to_yuv_stage1
2198    subs            N, N, #8
2199    b.lt            2f
22001:
2201    do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
2202    subs            N, N, #8
2203    b.ge            1b
22042:
2205    do_rgb_to_yuv_stage2
2206    do_store        8
2207    tst             N, #7
2208    b.eq            8f
22093:
2210    tbz             N, #2, 3f
2211    do_load         \bpp, 4, \fast_ld3
22123:
2213    tbz             N, #1, 4f
2214    do_load         \bpp, 2, \fast_ld3
22154:
2216    tbz             N, #0, 5f
2217    do_load         \bpp, 1, \fast_ld3
22185:
2219    do_rgb_to_yuv
2220    tbz             N, #2, 6f
2221    do_store        4
22226:
2223    tbz             N, #1, 7f
2224    do_store        2
22257:
2226    tbz             N, #0, 8f
2227    do_store        1
22288:
2229    subs            NUM_ROWS, NUM_ROWS, #1
2230    b.gt            0b
22319:
2232    /* Restore all registers and return */
2233    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2234    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2235    br              x30
2236
2237    .unreq          OUTPUT_WIDTH
2238    .unreq          OUTPUT_ROW
2239    .unreq          INPUT_BUF
2240    .unreq          NUM_ROWS
2241    .unreq          OUTPUT_BUF0
2242    .unreq          OUTPUT_BUF1
2243    .unreq          OUTPUT_BUF2
2244    .unreq          RGB
2245    .unreq          Y
2246    .unreq          U
2247    .unreq          V
2248    .unreq          N
2249
2250.purgem do_rgb_to_yuv
2251.purgem do_rgb_to_yuv_stage1
2252.purgem do_rgb_to_yuv_stage2
2253.purgem do_rgb_to_yuv_stage2_store_load_stage1
2254
2255.endm
2256
2257/*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
2258generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
2259generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
2260generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
2261generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
2262generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
2263generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
2264
2265generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
2266generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
2267
2268.purgem do_load
2269.purgem do_store
2270
2271
2272/*****************************************************************************/
2273
2274/*
2275 * Load data into workspace, applying unsigned->signed conversion
2276 *
2277 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
2278 *       rid of VST1.16 instructions
2279 */
2280
2281asm_function jsimd_convsamp_neon
2282    SAMPLE_DATA     .req x0
2283    START_COL       .req x1
2284    WORKSPACE       .req x2
2285    TMP1            .req x9
2286    TMP2            .req x10
2287    TMP3            .req x11
2288    TMP4            .req x12
2289    TMP5            .req x13
2290    TMP6            .req x14
2291    TMP7            .req x15
2292    TMP8            .req x4
2293    TMPDUP          .req w3
2294
2295    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
2296       guarantee that the upper (unused) 32 bits of x1 are valid.  This
2297       instruction ensures that those bits are set to zero. */
2298    uxtw x1, w1
2299
2300    mov             TMPDUP, #128
2301    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
2302    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
2303    dup             v0.8b, TMPDUP
2304    add             TMP1, TMP1, START_COL
2305    add             TMP2, TMP2, START_COL
2306    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
2307    add             TMP3, TMP3, START_COL
2308    add             TMP4, TMP4, START_COL
2309    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
2310    add             TMP5, TMP5, START_COL
2311    add             TMP6, TMP6, START_COL
2312    ld1             {v16.8b}, [TMP1]
2313    add             TMP7, TMP7, START_COL
2314    add             TMP8, TMP8, START_COL
2315    ld1             {v17.8b}, [TMP2]
2316    usubl           v16.8h, v16.8b, v0.8b
2317    ld1             {v18.8b}, [TMP3]
2318    usubl           v17.8h, v17.8b, v0.8b
2319    ld1             {v19.8b}, [TMP4]
2320    usubl           v18.8h, v18.8b, v0.8b
2321    ld1             {v20.8b}, [TMP5]
2322    usubl           v19.8h, v19.8b, v0.8b
2323    ld1             {v21.8b}, [TMP6]
2324    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
2325    usubl           v20.8h, v20.8b, v0.8b
2326    ld1             {v22.8b}, [TMP7]
2327    usubl           v21.8h, v21.8b, v0.8b
2328    ld1             {v23.8b}, [TMP8]
2329    usubl           v22.8h, v22.8b, v0.8b
2330    usubl           v23.8h, v23.8b, v0.8b
2331    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
2332
2333    br              x30
2334
2335    .unreq          SAMPLE_DATA
2336    .unreq          START_COL
2337    .unreq          WORKSPACE
2338    .unreq          TMP1
2339    .unreq          TMP2
2340    .unreq          TMP3
2341    .unreq          TMP4
2342    .unreq          TMP5
2343    .unreq          TMP6
2344    .unreq          TMP7
2345    .unreq          TMP8
2346    .unreq          TMPDUP
2347
2348/*****************************************************************************/
2349
2350/*
2351 * jsimd_fdct_islow_neon
2352 *
2353 * This file contains a slow-but-accurate integer implementation of the
2354 * forward DCT (Discrete Cosine Transform). The following code is based
2355 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
2356 * more details.
2357 *
2358 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2359 *       rid of a bunch of VLD1.16 instructions
2360 */
2361
2362#define CONST_BITS  13
2363#define PASS1_BITS  2
2364
2365#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
2366#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
2367
2368#define XFIX_P_0_298  v0.h[0]
2369#define XFIX_N_0_390  v0.h[1]
2370#define XFIX_P_0_541  v0.h[2]
2371#define XFIX_P_0_765  v0.h[3]
2372#define XFIX_N_0_899  v0.h[4]
2373#define XFIX_P_1_175  v0.h[5]
2374#define XFIX_P_1_501  v0.h[6]
2375#define XFIX_N_1_847  v0.h[7]
2376#define XFIX_N_1_961  v1.h[0]
2377#define XFIX_P_2_053  v1.h[1]
2378#define XFIX_N_2_562  v1.h[2]
2379#define XFIX_P_3_072  v1.h[3]
2380
2381asm_function jsimd_fdct_islow_neon
2382
2383    DATA            .req x0
2384    TMP             .req x9
2385
2386    /* Load constants */
2387    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
2388    ld1             {v0.8h, v1.8h}, [TMP]
2389
2390    /* Save NEON registers */
2391    sub             sp, sp, #64
2392    mov             x10, sp
2393    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
2394    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
2395
2396    /* Load all DATA into NEON registers with the following allocation:
2397     *       0 1 2 3 | 4 5 6 7
2398     *      ---------+--------
2399     *   0 | d16     | d17    | v16.8h
2400     *   1 | d18     | d19    | v17.8h
2401     *   2 | d20     | d21    | v18.8h
2402     *   3 | d22     | d23    | v19.8h
2403     *   4 | d24     | d25    | v20.8h
2404     *   5 | d26     | d27    | v21.8h
2405     *   6 | d28     | d29    | v22.8h
2406     *   7 | d30     | d31    | v23.8h
2407     */
2408
2409    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2410    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2411    sub             DATA, DATA, #64
2412
2413    /* Transpose */
2414    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2415    /* 1-D FDCT */
2416    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
2417    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
2418    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
2419    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
2420    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
2421    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
2422    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
2423    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
2424
2425    /* even part */
2426
2427    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
2428    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
2429    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
2430    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
2431
2432    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
2433    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
2434
2435    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
2436
2437    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
2438    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
2439
2440    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2441    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2442    mov             v22.16b, v18.16b
2443    mov             v25.16b, v24.16b
2444
2445    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2446    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2447    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2448    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2449
2450    rshrn           v18.4h, v18.4s, #DESCALE_P1
2451    rshrn           v22.4h, v22.4s, #DESCALE_P1
2452    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2453    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2454
2455    /* Odd part */
2456
2457    add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
2458    add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
2459    add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
2460    add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
2461    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
2462    smull2          v5.4s, v10.8h, XFIX_P_1_175
2463    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2464    smlal2          v5.4s, v11.8h, XFIX_P_1_175
2465
2466    smull2          v24.4s, v28.8h, XFIX_P_0_298
2467    smull2          v25.4s, v29.8h, XFIX_P_2_053
2468    smull2          v26.4s, v30.8h, XFIX_P_3_072
2469    smull2          v27.4s, v31.8h, XFIX_P_1_501
2470    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2471    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2472    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2473    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2474
2475    smull2          v12.4s, v8.8h, XFIX_N_0_899
2476    smull2          v13.4s, v9.8h, XFIX_N_2_562
2477    smull2          v14.4s, v10.8h, XFIX_N_1_961
2478    smull2          v15.4s, v11.8h, XFIX_N_0_390
2479    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
2480    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
2481    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
2482    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
2483
2484    add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
2485    add             v14.4s, v14.4s, v5.4s
2486    add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
2487    add             v15.4s, v15.4s, v5.4s
2488
2489    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
2490    add             v24.4s, v24.4s, v12.4s
2491    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
2492    add             v25.4s, v25.4s, v13.4s
2493    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
2494    add             v26.4s, v26.4s, v14.4s
2495    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
2496    add             v27.4s, v27.4s, v15.4s
2497
2498    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
2499    add             v24.4s, v24.4s, v14.4s
2500    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
2501    add             v25.4s, v25.4s, v15.4s
2502    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
2503    add             v26.4s, v26.4s, v13.4s
2504    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
2505    add             v27.4s, v27.4s, v12.4s
2506
2507    rshrn           v23.4h, v28.4s, #DESCALE_P1
2508    rshrn           v21.4h, v29.4s, #DESCALE_P1
2509    rshrn           v19.4h, v30.4s, #DESCALE_P1
2510    rshrn           v17.4h, v31.4s, #DESCALE_P1
2511    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2512    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2513    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2514    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2515
2516    /* Transpose */
2517    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2518
2519    /* 1-D FDCT */
2520    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
2521    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
2522    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
2523    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
2524    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
2525    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
2526    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
2527    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
2528
2529    /* even part */
2530    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
2531    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
2532    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
2533    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
2534
2535    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
2536    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
2537
2538    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
2539
2540    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
2541    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
2542
2543    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2544    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2545    mov             v22.16b, v18.16b
2546    mov             v25.16b, v24.16b
2547
2548    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2549    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2550    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2551    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2552
2553    rshrn           v18.4h, v18.4s, #DESCALE_P2
2554    rshrn           v22.4h, v22.4s, #DESCALE_P2
2555    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2556    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2557
2558    /* Odd part */
2559    add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
2560    add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
2561    add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
2562    add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
2563
2564    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
2565    smull2          v5.4s, v10.8h, XFIX_P_1_175
2566    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2567    smlal2          v5.4s, v11.8h, XFIX_P_1_175
2568
2569    smull2          v24.4s, v28.8h, XFIX_P_0_298
2570    smull2          v25.4s, v29.8h, XFIX_P_2_053
2571    smull2          v26.4s, v30.8h, XFIX_P_3_072
2572    smull2          v27.4s, v31.8h, XFIX_P_1_501
2573    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2574    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2575    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2576    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2577
2578    smull2          v12.4s, v8.8h, XFIX_N_0_899
2579    smull2          v13.4s, v9.8h, XFIX_N_2_562
2580    smull2          v14.4s, v10.8h, XFIX_N_1_961
2581    smull2          v15.4s, v11.8h, XFIX_N_0_390
2582    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
2583    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
2584    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
2585    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
2586
2587    add             v10.4s, v10.4s, v4.4s
2588    add             v14.4s, v14.4s, v5.4s
2589    add             v11.4s, v11.4s, v4.4s
2590    add             v15.4s, v15.4s, v5.4s
2591
2592    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
2593    add             v24.4s, v24.4s, v12.4s
2594    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
2595    add             v25.4s, v25.4s, v13.4s
2596    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
2597    add             v26.4s, v26.4s, v14.4s
2598    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
2599    add             v27.4s, v27.4s, v15.4s
2600
2601    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
2602    add             v24.4s, v24.4s, v14.4s
2603    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
2604    add             v25.4s, v25.4s, v15.4s
2605    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
2606    add             v26.4s, v26.4s, v13.4s
2607    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
2608    add             v27.4s, v27.4s, v12.4s
2609
2610    rshrn           v23.4h, v28.4s, #DESCALE_P2
2611    rshrn           v21.4h, v29.4s, #DESCALE_P2
2612    rshrn           v19.4h, v30.4s, #DESCALE_P2
2613    rshrn           v17.4h, v31.4s, #DESCALE_P2
2614    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2615    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2616    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2617    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2618
2619    /* store results */
2620    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2621    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2622
2623    /* Restore NEON registers */
2624    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2625    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2626
2627    br              x30
2628
2629    .unreq          DATA
2630    .unreq          TMP
2631
2632#undef XFIX_P_0_298
2633#undef XFIX_N_0_390
2634#undef XFIX_P_0_541
2635#undef XFIX_P_0_765
2636#undef XFIX_N_0_899
2637#undef XFIX_P_1_175
2638#undef XFIX_P_1_501
2639#undef XFIX_N_1_847
2640#undef XFIX_N_1_961
2641#undef XFIX_P_2_053
2642#undef XFIX_N_2_562
2643#undef XFIX_P_3_072
2644
2645
2646/*****************************************************************************/
2647
2648/*
2649 * jsimd_fdct_ifast_neon
2650 *
2651 * This function contains a fast, not so accurate integer implementation of
2652 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
2653 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
2654 * function from jfdctfst.c
2655 *
2656 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2657 *       rid of a bunch of VLD1.16 instructions
2658 */
2659
2660#undef XFIX_0_541196100
2661#define XFIX_0_382683433  v0.h[0]
2662#define XFIX_0_541196100  v0.h[1]
2663#define XFIX_0_707106781  v0.h[2]
2664#define XFIX_1_306562965  v0.h[3]
2665
2666asm_function jsimd_fdct_ifast_neon
2667
2668    DATA            .req x0
2669    TMP             .req x9
2670
2671    /* Load constants */
2672    get_symbol_loc  TMP, Ljsimd_fdct_ifast_neon_consts
2673    ld1             {v0.4h}, [TMP]
2674
2675    /* Load all DATA into NEON registers with the following allocation:
2676     *       0 1 2 3 | 4 5 6 7
2677     *      ---------+--------
2678     *   0 | d16     | d17    | v0.8h
2679     *   1 | d18     | d19    | q9
2680     *   2 | d20     | d21    | q10
2681     *   3 | d22     | d23    | q11
2682     *   4 | d24     | d25    | q12
2683     *   5 | d26     | d27    | q13
2684     *   6 | d28     | d29    | q14
2685     *   7 | d30     | d31    | q15
2686     */
2687
2688    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2689    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2690    mov             TMP, #2
2691    sub             DATA, DATA, #64
26921:
2693    /* Transpose */
2694    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
2695    subs            TMP, TMP, #1
2696    /* 1-D FDCT */
2697    add             v4.8h, v19.8h, v20.8h
2698    sub             v20.8h, v19.8h, v20.8h
2699    sub             v28.8h, v18.8h, v21.8h
2700    add             v18.8h, v18.8h, v21.8h
2701    sub             v29.8h, v17.8h, v22.8h
2702    add             v17.8h, v17.8h, v22.8h
2703    sub             v21.8h, v16.8h, v23.8h
2704    add             v16.8h, v16.8h, v23.8h
2705    sub             v6.8h, v17.8h, v18.8h
2706    sub             v7.8h, v16.8h, v4.8h
2707    add             v5.8h, v17.8h, v18.8h
2708    add             v6.8h, v6.8h, v7.8h
2709    add             v4.8h, v16.8h, v4.8h
2710    sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
2711    add             v19.8h, v20.8h, v28.8h
2712    add             v16.8h, v4.8h, v5.8h
2713    sub             v20.8h, v4.8h, v5.8h
2714    add             v5.8h, v28.8h, v29.8h
2715    add             v29.8h, v29.8h, v21.8h
2716    sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
2717    sub             v28.8h, v19.8h, v29.8h
2718    add             v18.8h, v7.8h, v6.8h
2719    sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
2720    sub             v22.8h, v7.8h, v6.8h
2721    sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
2722    sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
2723    add             v6.8h, v21.8h, v5.8h
2724    sub             v5.8h, v21.8h, v5.8h
2725    add             v29.8h, v29.8h, v28.8h
2726    add             v19.8h, v19.8h, v28.8h
2727    add             v29.8h, v29.8h, v7.8h
2728    add             v21.8h, v5.8h, v19.8h
2729    sub             v19.8h, v5.8h, v19.8h
2730    add             v17.8h, v6.8h, v29.8h
2731    sub             v23.8h, v6.8h, v29.8h
2732
2733    b.ne            1b
2734
2735    /* store results */
2736    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2737    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2738
2739    br              x30
2740
2741    .unreq          DATA
2742    .unreq          TMP
2743#undef XFIX_0_382683433
2744#undef XFIX_0_541196100
2745#undef XFIX_0_707106781
2746#undef XFIX_1_306562965
2747
2748
2749/*****************************************************************************/
2750
2751/*
2752 * GLOBAL(void)
2753 * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
2754 *                     DCTELEM *workspace);
2755 *
2756 */
2757asm_function jsimd_quantize_neon
2758
2759    COEF_BLOCK      .req x0
2760    DIVISORS        .req x1
2761    WORKSPACE       .req x2
2762
2763    RECIPROCAL      .req DIVISORS
2764    CORRECTION      .req x9
2765    SHIFT           .req x10
2766    LOOP_COUNT      .req x11
2767
2768    mov             LOOP_COUNT, #2
2769    add             CORRECTION, DIVISORS, #(64 * 2)
2770    add             SHIFT, DIVISORS, #(64 * 6)
27711:
2772    subs            LOOP_COUNT, LOOP_COUNT, #1
2773    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
2774    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
2775    abs             v20.8h, v0.8h
2776    abs             v21.8h, v1.8h
2777    abs             v22.8h, v2.8h
2778    abs             v23.8h, v3.8h
2779    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
2780    add             v20.8h, v20.8h, v4.8h  /* add correction */
2781    add             v21.8h, v21.8h, v5.8h
2782    add             v22.8h, v22.8h, v6.8h
2783    add             v23.8h, v23.8h, v7.8h
2784    umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
2785    umull2          v16.4s, v20.8h, v28.8h
2786    umull           v5.4s, v21.4h, v29.4h
2787    umull2          v17.4s, v21.8h, v29.8h
2788    umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
2789    umull2          v18.4s, v22.8h, v30.8h
2790    umull           v7.4s, v23.4h, v31.4h
2791    umull2          v19.4s, v23.8h, v31.8h
2792    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
2793    shrn            v4.4h, v4.4s, #16
2794    shrn            v5.4h, v5.4s, #16
2795    shrn            v6.4h, v6.4s, #16
2796    shrn            v7.4h, v7.4s, #16
2797    shrn2           v4.8h, v16.4s, #16
2798    shrn2           v5.8h, v17.4s, #16
2799    shrn2           v6.8h, v18.4s, #16
2800    shrn2           v7.8h, v19.4s, #16
2801    neg             v24.8h, v24.8h
2802    neg             v25.8h, v25.8h
2803    neg             v26.8h, v26.8h
2804    neg             v27.8h, v27.8h
2805    sshr            v0.8h, v0.8h, #15  /* extract sign */
2806    sshr            v1.8h, v1.8h, #15
2807    sshr            v2.8h, v2.8h, #15
2808    sshr            v3.8h, v3.8h, #15
2809    ushl            v4.8h, v4.8h, v24.8h  /* shift */
2810    ushl            v5.8h, v5.8h, v25.8h
2811    ushl            v6.8h, v6.8h, v26.8h
2812    ushl            v7.8h, v7.8h, v27.8h
2813
2814    eor             v4.16b, v4.16b, v0.16b  /* restore sign */
2815    eor             v5.16b, v5.16b, v1.16b
2816    eor             v6.16b, v6.16b, v2.16b
2817    eor             v7.16b, v7.16b, v3.16b
2818    sub             v4.8h, v4.8h, v0.8h
2819    sub             v5.8h, v5.8h, v1.8h
2820    sub             v6.8h, v6.8h, v2.8h
2821    sub             v7.8h, v7.8h, v3.8h
2822    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
2823
2824    b.ne            1b
2825
2826    br              x30  /* return */
2827
2828    .unreq          COEF_BLOCK
2829    .unreq          DIVISORS
2830    .unreq          WORKSPACE
2831    .unreq          RECIPROCAL
2832    .unreq          CORRECTION
2833    .unreq          SHIFT
2834    .unreq          LOOP_COUNT
2835
2836
2837/*****************************************************************************/
2838
2839/*
2840 * Downsample pixel values of a single component.
2841 * This version handles the common case of 2:1 horizontal and 1:1 vertical,
2842 * without smoothing.
2843 *
2844 * GLOBAL(void)
2845 * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
2846 *                            JDIMENSION v_samp_factor,
2847 *                            JDIMENSION width_in_blocks,
2848 *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
2849 */
2850
2851asm_function jsimd_h2v1_downsample_neon
2852    IMAGE_WIDTH     .req x0
2853    MAX_V_SAMP      .req x1
2854    V_SAMP          .req x2
2855    BLOCK_WIDTH     .req x3
2856    INPUT_DATA      .req x4
2857    OUTPUT_DATA     .req x5
2858    OUTPTR          .req x9
2859    INPTR           .req x10
2860    TMP1            .req x11
2861    TMP2            .req x12
2862    TMP3            .req x13
2863    TMPDUP          .req w15
2864
2865    mov             TMPDUP, #0x10000
2866    lsl             TMP2, BLOCK_WIDTH, #4
2867    sub             TMP2, TMP2, IMAGE_WIDTH
2868    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
2869    add             TMP3, TMP3, TMP2, lsl #4
2870    dup             v16.4s, TMPDUP
2871    ld1             {v18.16b}, [TMP3]
2872
28731:  /* row loop */
2874    ldr             INPTR, [INPUT_DATA], #8
2875    ldr             OUTPTR, [OUTPUT_DATA], #8
2876    subs            TMP1, BLOCK_WIDTH, #1
2877    b.eq            3f
28782:  /* columns */
2879    ld1             {v0.16b}, [INPTR], #16
2880    mov             v4.16b, v16.16b
2881    subs            TMP1, TMP1, #1
2882    uadalp          v4.8h, v0.16b
2883    shrn            v6.8b, v4.8h, #1
2884    st1             {v6.8b}, [OUTPTR], #8
2885    b.ne            2b
28863:  /* last columns */
2887    ld1             {v0.16b}, [INPTR]
2888    mov             v4.16b, v16.16b
2889    subs            V_SAMP, V_SAMP, #1
2890    /* expand right */
2891    tbl             v2.16b, {v0.16b}, v18.16b
2892    uadalp          v4.8h, v2.16b
2893    shrn            v6.8b, v4.8h, #1
2894    st1             {v6.8b}, [OUTPTR], #8
2895    b.ne            1b
2896
2897    br              x30
2898
2899    .unreq          IMAGE_WIDTH
2900    .unreq          MAX_V_SAMP
2901    .unreq          V_SAMP
2902    .unreq          BLOCK_WIDTH
2903    .unreq          INPUT_DATA
2904    .unreq          OUTPUT_DATA
2905    .unreq          OUTPTR
2906    .unreq          INPTR
2907    .unreq          TMP1
2908    .unreq          TMP2
2909    .unreq          TMP3
2910    .unreq          TMPDUP
2911
2912
2913/*****************************************************************************/
2914
2915/*
2916 * Downsample pixel values of a single component.
2917 * This version handles the common case of 2:1 horizontal and 2:1 vertical,
2918 * without smoothing.
2919 *
2920 * GLOBAL(void)
2921 * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
2922 *                            JDIMENSION v_samp_factor,
2923 *                            JDIMENSION width_in_blocks,
2924 *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
2925 */
2926
2927.balign 16
2928asm_function jsimd_h2v2_downsample_neon
2929    IMAGE_WIDTH     .req x0
2930    MAX_V_SAMP      .req x1
2931    V_SAMP          .req x2
2932    BLOCK_WIDTH     .req x3
2933    INPUT_DATA      .req x4
2934    OUTPUT_DATA     .req x5
2935    OUTPTR          .req x9
2936    INPTR0          .req x10
2937    INPTR1          .req x14
2938    TMP1            .req x11
2939    TMP2            .req x12
2940    TMP3            .req x13
2941    TMPDUP          .req w15
2942
2943    mov             TMPDUP, #1
2944    lsl             TMP2, BLOCK_WIDTH, #4
2945    lsl             TMPDUP, TMPDUP, #17
2946    sub             TMP2, TMP2, IMAGE_WIDTH
2947    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
2948    orr             TMPDUP, TMPDUP, #1
2949    add             TMP3, TMP3, TMP2, lsl #4
2950    dup             v16.4s, TMPDUP
2951    ld1             {v18.16b}, [TMP3]
2952
29531:  /* row loop */
2954    ldr             INPTR0, [INPUT_DATA], #8
2955    ldr             OUTPTR, [OUTPUT_DATA], #8
2956    ldr             INPTR1, [INPUT_DATA], #8
2957    subs            TMP1, BLOCK_WIDTH, #1
2958    b.eq            3f
29592:  /* columns */
2960    ld1             {v0.16b}, [INPTR0], #16
2961    ld1             {v1.16b}, [INPTR1], #16
2962    mov             v4.16b, v16.16b
2963    subs            TMP1, TMP1, #1
2964    uadalp          v4.8h, v0.16b
2965    uadalp          v4.8h, v1.16b
2966    shrn            v6.8b, v4.8h, #2
2967    st1             {v6.8b}, [OUTPTR], #8
2968    b.ne            2b
29693:  /* last columns */
2970    ld1             {v0.16b}, [INPTR0], #16
2971    ld1             {v1.16b}, [INPTR1], #16
2972    mov             v4.16b, v16.16b
2973    subs            V_SAMP, V_SAMP, #1
2974    /* expand right */
2975    tbl             v2.16b, {v0.16b}, v18.16b
2976    tbl             v3.16b, {v1.16b}, v18.16b
2977    uadalp          v4.8h, v2.16b
2978    uadalp          v4.8h, v3.16b
2979    shrn            v6.8b, v4.8h, #2
2980    st1             {v6.8b}, [OUTPTR], #8
2981    b.ne            1b
2982
2983    br              x30
2984
2985    .unreq          IMAGE_WIDTH
2986    .unreq          MAX_V_SAMP
2987    .unreq          V_SAMP
2988    .unreq          BLOCK_WIDTH
2989    .unreq          INPUT_DATA
2990    .unreq          OUTPUT_DATA
2991    .unreq          OUTPTR
2992    .unreq          INPTR0
2993    .unreq          INPTR1
2994    .unreq          TMP1
2995    .unreq          TMP2
2996    .unreq          TMP3
2997    .unreq          TMPDUP
2998
2999
3000/*****************************************************************************/
3001
3002/*
3003 * GLOBAL(JOCTET *)
3004 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
3005 *                             JCOEFPTR block, int last_dc_val,
3006 *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
3007 *
3008 */
3009
3010    BUFFER          .req x1
3011    PUT_BUFFER      .req x6
3012    PUT_BITS        .req x7
3013    PUT_BITSw       .req w7
3014
3015.macro emit_byte
3016    sub             PUT_BITS, PUT_BITS, #0x8
3017    lsr             x19, PUT_BUFFER, PUT_BITS
3018    uxtb            w19, w19
3019    strb            w19, [BUFFER, #1]!
3020    cmp             w19, #0xff
3021    b.ne            14f
3022    strb            wzr, [BUFFER, #1]!
302314:
3024.endm
3025.macro put_bits CODE, SIZE
3026    lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
3027    add             PUT_BITS, PUT_BITS, \SIZE
3028    orr             PUT_BUFFER, PUT_BUFFER, \CODE
3029.endm
3030.macro checkbuf31
3031    cmp             PUT_BITS, #0x20
3032    b.lt            31f
3033    emit_byte
3034    emit_byte
3035    emit_byte
3036    emit_byte
303731:
3038.endm
3039.macro checkbuf47
3040    cmp             PUT_BITS, #0x30
3041    b.lt            47f
3042    emit_byte
3043    emit_byte
3044    emit_byte
3045    emit_byte
3046    emit_byte
3047    emit_byte
304847:
3049.endm
3050
3051.macro generate_jsimd_huff_encode_one_block fast_tbl
3052
3053.balign 16
3054
3055.if \fast_tbl == 1
3056asm_function jsimd_huff_encode_one_block_neon
3057.else
3058asm_function jsimd_huff_encode_one_block_neon_slowtbl
3059.endif
3060    sub             sp, sp, 272
3061    sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
3062    /* Save ARM registers */
3063    stp             x19, x20, [sp]
3064.if \fast_tbl == 1
3065    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
3066.else
3067    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
3068.endif
3069    ldr             PUT_BUFFER, [x0, #0x10]
3070    ldr             PUT_BITSw, [x0, #0x18]
3071    ldrsh           w12, [x2]               /* load DC coeff in w12 */
3072    /* prepare data */
3073.if \fast_tbl == 1
3074    ld1             {v23.16b}, [x15], #16
3075    ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
3076    ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
3077    ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
3078    ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
3079    ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
3080    sub             w12, w12, w3      /* last_dc_val, not used afterwards */
3081    /* ZigZag 8x8 */
3082    tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
3083    tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
3084    tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
3085    tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
3086    tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
3087    tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
3088    tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
3089    tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
3090    ins             v0.h[0], w12
3091    tbx             v1.16b, {v28.16b}, v16.16b
3092    tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
3093    tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
3094    tbx             v6.16b, {v31.16b}, v19.16b
3095.else
3096      add             x13, x2, #0x22
3097      sub             w12, w12, w3    /* last_dc_val, not used afterwards */
3098    ld1             {v23.16b}, [x15]
3099      add             x14, x2, #0x18
3100      add             x3, x2, #0x36
3101    ins             v0.h[0], w12
3102      add             x9, x2, #0x2
3103    ld1             {v1.h}[0], [x13]
3104      add             x15, x2, #0x30
3105    ld1             {v2.h}[0], [x14]
3106      add             x19, x2, #0x26
3107    ld1             {v3.h}[0], [x3]
3108      add             x20, x2, #0x28
3109    ld1             {v0.h}[1], [x9]
3110      add             x12, x2, #0x10
3111    ld1             {v1.h}[1], [x15]
3112      add             x13, x2, #0x40
3113    ld1             {v2.h}[1], [x19]
3114      add             x14, x2, #0x34
3115    ld1             {v3.h}[1], [x20]
3116      add             x3, x2, #0x1a
3117    ld1             {v0.h}[2], [x12]
3118      add             x9, x2, #0x20
3119    ld1             {v1.h}[2], [x13]
3120      add             x15, x2, #0x32
3121    ld1             {v2.h}[2], [x14]
3122      add             x19, x2, #0x42
3123    ld1             {v3.h}[2], [x3]
3124      add             x20, x2, #0xc
3125    ld1             {v0.h}[3], [x9]
3126      add             x12, x2, #0x12
3127    ld1             {v1.h}[3], [x15]
3128      add             x13, x2, #0x24
3129    ld1             {v2.h}[3], [x19]
3130      add             x14, x2, #0x50
3131    ld1             {v3.h}[3], [x20]
3132      add             x3, x2, #0xe
3133    ld1             {v0.h}[4], [x12]
3134      add             x9, x2, #0x4
3135    ld1             {v1.h}[4], [x13]
3136      add             x15, x2, #0x16
3137    ld1             {v2.h}[4], [x14]
3138      add             x19, x2, #0x60
3139    ld1             {v3.h}[4], [x3]
3140      add             x20, x2, #0x1c
3141    ld1             {v0.h}[5], [x9]
3142      add             x12, x2, #0x6
3143    ld1             {v1.h}[5], [x15]
3144      add             x13, x2, #0x8
3145    ld1             {v2.h}[5], [x19]
3146      add             x14, x2, #0x52
3147    ld1             {v3.h}[5], [x20]
3148      add             x3, x2, #0x2a
3149    ld1             {v0.h}[6], [x12]
3150      add             x9, x2, #0x14
3151    ld1             {v1.h}[6], [x13]
3152      add             x15, x2, #0xa
3153    ld1             {v2.h}[6], [x14]
3154      add             x19, x2, #0x44
3155    ld1             {v3.h}[6], [x3]
3156      add             x20, x2, #0x38
3157    ld1             {v0.h}[7], [x9]
3158      add             x12, x2, #0x46
3159    ld1             {v1.h}[7], [x15]
3160      add             x13, x2, #0x3a
3161    ld1             {v2.h}[7], [x19]
3162      add             x14, x2, #0x74
3163    ld1             {v3.h}[7], [x20]
3164      add             x3, x2, #0x6a
3165    ld1             {v4.h}[0], [x12]
3166      add             x9, x2, #0x54
3167    ld1             {v5.h}[0], [x13]
3168      add             x15, x2, #0x2c
3169    ld1             {v6.h}[0], [x14]
3170      add             x19, x2, #0x76
3171    ld1             {v7.h}[0], [x3]
3172      add             x20, x2, #0x78
3173    ld1             {v4.h}[1], [x9]
3174      add             x12, x2, #0x62
3175    ld1             {v5.h}[1], [x15]
3176      add             x13, x2, #0x1e
3177    ld1             {v6.h}[1], [x19]
3178      add             x14, x2, #0x68
3179    ld1             {v7.h}[1], [x20]
3180      add             x3, x2, #0x7a
3181    ld1             {v4.h}[2], [x12]
3182      add             x9, x2, #0x70
3183    ld1             {v5.h}[2], [x13]
3184      add             x15, x2, #0x2e
3185    ld1             {v6.h}[2], [x14]
3186      add             x19, x2, #0x5a
3187    ld1             {v7.h}[2], [x3]
3188      add             x20, x2, #0x6c
3189    ld1             {v4.h}[3], [x9]
3190      add             x12, x2, #0x72
3191    ld1             {v5.h}[3], [x15]
3192      add             x13, x2, #0x3c
3193    ld1             {v6.h}[3], [x19]
3194      add             x14, x2, #0x4c
3195    ld1             {v7.h}[3], [x20]
3196      add             x3, x2, #0x5e
3197    ld1             {v4.h}[4], [x12]
3198      add             x9, x2, #0x64
3199    ld1             {v5.h}[4], [x13]
3200      add             x15, x2, #0x4a
3201    ld1             {v6.h}[4], [x14]
3202      add             x19, x2, #0x3e
3203    ld1             {v7.h}[4], [x3]
3204      add             x20, x2, #0x6e
3205    ld1             {v4.h}[5], [x9]
3206      add             x12, x2, #0x56
3207    ld1             {v5.h}[5], [x15]
3208      add             x13, x2, #0x58
3209    ld1             {v6.h}[5], [x19]
3210      add             x14, x2, #0x4e
3211    ld1             {v7.h}[5], [x20]
3212      add             x3, x2, #0x7c
3213    ld1             {v4.h}[6], [x12]
3214      add             x9, x2, #0x48
3215    ld1             {v5.h}[6], [x13]
3216      add             x15, x2, #0x66
3217    ld1             {v6.h}[6], [x14]
3218      add             x19, x2, #0x5c
3219    ld1             {v7.h}[6], [x3]
3220      add             x20, x2, #0x7e
3221    ld1             {v4.h}[7], [x9]
3222    ld1             {v5.h}[7], [x15]
3223    ld1             {v6.h}[7], [x19]
3224    ld1             {v7.h}[7], [x20]
3225.endif
3226    cmlt            v24.8h, v0.8h, #0
3227    cmlt            v25.8h, v1.8h, #0
3228    cmlt            v26.8h, v2.8h, #0
3229    cmlt            v27.8h, v3.8h, #0
3230    cmlt            v28.8h, v4.8h, #0
3231    cmlt            v29.8h, v5.8h, #0
3232    cmlt            v30.8h, v6.8h, #0
3233    cmlt            v31.8h, v7.8h, #0
3234    abs             v0.8h, v0.8h
3235    abs             v1.8h, v1.8h
3236    abs             v2.8h, v2.8h
3237    abs             v3.8h, v3.8h
3238    abs             v4.8h, v4.8h
3239    abs             v5.8h, v5.8h
3240    abs             v6.8h, v6.8h
3241    abs             v7.8h, v7.8h
3242    eor             v24.16b, v24.16b, v0.16b
3243    eor             v25.16b, v25.16b, v1.16b
3244    eor             v26.16b, v26.16b, v2.16b
3245    eor             v27.16b, v27.16b, v3.16b
3246    eor             v28.16b, v28.16b, v4.16b
3247    eor             v29.16b, v29.16b, v5.16b
3248    eor             v30.16b, v30.16b, v6.16b
3249    eor             v31.16b, v31.16b, v7.16b
3250    cmeq            v16.8h, v0.8h, #0
3251    cmeq            v17.8h, v1.8h, #0
3252    cmeq            v18.8h, v2.8h, #0
3253    cmeq            v19.8h, v3.8h, #0
3254    cmeq            v20.8h, v4.8h, #0
3255    cmeq            v21.8h, v5.8h, #0
3256    cmeq            v22.8h, v6.8h, #0
3257    xtn             v16.8b, v16.8h
3258    xtn             v18.8b, v18.8h
3259    xtn             v20.8b, v20.8h
3260    xtn             v22.8b, v22.8h
3261      umov            w14, v0.h[0]
3262    xtn2            v16.16b, v17.8h
3263      umov            w13, v24.h[0]
3264    xtn2            v18.16b, v19.8h
3265      clz             w14, w14
3266    xtn2            v20.16b, v21.8h
3267      lsl             w13, w13, w14
3268    cmeq            v17.8h, v7.8h, #0
3269      sub             w12, w14, #32
3270    xtn2            v22.16b, v17.8h
3271      lsr             w13, w13, w14
3272    and             v16.16b, v16.16b, v23.16b
3273      neg             w12, w12
3274    and             v18.16b, v18.16b, v23.16b
3275      add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
3276    and             v20.16b, v20.16b, v23.16b
3277      add             x15, sp, #0x90           /* x15 = t2 */
3278    and             v22.16b, v22.16b, v23.16b
3279      ldr             w10, [x4, x12, lsl #2]
3280    addp            v16.16b, v16.16b, v18.16b
3281      ldrb            w11, [x3, x12]
3282    addp            v20.16b, v20.16b, v22.16b
3283      checkbuf47
3284    addp            v16.16b, v16.16b, v20.16b
3285      put_bits        x10, x11
3286    addp            v16.16b, v16.16b, v18.16b
3287      checkbuf47
3288    umov            x9, v16.D[0]
3289      put_bits        x13, x12
3290    cnt             v17.8b, v16.8b
3291      mvn             x9, x9
3292    addv            B18, v17.8b
3293      add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
3294    umov            w12, v18.b[0]
3295      lsr             x9, x9, #0x1     /* clear AC coeff */
3296    ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
3297    rbit            x9, x9             /* x9 = index0 */
3298    ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
3299    cmp             w12, #(64-8)
3300    add             x11, sp, #16
3301    b.lt            4f
3302    cbz             x9, 6f
3303    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3304    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3305    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3306    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
33071:
3308    clz             x2, x9
3309    add             x15, x15, x2, lsl #1
3310    lsl             x9, x9, x2
3311    ldrh            w20, [x15, #-126]
33122:
3313    cmp             x2, #0x10
3314    b.lt            3f
3315    sub             x2, x2, #0x10
3316    checkbuf47
3317    put_bits        x13, x14
3318    b               2b
33193:
3320    clz             w20, w20
3321    ldrh            w3, [x15, #2]!
3322    sub             w11, w20, #32
3323    lsl             w3, w3, w20
3324    neg             w11, w11
3325    lsr             w3, w3, w20
3326    add             x2, x11, x2, lsl #4
3327    lsl             x9, x9, #0x1
3328    ldr             w12, [x5, x2, lsl #2]
3329    ldrb            w10, [x4, x2]
3330    checkbuf31
3331    put_bits        x12, x10
3332    put_bits        x3, x11
3333    cbnz            x9, 1b
3334    b               6f
33354:
3336    movi            v21.8h, #0x0010
3337    clz             v0.8h, v0.8h
3338    clz             v1.8h, v1.8h
3339    clz             v2.8h, v2.8h
3340    clz             v3.8h, v3.8h
3341    clz             v4.8h, v4.8h
3342    clz             v5.8h, v5.8h
3343    clz             v6.8h, v6.8h
3344    clz             v7.8h, v7.8h
3345    ushl            v24.8h, v24.8h, v0.8h
3346    ushl            v25.8h, v25.8h, v1.8h
3347    ushl            v26.8h, v26.8h, v2.8h
3348    ushl            v27.8h, v27.8h, v3.8h
3349    ushl            v28.8h, v28.8h, v4.8h
3350    ushl            v29.8h, v29.8h, v5.8h
3351    ushl            v30.8h, v30.8h, v6.8h
3352    ushl            v31.8h, v31.8h, v7.8h
3353    neg             v0.8h, v0.8h
3354    neg             v1.8h, v1.8h
3355    neg             v2.8h, v2.8h
3356    neg             v3.8h, v3.8h
3357    neg             v4.8h, v4.8h
3358    neg             v5.8h, v5.8h
3359    neg             v6.8h, v6.8h
3360    neg             v7.8h, v7.8h
3361    ushl            v24.8h, v24.8h, v0.8h
3362    ushl            v25.8h, v25.8h, v1.8h
3363    ushl            v26.8h, v26.8h, v2.8h
3364    ushl            v27.8h, v27.8h, v3.8h
3365    ushl            v28.8h, v28.8h, v4.8h
3366    ushl            v29.8h, v29.8h, v5.8h
3367    ushl            v30.8h, v30.8h, v6.8h
3368    ushl            v31.8h, v31.8h, v7.8h
3369    add             v0.8h, v21.8h, v0.8h
3370    add             v1.8h, v21.8h, v1.8h
3371    add             v2.8h, v21.8h, v2.8h
3372    add             v3.8h, v21.8h, v3.8h
3373    add             v4.8h, v21.8h, v4.8h
3374    add             v5.8h, v21.8h, v5.8h
3375    add             v6.8h, v21.8h, v6.8h
3376    add             v7.8h, v21.8h, v7.8h
3377    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3378    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3379    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3380    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
33811:
3382    clz             x2, x9
3383    add             x15, x15, x2, lsl #1
3384    lsl             x9, x9, x2
3385    ldrh            w11, [x15, #-126]
33862:
3387    cmp             x2, #0x10
3388    b.lt            3f
3389    sub             x2, x2, #0x10
3390    checkbuf47
3391    put_bits        x13, x14
3392    b               2b
33933:
3394    ldrh            w3, [x15, #2]!
3395    add             x2, x11, x2, lsl #4
3396    lsl             x9, x9, #0x1
3397    ldr             w12, [x5, x2, lsl #2]
3398    ldrb            w10, [x4, x2]
3399    checkbuf31
3400    put_bits        x12, x10
3401    put_bits        x3, x11
3402    cbnz            x9, 1b
34036:
3404    add             x13, sp, #0x10e
3405    cmp             x15, x13
3406    b.hs            1f
3407    ldr             w12, [x5]
3408    ldrb            w14, [x4]
3409    checkbuf47
3410    put_bits        x12, x14
34111:
3412    str             PUT_BUFFER, [x0, #0x10]
3413    str             PUT_BITSw, [x0, #0x18]
3414    ldp             x19, x20, [sp], 16
3415    add             x0, BUFFER, #0x1
3416    add             sp, sp, 256
3417    br              x30
3418
3419.endm
3420
3421generate_jsimd_huff_encode_one_block 1
3422generate_jsimd_huff_encode_one_block 0
3423
3424    .unreq          BUFFER
3425    .unreq          PUT_BUFFER
3426    .unreq          PUT_BITS
3427    .unreq          PUT_BITSw
3428
3429.purgem emit_byte
3430.purgem put_bits
3431.purgem checkbuf31
3432.purgem checkbuf47
3433