1/*
2 * ARMv8 NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
5 * All Rights Reserved.
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
9 * Copyright (C) 2014-2016, D. R. Commander.  All Rights Reserved.
10 * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
11 * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
12 *
13 * This software is provided 'as-is', without any express or implied
14 * warranty.  In no event will the authors be held liable for any damages
15 * arising from the use of this software.
16 *
17 * Permission is granted to anyone to use this software for any purpose,
18 * including commercial applications, and to alter it and redistribute it
19 * freely, subject to the following restrictions:
20 *
21 * 1. The origin of this software must not be misrepresented; you must not
22 *    claim that you wrote the original software. If you use this software
23 *    in a product, an acknowledgment in the product documentation would be
24 *    appreciated but is not required.
25 * 2. Altered source versions must be plainly marked as such, and must not be
26 *    misrepresented as being the original software.
27 * 3. This notice may not be removed or altered from any source distribution.
28 */
29
30#if defined(__linux__) && defined(__ELF__)
31.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
32#endif
33
34.text
35
36
37#define RESPECT_STRICT_ALIGNMENT 1
38
39
40/*****************************************************************************/
41
42/* Supplementary macro for setting function attributes */
43.macro asm_function fname
44#ifdef __APPLE__
45    .globl _\fname
46_\fname:
47#else
48    .global \fname
49#ifdef __ELF__
50    .hidden \fname
51    .type \fname, %function
52#endif
53\fname:
54#endif
55.endm
56
57/* Transpose elements of single 128 bit registers */
58.macro transpose_single x0, x1, xi, xilen, literal
59    ins             \xi\xilen[0], \x0\xilen[0]
60    ins             \x1\xilen[0], \x0\xilen[1]
61    trn1            \x0\literal, \x0\literal, \x1\literal
62    trn2            \x1\literal, \xi\literal, \x1\literal
63.endm
64
65/* Transpose elements of 2 differnet registers */
66.macro transpose x0, x1, xi, xilen, literal
67    mov             \xi\xilen, \x0\xilen
68    trn1            \x0\literal, \x0\literal, \x1\literal
69    trn2            \x1\literal, \xi\literal, \x1\literal
70.endm
71
72/* Transpose a block of 4x4 coefficients in four 64-bit registers */
73.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
74    mov             \xi\xilen, \x0\xilen
75    trn1            \x0\x0len, \x0\x0len, \x2\x2len
76    trn2            \x2\x2len, \xi\x0len, \x2\x2len
77    mov             \xi\xilen, \x1\xilen
78    trn1            \x1\x1len, \x1\x1len, \x3\x3len
79    trn2            \x3\x3len, \xi\x1len, \x3\x3len
80.endm
81
82.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
83    mov             \xi\xilen, \x0\xilen
84    trn1            \x0\x0len, \x0\x0len, \x1\x1len
85    trn2            \x1\x2len, \xi\x0len, \x1\x2len
86    mov             \xi\xilen, \x2\xilen
87    trn1            \x2\x2len, \x2\x2len, \x3\x3len
88    trn2            \x3\x2len, \xi\x1len, \x3\x3len
89.endm
90
91.macro transpose_4x4 x0, x1, x2, x3, x5
92    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
93    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
94.endm
95
96.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
97    trn1            \t0\().8h, \l0\().8h, \l1\().8h
98    trn1            \t1\().8h, \l2\().8h, \l3\().8h
99    trn1            \t2\().8h, \l4\().8h, \l5\().8h
100    trn1            \t3\().8h, \l6\().8h, \l7\().8h
101    trn2            \l1\().8h, \l0\().8h, \l1\().8h
102    trn2            \l3\().8h, \l2\().8h, \l3\().8h
103    trn2            \l5\().8h, \l4\().8h, \l5\().8h
104    trn2            \l7\().8h, \l6\().8h, \l7\().8h
105
106    trn1            \l4\().4s, \t2\().4s, \t3\().4s
107    trn2            \t3\().4s, \t2\().4s, \t3\().4s
108    trn1            \t2\().4s, \t0\().4s, \t1\().4s
109    trn2            \l2\().4s, \t0\().4s, \t1\().4s
110    trn1            \t0\().4s, \l1\().4s, \l3\().4s
111    trn2            \l3\().4s, \l1\().4s, \l3\().4s
112    trn2            \t1\().4s, \l5\().4s, \l7\().4s
113    trn1            \l5\().4s, \l5\().4s, \l7\().4s
114
115    trn2            \l6\().2d, \l2\().2d, \t3\().2d
116    trn1            \l0\().2d, \t2\().2d, \l4\().2d
117    trn1            \l1\().2d, \t0\().2d, \l5\().2d
118    trn2            \l7\().2d, \l3\().2d, \t1\().2d
119    trn1            \l2\().2d, \l2\().2d, \t3\().2d
120    trn2            \l4\().2d, \t2\().2d, \l4\().2d
121    trn1            \l3\().2d, \l3\().2d, \t1\().2d
122    trn2            \l5\().2d, \t0\().2d, \l5\().2d
123.endm
124
125
126#define CENTERJSAMPLE 128
127
128/*****************************************************************************/
129
130/*
131 * Perform dequantization and inverse DCT on one block of coefficients.
132 *
133 * GLOBAL(void)
134 * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
135 *                        JSAMPARRAY output_buf, JDIMENSION output_col)
136 */
137
138#define CONST_BITS 13
139#define PASS1_BITS 2
140
141#define F_0_298  2446  /* FIX(0.298631336) */
142#define F_0_390  3196  /* FIX(0.390180644) */
143#define F_0_541  4433  /* FIX(0.541196100) */
144#define F_0_765  6270  /* FIX(0.765366865) */
145#define F_0_899  7373  /* FIX(0.899976223) */
146#define F_1_175  9633  /* FIX(1.175875602) */
147#define F_1_501 12299  /* FIX(1.501321110) */
148#define F_1_847 15137  /* FIX(1.847759065) */
149#define F_1_961 16069  /* FIX(1.961570560) */
150#define F_2_053 16819  /* FIX(2.053119869) */
151#define F_2_562 20995  /* FIX(2.562915447) */
152#define F_3_072 25172  /* FIX(3.072711026) */
153
154.balign 16
155Ljsimd_idct_islow_neon_consts:
156  .short F_0_298
157  .short -F_0_390
158  .short F_0_541
159  .short F_0_765
160  .short - F_0_899
161  .short F_1_175
162  .short F_1_501
163  .short - F_1_847
164  .short - F_1_961
165  .short F_2_053
166  .short - F_2_562
167  .short F_3_072
168  .short 0          /* padding */
169  .short 0
170  .short 0
171  .short 0
172
173#undef F_0_298
174#undef F_0_390
175#undef F_0_541
176#undef F_0_765
177#undef F_0_899
178#undef F_1_175
179#undef F_1_501
180#undef F_1_847
181#undef F_1_961
182#undef F_2_053
183#undef F_2_562
184#undef F_3_072
185
186#define XFIX_P_0_298 v0.h[0]
187#define XFIX_N_0_390 v0.h[1]
188#define XFIX_P_0_541 v0.h[2]
189#define XFIX_P_0_765 v0.h[3]
190#define XFIX_N_0_899 v0.h[4]
191#define XFIX_P_1_175 v0.h[5]
192#define XFIX_P_1_501 v0.h[6]
193#define XFIX_N_1_847 v0.h[7]
194#define XFIX_N_1_961 v1.h[0]
195#define XFIX_P_2_053 v1.h[1]
196#define XFIX_N_2_562 v1.h[2]
197#define XFIX_P_3_072 v1.h[3]
198
199asm_function jsimd_idct_islow_neon
200    DCT_TABLE       .req x0
201    COEF_BLOCK      .req x1
202    OUTPUT_BUF      .req x2
203    OUTPUT_COL      .req x3
204    TMP1            .req x0
205    TMP2            .req x1
206    TMP3            .req x9
207    TMP4            .req x10
208    TMP5            .req x11
209    TMP6            .req x12
210    TMP7            .req x13
211    TMP8            .req x14
212
213    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
214       guarantee that the upper (unused) 32 bits of x3 are valid.  This
215       instruction ensures that those bits are set to zero. */
216    uxtw x3, w3
217
218    sub             sp, sp, #64
219    adr             x15, Ljsimd_idct_islow_neon_consts
220    mov             x10, sp
221    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
222    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
223    ld1             {v0.8h, v1.8h}, [x15]
224    ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
225    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
226    ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
227    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
228
229    cmeq            v16.8h, v3.8h, #0
230    cmeq            v26.8h, v4.8h, #0
231    cmeq            v27.8h, v5.8h, #0
232    cmeq            v28.8h, v6.8h, #0
233    cmeq            v29.8h, v7.8h, #0
234    cmeq            v30.8h, v8.8h, #0
235    cmeq            v31.8h, v9.8h, #0
236
237    and             v10.16b, v16.16b, v26.16b
238    and             v11.16b, v27.16b, v28.16b
239    and             v12.16b, v29.16b, v30.16b
240    and             v13.16b, v31.16b, v10.16b
241    and             v14.16b, v11.16b, v12.16b
242    mul             v2.8h, v2.8h, v18.8h
243    and             v15.16b, v13.16b, v14.16b
244    shl             v10.8h, v2.8h, #(PASS1_BITS)
245    sqxtn           v16.8b, v15.8h
246    mov             TMP1, v16.d[0]
247    mvn             TMP2, TMP1
248
249    cbnz            TMP2, 2f
250    /* case all AC coeffs are zeros */
251    dup             v2.2d, v10.d[0]
252    dup             v6.2d, v10.d[1]
253    mov             v3.16b, v2.16b
254    mov             v7.16b, v6.16b
255    mov             v4.16b, v2.16b
256    mov             v8.16b, v6.16b
257    mov             v5.16b, v2.16b
258    mov             v9.16b, v6.16b
2591:
260    /* for this transpose, we should organise data like this:
261     * 00, 01, 02, 03, 40, 41, 42, 43
262     * 10, 11, 12, 13, 50, 51, 52, 53
263     * 20, 21, 22, 23, 60, 61, 62, 63
264     * 30, 31, 32, 33, 70, 71, 72, 73
265     * 04, 05, 06, 07, 44, 45, 46, 47
266     * 14, 15, 16, 17, 54, 55, 56, 57
267     * 24, 25, 26, 27, 64, 65, 66, 67
268     * 34, 35, 36, 37, 74, 75, 76, 77
269     */
270    trn1            v28.8h, v2.8h, v3.8h
271    trn1            v29.8h, v4.8h, v5.8h
272    trn1            v30.8h, v6.8h, v7.8h
273    trn1            v31.8h, v8.8h, v9.8h
274    trn2            v16.8h, v2.8h, v3.8h
275    trn2            v17.8h, v4.8h, v5.8h
276    trn2            v18.8h, v6.8h, v7.8h
277    trn2            v19.8h, v8.8h, v9.8h
278    trn1            v2.4s, v28.4s, v29.4s
279    trn1            v6.4s, v30.4s, v31.4s
280    trn1            v3.4s, v16.4s, v17.4s
281    trn1            v7.4s, v18.4s, v19.4s
282    trn2            v4.4s, v28.4s, v29.4s
283    trn2            v8.4s, v30.4s, v31.4s
284    trn2            v5.4s, v16.4s, v17.4s
285    trn2            v9.4s, v18.4s, v19.4s
286    /* Even part: reverse the even part of the forward DCT. */
287    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
288    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
289    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
290    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
291    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
292    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
293    mov             v21.16b, v19.16b               /* tmp3 = z1 */
294    mov             v20.16b, v18.16b               /* tmp3 = z1 */
295    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
296    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
297    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
298    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
299    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
300    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
301    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
302    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
303    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
304    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
305    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
306    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
307    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
308    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
309    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
310
311    /* Odd part per figure 8; the matrix is unitary and hence its
312     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
313     */
314
315    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
316    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
317    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
318    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
319    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
320
321    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
322    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
323    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
324    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
325    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
326    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
327    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
328    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
329    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
330
331    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
332    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
333    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
334    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
335    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
336    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
337    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
338    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
339    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
340
341    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
342    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
343    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
344    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
345
346    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
347    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
348    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
349    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
350    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
351    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
352    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
353    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
354
355    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
356    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
357    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
358    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
359    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
360    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
361    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
362    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
363
364    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
365
366    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
367    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
368    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
369    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
370    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
371    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
372    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
373    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
374    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
375    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
376    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
377    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
378    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
379    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
380    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
381    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
382
383    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
384    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
385    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
386    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
387    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
388    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
389    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
390    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
391    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
392    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
393    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
394    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
395    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
396    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
397    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
398    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
399    movi            v0.16b, #(CENTERJSAMPLE)
400    /* Prepare pointers (dual-issue with NEON instructions) */
401      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
402    sqrshrn         v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
403      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
404    sqrshrn         v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
405      add             TMP1, TMP1, OUTPUT_COL
406    sqrshrn         v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
407      add             TMP2, TMP2, OUTPUT_COL
408    sqrshrn         v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
409      add             TMP3, TMP3, OUTPUT_COL
410    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
411      add             TMP4, TMP4, OUTPUT_COL
412    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
413      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
414    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
415      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
416    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
417      add             TMP5, TMP5, OUTPUT_COL
418    add             v16.16b, v28.16b, v0.16b
419      add             TMP6, TMP6, OUTPUT_COL
420    add             v18.16b, v29.16b, v0.16b
421      add             TMP7, TMP7, OUTPUT_COL
422    add             v20.16b, v30.16b, v0.16b
423      add             TMP8, TMP8, OUTPUT_COL
424    add             v22.16b, v31.16b, v0.16b
425
426    /* Transpose the final 8-bit samples */
427    trn1            v28.16b, v16.16b, v18.16b
428    trn1            v30.16b, v20.16b, v22.16b
429    trn2            v29.16b, v16.16b, v18.16b
430    trn2            v31.16b, v20.16b, v22.16b
431
432    trn1            v16.8h, v28.8h, v30.8h
433    trn2            v18.8h, v28.8h, v30.8h
434    trn1            v20.8h, v29.8h, v31.8h
435    trn2            v22.8h, v29.8h, v31.8h
436
437    uzp1            v28.4s, v16.4s, v18.4s
438    uzp2            v30.4s, v16.4s, v18.4s
439    uzp1            v29.4s, v20.4s, v22.4s
440    uzp2            v31.4s, v20.4s, v22.4s
441
442    /* Store results to the output buffer */
443    st1             {v28.d}[0], [TMP1]
444    st1             {v29.d}[0], [TMP2]
445    st1             {v28.d}[1], [TMP3]
446    st1             {v29.d}[1], [TMP4]
447    st1             {v30.d}[0], [TMP5]
448    st1             {v31.d}[0], [TMP6]
449    st1             {v30.d}[1], [TMP7]
450    st1             {v31.d}[1], [TMP8]
451    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
452    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
453    blr             x30
454
455.balign 16
4562:
457    mul             v3.8h, v3.8h, v19.8h
458    mul             v4.8h, v4.8h, v20.8h
459    mul             v5.8h, v5.8h, v21.8h
460    add             TMP4, xzr, TMP2, LSL #32
461    mul             v6.8h, v6.8h, v22.8h
462    mul             v7.8h, v7.8h, v23.8h
463    adds            TMP3, xzr, TMP2, LSR #32
464    mul             v8.8h, v8.8h, v24.8h
465    mul             v9.8h, v9.8h, v25.8h
466    b.ne            3f
467    /* Right AC coef is zero */
468    dup             v15.2d, v10.d[1]
469    /* Even part: reverse the even part of the forward DCT. */
470    add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
471    add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
472    sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
473    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
474    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
475    mov             v20.16b, v18.16b               /* tmp3 = z1 */
476    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
477    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
478    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
479    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
480    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
481    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
482    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
483
484    /* Odd part per figure 8; the matrix is unitary and hence its
485     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
486     */
487
488    add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
489    add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
490    add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
491    add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
492    add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
493
494    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
495    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
496    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
497    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
498    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
499    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
500    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
501    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
502    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
503
504    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
505    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
506
507    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
508    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
509    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
510    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
511
512    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
513    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
514    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
515    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
516
517    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
518
519    add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
520    sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
521    add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
522    sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
523    add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
524    sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
525    add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
526    sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
527
528    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
529    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
530    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
531    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
532    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
533    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
534    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
535    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
536    mov             v6.16b, v15.16b
537    mov             v7.16b, v15.16b
538    mov             v8.16b, v15.16b
539    mov             v9.16b, v15.16b
540    b               1b
541
542.balign 16
5433:
544    cbnz            TMP4, 4f
545    /* Left AC coef is zero */
546    dup             v14.2d, v10.d[0]
547    /* Even part: reverse the even part of the forward DCT. */
548    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
549    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
550    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
551    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
552    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
553    mov             v21.16b, v19.16b               /* tmp3 = z1 */
554    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
555    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
556    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
557    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
558    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
559    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
560    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
561
562    /* Odd part per figure 8; the matrix is unitary and hence its
563     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
564     */
565
566    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
567    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
568    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
569    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
570    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
571
572    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
573    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
574    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
575    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
576    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
577    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
578    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
579    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
580    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
581
582    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
583    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
584    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
585    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
586
587    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
588    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
589    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
590    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
591
592    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
593    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
594    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
595    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
596
597    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
598
599    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
600    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
601    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
602    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
603    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
604    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
605    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
606    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
607
608    mov             v2.16b, v14.16b
609    mov             v3.16b, v14.16b
610    mov             v4.16b, v14.16b
611    mov             v5.16b, v14.16b
612    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
613    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
614    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
615    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
616    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
617    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
618    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
619    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
620    b               1b
621
622.balign 16
6234:
624    /* "No" AC coef is zero */
625    /* Even part: reverse the even part of the forward DCT. */
626    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
627    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
628    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
629    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
630    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
631    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
632    mov             v21.16b, v19.16b               /* tmp3 = z1 */
633    mov             v20.16b, v18.16b               /* tmp3 = z1 */
634    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
635    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
636    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
637    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
638    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
639    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
640    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
641    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
642    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
643    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
644    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
645    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
646    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
647    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
648    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
649
650    /* Odd part per figure 8; the matrix is unitary and hence its
651     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
652     */
653
654    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
655    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
656    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
657    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
658    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
659
660    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
661    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
662    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
663    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
664    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
665    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
666    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
667    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
668    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
669
670    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
671    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
672    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
673    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
674    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
675    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
676    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
677    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
678    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
679
680    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
681    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
682    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
683    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
684
685    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
686    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
687    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
688    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
689    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
690    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
691    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
692    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
693
694    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
695    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
696    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
697    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
698    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
699    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
700    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
701    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
702
703    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
704
705    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
706    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
707    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
708    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
709    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
710    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
711    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
712    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
713    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
714    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
715    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
716    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
717    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
718    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
719    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
720    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
721
722    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
723    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
724    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
725    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
726    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
727    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
728    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
729    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
730    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
731    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
732    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
733    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
734    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
735    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
736    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
737    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
738    b               1b
739
740    .unreq          DCT_TABLE
741    .unreq          COEF_BLOCK
742    .unreq          OUTPUT_BUF
743    .unreq          OUTPUT_COL
744    .unreq          TMP1
745    .unreq          TMP2
746    .unreq          TMP3
747    .unreq          TMP4
748    .unreq          TMP5
749    .unreq          TMP6
750    .unreq          TMP7
751    .unreq          TMP8
752
753#undef CENTERJSAMPLE
754#undef CONST_BITS
755#undef PASS1_BITS
756#undef XFIX_P_0_298
757#undef XFIX_N_0_390
758#undef XFIX_P_0_541
759#undef XFIX_P_0_765
760#undef XFIX_N_0_899
761#undef XFIX_P_1_175
762#undef XFIX_P_1_501
763#undef XFIX_N_1_847
764#undef XFIX_N_1_961
765#undef XFIX_P_2_053
766#undef XFIX_N_2_562
767#undef XFIX_P_3_072
768
769
770/*****************************************************************************/
771
772/*
773 * jsimd_idct_ifast_neon
774 *
775 * This function contains a fast, not so accurate integer implementation of
776 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
777 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
778 * function from jidctfst.c
779 *
780 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
781 * But in ARM NEON case some extra additions are required because VQDMULH
782 * instruction can't handle the constants larger than 1. So the expressions
783 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
784 * which introduces an extra addition. Overall, there are 6 extra additions
785 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
786 */
787
788#define XFIX_1_082392200 v0.h[0]
789#define XFIX_1_414213562 v0.h[1]
790#define XFIX_1_847759065 v0.h[2]
791#define XFIX_2_613125930 v0.h[3]
792
793.balign 16
794Ljsimd_idct_ifast_neon_consts:
795  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
796  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
797  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
798  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
799
800asm_function jsimd_idct_ifast_neon
801
802    DCT_TABLE       .req x0
803    COEF_BLOCK      .req x1
804    OUTPUT_BUF      .req x2
805    OUTPUT_COL      .req x3
806    TMP1            .req x0
807    TMP2            .req x1
808    TMP3            .req x9
809    TMP4            .req x10
810    TMP5            .req x11
811    TMP6            .req x12
812    TMP7            .req x13
813    TMP8            .req x14
814
815    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
816       guarantee that the upper (unused) 32 bits of x3 are valid.  This
817       instruction ensures that those bits are set to zero. */
818    uxtw x3, w3
819
820    /* Load and dequantize coefficients into NEON registers
821     * with the following allocation:
822     *       0 1 2 3 | 4 5 6 7
823     *      ---------+--------
824     *   0 | d16     | d17     ( v16.8h )
825     *   1 | d18     | d19     ( v17.8h )
826     *   2 | d20     | d21     ( v18.8h )
827     *   3 | d22     | d23     ( v19.8h )
828     *   4 | d24     | d25     ( v20.8h )
829     *   5 | d26     | d27     ( v21.8h )
830     *   6 | d28     | d29     ( v22.8h )
831     *   7 | d30     | d31     ( v23.8h )
832     */
833    /* Save NEON registers used in fast IDCT */
834    adr             TMP5, Ljsimd_idct_ifast_neon_consts
835    ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
836    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
837    ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
838    mul             v16.8h, v16.8h, v0.8h
839    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
840    mul             v17.8h, v17.8h, v1.8h
841    ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
842    mul             v18.8h, v18.8h, v2.8h
843    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
844    mul             v19.8h, v19.8h, v3.8h
845    ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
846    mul             v20.8h, v20.8h, v0.8h
847    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
848    mul             v22.8h, v22.8h, v2.8h
849    mul             v21.8h, v21.8h, v1.8h
850    ld1             {v0.4h}, [TMP5]        /* load constants */
851    mul             v23.8h, v23.8h, v3.8h
852
853    /* 1-D IDCT, pass 1 */
854    sub             v2.8h, v18.8h, v22.8h
855    add             v22.8h, v18.8h, v22.8h
856    sub             v1.8h, v19.8h, v21.8h
857    add             v21.8h, v19.8h, v21.8h
858    sub             v5.8h, v17.8h, v23.8h
859    add             v23.8h, v17.8h, v23.8h
860    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
861    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
862    add             v3.8h, v1.8h, v1.8h
863    sub             v1.8h, v5.8h, v1.8h
864    add             v18.8h, v2.8h, v4.8h
865    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
866    sub             v2.8h, v23.8h, v21.8h
867    add             v3.8h, v3.8h, v6.8h
868    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
869    add             v1.8h, v1.8h, v4.8h
870    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
871    sub             v18.8h, v18.8h, v22.8h
872    add             v2.8h, v2.8h, v6.8h
873    sub             v6.8h, v16.8h, v20.8h
874    add             v20.8h, v16.8h, v20.8h
875    add             v17.8h, v5.8h, v4.8h
876    add             v5.8h, v6.8h, v18.8h
877    sub             v18.8h, v6.8h, v18.8h
878    add             v6.8h, v23.8h, v21.8h
879    add             v16.8h, v20.8h, v22.8h
880    sub             v3.8h, v6.8h, v3.8h
881    sub             v20.8h, v20.8h, v22.8h
882    sub             v3.8h, v3.8h, v1.8h
883    sub             v1.8h, v17.8h, v1.8h
884    add             v2.8h, v3.8h, v2.8h
885    sub             v23.8h, v16.8h, v6.8h
886    add             v1.8h, v1.8h, v2.8h
887    add             v16.8h, v16.8h, v6.8h
888    add             v22.8h, v5.8h, v3.8h
889    sub             v17.8h, v5.8h, v3.8h
890    sub             v21.8h, v18.8h, v2.8h
891    add             v18.8h, v18.8h, v2.8h
892    sub             v19.8h, v20.8h, v1.8h
893    add             v20.8h, v20.8h, v1.8h
894    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
895    /* 1-D IDCT, pass 2 */
896    sub             v2.8h, v18.8h, v22.8h
897    add             v22.8h, v18.8h, v22.8h
898    sub             v1.8h, v19.8h, v21.8h
899    add             v21.8h, v19.8h, v21.8h
900    sub             v5.8h, v17.8h, v23.8h
901    add             v23.8h, v17.8h, v23.8h
902    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
903    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
904    add             v3.8h, v1.8h, v1.8h
905    sub             v1.8h, v5.8h, v1.8h
906    add             v18.8h, v2.8h, v4.8h
907    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
908    sub             v2.8h, v23.8h, v21.8h
909    add             v3.8h, v3.8h, v6.8h
910    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
911    add             v1.8h, v1.8h, v4.8h
912    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
913    sub             v18.8h, v18.8h, v22.8h
914    add             v2.8h, v2.8h, v6.8h
915    sub             v6.8h, v16.8h, v20.8h
916    add             v20.8h, v16.8h, v20.8h
917    add             v17.8h, v5.8h, v4.8h
918    add             v5.8h, v6.8h, v18.8h
919    sub             v18.8h, v6.8h, v18.8h
920    add             v6.8h, v23.8h, v21.8h
921    add             v16.8h, v20.8h, v22.8h
922    sub             v3.8h, v6.8h, v3.8h
923    sub             v20.8h, v20.8h, v22.8h
924    sub             v3.8h, v3.8h, v1.8h
925    sub             v1.8h, v17.8h, v1.8h
926    add             v2.8h, v3.8h, v2.8h
927    sub             v23.8h, v16.8h, v6.8h
928    add             v1.8h, v1.8h, v2.8h
929    add             v16.8h, v16.8h, v6.8h
930    add             v22.8h, v5.8h, v3.8h
931    sub             v17.8h, v5.8h, v3.8h
932    sub             v21.8h, v18.8h, v2.8h
933    add             v18.8h, v18.8h, v2.8h
934    sub             v19.8h, v20.8h, v1.8h
935    add             v20.8h, v20.8h, v1.8h
936    /* Descale to 8-bit and range limit */
937    movi            v0.16b, #0x80
938      /* Prepare pointers (dual-issue with NEON instructions) */
939      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
940    sqshrn          v28.8b, v16.8h, #5
941      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
942    sqshrn          v29.8b, v17.8h, #5
943      add             TMP1, TMP1, OUTPUT_COL
944    sqshrn          v30.8b, v18.8h, #5
945      add             TMP2, TMP2, OUTPUT_COL
946    sqshrn          v31.8b, v19.8h, #5
947      add             TMP3, TMP3, OUTPUT_COL
948    sqshrn2         v28.16b, v20.8h, #5
949      add             TMP4, TMP4, OUTPUT_COL
950    sqshrn2         v29.16b, v21.8h, #5
951      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
952    sqshrn2         v30.16b, v22.8h, #5
953      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
954    sqshrn2         v31.16b, v23.8h, #5
955      add             TMP5, TMP5, OUTPUT_COL
956    add             v16.16b, v28.16b, v0.16b
957      add             TMP6, TMP6, OUTPUT_COL
958    add             v18.16b, v29.16b, v0.16b
959      add             TMP7, TMP7, OUTPUT_COL
960    add             v20.16b, v30.16b, v0.16b
961      add             TMP8, TMP8, OUTPUT_COL
962    add             v22.16b, v31.16b, v0.16b
963
964    /* Transpose the final 8-bit samples */
965    trn1            v28.16b, v16.16b, v18.16b
966    trn1            v30.16b, v20.16b, v22.16b
967    trn2            v29.16b, v16.16b, v18.16b
968    trn2            v31.16b, v20.16b, v22.16b
969
970    trn1            v16.8h, v28.8h, v30.8h
971    trn2            v18.8h, v28.8h, v30.8h
972    trn1            v20.8h, v29.8h, v31.8h
973    trn2            v22.8h, v29.8h, v31.8h
974
975    uzp1            v28.4s, v16.4s, v18.4s
976    uzp2            v30.4s, v16.4s, v18.4s
977    uzp1            v29.4s, v20.4s, v22.4s
978    uzp2            v31.4s, v20.4s, v22.4s
979
980    /* Store results to the output buffer */
981    st1             {v28.d}[0], [TMP1]
982    st1             {v29.d}[0], [TMP2]
983    st1             {v28.d}[1], [TMP3]
984    st1             {v29.d}[1], [TMP4]
985    st1             {v30.d}[0], [TMP5]
986    st1             {v31.d}[0], [TMP6]
987    st1             {v30.d}[1], [TMP7]
988    st1             {v31.d}[1], [TMP8]
989    blr             x30
990
991    .unreq          DCT_TABLE
992    .unreq          COEF_BLOCK
993    .unreq          OUTPUT_BUF
994    .unreq          OUTPUT_COL
995    .unreq          TMP1
996    .unreq          TMP2
997    .unreq          TMP3
998    .unreq          TMP4
999    .unreq          TMP5
1000    .unreq          TMP6
1001    .unreq          TMP7
1002    .unreq          TMP8
1003
1004
1005/*****************************************************************************/
1006
1007/*
1008 * jsimd_idct_4x4_neon
1009 *
1010 * This function contains inverse-DCT code for getting reduced-size
1011 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
1012 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1013 * function from jpeg-6b (jidctred.c).
1014 *
1015 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1016 *       requires much less arithmetic operations and hence should be faster.
1017 *       The primary purpose of this particular NEON optimized function is
1018 *       bit exact compatibility with jpeg-6b.
1019 *
1020 * TODO: a bit better instructions scheduling can be achieved by expanding
1021 *       idct_helper/transpose_4x4 macros and reordering instructions,
1022 *       but readability will suffer somewhat.
1023 */
1024
1025#define CONST_BITS  13
1026
1027#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
1028#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
1029#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
1030#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
1031#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
1032#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
1033#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
1034#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
1035#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
1036#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
1037#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
1038#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
1039#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
1040#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
1041
1042.balign 16
1043Ljsimd_idct_4x4_neon_consts:
1044  .short FIX_1_847759065      /* v0.h[0] */
1045  .short -FIX_0_765366865     /* v0.h[1] */
1046  .short -FIX_0_211164243     /* v0.h[2] */
1047  .short FIX_1_451774981      /* v0.h[3] */
1048  .short -FIX_2_172734803     /* d1[0] */
1049  .short FIX_1_061594337      /* d1[1] */
1050  .short -FIX_0_509795579     /* d1[2] */
1051  .short -FIX_0_601344887     /* d1[3] */
1052  .short FIX_0_899976223      /* v2.h[0] */
1053  .short FIX_2_562915447      /* v2.h[1] */
1054  .short 1 << (CONST_BITS+1)  /* v2.h[2] */
1055  .short 0                    /* v2.h[3] */
1056
1057.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1058    smull           v28.4s, \x4, v2.h[2]
1059    smlal           v28.4s, \x8, v0.h[0]
1060    smlal           v28.4s, \x14, v0.h[1]
1061
1062    smull           v26.4s, \x16, v1.h[2]
1063    smlal           v26.4s, \x12, v1.h[3]
1064    smlal           v26.4s, \x10, v2.h[0]
1065    smlal           v26.4s, \x6, v2.h[1]
1066
1067    smull           v30.4s, \x4, v2.h[2]
1068    smlsl           v30.4s, \x8, v0.h[0]
1069    smlsl           v30.4s, \x14, v0.h[1]
1070
1071    smull           v24.4s, \x16, v0.h[2]
1072    smlal           v24.4s, \x12, v0.h[3]
1073    smlal           v24.4s, \x10, v1.h[0]
1074    smlal           v24.4s, \x6, v1.h[1]
1075
1076    add             v20.4s, v28.4s, v26.4s
1077    sub             v28.4s, v28.4s, v26.4s
1078
1079  .if \shift > 16
1080    srshr           v20.4s, v20.4s, #\shift
1081    srshr           v28.4s, v28.4s, #\shift
1082    xtn             \y26, v20.4s
1083    xtn             \y29, v28.4s
1084  .else
1085    rshrn           \y26, v20.4s, #\shift
1086    rshrn           \y29, v28.4s, #\shift
1087  .endif
1088
1089    add             v20.4s, v30.4s, v24.4s
1090    sub             v30.4s, v30.4s, v24.4s
1091
1092  .if \shift > 16
1093    srshr           v20.4s, v20.4s, #\shift
1094    srshr           v30.4s, v30.4s, #\shift
1095    xtn             \y27, v20.4s
1096    xtn             \y28, v30.4s
1097  .else
1098    rshrn           \y27, v20.4s, #\shift
1099    rshrn           \y28, v30.4s, #\shift
1100  .endif
1101.endm
1102
1103asm_function jsimd_idct_4x4_neon
1104
1105    DCT_TABLE       .req x0
1106    COEF_BLOCK      .req x1
1107    OUTPUT_BUF      .req x2
1108    OUTPUT_COL      .req x3
1109    TMP1            .req x0
1110    TMP2            .req x1
1111    TMP3            .req x2
1112    TMP4            .req x15
1113
1114    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1115       guarantee that the upper (unused) 32 bits of x3 are valid.  This
1116       instruction ensures that those bits are set to zero. */
1117    uxtw x3, w3
1118
1119    /* Save all used NEON registers */
1120    sub             sp, sp, 64
1121    mov             x9, sp
1122    /* Load constants (v3.4h is just used for padding) */
1123    adr             TMP4, Ljsimd_idct_4x4_neon_consts
1124    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1125    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1126    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1127
1128    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1129     *       0 1 2 3 | 4 5 6 7
1130     *      ---------+--------
1131     *   0 | v4.4h   | v5.4h
1132     *   1 | v6.4h   | v7.4h
1133     *   2 | v8.4h   | v9.4h
1134     *   3 | v10.4h  | v11.4h
1135     *   4 | -       | -
1136     *   5 | v12.4h  | v13.4h
1137     *   6 | v14.4h  | v15.4h
1138     *   7 | v16.4h  | v17.4h
1139     */
1140    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1141    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1142    add             COEF_BLOCK, COEF_BLOCK, #16
1143    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1144    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
1145    /* dequantize */
1146    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1147    mul             v4.4h, v4.4h, v18.4h
1148    mul             v5.4h, v5.4h, v19.4h
1149    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
1150    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1151    mul             v6.4h, v6.4h, v20.4h
1152    mul             v7.4h, v7.4h, v21.4h
1153    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
1154    mul             v8.4h, v8.4h, v22.4h
1155    mul             v9.4h, v9.4h, v23.4h
1156    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
1157    add             DCT_TABLE, DCT_TABLE, #16
1158    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1159    mul             v10.4h, v10.4h, v24.4h
1160    mul             v11.4h, v11.4h, v25.4h
1161    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
1162    mul             v12.4h, v12.4h, v26.4h
1163    mul             v13.4h, v13.4h, v27.4h
1164    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
1165    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
1166    mul             v14.4h, v14.4h, v28.4h
1167    mul             v15.4h, v15.4h, v29.4h
1168    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
1169    mul             v16.4h, v16.4h, v30.4h
1170    mul             v17.4h, v17.4h, v31.4h
1171    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
1172
1173    /* Pass 1 */
1174    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
1175                    v4.4h, v6.4h, v8.4h, v10.4h
1176    transpose_4x4   v4, v6, v8, v10, v3
1177    ins             v10.d[1], v11.d[0]
1178    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
1179                    v5.4h, v7.4h, v9.4h, v11.4h
1180    transpose_4x4   v5, v7, v9, v11, v3
1181    ins             v10.d[1], v11.d[0]
1182
1183    /* Pass 2 */
1184    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
1185                    v26.4h, v27.4h, v28.4h, v29.4h
1186    transpose_4x4   v26, v27, v28, v29, v3
1187
1188    /* Range limit */
1189    movi            v30.8h, #0x80
1190    ins             v26.d[1], v27.d[0]
1191    ins             v28.d[1], v29.d[0]
1192    add             v26.8h, v26.8h, v30.8h
1193    add             v28.8h, v28.8h, v30.8h
1194    sqxtun          v26.8b, v26.8h
1195    sqxtun          v27.8b, v28.8h
1196
1197    /* Store results to the output buffer */
1198    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
1199    ldp             TMP3, TMP4, [OUTPUT_BUF]
1200    add             TMP1, TMP1, OUTPUT_COL
1201    add             TMP2, TMP2, OUTPUT_COL
1202    add             TMP3, TMP3, OUTPUT_COL
1203    add             TMP4, TMP4, OUTPUT_COL
1204
1205#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1206    /* We can use much less instructions on little endian systems if the
1207     * OS kernel is not configured to trap unaligned memory accesses
1208     */
1209    st1             {v26.s}[0], [TMP1], 4
1210    st1             {v27.s}[0], [TMP3], 4
1211    st1             {v26.s}[1], [TMP2], 4
1212    st1             {v27.s}[1], [TMP4], 4
1213#else
1214    st1             {v26.b}[0], [TMP1], 1
1215    st1             {v27.b}[0], [TMP3], 1
1216    st1             {v26.b}[1], [TMP1], 1
1217    st1             {v27.b}[1], [TMP3], 1
1218    st1             {v26.b}[2], [TMP1], 1
1219    st1             {v27.b}[2], [TMP3], 1
1220    st1             {v26.b}[3], [TMP1], 1
1221    st1             {v27.b}[3], [TMP3], 1
1222
1223    st1             {v26.b}[4], [TMP2], 1
1224    st1             {v27.b}[4], [TMP4], 1
1225    st1             {v26.b}[5], [TMP2], 1
1226    st1             {v27.b}[5], [TMP4], 1
1227    st1             {v26.b}[6], [TMP2], 1
1228    st1             {v27.b}[6], [TMP4], 1
1229    st1             {v26.b}[7], [TMP2], 1
1230    st1             {v27.b}[7], [TMP4], 1
1231#endif
1232
1233    /* vpop            {v8.4h - v15.4h}    ;not available */
1234    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1235    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1236    blr             x30
1237
1238    .unreq          DCT_TABLE
1239    .unreq          COEF_BLOCK
1240    .unreq          OUTPUT_BUF
1241    .unreq          OUTPUT_COL
1242    .unreq          TMP1
1243    .unreq          TMP2
1244    .unreq          TMP3
1245    .unreq          TMP4
1246
1247.purgem idct_helper
1248
1249
1250/*****************************************************************************/
1251
1252/*
1253 * jsimd_idct_2x2_neon
1254 *
1255 * This function contains inverse-DCT code for getting reduced-size
1256 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
1257 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1258 * function from jpeg-6b (jidctred.c).
1259 *
1260 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1261 *       requires much less arithmetic operations and hence should be faster.
1262 *       The primary purpose of this particular NEON optimized function is
1263 *       bit exact compatibility with jpeg-6b.
1264 */
1265
1266.balign 8
1267Ljsimd_idct_2x2_neon_consts:
1268  .short -FIX_0_720959822  /* v14[0] */
1269  .short FIX_0_850430095   /* v14[1] */
1270  .short -FIX_1_272758580  /* v14[2] */
1271  .short FIX_3_624509785   /* v14[3] */
1272
1273.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1274    sshll           v15.4s, \x4, #15
1275    smull           v26.4s, \x6, v14.h[3]
1276    smlal           v26.4s, \x10, v14.h[2]
1277    smlal           v26.4s, \x12, v14.h[1]
1278    smlal           v26.4s, \x16, v14.h[0]
1279
1280    add             v20.4s, v15.4s, v26.4s
1281    sub             v15.4s, v15.4s, v26.4s
1282
1283  .if \shift > 16
1284    srshr           v20.4s, v20.4s, #\shift
1285    srshr           v15.4s, v15.4s, #\shift
1286    xtn             \y26, v20.4s
1287    xtn             \y27, v15.4s
1288  .else
1289    rshrn           \y26, v20.4s, #\shift
1290    rshrn           \y27, v15.4s, #\shift
1291  .endif
1292.endm
1293
1294asm_function jsimd_idct_2x2_neon
1295
1296    DCT_TABLE       .req x0
1297    COEF_BLOCK      .req x1
1298    OUTPUT_BUF      .req x2
1299    OUTPUT_COL      .req x3
1300    TMP1            .req x0
1301    TMP2            .req x15
1302
1303    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1304       guarantee that the upper (unused) 32 bits of x3 are valid.  This
1305       instruction ensures that those bits are set to zero. */
1306    uxtw x3, w3
1307
1308    /* vpush           {v8.4h - v15.4h}            ; not available */
1309    sub             sp, sp, 64
1310    mov             x9, sp
1311
1312    /* Load constants */
1313    adr             TMP2, Ljsimd_idct_2x2_neon_consts
1314    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1315    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1316    ld1             {v14.4h}, [TMP2]
1317
1318    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1319     *       0 1 2 3 | 4 5 6 7
1320     *      ---------+--------
1321     *   0 | v4.4h   | v5.4h
1322     *   1 | v6.4h   | v7.4h
1323     *   2 | -       | -
1324     *   3 | v10.4h  | v11.4h
1325     *   4 | -       | -
1326     *   5 | v12.4h  | v13.4h
1327     *   6 | -       | -
1328     *   7 | v16.4h  | v17.4h
1329     */
1330    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1331    add             COEF_BLOCK, COEF_BLOCK, #16
1332    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
1333    add             COEF_BLOCK, COEF_BLOCK, #16
1334    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
1335    add             COEF_BLOCK, COEF_BLOCK, #16
1336    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
1337    /* Dequantize */
1338    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1339    mul             v4.4h, v4.4h, v18.4h
1340    mul             v5.4h, v5.4h, v19.4h
1341    ins             v4.d[1], v5.d[0]
1342    mul             v6.4h, v6.4h, v20.4h
1343    mul             v7.4h, v7.4h, v21.4h
1344    ins             v6.d[1], v7.d[0]
1345    add             DCT_TABLE, DCT_TABLE, #16
1346    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
1347    mul             v10.4h, v10.4h, v24.4h
1348    mul             v11.4h, v11.4h, v25.4h
1349    ins             v10.d[1], v11.d[0]
1350    add             DCT_TABLE, DCT_TABLE, #16
1351    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
1352    mul             v12.4h, v12.4h, v26.4h
1353    mul             v13.4h, v13.4h, v27.4h
1354    ins             v12.d[1], v13.d[0]
1355    add             DCT_TABLE, DCT_TABLE, #16
1356    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
1357    mul             v16.4h, v16.4h, v30.4h
1358    mul             v17.4h, v17.4h, v31.4h
1359    ins             v16.d[1], v17.d[0]
1360
1361    /* Pass 1 */
1362#if 0
1363    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1364    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
1365    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1366    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
1367#else
1368    smull           v26.4s, v6.4h, v14.h[3]
1369    smlal           v26.4s, v10.4h, v14.h[2]
1370    smlal           v26.4s, v12.4h, v14.h[1]
1371    smlal           v26.4s, v16.4h, v14.h[0]
1372    smull           v24.4s, v7.4h, v14.h[3]
1373    smlal           v24.4s, v11.4h, v14.h[2]
1374    smlal           v24.4s, v13.4h, v14.h[1]
1375    smlal           v24.4s, v17.4h, v14.h[0]
1376    sshll           v15.4s, v4.4h, #15
1377    sshll           v30.4s, v5.4h, #15
1378    add             v20.4s, v15.4s, v26.4s
1379    sub             v15.4s, v15.4s, v26.4s
1380    rshrn           v4.4h, v20.4s, #13
1381    rshrn           v6.4h, v15.4s, #13
1382    add             v20.4s, v30.4s, v24.4s
1383    sub             v15.4s, v30.4s, v24.4s
1384    rshrn           v5.4h, v20.4s, #13
1385    rshrn           v7.4h, v15.4s, #13
1386    ins             v4.d[1], v5.d[0]
1387    ins             v6.d[1], v7.d[0]
1388    transpose       v4, v6, v3, .16b, .8h
1389    transpose       v6, v10, v3, .16b, .4s
1390    ins             v11.d[0], v10.d[1]
1391    ins             v7.d[0], v6.d[1]
1392#endif
1393
1394    /* Pass 2 */
1395    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1396
1397    /* Range limit */
1398    movi            v30.8h, #0x80
1399    ins             v26.d[1], v27.d[0]
1400    add             v26.8h, v26.8h, v30.8h
1401    sqxtun          v30.8b, v26.8h
1402    ins             v26.d[0], v30.d[0]
1403    sqxtun          v27.8b, v26.8h
1404
1405    /* Store results to the output buffer */
1406    ldp             TMP1, TMP2, [OUTPUT_BUF]
1407    add             TMP1, TMP1, OUTPUT_COL
1408    add             TMP2, TMP2, OUTPUT_COL
1409
1410    st1             {v26.b}[0], [TMP1], 1
1411    st1             {v27.b}[4], [TMP1], 1
1412    st1             {v26.b}[1], [TMP2], 1
1413    st1             {v27.b}[5], [TMP2], 1
1414
1415    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1416    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1417    blr             x30
1418
1419    .unreq          DCT_TABLE
1420    .unreq          COEF_BLOCK
1421    .unreq          OUTPUT_BUF
1422    .unreq          OUTPUT_COL
1423    .unreq          TMP1
1424    .unreq          TMP2
1425
1426.purgem idct_helper
1427
1428
1429/*****************************************************************************/
1430
1431/*
1432 * jsimd_ycc_extrgb_convert_neon
1433 * jsimd_ycc_extbgr_convert_neon
1434 * jsimd_ycc_extrgbx_convert_neon
1435 * jsimd_ycc_extbgrx_convert_neon
1436 * jsimd_ycc_extxbgr_convert_neon
1437 * jsimd_ycc_extxrgb_convert_neon
1438 *
1439 * Colorspace conversion YCbCr -> RGB
1440 */
1441
1442.macro do_load size
1443  .if \size == 8
1444    ld1             {v4.8b}, [U], 8
1445    ld1             {v5.8b}, [V], 8
1446    ld1             {v0.8b}, [Y], 8
1447    prfm            pldl1keep, [U, #64]
1448    prfm            pldl1keep, [V, #64]
1449    prfm            pldl1keep, [Y, #64]
1450  .elseif \size == 4
1451    ld1             {v4.b}[0], [U], 1
1452    ld1             {v4.b}[1], [U], 1
1453    ld1             {v4.b}[2], [U], 1
1454    ld1             {v4.b}[3], [U], 1
1455    ld1             {v5.b}[0], [V], 1
1456    ld1             {v5.b}[1], [V], 1
1457    ld1             {v5.b}[2], [V], 1
1458    ld1             {v5.b}[3], [V], 1
1459    ld1             {v0.b}[0], [Y], 1
1460    ld1             {v0.b}[1], [Y], 1
1461    ld1             {v0.b}[2], [Y], 1
1462    ld1             {v0.b}[3], [Y], 1
1463  .elseif \size == 2
1464    ld1             {v4.b}[4], [U], 1
1465    ld1             {v4.b}[5], [U], 1
1466    ld1             {v5.b}[4], [V], 1
1467    ld1             {v5.b}[5], [V], 1
1468    ld1             {v0.b}[4], [Y], 1
1469    ld1             {v0.b}[5], [Y], 1
1470  .elseif \size == 1
1471    ld1             {v4.b}[6], [U], 1
1472    ld1             {v5.b}[6], [V], 1
1473    ld1             {v0.b}[6], [Y], 1
1474  .else
1475    .error unsupported macroblock size
1476  .endif
1477.endm
1478
1479.macro do_store bpp, size, fast_st3
1480  .if \bpp == 24
1481    .if \size == 8
1482      .if \fast_st3 == 1
1483        st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
1484      .else
1485        st1         {v10.b}[0], [RGB], #1
1486        st1         {v11.b}[0], [RGB], #1
1487        st1         {v12.b}[0], [RGB], #1
1488
1489        st1         {v10.b}[1], [RGB], #1
1490        st1         {v11.b}[1], [RGB], #1
1491        st1         {v12.b}[1], [RGB], #1
1492
1493        st1         {v10.b}[2], [RGB], #1
1494        st1         {v11.b}[2], [RGB], #1
1495        st1         {v12.b}[2], [RGB], #1
1496
1497        st1         {v10.b}[3], [RGB], #1
1498        st1         {v11.b}[3], [RGB], #1
1499        st1         {v12.b}[3], [RGB], #1
1500
1501        st1         {v10.b}[4], [RGB], #1
1502        st1         {v11.b}[4], [RGB], #1
1503        st1         {v12.b}[4], [RGB], #1
1504
1505        st1         {v10.b}[5], [RGB], #1
1506        st1         {v11.b}[5], [RGB], #1
1507        st1         {v12.b}[5], [RGB], #1
1508
1509        st1         {v10.b}[6], [RGB], #1
1510        st1         {v11.b}[6], [RGB], #1
1511        st1         {v12.b}[6], [RGB], #1
1512
1513        st1         {v10.b}[7], [RGB], #1
1514        st1         {v11.b}[7], [RGB], #1
1515        st1         {v12.b}[7], [RGB], #1
1516      .endif
1517    .elseif \size == 4
1518      st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
1519      st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
1520      st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
1521      st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
1522    .elseif \size == 2
1523      st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
1524      st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
1525    .elseif \size == 1
1526      st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
1527    .else
1528     .error unsupported macroblock size
1529    .endif
1530  .elseif \bpp == 32
1531    .if \size == 8
1532      st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1533    .elseif \size == 4
1534      st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1535      st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1536      st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1537      st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1538    .elseif \size == 2
1539      st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1540      st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1541    .elseif \size == 1
1542      st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1543    .else
1544      .error unsupported macroblock size
1545    .endif
1546  .elseif \bpp==16
1547    .if \size == 8
1548      st1           {v25.8h}, [RGB], 16
1549    .elseif \size == 4
1550      st1           {v25.4h}, [RGB], 8
1551    .elseif \size == 2
1552      st1           {v25.h}[4], [RGB], 2
1553      st1           {v25.h}[5], [RGB], 2
1554    .elseif \size == 1
1555      st1           {v25.h}[6], [RGB], 2
1556    .else
1557      .error unsupported macroblock size
1558    .endif
1559  .else
1560    .error unsupported bpp
1561  .endif
1562.endm
1563
1564.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
1565                                           g_offs, gsize, b_offs, bsize, \
1566                                           defsize, fast_st3
1567
1568/*
1569 * 2-stage pipelined YCbCr->RGB conversion
1570 */
1571
1572.macro do_yuv_to_rgb_stage1
1573    uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
1574    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1575    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1576    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1577    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1578    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1579    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1580    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1581    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
1582    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
1583.endm
1584
1585.macro do_yuv_to_rgb_stage2
1586    rshrn           v20.4h, v20.4s, #15
1587    rshrn2          v20.8h, v22.4s, #15
1588    rshrn           v24.4h, v24.4s, #14
1589    rshrn2          v24.8h, v26.4s, #14
1590    rshrn           v28.4h, v28.4s, #14
1591    rshrn2          v28.8h, v30.4s, #14
1592    uaddw           v20.8h, v20.8h, v0.8b
1593    uaddw           v24.8h, v24.8h, v0.8b
1594    uaddw           v28.8h, v28.8h, v0.8b
1595  .if \bpp != 16
1596    sqxtun          v1\g_offs\defsize, v20.8h
1597    sqxtun          v1\r_offs\defsize, v24.8h
1598    sqxtun          v1\b_offs\defsize, v28.8h
1599  .else
1600    sqshlu          v21.8h, v20.8h, #8
1601    sqshlu          v25.8h, v24.8h, #8
1602    sqshlu          v29.8h, v28.8h, #8
1603    sri             v25.8h, v21.8h, #5
1604    sri             v25.8h, v29.8h, #11
1605  .endif
1606.endm
1607
1608.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
1609    rshrn           v20.4h, v20.4s, #15
1610    rshrn           v24.4h, v24.4s, #14
1611    rshrn           v28.4h, v28.4s, #14
1612    ld1             {v4.8b}, [U], 8
1613    rshrn2          v20.8h, v22.4s, #15
1614    rshrn2          v24.8h, v26.4s, #14
1615    rshrn2          v28.8h, v30.4s, #14
1616    ld1             {v5.8b}, [V], 8
1617    uaddw           v20.8h, v20.8h, v0.8b
1618    uaddw           v24.8h, v24.8h, v0.8b
1619    uaddw           v28.8h, v28.8h, v0.8b
1620  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
1621    sqxtun          v1\g_offs\defsize, v20.8h
1622    ld1             {v0.8b}, [Y], 8
1623    sqxtun          v1\r_offs\defsize, v24.8h
1624    prfm            pldl1keep, [U, #64]
1625    prfm            pldl1keep, [V, #64]
1626    prfm            pldl1keep, [Y, #64]
1627    sqxtun          v1\b_offs\defsize, v28.8h
1628    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
1629    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1630    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1631    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1632    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1633    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1634    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1635    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1636  .else  /**************************** rgb565 ********************************/
1637    sqshlu          v21.8h, v20.8h, #8
1638    sqshlu          v25.8h, v24.8h, #8
1639    sqshlu          v29.8h, v28.8h, #8
1640    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
1641    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1642    ld1             {v0.8b}, [Y], 8
1643    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1644    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1645    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1646    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1647    sri             v25.8h, v21.8h, #5
1648    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1649    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1650    prfm            pldl1keep, [U, #64]
1651    prfm            pldl1keep, [V, #64]
1652    prfm            pldl1keep, [Y, #64]
1653    sri             v25.8h, v29.8h, #11
1654  .endif
1655    do_store        \bpp, 8, \fast_st3
1656    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
1657    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
1658.endm
1659
1660.macro do_yuv_to_rgb
1661    do_yuv_to_rgb_stage1
1662    do_yuv_to_rgb_stage2
1663.endm
1664
1665/* Apple gas crashes on adrl, work around that by using adr.
1666 * But this requires a copy of these constants for each function.
1667 */
1668
1669.balign 16
1670.if \fast_st3 == 1
1671Ljsimd_ycc_\colorid\()_neon_consts:
1672.else
1673Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
1674.endif
1675  .short 0,      0,     0,      0
1676  .short 22971, -11277, -23401, 29033
1677  .short -128,  -128,   -128,   -128
1678  .short -128,  -128,   -128,   -128
1679
1680.if \fast_st3 == 1
1681asm_function jsimd_ycc_\colorid\()_convert_neon
1682.else
1683asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
1684.endif
1685    OUTPUT_WIDTH    .req w0
1686    INPUT_BUF       .req x1
1687    INPUT_ROW       .req w2
1688    OUTPUT_BUF      .req x3
1689    NUM_ROWS        .req w4
1690
1691    INPUT_BUF0      .req x5
1692    INPUT_BUF1      .req x6
1693    INPUT_BUF2      .req x1
1694
1695    RGB             .req x7
1696    Y               .req x9
1697    U               .req x10
1698    V               .req x11
1699    N               .req w15
1700
1701    sub             sp, sp, 64
1702    mov             x9, sp
1703
1704    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
1705    .if \fast_st3 == 1
1706      adr           x15, Ljsimd_ycc_\colorid\()_neon_consts
1707    .else
1708      adr           x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
1709    .endif
1710
1711    /* Save NEON registers */
1712    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1713    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1714    ld1             {v0.4h, v1.4h}, [x15], 16
1715    ld1             {v2.8h}, [x15]
1716
1717    ldr             INPUT_BUF0, [INPUT_BUF]
1718    ldr             INPUT_BUF1, [INPUT_BUF, #8]
1719    ldr             INPUT_BUF2, [INPUT_BUF, #16]
1720    .unreq          INPUT_BUF
1721
1722    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1723    movi            v10.16b, #255
1724    movi            v13.16b, #255
1725
1726    /* Outer loop over scanlines */
1727    cmp             NUM_ROWS, #1
1728    b.lt            9f
17290:
1730    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
1731    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
1732    mov             N, OUTPUT_WIDTH
1733    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
1734    add             INPUT_ROW, INPUT_ROW, #1
1735    ldr             RGB, [OUTPUT_BUF], #8
1736
1737    /* Inner loop over pixels */
1738    subs            N, N, #8
1739    b.lt            3f
1740    do_load         8
1741    do_yuv_to_rgb_stage1
1742    subs            N, N, #8
1743    b.lt            2f
17441:
1745    do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
1746    subs            N, N, #8
1747    b.ge            1b
17482:
1749    do_yuv_to_rgb_stage2
1750    do_store        \bpp, 8, \fast_st3
1751    tst             N, #7
1752    b.eq            8f
17533:
1754    tst             N, #4
1755    b.eq            3f
1756    do_load         4
17573:
1758    tst             N, #2
1759    b.eq            4f
1760    do_load         2
17614:
1762    tst             N, #1
1763    b.eq            5f
1764    do_load         1
17655:
1766    do_yuv_to_rgb
1767    tst             N, #4
1768    b.eq            6f
1769    do_store        \bpp, 4, \fast_st3
17706:
1771    tst             N, #2
1772    b.eq            7f
1773    do_store        \bpp, 2, \fast_st3
17747:
1775    tst             N, #1
1776    b.eq            8f
1777    do_store        \bpp, 1, \fast_st3
17788:
1779    subs            NUM_ROWS, NUM_ROWS, #1
1780    b.gt            0b
17819:
1782    /* Restore all registers and return */
1783    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1784    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1785    br              x30
1786    .unreq          OUTPUT_WIDTH
1787    .unreq          INPUT_ROW
1788    .unreq          OUTPUT_BUF
1789    .unreq          NUM_ROWS
1790    .unreq          INPUT_BUF0
1791    .unreq          INPUT_BUF1
1792    .unreq          INPUT_BUF2
1793    .unreq          RGB
1794    .unreq          Y
1795    .unreq          U
1796    .unreq          V
1797    .unreq          N
1798
1799.purgem do_yuv_to_rgb
1800.purgem do_yuv_to_rgb_stage1
1801.purgem do_yuv_to_rgb_stage2
1802.purgem do_yuv_to_rgb_stage2_store_load_stage1
1803
1804.endm
1805
1806/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
1807generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
1808generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
1809generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
1810generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
1811generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
1812generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
1813generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
1814
1815generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
1816generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
1817
1818.purgem do_load
1819.purgem do_store
1820
1821
1822/*****************************************************************************/
1823
1824/*
1825 * jsimd_extrgb_ycc_convert_neon
1826 * jsimd_extbgr_ycc_convert_neon
1827 * jsimd_extrgbx_ycc_convert_neon
1828 * jsimd_extbgrx_ycc_convert_neon
1829 * jsimd_extxbgr_ycc_convert_neon
1830 * jsimd_extxrgb_ycc_convert_neon
1831 *
1832 * Colorspace conversion RGB -> YCbCr
1833 */
1834
1835.macro do_store size
1836  .if \size == 8
1837    st1             {v20.8b}, [Y], #8
1838    st1             {v21.8b}, [U], #8
1839    st1             {v22.8b}, [V], #8
1840  .elseif \size == 4
1841    st1             {v20.b}[0], [Y], #1
1842    st1             {v20.b}[1], [Y], #1
1843    st1             {v20.b}[2], [Y], #1
1844    st1             {v20.b}[3], [Y], #1
1845    st1             {v21.b}[0], [U], #1
1846    st1             {v21.b}[1], [U], #1
1847    st1             {v21.b}[2], [U], #1
1848    st1             {v21.b}[3], [U], #1
1849    st1             {v22.b}[0], [V], #1
1850    st1             {v22.b}[1], [V], #1
1851    st1             {v22.b}[2], [V], #1
1852    st1             {v22.b}[3], [V], #1
1853  .elseif \size == 2
1854    st1             {v20.b}[4], [Y], #1
1855    st1             {v20.b}[5], [Y], #1
1856    st1             {v21.b}[4], [U], #1
1857    st1             {v21.b}[5], [U], #1
1858    st1             {v22.b}[4], [V], #1
1859    st1             {v22.b}[5], [V], #1
1860  .elseif \size == 1
1861    st1             {v20.b}[6], [Y], #1
1862    st1             {v21.b}[6], [U], #1
1863    st1             {v22.b}[6], [V], #1
1864  .else
1865    .error unsupported macroblock size
1866  .endif
1867.endm
1868
1869.macro do_load bpp, size, fast_ld3
1870  .if \bpp == 24
1871    .if \size == 8
1872      .if \fast_ld3 == 1
1873        ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
1874      .else
1875        ld1         {v10.b}[0], [RGB], #1
1876        ld1         {v11.b}[0], [RGB], #1
1877        ld1         {v12.b}[0], [RGB], #1
1878
1879        ld1         {v10.b}[1], [RGB], #1
1880        ld1         {v11.b}[1], [RGB], #1
1881        ld1         {v12.b}[1], [RGB], #1
1882
1883        ld1         {v10.b}[2], [RGB], #1
1884        ld1         {v11.b}[2], [RGB], #1
1885        ld1         {v12.b}[2], [RGB], #1
1886
1887        ld1         {v10.b}[3], [RGB], #1
1888        ld1         {v11.b}[3], [RGB], #1
1889        ld1         {v12.b}[3], [RGB], #1
1890
1891        ld1         {v10.b}[4], [RGB], #1
1892        ld1         {v11.b}[4], [RGB], #1
1893        ld1         {v12.b}[4], [RGB], #1
1894
1895        ld1         {v10.b}[5], [RGB], #1
1896        ld1         {v11.b}[5], [RGB], #1
1897        ld1         {v12.b}[5], [RGB], #1
1898
1899        ld1         {v10.b}[6], [RGB], #1
1900        ld1         {v11.b}[6], [RGB], #1
1901        ld1         {v12.b}[6], [RGB], #1
1902
1903        ld1         {v10.b}[7], [RGB], #1
1904        ld1         {v11.b}[7], [RGB], #1
1905        ld1         {v12.b}[7], [RGB], #1
1906      .endif
1907      prfm          pldl1keep, [RGB, #128]
1908    .elseif \size == 4
1909      ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
1910      ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
1911      ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
1912      ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
1913    .elseif \size == 2
1914      ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
1915      ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
1916    .elseif \size == 1
1917      ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
1918    .else
1919      .error unsupported macroblock size
1920    .endif
1921  .elseif \bpp == 32
1922    .if \size == 8
1923      ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
1924      prfm          pldl1keep, [RGB, #128]
1925    .elseif \size == 4
1926      ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
1927      ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
1928      ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
1929      ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
1930    .elseif \size == 2
1931      ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
1932      ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
1933    .elseif \size == 1
1934      ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
1935    .else
1936      .error unsupported macroblock size
1937    .endif
1938  .else
1939    .error unsupported bpp
1940  .endif
1941.endm
1942
1943.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
1944                                           b_offs, fast_ld3
1945
1946/*
1947 * 2-stage pipelined RGB->YCbCr conversion
1948 */
1949
1950.macro do_rgb_to_yuv_stage1
1951    ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
1952    ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
1953    ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
1954    rev64           v18.4s, v1.4s
1955    rev64           v26.4s, v1.4s
1956    rev64           v28.4s, v1.4s
1957    rev64           v30.4s, v1.4s
1958    umull           v14.4s, v4.4h, v0.h[0]
1959    umull2          v16.4s, v4.8h, v0.h[0]
1960    umlsl           v18.4s, v4.4h, v0.h[3]
1961    umlsl2          v26.4s, v4.8h, v0.h[3]
1962    umlal           v28.4s, v4.4h, v0.h[5]
1963    umlal2          v30.4s, v4.8h, v0.h[5]
1964    umlal           v14.4s, v6.4h, v0.h[1]
1965    umlal2          v16.4s, v6.8h, v0.h[1]
1966    umlsl           v18.4s, v6.4h, v0.h[4]
1967    umlsl2          v26.4s, v6.8h, v0.h[4]
1968    umlsl           v28.4s, v6.4h, v0.h[6]
1969    umlsl2          v30.4s, v6.8h, v0.h[6]
1970    umlal           v14.4s, v8.4h, v0.h[2]
1971    umlal2          v16.4s, v8.8h, v0.h[2]
1972    umlal           v18.4s, v8.4h, v0.h[5]
1973    umlal2          v26.4s, v8.8h, v0.h[5]
1974    umlsl           v28.4s, v8.4h, v0.h[7]
1975    umlsl2          v30.4s, v8.8h, v0.h[7]
1976.endm
1977
1978.macro do_rgb_to_yuv_stage2
1979    rshrn           v20.4h, v14.4s, #16
1980    shrn            v22.4h, v18.4s, #16
1981    shrn            v24.4h, v28.4s, #16
1982    rshrn2          v20.8h, v16.4s, #16
1983    shrn2           v22.8h, v26.4s, #16
1984    shrn2           v24.8h, v30.4s, #16
1985    xtn             v20.8b, v20.8h       /* v20 = y */
1986    xtn             v21.8b, v22.8h       /* v21 = u */
1987    xtn             v22.8b, v24.8h       /* v22 = v */
1988.endm
1989
1990.macro do_rgb_to_yuv
1991    do_rgb_to_yuv_stage1
1992    do_rgb_to_yuv_stage2
1993.endm
1994
1995/* TODO: expand macros and interleave instructions if some in-order
1996 *       ARM64 processor actually can dual-issue LOAD/STORE with ALU */
1997.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
1998    do_rgb_to_yuv_stage2
1999    do_load         \bpp, 8, \fast_ld3
2000    st1             {v20.8b}, [Y], #8
2001    st1             {v21.8b}, [U], #8
2002    st1             {v22.8b}, [V], #8
2003    do_rgb_to_yuv_stage1
2004.endm
2005
2006.balign 16
2007.if \fast_ld3 == 1
2008Ljsimd_\colorid\()_ycc_neon_consts:
2009.else
2010Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
2011.endif
2012  .short 19595, 38470, 7471, 11059
2013  .short 21709, 32768, 27439, 5329
2014  .short 32767, 128, 32767, 128
2015  .short 32767, 128, 32767, 128
2016
2017.if \fast_ld3 == 1
2018asm_function jsimd_\colorid\()_ycc_convert_neon
2019.else
2020asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
2021.endif
2022    OUTPUT_WIDTH    .req w0
2023    INPUT_BUF       .req x1
2024    OUTPUT_BUF      .req x2
2025    OUTPUT_ROW      .req w3
2026    NUM_ROWS        .req w4
2027
2028    OUTPUT_BUF0     .req x5
2029    OUTPUT_BUF1     .req x6
2030    OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
2031
2032    RGB             .req x7
2033    Y               .req x9
2034    U               .req x10
2035    V               .req x11
2036    N               .req w12
2037
2038    /* Load constants to d0, d1, d2, d3 */
2039    .if \fast_ld3 == 1
2040      adr           x13, Ljsimd_\colorid\()_ycc_neon_consts
2041    .else
2042      adr           x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
2043    .endif
2044    ld1             {v0.8h, v1.8h}, [x13]
2045
2046    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
2047    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
2048    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
2049    .unreq          OUTPUT_BUF
2050
2051    /* Save NEON registers */
2052    sub             sp, sp, #64
2053    mov             x9, sp
2054    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
2055    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
2056
2057    /* Outer loop over scanlines */
2058    cmp             NUM_ROWS, #1
2059    b.lt            9f
20600:
2061    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
2062    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
2063    mov             N, OUTPUT_WIDTH
2064    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
2065    add             OUTPUT_ROW, OUTPUT_ROW, #1
2066    ldr             RGB, [INPUT_BUF], #8
2067
2068    /* Inner loop over pixels */
2069    subs            N, N, #8
2070    b.lt            3f
2071    do_load         \bpp, 8, \fast_ld3
2072    do_rgb_to_yuv_stage1
2073    subs            N, N, #8
2074    b.lt            2f
20751:
2076    do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
2077    subs            N, N, #8
2078    b.ge            1b
20792:
2080    do_rgb_to_yuv_stage2
2081    do_store        8
2082    tst             N, #7
2083    b.eq            8f
20843:
2085    tbz             N, #2, 3f
2086    do_load         \bpp, 4, \fast_ld3
20873:
2088    tbz             N, #1, 4f
2089    do_load         \bpp, 2, \fast_ld3
20904:
2091    tbz             N, #0, 5f
2092    do_load         \bpp, 1, \fast_ld3
20935:
2094    do_rgb_to_yuv
2095    tbz             N, #2, 6f
2096    do_store        4
20976:
2098    tbz             N, #1, 7f
2099    do_store        2
21007:
2101    tbz             N, #0, 8f
2102    do_store        1
21038:
2104    subs            NUM_ROWS, NUM_ROWS, #1
2105    b.gt            0b
21069:
2107    /* Restore all registers and return */
2108    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2109    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2110    br              x30
2111
2112    .unreq          OUTPUT_WIDTH
2113    .unreq          OUTPUT_ROW
2114    .unreq          INPUT_BUF
2115    .unreq          NUM_ROWS
2116    .unreq          OUTPUT_BUF0
2117    .unreq          OUTPUT_BUF1
2118    .unreq          OUTPUT_BUF2
2119    .unreq          RGB
2120    .unreq          Y
2121    .unreq          U
2122    .unreq          V
2123    .unreq          N
2124
2125.purgem do_rgb_to_yuv
2126.purgem do_rgb_to_yuv_stage1
2127.purgem do_rgb_to_yuv_stage2
2128.purgem do_rgb_to_yuv_stage2_store_load_stage1
2129
2130.endm
2131
2132/*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
2133generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
2134generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
2135generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
2136generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
2137generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
2138generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
2139
2140generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
2141generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
2142
2143.purgem do_load
2144.purgem do_store
2145
2146
2147/*****************************************************************************/
2148
2149/*
2150 * Load data into workspace, applying unsigned->signed conversion
2151 *
2152 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
2153 *       rid of VST1.16 instructions
2154 */
2155
2156asm_function jsimd_convsamp_neon
2157    SAMPLE_DATA     .req x0
2158    START_COL       .req x1
2159    WORKSPACE       .req x2
2160    TMP1            .req x9
2161    TMP2            .req x10
2162    TMP3            .req x11
2163    TMP4            .req x12
2164    TMP5            .req x13
2165    TMP6            .req x14
2166    TMP7            .req x15
2167    TMP8            .req x4
2168    TMPDUP          .req w3
2169
2170    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
2171       guarantee that the upper (unused) 32 bits of x1 are valid.  This
2172       instruction ensures that those bits are set to zero. */
2173    uxtw x1, w1
2174
2175    mov             TMPDUP, #128
2176    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
2177    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
2178    dup             v0.8b, TMPDUP
2179    add             TMP1, TMP1, START_COL
2180    add             TMP2, TMP2, START_COL
2181    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
2182    add             TMP3, TMP3, START_COL
2183    add             TMP4, TMP4, START_COL
2184    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
2185    add             TMP5, TMP5, START_COL
2186    add             TMP6, TMP6, START_COL
2187    ld1             {v16.8b}, [TMP1]
2188    add             TMP7, TMP7, START_COL
2189    add             TMP8, TMP8, START_COL
2190    ld1             {v17.8b}, [TMP2]
2191    usubl           v16.8h, v16.8b, v0.8b
2192    ld1             {v18.8b}, [TMP3]
2193    usubl           v17.8h, v17.8b, v0.8b
2194    ld1             {v19.8b}, [TMP4]
2195    usubl           v18.8h, v18.8b, v0.8b
2196    ld1             {v20.8b}, [TMP5]
2197    usubl           v19.8h, v19.8b, v0.8b
2198    ld1             {v21.8b}, [TMP6]
2199    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
2200    usubl           v20.8h, v20.8b, v0.8b
2201    ld1             {v22.8b}, [TMP7]
2202    usubl           v21.8h, v21.8b, v0.8b
2203    ld1             {v23.8b}, [TMP8]
2204    usubl           v22.8h, v22.8b, v0.8b
2205    usubl           v23.8h, v23.8b, v0.8b
2206    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
2207
2208    br              x30
2209
2210    .unreq          SAMPLE_DATA
2211    .unreq          START_COL
2212    .unreq          WORKSPACE
2213    .unreq          TMP1
2214    .unreq          TMP2
2215    .unreq          TMP3
2216    .unreq          TMP4
2217    .unreq          TMP5
2218    .unreq          TMP6
2219    .unreq          TMP7
2220    .unreq          TMP8
2221    .unreq          TMPDUP
2222
2223/*****************************************************************************/
2224
2225/*
2226 * jsimd_fdct_islow_neon
2227 *
2228 * This file contains a slow-but-accurate integer implementation of the
2229 * forward DCT (Discrete Cosine Transform). The following code is based
2230 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
2231 * more details.
2232 *
2233 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2234 *       rid of a bunch of VLD1.16 instructions
2235 */
2236
2237#define CONST_BITS 13
2238#define PASS1_BITS 2
2239
2240#define DESCALE_P1 (CONST_BITS-PASS1_BITS)
2241#define DESCALE_P2 (CONST_BITS+PASS1_BITS)
2242
2243#define F_0_298  2446  /* FIX(0.298631336) */
2244#define F_0_390  3196  /* FIX(0.390180644) */
2245#define F_0_541  4433  /* FIX(0.541196100) */
2246#define F_0_765  6270  /* FIX(0.765366865) */
2247#define F_0_899  7373  /* FIX(0.899976223) */
2248#define F_1_175  9633  /* FIX(1.175875602) */
2249#define F_1_501 12299  /* FIX(1.501321110) */
2250#define F_1_847 15137  /* FIX(1.847759065) */
2251#define F_1_961 16069  /* FIX(1.961570560) */
2252#define F_2_053 16819  /* FIX(2.053119869) */
2253#define F_2_562 20995  /* FIX(2.562915447) */
2254#define F_3_072 25172  /* FIX(3.072711026) */
2255
2256.balign 16
2257Ljsimd_fdct_islow_neon_consts:
2258  .short F_0_298
2259  .short -F_0_390
2260  .short F_0_541
2261  .short F_0_765
2262  .short - F_0_899
2263  .short F_1_175
2264  .short F_1_501
2265  .short - F_1_847
2266  .short - F_1_961
2267  .short F_2_053
2268  .short - F_2_562
2269  .short F_3_072
2270  .short 0          /* padding */
2271  .short 0
2272  .short 0
2273  .short 0
2274
2275#undef F_0_298
2276#undef F_0_390
2277#undef F_0_541
2278#undef F_0_765
2279#undef F_0_899
2280#undef F_1_175
2281#undef F_1_501
2282#undef F_1_847
2283#undef F_1_961
2284#undef F_2_053
2285#undef F_2_562
2286#undef F_3_072
2287#define XFIX_P_0_298 v0.h[0]
2288#define XFIX_N_0_390 v0.h[1]
2289#define XFIX_P_0_541 v0.h[2]
2290#define XFIX_P_0_765 v0.h[3]
2291#define XFIX_N_0_899 v0.h[4]
2292#define XFIX_P_1_175 v0.h[5]
2293#define XFIX_P_1_501 v0.h[6]
2294#define XFIX_N_1_847 v0.h[7]
2295#define XFIX_N_1_961 v1.h[0]
2296#define XFIX_P_2_053 v1.h[1]
2297#define XFIX_N_2_562 v1.h[2]
2298#define XFIX_P_3_072 v1.h[3]
2299
2300asm_function jsimd_fdct_islow_neon
2301
2302    DATA            .req x0
2303    TMP             .req x9
2304
2305    /* Load constants */
2306    adr             TMP, Ljsimd_fdct_islow_neon_consts
2307    ld1             {v0.8h, v1.8h}, [TMP]
2308
2309    /* Save NEON registers */
2310    sub             sp, sp, #64
2311    mov             x10, sp
2312    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
2313    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
2314
2315    /* Load all DATA into NEON registers with the following allocation:
2316     *       0 1 2 3 | 4 5 6 7
2317     *      ---------+--------
2318     *   0 | d16     | d17    | v16.8h
2319     *   1 | d18     | d19    | v17.8h
2320     *   2 | d20     | d21    | v18.8h
2321     *   3 | d22     | d23    | v19.8h
2322     *   4 | d24     | d25    | v20.8h
2323     *   5 | d26     | d27    | v21.8h
2324     *   6 | d28     | d29    | v22.8h
2325     *   7 | d30     | d31    | v23.8h
2326     */
2327
2328    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2329    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2330    sub             DATA, DATA, #64
2331
2332    /* Transpose */
2333    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2334    /* 1-D FDCT */
2335    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
2336    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
2337    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
2338    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
2339    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
2340    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
2341    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
2342    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
2343
2344    /* even part */
2345
2346    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
2347    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
2348    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
2349    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
2350
2351    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
2352    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
2353
2354    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
2355
2356    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
2357    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
2358
2359    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2360    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2361    mov             v22.16b, v18.16b
2362    mov             v25.16b, v24.16b
2363
2364    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2365    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2366    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2367    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2368
2369    rshrn           v18.4h, v18.4s, #DESCALE_P1
2370    rshrn           v22.4h, v22.4s, #DESCALE_P1
2371    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2372    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2373
2374    /* Odd part */
2375
2376    add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
2377    add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
2378    add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
2379    add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
2380    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
2381    smull2          v5.4s, v10.8h, XFIX_P_1_175
2382    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2383    smlal2          v5.4s, v11.8h, XFIX_P_1_175
2384
2385    smull2          v24.4s, v28.8h, XFIX_P_0_298
2386    smull2          v25.4s, v29.8h, XFIX_P_2_053
2387    smull2          v26.4s, v30.8h, XFIX_P_3_072
2388    smull2          v27.4s, v31.8h, XFIX_P_1_501
2389    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2390    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2391    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2392    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2393
2394    smull2          v12.4s, v8.8h, XFIX_N_0_899
2395    smull2          v13.4s, v9.8h, XFIX_N_2_562
2396    smull2          v14.4s, v10.8h, XFIX_N_1_961
2397    smull2          v15.4s, v11.8h, XFIX_N_0_390
2398    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
2399    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
2400    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
2401    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
2402
2403    add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
2404    add             v14.4s, v14.4s, v5.4s
2405    add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
2406    add             v15.4s, v15.4s, v5.4s
2407
2408    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
2409    add             v24.4s, v24.4s, v12.4s
2410    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
2411    add             v25.4s, v25.4s, v13.4s
2412    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
2413    add             v26.4s, v26.4s, v14.4s
2414    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
2415    add             v27.4s, v27.4s, v15.4s
2416
2417    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
2418    add             v24.4s, v24.4s, v14.4s
2419    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
2420    add             v25.4s, v25.4s, v15.4s
2421    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
2422    add             v26.4s, v26.4s, v13.4s
2423    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
2424    add             v27.4s, v27.4s, v12.4s
2425
2426    rshrn           v23.4h, v28.4s, #DESCALE_P1
2427    rshrn           v21.4h, v29.4s, #DESCALE_P1
2428    rshrn           v19.4h, v30.4s, #DESCALE_P1
2429    rshrn           v17.4h, v31.4s, #DESCALE_P1
2430    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2431    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2432    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2433    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2434
2435    /* Transpose */
2436    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2437
2438    /* 1-D FDCT */
2439    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
2440    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
2441    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
2442    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
2443    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
2444    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
2445    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
2446    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
2447
2448    /* even part */
2449    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
2450    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
2451    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
2452    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
2453
2454    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
2455    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
2456
2457    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
2458
2459    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
2460    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
2461
2462    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2463    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2464    mov             v22.16b, v18.16b
2465    mov             v25.16b, v24.16b
2466
2467    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2468    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2469    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2470    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2471
2472    rshrn           v18.4h, v18.4s, #DESCALE_P2
2473    rshrn           v22.4h, v22.4s, #DESCALE_P2
2474    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2475    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2476
2477    /* Odd part */
2478    add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
2479    add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
2480    add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
2481    add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
2482
2483    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
2484    smull2          v5.4s, v10.8h, XFIX_P_1_175
2485    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2486    smlal2          v5.4s, v11.8h, XFIX_P_1_175
2487
2488    smull2          v24.4s, v28.8h, XFIX_P_0_298
2489    smull2          v25.4s, v29.8h, XFIX_P_2_053
2490    smull2          v26.4s, v30.8h, XFIX_P_3_072
2491    smull2          v27.4s, v31.8h, XFIX_P_1_501
2492    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2493    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2494    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2495    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2496
2497    smull2          v12.4s, v8.8h, XFIX_N_0_899
2498    smull2          v13.4s, v9.8h, XFIX_N_2_562
2499    smull2          v14.4s, v10.8h, XFIX_N_1_961
2500    smull2          v15.4s, v11.8h, XFIX_N_0_390
2501    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
2502    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
2503    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
2504    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
2505
2506    add             v10.4s, v10.4s, v4.4s
2507    add             v14.4s, v14.4s, v5.4s
2508    add             v11.4s, v11.4s, v4.4s
2509    add             v15.4s, v15.4s, v5.4s
2510
2511    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
2512    add             v24.4s, v24.4s, v12.4s
2513    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
2514    add             v25.4s, v25.4s, v13.4s
2515    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
2516    add             v26.4s, v26.4s, v14.4s
2517    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
2518    add             v27.4s, v27.4s, v15.4s
2519
2520    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
2521    add             v24.4s, v24.4s, v14.4s
2522    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
2523    add             v25.4s, v25.4s, v15.4s
2524    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
2525    add             v26.4s, v26.4s, v13.4s
2526    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
2527    add             v27.4s, v27.4s, v12.4s
2528
2529    rshrn           v23.4h, v28.4s, #DESCALE_P2
2530    rshrn           v21.4h, v29.4s, #DESCALE_P2
2531    rshrn           v19.4h, v30.4s, #DESCALE_P2
2532    rshrn           v17.4h, v31.4s, #DESCALE_P2
2533    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2534    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2535    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2536    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2537
2538    /* store results */
2539    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2540    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2541
2542    /* Restore NEON registers */
2543    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2544    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2545
2546    br              x30
2547
2548    .unreq          DATA
2549    .unreq          TMP
2550
2551#undef XFIX_P_0_298
2552#undef XFIX_N_0_390
2553#undef XFIX_P_0_541
2554#undef XFIX_P_0_765
2555#undef XFIX_N_0_899
2556#undef XFIX_P_1_175
2557#undef XFIX_P_1_501
2558#undef XFIX_N_1_847
2559#undef XFIX_N_1_961
2560#undef XFIX_P_2_053
2561#undef XFIX_N_2_562
2562#undef XFIX_P_3_072
2563
2564
2565/*****************************************************************************/
2566
2567/*
2568 * jsimd_fdct_ifast_neon
2569 *
2570 * This function contains a fast, not so accurate integer implementation of
2571 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
2572 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
2573 * function from jfdctfst.c
2574 *
2575 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2576 *       rid of a bunch of VLD1.16 instructions
2577 */
2578
2579#undef XFIX_0_541196100
2580#define XFIX_0_382683433 v0.h[0]
2581#define XFIX_0_541196100 v0.h[1]
2582#define XFIX_0_707106781 v0.h[2]
2583#define XFIX_1_306562965 v0.h[3]
2584
2585.balign 16
2586Ljsimd_fdct_ifast_neon_consts:
2587  .short (98 * 128)               /* XFIX_0_382683433 */
2588  .short (139 * 128)              /* XFIX_0_541196100 */
2589  .short (181 * 128)              /* XFIX_0_707106781 */
2590  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
2591
2592asm_function jsimd_fdct_ifast_neon
2593
2594    DATA            .req x0
2595    TMP             .req x9
2596
2597    /* Load constants */
2598    adr             TMP, Ljsimd_fdct_ifast_neon_consts
2599    ld1             {v0.4h}, [TMP]
2600
2601    /* Load all DATA into NEON registers with the following allocation:
2602     *       0 1 2 3 | 4 5 6 7
2603     *      ---------+--------
2604     *   0 | d16     | d17    | v0.8h
2605     *   1 | d18     | d19    | q9
2606     *   2 | d20     | d21    | q10
2607     *   3 | d22     | d23    | q11
2608     *   4 | d24     | d25    | q12
2609     *   5 | d26     | d27    | q13
2610     *   6 | d28     | d29    | q14
2611     *   7 | d30     | d31    | q15
2612     */
2613
2614    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2615    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2616    mov             TMP, #2
2617    sub             DATA, DATA, #64
26181:
2619    /* Transpose */
2620    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
2621    subs            TMP, TMP, #1
2622    /* 1-D FDCT */
2623    add             v4.8h, v19.8h, v20.8h
2624    sub             v20.8h, v19.8h, v20.8h
2625    sub             v28.8h, v18.8h, v21.8h
2626    add             v18.8h, v18.8h, v21.8h
2627    sub             v29.8h, v17.8h, v22.8h
2628    add             v17.8h, v17.8h, v22.8h
2629    sub             v21.8h, v16.8h, v23.8h
2630    add             v16.8h, v16.8h, v23.8h
2631    sub             v6.8h, v17.8h, v18.8h
2632    sub             v7.8h, v16.8h, v4.8h
2633    add             v5.8h, v17.8h, v18.8h
2634    add             v6.8h, v6.8h, v7.8h
2635    add             v4.8h, v16.8h, v4.8h
2636    sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
2637    add             v19.8h, v20.8h, v28.8h
2638    add             v16.8h, v4.8h, v5.8h
2639    sub             v20.8h, v4.8h, v5.8h
2640    add             v5.8h, v28.8h, v29.8h
2641    add             v29.8h, v29.8h, v21.8h
2642    sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
2643    sub             v28.8h, v19.8h, v29.8h
2644    add             v18.8h, v7.8h, v6.8h
2645    sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
2646    sub             v22.8h, v7.8h, v6.8h
2647    sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
2648    sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
2649    add             v6.8h, v21.8h, v5.8h
2650    sub             v5.8h, v21.8h, v5.8h
2651    add             v29.8h, v29.8h, v28.8h
2652    add             v19.8h, v19.8h, v28.8h
2653    add             v29.8h, v29.8h, v7.8h
2654    add             v21.8h, v5.8h, v19.8h
2655    sub             v19.8h, v5.8h, v19.8h
2656    add             v17.8h, v6.8h, v29.8h
2657    sub             v23.8h, v6.8h, v29.8h
2658
2659    b.ne            1b
2660
2661    /* store results */
2662    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2663    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2664
2665    br              x30
2666
2667    .unreq          DATA
2668    .unreq          TMP
2669#undef XFIX_0_382683433
2670#undef XFIX_0_541196100
2671#undef XFIX_0_707106781
2672#undef XFIX_1_306562965
2673
2674
2675/*****************************************************************************/
2676
2677/*
2678 * GLOBAL(void)
2679 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
2680 *                      DCTELEM *workspace);
2681 *
2682 */
2683asm_function jsimd_quantize_neon
2684
2685    COEF_BLOCK      .req x0
2686    DIVISORS        .req x1
2687    WORKSPACE       .req x2
2688
2689    RECIPROCAL      .req DIVISORS
2690    CORRECTION      .req x9
2691    SHIFT           .req x10
2692    LOOP_COUNT      .req x11
2693
2694    mov             LOOP_COUNT, #2
2695    add             CORRECTION, DIVISORS, #(64 * 2)
2696    add             SHIFT, DIVISORS, #(64 * 6)
26971:
2698    subs            LOOP_COUNT, LOOP_COUNT, #1
2699    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
2700    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
2701    abs             v20.8h, v0.8h
2702    abs             v21.8h, v1.8h
2703    abs             v22.8h, v2.8h
2704    abs             v23.8h, v3.8h
2705    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
2706    add             v20.8h, v20.8h, v4.8h  /* add correction */
2707    add             v21.8h, v21.8h, v5.8h
2708    add             v22.8h, v22.8h, v6.8h
2709    add             v23.8h, v23.8h, v7.8h
2710    umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
2711    umull2          v16.4s, v20.8h, v28.8h
2712    umull           v5.4s, v21.4h, v29.4h
2713    umull2          v17.4s, v21.8h, v29.8h
2714    umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
2715    umull2          v18.4s, v22.8h, v30.8h
2716    umull           v7.4s, v23.4h, v31.4h
2717    umull2          v19.4s, v23.8h, v31.8h
2718    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
2719    shrn            v4.4h, v4.4s, #16
2720    shrn            v5.4h, v5.4s, #16
2721    shrn            v6.4h, v6.4s, #16
2722    shrn            v7.4h, v7.4s, #16
2723    shrn2           v4.8h, v16.4s, #16
2724    shrn2           v5.8h, v17.4s, #16
2725    shrn2           v6.8h, v18.4s, #16
2726    shrn2           v7.8h, v19.4s, #16
2727    neg             v24.8h, v24.8h
2728    neg             v25.8h, v25.8h
2729    neg             v26.8h, v26.8h
2730    neg             v27.8h, v27.8h
2731    sshr            v0.8h, v0.8h, #15  /* extract sign */
2732    sshr            v1.8h, v1.8h, #15
2733    sshr            v2.8h, v2.8h, #15
2734    sshr            v3.8h, v3.8h, #15
2735    ushl            v4.8h, v4.8h, v24.8h  /* shift */
2736    ushl            v5.8h, v5.8h, v25.8h
2737    ushl            v6.8h, v6.8h, v26.8h
2738    ushl            v7.8h, v7.8h, v27.8h
2739
2740    eor             v4.16b, v4.16b, v0.16b  /* restore sign */
2741    eor             v5.16b, v5.16b, v1.16b
2742    eor             v6.16b, v6.16b, v2.16b
2743    eor             v7.16b, v7.16b, v3.16b
2744    sub             v4.8h, v4.8h, v0.8h
2745    sub             v5.8h, v5.8h, v1.8h
2746    sub             v6.8h, v6.8h, v2.8h
2747    sub             v7.8h, v7.8h, v3.8h
2748    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
2749
2750    b.ne            1b
2751
2752    br              x30  /* return */
2753
2754    .unreq          COEF_BLOCK
2755    .unreq          DIVISORS
2756    .unreq          WORKSPACE
2757    .unreq          RECIPROCAL
2758    .unreq          CORRECTION
2759    .unreq          SHIFT
2760    .unreq          LOOP_COUNT
2761
2762
2763/*****************************************************************************/
2764
2765/*
2766 * Downsample pixel values of a single component.
2767 * This version handles the common case of 2:1 horizontal and 1:1 vertical,
2768 * without smoothing.
2769 *
2770 * GLOBAL(void)
2771 * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
2772 *                             JDIMENSION v_samp_factor,
2773 *                             JDIMENSION width_blocks, JSAMPARRAY input_data,
2774 *                             JSAMPARRAY output_data);
2775 */
2776
2777.balign 16
2778Ljsimd_h2_downsample_neon_consts:
2779  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2780        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
2781  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2782        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
2783  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2784        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
2785  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2786        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
2787  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2788        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
2789  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2790        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
2791  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2792        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
2793  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2794        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
2795  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2796        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
2797  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
2798        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
2799  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
2800        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
2801  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
2802        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
2803  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
2804        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
2805  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
2806        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
2807  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
2808        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
2809  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
2810        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
2811
2812asm_function jsimd_h2v1_downsample_neon
2813    IMAGE_WIDTH     .req x0
2814    MAX_V_SAMP      .req x1
2815    V_SAMP          .req x2
2816    BLOCK_WIDTH     .req x3
2817    INPUT_DATA      .req x4
2818    OUTPUT_DATA     .req x5
2819    OUTPTR          .req x9
2820    INPTR           .req x10
2821    TMP1            .req x11
2822    TMP2            .req x12
2823    TMP3            .req x13
2824    TMPDUP          .req w15
2825
2826    mov             TMPDUP, #0x10000
2827    lsl             TMP2, BLOCK_WIDTH, #4
2828    sub             TMP2, TMP2, IMAGE_WIDTH
2829    adr             TMP3, Ljsimd_h2_downsample_neon_consts
2830    add             TMP3, TMP3, TMP2, lsl #4
2831    dup             v16.4s, TMPDUP
2832    ld1             {v18.16b}, [TMP3]
2833
28341:  /* row loop */
2835    ldr             INPTR, [INPUT_DATA], #8
2836    ldr             OUTPTR, [OUTPUT_DATA], #8
2837    subs            TMP1, BLOCK_WIDTH, #1
2838    b.eq            3f
28392:  /* columns */
2840    ld1             {v0.16b}, [INPTR], #16
2841    mov             v4.16b, v16.16b
2842    subs            TMP1, TMP1, #1
2843    uadalp          v4.8h, v0.16b
2844    shrn            v6.8b, v4.8h, #1
2845    st1             {v6.8b}, [OUTPTR], #8
2846    b.ne            2b
28473:  /* last columns */
2848    ld1             {v0.16b}, [INPTR]
2849    mov             v4.16b, v16.16b
2850    subs            V_SAMP, V_SAMP, #1
2851    /* expand right */
2852    tbl             v2.16b, {v0.16b}, v18.16b
2853    uadalp          v4.8h, v2.16b
2854    shrn            v6.8b, v4.8h, #1
2855    st1             {v6.8b}, [OUTPTR], #8
2856    b.ne            1b
2857
2858    br              x30
2859
2860    .unreq          IMAGE_WIDTH
2861    .unreq          MAX_V_SAMP
2862    .unreq          V_SAMP
2863    .unreq          BLOCK_WIDTH
2864    .unreq          INPUT_DATA
2865    .unreq          OUTPUT_DATA
2866    .unreq          OUTPTR
2867    .unreq          INPTR
2868    .unreq          TMP1
2869    .unreq          TMP2
2870    .unreq          TMP3
2871    .unreq          TMPDUP
2872
2873
2874/*****************************************************************************/
2875
2876/*
2877 * Downsample pixel values of a single component.
2878 * This version handles the common case of 2:1 horizontal and 2:1 vertical,
2879 * without smoothing.
2880 *
2881 * GLOBAL(void)
2882 * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
2883 *                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
2884 *                             JSAMPARRAY input_data, JSAMPARRAY output_data);
2885 */
2886
2887.balign 16
2888asm_function jsimd_h2v2_downsample_neon
2889    IMAGE_WIDTH     .req x0
2890    MAX_V_SAMP      .req x1
2891    V_SAMP          .req x2
2892    BLOCK_WIDTH     .req x3
2893    INPUT_DATA      .req x4
2894    OUTPUT_DATA     .req x5
2895    OUTPTR          .req x9
2896    INPTR0          .req x10
2897    INPTR1          .req x14
2898    TMP1            .req x11
2899    TMP2            .req x12
2900    TMP3            .req x13
2901    TMPDUP          .req w15
2902
2903    mov             TMPDUP, #1
2904    lsl             TMP2, BLOCK_WIDTH, #4
2905    lsl             TMPDUP, TMPDUP, #17
2906    sub             TMP2, TMP2, IMAGE_WIDTH
2907    adr             TMP3, Ljsimd_h2_downsample_neon_consts
2908    orr             TMPDUP, TMPDUP, #1
2909    add             TMP3, TMP3, TMP2, lsl #4
2910    dup             v16.4s, TMPDUP
2911    ld1             {v18.16b}, [TMP3]
2912
29131:  /* row loop */
2914    ldr             INPTR0, [INPUT_DATA], #8
2915    ldr             OUTPTR, [OUTPUT_DATA], #8
2916    ldr             INPTR1, [INPUT_DATA], #8
2917    subs            TMP1, BLOCK_WIDTH, #1
2918    b.eq            3f
29192:  /* columns */
2920    ld1             {v0.16b}, [INPTR0], #16
2921    ld1             {v1.16b}, [INPTR1], #16
2922    mov             v4.16b, v16.16b
2923    subs            TMP1, TMP1, #1
2924    uadalp          v4.8h, v0.16b
2925    uadalp          v4.8h, v1.16b
2926    shrn            v6.8b, v4.8h, #2
2927    st1             {v6.8b}, [OUTPTR], #8
2928    b.ne            2b
29293:  /* last columns */
2930    ld1             {v0.16b}, [INPTR0], #16
2931    ld1             {v1.16b}, [INPTR1], #16
2932    mov             v4.16b, v16.16b
2933    subs            V_SAMP, V_SAMP, #1
2934    /* expand right */
2935    tbl             v2.16b, {v0.16b}, v18.16b
2936    tbl             v3.16b, {v1.16b}, v18.16b
2937    uadalp          v4.8h, v2.16b
2938    uadalp          v4.8h, v3.16b
2939    shrn            v6.8b, v4.8h, #2
2940    st1             {v6.8b}, [OUTPTR], #8
2941    b.ne            1b
2942
2943    br              x30
2944
2945    .unreq          IMAGE_WIDTH
2946    .unreq          MAX_V_SAMP
2947    .unreq          V_SAMP
2948    .unreq          BLOCK_WIDTH
2949    .unreq          INPUT_DATA
2950    .unreq          OUTPUT_DATA
2951    .unreq          OUTPTR
2952    .unreq          INPTR0
2953    .unreq          INPTR1
2954    .unreq          TMP1
2955    .unreq          TMP2
2956    .unreq          TMP3
2957    .unreq          TMPDUP
2958
2959
2960/*****************************************************************************/
2961
2962/*
2963 * GLOBAL(JOCTET*)
2964 * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
2965 *                              JCOEFPTR block, int last_dc_val,
2966 *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
2967 *
2968 */
2969
2970    BUFFER          .req x1
2971    PUT_BUFFER      .req x6
2972    PUT_BITS        .req x7
2973    PUT_BITSw       .req w7
2974
2975.macro emit_byte
2976    sub             PUT_BITS, PUT_BITS, #0x8
2977    lsr             x19, PUT_BUFFER, PUT_BITS
2978    uxtb            w19, w19
2979    strb            w19, [BUFFER, #1]!
2980    cmp             w19, #0xff
2981    b.ne            14f
2982    strb            wzr, [BUFFER, #1]!
298314:
2984.endm
2985.macro put_bits CODE, SIZE
2986    lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
2987    add             PUT_BITS, PUT_BITS, \SIZE
2988    orr             PUT_BUFFER, PUT_BUFFER, \CODE
2989.endm
2990.macro checkbuf31
2991    cmp             PUT_BITS, #0x20
2992    b.lt            31f
2993    emit_byte
2994    emit_byte
2995    emit_byte
2996    emit_byte
299731:
2998.endm
2999.macro checkbuf47
3000    cmp             PUT_BITS, #0x30
3001    b.lt            47f
3002    emit_byte
3003    emit_byte
3004    emit_byte
3005    emit_byte
3006    emit_byte
3007    emit_byte
300847:
3009.endm
3010
3011.macro generate_jsimd_huff_encode_one_block fast_tbl
3012
3013.balign 16
3014.if \fast_tbl == 1
3015Ljsimd_huff_encode_one_block_neon_consts:
3016.else
3017Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
3018.endif
3019    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
3020          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
3021.if \fast_tbl == 1
3022    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
3023            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
3024    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
3025            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
3026    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
3027           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
3028    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
3029            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
3030    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
3031            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
3032    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
3033            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
3034    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
3035            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
3036    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
3037            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
3038    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
3039           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
3040    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
3041             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
3042    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
3043           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
3044    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
3045           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
3046.endif
3047
3048.if \fast_tbl == 1
3049asm_function jsimd_huff_encode_one_block_neon
3050.else
3051asm_function jsimd_huff_encode_one_block_neon_slowtbl
3052.endif
3053    sub             sp, sp, 272
3054    sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
3055    /* Save ARM registers */
3056    stp             x19, x20, [sp]
3057.if \fast_tbl == 1
3058    adr             x15, Ljsimd_huff_encode_one_block_neon_consts
3059.else
3060    adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
3061.endif
3062    ldr             PUT_BUFFER, [x0, #0x10]
3063    ldr             PUT_BITSw, [x0, #0x18]
3064    ldrsh           w12, [x2]               /* load DC coeff in w12 */
3065    /* prepare data */
3066.if \fast_tbl == 1
3067    ld1             {v23.16b}, [x15], #16
3068    ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
3069    ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
3070    ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
3071    ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
3072    ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
3073    sub             w12, w12, w3      /* last_dc_val, not used afterwards */
3074    /* ZigZag 8x8 */
3075    tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
3076    tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
3077    tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
3078    tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
3079    tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
3080    tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
3081    tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
3082    tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
3083    ins             v0.h[0], w12
3084    tbx             v1.16b, {v28.16b}, v16.16b
3085    tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
3086    tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
3087    tbx             v6.16b, {v31.16b}, v19.16b
3088.else
3089      add             x13, x2, #0x22
3090      sub             w12, w12, w3    /* last_dc_val, not used afterwards */
3091    ld1             {v23.16b}, [x15]
3092      add             x14, x2, #0x18
3093      add             x3, x2, #0x36
3094    ins             v0.h[0], w12
3095      add             x9, x2, #0x2
3096    ld1             {v1.h}[0], [x13]
3097      add             x15, x2, #0x30
3098    ld1             {v2.h}[0], [x14]
3099      add             x19, x2, #0x26
3100    ld1             {v3.h}[0], [x3]
3101      add             x20, x2, #0x28
3102    ld1             {v0.h}[1], [x9]
3103      add             x12, x2, #0x10
3104    ld1             {v1.h}[1], [x15]
3105      add             x13, x2, #0x40
3106    ld1             {v2.h}[1], [x19]
3107      add             x14, x2, #0x34
3108    ld1             {v3.h}[1], [x20]
3109      add             x3, x2, #0x1a
3110    ld1             {v0.h}[2], [x12]
3111      add             x9, x2, #0x20
3112    ld1             {v1.h}[2], [x13]
3113      add             x15, x2, #0x32
3114    ld1             {v2.h}[2], [x14]
3115      add             x19, x2, #0x42
3116    ld1             {v3.h}[2], [x3]
3117      add             x20, x2, #0xc
3118    ld1             {v0.h}[3], [x9]
3119      add             x12, x2, #0x12
3120    ld1             {v1.h}[3], [x15]
3121      add             x13, x2, #0x24
3122    ld1             {v2.h}[3], [x19]
3123      add             x14, x2, #0x50
3124    ld1             {v3.h}[3], [x20]
3125      add             x3, x2, #0xe
3126    ld1             {v0.h}[4], [x12]
3127      add             x9, x2, #0x4
3128    ld1             {v1.h}[4], [x13]
3129      add             x15, x2, #0x16
3130    ld1             {v2.h}[4], [x14]
3131      add             x19, x2, #0x60
3132    ld1             {v3.h}[4], [x3]
3133      add             x20, x2, #0x1c
3134    ld1             {v0.h}[5], [x9]
3135      add             x12, x2, #0x6
3136    ld1             {v1.h}[5], [x15]
3137      add             x13, x2, #0x8
3138    ld1             {v2.h}[5], [x19]
3139      add             x14, x2, #0x52
3140    ld1             {v3.h}[5], [x20]
3141      add             x3, x2, #0x2a
3142    ld1             {v0.h}[6], [x12]
3143      add             x9, x2, #0x14
3144    ld1             {v1.h}[6], [x13]
3145      add             x15, x2, #0xa
3146    ld1             {v2.h}[6], [x14]
3147      add             x19, x2, #0x44
3148    ld1             {v3.h}[6], [x3]
3149      add             x20, x2, #0x38
3150    ld1             {v0.h}[7], [x9]
3151      add             x12, x2, #0x46
3152    ld1             {v1.h}[7], [x15]
3153      add             x13, x2, #0x3a
3154    ld1             {v2.h}[7], [x19]
3155      add             x14, x2, #0x74
3156    ld1             {v3.h}[7], [x20]
3157      add             x3, x2, #0x6a
3158    ld1             {v4.h}[0], [x12]
3159      add             x9, x2, #0x54
3160    ld1             {v5.h}[0], [x13]
3161      add             x15, x2, #0x2c
3162    ld1             {v6.h}[0], [x14]
3163      add             x19, x2, #0x76
3164    ld1             {v7.h}[0], [x3]
3165      add             x20, x2, #0x78
3166    ld1             {v4.h}[1], [x9]
3167      add             x12, x2, #0x62
3168    ld1             {v5.h}[1], [x15]
3169      add             x13, x2, #0x1e
3170    ld1             {v6.h}[1], [x19]
3171      add             x14, x2, #0x68
3172    ld1             {v7.h}[1], [x20]
3173      add             x3, x2, #0x7a
3174    ld1             {v4.h}[2], [x12]
3175      add             x9, x2, #0x70
3176    ld1             {v5.h}[2], [x13]
3177      add             x15, x2, #0x2e
3178    ld1             {v6.h}[2], [x14]
3179      add             x19, x2, #0x5a
3180    ld1             {v7.h}[2], [x3]
3181      add             x20, x2, #0x6c
3182    ld1             {v4.h}[3], [x9]
3183      add             x12, x2, #0x72
3184    ld1             {v5.h}[3], [x15]
3185      add             x13, x2, #0x3c
3186    ld1             {v6.h}[3], [x19]
3187      add             x14, x2, #0x4c
3188    ld1             {v7.h}[3], [x20]
3189      add             x3, x2, #0x5e
3190    ld1             {v4.h}[4], [x12]
3191      add             x9, x2, #0x64
3192    ld1             {v5.h}[4], [x13]
3193      add             x15, x2, #0x4a
3194    ld1             {v6.h}[4], [x14]
3195      add             x19, x2, #0x3e
3196    ld1             {v7.h}[4], [x3]
3197      add             x20, x2, #0x6e
3198    ld1             {v4.h}[5], [x9]
3199      add             x12, x2, #0x56
3200    ld1             {v5.h}[5], [x15]
3201      add             x13, x2, #0x58
3202    ld1             {v6.h}[5], [x19]
3203      add             x14, x2, #0x4e
3204    ld1             {v7.h}[5], [x20]
3205      add             x3, x2, #0x7c
3206    ld1             {v4.h}[6], [x12]
3207      add             x9, x2, #0x48
3208    ld1             {v5.h}[6], [x13]
3209      add             x15, x2, #0x66
3210    ld1             {v6.h}[6], [x14]
3211      add             x19, x2, #0x5c
3212    ld1             {v7.h}[6], [x3]
3213      add             x20, x2, #0x7e
3214    ld1             {v4.h}[7], [x9]
3215    ld1             {v5.h}[7], [x15]
3216    ld1             {v6.h}[7], [x19]
3217    ld1             {v7.h}[7], [x20]
3218.endif
3219    cmlt            v24.8h, v0.8h, #0
3220    cmlt            v25.8h, v1.8h, #0
3221    cmlt            v26.8h, v2.8h, #0
3222    cmlt            v27.8h, v3.8h, #0
3223    cmlt            v28.8h, v4.8h, #0
3224    cmlt            v29.8h, v5.8h, #0
3225    cmlt            v30.8h, v6.8h, #0
3226    cmlt            v31.8h, v7.8h, #0
3227    abs             v0.8h, v0.8h
3228    abs             v1.8h, v1.8h
3229    abs             v2.8h, v2.8h
3230    abs             v3.8h, v3.8h
3231    abs             v4.8h, v4.8h
3232    abs             v5.8h, v5.8h
3233    abs             v6.8h, v6.8h
3234    abs             v7.8h, v7.8h
3235    eor             v24.16b, v24.16b, v0.16b
3236    eor             v25.16b, v25.16b, v1.16b
3237    eor             v26.16b, v26.16b, v2.16b
3238    eor             v27.16b, v27.16b, v3.16b
3239    eor             v28.16b, v28.16b, v4.16b
3240    eor             v29.16b, v29.16b, v5.16b
3241    eor             v30.16b, v30.16b, v6.16b
3242    eor             v31.16b, v31.16b, v7.16b
3243    cmeq            v16.8h, v0.8h, #0
3244    cmeq            v17.8h, v1.8h, #0
3245    cmeq            v18.8h, v2.8h, #0
3246    cmeq            v19.8h, v3.8h, #0
3247    cmeq            v20.8h, v4.8h, #0
3248    cmeq            v21.8h, v5.8h, #0
3249    cmeq            v22.8h, v6.8h, #0
3250    xtn             v16.8b, v16.8h
3251    xtn             v18.8b, v18.8h
3252    xtn             v20.8b, v20.8h
3253    xtn             v22.8b, v22.8h
3254      umov            w14, v0.h[0]
3255    xtn2            v16.16b, v17.8h
3256      umov            w13, v24.h[0]
3257    xtn2            v18.16b, v19.8h
3258      clz             w14, w14
3259    xtn2            v20.16b, v21.8h
3260      lsl             w13, w13, w14
3261    cmeq            v17.8h, v7.8h, #0
3262      sub             w12, w14, #32
3263    xtn2            v22.16b, v17.8h
3264      lsr             w13, w13, w14
3265    and             v16.16b, v16.16b, v23.16b
3266      neg             w12, w12
3267    and             v18.16b, v18.16b, v23.16b
3268      add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
3269    and             v20.16b, v20.16b, v23.16b
3270      add             x15, sp, #0x90           /* x15 = t2 */
3271    and             v22.16b, v22.16b, v23.16b
3272      ldr             w10, [x4, x12, lsl #2]
3273    addp            v16.16b, v16.16b, v18.16b
3274      ldrb            w11, [x3, x12]
3275    addp            v20.16b, v20.16b, v22.16b
3276      checkbuf47
3277    addp            v16.16b, v16.16b, v20.16b
3278      put_bits        x10, x11
3279    addp            v16.16b, v16.16b, v18.16b
3280      checkbuf47
3281    umov            x9,v16.D[0]
3282      put_bits        x13, x12
3283    cnt             v17.8b, v16.8b
3284      mvn             x9, x9
3285    addv            B18, v17.8b
3286      add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
3287    umov            w12, v18.b[0]
3288      lsr             x9, x9, #0x1     /* clear AC coeff */
3289    ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
3290    rbit            x9, x9             /* x9 = index0 */
3291    ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
3292    cmp             w12, #(64-8)
3293    add             x11, sp, #16
3294    b.lt            4f
3295    cbz             x9, 6f
3296    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3297    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3298    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3299    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
33001:
3301    clz             x2, x9
3302    add             x15, x15, x2, lsl #1
3303    lsl             x9, x9, x2
3304    ldrh            w20, [x15, #-126]
33052:
3306    cmp             x2, #0x10
3307    b.lt            3f
3308    sub             x2, x2, #0x10
3309    checkbuf47
3310    put_bits        x13, x14
3311    b               2b
33123:
3313    clz             w20, w20
3314    ldrh            w3, [x15, #2]!
3315    sub             w11, w20, #32
3316    lsl             w3, w3, w20
3317    neg             w11, w11
3318    lsr             w3, w3, w20
3319    add             x2, x11, x2, lsl #4
3320    lsl             x9, x9, #0x1
3321    ldr             w12, [x5, x2, lsl #2]
3322    ldrb            w10, [x4, x2]
3323    checkbuf31
3324    put_bits        x12, x10
3325    put_bits        x3, x11
3326    cbnz            x9, 1b
3327    b               6f
33284:
3329    movi            v21.8h, #0x0010
3330    clz             v0.8h, v0.8h
3331    clz             v1.8h, v1.8h
3332    clz             v2.8h, v2.8h
3333    clz             v3.8h, v3.8h
3334    clz             v4.8h, v4.8h
3335    clz             v5.8h, v5.8h
3336    clz             v6.8h, v6.8h
3337    clz             v7.8h, v7.8h
3338    ushl            v24.8h, v24.8h, v0.8h
3339    ushl            v25.8h, v25.8h, v1.8h
3340    ushl            v26.8h, v26.8h, v2.8h
3341    ushl            v27.8h, v27.8h, v3.8h
3342    ushl            v28.8h, v28.8h, v4.8h
3343    ushl            v29.8h, v29.8h, v5.8h
3344    ushl            v30.8h, v30.8h, v6.8h
3345    ushl            v31.8h, v31.8h, v7.8h
3346    neg             v0.8h, v0.8h
3347    neg             v1.8h, v1.8h
3348    neg             v2.8h, v2.8h
3349    neg             v3.8h, v3.8h
3350    neg             v4.8h, v4.8h
3351    neg             v5.8h, v5.8h
3352    neg             v6.8h, v6.8h
3353    neg             v7.8h, v7.8h
3354    ushl            v24.8h, v24.8h, v0.8h
3355    ushl            v25.8h, v25.8h, v1.8h
3356    ushl            v26.8h, v26.8h, v2.8h
3357    ushl            v27.8h, v27.8h, v3.8h
3358    ushl            v28.8h, v28.8h, v4.8h
3359    ushl            v29.8h, v29.8h, v5.8h
3360    ushl            v30.8h, v30.8h, v6.8h
3361    ushl            v31.8h, v31.8h, v7.8h
3362    add             v0.8h, v21.8h, v0.8h
3363    add             v1.8h, v21.8h, v1.8h
3364    add             v2.8h, v21.8h, v2.8h
3365    add             v3.8h, v21.8h, v3.8h
3366    add             v4.8h, v21.8h, v4.8h
3367    add             v5.8h, v21.8h, v5.8h
3368    add             v6.8h, v21.8h, v6.8h
3369    add             v7.8h, v21.8h, v7.8h
3370    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3371    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3372    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3373    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
33741:
3375    clz             x2, x9
3376    add             x15, x15, x2, lsl #1
3377    lsl             x9, x9, x2
3378    ldrh            w11, [x15, #-126]
33792:
3380    cmp             x2, #0x10
3381    b.lt            3f
3382    sub             x2, x2, #0x10
3383    checkbuf47
3384    put_bits        x13, x14
3385    b               2b
33863:
3387    ldrh            w3, [x15, #2]!
3388    add             x2, x11, x2, lsl #4
3389    lsl             x9, x9, #0x1
3390    ldr             w12, [x5, x2, lsl #2]
3391    ldrb            w10, [x4, x2]
3392    checkbuf31
3393    put_bits        x12, x10
3394    put_bits        x3, x11
3395    cbnz            x9, 1b
33966:
3397    add             x13, sp, #0x10e
3398    cmp             x15, x13
3399    b.hs            1f
3400    ldr             w12, [x5]
3401    ldrb            w14, [x4]
3402    checkbuf47
3403    put_bits        x12, x14
34041:
3405    str             PUT_BUFFER, [x0, #0x10]
3406    str             PUT_BITSw, [x0, #0x18]
3407    ldp             x19, x20, [sp], 16
3408    add             x0, BUFFER, #0x1
3409    add             sp, sp, 256
3410    br              x30
3411
3412.endm
3413
3414generate_jsimd_huff_encode_one_block 1
3415generate_jsimd_huff_encode_one_block 0
3416
3417    .unreq          BUFFER
3418    .unreq          PUT_BUFFER
3419    .unreq          PUT_BITS
3420    .unreq          PUT_BITSw
3421
3422.purgem emit_byte
3423.purgem put_bits
3424.purgem checkbuf31
3425.purgem checkbuf47
3426