1/*
2 * ARMv8 NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2013-2014, Linaro Limited
8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
9 * Copyright (C) 2014, D. R. Commander.  All rights reserved.
10 *
11 * This software is provided 'as-is', without any express or implied
12 * warranty.  In no event will the authors be held liable for any damages
13 * arising from the use of this software.
14 *
15 * Permission is granted to anyone to use this software for any purpose,
16 * including commercial applications, and to alter it and redistribute it
17 * freely, subject to the following restrictions:
18 *
19 * 1. The origin of this software must not be misrepresented; you must not
20 *    claim that you wrote the original software. If you use this software
21 *    in a product, an acknowledgment in the product documentation would be
22 *    appreciated but is not required.
23 * 2. Altered source versions must be plainly marked as such, and must not be
24 *    misrepresented as being the original software.
25 * 3. This notice may not be removed or altered from any source distribution.
26 */
27
28#if defined(__linux__) && defined(__ELF__)
29.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
30#endif
31
32.text
33
34
35#define RESPECT_STRICT_ALIGNMENT 1
36
37
38/*****************************************************************************/
39
40/* Supplementary macro for setting function attributes */
41.macro asm_function fname
42#ifdef __APPLE__
43    .globl _\fname
44_\fname:
45#else
46    .global \fname
47#ifdef __ELF__
48    .hidden \fname
49    .type \fname, %function
50#endif
51\fname:
52#endif
53.endm
54
55/* Transpose elements of single 128 bit registers */
56.macro transpose_single x0,x1,xi,xilen,literal
57    ins  \xi\xilen[0],  \x0\xilen[0]
58    ins  \x1\xilen[0],  \x0\xilen[1]
59    trn1 \x0\literal,   \x0\literal, \x1\literal
60    trn2 \x1\literal,   \xi\literal, \x1\literal
61.endm
62
63/* Transpose elements of 2 differnet registers */
64.macro transpose x0,x1,xi,xilen,literal
65    mov  \xi\xilen,     \x0\xilen
66    trn1 \x0\literal,   \x0\literal, \x1\literal
67    trn2 \x1\literal,   \xi\literal, \x1\literal
68.endm
69
70/* Transpose a block of 4x4 coefficients in four 64-bit registers */
71.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
72    mov  \xi\xilen, \x0\xilen
73    trn1 \x0\x0len, \x0\x0len, \x2\x2len
74    trn2 \x2\x2len, \xi\x0len, \x2\x2len
75    mov  \xi\xilen, \x1\xilen
76    trn1 \x1\x1len, \x1\x1len, \x3\x3len
77    trn2 \x3\x3len, \xi\x1len, \x3\x3len
78.endm
79
80.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
81    mov  \xi\xilen, \x0\xilen
82    trn1 \x0\x0len, \x0\x0len, \x1\x1len
83    trn2 \x1\x2len, \xi\x0len, \x1\x2len
84    mov  \xi\xilen, \x2\xilen
85    trn1 \x2\x2len, \x2\x2len, \x3\x3len
86    trn2 \x3\x2len, \xi\x1len, \x3\x3len
87.endm
88
89.macro transpose_4x4 x0, x1, x2, x3,x5
90    transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
91    transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
92.endm
93
94
95#define CENTERJSAMPLE 128
96
97/*****************************************************************************/
98
99/*
100 * Perform dequantization and inverse DCT on one block of coefficients.
101 *
102 * GLOBAL(void)
103 * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
104 *                        JSAMPARRAY output_buf, JDIMENSION output_col)
105 */
106
107#define FIX_0_298631336  (2446)
108#define FIX_0_390180644  (3196)
109#define FIX_0_541196100  (4433)
110#define FIX_0_765366865  (6270)
111#define FIX_0_899976223  (7373)
112#define FIX_1_175875602  (9633)
113#define FIX_1_501321110  (12299)
114#define FIX_1_847759065  (15137)
115#define FIX_1_961570560  (16069)
116#define FIX_2_053119869  (16819)
117#define FIX_2_562915447  (20995)
118#define FIX_3_072711026  (25172)
119
120#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
121#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
122#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
123#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
124#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
125#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
126#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
127#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
128
129/*
130 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
131 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
132 */
133#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
134{                                                                             \
135    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
136    INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
137    INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
138                                                                              \
139    /* 1-D iDCT input data */                                                 \
140    row0 = xrow0;                                                             \
141    row1 = xrow1;                                                             \
142    row2 = xrow2;                                                             \
143    row3 = xrow3;                                                             \
144    row4 = xrow4;                                                             \
145    row5 = xrow5;                                                             \
146    row6 = xrow6;                                                             \
147    row7 = xrow7;                                                             \
148                                                                              \
149    q5 = row7 + row3;                                                         \
150    q4 = row5 + row1;                                                         \
151    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
152         MULTIPLY(q4, FIX_1_175875602);                                       \
153    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
154         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
155    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
156         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
157    q4 = q6;                                                                  \
158    q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
159    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
160          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
161    /* now we can use q1 (reloadable constants have been used up) */          \
162    q1 = q3 + q2;                                                             \
163    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
164          MULTIPLY(row1, -FIX_0_899976223);                                   \
165    q5 = q7;                                                                  \
166    q1 = q1 + q6;                                                             \
167    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
168          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
169                                                                              \
170    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
171    tmp11_plus_tmp2 = q1;                                                     \
172    row1 = 0;                                                                 \
173                                                                              \
174    q1 = q1 - q6;                                                             \
175    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
176          MULTIPLY(row3, -FIX_2_562915447);                                   \
177    q1 = q1 - q6;                                                             \
178    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
179         MULTIPLY(row6, FIX_0_541196100);                                     \
180    q3 = q3 - q2;                                                             \
181                                                                              \
182    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
183    tmp11_minus_tmp2 = q1;                                                    \
184                                                                              \
185    q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
186    q2 = q1 + q6;                                                             \
187    q1 = q1 - q6;                                                             \
188                                                                              \
189    /* pick up the results */                                                 \
190    tmp0  = q4;                                                               \
191    tmp1  = q5;                                                               \
192    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
193    tmp3  = q7;                                                               \
194    tmp10 = q2;                                                               \
195    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
196    tmp12 = q3;                                                               \
197    tmp13 = q1;                                                               \
198}
199
200#define XFIX_0_899976223                    v0.h[0]
201#define XFIX_0_541196100                    v0.h[1]
202#define XFIX_2_562915447                    v0.h[2]
203#define XFIX_0_298631336_MINUS_0_899976223  v0.h[3]
204#define XFIX_1_501321110_MINUS_0_899976223  v1.h[0]
205#define XFIX_2_053119869_MINUS_2_562915447  v1.h[1]
206#define XFIX_0_541196100_PLUS_0_765366865   v1.h[2]
207#define XFIX_1_175875602                    v1.h[3]
208#define XFIX_1_175875602_MINUS_0_390180644  v2.h[0]
209#define XFIX_0_541196100_MINUS_1_847759065  v2.h[1]
210#define XFIX_3_072711026_MINUS_2_562915447  v2.h[2]
211#define XFIX_1_175875602_MINUS_1_961570560  v2.h[3]
212
213.balign 16
214Ljsimd_idct_islow_neon_consts:
215    .short FIX_0_899976223                    /* d0[0] */
216    .short FIX_0_541196100                    /* d0[1] */
217    .short FIX_2_562915447                    /* d0[2] */
218    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
219    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
220    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
221    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
222    .short FIX_1_175875602                    /* d1[3] */
223    /* reloadable constants */
224    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
225    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
226    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
227    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
228
229asm_function jsimd_idct_islow_neon
230
231    DCT_TABLE       .req x0
232    COEF_BLOCK      .req x1
233    OUTPUT_BUF      .req x2
234    OUTPUT_COL      .req x3
235    TMP1            .req x0
236    TMP2            .req x1
237    TMP3            .req x2
238    TMP4            .req x15
239
240    ROW0L           .req v16
241    ROW0R           .req v17
242    ROW1L           .req v18
243    ROW1R           .req v19
244    ROW2L           .req v20
245    ROW2R           .req v21
246    ROW3L           .req v22
247    ROW3R           .req v23
248    ROW4L           .req v24
249    ROW4R           .req v25
250    ROW5L           .req v26
251    ROW5R           .req v27
252    ROW6L           .req v28
253    ROW6R           .req v29
254    ROW7L           .req v30
255    ROW7R           .req v31
256    /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
257    sub             sp, sp, 272
258    str             x15, [sp], 16
259    adr             x15, Ljsimd_idct_islow_neon_consts
260    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
261    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
262    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
263    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
264    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
265    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
266    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
267    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
268    ld1             {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
269    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
270    ld1             {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
271    mul             v16.4h, v16.4h, v0.4h
272    mul             v17.4h, v17.4h, v1.4h
273    ins             v16.d[1], v17.d[0]  /* 128 bit q8 */
274    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
275    mul             v18.4h, v18.4h, v2.4h
276    mul             v19.4h, v19.4h, v3.4h
277    ins             v18.d[1], v19.d[0]  /* 128 bit q9 */
278    ld1             {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
279    mul             v20.4h, v20.4h, v4.4h
280    mul             v21.4h, v21.4h, v5.4h
281    ins             v20.d[1], v21.d[0]  /* 128 bit q10 */
282    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
283    mul             v22.4h, v22.4h, v6.4h
284    mul             v23.4h, v23.4h, v7.4h
285    ins             v22.d[1], v23.d[0]  /* 128 bit q11 */
286    ld1             {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
287    mul             v24.4h, v24.4h, v0.4h
288    mul             v25.4h, v25.4h, v1.4h
289    ins             v24.d[1], v25.d[0]  /* 128 bit q12 */
290    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
291    mul             v28.4h, v28.4h, v4.4h
292    mul             v29.4h, v29.4h, v5.4h
293    ins             v28.d[1], v29.d[0]  /* 128 bit q14 */
294    mul             v26.4h, v26.4h, v2.4h
295    mul             v27.4h, v27.4h, v3.4h
296    ins             v26.d[1], v27.d[0]  /* 128 bit q13 */
297    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x15]  /* load constants */
298    add             x15, x15, #16
299    mul             v30.4h, v30.4h, v6.4h
300    mul             v31.4h, v31.4h, v7.4h
301    ins             v30.d[1], v31.d[0]  /* 128 bit q15 */
302    /* Go to the bottom of the stack */
303    sub             sp, sp, 352
304    stp             x4, x5, [sp], 16
305    st1             {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32  /* save NEON registers */
306    st1             {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
307    /* 1-D IDCT, pass 1, left 4x8 half */
308    add             v4.4h,    ROW7L.4h, ROW3L.4h
309    add             v5.4h,    ROW5L.4h, ROW1L.4h
310    smull           v12.4s,   v4.4h,    XFIX_1_175875602_MINUS_1_961570560
311    smlal           v12.4s,   v5.4h,    XFIX_1_175875602
312    smull           v14.4s,   v4.4h,    XFIX_1_175875602
313    /* Check for the zero coefficients in the right 4x8 half */
314    smlal           v14.4s,   v5.4h,    XFIX_1_175875602_MINUS_0_390180644
315    ssubl           v6.4s,    ROW0L.4h, ROW4L.4h
316      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
317    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
318    smlal           v4.4s,    ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
319      orr           x0,       x4,       x5
320    mov             v8.16b,   v12.16b
321    smlsl           v12.4s,   ROW5L.4h, XFIX_2_562915447
322      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
323    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
324    shl             v6.4s,    v6.4s,    #13
325      orr           x0,       x0,       x4
326    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
327      orr           x0,       x0 ,      x5
328    add             v2.4s,    v6.4s,    v4.4s
329      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
330    mov             v10.16b,  v14.16b
331    add             v2.4s,    v2.4s,    v12.4s
332      orr           x0,       x0,       x4
333    smlsl           v14.4s,   ROW7L.4h, XFIX_0_899976223
334      orr           x0,       x0,       x5
335    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
336    rshrn           ROW1L.4h, v2.4s,    #11
337      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
338    sub             v2.4s,    v2.4s,    v12.4s
339    smlal           v10.4s,   ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
340      orr           x0,       x0,       x4
341    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
342      orr           x0,       x0,       x5
343    sub             v2.4s,    v2.4s,    v12.4s
344    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
345      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
346    smlal           v12.4s,   ROW6L.4h, XFIX_0_541196100
347    sub             v6.4s,    v6.4s,    v4.4s
348      orr           x0,       x0,       x4
349    rshrn           ROW6L.4h, v2.4s,    #11
350      orr           x0,       x0,       x5
351    add             v2.4s,    v6.4s,    v10.4s
352      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
353    sub             v6.4s,    v6.4s,    v10.4s
354    saddl           v10.4s,   ROW0L.4h, ROW4L.4h
355      orr           x0,       x0,       x4
356    rshrn           ROW2L.4h, v2.4s,    #11
357      orr           x0,       x0,       x5
358    rshrn           ROW5L.4h, v6.4s,    #11
359      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
360    shl             v10.4s,   v10.4s,   #13
361    smlal           v8.4s,    ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
362      orr           x0,       x0,       x4
363    add             v4.4s,    v10.4s,   v12.4s
364      orr           x0,       x0,       x5
365    cmp             x0, #0 /* orrs instruction removed */
366    sub             v2.4s,    v10.4s,   v12.4s
367    add             v12.4s,   v4.4s,    v14.4s
368      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
369    sub             v4.4s,    v4.4s,    v14.4s
370    add             v10.4s,   v2.4s,    v8.4s
371      orr           x0,       x4,       x5
372    sub             v6.4s,    v2.4s,    v8.4s
373      /* pop             {x4, x5} */
374      sub           sp, sp, 80
375      ldp           x4, x5, [sp], 16
376    rshrn           ROW7L.4h, v4.4s,    #11
377    rshrn           ROW3L.4h, v10.4s,   #11
378    rshrn           ROW0L.4h, v12.4s,   #11
379    rshrn           ROW4L.4h, v6.4s,    #11
380
381      b.eq          3f /* Go to do some special handling for the sparse right 4x8 half */
382
383    /* 1-D IDCT, pass 1, right 4x8 half */
384    ld1             {v2.4h},  [x15]    /* reload constants */
385    add             v10.4h,   ROW7R.4h, ROW3R.4h
386    add             v8.4h,    ROW5R.4h, ROW1R.4h
387    /* Transpose ROW6L <-> ROW7L   (v3 available free register) */
388    transpose       ROW6L, ROW7L, v3, .16b, .4h
389    smull           v12.4s,   v10.4h,   XFIX_1_175875602_MINUS_1_961570560
390    smlal           v12.4s,   v8.4h,    XFIX_1_175875602
391    /* Transpose ROW2L <-> ROW3L   (v3 available free register) */
392    transpose       ROW2L, ROW3L, v3, .16b, .4h
393    smull           v14.4s,   v10.4h,   XFIX_1_175875602
394    smlal           v14.4s,   v8.4h,    XFIX_1_175875602_MINUS_0_390180644
395    /* Transpose ROW0L <-> ROW1L   (v3 available free register) */
396    transpose       ROW0L, ROW1L, v3, .16b, .4h
397    ssubl           v6.4s,    ROW0R.4h, ROW4R.4h
398    smull           v4.4s,    ROW2R.4h, XFIX_0_541196100
399    smlal           v4.4s,    ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
400    /* Transpose ROW4L <-> ROW5L   (v3 available free register) */
401    transpose       ROW4L, ROW5L, v3, .16b, .4h
402    mov             v8.16b,   v12.16b
403    smlsl           v12.4s,   ROW5R.4h, XFIX_2_562915447
404    smlal           v12.4s,   ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447
405    /* Transpose ROW1L <-> ROW3L   (v3 available free register) */
406    transpose       ROW1L, ROW3L, v3, .16b, .2s
407    shl             v6.4s,    v6.4s,    #13
408    smlsl           v8.4s,    ROW1R.4h, XFIX_0_899976223
409    /* Transpose ROW4L <-> ROW6L   (v3 available free register) */
410    transpose       ROW4L, ROW6L, v3, .16b, .2s
411    add             v2.4s,    v6.4s,    v4.4s
412    mov             v10.16b,  v14.16b
413    add             v2.4s,    v2.4s,    v12.4s
414    /* Transpose ROW0L <-> ROW2L   (v3 available free register) */
415    transpose       ROW0L, ROW2L, v3, .16b, .2s
416    smlsl           v14.4s,   ROW7R.4h, XFIX_0_899976223
417    smlal           v14.4s,   ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223
418    rshrn           ROW1R.4h, v2.4s,    #11
419    /* Transpose ROW5L <-> ROW7L   (v3 available free register) */
420    transpose       ROW5L, ROW7L, v3, .16b, .2s
421    sub             v2.4s,    v2.4s,    v12.4s
422    smlal           v10.4s,   ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
423    smlsl           v10.4s,   ROW3R.4h, XFIX_2_562915447
424    sub             v2.4s,    v2.4s,    v12.4s
425    smull           v12.4s,   ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865
426    smlal           v12.4s,   ROW6R.4h, XFIX_0_541196100
427    sub             v6.4s,    v6.4s,    v4.4s
428    rshrn           ROW6R.4h, v2.4s,    #11
429    add             v2.4s,    v6.4s,    v10.4s
430    sub             v6.4s,    v6.4s,    v10.4s
431    saddl           v10.4s,   ROW0R.4h, ROW4R.4h
432    rshrn           ROW2R.4h, v2.4s,    #11
433    rshrn           ROW5R.4h, v6.4s,    #11
434    shl             v10.4s,   v10.4s,   #13
435    smlal           v8.4s,    ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
436    add             v4.4s,    v10.4s,   v12.4s
437    sub             v2.4s,    v10.4s,   v12.4s
438    add             v12.4s,   v4.4s,    v14.4s
439    sub             v4.4s,    v4.4s,    v14.4s
440    add             v10.4s,   v2.4s,    v8.4s
441    sub             v6.4s,    v2.4s,    v8.4s
442    rshrn           ROW7R.4h, v4.4s,    #11
443    rshrn           ROW3R.4h, v10.4s,   #11
444    rshrn           ROW0R.4h, v12.4s,   #11
445    rshrn           ROW4R.4h, v6.4s,    #11
446    /* Transpose right 4x8 half */
447    transpose       ROW6R, ROW7R, v3, .16b, .4h
448    transpose       ROW2R, ROW3R, v3, .16b, .4h
449    transpose       ROW0R, ROW1R, v3, .16b, .4h
450    transpose       ROW4R, ROW5R, v3, .16b, .4h
451    transpose       ROW1R, ROW3R, v3, .16b, .2s
452    transpose       ROW4R, ROW6R, v3, .16b, .2s
453    transpose       ROW0R, ROW2R, v3, .16b, .2s
454    transpose       ROW5R, ROW7R, v3, .16b, .2s
455
4561:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
457    ld1             {v2.4h},  [x15]    /* reload constants */
458    smull           v12.4S,   ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
459    smlal           v12.4s,   ROW1L.4h, XFIX_1_175875602
460    smlal           v12.4s,   ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
461    smlal           v12.4s,   ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
462    smull           v14.4s,   ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
463    smlal           v14.4s,   ROW3L.4h, XFIX_1_175875602
464    smlal           v14.4s,   ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
465    smlal           v14.4s,   ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
466    ssubl           v6.4s,    ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
467    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
468    smlal           v4.4s,    ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L.4h <-> ROW2R.4h */
469    mov             v8.16b,   v12.16b
470    smlsl           v12.4s,   ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
471    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
472    shl             v6.4s,    v6.4s,    #13
473    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
474    add             v2.4s,    v6.4s,    v4.4s
475    mov             v10.16b,  v14.16b
476    add             v2.4s,    v2.4s,    v12.4s
477    smlsl           v14.4s,   ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
478    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
479    shrn            ROW1L.4h, v2.4s,    #16
480    sub             v2.4s,    v2.4s,    v12.4s
481    smlal           v10.4s,   ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
482    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
483    sub             v2.4s,    v2.4s,    v12.4s
484    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
485    smlal           v12.4s,   ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
486    sub             v6.4s,    v6.4s,    v4.4s
487    shrn            ROW2R.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
488    add             v2.4s,    v6.4s,    v10.4s
489    sub             v6.4s,    v6.4s,    v10.4s
490    saddl           v10.4s,   ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
491    shrn            ROW2L.4h, v2.4s,    #16
492    shrn            ROW1R.4h, v6.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
493    shl             v10.4s,   v10.4s,   #13
494    smlal           v8.4s,    ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
495    add             v4.4s,    v10.4s,   v12.4s
496    sub             v2.4s,    v10.4s,   v12.4s
497    add             v12.4s,   v4.4s,    v14.4s
498    sub             v4.4s,    v4.4s,    v14.4s
499    add             v10.4s,   v2.4s,    v8.4s
500    sub             v6.4s,    v2.4s,    v8.4s
501    shrn            ROW3R.4h, v4.4s,    #16 /* ROW7L.4h <-> ROW3R.4h */
502    shrn            ROW3L.4h, v10.4s,   #16
503    shrn            ROW0L.4h, v12.4s,   #16
504    shrn            ROW0R.4h, v6.4s,    #16 /* ROW4L.4h <-> ROW0R.4h */
505    /* 1-D IDCT, pass 2, right 4x8 half */
506    ld1             {v2.4h},  [x15]    /* reload constants */
507    smull           v12.4s,   ROW5R.4h, XFIX_1_175875602
508    smlal           v12.4s,   ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
509    smlal           v12.4s,   ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
510    smlal           v12.4s,   ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
511    smull           v14.4s,   ROW7R.4h, XFIX_1_175875602
512    smlal           v14.4s,   ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
513    smlal           v14.4s,   ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
514    smlal           v14.4s,   ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
515    ssubl           v6.4s,    ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
516    smull           v4.4s,    ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
517    smlal           v4.4s,    ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
518    mov             v8.16b,   v12.16b
519    smlsl           v12.4s,   ROW5R.4h, XFIX_2_562915447
520    smlal           v12.4s,   ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
521    shl             v6.4s,    v6.4s,    #13
522    smlsl           v8.4s,    ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
523    add             v2.4s,    v6.4s,    v4.4s
524    mov             v10.16b,  v14.16b
525    add             v2.4s,    v2.4s,    v12.4s
526    smlsl           v14.4s,   ROW7R.4h, XFIX_0_899976223
527    smlal           v14.4s,   ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
528    shrn            ROW5L.4h, v2.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
529    sub             v2.4s,    v2.4s,    v12.4s
530    smlal           v10.4s,   ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
531    smlsl           v10.4s,   ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
532    sub             v2.4s,    v2.4s,    v12.4s
533    smull           v12.4s,   ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L.4h <-> ROW2R.4h */
534    smlal           v12.4s,   ROW6R.4h, XFIX_0_541196100
535    sub             v6.4s,    v6.4s,    v4.4s
536    shrn            ROW6R.4h, v2.4s,    #16
537    add             v2.4s,    v6.4s,    v10.4s
538    sub             v6.4s,    v6.4s,    v10.4s
539    saddl           v10.4s,   ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
540    shrn            ROW6L.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
541    shrn            ROW5R.4h, v6.4s,    #16
542    shl             v10.4s,   v10.4s,   #13
543    smlal           v8.4s,    ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
544    add             v4.4s,    v10.4s,   v12.4s
545    sub             v2.4s,    v10.4s,   v12.4s
546    add             v12.4s,   v4.4s,    v14.4s
547    sub             v4.4s,    v4.4s,    v14.4s
548    add             v10.4s,   v2.4s,    v8.4s
549    sub             v6.4s,    v2.4s,    v8.4s
550    shrn            ROW7R.4h, v4.4s,    #16
551    shrn            ROW7L.4h, v10.4s,   #16 /* ROW7L.4h <-> ROW3R.4h */
552    shrn            ROW4L.4h, v12.4s,   #16 /* ROW4L.4h <-> ROW0R.4h */
553    shrn            ROW4R.4h, v6.4s,    #16
554
5552:  /* Descale to 8-bit and range limit */
556    ins             v16.d[1], v17.d[0]
557    ins             v18.d[1], v19.d[0]
558    ins             v20.d[1], v21.d[0]
559    ins             v22.d[1], v23.d[0]
560    sqrshrn         v16.8b,   v16.8h,   #2
561    sqrshrn2        v16.16b,  v18.8h,   #2
562    sqrshrn         v18.8b,   v20.8h,   #2
563    sqrshrn2        v18.16b,  v22.8h,   #2
564
565    /* vpop            {v8.4h - d15.4h} */ /* restore NEON registers */
566    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [sp], 32
567    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [sp], 32
568    ins             v24.d[1], v25.d[0]
569
570    sqrshrn         v20.8b,   v24.8h,   #2
571      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
572    /* trn1            v16.8h,    v16.8h,  v18.8h */
573    transpose       v16, v18, v3, .16b, .8h
574    ins             v26.d[1], v27.d[0]
575    ins             v28.d[1], v29.d[0]
576    ins             v30.d[1], v31.d[0]
577    sqrshrn2        v20.16b,  v26.8h,   #2
578    sqrshrn         v22.8b,   v28.8h,   #2
579    movi            v0.16b,   #(CENTERJSAMPLE)
580    sqrshrn2        v22.16b,  v30.8h,   #2
581    transpose_single v16, v17, v3, .d, .8b
582    transpose_single v18, v19, v3, .d, .8b
583    add             v16.8b,   v16.8b,   v0.8b
584    add             v17.8b,   v17.8b,   v0.8b
585    add             v18.8b,   v18.8b,   v0.8b
586    add             v19.8b,   v19.8b,   v0.8b
587    transpose       v20, v22, v3, .16b, .8h
588    /* Store results to the output buffer */
589    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
590    add             TMP1,     TMP1,     OUTPUT_COL
591    add             TMP2,     TMP2,     OUTPUT_COL
592    st1             {v16.8b}, [TMP1]
593    transpose_single v20, v21, v3, .d, .8b
594    st1             {v17.8b}, [TMP2]
595    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
596    add             TMP1,     TMP1,     OUTPUT_COL
597    add             TMP2,     TMP2,     OUTPUT_COL
598    st1             {v18.8b}, [TMP1]
599    add             v20.8b,   v20.8b,   v0.8b
600    add             v21.8b,   v21.8b,   v0.8b
601    st1             {v19.8b}, [TMP2]
602    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
603    ldp             TMP3,     TMP4,     [OUTPUT_BUF]
604    add             TMP1,     TMP1,     OUTPUT_COL
605    add             TMP2,     TMP2,     OUTPUT_COL
606    add             TMP3,     TMP3,     OUTPUT_COL
607    add             TMP4,     TMP4,     OUTPUT_COL
608    transpose_single v22, v23, v3, .d, .8b
609    st1             {v20.8b}, [TMP1]
610    add             v22.8b,   v22.8b,   v0.8b
611    add             v23.8b,   v23.8b,   v0.8b
612    st1             {v21.8b}, [TMP2]
613    st1             {v22.8b}, [TMP3]
614    st1             {v23.8b}, [TMP4]
615    ldr             x15, [sp], 16
616    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
617    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
618    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
619    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
620    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
621    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
622    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
623    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
624    blr             x30
625
6263:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
627
628    /* Transpose left 4x8 half */
629    transpose       ROW6L, ROW7L, v3, .16b, .4h
630    transpose       ROW2L, ROW3L, v3, .16b, .4h
631    transpose       ROW0L, ROW1L, v3, .16b, .4h
632    transpose       ROW4L, ROW5L, v3, .16b, .4h
633    shl             ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
634    transpose       ROW1L, ROW3L, v3, .16b, .2s
635    transpose       ROW4L, ROW6L, v3, .16b, .2s
636    transpose       ROW0L, ROW2L, v3, .16b, .2s
637    transpose       ROW5L, ROW7L, v3, .16b, .2s
638    cmp             x0, #0
639    b.eq            4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
640
641    /* Only row 0 is non-zero for the right 4x8 half  */
642    dup             ROW1R.4h, ROW0R.h[1]
643    dup             ROW2R.4h, ROW0R.h[2]
644    dup             ROW3R.4h, ROW0R.h[3]
645    dup             ROW4R.4h, ROW0R.h[0]
646    dup             ROW5R.4h, ROW0R.h[1]
647    dup             ROW6R.4h, ROW0R.h[2]
648    dup             ROW7R.4h, ROW0R.h[3]
649    dup             ROW0R.4h, ROW0R.h[0]
650    b               1b /* Go to 'normal' second pass */
651
6524:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
653    ld1             {v2.4h},  [x15]    /* reload constants */
654    smull           v12.4s,   ROW1L.4h, XFIX_1_175875602
655    smlal           v12.4s,   ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
656    smull           v14.4s,   ROW3L.4h, XFIX_1_175875602
657    smlal           v14.4s,   ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
658    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
659    sshll           v6.4s,    ROW0L.4h, #13
660    mov             v8.16b,   v12.16b
661    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
662    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
663    add             v2.4s,    v6.4s,    v4.4s
664    mov             v10.16b,  v14.16b
665    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
666    add             v2.4s,    v2.4s,    v12.4s
667    add             v12.4s,   v12.4s,   v12.4s
668    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
669    shrn            ROW1L.4h, v2.4s,    #16
670    sub             v2.4s,    v2.4s,    v12.4s
671    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
672    sub             v6.4s,    v6.4s,    v4.4s
673    shrn            ROW2R.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
674    add             v2.4s,    v6.4s,    v10.4s
675    sub             v6.4s,    v6.4s,    v10.4s
676    sshll           v10.4s,   ROW0L.4h, #13
677    shrn            ROW2L.4h, v2.4s,    #16
678    shrn            ROW1R.4h, v6.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
679    add             v4.4s,    v10.4s,   v12.4s
680    sub             v2.4s,    v10.4s,   v12.4s
681    add             v12.4s,   v4.4s,    v14.4s
682    sub             v4.4s,    v4.4s,    v14.4s
683    add             v10.4s,   v2.4s,    v8.4s
684    sub             v6.4s,    v2.4s,    v8.4s
685    shrn            ROW3R.4h, v4.4s,    #16 /* ROW7L.4h <-> ROW3R.4h */
686    shrn            ROW3L.4h, v10.4s,   #16
687    shrn            ROW0L.4h, v12.4s,   #16
688    shrn            ROW0R.4h, v6.4s,    #16 /* ROW4L.4h <-> ROW0R.4h */
689    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
690    ld1             {v2.4h},  [x15]    /* reload constants */
691    smull           v12.4s,   ROW5L.4h, XFIX_1_175875602
692    smlal           v12.4s,   ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560
693    smull           v14.4s,   ROW7L.4h, XFIX_1_175875602
694    smlal           v14.4s,   ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644
695    smull           v4.4s,    ROW6L.4h, XFIX_0_541196100
696    sshll           v6.4s,    ROW4L.4h, #13
697    mov             v8.16b,   v12.16b
698    smlal           v12.4s,   ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447
699    smlsl           v8.4s,    ROW5L.4h, XFIX_0_899976223
700    add             v2.4s,    v6.4s,    v4.4s
701    mov             v10.16b,  v14.16b
702    smlal           v14.4s,   ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223
703    add             v2.4s,    v2.4s,    v12.4s
704    add             v12.4s,   v12.4s,   v12.4s
705    smlsl           v10.4s,   ROW7L.4h, XFIX_2_562915447
706    shrn            ROW5L.4h, v2.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
707    sub             v2.4s,    v2.4s,    v12.4s
708    smull           v12.4s,   ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865
709    sub             v6.4s,    v6.4s,    v4.4s
710    shrn            ROW6R.4h, v2.4s,    #16
711    add             v2.4s,    v6.4s,    v10.4s
712    sub             v6.4s,    v6.4s,    v10.4s
713    sshll           v10.4s,   ROW4L.4h, #13
714    shrn            ROW6L.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
715    shrn            ROW5R.4h, v6.4s,    #16
716    add             v4.4s,    v10.4s,   v12.4s
717    sub             v2.4s,    v10.4s,   v12.4s
718    add             v12.4s,   v4.4s,    v14.4s
719    sub             v4.4s,    v4.4s,    v14.4s
720    add             v10.4s,   v2.4s,    v8.4s
721    sub             v6.4s,    v2.4s,    v8.4s
722    shrn            ROW7R.4h, v4.4s,    #16
723    shrn            ROW7L.4h, v10.4s,   #16 /* ROW7L.4h <-> ROW3R.4h */
724    shrn            ROW4L.4h, v12.4s,   #16 /* ROW4L.4h <-> ROW0R.4h */
725    shrn            ROW4R.4h, v6.4s,    #16
726    b               2b /* Go to epilogue */
727
728    .unreq          DCT_TABLE
729    .unreq          COEF_BLOCK
730    .unreq          OUTPUT_BUF
731    .unreq          OUTPUT_COL
732    .unreq          TMP1
733    .unreq          TMP2
734    .unreq          TMP3
735    .unreq          TMP4
736
737    .unreq          ROW0L
738    .unreq          ROW0R
739    .unreq          ROW1L
740    .unreq          ROW1R
741    .unreq          ROW2L
742    .unreq          ROW2R
743    .unreq          ROW3L
744    .unreq          ROW3R
745    .unreq          ROW4L
746    .unreq          ROW4R
747    .unreq          ROW5L
748    .unreq          ROW5R
749    .unreq          ROW6L
750    .unreq          ROW6R
751    .unreq          ROW7L
752    .unreq          ROW7R
753
754
755/*****************************************************************************/
756
757/*
758 * jsimd_idct_ifast_neon
759 *
760 * This function contains a fast, not so accurate integer implementation of
761 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
762 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
763 * function from jidctfst.c
764 *
765 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
766 * But in ARM NEON case some extra additions are required because VQDMULH
767 * instruction can't handle the constants larger than 1. So the expressions
768 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
769 * which introduces an extra addition. Overall, there are 6 extra additions
770 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
771 */
772
773#define XFIX_1_082392200 v0.h[0]
774#define XFIX_1_414213562 v0.h[1]
775#define XFIX_1_847759065 v0.h[2]
776#define XFIX_2_613125930 v0.h[3]
777
778.balign 16
779Ljsimd_idct_ifast_neon_consts:
780    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
781    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
782    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
783    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
784
785asm_function jsimd_idct_ifast_neon
786
787    DCT_TABLE       .req x0
788    COEF_BLOCK      .req x1
789    OUTPUT_BUF      .req x2
790    OUTPUT_COL      .req x3
791    TMP1            .req x0
792    TMP2            .req x1
793    TMP3            .req x2
794    TMP4            .req x22
795    TMP5            .req x23
796
797    /* Load and dequantize coefficients into NEON registers
798     * with the following allocation:
799     *       0 1 2 3 | 4 5 6 7
800     *      ---------+--------
801     *   0 | d16     | d17     ( v8.8h  )
802     *   1 | d18     | d19     ( v9.8h  )
803     *   2 | d20     | d21     ( v10.8h )
804     *   3 | d22     | d23     ( v11.8h )
805     *   4 | d24     | d25     ( v12.8h )
806     *   5 | d26     | d27     ( v13.8h )
807     *   6 | d28     | d29     ( v14.8h )
808     *   7 | d30     | d31     ( v15.8h )
809     */
810    /* Save NEON registers used in fast IDCT */
811    sub             sp, sp, #176
812    stp             x22, x23, [sp], 16
813    adr             x23, Ljsimd_idct_ifast_neon_consts
814    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
815    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
816    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
817    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
818    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
819    ld1             {v8.8h, v9.8h}, [COEF_BLOCK], 32
820    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
821    ld1             {v10.8h, v11.8h}, [COEF_BLOCK], 32
822    mul             v8.8h,  v8.8h,  v0.8h
823    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
824    mul             v9.8h,  v9.8h,  v1.8h
825    ld1             {v12.8h, v13.8h}, [COEF_BLOCK], 32
826    mul             v10.8h, v10.8h, v2.8h
827    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
828    mul             v11.8h, v11.8h, v3.8h
829    ld1             {v14.8h, v15.8h}, [COEF_BLOCK], 32
830    mul             v12.8h, v12.8h, v0.8h
831    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
832    mul             v14.8h, v14.8h, v2.8h
833    mul             v13.8h, v13.8h, v1.8h
834    ld1             {v0.4h}, [x23]      /* load constants */
835    mul             v15.8h, v15.8h, v3.8h
836
837    /* 1-D IDCT, pass 1 */
838    sub             v2.8h,    v10.8h,   v14.8h
839    add             v14.8h,   v10.8h,   v14.8h
840    sub             v1.8h,    v11.8h,   v13.8h
841    add             v13.8h,   v11.8h,   v13.8h
842    sub             v5.8h,    v9.8h,    v15.8h
843    add             v15.8h,   v9.8h,    v15.8h
844    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
845    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
846    add             v3.8h,    v1.8h,    v1.8h
847    sub             v1.8h,    v5.8h,    v1.8h
848    add             v10.8h,   v2.8h,    v4.8h
849    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
850    sub             v2.8h,    v15.8h,   v13.8h
851    add             v3.8h,    v3.8h,    v6.8h
852    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
853    add             v1.8h,    v1.8h,    v4.8h
854    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
855    sub             v10.8h,   v10.8h,   v14.8h
856    add             v2.8h,    v2.8h,    v6.8h
857    sub             v6.8h,    v8.8h,    v12.8h
858    add             v12.8h,   v8.8h,    v12.8h
859    add             v9.8h,    v5.8h,    v4.8h
860    add             v5.8h,    v6.8h,    v10.8h
861    sub             v10.8h,   v6.8h,    v10.8h
862    add             v6.8h,    v15.8h,   v13.8h
863    add             v8.8h,    v12.8h,   v14.8h
864    sub             v3.8h,    v6.8h,    v3.8h
865    sub             v12.8h,   v12.8h,   v14.8h
866    sub             v3.8h,    v3.8h,    v1.8h
867    sub             v1.8h,    v9.8h,    v1.8h
868    add             v2.8h,    v3.8h,    v2.8h
869    sub             v15.8h,   v8.8h,    v6.8h
870    add             v1.8h,    v1.8h,    v2.8h
871    add             v8.8h,    v8.8h,    v6.8h
872    add             v14.8h,   v5.8h,    v3.8h
873    sub             v9.8h,    v5.8h,    v3.8h
874    sub             v13.8h,   v10.8h,   v2.8h
875    add             v10.8h,   v10.8h,   v2.8h
876    /* Transpose  q8-q9 */
877    mov             v18.16b,  v8.16b
878    trn1            v8.8h,    v8.8h,    v9.8h
879    trn2            v9.8h,    v18.8h,   v9.8h
880    sub             v11.8h,   v12.8h,   v1.8h
881    /* Transpose  q14-q15 */
882    mov             v18.16b,  v14.16b
883    trn1            v14.8h,   v14.8h,   v15.8h
884    trn2            v15.8h,   v18.8h,   v15.8h
885    add             v12.8h,   v12.8h,   v1.8h
886    /* Transpose  q10-q11 */
887    mov             v18.16b,  v10.16b
888    trn1            v10.8h,   v10.8h,   v11.8h
889    trn2            v11.8h,   v18.8h,   v11.8h
890    /* Transpose  q12-q13 */
891    mov             v18.16b,  v12.16b
892    trn1            v12.8h,   v12.8h,   v13.8h
893    trn2            v13.8h,   v18.8h,   v13.8h
894    /* Transpose  q9-q11 */
895    mov             v18.16b,  v9.16b
896    trn1            v9.4s,    v9.4s,    v11.4s
897    trn2            v11.4s,   v18.4s,   v11.4s
898    /* Transpose  q12-q14 */
899    mov             v18.16b,  v12.16b
900    trn1            v12.4s,   v12.4s,   v14.4s
901    trn2            v14.4s,   v18.4s,   v14.4s
902    /* Transpose  q8-q10 */
903    mov             v18.16b,  v8.16b
904    trn1            v8.4s,    v8.4s,    v10.4s
905    trn2            v10.4s,   v18.4s,   v10.4s
906    /* Transpose  q13-q15 */
907    mov             v18.16b,  v13.16b
908    trn1            v13.4s,   v13.4s,   v15.4s
909    trn2            v15.4s,   v18.4s,   v15.4s
910    /* vswp            v14.4h,   v10-MSB.4h */
911    umov            x22, v14.d[0]
912    ins             v14.d[0], v10.d[1]
913    ins             v10.d[1], x22
914    /* vswp            v13.4h,   v9MSB.4h */
915
916    umov            x22, v13.d[0]
917    ins             v13.d[0], v9.d[1]
918    ins             v9.d[1], x22
919    /* 1-D IDCT, pass 2 */
920    sub             v2.8h,    v10.8h,   v14.8h
921    /* vswp            v15.4h,   v11MSB.4h */
922    umov            x22, v15.d[0]
923    ins             v15.d[0], v11.d[1]
924    ins             v11.d[1], x22
925    add             v14.8h,   v10.8h,   v14.8h
926    /* vswp            v12.4h,   v8-MSB.4h */
927    umov            x22, v12.d[0]
928    ins             v12.d[0], v8.d[1]
929    ins             v8.d[1],  x22
930    sub             v1.8h,    v11.8h,   v13.8h
931    add             v13.8h,   v11.8h,   v13.8h
932    sub             v5.8h,    v9.8h,    v15.8h
933    add             v15.8h,   v9.8h,    v15.8h
934    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
935    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
936    add             v3.8h,    v1.8h,    v1.8h
937    sub             v1.8h,    v5.8h,    v1.8h
938    add             v10.8h,   v2.8h,    v4.8h
939    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
940    sub             v2.8h,    v15.8h,   v13.8h
941    add             v3.8h,    v3.8h,    v6.8h
942    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
943    add             v1.8h,    v1.8h,    v4.8h
944    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
945    sub             v10.8h,   v10.8h,   v14.8h
946    add             v2.8h,    v2.8h,    v6.8h
947    sub             v6.8h,    v8.8h,    v12.8h
948    add             v12.8h,   v8.8h,    v12.8h
949    add             v9.8h,    v5.8h,    v4.8h
950    add             v5.8h,    v6.8h,    v10.8h
951    sub             v10.8h,   v6.8h,    v10.8h
952    add             v6.8h,    v15.8h,   v13.8h
953    add             v8.8h,    v12.8h,   v14.8h
954    sub             v3.8h,    v6.8h,    v3.8h
955    sub             v12.8h,   v12.8h,   v14.8h
956    sub             v3.8h,    v3.8h,    v1.8h
957    sub             v1.8h,    v9.8h,    v1.8h
958    add             v2.8h,    v3.8h,    v2.8h
959    sub             v15.8h,   v8.8h,    v6.8h
960    add             v1.8h,    v1.8h,    v2.8h
961    add             v8.8h,    v8.8h,    v6.8h
962    add             v14.8h,   v5.8h,    v3.8h
963    sub             v9.8h,    v5.8h,    v3.8h
964    sub             v13.8h,   v10.8h,   v2.8h
965    add             v10.8h,   v10.8h,   v2.8h
966    sub             v11.8h,   v12.8h,   v1.8h
967    add             v12.8h,   v12.8h,   v1.8h
968    /* Descale to 8-bit and range limit */
969    movi            v0.16b,   #0x80
970    sqshrn          v8.8b,    v8.8h,    #5
971    sqshrn2         v8.16b,   v9.8h,    #5
972    sqshrn          v9.8b,    v10.8h,   #5
973    sqshrn2         v9.16b,   v11.8h,   #5
974    sqshrn          v10.8b,   v12.8h,   #5
975    sqshrn2         v10.16b,  v13.8h,   #5
976    sqshrn          v11.8b,   v14.8h,   #5
977    sqshrn2         v11.16b,  v15.8h,   #5
978    add             v8.16b,   v8.16b,   v0.16b
979    add             v9.16b,   v9.16b,   v0.16b
980    add             v10.16b,  v10.16b,  v0.16b
981    add             v11.16b,  v11.16b,  v0.16b
982    /* Transpose the final 8-bit samples */
983    /* Transpose  q8-q9 */
984    mov             v18.16b,  v8.16b
985    trn1            v8.8h,    v8.8h,    v9.8h
986    trn2            v9.8h,    v18.8h,   v9.8h
987    /* Transpose  q10-q11 */
988    mov             v18.16b,  v10.16b
989    trn1            v10.8h,   v10.8h,   v11.8h
990    trn2            v11.8h,   v18.8h,   v11.8h
991    /* Transpose  q8-q10 */
992    mov             v18.16b,  v8.16b
993    trn1            v8.4s,    v8.4s,    v10.4s
994    trn2            v10.4s,   v18.4s,   v10.4s
995    /* Transpose  q9-q11 */
996    mov             v18.16b,  v9.16b
997    trn1            v9.4s,    v9.4s,    v11.4s
998    trn2            v11.4s,   v18.4s,   v11.4s
999    /* make copy */
1000    ins             v17.d[0], v8.d[1]
1001    /* Transpose  d16-d17-msb */
1002    mov             v18.16b,  v8.16b
1003    trn1            v8.8b,    v8.8b,    v17.8b
1004    trn2            v17.8b,   v18.8b,   v17.8b
1005    /* make copy */
1006    ins             v19.d[0], v9.d[1]
1007    mov             v18.16b,  v9.16b
1008    trn1            v9.8b,    v9.8b,    v19.8b
1009    trn2            v19.8b,   v18.8b,   v19.8b
1010    /* Store results to the output buffer */
1011    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
1012    add             TMP1,     TMP1,     OUTPUT_COL
1013    add             TMP2,     TMP2,     OUTPUT_COL
1014    st1             {v8.8b},  [TMP1]
1015    st1             {v17.8b}, [TMP2]
1016    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
1017    add             TMP1,     TMP1,     OUTPUT_COL
1018    add             TMP2,     TMP2,     OUTPUT_COL
1019    st1             {v9.8b},  [TMP1]
1020    /* make copy */
1021    ins             v7.d[0],  v10.d[1]
1022    mov             v18.16b,  v10.16b
1023    trn1            v10.8b,   v10.8b,   v7.8b
1024    trn2            v7.8b,    v18.8b,   v7.8b
1025    st1             {v19.8b}, [TMP2]
1026    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
1027    ldp             TMP4,     TMP5,     [OUTPUT_BUF], 16
1028    add             TMP1,     TMP1,     OUTPUT_COL
1029    add             TMP2,     TMP2,     OUTPUT_COL
1030    add             TMP4,     TMP4,     OUTPUT_COL
1031    add             TMP5,     TMP5,     OUTPUT_COL
1032    st1             {v10.8b}, [TMP1]
1033    /* make copy */
1034    ins             v16.d[0], v11.d[1]
1035    mov             v18.16b,  v11.16b
1036    trn1            v11.8b,   v11.8b,   v16.8b
1037    trn2            v16.8b,   v18.8b,   v16.8b
1038    st1             {v7.8b},  [TMP2]
1039    st1             {v11.8b}, [TMP4]
1040    st1             {v16.8b}, [TMP5]
1041    sub             sp, sp, #176
1042    ldp             x22, x23, [sp], 16
1043    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1044    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1045    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1046    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1047    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1048    blr             x30
1049
1050    .unreq          DCT_TABLE
1051    .unreq          COEF_BLOCK
1052    .unreq          OUTPUT_BUF
1053    .unreq          OUTPUT_COL
1054    .unreq          TMP1
1055    .unreq          TMP2
1056    .unreq          TMP3
1057    .unreq          TMP4
1058
1059
1060/*****************************************************************************/
1061
1062/*
1063 * jsimd_idct_4x4_neon
1064 *
1065 * This function contains inverse-DCT code for getting reduced-size
1066 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
1067 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1068 * function from jpeg-6b (jidctred.c).
1069 *
1070 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1071 *       requires much less arithmetic operations and hence should be faster.
1072 *       The primary purpose of this particular NEON optimized function is
1073 *       bit exact compatibility with jpeg-6b.
1074 *
1075 * TODO: a bit better instructions scheduling can be achieved by expanding
1076 *       idct_helper/transpose_4x4 macros and reordering instructions,
1077 *       but readability will suffer somewhat.
1078 */
1079
1080#define CONST_BITS  13
1081
1082#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
1083#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
1084#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
1085#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
1086#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
1087#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
1088#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
1089#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
1090#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
1091#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
1092#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
1093#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
1094#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
1095#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
1096
1097.balign 16
1098Ljsimd_idct_4x4_neon_consts:
1099    .short     FIX_1_847759065     /* v0.h[0] */
1100    .short     -FIX_0_765366865    /* v0.h[1] */
1101    .short     -FIX_0_211164243    /* v0.h[2] */
1102    .short     FIX_1_451774981     /* v0.h[3] */
1103    .short     -FIX_2_172734803    /* d1[0] */
1104    .short     FIX_1_061594337     /* d1[1] */
1105    .short     -FIX_0_509795579    /* d1[2] */
1106    .short     -FIX_0_601344887    /* d1[3] */
1107    .short     FIX_0_899976223     /* v2.h[0] */
1108    .short     FIX_2_562915447     /* v2.h[1] */
1109    .short     1 << (CONST_BITS+1) /* v2.h[2] */
1110    .short     0                   /* v2.h[3] */
1111
1112.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1113    smull           v28.4s, \x4,    v2.h[2]
1114    smlal           v28.4s, \x8,    v0.h[0]
1115    smlal           v28.4s, \x14,   v0.h[1]
1116
1117    smull           v26.4s, \x16,   v1.h[2]
1118    smlal           v26.4s, \x12,   v1.h[3]
1119    smlal           v26.4s, \x10,   v2.h[0]
1120    smlal           v26.4s, \x6,    v2.h[1]
1121
1122    smull           v30.4s, \x4,    v2.h[2]
1123    smlsl           v30.4s, \x8,    v0.h[0]
1124    smlsl           v30.4s, \x14,   v0.h[1]
1125
1126    smull           v24.4s, \x16,   v0.h[2]
1127    smlal           v24.4s, \x12,   v0.h[3]
1128    smlal           v24.4s, \x10,   v1.h[0]
1129    smlal           v24.4s, \x6,    v1.h[1]
1130
1131    add             v20.4s, v28.4s, v26.4s
1132    sub             v28.4s, v28.4s, v26.4s
1133
1134.if \shift > 16
1135    srshr           v20.4s, v20.4s, #\shift
1136    srshr           v28.4s, v28.4s, #\shift
1137    xtn             \y26,   v20.4s
1138    xtn             \y29,   v28.4s
1139.else
1140    rshrn           \y26,   v20.4s, #\shift
1141    rshrn           \y29,   v28.4s, #\shift
1142.endif
1143
1144    add             v20.4s, v30.4s, v24.4s
1145    sub             v30.4s, v30.4s, v24.4s
1146
1147.if \shift > 16
1148    srshr           v20.4s, v20.4s, #\shift
1149    srshr           v30.4s, v30.4s, #\shift
1150    xtn             \y27,   v20.4s
1151    xtn             \y28,   v30.4s
1152.else
1153    rshrn           \y27,   v20.4s, #\shift
1154    rshrn           \y28,   v30.4s, #\shift
1155.endif
1156
1157.endm
1158
1159asm_function jsimd_idct_4x4_neon
1160
1161    DCT_TABLE       .req x0
1162    COEF_BLOCK      .req x1
1163    OUTPUT_BUF      .req x2
1164    OUTPUT_COL      .req x3
1165    TMP1            .req x0
1166    TMP2            .req x1
1167    TMP3            .req x2
1168    TMP4            .req x15
1169
1170    /* Save all used NEON registers */
1171    sub             sp, sp, 272
1172    str             x15, [sp], 16
1173    /* Load constants (v3.4h is just used for padding) */
1174    adr             TMP4, Ljsimd_idct_4x4_neon_consts
1175    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1176    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1177    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1178    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1179    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1180    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
1181    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1182    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
1183    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1184
1185    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1186     *       0 1 2 3 | 4 5 6 7
1187     *      ---------+--------
1188     *   0 | v4.4h   | v5.4h
1189     *   1 | v6.4h   | v7.4h
1190     *   2 | v8.4h   | v9.4h
1191     *   3 | v10.4h  | v11.4h
1192     *   4 | -       | -
1193     *   5 | v12.4h  | v13.4h
1194     *   6 | v14.4h  | v15.4h
1195     *   7 | v16.4h  | v17.4h
1196     */
1197    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1198    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1199    add             COEF_BLOCK, COEF_BLOCK, #16
1200    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1201    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
1202    /* dequantize */
1203    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1204    mul             v4.4h, v4.4h, v18.4h
1205    mul             v5.4h, v5.4h, v19.4h
1206    ins             v4.d[1], v5.d[0]    /* 128 bit q4 */
1207    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1208    mul             v6.4h, v6.4h, v20.4h
1209    mul             v7.4h, v7.4h, v21.4h
1210    ins             v6.d[1], v7.d[0]    /* 128 bit q6 */
1211    mul             v8.4h, v8.4h, v22.4h
1212    mul             v9.4h, v9.4h, v23.4h
1213    ins             v8.d[1], v9.d[0]    /* 128 bit q8 */
1214    add             DCT_TABLE, DCT_TABLE, #16
1215    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1216    mul             v10.4h, v10.4h, v24.4h
1217    mul             v11.4h, v11.4h, v25.4h
1218    ins             v10.d[1], v11.d[0]  /* 128 bit q10 */
1219    mul             v12.4h, v12.4h, v26.4h
1220    mul             v13.4h, v13.4h, v27.4h
1221    ins             v12.d[1], v13.d[0]  /* 128 bit q12 */
1222    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
1223    mul             v14.4h, v14.4h, v28.4h
1224    mul             v15.4h, v15.4h, v29.4h
1225    ins             v14.d[1], v15.d[0]  /* 128 bit q14 */
1226    mul             v16.4h, v16.4h, v30.4h
1227    mul             v17.4h, v17.4h, v31.4h
1228    ins             v16.d[1], v17.d[0]  /* 128 bit q16 */
1229
1230    /* Pass 1 */
1231    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
1232    transpose_4x4   v4, v6, v8, v10, v3
1233    ins             v10.d[1], v11.d[0]
1234    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
1235    transpose_4x4   v5, v7, v9, v11, v3
1236    ins             v10.d[1], v11.d[0]
1237    /* Pass 2 */
1238    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
1239    transpose_4x4   v26, v27, v28, v29, v3
1240
1241    /* Range limit */
1242    movi            v30.8h, #0x80
1243    ins             v26.d[1], v27.d[0]
1244    ins             v28.d[1], v29.d[0]
1245    add             v26.8h, v26.8h, v30.8h
1246    add             v28.8h, v28.8h, v30.8h
1247    sqxtun          v26.8b, v26.8h
1248    sqxtun          v27.8b, v28.8h
1249
1250    /* Store results to the output buffer */
1251    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
1252    ldp             TMP3, TMP4, [OUTPUT_BUF]
1253    add             TMP1, TMP1, OUTPUT_COL
1254    add             TMP2, TMP2, OUTPUT_COL
1255    add             TMP3, TMP3, OUTPUT_COL
1256    add             TMP4, TMP4, OUTPUT_COL
1257
1258#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1259    /* We can use much less instructions on little endian systems if the
1260     * OS kernel is not configured to trap unaligned memory accesses
1261     */
1262    st1             {v26.s}[0], [TMP1], 4
1263    st1             {v27.s}[0], [TMP3], 4
1264    st1             {v26.s}[1], [TMP2], 4
1265    st1             {v27.s}[1], [TMP4], 4
1266#else
1267    st1             {v26.b}[0], [TMP1], 1
1268    st1             {v27.b}[0], [TMP3], 1
1269    st1             {v26.b}[1], [TMP1], 1
1270    st1             {v27.b}[1], [TMP3], 1
1271    st1             {v26.b}[2], [TMP1], 1
1272    st1             {v27.b}[2], [TMP3], 1
1273    st1             {v26.b}[3], [TMP1], 1
1274    st1             {v27.b}[3], [TMP3], 1
1275
1276    st1             {v26.b}[4], [TMP2], 1
1277    st1             {v27.b}[4], [TMP4], 1
1278    st1             {v26.b}[5], [TMP2], 1
1279    st1             {v27.b}[5], [TMP4], 1
1280    st1             {v26.b}[6], [TMP2], 1
1281    st1             {v27.b}[6], [TMP4], 1
1282    st1             {v26.b}[7], [TMP2], 1
1283    st1             {v27.b}[7], [TMP4], 1
1284#endif
1285
1286    /* vpop            {v8.4h - v15.4h}    ;not available */
1287    sub             sp, sp, #272
1288    ldr             x15, [sp], 16
1289    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1290    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1291    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1292    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1293    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1294    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
1295    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1296    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
1297    blr             x30
1298
1299    .unreq          DCT_TABLE
1300    .unreq          COEF_BLOCK
1301    .unreq          OUTPUT_BUF
1302    .unreq          OUTPUT_COL
1303    .unreq          TMP1
1304    .unreq          TMP2
1305    .unreq          TMP3
1306    .unreq          TMP4
1307
1308.purgem idct_helper
1309
1310
1311/*****************************************************************************/
1312
1313/*
1314 * jsimd_idct_2x2_neon
1315 *
1316 * This function contains inverse-DCT code for getting reduced-size
1317 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
1318 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1319 * function from jpeg-6b (jidctred.c).
1320 *
1321 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1322 *       requires much less arithmetic operations and hence should be faster.
1323 *       The primary purpose of this particular NEON optimized function is
1324 *       bit exact compatibility with jpeg-6b.
1325 */
1326
1327.balign 8
1328Ljsimd_idct_2x2_neon_consts:
1329    .short     -FIX_0_720959822    /* v14[0] */
1330    .short     FIX_0_850430095     /* v14[1] */
1331    .short     -FIX_1_272758580    /* v14[2] */
1332    .short     FIX_3_624509785     /* v14[3] */
1333
1334.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1335    sshll      v15.4s, \x4,    #15
1336    smull      v26.4s, \x6,    v14.h[3]
1337    smlal      v26.4s, \x10,   v14.h[2]
1338    smlal      v26.4s, \x12,   v14.h[1]
1339    smlal      v26.4s, \x16,   v14.h[0]
1340
1341    add        v20.4s, v15.4s, v26.4s
1342    sub        v15.4s, v15.4s, v26.4s
1343
1344.if \shift > 16
1345    srshr      v20.4s, v20.4s, #\shift
1346    srshr      v15.4s, v15.4s, #\shift
1347    xtn        \y26,   v20.4s
1348    xtn        \y27,   v15.4s
1349.else
1350    rshrn      \y26,   v20.4s, #\shift
1351    rshrn      \y27,   v15.4s, #\shift
1352.endif
1353
1354.endm
1355
1356asm_function jsimd_idct_2x2_neon
1357
1358    DCT_TABLE       .req x0
1359    COEF_BLOCK      .req x1
1360    OUTPUT_BUF      .req x2
1361    OUTPUT_COL      .req x3
1362    TMP1            .req x0
1363    TMP2            .req x15
1364
1365    /* vpush           {v8.4h - v15.4h}            ; not available */
1366    sub             sp, sp, 208
1367    str             x15, [sp], 16
1368
1369    /* Load constants */
1370    adr             TMP2, Ljsimd_idct_2x2_neon_consts
1371    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1372    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1373    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1374    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1375    st1             {v21.8b, v22.8b}, [sp], 16
1376    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1377    st1             {v30.8b, v31.8b}, [sp], 16
1378    ld1             {v14.4h}, [TMP2]
1379
1380    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1381     *       0 1 2 3 | 4 5 6 7
1382     *      ---------+--------
1383     *   0 | v4.4h   | v5.4h
1384     *   1 | v6.4h   | v7.4h
1385     *   2 | -       | -
1386     *   3 | v10.4h  | v11.4h
1387     *   4 | -       | -
1388     *   5 | v12.4h  | v13.4h
1389     *   6 | -       | -
1390     *   7 | v16.4h  | v17.4h
1391     */
1392    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1393    add             COEF_BLOCK, COEF_BLOCK, #16
1394    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
1395    add             COEF_BLOCK, COEF_BLOCK, #16
1396    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
1397    add             COEF_BLOCK, COEF_BLOCK, #16
1398    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
1399    /* Dequantize */
1400    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1401    mul             v4.4h, v4.4h, v18.4h
1402    mul             v5.4h, v5.4h, v19.4h
1403    ins             v4.d[1], v5.d[0]
1404    mul             v6.4h, v6.4h, v20.4h
1405    mul             v7.4h, v7.4h, v21.4h
1406    ins             v6.d[1], v7.d[0]
1407    add             DCT_TABLE, DCT_TABLE, #16
1408    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
1409    mul             v10.4h, v10.4h, v24.4h
1410    mul             v11.4h, v11.4h, v25.4h
1411    ins             v10.d[1], v11.d[0]
1412    add             DCT_TABLE, DCT_TABLE, #16
1413    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
1414    mul             v12.4h, v12.4h, v26.4h
1415    mul             v13.4h, v13.4h, v27.4h
1416    ins             v12.d[1], v13.d[0]
1417    add             DCT_TABLE, DCT_TABLE, #16
1418    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
1419    mul             v16.4h, v16.4h, v30.4h
1420    mul             v17.4h, v17.4h, v31.4h
1421    ins             v16.d[1], v17.d[0]
1422
1423    /* Pass 1 */
1424#if 0
1425    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1426    transpose_4x4   v4.4h, v6.4h, v8.4h,  v10.4h
1427    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1428    transpose_4x4   v5.4h, v7.4h, v9.4h,  v11.4h
1429#else
1430    smull           v26.4s, v6.4h,  v14.h[3]
1431    smlal           v26.4s, v10.4h, v14.h[2]
1432    smlal           v26.4s, v12.4h, v14.h[1]
1433    smlal           v26.4s, v16.4h, v14.h[0]
1434    smull           v24.4s, v7.4h,  v14.h[3]
1435    smlal           v24.4s, v11.4h, v14.h[2]
1436    smlal           v24.4s, v13.4h, v14.h[1]
1437    smlal           v24.4s, v17.4h, v14.h[0]
1438    sshll           v15.4s, v4.4h,  #15
1439    sshll           v30.4s, v5.4h,  #15
1440    add             v20.4s, v15.4s, v26.4s
1441    sub             v15.4s, v15.4s, v26.4s
1442    rshrn           v4.4h,  v20.4s, #13
1443    rshrn           v6.4h,  v15.4s, #13
1444    add             v20.4s, v30.4s, v24.4s
1445    sub             v15.4s, v30.4s, v24.4s
1446    rshrn           v5.4h,  v20.4s, #13
1447    rshrn           v7.4h,  v15.4s, #13
1448    ins             v4.d[1], v5.d[0]
1449    ins             v6.d[1], v7.d[0]
1450    transpose       v4, v6, v3, .16b, .8h
1451    transpose       v6, v10, v3, .16b, .4s
1452    ins             v11.d[0], v10.d[1]
1453    ins             v7.d[0], v6.d[1]
1454#endif
1455
1456    /* Pass 2 */
1457    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1458
1459    /* Range limit */
1460    movi            v30.8h, #0x80
1461    ins             v26.d[1], v27.d[0]
1462    add             v26.8h, v26.8h, v30.8h
1463    sqxtun          v30.8b, v26.8h
1464    ins             v26.d[0], v30.d[0]
1465    sqxtun          v27.8b, v26.8h
1466
1467    /* Store results to the output buffer */
1468    ldp             TMP1, TMP2, [OUTPUT_BUF]
1469    add             TMP1, TMP1, OUTPUT_COL
1470    add             TMP2, TMP2, OUTPUT_COL
1471
1472    st1             {v26.b}[0], [TMP1], 1
1473    st1             {v27.b}[4], [TMP1], 1
1474    st1             {v26.b}[1], [TMP2], 1
1475    st1             {v27.b}[5], [TMP2], 1
1476
1477    sub             sp, sp, #208
1478    ldr             x15, [sp], 16
1479    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1480    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1481    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1482    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1483    ld1             {v21.8b, v22.8b}, [sp], 16
1484    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1485    ld1             {v30.8b, v31.8b}, [sp], 16
1486    blr             x30
1487
1488    .unreq          DCT_TABLE
1489    .unreq          COEF_BLOCK
1490    .unreq          OUTPUT_BUF
1491    .unreq          OUTPUT_COL
1492    .unreq          TMP1
1493    .unreq          TMP2
1494
1495.purgem idct_helper
1496
1497
1498/*****************************************************************************/
1499
1500/*
1501 * jsimd_ycc_extrgb_convert_neon
1502 * jsimd_ycc_extbgr_convert_neon
1503 * jsimd_ycc_extrgbx_convert_neon
1504 * jsimd_ycc_extbgrx_convert_neon
1505 * jsimd_ycc_extxbgr_convert_neon
1506 * jsimd_ycc_extxrgb_convert_neon
1507 *
1508 * Colorspace conversion YCbCr -> RGB
1509 */
1510
1511
1512.macro do_load size
1513    .if \size == 8
1514        ld1  {v4.8b}, [U], 8
1515        ld1  {v5.8b}, [V], 8
1516        ld1  {v0.8b}, [Y], 8
1517        prfm pldl1keep, [U, #64]
1518        prfm pldl1keep, [V, #64]
1519        prfm pldl1keep, [Y, #64]
1520    .elseif \size == 4
1521        ld1  {v4.b}[0], [U], 1
1522        ld1  {v4.b}[1], [U], 1
1523        ld1  {v4.b}[2], [U], 1
1524        ld1  {v4.b}[3], [U], 1
1525        ld1  {v5.b}[0], [V], 1
1526        ld1  {v5.b}[1], [V], 1
1527        ld1  {v5.b}[2], [V], 1
1528        ld1  {v5.b}[3], [V], 1
1529        ld1  {v0.b}[0], [Y], 1
1530        ld1  {v0.b}[1], [Y], 1
1531        ld1  {v0.b}[2], [Y], 1
1532        ld1  {v0.b}[3], [Y], 1
1533    .elseif \size == 2
1534        ld1  {v4.b}[4], [U], 1
1535        ld1  {v4.b}[5], [U], 1
1536        ld1  {v5.b}[4], [V], 1
1537        ld1  {v5.b}[5], [V], 1
1538        ld1  {v0.b}[4], [Y], 1
1539        ld1  {v0.b}[5], [Y], 1
1540    .elseif \size == 1
1541        ld1  {v4.b}[6], [U], 1
1542        ld1  {v5.b}[6], [V], 1
1543        ld1  {v0.b}[6], [Y], 1
1544    .else
1545        .error unsupported macroblock size
1546    .endif
1547.endm
1548
1549.macro do_store bpp, size
1550    .if \bpp == 24
1551        .if \size == 8
1552            st3  {v10.8b, v11.8b, v12.8b}, [RGB], 24
1553        .elseif \size == 4
1554            st3  {v10.b, v11.b, v12.b}[0], [RGB], 3
1555            st3  {v10.b, v11.b, v12.b}[1], [RGB], 3
1556            st3  {v10.b, v11.b, v12.b}[2], [RGB], 3
1557            st3  {v10.b, v11.b, v12.b}[3], [RGB], 3
1558        .elseif \size == 2
1559            st3  {v10.b, v11.b, v12.b}[4], [RGB], 3
1560            st3  {v10.b, v11.b, v12.b}[5], [RGB], 3
1561        .elseif \size == 1
1562            st3  {v10.b, v11.b, v12.b}[6], [RGB], 3
1563        .else
1564            .error unsupported macroblock size
1565        .endif
1566    .elseif \bpp == 32
1567        .if \size == 8
1568            st4  {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1569        .elseif \size == 4
1570            st4  {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1571            st4  {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1572            st4  {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1573            st4  {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1574        .elseif \size == 2
1575            st4  {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1576            st4  {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1577        .elseif \size == 1
1578            st4  {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1579        .else
1580            .error unsupported macroblock size
1581        .endif
1582    .elseif \bpp==16
1583        .if \size == 8
1584            st1  {v25.8h}, [RGB],16
1585        .elseif \size == 4
1586            st1  {v25.4h}, [RGB],8
1587        .elseif \size == 2
1588            st1  {v25.h}[4], [RGB],2
1589            st1  {v25.h}[5], [RGB],2
1590        .elseif \size == 1
1591            st1  {v25.h}[6], [RGB],2
1592        .else
1593            .error unsupported macroblock size
1594        .endif
1595     .else
1596        .error unsupported bpp
1597    .endif
1598.endm
1599
1600.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
1601
1602/*
1603 * 2-stage pipelined YCbCr->RGB conversion
1604 */
1605
1606.macro do_yuv_to_rgb_stage1
1607    uaddw        v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
1608    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1609    smull        v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1610    smlal        v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1611    smull2       v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1612    smlal2       v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1613    smull        v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1614    smull2       v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1615    smull        v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
1616    smull2       v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
1617.endm
1618
1619.macro do_yuv_to_rgb_stage2
1620    rshrn        v20.4h, v20.4s, #15
1621    rshrn2       v20.8h, v22.4s, #15
1622    rshrn        v24.4h, v24.4s, #14
1623    rshrn2       v24.8h, v26.4s, #14
1624    rshrn        v28.4h, v28.4s, #14
1625    rshrn2       v28.8h, v30.4s, #14
1626    uaddw        v20.8h, v20.8h, v0.8b
1627    uaddw        v24.8h, v24.8h, v0.8b
1628    uaddw        v28.8h, v28.8h, v0.8b
1629.if \bpp != 16
1630    sqxtun       v1\g_offs\defsize, v20.8h
1631    sqxtun       v1\r_offs\defsize, v24.8h
1632    sqxtun       v1\b_offs\defsize, v28.8h
1633.else
1634    sqshlu       v21.8h, v20.8h, #8
1635    sqshlu       v25.8h, v24.8h, #8
1636    sqshlu       v29.8h, v28.8h, #8
1637    sri          v25.8h, v21.8h, #5
1638    sri          v25.8h, v29.8h, #11
1639.endif
1640
1641.endm
1642
1643.macro do_yuv_to_rgb_stage2_store_load_stage1
1644    rshrn        v20.4h, v20.4s, #15
1645    rshrn        v24.4h, v24.4s, #14
1646    rshrn        v28.4h, v28.4s, #14
1647    ld1          {v4.8b}, [U], 8
1648    rshrn2       v20.8h, v22.4s, #15
1649    rshrn2       v24.8h, v26.4s, #14
1650    rshrn2       v28.8h, v30.4s, #14
1651    ld1          {v5.8b}, [V], 8
1652    uaddw        v20.8h, v20.8h, v0.8b
1653    uaddw        v24.8h, v24.8h, v0.8b
1654    uaddw        v28.8h, v28.8h, v0.8b
1655.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
1656    sqxtun       v1\g_offs\defsize, v20.8h
1657    ld1          {v0.8b}, [Y], 8
1658    sqxtun       v1\r_offs\defsize, v24.8h
1659    prfm         pldl1keep, [U, #64]
1660    prfm         pldl1keep, [V, #64]
1661    prfm         pldl1keep, [Y, #64]
1662    sqxtun       v1\b_offs\defsize, v28.8h
1663    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
1664    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1665    smull        v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1666    smlal        v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1667    smull2       v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1668    smlal2       v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1669    smull        v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1670    smull2       v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1671.else /**************************** rgb565 ***********************************/
1672    sqshlu       v21.8h, v20.8h, #8
1673    sqshlu       v25.8h, v24.8h, #8
1674    sqshlu       v29.8h, v28.8h, #8
1675    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
1676    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1677    ld1          {v0.8b}, [Y], 8
1678    smull        v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1679    smlal        v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1680    smull2       v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1681    smlal2       v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1682    sri          v25.8h, v21.8h, #5
1683    smull        v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1684    smull2       v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1685    prfm         pldl1keep, [U, #64]
1686    prfm         pldl1keep, [V, #64]
1687    prfm         pldl1keep, [Y, #64]
1688    sri          v25.8h, v29.8h, #11
1689.endif
1690    do_store     \bpp, 8
1691    smull        v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
1692    smull2       v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
1693.endm
1694
1695.macro do_yuv_to_rgb
1696    do_yuv_to_rgb_stage1
1697    do_yuv_to_rgb_stage2
1698.endm
1699
1700/* Apple gas crashes on adrl, work around that by using adr.
1701 * But this requires a copy of these constants for each function.
1702 */
1703
1704.balign 16
1705Ljsimd_ycc_\colorid\()_neon_consts:
1706    .short          0,      0,     0,      0
1707    .short          22971, -11277, -23401, 29033
1708    .short          -128,  -128,   -128,   -128
1709    .short          -128,  -128,   -128,   -128
1710
1711asm_function jsimd_ycc_\colorid\()_convert_neon
1712    OUTPUT_WIDTH    .req x0
1713    INPUT_BUF       .req x1
1714    INPUT_ROW       .req x2
1715    OUTPUT_BUF      .req x3
1716    NUM_ROWS        .req x4
1717
1718    INPUT_BUF0      .req x5
1719    INPUT_BUF1      .req x6
1720    INPUT_BUF2      .req x1
1721
1722    RGB             .req x7
1723    Y               .req x8
1724    U               .req x9
1725    V               .req x10
1726    N               .req x15
1727
1728    sub             sp, sp, 336
1729    str             x15, [sp], 16
1730    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
1731    adr             x15, Ljsimd_ycc_\colorid\()_neon_consts
1732    /* Save NEON registers */
1733    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1734    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1735    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1736    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1737    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1738    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
1739    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1740    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
1741    ld1             {v0.4h, v1.4h}, [x15], 16
1742    ld1             {v2.8h}, [x15]
1743
1744    /* Save ARM registers and handle input arguments */
1745    /* push            {x4, x5, x6, x7, x8, x9, x10, x30} */
1746    stp             x4, x5, [sp], 16
1747    stp             x6, x7, [sp], 16
1748    stp             x8, x9, [sp], 16
1749    stp             x10, x30, [sp], 16
1750    ldr             INPUT_BUF0, [INPUT_BUF]
1751    ldr             INPUT_BUF1, [INPUT_BUF, #8]
1752    ldr             INPUT_BUF2, [INPUT_BUF, #16]
1753    .unreq          INPUT_BUF
1754
1755    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1756    movi            v10.16b, #255
1757    movi            v13.16b, #255
1758
1759    /* Outer loop over scanlines */
1760    cmp             NUM_ROWS, #1
1761    b.lt            9f
17620:
1763    lsl             x16, INPUT_ROW, #3
1764    ldr             Y, [INPUT_BUF0, x16]
1765    ldr             U, [INPUT_BUF1, x16]
1766    mov             N, OUTPUT_WIDTH
1767    ldr             V, [INPUT_BUF2, x16]
1768    add             INPUT_ROW, INPUT_ROW, #1
1769    ldr             RGB, [OUTPUT_BUF], #8
1770
1771    /* Inner loop over pixels */
1772    subs            N, N, #8
1773    b.lt            3f
1774    do_load         8
1775    do_yuv_to_rgb_stage1
1776    subs            N, N, #8
1777    b.lt            2f
17781:
1779    do_yuv_to_rgb_stage2_store_load_stage1
1780    subs            N, N, #8
1781    b.ge            1b
17822:
1783    do_yuv_to_rgb_stage2
1784    do_store        \bpp, 8
1785    tst             N, #7
1786    b.eq            8f
17873:
1788    tst             N, #4
1789    b.eq            3f
1790    do_load         4
17913:
1792    tst             N, #2
1793    b.eq            4f
1794    do_load         2
17954:
1796    tst             N, #1
1797    b.eq            5f
1798    do_load         1
17995:
1800    do_yuv_to_rgb
1801    tst             N, #4
1802    b.eq            6f
1803    do_store        \bpp, 4
18046:
1805    tst             N, #2
1806    b.eq            7f
1807    do_store        \bpp, 2
18087:
1809    tst             N, #1
1810    b.eq            8f
1811    do_store        \bpp, 1
18128:
1813    subs            NUM_ROWS, NUM_ROWS, #1
1814    b.gt            0b
18159:
1816    /* Restore all registers and return */
1817    sub             sp, sp, #336
1818    ldr             x15, [sp], 16
1819    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
1820    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
1821    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1822    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1823    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
1824    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
1825    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
1826    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
1827    /* pop             {r4, r5, r6, r7, r8, r9, r10, pc} */
1828    ldp             x4, x5, [sp], 16
1829    ldp             x6, x7, [sp], 16
1830    ldp             x8, x9, [sp], 16
1831    ldp             x10, x30, [sp], 16
1832    br              x30
1833    .unreq          OUTPUT_WIDTH
1834    .unreq          INPUT_ROW
1835    .unreq          OUTPUT_BUF
1836    .unreq          NUM_ROWS
1837    .unreq          INPUT_BUF0
1838    .unreq          INPUT_BUF1
1839    .unreq          INPUT_BUF2
1840    .unreq          RGB
1841    .unreq          Y
1842    .unreq          U
1843    .unreq          V
1844    .unreq          N
1845
1846.purgem do_yuv_to_rgb
1847.purgem do_yuv_to_rgb_stage1
1848.purgem do_yuv_to_rgb_stage2
1849.purgem do_yuv_to_rgb_stage2_store_load_stage1
1850.endm
1851
1852/*--------------------------------- id ----- bpp R  rsize  G  gsize  B  bsize  defsize   */
1853generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,   1, .4h,   2, .4h,   .8b
1854generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,   1, .4h,   0, .4h,   .8b
1855generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,   1, .4h,   2, .4h,   .8b
1856generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,   1, .4h,   0, .4h,   .8b
1857generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,   2, .4h,   1, .4h,   .8b
1858generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,   2, .4h,   3, .4h,   .8b
1859generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,   0, .4h,   0, .4h,   .8b
1860.purgem do_load
1861.purgem do_store
1862