1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define END(f) .size f, .-f;
19
20/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
21 * integer (bicubic has a little overshoot).  It would also be possible to add
22 * a temporary DC bias to eliminate the sign bit for more precision, but that's
23 * extra arithmetic.
24 */
25.set VERTBITS, 14
26
27/* The size of the scratch buffer in which we store our vertically convolved
28 * intermediates.
29 */
30.set CHUNKSHIFT, 7       /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */
31.set CHUNKSIZE, (1 << CHUNKSHIFT)
32
33/* The number of components processed in a single iteration of the innermost
34 * loop.
35 */
36.set VECSHIFT, 3
37.set VECSIZE, (1<<VECSHIFT)
38
39/* Read four different lines (except at edges where addresses may be clamped,
40 * which is why we don't simply take base and stride registers), and multiply
41 * and accumulate them by the coefficients in v3[0..3], leaving the results in
42 * v12.  This gives eight 16-bit results representing a horizontal line of 2-8
43 * input pixels (depending on number of components per pixel) to be fed into
44 * the horizontal scaling pass.
45 *
46 * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
47 * known to represent negative values and VMLS is used to implement this).
48 * Output is VERTBITS signed fixed-point, which must leave room for a little
49 * v12.  This gives eight 16-bit results.
50 */
51.macro vert8, dstlo=v12.4h, dsthi=v12.8h
52        ld1         {v8.8b}, [x4], #8
53        ld1         {v9.8b}, [x5], #8
54        ld1         {v10.8b}, [x6], #8
55        ld1         {v11.8b}, [x7], #8
56        uxtl        v8.8h, v8.8b
57        uxtl        v9.8h, v9.8b
58        uxtl        v10.8h, v10.8b
59        uxtl        v11.8h, v11.8b
60        umull       v12.4s, v9.4h, v3.h[1]
61        umull2      v13.4s, v9.8h, v3.h[1]
62        umlsl       v12.4s, v8.4h, v3.h[0]
63        umlsl2      v13.4s, v8.8h, v3.h[0]
64        umlal       v12.4s, v10.4h, v3.h[2]
65        umlal2      v13.4s, v10.8h, v3.h[2]
66        umlsl       v12.4s, v11.4h, v3.h[3]
67        umlsl2      v13.4s, v11.8h, v3.h[3]
68
69        /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
70         * minus VERTBITS (the number of fraction bits we want to keep from
71         * here on).
72         */
73        sqshrn      \dstlo, v12.4s, #8 + (16 - VERTBITS)
74        sqshrn2     \dsthi, v13.4s, #8 + (16 - VERTBITS)
75.endm
76
77/* As above, but only four 16-bit results into v12hi.
78 */
79.macro vert4, dst=v12.8h
80        ld1         {v8.s}[0], [x4], #4
81        ld1         {v9.s}[0], [x5], #4
82        ld1         {v10.s}[0], [x6], #4
83        ld1         {v11.s}[0], [x7], #4
84        uxtl        v8.8h, v8.8b
85        uxtl        v9.8h, v9.8b
86        uxtl        v10.8h, v10.8b
87        uxtl        v11.8h, v11.8b
88        umull       v12.4s, v9.4h, v3.h[1]
89        umlsl       v12.4s, v8.4h, v3.h[0]
90        umlal       v12.4s, v10.4h, v3.h[2]
91        umlsl       v12.4s, v11.4h, v3.h[3]
92.ifc \dst,v12.8h
93        sqshrn2     \dst, v12.4s, #8 + (16 - VERTBITS)
94.else
95        sqshrn      \dst, v12.4s, #8 + (16 - VERTBITS)
96.endif
97.endm
98
99
100/* During horizontal resize having CHUNKSIZE input available means being able
101 * to produce a varying amount of output, depending on the phase of the data.
102 * This function calculates the minimum number of VECSIZE chunks extracted from
103 * a CHUNKSIZE window (x1), and the threshold value for when the count will be
104 * one higher than that (x0).
105 * These work out, conveniently, to be the quotient and remainder from:
106 *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
107 *
108 * The two values are packed together in a uint64_t for convenience; and
109 * they are, in fact, used this way as an arithmetic short-cut later on.
110 */
111/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */
112ENTRY(rsdIntrinsicResize_oscctl_K)
113        lsl         x2, x0, #VECSHIFT
114        mov         x0, #(CHUNKSIZE << 16) - 1
115        add         x0, x0, x2
116        udiv        x1, x0, x2
117        msub        x0, x1, x2, x0
118        add         x0, x0, x1, LSL #32
119        ret
120END(rsdIntrinsicResize_oscctl_K)
121
122/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
123 * For the most part the vertical pass (the outer loop) is the same for all
124 * versions.  Exceptions are handled in-line with conditional assembly.
125 */
126.irp comp, 1, 2, 4
127.if \comp == 1
128.set COMPONENT_SHIFT, 0
129.elseif \comp == 2
130.set COMPONENT_SHIFT, 1
131.elseif \comp == 4
132.set COMPONENT_SHIFT, 2
133.else
134.error "Unknown component count"
135.endif
136.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
137.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
138
139.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
140
141/* void rsdIntrinsicResizeB1_K(
142 *             uint8_t * restrict dst,          // x0
143 *             size_t count,                    // x1
144 *             uint32_t xf,                     // x2
145 *             uint32_t xinc,                   // x3
146 *             uint8_t const * restrict srcn,   // x4
147 *             uint8_t const * restrict src0,   // x5
148 *             uint8_t const * restrict src1,   // x6
149 *             uint8_t const * restrict src2,   // x7
150 *             size_t xclip,                    // [sp,#0]  -> [sp,#64] -> x12
151 *             size_t avail,                    // [sp,#8]  -> [sp,#72] -> x11
152 *             uint64_t osc_ctl,                // [sp,#16] -> [sp,#80] -> x10
153 *             int32 const *yr,                 // [sp,#24] -> [sp,#88] -> v4   (copied to v3   for scalar access)
154 */
155ENTRY(rsdIntrinsicResizeB\comp\()_K)
156            sub         x8, sp, #32
157            sub         sp, sp, #64
158            st1         {v8.1d - v11.1d}, [sp]
159            st1         {v12.1d - v15.1d}, [x8]
160
161            /* align the working buffer on the stack to make it easy to use bit
162             * twiddling for address calculations.
163             */
164            sub         x12, sp, #BUFFER_SIZE
165            bic         x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1
166
167            ldr         x8, [sp,#88]            // yr
168            adr         x9, 8f
169            ld1         {v4.4s}, [x8]
170            ld1         {v5.8h}, [x9]
171            sqxtun      v4.4h, v4.4s            // yr
172            dup         v6.8h, w2
173            dup         v7.8h, w3
174            mla         v6.8h, v5.8h, v7.8h     // vxf
175            shl         v7.8h, v7.8h, #VECSHIFT // vxinc
176
177            /* Compute starting condition for oscillator used to compute ahead
178             * of time how many iterations are possible before needing to
179             * refill the working buffer.  This is based on the fixed-point
180             * index of the last element in the vector of pixels processed in
181             * each iteration, counting up until it would overflow.
182             */
183            sub         x8, x2, x3
184            lsl         x9, x3, #VECSHIFT
185            add         x8, x8, x9
186
187            ldr         x10, [sp,#80]           // osc_ctl
188            ldp         x13,x11, [sp,#64]       // xclip, avail
189
190            mov         x18, sp
191            mov         sp, x12
192
193            /* x4-x7 contain pointers to the four lines of input to be
194             * convolved.  These pointers have been clamped vertically and
195             * horizontally (which is why it's not a simple row/stride pair),
196             * and the xclip argument (now in x13) indicates how many pixels
197             * from true the x position of the pointer is.  This value should
198             * be 0, 1, or 2 only.
199             *
200             * Start by placing four pixels worth of input at the far end of
201             * the buffer.  As many as two of these may be clipped, so four
202             * pixels are fetched, and then the first pixel is duplicated and
203             * the data shifted according to xclip.  The source pointers are
204             * then also adjusted according to xclip so that subsequent fetches
205             * match.
206             */
207            mov         v3.8b, v4.8b  /* make y coeffs available for vert4 and vert8 macros */
208            sub         x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
209            add         x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
210            add         x14, x14, #4 * COMPONENT_COUNT * 2
211.if \comp == 1
212            vert4       v12.4h
213            dup         v11.4h, v12.h[0]
214            st1         {v11.4h,v12.4h}, [x12]
215            ld1         {v12.4h}, [x14]
216            st1         {v12.4h}, [x15]
217.elseif \comp == 2
218            vert8
219            dup         v11.4s, v12.s[0]
220            st1         {v11.8h,v12.8h}, [x12]
221            ld1         {v12.8h}, [x14]
222            st1         {v12.8h}, [x15]
223.elseif \comp == 4
224            vert8       v14.4h, v14.8h
225            vert8       v15.4h, v15.8h
226            dup         v12.2d, v14.d[0]
227            dup         v13.2d, v14.d[0]
228            st1         {v12.8h,v13.8h}, [x12], #32
229            st1         {v14.8h,v15.8h}, [x12]
230            sub         x12, x12, #32
231            ld1         {v11.8h,v12.8h}, [x14]
232            st1         {v11.8h,v12.8h}, [x15]
233.endif
234            /* Count off four pixels into the working buffer.
235             */
236            sub         x11, x11, #4
237            /* Incoming pointers were to the first _legal_ pixel.  Four pixels
238             * were read unconditionally, but some may have been discarded by
239             * xclip, so we rewind the pointers to compensate.
240             */
241            sub         x4, x4, x13, LSL #(COMPONENT_SHIFT)
242            sub         x5, x5, x13, LSL #(COMPONENT_SHIFT)
243            sub         x6, x6, x13, LSL #(COMPONENT_SHIFT)
244            sub         x7, x7, x13, LSL #(COMPONENT_SHIFT)
245
246            /* First tap starts where we just pre-filled, at the end of the
247             * buffer.
248             */
249            add         x2, x2, #(CHUNKSIZE * 2 - 4) << 16
250
251            /* Use overflowing arithmetic to implement wraparound array
252             * indexing.
253             */
254            lsl         x2, x2, #(47 - CHUNKSHIFT)
255            lsl         x3, x3, #(47 - CHUNKSHIFT)
256
257
258            /* Start of outermost loop.
259             * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
260             * number of iterations of the inner loop that can be performed and
261             * get into that.
262             *
263             * The fill is complicated by the possibility of running out of
264             * input before the scratch buffer is filled.  If this isn't a risk
265             * then it's handled by the simple loop at 2:, otherwise the
266             * horrible loop at 3:.
267             */
2681:          mov         v3.8b, v4.8b            /* put y scaling coefficients somewhere handy */
269            subs        x11, x11, #CHUNKSIZE
270            bge         2f                      /* if at least CHUNKSIZE are available... */
271            add         x11, x11, #CHUNKSIZE    /* if they're not... */
272            b           4f
273            /* ..just sneaking a literal in here after this unconditional branch.. */
2748:          .hword      0, 1, 2, 3, 4, 5, 6, 7
275            /* basic fill loop, processing 8 bytes at a time until there are
276             * fewer than eight bytes available.
277             */
2783:          vert8
279            sub         x11, x11, #8 / COMPONENT_COUNT
280            st1         {v12.8h}, [x12], #16
2814:          cmp         x11, #8 / COMPONENT_COUNT - 1
282            bgt         3b
283.if \comp == 4
284            blt         3f
285            /* The last pixel (four bytes) if necessary */
286            vert4
287.else
288            cmp         x11, #1
289            blt         3f
290            /* The last pixels if necessary */
291            sub         x4, x4, #8
292            sub         x5, x5, #8
293            sub         x6, x6, #8
294            sub         x7, x7, #8
295            add         x4, x4, x11, LSL #(COMPONENT_SHIFT)
296            add         x5, x5, x11, LSL #(COMPONENT_SHIFT)
297            add         x6, x6, x11, LSL #(COMPONENT_SHIFT)
298            add         x7, x7, x11, LSL #(COMPONENT_SHIFT)
299            vert8
300            sub         x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
301            sub         sp, sp, #32
302            sub         x11, x11, #16
303.if \comp == 1
304            dup         v13.8h, v12.h[7]
305.elseif \comp == 2
306            dup         v13.4s, v12.s[3]
307.endif
308            st1         {v12.8h,v13.8h}, [sp]
309            ld1         {v12.8h}, [x11]
310            add         sp, sp, #32
311            b           4f
312.endif
313            /* Keep filling until we get to the end of this chunk of the buffer */
3143:
315.if \comp == 1
316            dup         v12.8h, v12.h[7]
317.elseif \comp == 2
318            dup         v12.4s, v12.s[3]
319.elseif \comp == 4
320            dup         v12.2d, v12.d[1]
321.endif
3224:          st1         {v12.8h}, [x12], #16
323            tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
324            bne         3b
325            b           4f
326
327.align 4
3282:          /* Quickly pull a chunk of data into the working buffer.
329             */
330            vert8
331            st1         {v12.8h}, [x12], #16
332            vert8
333            st1         {v12.8h}, [x12], #16
334            tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
335            bne         2b
336            cmp         x11, #0
337            bne         3f
3384:          /* if we end with 0 pixels left we'll have nothing handy to spread
339             * across to the right, so we rewind a bit.
340             */
341            mov         x11, #1
342            sub         x4, x4, #COMPONENT_COUNT
343            sub         x5, x5, #COMPONENT_COUNT
344            sub         x6, x6, #COMPONENT_COUNT
345            sub         x7, x7, #COMPONENT_COUNT
3463:          /* copy four taps (width of cubic window) to far end for overflow
347             * address handling
348             */
349            sub         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
350            eor         x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2
351.if \comp == 1
352            ld1         {v14.4h}, [x13]
353.elseif \comp == 2
354            ld1         {v14.8h}, [x13]
355.elseif \comp == 4
356            ld1         {v14.8h,v15.8h}, [x13]
357.endif
358            add         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
359.if \comp == 1
360            st1         {v14.4h}, [x13]
361.elseif \comp == 2
362            st1         {v14.8h}, [x13]
363.elseif \comp == 4
364            st1         {v14.8h,v15.8h}, [x13]
365.endif
366            /* The high 32-bits of x10 contains the maximum possible iteration
367             * count, but if x8 is greater than the low 32-bits of x10 then
368             * this indicates that the count must be reduced by one for this
369             * iteration to avoid reading past the end of the available data.
370             */
371            sub         x13, x10, x8
372            lsr         x13, x13, #32
373
374            madd        x8, x13, x9, x8
375            sub         x8, x8, #(CHUNKSIZE << 16)
376
377            /* prefer to count pixels, rather than vectors, to clarify the tail
378             * store case on exit.
379             */
380            lsl         x13, x13, #VECSHIFT
381            cmp         x13, x1
382            csel        x13, x1, x13, gt
383
384            sub         x1, x1, x13
385
386            lsl         x13, x13, #COMPONENT_SHIFT
387
388            mov         w14, #0x8000
389            movi        v30.8h, #3
390            dup         v31.8h, w14
391
392            cmp         x13, #0
393            bgt         3f
394            cmp         x1, #0
395            bgt         1b     /* an extreme case where we shouldn't use code in this structure */
396            b           9f
397
398            .align 4
3992:          /* Inner loop continues here, but starts at 3:, see end of loop
400             * below for explanation. */
401.if LOOP_OUTPUT_SIZE == 4
402            st1         {v8.s}[0], [x0], #4
403.elseif LOOP_OUTPUT_SIZE == 8
404            st1         {v8.8b}, [x0], #8
405.elseif LOOP_OUTPUT_SIZE == 16
406            st1         {v8.16b}, [x0], #16
407.elseif LOOP_OUTPUT_SIZE == 32
408            st1         {v8.16b,v9.16b}, [x0], #32
409.endif
410            /* Inner loop:  here the four x coefficients for each tap are
411             * calculated in vector code, and the addresses are calculated in
412             * scalar code, and these calculations are interleaved.
413             */
4143:          ushr        v8.8h, v6.8h, #1            // sxf
415            lsr         x14, x2, #(63 - CHUNKSHIFT)
416            sqrdmulh    v9.8h, v8.8h, v8.8h         // sxf**2
417            add         x2, x2, x3
418            sqrdmulh    v10.8h, v9.8h, v8.8h        // sxf**3
419            lsr         x15, x2, #(63 - CHUNKSHIFT)
420            sshll       v11.4s, v9.4h, #2
421            sshll2      v12.4s, v9.8h, #2
422            add         x2, x2, x3
423            smlsl       v11.4s, v10.4h, v30.4h
424            smlsl2      v12.4s, v10.8h, v30.8h
425            lsr         x16, x2, #(63 - CHUNKSHIFT)
426
427            shadd       v0.8h, v10.8h, v8.8h
428            add         x2, x2, x3
429            sub         v0.8h, v9.8h, v0.8h
430            lsr         x17, x2, #(63 - CHUNKSHIFT)
431
432            saddw       v1.4s, v11.4s, v9.4h
433            saddw2      v13.4s, v12.4s, v9.8h
434            add         x2, x2, x3
435            shrn        v1.4h, v1.4s, #1
436            shrn2       v1.8h, v13.4s, #1
437            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
438            sub         v1.8h, v1.8h, v31.8h
439            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
440
441            saddw       v2.4s, v11.4s, v8.4h
442            saddw2      v13.4s, v12.4s, v8.8h
443            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
444            shrn        v2.4h, v2.4s, #1
445            shrn2       v2.8h, v13.4s, #1
446            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
447            neg         v2.8h, v2.8h
448
449            shsub       v3.8h, v10.8h, v9.8h
450
451            /* increment the x fractional parts (oveflow is ignored, as the
452             * scalar arithmetic shadows this addition with full precision).
453             */
454            add         v6.8h, v6.8h, v7.8h
455
456            /* At this point we have four pointers in x8-x11, pointing to the
457             * four taps in the scratch buffer that must be convolved together
458             * to produce an output pixel (one output pixel per pointer).
459             * These pointers usually overlap, but their spacing is irregular
460             * so resolving the redundancy through L1 is a pragmatic solution.
461             *
462             * The scratch buffer is made of signed 16-bit data, holding over
463             * some extra precision, and overshoot, from the vertical pass.
464             *
465             * We also have the 16-bit unsigned fixed-point weights for each
466             * of the four taps in v0 - v3.  That's eight pixels worth of
467             * coefficients when we have only four pointers, so calculations
468             * for four more pixels are interleaved with the fetch and permute
469             * code for each variant in the following code.
470             *
471             * The data arrangement is less than ideal for any pixel format,
472             * but permuting loads help to mitigate most of the problems.
473             *
474             * Note also that the two outside taps of a bicubic are negative,
475             * but these coefficients are unsigned.  The sign is hard-coded by
476             * use of multiply-and-subtract operations.
477             */
478.if \comp == 1
479            /* The uchar 1 case.
480             * Issue one lanewise ld4.h to load four consecutive pixels from
481             * one pointer (one pixel) into four different registers; then load
482             * four consecutive s16 values from the next pointer (pixel) into
483             * the next lane of those four registers, etc., so that we finish
484             * with v12 - v15 representing the four taps, and each lane
485             * representing a separate pixel.
486             *
487             * The first ld4 uses a splat to avoid any false dependency on
488             * the previous state of the register.
489             */
490            ld4r        {v12.8h,v13.8h,v14.8h,v15.8h}, [x14]
491            lsr         x14, x2, #(63 - CHUNKSHIFT)
492            add         x2, x2, x3
493            ld4         {v12.h,v13.h,v14.h,v15.h}[1], [x15]
494            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
495            lsr         x15, x2, #(63 - CHUNKSHIFT)
496            add         x2, x2, x3
497            ld4         {v12.h,v13.h,v14.h,v15.h}[2], [x16]
498            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
499            lsr         x16, x2, #(63 - CHUNKSHIFT)
500            add         x2, x2, x3
501            ld4         {v12.h,v13.h,v14.h,v15.h}[3], [x17]
502            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
503            lsr         x17, x2, #(63 - CHUNKSHIFT)
504            add         x2, x2, x3
505            ld4         {v12.h,v13.h,v14.h,v15.h}[4], [x14]
506            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
507            ld4         {v12.h,v13.h,v14.h,v15.h}[5], [x15]
508            ld4         {v12.h,v13.h,v14.h,v15.h}[6], [x16]
509            ld4         {v12.h,v13.h,v14.h,v15.h}[7], [x17]
510
511            smull       v8.4s, v12.4h, v0.4h
512            smull2      v9.4s, v12.8h, v0.8h
513            smlsl       v8.4s, v13.4h, v1.4h
514            smlsl2      v9.4s, v13.8h, v1.8h
515            smlsl       v8.4s, v14.4h, v2.4h
516            smlsl2      v9.4s, v14.8h, v2.8h
517            smlal       v8.4s, v15.4h, v3.4h
518            smlal2      v9.4s, v15.8h, v3.8h
519
520            subs        x13, x13, #LOOP_OUTPUT_SIZE
521
522            sqrshrn     v8.4h, v8.4s, #15
523            sqrshrn2    v8.8h, v9.4s, #15
524
525            sqrshrun    v8.8b, v8.8h, #VERTBITS - 8
526.elseif \comp == 2
527            /* The uchar2 case:
528             * This time load pairs of values into adjacent lanes in v12 - v15
529             * by aliasing them as u32 data; leaving room for only four pixels,
530             * so the process has to be done twice.  This also means that the
531             * coefficient registers fail to align with the coefficient data
532             * (eight separate pixels), so that has to be doubled-up to match.
533             */
534            ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
535            lsr         x14, x2, #(63 - CHUNKSHIFT)
536            add         x2, x2, x3
537            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
538            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
539            lsr         x15, x2, #(63 - CHUNKSHIFT)
540            add         x2, x2, x3
541            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
542            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
543            lsr         x16, x2, #(63 - CHUNKSHIFT)
544            add         x2, x2, x3
545            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
546            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
547            lsr         x17, x2, #(63 - CHUNKSHIFT)
548            add         x2, x2, x3
549
550            /* double-up coefficients to align with component pairs */
551            zip1        v16.8h, v0.8h, v0.8h
552            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
553            zip1        v17.8h, v1.8h, v1.8h
554            zip1        v18.8h, v2.8h, v2.8h
555            zip1        v19.8h, v3.8h, v3.8h
556
557            smull       v8.4s, v12.4h, v16.4h
558            smull2      v9.4s, v12.8h, v16.8h
559            smlsl       v8.4s, v13.4h, v17.4h
560            smlsl2      v9.4s, v13.8h, v17.8h
561            smlsl       v8.4s, v14.4h, v18.4h
562            smlsl2      v9.4s, v14.8h, v18.8h
563            smlal       v8.4s, v15.4h, v19.4h
564            smlal2      v9.4s, v15.8h, v19.8h
565
566            sqrshrn     v8.4h, v8.4s, #15
567            sqrshrn2    v8.8h, v9.4s, #15
568
569            ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
570            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
571            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
572            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
573
574            /* double-up coefficients to align with component pairs */
575            zip2        v16.8h, v0.8h, v0.8h
576            zip2        v17.8h, v1.8h, v1.8h
577            zip2        v18.8h, v2.8h, v2.8h
578            zip2        v19.8h, v3.8h, v3.8h
579
580            smull       v10.4s, v12.4h, v16.4h
581            smull2      v11.4s, v12.8h, v16.8h
582            smlsl       v10.4s, v13.4h, v17.4h
583            smlsl2      v11.4s, v13.8h, v17.8h
584            smlsl       v10.4s, v14.4h, v18.4h
585            smlsl2      v11.4s, v14.8h, v18.8h
586            smlal       v10.4s, v15.4h, v19.4h
587            smlal2      v11.4s, v15.8h, v19.8h
588
589            subs        x13, x13, #LOOP_OUTPUT_SIZE
590
591            sqrshrn     v9.4h, v10.4s, #15
592            sqrshrn2    v9.8h, v11.4s, #15
593
594            sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
595            sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
596.elseif \comp == 4
597            /* The uchar4 case.
598             * This case is comparatively painless because four s16s are the
599             * smallest addressable unit for a vmul-by-scalar.  Rather than
600             * permute the data, simply arrange the multiplies to suit the way
601             * the data comes in.  That's a lot of data, though, so things
602             * progress in pairs of pixels at a time.
603             */
604            ld1         {v12.8h,v13.8h}, [x14]
605            lsr         x14, x2, #(63 - CHUNKSHIFT)
606            add         x2, x2, x3
607            ld1         {v14.8h,v15.8h}, [x15]
608            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
609            lsr         x15, x2, #(63 - CHUNKSHIFT)
610            add         x2, x2, x3
611
612            smull       v8.4s, v12.4h, v0.h[0]
613            smull       v9.4s, v14.4h, v0.h[1]
614            smlsl2      v8.4s, v12.8h, v1.h[0]
615            smlsl2      v9.4s, v14.8h, v1.h[1]
616            smlsl       v8.4s, v13.4h, v2.h[0]
617            smlsl       v9.4s, v15.4h, v2.h[1]
618            smlal2      v8.4s, v13.8h, v3.h[0]
619            smlal2      v9.4s, v15.8h, v3.h[1]
620
621            /* And two more...  */
622            ld1         {v12.8h,v13.8h}, [x16]
623            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
624            lsr         x16, x2, #(63 - CHUNKSHIFT)
625            add         x2, x2, x3
626            ld1         {v14.8h,v15.8h}, [x17]
627            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
628            lsr         x17, x2, #(63 - CHUNKSHIFT)
629            add         x2, x2, x3
630
631            sqrshrn     v8.4h, v8.4s, #15
632            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
633            sqrshrn2    v8.8h, v9.4s, #15
634
635            smull       v10.4s, v12.4h, v0.h[2]
636            smull       v11.4s, v14.4h, v0.h[3]
637            smlsl2      v10.4s, v12.8h, v1.h[2]
638            smlsl2      v11.4s, v14.8h, v1.h[3]
639            smlsl       v10.4s, v13.4h, v2.h[2]
640            smlsl       v11.4s, v15.4h, v2.h[3]
641            smlal2      v10.4s, v13.8h, v3.h[2]
642            smlal2      v11.4s, v15.8h, v3.h[3]
643
644            sqrshrn     v9.4h, v10.4s, #15
645            sqrshrn2    v9.8h, v11.4s, #15
646
647            sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
648            sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
649
650            /* And two more...  */
651            ld1         {v12.8h,v13.8h}, [x14]
652            ld1         {v14.8h,v15.8h}, [x15]
653
654            smull       v10.4s, v12.4h, v0.h[4]
655            smull       v11.4s, v14.4h, v0.h[5]
656            smlsl2      v10.4s, v12.8h, v1.h[4]
657            smlsl2      v11.4s, v14.8h, v1.h[5]
658            smlsl       v10.4s, v13.4h, v2.h[4]
659            smlsl       v11.4s, v15.4h, v2.h[5]
660            smlal2      v10.4s, v13.8h, v3.h[4]
661            smlal2      v11.4s, v15.8h, v3.h[5]
662
663            /* And two more...  */
664            ld1         {v12.8h,v13.8h}, [x16]
665            ld1         {v14.8h,v15.8h}, [x17]
666
667            subs        x13, x13, #LOOP_OUTPUT_SIZE
668
669            sqrshrn     v9.4h, v10.4s, #15
670            sqrshrn2    v9.8h, v11.4s, #15
671
672            smull       v10.4s, v12.4h, v0.h[6]
673            smull       v11.4s, v14.4h, v0.h[7]
674            smlsl2      v10.4s, v12.8h, v1.h[6]
675            smlsl2      v11.4s, v14.8h, v1.h[7]
676            smlsl       v10.4s, v13.4h, v2.h[6]
677            smlsl       v11.4s, v15.4h, v2.h[7]
678            smlal2      v10.4s, v13.8h, v3.h[6]
679            smlal2      v11.4s, v15.8h, v3.h[7]
680
681            sqrshrn     v10.4h, v10.4s, #15
682            sqrshrn2    v10.8h, v11.4s, #15
683
684            sqrshrun     v9.8b, v9.8h, #VERTBITS - 8
685            sqrshrun2    v9.16b, v10.8h, #VERTBITS - 8
686.endif
687            bgt         2b      /* continue inner loop */
688            /* The inner loop has already been limited to ensure that none of
689             * the earlier iterations could overfill the output, so the store
690             * appears within the loop but after the conditional branch (at the
691             * top).  At the end, provided it won't overfill, perform the final
692             * store here.  If it would, then break out to the tricky tail case
693             * instead.
694             */
695            blt         1f
696            /* Store the amount of data appropriate to the configuration of the
697             * instance being assembled.
698             */
699.if LOOP_OUTPUT_SIZE == 4
700            st1         {v8.s}[0], [x0], #4
701.elseif LOOP_OUTPUT_SIZE == 8
702            st1         {v8.8b}, [x0], #8
703.elseif LOOP_OUTPUT_SIZE == 16
704            st1         {v8.16b}, [x0], #16
705.elseif LOOP_OUTPUT_SIZE == 32
706            st1         {v8.16b,v9.16b}, [x0], #32
707.endif
708            b           1b              /* resume outer loop */
709            /* Partial tail store case:
710             * Different versions of the code need different subsets of the
711             * following partial stores.  Here the number of components and the
712             * size of the chunk of data produced by each inner loop iteration
713             * is tested to figure out whether or not each phrase is relevant.
714             */
715.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
7161:          tst         x13, #16
717            beq         1f
718            st1         {v8.16b}, [x0], #16
719            mov         v8.16b, v9.16b
720.endif
721.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
7221:          tst         x13, #8
723            beq         1f
724            st1         {v8.8b}, [x0], #8
725            ext         v8.16b, v8.16b, v8.16b, #8
726.endif
727.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
7281:          tst         x13, #4
729            beq         1f
730            st1         {v8.s}[0], [x0], #4
731            ext         v8.8b, v8.8b, v8.8b, #4
732.endif
733.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
7341:          tst         x13, #2
735            beq         1f
736            st1         {v8.h}[0], [x0], #2
737            ext         v8.8b, v8.8b, v8.8b, #2
738.endif
739.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
7401:          tst         x13, #1
741            beq         1f
742            st1         {v8.b}[0], [x0], #1
743.endif
7441:
7459:          mov         sp, x18
746            ld1         {v8.1d - v11.1d}, [sp], #32
747            ld1         {v12.1d - v15.1d}, [sp], #32
748            ret
749END(rsdIntrinsicResizeB\comp\()_K)
750.endr
751
752