1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkBlitRow_opts_arm_neon.h"
9 
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
13 #include "SkDither.h"
14 #include "SkMathPriv.h"
15 #include "SkUtils.h"
16 
17 #include "SkColor_opts_neon.h"
18 #include <arm_neon.h>
19 
20 #ifdef SK_CPU_ARM64
sk_vld4_u8_arm64_3(const SkPMColor * SK_RESTRICT & src)21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22     uint8x8x4_t vsrc;
23     uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24 
25     asm (
26         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27         "mov    %[vsrc0].8b, v0.8b             \t\n"
28         "mov    %[vsrc1].8b, v1.8b             \t\n"
29         "mov    %[vsrc2].8b, v2.8b             \t\n"
30         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31           [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32         : : "v0", "v1", "v2", "v3"
33     );
34 
35     vsrc.val[0] = vsrc_0;
36     vsrc.val[1] = vsrc_1;
37     vsrc.val[2] = vsrc_2;
38 
39     return vsrc;
40 }
41 
sk_vld4_u8_arm64_4(const SkPMColor * SK_RESTRICT & src)42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43     uint8x8x4_t vsrc;
44     uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45 
46     asm (
47         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48         "mov    %[vsrc0].8b, v0.8b             \t\n"
49         "mov    %[vsrc1].8b, v1.8b             \t\n"
50         "mov    %[vsrc2].8b, v2.8b             \t\n"
51         "mov    %[vsrc3].8b, v3.8b             \t\n"
52         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53           [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54           [src] "+&r" (src)
55         : : "v0", "v1", "v2", "v3"
56     );
57 
58     vsrc.val[0] = vsrc_0;
59     vsrc.val[1] = vsrc_1;
60     vsrc.val[2] = vsrc_2;
61     vsrc.val[3] = vsrc_3;
62 
63     return vsrc;
64 }
65 #endif
66 
S32_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
68                            const SkPMColor* SK_RESTRICT src, int count,
69                            U8CPU alpha, int /*x*/, int /*y*/) {
70     SkASSERT(255 == alpha);
71 
72     while (count >= 8) {
73         uint8x8x4_t vsrc;
74         uint16x8_t vdst;
75 
76         // Load
77 #ifdef SK_CPU_ARM64
78         vsrc = sk_vld4_u8_arm64_3(src);
79 #else
80         vsrc = vld4_u8((uint8_t*)src);
81         src += 8;
82 #endif
83 
84         // Convert src to 565
85         vdst = SkPixel32ToPixel16_neon8(vsrc);
86 
87         // Store
88         vst1q_u16(dst, vdst);
89 
90         // Prepare next iteration
91         dst += 8;
92         count -= 8;
93     };
94 
95     // Leftovers
96     while (count > 0) {
97         SkPMColor c = *src++;
98         SkPMColorAssert(c);
99         *dst = SkPixel32ToPixel16_ToU16(c);
100         dst++;
101         count--;
102     };
103 }
104 
S32_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
106                           const SkPMColor* SK_RESTRICT src, int count,
107                           U8CPU alpha, int /*x*/, int /*y*/) {
108     SkASSERT(255 > alpha);
109 
110     uint16x8_t vmask_blue, vscale;
111 
112     // prepare constants
113     vscale = vdupq_n_u16(SkAlpha255To256(alpha));
114     vmask_blue = vmovq_n_u16(0x1F);
115 
116     while (count >= 8) {
117         uint8x8x4_t vsrc;
118         uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
119         uint16x8_t vres_r, vres_g, vres_b;
120 
121         // Load src
122 #ifdef SK_CPU_ARM64
123         vsrc = sk_vld4_u8_arm64_3(src);
124 #else
125         {
126         register uint8x8_t d0 asm("d0");
127         register uint8x8_t d1 asm("d1");
128         register uint8x8_t d2 asm("d2");
129         register uint8x8_t d3 asm("d3");
130 
131         asm (
132             "vld4.8    {d0-d3},[%[src]]!"
133             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
134             :
135         );
136         vsrc.val[0] = d0;
137         vsrc.val[1] = d1;
138         vsrc.val[2] = d2;
139         }
140 #endif
141 
142         // Load and unpack dst
143         vdst = vld1q_u16(dst);
144         vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
145         vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
146         vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
147         vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
148 
149         // Shift src to 565 range
150         vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151         vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152         vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
153 
154         // Scale src - dst
155         vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156         vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157         vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
158 
159         vres_r = vshrq_n_u16(vres_r * vscale, 8);
160         vres_g = vshrq_n_u16(vres_g * vscale, 8);
161         vres_b = vshrq_n_u16(vres_b * vscale, 8);
162 
163         vres_r += vdst_r;
164         vres_g += vdst_g;
165         vres_b += vdst_b;
166 
167         // Combine
168         vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
169         vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
170 
171         // Store
172         vst1q_u16(dst, vres_b);
173         dst += 8;
174         count -= 8;
175     }
176     if (count > 0) {
177         int scale = SkAlpha255To256(alpha);
178         do {
179             SkPMColor c = *src++;
180             SkPMColorAssert(c);
181             uint16_t d = *dst;
182             *dst++ = SkPackRGB16(
183                     SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
184                     SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
185                     SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
186         } while (--count != 0);
187     }
188 }
189 
190 #ifdef SK_CPU_ARM32
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192                            const SkPMColor* SK_RESTRICT src, int count,
193                            U8CPU alpha, int /*x*/, int /*y*/) {
194     SkASSERT(255 == alpha);
195 
196     if (count >= 8) {
197         uint16_t* SK_RESTRICT keep_dst = 0;
198 
199         asm volatile (
200                       "ands       ip, %[count], #7            \n\t"
201                       "vmov.u8    d31, #1<<7                  \n\t"
202                       "vld1.16    {q12}, [%[dst]]             \n\t"
203                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
204                       // Thumb does not support the standard ARM conditional
205                       // instructions but instead requires the 'it' instruction
206                       // to signal conditional execution
207                       "it eq                                  \n\t"
208                       "moveq      ip, #8                      \n\t"
209                       "mov        %[keep_dst], %[dst]         \n\t"
210 
211                       "add        %[src], %[src], ip, LSL#2   \n\t"
212                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
213                       "subs       %[count], %[count], ip      \n\t"
214                       "b          9f                          \n\t"
215                       // LOOP
216                       "2:                                         \n\t"
217 
218                       "vld1.16    {q12}, [%[dst]]!            \n\t"
219                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
220                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
221                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
222                       "subs       %[count], %[count], #8      \n\t"
223                       "9:                                         \n\t"
224                       "pld        [%[dst],#32]                \n\t"
225                       // expand 0565 q12 to 8888 {d4-d7}
226                       "vmovn.u16  d4, q12                     \n\t"
227                       "vshr.u16   q11, q12, #5                \n\t"
228                       "vshr.u16   q10, q12, #6+5              \n\t"
229                       "vmovn.u16  d5, q11                     \n\t"
230                       "vmovn.u16  d6, q10                     \n\t"
231                       "vshl.u8    d4, d4, #3                  \n\t"
232                       "vshl.u8    d5, d5, #2                  \n\t"
233                       "vshl.u8    d6, d6, #3                  \n\t"
234 
235                       "vmovl.u8   q14, d31                    \n\t"
236                       "vmovl.u8   q13, d31                    \n\t"
237                       "vmovl.u8   q12, d31                    \n\t"
238 
239                       // duplicate in 4/2/1 & 8pix vsns
240                       "vmvn.8     d30, d3                     \n\t"
241                       "vmlal.u8   q14, d30, d6                \n\t"
242                       "vmlal.u8   q13, d30, d5                \n\t"
243                       "vmlal.u8   q12, d30, d4                \n\t"
244                       "vshr.u16   q8, q14, #5                 \n\t"
245                       "vshr.u16   q9, q13, #6                 \n\t"
246                       "vaddhn.u16 d6, q14, q8                 \n\t"
247                       "vshr.u16   q8, q12, #5                 \n\t"
248                       "vaddhn.u16 d5, q13, q9                 \n\t"
249                       "vaddhn.u16 d4, q12, q8                 \n\t"
250                       // intentionally don't calculate alpha
251                       // result in d4-d6
252 
253             #ifdef SK_PMCOLOR_IS_RGBA
254                       "vqadd.u8   d6, d6, d0                  \n\t"
255                       "vqadd.u8   d5, d5, d1                  \n\t"
256                       "vqadd.u8   d4, d4, d2                  \n\t"
257             #else
258                       "vqadd.u8   d6, d6, d2                  \n\t"
259                       "vqadd.u8   d5, d5, d1                  \n\t"
260                       "vqadd.u8   d4, d4, d0                  \n\t"
261             #endif
262 
263                       // pack 8888 {d4-d6} to 0565 q10
264                       "vshll.u8   q10, d6, #8                 \n\t"
265                       "vshll.u8   q3, d5, #8                  \n\t"
266                       "vshll.u8   q2, d4, #8                  \n\t"
267                       "vsri.u16   q10, q3, #5                 \n\t"
268                       "vsri.u16   q10, q2, #11                \n\t"
269 
270                       "bne        2b                          \n\t"
271 
272                       "1:                                         \n\t"
273                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
274                       : [count] "+r" (count)
275                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
276                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
277                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
278                       "d30","d31"
279                       );
280     }
281     else
282     {   // handle count < 8
283         uint16_t* SK_RESTRICT keep_dst = 0;
284 
285         asm volatile (
286                       "vmov.u8    d31, #1<<7                  \n\t"
287                       "mov        %[keep_dst], %[dst]         \n\t"
288 
289                       "tst        %[count], #4                \n\t"
290                       "beq        14f                         \n\t"
291                       "vld1.16    {d25}, [%[dst]]!            \n\t"
292                       "vld1.32    {q1}, [%[src]]!             \n\t"
293 
294                       "14:                                        \n\t"
295                       "tst        %[count], #2                \n\t"
296                       "beq        12f                         \n\t"
297                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
298                       "vld1.32    {d1}, [%[src]]!             \n\t"
299 
300                       "12:                                        \n\t"
301                       "tst        %[count], #1                \n\t"
302                       "beq        11f                         \n\t"
303                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
304                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
305 
306                       "11:                                        \n\t"
307                       // unzips achieve the same as a vld4 operation
308                       "vuzp.u16   q0, q1                      \n\t"
309                       "vuzp.u8    d0, d1                      \n\t"
310                       "vuzp.u8    d2, d3                      \n\t"
311                       // expand 0565 q12 to 8888 {d4-d7}
312                       "vmovn.u16  d4, q12                     \n\t"
313                       "vshr.u16   q11, q12, #5                \n\t"
314                       "vshr.u16   q10, q12, #6+5              \n\t"
315                       "vmovn.u16  d5, q11                     \n\t"
316                       "vmovn.u16  d6, q10                     \n\t"
317                       "vshl.u8    d4, d4, #3                  \n\t"
318                       "vshl.u8    d5, d5, #2                  \n\t"
319                       "vshl.u8    d6, d6, #3                  \n\t"
320 
321                       "vmovl.u8   q14, d31                    \n\t"
322                       "vmovl.u8   q13, d31                    \n\t"
323                       "vmovl.u8   q12, d31                    \n\t"
324 
325                       // duplicate in 4/2/1 & 8pix vsns
326                       "vmvn.8     d30, d3                     \n\t"
327                       "vmlal.u8   q14, d30, d6                \n\t"
328                       "vmlal.u8   q13, d30, d5                \n\t"
329                       "vmlal.u8   q12, d30, d4                \n\t"
330                       "vshr.u16   q8, q14, #5                 \n\t"
331                       "vshr.u16   q9, q13, #6                 \n\t"
332                       "vaddhn.u16 d6, q14, q8                 \n\t"
333                       "vshr.u16   q8, q12, #5                 \n\t"
334                       "vaddhn.u16 d5, q13, q9                 \n\t"
335                       "vaddhn.u16 d4, q12, q8                 \n\t"
336                       // intentionally don't calculate alpha
337                       // result in d4-d6
338 
339             #ifdef SK_PMCOLOR_IS_RGBA
340                       "vqadd.u8   d6, d6, d0                  \n\t"
341                       "vqadd.u8   d5, d5, d1                  \n\t"
342                       "vqadd.u8   d4, d4, d2                  \n\t"
343             #else
344                       "vqadd.u8   d6, d6, d2                  \n\t"
345                       "vqadd.u8   d5, d5, d1                  \n\t"
346                       "vqadd.u8   d4, d4, d0                  \n\t"
347             #endif
348 
349                       // pack 8888 {d4-d6} to 0565 q10
350                       "vshll.u8   q10, d6, #8                 \n\t"
351                       "vshll.u8   q3, d5, #8                  \n\t"
352                       "vshll.u8   q2, d4, #8                  \n\t"
353                       "vsri.u16   q10, q3, #5                 \n\t"
354                       "vsri.u16   q10, q2, #11                \n\t"
355 
356                       // store
357                       "tst        %[count], #4                \n\t"
358                       "beq        24f                         \n\t"
359                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
360 
361                       "24:                                        \n\t"
362                       "tst        %[count], #2                \n\t"
363                       "beq        22f                         \n\t"
364                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
365 
366                       "22:                                        \n\t"
367                       "tst        %[count], #1                \n\t"
368                       "beq        21f                         \n\t"
369                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
370 
371                       "21:                                        \n\t"
372                       : [count] "+r" (count)
373                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
374                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
375                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
376                       "d30","d31"
377                       );
378     }
379 }
380 
381 #else // #ifdef SK_CPU_ARM32
382 
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)383 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
384                            const SkPMColor* SK_RESTRICT src, int count,
385                            U8CPU alpha, int /*x*/, int /*y*/) {
386     SkASSERT(255 == alpha);
387 
388     if (count >= 16) {
389         asm (
390             "movi    v4.8h, #0x80                   \t\n"
391 
392             "1:                                     \t\n"
393             "sub     %w[count], %w[count], #16      \t\n"
394             "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
395             "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
396             "prfm    pldl1keep, [%[src],#512]       \t\n"
397             "prfm    pldl1keep, [%[dst],#256]       \t\n"
398             "ushr    v20.8h, v17.8h, #5             \t\n"
399             "ushr    v31.8h, v16.8h, #5             \t\n"
400             "xtn     v6.8b, v31.8h                  \t\n"
401             "xtn2    v6.16b, v20.8h                 \t\n"
402             "ushr    v20.8h, v17.8h, #11            \t\n"
403             "shl     v19.16b, v6.16b, #2            \t\n"
404             "ushr    v31.8h, v16.8h, #11            \t\n"
405             "xtn     v22.8b, v31.8h                 \t\n"
406             "xtn2    v22.16b, v20.8h                \t\n"
407             "shl     v18.16b, v22.16b, #3           \t\n"
408             "mvn     v3.16b, v3.16b                 \t\n"
409             "xtn     v16.8b, v16.8h                 \t\n"
410             "mov     v7.16b, v4.16b                 \t\n"
411             "xtn2    v16.16b, v17.8h                \t\n"
412             "umlal   v7.8h, v3.8b, v19.8b           \t\n"
413             "shl     v16.16b, v16.16b, #3           \t\n"
414             "mov     v22.16b, v4.16b                \t\n"
415             "ushr    v24.8h, v7.8h, #6              \t\n"
416             "umlal   v22.8h, v3.8b, v18.8b          \t\n"
417             "ushr    v20.8h, v22.8h, #5             \t\n"
418             "addhn   v20.8b, v22.8h, v20.8h         \t\n"
419             "cmp     %w[count], #16                 \t\n"
420             "mov     v6.16b, v4.16b                 \t\n"
421             "mov     v5.16b, v4.16b                 \t\n"
422             "umlal   v6.8h, v3.8b, v16.8b           \t\n"
423             "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
424             "mov     v17.16b, v4.16b                \t\n"
425             "ushr    v19.8h, v6.8h, #5              \t\n"
426             "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
427             "addhn   v7.8b, v7.8h, v24.8h           \t\n"
428             "ushr    v18.8h, v5.8h, #6              \t\n"
429             "ushr    v21.8h, v17.8h, #5             \t\n"
430             "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
431             "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
432             "mov     v22.16b, v4.16b                \t\n"
433             "addhn   v6.8b, v6.8h, v19.8h           \t\n"
434             "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
435             "ushr    v5.8h, v22.8h, #5              \t\n"
436             "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
437             "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
438 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
439             "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
440             "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
441 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
442             "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
443             "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
444 #else
445 #error "This function only supports BGRA and RGBA."
446 #endif
447             "shll    v22.8h, v20.8b, #8             \t\n"
448             "shll    v5.8h, v7.8b, #8               \t\n"
449             "sri     v22.8h, v5.8h, #5              \t\n"
450             "shll    v17.8h, v6.8b, #8              \t\n"
451             "shll2   v23.8h, v20.16b, #8            \t\n"
452             "shll2   v7.8h, v7.16b, #8              \t\n"
453             "sri     v22.8h, v17.8h, #11            \t\n"
454             "sri     v23.8h, v7.8h, #5              \t\n"
455             "shll2   v6.8h, v6.16b, #8              \t\n"
456             "st1     {v22.8h}, [%[dst]], #16        \t\n"
457             "sri     v23.8h, v6.8h, #11             \t\n"
458             "st1     {v23.8h}, [%[dst]], #16        \t\n"
459             "b.ge    1b                             \t\n"
460             : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
461             :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
462                "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
463                "v31"
464         );
465     }
466         // Leftovers
467     if (count > 0) {
468         do {
469             SkPMColor c = *src++;
470             SkPMColorAssert(c);
471             if (c) {
472                 *dst = SkSrcOver32To16(c, *dst);
473             }
474             dst += 1;
475         } while (--count != 0);
476     }
477 }
478 #endif // #ifdef SK_CPU_ARM32
479 
pmcolor_to_expand16(SkPMColor c)480 static uint32_t pmcolor_to_expand16(SkPMColor c) {
481     unsigned r = SkGetPackedR32(c);
482     unsigned g = SkGetPackedG32(c);
483     unsigned b = SkGetPackedB32(c);
484     return (g << 24) | (r << 13) | (b << 2);
485 }
486 
Color32A_D565_neon(uint16_t dst[],SkPMColor src,int count,int x,int y)487 void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
488     uint32_t src_expand;
489     unsigned scale;
490     uint16x8_t vmask_blue;
491 
492     if (count <= 0) return;
493     SkASSERT(((size_t)dst & 0x01) == 0);
494 
495     /*
496      * This preamble code is in order to make dst aligned to 8 bytes
497      * in the next mutiple bytes read & write access.
498      */
499     src_expand = pmcolor_to_expand16(src);
500     scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
501 
502 #define DST_ALIGN 8
503 
504     /*
505      * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
506      */
507     int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
508 
509     for (int i = 0; i < preamble_size; i+=2, dst++) {
510         uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
511         *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
512         if (--count == 0)
513             break;
514     }
515 
516     int count16 = 0;
517     count16 = count >> 4;
518     vmask_blue = vmovq_n_u16(SK_B16_MASK);
519 
520     if (count16) {
521         uint16x8_t wide_sr;
522         uint16x8_t wide_sg;
523         uint16x8_t wide_sb;
524         uint16x8_t wide_256_sa;
525 
526         unsigned sr = SkGetPackedR32(src);
527         unsigned sg = SkGetPackedG32(src);
528         unsigned sb = SkGetPackedB32(src);
529         unsigned sa = SkGetPackedA32(src);
530 
531         // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
532         // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
533         //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
534         wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
535 
536         // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
537         //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
538         wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
539 
540         // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
541         //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
542         wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
543 
544         wide_256_sa =
545             vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
546 
547         while (count16-- > 0) {
548             uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
549             uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
550             vdst1 = vld1q_u16(dst);
551             dst += 8;
552             vdst2 = vld1q_u16(dst);
553             dst -= 8;    //to store dst again.
554 
555             vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS);                 // shift green to top of lanes
556             vdst1_b = vdst1 & vmask_blue;                              // extract blue
557             vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT);                // extract red
558             vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
559 
560             vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS);                 // shift green to top of lanes
561             vdst2_b = vdst2 & vmask_blue;                              // extract blue
562             vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT);                // extract red
563             vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
564 
565             vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r);        // sr + (256-sa) x dr1
566             vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g);        // sg + (256-sa) x dg1
567             vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b);        // sb + (256-sa) x db1
568 
569             vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r);        // sr + (256-sa) x dr2
570             vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g);        // sg + (256-sa) x dg2
571             vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b);        // sb + (256-sa) x db2
572 
573             vdst1_r = vshrq_n_u16(vdst1_r, 5);                         // 5-bit right shift for 5-bit red
574             vdst1_g = vshrq_n_u16(vdst1_g, 5);                         // 5-bit right shift for 6-bit green
575             vdst1_b = vshrq_n_u16(vdst1_b, 5);                         // 5-bit right shift for 5-bit blue
576 
577             vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT);       // insert green into blue
578             vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT);         // insert red into green/blue
579 
580             vdst2_r = vshrq_n_u16(vdst2_r, 5);                         // 5-bit right shift for 5-bit red
581             vdst2_g = vshrq_n_u16(vdst2_g, 5);                         // 5-bit right shift for 6-bit green
582             vdst2_b = vshrq_n_u16(vdst2_b, 5);                         // 5-bit right shift for 5-bit blue
583 
584             vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT);       // insert green into blue
585             vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT);         // insert red into green/blue
586 
587             vst1q_u16(dst, vdst1);
588             dst += 8;
589             vst1q_u16(dst, vdst2);
590             dst += 8;
591         }
592     }
593 
594     count &= 0xF;
595     if (count > 0) {
596         do {
597             uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
598             *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
599             dst += 1;
600         } while (--count != 0);
601     }
602 }
603 
SkDiv255Round_neon8(uint16x8_t prod)604 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
605     prod += vdupq_n_u16(128);
606     prod += vshrq_n_u16(prod, 8);
607     return vshrq_n_u16(prod, 8);
608 }
609 
S32A_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)610 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
611                           const SkPMColor* SK_RESTRICT src, int count,
612                           U8CPU alpha, int /*x*/, int /*y*/) {
613    SkASSERT(255 > alpha);
614 
615     /* This code implements a Neon version of S32A_D565_Blend. The results have
616      * a few mismatches compared to the original code. These mismatches never
617      * exceed 1.
618      */
619 
620     if (count >= 8) {
621         uint16x8_t valpha_max, vmask_blue;
622         uint8x8_t valpha;
623 
624         // prepare constants
625         valpha_max = vmovq_n_u16(255);
626         valpha = vdup_n_u8(alpha);
627         vmask_blue = vmovq_n_u16(SK_B16_MASK);
628 
629         do {
630             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
631             uint16x8_t vres_a, vres_r, vres_g, vres_b;
632             uint8x8x4_t vsrc;
633 
634             // load pixels
635             vdst = vld1q_u16(dst);
636 #ifdef SK_CPU_ARM64
637             vsrc = sk_vld4_u8_arm64_4(src);
638 #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
639             asm (
640                 "vld4.u8 %h[vsrc], [%[src]]!"
641                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
642                 : :
643             );
644 #else
645             register uint8x8_t d0 asm("d0");
646             register uint8x8_t d1 asm("d1");
647             register uint8x8_t d2 asm("d2");
648             register uint8x8_t d3 asm("d3");
649 
650             asm volatile (
651                 "vld4.u8    {d0-d3},[%[src]]!;"
652                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
653                   [src] "+&r" (src)
654                 : :
655             );
656             vsrc.val[0] = d0;
657             vsrc.val[1] = d1;
658             vsrc.val[2] = d2;
659             vsrc.val[3] = d3;
660 #endif
661 
662 
663             // deinterleave dst
664             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
665             vdst_b = vdst & vmask_blue;                     // extract blue
666             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
667             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
668 
669             // shift src to 565
670             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
671             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
672             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
673 
674             // calc src * src_scale
675             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
676             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
677             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
678             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
679 
680             // prepare dst_scale
681             vres_a = SkDiv255Round_neon8(vres_a);
682             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
683 
684             // add dst * dst_scale to previous result
685             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
686             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
687             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
688 
689 #ifdef S32A_D565_BLEND_EXACT
690             // It is possible to get exact results with this but it is slow,
691             // even slower than C code in some cases
692             vres_r = SkDiv255Round_neon8(vres_r);
693             vres_g = SkDiv255Round_neon8(vres_g);
694             vres_b = SkDiv255Round_neon8(vres_b);
695 #else
696             vres_r = vrshrq_n_u16(vres_r, 8);
697             vres_g = vrshrq_n_u16(vres_g, 8);
698             vres_b = vrshrq_n_u16(vres_b, 8);
699 #endif
700             // pack result
701             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
702             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
703 
704             // store
705             vst1q_u16(dst, vres_b);
706             dst += 8;
707             count -= 8;
708         } while (count >= 8);
709     }
710 
711     // leftovers
712     while (count-- > 0) {
713         SkPMColor sc = *src++;
714         if (sc) {
715             uint16_t dc = *dst;
716             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
717             unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale);
718             unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale);
719             unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale);
720             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
721         }
722         dst += 1;
723     }
724 }
725 
726 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
727  * each dither value is spaced out into byte lanes, and repeated
728  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
729  * start of each row.
730  */
731 static const uint8_t gDitherMatrix_Neon[48] = {
732     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
733     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
734     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
735     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
736 
737 };
738 
S32_D565_Blend_Dither_neon(uint16_t * dst,const SkPMColor * src,int count,U8CPU alpha,int x,int y)739 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
740                                 int count, U8CPU alpha, int x, int y)
741 {
742 
743     SkASSERT(255 > alpha);
744 
745     // rescale alpha to range 1 - 256
746     int scale = SkAlpha255To256(alpha);
747 
748     if (count >= 8) {
749         /* select row and offset for dither array */
750         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
751 
752         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
753         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
754 
755         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
756         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
757 
758         do {
759 
760             uint8x8x4_t vsrc;
761             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
762             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
763             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
764             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
765             uint16x8_t vdst;
766             uint16x8_t vdst_r, vdst_g, vdst_b;
767             int16x8_t vres_r, vres_g, vres_b;
768             int8x8_t vres8_r, vres8_g, vres8_b;
769 
770             // Load source and add dither
771 #ifdef SK_CPU_ARM64
772             vsrc = sk_vld4_u8_arm64_3(src);
773 #else
774             {
775             register uint8x8_t d0 asm("d0");
776             register uint8x8_t d1 asm("d1");
777             register uint8x8_t d2 asm("d2");
778             register uint8x8_t d3 asm("d3");
779 
780             asm (
781                 "vld4.8    {d0-d3},[%[src]]! "
782                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
783                 :
784             );
785             vsrc.val[0] = d0;
786             vsrc.val[1] = d1;
787             vsrc.val[2] = d2;
788             }
789 #endif
790             vsrc_r = vsrc.val[NEON_R];
791             vsrc_g = vsrc.val[NEON_G];
792             vsrc_b = vsrc.val[NEON_B];
793 
794             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
795             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
796             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
797 
798             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
799             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
800             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
801 
802             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
803             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
804             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
805 
806             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
807             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
808             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
809 
810             // Load dst and unpack
811             vdst = vld1q_u16(dst);
812             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
813             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
814             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
815 
816             // subtract dst from src and widen
817             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
818             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
819             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
820 
821             // multiply diffs by scale and shift
822             vres_r = vmulq_s16(vres_r, vscale);
823             vres_g = vmulq_s16(vres_g, vscale);
824             vres_b = vmulq_s16(vres_b, vscale);
825 
826             vres8_r = vshrn_n_s16(vres_r, 8);
827             vres8_g = vshrn_n_s16(vres_g, 8);
828             vres8_b = vshrn_n_s16(vres_b, 8);
829 
830             // add dst to result
831             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
832             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
833             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
834 
835             // put result into 565 format
836             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
837             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
838 
839             // Store result
840             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
841 
842             // Next iteration
843             dst += 8;
844             count -= 8;
845 
846         } while (count >= 8);
847     }
848 
849     // Leftovers
850     if (count > 0) {
851         int scale = SkAlpha255To256(alpha);
852         DITHER_565_SCAN(y);
853         do {
854             SkPMColor c = *src++;
855             SkPMColorAssert(c);
856 
857             int dither = DITHER_VALUE(x);
858             int sr = SkGetPackedR32(c);
859             int sg = SkGetPackedG32(c);
860             int sb = SkGetPackedB32(c);
861             sr = SkDITHER_R32To565(sr, dither);
862             sg = SkDITHER_G32To565(sg, dither);
863             sb = SkDITHER_B32To565(sb, dither);
864 
865             uint16_t d = *dst;
866             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
867                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
868                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
869             DITHER_INC_X(x);
870         } while (--count != 0);
871     }
872 }
873 
S32A_Opaque_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)874 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
875                                 const SkPMColor* SK_RESTRICT src,
876                                 int count, U8CPU alpha) {
877 
878     SkASSERT(255 == alpha);
879     if (count > 0) {
880 
881 
882     uint8x8_t alpha_mask;
883 
884     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
885     alpha_mask = vld1_u8(alpha_mask_setup);
886 
887     /* do the NEON unrolled code */
888 #define    UNROLL    4
889     while (count >= UNROLL) {
890         uint8x8_t src_raw, dst_raw, dst_final;
891         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
892 
893         /* The two prefetches below may make the code slighlty
894          * slower for small values of count but are worth having
895          * in the general case.
896          */
897         __builtin_prefetch(src+32);
898         __builtin_prefetch(dst+32);
899 
900         /* get the source */
901         src_raw = vreinterpret_u8_u32(vld1_u32(src));
902 #if    UNROLL > 2
903         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
904 #endif
905 
906         /* get and hold the dst too */
907         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
908 #if    UNROLL > 2
909         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
910 #endif
911 
912     /* 1st and 2nd bits of the unrolling */
913     {
914         uint8x8_t dst_cooked;
915         uint16x8_t dst_wide;
916         uint8x8_t alpha_narrow;
917         uint16x8_t alpha_wide;
918 
919         /* get the alphas spread out properly */
920         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
921         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
922 
923         /* spread the dest */
924         dst_wide = vmovl_u8(dst_raw);
925 
926         /* alpha mul the dest */
927         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
928         dst_cooked = vshrn_n_u16(dst_wide, 8);
929 
930         /* sum -- ignoring any byte lane overflows */
931         dst_final = vadd_u8(src_raw, dst_cooked);
932     }
933 
934 #if    UNROLL > 2
935     /* the 3rd and 4th bits of our unrolling */
936     {
937         uint8x8_t dst_cooked;
938         uint16x8_t dst_wide;
939         uint8x8_t alpha_narrow;
940         uint16x8_t alpha_wide;
941 
942         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
943         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
944 
945         /* spread the dest */
946         dst_wide = vmovl_u8(dst_raw_2);
947 
948         /* alpha mul the dest */
949         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
950         dst_cooked = vshrn_n_u16(dst_wide, 8);
951 
952         /* sum -- ignoring any byte lane overflows */
953         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
954     }
955 #endif
956 
957         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
958 #if    UNROLL > 2
959         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
960 #endif
961 
962         src += UNROLL;
963         dst += UNROLL;
964         count -= UNROLL;
965     }
966 #undef    UNROLL
967 
968     /* do any residual iterations */
969         while (--count >= 0) {
970             *dst = SkPMSrcOver(*src, *dst);
971             src += 1;
972             dst += 1;
973         }
974     }
975 }
976 
S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)977 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
978                                 const SkPMColor* SK_RESTRICT src,
979                                 int count, U8CPU alpha) {
980     SkASSERT(255 == alpha);
981 
982     if (count <= 0)
983     return;
984 
985     /* Use these to check if src is transparent or opaque */
986     const unsigned int ALPHA_OPAQ  = 0xFF000000;
987     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
988 
989 #define UNROLL  4
990     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
991     const SkPMColor* SK_RESTRICT src_temp = src;
992 
993     /* set up the NEON variables */
994     uint8x8_t alpha_mask;
995     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
996     alpha_mask = vld1_u8(alpha_mask_setup);
997 
998     uint8x8_t src_raw, dst_raw, dst_final;
999     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
1000     uint8x8_t dst_cooked;
1001     uint16x8_t dst_wide;
1002     uint8x8_t alpha_narrow;
1003     uint16x8_t alpha_wide;
1004 
1005     /* choose the first processing type */
1006     if( src >= src_end)
1007         goto TAIL;
1008     if(*src <= ALPHA_TRANS)
1009         goto ALPHA_0;
1010     if(*src >= ALPHA_OPAQ)
1011         goto ALPHA_255;
1012     /* fall-thru */
1013 
1014 ALPHA_1_TO_254:
1015     do {
1016 
1017         /* get the source */
1018         src_raw = vreinterpret_u8_u32(vld1_u32(src));
1019         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
1020 
1021         /* get and hold the dst too */
1022         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
1023         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
1024 
1025 
1026         /* get the alphas spread out properly */
1027         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
1028         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
1029         /* we collapsed (255-a)+1 ... */
1030         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
1031 
1032         /* spread the dest */
1033         dst_wide = vmovl_u8(dst_raw);
1034 
1035         /* alpha mul the dest */
1036         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
1037         dst_cooked = vshrn_n_u16(dst_wide, 8);
1038 
1039         /* sum -- ignoring any byte lane overflows */
1040         dst_final = vadd_u8(src_raw, dst_cooked);
1041 
1042         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
1043         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
1044         /* we collapsed (255-a)+1 ... */
1045         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
1046 
1047         /* spread the dest */
1048         dst_wide = vmovl_u8(dst_raw_2);
1049 
1050         /* alpha mul the dest */
1051         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
1052         dst_cooked = vshrn_n_u16(dst_wide, 8);
1053 
1054         /* sum -- ignoring any byte lane overflows */
1055         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
1056 
1057         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
1058         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
1059 
1060         src += UNROLL;
1061         dst += UNROLL;
1062 
1063         /* if 2 of the next pixels aren't between 1 and 254
1064         it might make sense to go to the optimized loops */
1065         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
1066             break;
1067 
1068     } while(src < src_end);
1069 
1070     if (src >= src_end)
1071         goto TAIL;
1072 
1073     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
1074         goto ALPHA_255;
1075 
1076     /*fall-thru*/
1077 
1078 ALPHA_0:
1079 
1080     /*In this state, we know the current alpha is 0 and
1081      we optimize for the next alpha also being zero. */
1082     src_temp = src;  //so we don't have to increment dst every time
1083     do {
1084         if(*(++src) > ALPHA_TRANS)
1085             break;
1086         if(*(++src) > ALPHA_TRANS)
1087             break;
1088         if(*(++src) > ALPHA_TRANS)
1089             break;
1090         if(*(++src) > ALPHA_TRANS)
1091             break;
1092     } while(src < src_end);
1093 
1094     dst += (src - src_temp);
1095 
1096     /* no longer alpha 0, so determine where to go next. */
1097     if( src >= src_end)
1098         goto TAIL;
1099     if(*src >= ALPHA_OPAQ)
1100         goto ALPHA_255;
1101     else
1102         goto ALPHA_1_TO_254;
1103 
1104 ALPHA_255:
1105     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
1106         dst[0]=src[0];
1107         dst[1]=src[1];
1108         dst[2]=src[2];
1109         dst[3]=src[3];
1110         src+=UNROLL;
1111         dst+=UNROLL;
1112         if(src >= src_end)
1113             goto TAIL;
1114     }
1115 
1116     //Handle remainder.
1117     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
1118         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
1119             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
1120         }
1121     }
1122 
1123     if( src >= src_end)
1124         goto TAIL;
1125     if(*src <= ALPHA_TRANS)
1126         goto ALPHA_0;
1127     else
1128         goto ALPHA_1_TO_254;
1129 
1130 TAIL:
1131     /* do any residual iterations */
1132     src_end += UNROLL + 1;  //goto the real end
1133     while(src != src_end) {
1134         if( *src != 0 ) {
1135             if( *src >= ALPHA_OPAQ ) {
1136                 *dst = *src;
1137             }
1138             else {
1139                 *dst = SkPMSrcOver(*src, *dst);
1140             }
1141         }
1142         src++;
1143         dst++;
1144     }
1145 
1146 #undef    UNROLL
1147     return;
1148 }
1149 
1150 /* Neon version of S32_Blend_BlitRow32()
1151  * portable version is in src/core/SkBlitRow_D32.cpp
1152  */
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)1153 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1154                               const SkPMColor* SK_RESTRICT src,
1155                               int count, U8CPU alpha) {
1156     SkASSERT(alpha <= 255);
1157 
1158     if (count <= 0) {
1159         return;
1160     }
1161 
1162     uint16_t src_scale = SkAlpha255To256(alpha);
1163     uint16_t dst_scale = 256 - src_scale;
1164 
1165     while (count >= 2) {
1166         uint8x8_t vsrc, vdst, vres;
1167         uint16x8_t vsrc_wide, vdst_wide;
1168 
1169         /* These commented prefetches are a big win for count
1170          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
1171          * They also hurt a little (<5%) on an A15
1172          */
1173         //__builtin_prefetch(src+32);
1174         //__builtin_prefetch(dst+32);
1175 
1176         // Load
1177         vsrc = vreinterpret_u8_u32(vld1_u32(src));
1178         vdst = vreinterpret_u8_u32(vld1_u32(dst));
1179 
1180         // Process src
1181         vsrc_wide = vmovl_u8(vsrc);
1182         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1183 
1184         // Process dst
1185         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1186 
1187         // Combine
1188         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1189 
1190         // Store
1191         vst1_u32(dst, vreinterpret_u32_u8(vres));
1192 
1193         src += 2;
1194         dst += 2;
1195         count -= 2;
1196     }
1197 
1198     if (count == 1) {
1199         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1200         uint16x8_t vsrc_wide, vdst_wide;
1201 
1202         // Load
1203         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1204         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1205 
1206         // Process
1207         vsrc_wide = vmovl_u8(vsrc);
1208         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1209         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1210         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1211 
1212         // Store
1213         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1214     }
1215 }
1216 
1217 #ifdef SK_CPU_ARM32
S32A_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)1218 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1219                          const SkPMColor* SK_RESTRICT src,
1220                          int count, U8CPU alpha) {
1221 
1222     SkASSERT(255 >= alpha);
1223 
1224     if (count <= 0) {
1225         return;
1226     }
1227 
1228     unsigned alpha256 = SkAlpha255To256(alpha);
1229 
1230     // First deal with odd counts
1231     if (count & 1) {
1232         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1233         uint16x8_t vdst_wide, vsrc_wide;
1234         unsigned dst_scale;
1235 
1236         // Load
1237         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1238         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1239 
1240         // Calc dst_scale
1241         dst_scale = vget_lane_u8(vsrc, 3);
1242         dst_scale *= alpha256;
1243         dst_scale >>= 8;
1244         dst_scale = 256 - dst_scale;
1245 
1246         // Process src
1247         vsrc_wide = vmovl_u8(vsrc);
1248         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
1249 
1250         // Process dst
1251         vdst_wide = vmovl_u8(vdst);
1252         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
1253 
1254         // Combine
1255         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1256 
1257         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1258         dst++;
1259         src++;
1260         count--;
1261     }
1262 
1263     if (count) {
1264         uint8x8_t alpha_mask;
1265         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
1266         alpha_mask = vld1_u8(alpha_mask_setup);
1267 
1268         do {
1269 
1270             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
1271             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
1272 
1273             __builtin_prefetch(src+32);
1274             __builtin_prefetch(dst+32);
1275 
1276             // Load
1277             vsrc = vreinterpret_u8_u32(vld1_u32(src));
1278             vdst = vreinterpret_u8_u32(vld1_u32(dst));
1279 
1280             // Prepare src_scale
1281             vsrc_scale = vdupq_n_u16(alpha256);
1282 
1283             // Calc dst_scale
1284             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1285             vdst_scale = vmovl_u8(vsrc_alphas);
1286             vdst_scale *= vsrc_scale;
1287             vdst_scale = vshrq_n_u16(vdst_scale, 8);
1288             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1289 
1290             // Process src
1291             vsrc_wide = vmovl_u8(vsrc);
1292             vsrc_wide *= vsrc_scale;
1293 
1294             // Process dst
1295             vdst_wide = vmovl_u8(vdst);
1296             vdst_wide *= vdst_scale;
1297 
1298             // Combine
1299             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1300 
1301             vst1_u32(dst, vreinterpret_u32_u8(vres));
1302 
1303             src += 2;
1304             dst += 2;
1305             count -= 2;
1306         } while(count);
1307     }
1308 }
1309 
1310 ///////////////////////////////////////////////////////////////////////////////
1311 
1312 #endif // #ifdef SK_CPU_ARM32
1313 
S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1314 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1315                                    const SkPMColor* SK_RESTRICT src,
1316                                    int count, U8CPU alpha, int x, int y) {
1317     SkASSERT(255 == alpha);
1318 
1319 #define    UNROLL    8
1320 
1321     if (count >= UNROLL) {
1322 
1323     uint8x8_t dbase;
1324     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1325     dbase = vld1_u8(dstart);
1326 
1327         do {
1328         uint8x8x4_t vsrc;
1329         uint8x8_t sr, sg, sb, sa, d;
1330         uint16x8_t dst8, scale8, alpha8;
1331         uint16x8_t dst_r, dst_g, dst_b;
1332 
1333 #ifdef SK_CPU_ARM64
1334         vsrc = sk_vld4_u8_arm64_4(src);
1335 #else
1336         {
1337         register uint8x8_t d0 asm("d0");
1338         register uint8x8_t d1 asm("d1");
1339         register uint8x8_t d2 asm("d2");
1340         register uint8x8_t d3 asm("d3");
1341 
1342         asm ("vld4.8    {d0-d3},[%[src]]! "
1343             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1344             :
1345         );
1346         vsrc.val[0] = d0;
1347         vsrc.val[1] = d1;
1348         vsrc.val[2] = d2;
1349         vsrc.val[3] = d3;
1350         }
1351 #endif
1352         sa = vsrc.val[NEON_A];
1353         sr = vsrc.val[NEON_R];
1354         sg = vsrc.val[NEON_G];
1355         sb = vsrc.val[NEON_B];
1356 
1357         /* calculate 'd', which will be 0..7
1358          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1359          */
1360         alpha8 = vmovl_u8(dbase);
1361         alpha8 = vmlal_u8(alpha8, sa, dbase);
1362         d = vshrn_n_u16(alpha8, 8);    // narrowing too
1363 
1364         // sr = sr - (sr>>5) + d
1365         /* watching for 8-bit overflow.  d is 0..7; risky range of
1366          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1367          * safe  as long as we do ((sr-sr>>5) + d)
1368          */
1369         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1370         sr = vadd_u8(sr, d);
1371 
1372         // sb = sb - (sb>>5) + d
1373         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1374         sb = vadd_u8(sb, d);
1375 
1376         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1377         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1378         sg = vadd_u8(sg, vshr_n_u8(d,1));
1379 
1380         // need to pick up 8 dst's -- at 16 bits each, 128 bits
1381         dst8 = vld1q_u16(dst);
1382         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1383         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1384         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1385 
1386         // blend
1387         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1388 
1389         // combine the addq and mul, save 3 insns
1390         scale8 = vshrq_n_u16(scale8, 3);
1391         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1392         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1393         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1394 
1395         // repack to store
1396         dst8 = vshrq_n_u16(dst_b, 5);
1397         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1398         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1399 
1400         vst1q_u16(dst, dst8);
1401 
1402         dst += UNROLL;
1403         count -= UNROLL;
1404         // skip x += UNROLL, since it's unchanged mod-4
1405         } while (count >= UNROLL);
1406     }
1407 #undef    UNROLL
1408 
1409     // residuals
1410     if (count > 0) {
1411         DITHER_565_SCAN(y);
1412         do {
1413             SkPMColor c = *src++;
1414             SkPMColorAssert(c);
1415             if (c) {
1416                 unsigned a = SkGetPackedA32(c);
1417 
1418                 // dither and alpha are just temporary variables to work-around
1419                 // an ICE in debug.
1420                 unsigned dither = DITHER_VALUE(x);
1421                 unsigned alpha = SkAlpha255To256(a);
1422                 int d = SkAlphaMul(dither, alpha);
1423 
1424                 unsigned sr = SkGetPackedR32(c);
1425                 unsigned sg = SkGetPackedG32(c);
1426                 unsigned sb = SkGetPackedB32(c);
1427                 sr = SkDITHER_R32_FOR_565(sr, d);
1428                 sg = SkDITHER_G32_FOR_565(sg, d);
1429                 sb = SkDITHER_B32_FOR_565(sb, d);
1430 
1431                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1432                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1433                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1434                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1435                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1436             }
1437             dst += 1;
1438             DITHER_INC_X(x);
1439         } while (--count != 0);
1440     }
1441 }
1442 
1443 ///////////////////////////////////////////////////////////////////////////////
1444 
S32_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1445 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1446                                  const SkPMColor* SK_RESTRICT src,
1447                                  int count, U8CPU alpha, int x, int y) {
1448     SkASSERT(255 == alpha);
1449 
1450 #define    UNROLL    8
1451     if (count >= UNROLL) {
1452     uint8x8_t d;
1453     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1454     d = vld1_u8(dstart);
1455 
1456     while (count >= UNROLL) {
1457         uint8x8_t sr, sg, sb;
1458         uint16x8_t dr, dg, db;
1459         uint16x8_t dst8;
1460         uint8x8x4_t vsrc;
1461 
1462 #ifdef SK_CPU_ARM64
1463         vsrc = sk_vld4_u8_arm64_3(src);
1464 #else
1465         {
1466         register uint8x8_t d0 asm("d0");
1467         register uint8x8_t d1 asm("d1");
1468         register uint8x8_t d2 asm("d2");
1469         register uint8x8_t d3 asm("d3");
1470 
1471         asm (
1472             "vld4.8    {d0-d3},[%[src]]! "
1473             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1474             :
1475         );
1476         vsrc.val[0] = d0;
1477         vsrc.val[1] = d1;
1478         vsrc.val[2] = d2;
1479         }
1480 #endif
1481         sr = vsrc.val[NEON_R];
1482         sg = vsrc.val[NEON_G];
1483         sb = vsrc.val[NEON_B];
1484 
1485         /* XXX: if we want to prefetch, hide it in the above asm()
1486          * using the gcc __builtin_prefetch(), the prefetch will
1487          * fall to the bottom of the loop -- it won't stick up
1488          * at the top of the loop, just after the vld4.
1489          */
1490 
1491         // sr = sr - (sr>>5) + d
1492         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1493         dr = vaddl_u8(sr, d);
1494 
1495         // sb = sb - (sb>>5) + d
1496         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1497         db = vaddl_u8(sb, d);
1498 
1499         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1500         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1501         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1502 
1503         // pack high bits of each into 565 format  (rgb, b is lsb)
1504         dst8 = vshrq_n_u16(db, 3);
1505         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1506         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1507 
1508         // store it
1509         vst1q_u16(dst, dst8);
1510 
1511         dst += UNROLL;
1512         // we don't need to increment src as the asm above has already done it
1513         count -= UNROLL;
1514         x += UNROLL;        // probably superfluous
1515     }
1516     }
1517 #undef    UNROLL
1518 
1519     // residuals
1520     if (count > 0) {
1521         DITHER_565_SCAN(y);
1522         do {
1523             SkPMColor c = *src++;
1524             SkPMColorAssert(c);
1525             SkASSERT(SkGetPackedA32(c) == 255);
1526 
1527             unsigned dither = DITHER_VALUE(x);
1528             *dst++ = SkDitherRGB32To565(c, dither);
1529             DITHER_INC_X(x);
1530         } while (--count != 0);
1531     }
1532 }
1533 
1534 ///////////////////////////////////////////////////////////////////////////////
1535 
1536 const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
1537     // no dither
1538     S32_D565_Opaque_neon,
1539     S32_D565_Blend_neon,
1540     S32A_D565_Opaque_neon,
1541 #if 0
1542     S32A_D565_Blend_neon,
1543 #else
1544     nullptr,   // https://code.google.com/p/skia/issues/detail?id=2797
1545 #endif
1546 
1547     // dither
1548     S32_D565_Opaque_Dither_neon,
1549     S32_D565_Blend_Dither_neon,
1550     S32A_D565_Opaque_Dither_neon,
1551     nullptr,   // S32A_D565_Blend_Dither
1552 };
1553 
1554 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
1555     Color32A_D565_neon,    // Color32_D565,
1556     Color32A_D565_neon,    // Color32A_D565,
1557     Color32A_D565_neon,    // Color32_D565_Dither,
1558     Color32A_D565_neon,    // Color32A_D565_Dither
1559 };
1560 
1561 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1562     nullptr,   // S32_Opaque,
1563     S32_Blend_BlitRow32_neon,        // S32_Blend,
1564     /*
1565      * We have two choices for S32A_Opaque procs. The one reads the src alpha
1566      * value and attempts to optimize accordingly.  The optimization is
1567      * sensitive to the source content and is not a win in all cases. For
1568      * example, if there are a lot of transitions between the alpha states,
1569      * the performance will almost certainly be worse.  However, for many
1570      * common cases the performance is equivalent or better than the standard
1571      * case where we do not inspect the src alpha.
1572      */
1573 #if SK_A32_SHIFT == 24
1574     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1575     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1576 #else
1577     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1578 #endif
1579 #ifdef SK_CPU_ARM32
1580     S32A_Blend_BlitRow32_neon        // S32A_Blend
1581 #else
1582     nullptr
1583 #endif
1584 };
1585