1 
2 #include "SkBlitMask.h"
3 #include "SkColor_opts_neon.h"
4 
D32_A8_Black_neon(void * SK_RESTRICT dst,size_t dstRB,const void * SK_RESTRICT maskPtr,size_t maskRB,SkColor,int width,int height)5 static void D32_A8_Black_neon(void* SK_RESTRICT dst, size_t dstRB,
6                               const void* SK_RESTRICT maskPtr, size_t maskRB,
7                               SkColor, int width, int height) {
8     SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
9     const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
10 
11     maskRB -= width;
12     dstRB -= (width << 2);
13     do {
14         int w = width;
15         while (w >= 8) {
16             uint8x8_t vmask = vld1_u8(mask);
17             uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask);
18             uint8x8x4_t vdevice = vld4_u8((uint8_t*)device);
19 
20             vdevice = SkAlphaMulQ_neon8(vdevice, vscale);
21             vdevice.val[NEON_A] += vmask;
22 
23             vst4_u8((uint8_t*)device, vdevice);
24 
25             mask += 8;
26             device += 8;
27             w -= 8;
28         }
29         while (w-- > 0) {
30             unsigned aa = *mask++;
31             *device = (aa << SK_A32_SHIFT)
32                         + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
33             device += 1;
34         };
35         device = (uint32_t*)((char*)device + dstRB);
36         mask += maskRB;
37     } while (--height != 0);
38 }
39 
40 template <bool isColor>
D32_A8_Opaque_Color_neon(void * SK_RESTRICT dst,size_t dstRB,const void * SK_RESTRICT maskPtr,size_t maskRB,SkColor color,int width,int height)41 static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB,
42                                      const void* SK_RESTRICT maskPtr, size_t maskRB,
43                                      SkColor color, int width, int height) {
44     SkPMColor pmc = SkPreMultiplyColor(color);
45     SkPMColor* SK_RESTRICT device = (SkPMColor*)dst;
46     const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr;
47     uint8x8x4_t vpmc;
48 
49     maskRB -= width;
50     dstRB -= (width << 2);
51 
52     if (width >= 8) {
53         vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc));
54         vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc));
55         vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc));
56         vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc));
57     }
58     do {
59         int w = width;
60         while (w >= 8) {
61             uint8x8_t vmask = vld1_u8(mask);
62             uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask);
63             if (isColor) {
64                 vscale = vsubw_u8(vdupq_n_u16(256),
65                             SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256));
66             } else {
67                 vscale = vsubw_u8(vdupq_n_u16(256), vmask);
68             }
69             uint8x8x4_t vdev = vld4_u8((uint8_t*)device);
70 
71             vdev.val[NEON_A] =   SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)
72                                + SkAlphaMul_neon8(vdev.val[NEON_A], vscale);
73             vdev.val[NEON_R] =   SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256)
74                                + SkAlphaMul_neon8(vdev.val[NEON_R], vscale);
75             vdev.val[NEON_G] =   SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256)
76                                + SkAlphaMul_neon8(vdev.val[NEON_G], vscale);
77             vdev.val[NEON_B] =   SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256)
78                                + SkAlphaMul_neon8(vdev.val[NEON_B], vscale);
79 
80             vst4_u8((uint8_t*)device, vdev);
81 
82             mask += 8;
83             device += 8;
84             w -= 8;
85         }
86 
87         while (w--) {
88             unsigned aa = *mask++;
89             if (isColor) {
90                 *device = SkBlendARGB32(pmc, *device, aa);
91             } else {
92                 *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa))
93                             + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa));
94             }
95             device += 1;
96         };
97 
98         device = (uint32_t*)((char*)device + dstRB);
99         mask += maskRB;
100 
101     } while (--height != 0);
102 }
103 
D32_A8_Opaque_neon(void * SK_RESTRICT dst,size_t dstRB,const void * SK_RESTRICT maskPtr,size_t maskRB,SkColor color,int width,int height)104 static void D32_A8_Opaque_neon(void* SK_RESTRICT dst, size_t dstRB,
105                                const void* SK_RESTRICT maskPtr, size_t maskRB,
106                                SkColor color, int width, int height) {
107     D32_A8_Opaque_Color_neon<false>(dst, dstRB, maskPtr, maskRB, color, width, height);
108 }
109 
D32_A8_Color_neon(void * SK_RESTRICT dst,size_t dstRB,const void * SK_RESTRICT maskPtr,size_t maskRB,SkColor color,int width,int height)110 static void D32_A8_Color_neon(void* SK_RESTRICT dst, size_t dstRB,
111                               const void* SK_RESTRICT maskPtr, size_t maskRB,
112                               SkColor color, int width, int height) {
113     D32_A8_Opaque_Color_neon<true>(dst, dstRB, maskPtr, maskRB, color, width, height);
114 }
115 
D32_A8_Factory_neon(SkColor color)116 SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color) {
117     if (SK_ColorBLACK == color) {
118         return D32_A8_Black_neon;
119     } else if (0xFF == SkColorGetA(color)) {
120         return D32_A8_Opaque_neon;
121     } else {
122         return D32_A8_Color_neon;
123     }
124 }
125 
126 ////////////////////////////////////////////////////////////////////////////////
127 
SkBlitLCD16OpaqueRow_neon(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)128 void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
129                                         SkColor color, int width,
130                                         SkPMColor opaqueDst) {
131     int colR = SkColorGetR(color);
132     int colG = SkColorGetG(color);
133     int colB = SkColorGetB(color);
134 
135     uint8x8_t vcolR, vcolG, vcolB;
136     uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB;
137 
138     if (width >= 8) {
139         vcolR = vdup_n_u8(colR);
140         vcolG = vdup_n_u8(colG);
141         vcolB = vdup_n_u8(colB);
142         vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
143         vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
144         vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
145         vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
146     }
147 
148     while (width >= 8) {
149         uint8x8x4_t vdst;
150         uint16x8_t vmask;
151         uint16x8_t vmaskR, vmaskG, vmaskB;
152         uint8x8_t vsel_trans, vsel_opq;
153 
154         vdst = vld4_u8((uint8_t*)dst);
155         vmask = vld1q_u16(src);
156 
157         // Prepare compare masks
158         vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
159         vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
160 
161         // Get all the color masks on 5 bits
162         vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
163         vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
164                              SK_B16_BITS + SK_R16_BITS + 1);
165         vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
166 
167         // Upscale to 0..32
168         vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
169         vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
170         vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
171 
172         vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
173         vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
174 
175         vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
176         vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
177         vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
178 
179         vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
180         vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
181         vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
182 
183         vst4_u8((uint8_t*)dst, vdst);
184 
185         dst += 8;
186         src += 8;
187         width -= 8;
188     }
189 
190     // Leftovers
191     for (int i = 0; i < width; i++) {
192         dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
193                                     opaqueDst);
194     }
195 }
196 
SkBlitLCD16Row_neon(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)197 void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
198                                    SkColor color, int width, SkPMColor) {
199     int colA = SkColorGetA(color);
200     int colR = SkColorGetR(color);
201     int colG = SkColorGetG(color);
202     int colB = SkColorGetB(color);
203 
204     colA = SkAlpha255To256(colA);
205 
206     uint8x8_t vcolR, vcolG, vcolB;
207     uint16x8_t vcolA;
208 
209     if (width >= 8) {
210         vcolA = vdupq_n_u16(colA);
211         vcolR = vdup_n_u8(colR);
212         vcolG = vdup_n_u8(colG);
213         vcolB = vdup_n_u8(colB);
214     }
215 
216     while (width >= 8) {
217         uint8x8x4_t vdst;
218         uint16x8_t vmask;
219         uint16x8_t vmaskR, vmaskG, vmaskB;
220 
221         vdst = vld4_u8((uint8_t*)dst);
222         vmask = vld1q_u16(src);
223 
224         // Get all the color masks on 5 bits
225         vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
226         vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
227                              SK_B16_BITS + SK_R16_BITS + 1);
228         vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
229 
230         // Upscale to 0..32
231         vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
232         vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
233         vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
234 
235         vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
236         vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
237         vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
238 
239         vdst.val[NEON_A] = vdup_n_u8(0xFF);
240         vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
241         vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
242         vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
243 
244         vst4_u8((uint8_t*)dst, vdst);
245 
246         dst += 8;
247         src += 8;
248         width -= 8;
249     }
250 
251     for (int i = 0; i < width; i++) {
252         dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
253     }
254 }
255