1 /*
2  * Copyright 2006 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "Sk4px.h"
9 #include "SkColorData.h"
10 #include "SkCoreBlitters.h"
11 #include "SkShader.h"
12 #include "SkUtils.h"
13 #include "SkXfermodePriv.h"
14 
upscale_31_to_32(int value)15 static inline int upscale_31_to_32(int value) {
16     SkASSERT((unsigned)value <= 31);
17     return value + (value >> 4);
18 }
19 
blend_32(int src,int dst,int scale)20 static inline int blend_32(int src, int dst, int scale) {
21     SkASSERT((unsigned)src <= 0xFF);
22     SkASSERT((unsigned)dst <= 0xFF);
23     SkASSERT((unsigned)scale <= 32);
24     return dst + ((src - dst) * scale >> 5);
25 }
26 
blend_lcd16(int srcA,int srcR,int srcG,int srcB,SkPMColor dst,uint16_t mask)27 static inline SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB,
28                                      SkPMColor dst, uint16_t mask) {
29     if (mask == 0) {
30         return dst;
31     }
32 
33     /*  We want all of these in 5bits, hence the shifts in case one of them
34      *  (green) is 6bits.
35      */
36     int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
37     int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
38     int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
39 
40     // Now upscale them to 0..32, so we can use blend32
41     maskR = upscale_31_to_32(maskR);
42     maskG = upscale_31_to_32(maskG);
43     maskB = upscale_31_to_32(maskB);
44 
45     // srcA has been upscaled to 256 before passed into this function
46     maskR = maskR * srcA >> 8;
47     maskG = maskG * srcA >> 8;
48     maskB = maskB * srcA >> 8;
49 
50     int dstR = SkGetPackedR32(dst);
51     int dstG = SkGetPackedG32(dst);
52     int dstB = SkGetPackedB32(dst);
53 
54     // LCD blitting is only supported if the dst is known/required
55     // to be opaque
56     return SkPackARGB32(0xFF,
57                         blend_32(srcR, dstR, maskR),
58                         blend_32(srcG, dstG, maskG),
59                         blend_32(srcB, dstB, maskB));
60 }
61 
blend_lcd16_opaque(int srcR,int srcG,int srcB,SkPMColor dst,uint16_t mask,SkPMColor opaqueDst)62 static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
63                                            SkPMColor dst, uint16_t mask,
64                                            SkPMColor opaqueDst) {
65     if (mask == 0) {
66         return dst;
67     }
68 
69     if (0xFFFF == mask) {
70         return opaqueDst;
71     }
72 
73     /*  We want all of these in 5bits, hence the shifts in case one of them
74      *  (green) is 6bits.
75      */
76     int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
77     int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
78     int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
79 
80     // Now upscale them to 0..32, so we can use blend32
81     maskR = upscale_31_to_32(maskR);
82     maskG = upscale_31_to_32(maskG);
83     maskB = upscale_31_to_32(maskB);
84 
85     int dstR = SkGetPackedR32(dst);
86     int dstG = SkGetPackedG32(dst);
87     int dstB = SkGetPackedB32(dst);
88 
89     // LCD blitting is only supported if the dst is known/required
90     // to be opaque
91     return SkPackARGB32(0xFF,
92                         blend_32(srcR, dstR, maskR),
93                         blend_32(srcG, dstG, maskG),
94                         blend_32(srcB, dstB, maskB));
95 }
96 
97 
98 // TODO: rewrite at least the SSE code here.  It's miserable.
99 
100 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
101     #include <emmintrin.h>
102 
103     // The following (left) shifts cause the top 5 bits of the mask components to
104     // line up with the corresponding components in an SkPMColor.
105     // Note that the mask's RGB16 order may differ from the SkPMColor order.
106     #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
107     #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
108     #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
109 
110     #if SK_R16x5_R32x5_SHIFT == 0
111         #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
112     #elif SK_R16x5_R32x5_SHIFT > 0
113         #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
114     #else
115         #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
116     #endif
117 
118     #if SK_G16x5_G32x5_SHIFT == 0
119         #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
120     #elif SK_G16x5_G32x5_SHIFT > 0
121         #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
122     #else
123         #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
124     #endif
125 
126     #if SK_B16x5_B32x5_SHIFT == 0
127         #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
128     #elif SK_B16x5_B32x5_SHIFT > 0
129         #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
130     #else
131         #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
132     #endif
133 
blend_lcd16_sse2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)134     static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
135         // In the following comments, the components of src, dst and mask are
136         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
137         // by an R, G, B, or A suffix. Components of one of the four pixels that
138         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
139         // example is the blue channel of the second destination pixel. Memory
140         // layout is shown for an ARGB byte order in a color value.
141 
142         // src and srcA store 8-bit values interleaved with zeros.
143         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
144         // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
145         //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
146         // mask stores 16-bit values (compressed three channels) interleaved with zeros.
147         // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
148         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
149         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
150 
151         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
152         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
153         __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
154                                   _mm_set1_epi32(0x1F << SK_R32_SHIFT));
155 
156         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
157         __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
158                                   _mm_set1_epi32(0x1F << SK_G32_SHIFT));
159 
160         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
161         __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
162                                   _mm_set1_epi32(0x1F << SK_B32_SHIFT));
163 
164         // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
165         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
166         // 8-bit position
167         // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
168         //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
169         mask = _mm_or_si128(_mm_or_si128(r, g), b);
170 
171         // Interleave R,G,B into the lower byte of word.
172         // i.e. split the sixteen 8-bit values from mask into two sets of eight
173         // 16-bit values, padded by zero.
174         __m128i maskLo, maskHi;
175         // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
176         maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
177         // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
178         maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
179 
180         // Upscale from 0..31 to 0..32
181         // (allows to replace division by left-shift further down)
182         // Left-shift each component by 4 and add the result back to that component,
183         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
184         maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
185         maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
186 
187         // Multiply each component of maskLo and maskHi by srcA
188         maskLo = _mm_mullo_epi16(maskLo, srcA);
189         maskHi = _mm_mullo_epi16(maskHi, srcA);
190 
191         // Left shift mask components by 8 (divide by 256)
192         maskLo = _mm_srli_epi16(maskLo, 8);
193         maskHi = _mm_srli_epi16(maskHi, 8);
194 
195         // Interleave R,G,B into the lower byte of the word
196         // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
197         __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
198         // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
199         __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
200 
201         // mask = (src - dst) * mask
202         maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
203         maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
204 
205         // mask = (src - dst) * mask >> 5
206         maskLo = _mm_srai_epi16(maskLo, 5);
207         maskHi = _mm_srai_epi16(maskHi, 5);
208 
209         // Add two pixels into result.
210         // result = dst + ((src - dst) * mask >> 5)
211         __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
212         __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
213 
214         // Pack into 4 32bit dst pixels.
215         // resultLo and resultHi contain eight 16-bit components (two pixels) each.
216         // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
217         // clamping to 255 if necessary.
218         return _mm_packus_epi16(resultLo, resultHi);
219     }
220 
blend_lcd16_opaque_sse2(__m128i & src,__m128i & dst,__m128i & mask)221     static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask) {
222         // In the following comments, the components of src, dst and mask are
223         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
224         // by an R, G, B, or A suffix. Components of one of the four pixels that
225         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
226         // example is the blue channel of the second destination pixel. Memory
227         // layout is shown for an ARGB byte order in a color value.
228 
229         // src and srcA store 8-bit values interleaved with zeros.
230         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
231         // mask stores 16-bit values (shown as high and low bytes) interleaved with
232         // zeros
233         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
234         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
235 
236         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
237         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
238         __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
239                                   _mm_set1_epi32(0x1F << SK_R32_SHIFT));
240 
241         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
242         __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
243                                   _mm_set1_epi32(0x1F << SK_G32_SHIFT));
244 
245         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
246         __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
247                                   _mm_set1_epi32(0x1F << SK_B32_SHIFT));
248 
249         // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
250         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
251         // 8-bit position
252         // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
253         //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
254         mask = _mm_or_si128(_mm_or_si128(r, g), b);
255 
256         // Interleave R,G,B into the lower byte of word.
257         // i.e. split the sixteen 8-bit values from mask into two sets of eight
258         // 16-bit values, padded by zero.
259         __m128i maskLo, maskHi;
260         // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
261         maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
262         // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
263         maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
264 
265         // Upscale from 0..31 to 0..32
266         // (allows to replace division by left-shift further down)
267         // Left-shift each component by 4 and add the result back to that component,
268         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
269         maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
270         maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
271 
272         // Interleave R,G,B into the lower byte of the word
273         // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
274         __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
275         // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
276         __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
277 
278         // mask = (src - dst) * mask
279         maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
280         maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
281 
282         // mask = (src - dst) * mask >> 5
283         maskLo = _mm_srai_epi16(maskLo, 5);
284         maskHi = _mm_srai_epi16(maskHi, 5);
285 
286         // Add two pixels into result.
287         // result = dst + ((src - dst) * mask >> 5)
288         __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
289         __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
290 
291         // Pack into 4 32bit dst pixels and force opaque.
292         // resultLo and resultHi contain eight 16-bit components (two pixels) each.
293         // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
294         // clamping to 255 if necessary. Set alpha components to 0xFF.
295         return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
296                             _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
297     }
298 
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)299     void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
300         if (width <= 0) {
301             return;
302         }
303 
304         int srcA = SkColorGetA(src);
305         int srcR = SkColorGetR(src);
306         int srcG = SkColorGetG(src);
307         int srcB = SkColorGetB(src);
308 
309         srcA = SkAlpha255To256(srcA);
310 
311         if (width >= 4) {
312             SkASSERT(((size_t)dst & 0x03) == 0);
313             while (((size_t)dst & 0x0F) != 0) {
314                 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
315                 mask++;
316                 dst++;
317                 width--;
318             }
319 
320             __m128i *d = reinterpret_cast<__m128i*>(dst);
321             // Set alpha to 0xFF and replicate source four times in SSE register.
322             __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
323             // Interleave with zeros to get two sets of four 16-bit values.
324             src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
325             // Set srcA_sse to contain eight copies of srcA, padded with zero.
326             // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
327             __m128i srcA_sse = _mm_set1_epi16(srcA);
328             while (width >= 4) {
329                 // Load four destination pixels into dst_sse.
330                 __m128i dst_sse = _mm_load_si128(d);
331                 // Load four 16-bit masks into lower half of mask_sse.
332                 __m128i mask_sse = _mm_loadl_epi64(
333                                        reinterpret_cast<const __m128i*>(mask));
334 
335                 // Check whether masks are equal to 0 and get the highest bit
336                 // of each byte of result, if masks are all zero, we will get
337                 // pack_cmp to 0xFFFF
338                 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
339                                                  _mm_setzero_si128()));
340 
341                 // if mask pixels are not all zero, we will blend the dst pixels
342                 if (pack_cmp != 0xFFFF) {
343                     // Unpack 4 16bit mask pixels to
344                     // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
345                     //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
346                     mask_sse = _mm_unpacklo_epi16(mask_sse,
347                                                   _mm_setzero_si128());
348 
349                     // Process 4 32bit dst pixels
350                     __m128i result = blend_lcd16_sse2(src_sse, dst_sse, mask_sse, srcA_sse);
351                     _mm_store_si128(d, result);
352                 }
353 
354                 d++;
355                 mask += 4;
356                 width -= 4;
357             }
358 
359             dst = reinterpret_cast<SkPMColor*>(d);
360         }
361 
362         while (width > 0) {
363             *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
364             mask++;
365             dst++;
366             width--;
367         }
368     }
369 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)370     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
371                                    SkColor src, int width, SkPMColor opaqueDst) {
372         if (width <= 0) {
373             return;
374         }
375 
376         int srcR = SkColorGetR(src);
377         int srcG = SkColorGetG(src);
378         int srcB = SkColorGetB(src);
379 
380         if (width >= 4) {
381             SkASSERT(((size_t)dst & 0x03) == 0);
382             while (((size_t)dst & 0x0F) != 0) {
383                 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
384                 mask++;
385                 dst++;
386                 width--;
387             }
388 
389             __m128i *d = reinterpret_cast<__m128i*>(dst);
390             // Set alpha to 0xFF and replicate source four times in SSE register.
391             __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
392             // Set srcA_sse to contain eight copies of srcA, padded with zero.
393             // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
394             src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
395             while (width >= 4) {
396                 // Load four destination pixels into dst_sse.
397                 __m128i dst_sse = _mm_load_si128(d);
398                 // Load four 16-bit masks into lower half of mask_sse.
399                 __m128i mask_sse = _mm_loadl_epi64(
400                                        reinterpret_cast<const __m128i*>(mask));
401 
402                 // Check whether masks are equal to 0 and get the highest bit
403                 // of each byte of result, if masks are all zero, we will get
404                 // pack_cmp to 0xFFFF
405                 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
406                                                  _mm_setzero_si128()));
407 
408                 // if mask pixels are not all zero, we will blend the dst pixels
409                 if (pack_cmp != 0xFFFF) {
410                     // Unpack 4 16bit mask pixels to
411                     // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
412                     //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
413                     mask_sse = _mm_unpacklo_epi16(mask_sse,
414                                                   _mm_setzero_si128());
415 
416                     // Process 4 32bit dst pixels
417                     __m128i result = blend_lcd16_opaque_sse2(src_sse, dst_sse, mask_sse);
418                     _mm_store_si128(d, result);
419                 }
420 
421                 d++;
422                 mask += 4;
423                 width -= 4;
424             }
425 
426             dst = reinterpret_cast<SkPMColor*>(d);
427         }
428 
429         while (width > 0) {
430             *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
431             mask++;
432             dst++;
433             width--;
434         }
435     }
436 
437 #elif defined(SK_ARM_HAS_NEON)
438     #include <arm_neon.h>
439 
440     #define NEON_A (SK_A32_SHIFT / 8)
441     #define NEON_R (SK_R32_SHIFT / 8)
442     #define NEON_G (SK_G32_SHIFT / 8)
443     #define NEON_B (SK_B32_SHIFT / 8)
444 
blend_32_neon(uint8x8_t src,uint8x8_t dst,uint16x8_t scale)445     static inline uint8x8_t blend_32_neon(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) {
446         int16x8_t src_wide, dst_wide;
447 
448         src_wide = vreinterpretq_s16_u16(vmovl_u8(src));
449         dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst));
450 
451         src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale);
452 
453         dst_wide += vshrq_n_s16(src_wide, 5);
454 
455         return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
456     }
457 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)458     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t src[],
459                                SkColor color, int width,
460                                SkPMColor opaqueDst) {
461         int colR = SkColorGetR(color);
462         int colG = SkColorGetG(color);
463         int colB = SkColorGetB(color);
464 
465         uint8x8_t vcolR = vdup_n_u8(colR);
466         uint8x8_t vcolG = vdup_n_u8(colG);
467         uint8x8_t vcolB = vdup_n_u8(colB);
468         uint8x8_t vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
469         uint8x8_t vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
470         uint8x8_t vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
471         uint8x8_t vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
472 
473         while (width >= 8) {
474             uint8x8x4_t vdst;
475             uint16x8_t vmask;
476             uint16x8_t vmaskR, vmaskG, vmaskB;
477             uint8x8_t vsel_trans, vsel_opq;
478 
479             vdst = vld4_u8((uint8_t*)dst);
480             vmask = vld1q_u16(src);
481 
482             // Prepare compare masks
483             vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
484             vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
485 
486             // Get all the color masks on 5 bits
487             vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
488             vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
489                                  SK_B16_BITS + SK_R16_BITS + 1);
490             vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
491 
492             // Upscale to 0..32
493             vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
494             vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
495             vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
496 
497             vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
498             vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
499 
500             vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
501             vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
502             vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
503 
504             vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
505             vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
506             vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
507 
508             vst4_u8((uint8_t*)dst, vdst);
509 
510             dst += 8;
511             src += 8;
512             width -= 8;
513         }
514 
515         // Leftovers
516         for (int i = 0; i < width; i++) {
517             dst[i] = blend_lcd16_opaque(colR, colG, colB, dst[i], src[i], opaqueDst);
518         }
519     }
520 
blit_row_lcd16(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)521     void blit_row_lcd16(SkPMColor dst[], const uint16_t src[],
522                         SkColor color, int width, SkPMColor) {
523         int colA = SkColorGetA(color);
524         int colR = SkColorGetR(color);
525         int colG = SkColorGetG(color);
526         int colB = SkColorGetB(color);
527 
528         colA = SkAlpha255To256(colA);
529 
530         uint16x8_t vcolA = vdupq_n_u16(colA);
531         uint8x8_t vcolR = vdup_n_u8(colR);
532         uint8x8_t vcolG = vdup_n_u8(colG);
533         uint8x8_t vcolB = vdup_n_u8(colB);
534 
535         while (width >= 8) {
536             uint8x8x4_t vdst;
537             uint16x8_t vmask;
538             uint16x8_t vmaskR, vmaskG, vmaskB;
539 
540             vdst = vld4_u8((uint8_t*)dst);
541             vmask = vld1q_u16(src);
542 
543             // Get all the color masks on 5 bits
544             vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
545             vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
546                                  SK_B16_BITS + SK_R16_BITS + 1);
547             vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
548 
549             // Upscale to 0..32
550             vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
551             vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
552             vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
553 
554             vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
555             vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
556             vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
557 
558             vdst.val[NEON_A] = vdup_n_u8(0xFF);
559             vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
560             vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
561             vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
562 
563             vst4_u8((uint8_t*)dst, vdst);
564 
565             dst += 8;
566             src += 8;
567             width -= 8;
568         }
569 
570         for (int i = 0; i < width; i++) {
571             dst[i] = blend_lcd16(colA, colR, colG, colB, dst[i], src[i]);
572         }
573     }
574 
575 #else
576 
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)577     static inline void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[],
578                                       SkColor src, int width, SkPMColor) {
579         int srcA = SkColorGetA(src);
580         int srcR = SkColorGetR(src);
581         int srcG = SkColorGetG(src);
582         int srcB = SkColorGetB(src);
583 
584         srcA = SkAlpha255To256(srcA);
585 
586         for (int i = 0; i < width; i++) {
587             dst[i] = blend_lcd16(srcA, srcR, srcG, srcB, dst[i], mask[i]);
588         }
589     }
590 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)591     static inline void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
592                                              SkColor src, int width,
593                                              SkPMColor opaqueDst) {
594         int srcR = SkColorGetR(src);
595         int srcG = SkColorGetG(src);
596         int srcB = SkColorGetB(src);
597 
598         for (int i = 0; i < width; i++) {
599             dst[i] = blend_lcd16_opaque(srcR, srcG, srcB, dst[i], mask[i], opaqueDst);
600         }
601     }
602 
603 #endif
604 
blit_color(const SkPixmap & device,const SkMask & mask,const SkIRect & clip,SkColor color)605 static bool blit_color(const SkPixmap& device,
606                        const SkMask& mask,
607                        const SkIRect& clip,
608                        SkColor color) {
609     int x = clip.fLeft,
610         y = clip.fTop;
611 
612     if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) {
613         SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(),
614                                  (const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes,
615                                  color, clip.width(), clip.height());
616         return true;
617     }
618 
619     if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) {
620         auto dstRow  = device.writable_addr32(x,y);
621         auto maskRow = (const uint16_t*)mask.getAddr(x,y);
622 
623         auto blit_row = blit_row_lcd16;
624         SkPMColor opaqueDst = 0;  // ignored unless opaque
625 
626         if (0xff == SkColorGetA(color)) {
627             blit_row  = blit_row_lcd16_opaque;
628             opaqueDst = SkPreMultiplyColor(color);
629         }
630 
631         for (int height = clip.height(); height --> 0; ) {
632             blit_row(dstRow, maskRow, color, clip.width(), opaqueDst);
633 
634             dstRow  = (SkPMColor*)     ((      char*) dstRow + device.rowBytes());
635             maskRow = (const uint16_t*)((const char*)maskRow +  mask.fRowBytes);
636         }
637         return true;
638     }
639 
640     return false;
641 }
642 
643 ///////////////////////////////////////////////////////////////////////////////
644 
SkARGB32_Blit32(const SkPixmap & device,const SkMask & mask,const SkIRect & clip,SkPMColor srcColor)645 static void SkARGB32_Blit32(const SkPixmap& device, const SkMask& mask,
646                             const SkIRect& clip, SkPMColor srcColor) {
647     U8CPU alpha = SkGetPackedA32(srcColor);
648     unsigned flags = SkBlitRow::kSrcPixelAlpha_Flag32;
649     if (alpha != 255) {
650         flags |= SkBlitRow::kGlobalAlpha_Flag32;
651     }
652     SkBlitRow::Proc32 proc = SkBlitRow::Factory32(flags);
653 
654     int x = clip.fLeft;
655     int y = clip.fTop;
656     int width = clip.width();
657     int height = clip.height();
658 
659     SkPMColor* dstRow = device.writable_addr32(x, y);
660     const SkPMColor* srcRow = reinterpret_cast<const SkPMColor*>(mask.getAddr8(x, y));
661 
662     do {
663         proc(dstRow, srcRow, width, alpha);
664         dstRow = (SkPMColor*)((char*)dstRow + device.rowBytes());
665         srcRow = (const SkPMColor*)((const char*)srcRow + mask.fRowBytes);
666     } while (--height != 0);
667 }
668 
669 //////////////////////////////////////////////////////////////////////////////////////
670 
SkARGB32_Blitter(const SkPixmap & device,const SkPaint & paint)671 SkARGB32_Blitter::SkARGB32_Blitter(const SkPixmap& device, const SkPaint& paint)
672         : INHERITED(device) {
673     SkColor color = paint.getColor();
674     fColor = color;
675 
676     fSrcA = SkColorGetA(color);
677     unsigned scale = SkAlpha255To256(fSrcA);
678     fSrcR = SkAlphaMul(SkColorGetR(color), scale);
679     fSrcG = SkAlphaMul(SkColorGetG(color), scale);
680     fSrcB = SkAlphaMul(SkColorGetB(color), scale);
681 
682     fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB);
683 }
684 
justAnOpaqueColor(uint32_t * value)685 const SkPixmap* SkARGB32_Blitter::justAnOpaqueColor(uint32_t* value) {
686     if (255 == fSrcA) {
687         *value = fPMColor;
688         return &fDevice;
689     }
690     return nullptr;
691 }
692 
693 #if defined _WIN32  // disable warning : local variable used without having been initialized
694 #pragma warning ( push )
695 #pragma warning ( disable : 4701 )
696 #endif
697 
blitH(int x,int y,int width)698 void SkARGB32_Blitter::blitH(int x, int y, int width) {
699     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
700 
701     uint32_t* device = fDevice.writable_addr32(x, y);
702     SkBlitRow::Color32(device, device, width, fPMColor);
703 }
704 
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])705 void SkARGB32_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
706                                  const int16_t runs[]) {
707     if (fSrcA == 0) {
708         return;
709     }
710 
711     uint32_t    color = fPMColor;
712     uint32_t*   device = fDevice.writable_addr32(x, y);
713     unsigned    opaqueMask = fSrcA; // if fSrcA is 0xFF, then we will catch the fast opaque case
714 
715     for (;;) {
716         int count = runs[0];
717         SkASSERT(count >= 0);
718         if (count <= 0) {
719             return;
720         }
721         unsigned aa = antialias[0];
722         if (aa) {
723             if ((opaqueMask & aa) == 255) {
724                 sk_memset32(device, color, count);
725             } else {
726                 uint32_t sc = SkAlphaMulQ(color, SkAlpha255To256(aa));
727                 SkBlitRow::Color32(device, device, count, sc);
728             }
729         }
730         runs += count;
731         antialias += count;
732         device += count;
733     }
734 }
735 
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)736 void SkARGB32_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
737     uint32_t* device = fDevice.writable_addr32(x, y);
738     SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
739 
740     device[0] = SkBlendARGB32(fPMColor, device[0], a0);
741     device[1] = SkBlendARGB32(fPMColor, device[1], a1);
742 }
743 
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)744 void SkARGB32_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
745     uint32_t* device = fDevice.writable_addr32(x, y);
746     SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
747 
748     device[0] = SkBlendARGB32(fPMColor, device[0], a0);
749     device = (uint32_t*)((char*)device + fDevice.rowBytes());
750     device[0] = SkBlendARGB32(fPMColor, device[0], a1);
751 }
752 
753 //////////////////////////////////////////////////////////////////////////////////////
754 
755 #define solid_8_pixels(mask, dst, color)    \
756     do {                                    \
757         if (mask & 0x80) dst[0] = color;    \
758         if (mask & 0x40) dst[1] = color;    \
759         if (mask & 0x20) dst[2] = color;    \
760         if (mask & 0x10) dst[3] = color;    \
761         if (mask & 0x08) dst[4] = color;    \
762         if (mask & 0x04) dst[5] = color;    \
763         if (mask & 0x02) dst[6] = color;    \
764         if (mask & 0x01) dst[7] = color;    \
765     } while (0)
766 
767 #define SK_BLITBWMASK_NAME                  SkARGB32_BlitBW
768 #define SK_BLITBWMASK_ARGS                  , SkPMColor color
769 #define SK_BLITBWMASK_BLIT8(mask, dst)      solid_8_pixels(mask, dst, color)
770 #define SK_BLITBWMASK_GETADDR               writable_addr32
771 #define SK_BLITBWMASK_DEVTYPE               uint32_t
772 #include "SkBlitBWMaskTemplate.h"
773 
774 #define blend_8_pixels(mask, dst, sc, dst_scale)                            \
775     do {                                                                    \
776         if (mask & 0x80) { dst[0] = sc + SkAlphaMulQ(dst[0], dst_scale); }  \
777         if (mask & 0x40) { dst[1] = sc + SkAlphaMulQ(dst[1], dst_scale); }  \
778         if (mask & 0x20) { dst[2] = sc + SkAlphaMulQ(dst[2], dst_scale); }  \
779         if (mask & 0x10) { dst[3] = sc + SkAlphaMulQ(dst[3], dst_scale); }  \
780         if (mask & 0x08) { dst[4] = sc + SkAlphaMulQ(dst[4], dst_scale); }  \
781         if (mask & 0x04) { dst[5] = sc + SkAlphaMulQ(dst[5], dst_scale); }  \
782         if (mask & 0x02) { dst[6] = sc + SkAlphaMulQ(dst[6], dst_scale); }  \
783         if (mask & 0x01) { dst[7] = sc + SkAlphaMulQ(dst[7], dst_scale); }  \
784     } while (0)
785 
786 #define SK_BLITBWMASK_NAME                  SkARGB32_BlendBW
787 #define SK_BLITBWMASK_ARGS                  , uint32_t sc, unsigned dst_scale
788 #define SK_BLITBWMASK_BLIT8(mask, dst)      blend_8_pixels(mask, dst, sc, dst_scale)
789 #define SK_BLITBWMASK_GETADDR               writable_addr32
790 #define SK_BLITBWMASK_DEVTYPE               uint32_t
791 #include "SkBlitBWMaskTemplate.h"
792 
blitMask(const SkMask & mask,const SkIRect & clip)793 void SkARGB32_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
794     SkASSERT(mask.fBounds.contains(clip));
795     SkASSERT(fSrcA != 0xFF);
796 
797     if (fSrcA == 0) {
798         return;
799     }
800 
801     if (blit_color(fDevice, mask, clip, fColor)) {
802         return;
803     }
804 
805     switch (mask.fFormat) {
806         case SkMask::kBW_Format:
807             SkARGB32_BlendBW(fDevice, mask, clip, fPMColor, SkAlpha255To256(255 - fSrcA));
808             break;
809         case SkMask::kARGB32_Format:
810             SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
811             break;
812         default:
813             SK_ABORT("Mask format not handled.");
814     }
815 }
816 
blitMask(const SkMask & mask,const SkIRect & clip)817 void SkARGB32_Opaque_Blitter::blitMask(const SkMask& mask,
818                                        const SkIRect& clip) {
819     SkASSERT(mask.fBounds.contains(clip));
820 
821     if (blit_color(fDevice, mask, clip, fColor)) {
822         return;
823     }
824 
825     switch (mask.fFormat) {
826         case SkMask::kBW_Format:
827             SkARGB32_BlitBW(fDevice, mask, clip, fPMColor);
828             break;
829         case SkMask::kARGB32_Format:
830             SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
831             break;
832         default:
833             SK_ABORT("Mask format not handled.");
834     }
835 }
836 
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)837 void SkARGB32_Opaque_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
838     uint32_t* device = fDevice.writable_addr32(x, y);
839     SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
840 
841     device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
842     device[1] = SkFastFourByteInterp(fPMColor, device[1], a1);
843 }
844 
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)845 void SkARGB32_Opaque_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
846     uint32_t* device = fDevice.writable_addr32(x, y);
847     SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
848 
849     device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
850     device = (uint32_t*)((char*)device + fDevice.rowBytes());
851     device[0] = SkFastFourByteInterp(fPMColor, device[0], a1);
852 }
853 
854 ///////////////////////////////////////////////////////////////////////////////
855 
blitV(int x,int y,int height,SkAlpha alpha)856 void SkARGB32_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
857     if (alpha == 0 || fSrcA == 0) {
858         return;
859     }
860 
861     uint32_t* device = fDevice.writable_addr32(x, y);
862     uint32_t  color = fPMColor;
863 
864     if (alpha != 255) {
865         color = SkAlphaMulQ(color, SkAlpha255To256(alpha));
866     }
867 
868     unsigned dst_scale = SkAlpha255To256(255 - SkGetPackedA32(color));
869     size_t rowBytes = fDevice.rowBytes();
870     while (--height >= 0) {
871         device[0] = color + SkAlphaMulQ(device[0], dst_scale);
872         device = (uint32_t*)((char*)device + rowBytes);
873     }
874 }
875 
blitRect(int x,int y,int width,int height)876 void SkARGB32_Blitter::blitRect(int x, int y, int width, int height) {
877     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width() && y + height <= fDevice.height());
878 
879     if (fSrcA == 0) {
880         return;
881     }
882 
883     uint32_t*   device = fDevice.writable_addr32(x, y);
884     uint32_t    color = fPMColor;
885     size_t      rowBytes = fDevice.rowBytes();
886 
887     while (--height >= 0) {
888         SkBlitRow::Color32(device, device, width, color);
889         device = (uint32_t*)((char*)device + rowBytes);
890     }
891 }
892 
893 #if defined _WIN32
894 #pragma warning ( pop )
895 #endif
896 
897 ///////////////////////////////////////////////////////////////////////
898 
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])899 void SkARGB32_Black_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
900                                        const int16_t runs[]) {
901     uint32_t*   device = fDevice.writable_addr32(x, y);
902     SkPMColor   black = (SkPMColor)(SK_A32_MASK << SK_A32_SHIFT);
903 
904     for (;;) {
905         int count = runs[0];
906         SkASSERT(count >= 0);
907         if (count <= 0) {
908             return;
909         }
910         unsigned aa = antialias[0];
911         if (aa) {
912             if (aa == 255) {
913                 sk_memset32(device, black, count);
914             } else {
915                 SkPMColor src = aa << SK_A32_SHIFT;
916                 unsigned dst_scale = 256 - aa;
917                 int n = count;
918                 do {
919                     --n;
920                     device[n] = src + SkAlphaMulQ(device[n], dst_scale);
921                 } while (n > 0);
922             }
923         }
924         runs += count;
925         antialias += count;
926         device += count;
927     }
928 }
929 
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)930 void SkARGB32_Black_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
931     uint32_t* device = fDevice.writable_addr32(x, y);
932     SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
933 
934     device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
935     device[1] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[1], 256 - a1);
936 }
937 
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)938 void SkARGB32_Black_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
939     uint32_t* device = fDevice.writable_addr32(x, y);
940     SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
941 
942     device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
943     device = (uint32_t*)((char*)device + fDevice.rowBytes());
944     device[0] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a1);
945 }
946 
947 ///////////////////////////////////////////////////////////////////////////////
948 
949 // Special version of SkBlitRow::Factory32 that knows we're in kSrc_Mode,
950 // instead of kSrcOver_Mode
blend_srcmode(SkPMColor * SK_RESTRICT device,const SkPMColor * SK_RESTRICT span,int count,U8CPU aa)951 static void blend_srcmode(SkPMColor* SK_RESTRICT device,
952                           const SkPMColor* SK_RESTRICT span,
953                           int count, U8CPU aa) {
954     int aa256 = SkAlpha255To256(aa);
955     for (int i = 0; i < count; ++i) {
956         device[i] = SkFourByteInterp256(span[i], device[i], aa256);
957     }
958 }
959 
SkARGB32_Shader_Blitter(const SkPixmap & device,const SkPaint & paint,SkShaderBase::Context * shaderContext)960 SkARGB32_Shader_Blitter::SkARGB32_Shader_Blitter(const SkPixmap& device,
961         const SkPaint& paint, SkShaderBase::Context* shaderContext)
962     : INHERITED(device, paint, shaderContext)
963 {
964     fBuffer = (SkPMColor*)sk_malloc_throw(device.width() * (sizeof(SkPMColor)));
965 
966     fXfermode = SkXfermode::Peek(paint.getBlendMode());
967 
968     int flags = 0;
969     if (!(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
970         flags |= SkBlitRow::kSrcPixelAlpha_Flag32;
971     }
972     // we call this on the output from the shader
973     fProc32 = SkBlitRow::Factory32(flags);
974     // we call this on the output from the shader + alpha from the aa buffer
975     fProc32Blend = SkBlitRow::Factory32(flags | SkBlitRow::kGlobalAlpha_Flag32);
976 
977     fShadeDirectlyIntoDevice = false;
978     if (fXfermode == nullptr) {
979         if (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag) {
980             fShadeDirectlyIntoDevice = true;
981         }
982     } else {
983         if (SkBlendMode::kSrc == paint.getBlendMode()) {
984             fShadeDirectlyIntoDevice = true;
985             fProc32Blend = blend_srcmode;
986         }
987     }
988 
989     fConstInY = SkToBool(shaderContext->getFlags() & SkShaderBase::kConstInY32_Flag);
990 }
991 
~SkARGB32_Shader_Blitter()992 SkARGB32_Shader_Blitter::~SkARGB32_Shader_Blitter() {
993     sk_free(fBuffer);
994 }
995 
blitH(int x,int y,int width)996 void SkARGB32_Shader_Blitter::blitH(int x, int y, int width) {
997     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
998 
999     uint32_t* device = fDevice.writable_addr32(x, y);
1000 
1001     if (fShadeDirectlyIntoDevice) {
1002         fShaderContext->shadeSpan(x, y, device, width);
1003     } else {
1004         SkPMColor*  span = fBuffer;
1005         fShaderContext->shadeSpan(x, y, span, width);
1006         if (fXfermode) {
1007             fXfermode->xfer32(device, span, width, nullptr);
1008         } else {
1009             fProc32(device, span, width, 255);
1010         }
1011     }
1012 }
1013 
blitRect(int x,int y,int width,int height)1014 void SkARGB32_Shader_Blitter::blitRect(int x, int y, int width, int height) {
1015     SkASSERT(x >= 0 && y >= 0 &&
1016              x + width <= fDevice.width() && y + height <= fDevice.height());
1017 
1018     uint32_t*  device = fDevice.writable_addr32(x, y);
1019     size_t     deviceRB = fDevice.rowBytes();
1020     auto*      shaderContext = fShaderContext;
1021     SkPMColor* span = fBuffer;
1022 
1023     if (fConstInY) {
1024         if (fShadeDirectlyIntoDevice) {
1025             // shade the first row directly into the device
1026             shaderContext->shadeSpan(x, y, device, width);
1027             span = device;
1028             while (--height > 0) {
1029                 device = (uint32_t*)((char*)device + deviceRB);
1030                 memcpy(device, span, width << 2);
1031             }
1032         } else {
1033             shaderContext->shadeSpan(x, y, span, width);
1034             SkXfermode* xfer = fXfermode;
1035             if (xfer) {
1036                 do {
1037                     xfer->xfer32(device, span, width, nullptr);
1038                     y += 1;
1039                     device = (uint32_t*)((char*)device + deviceRB);
1040                 } while (--height > 0);
1041             } else {
1042                 SkBlitRow::Proc32 proc = fProc32;
1043                 do {
1044                     proc(device, span, width, 255);
1045                     y += 1;
1046                     device = (uint32_t*)((char*)device + deviceRB);
1047                 } while (--height > 0);
1048             }
1049         }
1050         return;
1051     }
1052 
1053     if (fShadeDirectlyIntoDevice) {
1054         do {
1055             shaderContext->shadeSpan(x, y, device, width);
1056             y += 1;
1057             device = (uint32_t*)((char*)device + deviceRB);
1058         } while (--height > 0);
1059     } else {
1060         SkXfermode* xfer = fXfermode;
1061         if (xfer) {
1062             do {
1063                 shaderContext->shadeSpan(x, y, span, width);
1064                 xfer->xfer32(device, span, width, nullptr);
1065                 y += 1;
1066                 device = (uint32_t*)((char*)device + deviceRB);
1067             } while (--height > 0);
1068         } else {
1069             SkBlitRow::Proc32 proc = fProc32;
1070             do {
1071                 shaderContext->shadeSpan(x, y, span, width);
1072                 proc(device, span, width, 255);
1073                 y += 1;
1074                 device = (uint32_t*)((char*)device + deviceRB);
1075             } while (--height > 0);
1076         }
1077     }
1078 }
1079 
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])1080 void SkARGB32_Shader_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1081                                         const int16_t runs[]) {
1082     SkPMColor* span = fBuffer;
1083     uint32_t*  device = fDevice.writable_addr32(x, y);
1084     auto*      shaderContext = fShaderContext;
1085 
1086     if (fXfermode && !fShadeDirectlyIntoDevice) {
1087         for (;;) {
1088             SkXfermode* xfer = fXfermode;
1089 
1090             int count = *runs;
1091             if (count <= 0)
1092                 break;
1093             int aa = *antialias;
1094             if (aa) {
1095                 shaderContext->shadeSpan(x, y, span, count);
1096                 if (aa == 255) {
1097                     xfer->xfer32(device, span, count, nullptr);
1098                 } else {
1099                     // count is almost always 1
1100                     for (int i = count - 1; i >= 0; --i) {
1101                         xfer->xfer32(&device[i], &span[i], 1, antialias);
1102                     }
1103                 }
1104             }
1105             device += count;
1106             runs += count;
1107             antialias += count;
1108             x += count;
1109         }
1110     } else if (fShadeDirectlyIntoDevice ||
1111                (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
1112         for (;;) {
1113             int count = *runs;
1114             if (count <= 0) {
1115                 break;
1116             }
1117             int aa = *antialias;
1118             if (aa) {
1119                 if (aa == 255) {
1120                     // cool, have the shader draw right into the device
1121                     shaderContext->shadeSpan(x, y, device, count);
1122                 } else {
1123                     shaderContext->shadeSpan(x, y, span, count);
1124                     fProc32Blend(device, span, count, aa);
1125                 }
1126             }
1127             device += count;
1128             runs += count;
1129             antialias += count;
1130             x += count;
1131         }
1132     } else {
1133         for (;;) {
1134             int count = *runs;
1135             if (count <= 0) {
1136                 break;
1137             }
1138             int aa = *antialias;
1139             if (aa) {
1140                 shaderContext->shadeSpan(x, y, span, count);
1141                 if (aa == 255) {
1142                     fProc32(device, span, count, 255);
1143                 } else {
1144                     fProc32Blend(device, span, count, aa);
1145                 }
1146             }
1147             device += count;
1148             runs += count;
1149             antialias += count;
1150             x += count;
1151         }
1152     }
1153 }
1154 
blend_row_A8(SkPMColor * dst,const void * vmask,const SkPMColor * src,int n)1155 static void blend_row_A8(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1156     auto mask = (const uint8_t*)vmask;
1157 
1158 #ifdef SK_SUPPORT_LEGACY_A8_MASKBLITTER
1159     for (int i = 0; i < n; ++i) {
1160         if (mask[i]) {
1161             dst[i] = SkBlendARGB32(src[i], dst[i], mask[i]);
1162         }
1163     }
1164 #else
1165     Sk4px::MapDstSrcAlpha(n, dst, src, mask, [](const Sk4px& d, const Sk4px& s, const Sk4px& aa) {
1166         const auto s_aa = s.approxMulDiv255(aa);
1167         return s_aa + d.approxMulDiv255(s_aa.alphas().inv());
1168     });
1169 #endif
1170 }
1171 
blend_row_A8_opaque(SkPMColor * dst,const void * vmask,const SkPMColor * src,int n)1172 static void blend_row_A8_opaque(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1173     auto mask = (const uint8_t*)vmask;
1174 
1175 #ifdef SK_SUPPORT_LEGACY_A8_MASKBLITTER
1176     for (int i = 0; i < n; ++i) {
1177         if (int m = mask[i]) {
1178             m += (m >> 7);
1179             dst[i] = SkAlphaMulQ(src[i], m) + SkAlphaMulQ(dst[i], 256 - m);
1180         }
1181     }
1182 #else
1183     Sk4px::MapDstSrcAlpha(n, dst, src, mask, [](const Sk4px& d, const Sk4px& s, const Sk4px& aa) {
1184         return (s * aa + d * aa.inv()).div255();
1185     });
1186 #endif
1187 }
1188 
blend_row_lcd16(SkPMColor * dst,const void * vmask,const SkPMColor * src,int n)1189 static void blend_row_lcd16(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1190     auto src_alpha_blend = [](int s, int d, int sa, int m) {
1191         return d + SkAlphaMul(s - SkAlphaMul(sa, d), m);
1192     };
1193 
1194     auto upscale_31_to_255 = [](int v) {
1195         return (v << 3) | (v >> 2);
1196     };
1197 
1198     auto mask = (const uint16_t*)vmask;
1199     for (int i = 0; i < n; ++i) {
1200         uint16_t m = mask[i];
1201         if (0 == m) {
1202             continue;
1203         }
1204 
1205         SkPMColor s = src[i];
1206         SkPMColor d = dst[i];
1207 
1208         int srcA = SkGetPackedA32(s);
1209         int srcR = SkGetPackedR32(s);
1210         int srcG = SkGetPackedG32(s);
1211         int srcB = SkGetPackedB32(s);
1212 
1213         srcA += srcA >> 7;
1214 
1215         // We're ignoring the least significant bit of the green coverage channel here.
1216         int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1217         int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1218         int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1219 
1220         // Scale up to 8-bit coverage to work with SkAlphaMul() in src_alpha_blend().
1221         maskR = upscale_31_to_255(maskR);
1222         maskG = upscale_31_to_255(maskG);
1223         maskB = upscale_31_to_255(maskB);
1224 
1225         // This LCD blit routine only works if the destination is opaque.
1226         dst[i] = SkPackARGB32(0xFF,
1227                               src_alpha_blend(srcR, SkGetPackedR32(d), srcA, maskR),
1228                               src_alpha_blend(srcG, SkGetPackedG32(d), srcA, maskG),
1229                               src_alpha_blend(srcB, SkGetPackedB32(d), srcA, maskB));
1230     }
1231 }
1232 
blend_row_LCD16_opaque(SkPMColor * dst,const void * vmask,const SkPMColor * src,int n)1233 static void blend_row_LCD16_opaque(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1234     auto mask = (const uint16_t*)vmask;
1235 
1236     for (int i = 0; i < n; ++i) {
1237         uint16_t m = mask[i];
1238         if (0 == m) {
1239             continue;
1240         }
1241 
1242         SkPMColor s = src[i];
1243         SkPMColor d = dst[i];
1244 
1245         int srcR = SkGetPackedR32(s);
1246         int srcG = SkGetPackedG32(s);
1247         int srcB = SkGetPackedB32(s);
1248 
1249         // We're ignoring the least significant bit of the green coverage channel here.
1250         int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1251         int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1252         int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1253 
1254         // Now upscale them to 0..32, so we can use blend_32.
1255         maskR = upscale_31_to_32(maskR);
1256         maskG = upscale_31_to_32(maskG);
1257         maskB = upscale_31_to_32(maskB);
1258 
1259         // This LCD blit routine only works if the destination is opaque.
1260         dst[i] = SkPackARGB32(0xFF,
1261                               blend_32(srcR, SkGetPackedR32(d), maskR),
1262                               blend_32(srcG, SkGetPackedG32(d), maskG),
1263                               blend_32(srcB, SkGetPackedB32(d), maskB));
1264     }
1265 }
1266 
blitMask(const SkMask & mask,const SkIRect & clip)1267 void SkARGB32_Shader_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
1268     // we only handle kA8 with an xfermode
1269     if (fXfermode && (SkMask::kA8_Format != mask.fFormat)) {
1270         this->INHERITED::blitMask(mask, clip);
1271         return;
1272     }
1273 
1274     SkASSERT(mask.fBounds.contains(clip));
1275 
1276     void (*blend_row)(SkPMColor*, const void* mask, const SkPMColor*, int) = nullptr;
1277 
1278     if (!fXfermode) {
1279         bool opaque = (fShaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag);
1280 
1281         if (mask.fFormat == SkMask::kA8_Format && opaque) {
1282             blend_row = blend_row_A8_opaque;
1283         } else if (mask.fFormat == SkMask::kA8_Format) {
1284             blend_row = blend_row_A8;
1285         } else if (mask.fFormat == SkMask::kLCD16_Format && opaque) {
1286             blend_row = blend_row_LCD16_opaque;
1287         } else if (mask.fFormat == SkMask::kLCD16_Format) {
1288             blend_row = blend_row_lcd16;
1289         } else {
1290             this->INHERITED::blitMask(mask, clip);
1291             return;
1292         }
1293     }
1294 
1295     const int x = clip.fLeft;
1296     const int width = clip.width();
1297     int y = clip.fTop;
1298     int height = clip.height();
1299 
1300     char* dstRow = (char*)fDevice.writable_addr32(x, y);
1301     const size_t dstRB = fDevice.rowBytes();
1302     const uint8_t* maskRow = (const uint8_t*)mask.getAddr(x, y);
1303     const size_t maskRB = mask.fRowBytes;
1304 
1305     SkPMColor* span = fBuffer;
1306 
1307     if (fXfermode) {
1308         SkASSERT(SkMask::kA8_Format == mask.fFormat);
1309         SkXfermode* xfer = fXfermode;
1310         do {
1311             fShaderContext->shadeSpan(x, y, span, width);
1312             xfer->xfer32(reinterpret_cast<SkPMColor*>(dstRow), span, width, maskRow);
1313             dstRow += dstRB;
1314             maskRow += maskRB;
1315             y += 1;
1316         } while (--height > 0);
1317     } else {
1318         SkASSERT(blend_row);
1319         do {
1320             fShaderContext->shadeSpan(x, y, span, width);
1321             blend_row(reinterpret_cast<SkPMColor*>(dstRow), maskRow, span, width);
1322             dstRow += dstRB;
1323             maskRow += maskRB;
1324             y += 1;
1325         } while (--height > 0);
1326     }
1327 }
1328 
blitV(int x,int y,int height,SkAlpha alpha)1329 void SkARGB32_Shader_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
1330     SkASSERT(x >= 0 && y >= 0 && y + height <= fDevice.height());
1331 
1332     uint32_t* device = fDevice.writable_addr32(x, y);
1333     size_t    deviceRB = fDevice.rowBytes();
1334 
1335     if (fConstInY) {
1336         SkPMColor c;
1337         fShaderContext->shadeSpan(x, y, &c, 1);
1338 
1339         if (fShadeDirectlyIntoDevice) {
1340             if (255 == alpha) {
1341                 do {
1342                     *device = c;
1343                     device = (uint32_t*)((char*)device + deviceRB);
1344                 } while (--height > 0);
1345             } else {
1346                 do {
1347                     *device = SkFourByteInterp(c, *device, alpha);
1348                     device = (uint32_t*)((char*)device + deviceRB);
1349                 } while (--height > 0);
1350             }
1351         } else {
1352             SkXfermode* xfer = fXfermode;
1353             if (xfer) {
1354                 do {
1355                     xfer->xfer32(device, &c, 1, &alpha);
1356                     device = (uint32_t*)((char*)device + deviceRB);
1357                 } while (--height > 0);
1358             } else {
1359                 SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend;
1360                 do {
1361                     proc(device, &c, 1, alpha);
1362                     device = (uint32_t*)((char*)device + deviceRB);
1363                 } while (--height > 0);
1364             }
1365         }
1366         return;
1367     }
1368 
1369     if (fShadeDirectlyIntoDevice) {
1370         if (255 == alpha) {
1371             do {
1372                 fShaderContext->shadeSpan(x, y, device, 1);
1373                 y += 1;
1374                 device = (uint32_t*)((char*)device + deviceRB);
1375             } while (--height > 0);
1376         } else {
1377             do {
1378                 SkPMColor c;
1379                 fShaderContext->shadeSpan(x, y, &c, 1);
1380                 *device = SkFourByteInterp(c, *device, alpha);
1381                 y += 1;
1382                 device = (uint32_t*)((char*)device + deviceRB);
1383             } while (--height > 0);
1384         }
1385     } else {
1386         SkPMColor* span = fBuffer;
1387         SkXfermode* xfer = fXfermode;
1388         if (xfer) {
1389             do {
1390                 fShaderContext->shadeSpan(x, y, span, 1);
1391                 xfer->xfer32(device, span, 1, &alpha);
1392                 y += 1;
1393                 device = (uint32_t*)((char*)device + deviceRB);
1394             } while (--height > 0);
1395         } else {
1396             SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend;
1397             do {
1398                 fShaderContext->shadeSpan(x, y, span, 1);
1399                 proc(device, span, 1, alpha);
1400                 y += 1;
1401                 device = (uint32_t*)((char*)device + deviceRB);
1402             } while (--height > 0);
1403         }
1404     }
1405 }
1406