• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include <emmintrin.h>
9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkBlitRow_opts_SSE2.h"
11 #include "SkColorPriv.h"
12 #include "SkColor_opts_SSE2.h"
13 #include "SkDither.h"
14 #include "SkMSAN.h"
15 #include "SkUtils.h"
16 
17 /* SSE2 version of S32_Blend_BlitRow32()
18  * portable version is in core/SkBlitRow_D32.cpp
19  */
S32_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)20 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
21                               const SkPMColor* SK_RESTRICT src,
22                               int count, U8CPU alpha) {
23     SkASSERT(alpha <= 255);
24     if (count <= 0) {
25         return;
26     }
27 
28     uint32_t src_scale = SkAlpha255To256(alpha);
29 
30     if (count >= 4) {
31         SkASSERT(((size_t)dst & 0x03) == 0);
32         while (((size_t)dst & 0x0F) != 0) {
33             *dst = SkPMLerp(*src, *dst, src_scale);
34             src++;
35             dst++;
36             count--;
37         }
38 
39         const __m128i *s = reinterpret_cast<const __m128i*>(src);
40         __m128i *d = reinterpret_cast<__m128i*>(dst);
41 
42         while (count >= 4) {
43             // Load 4 pixels each of src and dest.
44             __m128i src_pixel = _mm_loadu_si128(s);
45             __m128i dst_pixel = _mm_load_si128(d);
46 
47             __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
48             _mm_store_si128(d, result);
49             s++;
50             d++;
51             count -= 4;
52         }
53         src = reinterpret_cast<const SkPMColor*>(s);
54         dst = reinterpret_cast<SkPMColor*>(d);
55     }
56 
57     while (count > 0) {
58         *dst = SkPMLerp(*src, *dst, src_scale);
59         src++;
60         dst++;
61         count--;
62     }
63 }
64 
S32A_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)65 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
66                                const SkPMColor* SK_RESTRICT src,
67                                int count, U8CPU alpha) {
68     SkASSERT(alpha <= 255);
69     if (count <= 0) {
70         return;
71     }
72 
73     if (count >= 4) {
74         while (((size_t)dst & 0x0F) != 0) {
75             *dst = SkBlendARGB32(*src, *dst, alpha);
76             src++;
77             dst++;
78             count--;
79         }
80 
81         const __m128i *s = reinterpret_cast<const __m128i*>(src);
82         __m128i *d = reinterpret_cast<__m128i*>(dst);
83         while (count >= 4) {
84             // Load 4 pixels each of src and dest.
85             __m128i src_pixel = _mm_loadu_si128(s);
86             __m128i dst_pixel = _mm_load_si128(d);
87 
88             __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
89             _mm_store_si128(d, result);
90             s++;
91             d++;
92             count -= 4;
93         }
94         src = reinterpret_cast<const SkPMColor*>(s);
95         dst = reinterpret_cast<SkPMColor*>(d);
96     }
97 
98     while (count > 0) {
99         *dst = SkBlendARGB32(*src, *dst, alpha);
100         src++;
101         dst++;
102         count--;
103     }
104 }
105 
Color32A_D565_SSE2(uint16_t dst[],SkPMColor src,int count,int x,int y)106 void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
107     SkASSERT(count > 0);
108 
109     uint32_t src_expand = (SkGetPackedG32(src) << 24) |
110                           (SkGetPackedR32(src) << 13) |
111                           (SkGetPackedB32(src) << 2);
112     unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
113 
114     // Check if we have enough pixels to run SIMD
115     if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
116         __m128i* dst_wide;
117         const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
118         const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
119         const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
120         const __m128i scale_wide = _mm_set1_epi16(scale);
121         const __m128i mask_blue  = _mm_set1_epi16(SK_B16_MASK);
122         const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
123 
124         // Align dst to an even 16 byte address (0-7 pixels)
125         while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
126             *dst = SkBlend32_RGB16(src_expand, *dst, scale);
127             dst += 1;
128             count--;
129         }
130 
131         dst_wide = reinterpret_cast<__m128i*>(dst);
132         do {
133             // Load eight RGB565 pixels
134             __m128i pixels = _mm_load_si128(dst_wide);
135 
136             // Mask out sub-pixels
137             __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
138             __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
139             pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
140             __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
141 
142             // Scale with alpha
143             pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
144             pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
145             pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
146 
147             // Add src_X_wide and shift down again
148             pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
149             pixel_R = _mm_srli_epi16(pixel_R, 5);
150             pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
151             pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
152             pixel_B = _mm_srli_epi16(pixel_B, 5);
153 
154             // Combine into RGB565 and store
155             pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
156             pixel_G = _mm_and_si128(pixel_G, mask_green);
157             pixels = _mm_or_si128(pixel_R, pixel_G);
158             pixels = _mm_or_si128(pixels, pixel_B);
159             _mm_store_si128(dst_wide, pixels);
160             count -= 8;
161             dst_wide++;
162         } while (count >= 8);
163 
164         dst = reinterpret_cast<uint16_t*>(dst_wide);
165     }
166 
167     // Small loop to handle remaining pixels.
168     while (count > 0) {
169         *dst = SkBlend32_RGB16(src_expand, *dst, scale);
170         dst += 1;
171         count--;
172     }
173 }
174 
175 // The following (left) shifts cause the top 5 bits of the mask components to
176 // line up with the corresponding components in an SkPMColor.
177 // Note that the mask's RGB16 order may differ from the SkPMColor order.
178 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
179 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
180 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
181 
182 #if SK_R16x5_R32x5_SHIFT == 0
183     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
184 #elif SK_R16x5_R32x5_SHIFT > 0
185     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
186 #else
187     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
188 #endif
189 
190 #if SK_G16x5_G32x5_SHIFT == 0
191     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
192 #elif SK_G16x5_G32x5_SHIFT > 0
193     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
194 #else
195     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
196 #endif
197 
198 #if SK_B16x5_B32x5_SHIFT == 0
199     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
200 #elif SK_B16x5_B32x5_SHIFT > 0
201     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
202 #else
203     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
204 #endif
205 
SkBlendLCD16_SSE2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)206 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
207                                  __m128i &mask, __m128i &srcA) {
208     // In the following comments, the components of src, dst and mask are
209     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
210     // by an R, G, B, or A suffix. Components of one of the four pixels that
211     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
212     // example is the blue channel of the second destination pixel. Memory
213     // layout is shown for an ARGB byte order in a color value.
214 
215     // src and srcA store 8-bit values interleaved with zeros.
216     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
217     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
218     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
219     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
220     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
221     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
222     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
223 
224     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
225     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
226     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
227                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
228 
229     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
230     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
231                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
232 
233     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
234     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
235                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
236 
237     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
238     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
239     // 8-bit position
240     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
241     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
242     mask = _mm_or_si128(_mm_or_si128(r, g), b);
243 
244     // Interleave R,G,B into the lower byte of word.
245     // i.e. split the sixteen 8-bit values from mask into two sets of eight
246     // 16-bit values, padded by zero.
247     __m128i maskLo, maskHi;
248     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
249     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
250     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
251     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
252 
253     // Upscale from 0..31 to 0..32
254     // (allows to replace division by left-shift further down)
255     // Left-shift each component by 4 and add the result back to that component,
256     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
257     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
258     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
259 
260     // Multiply each component of maskLo and maskHi by srcA
261     maskLo = _mm_mullo_epi16(maskLo, srcA);
262     maskHi = _mm_mullo_epi16(maskHi, srcA);
263 
264     // Left shift mask components by 8 (divide by 256)
265     maskLo = _mm_srli_epi16(maskLo, 8);
266     maskHi = _mm_srli_epi16(maskHi, 8);
267 
268     // Interleave R,G,B into the lower byte of the word
269     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
270     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
271     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
272     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
273 
274     // mask = (src - dst) * mask
275     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
276     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
277 
278     // mask = (src - dst) * mask >> 5
279     maskLo = _mm_srai_epi16(maskLo, 5);
280     maskHi = _mm_srai_epi16(maskHi, 5);
281 
282     // Add two pixels into result.
283     // result = dst + ((src - dst) * mask >> 5)
284     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
285     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
286 
287     // Pack into 4 32bit dst pixels.
288     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
289     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
290     // clamping to 255 if necessary.
291     return _mm_packus_epi16(resultLo, resultHi);
292 }
293 
SkBlendLCD16Opaque_SSE2(__m128i & src,__m128i & dst,__m128i & mask)294 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
295                                        __m128i &mask) {
296     // In the following comments, the components of src, dst and mask are
297     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
298     // by an R, G, B, or A suffix. Components of one of the four pixels that
299     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
300     // example is the blue channel of the second destination pixel. Memory
301     // layout is shown for an ARGB byte order in a color value.
302 
303     // src and srcA store 8-bit values interleaved with zeros.
304     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
305     // mask stores 16-bit values (shown as high and low bytes) interleaved with
306     // zeros
307     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
308     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
309 
310     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
311     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
312     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
313                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
314 
315     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
316     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
317                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
318 
319     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
320     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
321                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
322 
323     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
324     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
325     // 8-bit position
326     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
327     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
328     mask = _mm_or_si128(_mm_or_si128(r, g), b);
329 
330     // Interleave R,G,B into the lower byte of word.
331     // i.e. split the sixteen 8-bit values from mask into two sets of eight
332     // 16-bit values, padded by zero.
333     __m128i maskLo, maskHi;
334     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
335     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
336     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
337     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
338 
339     // Upscale from 0..31 to 0..32
340     // (allows to replace division by left-shift further down)
341     // Left-shift each component by 4 and add the result back to that component,
342     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
343     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
344     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
345 
346     // Interleave R,G,B into the lower byte of the word
347     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
348     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
349     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
350     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
351 
352     // mask = (src - dst) * mask
353     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
354     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
355 
356     // mask = (src - dst) * mask >> 5
357     maskLo = _mm_srai_epi16(maskLo, 5);
358     maskHi = _mm_srai_epi16(maskHi, 5);
359 
360     // Add two pixels into result.
361     // result = dst + ((src - dst) * mask >> 5)
362     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
363     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
364 
365     // Pack into 4 32bit dst pixels and force opaque.
366     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
367     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
368     // clamping to 255 if necessary. Set alpha components to 0xFF.
369     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
370                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
371 }
372 
SkBlitLCD16Row_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)373 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
374                          SkColor src, int width, SkPMColor) {
375     if (width <= 0) {
376         return;
377     }
378 
379     int srcA = SkColorGetA(src);
380     int srcR = SkColorGetR(src);
381     int srcG = SkColorGetG(src);
382     int srcB = SkColorGetB(src);
383 
384     srcA = SkAlpha255To256(srcA);
385 
386     if (width >= 4) {
387         SkASSERT(((size_t)dst & 0x03) == 0);
388         while (((size_t)dst & 0x0F) != 0) {
389             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
390             mask++;
391             dst++;
392             width--;
393         }
394 
395         __m128i *d = reinterpret_cast<__m128i*>(dst);
396         // Set alpha to 0xFF and replicate source four times in SSE register.
397         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
398         // Interleave with zeros to get two sets of four 16-bit values.
399         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
400         // Set srcA_sse to contain eight copies of srcA, padded with zero.
401         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
402         __m128i srcA_sse = _mm_set1_epi16(srcA);
403         while (width >= 4) {
404             // Load four destination pixels into dst_sse.
405             __m128i dst_sse = _mm_load_si128(d);
406             // Load four 16-bit masks into lower half of mask_sse.
407             __m128i mask_sse = _mm_loadl_epi64(
408                                    reinterpret_cast<const __m128i*>(mask));
409 
410             // Check whether masks are equal to 0 and get the highest bit
411             // of each byte of result, if masks are all zero, we will get
412             // pack_cmp to 0xFFFF
413             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
414                                              _mm_setzero_si128()));
415 
416             // if mask pixels are not all zero, we will blend the dst pixels
417             if (pack_cmp != 0xFFFF) {
418                 // Unpack 4 16bit mask pixels to
419                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
420                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
421                 mask_sse = _mm_unpacklo_epi16(mask_sse,
422                                               _mm_setzero_si128());
423 
424                 // Process 4 32bit dst pixels
425                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
426                                                    mask_sse, srcA_sse);
427                 _mm_store_si128(d, result);
428             }
429 
430             d++;
431             mask += 4;
432             width -= 4;
433         }
434 
435         dst = reinterpret_cast<SkPMColor*>(d);
436     }
437 
438     while (width > 0) {
439         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
440         mask++;
441         dst++;
442         width--;
443     }
444 }
445 
SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)446 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
447                                SkColor src, int width, SkPMColor opaqueDst) {
448     if (width <= 0) {
449         return;
450     }
451 
452     int srcR = SkColorGetR(src);
453     int srcG = SkColorGetG(src);
454     int srcB = SkColorGetB(src);
455 
456     if (width >= 4) {
457         SkASSERT(((size_t)dst & 0x03) == 0);
458         while (((size_t)dst & 0x0F) != 0) {
459             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
460             mask++;
461             dst++;
462             width--;
463         }
464 
465         __m128i *d = reinterpret_cast<__m128i*>(dst);
466         // Set alpha to 0xFF and replicate source four times in SSE register.
467         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
468         // Set srcA_sse to contain eight copies of srcA, padded with zero.
469         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
470         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
471         while (width >= 4) {
472             // Load four destination pixels into dst_sse.
473             __m128i dst_sse = _mm_load_si128(d);
474             // Load four 16-bit masks into lower half of mask_sse.
475             __m128i mask_sse = _mm_loadl_epi64(
476                                    reinterpret_cast<const __m128i*>(mask));
477 
478             // Check whether masks are equal to 0 and get the highest bit
479             // of each byte of result, if masks are all zero, we will get
480             // pack_cmp to 0xFFFF
481             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
482                                              _mm_setzero_si128()));
483 
484             // if mask pixels are not all zero, we will blend the dst pixels
485             if (pack_cmp != 0xFFFF) {
486                 // Unpack 4 16bit mask pixels to
487                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
488                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
489                 mask_sse = _mm_unpacklo_epi16(mask_sse,
490                                               _mm_setzero_si128());
491 
492                 // Process 4 32bit dst pixels
493                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
494                                                          mask_sse);
495                 _mm_store_si128(d, result);
496             }
497 
498             d++;
499             mask += 4;
500             width -= 4;
501         }
502 
503         dst = reinterpret_cast<SkPMColor*>(d);
504     }
505 
506     while (width > 0) {
507         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
508         mask++;
509         dst++;
510         width--;
511     }
512 }
513 
514 /* SSE2 version of S32_D565_Opaque()
515  * portable version is in core/SkBlitRow_D16.cpp
516  */
S32_D565_Opaque_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)517 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
518                           const SkPMColor* SK_RESTRICT src, int count,
519                           U8CPU alpha, int /*x*/, int /*y*/) {
520     SkASSERT(255 == alpha);
521 
522     if (count <= 0) {
523         return;
524     }
525 
526     if (count >= 8) {
527         while (((size_t)dst & 0x0F) != 0) {
528             SkPMColor c = *src++;
529             SkPMColorAssert(c);
530 
531             *dst++ = SkPixel32ToPixel16_ToU16(c);
532             count--;
533         }
534 
535         const __m128i* s = reinterpret_cast<const __m128i*>(src);
536         __m128i* d = reinterpret_cast<__m128i*>(dst);
537 
538         while (count >= 8) {
539             // Load 8 pixels of src.
540             __m128i src_pixel1 = _mm_loadu_si128(s++);
541             __m128i src_pixel2 = _mm_loadu_si128(s++);
542 
543             __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
544             _mm_store_si128(d++, d_pixel);
545             count -= 8;
546         }
547         src = reinterpret_cast<const SkPMColor*>(s);
548         dst = reinterpret_cast<uint16_t*>(d);
549     }
550 
551     if (count > 0) {
552         do {
553             SkPMColor c = *src++;
554             SkPMColorAssert(c);
555             *dst++ = SkPixel32ToPixel16_ToU16(c);
556         } while (--count != 0);
557     }
558 }
559 
560 /* SSE2 version of S32A_D565_Opaque()
561  * portable version is in core/SkBlitRow_D16.cpp
562  */
S32A_D565_Opaque_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)563 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
564                            const SkPMColor* SK_RESTRICT src,
565                            int count, U8CPU alpha, int /*x*/, int /*y*/) {
566     SkASSERT(255 == alpha);
567 
568     if (count <= 0) {
569         return;
570     }
571 
572     if (count >= 8) {
573         // Make dst 16 bytes alignment
574         while (((size_t)dst & 0x0F) != 0) {
575             SkPMColor c = *src++;
576             if (c) {
577               *dst = SkSrcOver32To16(c, *dst);
578             }
579             dst += 1;
580             count--;
581         }
582 
583         const __m128i* s = reinterpret_cast<const __m128i*>(src);
584         __m128i* d = reinterpret_cast<__m128i*>(dst);
585         __m128i var255 = _mm_set1_epi16(255);
586         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
587         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
588         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
589 
590         while (count >= 8) {
591             // Load 8 pixels of src.
592             __m128i src_pixel1 = _mm_loadu_si128(s++);
593             __m128i src_pixel2 = _mm_loadu_si128(s++);
594 
595             // Check whether src pixels are equal to 0 and get the highest bit
596             // of each byte of result, if src pixels are all zero, src_cmp1 and
597             // src_cmp2 will be 0xFFFF.
598             int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
599                                              _mm_setzero_si128()));
600             int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
601                                              _mm_setzero_si128()));
602             if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
603                 d++;
604                 count -= 8;
605                 continue;
606             }
607 
608             // Load 8 pixels of dst.
609             __m128i dst_pixel = _mm_load_si128(d);
610 
611             // Extract A from src.
612             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
613             sa1 = _mm_srli_epi32(sa1, 24);
614             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
615             sa2 = _mm_srli_epi32(sa2, 24);
616             __m128i sa = _mm_packs_epi32(sa1, sa2);
617 
618             // Extract R from src.
619             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
620             sr1 = _mm_srli_epi32(sr1, 24);
621             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
622             sr2 = _mm_srli_epi32(sr2, 24);
623             __m128i sr = _mm_packs_epi32(sr1, sr2);
624 
625             // Extract G from src.
626             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
627             sg1 = _mm_srli_epi32(sg1, 24);
628             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
629             sg2 = _mm_srli_epi32(sg2, 24);
630             __m128i sg = _mm_packs_epi32(sg1, sg2);
631 
632             // Extract B from src.
633             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
634             sb1 = _mm_srli_epi32(sb1, 24);
635             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
636             sb2 = _mm_srli_epi32(sb2, 24);
637             __m128i sb = _mm_packs_epi32(sb1, sb2);
638 
639             // Extract R G B from dst.
640             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
641             dr = _mm_and_si128(dr, r16_mask);
642             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
643             dg = _mm_and_si128(dg, g16_mask);
644             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
645             db = _mm_and_si128(db, b16_mask);
646 
647             __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
648 
649             // Calculate R G B of result.
650             // Original algorithm is in SkSrcOver32To16().
651             dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
652             dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
653             dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
654             dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
655             db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
656             db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
657 
658             // Pack R G B into 16-bit color.
659             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
660 
661             // Store 8 16-bit colors in dst.
662             _mm_store_si128(d++, d_pixel);
663             count -= 8;
664         }
665 
666         src = reinterpret_cast<const SkPMColor*>(s);
667         dst = reinterpret_cast<uint16_t*>(d);
668     }
669 
670     if (count > 0) {
671         do {
672             SkPMColor c = *src++;
673             SkPMColorAssert(c);
674             if (c) {
675                 *dst = SkSrcOver32To16(c, *dst);
676             }
677             dst += 1;
678         } while (--count != 0);
679     }
680 }
681 
S32_D565_Opaque_Dither_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)682 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
683                                  const SkPMColor* SK_RESTRICT src,
684                                  int count, U8CPU alpha, int x, int y) {
685     SkASSERT(255 == alpha);
686 
687     if (count <= 0) {
688         return;
689     }
690 
691     if (count >= 8) {
692         while (((size_t)dst & 0x0F) != 0) {
693             DITHER_565_SCAN(y);
694             SkPMColor c = *src++;
695             SkPMColorAssert(c);
696 
697             unsigned dither = DITHER_VALUE(x);
698             *dst++ = SkDitherRGB32To565(c, dither);
699             DITHER_INC_X(x);
700             count--;
701         }
702 
703         unsigned short dither_value[8];
704         __m128i dither;
705 #ifdef ENABLE_DITHER_MATRIX_4X4
706         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
707         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
708         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
709         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
710         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
711 #else
712         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
713         dither_value[0] = dither_value[4] = (dither_scan
714                                              >> (((x) & 3) << 2)) & 0xF;
715         dither_value[1] = dither_value[5] = (dither_scan
716                                              >> (((x + 1) & 3) << 2)) & 0xF;
717         dither_value[2] = dither_value[6] = (dither_scan
718                                              >> (((x + 2) & 3) << 2)) & 0xF;
719         dither_value[3] = dither_value[7] = (dither_scan
720                                              >> (((x + 3) & 3) << 2)) & 0xF;
721 #endif
722         dither = _mm_loadu_si128((__m128i*) dither_value);
723 
724         const __m128i* s = reinterpret_cast<const __m128i*>(src);
725         __m128i* d = reinterpret_cast<__m128i*>(dst);
726 
727         while (count >= 8) {
728             // Load 8 pixels of src.
729             __m128i src_pixel1 = _mm_loadu_si128(s++);
730             __m128i src_pixel2 = _mm_loadu_si128(s++);
731 
732             // Extract R from src.
733             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
734             sr1 = _mm_srli_epi32(sr1, 24);
735             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
736             sr2 = _mm_srli_epi32(sr2, 24);
737             __m128i sr = _mm_packs_epi32(sr1, sr2);
738 
739             // SkDITHER_R32To565(sr, dither)
740             __m128i sr_offset = _mm_srli_epi16(sr, 5);
741             sr = _mm_add_epi16(sr, dither);
742             sr = _mm_sub_epi16(sr, sr_offset);
743             sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
744 
745             // Extract G from src.
746             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
747             sg1 = _mm_srli_epi32(sg1, 24);
748             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
749             sg2 = _mm_srli_epi32(sg2, 24);
750             __m128i sg = _mm_packs_epi32(sg1, sg2);
751 
752             // SkDITHER_R32To565(sg, dither)
753             __m128i sg_offset = _mm_srli_epi16(sg, 6);
754             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
755             sg = _mm_sub_epi16(sg, sg_offset);
756             sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
757 
758             // Extract B from src.
759             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
760             sb1 = _mm_srli_epi32(sb1, 24);
761             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
762             sb2 = _mm_srli_epi32(sb2, 24);
763             __m128i sb = _mm_packs_epi32(sb1, sb2);
764 
765             // SkDITHER_R32To565(sb, dither)
766             __m128i sb_offset = _mm_srli_epi16(sb, 5);
767             sb = _mm_add_epi16(sb, dither);
768             sb = _mm_sub_epi16(sb, sb_offset);
769             sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
770 
771             // Pack and store 16-bit dst pixel.
772             __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
773             _mm_store_si128(d++, d_pixel);
774 
775             count -= 8;
776             x += 8;
777         }
778 
779         src = reinterpret_cast<const SkPMColor*>(s);
780         dst = reinterpret_cast<uint16_t*>(d);
781     }
782 
783     if (count > 0) {
784         DITHER_565_SCAN(y);
785         do {
786             SkPMColor c = *src++;
787             SkPMColorAssert(c);
788 
789             unsigned dither = DITHER_VALUE(x);
790             *dst++ = SkDitherRGB32To565(c, dither);
791             DITHER_INC_X(x);
792         } while (--count != 0);
793     }
794 }
795 
796 /* SSE2 version of S32A_D565_Opaque_Dither()
797  * portable version is in core/SkBlitRow_D16.cpp
798  */
S32A_D565_Opaque_Dither_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)799 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
800                                   const SkPMColor* SK_RESTRICT src,
801                                   int count, U8CPU alpha, int x, int y) {
802     SkASSERT(255 == alpha);
803 
804     if (count <= 0) {
805         return;
806     }
807 
808     if (count >= 8) {
809         while (((size_t)dst & 0x0F) != 0) {
810             DITHER_565_SCAN(y);
811             SkPMColor c = *src++;
812             SkPMColorAssert(c);
813             if (c) {
814                 unsigned a = SkGetPackedA32(c);
815 
816                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
817 
818                 unsigned sr = SkGetPackedR32(c);
819                 unsigned sg = SkGetPackedG32(c);
820                 unsigned sb = SkGetPackedB32(c);
821                 sr = SkDITHER_R32_FOR_565(sr, d);
822                 sg = SkDITHER_G32_FOR_565(sg, d);
823                 sb = SkDITHER_B32_FOR_565(sb, d);
824 
825                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
826                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
827                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
828                 // now src and dst expanded are in g:11 r:10 x:1 b:10
829                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
830             }
831             dst += 1;
832             DITHER_INC_X(x);
833             count--;
834         }
835 
836         unsigned short dither_value[8];
837         __m128i dither, dither_cur;
838 #ifdef ENABLE_DITHER_MATRIX_4X4
839         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
840         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
841         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
842         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
843         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
844 #else
845         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
846         dither_value[0] = dither_value[4] = (dither_scan
847                                              >> (((x) & 3) << 2)) & 0xF;
848         dither_value[1] = dither_value[5] = (dither_scan
849                                              >> (((x + 1) & 3) << 2)) & 0xF;
850         dither_value[2] = dither_value[6] = (dither_scan
851                                              >> (((x + 2) & 3) << 2)) & 0xF;
852         dither_value[3] = dither_value[7] = (dither_scan
853                                              >> (((x + 3) & 3) << 2)) & 0xF;
854 #endif
855         dither = _mm_loadu_si128((__m128i*) dither_value);
856 
857         const __m128i* s = reinterpret_cast<const __m128i*>(src);
858         __m128i* d = reinterpret_cast<__m128i*>(dst);
859         __m128i var256 = _mm_set1_epi16(256);
860         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
861         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
862         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
863 
864         while (count >= 8) {
865             // Load 8 pixels of src and dst.
866             __m128i src_pixel1 = _mm_loadu_si128(s++);
867             __m128i src_pixel2 = _mm_loadu_si128(s++);
868             __m128i dst_pixel = _mm_load_si128(d);
869 
870             // Extract A from src.
871             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
872             sa1 = _mm_srli_epi32(sa1, 24);
873             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
874             sa2 = _mm_srli_epi32(sa2, 24);
875             __m128i sa = _mm_packs_epi32(sa1, sa2);
876 
877             // Calculate current dither value.
878             dither_cur = _mm_mullo_epi16(dither,
879                                          _mm_add_epi16(sa, _mm_set1_epi16(1)));
880             dither_cur = _mm_srli_epi16(dither_cur, 8);
881 
882             // Extract R from src.
883             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
884             sr1 = _mm_srli_epi32(sr1, 24);
885             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
886             sr2 = _mm_srli_epi32(sr2, 24);
887             __m128i sr = _mm_packs_epi32(sr1, sr2);
888 
889             // SkDITHER_R32_FOR_565(sr, d)
890             __m128i sr_offset = _mm_srli_epi16(sr, 5);
891             sr = _mm_add_epi16(sr, dither_cur);
892             sr = _mm_sub_epi16(sr, sr_offset);
893 
894             // Expand sr.
895             sr = _mm_slli_epi16(sr, 2);
896 
897             // Extract G from src.
898             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
899             sg1 = _mm_srli_epi32(sg1, 24);
900             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
901             sg2 = _mm_srli_epi32(sg2, 24);
902             __m128i sg = _mm_packs_epi32(sg1, sg2);
903 
904             // sg = SkDITHER_G32_FOR_565(sg, d).
905             __m128i sg_offset = _mm_srli_epi16(sg, 6);
906             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
907             sg = _mm_sub_epi16(sg, sg_offset);
908 
909             // Expand sg.
910             sg = _mm_slli_epi16(sg, 3);
911 
912             // Extract B from src.
913             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
914             sb1 = _mm_srli_epi32(sb1, 24);
915             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
916             sb2 = _mm_srli_epi32(sb2, 24);
917             __m128i sb = _mm_packs_epi32(sb1, sb2);
918 
919             // sb = SkDITHER_B32_FOR_565(sb, d).
920             __m128i sb_offset = _mm_srli_epi16(sb, 5);
921             sb = _mm_add_epi16(sb, dither_cur);
922             sb = _mm_sub_epi16(sb, sb_offset);
923 
924             // Expand sb.
925             sb = _mm_slli_epi16(sb, 2);
926 
927             // Extract R G B from dst.
928             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
929             dr = _mm_and_si128(dr, r16_mask);
930             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
931             dg = _mm_and_si128(dg, g16_mask);
932             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
933             db = _mm_and_si128(db, b16_mask);
934 
935             // SkAlpha255To256(255 - a) >> 3
936             __m128i isa = _mm_sub_epi16(var256, sa);
937             isa = _mm_srli_epi16(isa, 3);
938 
939             dr = _mm_mullo_epi16(dr, isa);
940             dr = _mm_add_epi16(dr, sr);
941             dr = _mm_srli_epi16(dr, 5);
942 
943             dg = _mm_mullo_epi16(dg, isa);
944             dg = _mm_add_epi16(dg, sg);
945             dg = _mm_srli_epi16(dg, 5);
946 
947             db = _mm_mullo_epi16(db, isa);
948             db = _mm_add_epi16(db, sb);
949             db = _mm_srli_epi16(db, 5);
950 
951             // Package and store dst pixel.
952             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
953             _mm_store_si128(d++, d_pixel);
954 
955             count -= 8;
956             x += 8;
957         }
958 
959         src = reinterpret_cast<const SkPMColor*>(s);
960         dst = reinterpret_cast<uint16_t*>(d);
961     }
962 
963     if (count > 0) {
964         DITHER_565_SCAN(y);
965         do {
966             SkPMColor c = *src++;
967             SkPMColorAssert(c);
968             if (c) {
969                 unsigned a = SkGetPackedA32(c);
970 
971                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
972 
973                 unsigned sr = SkGetPackedR32(c);
974                 unsigned sg = SkGetPackedG32(c);
975                 unsigned sb = SkGetPackedB32(c);
976                 sr = SkDITHER_R32_FOR_565(sr, d);
977                 sg = SkDITHER_G32_FOR_565(sg, d);
978                 sb = SkDITHER_B32_FOR_565(sb, d);
979 
980                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
981                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
982                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
983                 // now src and dst expanded are in g:11 r:10 x:1 b:10
984                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
985             }
986             dst += 1;
987             DITHER_INC_X(x);
988         } while (--count != 0);
989     }
990 }
991