1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include <emmintrin.h>
9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkBlitRow_opts_SSE2.h"
11 #include "SkColorPriv.h"
12 #include "SkColor_opts_SSE2.h"
13 #include "SkDither.h"
14 #include "SkUtils.h"
15 
16 /* SSE2 version of S32_Blend_BlitRow32()
17  * portable version is in core/SkBlitRow_D32.cpp
18  */
S32_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20                               const SkPMColor* SK_RESTRICT src,
21                               int count, U8CPU alpha) {
22     SkASSERT(alpha <= 255);
23     if (count <= 0) {
24         return;
25     }
26 
27     uint32_t src_scale = SkAlpha255To256(alpha);
28     uint32_t dst_scale = 256 - src_scale;
29 
30     if (count >= 4) {
31         SkASSERT(((size_t)dst & 0x03) == 0);
32         while (((size_t)dst & 0x0F) != 0) {
33             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34             src++;
35             dst++;
36             count--;
37         }
38 
39         const __m128i *s = reinterpret_cast<const __m128i*>(src);
40         __m128i *d = reinterpret_cast<__m128i*>(dst);
41 
42         while (count >= 4) {
43             // Load 4 pixels each of src and dest.
44             __m128i src_pixel = _mm_loadu_si128(s);
45             __m128i dst_pixel = _mm_load_si128(d);
46 
47             src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
48             dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
49 
50             // Add result
51             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
52             _mm_store_si128(d, result);
53             s++;
54             d++;
55             count -= 4;
56         }
57         src = reinterpret_cast<const SkPMColor*>(s);
58         dst = reinterpret_cast<SkPMColor*>(d);
59     }
60 
61     while (count > 0) {
62         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
63         src++;
64         dst++;
65         count--;
66     }
67 }
68 
S32A_Opaque_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)69 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
70                                 const SkPMColor* SK_RESTRICT src,
71                                 int count, U8CPU alpha) {
72     SkASSERT(alpha == 255);
73     if (count <= 0) {
74         return;
75     }
76 
77 #ifdef SK_USE_ACCURATE_BLENDING
78     if (count >= 4) {
79         SkASSERT(((size_t)dst & 0x03) == 0);
80         while (((size_t)dst & 0x0F) != 0) {
81             *dst = SkPMSrcOver(*src, *dst);
82             src++;
83             dst++;
84             count--;
85         }
86 
87         const __m128i *s = reinterpret_cast<const __m128i*>(src);
88         __m128i *d = reinterpret_cast<__m128i*>(dst);
89         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
90         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
91         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
92         while (count >= 4) {
93             // Load 4 pixels
94             __m128i src_pixel = _mm_loadu_si128(s);
95             __m128i dst_pixel = _mm_load_si128(d);
96 
97             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
98             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
99             // Shift alphas down to lower 8 bits of each quad.
100             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
101 
102             // Copy alpha to upper 3rd byte of each quad
103             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
104 
105             // Subtract alphas from 255, to get 0..255
106             alpha = _mm_sub_epi16(c_255, alpha);
107 
108             // Multiply by red and blue by src alpha.
109             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
110             // Multiply by alpha and green by src alpha.
111             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
112 
113             // dst_rb_low = (dst_rb >> 8)
114             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
115             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
116 
117             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
118             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
119             dst_rb = _mm_add_epi16(dst_rb, c_128);
120             dst_rb = _mm_srli_epi16(dst_rb, 8);
121 
122             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
123             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
124             dst_ag = _mm_add_epi16(dst_ag, c_128);
125             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
126 
127             // Combine back into RGBA.
128             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
129 
130             // Add result
131             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
132             _mm_store_si128(d, result);
133             s++;
134             d++;
135             count -= 4;
136         }
137         src = reinterpret_cast<const SkPMColor*>(s);
138         dst = reinterpret_cast<SkPMColor*>(d);
139     }
140 
141     while (count > 0) {
142         *dst = SkPMSrcOver(*src, *dst);
143         src++;
144         dst++;
145         count--;
146     }
147 #else
148     int count16 = count / 16;
149     __m128i* dst4 = (__m128i*)dst;
150     const __m128i* src4 = (const __m128i*)src;
151 
152     for (int i = 0; i < count16 * 4; i += 4) {
153         // Load 16 source pixels.
154         __m128i s0 = _mm_loadu_si128(src4+i+0),
155                 s1 = _mm_loadu_si128(src4+i+1),
156                 s2 = _mm_loadu_si128(src4+i+2),
157                 s3 = _mm_loadu_si128(src4+i+3);
158 
159         const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
160         const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
161         __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
162         if (0xffff == _mm_movemask_epi8(cmp)) {
163             // All 16 source pixels are fully transparent. There's nothing to do!
164             continue;
165         }
166         const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
167         cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
168         if (0xffff == _mm_movemask_epi8(cmp)) {
169             // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
170             _mm_storeu_si128(dst4+i+0, s0);
171             _mm_storeu_si128(dst4+i+1, s1);
172             _mm_storeu_si128(dst4+i+2, s2);
173             _mm_storeu_si128(dst4+i+3, s3);
174             continue;
175         }
176         // The general slow case: do the blend for all 16 pixels.
177         _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
178         _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
179         _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
180         _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
181     }
182 
183     // Wrap up the last <= 15 pixels.
184     SkASSERT(count - (count16*16) <= 15);
185     for (int i = count16*16; i < count; i++) {
186         // This check is not really necessarily, but it prevents pointless autovectorization.
187         if (src[i] & 0xFF000000) {
188             dst[i] = SkPMSrcOver(src[i], dst[i]);
189         }
190     }
191 #endif
192 }
193 
S32A_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)194 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
195                                const SkPMColor* SK_RESTRICT src,
196                                int count, U8CPU alpha) {
197     SkASSERT(alpha <= 255);
198     if (count <= 0) {
199         return;
200     }
201 
202     if (count >= 4) {
203         while (((size_t)dst & 0x0F) != 0) {
204             *dst = SkBlendARGB32(*src, *dst, alpha);
205             src++;
206             dst++;
207             count--;
208         }
209 
210         const __m128i *s = reinterpret_cast<const __m128i*>(src);
211         __m128i *d = reinterpret_cast<__m128i*>(dst);
212         while (count >= 4) {
213             // Load 4 pixels each of src and dest.
214             __m128i src_pixel = _mm_loadu_si128(s);
215             __m128i dst_pixel = _mm_load_si128(d);
216 
217             __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
218             _mm_store_si128(d, result);
219             s++;
220             d++;
221             count -= 4;
222         }
223         src = reinterpret_cast<const SkPMColor*>(s);
224         dst = reinterpret_cast<SkPMColor*>(d);
225     }
226 
227     while (count > 0) {
228         *dst = SkBlendARGB32(*src, *dst, alpha);
229         src++;
230         dst++;
231         count--;
232     }
233 }
234 
Color32A_D565_SSE2(uint16_t dst[],SkPMColor src,int count,int x,int y)235 void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
236     SkASSERT(count > 0);
237 
238     uint32_t src_expand = (SkGetPackedG32(src) << 24) |
239                           (SkGetPackedR32(src) << 13) |
240                           (SkGetPackedB32(src) << 2);
241     unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
242 
243     // Check if we have enough pixels to run SIMD
244     if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
245         __m128i* dst_wide;
246         const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
247         const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
248         const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
249         const __m128i scale_wide = _mm_set1_epi16(scale);
250         const __m128i mask_blue  = _mm_set1_epi16(SK_B16_MASK);
251         const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
252 
253         // Align dst to an even 16 byte address (0-7 pixels)
254         while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
255             *dst = SkBlend32_RGB16(src_expand, *dst, scale);
256             dst += 1;
257             count--;
258         }
259 
260         dst_wide = reinterpret_cast<__m128i*>(dst);
261         do {
262             // Load eight RGB565 pixels
263             __m128i pixels = _mm_load_si128(dst_wide);
264 
265             // Mask out sub-pixels
266             __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
267             __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
268             pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
269             __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
270 
271             // Scale with alpha
272             pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
273             pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
274             pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
275 
276             // Add src_X_wide and shift down again
277             pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
278             pixel_R = _mm_srli_epi16(pixel_R, 5);
279             pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
280             pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
281             pixel_B = _mm_srli_epi16(pixel_B, 5);
282 
283             // Combine into RGB565 and store
284             pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
285             pixel_G = _mm_and_si128(pixel_G, mask_green);
286             pixels = _mm_or_si128(pixel_R, pixel_G);
287             pixels = _mm_or_si128(pixels, pixel_B);
288             _mm_store_si128(dst_wide, pixels);
289             count -= 8;
290             dst_wide++;
291         } while (count >= 8);
292 
293         dst = reinterpret_cast<uint16_t*>(dst_wide);
294     }
295 
296     // Small loop to handle remaining pixels.
297     while (count > 0) {
298         *dst = SkBlend32_RGB16(src_expand, *dst, scale);
299         dst += 1;
300         count--;
301     }
302 }
303 
SkARGB32_A8_BlitMask_SSE2(void * device,size_t dstRB,const void * maskPtr,size_t maskRB,SkColor origColor,int width,int height)304 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
305                                size_t maskRB, SkColor origColor,
306                                int width, int height) {
307     SkPMColor color = SkPreMultiplyColor(origColor);
308     size_t dstOffset = dstRB - (width << 2);
309     size_t maskOffset = maskRB - width;
310     SkPMColor* dst = (SkPMColor *)device;
311     const uint8_t* mask = (const uint8_t*)maskPtr;
312     do {
313         int count = width;
314         if (count >= 4) {
315             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
316                 *dst = SkBlendARGB32(color, *dst, *mask);
317                 mask++;
318                 dst++;
319                 count--;
320             }
321             __m128i *d = reinterpret_cast<__m128i*>(dst);
322             __m128i src_pixel = _mm_set1_epi32(color);
323             while (count >= 4) {
324                 // Load 4 dst pixels
325                 __m128i dst_pixel = _mm_load_si128(d);
326 
327                 // Set the alpha value
328                 __m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask));
329                 alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128());
330                 alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128());
331 
332                 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_wide);
333                 _mm_store_si128(d, result);
334                 // Load the next 4 dst pixels and alphas
335                 mask = mask + 4;
336                 d++;
337                 count -= 4;
338             }
339             dst = reinterpret_cast<SkPMColor*>(d);
340         }
341         while (count > 0) {
342             *dst= SkBlendARGB32(color, *dst, *mask);
343             dst += 1;
344             mask++;
345             count --;
346         }
347         dst = (SkPMColor *)((char*)dst + dstOffset);
348         mask += maskOffset;
349     } while (--height != 0);
350 }
351 
352 // The following (left) shifts cause the top 5 bits of the mask components to
353 // line up with the corresponding components in an SkPMColor.
354 // Note that the mask's RGB16 order may differ from the SkPMColor order.
355 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
356 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
357 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
358 
359 #if SK_R16x5_R32x5_SHIFT == 0
360     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
361 #elif SK_R16x5_R32x5_SHIFT > 0
362     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
363 #else
364     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
365 #endif
366 
367 #if SK_G16x5_G32x5_SHIFT == 0
368     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
369 #elif SK_G16x5_G32x5_SHIFT > 0
370     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
371 #else
372     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
373 #endif
374 
375 #if SK_B16x5_B32x5_SHIFT == 0
376     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
377 #elif SK_B16x5_B32x5_SHIFT > 0
378     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
379 #else
380     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
381 #endif
382 
SkBlendLCD16_SSE2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)383 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
384                                  __m128i &mask, __m128i &srcA) {
385     // In the following comments, the components of src, dst and mask are
386     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
387     // by an R, G, B, or A suffix. Components of one of the four pixels that
388     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
389     // example is the blue channel of the second destination pixel. Memory
390     // layout is shown for an ARGB byte order in a color value.
391 
392     // src and srcA store 8-bit values interleaved with zeros.
393     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
394     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
395     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
396     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
397     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
398     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
399     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
400 
401     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
402     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
403     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
404                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
405 
406     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
407     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
408                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
409 
410     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
411     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
412                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
413 
414     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
415     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
416     // 8-bit position
417     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
418     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
419     mask = _mm_or_si128(_mm_or_si128(r, g), b);
420 
421     // Interleave R,G,B into the lower byte of word.
422     // i.e. split the sixteen 8-bit values from mask into two sets of eight
423     // 16-bit values, padded by zero.
424     __m128i maskLo, maskHi;
425     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
426     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
427     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
428     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
429 
430     // Upscale from 0..31 to 0..32
431     // (allows to replace division by left-shift further down)
432     // Left-shift each component by 4 and add the result back to that component,
433     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
434     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
435     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
436 
437     // Multiply each component of maskLo and maskHi by srcA
438     maskLo = _mm_mullo_epi16(maskLo, srcA);
439     maskHi = _mm_mullo_epi16(maskHi, srcA);
440 
441     // Left shift mask components by 8 (divide by 256)
442     maskLo = _mm_srli_epi16(maskLo, 8);
443     maskHi = _mm_srli_epi16(maskHi, 8);
444 
445     // Interleave R,G,B into the lower byte of the word
446     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
447     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
448     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
449     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
450 
451     // mask = (src - dst) * mask
452     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
453     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
454 
455     // mask = (src - dst) * mask >> 5
456     maskLo = _mm_srai_epi16(maskLo, 5);
457     maskHi = _mm_srai_epi16(maskHi, 5);
458 
459     // Add two pixels into result.
460     // result = dst + ((src - dst) * mask >> 5)
461     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
462     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
463 
464     // Pack into 4 32bit dst pixels.
465     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
466     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
467     // clamping to 255 if necessary.
468     return _mm_packus_epi16(resultLo, resultHi);
469 }
470 
SkBlendLCD16Opaque_SSE2(__m128i & src,__m128i & dst,__m128i & mask)471 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
472                                        __m128i &mask) {
473     // In the following comments, the components of src, dst and mask are
474     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
475     // by an R, G, B, or A suffix. Components of one of the four pixels that
476     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
477     // example is the blue channel of the second destination pixel. Memory
478     // layout is shown for an ARGB byte order in a color value.
479 
480     // src and srcA store 8-bit values interleaved with zeros.
481     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
482     // mask stores 16-bit values (shown as high and low bytes) interleaved with
483     // zeros
484     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
485     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
486 
487     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
488     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
489     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
490                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
491 
492     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
493     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
494                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
495 
496     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
497     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
498                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
499 
500     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
501     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
502     // 8-bit position
503     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
504     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
505     mask = _mm_or_si128(_mm_or_si128(r, g), b);
506 
507     // Interleave R,G,B into the lower byte of word.
508     // i.e. split the sixteen 8-bit values from mask into two sets of eight
509     // 16-bit values, padded by zero.
510     __m128i maskLo, maskHi;
511     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
512     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
513     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
514     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
515 
516     // Upscale from 0..31 to 0..32
517     // (allows to replace division by left-shift further down)
518     // Left-shift each component by 4 and add the result back to that component,
519     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
520     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
521     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
522 
523     // Interleave R,G,B into the lower byte of the word
524     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
525     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
526     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
527     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
528 
529     // mask = (src - dst) * mask
530     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
531     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
532 
533     // mask = (src - dst) * mask >> 5
534     maskLo = _mm_srai_epi16(maskLo, 5);
535     maskHi = _mm_srai_epi16(maskHi, 5);
536 
537     // Add two pixels into result.
538     // result = dst + ((src - dst) * mask >> 5)
539     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
540     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
541 
542     // Pack into 4 32bit dst pixels and force opaque.
543     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
544     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
545     // clamping to 255 if necessary. Set alpha components to 0xFF.
546     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
547                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
548 }
549 
SkBlitLCD16Row_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)550 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
551                          SkColor src, int width, SkPMColor) {
552     if (width <= 0) {
553         return;
554     }
555 
556     int srcA = SkColorGetA(src);
557     int srcR = SkColorGetR(src);
558     int srcG = SkColorGetG(src);
559     int srcB = SkColorGetB(src);
560 
561     srcA = SkAlpha255To256(srcA);
562 
563     if (width >= 4) {
564         SkASSERT(((size_t)dst & 0x03) == 0);
565         while (((size_t)dst & 0x0F) != 0) {
566             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
567             mask++;
568             dst++;
569             width--;
570         }
571 
572         __m128i *d = reinterpret_cast<__m128i*>(dst);
573         // Set alpha to 0xFF and replicate source four times in SSE register.
574         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
575         // Interleave with zeros to get two sets of four 16-bit values.
576         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
577         // Set srcA_sse to contain eight copies of srcA, padded with zero.
578         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
579         __m128i srcA_sse = _mm_set1_epi16(srcA);
580         while (width >= 4) {
581             // Load four destination pixels into dst_sse.
582             __m128i dst_sse = _mm_load_si128(d);
583             // Load four 16-bit masks into lower half of mask_sse.
584             __m128i mask_sse = _mm_loadl_epi64(
585                                    reinterpret_cast<const __m128i*>(mask));
586 
587             // Check whether masks are equal to 0 and get the highest bit
588             // of each byte of result, if masks are all zero, we will get
589             // pack_cmp to 0xFFFF
590             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
591                                              _mm_setzero_si128()));
592 
593             // if mask pixels are not all zero, we will blend the dst pixels
594             if (pack_cmp != 0xFFFF) {
595                 // Unpack 4 16bit mask pixels to
596                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
597                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
598                 mask_sse = _mm_unpacklo_epi16(mask_sse,
599                                               _mm_setzero_si128());
600 
601                 // Process 4 32bit dst pixels
602                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
603                                                    mask_sse, srcA_sse);
604                 _mm_store_si128(d, result);
605             }
606 
607             d++;
608             mask += 4;
609             width -= 4;
610         }
611 
612         dst = reinterpret_cast<SkPMColor*>(d);
613     }
614 
615     while (width > 0) {
616         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
617         mask++;
618         dst++;
619         width--;
620     }
621 }
622 
SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)623 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
624                                SkColor src, int width, SkPMColor opaqueDst) {
625     if (width <= 0) {
626         return;
627     }
628 
629     int srcR = SkColorGetR(src);
630     int srcG = SkColorGetG(src);
631     int srcB = SkColorGetB(src);
632 
633     if (width >= 4) {
634         SkASSERT(((size_t)dst & 0x03) == 0);
635         while (((size_t)dst & 0x0F) != 0) {
636             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
637             mask++;
638             dst++;
639             width--;
640         }
641 
642         __m128i *d = reinterpret_cast<__m128i*>(dst);
643         // Set alpha to 0xFF and replicate source four times in SSE register.
644         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
645         // Set srcA_sse to contain eight copies of srcA, padded with zero.
646         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
647         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
648         while (width >= 4) {
649             // Load four destination pixels into dst_sse.
650             __m128i dst_sse = _mm_load_si128(d);
651             // Load four 16-bit masks into lower half of mask_sse.
652             __m128i mask_sse = _mm_loadl_epi64(
653                                    reinterpret_cast<const __m128i*>(mask));
654 
655             // Check whether masks are equal to 0 and get the highest bit
656             // of each byte of result, if masks are all zero, we will get
657             // pack_cmp to 0xFFFF
658             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
659                                              _mm_setzero_si128()));
660 
661             // if mask pixels are not all zero, we will blend the dst pixels
662             if (pack_cmp != 0xFFFF) {
663                 // Unpack 4 16bit mask pixels to
664                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
665                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
666                 mask_sse = _mm_unpacklo_epi16(mask_sse,
667                                               _mm_setzero_si128());
668 
669                 // Process 4 32bit dst pixels
670                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
671                                                          mask_sse);
672                 _mm_store_si128(d, result);
673             }
674 
675             d++;
676             mask += 4;
677             width -= 4;
678         }
679 
680         dst = reinterpret_cast<SkPMColor*>(d);
681     }
682 
683     while (width > 0) {
684         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
685         mask++;
686         dst++;
687         width--;
688     }
689 }
690 
691 /* SSE2 version of S32_D565_Opaque()
692  * portable version is in core/SkBlitRow_D16.cpp
693  */
S32_D565_Opaque_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)694 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
695                           const SkPMColor* SK_RESTRICT src, int count,
696                           U8CPU alpha, int /*x*/, int /*y*/) {
697     SkASSERT(255 == alpha);
698 
699     if (count <= 0) {
700         return;
701     }
702 
703     if (count >= 8) {
704         while (((size_t)dst & 0x0F) != 0) {
705             SkPMColor c = *src++;
706             SkPMColorAssert(c);
707 
708             *dst++ = SkPixel32ToPixel16_ToU16(c);
709             count--;
710         }
711 
712         const __m128i* s = reinterpret_cast<const __m128i*>(src);
713         __m128i* d = reinterpret_cast<__m128i*>(dst);
714 
715         while (count >= 8) {
716             // Load 8 pixels of src.
717             __m128i src_pixel1 = _mm_loadu_si128(s++);
718             __m128i src_pixel2 = _mm_loadu_si128(s++);
719 
720             __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
721             _mm_store_si128(d++, d_pixel);
722             count -= 8;
723         }
724         src = reinterpret_cast<const SkPMColor*>(s);
725         dst = reinterpret_cast<uint16_t*>(d);
726     }
727 
728     if (count > 0) {
729         do {
730             SkPMColor c = *src++;
731             SkPMColorAssert(c);
732             *dst++ = SkPixel32ToPixel16_ToU16(c);
733         } while (--count != 0);
734     }
735 }
736 
737 /* SSE2 version of S32A_D565_Opaque()
738  * portable version is in core/SkBlitRow_D16.cpp
739  */
S32A_D565_Opaque_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)740 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
741                            const SkPMColor* SK_RESTRICT src,
742                            int count, U8CPU alpha, int /*x*/, int /*y*/) {
743     SkASSERT(255 == alpha);
744 
745     if (count <= 0) {
746         return;
747     }
748 
749     if (count >= 8) {
750         // Make dst 16 bytes alignment
751         while (((size_t)dst & 0x0F) != 0) {
752             SkPMColor c = *src++;
753             if (c) {
754               *dst = SkSrcOver32To16(c, *dst);
755             }
756             dst += 1;
757             count--;
758         }
759 
760         const __m128i* s = reinterpret_cast<const __m128i*>(src);
761         __m128i* d = reinterpret_cast<__m128i*>(dst);
762         __m128i var255 = _mm_set1_epi16(255);
763         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
764         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
765         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
766 
767         while (count >= 8) {
768             // Load 8 pixels of src.
769             __m128i src_pixel1 = _mm_loadu_si128(s++);
770             __m128i src_pixel2 = _mm_loadu_si128(s++);
771 
772             // Check whether src pixels are equal to 0 and get the highest bit
773             // of each byte of result, if src pixels are all zero, src_cmp1 and
774             // src_cmp2 will be 0xFFFF.
775             int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
776                                              _mm_setzero_si128()));
777             int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
778                                              _mm_setzero_si128()));
779             if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
780                 d++;
781                 count -= 8;
782                 continue;
783             }
784 
785             // Load 8 pixels of dst.
786             __m128i dst_pixel = _mm_load_si128(d);
787 
788             // Extract A from src.
789             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
790             sa1 = _mm_srli_epi32(sa1, 24);
791             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
792             sa2 = _mm_srli_epi32(sa2, 24);
793             __m128i sa = _mm_packs_epi32(sa1, sa2);
794 
795             // Extract R from src.
796             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
797             sr1 = _mm_srli_epi32(sr1, 24);
798             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
799             sr2 = _mm_srli_epi32(sr2, 24);
800             __m128i sr = _mm_packs_epi32(sr1, sr2);
801 
802             // Extract G from src.
803             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
804             sg1 = _mm_srli_epi32(sg1, 24);
805             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
806             sg2 = _mm_srli_epi32(sg2, 24);
807             __m128i sg = _mm_packs_epi32(sg1, sg2);
808 
809             // Extract B from src.
810             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
811             sb1 = _mm_srli_epi32(sb1, 24);
812             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
813             sb2 = _mm_srli_epi32(sb2, 24);
814             __m128i sb = _mm_packs_epi32(sb1, sb2);
815 
816             // Extract R G B from dst.
817             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
818             dr = _mm_and_si128(dr, r16_mask);
819             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
820             dg = _mm_and_si128(dg, g16_mask);
821             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
822             db = _mm_and_si128(db, b16_mask);
823 
824             __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
825 
826             // Calculate R G B of result.
827             // Original algorithm is in SkSrcOver32To16().
828             dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
829             dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
830             dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
831             dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
832             db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
833             db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
834 
835             // Pack R G B into 16-bit color.
836             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
837 
838             // Store 8 16-bit colors in dst.
839             _mm_store_si128(d++, d_pixel);
840             count -= 8;
841         }
842 
843         src = reinterpret_cast<const SkPMColor*>(s);
844         dst = reinterpret_cast<uint16_t*>(d);
845     }
846 
847     if (count > 0) {
848         do {
849             SkPMColor c = *src++;
850             SkPMColorAssert(c);
851             if (c) {
852                 *dst = SkSrcOver32To16(c, *dst);
853             }
854             dst += 1;
855         } while (--count != 0);
856     }
857 }
858 
S32_D565_Opaque_Dither_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)859 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
860                                  const SkPMColor* SK_RESTRICT src,
861                                  int count, U8CPU alpha, int x, int y) {
862     SkASSERT(255 == alpha);
863 
864     if (count <= 0) {
865         return;
866     }
867 
868     if (count >= 8) {
869         while (((size_t)dst & 0x0F) != 0) {
870             DITHER_565_SCAN(y);
871             SkPMColor c = *src++;
872             SkPMColorAssert(c);
873 
874             unsigned dither = DITHER_VALUE(x);
875             *dst++ = SkDitherRGB32To565(c, dither);
876             DITHER_INC_X(x);
877             count--;
878         }
879 
880         unsigned short dither_value[8];
881         __m128i dither;
882 #ifdef ENABLE_DITHER_MATRIX_4X4
883         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
884         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
885         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
886         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
887         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
888 #else
889         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
890         dither_value[0] = dither_value[4] = (dither_scan
891                                              >> (((x) & 3) << 2)) & 0xF;
892         dither_value[1] = dither_value[5] = (dither_scan
893                                              >> (((x + 1) & 3) << 2)) & 0xF;
894         dither_value[2] = dither_value[6] = (dither_scan
895                                              >> (((x + 2) & 3) << 2)) & 0xF;
896         dither_value[3] = dither_value[7] = (dither_scan
897                                              >> (((x + 3) & 3) << 2)) & 0xF;
898 #endif
899         dither = _mm_loadu_si128((__m128i*) dither_value);
900 
901         const __m128i* s = reinterpret_cast<const __m128i*>(src);
902         __m128i* d = reinterpret_cast<__m128i*>(dst);
903 
904         while (count >= 8) {
905             // Load 8 pixels of src.
906             __m128i src_pixel1 = _mm_loadu_si128(s++);
907             __m128i src_pixel2 = _mm_loadu_si128(s++);
908 
909             // Extract R from src.
910             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
911             sr1 = _mm_srli_epi32(sr1, 24);
912             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
913             sr2 = _mm_srli_epi32(sr2, 24);
914             __m128i sr = _mm_packs_epi32(sr1, sr2);
915 
916             // SkDITHER_R32To565(sr, dither)
917             __m128i sr_offset = _mm_srli_epi16(sr, 5);
918             sr = _mm_add_epi16(sr, dither);
919             sr = _mm_sub_epi16(sr, sr_offset);
920             sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
921 
922             // Extract G from src.
923             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
924             sg1 = _mm_srli_epi32(sg1, 24);
925             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
926             sg2 = _mm_srli_epi32(sg2, 24);
927             __m128i sg = _mm_packs_epi32(sg1, sg2);
928 
929             // SkDITHER_R32To565(sg, dither)
930             __m128i sg_offset = _mm_srli_epi16(sg, 6);
931             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
932             sg = _mm_sub_epi16(sg, sg_offset);
933             sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
934 
935             // Extract B from src.
936             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
937             sb1 = _mm_srli_epi32(sb1, 24);
938             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
939             sb2 = _mm_srli_epi32(sb2, 24);
940             __m128i sb = _mm_packs_epi32(sb1, sb2);
941 
942             // SkDITHER_R32To565(sb, dither)
943             __m128i sb_offset = _mm_srli_epi16(sb, 5);
944             sb = _mm_add_epi16(sb, dither);
945             sb = _mm_sub_epi16(sb, sb_offset);
946             sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
947 
948             // Pack and store 16-bit dst pixel.
949             __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
950             _mm_store_si128(d++, d_pixel);
951 
952             count -= 8;
953             x += 8;
954         }
955 
956         src = reinterpret_cast<const SkPMColor*>(s);
957         dst = reinterpret_cast<uint16_t*>(d);
958     }
959 
960     if (count > 0) {
961         DITHER_565_SCAN(y);
962         do {
963             SkPMColor c = *src++;
964             SkPMColorAssert(c);
965 
966             unsigned dither = DITHER_VALUE(x);
967             *dst++ = SkDitherRGB32To565(c, dither);
968             DITHER_INC_X(x);
969         } while (--count != 0);
970     }
971 }
972 
973 /* SSE2 version of S32A_D565_Opaque_Dither()
974  * portable version is in core/SkBlitRow_D16.cpp
975  */
S32A_D565_Opaque_Dither_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)976 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
977                                   const SkPMColor* SK_RESTRICT src,
978                                   int count, U8CPU alpha, int x, int y) {
979     SkASSERT(255 == alpha);
980 
981     if (count <= 0) {
982         return;
983     }
984 
985     if (count >= 8) {
986         while (((size_t)dst & 0x0F) != 0) {
987             DITHER_565_SCAN(y);
988             SkPMColor c = *src++;
989             SkPMColorAssert(c);
990             if (c) {
991                 unsigned a = SkGetPackedA32(c);
992 
993                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
994 
995                 unsigned sr = SkGetPackedR32(c);
996                 unsigned sg = SkGetPackedG32(c);
997                 unsigned sb = SkGetPackedB32(c);
998                 sr = SkDITHER_R32_FOR_565(sr, d);
999                 sg = SkDITHER_G32_FOR_565(sg, d);
1000                 sb = SkDITHER_B32_FOR_565(sb, d);
1001 
1002                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1003                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1004                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1005                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1006                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1007             }
1008             dst += 1;
1009             DITHER_INC_X(x);
1010             count--;
1011         }
1012 
1013         unsigned short dither_value[8];
1014         __m128i dither, dither_cur;
1015 #ifdef ENABLE_DITHER_MATRIX_4X4
1016         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1017         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1018         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1019         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1020         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1021 #else
1022         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1023         dither_value[0] = dither_value[4] = (dither_scan
1024                                              >> (((x) & 3) << 2)) & 0xF;
1025         dither_value[1] = dither_value[5] = (dither_scan
1026                                              >> (((x + 1) & 3) << 2)) & 0xF;
1027         dither_value[2] = dither_value[6] = (dither_scan
1028                                              >> (((x + 2) & 3) << 2)) & 0xF;
1029         dither_value[3] = dither_value[7] = (dither_scan
1030                                              >> (((x + 3) & 3) << 2)) & 0xF;
1031 #endif
1032         dither = _mm_loadu_si128((__m128i*) dither_value);
1033 
1034         const __m128i* s = reinterpret_cast<const __m128i*>(src);
1035         __m128i* d = reinterpret_cast<__m128i*>(dst);
1036         __m128i var256 = _mm_set1_epi16(256);
1037         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
1038         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
1039         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
1040 
1041         while (count >= 8) {
1042             // Load 8 pixels of src and dst.
1043             __m128i src_pixel1 = _mm_loadu_si128(s++);
1044             __m128i src_pixel2 = _mm_loadu_si128(s++);
1045             __m128i dst_pixel = _mm_load_si128(d);
1046 
1047             // Extract A from src.
1048             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
1049             sa1 = _mm_srli_epi32(sa1, 24);
1050             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
1051             sa2 = _mm_srli_epi32(sa2, 24);
1052             __m128i sa = _mm_packs_epi32(sa1, sa2);
1053 
1054             // Calculate current dither value.
1055             dither_cur = _mm_mullo_epi16(dither,
1056                                          _mm_add_epi16(sa, _mm_set1_epi16(1)));
1057             dither_cur = _mm_srli_epi16(dither_cur, 8);
1058 
1059             // Extract R from src.
1060             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1061             sr1 = _mm_srli_epi32(sr1, 24);
1062             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1063             sr2 = _mm_srli_epi32(sr2, 24);
1064             __m128i sr = _mm_packs_epi32(sr1, sr2);
1065 
1066             // SkDITHER_R32_FOR_565(sr, d)
1067             __m128i sr_offset = _mm_srli_epi16(sr, 5);
1068             sr = _mm_add_epi16(sr, dither_cur);
1069             sr = _mm_sub_epi16(sr, sr_offset);
1070 
1071             // Expand sr.
1072             sr = _mm_slli_epi16(sr, 2);
1073 
1074             // Extract G from src.
1075             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1076             sg1 = _mm_srli_epi32(sg1, 24);
1077             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1078             sg2 = _mm_srli_epi32(sg2, 24);
1079             __m128i sg = _mm_packs_epi32(sg1, sg2);
1080 
1081             // sg = SkDITHER_G32_FOR_565(sg, d).
1082             __m128i sg_offset = _mm_srli_epi16(sg, 6);
1083             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
1084             sg = _mm_sub_epi16(sg, sg_offset);
1085 
1086             // Expand sg.
1087             sg = _mm_slli_epi16(sg, 3);
1088 
1089             // Extract B from src.
1090             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1091             sb1 = _mm_srli_epi32(sb1, 24);
1092             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1093             sb2 = _mm_srli_epi32(sb2, 24);
1094             __m128i sb = _mm_packs_epi32(sb1, sb2);
1095 
1096             // sb = SkDITHER_B32_FOR_565(sb, d).
1097             __m128i sb_offset = _mm_srli_epi16(sb, 5);
1098             sb = _mm_add_epi16(sb, dither_cur);
1099             sb = _mm_sub_epi16(sb, sb_offset);
1100 
1101             // Expand sb.
1102             sb = _mm_slli_epi16(sb, 2);
1103 
1104             // Extract R G B from dst.
1105             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1106             dr = _mm_and_si128(dr, r16_mask);
1107             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1108             dg = _mm_and_si128(dg, g16_mask);
1109             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1110             db = _mm_and_si128(db, b16_mask);
1111 
1112             // SkAlpha255To256(255 - a) >> 3
1113             __m128i isa = _mm_sub_epi16(var256, sa);
1114             isa = _mm_srli_epi16(isa, 3);
1115 
1116             dr = _mm_mullo_epi16(dr, isa);
1117             dr = _mm_add_epi16(dr, sr);
1118             dr = _mm_srli_epi16(dr, 5);
1119 
1120             dg = _mm_mullo_epi16(dg, isa);
1121             dg = _mm_add_epi16(dg, sg);
1122             dg = _mm_srli_epi16(dg, 5);
1123 
1124             db = _mm_mullo_epi16(db, isa);
1125             db = _mm_add_epi16(db, sb);
1126             db = _mm_srli_epi16(db, 5);
1127 
1128             // Package and store dst pixel.
1129             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1130             _mm_store_si128(d++, d_pixel);
1131 
1132             count -= 8;
1133             x += 8;
1134         }
1135 
1136         src = reinterpret_cast<const SkPMColor*>(s);
1137         dst = reinterpret_cast<uint16_t*>(d);
1138     }
1139 
1140     if (count > 0) {
1141         DITHER_565_SCAN(y);
1142         do {
1143             SkPMColor c = *src++;
1144             SkPMColorAssert(c);
1145             if (c) {
1146                 unsigned a = SkGetPackedA32(c);
1147 
1148                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1149 
1150                 unsigned sr = SkGetPackedR32(c);
1151                 unsigned sg = SkGetPackedG32(c);
1152                 unsigned sb = SkGetPackedB32(c);
1153                 sr = SkDITHER_R32_FOR_565(sr, d);
1154                 sg = SkDITHER_G32_FOR_565(sg, d);
1155                 sb = SkDITHER_B32_FOR_565(sb, d);
1156 
1157                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1158                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1159                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1160                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1161                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1162             }
1163             dst += 1;
1164             DITHER_INC_X(x);
1165         } while (--count != 0);
1166     }
1167 }
1168