1 /*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include <emmintrin.h>
9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkBlitRow_opts_SSE2.h"
11 #include "SkColorPriv.h"
12 #include "SkColor_opts_SSE2.h"
13 #include "SkDither.h"
14 #include "SkMSAN.h"
15 #include "SkUtils.h"
16
17 /* SSE2 version of S32_Blend_BlitRow32()
18 * portable version is in core/SkBlitRow_D32.cpp
19 */
S32_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)20 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
21 const SkPMColor* SK_RESTRICT src,
22 int count, U8CPU alpha) {
23 SkASSERT(alpha <= 255);
24 if (count <= 0) {
25 return;
26 }
27
28 uint32_t src_scale = SkAlpha255To256(alpha);
29 uint32_t dst_scale = 256 - src_scale;
30
31 if (count >= 4) {
32 SkASSERT(((size_t)dst & 0x03) == 0);
33 while (((size_t)dst & 0x0F) != 0) {
34 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
35 src++;
36 dst++;
37 count--;
38 }
39
40 const __m128i *s = reinterpret_cast<const __m128i*>(src);
41 __m128i *d = reinterpret_cast<__m128i*>(dst);
42
43 while (count >= 4) {
44 // Load 4 pixels each of src and dest.
45 __m128i src_pixel = _mm_loadu_si128(s);
46 __m128i dst_pixel = _mm_load_si128(d);
47
48 src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
49 dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
50
51 // Add result
52 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
53 _mm_store_si128(d, result);
54 s++;
55 d++;
56 count -= 4;
57 }
58 src = reinterpret_cast<const SkPMColor*>(s);
59 dst = reinterpret_cast<SkPMColor*>(d);
60 }
61
62 while (count > 0) {
63 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
64 src++;
65 dst++;
66 count--;
67 }
68 }
69
S32A_Opaque_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)70 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
71 const SkPMColor* SK_RESTRICT src,
72 int count, U8CPU alpha) {
73 sk_msan_assert_initialized(src, src+count);
74
75 SkASSERT(alpha == 255);
76 if (count <= 0) {
77 return;
78 }
79
80 #ifdef SK_USE_ACCURATE_BLENDING
81 if (count >= 4) {
82 SkASSERT(((size_t)dst & 0x03) == 0);
83 while (((size_t)dst & 0x0F) != 0) {
84 *dst = SkPMSrcOver(*src, *dst);
85 src++;
86 dst++;
87 count--;
88 }
89
90 const __m128i *s = reinterpret_cast<const __m128i*>(src);
91 __m128i *d = reinterpret_cast<__m128i*>(dst);
92 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
93 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
94 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
95 while (count >= 4) {
96 // Load 4 pixels
97 __m128i src_pixel = _mm_loadu_si128(s);
98 __m128i dst_pixel = _mm_load_si128(d);
99
100 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
101 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
102 // Shift alphas down to lower 8 bits of each quad.
103 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
104
105 // Copy alpha to upper 3rd byte of each quad
106 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
107
108 // Subtract alphas from 255, to get 0..255
109 alpha = _mm_sub_epi16(c_255, alpha);
110
111 // Multiply by red and blue by src alpha.
112 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
113 // Multiply by alpha and green by src alpha.
114 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
115
116 // dst_rb_low = (dst_rb >> 8)
117 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
118 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
119
120 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
121 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
122 dst_rb = _mm_add_epi16(dst_rb, c_128);
123 dst_rb = _mm_srli_epi16(dst_rb, 8);
124
125 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
126 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
127 dst_ag = _mm_add_epi16(dst_ag, c_128);
128 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
129
130 // Combine back into RGBA.
131 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
132
133 // Add result
134 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
135 _mm_store_si128(d, result);
136 s++;
137 d++;
138 count -= 4;
139 }
140 src = reinterpret_cast<const SkPMColor*>(s);
141 dst = reinterpret_cast<SkPMColor*>(d);
142 }
143
144 while (count > 0) {
145 *dst = SkPMSrcOver(*src, *dst);
146 src++;
147 dst++;
148 count--;
149 }
150 #else
151 int count16 = count / 16;
152 __m128i* dst4 = (__m128i*)dst;
153 const __m128i* src4 = (const __m128i*)src;
154
155 for (int i = 0; i < count16 * 4; i += 4) {
156 // Load 16 source pixels.
157 __m128i s0 = _mm_loadu_si128(src4+i+0),
158 s1 = _mm_loadu_si128(src4+i+1),
159 s2 = _mm_loadu_si128(src4+i+2),
160 s3 = _mm_loadu_si128(src4+i+3);
161
162 const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
163 const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
164 __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
165 if (0xffff == _mm_movemask_epi8(cmp)) {
166 // All 16 source pixels are fully transparent. There's nothing to do!
167 continue;
168 }
169 const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
170 cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
171 if (0xffff == _mm_movemask_epi8(cmp)) {
172 // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
173 _mm_storeu_si128(dst4+i+0, s0);
174 _mm_storeu_si128(dst4+i+1, s1);
175 _mm_storeu_si128(dst4+i+2, s2);
176 _mm_storeu_si128(dst4+i+3, s3);
177 continue;
178 }
179 // The general slow case: do the blend for all 16 pixels.
180 _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
181 _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
182 _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
183 _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
184 }
185
186 // Wrap up the last <= 15 pixels.
187 SkASSERT(count - (count16*16) <= 15);
188 for (int i = count16*16; i < count; i++) {
189 // This check is not really necessarily, but it prevents pointless autovectorization.
190 if (src[i] & 0xFF000000) {
191 dst[i] = SkPMSrcOver(src[i], dst[i]);
192 }
193 }
194 #endif
195 }
196
S32A_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)197 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
198 const SkPMColor* SK_RESTRICT src,
199 int count, U8CPU alpha) {
200 SkASSERT(alpha <= 255);
201 if (count <= 0) {
202 return;
203 }
204
205 if (count >= 4) {
206 while (((size_t)dst & 0x0F) != 0) {
207 *dst = SkBlendARGB32(*src, *dst, alpha);
208 src++;
209 dst++;
210 count--;
211 }
212
213 const __m128i *s = reinterpret_cast<const __m128i*>(src);
214 __m128i *d = reinterpret_cast<__m128i*>(dst);
215 while (count >= 4) {
216 // Load 4 pixels each of src and dest.
217 __m128i src_pixel = _mm_loadu_si128(s);
218 __m128i dst_pixel = _mm_load_si128(d);
219
220 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
221 _mm_store_si128(d, result);
222 s++;
223 d++;
224 count -= 4;
225 }
226 src = reinterpret_cast<const SkPMColor*>(s);
227 dst = reinterpret_cast<SkPMColor*>(d);
228 }
229
230 while (count > 0) {
231 *dst = SkBlendARGB32(*src, *dst, alpha);
232 src++;
233 dst++;
234 count--;
235 }
236 }
237
Color32A_D565_SSE2(uint16_t dst[],SkPMColor src,int count,int x,int y)238 void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
239 SkASSERT(count > 0);
240
241 uint32_t src_expand = (SkGetPackedG32(src) << 24) |
242 (SkGetPackedR32(src) << 13) |
243 (SkGetPackedB32(src) << 2);
244 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
245
246 // Check if we have enough pixels to run SIMD
247 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
248 __m128i* dst_wide;
249 const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
250 const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
251 const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
252 const __m128i scale_wide = _mm_set1_epi16(scale);
253 const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK);
254 const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
255
256 // Align dst to an even 16 byte address (0-7 pixels)
257 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
258 *dst = SkBlend32_RGB16(src_expand, *dst, scale);
259 dst += 1;
260 count--;
261 }
262
263 dst_wide = reinterpret_cast<__m128i*>(dst);
264 do {
265 // Load eight RGB565 pixels
266 __m128i pixels = _mm_load_si128(dst_wide);
267
268 // Mask out sub-pixels
269 __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
270 __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
271 pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
272 __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
273
274 // Scale with alpha
275 pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
276 pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
277 pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
278
279 // Add src_X_wide and shift down again
280 pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
281 pixel_R = _mm_srli_epi16(pixel_R, 5);
282 pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
283 pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
284 pixel_B = _mm_srli_epi16(pixel_B, 5);
285
286 // Combine into RGB565 and store
287 pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
288 pixel_G = _mm_and_si128(pixel_G, mask_green);
289 pixels = _mm_or_si128(pixel_R, pixel_G);
290 pixels = _mm_or_si128(pixels, pixel_B);
291 _mm_store_si128(dst_wide, pixels);
292 count -= 8;
293 dst_wide++;
294 } while (count >= 8);
295
296 dst = reinterpret_cast<uint16_t*>(dst_wide);
297 }
298
299 // Small loop to handle remaining pixels.
300 while (count > 0) {
301 *dst = SkBlend32_RGB16(src_expand, *dst, scale);
302 dst += 1;
303 count--;
304 }
305 }
306
307 // The following (left) shifts cause the top 5 bits of the mask components to
308 // line up with the corresponding components in an SkPMColor.
309 // Note that the mask's RGB16 order may differ from the SkPMColor order.
310 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
311 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
312 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
313
314 #if SK_R16x5_R32x5_SHIFT == 0
315 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
316 #elif SK_R16x5_R32x5_SHIFT > 0
317 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
318 #else
319 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
320 #endif
321
322 #if SK_G16x5_G32x5_SHIFT == 0
323 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
324 #elif SK_G16x5_G32x5_SHIFT > 0
325 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
326 #else
327 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
328 #endif
329
330 #if SK_B16x5_B32x5_SHIFT == 0
331 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
332 #elif SK_B16x5_B32x5_SHIFT > 0
333 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
334 #else
335 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
336 #endif
337
SkBlendLCD16_SSE2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)338 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
339 __m128i &mask, __m128i &srcA) {
340 // In the following comments, the components of src, dst and mask are
341 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
342 // by an R, G, B, or A suffix. Components of one of the four pixels that
343 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
344 // example is the blue channel of the second destination pixel. Memory
345 // layout is shown for an ARGB byte order in a color value.
346
347 // src and srcA store 8-bit values interleaved with zeros.
348 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
349 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
350 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
351 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
352 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
353 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
354 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
355
356 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
357 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
358 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
359 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
360
361 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
362 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
363 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
364
365 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
366 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
367 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
368
369 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
370 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
371 // 8-bit position
372 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
373 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
374 mask = _mm_or_si128(_mm_or_si128(r, g), b);
375
376 // Interleave R,G,B into the lower byte of word.
377 // i.e. split the sixteen 8-bit values from mask into two sets of eight
378 // 16-bit values, padded by zero.
379 __m128i maskLo, maskHi;
380 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
381 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
382 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
383 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
384
385 // Upscale from 0..31 to 0..32
386 // (allows to replace division by left-shift further down)
387 // Left-shift each component by 4 and add the result back to that component,
388 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
389 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
390 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
391
392 // Multiply each component of maskLo and maskHi by srcA
393 maskLo = _mm_mullo_epi16(maskLo, srcA);
394 maskHi = _mm_mullo_epi16(maskHi, srcA);
395
396 // Left shift mask components by 8 (divide by 256)
397 maskLo = _mm_srli_epi16(maskLo, 8);
398 maskHi = _mm_srli_epi16(maskHi, 8);
399
400 // Interleave R,G,B into the lower byte of the word
401 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
402 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
403 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
404 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
405
406 // mask = (src - dst) * mask
407 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
408 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
409
410 // mask = (src - dst) * mask >> 5
411 maskLo = _mm_srai_epi16(maskLo, 5);
412 maskHi = _mm_srai_epi16(maskHi, 5);
413
414 // Add two pixels into result.
415 // result = dst + ((src - dst) * mask >> 5)
416 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
417 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
418
419 // Pack into 4 32bit dst pixels.
420 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
421 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
422 // clamping to 255 if necessary.
423 return _mm_packus_epi16(resultLo, resultHi);
424 }
425
SkBlendLCD16Opaque_SSE2(__m128i & src,__m128i & dst,__m128i & mask)426 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
427 __m128i &mask) {
428 // In the following comments, the components of src, dst and mask are
429 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
430 // by an R, G, B, or A suffix. Components of one of the four pixels that
431 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
432 // example is the blue channel of the second destination pixel. Memory
433 // layout is shown for an ARGB byte order in a color value.
434
435 // src and srcA store 8-bit values interleaved with zeros.
436 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
437 // mask stores 16-bit values (shown as high and low bytes) interleaved with
438 // zeros
439 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
440 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
441
442 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
443 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
444 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
445 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
446
447 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
448 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
449 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
450
451 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
452 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
453 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
454
455 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
456 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
457 // 8-bit position
458 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
459 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
460 mask = _mm_or_si128(_mm_or_si128(r, g), b);
461
462 // Interleave R,G,B into the lower byte of word.
463 // i.e. split the sixteen 8-bit values from mask into two sets of eight
464 // 16-bit values, padded by zero.
465 __m128i maskLo, maskHi;
466 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
467 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
468 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
469 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
470
471 // Upscale from 0..31 to 0..32
472 // (allows to replace division by left-shift further down)
473 // Left-shift each component by 4 and add the result back to that component,
474 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
475 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
476 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
477
478 // Interleave R,G,B into the lower byte of the word
479 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
480 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
481 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
482 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
483
484 // mask = (src - dst) * mask
485 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
486 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
487
488 // mask = (src - dst) * mask >> 5
489 maskLo = _mm_srai_epi16(maskLo, 5);
490 maskHi = _mm_srai_epi16(maskHi, 5);
491
492 // Add two pixels into result.
493 // result = dst + ((src - dst) * mask >> 5)
494 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
495 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
496
497 // Pack into 4 32bit dst pixels and force opaque.
498 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
499 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
500 // clamping to 255 if necessary. Set alpha components to 0xFF.
501 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
502 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
503 }
504
SkBlitLCD16Row_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)505 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
506 SkColor src, int width, SkPMColor) {
507 if (width <= 0) {
508 return;
509 }
510
511 int srcA = SkColorGetA(src);
512 int srcR = SkColorGetR(src);
513 int srcG = SkColorGetG(src);
514 int srcB = SkColorGetB(src);
515
516 srcA = SkAlpha255To256(srcA);
517
518 if (width >= 4) {
519 SkASSERT(((size_t)dst & 0x03) == 0);
520 while (((size_t)dst & 0x0F) != 0) {
521 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
522 mask++;
523 dst++;
524 width--;
525 }
526
527 __m128i *d = reinterpret_cast<__m128i*>(dst);
528 // Set alpha to 0xFF and replicate source four times in SSE register.
529 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
530 // Interleave with zeros to get two sets of four 16-bit values.
531 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
532 // Set srcA_sse to contain eight copies of srcA, padded with zero.
533 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
534 __m128i srcA_sse = _mm_set1_epi16(srcA);
535 while (width >= 4) {
536 // Load four destination pixels into dst_sse.
537 __m128i dst_sse = _mm_load_si128(d);
538 // Load four 16-bit masks into lower half of mask_sse.
539 __m128i mask_sse = _mm_loadl_epi64(
540 reinterpret_cast<const __m128i*>(mask));
541
542 // Check whether masks are equal to 0 and get the highest bit
543 // of each byte of result, if masks are all zero, we will get
544 // pack_cmp to 0xFFFF
545 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
546 _mm_setzero_si128()));
547
548 // if mask pixels are not all zero, we will blend the dst pixels
549 if (pack_cmp != 0xFFFF) {
550 // Unpack 4 16bit mask pixels to
551 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
552 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
553 mask_sse = _mm_unpacklo_epi16(mask_sse,
554 _mm_setzero_si128());
555
556 // Process 4 32bit dst pixels
557 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
558 mask_sse, srcA_sse);
559 _mm_store_si128(d, result);
560 }
561
562 d++;
563 mask += 4;
564 width -= 4;
565 }
566
567 dst = reinterpret_cast<SkPMColor*>(d);
568 }
569
570 while (width > 0) {
571 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
572 mask++;
573 dst++;
574 width--;
575 }
576 }
577
SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)578 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
579 SkColor src, int width, SkPMColor opaqueDst) {
580 if (width <= 0) {
581 return;
582 }
583
584 int srcR = SkColorGetR(src);
585 int srcG = SkColorGetG(src);
586 int srcB = SkColorGetB(src);
587
588 if (width >= 4) {
589 SkASSERT(((size_t)dst & 0x03) == 0);
590 while (((size_t)dst & 0x0F) != 0) {
591 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
592 mask++;
593 dst++;
594 width--;
595 }
596
597 __m128i *d = reinterpret_cast<__m128i*>(dst);
598 // Set alpha to 0xFF and replicate source four times in SSE register.
599 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
600 // Set srcA_sse to contain eight copies of srcA, padded with zero.
601 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
602 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
603 while (width >= 4) {
604 // Load four destination pixels into dst_sse.
605 __m128i dst_sse = _mm_load_si128(d);
606 // Load four 16-bit masks into lower half of mask_sse.
607 __m128i mask_sse = _mm_loadl_epi64(
608 reinterpret_cast<const __m128i*>(mask));
609
610 // Check whether masks are equal to 0 and get the highest bit
611 // of each byte of result, if masks are all zero, we will get
612 // pack_cmp to 0xFFFF
613 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
614 _mm_setzero_si128()));
615
616 // if mask pixels are not all zero, we will blend the dst pixels
617 if (pack_cmp != 0xFFFF) {
618 // Unpack 4 16bit mask pixels to
619 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
620 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
621 mask_sse = _mm_unpacklo_epi16(mask_sse,
622 _mm_setzero_si128());
623
624 // Process 4 32bit dst pixels
625 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
626 mask_sse);
627 _mm_store_si128(d, result);
628 }
629
630 d++;
631 mask += 4;
632 width -= 4;
633 }
634
635 dst = reinterpret_cast<SkPMColor*>(d);
636 }
637
638 while (width > 0) {
639 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
640 mask++;
641 dst++;
642 width--;
643 }
644 }
645
646 /* SSE2 version of S32_D565_Opaque()
647 * portable version is in core/SkBlitRow_D16.cpp
648 */
S32_D565_Opaque_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)649 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
650 const SkPMColor* SK_RESTRICT src, int count,
651 U8CPU alpha, int /*x*/, int /*y*/) {
652 SkASSERT(255 == alpha);
653
654 if (count <= 0) {
655 return;
656 }
657
658 if (count >= 8) {
659 while (((size_t)dst & 0x0F) != 0) {
660 SkPMColor c = *src++;
661 SkPMColorAssert(c);
662
663 *dst++ = SkPixel32ToPixel16_ToU16(c);
664 count--;
665 }
666
667 const __m128i* s = reinterpret_cast<const __m128i*>(src);
668 __m128i* d = reinterpret_cast<__m128i*>(dst);
669
670 while (count >= 8) {
671 // Load 8 pixels of src.
672 __m128i src_pixel1 = _mm_loadu_si128(s++);
673 __m128i src_pixel2 = _mm_loadu_si128(s++);
674
675 __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
676 _mm_store_si128(d++, d_pixel);
677 count -= 8;
678 }
679 src = reinterpret_cast<const SkPMColor*>(s);
680 dst = reinterpret_cast<uint16_t*>(d);
681 }
682
683 if (count > 0) {
684 do {
685 SkPMColor c = *src++;
686 SkPMColorAssert(c);
687 *dst++ = SkPixel32ToPixel16_ToU16(c);
688 } while (--count != 0);
689 }
690 }
691
692 /* SSE2 version of S32A_D565_Opaque()
693 * portable version is in core/SkBlitRow_D16.cpp
694 */
S32A_D565_Opaque_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)695 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
696 const SkPMColor* SK_RESTRICT src,
697 int count, U8CPU alpha, int /*x*/, int /*y*/) {
698 SkASSERT(255 == alpha);
699
700 if (count <= 0) {
701 return;
702 }
703
704 if (count >= 8) {
705 // Make dst 16 bytes alignment
706 while (((size_t)dst & 0x0F) != 0) {
707 SkPMColor c = *src++;
708 if (c) {
709 *dst = SkSrcOver32To16(c, *dst);
710 }
711 dst += 1;
712 count--;
713 }
714
715 const __m128i* s = reinterpret_cast<const __m128i*>(src);
716 __m128i* d = reinterpret_cast<__m128i*>(dst);
717 __m128i var255 = _mm_set1_epi16(255);
718 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
719 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
720 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
721
722 while (count >= 8) {
723 // Load 8 pixels of src.
724 __m128i src_pixel1 = _mm_loadu_si128(s++);
725 __m128i src_pixel2 = _mm_loadu_si128(s++);
726
727 // Check whether src pixels are equal to 0 and get the highest bit
728 // of each byte of result, if src pixels are all zero, src_cmp1 and
729 // src_cmp2 will be 0xFFFF.
730 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
731 _mm_setzero_si128()));
732 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
733 _mm_setzero_si128()));
734 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
735 d++;
736 count -= 8;
737 continue;
738 }
739
740 // Load 8 pixels of dst.
741 __m128i dst_pixel = _mm_load_si128(d);
742
743 // Extract A from src.
744 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
745 sa1 = _mm_srli_epi32(sa1, 24);
746 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
747 sa2 = _mm_srli_epi32(sa2, 24);
748 __m128i sa = _mm_packs_epi32(sa1, sa2);
749
750 // Extract R from src.
751 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
752 sr1 = _mm_srli_epi32(sr1, 24);
753 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
754 sr2 = _mm_srli_epi32(sr2, 24);
755 __m128i sr = _mm_packs_epi32(sr1, sr2);
756
757 // Extract G from src.
758 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
759 sg1 = _mm_srli_epi32(sg1, 24);
760 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
761 sg2 = _mm_srli_epi32(sg2, 24);
762 __m128i sg = _mm_packs_epi32(sg1, sg2);
763
764 // Extract B from src.
765 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
766 sb1 = _mm_srli_epi32(sb1, 24);
767 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
768 sb2 = _mm_srli_epi32(sb2, 24);
769 __m128i sb = _mm_packs_epi32(sb1, sb2);
770
771 // Extract R G B from dst.
772 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
773 dr = _mm_and_si128(dr, r16_mask);
774 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
775 dg = _mm_and_si128(dg, g16_mask);
776 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
777 db = _mm_and_si128(db, b16_mask);
778
779 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
780
781 // Calculate R G B of result.
782 // Original algorithm is in SkSrcOver32To16().
783 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
784 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
785 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
786 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
787 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
788 db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
789
790 // Pack R G B into 16-bit color.
791 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
792
793 // Store 8 16-bit colors in dst.
794 _mm_store_si128(d++, d_pixel);
795 count -= 8;
796 }
797
798 src = reinterpret_cast<const SkPMColor*>(s);
799 dst = reinterpret_cast<uint16_t*>(d);
800 }
801
802 if (count > 0) {
803 do {
804 SkPMColor c = *src++;
805 SkPMColorAssert(c);
806 if (c) {
807 *dst = SkSrcOver32To16(c, *dst);
808 }
809 dst += 1;
810 } while (--count != 0);
811 }
812 }
813
S32_D565_Opaque_Dither_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)814 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
815 const SkPMColor* SK_RESTRICT src,
816 int count, U8CPU alpha, int x, int y) {
817 SkASSERT(255 == alpha);
818
819 if (count <= 0) {
820 return;
821 }
822
823 if (count >= 8) {
824 while (((size_t)dst & 0x0F) != 0) {
825 DITHER_565_SCAN(y);
826 SkPMColor c = *src++;
827 SkPMColorAssert(c);
828
829 unsigned dither = DITHER_VALUE(x);
830 *dst++ = SkDitherRGB32To565(c, dither);
831 DITHER_INC_X(x);
832 count--;
833 }
834
835 unsigned short dither_value[8];
836 __m128i dither;
837 #ifdef ENABLE_DITHER_MATRIX_4X4
838 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
839 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
840 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
841 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
842 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
843 #else
844 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
845 dither_value[0] = dither_value[4] = (dither_scan
846 >> (((x) & 3) << 2)) & 0xF;
847 dither_value[1] = dither_value[5] = (dither_scan
848 >> (((x + 1) & 3) << 2)) & 0xF;
849 dither_value[2] = dither_value[6] = (dither_scan
850 >> (((x + 2) & 3) << 2)) & 0xF;
851 dither_value[3] = dither_value[7] = (dither_scan
852 >> (((x + 3) & 3) << 2)) & 0xF;
853 #endif
854 dither = _mm_loadu_si128((__m128i*) dither_value);
855
856 const __m128i* s = reinterpret_cast<const __m128i*>(src);
857 __m128i* d = reinterpret_cast<__m128i*>(dst);
858
859 while (count >= 8) {
860 // Load 8 pixels of src.
861 __m128i src_pixel1 = _mm_loadu_si128(s++);
862 __m128i src_pixel2 = _mm_loadu_si128(s++);
863
864 // Extract R from src.
865 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
866 sr1 = _mm_srli_epi32(sr1, 24);
867 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
868 sr2 = _mm_srli_epi32(sr2, 24);
869 __m128i sr = _mm_packs_epi32(sr1, sr2);
870
871 // SkDITHER_R32To565(sr, dither)
872 __m128i sr_offset = _mm_srli_epi16(sr, 5);
873 sr = _mm_add_epi16(sr, dither);
874 sr = _mm_sub_epi16(sr, sr_offset);
875 sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
876
877 // Extract G from src.
878 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
879 sg1 = _mm_srli_epi32(sg1, 24);
880 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
881 sg2 = _mm_srli_epi32(sg2, 24);
882 __m128i sg = _mm_packs_epi32(sg1, sg2);
883
884 // SkDITHER_R32To565(sg, dither)
885 __m128i sg_offset = _mm_srli_epi16(sg, 6);
886 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
887 sg = _mm_sub_epi16(sg, sg_offset);
888 sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
889
890 // Extract B from src.
891 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
892 sb1 = _mm_srli_epi32(sb1, 24);
893 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
894 sb2 = _mm_srli_epi32(sb2, 24);
895 __m128i sb = _mm_packs_epi32(sb1, sb2);
896
897 // SkDITHER_R32To565(sb, dither)
898 __m128i sb_offset = _mm_srli_epi16(sb, 5);
899 sb = _mm_add_epi16(sb, dither);
900 sb = _mm_sub_epi16(sb, sb_offset);
901 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
902
903 // Pack and store 16-bit dst pixel.
904 __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
905 _mm_store_si128(d++, d_pixel);
906
907 count -= 8;
908 x += 8;
909 }
910
911 src = reinterpret_cast<const SkPMColor*>(s);
912 dst = reinterpret_cast<uint16_t*>(d);
913 }
914
915 if (count > 0) {
916 DITHER_565_SCAN(y);
917 do {
918 SkPMColor c = *src++;
919 SkPMColorAssert(c);
920
921 unsigned dither = DITHER_VALUE(x);
922 *dst++ = SkDitherRGB32To565(c, dither);
923 DITHER_INC_X(x);
924 } while (--count != 0);
925 }
926 }
927
928 /* SSE2 version of S32A_D565_Opaque_Dither()
929 * portable version is in core/SkBlitRow_D16.cpp
930 */
S32A_D565_Opaque_Dither_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)931 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
932 const SkPMColor* SK_RESTRICT src,
933 int count, U8CPU alpha, int x, int y) {
934 SkASSERT(255 == alpha);
935
936 if (count <= 0) {
937 return;
938 }
939
940 if (count >= 8) {
941 while (((size_t)dst & 0x0F) != 0) {
942 DITHER_565_SCAN(y);
943 SkPMColor c = *src++;
944 SkPMColorAssert(c);
945 if (c) {
946 unsigned a = SkGetPackedA32(c);
947
948 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
949
950 unsigned sr = SkGetPackedR32(c);
951 unsigned sg = SkGetPackedG32(c);
952 unsigned sb = SkGetPackedB32(c);
953 sr = SkDITHER_R32_FOR_565(sr, d);
954 sg = SkDITHER_G32_FOR_565(sg, d);
955 sb = SkDITHER_B32_FOR_565(sb, d);
956
957 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
958 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
959 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
960 // now src and dst expanded are in g:11 r:10 x:1 b:10
961 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
962 }
963 dst += 1;
964 DITHER_INC_X(x);
965 count--;
966 }
967
968 unsigned short dither_value[8];
969 __m128i dither, dither_cur;
970 #ifdef ENABLE_DITHER_MATRIX_4X4
971 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
972 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
973 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
974 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
975 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
976 #else
977 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
978 dither_value[0] = dither_value[4] = (dither_scan
979 >> (((x) & 3) << 2)) & 0xF;
980 dither_value[1] = dither_value[5] = (dither_scan
981 >> (((x + 1) & 3) << 2)) & 0xF;
982 dither_value[2] = dither_value[6] = (dither_scan
983 >> (((x + 2) & 3) << 2)) & 0xF;
984 dither_value[3] = dither_value[7] = (dither_scan
985 >> (((x + 3) & 3) << 2)) & 0xF;
986 #endif
987 dither = _mm_loadu_si128((__m128i*) dither_value);
988
989 const __m128i* s = reinterpret_cast<const __m128i*>(src);
990 __m128i* d = reinterpret_cast<__m128i*>(dst);
991 __m128i var256 = _mm_set1_epi16(256);
992 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
993 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
994 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
995
996 while (count >= 8) {
997 // Load 8 pixels of src and dst.
998 __m128i src_pixel1 = _mm_loadu_si128(s++);
999 __m128i src_pixel2 = _mm_loadu_si128(s++);
1000 __m128i dst_pixel = _mm_load_si128(d);
1001
1002 // Extract A from src.
1003 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
1004 sa1 = _mm_srli_epi32(sa1, 24);
1005 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
1006 sa2 = _mm_srli_epi32(sa2, 24);
1007 __m128i sa = _mm_packs_epi32(sa1, sa2);
1008
1009 // Calculate current dither value.
1010 dither_cur = _mm_mullo_epi16(dither,
1011 _mm_add_epi16(sa, _mm_set1_epi16(1)));
1012 dither_cur = _mm_srli_epi16(dither_cur, 8);
1013
1014 // Extract R from src.
1015 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1016 sr1 = _mm_srli_epi32(sr1, 24);
1017 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1018 sr2 = _mm_srli_epi32(sr2, 24);
1019 __m128i sr = _mm_packs_epi32(sr1, sr2);
1020
1021 // SkDITHER_R32_FOR_565(sr, d)
1022 __m128i sr_offset = _mm_srli_epi16(sr, 5);
1023 sr = _mm_add_epi16(sr, dither_cur);
1024 sr = _mm_sub_epi16(sr, sr_offset);
1025
1026 // Expand sr.
1027 sr = _mm_slli_epi16(sr, 2);
1028
1029 // Extract G from src.
1030 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1031 sg1 = _mm_srli_epi32(sg1, 24);
1032 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1033 sg2 = _mm_srli_epi32(sg2, 24);
1034 __m128i sg = _mm_packs_epi32(sg1, sg2);
1035
1036 // sg = SkDITHER_G32_FOR_565(sg, d).
1037 __m128i sg_offset = _mm_srli_epi16(sg, 6);
1038 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
1039 sg = _mm_sub_epi16(sg, sg_offset);
1040
1041 // Expand sg.
1042 sg = _mm_slli_epi16(sg, 3);
1043
1044 // Extract B from src.
1045 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1046 sb1 = _mm_srli_epi32(sb1, 24);
1047 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1048 sb2 = _mm_srli_epi32(sb2, 24);
1049 __m128i sb = _mm_packs_epi32(sb1, sb2);
1050
1051 // sb = SkDITHER_B32_FOR_565(sb, d).
1052 __m128i sb_offset = _mm_srli_epi16(sb, 5);
1053 sb = _mm_add_epi16(sb, dither_cur);
1054 sb = _mm_sub_epi16(sb, sb_offset);
1055
1056 // Expand sb.
1057 sb = _mm_slli_epi16(sb, 2);
1058
1059 // Extract R G B from dst.
1060 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1061 dr = _mm_and_si128(dr, r16_mask);
1062 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1063 dg = _mm_and_si128(dg, g16_mask);
1064 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1065 db = _mm_and_si128(db, b16_mask);
1066
1067 // SkAlpha255To256(255 - a) >> 3
1068 __m128i isa = _mm_sub_epi16(var256, sa);
1069 isa = _mm_srli_epi16(isa, 3);
1070
1071 dr = _mm_mullo_epi16(dr, isa);
1072 dr = _mm_add_epi16(dr, sr);
1073 dr = _mm_srli_epi16(dr, 5);
1074
1075 dg = _mm_mullo_epi16(dg, isa);
1076 dg = _mm_add_epi16(dg, sg);
1077 dg = _mm_srli_epi16(dg, 5);
1078
1079 db = _mm_mullo_epi16(db, isa);
1080 db = _mm_add_epi16(db, sb);
1081 db = _mm_srli_epi16(db, 5);
1082
1083 // Package and store dst pixel.
1084 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1085 _mm_store_si128(d++, d_pixel);
1086
1087 count -= 8;
1088 x += 8;
1089 }
1090
1091 src = reinterpret_cast<const SkPMColor*>(s);
1092 dst = reinterpret_cast<uint16_t*>(d);
1093 }
1094
1095 if (count > 0) {
1096 DITHER_565_SCAN(y);
1097 do {
1098 SkPMColor c = *src++;
1099 SkPMColorAssert(c);
1100 if (c) {
1101 unsigned a = SkGetPackedA32(c);
1102
1103 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1104
1105 unsigned sr = SkGetPackedR32(c);
1106 unsigned sg = SkGetPackedG32(c);
1107 unsigned sb = SkGetPackedB32(c);
1108 sr = SkDITHER_R32_FOR_565(sr, d);
1109 sg = SkDITHER_G32_FOR_565(sg, d);
1110 sb = SkDITHER_B32_FOR_565(sb, d);
1111
1112 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1113 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1114 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1115 // now src and dst expanded are in g:11 r:10 x:1 b:10
1116 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1117 }
1118 dst += 1;
1119 DITHER_INC_X(x);
1120 } while (--count != 0);
1121 }
1122 }
1123