1 /*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include <emmintrin.h>
9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkBlitRow_opts_SSE2.h"
11 #include "SkColorPriv.h"
12 #include "SkColor_opts_SSE2.h"
13 #include "SkDither.h"
14 #include "SkUtils.h"
15
16 /* SSE2 version of S32_Blend_BlitRow32()
17 * portable version is in core/SkBlitRow_D32.cpp
18 */
S32_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20 const SkPMColor* SK_RESTRICT src,
21 int count, U8CPU alpha) {
22 SkASSERT(alpha <= 255);
23 if (count <= 0) {
24 return;
25 }
26
27 uint32_t src_scale = SkAlpha255To256(alpha);
28 uint32_t dst_scale = 256 - src_scale;
29
30 if (count >= 4) {
31 SkASSERT(((size_t)dst & 0x03) == 0);
32 while (((size_t)dst & 0x0F) != 0) {
33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34 src++;
35 dst++;
36 count--;
37 }
38
39 const __m128i *s = reinterpret_cast<const __m128i*>(src);
40 __m128i *d = reinterpret_cast<__m128i*>(dst);
41
42 while (count >= 4) {
43 // Load 4 pixels each of src and dest.
44 __m128i src_pixel = _mm_loadu_si128(s);
45 __m128i dst_pixel = _mm_load_si128(d);
46
47 src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
48 dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
49
50 // Add result
51 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
52 _mm_store_si128(d, result);
53 s++;
54 d++;
55 count -= 4;
56 }
57 src = reinterpret_cast<const SkPMColor*>(s);
58 dst = reinterpret_cast<SkPMColor*>(d);
59 }
60
61 while (count > 0) {
62 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
63 src++;
64 dst++;
65 count--;
66 }
67 }
68
S32A_Opaque_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)69 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
70 const SkPMColor* SK_RESTRICT src,
71 int count, U8CPU alpha) {
72 SkASSERT(alpha == 255);
73 if (count <= 0) {
74 return;
75 }
76
77 #ifdef SK_USE_ACCURATE_BLENDING
78 if (count >= 4) {
79 SkASSERT(((size_t)dst & 0x03) == 0);
80 while (((size_t)dst & 0x0F) != 0) {
81 *dst = SkPMSrcOver(*src, *dst);
82 src++;
83 dst++;
84 count--;
85 }
86
87 const __m128i *s = reinterpret_cast<const __m128i*>(src);
88 __m128i *d = reinterpret_cast<__m128i*>(dst);
89 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
90 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
91 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
92 while (count >= 4) {
93 // Load 4 pixels
94 __m128i src_pixel = _mm_loadu_si128(s);
95 __m128i dst_pixel = _mm_load_si128(d);
96
97 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
98 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
99 // Shift alphas down to lower 8 bits of each quad.
100 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
101
102 // Copy alpha to upper 3rd byte of each quad
103 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
104
105 // Subtract alphas from 255, to get 0..255
106 alpha = _mm_sub_epi16(c_255, alpha);
107
108 // Multiply by red and blue by src alpha.
109 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
110 // Multiply by alpha and green by src alpha.
111 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
112
113 // dst_rb_low = (dst_rb >> 8)
114 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
115 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
116
117 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
118 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
119 dst_rb = _mm_add_epi16(dst_rb, c_128);
120 dst_rb = _mm_srli_epi16(dst_rb, 8);
121
122 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
123 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
124 dst_ag = _mm_add_epi16(dst_ag, c_128);
125 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
126
127 // Combine back into RGBA.
128 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
129
130 // Add result
131 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
132 _mm_store_si128(d, result);
133 s++;
134 d++;
135 count -= 4;
136 }
137 src = reinterpret_cast<const SkPMColor*>(s);
138 dst = reinterpret_cast<SkPMColor*>(d);
139 }
140
141 while (count > 0) {
142 *dst = SkPMSrcOver(*src, *dst);
143 src++;
144 dst++;
145 count--;
146 }
147 #else
148 int count16 = count / 16;
149 __m128i* dst4 = (__m128i*)dst;
150 const __m128i* src4 = (const __m128i*)src;
151
152 for (int i = 0; i < count16 * 4; i += 4) {
153 // Load 16 source pixels.
154 __m128i s0 = _mm_loadu_si128(src4+i+0),
155 s1 = _mm_loadu_si128(src4+i+1),
156 s2 = _mm_loadu_si128(src4+i+2),
157 s3 = _mm_loadu_si128(src4+i+3);
158
159 const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
160 const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
161 __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
162 if (0xffff == _mm_movemask_epi8(cmp)) {
163 // All 16 source pixels are fully transparent. There's nothing to do!
164 continue;
165 }
166 const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
167 cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
168 if (0xffff == _mm_movemask_epi8(cmp)) {
169 // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
170 _mm_storeu_si128(dst4+i+0, s0);
171 _mm_storeu_si128(dst4+i+1, s1);
172 _mm_storeu_si128(dst4+i+2, s2);
173 _mm_storeu_si128(dst4+i+3, s3);
174 continue;
175 }
176 // The general slow case: do the blend for all 16 pixels.
177 _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
178 _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
179 _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
180 _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
181 }
182
183 // Wrap up the last <= 15 pixels.
184 SkASSERT(count - (count16*16) <= 15);
185 for (int i = count16*16; i < count; i++) {
186 // This check is not really necessarily, but it prevents pointless autovectorization.
187 if (src[i] & 0xFF000000) {
188 dst[i] = SkPMSrcOver(src[i], dst[i]);
189 }
190 }
191 #endif
192 }
193
S32A_Blend_BlitRow32_SSE2(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)194 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
195 const SkPMColor* SK_RESTRICT src,
196 int count, U8CPU alpha) {
197 SkASSERT(alpha <= 255);
198 if (count <= 0) {
199 return;
200 }
201
202 if (count >= 4) {
203 while (((size_t)dst & 0x0F) != 0) {
204 *dst = SkBlendARGB32(*src, *dst, alpha);
205 src++;
206 dst++;
207 count--;
208 }
209
210 const __m128i *s = reinterpret_cast<const __m128i*>(src);
211 __m128i *d = reinterpret_cast<__m128i*>(dst);
212 while (count >= 4) {
213 // Load 4 pixels each of src and dest.
214 __m128i src_pixel = _mm_loadu_si128(s);
215 __m128i dst_pixel = _mm_load_si128(d);
216
217 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
218 _mm_store_si128(d, result);
219 s++;
220 d++;
221 count -= 4;
222 }
223 src = reinterpret_cast<const SkPMColor*>(s);
224 dst = reinterpret_cast<SkPMColor*>(d);
225 }
226
227 while (count > 0) {
228 *dst = SkBlendARGB32(*src, *dst, alpha);
229 src++;
230 dst++;
231 count--;
232 }
233 }
234
Color32A_D565_SSE2(uint16_t dst[],SkPMColor src,int count,int x,int y)235 void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
236 SkASSERT(count > 0);
237
238 uint32_t src_expand = (SkGetPackedG32(src) << 24) |
239 (SkGetPackedR32(src) << 13) |
240 (SkGetPackedB32(src) << 2);
241 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
242
243 // Check if we have enough pixels to run SIMD
244 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
245 __m128i* dst_wide;
246 const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
247 const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
248 const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
249 const __m128i scale_wide = _mm_set1_epi16(scale);
250 const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK);
251 const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
252
253 // Align dst to an even 16 byte address (0-7 pixels)
254 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
255 *dst = SkBlend32_RGB16(src_expand, *dst, scale);
256 dst += 1;
257 count--;
258 }
259
260 dst_wide = reinterpret_cast<__m128i*>(dst);
261 do {
262 // Load eight RGB565 pixels
263 __m128i pixels = _mm_load_si128(dst_wide);
264
265 // Mask out sub-pixels
266 __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
267 __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
268 pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
269 __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
270
271 // Scale with alpha
272 pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
273 pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
274 pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
275
276 // Add src_X_wide and shift down again
277 pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
278 pixel_R = _mm_srli_epi16(pixel_R, 5);
279 pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
280 pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
281 pixel_B = _mm_srli_epi16(pixel_B, 5);
282
283 // Combine into RGB565 and store
284 pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
285 pixel_G = _mm_and_si128(pixel_G, mask_green);
286 pixels = _mm_or_si128(pixel_R, pixel_G);
287 pixels = _mm_or_si128(pixels, pixel_B);
288 _mm_store_si128(dst_wide, pixels);
289 count -= 8;
290 dst_wide++;
291 } while (count >= 8);
292
293 dst = reinterpret_cast<uint16_t*>(dst_wide);
294 }
295
296 // Small loop to handle remaining pixels.
297 while (count > 0) {
298 *dst = SkBlend32_RGB16(src_expand, *dst, scale);
299 dst += 1;
300 count--;
301 }
302 }
303
SkARGB32_A8_BlitMask_SSE2(void * device,size_t dstRB,const void * maskPtr,size_t maskRB,SkColor origColor,int width,int height)304 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
305 size_t maskRB, SkColor origColor,
306 int width, int height) {
307 SkPMColor color = SkPreMultiplyColor(origColor);
308 size_t dstOffset = dstRB - (width << 2);
309 size_t maskOffset = maskRB - width;
310 SkPMColor* dst = (SkPMColor *)device;
311 const uint8_t* mask = (const uint8_t*)maskPtr;
312 do {
313 int count = width;
314 if (count >= 4) {
315 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
316 *dst = SkBlendARGB32(color, *dst, *mask);
317 mask++;
318 dst++;
319 count--;
320 }
321 __m128i *d = reinterpret_cast<__m128i*>(dst);
322 __m128i src_pixel = _mm_set1_epi32(color);
323 while (count >= 4) {
324 // Load 4 dst pixels
325 __m128i dst_pixel = _mm_load_si128(d);
326
327 // Set the alpha value
328 __m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask));
329 alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128());
330 alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128());
331
332 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_wide);
333 _mm_store_si128(d, result);
334 // Load the next 4 dst pixels and alphas
335 mask = mask + 4;
336 d++;
337 count -= 4;
338 }
339 dst = reinterpret_cast<SkPMColor*>(d);
340 }
341 while (count > 0) {
342 *dst= SkBlendARGB32(color, *dst, *mask);
343 dst += 1;
344 mask++;
345 count --;
346 }
347 dst = (SkPMColor *)((char*)dst + dstOffset);
348 mask += maskOffset;
349 } while (--height != 0);
350 }
351
352 // The following (left) shifts cause the top 5 bits of the mask components to
353 // line up with the corresponding components in an SkPMColor.
354 // Note that the mask's RGB16 order may differ from the SkPMColor order.
355 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
356 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
357 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
358
359 #if SK_R16x5_R32x5_SHIFT == 0
360 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
361 #elif SK_R16x5_R32x5_SHIFT > 0
362 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
363 #else
364 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
365 #endif
366
367 #if SK_G16x5_G32x5_SHIFT == 0
368 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
369 #elif SK_G16x5_G32x5_SHIFT > 0
370 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
371 #else
372 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
373 #endif
374
375 #if SK_B16x5_B32x5_SHIFT == 0
376 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
377 #elif SK_B16x5_B32x5_SHIFT > 0
378 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
379 #else
380 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
381 #endif
382
SkBlendLCD16_SSE2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)383 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
384 __m128i &mask, __m128i &srcA) {
385 // In the following comments, the components of src, dst and mask are
386 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
387 // by an R, G, B, or A suffix. Components of one of the four pixels that
388 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
389 // example is the blue channel of the second destination pixel. Memory
390 // layout is shown for an ARGB byte order in a color value.
391
392 // src and srcA store 8-bit values interleaved with zeros.
393 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
394 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
395 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
396 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
397 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
398 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
399 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
400
401 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
402 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
403 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
404 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
405
406 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
407 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
408 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
409
410 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
411 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
412 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
413
414 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
415 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
416 // 8-bit position
417 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
418 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
419 mask = _mm_or_si128(_mm_or_si128(r, g), b);
420
421 // Interleave R,G,B into the lower byte of word.
422 // i.e. split the sixteen 8-bit values from mask into two sets of eight
423 // 16-bit values, padded by zero.
424 __m128i maskLo, maskHi;
425 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
426 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
427 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
428 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
429
430 // Upscale from 0..31 to 0..32
431 // (allows to replace division by left-shift further down)
432 // Left-shift each component by 4 and add the result back to that component,
433 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
434 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
435 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
436
437 // Multiply each component of maskLo and maskHi by srcA
438 maskLo = _mm_mullo_epi16(maskLo, srcA);
439 maskHi = _mm_mullo_epi16(maskHi, srcA);
440
441 // Left shift mask components by 8 (divide by 256)
442 maskLo = _mm_srli_epi16(maskLo, 8);
443 maskHi = _mm_srli_epi16(maskHi, 8);
444
445 // Interleave R,G,B into the lower byte of the word
446 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
447 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
448 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
449 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
450
451 // mask = (src - dst) * mask
452 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
453 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
454
455 // mask = (src - dst) * mask >> 5
456 maskLo = _mm_srai_epi16(maskLo, 5);
457 maskHi = _mm_srai_epi16(maskHi, 5);
458
459 // Add two pixels into result.
460 // result = dst + ((src - dst) * mask >> 5)
461 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
462 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
463
464 // Pack into 4 32bit dst pixels.
465 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
466 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
467 // clamping to 255 if necessary.
468 return _mm_packus_epi16(resultLo, resultHi);
469 }
470
SkBlendLCD16Opaque_SSE2(__m128i & src,__m128i & dst,__m128i & mask)471 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
472 __m128i &mask) {
473 // In the following comments, the components of src, dst and mask are
474 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
475 // by an R, G, B, or A suffix. Components of one of the four pixels that
476 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
477 // example is the blue channel of the second destination pixel. Memory
478 // layout is shown for an ARGB byte order in a color value.
479
480 // src and srcA store 8-bit values interleaved with zeros.
481 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
482 // mask stores 16-bit values (shown as high and low bytes) interleaved with
483 // zeros
484 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
485 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
486
487 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
488 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
489 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
490 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
491
492 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
493 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
494 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
495
496 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
497 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
498 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
499
500 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
501 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
502 // 8-bit position
503 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
504 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
505 mask = _mm_or_si128(_mm_or_si128(r, g), b);
506
507 // Interleave R,G,B into the lower byte of word.
508 // i.e. split the sixteen 8-bit values from mask into two sets of eight
509 // 16-bit values, padded by zero.
510 __m128i maskLo, maskHi;
511 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
512 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
513 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
514 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
515
516 // Upscale from 0..31 to 0..32
517 // (allows to replace division by left-shift further down)
518 // Left-shift each component by 4 and add the result back to that component,
519 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
520 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
521 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
522
523 // Interleave R,G,B into the lower byte of the word
524 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
525 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
526 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
527 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
528
529 // mask = (src - dst) * mask
530 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
531 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
532
533 // mask = (src - dst) * mask >> 5
534 maskLo = _mm_srai_epi16(maskLo, 5);
535 maskHi = _mm_srai_epi16(maskHi, 5);
536
537 // Add two pixels into result.
538 // result = dst + ((src - dst) * mask >> 5)
539 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
540 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
541
542 // Pack into 4 32bit dst pixels and force opaque.
543 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
544 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
545 // clamping to 255 if necessary. Set alpha components to 0xFF.
546 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
547 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
548 }
549
SkBlitLCD16Row_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)550 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
551 SkColor src, int width, SkPMColor) {
552 if (width <= 0) {
553 return;
554 }
555
556 int srcA = SkColorGetA(src);
557 int srcR = SkColorGetR(src);
558 int srcG = SkColorGetG(src);
559 int srcB = SkColorGetB(src);
560
561 srcA = SkAlpha255To256(srcA);
562
563 if (width >= 4) {
564 SkASSERT(((size_t)dst & 0x03) == 0);
565 while (((size_t)dst & 0x0F) != 0) {
566 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
567 mask++;
568 dst++;
569 width--;
570 }
571
572 __m128i *d = reinterpret_cast<__m128i*>(dst);
573 // Set alpha to 0xFF and replicate source four times in SSE register.
574 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
575 // Interleave with zeros to get two sets of four 16-bit values.
576 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
577 // Set srcA_sse to contain eight copies of srcA, padded with zero.
578 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
579 __m128i srcA_sse = _mm_set1_epi16(srcA);
580 while (width >= 4) {
581 // Load four destination pixels into dst_sse.
582 __m128i dst_sse = _mm_load_si128(d);
583 // Load four 16-bit masks into lower half of mask_sse.
584 __m128i mask_sse = _mm_loadl_epi64(
585 reinterpret_cast<const __m128i*>(mask));
586
587 // Check whether masks are equal to 0 and get the highest bit
588 // of each byte of result, if masks are all zero, we will get
589 // pack_cmp to 0xFFFF
590 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
591 _mm_setzero_si128()));
592
593 // if mask pixels are not all zero, we will blend the dst pixels
594 if (pack_cmp != 0xFFFF) {
595 // Unpack 4 16bit mask pixels to
596 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
597 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
598 mask_sse = _mm_unpacklo_epi16(mask_sse,
599 _mm_setzero_si128());
600
601 // Process 4 32bit dst pixels
602 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
603 mask_sse, srcA_sse);
604 _mm_store_si128(d, result);
605 }
606
607 d++;
608 mask += 4;
609 width -= 4;
610 }
611
612 dst = reinterpret_cast<SkPMColor*>(d);
613 }
614
615 while (width > 0) {
616 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
617 mask++;
618 dst++;
619 width--;
620 }
621 }
622
SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)623 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
624 SkColor src, int width, SkPMColor opaqueDst) {
625 if (width <= 0) {
626 return;
627 }
628
629 int srcR = SkColorGetR(src);
630 int srcG = SkColorGetG(src);
631 int srcB = SkColorGetB(src);
632
633 if (width >= 4) {
634 SkASSERT(((size_t)dst & 0x03) == 0);
635 while (((size_t)dst & 0x0F) != 0) {
636 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
637 mask++;
638 dst++;
639 width--;
640 }
641
642 __m128i *d = reinterpret_cast<__m128i*>(dst);
643 // Set alpha to 0xFF and replicate source four times in SSE register.
644 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
645 // Set srcA_sse to contain eight copies of srcA, padded with zero.
646 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
647 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
648 while (width >= 4) {
649 // Load four destination pixels into dst_sse.
650 __m128i dst_sse = _mm_load_si128(d);
651 // Load four 16-bit masks into lower half of mask_sse.
652 __m128i mask_sse = _mm_loadl_epi64(
653 reinterpret_cast<const __m128i*>(mask));
654
655 // Check whether masks are equal to 0 and get the highest bit
656 // of each byte of result, if masks are all zero, we will get
657 // pack_cmp to 0xFFFF
658 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
659 _mm_setzero_si128()));
660
661 // if mask pixels are not all zero, we will blend the dst pixels
662 if (pack_cmp != 0xFFFF) {
663 // Unpack 4 16bit mask pixels to
664 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
665 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
666 mask_sse = _mm_unpacklo_epi16(mask_sse,
667 _mm_setzero_si128());
668
669 // Process 4 32bit dst pixels
670 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
671 mask_sse);
672 _mm_store_si128(d, result);
673 }
674
675 d++;
676 mask += 4;
677 width -= 4;
678 }
679
680 dst = reinterpret_cast<SkPMColor*>(d);
681 }
682
683 while (width > 0) {
684 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
685 mask++;
686 dst++;
687 width--;
688 }
689 }
690
691 /* SSE2 version of S32_D565_Opaque()
692 * portable version is in core/SkBlitRow_D16.cpp
693 */
S32_D565_Opaque_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)694 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
695 const SkPMColor* SK_RESTRICT src, int count,
696 U8CPU alpha, int /*x*/, int /*y*/) {
697 SkASSERT(255 == alpha);
698
699 if (count <= 0) {
700 return;
701 }
702
703 if (count >= 8) {
704 while (((size_t)dst & 0x0F) != 0) {
705 SkPMColor c = *src++;
706 SkPMColorAssert(c);
707
708 *dst++ = SkPixel32ToPixel16_ToU16(c);
709 count--;
710 }
711
712 const __m128i* s = reinterpret_cast<const __m128i*>(src);
713 __m128i* d = reinterpret_cast<__m128i*>(dst);
714
715 while (count >= 8) {
716 // Load 8 pixels of src.
717 __m128i src_pixel1 = _mm_loadu_si128(s++);
718 __m128i src_pixel2 = _mm_loadu_si128(s++);
719
720 __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
721 _mm_store_si128(d++, d_pixel);
722 count -= 8;
723 }
724 src = reinterpret_cast<const SkPMColor*>(s);
725 dst = reinterpret_cast<uint16_t*>(d);
726 }
727
728 if (count > 0) {
729 do {
730 SkPMColor c = *src++;
731 SkPMColorAssert(c);
732 *dst++ = SkPixel32ToPixel16_ToU16(c);
733 } while (--count != 0);
734 }
735 }
736
737 /* SSE2 version of S32A_D565_Opaque()
738 * portable version is in core/SkBlitRow_D16.cpp
739 */
S32A_D565_Opaque_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)740 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
741 const SkPMColor* SK_RESTRICT src,
742 int count, U8CPU alpha, int /*x*/, int /*y*/) {
743 SkASSERT(255 == alpha);
744
745 if (count <= 0) {
746 return;
747 }
748
749 if (count >= 8) {
750 // Make dst 16 bytes alignment
751 while (((size_t)dst & 0x0F) != 0) {
752 SkPMColor c = *src++;
753 if (c) {
754 *dst = SkSrcOver32To16(c, *dst);
755 }
756 dst += 1;
757 count--;
758 }
759
760 const __m128i* s = reinterpret_cast<const __m128i*>(src);
761 __m128i* d = reinterpret_cast<__m128i*>(dst);
762 __m128i var255 = _mm_set1_epi16(255);
763 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
764 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
765 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
766
767 while (count >= 8) {
768 // Load 8 pixels of src.
769 __m128i src_pixel1 = _mm_loadu_si128(s++);
770 __m128i src_pixel2 = _mm_loadu_si128(s++);
771
772 // Check whether src pixels are equal to 0 and get the highest bit
773 // of each byte of result, if src pixels are all zero, src_cmp1 and
774 // src_cmp2 will be 0xFFFF.
775 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
776 _mm_setzero_si128()));
777 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
778 _mm_setzero_si128()));
779 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
780 d++;
781 count -= 8;
782 continue;
783 }
784
785 // Load 8 pixels of dst.
786 __m128i dst_pixel = _mm_load_si128(d);
787
788 // Extract A from src.
789 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
790 sa1 = _mm_srli_epi32(sa1, 24);
791 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
792 sa2 = _mm_srli_epi32(sa2, 24);
793 __m128i sa = _mm_packs_epi32(sa1, sa2);
794
795 // Extract R from src.
796 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
797 sr1 = _mm_srli_epi32(sr1, 24);
798 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
799 sr2 = _mm_srli_epi32(sr2, 24);
800 __m128i sr = _mm_packs_epi32(sr1, sr2);
801
802 // Extract G from src.
803 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
804 sg1 = _mm_srli_epi32(sg1, 24);
805 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
806 sg2 = _mm_srli_epi32(sg2, 24);
807 __m128i sg = _mm_packs_epi32(sg1, sg2);
808
809 // Extract B from src.
810 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
811 sb1 = _mm_srli_epi32(sb1, 24);
812 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
813 sb2 = _mm_srli_epi32(sb2, 24);
814 __m128i sb = _mm_packs_epi32(sb1, sb2);
815
816 // Extract R G B from dst.
817 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
818 dr = _mm_and_si128(dr, r16_mask);
819 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
820 dg = _mm_and_si128(dg, g16_mask);
821 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
822 db = _mm_and_si128(db, b16_mask);
823
824 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
825
826 // Calculate R G B of result.
827 // Original algorithm is in SkSrcOver32To16().
828 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
829 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
830 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
831 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
832 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
833 db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
834
835 // Pack R G B into 16-bit color.
836 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
837
838 // Store 8 16-bit colors in dst.
839 _mm_store_si128(d++, d_pixel);
840 count -= 8;
841 }
842
843 src = reinterpret_cast<const SkPMColor*>(s);
844 dst = reinterpret_cast<uint16_t*>(d);
845 }
846
847 if (count > 0) {
848 do {
849 SkPMColor c = *src++;
850 SkPMColorAssert(c);
851 if (c) {
852 *dst = SkSrcOver32To16(c, *dst);
853 }
854 dst += 1;
855 } while (--count != 0);
856 }
857 }
858
S32_D565_Opaque_Dither_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)859 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
860 const SkPMColor* SK_RESTRICT src,
861 int count, U8CPU alpha, int x, int y) {
862 SkASSERT(255 == alpha);
863
864 if (count <= 0) {
865 return;
866 }
867
868 if (count >= 8) {
869 while (((size_t)dst & 0x0F) != 0) {
870 DITHER_565_SCAN(y);
871 SkPMColor c = *src++;
872 SkPMColorAssert(c);
873
874 unsigned dither = DITHER_VALUE(x);
875 *dst++ = SkDitherRGB32To565(c, dither);
876 DITHER_INC_X(x);
877 count--;
878 }
879
880 unsigned short dither_value[8];
881 __m128i dither;
882 #ifdef ENABLE_DITHER_MATRIX_4X4
883 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
884 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
885 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
886 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
887 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
888 #else
889 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
890 dither_value[0] = dither_value[4] = (dither_scan
891 >> (((x) & 3) << 2)) & 0xF;
892 dither_value[1] = dither_value[5] = (dither_scan
893 >> (((x + 1) & 3) << 2)) & 0xF;
894 dither_value[2] = dither_value[6] = (dither_scan
895 >> (((x + 2) & 3) << 2)) & 0xF;
896 dither_value[3] = dither_value[7] = (dither_scan
897 >> (((x + 3) & 3) << 2)) & 0xF;
898 #endif
899 dither = _mm_loadu_si128((__m128i*) dither_value);
900
901 const __m128i* s = reinterpret_cast<const __m128i*>(src);
902 __m128i* d = reinterpret_cast<__m128i*>(dst);
903
904 while (count >= 8) {
905 // Load 8 pixels of src.
906 __m128i src_pixel1 = _mm_loadu_si128(s++);
907 __m128i src_pixel2 = _mm_loadu_si128(s++);
908
909 // Extract R from src.
910 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
911 sr1 = _mm_srli_epi32(sr1, 24);
912 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
913 sr2 = _mm_srli_epi32(sr2, 24);
914 __m128i sr = _mm_packs_epi32(sr1, sr2);
915
916 // SkDITHER_R32To565(sr, dither)
917 __m128i sr_offset = _mm_srli_epi16(sr, 5);
918 sr = _mm_add_epi16(sr, dither);
919 sr = _mm_sub_epi16(sr, sr_offset);
920 sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
921
922 // Extract G from src.
923 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
924 sg1 = _mm_srli_epi32(sg1, 24);
925 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
926 sg2 = _mm_srli_epi32(sg2, 24);
927 __m128i sg = _mm_packs_epi32(sg1, sg2);
928
929 // SkDITHER_R32To565(sg, dither)
930 __m128i sg_offset = _mm_srli_epi16(sg, 6);
931 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
932 sg = _mm_sub_epi16(sg, sg_offset);
933 sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
934
935 // Extract B from src.
936 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
937 sb1 = _mm_srli_epi32(sb1, 24);
938 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
939 sb2 = _mm_srli_epi32(sb2, 24);
940 __m128i sb = _mm_packs_epi32(sb1, sb2);
941
942 // SkDITHER_R32To565(sb, dither)
943 __m128i sb_offset = _mm_srli_epi16(sb, 5);
944 sb = _mm_add_epi16(sb, dither);
945 sb = _mm_sub_epi16(sb, sb_offset);
946 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
947
948 // Pack and store 16-bit dst pixel.
949 __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
950 _mm_store_si128(d++, d_pixel);
951
952 count -= 8;
953 x += 8;
954 }
955
956 src = reinterpret_cast<const SkPMColor*>(s);
957 dst = reinterpret_cast<uint16_t*>(d);
958 }
959
960 if (count > 0) {
961 DITHER_565_SCAN(y);
962 do {
963 SkPMColor c = *src++;
964 SkPMColorAssert(c);
965
966 unsigned dither = DITHER_VALUE(x);
967 *dst++ = SkDitherRGB32To565(c, dither);
968 DITHER_INC_X(x);
969 } while (--count != 0);
970 }
971 }
972
973 /* SSE2 version of S32A_D565_Opaque_Dither()
974 * portable version is in core/SkBlitRow_D16.cpp
975 */
S32A_D565_Opaque_Dither_SSE2(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)976 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
977 const SkPMColor* SK_RESTRICT src,
978 int count, U8CPU alpha, int x, int y) {
979 SkASSERT(255 == alpha);
980
981 if (count <= 0) {
982 return;
983 }
984
985 if (count >= 8) {
986 while (((size_t)dst & 0x0F) != 0) {
987 DITHER_565_SCAN(y);
988 SkPMColor c = *src++;
989 SkPMColorAssert(c);
990 if (c) {
991 unsigned a = SkGetPackedA32(c);
992
993 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
994
995 unsigned sr = SkGetPackedR32(c);
996 unsigned sg = SkGetPackedG32(c);
997 unsigned sb = SkGetPackedB32(c);
998 sr = SkDITHER_R32_FOR_565(sr, d);
999 sg = SkDITHER_G32_FOR_565(sg, d);
1000 sb = SkDITHER_B32_FOR_565(sb, d);
1001
1002 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1003 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1004 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1005 // now src and dst expanded are in g:11 r:10 x:1 b:10
1006 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1007 }
1008 dst += 1;
1009 DITHER_INC_X(x);
1010 count--;
1011 }
1012
1013 unsigned short dither_value[8];
1014 __m128i dither, dither_cur;
1015 #ifdef ENABLE_DITHER_MATRIX_4X4
1016 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1017 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1018 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1019 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1020 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1021 #else
1022 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1023 dither_value[0] = dither_value[4] = (dither_scan
1024 >> (((x) & 3) << 2)) & 0xF;
1025 dither_value[1] = dither_value[5] = (dither_scan
1026 >> (((x + 1) & 3) << 2)) & 0xF;
1027 dither_value[2] = dither_value[6] = (dither_scan
1028 >> (((x + 2) & 3) << 2)) & 0xF;
1029 dither_value[3] = dither_value[7] = (dither_scan
1030 >> (((x + 3) & 3) << 2)) & 0xF;
1031 #endif
1032 dither = _mm_loadu_si128((__m128i*) dither_value);
1033
1034 const __m128i* s = reinterpret_cast<const __m128i*>(src);
1035 __m128i* d = reinterpret_cast<__m128i*>(dst);
1036 __m128i var256 = _mm_set1_epi16(256);
1037 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
1038 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
1039 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
1040
1041 while (count >= 8) {
1042 // Load 8 pixels of src and dst.
1043 __m128i src_pixel1 = _mm_loadu_si128(s++);
1044 __m128i src_pixel2 = _mm_loadu_si128(s++);
1045 __m128i dst_pixel = _mm_load_si128(d);
1046
1047 // Extract A from src.
1048 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
1049 sa1 = _mm_srli_epi32(sa1, 24);
1050 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
1051 sa2 = _mm_srli_epi32(sa2, 24);
1052 __m128i sa = _mm_packs_epi32(sa1, sa2);
1053
1054 // Calculate current dither value.
1055 dither_cur = _mm_mullo_epi16(dither,
1056 _mm_add_epi16(sa, _mm_set1_epi16(1)));
1057 dither_cur = _mm_srli_epi16(dither_cur, 8);
1058
1059 // Extract R from src.
1060 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1061 sr1 = _mm_srli_epi32(sr1, 24);
1062 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1063 sr2 = _mm_srli_epi32(sr2, 24);
1064 __m128i sr = _mm_packs_epi32(sr1, sr2);
1065
1066 // SkDITHER_R32_FOR_565(sr, d)
1067 __m128i sr_offset = _mm_srli_epi16(sr, 5);
1068 sr = _mm_add_epi16(sr, dither_cur);
1069 sr = _mm_sub_epi16(sr, sr_offset);
1070
1071 // Expand sr.
1072 sr = _mm_slli_epi16(sr, 2);
1073
1074 // Extract G from src.
1075 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1076 sg1 = _mm_srli_epi32(sg1, 24);
1077 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1078 sg2 = _mm_srli_epi32(sg2, 24);
1079 __m128i sg = _mm_packs_epi32(sg1, sg2);
1080
1081 // sg = SkDITHER_G32_FOR_565(sg, d).
1082 __m128i sg_offset = _mm_srli_epi16(sg, 6);
1083 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
1084 sg = _mm_sub_epi16(sg, sg_offset);
1085
1086 // Expand sg.
1087 sg = _mm_slli_epi16(sg, 3);
1088
1089 // Extract B from src.
1090 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1091 sb1 = _mm_srli_epi32(sb1, 24);
1092 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1093 sb2 = _mm_srli_epi32(sb2, 24);
1094 __m128i sb = _mm_packs_epi32(sb1, sb2);
1095
1096 // sb = SkDITHER_B32_FOR_565(sb, d).
1097 __m128i sb_offset = _mm_srli_epi16(sb, 5);
1098 sb = _mm_add_epi16(sb, dither_cur);
1099 sb = _mm_sub_epi16(sb, sb_offset);
1100
1101 // Expand sb.
1102 sb = _mm_slli_epi16(sb, 2);
1103
1104 // Extract R G B from dst.
1105 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1106 dr = _mm_and_si128(dr, r16_mask);
1107 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1108 dg = _mm_and_si128(dg, g16_mask);
1109 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1110 db = _mm_and_si128(db, b16_mask);
1111
1112 // SkAlpha255To256(255 - a) >> 3
1113 __m128i isa = _mm_sub_epi16(var256, sa);
1114 isa = _mm_srli_epi16(isa, 3);
1115
1116 dr = _mm_mullo_epi16(dr, isa);
1117 dr = _mm_add_epi16(dr, sr);
1118 dr = _mm_srli_epi16(dr, 5);
1119
1120 dg = _mm_mullo_epi16(dg, isa);
1121 dg = _mm_add_epi16(dg, sg);
1122 dg = _mm_srli_epi16(dg, 5);
1123
1124 db = _mm_mullo_epi16(db, isa);
1125 db = _mm_add_epi16(db, sb);
1126 db = _mm_srli_epi16(db, 5);
1127
1128 // Package and store dst pixel.
1129 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
1130 _mm_store_si128(d++, d_pixel);
1131
1132 count -= 8;
1133 x += 8;
1134 }
1135
1136 src = reinterpret_cast<const SkPMColor*>(s);
1137 dst = reinterpret_cast<uint16_t*>(d);
1138 }
1139
1140 if (count > 0) {
1141 DITHER_565_SCAN(y);
1142 do {
1143 SkPMColor c = *src++;
1144 SkPMColorAssert(c);
1145 if (c) {
1146 unsigned a = SkGetPackedA32(c);
1147
1148 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1149
1150 unsigned sr = SkGetPackedR32(c);
1151 unsigned sg = SkGetPackedG32(c);
1152 unsigned sb = SkGetPackedB32(c);
1153 sr = SkDITHER_R32_FOR_565(sr, d);
1154 sg = SkDITHER_G32_FOR_565(sg, d);
1155 sb = SkDITHER_B32_FOR_565(sb, d);
1156
1157 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1158 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1159 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1160 // now src and dst expanded are in g:11 r:10 x:1 b:10
1161 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1162 }
1163 dst += 1;
1164 DITHER_INC_X(x);
1165 } while (--count != 0);
1166 }
1167 }
1168