1 /*
2  * Copyright 2016 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED
10 
11 #include "SkColorData.h"
12 
13 #include <utility>
14 
15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
16     #include <immintrin.h>
17 #elif defined(SK_ARM_HAS_NEON)
18     #include <arm_neon.h>
19 #endif
20 
21 namespace SK_OPTS_NS {
22 
23 static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
24     for (int i = 0; i < count; i++) {
25         uint8_t a = src[i] >> 24,
26                 b = src[i] >> 16,
27                 g = src[i] >>  8,
28                 r = src[i] >>  0;
29         b = (b*a+127)/255;
30         g = (g*a+127)/255;
31         r = (r*a+127)/255;
32         dst[i] = (uint32_t)a << 24
33                | (uint32_t)b << 16
34                | (uint32_t)g <<  8
35                | (uint32_t)r <<  0;
36     }
37 }
38 
39 static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
40     for (int i = 0; i < count; i++) {
41         uint8_t a = src[i] >> 24,
42                 b = src[i] >> 16,
43                 g = src[i] >>  8,
44                 r = src[i] >>  0;
45         b = (b*a+127)/255;
46         g = (g*a+127)/255;
47         r = (r*a+127)/255;
48         dst[i] = (uint32_t)a << 24
49                | (uint32_t)r << 16
50                | (uint32_t)g <<  8
51                | (uint32_t)b <<  0;
52     }
53 }
54 
55 static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
56     for (int i = 0; i < count; i++) {
57         uint8_t a = src[i] >> 24,
58                 b = src[i] >> 16,
59                 g = src[i] >>  8,
60                 r = src[i] >>  0;
61         dst[i] = (uint32_t)a << 24
62                | (uint32_t)r << 16
63                | (uint32_t)g <<  8
64                | (uint32_t)b <<  0;
65     }
66 }
67 
68 static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
69     for (int i = 0; i < count; i++) {
70         uint8_t r = src[0],
71                 g = src[1],
72                 b = src[2];
73         src += 3;
74         dst[i] = (uint32_t)0xFF << 24
75                | (uint32_t)b    << 16
76                | (uint32_t)g    <<  8
77                | (uint32_t)r    <<  0;
78     }
79 }
80 
81 static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
82     for (int i = 0; i < count; i++) {
83         uint8_t r = src[0],
84                 g = src[1],
85                 b = src[2];
86         src += 3;
87         dst[i] = (uint32_t)0xFF << 24
88                | (uint32_t)r    << 16
89                | (uint32_t)g    <<  8
90                | (uint32_t)b    <<  0;
91     }
92 }
93 
94 static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
95     for (int i = 0; i < count; i++) {
96         dst[i] = (uint32_t)0xFF   << 24
97                | (uint32_t)src[i] << 16
98                | (uint32_t)src[i] <<  8
99                | (uint32_t)src[i] <<  0;
100     }
101 }
102 
103 static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
104     for (int i = 0; i < count; i++) {
105         uint8_t g = src[0],
106                 a = src[1];
107         src += 2;
108         dst[i] = (uint32_t)a << 24
109                | (uint32_t)g << 16
110                | (uint32_t)g <<  8
111                | (uint32_t)g <<  0;
112     }
113 }
114 
115 static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
116     for (int i = 0; i < count; i++) {
117         uint8_t g = src[0],
118                 a = src[1];
119         src += 2;
120         g = (g*a+127)/255;
121         dst[i] = (uint32_t)a << 24
122                | (uint32_t)g << 16
123                | (uint32_t)g <<  8
124                | (uint32_t)g <<  0;
125     }
126 }
127 
128 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
129     for (int i = 0; i < count; i++) {
130         uint8_t k = src[i] >> 24,
131                 y = src[i] >> 16,
132                 m = src[i] >>  8,
133                 c = src[i] >>  0;
134         // See comments in SkSwizzler.cpp for details on the conversion formula.
135         uint8_t b = (y*k+127)/255,
136                 g = (m*k+127)/255,
137                 r = (c*k+127)/255;
138         dst[i] = (uint32_t)0xFF << 24
139                | (uint32_t)   b << 16
140                | (uint32_t)   g <<  8
141                | (uint32_t)   r <<  0;
142     }
143 }
144 
145 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
146     for (int i = 0; i < count; i++) {
147         uint8_t k = src[i] >> 24,
148                 y = src[i] >> 16,
149                 m = src[i] >>  8,
150                 c = src[i] >>  0;
151         uint8_t b = (y*k+127)/255,
152                 g = (m*k+127)/255,
153                 r = (c*k+127)/255;
154         dst[i] = (uint32_t)0xFF << 24
155                | (uint32_t)   r << 16
156                | (uint32_t)   g <<  8
157                | (uint32_t)   b <<  0;
158     }
159 }
160 
161 #if defined(SK_ARM_HAS_NEON)
162 
163 // Rounded divide by 255, (x + 127) / 255
164 static uint8x8_t div255_round(uint16x8_t x) {
165     // result = (x + 127) / 255
166     // result = (x + 127) / 256 + error1
167     //
168     // error1 = (x + 127) / (255 * 256)
169     // error1 = (x + 127) / (256 * 256) + error2
170     //
171     // error2 = (x + 127) / (255 * 256 * 256)
172     //
173     // The maximum value of error2 is too small to matter.  Thus:
174     // result = (x + 127) / 256 + (x + 127) / (256 * 256)
175     // result = ((x + 127) / 256 + x + 127) / 256
176     // result = ((x + 127) >> 8 + x + 127) >> 8
177     //
178     // Use >>> to represent "rounded right shift" which, conveniently,
179     // NEON supports in one instruction.
180     // result = ((x >>> 8) + x) >>> 8
181     //
182     // Note that the second right shift is actually performed as an
183     // "add, round, and narrow back to 8-bits" instruction.
184     return vraddhn_u16(x, vrshrq_n_u16(x, 8));
185 }
186 
187 // Scale a byte by another, (x * y + 127) / 255
188 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
189     return div255_round(vmull_u8(x, y));
190 }
191 
192 template <bool kSwapRB>
193 static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
194     while (count >= 8) {
195         // Load 8 pixels.
196         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
197 
198         uint8x8_t a = rgba.val[3],
199                   b = rgba.val[2],
200                   g = rgba.val[1],
201                   r = rgba.val[0];
202 
203         // Premultiply.
204         b = scale(b, a);
205         g = scale(g, a);
206         r = scale(r, a);
207 
208         // Store 8 premultiplied pixels.
209         if (kSwapRB) {
210             rgba.val[2] = r;
211             rgba.val[1] = g;
212             rgba.val[0] = b;
213         } else {
214             rgba.val[2] = b;
215             rgba.val[1] = g;
216             rgba.val[0] = r;
217         }
218         vst4_u8((uint8_t*) dst, rgba);
219         src += 8;
220         dst += 8;
221         count -= 8;
222     }
223 
224     // Call portable code to finish up the tail of [0,8) pixels.
225     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
226     proc(dst, src, count);
227 }
228 
229 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
230     premul_should_swapRB<false>(dst, src, count);
231 }
232 
233 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
234     premul_should_swapRB<true>(dst, src, count);
235 }
236 
237 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
238     using std::swap;
239     while (count >= 16) {
240         // Load 16 pixels.
241         uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
242 
243         // Swap r and b.
244         swap(rgba.val[0], rgba.val[2]);
245 
246         // Store 16 pixels.
247         vst4q_u8((uint8_t*) dst, rgba);
248         src += 16;
249         dst += 16;
250         count -= 16;
251     }
252 
253     if (count >= 8) {
254         // Load 8 pixels.
255         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
256 
257         // Swap r and b.
258         swap(rgba.val[0], rgba.val[2]);
259 
260         // Store 8 pixels.
261         vst4_u8((uint8_t*) dst, rgba);
262         src += 8;
263         dst += 8;
264         count -= 8;
265     }
266 
267     RGBA_to_BGRA_portable(dst, src, count);
268 }
269 
270 template <bool kSwapRB>
271 static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
272     while (count >= 16) {
273         // Load 16 pixels.
274         uint8x16x3_t rgb = vld3q_u8(src);
275 
276         // Insert an opaque alpha channel and swap if needed.
277         uint8x16x4_t rgba;
278         if (kSwapRB) {
279             rgba.val[0] = rgb.val[2];
280             rgba.val[2] = rgb.val[0];
281         } else {
282             rgba.val[0] = rgb.val[0];
283             rgba.val[2] = rgb.val[2];
284         }
285         rgba.val[1] = rgb.val[1];
286         rgba.val[3] = vdupq_n_u8(0xFF);
287 
288         // Store 16 pixels.
289         vst4q_u8((uint8_t*) dst, rgba);
290         src += 16*3;
291         dst += 16;
292         count -= 16;
293     }
294 
295     if (count >= 8) {
296         // Load 8 pixels.
297         uint8x8x3_t rgb = vld3_u8(src);
298 
299         // Insert an opaque alpha channel and swap if needed.
300         uint8x8x4_t rgba;
301         if (kSwapRB) {
302             rgba.val[0] = rgb.val[2];
303             rgba.val[2] = rgb.val[0];
304         } else {
305             rgba.val[0] = rgb.val[0];
306             rgba.val[2] = rgb.val[2];
307         }
308         rgba.val[1] = rgb.val[1];
309         rgba.val[3] = vdup_n_u8(0xFF);
310 
311         // Store 8 pixels.
312         vst4_u8((uint8_t*) dst, rgba);
313         src += 8*3;
314         dst += 8;
315         count -= 8;
316     }
317 
318     // Call portable code to finish up the tail of [0,8) pixels.
319     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
320     proc(dst, src, count);
321 }
322 
323 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
324     insert_alpha_should_swaprb<false>(dst, src, count);
325 }
326 
327 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
328     insert_alpha_should_swaprb<true>(dst, src, count);
329 }
330 
331 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
332     while (count >= 16) {
333         // Load 16 pixels.
334         uint8x16_t gray = vld1q_u8(src);
335 
336         // Set each of the color channels.
337         uint8x16x4_t rgba;
338         rgba.val[0] = gray;
339         rgba.val[1] = gray;
340         rgba.val[2] = gray;
341         rgba.val[3] = vdupq_n_u8(0xFF);
342 
343         // Store 16 pixels.
344         vst4q_u8((uint8_t*) dst, rgba);
345         src += 16;
346         dst += 16;
347         count -= 16;
348     }
349 
350     if (count >= 8) {
351         // Load 8 pixels.
352         uint8x8_t gray = vld1_u8(src);
353 
354         // Set each of the color channels.
355         uint8x8x4_t rgba;
356         rgba.val[0] = gray;
357         rgba.val[1] = gray;
358         rgba.val[2] = gray;
359         rgba.val[3] = vdup_n_u8(0xFF);
360 
361         // Store 8 pixels.
362         vst4_u8((uint8_t*) dst, rgba);
363         src += 8;
364         dst += 8;
365         count -= 8;
366     }
367 
368     gray_to_RGB1_portable(dst, src, count);
369 }
370 
371 template <bool kPremul>
372 static void expand_grayA(uint32_t dst[], const uint8_t* src, int count) {
373     while (count >= 16) {
374         // Load 16 pixels.
375         uint8x16x2_t ga = vld2q_u8(src);
376 
377         // Premultiply if requested.
378         if (kPremul) {
379             ga.val[0] = vcombine_u8(
380                     scale(vget_low_u8(ga.val[0]),  vget_low_u8(ga.val[1])),
381                     scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
382         }
383 
384         // Set each of the color channels.
385         uint8x16x4_t rgba;
386         rgba.val[0] = ga.val[0];
387         rgba.val[1] = ga.val[0];
388         rgba.val[2] = ga.val[0];
389         rgba.val[3] = ga.val[1];
390 
391         // Store 16 pixels.
392         vst4q_u8((uint8_t*) dst, rgba);
393         src += 16*2;
394         dst += 16;
395         count -= 16;
396     }
397 
398     if (count >= 8) {
399         // Load 8 pixels.
400         uint8x8x2_t ga = vld2_u8(src);
401 
402         // Premultiply if requested.
403         if (kPremul) {
404             ga.val[0] = scale(ga.val[0], ga.val[1]);
405         }
406 
407         // Set each of the color channels.
408         uint8x8x4_t rgba;
409         rgba.val[0] = ga.val[0];
410         rgba.val[1] = ga.val[0];
411         rgba.val[2] = ga.val[0];
412         rgba.val[3] = ga.val[1];
413 
414         // Store 8 pixels.
415         vst4_u8((uint8_t*) dst, rgba);
416         src += 8*2;
417         dst += 8;
418         count -= 8;
419     }
420 
421     auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
422     proc(dst, src, count);
423 }
424 
425 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
426     expand_grayA<false>(dst, src, count);
427 }
428 
429 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
430     expand_grayA<true>(dst, src, count);
431 }
432 
433 enum Format { kRGB1, kBGR1 };
434 template <Format format>
435 static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
436     while (count >= 8) {
437         // Load 8 cmyk pixels.
438         uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
439 
440         uint8x8_t k = pixels.val[3],
441                   y = pixels.val[2],
442                   m = pixels.val[1],
443                   c = pixels.val[0];
444 
445         // Scale to r, g, b.
446         uint8x8_t b = scale(y, k);
447         uint8x8_t g = scale(m, k);
448         uint8x8_t r = scale(c, k);
449 
450         // Store 8 rgba pixels.
451         if (kBGR1 == format) {
452             pixels.val[3] = vdup_n_u8(0xFF);
453             pixels.val[2] = r;
454             pixels.val[1] = g;
455             pixels.val[0] = b;
456         } else {
457             pixels.val[3] = vdup_n_u8(0xFF);
458             pixels.val[2] = b;
459             pixels.val[1] = g;
460             pixels.val[0] = r;
461         }
462         vst4_u8((uint8_t*) dst, pixels);
463         src += 8;
464         dst += 8;
465         count -= 8;
466     }
467 
468     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
469     proc(dst, src, count);
470 }
471 
472 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
473     inverted_cmyk_to<kRGB1>(dst, src, count);
474 }
475 
476 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
477     inverted_cmyk_to<kBGR1>(dst, src, count);
478 }
479 
480 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
481 
482 // Scale a byte by another.
483 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
484 static __m128i scale(__m128i x, __m128i y) {
485     const __m128i _128 = _mm_set1_epi16(128);
486     const __m128i _257 = _mm_set1_epi16(257);
487 
488     // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
489     return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
490 }
491 
492 template <bool kSwapRB>
493 static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
494 
495     auto premul8 = [](__m128i* lo, __m128i* hi) {
496         const __m128i zeros = _mm_setzero_si128();
497         __m128i planar;
498         if (kSwapRB) {
499             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
500         } else {
501             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
502         }
503 
504         // Swizzle the pixels to 8-bit planar.
505         *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
506         *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
507         __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
508                 ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA
509 
510         // Unpack to 16-bit planar.
511         __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
512                 g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
513                 b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
514                 a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_
515 
516         // Premultiply!
517         r = scale(r, a);
518         g = scale(g, a);
519         b = scale(b, a);
520 
521         // Repack into interlaced pixels.
522         rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
523         ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
524         *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
525         *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
526     };
527 
528     while (count >= 8) {
529         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
530                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
531 
532         premul8(&lo, &hi);
533 
534         _mm_storeu_si128((__m128i*) (dst + 0), lo);
535         _mm_storeu_si128((__m128i*) (dst + 4), hi);
536 
537         src += 8;
538         dst += 8;
539         count -= 8;
540     }
541 
542     if (count >= 4) {
543         __m128i lo = _mm_loadu_si128((const __m128i*) src),
544                 hi = _mm_setzero_si128();
545 
546         premul8(&lo, &hi);
547 
548         _mm_storeu_si128((__m128i*) dst, lo);
549 
550         src += 4;
551         dst += 4;
552         count -= 4;
553     }
554 
555     // Call portable code to finish up the tail of [0,4) pixels.
556     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
557     proc(dst, src, count);
558 }
559 
560 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
561     premul_should_swapRB<false>(dst, src, count);
562 }
563 
564 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
565     premul_should_swapRB<true>(dst, src, count);
566 }
567 
568 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
569     const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
570 
571     while (count >= 4) {
572         __m128i rgba = _mm_loadu_si128((const __m128i*) src);
573         __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
574         _mm_storeu_si128((__m128i*) dst, bgra);
575 
576         src += 4;
577         dst += 4;
578         count -= 4;
579     }
580 
581     RGBA_to_BGRA_portable(dst, src, count);
582 }
583 
584 template <bool kSwapRB>
585 static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
586     const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
587     __m128i expand;
588     const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
589     if (kSwapRB) {
590         expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
591     } else {
592         expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
593     }
594 
595     while (count >= 6) {
596         // Load a vector.  While this actually contains 5 pixels plus an
597         // extra component, we will discard all but the first four pixels on
598         // this iteration.
599         __m128i rgb = _mm_loadu_si128((const __m128i*) src);
600 
601         // Expand the first four pixels to RGBX and then mask to RGB(FF).
602         __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
603 
604         // Store 4 pixels.
605         _mm_storeu_si128((__m128i*) dst, rgba);
606 
607         src += 4*3;
608         dst += 4;
609         count -= 4;
610     }
611 
612     // Call portable code to finish up the tail of [0,4) pixels.
613     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
614     proc(dst, src, count);
615 }
616 
617 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
618     insert_alpha_should_swaprb<false>(dst, src, count);
619 }
620 
621 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
622     insert_alpha_should_swaprb<true>(dst, src, count);
623 }
624 
625 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
626     const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
627     while (count >= 16) {
628         __m128i grays = _mm_loadu_si128((const __m128i*) src);
629 
630         __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
631         __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
632         __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
633         __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
634 
635         __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
636         __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
637         __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
638         __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
639 
640         _mm_storeu_si128((__m128i*) (dst +  0), ggga0);
641         _mm_storeu_si128((__m128i*) (dst +  4), ggga1);
642         _mm_storeu_si128((__m128i*) (dst +  8), ggga2);
643         _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
644 
645         src += 16;
646         dst += 16;
647         count -= 16;
648     }
649 
650     gray_to_RGB1_portable(dst, src, count);
651 }
652 
653 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
654     while (count >= 8) {
655         __m128i ga = _mm_loadu_si128((const __m128i*) src);
656 
657         __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
658                                   _mm_slli_epi16(ga, 8));
659 
660         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
661         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
662 
663         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
664         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
665 
666         src += 8*2;
667         dst += 8;
668         count -= 8;
669     }
670 
671     grayA_to_RGBA_portable(dst, src, count);
672 }
673 
674 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
675     while (count >= 8) {
676         __m128i grayA = _mm_loadu_si128((const __m128i*) src);
677 
678         __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
679         __m128i a0 = _mm_srli_epi16(grayA, 8);
680 
681         // Premultiply
682         g0 = scale(g0, a0);
683 
684         __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
685         __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
686 
687 
688         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
689         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
690 
691         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
692         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
693 
694         src += 8*2;
695         dst += 8;
696         count -= 8;
697     }
698 
699     grayA_to_rgbA_portable(dst, src, count);
700 }
701 
702 enum Format { kRGB1, kBGR1 };
703 template <Format format>
704 static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
705     auto convert8 = [](__m128i* lo, __m128i* hi) {
706         const __m128i zeros = _mm_setzero_si128();
707         __m128i planar;
708         if (kBGR1 == format) {
709             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
710         } else {
711             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
712         }
713 
714         // Swizzle the pixels to 8-bit planar.
715         *lo = _mm_shuffle_epi8(*lo, planar);                                 // ccccmmmm yyyykkkk
716         *hi = _mm_shuffle_epi8(*hi, planar);                                 // CCCCMMMM YYYYKKKK
717         __m128i cm = _mm_unpacklo_epi32(*lo, *hi),                           // ccccCCCC mmmmMMMM
718                 yk = _mm_unpackhi_epi32(*lo, *hi);                           // yyyyYYYY kkkkKKKK
719 
720         // Unpack to 16-bit planar.
721         __m128i c = _mm_unpacklo_epi8(cm, zeros),                            // c_c_c_c_ C_C_C_C_
722                 m = _mm_unpackhi_epi8(cm, zeros),                            // m_m_m_m_ M_M_M_M_
723                 y = _mm_unpacklo_epi8(yk, zeros),                            // y_y_y_y_ Y_Y_Y_Y_
724                 k = _mm_unpackhi_epi8(yk, zeros);                            // k_k_k_k_ K_K_K_K_
725 
726         // Scale to r, g, b.
727         __m128i r = scale(c, k),
728                 g = scale(m, k),
729                 b = scale(y, k);
730 
731         // Repack into interlaced pixels.
732         __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),                  // rgrgrgrg RGRGRGRG
733                 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00));     // b1b1b1b1 B1B1B1B1
734         *lo = _mm_unpacklo_epi16(rg, ba);                                    // rgbargba rgbargba
735         *hi = _mm_unpackhi_epi16(rg, ba);                                    // RGB1RGB1 RGB1RGB1
736     };
737 
738     while (count >= 8) {
739         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
740                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
741 
742         convert8(&lo, &hi);
743 
744         _mm_storeu_si128((__m128i*) (dst + 0), lo);
745         _mm_storeu_si128((__m128i*) (dst + 4), hi);
746 
747         src += 8;
748         dst += 8;
749         count -= 8;
750     }
751 
752     if (count >= 4) {
753         __m128i lo = _mm_loadu_si128((const __m128i*) src),
754                 hi = _mm_setzero_si128();
755 
756         convert8(&lo, &hi);
757 
758         _mm_storeu_si128((__m128i*) dst, lo);
759 
760         src += 4;
761         dst += 4;
762         count -= 4;
763     }
764 
765     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
766     proc(dst, src, count);
767 }
768 
769 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
770     inverted_cmyk_to<kRGB1>(dst, src, count);
771 }
772 
773 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
774     inverted_cmyk_to<kBGR1>(dst, src, count);
775 }
776 
777 #else
778 
779 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
780     RGBA_to_rgbA_portable(dst, src, count);
781 }
782 
783 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
784     RGBA_to_bgrA_portable(dst, src, count);
785 }
786 
787 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
788     RGBA_to_BGRA_portable(dst, src, count);
789 }
790 
791 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
792     RGB_to_RGB1_portable(dst, src, count);
793 }
794 
795 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
796     RGB_to_BGR1_portable(dst, src, count);
797 }
798 
799 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
800     gray_to_RGB1_portable(dst, src, count);
801 }
802 
803 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
804     grayA_to_RGBA_portable(dst, src, count);
805 }
806 
807 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
808     grayA_to_rgbA_portable(dst, src, count);
809 }
810 
811 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
812     inverted_CMYK_to_RGB1_portable(dst, src, count);
813 }
814 
815 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
816     inverted_CMYK_to_BGR1_portable(dst, src, count);
817 }
818 
819 #endif
820 
821 }
822 
823 #endif // SkSwizzler_opts_DEFINED
824