1 /*
2  * Copyright 2016 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED
10 
11 #include "SkColorPriv.h"
12 
13 namespace SK_OPTS_NS {
14 
RGBA_to_rgbA_portable(uint32_t * dst,const void * vsrc,int count)15 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
16     auto src = (const uint32_t*)vsrc;
17     for (int i = 0; i < count; i++) {
18         uint8_t a = src[i] >> 24,
19                 b = src[i] >> 16,
20                 g = src[i] >>  8,
21                 r = src[i] >>  0;
22         b = (b*a+127)/255;
23         g = (g*a+127)/255;
24         r = (r*a+127)/255;
25         dst[i] = (uint32_t)a << 24
26                | (uint32_t)b << 16
27                | (uint32_t)g <<  8
28                | (uint32_t)r <<  0;
29     }
30 }
31 
RGBA_to_bgrA_portable(uint32_t * dst,const void * vsrc,int count)32 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
33     auto src = (const uint32_t*)vsrc;
34     for (int i = 0; i < count; i++) {
35         uint8_t a = src[i] >> 24,
36                 b = src[i] >> 16,
37                 g = src[i] >>  8,
38                 r = src[i] >>  0;
39         b = (b*a+127)/255;
40         g = (g*a+127)/255;
41         r = (r*a+127)/255;
42         dst[i] = (uint32_t)a << 24
43                | (uint32_t)r << 16
44                | (uint32_t)g <<  8
45                | (uint32_t)b <<  0;
46     }
47 }
48 
RGBA_to_BGRA_portable(uint32_t * dst,const void * vsrc,int count)49 static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
50     auto src = (const uint32_t*)vsrc;
51     for (int i = 0; i < count; i++) {
52         uint8_t a = src[i] >> 24,
53                 b = src[i] >> 16,
54                 g = src[i] >>  8,
55                 r = src[i] >>  0;
56         dst[i] = (uint32_t)a << 24
57                | (uint32_t)r << 16
58                | (uint32_t)g <<  8
59                | (uint32_t)b <<  0;
60     }
61 }
62 
RGB_to_RGB1_portable(uint32_t dst[],const void * vsrc,int count)63 static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
64     const uint8_t* src = (const uint8_t*)vsrc;
65     for (int i = 0; i < count; i++) {
66         uint8_t r = src[0],
67                 g = src[1],
68                 b = src[2];
69         src += 3;
70         dst[i] = (uint32_t)0xFF << 24
71                | (uint32_t)b    << 16
72                | (uint32_t)g    <<  8
73                | (uint32_t)r    <<  0;
74     }
75 }
76 
RGB_to_BGR1_portable(uint32_t dst[],const void * vsrc,int count)77 static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
78     const uint8_t* src = (const uint8_t*)vsrc;
79     for (int i = 0; i < count; i++) {
80         uint8_t r = src[0],
81                 g = src[1],
82                 b = src[2];
83         src += 3;
84         dst[i] = (uint32_t)0xFF << 24
85                | (uint32_t)r    << 16
86                | (uint32_t)g    <<  8
87                | (uint32_t)b    <<  0;
88     }
89 }
90 
gray_to_RGB1_portable(uint32_t dst[],const void * vsrc,int count)91 static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
92     const uint8_t* src = (const uint8_t*)vsrc;
93     for (int i = 0; i < count; i++) {
94         dst[i] = (uint32_t)0xFF   << 24
95                | (uint32_t)src[i] << 16
96                | (uint32_t)src[i] <<  8
97                | (uint32_t)src[i] <<  0;
98     }
99 }
100 
grayA_to_RGBA_portable(uint32_t dst[],const void * vsrc,int count)101 static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {
102     const uint8_t* src = (const uint8_t*)vsrc;
103     for (int i = 0; i < count; i++) {
104         uint8_t g = src[0],
105                 a = src[1];
106         src += 2;
107         dst[i] = (uint32_t)a << 24
108                | (uint32_t)g << 16
109                | (uint32_t)g <<  8
110                | (uint32_t)g <<  0;
111     }
112 }
113 
grayA_to_rgbA_portable(uint32_t dst[],const void * vsrc,int count)114 static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {
115     const uint8_t* src = (const uint8_t*)vsrc;
116     for (int i = 0; i < count; i++) {
117         uint8_t g = src[0],
118                 a = src[1];
119         src += 2;
120         g = (g*a+127)/255;
121         dst[i] = (uint32_t)a << 24
122                | (uint32_t)g << 16
123                | (uint32_t)g <<  8
124                | (uint32_t)g <<  0;
125     }
126 }
127 
inverted_CMYK_to_RGB1_portable(uint32_t * dst,const void * vsrc,int count)128 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
129     const uint32_t* src = (const uint32_t*)vsrc;
130     for (int i = 0; i < count; i++) {
131         uint8_t k = src[i] >> 24,
132                 y = src[i] >> 16,
133                 m = src[i] >>  8,
134                 c = src[i] >>  0;
135         // See comments in SkSwizzler.cpp for details on the conversion formula.
136         uint8_t b = (y*k+127)/255,
137                 g = (m*k+127)/255,
138                 r = (c*k+127)/255;
139         dst[i] = (uint32_t)0xFF << 24
140                | (uint32_t)   b << 16
141                | (uint32_t)   g <<  8
142                | (uint32_t)   r <<  0;
143     }
144 }
145 
inverted_CMYK_to_BGR1_portable(uint32_t * dst,const void * vsrc,int count)146 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
147     const uint32_t* src = (const uint32_t*)vsrc;
148     for (int i = 0; i < count; i++) {
149         uint8_t k = src[i] >> 24,
150                 y = src[i] >> 16,
151                 m = src[i] >>  8,
152                 c = src[i] >>  0;
153         uint8_t b = (y*k+127)/255,
154                 g = (m*k+127)/255,
155                 r = (c*k+127)/255;
156         dst[i] = (uint32_t)0xFF << 24
157                | (uint32_t)   r << 16
158                | (uint32_t)   g <<  8
159                | (uint32_t)   b <<  0;
160     }
161 }
162 
163 #if defined(SK_ARM_HAS_NEON)
164 
165 // Rounded divide by 255, (x + 127) / 255
div255_round(uint16x8_t x)166 static uint8x8_t div255_round(uint16x8_t x) {
167     // result = (x + 127) / 255
168     // result = (x + 127) / 256 + error1
169     //
170     // error1 = (x + 127) / (255 * 256)
171     // error1 = (x + 127) / (256 * 256) + error2
172     //
173     // error2 = (x + 127) / (255 * 256 * 256)
174     //
175     // The maximum value of error2 is too small to matter.  Thus:
176     // result = (x + 127) / 256 + (x + 127) / (256 * 256)
177     // result = ((x + 127) / 256 + x + 127) / 256
178     // result = ((x + 127) >> 8 + x + 127) >> 8
179     //
180     // Use >>> to represent "rounded right shift" which, conveniently,
181     // NEON supports in one instruction.
182     // result = ((x >>> 8) + x) >>> 8
183     //
184     // Note that the second right shift is actually performed as an
185     // "add, round, and narrow back to 8-bits" instruction.
186     return vraddhn_u16(x, vrshrq_n_u16(x, 8));
187 }
188 
189 // Scale a byte by another, (x * y + 127) / 255
scale(uint8x8_t x,uint8x8_t y)190 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
191     return div255_round(vmull_u8(x, y));
192 }
193 
194 template <bool kSwapRB>
premul_should_swapRB(uint32_t * dst,const void * vsrc,int count)195 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
196     auto src = (const uint32_t*)vsrc;
197     while (count >= 8) {
198         // Load 8 pixels.
199         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
200 
201         uint8x8_t a = rgba.val[3],
202                   b = rgba.val[2],
203                   g = rgba.val[1],
204                   r = rgba.val[0];
205 
206         // Premultiply.
207         b = scale(b, a);
208         g = scale(g, a);
209         r = scale(r, a);
210 
211         // Store 8 premultiplied pixels.
212         if (kSwapRB) {
213             rgba.val[2] = r;
214             rgba.val[1] = g;
215             rgba.val[0] = b;
216         } else {
217             rgba.val[2] = b;
218             rgba.val[1] = g;
219             rgba.val[0] = r;
220         }
221         vst4_u8((uint8_t*) dst, rgba);
222         src += 8;
223         dst += 8;
224         count -= 8;
225     }
226 
227     // Call portable code to finish up the tail of [0,8) pixels.
228     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
229     proc(dst, src, count);
230 }
231 
RGBA_to_rgbA(uint32_t * dst,const void * src,int count)232 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
233     premul_should_swapRB<false>(dst, src, count);
234 }
235 
RGBA_to_bgrA(uint32_t * dst,const void * src,int count)236 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
237     premul_should_swapRB<true>(dst, src, count);
238 }
239 
RGBA_to_BGRA(uint32_t * dst,const void * vsrc,int count)240 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
241     auto src = (const uint32_t*)vsrc;
242     while (count >= 16) {
243         // Load 16 pixels.
244         uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
245 
246         // Swap r and b.
247         SkTSwap(rgba.val[0], rgba.val[2]);
248 
249         // Store 16 pixels.
250         vst4q_u8((uint8_t*) dst, rgba);
251         src += 16;
252         dst += 16;
253         count -= 16;
254     }
255 
256     if (count >= 8) {
257         // Load 8 pixels.
258         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
259 
260         // Swap r and b.
261         SkTSwap(rgba.val[0], rgba.val[2]);
262 
263         // Store 8 pixels.
264         vst4_u8((uint8_t*) dst, rgba);
265         src += 8;
266         dst += 8;
267         count -= 8;
268     }
269 
270     RGBA_to_BGRA_portable(dst, src, count);
271 }
272 
273 template <bool kSwapRB>
insert_alpha_should_swaprb(uint32_t dst[],const void * vsrc,int count)274 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
275     const uint8_t* src = (const uint8_t*) vsrc;
276     while (count >= 16) {
277         // Load 16 pixels.
278         uint8x16x3_t rgb = vld3q_u8(src);
279 
280         // Insert an opaque alpha channel and swap if needed.
281         uint8x16x4_t rgba;
282         if (kSwapRB) {
283             rgba.val[0] = rgb.val[2];
284             rgba.val[2] = rgb.val[0];
285         } else {
286             rgba.val[0] = rgb.val[0];
287             rgba.val[2] = rgb.val[2];
288         }
289         rgba.val[1] = rgb.val[1];
290         rgba.val[3] = vdupq_n_u8(0xFF);
291 
292         // Store 16 pixels.
293         vst4q_u8((uint8_t*) dst, rgba);
294         src += 16*3;
295         dst += 16;
296         count -= 16;
297     }
298 
299     if (count >= 8) {
300         // Load 8 pixels.
301         uint8x8x3_t rgb = vld3_u8(src);
302 
303         // Insert an opaque alpha channel and swap if needed.
304         uint8x8x4_t rgba;
305         if (kSwapRB) {
306             rgba.val[0] = rgb.val[2];
307             rgba.val[2] = rgb.val[0];
308         } else {
309             rgba.val[0] = rgb.val[0];
310             rgba.val[2] = rgb.val[2];
311         }
312         rgba.val[1] = rgb.val[1];
313         rgba.val[3] = vdup_n_u8(0xFF);
314 
315         // Store 8 pixels.
316         vst4_u8((uint8_t*) dst, rgba);
317         src += 8*3;
318         dst += 8;
319         count -= 8;
320     }
321 
322     // Call portable code to finish up the tail of [0,8) pixels.
323     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
324     proc(dst, src, count);
325 }
326 
RGB_to_RGB1(uint32_t dst[],const void * src,int count)327 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
328     insert_alpha_should_swaprb<false>(dst, src, count);
329 }
330 
RGB_to_BGR1(uint32_t dst[],const void * src,int count)331 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
332     insert_alpha_should_swaprb<true>(dst, src, count);
333 }
334 
gray_to_RGB1(uint32_t dst[],const void * vsrc,int count)335 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
336     const uint8_t* src = (const uint8_t*) vsrc;
337     while (count >= 16) {
338         // Load 16 pixels.
339         uint8x16_t gray = vld1q_u8(src);
340 
341         // Set each of the color channels.
342         uint8x16x4_t rgba;
343         rgba.val[0] = gray;
344         rgba.val[1] = gray;
345         rgba.val[2] = gray;
346         rgba.val[3] = vdupq_n_u8(0xFF);
347 
348         // Store 16 pixels.
349         vst4q_u8((uint8_t*) dst, rgba);
350         src += 16;
351         dst += 16;
352         count -= 16;
353     }
354 
355     if (count >= 8) {
356         // Load 8 pixels.
357         uint8x8_t gray = vld1_u8(src);
358 
359         // Set each of the color channels.
360         uint8x8x4_t rgba;
361         rgba.val[0] = gray;
362         rgba.val[1] = gray;
363         rgba.val[2] = gray;
364         rgba.val[3] = vdup_n_u8(0xFF);
365 
366         // Store 8 pixels.
367         vst4_u8((uint8_t*) dst, rgba);
368         src += 8;
369         dst += 8;
370         count -= 8;
371     }
372 
373     gray_to_RGB1_portable(dst, src, count);
374 }
375 
376 template <bool kPremul>
expand_grayA(uint32_t dst[],const void * vsrc,int count)377 static void expand_grayA(uint32_t dst[], const void* vsrc, int count) {
378     const uint8_t* src = (const uint8_t*) vsrc;
379     while (count >= 16) {
380         // Load 16 pixels.
381         uint8x16x2_t ga = vld2q_u8(src);
382 
383         // Premultiply if requested.
384         if (kPremul) {
385             ga.val[0] = vcombine_u8(
386                     scale(vget_low_u8(ga.val[0]),  vget_low_u8(ga.val[1])),
387                     scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
388         }
389 
390         // Set each of the color channels.
391         uint8x16x4_t rgba;
392         rgba.val[0] = ga.val[0];
393         rgba.val[1] = ga.val[0];
394         rgba.val[2] = ga.val[0];
395         rgba.val[3] = ga.val[1];
396 
397         // Store 16 pixels.
398         vst4q_u8((uint8_t*) dst, rgba);
399         src += 16*2;
400         dst += 16;
401         count -= 16;
402     }
403 
404     if (count >= 8) {
405         // Load 8 pixels.
406         uint8x8x2_t ga = vld2_u8(src);
407 
408         // Premultiply if requested.
409         if (kPremul) {
410             ga.val[0] = scale(ga.val[0], ga.val[1]);
411         }
412 
413         // Set each of the color channels.
414         uint8x8x4_t rgba;
415         rgba.val[0] = ga.val[0];
416         rgba.val[1] = ga.val[0];
417         rgba.val[2] = ga.val[0];
418         rgba.val[3] = ga.val[1];
419 
420         // Store 8 pixels.
421         vst4_u8((uint8_t*) dst, rgba);
422         src += 8*2;
423         dst += 8;
424         count -= 8;
425     }
426 
427     auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
428     proc(dst, src, count);
429 }
430 
grayA_to_RGBA(uint32_t dst[],const void * src,int count)431 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
432     expand_grayA<false>(dst, src, count);
433 }
434 
grayA_to_rgbA(uint32_t dst[],const void * src,int count)435 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
436     expand_grayA<true>(dst, src, count);
437 }
438 
439 enum Format { kRGB1, kBGR1 };
440 template <Format format>
inverted_cmyk_to(uint32_t * dst,const void * vsrc,int count)441 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
442     auto src = (const uint32_t*)vsrc;
443     while (count >= 8) {
444         // Load 8 cmyk pixels.
445         uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
446 
447         uint8x8_t k = pixels.val[3],
448                   y = pixels.val[2],
449                   m = pixels.val[1],
450                   c = pixels.val[0];
451 
452         // Scale to r, g, b.
453         uint8x8_t b = scale(y, k);
454         uint8x8_t g = scale(m, k);
455         uint8x8_t r = scale(c, k);
456 
457         // Store 8 rgba pixels.
458         if (kBGR1 == format) {
459             pixels.val[3] = vdup_n_u8(0xFF);
460             pixels.val[2] = r;
461             pixels.val[1] = g;
462             pixels.val[0] = b;
463         } else {
464             pixels.val[3] = vdup_n_u8(0xFF);
465             pixels.val[2] = b;
466             pixels.val[1] = g;
467             pixels.val[0] = r;
468         }
469         vst4_u8((uint8_t*) dst, pixels);
470         src += 8;
471         dst += 8;
472         count -= 8;
473     }
474 
475     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
476     proc(dst, src, count);
477 }
478 
inverted_CMYK_to_RGB1(uint32_t dst[],const void * src,int count)479 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
480     inverted_cmyk_to<kRGB1>(dst, src, count);
481 }
482 
inverted_CMYK_to_BGR1(uint32_t dst[],const void * src,int count)483 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
484     inverted_cmyk_to<kBGR1>(dst, src, count);
485 }
486 
487 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
488 
489 // Scale a byte by another.
490 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
scale(__m128i x,__m128i y)491 static __m128i scale(__m128i x, __m128i y) {
492     const __m128i _128 = _mm_set1_epi16(128);
493     const __m128i _257 = _mm_set1_epi16(257);
494 
495     // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
496     return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
497 }
498 
499 template <bool kSwapRB>
premul_should_swapRB(uint32_t * dst,const void * vsrc,int count)500 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
501     auto src = (const uint32_t*)vsrc;
502 
503     auto premul8 = [](__m128i* lo, __m128i* hi) {
504         const __m128i zeros = _mm_setzero_si128();
505         __m128i planar;
506         if (kSwapRB) {
507             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
508         } else {
509             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
510         }
511 
512         // Swizzle the pixels to 8-bit planar.
513         *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
514         *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
515         __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
516                 ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA
517 
518         // Unpack to 16-bit planar.
519         __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
520                 g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
521                 b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
522                 a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_
523 
524         // Premultiply!
525         r = scale(r, a);
526         g = scale(g, a);
527         b = scale(b, a);
528 
529         // Repack into interlaced pixels.
530         rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
531         ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
532         *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
533         *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
534     };
535 
536     while (count >= 8) {
537         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
538                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
539 
540         premul8(&lo, &hi);
541 
542         _mm_storeu_si128((__m128i*) (dst + 0), lo);
543         _mm_storeu_si128((__m128i*) (dst + 4), hi);
544 
545         src += 8;
546         dst += 8;
547         count -= 8;
548     }
549 
550     if (count >= 4) {
551         __m128i lo = _mm_loadu_si128((const __m128i*) src),
552                 hi = _mm_setzero_si128();
553 
554         premul8(&lo, &hi);
555 
556         _mm_storeu_si128((__m128i*) dst, lo);
557 
558         src += 4;
559         dst += 4;
560         count -= 4;
561     }
562 
563     // Call portable code to finish up the tail of [0,4) pixels.
564     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
565     proc(dst, src, count);
566 }
567 
RGBA_to_rgbA(uint32_t * dst,const void * src,int count)568 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
569     premul_should_swapRB<false>(dst, src, count);
570 }
571 
RGBA_to_bgrA(uint32_t * dst,const void * src,int count)572 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
573     premul_should_swapRB<true>(dst, src, count);
574 }
575 
RGBA_to_BGRA(uint32_t * dst,const void * vsrc,int count)576 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
577     auto src = (const uint32_t*)vsrc;
578     const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
579 
580     while (count >= 4) {
581         __m128i rgba = _mm_loadu_si128((const __m128i*) src);
582         __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
583         _mm_storeu_si128((__m128i*) dst, bgra);
584 
585         src += 4;
586         dst += 4;
587         count -= 4;
588     }
589 
590     RGBA_to_BGRA_portable(dst, src, count);
591 }
592 
593 template <bool kSwapRB>
insert_alpha_should_swaprb(uint32_t dst[],const void * vsrc,int count)594 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
595     const uint8_t* src = (const uint8_t*) vsrc;
596 
597     const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
598     __m128i expand;
599     const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
600     if (kSwapRB) {
601         expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
602     } else {
603         expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
604     }
605 
606     while (count >= 6) {
607         // Load a vector.  While this actually contains 5 pixels plus an
608         // extra component, we will discard all but the first four pixels on
609         // this iteration.
610         __m128i rgb = _mm_loadu_si128((const __m128i*) src);
611 
612         // Expand the first four pixels to RGBX and then mask to RGB(FF).
613         __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
614 
615         // Store 4 pixels.
616         _mm_storeu_si128((__m128i*) dst, rgba);
617 
618         src += 4*3;
619         dst += 4;
620         count -= 4;
621     }
622 
623     // Call portable code to finish up the tail of [0,4) pixels.
624     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
625     proc(dst, src, count);
626 }
627 
RGB_to_RGB1(uint32_t dst[],const void * src,int count)628 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
629     insert_alpha_should_swaprb<false>(dst, src, count);
630 }
631 
RGB_to_BGR1(uint32_t dst[],const void * src,int count)632 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
633     insert_alpha_should_swaprb<true>(dst, src, count);
634 }
635 
gray_to_RGB1(uint32_t dst[],const void * vsrc,int count)636 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
637     const uint8_t* src = (const uint8_t*) vsrc;
638 
639     const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
640     while (count >= 16) {
641         __m128i grays = _mm_loadu_si128((const __m128i*) src);
642 
643         __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
644         __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
645         __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
646         __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
647 
648         __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
649         __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
650         __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
651         __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
652 
653         _mm_storeu_si128((__m128i*) (dst +  0), ggga0);
654         _mm_storeu_si128((__m128i*) (dst +  4), ggga1);
655         _mm_storeu_si128((__m128i*) (dst +  8), ggga2);
656         _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
657 
658         src += 16;
659         dst += 16;
660         count -= 16;
661     }
662 
663     gray_to_RGB1_portable(dst, src, count);
664 }
665 
grayA_to_RGBA(uint32_t dst[],const void * vsrc,int count)666 static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
667     const uint8_t* src = (const uint8_t*) vsrc;
668     while (count >= 8) {
669         __m128i ga = _mm_loadu_si128((const __m128i*) src);
670 
671         __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
672                                   _mm_slli_epi16(ga, 8));
673 
674         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
675         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
676 
677         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
678         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
679 
680         src += 8*2;
681         dst += 8;
682         count -= 8;
683     }
684 
685     grayA_to_RGBA_portable(dst, src, count);
686 }
687 
grayA_to_rgbA(uint32_t dst[],const void * vsrc,int count)688 static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
689     const uint8_t* src = (const uint8_t*) vsrc;
690     while (count >= 8) {
691         __m128i grayA = _mm_loadu_si128((const __m128i*) src);
692 
693         __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
694         __m128i a0 = _mm_srli_epi16(grayA, 8);
695 
696         // Premultiply
697         g0 = scale(g0, a0);
698 
699         __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
700         __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
701 
702 
703         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
704         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
705 
706         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
707         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
708 
709         src += 8*2;
710         dst += 8;
711         count -= 8;
712     }
713 
714     grayA_to_rgbA_portable(dst, src, count);
715 }
716 
717 enum Format { kRGB1, kBGR1 };
718 template <Format format>
inverted_cmyk_to(uint32_t * dst,const void * vsrc,int count)719 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
720     auto src = (const uint32_t*)vsrc;
721 
722     auto convert8 = [](__m128i* lo, __m128i* hi) {
723         const __m128i zeros = _mm_setzero_si128();
724         __m128i planar;
725         if (kBGR1 == format) {
726             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
727         } else {
728             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
729         }
730 
731         // Swizzle the pixels to 8-bit planar.
732         *lo = _mm_shuffle_epi8(*lo, planar);                                 // ccccmmmm yyyykkkk
733         *hi = _mm_shuffle_epi8(*hi, planar);                                 // CCCCMMMM YYYYKKKK
734         __m128i cm = _mm_unpacklo_epi32(*lo, *hi),                           // ccccCCCC mmmmMMMM
735                 yk = _mm_unpackhi_epi32(*lo, *hi);                           // yyyyYYYY kkkkKKKK
736 
737         // Unpack to 16-bit planar.
738         __m128i c = _mm_unpacklo_epi8(cm, zeros),                            // c_c_c_c_ C_C_C_C_
739                 m = _mm_unpackhi_epi8(cm, zeros),                            // m_m_m_m_ M_M_M_M_
740                 y = _mm_unpacklo_epi8(yk, zeros),                            // y_y_y_y_ Y_Y_Y_Y_
741                 k = _mm_unpackhi_epi8(yk, zeros);                            // k_k_k_k_ K_K_K_K_
742 
743         // Scale to r, g, b.
744         __m128i r = scale(c, k),
745                 g = scale(m, k),
746                 b = scale(y, k);
747 
748         // Repack into interlaced pixels.
749         __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),                  // rgrgrgrg RGRGRGRG
750                 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00));     // b1b1b1b1 B1B1B1B1
751         *lo = _mm_unpacklo_epi16(rg, ba);                                    // rgbargba rgbargba
752         *hi = _mm_unpackhi_epi16(rg, ba);                                    // RGB1RGB1 RGB1RGB1
753     };
754 
755     while (count >= 8) {
756         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
757                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
758 
759         convert8(&lo, &hi);
760 
761         _mm_storeu_si128((__m128i*) (dst + 0), lo);
762         _mm_storeu_si128((__m128i*) (dst + 4), hi);
763 
764         src += 8;
765         dst += 8;
766         count -= 8;
767     }
768 
769     if (count >= 4) {
770         __m128i lo = _mm_loadu_si128((const __m128i*) src),
771                 hi = _mm_setzero_si128();
772 
773         convert8(&lo, &hi);
774 
775         _mm_storeu_si128((__m128i*) dst, lo);
776 
777         src += 4;
778         dst += 4;
779         count -= 4;
780     }
781 
782     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
783     proc(dst, src, count);
784 }
785 
inverted_CMYK_to_RGB1(uint32_t dst[],const void * src,int count)786 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
787     inverted_cmyk_to<kRGB1>(dst, src, count);
788 }
789 
inverted_CMYK_to_BGR1(uint32_t dst[],const void * src,int count)790 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
791     inverted_cmyk_to<kBGR1>(dst, src, count);
792 }
793 
794 #else
795 
RGBA_to_rgbA(uint32_t * dst,const void * src,int count)796 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
797     RGBA_to_rgbA_portable(dst, src, count);
798 }
799 
RGBA_to_bgrA(uint32_t * dst,const void * src,int count)800 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
801     RGBA_to_bgrA_portable(dst, src, count);
802 }
803 
RGBA_to_BGRA(uint32_t * dst,const void * src,int count)804 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
805     RGBA_to_BGRA_portable(dst, src, count);
806 }
807 
RGB_to_RGB1(uint32_t dst[],const void * src,int count)808 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
809     RGB_to_RGB1_portable(dst, src, count);
810 }
811 
RGB_to_BGR1(uint32_t dst[],const void * src,int count)812 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
813     RGB_to_BGR1_portable(dst, src, count);
814 }
815 
gray_to_RGB1(uint32_t dst[],const void * src,int count)816 static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
817     gray_to_RGB1_portable(dst, src, count);
818 }
819 
grayA_to_RGBA(uint32_t dst[],const void * src,int count)820 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
821     grayA_to_RGBA_portable(dst, src, count);
822 }
823 
grayA_to_rgbA(uint32_t dst[],const void * src,int count)824 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
825     grayA_to_rgbA_portable(dst, src, count);
826 }
827 
inverted_CMYK_to_RGB1(uint32_t dst[],const void * src,int count)828 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
829     inverted_CMYK_to_RGB1_portable(dst, src, count);
830 }
831 
inverted_CMYK_to_BGR1(uint32_t dst[],const void * src,int count)832 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
833     inverted_CMYK_to_BGR1_portable(dst, src, count);
834 }
835 
836 #endif
837 
838 }
839 
840 #endif // SkSwizzler_opts_DEFINED
841