1 /*
2  * Copyright 2016 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED
10 
11 #include "SkColorPriv.h"
12 
13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
14     #include <immintrin.h>
15 #elif defined(SK_ARM_HAS_NEON)
16     #include <arm_neon.h>
17 #endif
18 
19 namespace SK_OPTS_NS {
20 
RGBA_to_rgbA_portable(uint32_t * dst,const void * vsrc,int count)21 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
22     auto src = (const uint32_t*)vsrc;
23     for (int i = 0; i < count; i++) {
24         uint8_t a = src[i] >> 24,
25                 b = src[i] >> 16,
26                 g = src[i] >>  8,
27                 r = src[i] >>  0;
28         b = (b*a+127)/255;
29         g = (g*a+127)/255;
30         r = (r*a+127)/255;
31         dst[i] = (uint32_t)a << 24
32                | (uint32_t)b << 16
33                | (uint32_t)g <<  8
34                | (uint32_t)r <<  0;
35     }
36 }
37 
RGBA_to_bgrA_portable(uint32_t * dst,const void * vsrc,int count)38 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
39     auto src = (const uint32_t*)vsrc;
40     for (int i = 0; i < count; i++) {
41         uint8_t a = src[i] >> 24,
42                 b = src[i] >> 16,
43                 g = src[i] >>  8,
44                 r = src[i] >>  0;
45         b = (b*a+127)/255;
46         g = (g*a+127)/255;
47         r = (r*a+127)/255;
48         dst[i] = (uint32_t)a << 24
49                | (uint32_t)r << 16
50                | (uint32_t)g <<  8
51                | (uint32_t)b <<  0;
52     }
53 }
54 
RGBA_to_BGRA_portable(uint32_t * dst,const void * vsrc,int count)55 static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
56     auto src = (const uint32_t*)vsrc;
57     for (int i = 0; i < count; i++) {
58         uint8_t a = src[i] >> 24,
59                 b = src[i] >> 16,
60                 g = src[i] >>  8,
61                 r = src[i] >>  0;
62         dst[i] = (uint32_t)a << 24
63                | (uint32_t)r << 16
64                | (uint32_t)g <<  8
65                | (uint32_t)b <<  0;
66     }
67 }
68 
RGB_to_RGB1_portable(uint32_t dst[],const void * vsrc,int count)69 static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
70     const uint8_t* src = (const uint8_t*)vsrc;
71     for (int i = 0; i < count; i++) {
72         uint8_t r = src[0],
73                 g = src[1],
74                 b = src[2];
75         src += 3;
76         dst[i] = (uint32_t)0xFF << 24
77                | (uint32_t)b    << 16
78                | (uint32_t)g    <<  8
79                | (uint32_t)r    <<  0;
80     }
81 }
82 
RGB_to_BGR1_portable(uint32_t dst[],const void * vsrc,int count)83 static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
84     const uint8_t* src = (const uint8_t*)vsrc;
85     for (int i = 0; i < count; i++) {
86         uint8_t r = src[0],
87                 g = src[1],
88                 b = src[2];
89         src += 3;
90         dst[i] = (uint32_t)0xFF << 24
91                | (uint32_t)r    << 16
92                | (uint32_t)g    <<  8
93                | (uint32_t)b    <<  0;
94     }
95 }
96 
gray_to_RGB1_portable(uint32_t dst[],const void * vsrc,int count)97 static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
98     const uint8_t* src = (const uint8_t*)vsrc;
99     for (int i = 0; i < count; i++) {
100         dst[i] = (uint32_t)0xFF   << 24
101                | (uint32_t)src[i] << 16
102                | (uint32_t)src[i] <<  8
103                | (uint32_t)src[i] <<  0;
104     }
105 }
106 
grayA_to_RGBA_portable(uint32_t dst[],const void * vsrc,int count)107 static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {
108     const uint8_t* src = (const uint8_t*)vsrc;
109     for (int i = 0; i < count; i++) {
110         uint8_t g = src[0],
111                 a = src[1];
112         src += 2;
113         dst[i] = (uint32_t)a << 24
114                | (uint32_t)g << 16
115                | (uint32_t)g <<  8
116                | (uint32_t)g <<  0;
117     }
118 }
119 
grayA_to_rgbA_portable(uint32_t dst[],const void * vsrc,int count)120 static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {
121     const uint8_t* src = (const uint8_t*)vsrc;
122     for (int i = 0; i < count; i++) {
123         uint8_t g = src[0],
124                 a = src[1];
125         src += 2;
126         g = (g*a+127)/255;
127         dst[i] = (uint32_t)a << 24
128                | (uint32_t)g << 16
129                | (uint32_t)g <<  8
130                | (uint32_t)g <<  0;
131     }
132 }
133 
inverted_CMYK_to_RGB1_portable(uint32_t * dst,const void * vsrc,int count)134 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
135     const uint32_t* src = (const uint32_t*)vsrc;
136     for (int i = 0; i < count; i++) {
137         uint8_t k = src[i] >> 24,
138                 y = src[i] >> 16,
139                 m = src[i] >>  8,
140                 c = src[i] >>  0;
141         // See comments in SkSwizzler.cpp for details on the conversion formula.
142         uint8_t b = (y*k+127)/255,
143                 g = (m*k+127)/255,
144                 r = (c*k+127)/255;
145         dst[i] = (uint32_t)0xFF << 24
146                | (uint32_t)   b << 16
147                | (uint32_t)   g <<  8
148                | (uint32_t)   r <<  0;
149     }
150 }
151 
inverted_CMYK_to_BGR1_portable(uint32_t * dst,const void * vsrc,int count)152 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
153     const uint32_t* src = (const uint32_t*)vsrc;
154     for (int i = 0; i < count; i++) {
155         uint8_t k = src[i] >> 24,
156                 y = src[i] >> 16,
157                 m = src[i] >>  8,
158                 c = src[i] >>  0;
159         uint8_t b = (y*k+127)/255,
160                 g = (m*k+127)/255,
161                 r = (c*k+127)/255;
162         dst[i] = (uint32_t)0xFF << 24
163                | (uint32_t)   r << 16
164                | (uint32_t)   g <<  8
165                | (uint32_t)   b <<  0;
166     }
167 }
168 
169 #if defined(SK_ARM_HAS_NEON)
170 
171 // Rounded divide by 255, (x + 127) / 255
div255_round(uint16x8_t x)172 static uint8x8_t div255_round(uint16x8_t x) {
173     // result = (x + 127) / 255
174     // result = (x + 127) / 256 + error1
175     //
176     // error1 = (x + 127) / (255 * 256)
177     // error1 = (x + 127) / (256 * 256) + error2
178     //
179     // error2 = (x + 127) / (255 * 256 * 256)
180     //
181     // The maximum value of error2 is too small to matter.  Thus:
182     // result = (x + 127) / 256 + (x + 127) / (256 * 256)
183     // result = ((x + 127) / 256 + x + 127) / 256
184     // result = ((x + 127) >> 8 + x + 127) >> 8
185     //
186     // Use >>> to represent "rounded right shift" which, conveniently,
187     // NEON supports in one instruction.
188     // result = ((x >>> 8) + x) >>> 8
189     //
190     // Note that the second right shift is actually performed as an
191     // "add, round, and narrow back to 8-bits" instruction.
192     return vraddhn_u16(x, vrshrq_n_u16(x, 8));
193 }
194 
195 // Scale a byte by another, (x * y + 127) / 255
scale(uint8x8_t x,uint8x8_t y)196 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
197     return div255_round(vmull_u8(x, y));
198 }
199 
200 template <bool kSwapRB>
premul_should_swapRB(uint32_t * dst,const void * vsrc,int count)201 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
202     auto src = (const uint32_t*)vsrc;
203     while (count >= 8) {
204         // Load 8 pixels.
205         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
206 
207         uint8x8_t a = rgba.val[3],
208                   b = rgba.val[2],
209                   g = rgba.val[1],
210                   r = rgba.val[0];
211 
212         // Premultiply.
213         b = scale(b, a);
214         g = scale(g, a);
215         r = scale(r, a);
216 
217         // Store 8 premultiplied pixels.
218         if (kSwapRB) {
219             rgba.val[2] = r;
220             rgba.val[1] = g;
221             rgba.val[0] = b;
222         } else {
223             rgba.val[2] = b;
224             rgba.val[1] = g;
225             rgba.val[0] = r;
226         }
227         vst4_u8((uint8_t*) dst, rgba);
228         src += 8;
229         dst += 8;
230         count -= 8;
231     }
232 
233     // Call portable code to finish up the tail of [0,8) pixels.
234     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
235     proc(dst, src, count);
236 }
237 
RGBA_to_rgbA(uint32_t * dst,const void * src,int count)238 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
239     premul_should_swapRB<false>(dst, src, count);
240 }
241 
RGBA_to_bgrA(uint32_t * dst,const void * src,int count)242 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
243     premul_should_swapRB<true>(dst, src, count);
244 }
245 
RGBA_to_BGRA(uint32_t * dst,const void * vsrc,int count)246 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
247     auto src = (const uint32_t*)vsrc;
248     while (count >= 16) {
249         // Load 16 pixels.
250         uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
251 
252         // Swap r and b.
253         SkTSwap(rgba.val[0], rgba.val[2]);
254 
255         // Store 16 pixels.
256         vst4q_u8((uint8_t*) dst, rgba);
257         src += 16;
258         dst += 16;
259         count -= 16;
260     }
261 
262     if (count >= 8) {
263         // Load 8 pixels.
264         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
265 
266         // Swap r and b.
267         SkTSwap(rgba.val[0], rgba.val[2]);
268 
269         // Store 8 pixels.
270         vst4_u8((uint8_t*) dst, rgba);
271         src += 8;
272         dst += 8;
273         count -= 8;
274     }
275 
276     RGBA_to_BGRA_portable(dst, src, count);
277 }
278 
279 template <bool kSwapRB>
insert_alpha_should_swaprb(uint32_t dst[],const void * vsrc,int count)280 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
281     const uint8_t* src = (const uint8_t*) vsrc;
282     while (count >= 16) {
283         // Load 16 pixels.
284         uint8x16x3_t rgb = vld3q_u8(src);
285 
286         // Insert an opaque alpha channel and swap if needed.
287         uint8x16x4_t rgba;
288         if (kSwapRB) {
289             rgba.val[0] = rgb.val[2];
290             rgba.val[2] = rgb.val[0];
291         } else {
292             rgba.val[0] = rgb.val[0];
293             rgba.val[2] = rgb.val[2];
294         }
295         rgba.val[1] = rgb.val[1];
296         rgba.val[3] = vdupq_n_u8(0xFF);
297 
298         // Store 16 pixels.
299         vst4q_u8((uint8_t*) dst, rgba);
300         src += 16*3;
301         dst += 16;
302         count -= 16;
303     }
304 
305     if (count >= 8) {
306         // Load 8 pixels.
307         uint8x8x3_t rgb = vld3_u8(src);
308 
309         // Insert an opaque alpha channel and swap if needed.
310         uint8x8x4_t rgba;
311         if (kSwapRB) {
312             rgba.val[0] = rgb.val[2];
313             rgba.val[2] = rgb.val[0];
314         } else {
315             rgba.val[0] = rgb.val[0];
316             rgba.val[2] = rgb.val[2];
317         }
318         rgba.val[1] = rgb.val[1];
319         rgba.val[3] = vdup_n_u8(0xFF);
320 
321         // Store 8 pixels.
322         vst4_u8((uint8_t*) dst, rgba);
323         src += 8*3;
324         dst += 8;
325         count -= 8;
326     }
327 
328     // Call portable code to finish up the tail of [0,8) pixels.
329     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
330     proc(dst, src, count);
331 }
332 
RGB_to_RGB1(uint32_t dst[],const void * src,int count)333 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
334     insert_alpha_should_swaprb<false>(dst, src, count);
335 }
336 
RGB_to_BGR1(uint32_t dst[],const void * src,int count)337 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
338     insert_alpha_should_swaprb<true>(dst, src, count);
339 }
340 
gray_to_RGB1(uint32_t dst[],const void * vsrc,int count)341 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
342     const uint8_t* src = (const uint8_t*) vsrc;
343     while (count >= 16) {
344         // Load 16 pixels.
345         uint8x16_t gray = vld1q_u8(src);
346 
347         // Set each of the color channels.
348         uint8x16x4_t rgba;
349         rgba.val[0] = gray;
350         rgba.val[1] = gray;
351         rgba.val[2] = gray;
352         rgba.val[3] = vdupq_n_u8(0xFF);
353 
354         // Store 16 pixels.
355         vst4q_u8((uint8_t*) dst, rgba);
356         src += 16;
357         dst += 16;
358         count -= 16;
359     }
360 
361     if (count >= 8) {
362         // Load 8 pixels.
363         uint8x8_t gray = vld1_u8(src);
364 
365         // Set each of the color channels.
366         uint8x8x4_t rgba;
367         rgba.val[0] = gray;
368         rgba.val[1] = gray;
369         rgba.val[2] = gray;
370         rgba.val[3] = vdup_n_u8(0xFF);
371 
372         // Store 8 pixels.
373         vst4_u8((uint8_t*) dst, rgba);
374         src += 8;
375         dst += 8;
376         count -= 8;
377     }
378 
379     gray_to_RGB1_portable(dst, src, count);
380 }
381 
382 template <bool kPremul>
expand_grayA(uint32_t dst[],const void * vsrc,int count)383 static void expand_grayA(uint32_t dst[], const void* vsrc, int count) {
384     const uint8_t* src = (const uint8_t*) vsrc;
385     while (count >= 16) {
386         // Load 16 pixels.
387         uint8x16x2_t ga = vld2q_u8(src);
388 
389         // Premultiply if requested.
390         if (kPremul) {
391             ga.val[0] = vcombine_u8(
392                     scale(vget_low_u8(ga.val[0]),  vget_low_u8(ga.val[1])),
393                     scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
394         }
395 
396         // Set each of the color channels.
397         uint8x16x4_t rgba;
398         rgba.val[0] = ga.val[0];
399         rgba.val[1] = ga.val[0];
400         rgba.val[2] = ga.val[0];
401         rgba.val[3] = ga.val[1];
402 
403         // Store 16 pixels.
404         vst4q_u8((uint8_t*) dst, rgba);
405         src += 16*2;
406         dst += 16;
407         count -= 16;
408     }
409 
410     if (count >= 8) {
411         // Load 8 pixels.
412         uint8x8x2_t ga = vld2_u8(src);
413 
414         // Premultiply if requested.
415         if (kPremul) {
416             ga.val[0] = scale(ga.val[0], ga.val[1]);
417         }
418 
419         // Set each of the color channels.
420         uint8x8x4_t rgba;
421         rgba.val[0] = ga.val[0];
422         rgba.val[1] = ga.val[0];
423         rgba.val[2] = ga.val[0];
424         rgba.val[3] = ga.val[1];
425 
426         // Store 8 pixels.
427         vst4_u8((uint8_t*) dst, rgba);
428         src += 8*2;
429         dst += 8;
430         count -= 8;
431     }
432 
433     auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
434     proc(dst, src, count);
435 }
436 
grayA_to_RGBA(uint32_t dst[],const void * src,int count)437 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
438     expand_grayA<false>(dst, src, count);
439 }
440 
grayA_to_rgbA(uint32_t dst[],const void * src,int count)441 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
442     expand_grayA<true>(dst, src, count);
443 }
444 
445 enum Format { kRGB1, kBGR1 };
446 template <Format format>
inverted_cmyk_to(uint32_t * dst,const void * vsrc,int count)447 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
448     auto src = (const uint32_t*)vsrc;
449     while (count >= 8) {
450         // Load 8 cmyk pixels.
451         uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
452 
453         uint8x8_t k = pixels.val[3],
454                   y = pixels.val[2],
455                   m = pixels.val[1],
456                   c = pixels.val[0];
457 
458         // Scale to r, g, b.
459         uint8x8_t b = scale(y, k);
460         uint8x8_t g = scale(m, k);
461         uint8x8_t r = scale(c, k);
462 
463         // Store 8 rgba pixels.
464         if (kBGR1 == format) {
465             pixels.val[3] = vdup_n_u8(0xFF);
466             pixels.val[2] = r;
467             pixels.val[1] = g;
468             pixels.val[0] = b;
469         } else {
470             pixels.val[3] = vdup_n_u8(0xFF);
471             pixels.val[2] = b;
472             pixels.val[1] = g;
473             pixels.val[0] = r;
474         }
475         vst4_u8((uint8_t*) dst, pixels);
476         src += 8;
477         dst += 8;
478         count -= 8;
479     }
480 
481     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
482     proc(dst, src, count);
483 }
484 
inverted_CMYK_to_RGB1(uint32_t dst[],const void * src,int count)485 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
486     inverted_cmyk_to<kRGB1>(dst, src, count);
487 }
488 
inverted_CMYK_to_BGR1(uint32_t dst[],const void * src,int count)489 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
490     inverted_cmyk_to<kBGR1>(dst, src, count);
491 }
492 
493 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
494 
495 // Scale a byte by another.
496 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
scale(__m128i x,__m128i y)497 static __m128i scale(__m128i x, __m128i y) {
498     const __m128i _128 = _mm_set1_epi16(128);
499     const __m128i _257 = _mm_set1_epi16(257);
500 
501     // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
502     return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
503 }
504 
505 template <bool kSwapRB>
premul_should_swapRB(uint32_t * dst,const void * vsrc,int count)506 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
507     auto src = (const uint32_t*)vsrc;
508 
509     auto premul8 = [](__m128i* lo, __m128i* hi) {
510         const __m128i zeros = _mm_setzero_si128();
511         __m128i planar;
512         if (kSwapRB) {
513             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
514         } else {
515             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
516         }
517 
518         // Swizzle the pixels to 8-bit planar.
519         *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
520         *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
521         __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
522                 ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA
523 
524         // Unpack to 16-bit planar.
525         __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
526                 g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
527                 b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
528                 a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_
529 
530         // Premultiply!
531         r = scale(r, a);
532         g = scale(g, a);
533         b = scale(b, a);
534 
535         // Repack into interlaced pixels.
536         rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
537         ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
538         *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
539         *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
540     };
541 
542     while (count >= 8) {
543         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
544                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
545 
546         premul8(&lo, &hi);
547 
548         _mm_storeu_si128((__m128i*) (dst + 0), lo);
549         _mm_storeu_si128((__m128i*) (dst + 4), hi);
550 
551         src += 8;
552         dst += 8;
553         count -= 8;
554     }
555 
556     if (count >= 4) {
557         __m128i lo = _mm_loadu_si128((const __m128i*) src),
558                 hi = _mm_setzero_si128();
559 
560         premul8(&lo, &hi);
561 
562         _mm_storeu_si128((__m128i*) dst, lo);
563 
564         src += 4;
565         dst += 4;
566         count -= 4;
567     }
568 
569     // Call portable code to finish up the tail of [0,4) pixels.
570     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
571     proc(dst, src, count);
572 }
573 
RGBA_to_rgbA(uint32_t * dst,const void * src,int count)574 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
575     premul_should_swapRB<false>(dst, src, count);
576 }
577 
RGBA_to_bgrA(uint32_t * dst,const void * src,int count)578 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
579     premul_should_swapRB<true>(dst, src, count);
580 }
581 
RGBA_to_BGRA(uint32_t * dst,const void * vsrc,int count)582 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
583     auto src = (const uint32_t*)vsrc;
584     const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
585 
586     while (count >= 4) {
587         __m128i rgba = _mm_loadu_si128((const __m128i*) src);
588         __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
589         _mm_storeu_si128((__m128i*) dst, bgra);
590 
591         src += 4;
592         dst += 4;
593         count -= 4;
594     }
595 
596     RGBA_to_BGRA_portable(dst, src, count);
597 }
598 
599 template <bool kSwapRB>
insert_alpha_should_swaprb(uint32_t dst[],const void * vsrc,int count)600 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
601     const uint8_t* src = (const uint8_t*) vsrc;
602 
603     const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
604     __m128i expand;
605     const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
606     if (kSwapRB) {
607         expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
608     } else {
609         expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
610     }
611 
612     while (count >= 6) {
613         // Load a vector.  While this actually contains 5 pixels plus an
614         // extra component, we will discard all but the first four pixels on
615         // this iteration.
616         __m128i rgb = _mm_loadu_si128((const __m128i*) src);
617 
618         // Expand the first four pixels to RGBX and then mask to RGB(FF).
619         __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
620 
621         // Store 4 pixels.
622         _mm_storeu_si128((__m128i*) dst, rgba);
623 
624         src += 4*3;
625         dst += 4;
626         count -= 4;
627     }
628 
629     // Call portable code to finish up the tail of [0,4) pixels.
630     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
631     proc(dst, src, count);
632 }
633 
RGB_to_RGB1(uint32_t dst[],const void * src,int count)634 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
635     insert_alpha_should_swaprb<false>(dst, src, count);
636 }
637 
RGB_to_BGR1(uint32_t dst[],const void * src,int count)638 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
639     insert_alpha_should_swaprb<true>(dst, src, count);
640 }
641 
gray_to_RGB1(uint32_t dst[],const void * vsrc,int count)642 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
643     const uint8_t* src = (const uint8_t*) vsrc;
644 
645     const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
646     while (count >= 16) {
647         __m128i grays = _mm_loadu_si128((const __m128i*) src);
648 
649         __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
650         __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
651         __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
652         __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
653 
654         __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
655         __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
656         __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
657         __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
658 
659         _mm_storeu_si128((__m128i*) (dst +  0), ggga0);
660         _mm_storeu_si128((__m128i*) (dst +  4), ggga1);
661         _mm_storeu_si128((__m128i*) (dst +  8), ggga2);
662         _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
663 
664         src += 16;
665         dst += 16;
666         count -= 16;
667     }
668 
669     gray_to_RGB1_portable(dst, src, count);
670 }
671 
grayA_to_RGBA(uint32_t dst[],const void * vsrc,int count)672 static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
673     const uint8_t* src = (const uint8_t*) vsrc;
674     while (count >= 8) {
675         __m128i ga = _mm_loadu_si128((const __m128i*) src);
676 
677         __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
678                                   _mm_slli_epi16(ga, 8));
679 
680         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
681         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
682 
683         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
684         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
685 
686         src += 8*2;
687         dst += 8;
688         count -= 8;
689     }
690 
691     grayA_to_RGBA_portable(dst, src, count);
692 }
693 
grayA_to_rgbA(uint32_t dst[],const void * vsrc,int count)694 static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
695     const uint8_t* src = (const uint8_t*) vsrc;
696     while (count >= 8) {
697         __m128i grayA = _mm_loadu_si128((const __m128i*) src);
698 
699         __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
700         __m128i a0 = _mm_srli_epi16(grayA, 8);
701 
702         // Premultiply
703         g0 = scale(g0, a0);
704 
705         __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
706         __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
707 
708 
709         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
710         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
711 
712         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
713         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
714 
715         src += 8*2;
716         dst += 8;
717         count -= 8;
718     }
719 
720     grayA_to_rgbA_portable(dst, src, count);
721 }
722 
723 enum Format { kRGB1, kBGR1 };
724 template <Format format>
inverted_cmyk_to(uint32_t * dst,const void * vsrc,int count)725 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
726     auto src = (const uint32_t*)vsrc;
727 
728     auto convert8 = [](__m128i* lo, __m128i* hi) {
729         const __m128i zeros = _mm_setzero_si128();
730         __m128i planar;
731         if (kBGR1 == format) {
732             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
733         } else {
734             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
735         }
736 
737         // Swizzle the pixels to 8-bit planar.
738         *lo = _mm_shuffle_epi8(*lo, planar);                                 // ccccmmmm yyyykkkk
739         *hi = _mm_shuffle_epi8(*hi, planar);                                 // CCCCMMMM YYYYKKKK
740         __m128i cm = _mm_unpacklo_epi32(*lo, *hi),                           // ccccCCCC mmmmMMMM
741                 yk = _mm_unpackhi_epi32(*lo, *hi);                           // yyyyYYYY kkkkKKKK
742 
743         // Unpack to 16-bit planar.
744         __m128i c = _mm_unpacklo_epi8(cm, zeros),                            // c_c_c_c_ C_C_C_C_
745                 m = _mm_unpackhi_epi8(cm, zeros),                            // m_m_m_m_ M_M_M_M_
746                 y = _mm_unpacklo_epi8(yk, zeros),                            // y_y_y_y_ Y_Y_Y_Y_
747                 k = _mm_unpackhi_epi8(yk, zeros);                            // k_k_k_k_ K_K_K_K_
748 
749         // Scale to r, g, b.
750         __m128i r = scale(c, k),
751                 g = scale(m, k),
752                 b = scale(y, k);
753 
754         // Repack into interlaced pixels.
755         __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),                  // rgrgrgrg RGRGRGRG
756                 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00));     // b1b1b1b1 B1B1B1B1
757         *lo = _mm_unpacklo_epi16(rg, ba);                                    // rgbargba rgbargba
758         *hi = _mm_unpackhi_epi16(rg, ba);                                    // RGB1RGB1 RGB1RGB1
759     };
760 
761     while (count >= 8) {
762         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
763                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
764 
765         convert8(&lo, &hi);
766 
767         _mm_storeu_si128((__m128i*) (dst + 0), lo);
768         _mm_storeu_si128((__m128i*) (dst + 4), hi);
769 
770         src += 8;
771         dst += 8;
772         count -= 8;
773     }
774 
775     if (count >= 4) {
776         __m128i lo = _mm_loadu_si128((const __m128i*) src),
777                 hi = _mm_setzero_si128();
778 
779         convert8(&lo, &hi);
780 
781         _mm_storeu_si128((__m128i*) dst, lo);
782 
783         src += 4;
784         dst += 4;
785         count -= 4;
786     }
787 
788     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
789     proc(dst, src, count);
790 }
791 
inverted_CMYK_to_RGB1(uint32_t dst[],const void * src,int count)792 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
793     inverted_cmyk_to<kRGB1>(dst, src, count);
794 }
795 
inverted_CMYK_to_BGR1(uint32_t dst[],const void * src,int count)796 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
797     inverted_cmyk_to<kBGR1>(dst, src, count);
798 }
799 
800 #else
801 
RGBA_to_rgbA(uint32_t * dst,const void * src,int count)802 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
803     RGBA_to_rgbA_portable(dst, src, count);
804 }
805 
RGBA_to_bgrA(uint32_t * dst,const void * src,int count)806 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
807     RGBA_to_bgrA_portable(dst, src, count);
808 }
809 
RGBA_to_BGRA(uint32_t * dst,const void * src,int count)810 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
811     RGBA_to_BGRA_portable(dst, src, count);
812 }
813 
RGB_to_RGB1(uint32_t dst[],const void * src,int count)814 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
815     RGB_to_RGB1_portable(dst, src, count);
816 }
817 
RGB_to_BGR1(uint32_t dst[],const void * src,int count)818 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
819     RGB_to_BGR1_portable(dst, src, count);
820 }
821 
gray_to_RGB1(uint32_t dst[],const void * src,int count)822 static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
823     gray_to_RGB1_portable(dst, src, count);
824 }
825 
grayA_to_RGBA(uint32_t dst[],const void * src,int count)826 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
827     grayA_to_RGBA_portable(dst, src, count);
828 }
829 
grayA_to_rgbA(uint32_t dst[],const void * src,int count)830 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
831     grayA_to_rgbA_portable(dst, src, count);
832 }
833 
inverted_CMYK_to_RGB1(uint32_t dst[],const void * src,int count)834 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
835     inverted_CMYK_to_RGB1_portable(dst, src, count);
836 }
837 
inverted_CMYK_to_BGR1(uint32_t dst[],const void * src,int count)838 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
839     inverted_CMYK_to_BGR1_portable(dst, src, count);
840 }
841 
842 #endif
843 
844 }
845 
846 #endif // SkSwizzler_opts_DEFINED
847