1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED
10
11 #include "SkColorPriv.h"
12
13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
14 #include <immintrin.h>
15 #elif defined(SK_ARM_HAS_NEON)
16 #include <arm_neon.h>
17 #endif
18
19 namespace SK_OPTS_NS {
20
RGBA_to_rgbA_portable(uint32_t * dst,const void * vsrc,int count)21 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
22 auto src = (const uint32_t*)vsrc;
23 for (int i = 0; i < count; i++) {
24 uint8_t a = src[i] >> 24,
25 b = src[i] >> 16,
26 g = src[i] >> 8,
27 r = src[i] >> 0;
28 b = (b*a+127)/255;
29 g = (g*a+127)/255;
30 r = (r*a+127)/255;
31 dst[i] = (uint32_t)a << 24
32 | (uint32_t)b << 16
33 | (uint32_t)g << 8
34 | (uint32_t)r << 0;
35 }
36 }
37
RGBA_to_bgrA_portable(uint32_t * dst,const void * vsrc,int count)38 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
39 auto src = (const uint32_t*)vsrc;
40 for (int i = 0; i < count; i++) {
41 uint8_t a = src[i] >> 24,
42 b = src[i] >> 16,
43 g = src[i] >> 8,
44 r = src[i] >> 0;
45 b = (b*a+127)/255;
46 g = (g*a+127)/255;
47 r = (r*a+127)/255;
48 dst[i] = (uint32_t)a << 24
49 | (uint32_t)r << 16
50 | (uint32_t)g << 8
51 | (uint32_t)b << 0;
52 }
53 }
54
RGBA_to_BGRA_portable(uint32_t * dst,const void * vsrc,int count)55 static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
56 auto src = (const uint32_t*)vsrc;
57 for (int i = 0; i < count; i++) {
58 uint8_t a = src[i] >> 24,
59 b = src[i] >> 16,
60 g = src[i] >> 8,
61 r = src[i] >> 0;
62 dst[i] = (uint32_t)a << 24
63 | (uint32_t)r << 16
64 | (uint32_t)g << 8
65 | (uint32_t)b << 0;
66 }
67 }
68
RGB_to_RGB1_portable(uint32_t dst[],const void * vsrc,int count)69 static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
70 const uint8_t* src = (const uint8_t*)vsrc;
71 for (int i = 0; i < count; i++) {
72 uint8_t r = src[0],
73 g = src[1],
74 b = src[2];
75 src += 3;
76 dst[i] = (uint32_t)0xFF << 24
77 | (uint32_t)b << 16
78 | (uint32_t)g << 8
79 | (uint32_t)r << 0;
80 }
81 }
82
RGB_to_BGR1_portable(uint32_t dst[],const void * vsrc,int count)83 static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
84 const uint8_t* src = (const uint8_t*)vsrc;
85 for (int i = 0; i < count; i++) {
86 uint8_t r = src[0],
87 g = src[1],
88 b = src[2];
89 src += 3;
90 dst[i] = (uint32_t)0xFF << 24
91 | (uint32_t)r << 16
92 | (uint32_t)g << 8
93 | (uint32_t)b << 0;
94 }
95 }
96
gray_to_RGB1_portable(uint32_t dst[],const void * vsrc,int count)97 static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
98 const uint8_t* src = (const uint8_t*)vsrc;
99 for (int i = 0; i < count; i++) {
100 dst[i] = (uint32_t)0xFF << 24
101 | (uint32_t)src[i] << 16
102 | (uint32_t)src[i] << 8
103 | (uint32_t)src[i] << 0;
104 }
105 }
106
grayA_to_RGBA_portable(uint32_t dst[],const void * vsrc,int count)107 static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {
108 const uint8_t* src = (const uint8_t*)vsrc;
109 for (int i = 0; i < count; i++) {
110 uint8_t g = src[0],
111 a = src[1];
112 src += 2;
113 dst[i] = (uint32_t)a << 24
114 | (uint32_t)g << 16
115 | (uint32_t)g << 8
116 | (uint32_t)g << 0;
117 }
118 }
119
grayA_to_rgbA_portable(uint32_t dst[],const void * vsrc,int count)120 static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {
121 const uint8_t* src = (const uint8_t*)vsrc;
122 for (int i = 0; i < count; i++) {
123 uint8_t g = src[0],
124 a = src[1];
125 src += 2;
126 g = (g*a+127)/255;
127 dst[i] = (uint32_t)a << 24
128 | (uint32_t)g << 16
129 | (uint32_t)g << 8
130 | (uint32_t)g << 0;
131 }
132 }
133
inverted_CMYK_to_RGB1_portable(uint32_t * dst,const void * vsrc,int count)134 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
135 const uint32_t* src = (const uint32_t*)vsrc;
136 for (int i = 0; i < count; i++) {
137 uint8_t k = src[i] >> 24,
138 y = src[i] >> 16,
139 m = src[i] >> 8,
140 c = src[i] >> 0;
141 // See comments in SkSwizzler.cpp for details on the conversion formula.
142 uint8_t b = (y*k+127)/255,
143 g = (m*k+127)/255,
144 r = (c*k+127)/255;
145 dst[i] = (uint32_t)0xFF << 24
146 | (uint32_t) b << 16
147 | (uint32_t) g << 8
148 | (uint32_t) r << 0;
149 }
150 }
151
inverted_CMYK_to_BGR1_portable(uint32_t * dst,const void * vsrc,int count)152 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
153 const uint32_t* src = (const uint32_t*)vsrc;
154 for (int i = 0; i < count; i++) {
155 uint8_t k = src[i] >> 24,
156 y = src[i] >> 16,
157 m = src[i] >> 8,
158 c = src[i] >> 0;
159 uint8_t b = (y*k+127)/255,
160 g = (m*k+127)/255,
161 r = (c*k+127)/255;
162 dst[i] = (uint32_t)0xFF << 24
163 | (uint32_t) r << 16
164 | (uint32_t) g << 8
165 | (uint32_t) b << 0;
166 }
167 }
168
169 #if defined(SK_ARM_HAS_NEON)
170
171 // Rounded divide by 255, (x + 127) / 255
div255_round(uint16x8_t x)172 static uint8x8_t div255_round(uint16x8_t x) {
173 // result = (x + 127) / 255
174 // result = (x + 127) / 256 + error1
175 //
176 // error1 = (x + 127) / (255 * 256)
177 // error1 = (x + 127) / (256 * 256) + error2
178 //
179 // error2 = (x + 127) / (255 * 256 * 256)
180 //
181 // The maximum value of error2 is too small to matter. Thus:
182 // result = (x + 127) / 256 + (x + 127) / (256 * 256)
183 // result = ((x + 127) / 256 + x + 127) / 256
184 // result = ((x + 127) >> 8 + x + 127) >> 8
185 //
186 // Use >>> to represent "rounded right shift" which, conveniently,
187 // NEON supports in one instruction.
188 // result = ((x >>> 8) + x) >>> 8
189 //
190 // Note that the second right shift is actually performed as an
191 // "add, round, and narrow back to 8-bits" instruction.
192 return vraddhn_u16(x, vrshrq_n_u16(x, 8));
193 }
194
195 // Scale a byte by another, (x * y + 127) / 255
scale(uint8x8_t x,uint8x8_t y)196 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
197 return div255_round(vmull_u8(x, y));
198 }
199
200 template <bool kSwapRB>
premul_should_swapRB(uint32_t * dst,const void * vsrc,int count)201 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
202 auto src = (const uint32_t*)vsrc;
203 while (count >= 8) {
204 // Load 8 pixels.
205 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
206
207 uint8x8_t a = rgba.val[3],
208 b = rgba.val[2],
209 g = rgba.val[1],
210 r = rgba.val[0];
211
212 // Premultiply.
213 b = scale(b, a);
214 g = scale(g, a);
215 r = scale(r, a);
216
217 // Store 8 premultiplied pixels.
218 if (kSwapRB) {
219 rgba.val[2] = r;
220 rgba.val[1] = g;
221 rgba.val[0] = b;
222 } else {
223 rgba.val[2] = b;
224 rgba.val[1] = g;
225 rgba.val[0] = r;
226 }
227 vst4_u8((uint8_t*) dst, rgba);
228 src += 8;
229 dst += 8;
230 count -= 8;
231 }
232
233 // Call portable code to finish up the tail of [0,8) pixels.
234 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
235 proc(dst, src, count);
236 }
237
RGBA_to_rgbA(uint32_t * dst,const void * src,int count)238 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
239 premul_should_swapRB<false>(dst, src, count);
240 }
241
RGBA_to_bgrA(uint32_t * dst,const void * src,int count)242 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
243 premul_should_swapRB<true>(dst, src, count);
244 }
245
RGBA_to_BGRA(uint32_t * dst,const void * vsrc,int count)246 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
247 auto src = (const uint32_t*)vsrc;
248 while (count >= 16) {
249 // Load 16 pixels.
250 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
251
252 // Swap r and b.
253 SkTSwap(rgba.val[0], rgba.val[2]);
254
255 // Store 16 pixels.
256 vst4q_u8((uint8_t*) dst, rgba);
257 src += 16;
258 dst += 16;
259 count -= 16;
260 }
261
262 if (count >= 8) {
263 // Load 8 pixels.
264 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
265
266 // Swap r and b.
267 SkTSwap(rgba.val[0], rgba.val[2]);
268
269 // Store 8 pixels.
270 vst4_u8((uint8_t*) dst, rgba);
271 src += 8;
272 dst += 8;
273 count -= 8;
274 }
275
276 RGBA_to_BGRA_portable(dst, src, count);
277 }
278
279 template <bool kSwapRB>
insert_alpha_should_swaprb(uint32_t dst[],const void * vsrc,int count)280 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
281 const uint8_t* src = (const uint8_t*) vsrc;
282 while (count >= 16) {
283 // Load 16 pixels.
284 uint8x16x3_t rgb = vld3q_u8(src);
285
286 // Insert an opaque alpha channel and swap if needed.
287 uint8x16x4_t rgba;
288 if (kSwapRB) {
289 rgba.val[0] = rgb.val[2];
290 rgba.val[2] = rgb.val[0];
291 } else {
292 rgba.val[0] = rgb.val[0];
293 rgba.val[2] = rgb.val[2];
294 }
295 rgba.val[1] = rgb.val[1];
296 rgba.val[3] = vdupq_n_u8(0xFF);
297
298 // Store 16 pixels.
299 vst4q_u8((uint8_t*) dst, rgba);
300 src += 16*3;
301 dst += 16;
302 count -= 16;
303 }
304
305 if (count >= 8) {
306 // Load 8 pixels.
307 uint8x8x3_t rgb = vld3_u8(src);
308
309 // Insert an opaque alpha channel and swap if needed.
310 uint8x8x4_t rgba;
311 if (kSwapRB) {
312 rgba.val[0] = rgb.val[2];
313 rgba.val[2] = rgb.val[0];
314 } else {
315 rgba.val[0] = rgb.val[0];
316 rgba.val[2] = rgb.val[2];
317 }
318 rgba.val[1] = rgb.val[1];
319 rgba.val[3] = vdup_n_u8(0xFF);
320
321 // Store 8 pixels.
322 vst4_u8((uint8_t*) dst, rgba);
323 src += 8*3;
324 dst += 8;
325 count -= 8;
326 }
327
328 // Call portable code to finish up the tail of [0,8) pixels.
329 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
330 proc(dst, src, count);
331 }
332
RGB_to_RGB1(uint32_t dst[],const void * src,int count)333 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
334 insert_alpha_should_swaprb<false>(dst, src, count);
335 }
336
RGB_to_BGR1(uint32_t dst[],const void * src,int count)337 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
338 insert_alpha_should_swaprb<true>(dst, src, count);
339 }
340
gray_to_RGB1(uint32_t dst[],const void * vsrc,int count)341 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
342 const uint8_t* src = (const uint8_t*) vsrc;
343 while (count >= 16) {
344 // Load 16 pixels.
345 uint8x16_t gray = vld1q_u8(src);
346
347 // Set each of the color channels.
348 uint8x16x4_t rgba;
349 rgba.val[0] = gray;
350 rgba.val[1] = gray;
351 rgba.val[2] = gray;
352 rgba.val[3] = vdupq_n_u8(0xFF);
353
354 // Store 16 pixels.
355 vst4q_u8((uint8_t*) dst, rgba);
356 src += 16;
357 dst += 16;
358 count -= 16;
359 }
360
361 if (count >= 8) {
362 // Load 8 pixels.
363 uint8x8_t gray = vld1_u8(src);
364
365 // Set each of the color channels.
366 uint8x8x4_t rgba;
367 rgba.val[0] = gray;
368 rgba.val[1] = gray;
369 rgba.val[2] = gray;
370 rgba.val[3] = vdup_n_u8(0xFF);
371
372 // Store 8 pixels.
373 vst4_u8((uint8_t*) dst, rgba);
374 src += 8;
375 dst += 8;
376 count -= 8;
377 }
378
379 gray_to_RGB1_portable(dst, src, count);
380 }
381
382 template <bool kPremul>
expand_grayA(uint32_t dst[],const void * vsrc,int count)383 static void expand_grayA(uint32_t dst[], const void* vsrc, int count) {
384 const uint8_t* src = (const uint8_t*) vsrc;
385 while (count >= 16) {
386 // Load 16 pixels.
387 uint8x16x2_t ga = vld2q_u8(src);
388
389 // Premultiply if requested.
390 if (kPremul) {
391 ga.val[0] = vcombine_u8(
392 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),
393 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
394 }
395
396 // Set each of the color channels.
397 uint8x16x4_t rgba;
398 rgba.val[0] = ga.val[0];
399 rgba.val[1] = ga.val[0];
400 rgba.val[2] = ga.val[0];
401 rgba.val[3] = ga.val[1];
402
403 // Store 16 pixels.
404 vst4q_u8((uint8_t*) dst, rgba);
405 src += 16*2;
406 dst += 16;
407 count -= 16;
408 }
409
410 if (count >= 8) {
411 // Load 8 pixels.
412 uint8x8x2_t ga = vld2_u8(src);
413
414 // Premultiply if requested.
415 if (kPremul) {
416 ga.val[0] = scale(ga.val[0], ga.val[1]);
417 }
418
419 // Set each of the color channels.
420 uint8x8x4_t rgba;
421 rgba.val[0] = ga.val[0];
422 rgba.val[1] = ga.val[0];
423 rgba.val[2] = ga.val[0];
424 rgba.val[3] = ga.val[1];
425
426 // Store 8 pixels.
427 vst4_u8((uint8_t*) dst, rgba);
428 src += 8*2;
429 dst += 8;
430 count -= 8;
431 }
432
433 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
434 proc(dst, src, count);
435 }
436
grayA_to_RGBA(uint32_t dst[],const void * src,int count)437 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
438 expand_grayA<false>(dst, src, count);
439 }
440
grayA_to_rgbA(uint32_t dst[],const void * src,int count)441 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
442 expand_grayA<true>(dst, src, count);
443 }
444
445 enum Format { kRGB1, kBGR1 };
446 template <Format format>
inverted_cmyk_to(uint32_t * dst,const void * vsrc,int count)447 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
448 auto src = (const uint32_t*)vsrc;
449 while (count >= 8) {
450 // Load 8 cmyk pixels.
451 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
452
453 uint8x8_t k = pixels.val[3],
454 y = pixels.val[2],
455 m = pixels.val[1],
456 c = pixels.val[0];
457
458 // Scale to r, g, b.
459 uint8x8_t b = scale(y, k);
460 uint8x8_t g = scale(m, k);
461 uint8x8_t r = scale(c, k);
462
463 // Store 8 rgba pixels.
464 if (kBGR1 == format) {
465 pixels.val[3] = vdup_n_u8(0xFF);
466 pixels.val[2] = r;
467 pixels.val[1] = g;
468 pixels.val[0] = b;
469 } else {
470 pixels.val[3] = vdup_n_u8(0xFF);
471 pixels.val[2] = b;
472 pixels.val[1] = g;
473 pixels.val[0] = r;
474 }
475 vst4_u8((uint8_t*) dst, pixels);
476 src += 8;
477 dst += 8;
478 count -= 8;
479 }
480
481 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
482 proc(dst, src, count);
483 }
484
inverted_CMYK_to_RGB1(uint32_t dst[],const void * src,int count)485 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
486 inverted_cmyk_to<kRGB1>(dst, src, count);
487 }
488
inverted_CMYK_to_BGR1(uint32_t dst[],const void * src,int count)489 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
490 inverted_cmyk_to<kBGR1>(dst, src, count);
491 }
492
493 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
494
495 // Scale a byte by another.
496 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
scale(__m128i x,__m128i y)497 static __m128i scale(__m128i x, __m128i y) {
498 const __m128i _128 = _mm_set1_epi16(128);
499 const __m128i _257 = _mm_set1_epi16(257);
500
501 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
502 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
503 }
504
505 template <bool kSwapRB>
premul_should_swapRB(uint32_t * dst,const void * vsrc,int count)506 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
507 auto src = (const uint32_t*)vsrc;
508
509 auto premul8 = [](__m128i* lo, __m128i* hi) {
510 const __m128i zeros = _mm_setzero_si128();
511 __m128i planar;
512 if (kSwapRB) {
513 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
514 } else {
515 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
516 }
517
518 // Swizzle the pixels to 8-bit planar.
519 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa
520 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA
521 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG
522 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA
523
524 // Unpack to 16-bit planar.
525 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_
526 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_
527 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
528 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
529
530 // Premultiply!
531 r = scale(r, a);
532 g = scale(g, a);
533 b = scale(b, a);
534
535 // Repack into interlaced pixels.
536 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG
537 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA
538 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
539 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA
540 };
541
542 while (count >= 8) {
543 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
544 hi = _mm_loadu_si128((const __m128i*) (src + 4));
545
546 premul8(&lo, &hi);
547
548 _mm_storeu_si128((__m128i*) (dst + 0), lo);
549 _mm_storeu_si128((__m128i*) (dst + 4), hi);
550
551 src += 8;
552 dst += 8;
553 count -= 8;
554 }
555
556 if (count >= 4) {
557 __m128i lo = _mm_loadu_si128((const __m128i*) src),
558 hi = _mm_setzero_si128();
559
560 premul8(&lo, &hi);
561
562 _mm_storeu_si128((__m128i*) dst, lo);
563
564 src += 4;
565 dst += 4;
566 count -= 4;
567 }
568
569 // Call portable code to finish up the tail of [0,4) pixels.
570 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
571 proc(dst, src, count);
572 }
573
RGBA_to_rgbA(uint32_t * dst,const void * src,int count)574 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
575 premul_should_swapRB<false>(dst, src, count);
576 }
577
RGBA_to_bgrA(uint32_t * dst,const void * src,int count)578 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
579 premul_should_swapRB<true>(dst, src, count);
580 }
581
RGBA_to_BGRA(uint32_t * dst,const void * vsrc,int count)582 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
583 auto src = (const uint32_t*)vsrc;
584 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
585
586 while (count >= 4) {
587 __m128i rgba = _mm_loadu_si128((const __m128i*) src);
588 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
589 _mm_storeu_si128((__m128i*) dst, bgra);
590
591 src += 4;
592 dst += 4;
593 count -= 4;
594 }
595
596 RGBA_to_BGRA_portable(dst, src, count);
597 }
598
599 template <bool kSwapRB>
insert_alpha_should_swaprb(uint32_t dst[],const void * vsrc,int count)600 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
601 const uint8_t* src = (const uint8_t*) vsrc;
602
603 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
604 __m128i expand;
605 const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant.
606 if (kSwapRB) {
607 expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
608 } else {
609 expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
610 }
611
612 while (count >= 6) {
613 // Load a vector. While this actually contains 5 pixels plus an
614 // extra component, we will discard all but the first four pixels on
615 // this iteration.
616 __m128i rgb = _mm_loadu_si128((const __m128i*) src);
617
618 // Expand the first four pixels to RGBX and then mask to RGB(FF).
619 __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
620
621 // Store 4 pixels.
622 _mm_storeu_si128((__m128i*) dst, rgba);
623
624 src += 4*3;
625 dst += 4;
626 count -= 4;
627 }
628
629 // Call portable code to finish up the tail of [0,4) pixels.
630 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
631 proc(dst, src, count);
632 }
633
RGB_to_RGB1(uint32_t dst[],const void * src,int count)634 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
635 insert_alpha_should_swaprb<false>(dst, src, count);
636 }
637
RGB_to_BGR1(uint32_t dst[],const void * src,int count)638 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
639 insert_alpha_should_swaprb<true>(dst, src, count);
640 }
641
gray_to_RGB1(uint32_t dst[],const void * vsrc,int count)642 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
643 const uint8_t* src = (const uint8_t*) vsrc;
644
645 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
646 while (count >= 16) {
647 __m128i grays = _mm_loadu_si128((const __m128i*) src);
648
649 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
650 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
651 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
652 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
653
654 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
655 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
656 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
657 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
658
659 _mm_storeu_si128((__m128i*) (dst + 0), ggga0);
660 _mm_storeu_si128((__m128i*) (dst + 4), ggga1);
661 _mm_storeu_si128((__m128i*) (dst + 8), ggga2);
662 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
663
664 src += 16;
665 dst += 16;
666 count -= 16;
667 }
668
669 gray_to_RGB1_portable(dst, src, count);
670 }
671
grayA_to_RGBA(uint32_t dst[],const void * vsrc,int count)672 static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
673 const uint8_t* src = (const uint8_t*) vsrc;
674 while (count >= 8) {
675 __m128i ga = _mm_loadu_si128((const __m128i*) src);
676
677 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
678 _mm_slli_epi16(ga, 8));
679
680 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
681 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
682
683 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
684 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
685
686 src += 8*2;
687 dst += 8;
688 count -= 8;
689 }
690
691 grayA_to_RGBA_portable(dst, src, count);
692 }
693
grayA_to_rgbA(uint32_t dst[],const void * vsrc,int count)694 static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
695 const uint8_t* src = (const uint8_t*) vsrc;
696 while (count >= 8) {
697 __m128i grayA = _mm_loadu_si128((const __m128i*) src);
698
699 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
700 __m128i a0 = _mm_srli_epi16(grayA, 8);
701
702 // Premultiply
703 g0 = scale(g0, a0);
704
705 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
706 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
707
708
709 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
710 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
711
712 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
713 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
714
715 src += 8*2;
716 dst += 8;
717 count -= 8;
718 }
719
720 grayA_to_rgbA_portable(dst, src, count);
721 }
722
723 enum Format { kRGB1, kBGR1 };
724 template <Format format>
inverted_cmyk_to(uint32_t * dst,const void * vsrc,int count)725 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
726 auto src = (const uint32_t*)vsrc;
727
728 auto convert8 = [](__m128i* lo, __m128i* hi) {
729 const __m128i zeros = _mm_setzero_si128();
730 __m128i planar;
731 if (kBGR1 == format) {
732 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
733 } else {
734 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
735 }
736
737 // Swizzle the pixels to 8-bit planar.
738 *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk
739 *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK
740 __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM
741 yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK
742
743 // Unpack to 16-bit planar.
744 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_
745 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_
746 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_
747 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_
748
749 // Scale to r, g, b.
750 __m128i r = scale(c, k),
751 g = scale(m, k),
752 b = scale(y, k);
753
754 // Repack into interlaced pixels.
755 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG
756 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1
757 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
758 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1
759 };
760
761 while (count >= 8) {
762 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
763 hi = _mm_loadu_si128((const __m128i*) (src + 4));
764
765 convert8(&lo, &hi);
766
767 _mm_storeu_si128((__m128i*) (dst + 0), lo);
768 _mm_storeu_si128((__m128i*) (dst + 4), hi);
769
770 src += 8;
771 dst += 8;
772 count -= 8;
773 }
774
775 if (count >= 4) {
776 __m128i lo = _mm_loadu_si128((const __m128i*) src),
777 hi = _mm_setzero_si128();
778
779 convert8(&lo, &hi);
780
781 _mm_storeu_si128((__m128i*) dst, lo);
782
783 src += 4;
784 dst += 4;
785 count -= 4;
786 }
787
788 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
789 proc(dst, src, count);
790 }
791
inverted_CMYK_to_RGB1(uint32_t dst[],const void * src,int count)792 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
793 inverted_cmyk_to<kRGB1>(dst, src, count);
794 }
795
inverted_CMYK_to_BGR1(uint32_t dst[],const void * src,int count)796 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
797 inverted_cmyk_to<kBGR1>(dst, src, count);
798 }
799
800 #else
801
RGBA_to_rgbA(uint32_t * dst,const void * src,int count)802 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
803 RGBA_to_rgbA_portable(dst, src, count);
804 }
805
RGBA_to_bgrA(uint32_t * dst,const void * src,int count)806 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
807 RGBA_to_bgrA_portable(dst, src, count);
808 }
809
RGBA_to_BGRA(uint32_t * dst,const void * src,int count)810 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
811 RGBA_to_BGRA_portable(dst, src, count);
812 }
813
RGB_to_RGB1(uint32_t dst[],const void * src,int count)814 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
815 RGB_to_RGB1_portable(dst, src, count);
816 }
817
RGB_to_BGR1(uint32_t dst[],const void * src,int count)818 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
819 RGB_to_BGR1_portable(dst, src, count);
820 }
821
gray_to_RGB1(uint32_t dst[],const void * src,int count)822 static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
823 gray_to_RGB1_portable(dst, src, count);
824 }
825
grayA_to_RGBA(uint32_t dst[],const void * src,int count)826 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
827 grayA_to_RGBA_portable(dst, src, count);
828 }
829
grayA_to_rgbA(uint32_t dst[],const void * src,int count)830 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
831 grayA_to_rgbA_portable(dst, src, count);
832 }
833
inverted_CMYK_to_RGB1(uint32_t dst[],const void * src,int count)834 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
835 inverted_CMYK_to_RGB1_portable(dst, src, count);
836 }
837
inverted_CMYK_to_BGR1(uint32_t dst[],const void * src,int count)838 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
839 inverted_CMYK_to_BGR1_portable(dst, src, count);
840 }
841
842 #endif
843
844 }
845
846 #endif // SkSwizzler_opts_DEFINED
847