1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkSwizzler_opts_DEFINED
9 #define SkSwizzler_opts_DEFINED
10
11 #include "SkColorPriv.h"
12
13 namespace SK_OPTS_NS {
14
RGBA_to_rgbA_portable(uint32_t * dst,const void * vsrc,int count)15 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
16 auto src = (const uint32_t*)vsrc;
17 for (int i = 0; i < count; i++) {
18 uint8_t a = src[i] >> 24,
19 b = src[i] >> 16,
20 g = src[i] >> 8,
21 r = src[i] >> 0;
22 b = (b*a+127)/255;
23 g = (g*a+127)/255;
24 r = (r*a+127)/255;
25 dst[i] = (uint32_t)a << 24
26 | (uint32_t)b << 16
27 | (uint32_t)g << 8
28 | (uint32_t)r << 0;
29 }
30 }
31
RGBA_to_bgrA_portable(uint32_t * dst,const void * vsrc,int count)32 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
33 auto src = (const uint32_t*)vsrc;
34 for (int i = 0; i < count; i++) {
35 uint8_t a = src[i] >> 24,
36 b = src[i] >> 16,
37 g = src[i] >> 8,
38 r = src[i] >> 0;
39 b = (b*a+127)/255;
40 g = (g*a+127)/255;
41 r = (r*a+127)/255;
42 dst[i] = (uint32_t)a << 24
43 | (uint32_t)r << 16
44 | (uint32_t)g << 8
45 | (uint32_t)b << 0;
46 }
47 }
48
RGBA_to_BGRA_portable(uint32_t * dst,const void * vsrc,int count)49 static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
50 auto src = (const uint32_t*)vsrc;
51 for (int i = 0; i < count; i++) {
52 uint8_t a = src[i] >> 24,
53 b = src[i] >> 16,
54 g = src[i] >> 8,
55 r = src[i] >> 0;
56 dst[i] = (uint32_t)a << 24
57 | (uint32_t)r << 16
58 | (uint32_t)g << 8
59 | (uint32_t)b << 0;
60 }
61 }
62
RGB_to_RGB1_portable(uint32_t dst[],const void * vsrc,int count)63 static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
64 const uint8_t* src = (const uint8_t*)vsrc;
65 for (int i = 0; i < count; i++) {
66 uint8_t r = src[0],
67 g = src[1],
68 b = src[2];
69 src += 3;
70 dst[i] = (uint32_t)0xFF << 24
71 | (uint32_t)b << 16
72 | (uint32_t)g << 8
73 | (uint32_t)r << 0;
74 }
75 }
76
RGB_to_BGR1_portable(uint32_t dst[],const void * vsrc,int count)77 static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
78 const uint8_t* src = (const uint8_t*)vsrc;
79 for (int i = 0; i < count; i++) {
80 uint8_t r = src[0],
81 g = src[1],
82 b = src[2];
83 src += 3;
84 dst[i] = (uint32_t)0xFF << 24
85 | (uint32_t)r << 16
86 | (uint32_t)g << 8
87 | (uint32_t)b << 0;
88 }
89 }
90
gray_to_RGB1_portable(uint32_t dst[],const void * vsrc,int count)91 static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
92 const uint8_t* src = (const uint8_t*)vsrc;
93 for (int i = 0; i < count; i++) {
94 dst[i] = (uint32_t)0xFF << 24
95 | (uint32_t)src[i] << 16
96 | (uint32_t)src[i] << 8
97 | (uint32_t)src[i] << 0;
98 }
99 }
100
grayA_to_RGBA_portable(uint32_t dst[],const void * vsrc,int count)101 static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {
102 const uint8_t* src = (const uint8_t*)vsrc;
103 for (int i = 0; i < count; i++) {
104 uint8_t g = src[0],
105 a = src[1];
106 src += 2;
107 dst[i] = (uint32_t)a << 24
108 | (uint32_t)g << 16
109 | (uint32_t)g << 8
110 | (uint32_t)g << 0;
111 }
112 }
113
grayA_to_rgbA_portable(uint32_t dst[],const void * vsrc,int count)114 static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {
115 const uint8_t* src = (const uint8_t*)vsrc;
116 for (int i = 0; i < count; i++) {
117 uint8_t g = src[0],
118 a = src[1];
119 src += 2;
120 g = (g*a+127)/255;
121 dst[i] = (uint32_t)a << 24
122 | (uint32_t)g << 16
123 | (uint32_t)g << 8
124 | (uint32_t)g << 0;
125 }
126 }
127
inverted_CMYK_to_RGB1_portable(uint32_t * dst,const void * vsrc,int count)128 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
129 const uint32_t* src = (const uint32_t*)vsrc;
130 for (int i = 0; i < count; i++) {
131 uint8_t k = src[i] >> 24,
132 y = src[i] >> 16,
133 m = src[i] >> 8,
134 c = src[i] >> 0;
135 // See comments in SkSwizzler.cpp for details on the conversion formula.
136 uint8_t b = (y*k+127)/255,
137 g = (m*k+127)/255,
138 r = (c*k+127)/255;
139 dst[i] = (uint32_t)0xFF << 24
140 | (uint32_t) b << 16
141 | (uint32_t) g << 8
142 | (uint32_t) r << 0;
143 }
144 }
145
inverted_CMYK_to_BGR1_portable(uint32_t * dst,const void * vsrc,int count)146 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
147 const uint32_t* src = (const uint32_t*)vsrc;
148 for (int i = 0; i < count; i++) {
149 uint8_t k = src[i] >> 24,
150 y = src[i] >> 16,
151 m = src[i] >> 8,
152 c = src[i] >> 0;
153 uint8_t b = (y*k+127)/255,
154 g = (m*k+127)/255,
155 r = (c*k+127)/255;
156 dst[i] = (uint32_t)0xFF << 24
157 | (uint32_t) r << 16
158 | (uint32_t) g << 8
159 | (uint32_t) b << 0;
160 }
161 }
162
163 #if defined(SK_ARM_HAS_NEON)
164
165 // Rounded divide by 255, (x + 127) / 255
div255_round(uint16x8_t x)166 static uint8x8_t div255_round(uint16x8_t x) {
167 // result = (x + 127) / 255
168 // result = (x + 127) / 256 + error1
169 //
170 // error1 = (x + 127) / (255 * 256)
171 // error1 = (x + 127) / (256 * 256) + error2
172 //
173 // error2 = (x + 127) / (255 * 256 * 256)
174 //
175 // The maximum value of error2 is too small to matter. Thus:
176 // result = (x + 127) / 256 + (x + 127) / (256 * 256)
177 // result = ((x + 127) / 256 + x + 127) / 256
178 // result = ((x + 127) >> 8 + x + 127) >> 8
179 //
180 // Use >>> to represent "rounded right shift" which, conveniently,
181 // NEON supports in one instruction.
182 // result = ((x >>> 8) + x) >>> 8
183 //
184 // Note that the second right shift is actually performed as an
185 // "add, round, and narrow back to 8-bits" instruction.
186 return vraddhn_u16(x, vrshrq_n_u16(x, 8));
187 }
188
189 // Scale a byte by another, (x * y + 127) / 255
scale(uint8x8_t x,uint8x8_t y)190 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
191 return div255_round(vmull_u8(x, y));
192 }
193
194 template <bool kSwapRB>
premul_should_swapRB(uint32_t * dst,const void * vsrc,int count)195 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
196 auto src = (const uint32_t*)vsrc;
197 while (count >= 8) {
198 // Load 8 pixels.
199 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
200
201 uint8x8_t a = rgba.val[3],
202 b = rgba.val[2],
203 g = rgba.val[1],
204 r = rgba.val[0];
205
206 // Premultiply.
207 b = scale(b, a);
208 g = scale(g, a);
209 r = scale(r, a);
210
211 // Store 8 premultiplied pixels.
212 if (kSwapRB) {
213 rgba.val[2] = r;
214 rgba.val[1] = g;
215 rgba.val[0] = b;
216 } else {
217 rgba.val[2] = b;
218 rgba.val[1] = g;
219 rgba.val[0] = r;
220 }
221 vst4_u8((uint8_t*) dst, rgba);
222 src += 8;
223 dst += 8;
224 count -= 8;
225 }
226
227 // Call portable code to finish up the tail of [0,8) pixels.
228 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
229 proc(dst, src, count);
230 }
231
RGBA_to_rgbA(uint32_t * dst,const void * src,int count)232 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
233 premul_should_swapRB<false>(dst, src, count);
234 }
235
RGBA_to_bgrA(uint32_t * dst,const void * src,int count)236 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
237 premul_should_swapRB<true>(dst, src, count);
238 }
239
RGBA_to_BGRA(uint32_t * dst,const void * vsrc,int count)240 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
241 auto src = (const uint32_t*)vsrc;
242 while (count >= 16) {
243 // Load 16 pixels.
244 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
245
246 // Swap r and b.
247 SkTSwap(rgba.val[0], rgba.val[2]);
248
249 // Store 16 pixels.
250 vst4q_u8((uint8_t*) dst, rgba);
251 src += 16;
252 dst += 16;
253 count -= 16;
254 }
255
256 if (count >= 8) {
257 // Load 8 pixels.
258 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
259
260 // Swap r and b.
261 SkTSwap(rgba.val[0], rgba.val[2]);
262
263 // Store 8 pixels.
264 vst4_u8((uint8_t*) dst, rgba);
265 src += 8;
266 dst += 8;
267 count -= 8;
268 }
269
270 RGBA_to_BGRA_portable(dst, src, count);
271 }
272
273 template <bool kSwapRB>
insert_alpha_should_swaprb(uint32_t dst[],const void * vsrc,int count)274 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
275 const uint8_t* src = (const uint8_t*) vsrc;
276 while (count >= 16) {
277 // Load 16 pixels.
278 uint8x16x3_t rgb = vld3q_u8(src);
279
280 // Insert an opaque alpha channel and swap if needed.
281 uint8x16x4_t rgba;
282 if (kSwapRB) {
283 rgba.val[0] = rgb.val[2];
284 rgba.val[2] = rgb.val[0];
285 } else {
286 rgba.val[0] = rgb.val[0];
287 rgba.val[2] = rgb.val[2];
288 }
289 rgba.val[1] = rgb.val[1];
290 rgba.val[3] = vdupq_n_u8(0xFF);
291
292 // Store 16 pixels.
293 vst4q_u8((uint8_t*) dst, rgba);
294 src += 16*3;
295 dst += 16;
296 count -= 16;
297 }
298
299 if (count >= 8) {
300 // Load 8 pixels.
301 uint8x8x3_t rgb = vld3_u8(src);
302
303 // Insert an opaque alpha channel and swap if needed.
304 uint8x8x4_t rgba;
305 if (kSwapRB) {
306 rgba.val[0] = rgb.val[2];
307 rgba.val[2] = rgb.val[0];
308 } else {
309 rgba.val[0] = rgb.val[0];
310 rgba.val[2] = rgb.val[2];
311 }
312 rgba.val[1] = rgb.val[1];
313 rgba.val[3] = vdup_n_u8(0xFF);
314
315 // Store 8 pixels.
316 vst4_u8((uint8_t*) dst, rgba);
317 src += 8*3;
318 dst += 8;
319 count -= 8;
320 }
321
322 // Call portable code to finish up the tail of [0,8) pixels.
323 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
324 proc(dst, src, count);
325 }
326
RGB_to_RGB1(uint32_t dst[],const void * src,int count)327 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
328 insert_alpha_should_swaprb<false>(dst, src, count);
329 }
330
RGB_to_BGR1(uint32_t dst[],const void * src,int count)331 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
332 insert_alpha_should_swaprb<true>(dst, src, count);
333 }
334
gray_to_RGB1(uint32_t dst[],const void * vsrc,int count)335 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
336 const uint8_t* src = (const uint8_t*) vsrc;
337 while (count >= 16) {
338 // Load 16 pixels.
339 uint8x16_t gray = vld1q_u8(src);
340
341 // Set each of the color channels.
342 uint8x16x4_t rgba;
343 rgba.val[0] = gray;
344 rgba.val[1] = gray;
345 rgba.val[2] = gray;
346 rgba.val[3] = vdupq_n_u8(0xFF);
347
348 // Store 16 pixels.
349 vst4q_u8((uint8_t*) dst, rgba);
350 src += 16;
351 dst += 16;
352 count -= 16;
353 }
354
355 if (count >= 8) {
356 // Load 8 pixels.
357 uint8x8_t gray = vld1_u8(src);
358
359 // Set each of the color channels.
360 uint8x8x4_t rgba;
361 rgba.val[0] = gray;
362 rgba.val[1] = gray;
363 rgba.val[2] = gray;
364 rgba.val[3] = vdup_n_u8(0xFF);
365
366 // Store 8 pixels.
367 vst4_u8((uint8_t*) dst, rgba);
368 src += 8;
369 dst += 8;
370 count -= 8;
371 }
372
373 gray_to_RGB1_portable(dst, src, count);
374 }
375
376 template <bool kPremul>
expand_grayA(uint32_t dst[],const void * vsrc,int count)377 static void expand_grayA(uint32_t dst[], const void* vsrc, int count) {
378 const uint8_t* src = (const uint8_t*) vsrc;
379 while (count >= 16) {
380 // Load 16 pixels.
381 uint8x16x2_t ga = vld2q_u8(src);
382
383 // Premultiply if requested.
384 if (kPremul) {
385 ga.val[0] = vcombine_u8(
386 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),
387 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
388 }
389
390 // Set each of the color channels.
391 uint8x16x4_t rgba;
392 rgba.val[0] = ga.val[0];
393 rgba.val[1] = ga.val[0];
394 rgba.val[2] = ga.val[0];
395 rgba.val[3] = ga.val[1];
396
397 // Store 16 pixels.
398 vst4q_u8((uint8_t*) dst, rgba);
399 src += 16*2;
400 dst += 16;
401 count -= 16;
402 }
403
404 if (count >= 8) {
405 // Load 8 pixels.
406 uint8x8x2_t ga = vld2_u8(src);
407
408 // Premultiply if requested.
409 if (kPremul) {
410 ga.val[0] = scale(ga.val[0], ga.val[1]);
411 }
412
413 // Set each of the color channels.
414 uint8x8x4_t rgba;
415 rgba.val[0] = ga.val[0];
416 rgba.val[1] = ga.val[0];
417 rgba.val[2] = ga.val[0];
418 rgba.val[3] = ga.val[1];
419
420 // Store 8 pixels.
421 vst4_u8((uint8_t*) dst, rgba);
422 src += 8*2;
423 dst += 8;
424 count -= 8;
425 }
426
427 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
428 proc(dst, src, count);
429 }
430
grayA_to_RGBA(uint32_t dst[],const void * src,int count)431 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
432 expand_grayA<false>(dst, src, count);
433 }
434
grayA_to_rgbA(uint32_t dst[],const void * src,int count)435 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
436 expand_grayA<true>(dst, src, count);
437 }
438
439 enum Format { kRGB1, kBGR1 };
440 template <Format format>
inverted_cmyk_to(uint32_t * dst,const void * vsrc,int count)441 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
442 auto src = (const uint32_t*)vsrc;
443 while (count >= 8) {
444 // Load 8 cmyk pixels.
445 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
446
447 uint8x8_t k = pixels.val[3],
448 y = pixels.val[2],
449 m = pixels.val[1],
450 c = pixels.val[0];
451
452 // Scale to r, g, b.
453 uint8x8_t b = scale(y, k);
454 uint8x8_t g = scale(m, k);
455 uint8x8_t r = scale(c, k);
456
457 // Store 8 rgba pixels.
458 if (kBGR1 == format) {
459 pixels.val[3] = vdup_n_u8(0xFF);
460 pixels.val[2] = r;
461 pixels.val[1] = g;
462 pixels.val[0] = b;
463 } else {
464 pixels.val[3] = vdup_n_u8(0xFF);
465 pixels.val[2] = b;
466 pixels.val[1] = g;
467 pixels.val[0] = r;
468 }
469 vst4_u8((uint8_t*) dst, pixels);
470 src += 8;
471 dst += 8;
472 count -= 8;
473 }
474
475 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
476 proc(dst, src, count);
477 }
478
inverted_CMYK_to_RGB1(uint32_t dst[],const void * src,int count)479 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
480 inverted_cmyk_to<kRGB1>(dst, src, count);
481 }
482
inverted_CMYK_to_BGR1(uint32_t dst[],const void * src,int count)483 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
484 inverted_cmyk_to<kBGR1>(dst, src, count);
485 }
486
487 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
488
489 // Scale a byte by another.
490 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
scale(__m128i x,__m128i y)491 static __m128i scale(__m128i x, __m128i y) {
492 const __m128i _128 = _mm_set1_epi16(128);
493 const __m128i _257 = _mm_set1_epi16(257);
494
495 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
496 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
497 }
498
499 template <bool kSwapRB>
premul_should_swapRB(uint32_t * dst,const void * vsrc,int count)500 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
501 auto src = (const uint32_t*)vsrc;
502
503 auto premul8 = [](__m128i* lo, __m128i* hi) {
504 const __m128i zeros = _mm_setzero_si128();
505 __m128i planar;
506 if (kSwapRB) {
507 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
508 } else {
509 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
510 }
511
512 // Swizzle the pixels to 8-bit planar.
513 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa
514 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA
515 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG
516 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA
517
518 // Unpack to 16-bit planar.
519 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_
520 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_
521 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
522 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
523
524 // Premultiply!
525 r = scale(r, a);
526 g = scale(g, a);
527 b = scale(b, a);
528
529 // Repack into interlaced pixels.
530 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG
531 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA
532 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
533 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA
534 };
535
536 while (count >= 8) {
537 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
538 hi = _mm_loadu_si128((const __m128i*) (src + 4));
539
540 premul8(&lo, &hi);
541
542 _mm_storeu_si128((__m128i*) (dst + 0), lo);
543 _mm_storeu_si128((__m128i*) (dst + 4), hi);
544
545 src += 8;
546 dst += 8;
547 count -= 8;
548 }
549
550 if (count >= 4) {
551 __m128i lo = _mm_loadu_si128((const __m128i*) src),
552 hi = _mm_setzero_si128();
553
554 premul8(&lo, &hi);
555
556 _mm_storeu_si128((__m128i*) dst, lo);
557
558 src += 4;
559 dst += 4;
560 count -= 4;
561 }
562
563 // Call portable code to finish up the tail of [0,4) pixels.
564 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
565 proc(dst, src, count);
566 }
567
RGBA_to_rgbA(uint32_t * dst,const void * src,int count)568 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
569 premul_should_swapRB<false>(dst, src, count);
570 }
571
RGBA_to_bgrA(uint32_t * dst,const void * src,int count)572 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
573 premul_should_swapRB<true>(dst, src, count);
574 }
575
RGBA_to_BGRA(uint32_t * dst,const void * vsrc,int count)576 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
577 auto src = (const uint32_t*)vsrc;
578 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
579
580 while (count >= 4) {
581 __m128i rgba = _mm_loadu_si128((const __m128i*) src);
582 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
583 _mm_storeu_si128((__m128i*) dst, bgra);
584
585 src += 4;
586 dst += 4;
587 count -= 4;
588 }
589
590 RGBA_to_BGRA_portable(dst, src, count);
591 }
592
593 template <bool kSwapRB>
insert_alpha_should_swaprb(uint32_t dst[],const void * vsrc,int count)594 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
595 const uint8_t* src = (const uint8_t*) vsrc;
596
597 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
598 __m128i expand;
599 const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant.
600 if (kSwapRB) {
601 expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
602 } else {
603 expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
604 }
605
606 while (count >= 6) {
607 // Load a vector. While this actually contains 5 pixels plus an
608 // extra component, we will discard all but the first four pixels on
609 // this iteration.
610 __m128i rgb = _mm_loadu_si128((const __m128i*) src);
611
612 // Expand the first four pixels to RGBX and then mask to RGB(FF).
613 __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
614
615 // Store 4 pixels.
616 _mm_storeu_si128((__m128i*) dst, rgba);
617
618 src += 4*3;
619 dst += 4;
620 count -= 4;
621 }
622
623 // Call portable code to finish up the tail of [0,4) pixels.
624 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
625 proc(dst, src, count);
626 }
627
RGB_to_RGB1(uint32_t dst[],const void * src,int count)628 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
629 insert_alpha_should_swaprb<false>(dst, src, count);
630 }
631
RGB_to_BGR1(uint32_t dst[],const void * src,int count)632 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
633 insert_alpha_should_swaprb<true>(dst, src, count);
634 }
635
gray_to_RGB1(uint32_t dst[],const void * vsrc,int count)636 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
637 const uint8_t* src = (const uint8_t*) vsrc;
638
639 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
640 while (count >= 16) {
641 __m128i grays = _mm_loadu_si128((const __m128i*) src);
642
643 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
644 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
645 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
646 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
647
648 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
649 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
650 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
651 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
652
653 _mm_storeu_si128((__m128i*) (dst + 0), ggga0);
654 _mm_storeu_si128((__m128i*) (dst + 4), ggga1);
655 _mm_storeu_si128((__m128i*) (dst + 8), ggga2);
656 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
657
658 src += 16;
659 dst += 16;
660 count -= 16;
661 }
662
663 gray_to_RGB1_portable(dst, src, count);
664 }
665
grayA_to_RGBA(uint32_t dst[],const void * vsrc,int count)666 static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
667 const uint8_t* src = (const uint8_t*) vsrc;
668 while (count >= 8) {
669 __m128i ga = _mm_loadu_si128((const __m128i*) src);
670
671 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
672 _mm_slli_epi16(ga, 8));
673
674 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
675 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
676
677 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
678 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
679
680 src += 8*2;
681 dst += 8;
682 count -= 8;
683 }
684
685 grayA_to_RGBA_portable(dst, src, count);
686 }
687
grayA_to_rgbA(uint32_t dst[],const void * vsrc,int count)688 static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
689 const uint8_t* src = (const uint8_t*) vsrc;
690 while (count >= 8) {
691 __m128i grayA = _mm_loadu_si128((const __m128i*) src);
692
693 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
694 __m128i a0 = _mm_srli_epi16(grayA, 8);
695
696 // Premultiply
697 g0 = scale(g0, a0);
698
699 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
700 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
701
702
703 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
704 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
705
706 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
707 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
708
709 src += 8*2;
710 dst += 8;
711 count -= 8;
712 }
713
714 grayA_to_rgbA_portable(dst, src, count);
715 }
716
717 enum Format { kRGB1, kBGR1 };
718 template <Format format>
inverted_cmyk_to(uint32_t * dst,const void * vsrc,int count)719 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
720 auto src = (const uint32_t*)vsrc;
721
722 auto convert8 = [](__m128i* lo, __m128i* hi) {
723 const __m128i zeros = _mm_setzero_si128();
724 __m128i planar;
725 if (kBGR1 == format) {
726 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
727 } else {
728 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
729 }
730
731 // Swizzle the pixels to 8-bit planar.
732 *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk
733 *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK
734 __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM
735 yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK
736
737 // Unpack to 16-bit planar.
738 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_
739 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_
740 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_
741 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_
742
743 // Scale to r, g, b.
744 __m128i r = scale(c, k),
745 g = scale(m, k),
746 b = scale(y, k);
747
748 // Repack into interlaced pixels.
749 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG
750 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1
751 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
752 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1
753 };
754
755 while (count >= 8) {
756 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
757 hi = _mm_loadu_si128((const __m128i*) (src + 4));
758
759 convert8(&lo, &hi);
760
761 _mm_storeu_si128((__m128i*) (dst + 0), lo);
762 _mm_storeu_si128((__m128i*) (dst + 4), hi);
763
764 src += 8;
765 dst += 8;
766 count -= 8;
767 }
768
769 if (count >= 4) {
770 __m128i lo = _mm_loadu_si128((const __m128i*) src),
771 hi = _mm_setzero_si128();
772
773 convert8(&lo, &hi);
774
775 _mm_storeu_si128((__m128i*) dst, lo);
776
777 src += 4;
778 dst += 4;
779 count -= 4;
780 }
781
782 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
783 proc(dst, src, count);
784 }
785
inverted_CMYK_to_RGB1(uint32_t dst[],const void * src,int count)786 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
787 inverted_cmyk_to<kRGB1>(dst, src, count);
788 }
789
inverted_CMYK_to_BGR1(uint32_t dst[],const void * src,int count)790 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
791 inverted_cmyk_to<kBGR1>(dst, src, count);
792 }
793
794 #else
795
RGBA_to_rgbA(uint32_t * dst,const void * src,int count)796 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
797 RGBA_to_rgbA_portable(dst, src, count);
798 }
799
RGBA_to_bgrA(uint32_t * dst,const void * src,int count)800 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
801 RGBA_to_bgrA_portable(dst, src, count);
802 }
803
RGBA_to_BGRA(uint32_t * dst,const void * src,int count)804 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
805 RGBA_to_BGRA_portable(dst, src, count);
806 }
807
RGB_to_RGB1(uint32_t dst[],const void * src,int count)808 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
809 RGB_to_RGB1_portable(dst, src, count);
810 }
811
RGB_to_BGR1(uint32_t dst[],const void * src,int count)812 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
813 RGB_to_BGR1_portable(dst, src, count);
814 }
815
gray_to_RGB1(uint32_t dst[],const void * src,int count)816 static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
817 gray_to_RGB1_portable(dst, src, count);
818 }
819
grayA_to_RGBA(uint32_t dst[],const void * src,int count)820 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
821 grayA_to_RGBA_portable(dst, src, count);
822 }
823
grayA_to_rgbA(uint32_t dst[],const void * src,int count)824 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
825 grayA_to_rgbA_portable(dst, src, count);
826 }
827
inverted_CMYK_to_RGB1(uint32_t dst[],const void * src,int count)828 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
829 inverted_CMYK_to_RGB1_portable(dst, src, count);
830 }
831
inverted_CMYK_to_BGR1(uint32_t dst[],const void * src,int count)832 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
833 inverted_CMYK_to_BGR1_portable(dst, src, count);
834 }
835
836 #endif
837
838 }
839
840 #endif // SkSwizzler_opts_DEFINED
841