1 /* 2 * Copyright 2016 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #ifndef SkSwizzler_opts_DEFINED 9 #define SkSwizzler_opts_DEFINED 10 11 #include "SkColorData.h" 12 13 #include <utility> 14 15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 16 #include <immintrin.h> 17 #elif defined(SK_ARM_HAS_NEON) 18 #include <arm_neon.h> 19 #endif 20 21 namespace SK_OPTS_NS { 22 23 static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) { 24 for (int i = 0; i < count; i++) { 25 uint8_t a = src[i] >> 24, 26 b = src[i] >> 16, 27 g = src[i] >> 8, 28 r = src[i] >> 0; 29 b = (b*a+127)/255; 30 g = (g*a+127)/255; 31 r = (r*a+127)/255; 32 dst[i] = (uint32_t)a << 24 33 | (uint32_t)b << 16 34 | (uint32_t)g << 8 35 | (uint32_t)r << 0; 36 } 37 } 38 39 static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) { 40 for (int i = 0; i < count; i++) { 41 uint8_t a = src[i] >> 24, 42 b = src[i] >> 16, 43 g = src[i] >> 8, 44 r = src[i] >> 0; 45 b = (b*a+127)/255; 46 g = (g*a+127)/255; 47 r = (r*a+127)/255; 48 dst[i] = (uint32_t)a << 24 49 | (uint32_t)r << 16 50 | (uint32_t)g << 8 51 | (uint32_t)b << 0; 52 } 53 } 54 55 static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) { 56 for (int i = 0; i < count; i++) { 57 uint8_t a = src[i] >> 24, 58 b = src[i] >> 16, 59 g = src[i] >> 8, 60 r = src[i] >> 0; 61 dst[i] = (uint32_t)a << 24 62 | (uint32_t)r << 16 63 | (uint32_t)g << 8 64 | (uint32_t)b << 0; 65 } 66 } 67 68 static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) { 69 for (int i = 0; i < count; i++) { 70 uint8_t r = src[0], 71 g = src[1], 72 b = src[2]; 73 src += 3; 74 dst[i] = (uint32_t)0xFF << 24 75 | (uint32_t)b << 16 76 | (uint32_t)g << 8 77 | (uint32_t)r << 0; 78 } 79 } 80 81 static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) { 82 for (int i = 0; i < count; i++) { 83 uint8_t r = src[0], 84 g = src[1], 85 b = src[2]; 86 src += 3; 87 dst[i] = (uint32_t)0xFF << 24 88 | (uint32_t)r << 16 89 | (uint32_t)g << 8 90 | (uint32_t)b << 0; 91 } 92 } 93 94 static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) { 95 for (int i = 0; i < count; i++) { 96 dst[i] = (uint32_t)0xFF << 24 97 | (uint32_t)src[i] << 16 98 | (uint32_t)src[i] << 8 99 | (uint32_t)src[i] << 0; 100 } 101 } 102 103 static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) { 104 for (int i = 0; i < count; i++) { 105 uint8_t g = src[0], 106 a = src[1]; 107 src += 2; 108 dst[i] = (uint32_t)a << 24 109 | (uint32_t)g << 16 110 | (uint32_t)g << 8 111 | (uint32_t)g << 0; 112 } 113 } 114 115 static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) { 116 for (int i = 0; i < count; i++) { 117 uint8_t g = src[0], 118 a = src[1]; 119 src += 2; 120 g = (g*a+127)/255; 121 dst[i] = (uint32_t)a << 24 122 | (uint32_t)g << 16 123 | (uint32_t)g << 8 124 | (uint32_t)g << 0; 125 } 126 } 127 128 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) { 129 for (int i = 0; i < count; i++) { 130 uint8_t k = src[i] >> 24, 131 y = src[i] >> 16, 132 m = src[i] >> 8, 133 c = src[i] >> 0; 134 // See comments in SkSwizzler.cpp for details on the conversion formula. 135 uint8_t b = (y*k+127)/255, 136 g = (m*k+127)/255, 137 r = (c*k+127)/255; 138 dst[i] = (uint32_t)0xFF << 24 139 | (uint32_t) b << 16 140 | (uint32_t) g << 8 141 | (uint32_t) r << 0; 142 } 143 } 144 145 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) { 146 for (int i = 0; i < count; i++) { 147 uint8_t k = src[i] >> 24, 148 y = src[i] >> 16, 149 m = src[i] >> 8, 150 c = src[i] >> 0; 151 uint8_t b = (y*k+127)/255, 152 g = (m*k+127)/255, 153 r = (c*k+127)/255; 154 dst[i] = (uint32_t)0xFF << 24 155 | (uint32_t) r << 16 156 | (uint32_t) g << 8 157 | (uint32_t) b << 0; 158 } 159 } 160 161 #if defined(SK_ARM_HAS_NEON) 162 163 // Rounded divide by 255, (x + 127) / 255 164 static uint8x8_t div255_round(uint16x8_t x) { 165 // result = (x + 127) / 255 166 // result = (x + 127) / 256 + error1 167 // 168 // error1 = (x + 127) / (255 * 256) 169 // error1 = (x + 127) / (256 * 256) + error2 170 // 171 // error2 = (x + 127) / (255 * 256 * 256) 172 // 173 // The maximum value of error2 is too small to matter. Thus: 174 // result = (x + 127) / 256 + (x + 127) / (256 * 256) 175 // result = ((x + 127) / 256 + x + 127) / 256 176 // result = ((x + 127) >> 8 + x + 127) >> 8 177 // 178 // Use >>> to represent "rounded right shift" which, conveniently, 179 // NEON supports in one instruction. 180 // result = ((x >>> 8) + x) >>> 8 181 // 182 // Note that the second right shift is actually performed as an 183 // "add, round, and narrow back to 8-bits" instruction. 184 return vraddhn_u16(x, vrshrq_n_u16(x, 8)); 185 } 186 187 // Scale a byte by another, (x * y + 127) / 255 188 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { 189 return div255_round(vmull_u8(x, y)); 190 } 191 192 template <bool kSwapRB> 193 static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) { 194 while (count >= 8) { 195 // Load 8 pixels. 196 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); 197 198 uint8x8_t a = rgba.val[3], 199 b = rgba.val[2], 200 g = rgba.val[1], 201 r = rgba.val[0]; 202 203 // Premultiply. 204 b = scale(b, a); 205 g = scale(g, a); 206 r = scale(r, a); 207 208 // Store 8 premultiplied pixels. 209 if (kSwapRB) { 210 rgba.val[2] = r; 211 rgba.val[1] = g; 212 rgba.val[0] = b; 213 } else { 214 rgba.val[2] = b; 215 rgba.val[1] = g; 216 rgba.val[0] = r; 217 } 218 vst4_u8((uint8_t*) dst, rgba); 219 src += 8; 220 dst += 8; 221 count -= 8; 222 } 223 224 // Call portable code to finish up the tail of [0,8) pixels. 225 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 226 proc(dst, src, count); 227 } 228 229 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 230 premul_should_swapRB<false>(dst, src, count); 231 } 232 233 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 234 premul_should_swapRB<true>(dst, src, count); 235 } 236 237 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 238 using std::swap; 239 while (count >= 16) { 240 // Load 16 pixels. 241 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src); 242 243 // Swap r and b. 244 swap(rgba.val[0], rgba.val[2]); 245 246 // Store 16 pixels. 247 vst4q_u8((uint8_t*) dst, rgba); 248 src += 16; 249 dst += 16; 250 count -= 16; 251 } 252 253 if (count >= 8) { 254 // Load 8 pixels. 255 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); 256 257 // Swap r and b. 258 swap(rgba.val[0], rgba.val[2]); 259 260 // Store 8 pixels. 261 vst4_u8((uint8_t*) dst, rgba); 262 src += 8; 263 dst += 8; 264 count -= 8; 265 } 266 267 RGBA_to_BGRA_portable(dst, src, count); 268 } 269 270 template <bool kSwapRB> 271 static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) { 272 while (count >= 16) { 273 // Load 16 pixels. 274 uint8x16x3_t rgb = vld3q_u8(src); 275 276 // Insert an opaque alpha channel and swap if needed. 277 uint8x16x4_t rgba; 278 if (kSwapRB) { 279 rgba.val[0] = rgb.val[2]; 280 rgba.val[2] = rgb.val[0]; 281 } else { 282 rgba.val[0] = rgb.val[0]; 283 rgba.val[2] = rgb.val[2]; 284 } 285 rgba.val[1] = rgb.val[1]; 286 rgba.val[3] = vdupq_n_u8(0xFF); 287 288 // Store 16 pixels. 289 vst4q_u8((uint8_t*) dst, rgba); 290 src += 16*3; 291 dst += 16; 292 count -= 16; 293 } 294 295 if (count >= 8) { 296 // Load 8 pixels. 297 uint8x8x3_t rgb = vld3_u8(src); 298 299 // Insert an opaque alpha channel and swap if needed. 300 uint8x8x4_t rgba; 301 if (kSwapRB) { 302 rgba.val[0] = rgb.val[2]; 303 rgba.val[2] = rgb.val[0]; 304 } else { 305 rgba.val[0] = rgb.val[0]; 306 rgba.val[2] = rgb.val[2]; 307 } 308 rgba.val[1] = rgb.val[1]; 309 rgba.val[3] = vdup_n_u8(0xFF); 310 311 // Store 8 pixels. 312 vst4_u8((uint8_t*) dst, rgba); 313 src += 8*3; 314 dst += 8; 315 count -= 8; 316 } 317 318 // Call portable code to finish up the tail of [0,8) pixels. 319 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 320 proc(dst, src, count); 321 } 322 323 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 324 insert_alpha_should_swaprb<false>(dst, src, count); 325 } 326 327 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 328 insert_alpha_should_swaprb<true>(dst, src, count); 329 } 330 331 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 332 while (count >= 16) { 333 // Load 16 pixels. 334 uint8x16_t gray = vld1q_u8(src); 335 336 // Set each of the color channels. 337 uint8x16x4_t rgba; 338 rgba.val[0] = gray; 339 rgba.val[1] = gray; 340 rgba.val[2] = gray; 341 rgba.val[3] = vdupq_n_u8(0xFF); 342 343 // Store 16 pixels. 344 vst4q_u8((uint8_t*) dst, rgba); 345 src += 16; 346 dst += 16; 347 count -= 16; 348 } 349 350 if (count >= 8) { 351 // Load 8 pixels. 352 uint8x8_t gray = vld1_u8(src); 353 354 // Set each of the color channels. 355 uint8x8x4_t rgba; 356 rgba.val[0] = gray; 357 rgba.val[1] = gray; 358 rgba.val[2] = gray; 359 rgba.val[3] = vdup_n_u8(0xFF); 360 361 // Store 8 pixels. 362 vst4_u8((uint8_t*) dst, rgba); 363 src += 8; 364 dst += 8; 365 count -= 8; 366 } 367 368 gray_to_RGB1_portable(dst, src, count); 369 } 370 371 template <bool kPremul> 372 static void expand_grayA(uint32_t dst[], const uint8_t* src, int count) { 373 while (count >= 16) { 374 // Load 16 pixels. 375 uint8x16x2_t ga = vld2q_u8(src); 376 377 // Premultiply if requested. 378 if (kPremul) { 379 ga.val[0] = vcombine_u8( 380 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])), 381 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1]))); 382 } 383 384 // Set each of the color channels. 385 uint8x16x4_t rgba; 386 rgba.val[0] = ga.val[0]; 387 rgba.val[1] = ga.val[0]; 388 rgba.val[2] = ga.val[0]; 389 rgba.val[3] = ga.val[1]; 390 391 // Store 16 pixels. 392 vst4q_u8((uint8_t*) dst, rgba); 393 src += 16*2; 394 dst += 16; 395 count -= 16; 396 } 397 398 if (count >= 8) { 399 // Load 8 pixels. 400 uint8x8x2_t ga = vld2_u8(src); 401 402 // Premultiply if requested. 403 if (kPremul) { 404 ga.val[0] = scale(ga.val[0], ga.val[1]); 405 } 406 407 // Set each of the color channels. 408 uint8x8x4_t rgba; 409 rgba.val[0] = ga.val[0]; 410 rgba.val[1] = ga.val[0]; 411 rgba.val[2] = ga.val[0]; 412 rgba.val[3] = ga.val[1]; 413 414 // Store 8 pixels. 415 vst4_u8((uint8_t*) dst, rgba); 416 src += 8*2; 417 dst += 8; 418 count -= 8; 419 } 420 421 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable; 422 proc(dst, src, count); 423 } 424 425 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 426 expand_grayA<false>(dst, src, count); 427 } 428 429 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 430 expand_grayA<true>(dst, src, count); 431 } 432 433 enum Format { kRGB1, kBGR1 }; 434 template <Format format> 435 static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) { 436 while (count >= 8) { 437 // Load 8 cmyk pixels. 438 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src); 439 440 uint8x8_t k = pixels.val[3], 441 y = pixels.val[2], 442 m = pixels.val[1], 443 c = pixels.val[0]; 444 445 // Scale to r, g, b. 446 uint8x8_t b = scale(y, k); 447 uint8x8_t g = scale(m, k); 448 uint8x8_t r = scale(c, k); 449 450 // Store 8 rgba pixels. 451 if (kBGR1 == format) { 452 pixels.val[3] = vdup_n_u8(0xFF); 453 pixels.val[2] = r; 454 pixels.val[1] = g; 455 pixels.val[0] = b; 456 } else { 457 pixels.val[3] = vdup_n_u8(0xFF); 458 pixels.val[2] = b; 459 pixels.val[1] = g; 460 pixels.val[0] = r; 461 } 462 vst4_u8((uint8_t*) dst, pixels); 463 src += 8; 464 dst += 8; 465 count -= 8; 466 } 467 468 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 469 proc(dst, src, count); 470 } 471 472 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 473 inverted_cmyk_to<kRGB1>(dst, src, count); 474 } 475 476 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 477 inverted_cmyk_to<kBGR1>(dst, src, count); 478 } 479 480 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 481 482 // Scale a byte by another. 483 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 484 static __m128i scale(__m128i x, __m128i y) { 485 const __m128i _128 = _mm_set1_epi16(128); 486 const __m128i _257 = _mm_set1_epi16(257); 487 488 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. 489 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); 490 } 491 492 template <bool kSwapRB> 493 static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) { 494 495 auto premul8 = [](__m128i* lo, __m128i* hi) { 496 const __m128i zeros = _mm_setzero_si128(); 497 __m128i planar; 498 if (kSwapRB) { 499 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 500 } else { 501 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 502 } 503 504 // Swizzle the pixels to 8-bit planar. 505 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa 506 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA 507 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG 508 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA 509 510 // Unpack to 16-bit planar. 511 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ 512 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ 513 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ 514 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ 515 516 // Premultiply! 517 r = scale(r, a); 518 g = scale(g, a); 519 b = scale(b, a); 520 521 // Repack into interlaced pixels. 522 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG 523 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA 524 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba 525 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA 526 }; 527 528 while (count >= 8) { 529 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), 530 hi = _mm_loadu_si128((const __m128i*) (src + 4)); 531 532 premul8(&lo, &hi); 533 534 _mm_storeu_si128((__m128i*) (dst + 0), lo); 535 _mm_storeu_si128((__m128i*) (dst + 4), hi); 536 537 src += 8; 538 dst += 8; 539 count -= 8; 540 } 541 542 if (count >= 4) { 543 __m128i lo = _mm_loadu_si128((const __m128i*) src), 544 hi = _mm_setzero_si128(); 545 546 premul8(&lo, &hi); 547 548 _mm_storeu_si128((__m128i*) dst, lo); 549 550 src += 4; 551 dst += 4; 552 count -= 4; 553 } 554 555 // Call portable code to finish up the tail of [0,4) pixels. 556 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 557 proc(dst, src, count); 558 } 559 560 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 561 premul_should_swapRB<false>(dst, src, count); 562 } 563 564 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 565 premul_should_swapRB<true>(dst, src, count); 566 } 567 568 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 569 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); 570 571 while (count >= 4) { 572 __m128i rgba = _mm_loadu_si128((const __m128i*) src); 573 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB); 574 _mm_storeu_si128((__m128i*) dst, bgra); 575 576 src += 4; 577 dst += 4; 578 count -= 4; 579 } 580 581 RGBA_to_BGRA_portable(dst, src, count); 582 } 583 584 template <bool kSwapRB> 585 static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) { 586 const __m128i alphaMask = _mm_set1_epi32(0xFF000000); 587 __m128i expand; 588 const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant. 589 if (kSwapRB) { 590 expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X); 591 } else { 592 expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X); 593 } 594 595 while (count >= 6) { 596 // Load a vector. While this actually contains 5 pixels plus an 597 // extra component, we will discard all but the first four pixels on 598 // this iteration. 599 __m128i rgb = _mm_loadu_si128((const __m128i*) src); 600 601 // Expand the first four pixels to RGBX and then mask to RGB(FF). 602 __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask); 603 604 // Store 4 pixels. 605 _mm_storeu_si128((__m128i*) dst, rgba); 606 607 src += 4*3; 608 dst += 4; 609 count -= 4; 610 } 611 612 // Call portable code to finish up the tail of [0,4) pixels. 613 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 614 proc(dst, src, count); 615 } 616 617 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 618 insert_alpha_should_swaprb<false>(dst, src, count); 619 } 620 621 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 622 insert_alpha_should_swaprb<true>(dst, src, count); 623 } 624 625 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 626 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF); 627 while (count >= 16) { 628 __m128i grays = _mm_loadu_si128((const __m128i*) src); 629 630 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays); 631 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays); 632 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas); 633 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas); 634 635 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo); 636 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo); 637 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi); 638 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi); 639 640 _mm_storeu_si128((__m128i*) (dst + 0), ggga0); 641 _mm_storeu_si128((__m128i*) (dst + 4), ggga1); 642 _mm_storeu_si128((__m128i*) (dst + 8), ggga2); 643 _mm_storeu_si128((__m128i*) (dst + 12), ggga3); 644 645 src += 16; 646 dst += 16; 647 count -= 16; 648 } 649 650 gray_to_RGB1_portable(dst, src, count); 651 } 652 653 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 654 while (count >= 8) { 655 __m128i ga = _mm_loadu_si128((const __m128i*) src); 656 657 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)), 658 _mm_slli_epi16(ga, 8)); 659 660 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); 661 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); 662 663 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); 664 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); 665 666 src += 8*2; 667 dst += 8; 668 count -= 8; 669 } 670 671 grayA_to_RGBA_portable(dst, src, count); 672 } 673 674 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 675 while (count >= 8) { 676 __m128i grayA = _mm_loadu_si128((const __m128i*) src); 677 678 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF)); 679 __m128i a0 = _mm_srli_epi16(grayA, 8); 680 681 // Premultiply 682 g0 = scale(g0, a0); 683 684 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8)); 685 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8)); 686 687 688 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); 689 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); 690 691 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); 692 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); 693 694 src += 8*2; 695 dst += 8; 696 count -= 8; 697 } 698 699 grayA_to_rgbA_portable(dst, src, count); 700 } 701 702 enum Format { kRGB1, kBGR1 }; 703 template <Format format> 704 static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) { 705 auto convert8 = [](__m128i* lo, __m128i* hi) { 706 const __m128i zeros = _mm_setzero_si128(); 707 __m128i planar; 708 if (kBGR1 == format) { 709 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 710 } else { 711 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 712 } 713 714 // Swizzle the pixels to 8-bit planar. 715 *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk 716 *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK 717 __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM 718 yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK 719 720 // Unpack to 16-bit planar. 721 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_ 722 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_ 723 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_ 724 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_ 725 726 // Scale to r, g, b. 727 __m128i r = scale(c, k), 728 g = scale(m, k), 729 b = scale(y, k); 730 731 // Repack into interlaced pixels. 732 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG 733 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1 734 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba 735 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1 736 }; 737 738 while (count >= 8) { 739 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), 740 hi = _mm_loadu_si128((const __m128i*) (src + 4)); 741 742 convert8(&lo, &hi); 743 744 _mm_storeu_si128((__m128i*) (dst + 0), lo); 745 _mm_storeu_si128((__m128i*) (dst + 4), hi); 746 747 src += 8; 748 dst += 8; 749 count -= 8; 750 } 751 752 if (count >= 4) { 753 __m128i lo = _mm_loadu_si128((const __m128i*) src), 754 hi = _mm_setzero_si128(); 755 756 convert8(&lo, &hi); 757 758 _mm_storeu_si128((__m128i*) dst, lo); 759 760 src += 4; 761 dst += 4; 762 count -= 4; 763 } 764 765 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 766 proc(dst, src, count); 767 } 768 769 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 770 inverted_cmyk_to<kRGB1>(dst, src, count); 771 } 772 773 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 774 inverted_cmyk_to<kBGR1>(dst, src, count); 775 } 776 777 #else 778 779 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) { 780 RGBA_to_rgbA_portable(dst, src, count); 781 } 782 783 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) { 784 RGBA_to_bgrA_portable(dst, src, count); 785 } 786 787 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) { 788 RGBA_to_BGRA_portable(dst, src, count); 789 } 790 791 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 792 RGB_to_RGB1_portable(dst, src, count); 793 } 794 795 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) { 796 RGB_to_BGR1_portable(dst, src, count); 797 } 798 799 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) { 800 gray_to_RGB1_portable(dst, src, count); 801 } 802 803 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) { 804 grayA_to_RGBA_portable(dst, src, count); 805 } 806 807 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) { 808 grayA_to_rgbA_portable(dst, src, count); 809 } 810 811 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) { 812 inverted_CMYK_to_RGB1_portable(dst, src, count); 813 } 814 815 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) { 816 inverted_CMYK_to_BGR1_portable(dst, src, count); 817 } 818 819 #endif 820 821 } 822 823 #endif // SkSwizzler_opts_DEFINED 824