1/* 2 * Copyright 2016 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can 5 * be found in the LICENSE file. 6 * 7 */ 8 9// 10// 11// 12 13#include "tile.h" 14#include "block.h" 15#include "styling_types.h" 16#include "atomic_cl.h" 17#include "kernel_cl_12.h" 18 19// 20// 21// 22 23#define SKC_RENDER_SUBGROUP_MASK (SKC_RENDER_SUBGROUP_SIZE - 1) 24 25// 26// 27// 28 29#if ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 ) 30#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_1() 31#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 0 32 33#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 2 ) 34#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_2() 35#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 1 36 37#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 4 ) 38#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_4() 39#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 3 40 41#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 8 ) 42#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_8() 43#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 7 44 45#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 16) 46#define SKC_RENDER_SCANLINE_VECTOR_EXPAND() SKC_EXPAND_16() 47#define SKC_RENDER_SCANLINE_VECTOR_EXPAND_I_LAST 15 48#endif 49 50// 51// tile state flag bits 52// 53 54typedef enum skc_tile_flags_e { 55 56 // FLUSH 57 SKC_TILE_FLAGS_FLUSH_FINALIZE = 0x00000001, 58 SKC_TILE_FLAGS_FLUSH_UNWIND = 0x00000002, 59 SKC_TILE_FLAGS_FLUSH_COMPLETE = 0x00000004, 60 61 // OPACITY 62 SKC_TILE_FLAGS_SCATTER_SKIP = 0x00000008, 63 64 // 65 // Note: testing for opacity and skipping scattering is on its way 66 // to becoming a much more programmable option because sometimes we 67 // may be compositing/blending from back-to-front and/or be using 68 // group blend rules that ignore opacity. 69 // 70 // The point is that all of these decisions should be encoded in 71 // styling commands and, as much as possible, removed from the final 72 // group/layer styling traversal render loop. 73 // 74 75} skc_tile_flags_e; 76 77// 78// COVER -- assumes availability of either fp16 or fp32 79// 80 81union skc_tile_cover 82{ 83 struct { 84 SKC_RENDER_TILE_COVER c[SKC_TILE_WIDTH]; 85 } aN; 86 87#ifdef SKC_RENDER_TILE_COVER_VECTOR 88 struct { 89 SKC_RENDER_TILE_COVER_VECTOR c[SKC_RENDER_TILE_COVER_VECTOR_COUNT]; 90 } vN; 91#endif 92}; 93 94// 95// COLOR -- assumes availability of either fp16 or fp32 96// 97 98union skc_tile_color 99{ 100 union { 101 struct { 102 SKC_RENDER_TILE_COLOR r; 103 SKC_RENDER_TILE_COLOR g; 104 SKC_RENDER_TILE_COLOR b; 105 SKC_RENDER_TILE_COLOR a; 106 } rgba[SKC_TILE_WIDTH]; 107 } aN; 108 109#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED 110 union { 111 SKC_RENDER_TILE_COLOR_INTERLEAVED rgba[SKC_TILE_WIDTH]; 112 } iN; 113#endif 114 115#ifdef SKC_RENDER_TILE_COLOR_VECTOR 116 union { 117 SKC_RENDER_TILE_COLOR_VECTOR rgba[SKC_RENDER_TILE_COLOR_VECTOR_COUNT]; 118 } vN; 119#endif 120 121 struct { 122 union { 123 struct { 124 SKC_RENDER_TILE_COLOR r; 125 SKC_RENDER_TILE_COLOR g; 126 }; 127 SKC_RENDER_GRADIENT_FLOAT distance; 128 }; 129 union { 130 struct { 131 SKC_RENDER_TILE_COLOR b; 132 SKC_RENDER_TILE_COLOR a; 133 }; 134 SKC_RENDER_GRADIENT_FLOAT stoplerp; 135 }; 136 } grad[SKC_TILE_WIDTH]; 137}; 138 139// 140// SHARED MEMORY STATE 141// 142 143#define SKC_RENDER_TILE_SMEM_WORDS ((SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT) 144 145#define SKC_RENDER_WIDE_AA_BYTES (SKC_RENDER_TILE_SMEM_WORDS * sizeof(int) / SKC_RENDER_SUBGROUP_SIZE) 146#define SKC_RENDER_WIDE_AA_WIDTH (SKC_RENDER_WIDE_AA_BYTES / sizeof(SKC_RENDER_WIDE_AA)) 147 148// 149// 150// 151 152union skc_subgroup_smem 153{ 154 // 155 // The tiles are stored in column-major / height-major order 156 // 157 // The final column is a guard column that is OK to write to but 158 // will never be read. It simplifies the TTSB scatter but could be 159 // predicated if SMEM is really at a premium. 160 // 161#if ( SKC_RENDER_SUBGROUP_SIZE > 1 ) 162 struct { 163 SKC_ATOMIC_UINT area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h] 164 } atomic; 165#endif 166 167 struct { 168 int area[SKC_RENDER_TILE_SMEM_WORDS]; // area[w][h] 169 } aN; 170 171 struct { // assumption is that height = subgroup 172 SKC_RENDER_AREA_V area[SKC_TILE_WIDTH + 1][SKC_RENDER_SUBGROUP_SIZE]; 173 } vN; 174 175 struct { // assumption is that height = subgroup 176 SKC_RENDER_WIDE_AA area[SKC_RENDER_WIDE_AA_WIDTH][SKC_RENDER_SUBGROUP_SIZE]; 177 } wide; 178 179 union skc_styling_cmd cmds[(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT]; 180 181 half gc [(SKC_TILE_WIDTH + 1) * SKC_TILE_HEIGHT * 2]; 182 183#if 0 184 // 185 // SPILL TO GMEM 186 // 187#if (SKC_REGS_COLOR_S > 0) || (SKC_REGS_COVER_S > 0) 188 struct { 189 190#if (SKC_REGS_COLOR_S > 0) 191 union skc_color_r color[SKC_REGS_COLOR_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH]; 192#endif 193 194#if (SKC_REGS_COVER_S > 0) 195 union float cover[SKC_REGS_COVER_S][SKC_TILE_HEIGHT][SKC_TILE_WIDTH]; 196#endif 197 198 } regs; 199#endif 200 // 201 // 202 // 203#endif 204}; 205 206// 207// 208// 209 210#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) 211 212#define skc_subgroup_lane() 0 213 214#else 215 216#define skc_subgroup_lane() get_sub_group_local_id() 217 218#endif 219 220// 221// 222// 223 224typedef skc_uint skc_ttsk_lo_t; 225typedef skc_uint skc_ttsk_hi_t; 226 227typedef skc_uint skc_ttpk_lo_t; 228typedef skc_uint skc_ttpk_hi_t; 229 230typedef skc_uint skc_ttxk_lo_t; 231typedef skc_uint skc_ttxk_hi_t; 232 233typedef skc_uint skc_ttck_lo_t; 234typedef skc_uint skc_ttck_hi_t; 235 236typedef skc_uint2 skc_ttck_t; 237 238typedef skc_int skc_ttxb_t; 239 240// 241// TTCK (32-BIT COMPARE) v1: 242// 243// 0 63 244// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | 245// +----------------------+--------+--------+-------+-----+-----+ 246// | 30 | 1 | 1 | 18 | 7 | 7 | 247// 248// 249// TTCK (32-BIT COMPARE) v2: 250// 251// 0 63 252// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | 253// +----------------------+--------+--------+-------+-----+-----+ 254// | 30 | 1 | 1 | 15 | 9 | 8 | 255// 256// 257// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile: 258// 259// 0 63 260// | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER | X | Y | 261// +----------------------+--------+--------+-------+-----+-----+ 262// | 27 | 1 | 1 | 18 | 9 | 8 | 263// 264 265static 266skc_uint 267skc_ttck_lo_get_ttxb_id(skc_ttck_lo_t const a) 268{ 269 return a & SKC_TTCK_LO_MASK_ID; 270} 271 272static 273skc_layer_id 274skc_ttck_get_layer(skc_ttck_t const a) 275{ 276 // 277 // FIXME -- a union with a ulong and a shift down and mask is 278 // probably faster on some architectures 279 // 280 skc_uint const lo = (a.lo >> SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); 281 skc_uint const hi = (a.hi & SKC_TTCK_HI_MASK_LAYER) << SKC_TTCK_LO_BITS_LAYER; 282 283 return lo | hi; 284} 285 286static 287skc_uint 288skc_ttck_hi_get_x(skc_ttck_hi_t const a) 289{ 290 return SKC_BFE(a,SKC_TTCK_HI_BITS_X,SKC_TTCK_HI_OFFSET_X); 291} 292 293static 294skc_uint 295skc_ttck_hi_get_y(skc_ttck_hi_t const a) 296{ 297 return a >> SKC_TTCK_HI_OFFSET_Y; 298} 299 300static 301skc_bool 302skc_ttck_equal_yxl(skc_ttck_t const a, skc_ttck_t const b) 303{ 304 skc_uint const lo = (a.lo ^ b.lo) & SKC_BITS_TO_MASK_AT(SKC_TTCK_LO_BITS_LAYER,SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); 305 skc_uint const hi = (a.hi ^ b.hi); 306 307 return (lo | hi) == 0; 308} 309 310static 311skc_bool 312skc_ttck_hi_equal_yx(skc_ttck_hi_t const a, skc_ttck_hi_t const b) 313{ 314 return ((a ^ b) & SKC_TTCK_HI_MASK_YX) == 0; 315} 316 317static 318skc_bool 319skc_ttck_lo_is_prefix(skc_ttck_lo_t const a) 320{ 321 return (a & SKC_TTCK_LO_MASK_PREFIX) != 0; 322} 323 324// 325// TILE TRACE SUBPIXEL 326// 327// The subpixels are encoded with either absolute tile coordinates 328// (32-bits) or packed in delta-encoded form form. 329// 330// For 32-bit subpixel packing of a 32x32 tile: 331// 332// A tile X is encoded as: 333// 334// TX : 10 : unsigned min(x0,x1) tile subpixel coordinate. 335// 336// SX : 6 : unsigned subpixel span from min to max x with range 337// [0,32]. The original direction is not captured. Would 338// be nice to capture dx but not necessary right now but 339// could be in the future. <--- SPARE VALUES AVAILABLE 340// 341// A tile Y is encoded as: 342// 343// TY : 10 : unsigned min(y0,y1) tile subpixel coordinate. 344// 345// DY : 6 : signed subpixel delta y1-y0. The range of delta is 346// [-32,32] but horizontal lines are not encoded so [1,32] 347// is mapped to [0,31]. The resulting range [-32,31] fits 348// in 6 bits. 349// 350// TTS: 351// 352// 0 31 353// | TX | SX | TY | DY | 354// +-----+------+-----+------+ 355// | 10 | 6 | 10 | 6 | 356// 357 358static 359SKC_RENDER_TTS_V_BITFIELD 360skc_tts_get_ty_pixel_v(SKC_RENDER_TTS_V const a) 361{ 362 // 363 // extract the whole pixel y coordinate 364 // 365 return SKC_BFE(a, 366 SKC_TTS_BITS_TY - SKC_SUBPIXEL_RESL_Y_LOG2, 367 SKC_TTS_OFFSET_TY + SKC_SUBPIXEL_RESL_Y_LOG2); 368} 369 370static 371SKC_RENDER_TTS_V_BITFIELD 372skc_tts_get_xy_idx_v(SKC_RENDER_TTS_V const a) 373{ 374 // 375 // get the linear array tile index of the pixel 376 // 377 return (((a & SKC_TTS_MASK_TX_PIXEL) 378 379#if (SKC_SUBPIXEL_RESL_X_LOG2 > SKC_TILE_HEIGHT_LOG2) 380 >> (SKC_SUBPIXEL_RESL_X_LOG2 - SKC_TILE_HEIGHT_LOG2) 381#elif (SKC_SUBPIXEL_RESL_X_LOG2 < SKC_TILE_HEIGHT_LOG2) 382 << (SKC_TILE_HEIGHT_LOG2 - SKC_SUBPIXEL_RESL_X_LOG2) 383#endif 384 385 ) | skc_tts_get_ty_pixel_v(a)); 386} 387 388#if 0 389static 390skc_ttx_v_s32_t 391skc_tts_get_dy_v(SKC_RENDER_TTS_V const a) 392{ 393 skc_ttx_v_s32_t const dy = SKC_AS(skc_ttx_v_s32_t)a >> SKC_TTS_OFFSET_DY; 394 395 return (dy + SKC_AS(skc_ttx_v_s32_t)(~a >> 31)); 396} 397#else 398static 399SKC_RENDER_TTS_V_BITFIELD 400skc_tts_get_dy_v(SKC_RENDER_TTS_V const a) 401{ 402 SKC_RENDER_TTS_V_BITFIELD const dy = a >> SKC_TTS_OFFSET_DY; 403 404 return dy - (~a >> 31); 405} 406#endif 407 408static 409SKC_RENDER_TTS_V_BITFIELD 410skc_tts_get_tx_subpixel_v(SKC_RENDER_TTS_V const a) 411{ 412 return a & SKC_BITS_TO_MASK(SKC_SUBPIXEL_RESL_X_LOG2); 413} 414 415static 416SKC_RENDER_TTS_V_BITFIELD 417skc_tts_get_sx_v(SKC_RENDER_TTS_V const a) 418{ 419 return SKC_BFE(a,SKC_TTS_BITS_SX,SKC_TTS_OFFSET_SX); 420} 421 422// 423// 424// 425 426static 427void 428skc_tile_aa_zero(__local union skc_subgroup_smem * SKC_RESTRICT const smem) 429{ 430 // 431 // SIMD / CPU 432 // 433 // & 434 // 435 // SIMT / GPU 436 // 437 // Note that atomic_init() is likely implemented as a simple 438 // assignment so there is no identifiable performance difference on 439 // current targets. 440 // 441 // If such an architecture appears in the future then we'll probably 442 // still want to implement this zero'ing operation as below but 443 // follow with an appropriate fence that occurs before any scatter 444 // operations. 445 // 446 // The baroque expansion below improves performance on Intel GEN by, 447 // presumably, achieving the 64-byte per clock SLM write as well as 448 // minimizing the overall number of SEND() block initializations and 449 // launches. 450 // 451 // Intel GENx has a documented 64 byte per cycle SLM write limit. 452 // So having each lane in an 8 lane subgroup zero-write 8 bytes is 453 // probably a safe bet (Later: benchmarking backs this up!). 454 // 455 // Note there is no reason at this time to unroll this loop. 456 // 457 for (uint ii=0; ii<SKC_RENDER_WIDE_AA_WIDTH; ii++) 458 smem->wide.area[ii][skc_subgroup_lane()] = ( 0 ); 459} 460 461// 462// Note this is going to be vectorizable on most architectures. 463// 464// The return of the key translation feature might complicate things. 465// 466 467static 468void 469skc_scatter_ttpb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, 470 __local union skc_subgroup_smem * SKC_RESTRICT const smem, 471 skc_block_id_t const pb_id) 472{ 473 skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane(); 474 475#if ( SKC_TILE_RATIO == 1 ) 476 477 SKC_RENDER_TTP_V const ttp_v = ttxb_extent[offset]; 478 479#elif ( SKC_TILE_RATIO == 2 ) 480 481 SKC_RENDER_TTP_V const ttp_v = vload2(offset,ttxb_extent); 482 483#else 484 485#error("tile ratio greater than 2 not supported") 486 487#endif 488 489 // 490 // Note there is no need to use an atomic for this operation on the 491 // current group of target platforms... but this may change if 492 // atomic ops truly go through a different path. 493 // 494 // As noted above, this direct increment is probably faster and can 495 // always be followed by a fence. 496 // 497 // Furthermore, note that the key sorting orders all ttck keys 498 // before ttpk keys. 499 // 500 501 // 502 // FIXME -- if the SMEM store is wider than bank word count then we 503 // might want to odd-even interleave the TTP values if the target 504 // device can't handle 64-bit stores 505 // 506 507 // 508 // skipping per-key translation for now 509 // 510 smem->vN.area[0][skc_subgroup_lane()] += ttp_v << (SKC_SUBPIXEL_RESL_X_LOG2 + 1); 511} 512 513// 514// Note that skc_scatter_ttsb is *not* vectorizable unless the 515// architecture supports a "scatter-add" capability. All relevant 516// GPUs support atomic add on shared/local memory and thus support 517// scatter-add. 518// 519 520static 521void 522skc_scatter_ttsb(__global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, 523 __local union skc_subgroup_smem * SKC_RESTRICT const smem, 524 skc_block_id_t const sb_id) 525{ 526 skc_uint const offset = sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); 527 528 SKC_RENDER_TTS_V const tts_v = ttxb_extent[offset]; 529 530 // 531 // Skipping per-key translation for now 532 // 533 534 // Index into tile 535 // 536 // The tiles are stored in column-major / height-major order 537 // 538 // The final column is a guard column that is OK to write to but 539 // will never be read. It simplifies the TTSB scatter but could be 540 // predicated if SMEM is really at a premium. 541 // 542 543 SKC_RENDER_TTS_V_BITFIELD const xy_idx = skc_tts_get_xy_idx_v(tts_v); 544 545#if 0 546 if (tts_v != SKC_TTS_INVALID) 547 printf("(%08X) = %u\n",tts_v,xy_idx); 548#endif 549 550 // 551 // adjust subpixel range to max y 552 // 553 // range is stored as [-32,31] and when read [0,31] is mapped to 554 // [1,32] because a dy of 0 is not possible. 555 // 556 // more succinctly: if dy >= 0 then ++dy 557 // 558 SKC_RENDER_TTS_V_BITFIELD const dy = skc_tts_get_dy_v(tts_v); 559 560 // 561 // FIXME -- benchmark performance of setting dy to 0 if ttsv.vN is invalid? 562 // 563 564 // this "min(x0) * 2 + dx" is equivalent to "x0 + x1" 565 SKC_RENDER_TTS_V_BITFIELD const widths = skc_tts_get_tx_subpixel_v(tts_v) * 2 + skc_tts_get_sx_v(tts_v); 566 567 // Calculate left and right coverage contribution trapezoids 568 SKC_RENDER_TTS_V_BITFIELD const left = dy * widths; 569 SKC_RENDER_TTS_V_BITFIELD const right = (dy << (SKC_SUBPIXEL_RESL_X_LOG2 + 1)) - left; 570 571 // 572 // Accumulate altitudes and areas 573 // 574 // Optimization: if the device supports an CPU/SIMD vector-add or 575 // GPU/SIMT scatter-add atomic int2 add operation then placing the 576 // ALT and AREA values side-by-side would halve the number of 577 // additions. 578 // 579#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) 580 // 581 // CPU/SIMD 582 // 583#undef SKC_EXPAND_X 584#define SKC_EXPAND_X(I,S,C,P,A) \ 585 if (tts_v C != SKC_TTS_INVALID) { \ 586 smem->aN.area[SKC_TILE_HEIGHT + xy_idx C] += left C; \ 587 smem->aN.area[ xy_idx C] += right C; \ 588 } 589 590#else 591 // 592 // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD 593 // 594#undef SKC_EXPAND_X 595#define SKC_EXPAND_X(I,S,C,P,A) \ 596 if (tts_v C != SKC_TTS_INVALID) { \ 597 SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + \ 598 SKC_TILE_HEIGHT + xy_idx C, \ 599 left C); \ 600 SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->atomic.area + xy_idx C, \ 601 right C); \ 602 } 603#endif 604 605 SKC_RENDER_TTSB_EXPAND(); 606} 607 608// 609// Note that 2048.0 can be represented exactly with fp16... fortuitous! 610// 611 612#define SKC_RENDER_FILL_MAX_AREA (2u * SKC_SUBPIXEL_RESL_X * SKC_SUBPIXEL_RESL_Y) 613#define SKC_RENDER_FILL_MAX_AREA_2 (2u * SKC_RENDER_FILL_MAX_AREA) 614#define SKC_RENDER_FILL_EVEN_ODD_MASK (SKC_RENDER_FILL_MAX_AREA_2 - 1) 615#define SKC_RENDER_FILL_MAX_AREA_RCP_F32 (SKC_RENDER_TILE_COVER)(1.0f / SKC_RENDER_FILL_MAX_AREA) 616 617// 618// 619// 620 621static 622void 623skc_tile_cover_nonzero(__local union skc_subgroup_smem * SKC_RESTRICT const smem, 624 union skc_tile_cover * SKC_RESTRICT const cover, 625 union skc_tile_color * SKC_RESTRICT const color) 626{ 627 SKC_RENDER_ACC_COVER_INT area = 0; 628 629 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2 630 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 631 { 632 area += smem->vN.area[ii][skc_subgroup_lane()]; 633 SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area); 634 SKC_RENDER_TILE_COVER const nonzero = SKC_CONVERT(SKC_RENDER_TILE_COVER)(min(trapabs,SKC_RENDER_FILL_MAX_AREA)); 635 636 cover->aN.c[ii] = nonzero * (SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA_RCP_F32); 637 } 638} 639 640static 641void 642skc_tile_cover_evenodd(__local union skc_subgroup_smem * SKC_RESTRICT const smem, 643 union skc_tile_cover * SKC_RESTRICT const cover, 644 union skc_tile_color * SKC_RESTRICT const color) 645{ 646 SKC_RENDER_ACC_COVER_INT area = 0; 647 648 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) // doesn't help on AVX2 649 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 650 { 651 area += smem->vN.area[ii][skc_subgroup_lane()]; 652 SKC_RENDER_ACC_COVER_UINT const trapabs = abs(area); 653 SKC_RENDER_ACC_COVER_UINT const reflect = abs(SKC_AS(SKC_RENDER_ACC_COVER_INT)((trapabs & SKC_RENDER_FILL_EVEN_ODD_MASK) - SKC_RENDER_FILL_MAX_AREA)); 654 655 cover->aN.c[ii] = SKC_CONVERT(SKC_RENDER_TILE_COVER)(SKC_RENDER_FILL_MAX_AREA - reflect) * (SKC_RENDER_TILE_COVER)SKC_RENDER_FILL_MAX_AREA_RCP_F32; 656 } 657} 658 659// 660// 661// 662 663static 664void 665skc_tile_color_fill_solid(__global union skc_styling_cmd const * SKC_RESTRICT const commands, 666 uint * SKC_RESTRICT const cmd_next, 667 union skc_tile_color * SKC_RESTRICT const color) 668{ 669 // 670 // rgba = solid fill 671 // 672 __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0; 673 674 *cmd_next += 2; 675 676#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) 677 678 SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); 679 680 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) 681 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 682 color->aN.rgba[ii].r = rg.lo; 683 684 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) 685 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 686 color->aN.rgba[ii].g = rg.hi; 687 688 SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); 689 690 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) 691 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 692 color->aN.rgba[ii].b = ba.lo; 693 694 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) 695 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 696 color->aN.rgba[ii].a = ba.hi; 697 698#else 699 700 SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); 701 SKC_RENDER_TILE_COLOR const r = rg.lo; 702 703 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) 704 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 705 color->vN.rgba[ii].even.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(r); 706 707 SKC_RENDER_TILE_COLOR const g = rg.hi; 708 709 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) 710 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 711 color->vN.rgba[ii].odd.even = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(g); 712 713 SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); 714 SKC_RENDER_TILE_COLOR const b = ba.lo; 715 716 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) 717 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 718 color->vN.rgba[ii].even.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(b); 719 720 SKC_RENDER_TILE_COLOR const a = ba.hi; 721 722 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) 723 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 724 color->vN.rgba[ii].odd.odd = SKC_AS(SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT)(a); 725 726#endif 727} 728 729// 730// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" 731// 732// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ 733// 734// Lerp in two fma/mad ops: 735// 736// t * b + ((-t) * a + a) 737// 738// Note: OpenCL documents mix() as being implemented as: 739// 740// a + (b - a) * t 741// 742// But this may be a native instruction on some devices. For example, 743// on GEN9 there is an LRP "linear interoplation" function but it 744// doesn't appear to support half floats. 745// 746 747#if 1 748#define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) 749#else 750#define SKC_LERP(a,b,t) mix(a,b,t) 751#endif 752 753// 754// CPUs have a mock local address space so copying the gradient header 755// is probably not useful. Just read directly from global. 756// 757 758#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL 759#define SKC_RENDER_GRADIENT_SPACE __local 760#else 761#define SKC_RENDER_GRADIENT_SPACE __global 762#endif 763 764// 765// gradient is non-vertical 766// 767// removed the vertical (actually, horizontal) special case 768// 769 770static 771void 772skc_tile_color_fill_gradient_linear_nonvertical(__local union skc_subgroup_smem * SKC_RESTRICT const smem, 773 __global union skc_styling_cmd const * SKC_RESTRICT const commands, 774 uint * SKC_RESTRICT const cmd_next, 775 union skc_tile_color * SKC_RESTRICT const color, 776 skc_ttck_hi_t const ttck_hi) 777{ 778 // 779 // Where is this tile? 780 // 781 // Note that the gradient is being sampled from pixel centers. 782 // 783 SKC_RENDER_GRADIENT_FLOAT const y = 784#undef SKC_EXPAND_X 785#define SKC_EXPAND_X(I,S,C,P,A) I##.5f P 786 (SKC_RENDER_GRADIENT_FLOAT)( SKC_RENDER_SCANLINE_VECTOR_EXPAND() ) + 787 (skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE)); 788 789 float const x = 0.5f + (skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH); 790 791 // 792 // Get starting numerator and denominator 793 // 794 // Note: if gh[0].dx is exactly 0.0f then this is a vertical 795 // gradient and can be handled by a special opcode. 796 // 797 // Note: the mad() ordering is slightly different than the original 798 // CUDA implementation. 799 // 800 union skc_gradient_vector const gv = { vload4(0,&commands[*cmd_next].f32) }; 801 802 *cmd_next += 4; 803 804 float const gv_x_dot = mad(x,gv.dx,gv.p0); 805 SKC_RENDER_GRADIENT_FLOAT const gv_numer = mad(y,gv.dy,gv_x_dot); 806 807 // 808 // Where are columns along gradient vector? 809 // 810 // TODO: Note that the gv_denom isn't multiplied through. 811 // 812 // Please doublecheck this... but I recall that in certain cases 813 // this wipes out some precision and results in minor but noticeable 814 // gradient artifacts. 815 // 816 // All arguments are scalars except gv_numer so a simpler 817 // evaluation might save some flops. 818 // 819 820 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 821 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 822 color->grad[ii].distance = mad(gv.dx,(float)ii,gv_numer) * gv.denom; 823 824 // 825 // is gradient non-repeating, repeating or reflecting? 826 // 827 switch (commands[(*cmd_next)++].u32) 828 { 829 case SKC_STYLING_GRADIENT_TYPE_LINEAR_NON_REPEATING: 830 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 831 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 832 color->grad[ii].distance = clamp(color->grad[ii].distance,0.0f,1.0f); 833 break; 834 835 case SKC_STYLING_GRADIENT_TYPE_LINEAR_REPEATING: 836 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 837 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 838 color->grad[ii].distance -= floor(color->grad[ii].distance); 839 break; 840 841 default: // PXL_STYLING_GRADIENT_TYPE_LINEAR_REFLECTING 842 // 843 // OPTIMIZATION: Can this be done in fewer than ~4 ops? 844 // 845 // Note: OpenCL "rint()" is round-to-nearest-even integer! 846 // 847 // Note: the floor() "round to -inf" op is implemented in the 848 // GEN op 'FRC' so probably don't use trunc() when floor will 849 // suffice. 850 // 851 852 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 853 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 854 { 855 SKC_RENDER_GRADIENT_FLOAT dist_abs = fabs(color->grad[ii].distance); 856 color->grad[ii].distance = fabs(dist_abs - rint(dist_abs)); 857 } 858 } 859 860 // 861 // initialize "stoplerp" for all columns 862 // 863 uint const slope_count = commands[(*cmd_next)++].u32; 864 uint const gd_n_v1 = commands[(*cmd_next)++].u32; // REMOVE ME 865 866 { 867 float const slope = commands[(*cmd_next)++].f32; 868 869 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 870 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 871 color->grad[ii].stoplerp = color->grad[ii].distance * slope; 872 } 873 874 // 875 // compute stoplerp for remaining stops 876 // 877 for (int jj=1; jj<slope_count; jj++) 878 { 879 float const floor = (float)jj; 880 float const slope = commands[(*cmd_next)++].f32; 881 882 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 883 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 884 color->grad[ii].stoplerp = mad(min(0, color->grad[ii].stoplerp - floor),slope,color->grad[ii].stoplerp); 885 } 886 887 // 888 // copy gradient colors to local memory 889 // 890 uint const gd_n = slope_count + 1; 891 892#ifndef SKC_RENDER_GRADIENT_IS_GLOBAL 893 // 894 // copy entire gradient descriptor to local memory 895 // 896 for (uint ii=skc_subgroup_lane(); ii<gd_n*4; ii+=SKC_RENDER_SUBGROUP_SIZE) 897 smem->cmds[ii].u32 = commands[*cmd_next + ii].u32; 898 899 __local half const * const SKC_RESTRICT gc = smem->gc + 0; 900#else 901 // 902 // prefetch entire gradient header 903 // 904 // no noticeable impact on performance 905 // 906 // prefetch(&commands[*cmd_next].u32,gh_words); 907 // 908 __global half const * const SKC_RESTRICT gc = commands[*cmd_next].f16a2 + 0; 909#endif 910 911 // 912 // adjust cmd_next so that V1 structure is consumed -- FIXME 913 // 914 *cmd_next += SKC_GRADIENT_CMD_WORDS_V2_ADJUST(gd_n_v1,gd_n); 915 916 // 917 // lerp between color pair stops 918 // 919 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 920 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 921 { 922 // 923 // Finally, we have the gradient stop index and the color stop 924 // pair lerp fraction 925 // 926 // Note that if these are vector values then a gather operation 927 // must occur -- there may be platforms (AVX-512?) that can 928 // perform an explicit gather on a vector type but it's not 929 // really expressible in OpenCL except implicitly with a 930 // workgroup of work items. 931 // 932 // *********************** 933 // 934 // FIXME -- USE HERB'S SINGLE FMA LERP 935 // 936 // *********************** 937 // 938 SKC_RENDER_GRADIENT_STOP const gc_stop = SKC_CONVERT(SKC_RENDER_GRADIENT_STOP)(color->grad[ii].stoplerp); 939 SKC_RENDER_GRADIENT_FRAC const gc_frac = SKC_CONVERT(SKC_RENDER_GRADIENT_FRAC)(color->grad[ii].stoplerp - floor(color->grad[ii].stoplerp)); 940 941 { 942 SKC_RENDER_TILE_COLOR lo, hi; 943 944#undef SKC_EXPAND_X 945#define SKC_EXPAND_X(I,S,C,P,A) { \ 946 SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + 0,gc); \ 947 lo C = cc.lo; \ 948 hi C = cc.hi; \ 949 } 950 951 SKC_RENDER_SCANLINE_VECTOR_EXPAND(); 952 953 color->aN.rgba[ii].r = SKC_LERP(lo,hi,gc_frac); 954 } 955 956 // 957 // 958 // 959 { 960 SKC_RENDER_TILE_COLOR lo, hi; 961 962#undef SKC_EXPAND_X 963#define SKC_EXPAND_X(I,S,C,P,A) { \ 964 SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n,gc); \ 965 lo C = cc.lo; \ 966 hi C = cc.hi; \ 967 } 968 969 SKC_RENDER_SCANLINE_VECTOR_EXPAND(); 970 971 color->aN.rgba[ii].g = SKC_LERP(lo,hi,gc_frac); 972 } 973 974 // 975 // 976 // 977 { 978 SKC_RENDER_TILE_COLOR lo, hi; 979 980#undef SKC_EXPAND_X 981#define SKC_EXPAND_X(I,S,C,P,A) { \ 982 SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*2,gc); \ 983 lo C = cc.lo; \ 984 hi C = cc.hi; \ 985 } 986 987 SKC_RENDER_SCANLINE_VECTOR_EXPAND(); 988 989 color->aN.rgba[ii].b = SKC_LERP(lo,hi,gc_frac); 990 } 991 992 // 993 // 994 // 995 { 996 SKC_RENDER_TILE_COLOR lo, hi; 997 998#undef SKC_EXPAND_X 999#define SKC_EXPAND_X(I,S,C,P,A) { \ 1000 SKC_RENDER_TILE_COLOR_PAIR const cc = SKC_RENDER_TILE_COLOR_PAIR_LOAD(gc_stop C + gd_n*3,gc); \ 1001 lo C = cc.lo; \ 1002 hi C = cc.hi; \ 1003 } 1004 1005 SKC_RENDER_SCANLINE_VECTOR_EXPAND(); 1006 1007 color->aN.rgba[ii].a = SKC_LERP(lo,hi,gc_frac); 1008 } 1009 } 1010} 1011 1012// 1013// 1014// 1015 1016static 1017void 1018skc_tile_blend_over(union skc_tile_color * SKC_RESTRICT const color_acc, 1019 union skc_tile_cover const * SKC_RESTRICT const cover_wip, 1020 union skc_tile_color const * SKC_RESTRICT const color_wip) 1021{ 1022 // 1023 // fralunco = cover.wip * acc.a 1024 // 1025 // acc.r = fralunco * wip.r + acc.r 1026 // acc.g = fralunco * wip.g + acc.g 1027 // acc.b = fralunco * wip.b + acc.b 1028 // acc.a = -fralunco * wip.a + acc.a 1029 // 1030 1031 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1032 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1033 { 1034 SKC_RENDER_TILE_COVER const fralunco = cover_wip->aN.c[ii] * color_acc->aN.rgba[ii].a; 1035 1036 color_acc->aN.rgba[ii].r = mad(+fralunco,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); 1037 color_acc->aN.rgba[ii].g = mad(+fralunco,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); 1038 color_acc->aN.rgba[ii].b = mad(+fralunco,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); 1039 color_acc->aN.rgba[ii].a = mad(-fralunco,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); 1040 } 1041} 1042 1043// 1044// 1045// 1046 1047static 1048void 1049skc_tile_blend_plus(union skc_tile_color * SKC_RESTRICT const color_acc, 1050 union skc_tile_cover const * SKC_RESTRICT const cover_wip, 1051 union skc_tile_color const * SKC_RESTRICT const color_wip) 1052{ 1053 // 1054 // cover_min = min(cover.wip,a.acc) 1055 // 1056 // r.acc = cover_min * r.wip + r.acc 1057 // g.acc = cover_min * g.wip + g.acc 1058 // b.acc = cover_min * b.wip + b.acc 1059 // a.acc = -cover_min * a.wip + a.acc 1060 // 1061 1062 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1063 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1064 { 1065 SKC_RENDER_TILE_COVER const cover_min = fmin(cover_wip->aN.c[ii],color_acc->aN.rgba[ii].a); 1066 1067 color_acc->aN.rgba[ii].r = mad(+cover_min,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); 1068 color_acc->aN.rgba[ii].g = mad(+cover_min,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); 1069 color_acc->aN.rgba[ii].b = mad(+cover_min,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); 1070 color_acc->aN.rgba[ii].a = mad(-cover_min,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); 1071 } 1072} 1073 1074// 1075// 1076// 1077 1078static 1079void 1080skc_tile_blend_multiply(union skc_tile_color * SKC_RESTRICT const color_acc, 1081 union skc_tile_cover const * SKC_RESTRICT const cover_wip, 1082 union skc_tile_color const * SKC_RESTRICT const color_wip) 1083{ 1084 // 1085 // r.acc = (cover.wip * r.wip) * r.acc 1086 // g.acc = (cover.wip * g.wip) * g.acc 1087 // b.acc = (cover.wip * b.wip) * b.acc 1088 // a.acc = (cover.wip * a.wip) * (1.0 - a.acc) <-- a.acc is already (1.0 - alpha) 1089 // 1090 1091 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1092 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1093 { 1094 color_acc->aN.rgba[ii].r *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].r; 1095 color_acc->aN.rgba[ii].g *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].g; 1096 color_acc->aN.rgba[ii].b *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].b; 1097 color_acc->aN.rgba[ii].a *= cover_wip->aN.c[ii] * color_wip->aN.rgba[ii].a; 1098 } 1099} 1100 1101// 1102// 1103// 1104 1105static 1106void 1107skc_tile_blend_knockout(union skc_tile_cover * SKC_RESTRICT const cover_acc, 1108 union skc_tile_color * SKC_RESTRICT const color_acc, 1109 union skc_tile_cover const * SKC_RESTRICT const cover_wip, 1110 union skc_tile_color const * SKC_RESTRICT const color_wip) 1111{ 1112 // 1113 // cover.wip.contrib = (1.0 - cover.acc) * cover.wip 1114 // cover.acc = cover.acc + cover.wip.contrib 1115 // 1116 // r.acc = cover.wip.contrib * r.wip + r.acc 1117 // g.acc = cover.wip.contrib * g.wip + g.acc 1118 // b.acc = cover.wip.contrib * b.wip + b.acc 1119 // a.acc = -cover.wip.contrib * a.wip * a.acc 1120 // 1121 1122 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1123 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1124 { 1125 SKC_RENDER_TILE_COVER const contrib = (1 - cover_acc->aN.c[ii]) * cover_wip->aN.c[ii]; 1126 1127 cover_acc->aN.c[ii] += contrib; 1128 1129 color_acc->aN.rgba[ii].r = mad(+contrib,color_wip->aN.rgba[ii].r,color_acc->aN.rgba[ii].r); 1130 color_acc->aN.rgba[ii].g = mad(+contrib,color_wip->aN.rgba[ii].g,color_acc->aN.rgba[ii].g); 1131 color_acc->aN.rgba[ii].b = mad(+contrib,color_wip->aN.rgba[ii].b,color_acc->aN.rgba[ii].b); 1132 color_acc->aN.rgba[ii].a = mad(-contrib,color_wip->aN.rgba[ii].a,color_acc->aN.rgba[ii].a); 1133 } 1134} 1135 1136// 1137// 1138// 1139 1140static 1141void 1142skc_tile_cover_msk_copy_wip(union skc_tile_cover * SKC_RESTRICT const cover_msk, 1143 union skc_tile_cover const * SKC_RESTRICT const cover_wip) 1144{ 1145#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) 1146 1147 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1148 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1149 cover_msk->aN.c[ii] = cover_wip->aN.c[ii]; 1150 1151#else 1152 1153 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) 1154 for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) 1155 cover_msk->vN.c[ii] = cover_wip->vN.c[ii]; 1156 1157#endif 1158} 1159 1160// 1161// 1162// 1163 1164static 1165void 1166skc_tile_cover_msk_copy_acc(union skc_tile_cover * SKC_RESTRICT const cover_msk, 1167 union skc_tile_cover const * SKC_RESTRICT const cover_acc) 1168{ 1169#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) 1170 1171 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1172 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1173 cover_msk->aN.c[ii] = cover_acc->aN.c[ii]; 1174 1175#else 1176 1177 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNTN))) 1178 for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) 1179 cover_msk->vN.c[ii] = cover_acc->vN.c[ii]; 1180 1181#endif 1182} 1183 1184// 1185// 1186// 1187 1188static 1189void 1190skc_tile_cover_accumulate(union skc_tile_cover * SKC_RESTRICT const cover_acc, 1191 union skc_tile_cover const * SKC_RESTRICT const cover_wip) 1192{ 1193 // 1194 // cover.wip.contrib = (1.0 - cover.acc) * cover.wip 1195 // cover.acc = cover.acc + cover.wip.contrib 1196 // 1197 1198 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1199 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1200 cover_acc->aN.c[ii] = mad(1 - cover_acc->aN.c[ii],cover_wip->aN.c[ii],cover_acc->aN.c[ii]); 1201} 1202 1203// 1204// 1205// 1206 1207static 1208void 1209skc_tile_cover_wip_mask(union skc_tile_cover * SKC_RESTRICT const cover_wip, 1210 union skc_tile_cover const * SKC_RESTRICT const cover_msk) 1211{ 1212 // 1213 // cover.wip *= cover.msk 1214 // 1215 1216 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1217 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1218 cover_wip->aN.c[ii] *= cover_msk->aN.c[ii]; 1219} 1220 1221// 1222// 1223// 1224 1225static 1226void 1227skc_tile_cover_wip_zero(union skc_tile_cover * SKC_RESTRICT const cover) 1228{ 1229#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 ) 1230 1231 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1232 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1233 cover->aN.c[ii] = 0; 1234 1235#else 1236 // 1237 // GEN9 compiler underperforms on this 1238 // 1239 1240 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) 1241 for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) 1242 cover->vN.c[ii] = 0; 1243 1244#endif 1245} 1246 1247static 1248void 1249skc_tile_cover_acc_zero(union skc_tile_cover * SKC_RESTRICT const cover) 1250{ 1251#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) // || defined( SKC_ARCH_GEN9 ) 1252 1253 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1254 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1255 cover->aN.c[ii] = 0; 1256 1257#else 1258 // 1259 // GEN9 compiler underperforms on this 1260 // 1261 1262 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) 1263 for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) 1264 cover->vN.c[ii] = 0; 1265 1266#endif 1267} 1268 1269static 1270void 1271skc_tile_cover_msk_zero(union skc_tile_cover * SKC_RESTRICT const cover) 1272{ 1273#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) 1274 1275 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1276 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1277 cover->aN.c[ii] = 0; 1278 1279#else 1280 // 1281 // GEN9 compiler underperforms on this 1282 // 1283 1284 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) 1285 for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) 1286 cover->vN.c[ii] = 0; 1287 1288#endif 1289} 1290 1291// 1292// 1293// 1294 1295static 1296void 1297skc_tile_cover_msk_one(union skc_tile_cover * SKC_RESTRICT const cover) 1298{ 1299#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) 1300 1301 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1302 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1303 cover->aN.c[ii] = 1; 1304 1305#else 1306 // 1307 // GEN9 compiler underperforms on this 1308 // 1309 1310 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) 1311 for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) 1312 cover->vN.c[ii] = SKC_RENDER_TILE_COVER_VECTOR_ONE; 1313 1314#endif 1315} 1316 1317// 1318// 1319// 1320 1321static 1322void 1323skc_tile_cover_msk_invert(union skc_tile_cover * SKC_RESTRICT const cover) 1324{ 1325#if !defined( SKC_RENDER_TILE_COVER_VECTOR ) || defined( SKC_ARCH_GEN9 ) 1326 1327 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1328 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1329 cover->aN.c[ii] = 1 - cover->aN.c[ii]; 1330 1331#else 1332 1333 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COVER_VECTOR_COUNT))) 1334 for (uint ii=0; ii<SKC_RENDER_TILE_COVER_VECTOR_COUNT; ii++) 1335 cover->vN.c[ii] = 1 - cover->vN.c[ii]; 1336 1337#endif 1338} 1339 1340// 1341// 1342// 1343 1344static 1345void 1346skc_tile_color_wip_zero(union skc_tile_color * SKC_RESTRICT const color) 1347{ 1348#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 ) 1349 1350 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1351 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1352 { 1353 color->aN.rgba[ii].r = 0; 1354 color->aN.rgba[ii].g = 0; 1355 color->aN.rgba[ii].b = 0; 1356 color->aN.rgba[ii].a = 1; 1357 } 1358 1359#else 1360 // 1361 // DISABLED ON GEN9 -- probably a compiler bug 1362 // 1363 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) 1364 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 1365 color->vN.rgba[ii].even.even = 0; 1366 1367 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) 1368 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 1369 color->vN.rgba[ii].odd.even = 0; 1370 1371 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) 1372 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 1373 color->vN.rgba[ii].even.odd = 0; 1374 1375 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) 1376 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 1377 color->vN.rgba[ii].odd.odd = 1; 1378#endif 1379} 1380 1381static 1382void 1383skc_tile_color_acc_zero(union skc_tile_color * SKC_RESTRICT const color) 1384{ 1385#if !defined( SKC_RENDER_TILE_COLOR_VECTOR ) || defined( SKC_ARCH_GEN9 ) 1386 1387 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1388 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1389 { 1390 color->aN.rgba[ii].r = 0; 1391 color->aN.rgba[ii].g = 0; 1392 color->aN.rgba[ii].b = 0; 1393 color->aN.rgba[ii].a = 1; 1394 } 1395 1396#else 1397 // 1398 // DISABLED ON GEN9 -- probably a compiler bug 1399 // 1400 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) 1401 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 1402 color->vN.rgba[ii].even.even = 0; 1403 1404 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) 1405 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 1406 color->vN.rgba[ii].odd.even = 0; 1407 1408 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) 1409 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 1410 color->vN.rgba[ii].even.odd = 0; 1411 1412 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT))) 1413 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 1414 color->vN.rgba[ii].odd.odd = 1; 1415#endif 1416} 1417 1418// 1419// 1420// 1421 1422static 1423bool 1424skc_tile_color_test_opacity(union skc_tile_color const * SKC_RESTRICT const color) 1425{ 1426 // 1427 // returns true if tile is opaque 1428 // 1429 // various hacks to test for complete tile opacity 1430 // 1431 // note that front-to-back currently has alpha at 0.0f -- this can 1432 // be harmonized to use a traditional alpha if we want to support 1433 // rendering in either direction 1434 // 1435 // hack -- ADD/MAX/OR all alphas together and test for non-zero 1436 // 1437 SKC_RENDER_TILE_COLOR t = color->aN.rgba[0].a; 1438 1439 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH-1))) 1440 for (uint ii=1; ii<SKC_TILE_WIDTH; ii++) 1441 t += color->aN.rgba[ii].a; 1442 1443#if ( SKC_RENDER_SUBGROUP_SIZE == 1 ) 1444 // 1445 // SIMD 1446 // 1447 return !any(t != ( 0 )); 1448 1449#elif ( SKC_RENDER_SCANLINE_VECTOR_SIZE == 1 ) 1450 // 1451 // SIMT - scalar per lane 1452 // 1453 return !sub_group_any(t != 0); 1454 1455#else 1456 // 1457 // SIMT - vector per lane 1458 // 1459 return !sub_group_any(any(t != ( 0 ))); 1460 1461#endif 1462 1463 // 1464 // TODO: The alternative vector-per-lane implementation below is 1465 // *not* believed to be performant because the terse vector-wide 1466 // test is just hiding a series of comparisons and is likely worse 1467 // than the blind ADD/MAX/OR'ing of all alphas followed by a single 1468 // test. 1469 // 1470#if 0 1471 // 1472 // SIMT - vector per lane 1473 // 1474 1475 // __attribute__((opencl_unroll_hint(SKC_RENDER_TILE_COLOR_VECTOR_COUNT-1))) 1476 for (uint ii=0; ii<SKC_RENDER_TILE_COLOR_VECTOR_COUNT; ii++) 1477 { 1478 if (sub_group_any(any(color->vN.ba[ii].a != ( 0 )))) 1479 return false; 1480 } 1481 1482 return true; 1483#endif 1484} 1485 1486// 1487// 1488// 1489 1490static 1491void 1492skc_tile_background_over(__global union skc_styling_cmd const * SKC_RESTRICT const commands, 1493 uint * SKC_RESTRICT const cmd_next, 1494 union skc_tile_color * SKC_RESTRICT const color) 1495{ 1496 // 1497 // acc.r = acc.a * r + acc.r 1498 // acc.g = acc.a * g + acc.g 1499 // acc.b = acc.a * b + acc.b 1500 // 1501 __global half const * const rgba_ptr = commands[*cmd_next].f16a2 + 0; 1502 1503 *cmd_next += 2; 1504 1505 SKC_RENDER_TILE_COLOR_PAIR const rg = SKC_RENDER_TILE_COLOR_PAIR_LOAD(0,rgba_ptr); 1506 1507 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1508 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1509 color->aN.rgba[ii].r = mad(color->aN.rgba[ii].a,rg.lo,color->aN.rgba[ii].r); 1510 1511 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1512 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1513 color->aN.rgba[ii].g = mad(color->aN.rgba[ii].a,rg.hi,color->aN.rgba[ii].g); 1514 1515 SKC_RENDER_TILE_COLOR_PAIR const ba = SKC_RENDER_TILE_COLOR_PAIR_LOAD(1,rgba_ptr); 1516 1517 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1518 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1519 color->aN.rgba[ii].b = mad(color->aN.rgba[ii].a,ba.lo,color->aN.rgba[ii].b); 1520} 1521 1522// 1523// 1524// 1525 1526// #define SKC_SURFACE_IS_BUFFER 1527#ifdef SKC_SURFACE_IS_BUFFER 1528 1529static 1530void 1531skc_surface_composite_u8_rgba(__global SKC_RENDER_SURFACE_U8_RGBA * SKC_RESTRICT const surface, 1532 skc_uint const surface_pitch, 1533 union skc_tile_color const * SKC_RESTRICT const color, 1534 skc_ttck_hi_t const ttck_hi) 1535{ 1536 // 1537 // NEW MAJOR OPTIMIZATION: 1538 // 1539 // Rotating and rasterizing the original world transform by -90 1540 // degrees and then rendering the scene scene by +90 degrees enables 1541 // all the final surface composite to be perfomed in perfectly 1542 // coalesced wide transactions. 1543 // 1544 // For this reason, linear access to the framebuffer is preferred. 1545 // 1546 // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv 1547 // 1548 // NOTE THIS IS TRANSPOSED BY 90 DEGREES 1549 // 1550 // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE 1551 // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING. 1552 // 1553 // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS 1554 // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS 1555 // 1556 // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL 1557 // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER 1558 // 1559 uint const pitch = surface_pitch / SKC_RENDER_SCANLINE_VECTOR_SIZE; 1560 uint const x = skc_ttck_hi_get_x(ttck_hi); 1561 uint const y = skc_ttck_hi_get_y(ttck_hi) ; 1562 uint const base = x * SKC_TILE_WIDTH * pitch + y * (SKC_TILE_HEIGHT / SKC_RENDER_SCANLINE_VECTOR_SIZE) + skc_subgroup_lane(); 1563 1564 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1565 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1566 { 1567 SKC_RENDER_SURFACE_U8_RGBA rgba = ( 0xFF000000 ); 1568 1569 rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].r * 255); 1570 rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].g * 255) << 8; 1571 rgba |= SKC_CONVERT(SKC_RENDER_SURFACE_U8_RGBA)(color->aN.rgba[ii].b * 255) << 16; 1572 1573 surface[base + ii * pitch] = rgba; 1574 1575 // printf("%08v2X\n",rgba); 1576 } 1577} 1578 1579#else 1580 1581static 1582void 1583skc_surface_composite_u8_rgba(__write_only image2d_t surface, 1584 union skc_tile_color const * SKC_RESTRICT const color, 1585 skc_ttck_hi_t const ttck_hi) 1586{ 1587 // 1588 // NEW MAJOR OPTIMIZATION: 1589 // 1590 // Rotating and rasterizing the original world transform by -90 1591 // degrees and then rendering the scene scene by +90 degrees enables 1592 // all the final surface composite to be perfomed in perfectly 1593 // coalesced wide transactions. 1594 // 1595 // For this reason, linear access to the framebuffer is preferred. 1596 // 1597 // vvvvvvvvvvvv OLD NOTE BELOW vvvvvvvvvvvvv 1598 // 1599 // NOTE THIS IS TRANSPOSED BY 90 DEGREES 1600 // 1601 // INTEL HAS A "BLOCK STORE" FEATURE THAT SOLVES THIS AND TEXTURE 1602 // CACHES ARE ALSO PROBABLY SOMEWHAT FORGIVING. 1603 // 1604 // IT'S EASY TO TRANSPOSE THIS IN SMEM BEFORE STORING BUT IN THIS 1605 // CPU EXAMPLE WE CAN PROBABLY DO WELL BY JUST WRITING OUT SCALARS 1606 // 1607 // FIXME -- NEED TO HARMONIZE BYTE AND COMPONENT COLOR CHANNEL 1608 // ORDERING SO THAT COLOR CHANNELS MATCH 0xAARRGGBBAA ORDER 1609 // 1610 1611#if 1 1612 int x = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH; 1613 int y = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE); 1614 1615 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1616 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1617 { 1618#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED 1619 1620#undef SKC_EXPAND_X 1621#define SKC_EXPAND_X(I,S,C,P,A) { \ 1622 SKC_RENDER_SURFACE_WRITE(surface, \ 1623 (int2)(x,y+I), \ 1624 color->iN.rgba[ii] A); \ 1625 } 1626 1627#else 1628 1629#undef SKC_EXPAND_X 1630#define SKC_EXPAND_X(I,S,C,P,A) { \ 1631 SKC_RENDER_SURFACE_COLOR const rgba = \ 1632 (SKC_RENDER_SURFACE_COLOR) \ 1633 (color->aN.rgba[ii].r C, \ 1634 color->aN.rgba[ii].g C, \ 1635 color->aN.rgba[ii].b C, \ 1636 1.0); \ 1637 SKC_RENDER_SURFACE_WRITE(surface,(int2)(x,y+I),rgba); \ 1638 } 1639 1640#endif 1641 1642 SKC_RENDER_SCANLINE_VECTOR_EXPAND(); 1643 1644 x += 1; 1645 } 1646#else 1647 int x = skc_ttck_hi_get_y(ttck_hi) * SKC_TILE_HEIGHT + (skc_subgroup_lane() * SKC_RENDER_SCANLINE_VECTOR_SIZE); 1648 int y = skc_ttck_hi_get_x(ttck_hi) * SKC_TILE_WIDTH; 1649 1650 // __attribute__((opencl_unroll_hint(SKC_TILE_WIDTH))) 1651 for (uint ii=0; ii<SKC_TILE_WIDTH; ii++) 1652 { 1653#ifdef SKC_RENDER_TILE_COLOR_INTERLEAVED 1654 1655#undef SKC_EXPAND_X 1656#define SKC_EXPAND_X(I,S,C,P,A) { \ 1657 SKC_RENDER_SURFACE_WRITE(surface, \ 1658 (int2)(x+I,y+ii), \ 1659 color->iN.rgba[ii] A); \ 1660 } 1661 1662#else 1663 1664#undef SKC_EXPAND_X 1665#define SKC_EXPAND_X(I,S,C,P,A) { \ 1666 SKC_RENDER_SURFACE_COLOR const rgba = \ 1667 (SKC_RENDER_SURFACE_COLOR) \ 1668 (color->aN.rgba[ii].r C, \ 1669 color->aN.rgba[ii].g C, \ 1670 color->aN.rgba[ii].b C, \ 1671 1.0); \ 1672 SKC_RENDER_SURFACE_WRITE(surface,(int2)(x+I,y+ii),rgba); \ 1673 } 1674 1675#endif 1676 1677 SKC_RENDER_SCANLINE_VECTOR_EXPAND(); 1678 } 1679 1680#endif 1681} 1682 1683#endif 1684 1685// 1686// 1687// 1688static 1689uint const 1690skc_ttck_lane(uint const ttck_idx) 1691{ 1692 return ttck_idx & SKC_RENDER_SUBGROUP_MASK; 1693} 1694 1695// 1696// RENDER KERNEL 1697// 1698 1699__kernel 1700SKC_RENDER_KERNEL_ATTRIBS 1701void 1702skc_kernel_render(__global union skc_layer_node const * SKC_RESTRICT const layers, 1703 __global struct skc_group_node const * SKC_RESTRICT const groups, 1704 __global union skc_styling_cmd const * SKC_RESTRICT const commands, // FIXME -- rename 1705 1706 __global skc_ttck_t const * SKC_RESTRICT const ttck_keys, // rename: keys 1707 skc_uint const ttck_count, // rename: key_count 1708 1709 __global uint const * SKC_RESTRICT const ttck_offsets, // rename: offsets 1710 skc_uint const tile_count, // rename: offset_count 1711 1712 __global skc_ttxb_t const * SKC_RESTRICT const ttxb_extent, 1713#ifdef SKC_SURFACE_IS_BUFFER 1714 __global void * SKC_RESTRICT const surface, 1715#else 1716 __write_only image2d_t surface, 1717#endif 1718#ifdef SKC_SURFACE_IS_BUFFER 1719 skc_uint const surface_pitch, 1720#endif 1721 uint4 const tile_clip) // rename: clip 1722{ 1723 // 1724 // Each subgroup is responsible for a tile. No extra subgroups are 1725 // launched. 1726 // 1727 // FIXME -- might be better implemented as a "grid stride loop" if 1728 // Intel GEN really has a local memory "quantum" of 4KB which means 1729 // we would need to launch 4 subgroups per workgroup. 1730 // 1731 // Confirmed: GEN8 has 4KB SLM workgroup min while GEN9 is 1KB. 1732 // 1733 1734 // 1735 // declare tile cover and color registers 1736 // 1737 // this used to be a neat unified struct but the Intel GEN compiler 1738 // wasn't cooperating and spilling to private memory even though all 1739 // registers were indexed by constants 1740 // 1741 union skc_tile_color color_wip; 1742 union skc_tile_color color_acc; 1743 1744 union skc_tile_cover cover_wip; 1745 union skc_tile_cover cover_acc; 1746 union skc_tile_cover cover_msk; 1747 1748 // 1749 // which subgroup in the grid is this? 1750 // 1751 // TAKE NOTE: the Intel GEN compiler is recognizing get_group_id(0) 1752 // as a uniform but the alternative calculation used when there are 1753 // multiple subgroups per workgroup is not cooperating and 1754 // driving spillage elsewhere. 1755 // 1756#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 ) 1757 skc_uint const ttck_offset_idx = get_group_id(0); 1758#else 1759 skc_uint const ttck_offset_idx = get_group_id(0) * SKC_RENDER_WORKGROUP_SUBGROUPS + get_sub_group_id(); 1760#endif 1761 1762 // 1763 // load the starting ttck for this offset and get a bound on the max 1764 // number of keys that might be loaded 1765 // 1766 // these are uniform across all subgroup lanes 1767 // 1768 skc_uint ttck_idx = ttck_offsets[ttck_offset_idx]; 1769 1770 // 1771 // FIXME -- SIMD/CPU version should probaby load a 256-bit (4-wide) 1772 // vector of ttck keys 1773 // 1774#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK 1775 1776 skc_ttck_t ttck = ttck_keys[ttck_idx]; 1777 1778#else 1779 1780 uint const ttck_base = ttck_idx & ~SKC_RENDER_SUBGROUP_MASK; 1781 uint const ttck_lane = ttck_idx & SKC_RENDER_SUBGROUP_MASK; 1782 skc_ttck_t ttck_s = ttck_keys[min(ttck_base+max(get_sub_group_local_id(),ttck_lane),ttck_count-1)] 1783 1784#endif 1785 1786 // 1787 // set up style group/layer state 1788 // 1789 struct skc_styling_group { 1790 union skc_group_range range; 1791 skc_uint depth; 1792 skc_uint id; 1793 } group; 1794 1795 group.range.lo = 0; 1796 group.range.hi = SKC_UINT_MAX; 1797 group.depth = SKC_UINT_MAX; 1798 group.id = SKC_UINT_MAX; 1799 1800 // 1801 // start with clear tile opacity, knockout and flag bits 1802 // 1803 // uint color_acc_opacity = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32 1804 // uint cover_acc_knockout = 0; // per lane bit mask -- assumes a PIXEL_TILE_HEIGHT <= 32 1805 // 1806 skc_uint flags = 0; 1807 1808 // 1809 // declare and initialize accumulators 1810 // 1811#if ( SKC_RENDER_WORKGROUP_SUBGROUPS == 1 ) 1812 __local union skc_subgroup_smem smem[1]; 1813#else 1814 __local union skc_subgroup_smem smem_wg[SKC_RENDER_WORKGROUP_SUBGROUPS]; 1815 __local union skc_subgroup_smem * SKC_RESTRICT const smem = smem_wg + get_sub_group_id(); 1816#endif 1817 1818#ifdef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK 1819 // 1820 // select the initial ttck key 1821 // 1822 skc_ttck_t ttck; 1823#if 0 1824 ttck = sub_group_broadcast(ttck_s,ttck_lane); // SHOULD WORK BUT .4454 COMPILER IS BROKEN 1825#else 1826 ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane); // EXPLICIT WORKAROUND 1827 ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane); 1828#endif 1829 1830#endif 1831 1832 // 1833 // save the first key so we know what tile we're in 1834 // 1835 skc_ttck_t ttck0 = ttck; 1836 1837 // 1838 // evaluate the coarse clip as late as possible 1839 // 1840 skc_uint const ttck_hi_x = skc_ttck_hi_get_x(ttck0.hi); 1841 1842 if ((ttck_hi_x < tile_clip.lo.x) || (ttck_hi_x >= tile_clip.hi.x)) 1843 return; 1844 1845 skc_uint const ttck_hi_y = skc_ttck_hi_get_y(ttck0.hi); 1846 1847 if ((ttck_hi_y < tile_clip.lo.y) || (ttck_hi_y >= tile_clip.hi.y)) 1848 return; 1849 1850#if 0 1851 printf("< %u, %u >\n",ttck_hi_x,ttck_hi_y); 1852#endif 1853 1854 // 1855 // load -> scatter -> flush 1856 // 1857 while (true) 1858 { 1859 // if scattering is disabled then just run through ttck keys 1860 bool const is_scatter_enabled = (flags & SKC_TILE_FLAGS_SCATTER_SKIP) == 0; 1861 1862 // need to clear accumulators before a scatter loop 1863 if (is_scatter_enabled) 1864 { 1865 skc_tile_aa_zero(smem); 1866 } 1867 1868 do { 1869 // skip scattering? 1870 if (is_scatter_enabled) 1871 { 1872 skc_block_id_t const xb_id = skc_ttck_lo_get_ttxb_id(ttck.lo); 1873 1874 if (skc_ttck_lo_is_prefix(ttck.lo)) { 1875 skc_scatter_ttpb(ttxb_extent,smem,xb_id); 1876 } else { 1877 skc_scatter_ttsb(ttxb_extent,smem,xb_id); 1878 } 1879 } 1880 1881 // 1882 // any ttck keys left? 1883 // 1884 if (++ttck_idx >= ttck_count) 1885 { 1886 flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE; 1887 break; 1888 } 1889 1890 // 1891 // process next ttck key 1892 // 1893#ifndef SKC_TARGET_ARCH_COALESCED_LOAD_TTCK 1894 // 1895 // SIMD -- read next key 1896 // 1897 ttck = ttck_keys[ttck_idx]; 1898#else 1899 // 1900 // SIMT -- refresh the ttck_s? 1901 // 1902 uint const ttck_lane_next = ttck_idx & SKC_RENDER_SUBGROUP_MASK; 1903 1904 if (ttck_lane_next == 0) 1905 ttck_s = ttck_keys[min(ttck_idx+get_sub_group_local_id(),ttck_count-1)]; 1906 1907 // 1908 // broadcast next key to entire subgroup 1909 // 1910#if 0 1911 ttck = sub_group_broadcast(ttck_s,ttck_lane_next); // SHOULD WORK BUT .4454 COMPILER IS BROKEN 1912#else 1913 ttck.lo = sub_group_broadcast(ttck_s.lo,ttck_lane_next); // EXPLICIT WORKAROUND 1914 ttck.hi = sub_group_broadcast(ttck_s.hi,ttck_lane_next); 1915#endif 1916#endif 1917 // continue scattering if on same YXL layer 1918 } while (skc_ttck_equal_yxl(ttck0,ttck)); 1919 1920 // finalize if no longer on same YX tile 1921 if (!skc_ttck_hi_equal_yx(ttck0.hi,ttck.hi)) 1922 { 1923 // otherwise, unwind the tile styling and exit 1924 flags |= SKC_TILE_FLAGS_FLUSH_FINALIZE; 1925 } 1926 1927 // 1928 // given: new layer id from ttxk key 1929 // 1930 // load [layer id]{ group id, depth } 1931 // 1932 // if within current group's layer range 1933 // 1934 // if at same depth 1935 // 1936 // load and execute cover>[mask>]color>blend commands 1937 // 1938 // else if not at same depth then move deeper 1939 // 1940 // for all groups in group trail from cur depth to new depth 1941 // enter group, saving and initializing regs as necessary 1942 // increment depth and update layer range 1943 // load and execute cover>[mask>]color>blend commands 1944 // 1945 // else not within layer range 1946 // 1947 // exit current group, restoring regs as necessary 1948 // decrement depth and update layer range 1949 // 1950 // 1951 skc_layer_id const layer_id_new = skc_ttck_get_layer(ttck0); // FIXME -- this was ttck_hi 1952 union skc_layer_node const layer_node_new = layers[layer_id_new]; 1953 1954 // clear flag that controls group/layer traversal 1955 flags &= ~SKC_TILE_FLAGS_FLUSH_COMPLETE; 1956 1957 do { 1958 bool const unwind = (flags & SKC_TILE_FLAGS_FLUSH_UNWIND) != 0; 1959 1960 // 1961 // is layer a child of the current parent group? 1962 // 1963 uint cmd_next = 0; 1964 1965 if (!unwind && (layer_node_new.parent == group.id)) 1966 { 1967 // execute this layer's cmds 1968 cmd_next = layer_node_new.cmds; 1969 1970 // if this is final then configure so groups get unwound, otherwise we're done 1971 flags |= ((flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) 1972 ? SKC_TILE_FLAGS_FLUSH_UNWIND 1973 : SKC_TILE_FLAGS_FLUSH_COMPLETE); 1974 } 1975 else if (!unwind && (layer_id_new >= group.range.lo && layer_id_new <= group.range.hi)) 1976 { 1977 // 1978 // is layer in a child group? 1979 // 1980 union skc_group_parents const gp = groups[layer_node_new.parent].parents; 1981 uint const gn = gp.depth - ++group.depth; 1982 1983 if (gn == 0) 1984 group.id = layer_node_new.parent; 1985 else 1986 group.id = commands[gp.base + gn - 1].parent; 1987 1988 // update group layer range 1989 group.range = groups[group.id].range; 1990 1991 // enter current group 1992 cmd_next = groups[group.id].cmds.enter; 1993 } 1994 else // otherwise, exit this group 1995 { 1996 // enter current group 1997 cmd_next = groups[group.id].cmds.leave; 1998 1999 // decrement group depth 2000 if (--group.depth == SKC_UINT_MAX) 2001 { 2002 flags |= SKC_TILE_FLAGS_FLUSH_COMPLETE; 2003 } 2004 else 2005 { 2006 // get path_base of current group 2007 uint const gnpb = groups[group.id].parents.base; 2008 2009 // get parent of current group 2010 group.id = commands[gnpb].parent; 2011 2012 // update group layer range 2013 group.range = groups[group.id].range; 2014 } 2015 } 2016 2017 // 2018 // execute cmds 2019 // 2020 while (true) 2021 { 2022 union skc_styling_cmd const cmd = commands[cmd_next++]; 2023 2024 switch (cmd.u32 & SKC_STYLING_OPCODE_MASK_OPCODE) 2025 { 2026 case SKC_STYLING_OPCODE_NOOP: 2027 break; 2028 2029 case SKC_STYLING_OPCODE_COVER_NONZERO: 2030 skc_tile_cover_nonzero(smem,&cover_wip,&color_wip); 2031 break; 2032 2033 case SKC_STYLING_OPCODE_COVER_EVENODD: 2034 skc_tile_cover_evenodd(smem,&cover_wip,&color_wip); 2035 break; 2036 2037 case SKC_STYLING_OPCODE_COVER_ACCUMULATE: 2038 skc_tile_cover_accumulate(&cover_acc,&cover_wip); 2039 break; 2040 2041 case SKC_STYLING_OPCODE_COVER_MASK: 2042 skc_tile_cover_wip_mask(&cover_wip,&cover_msk); 2043 break; 2044 2045 case SKC_STYLING_OPCODE_COVER_WIP_ZERO: 2046 skc_tile_cover_wip_zero(&cover_wip); 2047 break; 2048 2049 case SKC_STYLING_OPCODE_COVER_ACC_ZERO: 2050 skc_tile_cover_acc_zero(&cover_acc); 2051 break; 2052 2053 case SKC_STYLING_OPCODE_COVER_MASK_ZERO: 2054 skc_tile_cover_msk_zero(&cover_msk); 2055 break; 2056 2057 case SKC_STYLING_OPCODE_COVER_MASK_ONE: 2058 skc_tile_cover_msk_one(&cover_msk); 2059 break; 2060 2061 case SKC_STYLING_OPCODE_COVER_MASK_INVERT: 2062 skc_tile_cover_msk_invert(&cover_msk); 2063 break; 2064 2065 case SKC_STYLING_OPCODE_COLOR_FILL_SOLID: 2066 skc_tile_color_fill_solid(commands,&cmd_next,&color_wip); 2067 break; 2068 2069 case SKC_STYLING_OPCODE_COLOR_FILL_GRADIENT_LINEAR: 2070 // 2071 // FIXME -- gradients shouldn't be executing so much 2072 // conditional driven code at runtime since we *know* 2073 // the gradient style on the host can just create a 2074 // new styling command to exploit this. 2075 // 2076 // FIXME -- it might be time to try using the GPU's 2077 // sampler on a linear array of half4 vectors -- it 2078 // might outperform the explicit load/lerp routines. 2079 // 2080 // FIXME -- optimizing for vertical gradients (uhhh, 2081 // they're actually horizontal due to the -90 degree 2082 // view transform) is nice but is it worthwhile to 2083 // have this in the kernel? Easy to add it back... 2084 // 2085#if defined( SKC_ARCH_GEN9 ) 2086 // disable gradients due to exessive spillage -- fix later 2087 cmd_next += SKC_GRADIENT_CMD_WORDS_V1(commands[cmd_next+6].u32); 2088#else 2089 skc_tile_color_fill_gradient_linear_nonvertical(smem,commands,&cmd_next,&color_wip,ttck0.hi); 2090#endif 2091 break; 2092 2093 case SKC_STYLING_OPCODE_COLOR_WIP_ZERO: 2094 skc_tile_color_wip_zero(&color_wip); 2095 break; 2096 2097 case SKC_STYLING_OPCODE_COLOR_ACC_ZERO: 2098 skc_tile_color_acc_zero(&color_acc); 2099 break; 2100 2101 case SKC_STYLING_OPCODE_BLEND_OVER: 2102 skc_tile_blend_over(&color_acc,&cover_wip,&color_wip); 2103 break; 2104 2105 case SKC_STYLING_OPCODE_BLEND_PLUS: 2106 skc_tile_blend_plus(&color_acc,&cover_wip,&color_wip); 2107 break; 2108 2109 case SKC_STYLING_OPCODE_BLEND_MULTIPLY: 2110 skc_tile_blend_multiply(&color_acc,&cover_wip,&color_wip); 2111 break; 2112 2113 case SKC_STYLING_OPCODE_BLEND_KNOCKOUT: 2114 skc_tile_blend_knockout(&cover_acc,&color_acc,&cover_wip,&color_wip); 2115 break; 2116 2117 case SKC_STYLING_OPCODE_COVER_WIP_MOVE_TO_MASK: 2118 skc_tile_cover_msk_copy_wip(&cover_msk,&cover_wip); 2119 break; 2120 2121 case SKC_STYLING_OPCODE_COVER_ACC_MOVE_TO_MASK: 2122 skc_tile_cover_msk_copy_acc(&cover_msk,&cover_acc); 2123 break; 2124 2125 case SKC_STYLING_OPCODE_BACKGROUND_OVER: 2126 skc_tile_background_over(commands,&cmd_next,&color_acc); 2127 break; 2128 2129 case SKC_STYLING_OPCODE_SURFACE_COMPOSITE: 2130#ifdef SKC_SURFACE_IS_BUFFER 2131 skc_surface_composite_u8_rgba(surface,surface_pitch,&color_acc,ttck0.hi); 2132#else 2133 skc_surface_composite_u8_rgba(surface, &color_acc,ttck0.hi); 2134#endif 2135 break; 2136 2137 case SKC_STYLING_OPCODE_COLOR_ACC_TEST_OPACITY: 2138 if (skc_tile_color_test_opacity(&color_acc)) 2139 flags |= SKC_TILE_FLAGS_SCATTER_SKIP; 2140 break; 2141 2142 default: 2143 return; // this is an illegal opcode -- trap and die! 2144 } 2145 2146 // 2147 // if sign bit is set then this was final command 2148 // 2149 if (cmd.s32 < 0) 2150 break; 2151 } 2152 2153 // continue as long as tile flush isn't complete 2154 } while ((flags & SKC_TILE_FLAGS_FLUSH_COMPLETE) == 0); 2155 2156 // return if was the final flush 2157 if (flags & SKC_TILE_FLAGS_FLUSH_FINALIZE) 2158 return; 2159 2160 // update wip ttck_hi 2161 ttck0 = ttck; 2162 } 2163} 2164 2165// 2166// 2167// 2168