1/* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can 5 * be found in the LICENSE file. 6 * 7 */ 8 9// 10// 11// 12 13#include "tile.h" 14#include "block.h" 15#include "raster.h" 16#include "atomic_cl.h" 17#include "raster_builder_cl_12.h" 18#include "kernel_cl_12.h" 19 20// 21// INPUT: 22// 23// TTRK (64-BIT COMPARE) 24// 25// 0 63 26// | TTSB ID | X | Y | COHORT ID | 27// +---------+------+------+-----------+ 28// | 27 | 12 | 12 | 13 | 29// 30// 31// TTRK (32-BIT COMPARE) 32// 33// 0 63 34// | TTSB ID | N/A | X | Y | COHORT ID | 35// +---------+-----+------+------+-----------+ 36// | 27 | 5 | 12 | 12 | 8 | 37// 38// 39// OUTPUT: 40// 41// TTSK v2: 42// 43// 0 63 44// | TTSB ID | PREFIX | N/A | X | Y | 45// +---------+--------+------+----+----+ 46// | 27 | 1 (=0) | 12 | 12 | 12 | 47// 48// 49// TTPK v1: 50// 51// 0 63 52// | TTPB ID | ALL ZEROES | SPAN | X | Y | 53// +---------+------------+------+-----+-----+ 54// | 27 | 1 | 12 | 12 | 12 | 55// 56// 57// TTPK v2: 58// 59// 0 63 60// | TTPB ID | PREFIX | SPAN | X | Y | 61// +---------+--------+------+-----+-----+ 62// | 27 | 1 (=1) | 12 | 12 | 12 | 63// 64 65#define SKC_PREFIX_SUBGROUP_MASK (SKC_PREFIX_SUBGROUP_SIZE - 1) 66 67// 68// smem accumulator 69// 70 71union skc_subgroup_accum 72{ 73 struct { 74 SKC_ATOMIC_INT ttp[SKC_TILE_HEIGHT]; 75 } atomic; 76 77 struct { 78 skc_ttp_t ttp[SKC_TILE_HEIGHT]; 79 } aN; 80 81 struct { 82 SKC_PREFIX_TTP_V ttp[SKC_PREFIX_SUBGROUP_SIZE]; 83 } vN; 84 85 struct { 86 SKC_PREFIX_SMEM_ZERO ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH]; 87 } zero; 88}; 89 90// 91// 92// 93 94struct skc_subgroup_smem 95{ 96 // prefix accumulator 97 union skc_subgroup_accum accum; 98}; 99 100// 101// 102// 103 104static 105skc_uint 106skc_subgroup_lane() 107{ 108#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) 109 return get_sub_group_local_id(); 110#else 111 return 0; 112#endif 113} 114 115// 116// 117// 118 119static 120SKC_PREFIX_TTS_V_BITFIELD 121skc_tts_get_dy(skc_tts_v_t const ttsv) 122{ 123 // tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32] 124 SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY; 125 126 return dy - (~ttsv >> 31); 127} 128 129static 130SKC_PREFIX_TTS_V_BITFIELD 131skc_tts_get_py(skc_tts_v_t const ttsv) 132{ 133 return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2); 134} 135 136// 137// 138// 139 140static 141void 142skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v) 143{ 144 // get "altitude" 145 SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v); 146 147 // get the y pixel coordinate 148 SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v); 149 150 // 151 // FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid? 152 // 153 // FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op 154 // 155 156#if 0 157 if (tts_v != SKC_TTS_INVALID) 158 printf("< %08X = %u : %d >\n",tts_v,py,dy); 159#endif 160 161 // 162 // scatter-add the "altitude" to accumulator 163 // 164#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) 165 // 166 // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD 167 // 168#undef SKC_EXPAND_X 169#define SKC_EXPAND_X(I,S,C,P,A) \ 170 if (tts_v C != SKC_TTS_INVALID) { \ 171 SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \ 172 } 173 174#else 175 // 176 // CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS 177 // 178 // WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT 179 // 180#undef SKC_EXPAND_X 181#define SKC_EXPAND_X(I,S,C,P,A) \ 182 if (tts_v C == SKC_TTS_INVALID) \ 183 return; \ 184 smem->accum.aN.ttp[py C] = dy C; 185#endif 186 187 SKC_PREFIX_TTS_VECTOR_INT_EXPAND(); 188} 189 190// 191// The implication here is that if our device configuration has a 192// rectangular 1:2 tile then we need a block size of at least 2 193// subblocks. The subblock size of course needs to match the length of 194// the smallest tile side. 195// 196 197static 198void 199skc_accum_flush(__local struct skc_subgroup_smem * const smem, 200 __global skc_bp_elem_t * const bp_elems, 201 skc_block_id_t const pb_id) 202{ 203 // load the ttp elements 204 SKC_PREFIX_TTP_V const ttp_v = smem->accum.vN.ttp[get_sub_group_local_id()]; 205 skc_uint const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane(); 206 207#if ( SKC_TILE_RATIO == 1 ) 208 209 bp_elems[offset] = ttp_v; 210 211#elif ( SKC_TILE_RATIO == 2 ) 212 213 vstore2(ttp_v,offset,bp_elems); 214 215#else 216 217#error("tile ratio greater than 2 not supported") 218 219#endif 220} 221 222// 223// 224// 225 226static 227void 228skc_accum_reset(__local struct skc_subgroup_smem * const smem) 229{ 230 for (uint ii=0; ii<SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH / SKC_PREFIX_SUBGROUP_SIZE; ii++) 231 smem->accum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 ); 232} 233 234// 235// get next sk key 236// 237 238static 239skc_ttsk_s_t 240skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v, 241 skc_uint * const sk_next, 242 skc_int * const rkpk_rem) 243{ 244 // decrement count 245 *rkpk_rem -= 1; 246 247#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) 248 // 249 // SIMT with subgroup support is easy 250 // 251 // SIMT without subgroup support can always emulate with smem 252 // 253#if 0 254 // 255 // BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly 256 // broadcast a uint2 cast to a long. It was probably bad to do this 257 // anyway without a union wrapping the TTSK scalar type. 258 // 259 // Consider creating a union { ulong; uint2 } at a later date -- 260 // probably no need to ever do this unless it makes broadcast faster 261 // which is unlikely since it will probably be implemented as 2 262 // 32-bit broadcasts. 263 // 264 // Additionally, the TTRK and TTXK key bitfield sizes are probably 265 // cast in stone and we aren't going to change them no matter 266 // architecture we're on. 267 // 268 skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++); 269#else 270 skc_ttsk_s_t sk_s; 271 272 sk_s.lo = sub_group_broadcast(sk_v->lo,*sk_next); 273 sk_s.hi = sub_group_broadcast(sk_v->hi,*sk_next); 274 *sk_next += 1; 275#endif 276 277#else 278 // 279 // SIMD will always grab component .s0 and then rotate the vector 280 // 281 sk_s = ( sk_v->s0 ); 282 283 skc_ttsk_v_rotate_down(sk_v); 284 285#endif 286 287 return sk_s; 288} 289 290// 291// 292// 293 294static 295skc_raster_yx_s 296skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next) 297{ 298#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) 299 // 300 // SIMT with subgroup support is easy 301 // 302 // SIMT without subgroup support can always emulate with smem 303 // 304 skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next); 305 306#else 307 // 308 // SIMD will always grab component .s0 and then rotate the vector 309 // 310 skc_raster_yx_s const yx_s = ( sk_v->s0.hi ); 311 312#endif 313 314 return yx_s; 315} 316 317// 318// mask off ttsb id 319// 320 321static 322skc_block_id_s_t 323skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s) 324{ 325 return ( sk_s->lo & SKC_TTXK_LO_MASK_ID ); 326} 327 328// 329// load tts_v as early as possible 330// 331 332static 333skc_tts_v_t 334skc_load_tts(__global skc_bp_elem_t * const bp_elems, 335 skc_block_id_s_t const sb_id) 336{ 337 return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] ); 338} 339 340// 341// massage ttrk keys into ttsk keys 342// 343 344static 345void 346skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v) 347{ 348 sk_v->lo = sk_v->lo & SKC_TTXK_LO_MASK_ID; // clear high (N/A) bits 349 sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits 350} 351 352// 353// replenish ttsk keys 354// 355 356static 357void 358skc_ttsk_v_replenish(skc_ttsk_v_t * const sk_v, 359 skc_uint * const sk_next, 360 skc_uint * const rks_next, 361 __global skc_ttrk_e_t const * const rks) 362{ 363 // if there are still keys available then return 364 if (*sk_next < SKC_PREFIX_TTXK_V_SIZE) 365 return; 366 367 // 368 // otherwise, replenish sk_v 369 // 370 // NOTE NOTE NOTE -- we are assuming rks[] extent size is always 371 // divisible by TTXK_V_SIZE and therefore loading some keys from the 372 // next raster is OK. 373 // 374 *sk_next = 0; 375 *rks_next += SKC_PREFIX_SUBGROUP_SIZE; 376 *sk_v = rks[*rks_next]; 377 378#if 0 379 printf("* %08X ( %3u, %3u )\n", 380 sk_v->hi, 381 (sk_v->hi >> 12) & 0xFFF, 382 (sk_v->hi ) & 0xFFF); 383#endif 384 385 skc_ttrk_to_ttsk(sk_v); 386 387#if 0 388 printf("! %08X ( %3u, %3u )\n", 389 sk_v->hi, 390 (sk_v->hi >> 20) & 0xFFF, 391 (sk_v->hi >> 8) & 0xFFF); 392#endif 393} 394 395// 396// replenish block ids 397// 398// note that you can't overrun the block id pool since it's a ring 399// 400 401static 402void 403skc_blocks_replenish(skc_uint * const blocks_next, 404 skc_uint * const blocks_idx, 405 skc_block_id_v_t * const blocks, 406 skc_uint const bp_mask, // pow2 modulo mask for block pool ring 407 __global skc_block_id_t const * const bp_ids) 408 409{ 410 *blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE; 411 *blocks = bp_ids[*blocks_idx & bp_mask]; 412 *blocks_next = 0; 413 414#if 0 415 printf("replenish blocks: %u\n",*blocks); 416#endif 417} 418 419// 420// 421// 422 423static 424skc_block_id_t 425skc_blocks_get_next(skc_uint * const blocks_next, 426 skc_uint * const blocks_idx, 427 skc_block_id_v_t * const blocks, 428 skc_uint const bp_mask, // pow2 modulo mask for block pool ring 429 __global skc_block_id_t const * const bp_ids) 430{ 431 // replenish? 432 if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE) 433 { 434 skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); 435 } 436 437#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) 438 // 439 // SIMT 440 // 441 skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next); 442 443#else 444 // 445 // SIMD 446 // 447 skc_block_id_t id = blocks->s0; 448 449 skc_shuffle_down_1(*blocks); 450 451#endif 452 453 *blocks_next += 1; 454 455 return id; 456} 457 458// 459// subblock allocator 460// 461 462#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 ) 463 464static 465skc_block_id_t 466skc_subblocks_get_next_pb_id(skc_block_id_t * const subblocks, 467 skc_uint * const blocks_next, 468 skc_uint * const blocks_idx, 469 skc_block_id_v_t * const blocks, 470 skc_uint const bp_mask, // pow2 modulo mask for block pool ring 471 __global skc_block_id_t const * const bp_ids) 472{ 473 if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) 474 { 475 *subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); 476 } 477 478 skc_block_id_t const pb_id = *subblocks; 479 480 *subblocks += SKC_TILE_RATIO; // note this is one or two subblocks 481 482 return pb_id; 483} 484 485#endif 486 487// 488// append a ttsk key to the work-in-progress node 489// 490 491static 492void 493skc_node_v_append_sk(skc_ttsk_s_t const * const sk_s, 494 495 skc_ttxk_v_t * const xk_v, 496 skc_uint * const xk_v_next, 497 skc_uint * const xk_v_idx, 498 __global skc_bp_elem_t * const bp_elems, 499 500 skc_int const rkpk_rem, 501 502 skc_uint * const blocks_next, 503 skc_uint * const blocks_idx, 504 skc_block_id_v_t * const blocks, 505 skc_uint const bp_mask, 506 __global skc_block_id_t const * const bp_ids) 507{ 508 // 509 // Append an sk key to the in-register xk_v vector 510 // 511 // If the work-in-progress node in gmem will only have room for one 512 // more key then: 513 // 514 // - if this was the final SK then write out xk_v and exit 515 // 516 // - otherwise, acquire a block id, link it, write out xk_v, 517 // prepare new node 518 // 519 // Note that this does *not* try to squeeze in a final key into the 520 // next node slot. This optimization isn't worth the added 521 // down-pipeline complexity. 522 // 523#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) 524 // 525 // SIMT 526 // 527 if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK)) 528 { 529 *xk_v = *sk_s; 530 } 531 532 *xk_v_next += 1; 533 534 // are there more keys coming? 535 if (rkpk_rem > 0) 536 { 537 // is the node almost full? 538 if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1) 539 { 540 skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); 541 542 if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1) 543 { 544 xk_v->lo = id; 545 xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary 546 } 547 548 // store xk_v (uint2) to bp (uint) 549 bp_elems[*xk_v_idx ] = xk_v->lo; 550 bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; 551#if 0 552 printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v); 553#endif 554 // reinitialize xk_v 555 xk_v->lo = SKC_UINT_MAX; 556 xk_v->hi = SKC_UINT_MAX; 557 558 // update node elem idx 559 *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); 560 561 // reset node count 562 *xk_v_next = 0; 563 } 564 // is xk_v full? 565 else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0) 566 { 567 // store xk_v to bp 568 bp_elems[*xk_v_idx ] = xk_v->lo; 569 bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; 570#if 0 571 printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v); 572#endif 573 // reinitialize xk_v 574 xk_v->lo = SKC_UINT_MAX; 575 xk_v->hi = SKC_UINT_MAX; 576 577 // increment node elem idx 578 *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; 579 } 580 } 581 else 582 { 583 bp_elems[*xk_v_idx ] = xk_v->lo; 584 bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; 585#if 0 586 printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v); 587#endif 588 while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2) 589 { 590 *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; 591 592 bp_elems[*xk_v_idx] = SKC_UINT_MAX; 593 bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX; 594 } 595 } 596 597#else 598 // 599 // SIMD 600 // 601 602#endif 603} 604 605// 606// 607// 608 609static 610skc_ttpk_s_t 611skc_ttpk_create(skc_raster_yx_s const yx_prev, 612 skc_raster_yx_s const yx_next, 613 skc_block_id_t const pb_id) 614{ 615 // - yx_prev is already incremented by one 616 // - yx_span is already shifted up at hi.x 617 skc_uint const yx_span = yx_next - yx_prev; 618 619 skc_ttpk_s_t pk; 620 621 // turn on prefix bit | shift span bits upward 622 pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN); 623 624 // shift down high span bits | yx of tile 625 pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev; 626 627#if 0 628 if (get_sub_group_local_id() == 0) 629 printf("* %08v2X : %u\n",pk,yx_span); 630#endif 631 632 return pk; 633} 634 635// 636// append a ttpk key to the work-in-progress node 637// 638 639static 640void 641skc_node_v_append_pk(skc_ttpk_s_t const * const pk_s, 642 643 skc_ttxk_v_t * const xk_v, 644 skc_uint * const xk_v_next, 645 skc_uint * const xk_v_idx, 646 __global skc_bp_elem_t * const bp_elems, 647 648 skc_uint * const blocks_next, 649 skc_uint * const blocks_idx, 650 skc_block_id_v_t * const blocks, 651 skc_uint const bp_mask, 652 __global skc_block_id_t const * const bp_ids) 653{ 654 // 655 // append a pk key to the in-register xk_v vector 656 // 657 // if the work-in-progress node in gmem will only have room for one 658 // more key then: 659 // 660 // - if this was the final SK then write out xk_v and exit 661 // 662 // - otherwise, acquire a block id, link it, write out xk_v, 663 // prepare new node 664 // 665#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) 666 // 667 // SIMT 668 // 669 if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK)) 670 { 671 *xk_v = *pk_s; 672 } 673 674 *xk_v_next += 1; 675 676 // is the node almost full? 677 if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1) 678 { 679 skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids); 680 681 if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1) 682 { 683 xk_v->lo = id; 684 xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary 685 } 686 687 // store xk_v to bp 688 bp_elems[*xk_v_idx ] = xk_v->lo; 689 bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; 690#if 0 691 printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v); 692#endif 693 // reinitialize xk_v 694 xk_v->lo = SKC_UINT_MAX; 695 xk_v->hi = SKC_UINT_MAX; 696 697 // update node elem idx 698 *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); 699 700 // reset node count 701 *xk_v_next = 0; 702 } 703 // is xk_v full? 704 else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0) 705 { 706 // store xk_v to bp 707 bp_elems[*xk_v_idx ] = xk_v->lo; 708 bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi; 709#if 0 710 printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v); 711#endif 712 // reinitialize xk_v 713 xk_v->lo = SKC_UINT_MAX; 714 xk_v->hi = SKC_UINT_MAX; 715 716 // increment node elem idx 717 *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; 718 } 719 720#else 721 // 722 // SIMD 723 // 724#endif 725} 726 727// 728// append the first 3 fields of meta info to the raster header 729// 730 731static 732void 733skc_node_v_init_header(skc_ttxk_v_t * const xk_v, 734 skc_uint * const xk_v_next, 735 union skc_raster_cohort_meta_out const * const meta) 736{ 737#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 ) 738 // 739 // SIMT 740 // 741 if (get_sub_group_local_id() < 2) 742 { 743 *xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi; 744 } 745 746#if 0 747 if (get_sub_group_local_id() == 0) 748 printf("header: %08v4X\n",meta->u32v4); 749#endif 750 751 // 752 // increment counter: uint4 + uint4 = uint2 x 4 753 // 754 *xk_v_next = 2 + 2; // +2 for unitialized bounds 755 756#else 757 // 758 // SIMD 759 // 760 761#endif 762} 763 764// 765// 766// 767 768__kernel 769SKC_PREFIX_KERNEL_ATTRIBS 770void 771skc_kernel_prefix(__global skc_uint const * const bp_atomics, 772 __global skc_block_id_t const * const bp_ids, 773 __global skc_bp_elem_t * const bp_elems, 774 skc_uint const bp_mask, // pow2 modulo mask for block pool ring 775 __global skc_ttrk_e_t const * const rks, 776 __global skc_block_id_t * const map, 777 __global skc_uint const * const metas, 778 skc_uint const count) 779{ 780 // 781 // declare shared memory block 782 // 783#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 ) 784 __local struct skc_subgroup_smem smem[1]; 785#else 786 __local struct skc_subgroup_smem smems[SKC_PREFIX_WORKGROUP_SUBGROUPS]; 787 __local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id(); 788#endif 789 790 // 791 // where is this subgroup in the grid? 792 // 793#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 ) 794 skc_uint const sgi = get_group_id(0); 795#else 796 skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id(); 797#endif 798 799 skc_uint const sgl = get_sub_group_local_id(); 800 801 // 802 // return if this subgroup is excess 803 // 804#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 ) 805 if (sgi >= count) 806 return; 807#endif 808 809 // 810 // get meta info for this subgroup's raster 811 // 812 union skc_raster_cohort_meta_out const meta = { vload4(sgi,metas) }; 813 skc_uint const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi]; 814 815#if 0 816 if (get_sub_group_local_id() == 0) 817 printf("%3u : %5u / %5u / %5u / %5u / %u\n", 818 sgi, 819 meta.blocks, 820 meta.offset, 821 meta.nodes, 822 meta.keys, 823 reads); 824#endif 825 826 // 827 // preload blocks -- align on subgroup 828 // 829 skc_uint blocks_idx = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane(); 830 skc_block_id_v_t blocks = bp_ids[blocks_idx & bp_mask]; 831 skc_uint blocks_next = (reads & SKC_PREFIX_SUBGROUP_MASK); 832 833 // 834 // prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset 835 // 836 skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); 837 838 // 839 // initialize raster header -- assumes block is greater than 8 words (4 doublewords) 840 // 841 skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX }; 842 skc_uint xk_v_next; 843 844 skc_node_v_init_header(&xk_v,&xk_v_next,&meta); 845 846 // 847 // no keys -- this is an empty raster! 848 // 849 if (meta.keys == 0) 850 { 851 bp_elems[xk_v_idx ] = xk_v.lo; 852 bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi; 853 854 while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2) 855 { 856 xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2; 857 858 bp_elems[xk_v_idx] = SKC_UINT_MAX; 859 bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX; 860 } 861 862 return; 863 } 864 865 // 866 // load TTRK keys and in-place convert to TTSK keys 867 // 868 skc_uint rks_next = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane(); 869 skc_ttsk_v_t sk_v = rks[rks_next]; 870 skc_uint sk_next = (meta.offset & SKC_PREFIX_SUBGROUP_MASK); 871 skc_int rkpk_rem = meta.keys; // signed count of remaining rk+pk keys 872 873#if 0 874 printf("* %08X ( %3u, %3u )\n", 875 sk_v.hi, 876 (sk_v.hi >> 12) & 0xFFF, 877 (sk_v.hi ) & 0xFFF); 878#endif 879 880 skc_ttrk_to_ttsk(&sk_v); 881 882#if 0 883 printf("! %08X ( %3u, %3u )\n", 884 sk_v.hi, 885 (sk_v.hi >> 20) & 0xFFF, 886 (sk_v.hi >> 8) & 0xFFF); 887#endif 888 889 // 890 // subblocks 891 // 892#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 ) 893 skc_block_id_t subblocks = 0; 894#endif 895 896 // 897 // begin "scan" of tiles 898 // 899 skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next); 900 901 // 902 // zero the accumulator 903 // 904 skc_accum_reset(smem); 905 906 while (true) 907 { 908 // get next rk key 909 skc_ttsk_s_t const sk_s = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem); 910 911 // load ttsb id 912 skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s); 913 914 // load tts_v transaction "in flight" as early as possible 915 skc_tts_v_t const tts_v = skc_load_tts(bp_elems,sb_id); 916 917#if 0 918 printf("{ %08X }\n",tts_v); 919#endif 920 921#if 0 922 if (get_sub_group_local_id() == 0) 923 printf("[ %d, %X ]\n",rkpk_rem,sb_id); 924#endif 925 926#if 0 927 if (get_sub_group_local_id() == 0) 928 printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF); 929#endif 930 931 // 932 // FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF 933 // TIME AND SIMD'IZED 934 // 935 936 // if yx's don't match then we're either issuing a ttpk or 937 // resetting the accumulator 938 if (sk_s.hi != yx_prev) 939 { 940 // if yx_next.y == yx_last.y then x changed 941 if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0) 942 { 943 // 944 // if the tile is not square then it's ratio is 1:2 945 // 946#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 947 skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks, 948 &blocks_next, 949 &blocks_idx, 950 &blocks, 951 bp_mask, 952 bp_ids); 953#else 954 skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next, 955 &blocks_idx, 956 &blocks, 957 bp_mask, 958 bp_ids); 959#endif 960 961 // flush accumulated ttp vector to block/subblock at ttpb_id 962 skc_accum_flush(smem,bp_elems,pb_id); 963 964#if 0 965 if (get_sub_group_local_id() == 0) 966 { 967 printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n", 968 pb_id, 969 (yx_prev >> SKC_TTXK_HI_OFFSET_Y), 970 (yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF, 971 (sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF, 972 (sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF); 973 } 974#endif 975 976 // 977 // FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP 978 // 979 rkpk_rem -= 1; 980 981 // create the pk 982 skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id); 983 984 // append pk key to xk buffer 985 skc_node_v_append_pk(&pk_s, 986 987 &xk_v, 988 &xk_v_next, 989 &xk_v_idx, 990 bp_elems, 991 992 &blocks_next, 993 &blocks_idx, 994 &blocks, 995 bp_mask, 996 bp_ids); 997 } 998 else if (rkpk_rem > 0) // we're starting a new tile row 999 { 1000 skc_accum_reset(smem); 1001 } 1002 } 1003 1004 // 1005 // append sk key to node_v 1006 // 1007 // if rkpk_rem is zero then return from kernel 1008 // 1009 skc_node_v_append_sk(&sk_s, 1010 1011 &xk_v, 1012 &xk_v_next, 1013 &xk_v_idx, 1014 bp_elems, 1015 1016 rkpk_rem, 1017 1018 &blocks_next, 1019 &blocks_idx, 1020 &blocks, 1021 bp_mask, 1022 bp_ids); 1023 1024 // we're done if no more sk keys 1025 if (rkpk_rem == 0) 1026 break; 1027 1028 // move to new tile 1029 yx_prev = sk_s.hi; 1030 1031 // scatter tts values into accumulator 1032 skc_accum_scatter(smem,tts_v); 1033 1034 // replenish sk keys 1035 skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks); 1036 } 1037} 1038 1039// 1040// 1041// 1042