1/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9//
10//
11//
12
13#include "tile.h"
14#include "block.h"
15#include "raster.h"
16#include "atomic_cl.h"
17#include "raster_builder_cl_12.h"
18#include "kernel_cl_12.h"
19
20//
21// INPUT:
22//
23//   TTRK (64-BIT COMPARE)
24//
25//    0                                  63
26//    | TTSB ID |   X  |   Y  | COHORT ID |
27//    +---------+------+------+-----------+
28//    |    27   |  12  |  12  |     13    |
29//
30//
31//   TTRK (32-BIT COMPARE)
32//
33//    0                                        63
34//    | TTSB ID | N/A |   X  |   Y  | COHORT ID |
35//    +---------+-----+------+------+-----------+
36//    |    27   |  5  |  12  |  12  |     8     |
37//
38//
39// OUTPUT:
40//
41//   TTSK v2:
42//
43//    0                                     63
44//    | TTSB ID | PREFIX |  N/A |  X |  Y |
45//    +---------+--------+------+----+----+
46//    |    27   | 1 (=0) |  12  | 12 | 12 |
47//
48//
49//   TTPK v1:
50//
51//    0                                        63
52//    | TTPB ID | ALL ZEROES | SPAN |  X  |  Y  |
53//    +---------+------------+------+-----+-----+
54//    |    27   |      1     |  12  | 12  | 12  |
55//
56//
57//   TTPK v2:
58//
59//    0                                       63
60//    | TTPB ID | PREFIX | SPAN |  X  |  Y  |
61//    +---------+--------+------+-----+-----+
62//    |    27   | 1 (=1) |  12  | 12  | 12  |
63//
64
65#define SKC_PREFIX_SUBGROUP_MASK  (SKC_PREFIX_SUBGROUP_SIZE - 1)
66
67//
68// smem accumulator
69//
70
71union skc_subgroup_accum
72{
73  struct {
74    SKC_ATOMIC_INT        ttp[SKC_TILE_HEIGHT];
75  } atomic;
76
77  struct {
78    skc_ttp_t             ttp[SKC_TILE_HEIGHT];
79  } aN;
80
81  struct {
82    SKC_PREFIX_TTP_V      ttp[SKC_PREFIX_SUBGROUP_SIZE];
83  } vN;
84
85  struct {
86    SKC_PREFIX_SMEM_ZERO  ttp[SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH];
87  } zero;
88};
89
90//
91//
92//
93
94struct skc_subgroup_smem
95{
96  // prefix accumulator
97  union skc_subgroup_accum accum;
98};
99
100//
101//
102//
103
104static
105skc_uint
106skc_subgroup_lane()
107{
108#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
109  return get_sub_group_local_id();
110#else
111  return 0;
112#endif
113}
114
115//
116//
117//
118
119static
120SKC_PREFIX_TTS_V_BITFIELD
121skc_tts_get_dy(skc_tts_v_t const ttsv)
122{
123  // tts.dy is packed to fit in range [-32,31] and unpacked to [-32..-1,+1..+32]
124  SKC_PREFIX_TTS_V_BITFIELD const dy = ttsv >> SKC_TTS_OFFSET_DY;
125
126  return dy - (~ttsv >> 31);
127}
128
129static
130SKC_PREFIX_TTS_V_BITFIELD
131skc_tts_get_py(skc_tts_v_t const ttsv)
132{
133  return SKC_BFE(ttsv,SKC_TTS_BITS_TY-SKC_SUBPIXEL_RESL_Y_LOG2,SKC_TTS_OFFSET_TY+SKC_SUBPIXEL_RESL_Y_LOG2);
134}
135
136//
137//
138//
139
140static
141void
142skc_accum_scatter(__local struct skc_subgroup_smem * const smem, skc_tts_v_t const tts_v)
143{
144  // get "altitude"
145  SKC_PREFIX_TTS_V_BITFIELD dy = skc_tts_get_dy(tts_v);
146
147  // get the y pixel coordinate
148  SKC_PREFIX_TTS_V_BITFIELD py = skc_tts_get_py(tts_v);
149
150  //
151  // FIXME -- benchmark performance of setting dy to 0 if tts_v is invalid?
152  //
153  // FIXME -- consider making TTS_INVALID a dy/py/etc. that's a no-op
154  //
155
156#if 0
157  if (tts_v != SKC_TTS_INVALID)
158    printf("< %08X = %u : %d >\n",tts_v,py,dy);
159#endif
160
161  //
162  // scatter-add the "altitude" to accumulator
163  //
164#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
165  //
166  // GPU/SIMT -- IMPLIES SUPPORT FOR ATOMIC SCATTER-ADD
167  //
168#undef  SKC_EXPAND_X
169#define SKC_EXPAND_X(I,S,C,P,A)                                         \
170  if (tts_v C != SKC_TTS_INVALID) {                                     \
171    SKC_ATOMIC_ADD_LOCAL_RELAXED_SUBGROUP(smem->accum.atomic.ttp + py C, dy C); \
172  }
173
174#else
175  //
176  // CPU/SIMD -- ITERATE OVER VECTOR, NO NEED FOR ATOMICS
177  //
178  // WITH SIMD, ONCE A TTS_INVALID IS DETECTED WE CAN QUIT
179  //
180#undef  SKC_EXPAND_X
181#define SKC_EXPAND_X(I,S,C,P,A)                 \
182  if (tts_v C == SKC_TTS_INVALID)               \
183    return;                                     \
184  smem->accum.aN.ttp[py C] = dy C;
185#endif
186
187  SKC_PREFIX_TTS_VECTOR_INT_EXPAND();
188}
189
190//
191// The implication here is that if our device configuration has a
192// rectangular 1:2 tile then we need a block size of at least 2
193// subblocks. The subblock size of course needs to match the length of
194// the smallest tile side.
195//
196
197static
198void
199skc_accum_flush(__local struct skc_subgroup_smem * const smem,
200                __global skc_bp_elem_t           * const bp_elems,
201                skc_block_id_t                     const pb_id)
202{
203  // load the ttp elements
204  SKC_PREFIX_TTP_V const ttp_v  = smem->accum.vN.ttp[get_sub_group_local_id()];
205  skc_uint         const offset = pb_id * (SKC_DEVICE_SUBBLOCK_WORDS / SKC_TILE_RATIO) + skc_subgroup_lane();
206
207#if   ( SKC_TILE_RATIO == 1 )
208
209  bp_elems[offset] = ttp_v;
210
211#elif ( SKC_TILE_RATIO == 2 )
212
213  vstore2(ttp_v,offset,bp_elems);
214
215#else
216
217#error("tile ratio greater than 2 not supported")
218
219#endif
220}
221
222//
223//
224//
225
226static
227void
228skc_accum_reset(__local struct skc_subgroup_smem * const smem)
229{
230  for (uint ii=0; ii<SKC_TILE_HEIGHT / SKC_PREFIX_SMEM_ZERO_WIDTH / SKC_PREFIX_SUBGROUP_SIZE; ii++)
231    smem->accum.zero.ttp[ii * SKC_PREFIX_SUBGROUP_SIZE + skc_subgroup_lane()] = ( 0 );
232}
233
234//
235// get next sk key
236//
237
238static
239skc_ttsk_s_t
240skc_ttsk_v_get_next(skc_ttsk_v_t * const sk_v,
241                    skc_uint     * const sk_next,
242                    skc_int      * const rkpk_rem)
243{
244  // decrement count
245  *rkpk_rem -= 1;
246
247#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
248  //
249  // SIMT with subgroup support is easy
250  //
251  // SIMT without subgroup support can always emulate with smem
252  //
253#if 0
254  //
255  // BUG TICKLED BY FILTHY CODE -- Intel compiler doesn't properly
256  // broadcast a uint2 cast to a long. It was probably bad to do this
257  // anyway without a union wrapping the TTSK scalar type.
258  //
259  // Consider creating a union { ulong; uint2 } at a later date --
260  // probably no need to ever do this unless it makes broadcast faster
261  // which is unlikely since it will probably be implemented as 2
262  // 32-bit broadcasts.
263  //
264  // Additionally, the TTRK and TTXK key bitfield sizes are probably
265  // cast in stone and we aren't going to change them no matter
266  // architecture we're on.
267  //
268  skc_ttsk_s_t sk_s = sub_group_broadcast(SKC_AS(ulong)(*sk_v),(*sk_next)++);
269#else
270  skc_ttsk_s_t sk_s;
271
272  sk_s.lo   = sub_group_broadcast(sk_v->lo,*sk_next);
273  sk_s.hi   = sub_group_broadcast(sk_v->hi,*sk_next);
274  *sk_next += 1;
275#endif
276
277#else
278  //
279  // SIMD will always grab component .s0 and then rotate the vector
280  //
281  sk_s = ( sk_v->s0 );
282
283  skc_ttsk_v_rotate_down(sk_v);
284
285#endif
286
287  return sk_s;
288}
289
290//
291//
292//
293
294static
295skc_raster_yx_s
296skc_ttsk_v_first(skc_ttsk_v_t * const sk_v, skc_uint const sk_next)
297{
298#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
299  //
300  // SIMT with subgroup support is easy
301  //
302  // SIMT without subgroup support can always emulate with smem
303  //
304  skc_raster_yx_s const yx_s = sub_group_broadcast(sk_v->hi,sk_next);
305
306#else
307  //
308  // SIMD will always grab component .s0 and then rotate the vector
309  //
310  skc_raster_yx_s const yx_s = ( sk_v->s0.hi );
311
312#endif
313
314  return yx_s;
315}
316
317//
318// mask off ttsb id
319//
320
321static
322skc_block_id_s_t
323skc_ttsk_s_get_ttsb_id(skc_ttsk_s_t const * const sk_s)
324{
325  return ( sk_s->lo & SKC_TTXK_LO_MASK_ID );
326}
327
328//
329// load tts_v as early as possible
330//
331
332static
333skc_tts_v_t
334skc_load_tts(__global skc_bp_elem_t * const bp_elems,
335             skc_block_id_s_t         const sb_id)
336{
337  return ( bp_elems[sb_id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()] );
338}
339
340//
341// massage ttrk keys into ttsk keys
342//
343
344static
345void
346skc_ttrk_to_ttsk(skc_ttsk_v_t * const sk_v)
347{
348  sk_v->lo = sk_v->lo  & SKC_TTXK_LO_MASK_ID;     // clear high (N/A) bits
349  sk_v->hi = sk_v->hi << SKC_TTRK_HI_BITS_COHORT; // shift cohort away -- zeroes low bits
350}
351
352//
353// replenish ttsk keys
354//
355
356static
357void
358skc_ttsk_v_replenish(skc_ttsk_v_t                * const sk_v,
359                     skc_uint                    * const sk_next,
360                     skc_uint                    * const rks_next,
361                     __global skc_ttrk_e_t const * const rks)
362{
363  // if there are still keys available then return
364  if (*sk_next < SKC_PREFIX_TTXK_V_SIZE)
365    return;
366
367  //
368  // otherwise, replenish sk_v
369  //
370  // NOTE NOTE NOTE -- we are assuming rks[] extent size is always
371  // divisible by TTXK_V_SIZE and therefore loading some keys from the
372  // next raster is OK.
373  //
374  *sk_next   = 0;
375  *rks_next += SKC_PREFIX_SUBGROUP_SIZE;
376  *sk_v      = rks[*rks_next];
377
378#if 0
379  printf("* %08X ( %3u, %3u )\n",
380         sk_v->hi,
381         (sk_v->hi >> 12) & 0xFFF,
382         (sk_v->hi      ) & 0xFFF);
383#endif
384
385  skc_ttrk_to_ttsk(sk_v);
386
387#if 0
388  printf("! %08X ( %3u, %3u )\n",
389         sk_v->hi,
390         (sk_v->hi >> 20) & 0xFFF,
391         (sk_v->hi >>  8) & 0xFFF);
392#endif
393}
394
395//
396// replenish block ids
397//
398// note that you can't overrun the block id pool since it's a ring
399//
400
401static
402void
403skc_blocks_replenish(skc_uint                      * const blocks_next,
404                     skc_uint                      * const blocks_idx,
405                     skc_block_id_v_t              * const blocks,
406                     skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
407                     __global skc_block_id_t const * const bp_ids)
408
409{
410  *blocks_idx += SKC_PREFIX_BLOCK_ID_V_SIZE;
411  *blocks      = bp_ids[*blocks_idx & bp_mask];
412  *blocks_next = 0;
413
414#if 0
415  printf("replenish blocks: %u\n",*blocks);
416#endif
417}
418
419//
420//
421//
422
423static
424skc_block_id_t
425skc_blocks_get_next(skc_uint                      * const blocks_next,
426                    skc_uint                      * const blocks_idx,
427                    skc_block_id_v_t              * const blocks,
428                    skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
429                    __global skc_block_id_t const * const bp_ids)
430{
431  // replenish?
432  if (*blocks_next == SKC_PREFIX_BLOCK_ID_V_SIZE)
433    {
434      skc_blocks_replenish(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
435    }
436
437#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
438  //
439  // SIMT
440  //
441  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);
442
443#else
444  //
445  // SIMD
446  //
447  skc_block_id_t id = blocks->s0;
448
449  skc_shuffle_down_1(*blocks);
450
451#endif
452
453  *blocks_next += 1;
454
455  return id;
456}
457
458//
459// subblock allocator
460//
461
462#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
463
464static
465skc_block_id_t
466skc_subblocks_get_next_pb_id(skc_block_id_t                * const subblocks,
467                             skc_uint                      * const blocks_next,
468                             skc_uint                      * const blocks_idx,
469                             skc_block_id_v_t              * const blocks,
470                             skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
471                             __global skc_block_id_t const * const bp_ids)
472{
473  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
474    {
475      *subblocks = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
476    }
477
478  skc_block_id_t const pb_id = *subblocks;
479
480  *subblocks += SKC_TILE_RATIO; // note this is one or two subblocks
481
482  return pb_id;
483}
484
485#endif
486
487//
488// append a ttsk key to the work-in-progress node
489//
490
491static
492void
493skc_node_v_append_sk(skc_ttsk_s_t            const * const sk_s,
494
495                     skc_ttxk_v_t                  * const xk_v,
496                     skc_uint                      * const xk_v_next,
497                     skc_uint                      * const xk_v_idx,
498                     __global skc_bp_elem_t        * const bp_elems,
499
500                     skc_int                         const rkpk_rem,
501
502                     skc_uint                      * const blocks_next,
503                     skc_uint                      * const blocks_idx,
504                     skc_block_id_v_t              * const blocks,
505                     skc_uint                        const bp_mask,
506                     __global skc_block_id_t const * const bp_ids)
507{
508  //
509  // Append an sk key to the in-register xk_v vector
510  //
511  // If the work-in-progress node in gmem will only have room for one
512  // more key then:
513  //
514  //   - if this was the final SK then write out xk_v and exit
515  //
516  //   - otherwise, acquire a block id, link it, write out xk_v,
517  //     prepare new node
518  //
519  // Note that this does *not* try to squeeze in a final key into the
520  // next node slot.  This optimization isn't worth the added
521  // down-pipeline complexity.
522  //
523#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
524  //
525  // SIMT
526  //
527  if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
528    {
529      *xk_v = *sk_s;
530    }
531
532  *xk_v_next += 1;
533
534  // are there more keys coming?
535  if (rkpk_rem > 0)
536    {
537      // is the node almost full?
538      if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
539        {
540          skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
541
542          if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
543            {
544              xk_v->lo = id;
545              xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
546            }
547
548          // store xk_v (uint2) to bp (uint)
549          bp_elems[*xk_v_idx                         ] = xk_v->lo;
550          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
551#if 0
552          printf("S) %u : %08v2X\n",*xk_v_idx,*xk_v);
553#endif
554          // reinitialize xk_v
555          xk_v->lo = SKC_UINT_MAX;
556          xk_v->hi = SKC_UINT_MAX;
557
558          // update node elem idx
559          *xk_v_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
560
561          // reset node count
562          *xk_v_next = 0;
563        }
564      // is xk_v full?
565      else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
566        {
567          // store xk_v to bp
568          bp_elems[*xk_v_idx                         ] = xk_v->lo;
569          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
570#if 0
571          printf("s) %u : %08v2X\n",*xk_v_idx,*xk_v);
572#endif
573          // reinitialize xk_v
574          xk_v->lo = SKC_UINT_MAX;
575          xk_v->hi = SKC_UINT_MAX;
576
577          // increment node elem idx
578          *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
579        }
580    }
581  else
582    {
583      bp_elems[*xk_v_idx                         ] = xk_v->lo;
584      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
585#if 0
586      printf("z) %u : %08v2X\n",*xk_v_idx,*xk_v);
587#endif
588      while ((*xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
589        {
590          *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
591
592          bp_elems[*xk_v_idx]                          = SKC_UINT_MAX;
593          bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
594        }
595    }
596
597#else
598  //
599  // SIMD
600  //
601
602#endif
603}
604
605//
606//
607//
608
609static
610skc_ttpk_s_t
611skc_ttpk_create(skc_raster_yx_s const yx_prev,
612                skc_raster_yx_s const yx_next,
613                skc_block_id_t  const pb_id)
614{
615  // - yx_prev is already incremented by one
616  // - yx_span is already shifted up at hi.x
617  skc_uint const yx_span = yx_next - yx_prev;
618
619  skc_ttpk_s_t pk;
620
621  // turn on prefix bit | shift span bits upward
622  pk.lo = pb_id | SKC_TTXK_LO_MASK_PREFIX | (yx_span << SKC_TTPK_LO_SHL_YX_SPAN);
623
624  // shift down high span bits | yx of tile
625  pk.hi = (yx_span >> SKC_TTPK_HI_SHR_YX_SPAN) | yx_prev;
626
627#if 0
628  if (get_sub_group_local_id() == 0)
629    printf("* %08v2X : %u\n",pk,yx_span);
630#endif
631
632  return pk;
633}
634
635//
636// append a ttpk key to the work-in-progress node
637//
638
639static
640void
641skc_node_v_append_pk(skc_ttpk_s_t            const * const pk_s,
642
643                     skc_ttxk_v_t                  * const xk_v,
644                     skc_uint                      * const xk_v_next,
645                     skc_uint                      * const xk_v_idx,
646                     __global skc_bp_elem_t        * const bp_elems,
647
648                     skc_uint                      * const blocks_next,
649                     skc_uint                      * const blocks_idx,
650                     skc_block_id_v_t              * const blocks,
651                     skc_uint                        const bp_mask,
652                     __global skc_block_id_t const * const bp_ids)
653{
654  //
655  // append a pk key to the in-register xk_v vector
656  //
657  // if the work-in-progress node in gmem will only have room for one
658  // more key then:
659  //
660  //   - if this was the final SK then write out xk_v and exit
661  //
662  //   - otherwise, acquire a block id, link it, write out xk_v,
663  //     prepare new node
664  //
665#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
666  //
667  // SIMT
668  //
669  if (get_sub_group_local_id() == (*xk_v_next & SKC_PREFIX_TTXK_V_MASK))
670    {
671      *xk_v = *pk_s;
672    }
673
674  *xk_v_next += 1;
675
676  // is the node almost full?
677  if (*xk_v_next == SKC_RASTER_NODE_DWORDS - 1)
678    {
679      skc_block_id_t const id = skc_blocks_get_next(blocks_next,blocks_idx,blocks,bp_mask,bp_ids);
680
681      if (get_sub_group_local_id() == SKC_PREFIX_TTXK_V_SIZE - 1)
682        {
683          xk_v->lo = id;
684          xk_v->hi = SKC_UINT_MAX; // this initialization isn't necessary
685        }
686
687      // store xk_v to bp
688      bp_elems[*xk_v_idx                         ] = xk_v->lo;
689      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
690#if 0
691      printf("P) %u : %08v2X\n",*xk_v_idx,*xk_v);
692#endif
693      // reinitialize xk_v
694      xk_v->lo = SKC_UINT_MAX;
695      xk_v->hi = SKC_UINT_MAX;
696
697      // update node elem idx
698      *xk_v_idx  = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
699
700      // reset node count
701      *xk_v_next = 0;
702    }
703  // is xk_v full?
704  else if ((*xk_v_next & SKC_PREFIX_TTXK_V_MASK) == 0)
705    {
706      // store xk_v to bp
707      bp_elems[*xk_v_idx                         ] = xk_v->lo;
708      bp_elems[*xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v->hi;
709#if 0
710      printf("p) %u : %08v2X\n",*xk_v_idx,*xk_v);
711#endif
712      // reinitialize xk_v
713      xk_v->lo = SKC_UINT_MAX;
714      xk_v->hi = SKC_UINT_MAX;
715
716      // increment node elem idx
717      *xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
718    }
719
720#else
721  //
722  // SIMD
723  //
724#endif
725}
726
727//
728// append the first 3 fields of meta info to the raster header
729//
730
731static
732void
733skc_node_v_init_header(skc_ttxk_v_t                           * const xk_v,
734                       skc_uint                               * const xk_v_next,
735                       union skc_raster_cohort_meta_out const * const meta)
736{
737#if ( SKC_PREFIX_SUBGROUP_SIZE > 1 )
738  //
739  // SIMT
740  //
741  if (get_sub_group_local_id() < 2)
742    {
743      *xk_v = ((get_sub_group_local_id() & 1) == 0) ? meta->u32v4.lo : meta->u32v4.hi;
744    }
745
746#if 0
747  if (get_sub_group_local_id() == 0)
748    printf("header: %08v4X\n",meta->u32v4);
749#endif
750
751  //
752  // increment counter: uint4 + uint4 = uint2 x 4
753  //
754  *xk_v_next = 2 + 2; // +2 for unitialized bounds
755
756#else
757  //
758  // SIMD
759  //
760
761#endif
762}
763
764//
765//
766//
767
768__kernel
769SKC_PREFIX_KERNEL_ATTRIBS
770void
771skc_kernel_prefix(__global skc_uint       const * const bp_atomics,
772                  __global skc_block_id_t const * const bp_ids,
773                  __global skc_bp_elem_t        * const bp_elems,
774                  skc_uint                        const bp_mask, // pow2 modulo mask for block pool ring
775                  __global skc_ttrk_e_t   const * const rks,
776                  __global skc_block_id_t       * const map,
777                  __global skc_uint       const * const metas,
778                  skc_uint                        const count)
779{
780  //
781  // declare shared memory block
782  //
783#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
784  __local struct skc_subgroup_smem                  smem[1];
785#else
786  __local struct skc_subgroup_smem                  smems[SKC_PREFIX_WORKGROUP_SUBGROUPS];
787  __local struct skc_subgroup_smem * restrict const smem = smems + get_sub_group_id();
788#endif
789
790  //
791  // where is this subgroup in the grid?
792  //
793#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS == 1 )
794  skc_uint const sgi = get_group_id(0);
795#else
796  skc_uint const sgi = get_group_id(0) * SKC_PREFIX_WORKGROUP_SUBGROUPS + get_sub_group_id();
797#endif
798
799  skc_uint const sgl = get_sub_group_local_id();
800
801  //
802  // return if this subgroup is excess
803  //
804#if ( SKC_PREFIX_WORKGROUP_SUBGROUPS > 1 )
805  if (sgi >= count)
806    return;
807#endif
808
809  //
810  // get meta info for this subgroup's raster
811  //
812  union skc_raster_cohort_meta_out const meta  = { vload4(sgi,metas) };
813  skc_uint                         const reads = metas[SKC_RASTER_COHORT_META_OFFSET_READS + sgi];
814
815#if 0
816  if (get_sub_group_local_id() == 0)
817    printf("%3u : %5u / %5u / %5u / %5u / %u\n",
818           sgi,
819           meta.blocks,
820           meta.offset,
821           meta.nodes,
822           meta.keys,
823           reads);
824#endif
825
826  //
827  // preload blocks -- align on subgroup
828  //
829  skc_uint         blocks_idx  = (reads & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
830  skc_block_id_v_t blocks      = bp_ids[blocks_idx & bp_mask];
831  skc_uint         blocks_next = (reads &  SKC_PREFIX_SUBGROUP_MASK);
832
833  //
834  // prime xk_v_idx with a block but note that OpenCL vstore_n() will scale the offset
835  //
836  skc_uint xk_v_idx = sub_group_broadcast(blocks,blocks_next++) * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
837
838  //
839  // initialize raster header -- assumes block is greater than 8 words (4 doublewords)
840  //
841  skc_ttxk_v_t xk_v = { SKC_UINT_MAX, SKC_UINT_MAX };
842  skc_uint     xk_v_next;
843
844  skc_node_v_init_header(&xk_v,&xk_v_next,&meta);
845
846  //
847  // no keys -- this is an empty raster!
848  //
849  if (meta.keys == 0)
850    {
851      bp_elems[xk_v_idx                         ] = xk_v.lo;
852      bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = xk_v.hi;
853
854      while ((xk_v_idx & SKC_DEVICE_BLOCK_WORDS_MASK) < SKC_DEVICE_BLOCK_WORDS - SKC_PREFIX_SUBGROUP_SIZE * 2)
855        {
856          xk_v_idx += SKC_PREFIX_SUBGROUP_SIZE * 2;
857
858          bp_elems[xk_v_idx]                          = SKC_UINT_MAX;
859          bp_elems[xk_v_idx+SKC_PREFIX_SUBGROUP_SIZE] = SKC_UINT_MAX;
860        }
861
862      return;
863    }
864
865  //
866  // load TTRK keys and in-place convert to TTSK keys
867  //
868  skc_uint         rks_next    = (meta.offset & ~SKC_PREFIX_SUBGROUP_MASK) + skc_subgroup_lane();
869  skc_ttsk_v_t     sk_v        = rks[rks_next];
870  skc_uint         sk_next     = (meta.offset & SKC_PREFIX_SUBGROUP_MASK);
871  skc_int          rkpk_rem    = meta.keys; // signed count of remaining rk+pk keys
872
873#if 0
874  printf("* %08X ( %3u, %3u )\n",
875         sk_v.hi,
876         (sk_v.hi >> 12) & 0xFFF,
877         (sk_v.hi      ) & 0xFFF);
878#endif
879
880  skc_ttrk_to_ttsk(&sk_v);
881
882#if 0
883  printf("! %08X ( %3u, %3u )\n",
884         sk_v.hi,
885         (sk_v.hi >> 20) & 0xFFF,
886         (sk_v.hi >>  8) & 0xFFF);
887#endif
888
889  //
890  // subblocks
891  //
892#if ( SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2 )
893  skc_block_id_t subblocks = 0;
894#endif
895
896  //
897  // begin "scan" of tiles
898  //
899  skc_raster_yx_s yx_prev = skc_ttsk_v_first(&sk_v,sk_next);
900
901  //
902  // zero the accumulator
903  //
904  skc_accum_reset(smem);
905
906  while (true)
907    {
908      // get next rk key
909      skc_ttsk_s_t     const sk_s  = skc_ttsk_v_get_next(&sk_v,&sk_next,&rkpk_rem);
910
911      // load ttsb id
912      skc_block_id_s_t const sb_id = skc_ttsk_s_get_ttsb_id(&sk_s);
913
914      // load tts_v transaction "in flight" as early as possible
915      skc_tts_v_t      const tts_v = skc_load_tts(bp_elems,sb_id);
916
917#if 0
918      printf("{ %08X }\n",tts_v);
919#endif
920
921#if 0
922      if (get_sub_group_local_id() == 0)
923        printf("[ %d, %X ]\n",rkpk_rem,sb_id);
924#endif
925
926#if 0
927      if (get_sub_group_local_id() == 0)
928        printf("@ %08X ( %3u, %3u )\n",sk_s.hi,(sk_s.hi >> 20),(sk_s.hi >> 8) & 0xFFF);
929#endif
930
931      //
932      // FIXME -- SOME OF THESE COMPARISONS CAN BE PERFORMED AHEAD OF
933      // TIME AND SIMD'IZED
934      //
935
936      // if yx's don't match then we're either issuing a ttpk or
937      // resetting the accumulator
938      if (sk_s.hi != yx_prev)
939        {
940          // if yx_next.y == yx_last.y then x changed
941          if (((sk_s.hi ^ yx_prev) & SKC_TTXK_HI_MASK_Y) == 0)
942            {
943              //
944              // if the tile is not square then it's ratio is 1:2
945              //
946#if SKC_DEVICE_SUBBLOCK_WORDS_LOG2 < SKC_DEVICE_BLOCK_WORDS_LOG2
947              skc_block_id_t const pb_id = skc_subblocks_get_next_pb_id(&subblocks,
948                                                                        &blocks_next,
949                                                                        &blocks_idx,
950                                                                        &blocks,
951                                                                        bp_mask,
952                                                                        bp_ids);
953#else
954              skc_block_id_t const pb_id = skc_blocks_get_next(&blocks_next,
955                                                               &blocks_idx,
956                                                               &blocks,
957                                                               bp_mask,
958                                                               bp_ids);
959#endif
960
961              // flush accumulated ttp vector to block/subblock at ttpb_id
962              skc_accum_flush(smem,bp_elems,pb_id);
963
964#if 0
965              if (get_sub_group_local_id() == 0)
966                {
967                  printf("%8u : ( %4u, %4u ) -> ( %4u, %4u )\n",
968                         pb_id,
969                         (yx_prev >> SKC_TTXK_HI_OFFSET_Y),
970                         (yx_prev >> SKC_TTXK_HI_OFFSET_X) & 0xFFF,
971                         (sk_s.hi >> SKC_TTXK_HI_OFFSET_Y) & 0xFFF,
972                         (sk_s.hi >> SKC_TTXK_HI_OFFSET_X) & 0xFFF);
973                }
974#endif
975
976              //
977              // FIXME -- A SIMD-WIDE BLOCK OF TTPK KEYS CAN BE CREATED IN ONE STEP
978              //
979              rkpk_rem -= 1;
980
981              // create the pk
982              skc_ttpk_s_t const pk_s = skc_ttpk_create(yx_prev+SKC_TTXK_HI_ONE_X,sk_s.hi,pb_id);
983
984              // append pk key to xk buffer
985              skc_node_v_append_pk(&pk_s,
986
987                                   &xk_v,
988                                   &xk_v_next,
989                                   &xk_v_idx,
990                                   bp_elems,
991
992                                   &blocks_next,
993                                   &blocks_idx,
994                                   &blocks,
995                                   bp_mask,
996                                   bp_ids);
997            }
998          else if (rkpk_rem > 0) // we're starting a new tile row
999            {
1000              skc_accum_reset(smem);
1001            }
1002        }
1003
1004      //
1005      // append sk key to node_v
1006      //
1007      // if rkpk_rem is zero then return from kernel
1008      //
1009      skc_node_v_append_sk(&sk_s,
1010
1011                           &xk_v,
1012                           &xk_v_next,
1013                           &xk_v_idx,
1014                           bp_elems,
1015
1016                           rkpk_rem,
1017
1018                           &blocks_next,
1019                           &blocks_idx,
1020                           &blocks,
1021                           bp_mask,
1022                           bp_ids);
1023
1024      // we're done if no more sk keys
1025      if (rkpk_rem == 0)
1026        break;
1027
1028      // move to new tile
1029      yx_prev = sk_s.hi;
1030
1031      // scatter tts values into accumulator
1032      skc_accum_scatter(smem,tts_v);
1033
1034      // replenish sk keys
1035      skc_ttsk_v_replenish(&sk_v,&sk_next,&rks_next,rks);
1036    }
1037}
1038
1039//
1040//
1041//
1042