1/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9//
10//
11//
12
13#include "tile.h"
14#include "common.h"
15#include "raster.h"
16#include "atomic_cl.h"
17#include "kernel_cl_12.h"
18
19//
20//
21//
22
23#define SKC_PLACE_SUBGROUP_MASK      (SKC_PLACE_SUBGROUP_SIZE - 1)
24#define SKC_PLACE_SUBGROUP_LAST      (SKC_PLACE_SUBGROUP_SIZE - 1)
25
26//
27//
28//
29
30#define SKC_PLACE_SMEM_COUNT_TTSK    SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
31#define SKC_PLACE_SMEM_COUNT_TTPK    SKC_RASTER_NODE_MAX_TTPK
32
33//
34//
35//
36
37#define SKC_PLACE_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
38
39//
40//
41//
42
43#if   ( SKC_PLACE_X == 1 )
44#define SKC_PLACE_EXPAND()           SKC_EXPAND_1()
45#define SKC_PLACE_EXPAND_I_LAST      0
46
47#elif ( SKC_PLACE_X == 2 )
48#define SKC_PLACE_EXPAND()           SKC_EXPAND_2()
49#define SKC_PLACE_EXPAND_I_LAST      1
50
51#elif ( SKC_PLACE_X == 4 )
52#define SKC_PLACE_EXPAND()           SKC_EXPAND_4()
53#define SKC_PLACE_EXPAND_I_LAST      3
54
55#elif ( SKC_PLACE_X == 8 )
56#define SKC_PLACE_EXPAND()           SKC_EXPAND_8()
57#define SKC_PLACE_EXPAND_I_LAST      7
58
59#elif ( SKC_PLACE_X == 16)
60#define SKC_PLACE_EXPAND()           SKC_EXPAND_16()
61#define SKC_PLACE_EXPAND_I_LAST      15
62#endif
63
64//
65// PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
66// COALESCED WRITES.  LO FIRST, FOLLOWED BY HI.
67//
68// THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
69// KERNELS USE DIFFERENT SUBGROUP SIZES.
70//
71// THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
72// LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
73//
74// NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
75// OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
76// ONLY SUPPORT A SUBGROUP SIZE OF 16.
77//
78
79#if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
80
81#define SKC_PLACE_STRIDE_H(L)              (L)
82#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
83#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
84
85#elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
86
87#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
88#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_RATIO - 1)
89#define SKC_PLACE_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
90
91#define SKC_PLACE_STRIDE_H(L)              (L)
92#define SKC_PLACE_STRIDE_V_LO(I)           (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
93#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
94
95#elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
96
97#define SKC_PLACE_SUBGROUP_RATIO           (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
98#define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
99
100#define SKC_PLACE_STRIDE_H(L)              (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
101#define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
102#define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
103
104#endif
105
106//
107// A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
108// IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
109//
110
111#define SKC_PLACE_IS_ALL_HEADER_ROW(i)   (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
112
113#define SKC_PLACE_IS_NOT_HEADER_ROW(i)   ( (i)    * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
114
115#define SKC_PLACE_IS_TRAILING_ROW(i)     (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
116
117#define SKC_PLACE_IS_HEADER_ROW_KEY(i)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
118
119
120//
121// Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
122//
123#define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
124#define SKC_PLACE_NODE_LESS_THAN(i,k)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id()                          < (k))
125
126//
127// TTSK v2:
128//
129//  0                                       63
130//  | TTSB ID | PREFIX |  SPAN   |  X  |  Y  |
131//  +---------+--------+---------+-----+-----+
132//  |    27   | 1 (=0) | 12 (=0) | 12  | 12  |
133//
134//
135// TTPK v2:
136//
137//  0                                    63
138//  | TTPB ID | PREFIX | SPAN |  X  |  Y  |
139//  +---------+--------+------+-----+-----+
140//  |    27   | 1 (=1) |  12  | 12  | 12  |
141//
142//
143
144//
145// TTCK (32-BIT COMPARE) v1:
146//
147//  0                                                           63
148//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
149//  +----------------------+--------+--------+-------+-----+-----+
150//  |          30          |    1   |    1   |   18  |  7  |  7  |
151//
152//
153// TTCK (32-BIT COMPARE) v2:
154//
155//  0                                                           63
156//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
157//  +----------------------+--------+--------+-------+-----+-----+
158//  |          30          |    1   |    1   |   15  |  9  |  8  |
159//
160//
161// TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
162//
163//  0                                                           63
164//  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
165//  +----------------------+--------+--------+-------+-----+-----+
166//  |          27          |    1   |    1   |   18  |  9  |  8  |
167//
168
169union skc_subgroup_smem
170{
171  skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
172
173  struct {
174    struct {
175      skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
176      skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
177    } lo;
178
179    struct {
180      skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
181      skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
182    } hi;
183
184    // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
185  };
186
187};
188
189//
190// scatter scan max
191//
192static
193skc_int_v_t
194skc_scatter_scan_max(__local union skc_subgroup_smem  volatile * const smem,
195                     skc_int_v_t                                 const iss,
196                     skc_int_v_t                                 const ess)
197{
198  //
199  // prefix sums determine which lanes we're going to work on next
200  //
201  skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
202  skc_int_v_t  const scratch_idx      = max(ess,0);
203
204  //
205  // SIMT
206  //
207
208  //
209  // zero the volatile smem scratchpad using vector syntax
210  //
211  smem->scratch[get_sub_group_local_id()] = ( 0 );
212
213  //
214  // store source lane at starting lane
215  //
216  if (is_scratch_store) {
217    smem->scratch[scratch_idx] = get_sub_group_local_id();
218  }
219
220  //
221  // propagate lanes to right using max scan
222  //
223  skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
224  skc_int_v_t const source  = sub_group_scan_inclusive_max(scratch);
225
226  return source;
227}
228
229//
230//
231//
232
233static
234skc_bool
235skc_xk_clip(union skc_tile_clip const * const tile_clip,
236            skc_ttxk_t                * const xk)
237{
238  //
239  // clip the sk and pk keys
240  //
241  // if fully clipped then return false
242  //
243  // alternatively -- we can expand all these keys in place
244  //
245  // alternatively -- keep sk and pk keys segregated because sk
246  // represents the vast majority of keys and are easier to process.
247  // don't mess with the fastpath!
248  //
249  return false;
250}
251
252//
253//
254//
255
256static
257skc_ttck_t
258skc_sk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
259             union skc_cmd_place              const    * const cmd,
260             skc_uint                                    const sk_idx)
261{
262  skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
263  skc_uint const hi = smem->hi.sk[sk_idx];
264
265  skc_ttck_t ck;
266
267  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
268
269  // FIXME -- x and y should already be clipped and shifted
270  skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
271  skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
272
273  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
274
275  return ck;
276}
277
278static
279skc_ttck_t
280skc_pk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
281             union skc_cmd_place              const    * const cmd,
282             skc_uint                                    const pk_idx,
283             skc_uint                                    const dx)
284{
285  skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
286  skc_uint const hi = smem->hi.pk[pk_idx];
287
288  skc_ttck_t ck;
289
290  ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
291
292  // FIXME -- x and y should already be clipped and shifted
293  skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
294  skc_uint const y = (cmd->ty +      SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
295
296  ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
297
298  return ck;
299}
300
301//
302//
303//
304
305static
306void
307skc_ttsk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
308               __global skc_ttck_t                       * const ck_extent,
309               __local union skc_subgroup_smem  volatile * const smem,
310               union skc_cmd_place              const    * const cmd,
311               skc_uint                         const            sk)
312{
313  //
314  // Pretty sure you can never ever have an sk count equal to 0
315  //
316  skc_uint ck_base = 0;
317
318  // last lane performs the block pool allocation with an atomic increment
319  if (get_sub_group_local_id() == 0) {
320    ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
321  }
322
323  // broadcast base to all lanes
324  ck_base = sub_group_broadcast(ck_base,0);
325
326  // convert sk keys to ck keys
327  for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
328    {
329      ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
330    }
331}
332
333//
334//
335//
336
337static
338skc_int
339skc_ttpk_get_span(__local union skc_subgroup_smem  volatile * const smem,
340                  skc_uint                                    const idx)
341{
342  skc_uint const lo      = smem->lo.pk[idx];
343  skc_uint const hi      = smem->hi.pk[idx];
344
345  skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
346  skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
347
348  return (span_lo | span_hi) + 1;
349}
350
351//
352//
353//
354
355static
356void
357skc_ttpk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
358               __global skc_ttck_t                       * const ck_extent,
359               __local union skc_subgroup_smem  volatile * const smem,
360               union skc_cmd_place              const    * const cmd,
361               skc_uint                         const            pk)
362{
363  // bail out if pk queue is empty
364  if (pk == 0)
365    return;
366
367#if 0
368  if (get_sub_group_local_id() == 0)
369    printf("%u\n",pk);
370#endif
371
372  //
373  // FIXME -- this nested loop iterates over the queue processing a
374  // subgroup of 64-bit keys at a time.  This is probably not the most
375  // efficient approach so investigate how to store and iterate over a
376  // wider than subgroup (node-sized) queue of keys.
377  //
378
379  // round up so we work with full subgroups
380  skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
381  skc_uint       ii    = 0;
382
383  // nested loop that expands all ttpk keys
384#if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
385  for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
386#endif
387    {
388      skc_uint idx  = ii + get_sub_group_local_id();
389      skc_int  span = 0;
390
391      // how many tiles does this ttpk span?
392      if (idx < pk)
393        span = skc_ttpk_get_span(smem,idx);
394
395      // we need inclusive, exclusive and total
396      skc_int iss = sub_group_scan_inclusive_add(span);
397      skc_int ess = iss - span;
398      skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
399
400      // printf("%u : %u\n",span,iss);
401      // continue;
402
403      // atomically allocate space for the pk keys
404      skc_uint ck_base = 0;
405
406      // last lane performs the block pool allocation with an atomic increment
407      if (get_sub_group_local_id() == 0) {
408        ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
409      }
410
411      // broadcast atomically allocated extent base to all lanes
412      skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
413
414      //
415      // FIXME -- this loop would probably be faster if the ttpk keys
416      // were held in registers and accessed with shuffles instead of
417      // SMEM loads
418      //
419
420      //
421      // loop until there are no more expanded pk keys
422      //
423      while (true)
424        {
425          skc_int const source = skc_scatter_scan_max(smem,iss,ess);
426          skc_int const dx     = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
427
428          // store valid ck keys to gmem
429          if (get_sub_group_local_id() < rem) {
430            ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
431          }
432
433          // decrement remainder
434          rem -= SKC_PLACE_SUBGROUP_SIZE;
435
436          if (rem <= 0)
437            break;
438
439          // increment/decrement indices
440          ck_idx += SKC_PLACE_SUBGROUP_SIZE;
441          iss    -= SKC_PLACE_SUBGROUP_SIZE;
442          ess    -= SKC_PLACE_SUBGROUP_SIZE;
443        }
444    }
445}
446
447//
448//
449//
450
451static
452skc_uint
453skc_ballot(skc_uint * const xk, skc_uint const is_xk)
454{
455#if 0
456  //
457  // FIXME -- when available, this should use the idiom:
458  //
459  //   ballot() + lane_mask_less_than_or_equal + popcount()
460  //
461  // Supported by:
462  //
463  //   - Vulkan 1.1 / SPIR-V 1.3
464  //   - CUDA
465  //   - AVX2 (SSE*?)
466  //
467#else
468  //
469  // otherwise, emulate with an inclusive scan (yuk)
470  //
471  skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
472
473  skc_uint const xk_idx = *xk + prefix - is_xk;
474
475  *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
476
477#if 0
478  printf("< %3u >\n",xk_idx);
479#endif
480
481  return xk_idx;
482#endif
483}
484
485//
486//
487//
488__kernel
489SKC_PLACE_KERNEL_ATTRIBS
490void
491skc_kernel_place(__global skc_bp_elem_t                * const bp_elems,
492                 __global SKC_ATOMIC_UINT     volatile * const place_atomics,
493                 __global skc_ttck_t                   * const ck_extent,
494                 __global union skc_cmd_place const    * const cmds,
495                 __global skc_block_id_t               * const map,
496                 skc_uint4                               const clip,
497                 skc_uint                                const count)
498{
499  //
500  // declare shared memory block
501  //
502#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
503  __local union skc_subgroup_smem  volatile                smem[1];
504#else
505  __local union skc_subgroup_smem  volatile                smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
506  __local union skc_subgroup_smem  volatile * const smem = smem_wg + get_sub_group_id();
507#endif
508
509  //
510  // This is a subgroup-centric kernel
511  //
512  // Which subgroup in the grid is this?
513  //
514  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
515  // get_group_id(0) as a uniform but the alternative calculation used
516  // when there are multiple subgroups per workgroup is not
517  // cooperating and driving spillage elsewhere.
518  //
519  // Test the raster's translated bounds against the composition's
520  // tile clip
521  //
522  // There are 3 cases:
523  //
524  //   - the raster is completely clipped -> return
525  //   - the raster is partially  clipped -> all keys must clipped
526  //   - the raster is not        clipped -> no keys are tested
527  //
528  //
529  // There are at least 4 implementations of place and we want to
530  // special-case them as much as possible so that, at the least, the
531  // fastpath remains fast.
532  //
533  //  - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
534  //
535  //  - implement CLIPPED + NO TRANSLATION path
536  //
537  //  - implement NO CLIP +    TRANSLATION path
538  //
539  //  - implement CLIPPED +    TRANSLATION path
540  //
541  //
542  // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
543  // 12:12:8 integer where:
544  //
545  //  12: ttsk
546  //  12: ttpk
547  //   8: /dev/null -- clipped or invalid key
548  //
549  // Three kinds of nodes in a raster's list:
550  //
551  //  - the head node
552  //  - an internal node
553  //  - the final node
554  //
555
556#if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
557  skc_uint const cmd_idx = get_group_id(0);
558#else
559  skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
560#endif
561
562  // load command
563  union skc_cmd_place const cmd = cmds[cmd_idx];
564
565  // get the raster header from the raster host id -- scalar
566  skc_block_id_t            id  = map[cmd.raster_h];
567
568  //
569  // load all of the head block ttxk keys into registers
570  //
571  // FIXME -- this pattern lends itself to using the higher
572  // performance Intel GEN block load instructions
573  //
574  skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
575
576#undef  SKC_EXPAND_X
577#define SKC_EXPAND_X(I,S,C,P,R)                                 \
578  union skc_raster_node_elem const h##I = {                     \
579    .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)],    \
580               bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)]  }  \
581  };
582
583  SKC_PLACE_EXPAND();
584
585  //
586  // load raster header counts -- we only need the "nodes" and "keys"
587  // words but the keys we loaded are doublewords.
588  //
589  // FIXME -- this can be made portable with compile-time macro expansion
590  //
591  skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
592  skc_uint keys  = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
593
594  //
595  //
596  //
597#if 0
598#undef  SKC_EXPAND_X
599#define SKC_EXPAND_X(I,S,C,P,R)                                 \
600  printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",             \
601         nodes,keys,                                            \
602         I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),  \
603         h##I.u32v2.hi,h##I.u32v2.lo,                           \
604         h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
605
606  SKC_PLACE_EXPAND();
607#endif
608
609  //
610#if 0
611  if (get_sub_group_local_id() == 0) {
612    printf("place: %u / %u / %u\n",head_id,nodes,keys);
613  }
614#endif
615
616  {
617    //
618    // classify every key in the header
619    //
620    // keys: 0 is not a key / 1 is a key
621    // skpk: 0 is sk        / 1 is pk
622    //
623    skc_uint bits_keys = 0;
624    skc_uint bits_skpk = 0;
625
626    //
627    // calculate bits_keys
628    //
629#undef  SKC_EXPAND_X
630#define SKC_EXPAND_X(I,S,C,P,R)                                         \
631    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
632      skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
633      if (idx < keys) {                                                 \
634        bits_keys |= (1u << I);                                         \
635      }                                                                 \
636      if (SKC_PLACE_IS_TRAILING_ROW(I)) {                               \
637        if (keys > SKC_RASTER_HEAD_COUNT_KEYS) {                        \
638          if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {    \
639            bits_keys &= ~(1u << I);                                    \
640          }                                                             \
641        }                                                               \
642      }                                                                 \
643    }
644
645    SKC_PLACE_EXPAND();
646
647    //
648    // blindly calculate bits_skpk
649    //
650#undef  SKC_EXPAND_X
651#define SKC_EXPAND_X(I,S,C,P,R)                                         \
652    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
653      bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
654    }
655
656    SKC_PLACE_EXPAND();
657
658#if 0
659    printf("%2X : %2X\n",bits_keys,bits_skpk);
660#endif
661
662    //
663    // next pointer is last element of last row.  save it now because
664    // this might be recognized as a subgroup-uniform/scalar.
665    //
666    id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
667
668    //
669    // append SK keys first
670    //
671    skc_uint const bits_sk = bits_keys & ~bits_skpk;
672    skc_uint       sk      = 0;
673
674#undef  SKC_EXPAND_X
675#define SKC_EXPAND_X(I,S,C,P,R)                 \
676    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
677      skc_uint is_sk  = (bits_sk >> I) & 1;     \
678      skc_uint sk_idx = skc_ballot(&sk,is_sk);  \
679      if (is_sk) {                              \
680        smem->lo.sk[sk_idx] = h##I.xk.lo;       \
681        smem->hi.sk[sk_idx] = h##I.xk.hi;       \
682      }                                         \
683    }
684
685    SKC_PLACE_EXPAND();
686
687    //
688    // append PK keys next
689    //
690    skc_uint const bits_pk = bits_keys & bits_skpk;
691    skc_uint       pk      = 0;
692
693#undef  SKC_EXPAND_X
694#define SKC_EXPAND_X(I,S,C,P,R)                 \
695    if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
696      skc_uint is_pk  = (bits_pk >> I) & 1;     \
697      skc_uint pk_idx = skc_ballot(&pk,is_pk);  \
698      if (is_pk) {                              \
699        smem->lo.pk[pk_idx] = h##I.xk.lo;       \
700        smem->hi.pk[pk_idx] = h##I.xk.hi;       \
701      }                                         \
702    }
703
704    SKC_PLACE_EXPAND();
705
706#if 0
707    printf("%2u * %2u\n",sk,pk);
708#endif
709    //
710    // flush the keys
711    //
712    skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
713    skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
714  }
715
716  //
717  // we're done if there was only a head node
718  //
719  if (nodes == 0)
720    return;
721
722  //
723  // decrement keys
724  //
725  keys -= SKC_RASTER_HEAD_COUNT_KEYS;
726
727  //
728  // otherwise, append keys in trailing nodes to smem
729  //
730  while (true)
731    {
732      //
733      // load all of the node block ttxk keys into registers
734      //
735      // FIXME -- this pattern lends itself to using the higher
736      // performance Intel GEN block load instructions
737      //
738      skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
739
740#undef  SKC_EXPAND_X
741#define SKC_EXPAND_X(I,S,C,P,R)                                         \
742      union skc_raster_node_elem const n##I = {                         \
743        .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)],        \
744                   bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)]  }      \
745      };
746
747      SKC_PLACE_EXPAND();
748
749#if 0
750#undef  SKC_EXPAND_X
751#define SKC_EXPAND_X(I,S,C,P,R)                                         \
752      printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",                 \
753             nodes,keys,                                                \
754             I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),      \
755             n##I.u32v2.hi,n##I.u32v2.lo,                               \
756             n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
757
758      SKC_PLACE_EXPAND();
759#endif
760
761      //
762      // classify every key in the header
763      //
764      // keys: 0 is not a key / 1 is a key
765      // skpk: 0 is sk        / 1 is pk
766      //
767      skc_uint bits_keys = 0;
768      skc_uint bits_skpk = 0;
769
770      //
771      // calculate bits_keys
772      //
773#undef  SKC_EXPAND_X
774#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
775        skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
776        if (idx < keys) {                                               \
777          bits_keys |= (1u << I);                                       \
778        }                                                               \
779        if (SKC_PLACE_IS_TRAILING_ROW(I)) {                             \
780          if (keys > SKC_RASTER_NODE_COUNT_KEYS) {                      \
781            if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {  \
782              bits_keys &= ~(1u << I);                                  \
783            }                                                           \
784          }                                                             \
785        }                                                               \
786      }
787
788      SKC_PLACE_EXPAND();
789
790      //
791      // blindly calculate bits_skpk
792      //
793#undef  SKC_EXPAND_X
794#define SKC_EXPAND_X(I,S,C,P,R) {                                       \
795        bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
796      }
797
798      SKC_PLACE_EXPAND();
799
800#if 0
801      printf("%2X : %2X\n",bits_keys,bits_skpk);
802#endif
803
804      //
805      // next pointer is last element of last row.  save it now because
806      // this might be recognized as a subgroup-uniform/scalar.
807      //
808      id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
809
810      //
811      // append SK keys first
812      //
813      skc_uint const bits_sk = bits_keys & ~bits_skpk;
814      skc_uint       sk      = 0;
815
816#undef  SKC_EXPAND_X
817#define SKC_EXPAND_X(I,S,C,P,R) {                       \
818        skc_uint is_sk  = (bits_sk >> I) & 1;           \
819        skc_uint sk_idx = skc_ballot(&sk,is_sk);        \
820        if (is_sk) {                                    \
821          smem->lo.sk[sk_idx] = n##I.xk.lo;             \
822          smem->hi.sk[sk_idx] = n##I.xk.hi;             \
823        }                                               \
824      }
825
826      SKC_PLACE_EXPAND();
827
828      //
829      // append PK keys next
830      //
831      skc_uint const bits_pk = bits_keys & bits_skpk;
832      skc_uint       pk      = 0;
833
834#undef  SKC_EXPAND_X
835#define SKC_EXPAND_X(I,S,C,P,R) {                       \
836        skc_uint is_pk  = (bits_pk >> I) & 1;           \
837        skc_uint pk_idx = skc_ballot(&pk,is_pk);        \
838        if (is_pk) {                                    \
839          smem->lo.pk[pk_idx] = n##I.xk.lo;             \
840          smem->hi.pk[pk_idx] = n##I.xk.hi;             \
841        }                                               \
842      }
843
844      SKC_PLACE_EXPAND();
845
846#if 0
847    printf("%2u * %2u\n",sk,pk);
848#endif
849      //
850      // if total for either the sk or pk queue reaches the
851      // highwater mark then flush it to the extent
852      //
853      skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
854      skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
855
856      //
857      // if this was the last node then we're done
858      //
859      if (--nodes == 0)
860        return;
861
862      //
863      // otherwise decrement keys
864      //
865      keys -= SKC_RASTER_NODE_COUNT_KEYS;
866    }
867}
868
869//
870//
871//
872