cl_12/kernels/rasterize.cl

/*
 * Copyright 2017 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can
 * be found in the LICENSE file.
 *
 */

//
//
//

#include "tile.h"
#include "common.h"
#include "atomic_cl.h"
#include "block_pool_cl.h"
#include "raster_builder_cl_12.h"
#include "kernel_cl_12.h"

// #define SKC_ARCH_AVX2
// #define SKC_RASTERIZE_SIMD_USES_SMEM

#define PRINTF_ENABLE       0
#define PRINTF_BLOCK_COUNT  0

//
// NOTE:
//
// ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT
// AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE
//
// NOTE:
//
// IGNORE FOR NOW ANY AVX2 CODE SNIPPETS.  THEY WILL BE MOVED ASAP.
//
//

#if 0 // SKC_ARCH_AVX2

// #define SKC_RASTERIZE_SUBGROUP_SIZE              1
// #define SKC_RASTERIZE_VECTOR_SIZE_LOG2           3
// #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP   1

// #define SKC_TTXB_WORDS                           8

// #define SKC_RASTERIZE_FLOAT                      float8
// #define SKC_RASTERIZE_UINT                       uint8
// #define SKC_RASTERIZE_INT                        int8
// #define SKC_RASTERIZE_PREDICATE                  int8

// #define SKC_RASTERIZE_BIN_BLOCK                  uint16
// #define SKC_RASTERIZE_BIN                        uint8

// #define SKC_RASTERIZE_POOL                       uint8
// #define SKC_RASTERIZE_POOL_SCALE                 6

// #define SKC_RASTERIZE_TILE_HASH_X_BITS           1
// #define SKC_RASTERIZE_TILE_HASH_Y_BITS           2

// #define SKC_RASTERIZE_VECTOR_EXPAND()            SKC_EXPAND_8()

#endif

//
// SIMT
//

#define SKC_RASTERIZE_BLOCK_ID_V_SIZE        SKC_RASTERIZE_SUBGROUP_SIZE
#define SKC_RASTERIZE_TTSK_V_SIZE            SKC_RASTERIZE_SUBGROUP_SIZE
#define SKC_RASTERIZE_TTSK_V_MASK            (SKC_RASTERIZE_TTSK_V_SIZE - 1)

//
//
//

#define SKC_RASTERIZE_VECTOR_SIZE            (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2)
#define SKC_RASTERIZE_ELEMS_PER_SUBGROUP     (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE)

//
//
//

#define SKC_RASTERIZE_YX_INIT                0x7FFF7FFF  // { +32767, +32767 }
#define SKC_RASTERIZE_YX_INVALID             0x80008000  // { -32768, -32768 }

//
//
//

#define SKC_RASTERIZE_TILE_HASH_X_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS)
#define SKC_RASTERIZE_TILE_HASH_Y_MASK       SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS)
#define SKC_RASTERIZE_TILE_HASH_BITS         (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS)
#define SKC_RASTERIZE_TILE_HASH_BIN_COUNT    (1 << SKC_RASTERIZE_TILE_HASH_BITS)
#define SKC_RASTERIZE_TILE_HASH_BIN_BITS     (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT)
#define SKC_RASTERIZE_TILE_HASH_BIN_MASK     SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS)

//
// Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++"
//
// https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/
//
// Lerp in two fma/mad ops:
//
//    t * b + ((-t) * a + a)
//
// Note: OpenCL documents mix() as being implemented as:
//
//    a + (b - a) * t
//
// But this may be a native instruction on some devices. For example,
// on GEN9 there is an LRP "linear interoplation" opcode but it
// doesn't appear to support half floats.
//
// Feel free to toggle this option and then benchmark and inspect the
// generated code.  We really want the double FMA to be generated when
// there isn't support for a LERP/MIX operation.
//

#if 1
#define SKC_LERP(a,b,t)      mad(t,b,mad(-(t),a,a))
#else
#define SKC_LERP(a,b,t)      mix(a,b,t)
#endif

//
// There is no integer MAD in OpenCL with "don't care" overflow
// semantics.
//
// FIXME -- verify if the platform needs explicit MAD operations even
// if a "--fastmath" option is available at compile time.  It might
// make sense to explicitly use MAD calls if the platform requires it.
//

#if 1
#define SKC_MAD_UINT(a,b,c)  ((a) * (b) + (c))
#else
#define SKC_MAD_UINT(a,b,c)  mad_sat(a,b,c)
#endif

//
//
//

#define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane())

//
//
//

union skc_bp_elem
{
  skc_uint              u32;
  skc_tagged_block_id_t tag_id;
  skc_float             coord;
};

//
//
//

struct skc_subgroup_smem
{
  //
  // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member
  //
#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM )
  struct {
    union {

      skc_uint                winner;

      struct {
        skc_uint              scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
      } aN;

      struct {
        SKC_RASTERIZE_UINT    scratch[SKC_RASTERIZE_SUBGROUP_SIZE];
      } vN;
    };
  } subgroup;
#endif

  //
  // work-in-progress TTSB blocks and associated YX keys
  //
  union {
    struct {
      // FIXME -- some typedefs are valid here
      skc_uint                ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS];
      skc_uint                yx   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
      skc_uint                id   [SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
      skc_uint                count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
    } aN;
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
    struct {
      SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT];
      SKC_RASTERIZE_BIN       yx;
      SKC_RASTERIZE_BIN       id;
      SKC_RASTERIZE_BIN       count;
    } vN;
#endif
  } bin;
};

//
//
//

#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
#define skc_subgroup_lane()  0
#else
#define skc_subgroup_lane()  get_sub_group_local_id()
#endif

//
//
//

#define SKC_PROJECT(tv,x,y,xp,yp)                                       \
  {                                                                     \
    float const d = native_recip(fma(x,tv->w0,fma(y,tv->w1,1.0f)));     \
    xp *= d;                                                            \
    yp *= d;                                                            \
  }

//
// replenish block ids
//
// note that you can't overrun the block id pool since it's a ring
//

static
void
skc_blocks_replenish(skc_uint                           * const blocks_next,
                     skc_block_id_v_t                   * const blocks,
                     __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
                     skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
                     __global skc_block_id_t   const    * const bp_ids)
{
  //
  // get a new vector of block ids -- this is kind of a narrow
  // allocation but subblocks help stretch out the pool.
  //
  // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids
  //
  skc_uint bp_idx = 0;

  if (skc_subgroup_lane() == 0)
    {
      bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,
                                                    SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads
#if 0
      printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE);
#endif
    }

  bp_idx       = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask;
  *blocks      = bp_ids[bp_idx];
  *blocks_next = 0;
}

//
//
//

static
skc_block_id_t
skc_blocks_get_next(skc_uint                           * const blocks_next,
                    skc_block_id_v_t                   * const blocks,
                    __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
                    skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
                    __global skc_block_id_t   const    * const bp_ids)
{
  // replenish?
  if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE)
    {
      skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
    }

#if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 )
  //
  // SIMT
  //
  skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next);

#else
  //
  // SIMD
  //
  skc_block_id_t id = blocks->s0;

  skc_shuffle_down_1(*blocks);

#endif

  *blocks_next += 1;

  return id;
}

//
// subblock allocator
//

#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2

static
skc_block_id_t
skc_subblocks_get_next(skc_block_id_t                     * const subblocks,
                       skc_uint                           * const blocks_next,
                       skc_block_id_v_t                   * const blocks,
                       __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
                       skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
                       __global skc_block_id_t   const    * const bp_ids)
{
  if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
    {
      *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids);
    }

  skc_block_id_t const sb_id = *subblocks;

  *subblocks += 1;

#if 0
  if (get_sub_group_local_id() == 0)
    printf("= %u\n",sb_id);
#endif

  return sb_id;
}


#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks
#define SKC_SUBBLOCKS_BLOCKS_ARGS()  subblocks, blocks

#else

#define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks
#define SKC_SUBBLOCKS_BLOCKS_ARGS()  blocks

#endif

//
//
//

static
skc_block_id_t
skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(),
                  skc_uint                           * const blocks_next,
                  __global SKC_ATOMIC_UINT  volatile * const bp_atomics,
                  skc_uint                             const bp_mask, // pow2 modulo mask for block pool ring
                  __global skc_block_id_t   const    * const bp_ids,
                  __global SKC_ATOMIC_UINT  volatile * const cohort_atomics,
                  skc_ttsk_v_t                       * const sk_v,
                  skc_uint                           * const sk_v_next,
                  __global skc_ttsk_s_t              * const sk_extent,
                  skc_uint                             const new_yx)
{
#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
  skc_block_id_t const new_id = skc_subblocks_get_next(subblocks,
                                                       blocks_next,
                                                       blocks,
                                                       bp_atomics,
                                                       bp_mask,
                                                       bp_ids);
#else
  skc_block_id_t const new_id = skc_blocks_get_next(blocks_next,
                                                    blocks,
                                                    bp_atomics,
                                                    bp_mask, // pow2 modulo mask for block pool ring
                                                    bp_ids);
#endif

  if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK))
    {
      sk_v->lo = new_id;
      sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx;
#if 0
      printf("@ ( %3u, %3u ) %u\n",
             (new_yx >> 12) & 0xFFF,
             (new_yx      ) & 0xFFF,
             new_id);
#endif
    }

  *sk_v_next += 1;

  if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE)
    {
      *sk_v_next = 0;

      skc_uint sk_idx = 0;

      if (skc_subgroup_lane() == 0)
        {
          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE);
#if 0
          printf("+ %u\n",sk_idx);
#endif
        }

      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();

#if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE )
      if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE)
#endif
        {
          sk_extent[sk_idx] = *sk_v;
#if 0
          printf("> %u : %v2u\n",sk_idx,*sk_v);
#endif
        }
    }

  return new_id;
}

//
//
//

static
SKC_RASTERIZE_FLOAT
skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v)
{
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //
  // Note that there isn't a built-in horizontal scan for vectors so
  // we'll define some here for various widths.
  //
  // FIXME -- a scalar version might be faster so put in a
  // compile-time switch to selection between implementations
  //

#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
  return v;

#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
  // 01
  //  0 +
  // --
  // 01
  SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v);
  return w;

#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
  // 0123
  //  012 +
  // ----
  // 0123
  //   01 +
  // ----
  // 0123
  //
  SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v);
  SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w);
  return x;

#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
  // 01234567
  //  0123456 +
  // --------
  // 01234567
  //   012345 +
  // --------
  // 01234567
  //     0123 +
  // --------
  // 01234567
  //
  SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v);
  SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w);
  SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x);
  return y;

#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
  // 0123456789abcdef
  //  0123456789abcde +
  // ----------------
  // 0123456789abcdef
  //   0123456789abcd +
  // ----------------
  // 0123456789abcdef
  //     0123456789ab +
  // ----------------
  // 0123456789abcdef
  //         01234567 +
  // ----------------
  // 0123456789abcdef
  //
  SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
  SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
  SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
  SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
  return z;

#endif

#else
  //
  // SIMT
  //

  return sub_group_scan_inclusive_add(v);

#endif
}

//
//
//

static
SKC_RASTERIZE_UINT
skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v)
{
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //
  // Note that there isn't a built-in horizontal scan for vectors so
  // we'll define some here for various widths.
  //
  // FIXME -- a scalar version might be faster so put in a
  // compile-time switch to selection between implementations
  //

#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
  return v;

#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
  // 01
  //  0 +
  // --
  // 01
  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v);
  return w;

#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
  // 0123
  //  012 +
  // ----
  // 0123
  //   01 +
  // ----
  // 0123
  //
  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v);
  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w);
  return x;

#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
  // 01234567
  //  0123456 +
  // --------
  // 01234567
  //   012345 +
  // --------
  // 01234567
  //     0123 +
  // --------
  // 01234567
  //
  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v);
  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w);
  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x);
  return y;

#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
  // 0123456789abcdef
  //  0123456789abcde +
  // ----------------
  // 0123456789abcdef
  //   0123456789abcd +
  // ----------------
  // 0123456789abcdef
  //     0123456789ab +
  // ----------------
  // 0123456789abcdef
  //         01234567 +
  // ----------------
  // 0123456789abcdef
  //
  SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v);
  SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w);
  SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x);
  SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y);
  return z;

#endif

#else
  //
  // SIMT
  //

  return sub_group_scan_inclusive_add(v);

#endif
}

//
//
//

static
SKC_RASTERIZE_UINT
skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v)
{
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //
  // Note that there isn't a built-in horizontal scan for vectors so
  // we'll define some here for various widths.
  //
  // FIXME -- a scalar version might be faster so put in a
  // compile-time switch to selection between implementations
  //

#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
  return v;

#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
  // 01
  // 00 max
  // --
  // 01
  SKC_RASTERIZE_UINT const w = max(v.s00,v);
  return w;

#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
  // 0123
  // 0012 +
  // ----
  // 0123
  // 0101 +
  // ----
  // 0123
  //
  SKC_RASTERIZE_UINT const w = max(v.s0012,v);
  SKC_RASTERIZE_UINT const x = max(w.s0101,w);
  return x;

#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
  // 01234567
  // 00123456 +
  // --------
  // 01234567
  // 01012345 +
  // --------
  // 01234567
  // 01230123 +
  // --------
  // 01234567
  //
  SKC_RASTERIZE_UINT const w = max(v.s00123456,v);
  SKC_RASTERIZE_UINT const x = max(w.s01012345,w);
  SKC_RASTERIZE_UINT const y = max(x.s01230123,x);
  return y;

#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
  // 0123456789abcdef
  // 00123456789abcde +
  // ----------------
  // 0123456789abcdef
  // 010123456789abcd +
  // ----------------
  // 0123456789abcdef
  // 01230123456789ab +
  // ----------------
  // 0123456789abcdef
  // 0123456701234567 +
  // ----------------
  // 0123456789abcdef
  //
  SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v);
  SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w);
  SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x);
  SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y);
  return z;

#endif

#else
  //
  // SIMT
  //

  return sub_group_scan_inclusive_max(v);

#endif
}

//
//
//

static
float
skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v)
{
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //
#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
  return v;
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
  return v.s1;
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
  return v.s3;
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
  return v.s7;
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
  return v.sf;
#endif

#else
  //
  // SIMT
  //
  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);

#endif
}

//
//
//

static
SKC_RASTERIZE_UINT
skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v)
{
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //
#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
  return v;
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
  return v.s1;
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
  return v.s3;
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
  return v.s7;
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
  return v.sf;
#endif

#else
  //
  // SIMT
  //
  return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1);

#endif
}

//
//
//

static
float
skc_subgroup_first(SKC_RASTERIZE_FLOAT const v)
{
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //
#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
  return v;
#else
  return v.s0;
#endif

#else
  //
  // SIMT
  //
  return sub_group_broadcast(v,0);

#endif
}

//
//
//

static
SKC_RASTERIZE_FLOAT
skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v,
                      SKC_RASTERIZE_UINT  const i)
{
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //
#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
  return v;
#else
  return shuffle(v,i);
#endif

#else
  //
  // SIMT
  //
  return intel_sub_group_shuffle(v,i);

#endif
}

//
//
//

static
SKC_RASTERIZE_FLOAT
skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous
                          SKC_RASTERIZE_FLOAT const c) // current
{
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //
  // FIXME -- there are alternative formulations here:
  //
  // Option 1:
  //
  //   select(c.rotate(+1),p.rotate(-1),(1,0,0,...))
  //
  // Option 2:
  //
  //   p is a scalar
  //   t    = c.rotate(+1)
  //   t.s0 = p;
  //
  // Option 3: ...
  //
#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
  return p;
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
  return shuffle2(p,c,(uint2)(1,2));
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
  return shuffle2(p,c,(uint4)(3,4,5,6));
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
  return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14));
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
  return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30));
#endif

#else
  //
  // SIMT
  //
  return intel_sub_group_shuffle_up(p,c,1);

#endif
}

//
//
//

static
bool
skc_is_lane_first()
{
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
  //
  // SIMD
  //
  return true;
#else
  //
  // SIMT
  //
  return get_sub_group_local_id() == 0;
#endif
}

//
//
//

static
SKC_RASTERIZE_FLOAT
skc_delta_offset()
{
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //
#if   ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
  return 1;
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 )
  return (SKC_RASTERIZE_FLOAT)( 1, 2 );
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 )
  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 );
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 )
  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 );
#elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 )
  return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 );
#endif

#else
  //
  // SIMT
  //
  return 1.0f + get_sub_group_local_id();

#endif

}

//
//
//

static
int
skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p)
{
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //
  return any(p);
#else
  //
  // SIMT
  //
  return sub_group_any(p);
#endif
}

//
//
//

#define SKC_PATH_NODEWORD_IS_LAST(n)  (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK)

void
skc_segment_next(__global union skc_bp_elem * const bp_elems,
                 skc_uint                   * const nodeword,
                 skc_block_id_t             * const id)
{
  if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0)
    {
      if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword))
        {
          *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS;
        }

      skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id;

      *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);
    }
}

//
//
//

static
SKC_RASTERIZE_FLOAT
skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y)
{
  return native_sqrt(x * x + y * y);
}

//
// Wang's Formula (1985)
//

#define SKC_WANG_PIXEL_RESL   0.25f // <-- this can be tuned

#define SKC_WANG_EPSILON      (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32)

#define SKC_WANG_CUBIC        ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON))
#define SKC_WANG_QUADRATIC    ((2.0f       ) / (8.0f * SKC_WANG_EPSILON))

#define SKC_WANG_LENGTH(x,y)  skc_native_length(x,y)
#define SKC_WANG_SQRT(x)      native_sqrt(x)

//
//
//

static
SKC_RASTERIZE_FLOAT
skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
                        SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
                        SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y,
                        SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y)
{
  //
  // Return the number of evenly spaced (in the parametric sense) line
  // segments that are guaranteed to be within "epsilon" error of the
  // curve.
  //
  // We're then going to take multiples of the reciprocal of this
  // number so that the segmentation can be distributed across the
  // subgroup.
  //
  // Note, this can probably be slightly optimized per architecture
  // but it's probably far from being a hotspot since it's all
  // straight-line unpredicated code.
  //
  // The result is an integer ranging from [1.0,#segments]
  //
  // Note that even if all of the control points are coincident, the
  // max(1.0f) will categorize this as a line of 1 segment.
  //
  // This is what we want!  We want to convert cubics to lines as
  // easily as possible and *then* cull lines that are either
  // horizontal or zero length.
  //
  return max(1.0f,
             ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC *
                                SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x),
                                                    fabs(t3x - 2.0f * t2x + t1x)),
                                                max(fabs(t2y - 2.0f * t1y + t0y),
                                                    fabs(t3y - 2.0f * t2y + t1y))))));
}

static
SKC_RASTERIZE_FLOAT
skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y,
                            SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y,
                            SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y)
{
  return max(1.0f,
             ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC *
                                SKC_WANG_LENGTH(t2x - 2.0f * t1x + t0x,
                                                t2y - 2.0f * t1y + t0y))));
}

//
// rational curves
//

static
SKC_RASTERIZE_FLOAT
skc_wangs_formula_cubic_rat()
{
  return 0.0f;
}

static
SKC_RASTERIZE_FLOAT
skc_wangs_formula_quad_rat()
{
  return 0.0f;
}

//
// flush any work-in-progress blocks and return unused block ids
//

static
void
skc_finalize(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
             __global union skc_bp_elem                 * const bp_elems,
             __global uint                              * const bp_ids,
             skc_uint                                     const bp_mask,
             __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
             skc_block_id_v_t                           * const blocks,
             skc_uint                                     const blocks_next,
             skc_ttsk_v_t                               * const sk_v,
             skc_uint                                     const sk_v_next,
             __global skc_ttsk_s_t                      * const sk_extent,
             __local  struct skc_subgroup_smem volatile * const smem)
{
  //
  // flush non-empty bins
  //
  // FIXME -- accelerate this iteration/search with a subgroup operation
  //
  for (skc_uint ii=0; ii<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; ii++)
    {
      if (smem->bin.aN.count[ii] > 0)
        {
          skc_block_id_v_t const id  = smem->bin.aN.id[ii];
          skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
          skc_uint         const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()];
#if 0
          printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts);
#endif
          bp_elems[idx].u32 = tts;
        }

      //
      // FIXME -- vectorize with vstoreN()
      //
    }

  //
  // return remaining block ids back to the pool
  //
  skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next;

  if (blocks_rem > 0)
    {
      skc_uint bp_idx = 0;

      if (skc_subgroup_lane() == 0)
        {
          bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem);

#if 0
          printf("r-: %8u + %u\n",bp_idx,blocks_rem);
#endif
        }

      bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask;

      if (skc_subgroup_lane() >= blocks_next)
        {
          bp_ids[bp_idx] = *blocks;
        }
    }

  //
  // flush work-in-progress ryx keys
  //
  if (sk_v_next > 0)
    {
      skc_uint sk_idx = 0;

      if (skc_subgroup_lane() == 0)
        {
          sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
            (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next);
#if 0
          printf("* %u\n",sk_idx);
#endif
        }

      sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane();

      if (skc_subgroup_lane() < sk_v_next)
        {
          sk_extent[sk_idx] = *sk_v;
        }
    }
}

//
// If there are lanes that were unable to append to a bin because
// their hashes collided with a bin's current ryx key then those bins
// must be ejected.
//
// Note that we do not eject "full" bins because lazily waiting for a
// collision results in simpler code.
//

static
void
skc_flush(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
          __global union skc_bp_elem                 * const bp_elems,
          __global uint                              * const bp_ids,
          skc_uint                                     const bp_mask,
          __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
          skc_block_id_t                             * const subblocks,
          skc_block_id_v_t                           * const blocks,
          skc_uint                                   * const blocks_next,
          skc_ttsk_v_t                               * const sk_v,
          skc_uint                                   * const sk_v_next,
          __global skc_ttsk_s_t                      * const sk_extent,
          __local  struct skc_subgroup_smem volatile * const smem,
          SKC_RASTERIZE_UINT                           const hash,
          SKC_RASTERIZE_UINT                           const yx,
          SKC_RASTERIZE_PREDICATE                            is_collision) // pass by value
{
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //

  //
  // FIXME -- this code is now stale with the changes to the
  // subblock/block allocation strategy
  //

  //
  // get local TTSB ID queue count
  //
  skc_uint ttsb_id_count  = smem->pool.count; // scalar

  // init hash bit mask
  skc_uint component_mask = 0;

  for (int cc=0; cc<SKC_RASTERIZE_VECTOR_SIZE; cc++)
    {
      // if no collision continue
      if (((int*)&is_collision)[cc] == 0)
        continue;

      uint const winner        = ((uint*)&hash)[cc];
      uint const component_bit = 1u << winner;

      // if already processed this hash then continue
      if (component_mask & component_bit)
        continue;

      // update component mask
      component_mask |= component_bit;

      //
      // new winner requires ejecting the old TTSB
      //
      if (smem->bin.aN.count[winner] > 0)
        {
          skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();

          bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
        }

        //
        // ensure there is at least one TTSK and TTSB ID
        //
        if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE)
          {
            //
            // update remaining count
            //
            ttsb_id_count = 0;

            //
            // flush accumulated ttsk_ryx keys
            //
            uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE
              (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count

#if 0
            printf("# %u\n",idx);
#endif

            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
              {
                ttsk_ryx[idx + ii] = skc_make_ttsk_ryx(smem,SKC_CMD_RASTERIZE_GET_COHORT(cmd),ii);
              }

            //
            // allocate more ttsb ids from pool
            //
            uint const id = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+0,SKC_RASTERIZE_POOL_SIZE); // ring_reads

            for (uint ii=0; ii<SKC_RASTERIZE_POOL_SIZE; ii+=SKC_RASTERIZE_SUBGROUP_SIZE)
              smem->pool.aN.id[ii] = bp_ids[id + ii];
          }

      //
      // invalidate the winning block
      //

      //
      // update bin with winning yx, new ttsb id and zero count
      //
      // all lanes are loading/storing from/to the same index
      //
      smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID );
      smem->bin.aN.id   [winner] = smem->pool.aN.id[ttsb_id_count];
      smem->bin.aN.yx   [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc];
      smem->bin.aN.count[winner] = 0;

      //
      // update count
      //
      ttsb_id_count += 1;
    }

  //
  // save count
  //
  smem->pool.count = ttsb_id_count;

#else
  //
  // SIMT
  //

  do {
    //
    // only one lane will win!
    //
    if (is_collision)
      smem->subgroup.winner = hash;

    barrier(CLK_LOCAL_MEM_FENCE);

    //
    // which bin is being ejected?
    //
    skc_uint const winner = smem->subgroup.winner;

    //
    // which colliding hash is taking over the bin?
    //
    SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner);

    //
    // all lanes with the same hash will try to store but only one
    // lane will win
    //
    if (is_winner)
      smem->subgroup.winner = yx;

    barrier(CLK_LOCAL_MEM_FENCE);

    //
    // flush this block to the pool
    //
    if (smem->bin.aN.count[winner] > 0)
      {
        skc_block_id_v_t const id  = smem->bin.aN.id[winner];
        skc_uint         const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane();
        skc_uint         const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()];
#if 0
        printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts);
#endif
        bp_elems[idx].u32 = tts;
      }

    //
    // append new ttsk
    //
    skc_uint       const new_yx = smem->subgroup.winner;
    skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(),
                                                    blocks_next,
                                                    bp_atomics,
                                                    bp_mask, // pow2 modulo mask for block pool ring
                                                    bp_ids,
                                                    cohort_atomics,
                                                    sk_v,
                                                    sk_v_next,
                                                    sk_extent,
                                                    new_yx);

#if 0
    if (get_sub_group_local_id() == 0) {
      printf(">>> %9u\n",new_id);
    }
#endif

    //
    // update bin with winning yx, new ttsb id and zero count
    //
    smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID;
    smem->bin.aN.yx   [winner]                      = new_yx;
    smem->bin.aN.id   [winner]                      = new_id;
    smem->bin.aN.count[winner]                      = 0;

    //
    // remove all lanes matching this hash
    //
    is_collision = is_collision && !is_winner;

    //
    // exit if nothing left to do
    //
  } while (sub_group_any(is_collision));

#endif
}

//
// scatter scan max
//
static
SKC_RASTERIZE_UINT
skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem,
                     SKC_RASTERIZE_FLOAT                         const iss,
                     SKC_RASTERIZE_FLOAT                         const ess)
{
  //
  // prefix sums determine which lanes we're going to work on next
  //
  SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP);
  SKC_RASTERIZE_UINT      const scratch_idx      = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f));

#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //
#ifdef SKC_RASTERIZE_SIMD_USES_SMEM
  //
  // SIMD APPROACH 1: SIMT'ISH
  //

  // zero the volatile smem scratchpad using vector syntax
  smem->subgroup.vN.scratch[0] = ( 0 );

#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,A)                         \
  if (is_scratch_store C)                               \
    smem->subgroup.aN.scratch[scratch_idx C] = I;

  SKC_RASTERIZE_VECTOR_EXPAND();

  // propagate lanes to right using max scan
  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0];
  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);

#else
  //
  // SIMD APPROACH 2: SCALAR'ISH
  //

  SKC_RASTERIZE_UINT source = ( 0 );

#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,A)                 \
  if (is_scratch_store C)                       \
    ((uint *)&source)[scratch_idx C] = I;

  SKC_RASTERIZE_VECTOR_EXPAND();

  for (uint ii=1; ii<SKC_RASTERIZE_ELEMS_PER_SUBGROUP; ii++)
    ((uint *)&source)[ii] = max(((uint *)&source)[ii-1],((uint *)&source)[ii]);
#endif

#else
  //
  // SIMT
  //

  //
  // zero the volatile smem scratchpad using vector syntax
  //
  smem->subgroup.vN.scratch[skc_subgroup_lane()] = ( 0 );

  //
  // store source lane at starting lane
  //
  if (is_scratch_store)
    smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane();

  //
  // propagate lanes to right using max scan
  //
  SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()];
  SKC_RASTERIZE_UINT const source  = skc_subgroup_scan_inclusive_max(scratch);
#endif

  return source;
}

//
// sliver lines into subpixels
//

static
void
skc_sliver(__global SKC_ATOMIC_UINT          volatile * const bp_atomics,
           __global union skc_bp_elem                 * const bp_elems,
           __global uint                              * const bp_ids,
           skc_uint                                     const bp_mask,
           __global SKC_ATOMIC_UINT          volatile * const cohort_atomics,
           skc_block_id_t                             * const subblocks,
           skc_block_id_v_t                           * const blocks,
           skc_uint                                   * const blocks_next,
           skc_ttsk_v_t                               * const sk_v,
           skc_uint                                   * const sk_v_next,
           __global skc_ttsk_s_t                      * const sk_extent,
           __local  struct skc_subgroup_smem volatile * const smem,
           SKC_RASTERIZE_FLOAT                          const l0x,
           SKC_RASTERIZE_FLOAT                          const l0y,
           SKC_RASTERIZE_FLOAT                          const l1x,
           SKC_RASTERIZE_FLOAT                          const l1y)
{
  //
  // Y-SLIVERING
  // -----------
  //
  // immediately sliver all multi-pixel lines in into 1-pixel high
  // lines
  //
  // note this implicitly squelches horizontal lines
  //
  // there is another test for horizontal lines after x-slivering
  // is complete
  //

  //
  // will we need to flip the sign of y_delta ?
  //
  SKC_RASTERIZE_PREDICATE const y_lt   = (l0y <= l1y);
  SKC_RASTERIZE_UINT      const dy_xor = y_lt ? 0 : 0x80000000;

  //
  // save 1/dy
  //
  SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y);

  //
  // how many non-horizontal subpixel y-axis slivers are there?
  //
  SKC_RASTERIZE_FLOAT const y_min   = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
  SKC_RASTERIZE_FLOAT const y_max   = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN);
  SKC_RASTERIZE_FLOAT const y_base  = y_lt ? y_min : y_max;
  SKC_RASTERIZE_FLOAT       y_segs  = y_max - y_min;

  //
  // inclusive subgroup scan of y_segs
  //
  SKC_RASTERIZE_FLOAT       y_iss   = skc_subgroup_scan_inclusive_add_float(y_segs);
  SKC_RASTERIZE_FLOAT       y_ess   = y_iss - y_segs;
  float                     y_rem   = skc_subgroup_last_float(y_iss);

  //
  // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails
  //
  if (y_segs == 0.0f)
    y_iss = 0.0f;

#if 0
  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem);
#endif

  //
  // these values don't matter on first iteration
  //
  SKC_RASTERIZE_FLOAT n1x_prev = 0;
  SKC_RASTERIZE_FLOAT n1y_prev = 0;

  //
  // loop until done
  //
  while (y_rem > 0.0f)
    {
      //
      // distribute work across lanes
      //
      SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess);

      //
      // get line at y_source line
      //
      SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source);
      SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source);
      SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source);
      SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source);

      //
      // every lane will create a 1 pixel tall line "sliver"
      //
      // FIXME -- this gets expanded on SIMD
      //
      // if numerator == 1 then this is the first lane
      // if numerator == s then this is the last  lane
      //
      SKC_RASTERIZE_FLOAT     const y_delta    = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source);
      SKC_RASTERIZE_FLOAT     const y_count    = skc_subgroup_shuffle(y_segs,y_source);

      SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f);
      SKC_RASTERIZE_PREDICATE const is_y_last  = (y_delta >= y_count);

      // toggle y_delta sign
      SKC_RASTERIZE_FLOAT     const y_offset   = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source)));

      //
      // calculate "right" line segment endpoint
      //
      SKC_RASTERIZE_FLOAT       n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP;
      SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source);
      SKC_RASTERIZE_FLOAT       n1x = round(SKC_LERP(m0x,m1x,n_t));

      //
      // override c1 if this is last point
      //
      n1y = select(n1y,m1y,is_y_last);
      n1x = select(n1x,m1x,is_y_last);

      //
      // shuffle up "left" line segment endpoint
      //
      // NOTE: Intel's shuffle_up is unique with its elegant
      // "previous" argument so don't get used to it
      //
      SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y);
      SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x);

      //
      // override shuffle up if this is the first line segment
      //
      n0y = select(n0y,m0y,is_y_first);
      n0x = select(n0x,m0x,is_y_first);

      //
      // save previous right endpoint
      //
      n1x_prev = n1x;
      n1y_prev = n1y;

      //
      // decrement by subgroup size
      //
      y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
      y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
      y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

#if 0
      //
      // debug
      //
      if (n0y != n1y) {
        printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y);
      }
#endif

      //
      // X-SLIVERING
      // -----------
      //
      // now sliver 1-pixel high lines into at either vertical or
      // 1-pixel wide lines
      //
      // save original direction and work with increasing x
      //
      SKC_RASTERIZE_PREDICATE const x_lt   = (n0x <= n1x);
      SKC_RASTERIZE_UINT      const dx_xor = x_lt ? 0 : 0x80000000;

      //
      // save 1/dy
      //
      SKC_RASTERIZE_FLOAT const x_denom  = native_recip(n1x - n0x);

      //
      // how many non-horizontal subpixel y-axis slivers are there?
      //
      SKC_RASTERIZE_FLOAT const x_min    = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
      SKC_RASTERIZE_FLOAT const x_max    = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN);
      SKC_RASTERIZE_FLOAT const x_base   = x_lt ? x_min : x_max;
      SKC_RASTERIZE_FLOAT const x_segs   = fmax(x_max - x_min,1.0f);

      //
      // inclusive subgroup scan of y_segs
      //
      SKC_RASTERIZE_FLOAT       x_iss    = skc_subgroup_scan_inclusive_add_float(x_segs);
      SKC_RASTERIZE_FLOAT       x_ess    = x_iss - x_segs;
      float                     x_rem    = skc_subgroup_last_float(x_iss);

      //
      // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails
      //
      //if (x_segs == 0.0f)
      // x_iss = 0.0f;

      //
      // these values don't matter on first iteration
      //
      SKC_RASTERIZE_FLOAT       p1x_prev = 0;
      SKC_RASTERIZE_FLOAT       p1y_prev = 0;

      //
      // loop until done
      //
      while (x_rem > 0)
        {
          //
          // distribute work across lanes
          //
          SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess);

          //
          // get line at y_source line
          //
          SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source);
          SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source);
          SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source);
          SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source);

          //
          // every lane will create a 1 pixel tall line "sliver"
          //
          // FIXME -- this gets expanded on SIMD
          //
          // if numerator == 1 then this is the first lane
          // if numerator == s then this is the last  lane
          //
          SKC_RASTERIZE_FLOAT     const x_delta    = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source);
          SKC_RASTERIZE_FLOAT     const x_count    = skc_subgroup_shuffle(x_segs,x_source);

          SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f);
          SKC_RASTERIZE_PREDICATE const is_x_last  = (x_delta >= x_count);

          // toggle x_delta sign
          SKC_RASTERIZE_FLOAT     const x_offset   = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source)));

          //
          // calculate "right" line segment endpoint
          //
          SKC_RASTERIZE_FLOAT       p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP;
          SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source);
          SKC_RASTERIZE_FLOAT       p1y = round(SKC_LERP(o0y,o1y,p_t));

          //
          // override c1 if this is last point
          //
          p1x = select(p1x,o1x,is_x_last);
          p1y = select(p1y,o1y,is_x_last);

          //
          // shuffle up "left" line segment endpoint
          //
          // NOTE: Intel's shuffle_up is unique with its elegant
          // "previous" argument so don't get used to it
          //
          SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x);
          SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y);

          //
          // override shuffle up if this is the first line segment
          //
          p0x = select(p0x,o0x,is_x_first);
          p0y = select(p0y,o0y,is_x_first);

          //
          // save previous right endpoint
          //
          p1x_prev = p1x;
          p1y_prev = p1y;

          //
          // decrement by subgroup size
          //
          x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
          x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
          x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

          //
          // only non-horizontal subpixel lines are valid
          //
          SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y);

          //
          // if no lanes are active then continue
          //
          // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY
          // IMPACTS PERFORMANCE (+12% ?)
          //
          // IT SHOULDN'T !!!
          //
#if 0
          if (!skc_subgroup_any(is_active))
            continue;
#endif

          //
          // Option 1: use SLM for explicitly managed coalesced stores
          //
          // 1. which tile does this line belong?
          // 2. hash tile coordinates
          // 3. lookup hash
          // 4. if tile matches then SLM append keys
          // 5. if tile doesn't match
          //   a. flush
          //   b. create new TTSK_RYX
          //   c. obtain TTSB block from pool
          //   d. goto 3.
          //

          //
          // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores
          //
          // 1. which tile does this line belong?
          // 2. hash tile coordinates
          // 3. lookup hash
          // 4. if tile matches then GMEM append keys
          // 5. if tile doesn't match
          //   a. flush (and invalidate empty elems)
          //   b. create new TTSK_RYX
          //   c. obtain TTSB block from pool
          //   d. goto 3.
          //

          //
          // The virtual rasterization surface is very large and
          // signed: +/- ~64K-256K, depending on the architecture.
          //
          // Rasters must be clipped to the virtual surface and,
          // optionally, clipped even further on a per raster
          // basis.
          //

          //
          // Clip to the per-raster clip
          //

          /*

            CLIP HERE

          */

          //
          // Hash the tile coordinates
          //
          // This table lists nominal values for each architecture.
          // We want to choose values that are naturally fit the
          // "width" of the architecture.
          //
          //   SIMD   RANGE   BITS  MAX RANGE  MAX BINS  HASH BITS
          //   ----  -------  ----  ---------  --------  ---------
          //     4   [0,  4]    3    [0,  7]      10      mod(10)  <-- SSE42, ?
          //     8   [0,  8]    4    [0, 15]       8         3     <-- GEN*,AVX*
          //    16   [0, 16]    5    [0, 31]       6      mod(6)   <-- GEN*,?
          //    32   [0, 32]    6    [0, 63]       5      mod(5)   <-- CUDA,PowerVR,Adreno,GEN*
          //    64   [0, 64]    7    [0,127]       4         2     <-- AMD Radeon
          //
          // NOTE: When possible, bias the hash toward using more y
          // bits because of:
          //
          //   1. the 90 degree counter-clockwise rotation that we put
          //      in place to offset the render-time clockwise
          //      rotation
          //
          //   2. the likely presence of left-to-right or
          //      right-to-left glyphs.
          //
          // For power-of-two bins, the hash is easy.
          //
          // For non-power-of-two, we may want to either implement a
          // fast mod (compiler should do this for us... hahahaha) or
          // drop down to the next power-of-two.
          //

          //
          // FIXME -- this snarl is not good -- can probably reduce
          // some of the sign casting but some is there to vectorize a
          // scalar
          //
          SKC_RASTERIZE_INT       const z0y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y);
          SKC_RASTERIZE_INT       const z1y    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y);

          SKC_RASTERIZE_INT       const z0x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x);
          SKC_RASTERIZE_INT       const z1x    = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x);

          SKC_RASTERIZE_INT       const min_y  = min(z0y,z1y);
          SKC_RASTERIZE_INT       const max_y  = max(z0y,z1y);

          SKC_RASTERIZE_INT       const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2;

          SKC_RASTERIZE_UINT      const ty     = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y;
          SKC_RASTERIZE_INT             dy     = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y);

          //
          // map [+1,+32] to [ 0,+31]
          // map [-1,-32] to [-1,-32]
          //
          SKC_RASTERIZE_INT             dys    = (dy + (~dy >> 31)) << 26;

          SKC_RASTERIZE_INT       const min_x  = min(z0x,z1x);
          SKC_RASTERIZE_INT       const max_x  = max(z0x,z1x);
          SKC_RASTERIZE_INT       const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2;

          SKC_RASTERIZE_UINT      const tx     = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X;
          SKC_RASTERIZE_UINT      const sx     = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x);

          SKC_RASTERIZE_UINT      const tts    = dys | (ty << 16) | (sx << 10) | tx;

          SKC_RASTERIZE_UINT      const hash   = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) |
                                                   (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK));

          SKC_RASTERIZE_UINT      const yx     = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF));

#if 0
          printf("(%3u, %3u)\n",tile_y,tile_x);
#endif

#if 0
          if (is_active)
            printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx);
#endif

          //
          // debug
          //
#if 0 // PRINTF_ENABLE

#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,A)                                         \
          if (is_active C)                                              \
            printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C);

          SKC_RASTERIZE_VECTOR_EXPAND();
#else
          if (is_active)
            printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash);
#endif

#endif
          //
          // flush all active lanes
          //
          while (true)
            {
              //
              // either gather load or vector load+shuffle the yx keys
              //
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
              SKC_RASTERIZE_BIN       const yx_bin     = smem->bin.vN.yx;
              SKC_RASTERIZE_UINT      const yx_cur     = shuffle(yx_bin,hash);
#else
              SKC_RASTERIZE_UINT      const yx_cur     = smem->bin.aN.yx[hash];
#endif

              //
              // does yx for lane match yx for hash?
              //
              SKC_RASTERIZE_UINT      const active_yx  = is_active ? yx : SKC_RASTERIZE_YX_INVALID;
              SKC_RASTERIZE_PREDICATE const is_match   = (yx_cur == active_yx);

              //
              // OpenCL spec: "When casting a bool to a vector integer
              // data type, the vector components will be set to -1
              // (i.e. all bits set) if the vector bool value is true
              // and 0 otherwise.
              //
#if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 )
              SKC_RASTERIZE_UINT      const h_match    = (SKC_RASTERIZE_UINT)is_match;
#else
              SKC_RASTERIZE_UINT      const h_match    = abs(is_match); // {-1,0} -> {+1,0}
#endif
              //
              // how many new elements for each matching hash bin?
              //
              SKC_RASTERIZE_UINT      const h_shl      = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS;
              SKC_RASTERIZE_UINT      const h          = h_match << h_shl;

              //
              // prefix sum all of the bins in parallel
              //
              SKC_RASTERIZE_UINT      const h_iss      = skc_subgroup_scan_inclusive_add_uint(h);
              SKC_RASTERIZE_UINT      const h_total    = skc_subgroup_last_uint(h_iss);

              //
              // current bin counts
              //
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
              SKC_RASTERIZE_BIN       const count_bin  = smem->bin.vN.count;
              SKC_RASTERIZE_UINT      const count_cur  = shuffle(count_bin,hash);
#else
              SKC_RASTERIZE_UINT      const count_cur  = smem->bin.aN.count[hash];
#endif

              //
              // calculate where each cache-hit and in-bounds tts should be stored
              //
              SKC_RASTERIZE_UINT      const ttsb_index = (h_iss   >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1;
              SKC_RASTERIZE_UINT      const count_new  = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur;

              //
              // which lanes can append to a matching bin?
              //
              SKC_RASTERIZE_PREDICATE const is_append  = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS);

              //
              // scatter append tts elements to bin blocks
              //
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1)
              //
              // SIMD
              //
#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,A)                                         \
              if (is_append C)                                          \
                {                                                       \
                  smem->bin.aN.ttsb [hash C][ttsb_index C] = tts       C; \
                  smem->bin.aN.count[hash C]               = count_new C; \
                }

              SKC_RASTERIZE_VECTOR_EXPAND();
#else
              //
              // SIMT
              //
              if (is_append)
                {
                  smem->bin.aN.ttsb [hash][ttsb_index] = tts;
                  smem->bin.aN.count[hash]             = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS
                }
#endif
              //
              // try to keep predicate updates SIMD-friendly and
              // outside of predicated code paths -- this is not
              // always how we would normally do things on SIMT but
              // either approach is acceptable
              //

              //
              // mask off lanes/components that successfully appended
              //
              is_active = is_active && !is_append;

              //
              // are there any active lanes left?
              //
              if (!skc_subgroup_any(is_active))
                break;

              //
              // There are active lanes that couldn't be appended to a
              // bin because their hashes collided with the bin's
              // current ryx key then those bins must be ejected.
              //
              // Note that we do not eject "full" bins because lazily
              // waiting for a collision results in simpler code.
              //
              skc_flush(bp_atomics,
                        bp_elems,
                        bp_ids,
                        bp_mask,
                        cohort_atomics,
                        subblocks,
                        blocks,
                        blocks_next,
                        sk_v,
                        sk_v_next,
                        sk_extent,
                        smem,
                        hash,
                        yx,
                        is_active);
            }
        }
    }
}

//
// INITIALIZE SMEM
//
// Note that SIMD/SIMT have nearly the same syntax.
//
static
void
skc_smem_init(__local struct skc_subgroup_smem volatile * const smem)
{
  //
  // initialize smem bins
  //
#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )
  //
  // SIMD
  //
  smem->bin.vN.yx    = ( SKC_RASTERIZE_YX_INIT );
  smem->bin.vN.count = ( 0 );
#else
  //
  // SIMT
  //
  int idx = skc_subgroup_lane();

#if   ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
  if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT)
#elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP )
  for (; idx<SKC_RASTERIZE_TILE_HASH_BIN_COUNT; idx+=SKC_RASTERIZE_SUBGROUP_SIZE)
#endif
    {
      smem->bin.aN.yx   [idx] = ( SKC_RASTERIZE_YX_INIT );
      smem->bin.aN.count[idx] = ( 0 );
    }
#endif
}

//
// RASTERIZE CUBIC KERNEL
//

static
void
skc_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                     __global union skc_bp_elem                * const bp_elems,
                     __global uint                             * const bp_ids,
                     skc_uint                                    const bp_mask,

                     __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                     __global skc_ttsk_s_t                     * const sk_extent,

                     __local struct skc_subgroup_smem volatile * const smem,

                     skc_uint                                  * const nodeword,
                     skc_block_id_t                            * const id,

                     union skc_transform              const    * const tv,
                     union skc_path_clip              const    * const cv,
                     skc_uint                                    const cohort)
{
  //
  // the initial segment idx and segments-per-block constant determine
  // how many block ids will need to be loaded
  //
  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  //
  // apply transform
  //
  // note that we only care if the end points are rounded to subpixel precision
  //
  // FIXME -- transformation is currently affine-only support perspective later
  //
  // the affine transformation requires 8 FMA + 2 ROUND operations
  //

  SKC_RASTERIZE_FLOAT b0x = c0x * tv->sx  + c0y * tv->shx + tv->tx;
  SKC_RASTERIZE_FLOAT b0y = c0x * tv->shy + c0y * tv->sy  + tv->ty;

  SKC_RASTERIZE_FLOAT t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
  SKC_RASTERIZE_FLOAT t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;

  SKC_RASTERIZE_FLOAT t2x = c2x * tv->sx  + c2y * tv->shx + tv->tx;
  SKC_RASTERIZE_FLOAT t2y = c2x * tv->shy + c2y * tv->sy  + tv->ty;

  SKC_RASTERIZE_FLOAT t3x = c3x * tv->sx  + c3y * tv->shx + tv->tx;
  SKC_RASTERIZE_FLOAT t3y = c3x * tv->shy + c3y * tv->sy  + tv->ty;

  //
  // FIXME -- this is temporary support for projection
  //
  bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f);

  if (!is_affine)
    {
      SKC_PROJECT(tv,c0x,c0y,b0x,b0y);
      SKC_PROJECT(tv,c1x,c1y,t1x,t1y);
      SKC_PROJECT(tv,c2x,c2y,t2x,t2y);
      SKC_PROJECT(tv,c3x,c3y,t3x,t3y);
    }

  b0x = round(b0x);
  b0y = round(b0y);

  t3x = round(t3x);
  t3y = round(t3y);

  //
  //
  //
#if PRINTF_ENABLE

#if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 )

#undef  SKC_EXPAND_X
#define SKC_EXPAND_X(I,S,C,P,A)                                         \
  printf("{ { %.02f, %.02f }, { %.02f, %.02f },"                        \
         "  { %.02f, %.02f }, { %.02f, %.02f } },\n",                   \
         b0x C,b0y C,t1x C,t1y C,                                       \
         t2x C,t2y C,t3x C,t3y C);

  SKC_RASTERIZE_VECTOR_EXPAND();

#else

  printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n",
         b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);

#endif

#endif

  //
  // OLD APPROACH
  // ------------
  //
  // The Spinel CUDA rasterizer was significantly more complex and
  // performed a few different tasks that are probably best kept
  // separate.
  //
  // The Spinel rasterizer Bezier held 4-element x and y coordinates
  // in adjacent lanes. This simplified intermingling of single lane
  // 4-coordinate line segments with two-lane cubic Beziers.
  //
  // After transformation of the input segments, the Spinel rasterizer
  // would test cubics for flatness and, if flat, collapse the
  // adjacent lanes into a single line lane and an empty lane.
  //
  // Any lines would then be appended to a line queue.
  //
  // Any cubics would then be subdivided.
  //
  // The reclassification process would be repeated.
  //
  // NEW APPROACH
  // ------------
  //
  // Assume we're only working with cubics in this kernel.
  //
  // Optimization: if the line segment is a special case -- a cusp,
  // has 1+ inflections, or a loop -- it might be beneficial to
  // subdivide the control cage 1+ times in order to separate the
  // flatter segments the high-velocity region(s).
  //
  // This means we want to split using [a,b] formulation to _directly_
  // subdivide producing a new control cage.
  //
  // Wang's Formula is still useful even if we subdivide once or twice
  // as it's so cheap that it might give some useful hints about where
  // the high-velocity sections of curve reside.
  //
  // But it seems like using Wang's and directly flattening to line
  // segments without any subdivision is good enough for the limited
  // set of test cases that I've tried.
  //
  // So... use Wang's Formula to estimate how many line segment are
  // required to properly flatten the cubics.
  //
  // Then use inclusive/exclusive scans to put all the lanes to work:
  //
  //   1. segmenting cubics to line segments
  //
  //   2. slivering line segments into 1-pixel high line segments
  //
  //   3. slivering 1-pixel high line segments into 1-pixel wide line
  //      segments
  //
  // MORE BACKGROUND ON NEW APPROACH
  // -------------------------------
  //
  // Two options for handling line segments:
  //
  // 1. append the line segments onto an SLM array until enough
  //    work has been accrued (Spinel does this)
  //
  // 2. immediately sliver the potentially multi-pixel line
  //    segments into subpixel lines
  //
  // The advantage of (1) is that it guarantees the slivering
  // process will, on average, always be emitting a full subgroup
  // of subpixel lines.
  //
  // The advantage of (2) is that it reduces code complexity and
  // leaves more room for SLM tile bins. The difference between Spinel
  // and Skia Compute is that Wang's Formula guarantees there will be
  // a full subgroup of multi-pixel lines unless this is the final
  // iteration of the warp of multi-pixel lines.
  //
  // Note that wider GPU architectures might benefit from (1) and
  // other work accumulation strategies because it will minimize
  // partial warp workloads in the final iteration of each stage.  It
  // also minimizes the sunk cost of the uniform control logic steps.
  //
  // So let's implement (2) for now...
  //

  //
  // And... begin!
  //
  // Estimate how many line segments are in quad/cubic curve.
  //
  // Wang's Formula will return zero if the control points are
  // collinear but we bump it up to 1.0f.
  //
  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y);

  //
  // if there are free registers then precalculate the reciprocal for
  // each estimated segments since it will never change
  //
  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);


  //
  // inclusive add scan of estimated line segments
  // exclusive add scan of estimated line segments
  // total number       of estimated line segments
  //
  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar

  //
  // Precompute cubic polynomial coefficients from transformed control
  // cage so we can shuffle them in on each iteration of the outer
  // loop and then evaluate the polynomial in Horner form.
  //
  //                            |  1  0  0  0 | | c0 |
  //                            |             | |    |
  //                            | -3  3  0  0 | | c1 |
  //   B(t) = [ 1 t^1 t^2 t^3 ] |             | |    |
  //                            |  3 -6  3  0 | | c2 |
  //                            |             | |    |
  //                            | -1  3 -3  1 | | c3 |
  //
  //
  SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x);                // 2 - 1 MAD + MUL
  SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y);                // 2 - 1 MAD + MUL

  SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x));  // 3 - 2 MAD + MUL
  SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y));  // 3 - 2 MAD + MUL

  SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB
  SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB

  //
  // these values don't matter on the first iteration
  //
  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
  SKC_RASTERIZE_FLOAT l1y_prev  = 0;

  //
  // allocate and init in-register TTSK keys
  //
  skc_uint     sk_v_next = 0;
  skc_ttsk_v_t sk_v;

  sk_v.hi = cohort;

  //
  // initialize smem
  //
  skc_smem_init(smem);

  //
  // initialize blocks / subblocks
  //
  skc_block_id_v_t blocks;
  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;

#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
  skc_block_id_t   subblocks   = 0;
#endif

  //
  // loop until done
  //
  while (s_rem > 0)
    {
      //
      // distribute work across lanes
      //
      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);

      //
      // every lane has a fraction to work off of
      //
      // FIXME -- this gets expanded on SIMD
      //
      // if delta == 1      then this is the first lane
      // if count == s_segs then this is the last  lane
      //
      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);

      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);

      //
      // init parametric t
      //
      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?

      //
      // if last then override to a hard 1.0f
      //
      s_t    = is_s_last ? 1.0f : s_t;

      //
      // decrement by subgroup size
      //
      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

      //
      // now every lane knows what to do and the following lines will
      // pump out up to SUBGROUP_SIZE line segments
      //
      // obtain the src vertices through shared or via a shuffle
      //

      //
      // shuffle in the polynomial coefficients their source lane
      //
      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);

      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);

      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);

      SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source);
      SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source);

      //
      // calculate "right" line segment endpoint using Horner form
      //
      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND
      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND

      //
      // shuffle up "left" line segment endpoint
      //
      // NOTE: Intel's shuffle_up is unique with its elegant
      // "previous" argument so don't get used to it
      //
      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);

      //
      // save previous right endpoint
      //
      l1x_prev = l1x;
      l1y_prev = l1y;

      //
      // override shuffle up if this is the first line segment
      //
      l0x = select(l0x,s0x,is_s_first);
      l0y = select(l0y,s0y,is_s_first);

      //
      // sliver lines
      //
      skc_sliver(bp_atomics,
                 bp_elems,
                 bp_ids,
                 bp_mask,
                 cohort_atomics,
                 &subblocks,
                 &blocks,
                 &blocks_next,
                 &sk_v,
                 &sk_v_next,
                 sk_extent,
                 smem,
                 l0x,l0y,l1x,l1y);
    }

  //
  // - flush work-in-progress blocks
  // - return unused block ids
  //
  skc_finalize(bp_atomics,
               bp_elems,
               bp_ids,
               bp_mask,
               cohort_atomics,
               &blocks,
               blocks_next,
               &sk_v,
               sk_v_next,
               sk_extent,
               smem);
}

//
// RASTERIZE QUAD KERNEL
//

static
void
skc_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                    __global union skc_bp_elem                * const bp_elems,
                    __global uint                             * const bp_ids,
                    skc_uint                                    const bp_mask,

                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                    __global skc_ttsk_s_t                     * const sk_extent,

                    __local struct skc_subgroup_smem volatile * const smem,

                    skc_uint                                  * const nodeword,
                    skc_block_id_t                            * const id,

                    union skc_transform              const    * const tv,
                    union skc_path_clip              const    * const cv,
                    skc_uint                                    const cohort)
{
  //
  // the initial segment idx and segments-per-block constant determine
  // how many block ids will need to be loaded
  //
  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  //
  // apply transform
  //
  // note that we only care if the end points are rounded to subpixel precision
  //
  // FIXME -- transformation is currently affine-only support perspective later
  //
  // the affine transformation requires 8 FMA + 2 ROUND operations
  //
  SKC_RASTERIZE_FLOAT b0x = c0x * tv->sx  + c0y * tv->shx + tv->tx;
  SKC_RASTERIZE_FLOAT b0y = c0x * tv->shy + c0y * tv->sy  + tv->ty;

  SKC_RASTERIZE_FLOAT t1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
  SKC_RASTERIZE_FLOAT t1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;

  SKC_RASTERIZE_FLOAT t2x = c2x * tv->sx  + c2y * tv->shx + tv->tx;
  SKC_RASTERIZE_FLOAT t2y = c2x * tv->shy + c2y * tv->sy  + tv->ty;

  //
  // FIXME -- this is temporary support for projection
  //
  bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f);

  if (!is_affine)
    {
      SKC_PROJECT(tv,c0x,c0y,b0x,b0y);
      SKC_PROJECT(tv,c1x,c1y,t1x,t1y);
      SKC_PROJECT(tv,c2x,c2y,t2x,t2y);
    }

  b0x = round(b0x);
  b0y = round(b0y);

  t2x = round(t2x);
  t2y = round(t2y);

  //
  // Estimate how many line segments are in quad/cubic curve.
  //
  // Wang's Formula will return zero if the control points are
  // collinear but we bump it up to 1.0f.
  //
  SKC_RASTERIZE_FLOAT const s_segs  = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y);

  //
  // if there are free registers then precalculate the reciprocal for
  // each estimated segments since it will never change
  //
  SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs);


  //
  // inclusive add scan of estimated line segments
  // exclusive add scan of estimated line segments
  // total number       of estimated line segments
  //
  SKC_RASTERIZE_FLOAT       s_iss   = skc_subgroup_scan_inclusive_add_float(s_segs);
  SKC_RASTERIZE_FLOAT       s_ess   = s_iss - s_segs;
  float                     s_rem   = skc_subgroup_last_float(s_iss); // scalar

  //
  // Precompute quadratic polynomial coefficients from control cage so
  // we can shuffle them in on each iteration of the outer loop and
  // then evaluate the polynomial in Horner form.
  //

  //                        |  1  0  0  | | c0 |
  //                        |           | |    |
  //   B(t) = [ 1 t^1 t^2 ] | -2  2  0  | | c1 |
  //                        |           | |    |
  //                        |  1 -2  1  | | c2 |
  //
  //
  SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL
  SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL

  SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x);  // 2 - 1 MAD + ADD
  SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y);  // 2 - 1 MAD + ADD

  //
  // these values don't matter on the first iteration
  //
  SKC_RASTERIZE_FLOAT l1x_prev  = 0;
  SKC_RASTERIZE_FLOAT l1y_prev  = 0;

  //
  // allocate and init in-register TTSK keys
  //
  skc_uint     sk_v_next = 0;
  skc_ttsk_v_t sk_v;

  sk_v.hi = cohort;

  //
  // initialize smem
  //
  skc_smem_init(smem);

  //
  // initialize blocks / subblocks
  //
  skc_block_id_v_t blocks;
  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;

#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
  skc_block_id_t   subblocks   = 0;
#endif

  //
  // loop until done
  //
  while (s_rem > 0)
    {
      //
      // distribute work across lanes
      //
      SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess);

      //
      // every lane has a fraction to work off of
      //
      // FIXME -- this gets expanded on SIMD
      //
      // if delta == 1      then this is the first lane
      // if count == s_segs then this is the last  lane
      //
      SKC_RASTERIZE_FLOAT     const s_delta    = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source);
      SKC_RASTERIZE_FLOAT     const s_count    = skc_subgroup_shuffle(s_segs,s_source);

      SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f);
      SKC_RASTERIZE_PREDICATE const is_s_last  = (s_delta >= s_count);

      //
      // init parametric t
      //
      SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)?

      //
      // if last then override to a hard 1.0f
      //
      s_t    = is_s_last ? 1.0f : s_t;

      //
      // decrement by subgroup size
      //
      s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
      s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;
      s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP;

      //
      // now every lane knows what to do and the following lines will
      // pump out up to SUBGROUP_SIZE line segments
      //
      // obtain the src vertices through shared or via a shuffle
      //

      //
      // shuffle in the polynomial coefficients their source lane
      //
      SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source);
      SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source);

      SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source);
      SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source);

      SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source);
      SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source);

      //
      // calculate "right" line segment endpoint using Horner form
      //
      SKC_RASTERIZE_FLOAT       l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND
      SKC_RASTERIZE_FLOAT       l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND

      //
      // shuffle up "left" line segment endpoint
      //
      // NOTE: Intel's shuffle_up is unique with its elegant
      // "previous" argument so don't get used to it
      //
      SKC_RASTERIZE_FLOAT       l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x);
      SKC_RASTERIZE_FLOAT       l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y);

      //
      // save previous right endpoint
      //
      l1x_prev = l1x;
      l1y_prev = l1y;

      //
      // override shuffle up if this is the first line segment
      //
      l0x = select(l0x,s0x,is_s_first);
      l0y = select(l0y,s0y,is_s_first);

      //
      // sliver lines
      //
      skc_sliver(bp_atomics,
                 bp_elems,
                 bp_ids,
                 bp_mask,
                 cohort_atomics,
                 &subblocks,
                 &blocks,
                 &blocks_next,
                 &sk_v,
                 &sk_v_next,
                 sk_extent,
                 smem,
                 l0x,l0y,l1x,l1y);
    }

  //
  // - flush work-in-progress blocks
  // - return unused block ids
  //
  skc_finalize(bp_atomics,
               bp_elems,
               bp_ids,
               bp_mask,
               cohort_atomics,
               &blocks,
               blocks_next,
               &sk_v,
               sk_v_next,
               sk_extent,
               smem);
}

//
// RASTERIZE LINE KERNEL
//

static
void
skc_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                    __global union skc_bp_elem                * const bp_elems,
                    __global uint                             * const bp_ids,
                    skc_uint                                    const bp_mask,

                    __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                    __global skc_ttsk_s_t                     * const sk_extent,

                    __local struct skc_subgroup_smem volatile * const smem,

                    skc_uint                                  * const nodeword,
                    skc_block_id_t                            * const id,

                    union skc_transform              const    * const tv,
                    union skc_path_clip              const    * const cv,
                    skc_uint                                    const cohort)
{
  //
  // the initial segment idx and segments-per-block constant determine
  // how many block ids will need to be loaded
  //
  SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

  skc_segment_next(bp_elems,nodeword,id);

  SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord;

#if 0
  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y);
#endif

  //
  // apply transform
  //
  // note that we only care if the end points are rounded to subpixel precision
  //
  // FIXME -- transformation is currently affine-only
  // FIXME -- support perspective later
  //
  // the affine transformation requires 8 FMA + 4 ROUND operations
  //
  SKC_RASTERIZE_FLOAT l0x = c0x * tv->sx  + c0y * tv->shx + tv->tx;
  SKC_RASTERIZE_FLOAT l0y = c0x * tv->shy + c0y * tv->sy  + tv->ty;

  SKC_RASTERIZE_FLOAT l1x = c1x * tv->sx  + c1y * tv->shx + tv->tx;
  SKC_RASTERIZE_FLOAT l1y = c1x * tv->shy + c1y * tv->sy  + tv->ty;

  //
  // FIXME -- this is temporary support for projection
  //
  bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f);

  if (!is_affine) {
    SKC_PROJECT(tv,c0x,c0y,l0x,l0y);
    SKC_PROJECT(tv,c1x,c1y,l1x,l1y);
  }

  l0x = round(l0x);
  l0y = round(l0y);

  l1x = round(l1x);
  l1y = round(l1y);

#if 0
  printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y);
#endif

  //
  // allocate and init in-register TTSK keys
  //
  skc_uint     sk_v_next = 0;
  skc_ttsk_v_t sk_v;

  sk_v.hi = cohort;

  //
  // initialize smem
  //
  skc_smem_init(smem);

  //
  // initialize blocks / subblocks
  //
  skc_block_id_v_t blocks;
  skc_uint         blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE;

#if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2
  skc_block_id_t   subblocks   = 0;
#endif

  //
  // sliver lines
  //
  skc_sliver(bp_atomics,
             bp_elems,
             bp_ids,
             bp_mask,
             cohort_atomics,
             &subblocks,
             &blocks,
             &blocks_next,
             &sk_v,
             &sk_v_next,
             sk_extent,
             smem,
             l0x,l0y,l1x,l1y);

  //
  // - flush work-in-progress blocks
  // - return unused block ids
  //
  skc_finalize(bp_atomics,
               bp_elems,
               bp_ids,
               bp_mask,
               cohort_atomics,
               &blocks,
               blocks_next,
               &sk_v,
               sk_v_next,
               sk_extent,
               smem);
}

//
//
//

__kernel
SKC_RASTERIZE_KERNEL_ATTRIBS
void
skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                         __global union skc_bp_elem                * const bp_elems,
                         __global uint                             * const bp_ids,
                         skc_uint                                    const bp_mask,

                         __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                         __global skc_ttsk_s_t                     * const sk_extent,

                         __global float8                  const    * const transforms, // FIXME -- __constant
                         __global float4                  const    * const clips,      // FIXME -- __constant
                         __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
                         skc_uint                                    const count)
{
  //
  // declare shared memory block
  //
#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
  __local struct skc_subgroup_smem volatile                smem[1];
#else
  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
#endif

  //
  // this is a subgroup/warp-centric kernel
  //
  // which subgroup in the grid is this?
  //
  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
  // get_group_id(0) as a uniform but the alternative calculation used
  // when there are multiple subgroups per workgroup is not
  // cooperating and driving spillage elsewhere.
  //
#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
  uint const cmd_idx = get_group_id(0);
#else
  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
#endif

#if 0
  if (get_sub_group_local_id() == 0)
    printf("+cmd_idx = %u\n",cmd_idx);
#endif

  //
  // if worksgroups are multi-subgroup then there may be excess
  // subgroups in the final workgroup
  //
  if (cmd_idx >= count)
    return;

#if 0
  if (get_sub_group_local_id() == 0)
    printf("-cmd_idx = %u\n",cmd_idx);
#endif

  //
  // load a single command for this subgroup
  //
  union skc_cmd_rasterize const cmd = cmds[cmd_idx];

#if 0
  if (get_sub_group_local_id() == 0)
    printf("[ %u ]< %u, %u, %u, %u >\n",
           cmd_idx,
           cmd.nodeword,
           SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd),
           SKC_CMD_RASTERIZE_GET_CLIP(cmd),
           SKC_CMD_RASTERIZE_GET_COHORT(cmd));
#endif

  //
  // get first block node command word and its subblock
  //
  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
  skc_block_id_tag      tag      = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id);
  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

  //
  // load transform -- uniform across subgroup
  //
  // v8: { sx shx tx shy sy ty w0 w1 }
  //
  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
  //
  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
  //
  // Coordinates are scaled to subpixel resolution.  All that matters
  // is that continuity is maintained between end path element
  // endpoints.
  //
  // It's the responsibility of the host to ensure that the transforms
  // are properly scaled either via intitializing a transform stack
  // with the subpixel resolution scaled identity or scaling the
  // transform before its loaded by a rasterization grid.
  //
  // FIXME -- horizontal load might be better than this broadcast load
  //
  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted

  switch (tag)
    {
    case SKC_BLOCK_ID_TAG_PATH_LINE:
      skc_rasterize_lines(bp_atomics,
                          bp_elems,
                          bp_ids,
                          bp_mask,
                          cohort_atomics,
                          sk_extent,
                          smem,
                          &nodeword,&id,
                          &tv,&cv,cohort);
      break;

    case SKC_BLOCK_ID_TAG_PATH_QUAD:
      skc_rasterize_quads(bp_atomics,
                          bp_elems,
                          bp_ids,
                          bp_mask,
                          cohort_atomics,
                          sk_extent,
                          smem,
                          &nodeword,&id,
                          &tv,&cv,cohort);
      break;

    case SKC_BLOCK_ID_TAG_PATH_CUBIC:
      skc_rasterize_cubics(bp_atomics,
                           bp_elems,
                           bp_ids,
                           bp_mask,
                           cohort_atomics,
                           sk_extent,
                           smem,
                           &nodeword,&id,
                           &tv,&cv,cohort);
      break;

    case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD:
      break;
    case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC:
      break;

    default:
      break;
    }
}

//
//
//

__kernel
SKC_RASTERIZE_KERNEL_ATTRIBS
void
skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                           __global union skc_bp_elem                * const bp_elems,
                           __global uint                             * const bp_ids,
                           skc_uint                                    const bp_mask,

                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                           __global skc_ttsk_s_t                     * const sk_extent,

                           __global float8                  const    * const transforms, // FIXME -- __constant
                           __global float4                  const    * const clips,      // FIXME -- __constant
                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
                           skc_uint                                    const count)
{
  //
  // declare shared memory block
  //
#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
  __local struct skc_subgroup_smem volatile                smem[1];
#else
  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
#endif

  //
  // this is a subgroup/warp-centric kernel
  //
  // which subgroup in the grid is this?
  //
  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
  // get_group_id(0) as a uniform but the alternative calculation used
  // when there are multiple subgroups per workgroup is not
  // cooperating and driving spillage elsewhere.
  //
#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
  uint const cmd_idx = get_group_id(0);
#else
  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
#endif

  //
  // if worksgroups are multi-subgroup then there may be excess
  // subgroups in the final workgroup
  //
  if (cmd_idx >= count)
    return;

#if 0
  if (get_sub_group_local_id() == 0)
    printf("cmd_idx = %u\n",cmd_idx);
#endif

  //
  // load a single command for this subgroup
  //
  union skc_cmd_rasterize const cmd = cmds[cmd_idx];

  //
  // get first block node command word and its subblock
  //
  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

  //
  // load transform -- uniform across subgroup
  //
  // v8: { sx shx tx shy sy ty w0 w1 }
  //
  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
  //
  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
  //
  // Coordinates are scaled to subpixel resolution.  All that matters
  // is that continuity is maintained between end path element
  // endpoints.
  //
  // It's the responsibility of the host to ensure that the transforms
  // are properly scaled either via intitializing a transform stack
  // with the subpixel resolution scaled identity or scaling the
  // transform before its loaded by a rasterization grid.
  //
  // FIXME -- horizontal load might be better than this broadcast load
  //
  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted

  skc_rasterize_lines(bp_atomics,
                      bp_elems,
                      bp_ids,
                      bp_mask,
                      cohort_atomics,
                      sk_extent,
                      smem,
                      &nodeword,&id,
                      &tv,&cv,cohort);
}

//
//
//

//
//
//

__kernel
SKC_RASTERIZE_KERNEL_ATTRIBS
void
skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                           __global union skc_bp_elem                * const bp_elems,
                           __global uint                             * const bp_ids,
                           skc_uint                                    const bp_mask,

                           __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                           __global skc_ttsk_s_t                     * const sk_extent,

                           __global float8                  const    * const transforms, // FIXME -- __constant
                           __global float4                  const    * const clips,      // FIXME -- __constant
                           __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
                           skc_uint                                    const count)
{
  //
  // declare shared memory block
  //
#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
  __local struct skc_subgroup_smem volatile                smem[1];
#else
  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
#endif

  //
  // this is a subgroup/warp-centric kernel
  //
  // which subgroup in the grid is this?
  //
  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
  // get_group_id(0) as a uniform but the alternative calculation used
  // when there are multiple subgroups per workgroup is not
  // cooperating and driving spillage elsewhere.
  //
#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
  uint const cmd_idx = get_group_id(0);
#else
  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
#endif

  //
  // if worksgroups are multi-subgroup then there may be excess
  // subgroups in the final workgroup
  //
  if (cmd_idx >= count)
    return;

#if 0
  if (get_sub_group_local_id() == 0)
    printf("cmd_idx = %u\n",cmd_idx);
#endif

  //
  // load a single command for this subgroup
  //
  union skc_cmd_rasterize const cmd = cmds[cmd_idx];

  //
  // get first block node command word and its subblock
  //
  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

  //
  // load transform -- uniform across subgroup
  //
  // v8: { sx shx tx shy sy ty w0 w1 }
  //
  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
  //
  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
  //
  // Coordinates are scaled to subpixel resolution.  All that matters
  // is that continuity is maintained between end path element
  // endpoints.
  //
  // It's the responsibility of the host to ensure that the transforms
  // are properly scaled either via intitializing a transform stack
  // with the subpixel resolution scaled identity or scaling the
  // transform before its loaded by a rasterization grid.
  //
  // FIXME -- horizontal load might be better than this broadcast load
  //
  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted

  skc_rasterize_quads(bp_atomics,
                      bp_elems,
                      bp_ids,
                      bp_mask,
                      cohort_atomics,
                      sk_extent,
                      smem,
                      &nodeword,&id,
                      &tv,&cv,cohort);
}

//
//
//

__kernel
SKC_RASTERIZE_KERNEL_ATTRIBS
void
skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                            __global union skc_bp_elem                * const bp_elems,
                            __global uint                             * const bp_ids,
                            skc_uint                                    const bp_mask,

                            __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                            __global skc_ttsk_s_t                     * const sk_extent,

                            __global float8                  const    * const transforms, // FIXME -- __constant
                            __global float4                  const    * const clips,      // FIXME -- __constant
                            __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
                            skc_uint                                    const count)
{
  //
  // declare shared memory block
  //
#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
  __local struct skc_subgroup_smem volatile                smem[1];
#else
  __local struct skc_subgroup_smem volatile                smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS];
  __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id();
#endif

  //
  // this is a subgroup/warp-centric kernel
  //
  // which subgroup in the grid is this?
  //
  // TAKE NOTE: the Intel GEN compiler appears to be recognizing
  // get_group_id(0) as a uniform but the alternative calculation used
  // when there are multiple subgroups per workgroup is not
  // cooperating and driving spillage elsewhere.
  //
#if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 )
  uint const cmd_idx = get_group_id(0);
#else
  uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id();
#endif

  //
  // if worksgroups are multi-subgroup then there may be excess
  // subgroups in the final workgroup
  //
  if (cmd_idx >= count)
    return;

#if 0
  if (get_sub_group_local_id() == 0)
    printf("cmd_idx = %u\n",cmd_idx);
#endif

  //
  // load a single command for this subgroup
  //
  union skc_cmd_rasterize const cmd = cmds[cmd_idx];

  //
  // get first block node command word and its subblock
  //
  skc_uint              nodeword = cmd.nodeword; // nodeword has word-addressing
  skc_tagged_block_id_t tag_id   = bp_elems[nodeword].tag_id;
  skc_block_id_t        id       = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id);

  //
  // load transform -- uniform across subgroup
  //
  // v8: { sx shx tx shy sy ty w0 w1 }
  //
  // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY:
  //
  //   [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ]
  //
  // Coordinates are scaled to subpixel resolution.  All that matters
  // is that continuity is maintained between end path element
  // endpoints.
  //
  // It's the responsibility of the host to ensure that the transforms
  // are properly scaled either via intitializing a transform stack
  // with the subpixel resolution scaled identity or scaling the
  // transform before its loaded by a rasterization grid.
  //
  // FIXME -- horizontal load might be better than this broadcast load
  //
  union skc_transform const tv     = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load
  union skc_path_clip const cv     = { .f32v4 = clips     [SKC_CMD_RASTERIZE_GET_CLIP(cmd)     ] }; // uniform load
  skc_uint            const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted

  skc_rasterize_cubics(bp_atomics,
                       bp_elems,
                       bp_ids,
                       bp_mask,
                       cohort_atomics,
                       sk_extent,
                       smem,
                       &nodeword,&id,
                       &tv,&cv,cohort);
}

//
//
//

__kernel
SKC_RASTERIZE_KERNEL_ATTRIBS
void
skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                               __global union skc_bp_elem                * const bp_elems,
                               __global uint                             * const bp_ids,
                               skc_uint                                    const bp_mask,

                               __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                               __global skc_ttsk_s_t                     * const sk_extent,

                               __global float8                  const    * const transforms, // FIXME -- __constant
                               __global float4                  const    * const clips,      // FIXME -- __constant
                               __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
                               skc_uint                                    const count)
{
  ;
}

//
//
//

__kernel
SKC_RASTERIZE_KERNEL_ATTRIBS
void
skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT         volatile * const bp_atomics,
                                __global union skc_bp_elem                * const bp_elems,
                                __global uint                             * const bp_ids,
                                skc_uint                                    const bp_mask,

                                __global SKC_ATOMIC_UINT         volatile * const cohort_atomics,
                                __global skc_ttsk_s_t                     * const sk_extent,

                                __global float8                  const    * const transforms, // FIXME -- __constant
                                __global float4                  const    * const clips,      // FIXME -- __constant
                                __global union skc_cmd_rasterize const    * const cmds,       // FIXME -- __constant
                                skc_uint                                    const count)
{
  ;
}

//
//
//