/* * Copyright 2017 Google Inc. * * Use of this source code is governed by a BSD-style license that can * be found in the LICENSE file. * */ // // // #include "tile.h" #include "common.h" #include "atomic_cl.h" #include "block_pool_cl.h" #include "raster_builder_cl_12.h" #include "kernel_cl_12.h" // #define SKC_ARCH_AVX2 // #define SKC_RASTERIZE_SIMD_USES_SMEM #define PRINTF_ENABLE 0 #define PRINTF_BLOCK_COUNT 0 // // NOTE: // // ON SIMD DEVICES THE BIN COUNT MUST BE POW2 SO THAT WE CAN LOAD IT // AS A VECTOR AND PERFORM A SWIZZLE/SHUFFLE // // NOTE: // // IGNORE FOR NOW ANY AVX2 CODE SNIPPETS. THEY WILL BE MOVED ASAP. // // #if 0 // SKC_ARCH_AVX2 // #define SKC_RASTERIZE_SUBGROUP_SIZE 1 // #define SKC_RASTERIZE_VECTOR_SIZE_LOG2 3 // #define SKC_RASTERIZE_WORKGROUP_COUNT_SUBGROUP 1 // #define SKC_TTXB_WORDS 8 // #define SKC_RASTERIZE_FLOAT float8 // #define SKC_RASTERIZE_UINT uint8 // #define SKC_RASTERIZE_INT int8 // #define SKC_RASTERIZE_PREDICATE int8 // #define SKC_RASTERIZE_BIN_BLOCK uint16 // #define SKC_RASTERIZE_BIN uint8 // #define SKC_RASTERIZE_POOL uint8 // #define SKC_RASTERIZE_POOL_SCALE 6 // #define SKC_RASTERIZE_TILE_HASH_X_BITS 1 // #define SKC_RASTERIZE_TILE_HASH_Y_BITS 2 // #define SKC_RASTERIZE_VECTOR_EXPAND() SKC_EXPAND_8() #endif // // SIMT // #define SKC_RASTERIZE_BLOCK_ID_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE #define SKC_RASTERIZE_TTSK_V_SIZE SKC_RASTERIZE_SUBGROUP_SIZE #define SKC_RASTERIZE_TTSK_V_MASK (SKC_RASTERIZE_TTSK_V_SIZE - 1) // // // #define SKC_RASTERIZE_VECTOR_SIZE (1 << SKC_RASTERIZE_VECTOR_SIZE_LOG2) #define SKC_RASTERIZE_ELEMS_PER_SUBGROUP (SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_VECTOR_SIZE) // // // #define SKC_RASTERIZE_YX_INIT 0x7FFF7FFF // { +32767, +32767 } #define SKC_RASTERIZE_YX_INVALID 0x80008000 // { -32768, -32768 } // // // #define SKC_RASTERIZE_TILE_HASH_X_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_X_BITS) #define SKC_RASTERIZE_TILE_HASH_Y_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_Y_BITS) #define SKC_RASTERIZE_TILE_HASH_BITS (SKC_RASTERIZE_TILE_HASH_X_BITS + SKC_RASTERIZE_TILE_HASH_Y_BITS) #define SKC_RASTERIZE_TILE_HASH_BIN_COUNT (1 << SKC_RASTERIZE_TILE_HASH_BITS) #define SKC_RASTERIZE_TILE_HASH_BIN_BITS (SKC_RASTERIZE_TILE_HASH_BITS + 1) // FIXME -- LOG2_RU(BIN_COUNT) #define SKC_RASTERIZE_TILE_HASH_BIN_MASK SKC_BITS_TO_MASK(SKC_RASTERIZE_TILE_HASH_BIN_BITS) // // Norbert Juffa notes: "GPU Pro Tip: Lerp Faster in C++" // // https://devblogs.nvidia.com/parallelforall/lerp-faster-cuda/ // // Lerp in two fma/mad ops: // // t * b + ((-t) * a + a) // // Note: OpenCL documents mix() as being implemented as: // // a + (b - a) * t // // But this may be a native instruction on some devices. For example, // on GEN9 there is an LRP "linear interoplation" opcode but it // doesn't appear to support half floats. // // Feel free to toggle this option and then benchmark and inspect the // generated code. We really want the double FMA to be generated when // there isn't support for a LERP/MIX operation. // #if 1 #define SKC_LERP(a,b,t) mad(t,b,mad(-(t),a,a)) #else #define SKC_LERP(a,b,t) mix(a,b,t) #endif // // There is no integer MAD in OpenCL with "don't care" overflow // semantics. // // FIXME -- verify if the platform needs explicit MAD operations even // if a "--fastmath" option is available at compile time. It might // make sense to explicitly use MAD calls if the platform requires it. // #if 1 #define SKC_MAD_UINT(a,b,c) ((a) * (b) + (c)) #else #define SKC_MAD_UINT(a,b,c) mad_sat(a,b,c) #endif // // // #define SKC_RASTERIZE_SEGMENT(id) (id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane()) // // // union skc_bp_elem { skc_uint u32; skc_tagged_block_id_t tag_id; skc_float coord; }; // // // struct skc_subgroup_smem { // // SIMT subgroup scratchpad for max scan -- also shared with 'winner' member // #if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) || defined ( SKC_RASTERIZE_SIMD_USES_SMEM ) struct { union { skc_uint winner; struct { skc_uint scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; } aN; struct { SKC_RASTERIZE_UINT scratch[SKC_RASTERIZE_SUBGROUP_SIZE]; } vN; }; } subgroup; #endif // // work-in-progress TTSB blocks and associated YX keys // union { struct { // FIXME -- some typedefs are valid here skc_uint ttsb [SKC_RASTERIZE_TILE_HASH_BIN_COUNT][SKC_DEVICE_SUBBLOCK_WORDS]; skc_uint yx [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; skc_uint id [SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; skc_uint count[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; } aN; #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) struct { SKC_RASTERIZE_BIN_BLOCK ttsb[SKC_RASTERIZE_TILE_HASH_BIN_COUNT]; SKC_RASTERIZE_BIN yx; SKC_RASTERIZE_BIN id; SKC_RASTERIZE_BIN count; } vN; #endif } bin; }; // // // #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) #define skc_subgroup_lane() 0 #else #define skc_subgroup_lane() get_sub_group_local_id() #endif // // // #define SKC_PROJECT(tv,x,y,xp,yp) \ { \ float const d = native_recip(fma(x,tv->w0,fma(y,tv->w1,1.0f))); \ xp *= d; \ yp *= d; \ } // // replenish block ids // // note that you can't overrun the block id pool since it's a ring // static void skc_blocks_replenish(skc_uint * const blocks_next, skc_block_id_v_t * const blocks, __global SKC_ATOMIC_UINT volatile * const bp_atomics, skc_uint const bp_mask, // pow2 modulo mask for block pool ring __global skc_block_id_t const * const bp_ids) { // // get a new vector of block ids -- this is kind of a narrow // allocation but subblocks help stretch out the pool. // // FIXME -- there is now plenty of SMEM to allocate a LOT of block ids // skc_uint bp_idx = 0; if (skc_subgroup_lane() == 0) { bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS, SKC_RASTERIZE_BLOCK_ID_V_SIZE); // ring_reads #if 0 printf("r+: %8u + %u\n",bp_idx,SKC_RASTERIZE_BLOCK_ID_V_SIZE); #endif } bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane()) & bp_mask; *blocks = bp_ids[bp_idx]; *blocks_next = 0; } // // // static skc_block_id_t skc_blocks_get_next(skc_uint * const blocks_next, skc_block_id_v_t * const blocks, __global SKC_ATOMIC_UINT volatile * const bp_atomics, skc_uint const bp_mask, // pow2 modulo mask for block pool ring __global skc_block_id_t const * const bp_ids) { // replenish? if (*blocks_next == SKC_RASTERIZE_BLOCK_ID_V_SIZE) { skc_blocks_replenish(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); } #if ( SKC_RASTERIZE_SUBGROUP_SIZE > 1 ) // // SIMT // skc_block_id_t id = sub_group_broadcast(*blocks,*blocks_next); #else // // SIMD // skc_block_id_t id = blocks->s0; skc_shuffle_down_1(*blocks); #endif *blocks_next += 1; return id; } // // subblock allocator // #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 static skc_block_id_t skc_subblocks_get_next(skc_block_id_t * const subblocks, skc_uint * const blocks_next, skc_block_id_v_t * const blocks, __global SKC_ATOMIC_UINT volatile * const bp_atomics, skc_uint const bp_mask, // pow2 modulo mask for block pool ring __global skc_block_id_t const * const bp_ids) { if ((*subblocks & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) { *subblocks = skc_blocks_get_next(blocks_next,blocks,bp_atomics,bp_mask,bp_ids); } skc_block_id_t const sb_id = *subblocks; *subblocks += 1; #if 0 if (get_sub_group_local_id() == 0) printf("= %u\n",sb_id); #endif return sb_id; } #define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const subblocks, skc_block_id_t * const blocks #define SKC_SUBBLOCKS_BLOCKS_ARGS() subblocks, blocks #else #define SKC_SUBBLOCKS_BLOCKS_PROTO() skc_block_id_t * const blocks #define SKC_SUBBLOCKS_BLOCKS_ARGS() blocks #endif // // // static skc_block_id_t skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_PROTO(), skc_uint * const blocks_next, __global SKC_ATOMIC_UINT volatile * const bp_atomics, skc_uint const bp_mask, // pow2 modulo mask for block pool ring __global skc_block_id_t const * const bp_ids, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, skc_ttsk_v_t * const sk_v, skc_uint * const sk_v_next, __global skc_ttsk_s_t * const sk_extent, skc_uint const new_yx) { #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 skc_block_id_t const new_id = skc_subblocks_get_next(subblocks, blocks_next, blocks, bp_atomics, bp_mask, bp_ids); #else skc_block_id_t const new_id = skc_blocks_get_next(blocks_next, blocks, bp_atomics, bp_mask, // pow2 modulo mask for block pool ring bp_ids); #endif if (get_sub_group_local_id() == (*sk_v_next & SKC_RASTERIZE_TTSK_V_MASK)) { sk_v->lo = new_id; sk_v->hi = (sk_v->hi & SKC_TTRK_HI_MASK_COHORT) | new_yx; #if 0 printf("@ ( %3u, %3u ) %u\n", (new_yx >> 12) & 0xFFF, (new_yx ) & 0xFFF, new_id); #endif } *sk_v_next += 1; if (*sk_v_next == SKC_RASTERIZE_TTSK_V_SIZE) { *sk_v_next = 0; skc_uint sk_idx = 0; if (skc_subgroup_lane() == 0) { sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_TTSK_V_SIZE); #if 0 printf("+ %u\n",sk_idx); #endif } sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); #if ( SKC_RASTERIZE_SUBGROUP_SIZE > SKC_RASTERIZE_TTSK_V_SIZE ) if (skc_subgroup_lane() < SKC_RASTERIZE_TTSK_V_SIZE) #endif { sk_extent[sk_idx] = *sk_v; #if 0 printf("> %u : %v2u\n",sk_idx,*sk_v); #endif } } return new_id; } // // // static SKC_RASTERIZE_FLOAT skc_subgroup_scan_inclusive_add_float(SKC_RASTERIZE_FLOAT const v) { #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // // Note that there isn't a built-in horizontal scan for vectors so // we'll define some here for various widths. // // FIXME -- a scalar version might be faster so put in a // compile-time switch to selection between implementations // #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) return v; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) // 01 // 0 + // -- // 01 SKC_RASTERIZE_FLOAT const w = mad(v.s10,(SKC_RASTERIZE_FLOAT)(0,1),v); return w; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) // 0123 // 012 + // ---- // 0123 // 01 + // ---- // 0123 // SKC_RASTERIZE_FLOAT const w = mad(v.s3012,(SKC_RASTERIZE_FLOAT)(0,1,1,1),v); SKC_RASTERIZE_FLOAT const x = mad(w.s2301,(SKC_RASTERIZE_FLOAT)(0,0,1,1),w); return x; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) // 01234567 // 0123456 + // -------- // 01234567 // 012345 + // -------- // 01234567 // 0123 + // -------- // 01234567 // SKC_RASTERIZE_FLOAT const w = mad(v.s70123456,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1),v); SKC_RASTERIZE_FLOAT const x = mad(w.s67012345,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1),w); SKC_RASTERIZE_FLOAT const y = mad(x.s45670123,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1),x); return y; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) // 0123456789abcdef // 0123456789abcde + // ---------------- // 0123456789abcdef // 0123456789abcd + // ---------------- // 0123456789abcdef // 0123456789ab + // ---------------- // 0123456789abcdef // 01234567 + // ---------------- // 0123456789abcdef // SKC_RASTERIZE_FLOAT const w = mad(v.sf0123456789abcde,(SKC_RASTERIZE_FLOAT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); SKC_RASTERIZE_FLOAT const x = mad(w.sef0123456789abcd,(SKC_RASTERIZE_FLOAT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); SKC_RASTERIZE_FLOAT const y = mad(x.scdef0123456789ab,(SKC_RASTERIZE_FLOAT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); SKC_RASTERIZE_FLOAT const z = mad(y.s89abcdef01234567,(SKC_RASTERIZE_FLOAT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); return z; #endif #else // // SIMT // return sub_group_scan_inclusive_add(v); #endif } // // // static SKC_RASTERIZE_UINT skc_subgroup_scan_inclusive_add_uint(SKC_RASTERIZE_UINT const v) { #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // // Note that there isn't a built-in horizontal scan for vectors so // we'll define some here for various widths. // // FIXME -- a scalar version might be faster so put in a // compile-time switch to selection between implementations // #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) return v; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) // 01 // 0 + // -- // 01 SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s10,(SKC_RASTERIZE_UINT)(0,1),v); return w; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) // 0123 // 012 + // ---- // 0123 // 01 + // ---- // 0123 // SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s3012,(SKC_RASTERIZE_UINT)(0,1,1,1),v); SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s2301,(SKC_RASTERIZE_UINT)(0,0,1,1),w); return x; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) // 01234567 // 0123456 + // -------- // 01234567 // 012345 + // -------- // 01234567 // 0123 + // -------- // 01234567 // SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.s70123456,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1),v); SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.s67012345,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1),w); SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.s45670123,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1),x); return y; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) // 0123456789abcdef // 0123456789abcde + // ---------------- // 0123456789abcdef // 0123456789abcd + // ---------------- // 0123456789abcdef // 0123456789ab + // ---------------- // 0123456789abcdef // 01234567 + // ---------------- // 0123456789abcdef // SKC_RASTERIZE_UINT const w = SKC_MAD_UINT(v.sf0123456789abcde,(SKC_RASTERIZE_UINT)(0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),v); SKC_RASTERIZE_UINT const x = SKC_MAD_UINT(w.sef0123456789abcd,(SKC_RASTERIZE_UINT)(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1),w); SKC_RASTERIZE_UINT const y = SKC_MAD_UINT(x.scdef0123456789ab,(SKC_RASTERIZE_UINT)(0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1),x); SKC_RASTERIZE_UINT const z = SKC_MAD_UINT(y.s89abcdef01234567,(SKC_RASTERIZE_UINT)(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1),y); return z; #endif #else // // SIMT // return sub_group_scan_inclusive_add(v); #endif } // // // static SKC_RASTERIZE_UINT skc_subgroup_scan_inclusive_max(SKC_RASTERIZE_UINT const v) { #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // // Note that there isn't a built-in horizontal scan for vectors so // we'll define some here for various widths. // // FIXME -- a scalar version might be faster so put in a // compile-time switch to selection between implementations // #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) return v; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) // 01 // 00 max // -- // 01 SKC_RASTERIZE_UINT const w = max(v.s00,v); return w; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) // 0123 // 0012 + // ---- // 0123 // 0101 + // ---- // 0123 // SKC_RASTERIZE_UINT const w = max(v.s0012,v); SKC_RASTERIZE_UINT const x = max(w.s0101,w); return x; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) // 01234567 // 00123456 + // -------- // 01234567 // 01012345 + // -------- // 01234567 // 01230123 + // -------- // 01234567 // SKC_RASTERIZE_UINT const w = max(v.s00123456,v); SKC_RASTERIZE_UINT const x = max(w.s01012345,w); SKC_RASTERIZE_UINT const y = max(x.s01230123,x); return y; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) // 0123456789abcdef // 00123456789abcde + // ---------------- // 0123456789abcdef // 010123456789abcd + // ---------------- // 0123456789abcdef // 01230123456789ab + // ---------------- // 0123456789abcdef // 0123456701234567 + // ---------------- // 0123456789abcdef // SKC_RASTERIZE_UINT const w = max(v.s00123456789abcde,v); SKC_RASTERIZE_UINT const x = max(w.s010123456789abcd,w); SKC_RASTERIZE_UINT const y = max(x.s01230123456789ab,x); SKC_RASTERIZE_UINT const z = max(y.s0123456701234567,y); return z; #endif #else // // SIMT // return sub_group_scan_inclusive_max(v); #endif } // // // static float skc_subgroup_last_float(SKC_RASTERIZE_FLOAT const v) { #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) return v; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) return v.s1; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) return v.s3; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) return v.s7; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) return v.sf; #endif #else // // SIMT // return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); #endif } // // // static SKC_RASTERIZE_UINT skc_subgroup_last_uint(SKC_RASTERIZE_UINT const v) { #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) return v; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) return v.s1; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) return v.s3; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) return v.s7; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) return v.sf; #endif #else // // SIMT // return sub_group_broadcast(v,SKC_RASTERIZE_SUBGROUP_SIZE-1); #endif } // // // static float skc_subgroup_first(SKC_RASTERIZE_FLOAT const v) { #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) return v; #else return v.s0; #endif #else // // SIMT // return sub_group_broadcast(v,0); #endif } // // // static SKC_RASTERIZE_FLOAT skc_subgroup_shuffle(SKC_RASTERIZE_FLOAT const v, SKC_RASTERIZE_UINT const i) { #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) return v; #else return shuffle(v,i); #endif #else // // SIMT // return intel_sub_group_shuffle(v,i); #endif } // // // static SKC_RASTERIZE_FLOAT skc_subgroup_shuffle_up_1(SKC_RASTERIZE_FLOAT const p, // previous SKC_RASTERIZE_FLOAT const c) // current { #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // // FIXME -- there are alternative formulations here: // // Option 1: // // select(c.rotate(+1),p.rotate(-1),(1,0,0,...)) // // Option 2: // // p is a scalar // t = c.rotate(+1) // t.s0 = p; // // Option 3: ... // #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) return p; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) return shuffle2(p,c,(uint2)(1,2)); #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) return shuffle2(p,c,(uint4)(3,4,5,6)); #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) return shuffle2(p,c,(uint8)(7,8,9,10,11,12,13,14)); #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) return shuffle2(p,c,(uint16)(15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30)); #endif #else // // SIMT // return intel_sub_group_shuffle_up(p,c,1); #endif } // // // static bool skc_is_lane_first() { #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) // // SIMD // return true; #else // // SIMT // return get_sub_group_local_id() == 0; #endif } // // // static SKC_RASTERIZE_FLOAT skc_delta_offset() { #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) return 1; #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 1 ) return (SKC_RASTERIZE_FLOAT)( 1, 2 ); #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 2 ) return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4 ); #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 3 ) return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8 ); #elif ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 4 ) return (SKC_RASTERIZE_FLOAT)( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ); #endif #else // // SIMT // return 1.0f + get_sub_group_local_id(); #endif } // // // static int skc_subgroup_any(SKC_RASTERIZE_PREDICATE const p) { #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // return any(p); #else // // SIMT // return sub_group_any(p); #endif } // // // #define SKC_PATH_NODEWORD_IS_LAST(n) (((n) & SKC_DEVICE_BLOCK_WORDS_MASK) == SKC_DEVICE_BLOCK_WORDS_MASK) void skc_segment_next(__global union skc_bp_elem * const bp_elems, skc_uint * const nodeword, skc_block_id_t * const id) { if ((++*id & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) == 0) { if (SKC_PATH_NODEWORD_IS_LAST(++*nodeword)) { *nodeword = SKC_TAGGED_BLOCK_ID_GET_ID(bp_elems[*nodeword].tag_id) * SKC_DEVICE_SUBBLOCK_WORDS; } skc_tagged_block_id_t const tag_id = bp_elems[*nodeword].tag_id; *id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); } } // // // static SKC_RASTERIZE_FLOAT skc_native_length(SKC_RASTERIZE_FLOAT const x, SKC_RASTERIZE_FLOAT const y) { return native_sqrt(x * x + y * y); } // // Wang's Formula (1985) // #define SKC_WANG_PIXEL_RESL 0.25f // <-- this can be tuned #define SKC_WANG_EPSILON (SKC_WANG_PIXEL_RESL * SKC_SUBPIXEL_RESL_X_F32) #define SKC_WANG_CUBIC ((3.0f * 2.0f) / (8.0f * SKC_WANG_EPSILON)) #define SKC_WANG_QUADRATIC ((2.0f ) / (8.0f * SKC_WANG_EPSILON)) #define SKC_WANG_LENGTH(x,y) skc_native_length(x,y) #define SKC_WANG_SQRT(x) native_sqrt(x) // // // static SKC_RASTERIZE_FLOAT skc_wangs_formula_cubic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y, SKC_RASTERIZE_FLOAT const t3x, SKC_RASTERIZE_FLOAT const t3y) { // // Return the number of evenly spaced (in the parametric sense) line // segments that are guaranteed to be within "epsilon" error of the // curve. // // We're then going to take multiples of the reciprocal of this // number so that the segmentation can be distributed across the // subgroup. // // Note, this can probably be slightly optimized per architecture // but it's probably far from being a hotspot since it's all // straight-line unpredicated code. // // The result is an integer ranging from [1.0,#segments] // // Note that even if all of the control points are coincident, the // max(1.0f) will categorize this as a line of 1 segment. // // This is what we want! We want to convert cubics to lines as // easily as possible and *then* cull lines that are either // horizontal or zero length. // return max(1.0f, ceil(SKC_WANG_SQRT(SKC_WANG_CUBIC * SKC_WANG_LENGTH(max(fabs(t2x - 2.0f * t1x + t0x), fabs(t3x - 2.0f * t2x + t1x)), max(fabs(t2y - 2.0f * t1y + t0y), fabs(t3y - 2.0f * t2y + t1y)))))); } static SKC_RASTERIZE_FLOAT skc_wangs_formula_quadratic(SKC_RASTERIZE_FLOAT const t0x, SKC_RASTERIZE_FLOAT const t0y, SKC_RASTERIZE_FLOAT const t1x, SKC_RASTERIZE_FLOAT const t1y, SKC_RASTERIZE_FLOAT const t2x, SKC_RASTERIZE_FLOAT const t2y) { return max(1.0f, ceil(SKC_WANG_SQRT(SKC_WANG_QUADRATIC * SKC_WANG_LENGTH(t2x - 2.0f * t1x + t0x, t2y - 2.0f * t1y + t0y)))); } // // rational curves // static SKC_RASTERIZE_FLOAT skc_wangs_formula_cubic_rat() { return 0.0f; } static SKC_RASTERIZE_FLOAT skc_wangs_formula_quad_rat() { return 0.0f; } // // flush any work-in-progress blocks and return unused block ids // static void skc_finalize(__global SKC_ATOMIC_UINT volatile * const bp_atomics, __global union skc_bp_elem * const bp_elems, __global uint * const bp_ids, skc_uint const bp_mask, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, skc_block_id_v_t * const blocks, skc_uint const blocks_next, skc_ttsk_v_t * const sk_v, skc_uint const sk_v_next, __global skc_ttsk_s_t * const sk_extent, __local struct skc_subgroup_smem volatile * const smem) { // // flush non-empty bins // // FIXME -- accelerate this iteration/search with a subgroup operation // for (skc_uint ii=0; iibin.aN.count[ii] > 0) { skc_block_id_v_t const id = smem->bin.aN.id[ii]; skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); skc_uint const tts = smem->bin.aN.ttsb[ii][skc_subgroup_lane()]; #if 0 printf("???????? : [ %10u = %10u : %08X ]\n",id,idx,tts); #endif bp_elems[idx].u32 = tts; } // // FIXME -- vectorize with vstoreN() // } // // return remaining block ids back to the pool // skc_uint const blocks_rem = SKC_RASTERIZE_BLOCK_ID_V_SIZE - blocks_next; if (blocks_rem > 0) { skc_uint bp_idx = 0; if (skc_subgroup_lane() == 0) { bp_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,blocks_rem); #if 0 printf("r-: %8u + %u\n",bp_idx,blocks_rem); #endif } bp_idx = (sub_group_broadcast(bp_idx,0) + skc_subgroup_lane() - blocks_next) & bp_mask; if (skc_subgroup_lane() >= blocks_next) { bp_ids[bp_idx] = *blocks; } } // // flush work-in-progress ryx keys // if (sk_v_next > 0) { skc_uint sk_idx = 0; if (skc_subgroup_lane() == 0) { sk_idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,sk_v_next); #if 0 printf("* %u\n",sk_idx); #endif } sk_idx = sub_group_broadcast(sk_idx,0) + skc_subgroup_lane(); if (skc_subgroup_lane() < sk_v_next) { sk_extent[sk_idx] = *sk_v; } } } // // If there are lanes that were unable to append to a bin because // their hashes collided with a bin's current ryx key then those bins // must be ejected. // // Note that we do not eject "full" bins because lazily waiting for a // collision results in simpler code. // static void skc_flush(__global SKC_ATOMIC_UINT volatile * const bp_atomics, __global union skc_bp_elem * const bp_elems, __global uint * const bp_ids, skc_uint const bp_mask, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, skc_block_id_t * const subblocks, skc_block_id_v_t * const blocks, skc_uint * const blocks_next, skc_ttsk_v_t * const sk_v, skc_uint * const sk_v_next, __global skc_ttsk_s_t * const sk_extent, __local struct skc_subgroup_smem volatile * const smem, SKC_RASTERIZE_UINT const hash, SKC_RASTERIZE_UINT const yx, SKC_RASTERIZE_PREDICATE is_collision) // pass by value { #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // // // FIXME -- this code is now stale with the changes to the // subblock/block allocation strategy // // // get local TTSB ID queue count // skc_uint ttsb_id_count = smem->pool.count; // scalar // init hash bit mask skc_uint component_mask = 0; for (int cc=0; ccbin.aN.count[winner] > 0) { skc_uint const elem_idx = smem->bin.aN.id[winner] * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); bp_elems[elem_idx].u32 = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; } // // ensure there is at least one TTSK and TTSB ID // if (ttsb_id_count == SKC_RASTERIZE_POOL_SIZE) { // // update remaining count // ttsb_id_count = 0; // // flush accumulated ttsk_ryx keys // uint const idx = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE (cohort_atomics+SKC_RASTER_COHORT_ATOMIC_OFFSET_KEYS,SKC_RASTERIZE_POOL_SIZE); // ttsk_ryx_count #if 0 printf("# %u\n",idx); #endif for (uint ii=0; iipool.aN.id[ii] = bp_ids[id + ii]; } // // invalidate the winning block // // // update bin with winning yx, new ttsb id and zero count // // all lanes are loading/storing from/to the same index // smem->bin.vN.ttsb [winner] = ( SKC_TTS_INVALID ); smem->bin.aN.id [winner] = smem->pool.aN.id[ttsb_id_count]; smem->bin.aN.yx [winner] = smem->pool.aN.yx[ttsb_id_count] = ((uint*)&yx)[cc]; smem->bin.aN.count[winner] = 0; // // update count // ttsb_id_count += 1; } // // save count // smem->pool.count = ttsb_id_count; #else // // SIMT // do { // // only one lane will win! // if (is_collision) smem->subgroup.winner = hash; barrier(CLK_LOCAL_MEM_FENCE); // // which bin is being ejected? // skc_uint const winner = smem->subgroup.winner; // // which colliding hash is taking over the bin? // SKC_RASTERIZE_PREDICATE const is_winner = is_collision && (hash == winner); // // all lanes with the same hash will try to store but only one // lane will win // if (is_winner) smem->subgroup.winner = yx; barrier(CLK_LOCAL_MEM_FENCE); // // flush this block to the pool // if (smem->bin.aN.count[winner] > 0) { skc_block_id_v_t const id = smem->bin.aN.id[winner]; skc_uint const idx = id * SKC_DEVICE_SUBBLOCK_WORDS + skc_subgroup_lane(); skc_uint const tts = smem->bin.aN.ttsb[winner][skc_subgroup_lane()]; #if 0 printf("%08X : [ %10u = %10u : %08X ]\n",yx,id,idx,tts); #endif bp_elems[idx].u32 = tts; } // // append new ttsk // skc_uint const new_yx = smem->subgroup.winner; skc_block_id_t const new_id = skc_ttsk_v_append(SKC_SUBBLOCKS_BLOCKS_ARGS(), blocks_next, bp_atomics, bp_mask, // pow2 modulo mask for block pool ring bp_ids, cohort_atomics, sk_v, sk_v_next, sk_extent, new_yx); #if 0 if (get_sub_group_local_id() == 0) { printf(">>> %9u\n",new_id); } #endif // // update bin with winning yx, new ttsb id and zero count // smem->bin.aN.ttsb [winner][skc_subgroup_lane()] = SKC_TTS_INVALID; smem->bin.aN.yx [winner] = new_yx; smem->bin.aN.id [winner] = new_id; smem->bin.aN.count[winner] = 0; // // remove all lanes matching this hash // is_collision = is_collision && !is_winner; // // exit if nothing left to do // } while (sub_group_any(is_collision)); #endif } // // scatter scan max // static SKC_RASTERIZE_UINT skc_scatter_scan_max(__local struct skc_subgroup_smem volatile * const smem, SKC_RASTERIZE_FLOAT const iss, SKC_RASTERIZE_FLOAT const ess) { // // prefix sums determine which lanes we're going to work on next // SKC_RASTERIZE_PREDICATE const is_scratch_store = (iss > 0.0f) && (ess < (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP); SKC_RASTERIZE_UINT const scratch_idx = SKC_CONVERT(SKC_RASTERIZE_UINT)(max(ess,0.0f)); #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // #ifdef SKC_RASTERIZE_SIMD_USES_SMEM // // SIMD APPROACH 1: SIMT'ISH // // zero the volatile smem scratchpad using vector syntax smem->subgroup.vN.scratch[0] = ( 0 ); #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,A) \ if (is_scratch_store C) \ smem->subgroup.aN.scratch[scratch_idx C] = I; SKC_RASTERIZE_VECTOR_EXPAND(); // propagate lanes to right using max scan SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[0]; SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); #else // // SIMD APPROACH 2: SCALAR'ISH // SKC_RASTERIZE_UINT source = ( 0 ); #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,A) \ if (is_scratch_store C) \ ((uint *)&source)[scratch_idx C] = I; SKC_RASTERIZE_VECTOR_EXPAND(); for (uint ii=1; iisubgroup.vN.scratch[skc_subgroup_lane()] = ( 0 ); // // store source lane at starting lane // if (is_scratch_store) smem->subgroup.aN.scratch[scratch_idx] = skc_subgroup_lane(); // // propagate lanes to right using max scan // SKC_RASTERIZE_UINT const scratch = smem->subgroup.vN.scratch[skc_subgroup_lane()]; SKC_RASTERIZE_UINT const source = skc_subgroup_scan_inclusive_max(scratch); #endif return source; } // // sliver lines into subpixels // static void skc_sliver(__global SKC_ATOMIC_UINT volatile * const bp_atomics, __global union skc_bp_elem * const bp_elems, __global uint * const bp_ids, skc_uint const bp_mask, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, skc_block_id_t * const subblocks, skc_block_id_v_t * const blocks, skc_uint * const blocks_next, skc_ttsk_v_t * const sk_v, skc_uint * const sk_v_next, __global skc_ttsk_s_t * const sk_extent, __local struct skc_subgroup_smem volatile * const smem, SKC_RASTERIZE_FLOAT const l0x, SKC_RASTERIZE_FLOAT const l0y, SKC_RASTERIZE_FLOAT const l1x, SKC_RASTERIZE_FLOAT const l1y) { // // Y-SLIVERING // ----------- // // immediately sliver all multi-pixel lines in into 1-pixel high // lines // // note this implicitly squelches horizontal lines // // there is another test for horizontal lines after x-slivering // is complete // // // will we need to flip the sign of y_delta ? // SKC_RASTERIZE_PREDICATE const y_lt = (l0y <= l1y); SKC_RASTERIZE_UINT const dy_xor = y_lt ? 0 : 0x80000000; // // save 1/dy // SKC_RASTERIZE_FLOAT const y_denom = native_recip(l1y - l0y); // // how many non-horizontal subpixel y-axis slivers are there? // SKC_RASTERIZE_FLOAT const y_min = floor(fmin(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); SKC_RASTERIZE_FLOAT const y_max = ceil (fmax(l0y,l1y) * SKC_SUBPIXEL_Y_SCALE_DOWN); SKC_RASTERIZE_FLOAT const y_base = y_lt ? y_min : y_max; SKC_RASTERIZE_FLOAT y_segs = y_max - y_min; // // inclusive subgroup scan of y_segs // SKC_RASTERIZE_FLOAT y_iss = skc_subgroup_scan_inclusive_add_float(y_segs); SKC_RASTERIZE_FLOAT y_ess = y_iss - y_segs; float y_rem = skc_subgroup_last_float(y_iss); // // if this is a horizontal line then tweak y_iss so "is_scratch_store" always fails // if (y_segs == 0.0f) y_iss = 0.0f; #if 0 printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } (* %5.0f / %5.0f / %5.0f / %5.0f *) }, \n",a0x,a0y,a1x,a1y,y_segs,y_iss,y_ess,y_rem); #endif // // these values don't matter on first iteration // SKC_RASTERIZE_FLOAT n1x_prev = 0; SKC_RASTERIZE_FLOAT n1y_prev = 0; // // loop until done // while (y_rem > 0.0f) { // // distribute work across lanes // SKC_RASTERIZE_UINT const y_source = skc_scatter_scan_max(smem,y_iss,y_ess); // // get line at y_source line // SKC_RASTERIZE_FLOAT const m0x = skc_subgroup_shuffle(l0x,y_source); SKC_RASTERIZE_FLOAT const m0y = skc_subgroup_shuffle(l0y,y_source); SKC_RASTERIZE_FLOAT const m1x = skc_subgroup_shuffle(l1x,y_source); SKC_RASTERIZE_FLOAT const m1y = skc_subgroup_shuffle(l1y,y_source); // // every lane will create a 1 pixel tall line "sliver" // // FIXME -- this gets expanded on SIMD // // if numerator == 1 then this is the first lane // if numerator == s then this is the last lane // SKC_RASTERIZE_FLOAT const y_delta = skc_delta_offset() - skc_subgroup_shuffle(y_ess,y_source); SKC_RASTERIZE_FLOAT const y_count = skc_subgroup_shuffle(y_segs,y_source); SKC_RASTERIZE_PREDICATE const is_y_first = (y_delta == 1.0f); SKC_RASTERIZE_PREDICATE const is_y_last = (y_delta >= y_count); // toggle y_delta sign SKC_RASTERIZE_FLOAT const y_offset = as_float((as_uint(y_delta) ^ intel_sub_group_shuffle(dy_xor,y_source))); // // calculate "right" line segment endpoint // SKC_RASTERIZE_FLOAT n1y = (y_offset + skc_subgroup_shuffle(y_base,y_source)) * SKC_SUBPIXEL_Y_SCALE_UP; SKC_RASTERIZE_FLOAT const n_t = (n1y - m0y) * skc_subgroup_shuffle(y_denom,y_source); SKC_RASTERIZE_FLOAT n1x = round(SKC_LERP(m0x,m1x,n_t)); // // override c1 if this is last point // n1y = select(n1y,m1y,is_y_last); n1x = select(n1x,m1x,is_y_last); // // shuffle up "left" line segment endpoint // // NOTE: Intel's shuffle_up is unique with its elegant // "previous" argument so don't get used to it // SKC_RASTERIZE_FLOAT n0y = skc_subgroup_shuffle_up_1(n1y_prev,n1y); SKC_RASTERIZE_FLOAT n0x = skc_subgroup_shuffle_up_1(n1x_prev,n1x); // // override shuffle up if this is the first line segment // n0y = select(n0y,m0y,is_y_first); n0x = select(n0x,m0x,is_y_first); // // save previous right endpoint // n1x_prev = n1x; n1y_prev = n1y; // // decrement by subgroup size // y_iss -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; y_ess -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; y_rem -= (float)SKC_RASTERIZE_ELEMS_PER_SUBGROUP; #if 0 // // debug // if (n0y != n1y) { printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",n0x,n0y,n1x,n1y); } #endif // // X-SLIVERING // ----------- // // now sliver 1-pixel high lines into at either vertical or // 1-pixel wide lines // // save original direction and work with increasing x // SKC_RASTERIZE_PREDICATE const x_lt = (n0x <= n1x); SKC_RASTERIZE_UINT const dx_xor = x_lt ? 0 : 0x80000000; // // save 1/dy // SKC_RASTERIZE_FLOAT const x_denom = native_recip(n1x - n0x); // // how many non-horizontal subpixel y-axis slivers are there? // SKC_RASTERIZE_FLOAT const x_min = floor(fmin(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); SKC_RASTERIZE_FLOAT const x_max = ceil (fmax(n0x,n1x) * SKC_SUBPIXEL_X_SCALE_DOWN); SKC_RASTERIZE_FLOAT const x_base = x_lt ? x_min : x_max; SKC_RASTERIZE_FLOAT const x_segs = fmax(x_max - x_min,1.0f); // // inclusive subgroup scan of y_segs // SKC_RASTERIZE_FLOAT x_iss = skc_subgroup_scan_inclusive_add_float(x_segs); SKC_RASTERIZE_FLOAT x_ess = x_iss - x_segs; float x_rem = skc_subgroup_last_float(x_iss); // // if this is a horizontal line then tweak x_iss so "is_scratch_store" always fails // //if (x_segs == 0.0f) // x_iss = 0.0f; // // these values don't matter on first iteration // SKC_RASTERIZE_FLOAT p1x_prev = 0; SKC_RASTERIZE_FLOAT p1y_prev = 0; // // loop until done // while (x_rem > 0) { // // distribute work across lanes // SKC_RASTERIZE_UINT const x_source = skc_scatter_scan_max(smem,x_iss,x_ess); // // get line at y_source line // SKC_RASTERIZE_FLOAT const o0x = skc_subgroup_shuffle(n0x,x_source); SKC_RASTERIZE_FLOAT const o0y = skc_subgroup_shuffle(n0y,x_source); SKC_RASTERIZE_FLOAT const o1x = skc_subgroup_shuffle(n1x,x_source); SKC_RASTERIZE_FLOAT const o1y = skc_subgroup_shuffle(n1y,x_source); // // every lane will create a 1 pixel tall line "sliver" // // FIXME -- this gets expanded on SIMD // // if numerator == 1 then this is the first lane // if numerator == s then this is the last lane // SKC_RASTERIZE_FLOAT const x_delta = skc_delta_offset() - skc_subgroup_shuffle(x_ess,x_source); SKC_RASTERIZE_FLOAT const x_count = skc_subgroup_shuffle(x_segs,x_source); SKC_RASTERIZE_PREDICATE const is_x_first = (x_delta == 1.0f); SKC_RASTERIZE_PREDICATE const is_x_last = (x_delta >= x_count); // toggle x_delta sign SKC_RASTERIZE_FLOAT const x_offset = as_float((as_uint(x_delta) ^ intel_sub_group_shuffle(dx_xor,x_source))); // // calculate "right" line segment endpoint // SKC_RASTERIZE_FLOAT p1x = (x_offset + skc_subgroup_shuffle(x_base,x_source)) * SKC_SUBPIXEL_X_SCALE_UP; SKC_RASTERIZE_FLOAT const p_t = (p1x - o0x) * skc_subgroup_shuffle(x_denom,x_source); SKC_RASTERIZE_FLOAT p1y = round(SKC_LERP(o0y,o1y,p_t)); // // override c1 if this is last point // p1x = select(p1x,o1x,is_x_last); p1y = select(p1y,o1y,is_x_last); // // shuffle up "left" line segment endpoint // // NOTE: Intel's shuffle_up is unique with its elegant // "previous" argument so don't get used to it // SKC_RASTERIZE_FLOAT p0x = skc_subgroup_shuffle_up_1(p1x_prev,p1x); SKC_RASTERIZE_FLOAT p0y = skc_subgroup_shuffle_up_1(p1y_prev,p1y); // // override shuffle up if this is the first line segment // p0x = select(p0x,o0x,is_x_first); p0y = select(p0y,o0y,is_x_first); // // save previous right endpoint // p1x_prev = p1x; p1y_prev = p1y; // // decrement by subgroup size // x_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; x_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; x_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; // // only non-horizontal subpixel lines are valid // SKC_RASTERIZE_PREDICATE is_active = (p0y != p1y); // // if no lanes are active then continue // // FIXME -- THIS SIMPLE SUB_GROUP_ANY TEST SIGNIFICANTLY // IMPACTS PERFORMANCE (+12% ?) // // IT SHOULDN'T !!! // #if 0 if (!skc_subgroup_any(is_active)) continue; #endif // // Option 1: use SLM for explicitly managed coalesced stores // // 1. which tile does this line belong? // 2. hash tile coordinates // 3. lookup hash // 4. if tile matches then SLM append keys // 5. if tile doesn't match // a. flush // b. create new TTSK_RYX // c. obtain TTSB block from pool // d. goto 3. // // // Option 2: rely on L1/L2/L3 to mitigate non-coalesced stores // // 1. which tile does this line belong? // 2. hash tile coordinates // 3. lookup hash // 4. if tile matches then GMEM append keys // 5. if tile doesn't match // a. flush (and invalidate empty elems) // b. create new TTSK_RYX // c. obtain TTSB block from pool // d. goto 3. // // // The virtual rasterization surface is very large and // signed: +/- ~64K-256K, depending on the architecture. // // Rasters must be clipped to the virtual surface and, // optionally, clipped even further on a per raster // basis. // // // Clip to the per-raster clip // /* CLIP HERE */ // // Hash the tile coordinates // // This table lists nominal values for each architecture. // We want to choose values that are naturally fit the // "width" of the architecture. // // SIMD RANGE BITS MAX RANGE MAX BINS HASH BITS // ---- ------- ---- --------- -------- --------- // 4 [0, 4] 3 [0, 7] 10 mod(10) <-- SSE42, ? // 8 [0, 8] 4 [0, 15] 8 3 <-- GEN*,AVX* // 16 [0, 16] 5 [0, 31] 6 mod(6) <-- GEN*,? // 32 [0, 32] 6 [0, 63] 5 mod(5) <-- CUDA,PowerVR,Adreno,GEN* // 64 [0, 64] 7 [0,127] 4 2 <-- AMD Radeon // // NOTE: When possible, bias the hash toward using more y // bits because of: // // 1. the 90 degree counter-clockwise rotation that we put // in place to offset the render-time clockwise // rotation // // 2. the likely presence of left-to-right or // right-to-left glyphs. // // For power-of-two bins, the hash is easy. // // For non-power-of-two, we may want to either implement a // fast mod (compiler should do this for us... hahahaha) or // drop down to the next power-of-two. // // // FIXME -- this snarl is not good -- can probably reduce // some of the sign casting but some is there to vectorize a // scalar // SKC_RASTERIZE_INT const z0y = SKC_CONVERT(SKC_RASTERIZE_INT)(p0y); SKC_RASTERIZE_INT const z1y = SKC_CONVERT(SKC_RASTERIZE_INT)(p1y); SKC_RASTERIZE_INT const z0x = SKC_CONVERT(SKC_RASTERIZE_INT)(p0x); SKC_RASTERIZE_INT const z1x = SKC_CONVERT(SKC_RASTERIZE_INT)(p1x); SKC_RASTERIZE_INT const min_y = min(z0y,z1y); SKC_RASTERIZE_INT const max_y = max(z0y,z1y); SKC_RASTERIZE_INT const tile_y = min_y >> SKC_SUBTILE_RESL_Y_LOG2; SKC_RASTERIZE_UINT const ty = SKC_AS(SKC_RASTERIZE_UINT)(min_y) & SKC_SUBTILE_MASK_Y; SKC_RASTERIZE_INT dy = SKC_AS(SKC_RASTERIZE_INT)(z1y - z0y); // // map [+1,+32] to [ 0,+31] // map [-1,-32] to [-1,-32] // SKC_RASTERIZE_INT dys = (dy + (~dy >> 31)) << 26; SKC_RASTERIZE_INT const min_x = min(z0x,z1x); SKC_RASTERIZE_INT const max_x = max(z0x,z1x); SKC_RASTERIZE_INT const tile_x = min_x >> SKC_SUBTILE_RESL_X_LOG2; SKC_RASTERIZE_UINT const tx = SKC_AS(SKC_RASTERIZE_UINT)(min_x) & SKC_SUBTILE_MASK_X; SKC_RASTERIZE_UINT const sx = SKC_AS(SKC_RASTERIZE_UINT)(max_x - min_x); SKC_RASTERIZE_UINT const tts = dys | (ty << 16) | (sx << 10) | tx; SKC_RASTERIZE_UINT const hash = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & SKC_RASTERIZE_TILE_HASH_Y_MASK) << SKC_RASTERIZE_TILE_HASH_X_BITS) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & SKC_RASTERIZE_TILE_HASH_X_MASK)); SKC_RASTERIZE_UINT const yx = (((SKC_AS(SKC_RASTERIZE_UINT)(tile_y) & 0xFFF) << 12) | (SKC_AS(SKC_RASTERIZE_UINT)(tile_x) & 0xFFF)); #if 0 printf("(%3u, %3u)\n",tile_y,tile_x); #endif #if 0 if (is_active) printf("( %3u, %3u ) : [ %3u, %3u, %3d, %3d, %3u ]\n",tile_y,tile_x,ty,tx,dy,((int)dys)>>26,sx); #endif // // debug // #if 0 // PRINTF_ENABLE #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,A) \ if (is_active C) \ printf("{ { %5d, %5d }, { %5d, %5d } (* %2u *) },\n",z0x C,z0y C,z1x C,z1y C,hash C); SKC_RASTERIZE_VECTOR_EXPAND(); #else if (is_active) printf("{ { %5d, %5d }, { %5d, %5d } } (* %2u *),\n",z0x,z0y,z1x,z1y,hash); #endif #endif // // flush all active lanes // while (true) { // // either gather load or vector load+shuffle the yx keys // #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) SKC_RASTERIZE_BIN const yx_bin = smem->bin.vN.yx; SKC_RASTERIZE_UINT const yx_cur = shuffle(yx_bin,hash); #else SKC_RASTERIZE_UINT const yx_cur = smem->bin.aN.yx[hash]; #endif // // does yx for lane match yx for hash? // SKC_RASTERIZE_UINT const active_yx = is_active ? yx : SKC_RASTERIZE_YX_INVALID; SKC_RASTERIZE_PREDICATE const is_match = (yx_cur == active_yx); // // OpenCL spec: "When casting a bool to a vector integer // data type, the vector components will be set to -1 // (i.e. all bits set) if the vector bool value is true // and 0 otherwise. // #if ( SKC_RASTERIZE_VECTOR_SIZE_LOG2 == 0 ) SKC_RASTERIZE_UINT const h_match = (SKC_RASTERIZE_UINT)is_match; #else SKC_RASTERIZE_UINT const h_match = abs(is_match); // {-1,0} -> {+1,0} #endif // // how many new elements for each matching hash bin? // SKC_RASTERIZE_UINT const h_shl = hash * SKC_RASTERIZE_TILE_HASH_BIN_BITS; SKC_RASTERIZE_UINT const h = h_match << h_shl; // // prefix sum all of the bins in parallel // SKC_RASTERIZE_UINT const h_iss = skc_subgroup_scan_inclusive_add_uint(h); SKC_RASTERIZE_UINT const h_total = skc_subgroup_last_uint(h_iss); // // current bin counts // #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) SKC_RASTERIZE_BIN const count_bin = smem->bin.vN.count; SKC_RASTERIZE_UINT const count_cur = shuffle(count_bin,hash); #else SKC_RASTERIZE_UINT const count_cur = smem->bin.aN.count[hash]; #endif // // calculate where each cache-hit and in-bounds tts should be stored // SKC_RASTERIZE_UINT const ttsb_index = (h_iss >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur - 1; SKC_RASTERIZE_UINT const count_new = (h_total >> h_shl & SKC_RASTERIZE_TILE_HASH_BIN_MASK) + count_cur; // // which lanes can append to a matching bin? // SKC_RASTERIZE_PREDICATE const is_append = is_match && (ttsb_index < SKC_DEVICE_SUBBLOCK_WORDS); // // scatter append tts elements to bin blocks // #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1) // // SIMD // #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,A) \ if (is_append C) \ { \ smem->bin.aN.ttsb [hash C][ttsb_index C] = tts C; \ smem->bin.aN.count[hash C] = count_new C; \ } SKC_RASTERIZE_VECTOR_EXPAND(); #else // // SIMT // if (is_append) { smem->bin.aN.ttsb [hash][ttsb_index] = tts; smem->bin.aN.count[hash] = count_new; // it's ok if this is > SKC_DEVICE_SUBBLOCK_WORDS } #endif // // try to keep predicate updates SIMD-friendly and // outside of predicated code paths -- this is not // always how we would normally do things on SIMT but // either approach is acceptable // // // mask off lanes/components that successfully appended // is_active = is_active && !is_append; // // are there any active lanes left? // if (!skc_subgroup_any(is_active)) break; // // There are active lanes that couldn't be appended to a // bin because their hashes collided with the bin's // current ryx key then those bins must be ejected. // // Note that we do not eject "full" bins because lazily // waiting for a collision results in simpler code. // skc_flush(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, subblocks, blocks, blocks_next, sk_v, sk_v_next, sk_extent, smem, hash, yx, is_active); } } } } // // INITIALIZE SMEM // // Note that SIMD/SIMT have nearly the same syntax. // static void skc_smem_init(__local struct skc_subgroup_smem volatile * const smem) { // // initialize smem bins // #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) // // SIMD // smem->bin.vN.yx = ( SKC_RASTERIZE_YX_INIT ); smem->bin.vN.count = ( 0 ); #else // // SIMT // int idx = skc_subgroup_lane(); #if ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT < SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) if (idx < SKC_RASTERIZE_TILE_HASH_BIN_COUNT) #elif ( SKC_RASTERIZE_TILE_HASH_BIN_COUNT > SKC_RASTERIZE_ELEMS_PER_SUBGROUP ) for (; idxbin.aN.yx [idx] = ( SKC_RASTERIZE_YX_INIT ); smem->bin.aN.count[idx] = ( 0 ); } #endif } // // RASTERIZE CUBIC KERNEL // static void skc_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, __global union skc_bp_elem * const bp_elems, __global uint * const bp_ids, skc_uint const bp_mask, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, __global skc_ttsk_s_t * const sk_extent, __local struct skc_subgroup_smem volatile * const smem, skc_uint * const nodeword, skc_block_id_t * const id, union skc_transform const * const tv, union skc_path_clip const * const cv, skc_uint const cohort) { // // the initial segment idx and segments-per-block constant determine // how many block ids will need to be loaded // SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c3x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c3y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; // // apply transform // // note that we only care if the end points are rounded to subpixel precision // // FIXME -- transformation is currently affine-only support perspective later // // the affine transformation requires 8 FMA + 2 ROUND operations // SKC_RASTERIZE_FLOAT b0x = c0x * tv->sx + c0y * tv->shx + tv->tx; SKC_RASTERIZE_FLOAT b0y = c0x * tv->shy + c0y * tv->sy + tv->ty; SKC_RASTERIZE_FLOAT t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; SKC_RASTERIZE_FLOAT t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; SKC_RASTERIZE_FLOAT t2x = c2x * tv->sx + c2y * tv->shx + tv->tx; SKC_RASTERIZE_FLOAT t2y = c2x * tv->shy + c2y * tv->sy + tv->ty; SKC_RASTERIZE_FLOAT t3x = c3x * tv->sx + c3y * tv->shx + tv->tx; SKC_RASTERIZE_FLOAT t3y = c3x * tv->shy + c3y * tv->sy + tv->ty; // // FIXME -- this is temporary support for projection // bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f); if (!is_affine) { SKC_PROJECT(tv,c0x,c0y,b0x,b0y); SKC_PROJECT(tv,c1x,c1y,t1x,t1y); SKC_PROJECT(tv,c2x,c2y,t2x,t2y); SKC_PROJECT(tv,c3x,c3y,t3x,t3y); } b0x = round(b0x); b0y = round(b0y); t3x = round(t3x); t3y = round(t3y); // // // #if PRINTF_ENABLE #if ( SKC_RASTERIZE_SUBGROUP_SIZE == 1 ) #undef SKC_EXPAND_X #define SKC_EXPAND_X(I,S,C,P,A) \ printf("{ { %.02f, %.02f }, { %.02f, %.02f }," \ " { %.02f, %.02f }, { %.02f, %.02f } },\n", \ b0x C,b0y C,t1x C,t1y C, \ t2x C,t2y C,t3x C,t3y C); SKC_RASTERIZE_VECTOR_EXPAND(); #else printf("{ { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f }, { %.02f, %.02f } },\n", b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); #endif #endif // // OLD APPROACH // ------------ // // The Spinel CUDA rasterizer was significantly more complex and // performed a few different tasks that are probably best kept // separate. // // The Spinel rasterizer Bezier held 4-element x and y coordinates // in adjacent lanes. This simplified intermingling of single lane // 4-coordinate line segments with two-lane cubic Beziers. // // After transformation of the input segments, the Spinel rasterizer // would test cubics for flatness and, if flat, collapse the // adjacent lanes into a single line lane and an empty lane. // // Any lines would then be appended to a line queue. // // Any cubics would then be subdivided. // // The reclassification process would be repeated. // // NEW APPROACH // ------------ // // Assume we're only working with cubics in this kernel. // // Optimization: if the line segment is a special case -- a cusp, // has 1+ inflections, or a loop -- it might be beneficial to // subdivide the control cage 1+ times in order to separate the // flatter segments the high-velocity region(s). // // This means we want to split using [a,b] formulation to _directly_ // subdivide producing a new control cage. // // Wang's Formula is still useful even if we subdivide once or twice // as it's so cheap that it might give some useful hints about where // the high-velocity sections of curve reside. // // But it seems like using Wang's and directly flattening to line // segments without any subdivision is good enough for the limited // set of test cases that I've tried. // // So... use Wang's Formula to estimate how many line segment are // required to properly flatten the cubics. // // Then use inclusive/exclusive scans to put all the lanes to work: // // 1. segmenting cubics to line segments // // 2. slivering line segments into 1-pixel high line segments // // 3. slivering 1-pixel high line segments into 1-pixel wide line // segments // // MORE BACKGROUND ON NEW APPROACH // ------------------------------- // // Two options for handling line segments: // // 1. append the line segments onto an SLM array until enough // work has been accrued (Spinel does this) // // 2. immediately sliver the potentially multi-pixel line // segments into subpixel lines // // The advantage of (1) is that it guarantees the slivering // process will, on average, always be emitting a full subgroup // of subpixel lines. // // The advantage of (2) is that it reduces code complexity and // leaves more room for SLM tile bins. The difference between Spinel // and Skia Compute is that Wang's Formula guarantees there will be // a full subgroup of multi-pixel lines unless this is the final // iteration of the warp of multi-pixel lines. // // Note that wider GPU architectures might benefit from (1) and // other work accumulation strategies because it will minimize // partial warp workloads in the final iteration of each stage. It // also minimizes the sunk cost of the uniform control logic steps. // // So let's implement (2) for now... // // // And... begin! // // Estimate how many line segments are in quad/cubic curve. // // Wang's Formula will return zero if the control points are // collinear but we bump it up to 1.0f. // SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_cubic(b0x,b0y,t1x,t1y,t2x,t2y,t3x,t3y); // // if there are free registers then precalculate the reciprocal for // each estimated segments since it will never change // SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); // // inclusive add scan of estimated line segments // exclusive add scan of estimated line segments // total number of estimated line segments // SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; float s_rem = skc_subgroup_last_float(s_iss); // scalar // // Precompute cubic polynomial coefficients from transformed control // cage so we can shuffle them in on each iteration of the outer // loop and then evaluate the polynomial in Horner form. // // | 1 0 0 0 | | c0 | // | | | | // | -3 3 0 0 | | c1 | // B(t) = [ 1 t^1 t^2 t^3 ] | | | | // | 3 -6 3 0 | | c2 | // | | | | // | -1 3 -3 1 | | c3 | // // SKC_RASTERIZE_FLOAT const b1x = mad(-3.0f,b0x,3.0f*t1x); // 2 - 1 MAD + MUL SKC_RASTERIZE_FLOAT const b1y = mad(-3.0f,b0y,3.0f*t1y); // 2 - 1 MAD + MUL SKC_RASTERIZE_FLOAT const b2x = mad(3.0f,b0x,mad(-6.0f,t1x,3.0f*t2x)); // 3 - 2 MAD + MUL SKC_RASTERIZE_FLOAT const b2y = mad(3.0f,b0y,mad(-6.0f,t1y,3.0f*t2y)); // 3 - 2 MAD + MUL SKC_RASTERIZE_FLOAT const b3x = mad(3.0f,t1x,mad(-3.0f,t2x,t3x)) - b0x; // 3 - 2 MAD + SUB SKC_RASTERIZE_FLOAT const b3y = mad(3.0f,t1y,mad(-3.0f,t2y,t3y)) - b0y; // 3 - 2 MAD + SUB // // these values don't matter on the first iteration // SKC_RASTERIZE_FLOAT l1x_prev = 0; SKC_RASTERIZE_FLOAT l1y_prev = 0; // // allocate and init in-register TTSK keys // skc_uint sk_v_next = 0; skc_ttsk_v_t sk_v; sk_v.hi = cohort; // // initialize smem // skc_smem_init(smem); // // initialize blocks / subblocks // skc_block_id_v_t blocks; skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 skc_block_id_t subblocks = 0; #endif // // loop until done // while (s_rem > 0) { // // distribute work across lanes // SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); // // every lane has a fraction to work off of // // FIXME -- this gets expanded on SIMD // // if delta == 1 then this is the first lane // if count == s_segs then this is the last lane // SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); // // init parametric t // SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? // // if last then override to a hard 1.0f // s_t = is_s_last ? 1.0f : s_t; // // decrement by subgroup size // s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; // // now every lane knows what to do and the following lines will // pump out up to SUBGROUP_SIZE line segments // // obtain the src vertices through shared or via a shuffle // // // shuffle in the polynomial coefficients their source lane // SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); SKC_RASTERIZE_FLOAT const s3x = skc_subgroup_shuffle(b3x,s_source); SKC_RASTERIZE_FLOAT const s3y = skc_subgroup_shuffle(b3y,s_source); // // calculate "right" line segment endpoint using Horner form // SKC_RASTERIZE_FLOAT l1x = round(mad(mad(mad(s3x,s_t,s2x),s_t,s1x),s_t,s0x)); // 3 MAD + ROUND SKC_RASTERIZE_FLOAT l1y = round(mad(mad(mad(s3y,s_t,s2y),s_t,s1y),s_t,s0y)); // 3 MAD + ROUND // // shuffle up "left" line segment endpoint // // NOTE: Intel's shuffle_up is unique with its elegant // "previous" argument so don't get used to it // SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); // // save previous right endpoint // l1x_prev = l1x; l1y_prev = l1y; // // override shuffle up if this is the first line segment // l0x = select(l0x,s0x,is_s_first); l0y = select(l0y,s0y,is_s_first); // // sliver lines // skc_sliver(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, &subblocks, &blocks, &blocks_next, &sk_v, &sk_v_next, sk_extent, smem, l0x,l0y,l1x,l1y); } // // - flush work-in-progress blocks // - return unused block ids // skc_finalize(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, &blocks, blocks_next, &sk_v, sk_v_next, sk_extent, smem); } // // RASTERIZE QUAD KERNEL // static void skc_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, __global union skc_bp_elem * const bp_elems, __global uint * const bp_ids, skc_uint const bp_mask, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, __global skc_ttsk_s_t * const sk_extent, __local struct skc_subgroup_smem volatile * const smem, skc_uint * const nodeword, skc_block_id_t * const id, union skc_transform const * const tv, union skc_path_clip const * const cv, skc_uint const cohort) { // // the initial segment idx and segments-per-block constant determine // how many block ids will need to be loaded // SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c2x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c2y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; // // apply transform // // note that we only care if the end points are rounded to subpixel precision // // FIXME -- transformation is currently affine-only support perspective later // // the affine transformation requires 8 FMA + 2 ROUND operations // SKC_RASTERIZE_FLOAT b0x = c0x * tv->sx + c0y * tv->shx + tv->tx; SKC_RASTERIZE_FLOAT b0y = c0x * tv->shy + c0y * tv->sy + tv->ty; SKC_RASTERIZE_FLOAT t1x = c1x * tv->sx + c1y * tv->shx + tv->tx; SKC_RASTERIZE_FLOAT t1y = c1x * tv->shy + c1y * tv->sy + tv->ty; SKC_RASTERIZE_FLOAT t2x = c2x * tv->sx + c2y * tv->shx + tv->tx; SKC_RASTERIZE_FLOAT t2y = c2x * tv->shy + c2y * tv->sy + tv->ty; // // FIXME -- this is temporary support for projection // bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f); if (!is_affine) { SKC_PROJECT(tv,c0x,c0y,b0x,b0y); SKC_PROJECT(tv,c1x,c1y,t1x,t1y); SKC_PROJECT(tv,c2x,c2y,t2x,t2y); } b0x = round(b0x); b0y = round(b0y); t2x = round(t2x); t2y = round(t2y); // // Estimate how many line segments are in quad/cubic curve. // // Wang's Formula will return zero if the control points are // collinear but we bump it up to 1.0f. // SKC_RASTERIZE_FLOAT const s_segs = skc_wangs_formula_quadratic(b0x,b0y,t1x,t1y,t2x,t2y); // // if there are free registers then precalculate the reciprocal for // each estimated segments since it will never change // SKC_RASTERIZE_FLOAT const s_denom = native_recip(s_segs); // // inclusive add scan of estimated line segments // exclusive add scan of estimated line segments // total number of estimated line segments // SKC_RASTERIZE_FLOAT s_iss = skc_subgroup_scan_inclusive_add_float(s_segs); SKC_RASTERIZE_FLOAT s_ess = s_iss - s_segs; float s_rem = skc_subgroup_last_float(s_iss); // scalar // // Precompute quadratic polynomial coefficients from control cage so // we can shuffle them in on each iteration of the outer loop and // then evaluate the polynomial in Horner form. // // | 1 0 0 | | c0 | // | | | | // B(t) = [ 1 t^1 t^2 ] | -2 2 0 | | c1 | // | | | | // | 1 -2 1 | | c2 | // // SKC_RASTERIZE_FLOAT const b1x = mad(-2.0f,b0x,2.0f*t1x); // 2 - 1 MAD + MUL SKC_RASTERIZE_FLOAT const b1y = mad(-2.0f,b0y,2.0f*t1y); // 2 - 1 MAD + MUL SKC_RASTERIZE_FLOAT const b2x = mad(-2.0f,t1x,b0x+t2x); // 2 - 1 MAD + ADD SKC_RASTERIZE_FLOAT const b2y = mad(-2.0f,t1y,b0y+t2y); // 2 - 1 MAD + ADD // // these values don't matter on the first iteration // SKC_RASTERIZE_FLOAT l1x_prev = 0; SKC_RASTERIZE_FLOAT l1y_prev = 0; // // allocate and init in-register TTSK keys // skc_uint sk_v_next = 0; skc_ttsk_v_t sk_v; sk_v.hi = cohort; // // initialize smem // skc_smem_init(smem); // // initialize blocks / subblocks // skc_block_id_v_t blocks; skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 skc_block_id_t subblocks = 0; #endif // // loop until done // while (s_rem > 0) { // // distribute work across lanes // SKC_RASTERIZE_UINT const s_source = skc_scatter_scan_max(smem,s_iss,s_ess); // // every lane has a fraction to work off of // // FIXME -- this gets expanded on SIMD // // if delta == 1 then this is the first lane // if count == s_segs then this is the last lane // SKC_RASTERIZE_FLOAT const s_delta = skc_delta_offset() - skc_subgroup_shuffle(s_ess,s_source); SKC_RASTERIZE_FLOAT const s_count = skc_subgroup_shuffle(s_segs,s_source); SKC_RASTERIZE_PREDICATE const is_s_first = (s_delta == 1.0f); SKC_RASTERIZE_PREDICATE const is_s_last = (s_delta >= s_count); // // init parametric t // SKC_RASTERIZE_FLOAT s_t = s_delta * skc_subgroup_shuffle(s_denom,s_source); // faster than native_recip(s_count)? // // if last then override to a hard 1.0f // s_t = is_s_last ? 1.0f : s_t; // // decrement by subgroup size // s_iss -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; s_ess -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; s_rem -= SKC_RASTERIZE_ELEMS_PER_SUBGROUP; // // now every lane knows what to do and the following lines will // pump out up to SUBGROUP_SIZE line segments // // obtain the src vertices through shared or via a shuffle // // // shuffle in the polynomial coefficients their source lane // SKC_RASTERIZE_FLOAT const s0x = skc_subgroup_shuffle(b0x,s_source); SKC_RASTERIZE_FLOAT const s0y = skc_subgroup_shuffle(b0y,s_source); SKC_RASTERIZE_FLOAT const s1x = skc_subgroup_shuffle(b1x,s_source); SKC_RASTERIZE_FLOAT const s1y = skc_subgroup_shuffle(b1y,s_source); SKC_RASTERIZE_FLOAT const s2x = skc_subgroup_shuffle(b2x,s_source); SKC_RASTERIZE_FLOAT const s2y = skc_subgroup_shuffle(b2y,s_source); // // calculate "right" line segment endpoint using Horner form // SKC_RASTERIZE_FLOAT l1x = round(mad(mad(s2x,s_t,s1x),s_t,s0x)); // 2 MAD + ROUND SKC_RASTERIZE_FLOAT l1y = round(mad(mad(s2y,s_t,s1y),s_t,s0y)); // 2 MAD + ROUND // // shuffle up "left" line segment endpoint // // NOTE: Intel's shuffle_up is unique with its elegant // "previous" argument so don't get used to it // SKC_RASTERIZE_FLOAT l0x = skc_subgroup_shuffle_up_1(l1x_prev,l1x); SKC_RASTERIZE_FLOAT l0y = skc_subgroup_shuffle_up_1(l1y_prev,l1y); // // save previous right endpoint // l1x_prev = l1x; l1y_prev = l1y; // // override shuffle up if this is the first line segment // l0x = select(l0x,s0x,is_s_first); l0y = select(l0y,s0y,is_s_first); // // sliver lines // skc_sliver(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, &subblocks, &blocks, &blocks_next, &sk_v, &sk_v_next, sk_extent, smem, l0x,l0y,l1x,l1y); } // // - flush work-in-progress blocks // - return unused block ids // skc_finalize(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, &blocks, blocks_next, &sk_v, sk_v_next, sk_extent, smem); } // // RASTERIZE LINE KERNEL // static void skc_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, __global union skc_bp_elem * const bp_elems, __global uint * const bp_ids, skc_uint const bp_mask, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, __global skc_ttsk_s_t * const sk_extent, __local struct skc_subgroup_smem volatile * const smem, skc_uint * const nodeword, skc_block_id_t * const id, union skc_transform const * const tv, union skc_path_clip const * const cv, skc_uint const cohort) { // // the initial segment idx and segments-per-block constant determine // how many block ids will need to be loaded // SKC_RASTERIZE_FLOAT const c0x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c0y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c1x = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; skc_segment_next(bp_elems,nodeword,id); SKC_RASTERIZE_FLOAT const c1y = bp_elems[SKC_RASTERIZE_SEGMENT(*id)].coord; #if 0 printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",c0x,c0y,c1x,c1y); #endif // // apply transform // // note that we only care if the end points are rounded to subpixel precision // // FIXME -- transformation is currently affine-only // FIXME -- support perspective later // // the affine transformation requires 8 FMA + 4 ROUND operations // SKC_RASTERIZE_FLOAT l0x = c0x * tv->sx + c0y * tv->shx + tv->tx; SKC_RASTERIZE_FLOAT l0y = c0x * tv->shy + c0y * tv->sy + tv->ty; SKC_RASTERIZE_FLOAT l1x = c1x * tv->sx + c1y * tv->shx + tv->tx; SKC_RASTERIZE_FLOAT l1y = c1x * tv->shy + c1y * tv->sy + tv->ty; // // FIXME -- this is temporary support for projection // bool const is_affine = (tv->w0 == 0.0f) && (tv->w1 == 0.0f); if (!is_affine) { SKC_PROJECT(tv,c0x,c0y,l0x,l0y); SKC_PROJECT(tv,c1x,c1y,l1x,l1y); } l0x = round(l0x); l0y = round(l0y); l1x = round(l1x); l1y = round(l1y); #if 0 printf("{ { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n",l0x,l0y,l1x,l1y); #endif // // allocate and init in-register TTSK keys // skc_uint sk_v_next = 0; skc_ttsk_v_t sk_v; sk_v.hi = cohort; // // initialize smem // skc_smem_init(smem); // // initialize blocks / subblocks // skc_block_id_v_t blocks; skc_uint blocks_next = SKC_RASTERIZE_BLOCK_ID_V_SIZE; #if SKC_DEVICE_BLOCK_WORDS_LOG2 > SKC_DEVICE_SUBBLOCK_WORDS_LOG2 skc_block_id_t subblocks = 0; #endif // // sliver lines // skc_sliver(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, &subblocks, &blocks, &blocks_next, &sk_v, &sk_v_next, sk_extent, smem, l0x,l0y,l1x,l1y); // // - flush work-in-progress blocks // - return unused block ids // skc_finalize(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, &blocks, blocks_next, &sk_v, sk_v_next, sk_extent, smem); } // // // __kernel SKC_RASTERIZE_KERNEL_ATTRIBS void skc_kernel_rasterize_all(__global SKC_ATOMIC_UINT volatile * const bp_atomics, __global union skc_bp_elem * const bp_elems, __global uint * const bp_ids, skc_uint const bp_mask, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, __global skc_ttsk_s_t * const sk_extent, __global float8 const * const transforms, // FIXME -- __constant __global float4 const * const clips, // FIXME -- __constant __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant skc_uint const count) { // // declare shared memory block // #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) __local struct skc_subgroup_smem volatile smem[1]; #else __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); #endif // // this is a subgroup/warp-centric kernel // // which subgroup in the grid is this? // // TAKE NOTE: the Intel GEN compiler appears to be recognizing // get_group_id(0) as a uniform but the alternative calculation used // when there are multiple subgroups per workgroup is not // cooperating and driving spillage elsewhere. // #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) uint const cmd_idx = get_group_id(0); #else uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); #endif #if 0 if (get_sub_group_local_id() == 0) printf("+cmd_idx = %u\n",cmd_idx); #endif // // if worksgroups are multi-subgroup then there may be excess // subgroups in the final workgroup // if (cmd_idx >= count) return; #if 0 if (get_sub_group_local_id() == 0) printf("-cmd_idx = %u\n",cmd_idx); #endif // // load a single command for this subgroup // union skc_cmd_rasterize const cmd = cmds[cmd_idx]; #if 0 if (get_sub_group_local_id() == 0) printf("[ %u ]< %u, %u, %u, %u >\n", cmd_idx, cmd.nodeword, SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd), SKC_CMD_RASTERIZE_GET_CLIP(cmd), SKC_CMD_RASTERIZE_GET_COHORT(cmd)); #endif // // get first block node command word and its subblock // skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; skc_block_id_tag tag = SKC_TAGGED_BLOCK_ID_GET_TAG(tag_id); skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); // // load transform -- uniform across subgroup // // v8: { sx shx tx shy sy ty w0 w1 } // // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: // // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] // // Coordinates are scaled to subpixel resolution. All that matters // is that continuity is maintained between end path element // endpoints. // // It's the responsibility of the host to ensure that the transforms // are properly scaled either via intitializing a transform stack // with the subpixel resolution scaled identity or scaling the // transform before its loaded by a rasterization grid. // // FIXME -- horizontal load might be better than this broadcast load // union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted switch (tag) { case SKC_BLOCK_ID_TAG_PATH_LINE: skc_rasterize_lines(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, sk_extent, smem, &nodeword,&id, &tv,&cv,cohort); break; case SKC_BLOCK_ID_TAG_PATH_QUAD: skc_rasterize_quads(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, sk_extent, smem, &nodeword,&id, &tv,&cv,cohort); break; case SKC_BLOCK_ID_TAG_PATH_CUBIC: skc_rasterize_cubics(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, sk_extent, smem, &nodeword,&id, &tv,&cv,cohort); break; case SKC_BLOCK_ID_TAG_PATH_RAT_QUAD: break; case SKC_BLOCK_ID_TAG_PATH_RAT_CUBIC: break; default: break; } } // // // __kernel SKC_RASTERIZE_KERNEL_ATTRIBS void skc_kernel_rasterize_lines(__global SKC_ATOMIC_UINT volatile * const bp_atomics, __global union skc_bp_elem * const bp_elems, __global uint * const bp_ids, skc_uint const bp_mask, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, __global skc_ttsk_s_t * const sk_extent, __global float8 const * const transforms, // FIXME -- __constant __global float4 const * const clips, // FIXME -- __constant __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant skc_uint const count) { // // declare shared memory block // #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) __local struct skc_subgroup_smem volatile smem[1]; #else __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); #endif // // this is a subgroup/warp-centric kernel // // which subgroup in the grid is this? // // TAKE NOTE: the Intel GEN compiler appears to be recognizing // get_group_id(0) as a uniform but the alternative calculation used // when there are multiple subgroups per workgroup is not // cooperating and driving spillage elsewhere. // #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) uint const cmd_idx = get_group_id(0); #else uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); #endif // // if worksgroups are multi-subgroup then there may be excess // subgroups in the final workgroup // if (cmd_idx >= count) return; #if 0 if (get_sub_group_local_id() == 0) printf("cmd_idx = %u\n",cmd_idx); #endif // // load a single command for this subgroup // union skc_cmd_rasterize const cmd = cmds[cmd_idx]; // // get first block node command word and its subblock // skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); // // load transform -- uniform across subgroup // // v8: { sx shx tx shy sy ty w0 w1 } // // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: // // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] // // Coordinates are scaled to subpixel resolution. All that matters // is that continuity is maintained between end path element // endpoints. // // It's the responsibility of the host to ensure that the transforms // are properly scaled either via intitializing a transform stack // with the subpixel resolution scaled identity or scaling the // transform before its loaded by a rasterization grid. // // FIXME -- horizontal load might be better than this broadcast load // union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted skc_rasterize_lines(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, sk_extent, smem, &nodeword,&id, &tv,&cv,cohort); } // // // // // // __kernel SKC_RASTERIZE_KERNEL_ATTRIBS void skc_kernel_rasterize_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, __global union skc_bp_elem * const bp_elems, __global uint * const bp_ids, skc_uint const bp_mask, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, __global skc_ttsk_s_t * const sk_extent, __global float8 const * const transforms, // FIXME -- __constant __global float4 const * const clips, // FIXME -- __constant __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant skc_uint const count) { // // declare shared memory block // #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) __local struct skc_subgroup_smem volatile smem[1]; #else __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); #endif // // this is a subgroup/warp-centric kernel // // which subgroup in the grid is this? // // TAKE NOTE: the Intel GEN compiler appears to be recognizing // get_group_id(0) as a uniform but the alternative calculation used // when there are multiple subgroups per workgroup is not // cooperating and driving spillage elsewhere. // #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) uint const cmd_idx = get_group_id(0); #else uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); #endif // // if worksgroups are multi-subgroup then there may be excess // subgroups in the final workgroup // if (cmd_idx >= count) return; #if 0 if (get_sub_group_local_id() == 0) printf("cmd_idx = %u\n",cmd_idx); #endif // // load a single command for this subgroup // union skc_cmd_rasterize const cmd = cmds[cmd_idx]; // // get first block node command word and its subblock // skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); // // load transform -- uniform across subgroup // // v8: { sx shx tx shy sy ty w0 w1 } // // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: // // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] // // Coordinates are scaled to subpixel resolution. All that matters // is that continuity is maintained between end path element // endpoints. // // It's the responsibility of the host to ensure that the transforms // are properly scaled either via intitializing a transform stack // with the subpixel resolution scaled identity or scaling the // transform before its loaded by a rasterization grid. // // FIXME -- horizontal load might be better than this broadcast load // union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted skc_rasterize_quads(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, sk_extent, smem, &nodeword,&id, &tv,&cv,cohort); } // // // __kernel SKC_RASTERIZE_KERNEL_ATTRIBS void skc_kernel_rasterize_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, __global union skc_bp_elem * const bp_elems, __global uint * const bp_ids, skc_uint const bp_mask, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, __global skc_ttsk_s_t * const sk_extent, __global float8 const * const transforms, // FIXME -- __constant __global float4 const * const clips, // FIXME -- __constant __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant skc_uint const count) { // // declare shared memory block // #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) __local struct skc_subgroup_smem volatile smem[1]; #else __local struct skc_subgroup_smem volatile smem_wg[SKC_RASTERIZE_WORKGROUP_SUBGROUPS]; __local struct skc_subgroup_smem volatile * const smem = smem_wg + get_sub_group_id(); #endif // // this is a subgroup/warp-centric kernel // // which subgroup in the grid is this? // // TAKE NOTE: the Intel GEN compiler appears to be recognizing // get_group_id(0) as a uniform but the alternative calculation used // when there are multiple subgroups per workgroup is not // cooperating and driving spillage elsewhere. // #if ( SKC_RASTERIZE_WORKGROUP_SUBGROUPS == 1 ) uint const cmd_idx = get_group_id(0); #else uint const cmd_idx = get_group_id(0) * SKC_RASTERIZE_WORKGROUP_SUBGROUPS + get_sub_group_id(); #endif // // if worksgroups are multi-subgroup then there may be excess // subgroups in the final workgroup // if (cmd_idx >= count) return; #if 0 if (get_sub_group_local_id() == 0) printf("cmd_idx = %u\n",cmd_idx); #endif // // load a single command for this subgroup // union skc_cmd_rasterize const cmd = cmds[cmd_idx]; // // get first block node command word and its subblock // skc_uint nodeword = cmd.nodeword; // nodeword has word-addressing skc_tagged_block_id_t tag_id = bp_elems[nodeword].tag_id; skc_block_id_t id = SKC_TAGGED_BLOCK_ID_GET_ID(tag_id); // // load transform -- uniform across subgroup // // v8: { sx shx tx shy sy ty w0 w1 } // // NOTE THAT WE'RE SCALING UP THE TRANSFORM BY: // // [ SKC_SUBPIXEL_RESL_X_F32, SKC_SUBPIXEL_RESL_Y_F32, 1.0f ] // // Coordinates are scaled to subpixel resolution. All that matters // is that continuity is maintained between end path element // endpoints. // // It's the responsibility of the host to ensure that the transforms // are properly scaled either via intitializing a transform stack // with the subpixel resolution scaled identity or scaling the // transform before its loaded by a rasterization grid. // // FIXME -- horizontal load might be better than this broadcast load // union skc_transform const tv = { .f32v8 = transforms[SKC_CMD_RASTERIZE_GET_TRANSFORM(cmd)] }; // uniform load union skc_path_clip const cv = { .f32v4 = clips [SKC_CMD_RASTERIZE_GET_CLIP(cmd) ] }; // uniform load skc_uint const cohort = SKC_CMD_RASTERIZE_MASK_COHORT(cmd); // shifted skc_rasterize_cubics(bp_atomics, bp_elems, bp_ids, bp_mask, cohort_atomics, sk_extent, smem, &nodeword,&id, &tv,&cv,cohort); } // // // __kernel SKC_RASTERIZE_KERNEL_ATTRIBS void skc_kernel_rasterize_rat_quads(__global SKC_ATOMIC_UINT volatile * const bp_atomics, __global union skc_bp_elem * const bp_elems, __global uint * const bp_ids, skc_uint const bp_mask, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, __global skc_ttsk_s_t * const sk_extent, __global float8 const * const transforms, // FIXME -- __constant __global float4 const * const clips, // FIXME -- __constant __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant skc_uint const count) { ; } // // // __kernel SKC_RASTERIZE_KERNEL_ATTRIBS void skc_kernel_rasterize_rat_cubics(__global SKC_ATOMIC_UINT volatile * const bp_atomics, __global union skc_bp_elem * const bp_elems, __global uint * const bp_ids, skc_uint const bp_mask, __global SKC_ATOMIC_UINT volatile * const cohort_atomics, __global skc_ttsk_s_t * const sk_extent, __global float8 const * const transforms, // FIXME -- __constant __global float4 const * const clips, // FIXME -- __constant __global union skc_cmd_rasterize const * const cmds, // FIXME -- __constant skc_uint const count) { ; } // // //