1 /*
2  * Copyright 2017 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can
5  * be found in the LICENSE file.
6  *
7  */
8 
9 #ifndef SKC_ONCE_DEVICE_CL_12_H
10 #define SKC_ONCE_DEVICE_CL_12_H
11 
12 //
13 // FIXME -- THERE ARE SOME DUPLICATED TYPEDEFS IN THIS FILE
14 //
15 // THESE WILL GO AWAY AS THE TYPING GET POLISHED AND SIMPLIFIED
16 //
17 
18 #include "block.h"
19 
20 //
21 // HOW TO SELECT A SUBBLOCK AND BLOCK SIZES:
22 //
23 // 1) The subblock size should match the natural SIMT/SIMD width of
24 //    the target device.
25 //
26 // 2) Either a square or rectangular (1:2) tile size is chosen.  The
27 //    tile size is usually determined by the amount of SMEM available
28 //    to a render kernel subgroup and desired multiprocessor
29 //    occupancy.
30 //
31 // 3) If the tile is rectangular then the block size must be at least
32 //    twice the size of the subblock size.
33 //
34 // 4) A large block size can decrease allocation overhead but there
35 //    will be diminishing returns as the block size increases.
36 //
37 
38 #define SKC_DEVICE_BLOCK_WORDS_LOG2             6  // CHANGE "WORDS" TO "SIZE" ?
39 #define SKC_DEVICE_SUBBLOCK_WORDS_LOG2          3
40 
41 #define SKC_TILE_WIDTH_LOG2                     SKC_DEVICE_SUBBLOCK_WORDS_LOG2
42 #define SKC_TILE_HEIGHT_LOG2                    (SKC_DEVICE_SUBBLOCK_WORDS_LOG2 + 1)
43 
44 /////////////////////////////////////////////////////////////////
45 //
46 // BLOCK POOL INIT
47 //
48 
49 #define SKC_BP_INIT_IDS_KERNEL_ATTRIBS
50 #define SKC_BP_INIT_ATOMICS_KERNEL_ATTRIBS      __attribute__((reqd_work_group_size(2,1,1)))
51 
52 /////////////////////////////////////////////////////////////////
53 //
54 // PATHS ALLOC
55 //
56 
57 #define SKC_PATHS_ALLOC_KERNEL_ATTRIBS          __attribute__((reqd_work_group_size(1,1,1)))
58 
59 /////////////////////////////////////////////////////////////////
60 //
61 // PATHS COPY
62 //
63 
64 #define SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2       SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK?
65 #define SKC_PATHS_COPY_ELEM_WORDS               1
66 #define SKC_PATHS_COPY_ELEM_EXPAND()            SKC_EXPAND_1()
67 
68 #define SKC_PATHS_COPY_KERNEL_ATTRIBS           __attribute__((intel_reqd_sub_group_size(SKC_PATHS_COPY_SUBGROUP_SIZE)))
69 
70 #define SKC_IS_NOT_PATH_HEAD(sg,I)              ((sg) + get_sub_group_local_id() >= SKC_PATH_HEAD_WORDS)
71 
72 typedef skc_uint  skc_paths_copy_elem;
73 typedef skc_uint  skc_pb_idx_v;
74 
75 /////////////////////////////////////////////////////////////////
76 //
77 // FILLS EXPAND
78 //
79 
80 #define SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2     SKC_DEVICE_SUBBLOCK_WORDS_LOG2
81 #define SKC_FILLS_EXPAND_ELEM_WORDS             1
82 
83 #define SKC_FILLS_EXPAND_KERNEL_ATTRIBS         __attribute__((intel_reqd_sub_group_size(SKC_FILLS_EXPAND_SUBGROUP_SIZE)))
84 
85 /////////////////////////////////////////////////////////////////
86 //
87 // RASTER ALLOC
88 //
89 // NOTE -- Intel subgroup shuffles aren't supported in SIMD32 which is
90 // why use of the subgroup broadcast produces a compiler error. So a
91 // subgroup of size 16 is this widest we can require.
92 //
93 
94 #define SKC_RASTERS_ALLOC_GROUP_SIZE            16
95 
96 #if (SKC_RASTERS_ALLOC_GROUP_SIZE <= 16)
97 
98 #define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS        __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE)))
99 #define SKC_RASTERS_ALLOC_LOCAL_ID()            get_sub_group_local_id()
100 #define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v)      sub_group_scan_inclusive_add(v)
101 #define SKC_RASTERS_ALLOC_BROADCAST(v,i)        sub_group_broadcast(v,i)
102 
103 #else
104 
105 #define SKC_RASTERS_ALLOC_KERNEL_ATTRIBS        __attribute__((reqd_work_group_size(SKC_RASTERS_ALLOC_GROUP_SIZE,1,1)))
106 #define SKC_RASTERS_ALLOC_LOCAL_ID()            get_local_id(0)
107 #define SKC_RASTERS_ALLOC_INCLUSIVE_ADD(v)      work_group_scan_inclusive_add(v)
108 #define SKC_RASTERS_ALLOC_BROADCAST(v,i)        work_group_broadcast(v,i)
109 
110 #endif
111 
112 /////////////////////////////////////////////////////////////////
113 //
114 // RASTERIZE
115 //
116 
117 #define SKC_RASTERIZE_SUBGROUP_SIZE             SKC_DEVICE_SUBBLOCK_WORDS
118 #define SKC_RASTERIZE_VECTOR_SIZE_LOG2          0
119 #define SKC_RASTERIZE_WORKGROUP_SUBGROUPS       1
120 
121 #define SKC_RASTERIZE_KERNEL_ATTRIBS                                    \
122   __attribute__((intel_reqd_sub_group_size(SKC_RASTERIZE_SUBGROUP_SIZE))) \
123   __attribute__((reqd_work_group_size(SKC_RASTERIZE_SUBGROUP_SIZE * SKC_RASTERIZE_WORKGROUP_SUBGROUPS, 1, 1)))
124 
125 #define SKC_RASTERIZE_FLOAT                     float
126 #define SKC_RASTERIZE_UINT                      uint
127 #define SKC_RASTERIZE_INT                       int
128 #define SKC_RASTERIZE_PREDICATE                 bool
129 #define SKC_RASTERIZE_POOL                      uint
130 
131 #define SKC_RASTERIZE_TILE_HASH_X_BITS          1
132 #define SKC_RASTERIZE_TILE_HASH_Y_BITS          2
133 
134 typedef skc_block_id_t skc_block_id_v_t;
135 typedef skc_uint2      skc_ttsk_v_t;
136 typedef skc_uint2      skc_ttsk_s_t;
137 
138 // SKC_STATIC_ASSERT(SKC_RASTERIZE_POOL_SIZE > SKC_RASTERIZE_SUBGROUP_SIZE);
139 
140 /////////////////////////////////////////////////////////////////
141 //
142 // PREFIX
143 //
144 
145 #define SKC_PREFIX_SUBGROUP_SIZE               8 // for now this had better be SKC_DEVICE_SUBBLOCK_WORDS
146 #define SKC_PREFIX_WORKGROUP_SUBGROUPS         1
147 
148 #define SKC_PREFIX_KERNEL_ATTRIBS                                       \
149   __attribute__((intel_reqd_sub_group_size(SKC_PREFIX_SUBGROUP_SIZE)))  \
150   __attribute__((reqd_work_group_size(SKC_PREFIX_SUBGROUP_SIZE * SKC_PREFIX_WORKGROUP_SUBGROUPS, 1, 1)))
151 
152 #define SKC_PREFIX_TTP_V                       skc_uint2
153 #define SKC_PREFIX_TTS_V_BITFIELD              skc_int
154 
155 #define SKC_PREFIX_TTS_VECTOR_INT_EXPAND       SKC_EXPAND_1
156 
157 #define SKC_PREFIX_SMEM_ZERO                   ulong
158 #define SKC_PREFIX_SMEM_ZERO_WIDTH             (sizeof(SKC_PREFIX_SMEM_ZERO) / sizeof(skc_ttp_t))
159 #define SKC_PREFIX_SMEM_COUNT_BLOCK_ID         8
160 
161 #define SKC_PREFIX_BLOCK_ID_V_SIZE             SKC_PREFIX_SUBGROUP_SIZE
162 
163 #define SKC_PREFIX_TTXK_V_SIZE                 SKC_PREFIX_SUBGROUP_SIZE
164 #define SKC_PREFIX_TTXK_V_MASK                 (SKC_PREFIX_TTXK_V_SIZE - 1)
165 
166 typedef skc_uint       skc_bp_elem_t;
167 
168 typedef skc_uint2      skc_ttrk_e_t;
169 typedef skc_uint2      skc_ttsk_v_t;
170 typedef skc_uint2      skc_ttsk_s_t;
171 typedef skc_uint2      skc_ttpk_s_t;
172 typedef skc_uint2      skc_ttxk_v_t;
173 
174 typedef skc_int        skc_tts_v_t;
175 
176 typedef skc_int        skc_ttp_t;
177 
178 typedef skc_uint       skc_raster_yx_s;
179 
180 typedef skc_block_id_t skc_block_id_v_t;
181 typedef skc_block_id_t skc_block_id_s_t;
182 
183 /////////////////////////////////////////////////////////////////
184 //
185 // PLACE
186 //
187 
188 #define SKC_PLACE_SUBGROUP_SIZE                16
189 #define SKC_PLACE_WORKGROUP_SUBGROUPS          1
190 
191 #define SKC_PLACE_KERNEL_ATTRIBS                                       \
192   __attribute__((intel_reqd_sub_group_size(SKC_PLACE_SUBGROUP_SIZE)))  \
193   __attribute__((reqd_work_group_size(SKC_PLACE_SUBGROUP_SIZE * SKC_PLACE_WORKGROUP_SUBGROUPS, 1, 1)))
194 
195 typedef skc_uint  skc_bp_elem_t;
196 
197 typedef skc_uint  skc_ttsk_lo_t;
198 typedef skc_uint  skc_ttsk_hi_t;
199 
200 typedef skc_uint  skc_ttpk_lo_t;
201 typedef skc_uint  skc_ttpk_hi_t;
202 
203 typedef skc_uint  skc_ttxk_lo_t;
204 typedef skc_uint  skc_ttxk_hi_t;
205 
206 typedef skc_uint2 skc_ttck_t;
207 
208 typedef skc_bool  skc_pred_v_t;
209 typedef skc_int   skc_int_v_t;
210 
211 /////////////////////////////////////////////////////////////////
212 //
213 // RENDER
214 //
215 
216 #define SKC_ARCH_GEN9
217 
218 #if defined(__OPENCL_C_VERSION__)
219 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
220 #endif
221 
222 #define SKC_RENDER_SUBGROUP_SIZE               8
223 #define SKC_RENDER_WORKGROUP_SUBGROUPS         1
224 
225 #define SKC_RENDER_KERNEL_ATTRIBS                                       \
226   __attribute__((intel_reqd_sub_group_size(SKC_RENDER_SUBGROUP_SIZE)))  \
227   __attribute__((reqd_work_group_size(SKC_RENDER_SUBGROUP_SIZE * SKC_RENDER_WORKGROUP_SUBGROUPS, 1, 1)))
228 
229 #define SKC_RENDER_SCANLINE_VECTOR_SIZE        2
230 
231 #define SKC_RENDER_REGS_COLOR_R                2
232 #define SKC_RENDER_REGS_COVER_R                3
233 
234 #define SKC_RENDER_TTSB_EXPAND()               SKC_EXPAND_1()
235 
236 #define SKC_RENDER_TTS_V                       skc_int
237 #define SKC_RENDER_TTS_V_BITFIELD              skc_int
238 
239 #define SKC_RENDER_TTP_V                       skc_int2
240 #define SKC_RENDER_AREA_V                      skc_int2
241 
242 #define SKC_RENDER_TILE_COLOR_PAIR             half2
243 #define SKC_RENDER_TILE_COLOR_PAIR_LOAD(x,v)   vload2(x,v)
244 
245 #define SKC_RENDER_SURFACE_COLOR               half4
246 #define SKC_RENDER_SURFACE_WRITE               write_imageh
247 
248 // #define SKC_RENDER_TTXB_VECTOR_INT             int2
249 // #define SKC_RENDER_TTXB_VECTOR_UINT            uint2
250 
251 #define SKC_RENDER_WIDE_AA                     ulong // SLM = 64 bytes/clock
252 
253 #define SKC_RENDER_TILE_COLOR                  half2
254 #define SKC_RENDER_TILE_COVER                  half2
255 
256 #define SKC_RENDER_ACC_COVER_INT               int2
257 #define SKC_RENDER_ACC_COVER_UINT              uint2
258 
259 #define SKC_RENDER_GRADIENT_FLOAT              float2
260 #define SKC_RENDER_GRADIENT_INT                int2
261 #define SKC_RENDER_GRADIENT_STOP               int2
262 #define SKC_RENDER_GRADIENT_FRAC               half2
263 #define SKC_RENDER_GRADIENT_COLOR_STOP         half
264 
265 #define SKC_RENDER_SURFACE_U8_RGBA             uint2
266 
267 #define SKC_RENDER_TILE_COLOR_VECTOR           uint16
268 #define SKC_RENDER_TILE_COLOR_VECTOR_COMPONENT uint
269 #define SKC_RENDER_TILE_COLOR_VECTOR_COUNT     ((sizeof(SKC_RENDER_TILE_COLOR) * 4 * SKC_TILE_WIDTH) / sizeof(SKC_RENDER_TILE_COLOR_VECTOR))
270 
271 /////////////////////////////////////////////////////////////////
272 //
273 // PATHS & RASTERS RECLAIM
274 //
275 // FIXME -- investigate enabling the stride option for a smaller grid
276 // that iterates over a fixed number of threads.  Since reclamation is
277 // a low-priority task, it's probably reasonable to trade longer
278 // reclamation times for lower occupancy of the device because it
279 // might delay the fastpath of the pipeline.
280 //
281 
282 #define SKC_RECLAIM_ARRAY_SIZE                  (7 * 8 / 2) // 8 EUs with 7 hardware threads divided by 2 is half a sub-slice
283 
284 /////////////////////////////////////////////////////////////////
285 //
286 // PATHS RECLAIM
287 //
288 
289 #define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2    SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK?
290 #define SKC_PATHS_RECLAIM_LOCAL_ELEMS           1
291 #define SKC_PATHS_RECLAIM_KERNEL_ATTRIBS        __attribute__((intel_reqd_sub_group_size(SKC_PATHS_RECLAIM_SUBGROUP_SIZE)))
292 
293 /////////////////////////////////////////////////////////////////
294 //
295 // RASTERS RECLAIM
296 //
297 
298 #define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2  SKC_DEVICE_SUBBLOCK_WORDS_LOG2 // FIXME -- SUBGROUP OR THREADS PER BLOCK?
299 #define SKC_RASTERS_RECLAIM_LOCAL_ELEMS         1
300 #define SKC_RASTERS_RECLAIM_KERNEL_ATTRIBS      __attribute__((intel_reqd_sub_group_size(SKC_RASTERS_RECLAIM_SUBGROUP_SIZE)))
301 
302 //
303 // COMMON -- FIXME -- HOIST THESE ELSEWHERE
304 //
305 
306 #define SKC_DEVICE_BLOCK_WORDS                 (1u << SKC_DEVICE_BLOCK_WORDS_LOG2)
307 #define SKC_DEVICE_SUBBLOCK_WORDS              (1u << SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
308 
309 #define SKC_DEVICE_BLOCK_DWORDS                (SKC_DEVICE_BLOCK_WORDS / 2)
310 
311 #define SKC_DEVICE_BLOCK_WORDS_MASK            SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2)
312 #define SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK    SKC_BITS_TO_MASK(SKC_DEVICE_BLOCK_WORDS_LOG2 - SKC_DEVICE_SUBBLOCK_WORDS_LOG2)
313 
314 #define SKC_DEVICE_SUBBLOCKS_PER_BLOCK         (SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS)
315 
316 #define SKC_TILE_RATIO                         (SKC_TILE_HEIGHT / SKC_TILE_WIDTH)
317 
318 //
319 //
320 //
321 
322 #define SKC_PATHS_COPY_SUBGROUP_SIZE           (1 << SKC_PATHS_COPY_SUBGROUP_SIZE_LOG2)
323 #define SKC_PATHS_RECLAIM_SUBGROUP_SIZE        (1 << SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2)
324 #define SKC_RASTERS_RECLAIM_SUBGROUP_SIZE      (1 << SKC_RASTERS_RECLAIM_SUBGROUP_SIZE_LOG2)
325 #define SKC_FILLS_EXPAND_SUBGROUP_SIZE         (1 << SKC_FILLS_EXPAND_SUBGROUP_SIZE_LOG2)
326 
327 //
328 //
329 //
330 
331 #endif
332 
333 //
334 //
335 //
336