1 /*
2  * Copyright 2017 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can
5  * be found in the LICENSE file.
6  *
7  */
8 
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <assert.h>
12 
13 #include "common/cl/assert_cl.h"
14 
15 #include "tile.h"
16 #include "raster.h"
17 #include "macros.h"
18 
19 #include "config_cl.h"
20 #include "runtime_cl_12.h"
21 
22 #include "kernel_cl_12.h"
23 #include "device_cl_12.h"
24 
25 //
26 //
27 //
28 
29 #include "hs/cl/hs_cl.h"
30 #include "hs/cl/intel/gen8/u64/hs_target.h"
31 #include "hs/cl/intel/gen8/u64/hs_config.h"
32 
33 //
34 //
35 //
36 
37 #define SKC_KERNEL_SPIRV  0
38 #define SKC_KERNEL_BINARY 1
39 #define SKC_KERNEL_SRC    0
40 
41 //
42 //
43 //
44 
45 #if   SKC_KERNEL_SPIRV
46 
47 #include "inl/block_pool_init.pre.spv.inl"
48 #include "inl/paths_copy.pre.spv.inl"
49 #include "inl/fills_expand.pre.spv.inl"
50 #include "inl/rasterize.pre.spv.inl"
51 #include "inl/segment_ttrk.pre.spv.inl"
52 #include "inl/rasters_alloc.pre.spv.inl"
53 #include "inl/prefix.pre.spv.inl"
54 #include "inl/place.pre.spv.inl"
55 #include "inl/segment_ttck.pre.spv.inl"
56 #include "inl/render.pre.spv.inl"
57 #include "inl/paths_reclaim.pre.spv.inl"
58 #include "inl/rasters_reclaim.pre.spv.inl"
59 
60 #elif SKC_KERNEL_BINARY
61 
62 #include "inl/block_pool_init.pre.bin.inl"
63 #include "inl/paths_copy.pre.bin.inl"
64 #include "inl/fills_expand.pre.bin.inl"
65 #include "inl/rasterize.pre.bin.inl"
66 #include "inl/segment_ttrk.pre.bin.inl"
67 #include "inl/rasters_alloc.pre.bin.inl"
68 #include "inl/prefix.pre.bin.inl"
69 #include "inl/place.pre.bin.inl"
70 #include "inl/segment_ttck.pre.bin.inl"
71 #include "inl/render.pre.bin.inl"
72 #include "inl/paths_reclaim.pre.bin.inl"
73 #include "inl/rasters_reclaim.pre.bin.inl"
74 
75 #elif SKC_KERNEL_SRC
76 
77 #include "inl/block_pool_init.pre.src.inl"
78 #include "inl/paths_copy.pre.src.inl"
79 #include "inl/fills_expand.pre.src.inl"
80 #include "inl/rasterize.pre.src.inl"
81 #include "inl/segment_ttrk.pre.src.inl"
82 #include "inl/rasters_alloc.pre.src.inl"
83 #include "inl/prefix.pre.src.inl"
84 #include "inl/place.pre.src.inl"
85 #include "inl/segment_ttck.pre.src.inl"
86 #include "inl/render.pre.src.inl"
87 #include "inl/paths_reclaim.pre.src.inl"
88 #include "inl/rasters_reclaim.pre.src.inl"
89 
90 #endif
91 
92 //
93 // FIXME -- THE CONFIG INITIALIZATION IS ONLY HERE TEMPORARILY
94 //
95 // FIXME -- move these to log2 values where appropriate
96 //
97 
98 static
99 struct skc_config const config =
100   {
101     .suballocator = {
102       .host = {
103         .size       = 1024 * 1024, // words
104         .subbufs    = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t)))
105       },
106       .device = {
107         .size       = 128 * 1024 * 1024,
108         .subbufs    = 1024 // must be <= (1 << (8 * sizeof(skc_subbuf_id_t)))
109       }
110     },
111 
112     .scheduler = {
113       .size         = 4096 // 128 // FIXME -- this is just for testing -- way too big -- schedulees should bring their own state
114     },
115 
116     .subblock = {
117       .words        = SKC_DEVICE_SUBBLOCK_WORDS,                         // words per subblock -- pow2
118       .bytes        = SKC_DEVICE_SUBBLOCK_WORDS * sizeof(skc_uint)       // bytes per subblock -- pow2
119     },
120 
121     .block = {
122       .words        = SKC_DEVICE_BLOCK_WORDS,                            // words per block     -- pow2
123       .bytes        = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint),         // bytes per block     -- pow2
124       .subblocks    = SKC_DEVICE_BLOCK_WORDS / SKC_DEVICE_SUBBLOCK_WORDS // subblocks per block -- block.bytes >= subblock.bytes
125     },
126 
127     .block_pool = {
128       .pool_size    = 524288, // blocks in pool -- 128 MB
129       .ring_pow2    = 524288, // blocks in pool rounded up pow2
130       .ring_mask    = 524288 - 1
131     },
132 
133     .cq_pool     = {
134 #ifndef NDEBUG
135        .cq_props    = CL_QUEUE_PROFILING_ENABLE,
136 #else
137        .cq_props    = 0,
138 #endif
139       .size         = 8
140     },
141 
142     .handle_pool = {
143       .size         = 262144,  // large fraction of block pool size (for now, 1:2)
144       .width        = SKC_RECLAIM_ARRAY_SIZE,
145       .recs         = 256      // too many?  too few?
146     },
147 
148     .tile = {
149       .width        = SKC_TILE_WIDTH,                  // tile width  in pixels
150       .height       = SKC_TILE_HEIGHT,                 // tile height in pixels
151       .ratio        = SKC_TILE_HEIGHT / SKC_TILE_WIDTH // subblocks per TTPB
152     },
153 
154     .paths_copy = {
155 
156       .buffer = {
157         .count      = 16   // # of subbufs in buffer
158       },
159 
160       .subbuf = {
161         .count      = 1024 // # of blocks/commands in subbuf
162       },
163 
164       .block = {
165         .subbuf     = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024,     // block.bytes * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
166         .buffer     = SKC_DEVICE_BLOCK_WORDS * sizeof(skc_uint) * 1024 * 16 // block.bytes * subbuf.blocks * subbuf.count
167       },
168 
169       .command = {
170         .subbuf     = sizeof(skc_uint) * 1024,     // sizeof(skc_uint) * subbuf.blocks -- multiple of CL_DEVICE_MEM_BASE_ADDR_ALIGN
171         .buffer     = sizeof(skc_uint) * 1024 * 16 // sizeof(skc_uint) * subbuf.blocks * subbuf.count
172       },
173 
174       // skc_uint paths_lowat;
175     },
176 
177     .raster_cohort = {
178       .path_ids = {
179         .elem_count = 8192,
180         .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
181       },
182 
183       .transforms = {
184         .elem_count = 8192,
185         .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
186       },
187 
188       .clips = {
189         .elem_count = 8192,
190         .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
191       },
192 
193       .fill = {
194         .elem_count = 8192,
195         .snap_count = 1024 // FIXME -- THIS SHOULD BE WAYYYY BIGGER
196       },
197 
198       .raster_ids = {
199         .elem_count = 8192,
200         .snap_count = (1<<SKC_TTRK_HI_BITS_COHORT) // 256
201       },
202 
203       .expand = {
204         .cmds       = 1024*128,
205       },
206 
207       .rasterize = {
208         .keys       = 1024*1024
209       }
210     },
211 
212     .composition = {
213       .cmds = {
214         .elem_count = 1024*16,
215         .snap_count = 1024
216       },
217       .raster_ids = {
218         .elem_count = 1024*1024
219       },
220       .keys = {
221         .elem_count = 1024*1024,
222       }
223     },
224   };
225 
226 //
227 //
228 //
229 
230 static char const cl_build_options_optimized[] =
231   "-cl-std=CL1.2 "
232   "-cl-single-precision-constant "
233   "-cl-denorms-are-zero "
234   "-cl-mad-enable "
235   "-cl-no-signed-zeros "
236   "-cl-fast-relaxed-math "
237   "-cl-kernel-arg-info ";
238 
239 static char const cl_build_options_debug[] =
240   "-cl-std=CL1.2 -cl-kernel-arg-info -g"; // -s c:/users/allanmac/home/google/skia_internal/src/compute/skc";
241 
242 // #define SKC_BUILD_OPTIONS cl_build_options_debug
243 #define SKC_BUILD_OPTIONS    cl_build_options_optimized
244 
245 //
246 //
247 //
248 
249 struct skc_program_source
250 {
251   char   const * name;
252   char   const * options;
253   char   const * src;
254   size_t const   srclen;
255 };
256 
257 //
258 // THIS IS A RELATIVELY COMPACT WAY OF DECLARING EACH PROGRAM SOURCE
259 // AND ITS BUILD OPTIONS
260 //
261 
262 union skc_program_sources
263 {
264   struct {
265     struct skc_program_source block_pool_init;
266     struct skc_program_source paths_copy;
267     struct skc_program_source fills_expand;
268     struct skc_program_source rasterize;
269     struct skc_program_source segment_ttrk;
270     struct skc_program_source rasters_alloc;
271     struct skc_program_source prefix;
272     struct skc_program_source place;
273     struct skc_program_source segment_ttck;
274     struct skc_program_source render;
275     struct skc_program_source paths_reclaim;
276     struct skc_program_source rasters_reclaim;
277   };
278   struct skc_program_source   sources[];
279 };
280 
281 typedef size_t * (*skc_grid_shaper)(size_t    const work_size,
282                                     cl_uint * const work_dim,
283                                     size_t  * const global_work_size,
284                                     size_t  * const local_work_size);
285 struct skc_program_kernel
286 {
287   char const *         name;
288   skc_grid_shaper      shaper;
289   skc_device_kernel_id id;
290 };
291 
292 union skc_program_kernels
293 {
294   struct {
295     struct skc_program_kernel block_pool_init[2];
296     struct skc_program_kernel paths_copy     [2];
297     struct skc_program_kernel fills_expand   [1];
298     struct skc_program_kernel rasterize      [6];
299     struct skc_program_kernel segment_ttrk   [1];
300     struct skc_program_kernel rasters_alloc  [1];
301     struct skc_program_kernel prefix         [1];
302     struct skc_program_kernel place          [1];
303     struct skc_program_kernel segment_ttck   [1];
304     struct skc_program_kernel render         [1];
305     struct skc_program_kernel paths_reclaim  [1];
306     struct skc_program_kernel rasters_reclaim[1];
307   };
308   struct skc_program_kernel   kernels[];
309 };
310 
311 //
312 //
313 //
314 
315 #if     SKC_KERNEL_SPIRV  // PROGRAM IS SPIR-V
316 #define SKC_KERNEL_SUFFIX(n) n ## _pre_spv
317 #elif   SKC_KERNEL_BINARY // PROGRAM IS BINARY
318 #define SKC_KERNEL_SUFFIX(n) n ## _pre_ir
319 #elif   SKC_KERNEL_SRC    // PROGRAM IS SOURCE CODE
320 #define SKC_KERNEL_SUFFIX(n) n ## _pre_cl
321 #else
322 #error  "SKC_KERNEL_???"
323 #endif
324 
325 //
326 //
327 //
328 
329 #define SKC_PROGRAM_SOURCE_EXPAND(k,s,o) .k = { SKC_STRINGIFY(k), o, s, sizeof(s) }
330 #define SKC_PROGRAM_SOURCE(k,o)          SKC_PROGRAM_SOURCE_EXPAND(k,SKC_KERNEL_SUFFIX(k),o)
331 #define SKC_PROGRAM_KERNEL(k)            "skc_kernel_" SKC_STRINGIFY(k), SKC_CONCAT(skc_device_shaper_,k)
332 
333 //
334 //
335 //
336 
337 static
338 size_t *
skc_device_shaper_block_pool_init_ids(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)339 skc_device_shaper_block_pool_init_ids(size_t    const work_size,
340                                       cl_uint * const work_dim,
341                                       size_t  * const work_global,
342                                       size_t  * const work_local)
343 {
344   work_dim   [0] = 1;
345   work_global[0] = work_size;
346 
347   return NULL; // let runtime figure out local work size
348 }
349 
350 static
351 size_t *
skc_device_shaper_block_pool_init_atomics(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)352 skc_device_shaper_block_pool_init_atomics(size_t    const work_size,
353                                           cl_uint * const work_dim,
354                                           size_t  * const work_global,
355                                           size_t  * const work_local)
356 {
357   work_dim   [0] = 1;
358   work_global[0] = 2;
359 
360   return NULL; // let runtime figure out local work size
361 }
362 
363 static
364 size_t *
skc_device_shaper_paths_alloc(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)365 skc_device_shaper_paths_alloc(size_t    const work_size,
366                               cl_uint * const work_dim,
367                               size_t  * const work_global,
368                               size_t  * const work_local)
369 {
370   work_dim   [0] = 1;
371   work_global[0] = 1;
372 
373   return NULL; // let runtime figure out local work size
374 }
375 
376 
377 static
378 size_t *
skc_device_shaper_paths_copy(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)379 skc_device_shaper_paths_copy(size_t    const work_size,
380                              cl_uint * const work_dim,
381                              size_t  * const work_global,
382                              size_t  * const work_local)
383 {
384   work_dim   [0] = 1;
385   work_global[0] = SKC_PATHS_COPY_SUBGROUP_SIZE * work_size;
386 #if 0
387   work_local [0] = SKC_PATHS_COPY_SUBGROUP_SIZE;
388 
389   return work_local;
390 #else
391   return NULL; // let runtime figure out local work size
392 #endif
393 }
394 
395 static
396 size_t *
skc_device_shaper_fills_expand(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)397 skc_device_shaper_fills_expand(size_t    const work_size,
398                                cl_uint * const work_dim,
399                                size_t  * const work_global,
400                                size_t  * const work_local)
401 {
402   work_dim   [0] = 1;
403   work_global[0] = SKC_FILLS_EXPAND_SUBGROUP_SIZE * work_size;
404   work_local [0] = SKC_FILLS_EXPAND_SUBGROUP_SIZE;
405 
406   return work_local;
407 }
408 
409 static
410 size_t *
skc_device_shaper_rasterize(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)411 skc_device_shaper_rasterize(size_t    const work_size,
412                             cl_uint * const work_dim,
413                             size_t  * const work_global,
414                             size_t  * const work_local)
415 {
416   work_dim   [0] = 1;
417   work_global[0] = SKC_RASTERIZE_SUBGROUP_SIZE * work_size;
418   work_local [0] = SKC_RASTERIZE_SUBGROUP_SIZE;
419 
420   return work_local;
421 }
422 
423 static
424 size_t *
skc_device_shaper_rasterize_all(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)425 skc_device_shaper_rasterize_all(size_t    const work_size,
426                                 cl_uint * const work_dim,
427                                 size_t  * const work_global,
428                                 size_t  * const work_local)
429 {
430   return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
431 }
432 
433 static
434 size_t *
skc_device_shaper_rasterize_lines(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)435 skc_device_shaper_rasterize_lines(size_t    const work_size,
436                                   cl_uint * const work_dim,
437                                   size_t  * const work_global,
438                                   size_t  * const work_local)
439 {
440   return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
441 }
442 
443 static
444 size_t *
skc_device_shaper_rasterize_quads(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)445 skc_device_shaper_rasterize_quads(size_t    const work_size,
446                                   cl_uint * const work_dim,
447                                   size_t  * const work_global,
448                                   size_t  * const work_local)
449 {
450   return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
451 }
452 
453 static
454 size_t *
skc_device_shaper_rasterize_cubics(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)455 skc_device_shaper_rasterize_cubics(size_t    const work_size,
456                                    cl_uint * const work_dim,
457                                    size_t  * const work_global,
458                                    size_t  * const work_local)
459 {
460   return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
461 }
462 
463 static
464 size_t *
skc_device_shaper_rasterize_rat_quads(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)465 skc_device_shaper_rasterize_rat_quads(size_t    const work_size,
466                                       cl_uint * const work_dim,
467                                       size_t  * const work_global,
468                                       size_t  * const work_local)
469 {
470   return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
471 }
472 
473 static
474 size_t *
skc_device_shaper_rasterize_rat_cubics(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)475 skc_device_shaper_rasterize_rat_cubics(size_t    const work_size,
476                                        cl_uint * const work_dim,
477                                        size_t  * const work_global,
478                                        size_t  * const work_local)
479 {
480   return skc_device_shaper_rasterize(work_size,work_dim,work_global,work_local);
481 }
482 
483 static
484 size_t *
skc_device_shaper_rasters_alloc(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)485 skc_device_shaper_rasters_alloc(size_t    const work_size,
486                                 cl_uint * const work_dim,
487                                 size_t  * const work_global,
488                                 size_t  * const work_local)
489 {
490   // round up to whole groups
491   size_t gs = SKC_ROUND_UP(work_size,SKC_RASTERS_ALLOC_GROUP_SIZE);
492 
493   work_dim   [0] = 1;
494   work_global[0] = gs;
495   work_local [0] = SKC_RASTERS_ALLOC_GROUP_SIZE;
496 
497   return work_local;
498 }
499 
500 static
501 size_t *
skc_device_shaper_segment_ttrk(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)502 skc_device_shaper_segment_ttrk(size_t    const work_size,
503                                cl_uint * const work_dim,
504                                size_t  * const work_global,
505                                size_t  * const work_local)
506 {
507   // work_size is number of keys -- round up to a whole slab
508   size_t keys_ru = SKC_ROUND_UP(work_size,HS_SLAB_WIDTH*HS_SLAB_HEIGHT);
509 
510   work_dim   [0] = 1;
511   work_global[0] = keys_ru / HS_SLAB_HEIGHT;
512   work_local [0] = HS_SLAB_WIDTH; // or just return NULL
513 
514   return work_local;
515 }
516 
517 static
518 size_t *
skc_device_shaper_segment_ttck(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)519 skc_device_shaper_segment_ttck(size_t    const work_size,
520                                cl_uint * const work_dim,
521                                size_t  * const work_global,
522                                size_t  * const work_local)
523 {
524   // work_size is number of keys -- round up to a whole slab
525   size_t keys_ru = SKC_ROUND_UP(work_size,HS_SLAB_WIDTH*HS_SLAB_HEIGHT);
526 
527   work_dim   [0] = 1;
528   work_global[0] = keys_ru / HS_SLAB_HEIGHT;
529   work_local [0] = HS_SLAB_WIDTH; // or just return NULL
530 
531   return work_local;
532 }
533 
534 static
535 size_t *
skc_device_shaper_prefix(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)536 skc_device_shaper_prefix(size_t    const work_size,
537                          cl_uint * const work_dim,
538                          size_t  * const work_global,
539                          size_t  * const work_local)
540 {
541   work_dim   [0] = 1;
542   work_global[0] = SKC_PREFIX_SUBGROUP_SIZE * work_size;
543   work_local [0] = SKC_PREFIX_SUBGROUP_SIZE;
544 
545   return work_local;
546 }
547 
548 static
549 size_t *
skc_device_shaper_place(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)550 skc_device_shaper_place(size_t    const work_size,
551                         cl_uint * const work_dim,
552                         size_t  * const work_global,
553                         size_t  * const work_local)
554 {
555   work_dim   [0] = 1;
556   work_global[0] = SKC_PLACE_SUBGROUP_SIZE * work_size;
557   work_local [0] = SKC_PLACE_SUBGROUP_SIZE;
558 
559   return work_local;
560 }
561 
562 static
563 size_t *
skc_device_shaper_render(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)564 skc_device_shaper_render(size_t    const work_size,
565                          cl_uint * const work_dim,
566                          size_t  * const work_global,
567                          size_t  * const work_local)
568 {
569   work_dim   [0] = 1;
570   work_global[0] = SKC_RENDER_SUBGROUP_SIZE * work_size;
571   work_local [0] = SKC_RENDER_SUBGROUP_SIZE;
572 
573   return work_local;
574 }
575 
576 static
577 size_t *
skc_device_shaper_paths_reclaim(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)578 skc_device_shaper_paths_reclaim(size_t    const work_size,
579                                 cl_uint * const work_dim,
580                                 size_t  * const work_global,
581                                 size_t  * const work_local)
582 {
583   assert(work_size == SKC_RECLAIM_ARRAY_SIZE);
584 
585   work_dim   [0] = 1;
586   work_global[0] = SKC_RECLAIM_ARRAY_SIZE * SKC_PATHS_RECLAIM_SUBGROUP_SIZE;
587 
588   return NULL; // let runtime figure out local work size
589 }
590 
591 static
592 size_t *
skc_device_shaper_rasters_reclaim(size_t const work_size,cl_uint * const work_dim,size_t * const work_global,size_t * const work_local)593 skc_device_shaper_rasters_reclaim(size_t    const work_size,
594                                   cl_uint * const work_dim,
595                                   size_t  * const work_global,
596                                   size_t  * const work_local)
597 {
598   assert(work_size == SKC_RECLAIM_ARRAY_SIZE);
599 
600   work_dim   [0] = 1;
601   work_global[0] = SKC_RECLAIM_ARRAY_SIZE * SKC_PATHS_RECLAIM_SUBGROUP_SIZE;
602 
603   return NULL; // let runtime figure out local work size
604 }
605 
606 //
607 //
608 //
609 
610 static union skc_program_sources const program_sources = {
611   SKC_PROGRAM_SOURCE(block_pool_init,SKC_BUILD_OPTIONS),
612   SKC_PROGRAM_SOURCE(paths_copy,     SKC_BUILD_OPTIONS),
613   SKC_PROGRAM_SOURCE(fills_expand,   SKC_BUILD_OPTIONS),
614   SKC_PROGRAM_SOURCE(rasterize,      SKC_BUILD_OPTIONS),
615   SKC_PROGRAM_SOURCE(segment_ttrk,   SKC_BUILD_OPTIONS),
616   SKC_PROGRAM_SOURCE(rasters_alloc,  SKC_BUILD_OPTIONS),
617   SKC_PROGRAM_SOURCE(prefix,         SKC_BUILD_OPTIONS),
618   SKC_PROGRAM_SOURCE(place,          SKC_BUILD_OPTIONS),
619   SKC_PROGRAM_SOURCE(segment_ttck,   SKC_BUILD_OPTIONS),
620   SKC_PROGRAM_SOURCE(render,         SKC_BUILD_OPTIONS),
621   SKC_PROGRAM_SOURCE(paths_reclaim,  SKC_BUILD_OPTIONS),
622   SKC_PROGRAM_SOURCE(rasters_reclaim,SKC_BUILD_OPTIONS)
623 };
624 
625 static union skc_program_kernels const program_kernels = {
626 
627   .block_pool_init = { { SKC_PROGRAM_KERNEL(block_pool_init_ids),     SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_IDS     },
628                        { SKC_PROGRAM_KERNEL(block_pool_init_atomics), SKC_DEVICE_KERNEL_ID_BLOCK_POOL_INIT_ATOMICS } },
629 
630   .paths_copy      = { { SKC_PROGRAM_KERNEL(paths_alloc),             SKC_DEVICE_KERNEL_ID_PATHS_ALLOC             },
631                        { SKC_PROGRAM_KERNEL(paths_copy) ,             SKC_DEVICE_KERNEL_ID_PATHS_COPY              } },
632 
633   .fills_expand    = { { SKC_PROGRAM_KERNEL(fills_expand),            SKC_DEVICE_KERNEL_ID_FILLS_EXPAND            } },
634 
635   .rasterize       = { { SKC_PROGRAM_KERNEL(rasterize_all),           SKC_DEVICE_KERNEL_ID_RASTERIZE_ALL           },
636                        { SKC_PROGRAM_KERNEL(rasterize_lines),         SKC_DEVICE_KERNEL_ID_RASTERIZE_LINES         },
637                        { SKC_PROGRAM_KERNEL(rasterize_quads),         SKC_DEVICE_KERNEL_ID_RASTERIZE_QUADS         },
638                        { SKC_PROGRAM_KERNEL(rasterize_cubics),        SKC_DEVICE_KERNEL_ID_RASTERIZE_CUBICS        },
639                        { SKC_PROGRAM_KERNEL(rasterize_rat_quads),     SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_QUADS     },
640                        { SKC_PROGRAM_KERNEL(rasterize_rat_cubics),    SKC_DEVICE_KERNEL_ID_RASTERIZE_RAT_CUBICS    } },
641 
642   .segment_ttrk    = { { SKC_PROGRAM_KERNEL(segment_ttrk),            SKC_DEVICE_KERNEL_ID_SEGMENT_TTRK            } },
643 
644   .rasters_alloc   = { { SKC_PROGRAM_KERNEL(rasters_alloc),           SKC_DEVICE_KERNEL_ID_RASTERS_ALLOC           } },
645 
646   .prefix          = { { SKC_PROGRAM_KERNEL(prefix),                  SKC_DEVICE_KERNEL_ID_PREFIX                  } },
647 
648   .place           = { { SKC_PROGRAM_KERNEL(place),                   SKC_DEVICE_KERNEL_ID_PLACE                   } },
649 
650   .segment_ttck    = { { SKC_PROGRAM_KERNEL(segment_ttck) ,           SKC_DEVICE_KERNEL_ID_SEGMENT_TTCK            } },
651 
652   .render          = { { SKC_PROGRAM_KERNEL(render),                  SKC_DEVICE_KERNEL_ID_RENDER                  } },
653 
654   .paths_reclaim   = { { SKC_PROGRAM_KERNEL(paths_reclaim),           SKC_DEVICE_KERNEL_ID_PATHS_RECLAIM           } },
655 
656   .rasters_reclaim = { { SKC_PROGRAM_KERNEL(rasters_reclaim),         SKC_DEVICE_KERNEL_ID_RASTERS_RECLAIM         } }
657 };
658 
659 //
660 //
661 //
662 
663 struct skc_device
664 {
665   //
666   // FIXME -- an OpenCL 2.1+ device would clone these kernels in a
667   // multithreaded system.
668   //
669   // Not having the ability to clone kernels (yet set their sticky
670   // args) was an oversight in previous versions of OpenCL.
671   //
672   // For now, we can probably get away with just a single kernel
673   // instance as long as args are set and the kernel is launched
674   // before having its arguments stomped on.
675   //
676   cl_kernel kernels [SKC_DEVICE_KERNEL_ID_COUNT];
677   size_t    reqd_szs[SKC_DEVICE_KERNEL_ID_COUNT][3];
678 };
679 
680 //
681 // CREATE KERNELS
682 //
683 
684 static
685 void
skc_device_create_kernels(struct skc_runtime * const runtime,struct skc_program_kernel const * const kernels,skc_uint const kernel_count,cl_program program)686 skc_device_create_kernels(struct skc_runtime              * const runtime,
687                           struct skc_program_kernel const * const kernels,
688                           skc_uint                          const kernel_count,
689                           cl_program                              program)
690 {
691   for (skc_uint ii=0; ii<kernel_count; ii++)
692     {
693       cl_int cl_err;
694 
695       char     const * name = kernels[ii].name;
696       skc_uint const   id   = kernels[ii].id;
697 
698       fprintf(stderr,"\t\"%s\"\n",name);
699 
700       // create the kernel
701       runtime->device->kernels[id] = clCreateKernel(program,name,&cl_err); cl_ok(cl_err);
702 
703       //
704       // release program now
705       //
706       // FIXME -- if/when we multithread then we need to clone kernels
707       // (>=2.1) or keep programs around (<=2.0)
708       //
709 
710       // get workgroup size
711       cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
712                                 runtime->cl.device_id,
713                                 CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
714                                 sizeof(runtime->device->reqd_szs[0]),
715                                 runtime->device->reqd_szs[id],
716                                 NULL));
717 
718       //
719       // GEN9+ PROBING
720       //
721 #define SKC_TARGET_GEN9
722 #ifdef  SKC_TARGET_GEN9
723 
724 #define CL_DEVICE_SUB_GROUP_SIZES_INTEL         0x4108
725 #define CL_KERNEL_SPILL_MEM_SIZE_INTEL          0x4109
726 #define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL  0x410A
727 
728       cl_ulong spill_mem_size;
729 
730       cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
731                                 runtime->cl.device_id,
732                                 CL_KERNEL_SPILL_MEM_SIZE_INTEL,
733                                 sizeof(spill_mem_size),
734                                 &spill_mem_size,
735                                 NULL));
736 
737       fprintf(stderr,"\t\tspill mem size: %lu bytes\n",
738               (unsigned long)spill_mem_size);
739 
740       cl_ulong local_mem_size;
741 
742       cl(GetKernelWorkGroupInfo(runtime->device->kernels[id],
743                                 runtime->cl.device_id,
744                                 CL_KERNEL_LOCAL_MEM_SIZE,
745                                 sizeof(local_mem_size),
746                                 &local_mem_size,
747                                 NULL));
748 
749       fprintf(stderr,"\t\tlocal mem size: %lu bytes\n",
750               (unsigned long)local_mem_size);
751 #endif
752     }
753 }
754 
755 static
756 void
skc_device_build_program(struct skc_runtime * const runtime,struct skc_program_source const * const source,struct skc_program_kernel const * const kernels,skc_uint const kernel_count)757 skc_device_build_program(struct skc_runtime              * const runtime,
758                          struct skc_program_source const * const source,
759                          struct skc_program_kernel const * const kernels,
760                          skc_uint                          const kernel_count)
761 {
762   cl_program program;
763 
764   fprintf(stderr,"%-20s: ",source->name);
765 
766   cl_int cl_err;
767 
768 #if   SKC_KERNEL_SPIRV // PROGRAM IS SPIR-V
769 
770   fprintf(stderr,"Creating (SPIR-V) ... ");
771 
772   program = clCreateProgramWithIL(runtime->cl.context,
773                                   source->src,
774                                   source->srclen,
775                                   &cl_err);
776 
777 #elif SKC_KERNEL_BINARY // PROGRAM IS BINARY
778 
779   fprintf(stderr,"Creating (Binary) ... ");
780 
781   cl_int status;
782   program = clCreateProgramWithBinary(runtime->cl.context,
783                                       1,
784                                       &runtime->cl.device_id,
785                                       &source->srclen,
786                                       (unsigned char const *[]){ source->src },
787                                       &status,
788                                       &cl_err);
789 
790 #elif SKC_KERNEL_SRC // PROGRAM IS SOURCE CODE
791 
792   fprintf(stderr,"Creating (Source) ... ");
793 
794   program = clCreateProgramWithSource(runtime->cl.context,
795                                       1,
796                                       (char const *[]){ source->src },
797                                       &source->srclen,
798                                       &cl_err);
799 #else
800 
801 #error "SKC_KERNEL_???"
802 
803 #endif
804 
805   cl_ok(cl_err);
806 
807   fprintf(stderr,"Building ... ");
808 
809   // build the program
810   cl(BuildProgram(program,
811                   1,
812                   &runtime->cl.device_id,
813                   source->options, // build options are ignored by binary
814                   NULL,
815                   NULL));
816 
817   fprintf(stderr,"Done\n");
818 
819   // build the kernels
820   skc_device_create_kernels(runtime,kernels,kernel_count,program);
821 
822   // we're done with program for now
823   // can always recover it from a kernel instance
824   cl(ReleaseProgram(program));
825 }
826 
827 //
828 // RELEASE KERNELS
829 //
830 
831 static
832 void
skc_device_release_kernels(struct skc_device * const device)833 skc_device_release_kernels(struct skc_device * const device)
834 {
835   for (skc_int ii=0; ii<SKC_COUNT_OF(device->kernels); ii++)
836     cl(ReleaseKernel(device->kernels[ii]));
837 }
838 
839 
840 
841 cl_kernel
skc_device_acquire_kernel(struct skc_device * const device,skc_device_kernel_id const type)842 skc_device_acquire_kernel(struct skc_device  * const device,
843                           skc_device_kernel_id const type)
844 {
845   cl_kernel kernel = device->kernels[type];
846 
847   cl(RetainKernel(kernel));
848 
849   return kernel;
850 }
851 
852 
853 void
skc_device_release_kernel(struct skc_device * const device,cl_kernel kernel)854 skc_device_release_kernel(struct skc_device  * const device,
855                           cl_kernel                  kernel)
856 {
857   cl(ReleaseKernel(kernel));
858 }
859 
860 //
861 // INITIALIZE KERNEL ARGS
862 //
863 // FIXME
864 //
865 // pre-assign any kernel arguments that are never going to change --
866 // for example, the block pool
867 //
868 
869 //
870 //
871 //
872 
873 #define SKC_DEVICE_BUILD_PROGRAM(p) \
874   skc_device_build_program(runtime,&program_sources.p,program_kernels.p,SKC_COUNT_OF(program_kernels.p))
875 
876 
877 void
skc_device_create(struct skc_runtime * const runtime)878 skc_device_create(struct skc_runtime * const runtime)
879 {
880   struct skc_device * const device = skc_runtime_host_perm_alloc(runtime,SKC_MEM_FLAGS_READ_WRITE,sizeof(*device));
881 
882   // hang device off of runtime
883   runtime->device = device;
884 
885   // hang config off of runtime
886   runtime->config = &config;
887 
888   // create kernels
889   SKC_DEVICE_BUILD_PROGRAM(block_pool_init);
890   SKC_DEVICE_BUILD_PROGRAM(paths_copy);
891   SKC_DEVICE_BUILD_PROGRAM(fills_expand);
892   SKC_DEVICE_BUILD_PROGRAM(rasterize);
893   SKC_DEVICE_BUILD_PROGRAM(segment_ttrk);
894   SKC_DEVICE_BUILD_PROGRAM(rasters_alloc);
895   SKC_DEVICE_BUILD_PROGRAM(prefix);
896   SKC_DEVICE_BUILD_PROGRAM(place);
897   SKC_DEVICE_BUILD_PROGRAM(segment_ttck);
898   SKC_DEVICE_BUILD_PROGRAM(render);
899   SKC_DEVICE_BUILD_PROGRAM(paths_reclaim);
900   SKC_DEVICE_BUILD_PROGRAM(rasters_reclaim);
901 
902   // create HotSort instance
903   runtime->hs = hs_cl_create(&hs_intel_gen8_u64,
904                              runtime->cl.context,
905                              runtime->cl.device_id);
906 }
907 
908 void
skc_device_dispose(struct skc_runtime * const runtime)909 skc_device_dispose(struct skc_runtime * const runtime)
910 {
911   //
912   // FIXME -- dispose of programs, kernels, etc.
913   //
914 
915   skc_runtime_host_perm_free(runtime,runtime->device);
916 
917   // dispose of hotsort etc.
918 }
919 
920 //
921 // FIXME -- just pass the device type
922 //
923 
924 void
skc_device_enqueue_kernel(struct skc_device * const device,skc_device_kernel_id const type,cl_command_queue cq,cl_kernel kernel,size_t const work_size,cl_uint num_events_in_wait_list,cl_event const * const event_wait_list,cl_event * const event)925 skc_device_enqueue_kernel(struct skc_device  * const device,
926                           skc_device_kernel_id const type,
927                           cl_command_queue           cq,
928                           cl_kernel                  kernel,
929                           size_t               const work_size,
930                           cl_uint                    num_events_in_wait_list,
931                           cl_event const     * const event_wait_list,
932                           cl_event           * const event)
933 {
934   if (work_size == 0)
935     return;
936 
937   cl_uint  work_dim   [1];
938   size_t   work_global[3];
939   size_t   work_local [3];
940 
941   size_t * work_local_ptr = program_kernels.kernels[type].shaper(work_size,
942                                                                  work_dim,
943                                                                  work_global,
944                                                                  work_local);
945   cl(EnqueueNDRangeKernel(cq,
946                           kernel,// device->kernels[type],
947                           work_dim[0],
948                           NULL,
949                           work_global,
950                           work_local_ptr,
951                           num_events_in_wait_list,
952                           event_wait_list,
953                           event));
954 }
955 
956 //
957 //
958 //
959