1 /*
2  * Copyright 2016 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can
5  * be found in the LICENSE file.
6  *
7  */
8 
9 //
10 //
11 //
12 
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <inttypes.h>
17 
18 //
19 //
20 //
21 
22 #include "common/cl/assert_cl.h"
23 #include "common/macros.h"
24 #include "common/util.h"
25 
26 //
27 //
28 //
29 
30 #include "hs_cl.h"
31 
32 //
33 //
34 //
35 
36 struct hs_cl
37 {
38   struct hs_cl_target_config config;
39 
40   uint32_t                   key_val_size;
41   uint32_t                   slab_keys;
42   uint32_t                   bs_slabs_log2_ru;
43   uint32_t                   bc_slabs_log2_max;
44 
45   struct {
46     uint32_t                 count;
47     cl_kernel              * bs;
48     cl_kernel              * bc;
49     cl_kernel              * fm[3];
50     cl_kernel              * hm[3];
51     cl_kernel              * transpose;
52     cl_kernel                all[];
53   } kernels;
54 };
55 
56 //
57 //
58 //
59 
60 struct hs_state
61 {
62 #ifndef NDEBUG
63   cl_ulong         t_total; // 0
64 #endif
65 
66   cl_command_queue cq;
67 
68   // key buffers
69   cl_mem           vin;
70   cl_mem           vout; // can be vin
71 
72   // enforces ordering on out-of-order queue
73   cl_event         wait_list[3]; // worst case
74   uint32_t         wait_list_size;
75 
76   // bx_ru is number of rounded up warps in vin
77   uint32_t         bx_ru;
78 };
79 
80 //
81 //
82 //
83 
84 static
85 void
hs_state_wait_list_release(struct hs_state * const state)86 hs_state_wait_list_release(struct hs_state * const state)
87 {
88   for (uint32_t ii=0; ii<state->wait_list_size; ii++)
89     cl(ReleaseEvent(state->wait_list[ii]));
90 
91   state->wait_list_size = 0;
92 }
93 
94 static
95 void
hs_state_wait_list_update(struct hs_state * const state,uint32_t const wait_list_size,cl_event const * const wait_list)96 hs_state_wait_list_update(struct hs_state * const state,
97                           uint32_t          const wait_list_size,
98                           cl_event  const * const wait_list)
99 {
100   uint32_t const new_size = state->wait_list_size + wait_list_size;
101 
102   for (uint32_t ii=state->wait_list_size; ii<new_size; ii++)
103     state->wait_list[ii] = wait_list[ii];
104 
105   state->wait_list_size = new_size;
106 }
107 
108 //
109 //
110 //
111 
112 #ifdef NDEBUG
113 
114 #define HS_STATE_WAIT_LIST_PROFILE(state)
115 #define HS_STATE_WAIT_LIST_PROFILE_EX(state,wait_list_size,wait_list)
116 
117 #else
118 
119 #include <stdio.h>
120 
121 #define HS_STATE_WAIT_LIST_PROFILE(state)               \
122   hs_state_wait_list_profile(state,                     \
123                              state->wait_list_size,     \
124                              state->wait_list)
125 
126 #define HS_STATE_WAIT_LIST_PROFILE_EX(state,wait_list_size,wait_list)   \
127   hs_state_wait_list_profile(state,                                     \
128                              wait_list_size,                            \
129                              wait_list)
130 
131 static
132 void
hs_state_wait_list_profile(struct hs_state * const state,uint32_t const wait_list_size,cl_event const * const wait_list)133 hs_state_wait_list_profile(struct hs_state  * const state,
134                            uint32_t           const wait_list_size,
135                            cl_event   const * const wait_list)
136 {
137   cl(Finish(state->cq));
138 
139   cl_command_queue_properties props;
140 
141   cl(GetCommandQueueInfo(state->cq,
142                          CL_QUEUE_PROPERTIES,
143                          sizeof(props),
144                          &props,
145                          NULL));
146 
147   for (uint32_t ii=0; ii<wait_list_size; ii++)
148     {
149       cl_event event = wait_list[ii];
150 
151       //
152       // profiling
153       //
154       cl_ulong t_start=0, t_end=0;
155 
156       if (props & CL_QUEUE_PROFILING_ENABLE)
157         {
158           // start
159           cl(GetEventProfilingInfo(event,
160                                    CL_PROFILING_COMMAND_START,
161                                    sizeof(cl_ulong),
162                                    &t_start,
163                                    NULL));
164           // end
165           cl(GetEventProfilingInfo(event,
166                                    CL_PROFILING_COMMAND_END,
167                                    sizeof(cl_ulong),
168                                    &t_end,
169                                    NULL));
170 
171           state->t_total += t_end - t_start;
172         }
173 
174       //
175       // status
176       //
177       cl_int          status;
178       cl_command_type type;
179 
180       cl_get_event_info(event,&status,&type);
181 
182       fprintf(stdout,"%-13s, %-28s, %20"PRIu64", %20"PRIu64", %20"PRIu64", %20"PRIu64"\n",
183               cl_get_event_command_status_string(status),
184               cl_get_event_command_type_string(type),
185               t_start,t_end,t_end-t_start,state->t_total);
186     }
187 }
188 
189 #endif
190 
191 //
192 //
193 //
194 
195 #ifdef NDEBUG
196 
197 #define HS_TRACE(k,g,l)
198 
199 #else
200 
201 #define HS_KERNEL_NAME_MAX 20
202 
203 static
204 void
hs_trace(cl_kernel kernel,uint32_t const dim,size_t const * const global_work_size)205 hs_trace(cl_kernel            kernel,
206                 uint32_t       const dim,
207                 size_t const * const global_work_size)
208 {
209   if (kernel == NULL)
210     return;
211 
212   char name[HS_KERNEL_NAME_MAX];
213 
214   cl(GetKernelInfo(kernel,CL_KERNEL_FUNCTION_NAME,HS_KERNEL_NAME_MAX,name,NULL));
215 
216   fprintf(stderr,"%-19s ( %6zu, %6zu, %6zu )\n",
217           name,
218           global_work_size[0],
219           dim < 2 ? 0 : global_work_size[1],
220           dim < 3 ? 0 : global_work_size[2]);
221 }
222 
223 #define HS_TRACE(k,d,g)  hs_trace(k,d,g)
224 
225 #endif
226 
227 //
228 //
229 //
230 
231 static
232 void
hs_transpose(struct hs_cl const * const hs,struct hs_state * const state)233 hs_transpose(struct hs_cl const * const hs,
234              struct hs_state    * const state)
235 {
236   size_t const size[1] = { state->bx_ru << hs->config.slab.threads_log2 };
237   cl_kernel    kernel  = hs->kernels.transpose[0];
238 
239   HS_TRACE(kernel,1,size);
240 
241   //
242   // The transpose kernel operates on a single slab.  For now, let's
243   // rely on the driver to choose a workgroup size.
244   //
245   // size_t local_work_size[1] = { HS_SLAB_THREADS };
246   //
247   cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
248 
249   cl_event wait_list_out[1];
250 
251   cl(EnqueueNDRangeKernel(state->cq,
252                           kernel,
253                           1,
254                           NULL,
255                           size,
256                           NULL,
257                           state->wait_list_size,
258                           state->wait_list,
259                           wait_list_out));
260 
261   hs_state_wait_list_release(state);
262   hs_state_wait_list_update(state,1,wait_list_out);
263 
264   HS_STATE_WAIT_LIST_PROFILE(state);
265 }
266 
267 //
268 //
269 //
270 
271 static
272 void
hs_hm_enqueue(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const scale_log2,uint32_t const spans,uint32_t const span_threads)273 hs_hm_enqueue(struct hs_cl const * const hs,
274               struct hs_state    * const state,
275               uint32_t             const scale_log2,
276               uint32_t             const spans,
277               uint32_t             const span_threads)
278 {
279   //
280   // Note that some platforms might need to use .z on large grids
281   //
282   size_t const size[3] = { span_threads, spans, 1 };
283   cl_kernel    kernel  = hs->kernels.hm[scale_log2][0];
284 
285   HS_TRACE(kernel,3,size);
286 
287   cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
288 
289   cl_event wait_list_out[1];
290 
291   cl(EnqueueNDRangeKernel(state->cq,
292                           kernel,
293                           3,
294                           NULL,
295                           size,
296                           NULL,
297                           state->wait_list_size,
298                           state->wait_list,
299                           wait_list_out));
300 
301   hs_state_wait_list_release(state);
302   hs_state_wait_list_update(state,1,wait_list_out);
303 
304   HS_STATE_WAIT_LIST_PROFILE(state);
305 }
306 
307 //
308 //
309 //
310 
311 static
312 uint32_t
hs_hm(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const down_slabs,uint32_t const clean_slabs_log2)313 hs_hm(struct hs_cl const * const hs,
314       struct hs_state    * const state,
315       uint32_t             const down_slabs,
316       uint32_t             const clean_slabs_log2)
317 {
318   // how many scaled half-merge spans are there?
319   uint32_t const frac_ru    = (1 << clean_slabs_log2) - 1;
320   uint32_t const spans      = (down_slabs + frac_ru) >> clean_slabs_log2;
321 
322   // for now, just clamp to the max
323   uint32_t const log2_rem   = clean_slabs_log2 - hs->bc_slabs_log2_max;
324   uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.hm.scale_max,log2_rem);
325   uint32_t const log2_out   = log2_rem - scale_log2;
326 
327   // size the grid
328   uint32_t const span_threads = hs->slab_keys << log2_out;
329 
330   // launch the hm kernel
331   hs_hm_enqueue(hs,
332                 state,
333                 scale_log2,
334                 spans,
335                 span_threads);
336 
337   return log2_out;
338 }
339 
340 //
341 //
342 //
343 
344 static
345 void
hs_bc_enqueue(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const full,uint32_t const clean_slabs_log2)346 hs_bc_enqueue(struct hs_cl const * const hs,
347               struct hs_state    * const state,
348               uint32_t             const full,
349               uint32_t             const clean_slabs_log2)
350 {
351   size_t const size[1] = { full << hs->config.slab.threads_log2 };
352   cl_kernel    kernel  = hs->kernels.bc[clean_slabs_log2];
353 
354   HS_TRACE(kernel,1,size);
355 
356   cl(SetKernelArg(kernel,0,sizeof(state->vout),&state->vout));
357 
358   cl_event wait_list_out[1];
359 
360   cl(EnqueueNDRangeKernel(state->cq,
361                           kernel,
362                           1,
363                           NULL,
364                           size,
365                           NULL,
366                           state->wait_list_size,
367                           state->wait_list,
368                           wait_list_out));
369 
370   hs_state_wait_list_release(state);
371   hs_state_wait_list_update(state,1,wait_list_out);
372 
373   HS_STATE_WAIT_LIST_PROFILE(state);
374 }
375 
376 //
377 //
378 //
379 
380 static
381 void
hs_bc(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const down_slabs,uint32_t const clean_slabs_log2)382 hs_bc(struct hs_cl const * const hs,
383       struct hs_state    * const state,
384       uint32_t             const down_slabs,
385       uint32_t             const clean_slabs_log2)
386 {
387   // block clean the minimal number of down_slabs_log2 spans
388   uint32_t const frac_ru = (1u << clean_slabs_log2) - 1;
389   uint32_t const full    = (down_slabs + frac_ru) & ~frac_ru;
390 
391   // we better be capable of cleaning at least two warps !!!
392   hs_bc_enqueue(hs,state,full,clean_slabs_log2);
393 }
394 
395 //
396 // FIXME -- some of this logic can be skipped if BS is a power-of-two
397 //
398 
399 static
400 void
hs_fm_enqueue(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const scale_log2,uint32_t const fm_full,uint32_t const fm_frac,uint32_t const span_threads)401 hs_fm_enqueue(struct hs_cl const * const hs,
402               struct hs_state    * const state,
403               uint32_t             const scale_log2,
404               uint32_t             const fm_full,
405               uint32_t             const fm_frac,
406               uint32_t             const span_threads)
407 {
408   //
409   // Note that some platforms might need to use .z on large grids
410   //
411   uint32_t wait_list_out_size = 0;
412   cl_event wait_list_out[2];
413 
414   if (fm_full > 0)
415     {
416       size_t const size_full[3] = { span_threads, fm_full, 1 };
417       cl_kernel    kernel_full  = hs->kernels.fm[scale_log2][hs->bs_slabs_log2_ru-1+scale_log2];
418 
419       HS_TRACE(kernel_full,3,size_full);
420 
421       cl(SetKernelArg(kernel_full,0,sizeof(state->vout),&state->vout));
422 
423       cl(EnqueueNDRangeKernel(state->cq,
424                               kernel_full,
425                               3,
426                               NULL,
427                               size_full,
428                               NULL,
429                               state->wait_list_size,
430                               state->wait_list,
431                               wait_list_out+wait_list_out_size++));
432     }
433 
434   if (fm_frac > 0)
435     {
436       size_t const offset_frac[3] = { 0,            fm_full, 0 };
437       size_t const size_frac  [3] = { span_threads, 1,       1 };
438       cl_kernel    kernel_frac    = hs->kernels.fm[scale_log2][msb_idx_u32(fm_frac)];
439 
440       HS_TRACE(kernel_frac,3,size_frac);
441 
442       cl(SetKernelArg(kernel_frac,0,sizeof(state->vout),&state->vout));
443 
444       cl(EnqueueNDRangeKernel(state->cq,
445                               kernel_frac,
446                               3,
447                               offset_frac,
448                               size_frac,
449                               NULL,
450                               state->wait_list_size,
451                               state->wait_list,
452                               wait_list_out+wait_list_out_size++));
453     }
454 
455   hs_state_wait_list_release(state);
456   hs_state_wait_list_update(state,wait_list_out_size,wait_list_out);
457 
458   HS_STATE_WAIT_LIST_PROFILE(state);
459 }
460 
461 //
462 //
463 //
464 
465 static
466 uint32_t
hs_fm(struct hs_cl const * const hs,struct hs_state * const state,uint32_t * const down_slabs,uint32_t const up_scale_log2)467 hs_fm(struct hs_cl const * const hs,
468       struct hs_state    * const state,
469       uint32_t           * const down_slabs,
470       uint32_t             const up_scale_log2)
471 {
472   //
473   // FIXME OPTIMIZATION: in previous HotSort launchers it's sometimes
474   // a performance win to bias toward launching the smaller flip merge
475   // kernel in order to get more warps in flight (increased
476   // occupancy).  This is useful when merging small numbers of slabs.
477   //
478   // Note that HS_FM_SCALE_MIN will always be 0 or 1.
479   //
480   // So, for now, just clamp to the max until there is a reason to
481   // restore the fancier and probably low-impact approach.
482   //
483   uint32_t const scale_log2 = MIN_MACRO(hs->config.merge.fm.scale_max,up_scale_log2);
484   uint32_t const clean_log2 = up_scale_log2 - scale_log2;
485 
486   // number of slabs in a full-sized scaled flip-merge span
487   uint32_t const full_span_slabs = hs->config.block.slabs << up_scale_log2;
488 
489   // how many full-sized scaled flip-merge spans are there?
490   uint32_t fm_full = state->bx_ru / full_span_slabs;
491   uint32_t fm_frac = 0;
492 
493   // initialize down_slabs
494   *down_slabs = fm_full * full_span_slabs;
495 
496   // how many half-size scaled + fractional scaled spans are there?
497   uint32_t const span_rem        = state->bx_ru - *down_slabs;
498   uint32_t const half_span_slabs = full_span_slabs >> 1;
499 
500   // if we have over a half-span then fractionally merge it
501   if (span_rem > half_span_slabs)
502     {
503       // the remaining slabs will be cleaned
504       *down_slabs += span_rem;
505 
506       uint32_t const frac_rem      = span_rem - half_span_slabs;
507       uint32_t const frac_rem_pow2 = pow2_ru_u32(frac_rem);
508 
509       if (frac_rem_pow2 >= half_span_slabs)
510         {
511           // bump it up to a full span
512           fm_full += 1;
513         }
514       else
515         {
516           // otherwise, add fractional
517           fm_frac  = MAX_MACRO(1,frac_rem_pow2 >> clean_log2);
518         }
519     }
520 
521   // size the grid
522   uint32_t const span_threads = hs->slab_keys << clean_log2;
523 
524   //
525   // launch the fm kernel
526   //
527   hs_fm_enqueue(hs,
528                 state,
529                 scale_log2,
530                 fm_full,
531                 fm_frac,
532                 span_threads);
533 
534   return clean_log2;
535 }
536 
537 //
538 //
539 //
540 
541 static
542 void
hs_bs_enqueue(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const full,uint32_t const frac,uint32_t const wait_list_size,cl_event * wait_list)543 hs_bs_enqueue(struct hs_cl const * const hs,
544               struct hs_state    * const state,
545               uint32_t             const full,
546               uint32_t             const frac,
547               uint32_t             const wait_list_size,
548               cl_event           *       wait_list)
549 {
550   uint32_t wait_list_out_size = 0;
551   cl_event wait_list_out[2];
552 
553   if (full > 0)
554     {
555       size_t const size_full[1] = { full << hs->config.slab.threads_log2 };
556       cl_kernel    kernel_full  = hs->kernels.bs[hs->bs_slabs_log2_ru];
557 
558       HS_TRACE(kernel_full,1,size_full);
559 
560       cl(SetKernelArg(kernel_full,0,sizeof(state->vin), &state->vin));
561       cl(SetKernelArg(kernel_full,1,sizeof(state->vout),&state->vout));
562 
563       cl(EnqueueNDRangeKernel(state->cq,
564                               kernel_full,
565                               1,
566                               NULL,
567                               size_full,
568                               NULL,
569                               wait_list_size,
570                               wait_list,
571                               wait_list_out+wait_list_out_size++));
572     }
573 
574   if (frac > 0)
575     {
576       size_t const offset_frac[1] = { full << hs->config.slab.threads_log2 };
577       size_t const size_frac  [1] = { frac << hs->config.slab.threads_log2 };
578       cl_kernel    kernel_frac    = hs->kernels.bs[msb_idx_u32(frac)];
579 
580       HS_TRACE(kernel_frac,1,size_frac);
581 
582       cl(SetKernelArg(kernel_frac,0,sizeof(state->vin), &state->vin));
583       cl(SetKernelArg(kernel_frac,1,sizeof(state->vout),&state->vout));
584 
585       cl(EnqueueNDRangeKernel(state->cq,
586                               kernel_frac,
587                               1,
588                               offset_frac,
589                               size_frac,
590                               NULL,
591                               wait_list_size,
592                               wait_list,
593                               wait_list_out+wait_list_out_size++));
594     }
595 
596   hs_state_wait_list_release(state);
597   hs_state_wait_list_update(state,wait_list_out_size,wait_list_out);
598 
599   HS_STATE_WAIT_LIST_PROFILE(state);
600 }
601 
602 //
603 //
604 //
605 
606 static
607 void
hs_bs(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const count_padded_in,uint32_t const wait_list_size,cl_event * wait_list)608 hs_bs(struct hs_cl const * const hs,
609       struct hs_state    * const state,
610       uint32_t             const count_padded_in,
611       uint32_t             const wait_list_size,
612       cl_event           *       wait_list)
613 {
614   uint32_t const slabs_in = count_padded_in / hs->slab_keys;
615   uint32_t const full     = (slabs_in / hs->config.block.slabs) * hs->config.block.slabs;
616   uint32_t const frac     = slabs_in - full;
617 
618   hs_bs_enqueue(hs,state,
619                 full,frac,
620                 wait_list_size,wait_list);
621 }
622 
623 //
624 //
625 //
626 
627 static
628 void
hs_keyset_pre_sort(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const count,uint32_t const count_hi,uint32_t const wait_list_size,cl_event * wait_list,cl_event * event)629 hs_keyset_pre_sort(struct hs_cl const * const hs,
630                    struct hs_state    * const state,
631                    uint32_t             const count,
632                    uint32_t             const count_hi,
633                    uint32_t             const wait_list_size,
634                    cl_event           *       wait_list,
635                    cl_event           *       event)
636 {
637   uint32_t const vin_span = count_hi - count;
638   uint32_t const pattern  = UINT32_MAX;
639 
640   cl(EnqueueFillBuffer(state->cq,
641                        state->vin,
642                        &pattern,
643                        sizeof(pattern),
644                        count    * hs->key_val_size,
645                        vin_span * hs->key_val_size,
646                        wait_list_size,
647                        wait_list,
648                        event));
649 
650   HS_STATE_WAIT_LIST_PROFILE_EX(state,1,event);
651 }
652 
653 //
654 //
655 //
656 
657 static
658 void
hs_keyset_pre_merge(struct hs_cl const * const hs,struct hs_state * const state,uint32_t const count_lo,uint32_t const count_hi,uint32_t const wait_list_size,cl_event * wait_list)659 hs_keyset_pre_merge(struct hs_cl const * const hs,
660                     struct hs_state    * const state,
661                     uint32_t             const count_lo,
662                     uint32_t             const count_hi,
663                     uint32_t             const wait_list_size,
664                     cl_event           *       wait_list)
665 {
666   uint32_t const vout_span = count_hi - count_lo;
667   uint32_t const pattern   = UINT32_MAX;
668 
669   // appends event to incoming wait list
670   cl(EnqueueFillBuffer(state->cq,
671                        state->vout,
672                        &pattern,
673                        sizeof(pattern),
674                        count_lo  * hs->key_val_size,
675                        vout_span * hs->key_val_size,
676                        wait_list_size,
677                        wait_list,
678                        state->wait_list+state->wait_list_size++));
679 
680   HS_STATE_WAIT_LIST_PROFILE(state);
681 }
682 
683 //
684 // We want concurrent kernel execution to occur in a few places.
685 //
686 // The summary is:
687 //
688 //   1) If necessary, some max valued keys are written to the end of
689 //      the vin/vout buffers.
690 //
691 //   2) Blocks of slabs of keys are sorted.
692 //
693 //   3) If necesary, the blocks of slabs are merged until complete.
694 //
695 //   4) If requested, the slabs will be converted from slab ordering
696 //      to linear ordering.
697 //
698 // Below is the general "happens-before" relationship between HotSort
699 // compute kernels.
700 //
701 // Note the diagram assumes vin and vout are different buffers.  If
702 // they're not, then the first merge doesn't include the pad_vout
703 // event in the wait list.
704 //
705 //                    +----------+            +---------+
706 //                    | pad_vout |            | pad_vin |
707 //                    +----+-----+            +----+----+
708 //                         |                       |
709 //                         |                WAITFOR(pad_vin)
710 //                         |                       |
711 //                         |                 +-----v-----+
712 //                         |                 |           |
713 //                         |            +----v----+ +----v----+
714 //                         |            | bs_full | | bs_frac |
715 //                         |            +----+----+ +----+----+
716 //                         |                 |           |
717 //                         |                 +-----v-----+
718 //                         |                       |
719 //                         |  +------NO------JUST ONE BLOCK?
720 //                         | /                     |
721 //                         |/                     YES
722 //                         +                       |
723 //                         |                       v
724 //                         |         END_WITH_EVENTS(bs_full,bs_frac)
725 //                         |
726 //                         |
727 //        WAITFOR(pad_vout,bs_full,bs_frac) >>> first iteration of loop <<<
728 //                         |
729 //                         |
730 //                         +-----------<------------+
731 //                         |                        |
732 //                   +-----v-----+                  |
733 //                   |           |                  |
734 //              +----v----+ +----v----+             |
735 //              | fm_full | | fm_frac |             |
736 //              +----+----+ +----+----+             |
737 //                   |           |                  ^
738 //                   +-----v-----+                  |
739 //                         |                        |
740 //              WAITFOR(fm_full,fm_frac)            |
741 //                         |                        |
742 //                         v                        |
743 //                      +--v--+                WAITFOR(bc)
744 //                      | hm  |                     |
745 //                      +-----+                     |
746 //                         |                        |
747 //                    WAITFOR(hm)                   |
748 //                         |                        ^
749 //                      +--v--+                     |
750 //                      | bc  |                     |
751 //                      +-----+                     |
752 //                         |                        |
753 //                         v                        |
754 //                  MERGING COMPLETE?-------NO------+
755 //                         |
756 //                        YES
757 //                         |
758 //                         v
759 //                END_WITH_EVENTS(bc)
760 //
761 
762 void
hs_cl_sort(struct hs_cl const * const hs,cl_command_queue cq,uint32_t const wait_list_size,cl_event * wait_list,cl_event * event,cl_mem vin,cl_mem vout,uint32_t const count,uint32_t const count_padded_in,uint32_t const count_padded_out,bool const linearize)763 hs_cl_sort(struct hs_cl const * const hs,
764            cl_command_queue           cq,
765            uint32_t             const wait_list_size,
766            cl_event           *       wait_list,
767            cl_event           *       event,
768            cl_mem                     vin,
769            cl_mem                     vout,
770            uint32_t             const count,
771            uint32_t             const count_padded_in,
772            uint32_t             const count_padded_out,
773            bool                 const linearize)
774 {
775   // is this sort in place?
776   bool const is_in_place = (vout == NULL);
777 
778   // cq, buffers, wait list and slab count
779   struct hs_state state = {
780 #ifndef NDEBUG
781     .t_total        = 0,
782 #endif
783     .cq             = cq,
784     .vin            = vin,
785     .vout           = is_in_place ? vin : vout,
786     .wait_list_size = 0,
787     .bx_ru          = (count + hs->slab_keys - 1) / hs->slab_keys
788   };
789 
790   // initialize vin
791   uint32_t const count_hi                = is_in_place ? count_padded_out : count_padded_in;
792   bool     const is_pre_sort_keyset_reqd = count_hi > count;
793   cl_event       event_keyset_pre_sort[1];
794 
795   // initialize any trailing keys in vin before sorting
796   if (is_pre_sort_keyset_reqd)
797     {
798       hs_keyset_pre_sort(hs,&state,
799                          count,count_hi,
800                          wait_list_size,wait_list,
801                          event_keyset_pre_sort);
802     }
803 
804   // initialize any trailing keys in vout before merging
805   if (!is_in_place && (count_padded_out > count_padded_in))
806     {
807       hs_keyset_pre_merge(hs,&state,
808                           count_padded_in,count_padded_out,
809                           wait_list_size,wait_list);
810     }
811 
812   //
813   // sort blocks of slabs
814   //
815   hs_bs(hs,&state,
816         count_padded_in,
817         is_pre_sort_keyset_reqd ? 1                     : wait_list_size,
818         is_pre_sort_keyset_reqd ? event_keyset_pre_sort : wait_list);
819 
820   // release the event
821   if (is_pre_sort_keyset_reqd)
822     cl(ReleaseEvent(event_keyset_pre_sort[0]));
823 
824   //
825   // we're done if this was a single bs block...
826   //
827   // otherwise, merge sorted spans of slabs until done
828   //
829   if (state.bx_ru > hs->config.block.slabs)
830     {
831       int32_t up_scale_log2 = 1;
832 
833       while (true)
834         {
835           uint32_t down_slabs;
836 
837           // flip merge slabs -- return span of slabs that must be cleaned
838           uint32_t clean_slabs_log2 = hs_fm(hs,&state,
839                                             &down_slabs,
840                                             up_scale_log2);
841 
842           // if span is gt largest slab block cleaner then half merge
843           while (clean_slabs_log2 > hs->bc_slabs_log2_max)
844             {
845               clean_slabs_log2 = hs_hm(hs,&state,
846                                        down_slabs,
847                                        clean_slabs_log2);
848             }
849 
850           // launch clean slab grid -- is it the final launch?
851           hs_bc(hs,&state,down_slabs,clean_slabs_log2);
852 
853           // was this the final block clean?
854           if (((uint32_t)hs->config.block.slabs << up_scale_log2) >= state.bx_ru)
855             break;
856 
857           // otherwise, merge twice as many slabs
858           up_scale_log2 += 1;
859         }
860     }
861 
862   // slabs or linear?
863   if (linearize) {
864     hs_transpose(hs,&state);
865   }
866 
867   // does the caller want the final event?
868   if (event != NULL) {
869     *event = state.wait_list[0];
870   } else {
871     cl(ReleaseEvent(state.wait_list[0]));
872   }
873 }
874 
875 //
876 // all grids will be computed as a function of the minimum number of slabs
877 //
878 
879 void
hs_cl_pad(struct hs_cl const * const hs,uint32_t const count,uint32_t * const count_padded_in,uint32_t * const count_padded_out)880 hs_cl_pad(struct hs_cl const * const hs,
881           uint32_t             const count,
882           uint32_t           * const count_padded_in,
883           uint32_t           * const count_padded_out)
884 {
885   //
886   // round up the count to slabs
887   //
888   uint32_t const slabs_ru        = (count + hs->slab_keys - 1) / hs->slab_keys;
889   uint32_t const blocks          = slabs_ru / hs->config.block.slabs;
890   uint32_t const block_slabs     = blocks * hs->config.block.slabs;
891   uint32_t const slabs_ru_rem    = slabs_ru - block_slabs;
892   uint32_t const slabs_ru_rem_ru = MIN_MACRO(pow2_ru_u32(slabs_ru_rem),hs->config.block.slabs);
893 
894   *count_padded_in  = (block_slabs + slabs_ru_rem_ru) * hs->slab_keys;
895   *count_padded_out = *count_padded_in;
896 
897   //
898   // will merging be required?
899   //
900   if (slabs_ru > hs->config.block.slabs)
901     {
902       // more than one block
903       uint32_t const blocks_lo       = pow2_rd_u32(blocks);
904       uint32_t const block_slabs_lo  = blocks_lo * hs->config.block.slabs;
905       uint32_t const block_slabs_rem = slabs_ru - block_slabs_lo;
906 
907       if (block_slabs_rem > 0)
908         {
909           uint32_t const block_slabs_rem_ru     = pow2_ru_u32(block_slabs_rem);
910 
911           uint32_t const block_slabs_hi         = MAX_MACRO(block_slabs_rem_ru,
912                                                             blocks_lo << (1 - hs->config.merge.fm.scale_min));
913 
914           uint32_t const block_slabs_padded_out = MIN_MACRO(block_slabs_lo+block_slabs_hi,
915                                                             block_slabs_lo*2); // clamp non-pow2 blocks
916 
917           *count_padded_out = block_slabs_padded_out * hs->slab_keys;
918         }
919     }
920 }
921 
922 //
923 //
924 //
925 
926 static
927 void
hs_create_kernel(cl_program program,cl_kernel * const kernel,char const * const name)928 hs_create_kernel(cl_program         program,
929                  cl_kernel  * const kernel,
930                  char const * const name)
931 {
932   cl_int err;
933 
934   *kernel = clCreateKernel(program,name,&err);
935 
936   cl_ok(err);
937 }
938 
939 static
940 void
hs_create_kernels(cl_program program,cl_kernel * kernels,char name_template[],size_t const name_template_size,uint32_t const count)941 hs_create_kernels(cl_program     program,
942                   cl_kernel    * kernels,
943                   char           name_template[],
944                   size_t   const name_template_size,
945                   uint32_t const count)
946 {
947   char const n_max = '0'+(char)count;
948 
949   for (char n = '0'; n<n_max; n++)
950     {
951       cl_int err;
952 
953       name_template[name_template_size-2] = n;
954 
955       *kernels++ = clCreateKernel(program,name_template,&err);
956 
957       cl_ok(err);
958     }
959 }
960 
961 //
962 //
963 //
964 
965 struct hs_cl *
hs_cl_create(struct hs_cl_target const * const target,cl_context context,cl_device_id device_id)966 hs_cl_create(struct hs_cl_target const * const target,
967              cl_context                        context,
968              cl_device_id                      device_id)
969 {
970   //
971   // immediately try to build the OpenCL program
972   //
973   bool     const is_binary    = (target->program[0] == 0);
974   uint32_t const program_size = NPBTOHL_MACRO(target->program+1);
975 
976   cl_program program;
977 
978   if (is_binary) // program is a binary
979     {
980       cl_int status, err;
981 
982       size_t        const   bins_sizeof[] = { program_size      };
983       unsigned char const * bins[]        = { target->program+5 };
984 
985       program = clCreateProgramWithBinary(context,
986                                           1,
987                                           &device_id,
988                                           bins_sizeof,
989                                           bins,
990                                           &status,
991                                           &err);
992       cl_ok(err);
993 
994       fprintf(stdout,"Building binary... ");
995 
996       fflush(stdout);
997 
998       cl(BuildProgram(program,
999                       1,
1000                       &device_id,
1001                       NULL,
1002                       NULL,
1003                       NULL));
1004     }
1005   else // program is source code
1006     {
1007       cl_int err;
1008 
1009       size_t const   strings_sizeof[] = { program_size             };
1010       char   const * strings[]        = { (char*)target->program+5 };
1011 
1012       program = clCreateProgramWithSource(context,
1013                                           1,
1014                                           strings,
1015                                           strings_sizeof,
1016                                           &err);
1017       cl_ok(err);
1018 
1019       char const * const options =
1020         "-cl-std=CL1.2 -cl-fast-relaxed-math "
1021         "-cl-no-signed-zeros -cl-mad-enable "
1022         "-cl-denorms-are-zero "
1023         "-cl-kernel-arg-info";
1024 
1025       fprintf(stdout,"Building source... ");
1026 
1027       fflush(stdout);
1028 
1029       cl(BuildProgram(program,
1030                       1,
1031                       &device_id,
1032                       options,
1033                       NULL,
1034                       NULL));
1035     }
1036 
1037   //
1038   // we reference these values a lot
1039   //
1040   uint32_t const bs_slabs_log2_ru  = msb_idx_u32(pow2_ru_u32(target->config.block.slabs));
1041   uint32_t const bc_slabs_log2_max = msb_idx_u32(pow2_rd_u32(target->config.block.slabs));
1042 
1043   //
1044   // how many kernels will be created?
1045   //
1046   uint32_t const count_bs    = bs_slabs_log2_ru + 1;
1047   uint32_t const count_bc    = bc_slabs_log2_max + 1;
1048   uint32_t       count_fm[3] = { 0 };
1049   uint32_t       count_hm[3] = { 0 };
1050 
1051   // guaranteed to be in range [0,2]
1052   for (uint32_t scale = target->config.merge.fm.scale_min;
1053        scale <= target->config.merge.fm.scale_max;
1054        scale++)
1055     {
1056       uint32_t fm_left = (target->config.block.slabs / 2) << scale;
1057 
1058       count_fm[scale] = msb_idx_u32(pow2_ru_u32(fm_left)) + 1;
1059     }
1060 
1061   // guaranteed to be in range [0,2]
1062   for (uint32_t scale = target->config.merge.hm.scale_min;
1063        scale <= target->config.merge.hm.scale_max;
1064        scale++)
1065     {
1066       count_hm[scale] = 1;
1067     }
1068 
1069   uint32_t const count_all =
1070     1
1071     + count_bs
1072     + count_bc
1073     + count_fm[0] + count_fm[1] + count_fm[2]
1074     + count_hm[0] + count_hm[1] + count_hm[2];
1075 
1076   //
1077   // allocate hs_cl
1078   //
1079   struct hs_cl * hs = malloc(sizeof(*hs) + sizeof(cl_kernel) * count_all);
1080 
1081   memcpy(&hs->config,&target->config,sizeof(hs->config));
1082 
1083   // save some frequently used calculated values
1084   hs->key_val_size      = (target->config.words.key + target->config.words.val) * 4;
1085   hs->slab_keys         = target->config.slab.height << target->config.slab.width_log2;
1086   hs->bs_slabs_log2_ru  = bs_slabs_log2_ru;
1087   hs->bc_slabs_log2_max = bc_slabs_log2_max;
1088 
1089   // save kernel count
1090   hs->kernels.count     = count_all;
1091 
1092   //
1093   // create all the kernels and release the program
1094   //
1095   cl_kernel * kernel_next = hs->kernels.all;
1096 
1097   //
1098   // BS
1099   //
1100   {
1101     hs->kernels.bs = kernel_next;
1102 
1103     char bs_name[] = { "hs_kernel_bs_X" };
1104 
1105     hs_create_kernels(program,
1106                       kernel_next,
1107                       bs_name,sizeof(bs_name),
1108                       count_bs);
1109 
1110     kernel_next += count_bs;
1111   }
1112 
1113   //
1114   // BC
1115   //
1116   {
1117     hs->kernels.bc = kernel_next;
1118 
1119     char bc_name[] = { "hs_kernel_bc_X" };
1120 
1121     hs_create_kernels(program,
1122                       kernel_next,
1123                       bc_name,sizeof(bc_name),
1124                       count_bc);
1125 
1126     kernel_next += count_bc;
1127   }
1128 
1129   //
1130   // FM
1131   //
1132   if (count_fm[0] > 0)
1133     {
1134       hs->kernels.fm[0] = kernel_next;
1135 
1136       char fm_0_name[]  = { "hs_kernel_fm_0_X" };
1137 
1138       hs_create_kernels(program,
1139                         kernel_next,
1140                         fm_0_name,sizeof(fm_0_name),
1141                         count_fm[0]);
1142 
1143       kernel_next += count_fm[0];
1144     }
1145 
1146   if (count_fm[1] > 0)
1147     {
1148       hs->kernels.fm[1] = kernel_next;
1149 
1150       char fm_1_name[]  = { "hs_kernel_fm_1_X" };
1151 
1152       hs_create_kernels(program,
1153                         kernel_next,
1154                         fm_1_name,sizeof(fm_1_name),
1155                         count_fm[1]);
1156 
1157       kernel_next += count_fm[1];
1158     }
1159 
1160   if (count_fm[2] > 0)
1161     {
1162       hs->kernels.fm[2] = kernel_next;
1163 
1164       char fm_2_name[]  = { "hs_kernel_fm_2_X" };
1165 
1166       hs_create_kernels(program,
1167                         kernel_next,
1168                         fm_2_name,sizeof(fm_2_name),
1169                         count_fm[2]);
1170 
1171       kernel_next += count_fm[2];
1172     }
1173 
1174   //
1175   // HM
1176   //
1177   if (count_hm[0] > 0)
1178     {
1179       hs->kernels.hm[0] = kernel_next;
1180 
1181       hs_create_kernel(program,
1182                        kernel_next,
1183                        "hs_kernel_hm_0");
1184 
1185       kernel_next += count_hm[0];
1186     }
1187 
1188   if (count_hm[1] > 0)
1189     {
1190       hs->kernels.hm[1] = kernel_next;
1191 
1192       hs_create_kernel(program,
1193                        kernel_next,
1194                        "hs_kernel_hm_1");
1195 
1196       kernel_next += count_hm[1];
1197     }
1198 
1199   if (count_hm[2] > 0)
1200     {
1201       hs->kernels.hm[2] = kernel_next;
1202 
1203       hs_create_kernel(program,
1204                        kernel_next,
1205                        "hs_kernel_hm_2");
1206 
1207       kernel_next += count_hm[2]; // unnecessary
1208     }
1209 
1210   //
1211   // TRANSPOSE
1212   //
1213   {
1214     hs->kernels.transpose = kernel_next;
1215 
1216     hs_create_kernel(program,
1217                      kernel_next,
1218                      "hs_kernel_transpose");
1219 
1220     kernel_next += 1;
1221   }
1222 
1223   return hs;
1224 }
1225 
1226 //
1227 //
1228 //
1229 
1230 void
hs_cl_release(struct hs_cl * const hs)1231 hs_cl_release(struct hs_cl * const hs)
1232 {
1233   for (uint32_t ii=0; ii<hs->kernels.count; ii++)
1234     cl(ReleaseKernel(hs->kernels.all[ii]));
1235 
1236   free(hs);
1237 }
1238 
1239 //
1240 //
1241 //
1242