1 /*
2  * Copyright 2016 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can
5  * be found in the LICENSE file.
6  *
7  */
8 
9 //
10 //
11 //
12 
13 #include <stdlib.h>
14 #include <stdio.h>
15 #include <string.h>
16 #include <inttypes.h>
17 #include <float.h>
18 #include <stdbool.h>
19 
20 //
21 //
22 //
23 
24 #include <cuda_runtime.h>
25 
26 //
27 //
28 //
29 
30 #include "common/cuda/assert_cuda.h"
31 #include "common/macros.h"
32 
33 //
34 //
35 //
36 
37 #include "hs/cuda/sm_35/u32/hs_cuda.h"
38 #include "hs/cuda/sm_35/u64/hs_cuda.h"
39 
40 //
41 // PFNs to select between different key widths
42 //
43 
44 typedef void (*hs_cuda_info_pfn)(uint32_t * const key_words,
45                                  uint32_t * const val_words,
46                                  uint32_t * const slab_height,
47                                  uint32_t * const slab_width_log2);
48 
49 typedef void (*hs_cuda_pad_pfn)(uint32_t   const count,
50                                 uint32_t * const count_padded_in,
51                                 uint32_t * const count_padded_out);
52 
53 typedef void (*hs_cuda_sort_pfn)(void *   const vin,
54                                  void *   const vout,
55                                  uint32_t const count,
56                                  uint32_t const count_padded_in,
57                                  uint32_t const count_padded_out,
58                                  bool     const linearize,
59                                  cudaStream_t   stream0,
60                                  cudaStream_t   stream1,
61                                  cudaStream_t   stream2);
62 
63 //
64 // The quality of the RNG doesn't matter.  The same number of
65 // instructions will be run no matter what the key distribution looks
66 // like.  So here is something small and fast.
67 //
68 
69 static
70 uint32_t
hs_rand_u32()71 hs_rand_u32()
72 {
73   static uint32_t seed = 0xDEADBEEF;
74 
75   // Numerical Recipes
76   seed = seed * 1664525 + 1013904223;
77 
78   return seed;
79 }
80 
81 //
82 //
83 //
84 
85 static
86 void
hs_fill_rand(uint32_t * vin_h,uint32_t const count,uint32_t const words)87 hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words)
88 {
89 #if   1
90   for (uint32_t ii=0; ii<count*words; ii++)
91     vin_h[ii] = hs_rand_u32();
92 #elif 0 // in-order
93   memset(vin_h,0,count*words*sizeof(uint32_t));
94   for (uint32_t ii=0; ii<count; ii++)
95     vin_h[ii*words] = ii;
96 #else   // reverse order
97   memset(vin_h,0,count*words*sizeof(uint32_t));
98   for (uint32_t ii=0; ii<count; ii++)
99     vin_h[ii*words] = count - 1 - ii;
100 #endif
101 }
102 
103 //
104 //
105 //
106 
107 char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns);
108 char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns);
109 
110 //
111 //
112 //
113 
114 static
115 char const *
hs_cpu_sort(void * sorted_h,uint32_t const hs_words,uint32_t const count,double * const cpu_ns)116 hs_cpu_sort(void     *       sorted_h,
117             uint32_t   const hs_words,
118             uint32_t   const count,
119             double   * const cpu_ns)
120 {
121   if (hs_words == 1)
122     return hs_cpu_sort_u32(sorted_h,count,cpu_ns);
123   else
124     return hs_cpu_sort_u64(sorted_h,count,cpu_ns);
125 }
126 
127 static
128 bool
hs_verify_linear(uint32_t const hs_words,void * sorted_h,void * vout_h,uint32_t const count)129 hs_verify_linear(uint32_t const hs_words,
130                  void   *       sorted_h,
131                  void   *       vout_h,
132                  uint32_t const count)
133 {
134   return memcmp(sorted_h, vout_h, sizeof(uint32_t) * hs_words * count) == 0;
135 }
136 
137 static
138 void
hs_transpose_slabs_u32(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,uint32_t * vout_h,uint32_t const count)139 hs_transpose_slabs_u32(uint32_t const hs_words,
140                        uint32_t const hs_width,
141                        uint32_t const hs_height,
142                        uint32_t *     vout_h,
143                        uint32_t const count)
144 {
145   uint32_t   const slab_keys  = hs_width * hs_height;
146   size_t     const slab_size  = sizeof(uint32_t) * hs_words * slab_keys;
147   uint32_t * const slab       = ALLOCA_MACRO(slab_size);
148   uint32_t         slab_count = count / slab_keys;
149 
150   while (slab_count-- > 0)
151     {
152       memcpy(slab,vout_h,slab_size);
153 
154       for (uint32_t row=0; row<hs_height; row++)
155         for (uint32_t col=0; col<hs_width; col++)
156           vout_h[col * hs_height + row] = slab[row * hs_width + col];
157 
158       vout_h += slab_keys;
159     }
160 }
161 
162 static
163 void
hs_transpose_slabs_u64(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,uint64_t * vout_h,uint32_t const count)164 hs_transpose_slabs_u64(uint32_t const hs_words,
165                        uint32_t const hs_width,
166                        uint32_t const hs_height,
167                        uint64_t *     vout_h,
168                        uint32_t const count)
169 {
170   uint32_t   const slab_keys  = hs_width * hs_height;
171   size_t     const slab_size  = sizeof(uint32_t) * hs_words * slab_keys;
172   uint64_t * const slab       = ALLOCA_MACRO(slab_size);
173   uint32_t         slab_count = count / slab_keys;
174 
175   while (slab_count-- > 0)
176     {
177       memcpy(slab,vout_h,slab_size);
178 
179       for (uint32_t row=0; row<hs_height; row++)
180         for (uint32_t col=0; col<hs_width; col++)
181           vout_h[col * hs_height + row] = slab[row * hs_width + col];
182 
183       vout_h += slab_keys;
184     }
185 }
186 
187 static
188 void
hs_transpose_slabs(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,void * vout_h,uint32_t const count)189 hs_transpose_slabs(uint32_t const hs_words,
190                    uint32_t const hs_width,
191                    uint32_t const hs_height,
192                    void   *       vout_h,
193                    uint32_t const count)
194 {
195   if (hs_words == 1)
196     hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
197   else
198     hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
199 }
200 
201 //
202 //
203 //
204 
205 static
206 void
hs_debug_u32(uint32_t const hs_width,uint32_t const hs_height,uint32_t const * vout_h,uint32_t const count)207 hs_debug_u32(uint32_t const   hs_width,
208              uint32_t const   hs_height,
209              uint32_t const * vout_h,
210              uint32_t const   count)
211 {
212   uint32_t const slab_keys = hs_width * hs_height;
213   uint32_t const slabs     = (count + slab_keys - 1) / slab_keys;
214 
215   for (uint32_t ss=0; ss<slabs; ss++) {
216     fprintf(stderr,"%u\n",ss);
217     for (uint32_t cc=0; cc<hs_height; cc++) {
218       for (uint32_t rr=0; rr<hs_width; rr++)
219         fprintf(stderr,"%8" PRIX32 " ",*vout_h++);
220       fprintf(stderr,"\n");
221     }
222   }
223 }
224 
225 static
226 void
hs_debug_u64(uint32_t const hs_width,uint32_t const hs_height,uint64_t const * vout_h,uint32_t const count)227 hs_debug_u64(uint32_t const   hs_width,
228              uint32_t const   hs_height,
229              uint64_t const * vout_h,
230              uint32_t const   count)
231 {
232   uint32_t const slab_keys = hs_width * hs_height;
233   uint32_t const slabs     = (count + slab_keys - 1) / slab_keys;
234 
235   for (uint32_t ss=0; ss<slabs; ss++) {
236     fprintf(stderr,"%u\n",ss);
237     for (uint32_t cc=0; cc<hs_height; cc++) {
238       for (uint32_t rr=0; rr<hs_width; rr++)
239         fprintf(stderr,"%16" PRIX64 " ",*vout_h++);
240       fprintf(stderr,"\n");
241     }
242   }
243 }
244 
245 //
246 //
247 //
248 
249 static
250 void
hs_bench(hs_cuda_pad_pfn hs_pad,hs_cuda_sort_pfn hs_sort,cudaStream_t stream0,cudaStream_t stream1,cudaStream_t stream2,struct cudaDeviceProp const * props,int const driver_version,uint32_t const hs_words,uint32_t const hs_height,uint32_t const hs_width,uint32_t const count_lo,uint32_t const count_hi,uint32_t const count_step,uint32_t const loops,uint32_t const warmup,bool const linearize,bool const verify)251 hs_bench(hs_cuda_pad_pfn               hs_pad,
252          hs_cuda_sort_pfn              hs_sort,
253          cudaStream_t                  stream0,
254          cudaStream_t                  stream1,
255          cudaStream_t                  stream2,
256          struct cudaDeviceProp const * props,
257          int                   const   driver_version,
258          uint32_t              const   hs_words,
259          uint32_t              const   hs_height,
260          uint32_t              const   hs_width,
261          uint32_t              const   count_lo,
262          uint32_t              const   count_hi,
263          uint32_t              const   count_step,
264          uint32_t              const   loops,
265          uint32_t              const   warmup,
266          bool                  const   linearize,
267          bool                  const   verify)
268 {
269   //
270   // return if nothing to do
271   //
272   if (count_hi <= 1)
273     return;
274 
275   //
276   // size for the largest array
277   //
278   uint32_t count_hi_padded_in, count_hi_padded_out;
279 
280   hs_pad(count_hi,&count_hi_padded_in,&count_hi_padded_out);
281 
282   size_t const key_size    = sizeof(uint32_t)    * hs_words;
283   size_t const size_hi_in  = count_hi_padded_in  * key_size;
284   size_t const size_hi_out = count_hi_padded_out * key_size;
285 
286   //
287   // allocate device extents
288   //
289   void * random_d;
290   void * vin_d;
291   void * vout_d;
292 
293   cuda(Malloc(&random_d,size_hi_in));
294   cuda(Malloc(&vin_d,   size_hi_in));
295   cuda(Malloc(&vout_d,  size_hi_out));
296 
297   //
298   // initialize device random extent
299   //
300   void * random_h = malloc(size_hi_in);
301 
302   // fill with random numbers
303   hs_fill_rand(random_h,count_hi,hs_words);
304 
305   // copy to device
306   cuda(Memcpy(random_d,random_h,size_hi_in,cudaMemcpyHostToDevice));
307 
308   free(random_h);
309 
310   //
311   // allocate host result extent
312   //
313   void * sorted_h = malloc(size_hi_in);
314   void * vout_h   = malloc(size_hi_in);
315 
316   //
317   // LABELS
318   //
319   fprintf(stdout,
320           "Device, "
321           "Driver, "
322           "Type, "
323           "Slab/Linear, "
324           "Verified?, "
325           "Keys, "
326           "Keys Padded In, "
327           "Keys Padded Out, "
328           "CPU Algorithm, "
329           "CPU Msecs, "
330           "CPU Mkeys/s, "
331           "Trials, "
332           "Avg. Msecs, "
333           "Min Msecs, "
334           "Max Msecs, "
335           "Avg. Mkeys/s, "
336           "Max. Mkeys/s\n");
337   //
338   // BENCHMARK
339   //
340   cudaEvent_t start, end;
341 
342   cuda(EventCreate(&start));
343   cuda(EventCreate(&end));
344 
345   for (uint32_t count=count_lo; count<=count_hi; count+=count_step)
346     {
347       // compute padding before sorting
348       uint32_t count_padded_in, count_padded_out;
349 
350       hs_pad(count,&count_padded_in,&count_padded_out);
351 
352       cuda(Memcpy(vin_d,random_d,count*key_size,cudaMemcpyDeviceToDevice));
353 
354       float elapsed_ms_min = FLT_MAX;
355       float elapsed_ms_max = 0.0f;
356       float elapsed_ms_sum = 0.0f;
357 
358       for (uint32_t ii=0; ii<warmup+loops; ii++)
359         {
360           if (ii == warmup)
361             {
362               elapsed_ms_min = FLT_MAX;
363               elapsed_ms_max = 0.0f;
364               elapsed_ms_sum = 0.0f;
365             }
366 
367           //
368           // sort vin/vout
369           //
370           cuda(EventRecord(start,stream0));
371           cuda(StreamWaitEvent(stream1,start,0));
372           cuda(StreamWaitEvent(stream2,start,0));
373 
374           hs_sort(vin_d,
375                   vout_d,
376                   count,
377                   count_padded_in,
378                   count_padded_out,
379                   linearize,
380                   stream0,
381                   stream1,
382                   stream2);
383 
384           cuda(EventRecord(end,stream0));
385 
386           cuda(EventSynchronize(end));
387 
388           float elapsed;
389 
390           cuda(EventElapsedTime(&elapsed,start,end));
391 
392           elapsed_ms_min  = MIN_MACRO(elapsed_ms_min,elapsed);
393           elapsed_ms_max  = MAX_MACRO(elapsed_ms_max,elapsed);
394           elapsed_ms_sum += elapsed;
395         }
396 
397       //
398       // verify
399       //
400       char const * cpu_algo = NULL;
401       double       cpu_ns   = 1.0;
402       bool         verified = false;
403 
404       if (verify)
405         {
406 	  //
407 	  // copy back the results
408 	  //
409 	  size_t const size_padded_in = count_padded_in * key_size;
410 
411 	  cuda(Memcpy(sorted_h,vin_d, size_padded_in,cudaMemcpyDeviceToHost));
412 	  cuda(Memcpy(vout_h,  vout_d,size_padded_in,cudaMemcpyDeviceToHost));
413 
414 	  //
415 	  // sort the input with another algorithm
416 	  //
417 	  cpu_algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns);
418 
419 	  // transpose the cpu sorted slabs before comparison
420 	  if (!linearize) {
421 	    hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);
422 	  }
423 
424 	  verified = hs_verify_linear(hs_words,sorted_h,vout_h,count_padded_in);
425 
426 #ifndef NDEBUG
427 	  if (!verified)
428 	    {
429 	      if (hs_words == 1) {
430 		hs_debug_u32(hs_width,hs_height,vout_h,  count);
431 		hs_debug_u32(hs_width,hs_height,sorted_h,count);
432 	      } else { // ulong
433 		hs_debug_u64(hs_width,hs_height,vout_h,  count);
434 		hs_debug_u64(hs_width,hs_height,sorted_h,count);
435 	      }
436 	    }
437 #endif
438 	}
439 
440       //
441       // REPORT
442       //
443       fprintf(stdout,"%s, %u, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
444               props->name,
445               driver_version,
446               (hs_words == 1) ? "uint32_t" : "uint64_t",
447               linearize       ? "linear"   : "slab",
448               verify ? (verified ? "  OK  " : "*FAIL*") : "UNVERIFIED",
449               count,
450               count_padded_in,
451               count_padded_out,
452               // CPU
453               verify ? cpu_algo : "UNVERIFIED",
454               verify ? (cpu_ns / 1000000.0)      : 0.0,             // milliseconds
455               verify ? (1000.0 * count / cpu_ns) : 0.0,             // mkeys / sec
456               // GPU
457               loops,
458               elapsed_ms_sum / loops,                               // avg msecs
459               elapsed_ms_min,                                       // min msecs
460               elapsed_ms_max,                                       // max msecs
461               (double)(count * loops) / (1000.0 * elapsed_ms_sum),  // mkeys / sec - avg
462               (double) count          / (1000.0 * elapsed_ms_min)); // mkeys / sec - max
463 
464       // quit early if not verified
465       if (verify && !verified)
466         break;
467     }
468 
469   //
470   // dispose
471   //
472   cuda(EventDestroy(start));
473   cuda(EventDestroy(end));
474 
475   free(sorted_h);
476   free(vout_h);
477 
478   cuda(Free(random_d));
479   cuda(Free(vin_d));
480   cuda(Free(vout_d));
481 }
482 
483 //
484 //
485 //
486 
487 int
main(int argc,char const * argv[])488 main(int argc, char const * argv[])
489 {
490   //
491   // which CUDA device?
492   //
493   const int32_t device = (argc == 1) ? 0 : atoi(argv[1]);
494 
495   struct cudaDeviceProp props;
496   cuda(GetDeviceProperties(&props,device));
497 
498   cuda(SetDeviceFlags(cudaDeviceScheduleBlockingSync));
499   cuda(SetDevice(device));
500 
501   int driver_version;
502 
503   cuda(DriverGetVersion(&driver_version));
504 
505 #ifndef NDEBUG
506   fprintf(stdout,"%s (%2d) : %u\n",
507           props.name,
508           props.multiProcessorCount,
509           driver_version);
510 #endif
511 
512   //
513   // create some streams
514   //
515   cudaStream_t stream0,stream1,stream2;
516 
517   cuda(StreamCreate(&stream0));
518   cuda(StreamCreate(&stream1));
519   cuda(StreamCreate(&stream2));
520 
521   //
522   //
523   //
524 #ifdef NDEBUG
525 #define HS_BENCH_LOOPS   100
526 #define HS_BENCH_WARMUP  100
527 #else
528 #define HS_BENCH_LOOPS   1
529 #define HS_BENCH_WARMUP  0
530 #endif
531 
532   //
533   // are we sorting 32-bit or 64-bit keys?
534   //
535   uint32_t const key_size = (argc <= 2) ? 2 : strtoul(argv[2],NULL,0);
536 
537   hs_cuda_info_pfn hs_info;
538   hs_cuda_pad_pfn  hs_pad;
539   hs_cuda_sort_pfn hs_sort;
540 
541   if (key_size == 1)
542     {
543       hs_info = hs_cuda_info_u32;
544       hs_pad  = hs_cuda_pad_u32;
545       hs_sort = hs_cuda_sort_u32;
546     }
547   else
548     {
549       hs_info = hs_cuda_info_u64;
550       hs_pad  = hs_cuda_pad_u64;
551       hs_sort = hs_cuda_sort_u64;
552     }
553 
554   //
555   // get some configuration info
556   //
557   uint32_t key_words, val_words, slab_height, slab_width_log2;
558 
559   hs_info(&key_words,&val_words,&slab_height,&slab_width_log2);
560 
561   //
562   // sort sizes and loops
563   //
564   uint32_t const kpb        = slab_height << slab_width_log2;
565   uint32_t const count_lo   = (argc <= 3) ? kpb             : strtoul(argv[3],NULL,0);
566   uint32_t const count_hi   = (argc <= 4) ? count_lo        : strtoul(argv[4],NULL,0);
567   uint32_t const count_step = (argc <= 5) ? count_lo        : strtoul(argv[5],NULL,0);
568   uint32_t const loops      = (argc <= 6) ? HS_BENCH_LOOPS  : strtoul(argv[6],NULL,0);
569   uint32_t const warmup     = (argc <= 7) ? HS_BENCH_WARMUP : strtoul(argv[7],NULL,0);
570   bool     const linearize  = (argc <= 8) ? true            : strtoul(argv[8],NULL,0);
571   bool     const verify     = (argc <= 9) ? true            : strtoul(argv[9],NULL,0);
572 
573   //
574   // benchmark
575   //
576   hs_bench(hs_pad,
577            hs_sort,
578            stream0,
579            stream1,
580            stream2,
581            &props,
582            driver_version,
583            key_words + val_words,
584            slab_height,
585            1 << slab_width_log2,
586            count_lo,
587            count_hi,
588            count_step,
589            loops,
590            warmup,
591            linearize,
592 	   verify);
593 
594   //
595   // cleanup
596   //
597   cuda(StreamDestroy(stream0));
598   cuda(StreamDestroy(stream1));
599   cuda(StreamDestroy(stream2));
600 
601   cuda(DeviceReset());
602 
603   return EXIT_SUCCESS;
604 }
605