1 /*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can
5 * be found in the LICENSE file.
6 *
7 */
8
9 //
10 //
11 //
12
13 #include <stdlib.h>
14 #include <stdio.h>
15 #include <string.h>
16 #include <inttypes.h>
17 #include <float.h>
18 #include <stdbool.h>
19
20 //
21 //
22 //
23
24 #include <cuda_runtime.h>
25
26 //
27 //
28 //
29
30 #include "common/cuda/assert_cuda.h"
31 #include "common/macros.h"
32
33 //
34 //
35 //
36
37 #include "hs/cuda/sm_35/u32/hs_cuda.h"
38 #include "hs/cuda/sm_35/u64/hs_cuda.h"
39
40 //
41 // PFNs to select between different key widths
42 //
43
44 typedef void (*hs_cuda_info_pfn)(uint32_t * const key_words,
45 uint32_t * const val_words,
46 uint32_t * const slab_height,
47 uint32_t * const slab_width_log2);
48
49 typedef void (*hs_cuda_pad_pfn)(uint32_t const count,
50 uint32_t * const count_padded_in,
51 uint32_t * const count_padded_out);
52
53 typedef void (*hs_cuda_sort_pfn)(void * const vin,
54 void * const vout,
55 uint32_t const count,
56 uint32_t const count_padded_in,
57 uint32_t const count_padded_out,
58 bool const linearize,
59 cudaStream_t stream0,
60 cudaStream_t stream1,
61 cudaStream_t stream2);
62
63 //
64 // The quality of the RNG doesn't matter. The same number of
65 // instructions will be run no matter what the key distribution looks
66 // like. So here is something small and fast.
67 //
68
69 static
70 uint32_t
hs_rand_u32()71 hs_rand_u32()
72 {
73 static uint32_t seed = 0xDEADBEEF;
74
75 // Numerical Recipes
76 seed = seed * 1664525 + 1013904223;
77
78 return seed;
79 }
80
81 //
82 //
83 //
84
85 static
86 void
hs_fill_rand(uint32_t * vin_h,uint32_t const count,uint32_t const words)87 hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words)
88 {
89 #if 1
90 for (uint32_t ii=0; ii<count*words; ii++)
91 vin_h[ii] = hs_rand_u32();
92 #elif 0 // in-order
93 memset(vin_h,0,count*words*sizeof(uint32_t));
94 for (uint32_t ii=0; ii<count; ii++)
95 vin_h[ii*words] = ii;
96 #else // reverse order
97 memset(vin_h,0,count*words*sizeof(uint32_t));
98 for (uint32_t ii=0; ii<count; ii++)
99 vin_h[ii*words] = count - 1 - ii;
100 #endif
101 }
102
103 //
104 //
105 //
106
107 char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns);
108 char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns);
109
110 //
111 //
112 //
113
114 static
115 char const *
hs_cpu_sort(void * sorted_h,uint32_t const hs_words,uint32_t const count,double * const cpu_ns)116 hs_cpu_sort(void * sorted_h,
117 uint32_t const hs_words,
118 uint32_t const count,
119 double * const cpu_ns)
120 {
121 if (hs_words == 1)
122 return hs_cpu_sort_u32(sorted_h,count,cpu_ns);
123 else
124 return hs_cpu_sort_u64(sorted_h,count,cpu_ns);
125 }
126
127 static
128 bool
hs_verify_linear(uint32_t const hs_words,void * sorted_h,void * vout_h,uint32_t const count)129 hs_verify_linear(uint32_t const hs_words,
130 void * sorted_h,
131 void * vout_h,
132 uint32_t const count)
133 {
134 return memcmp(sorted_h, vout_h, sizeof(uint32_t) * hs_words * count) == 0;
135 }
136
137 static
138 void
hs_transpose_slabs_u32(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,uint32_t * vout_h,uint32_t const count)139 hs_transpose_slabs_u32(uint32_t const hs_words,
140 uint32_t const hs_width,
141 uint32_t const hs_height,
142 uint32_t * vout_h,
143 uint32_t const count)
144 {
145 uint32_t const slab_keys = hs_width * hs_height;
146 size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
147 uint32_t * const slab = ALLOCA_MACRO(slab_size);
148 uint32_t slab_count = count / slab_keys;
149
150 while (slab_count-- > 0)
151 {
152 memcpy(slab,vout_h,slab_size);
153
154 for (uint32_t row=0; row<hs_height; row++)
155 for (uint32_t col=0; col<hs_width; col++)
156 vout_h[col * hs_height + row] = slab[row * hs_width + col];
157
158 vout_h += slab_keys;
159 }
160 }
161
162 static
163 void
hs_transpose_slabs_u64(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,uint64_t * vout_h,uint32_t const count)164 hs_transpose_slabs_u64(uint32_t const hs_words,
165 uint32_t const hs_width,
166 uint32_t const hs_height,
167 uint64_t * vout_h,
168 uint32_t const count)
169 {
170 uint32_t const slab_keys = hs_width * hs_height;
171 size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys;
172 uint64_t * const slab = ALLOCA_MACRO(slab_size);
173 uint32_t slab_count = count / slab_keys;
174
175 while (slab_count-- > 0)
176 {
177 memcpy(slab,vout_h,slab_size);
178
179 for (uint32_t row=0; row<hs_height; row++)
180 for (uint32_t col=0; col<hs_width; col++)
181 vout_h[col * hs_height + row] = slab[row * hs_width + col];
182
183 vout_h += slab_keys;
184 }
185 }
186
187 static
188 void
hs_transpose_slabs(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,void * vout_h,uint32_t const count)189 hs_transpose_slabs(uint32_t const hs_words,
190 uint32_t const hs_width,
191 uint32_t const hs_height,
192 void * vout_h,
193 uint32_t const count)
194 {
195 if (hs_words == 1)
196 hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
197 else
198 hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
199 }
200
201 //
202 //
203 //
204
205 static
206 void
hs_debug_u32(uint32_t const hs_width,uint32_t const hs_height,uint32_t const * vout_h,uint32_t const count)207 hs_debug_u32(uint32_t const hs_width,
208 uint32_t const hs_height,
209 uint32_t const * vout_h,
210 uint32_t const count)
211 {
212 uint32_t const slab_keys = hs_width * hs_height;
213 uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
214
215 for (uint32_t ss=0; ss<slabs; ss++) {
216 fprintf(stderr,"%u\n",ss);
217 for (uint32_t cc=0; cc<hs_height; cc++) {
218 for (uint32_t rr=0; rr<hs_width; rr++)
219 fprintf(stderr,"%8" PRIX32 " ",*vout_h++);
220 fprintf(stderr,"\n");
221 }
222 }
223 }
224
225 static
226 void
hs_debug_u64(uint32_t const hs_width,uint32_t const hs_height,uint64_t const * vout_h,uint32_t const count)227 hs_debug_u64(uint32_t const hs_width,
228 uint32_t const hs_height,
229 uint64_t const * vout_h,
230 uint32_t const count)
231 {
232 uint32_t const slab_keys = hs_width * hs_height;
233 uint32_t const slabs = (count + slab_keys - 1) / slab_keys;
234
235 for (uint32_t ss=0; ss<slabs; ss++) {
236 fprintf(stderr,"%u\n",ss);
237 for (uint32_t cc=0; cc<hs_height; cc++) {
238 for (uint32_t rr=0; rr<hs_width; rr++)
239 fprintf(stderr,"%16" PRIX64 " ",*vout_h++);
240 fprintf(stderr,"\n");
241 }
242 }
243 }
244
245 //
246 //
247 //
248
249 static
250 void
hs_bench(hs_cuda_pad_pfn hs_pad,hs_cuda_sort_pfn hs_sort,cudaStream_t stream0,cudaStream_t stream1,cudaStream_t stream2,struct cudaDeviceProp const * props,int const driver_version,uint32_t const hs_words,uint32_t const hs_height,uint32_t const hs_width,uint32_t const count_lo,uint32_t const count_hi,uint32_t const count_step,uint32_t const loops,uint32_t const warmup,bool const linearize,bool const verify)251 hs_bench(hs_cuda_pad_pfn hs_pad,
252 hs_cuda_sort_pfn hs_sort,
253 cudaStream_t stream0,
254 cudaStream_t stream1,
255 cudaStream_t stream2,
256 struct cudaDeviceProp const * props,
257 int const driver_version,
258 uint32_t const hs_words,
259 uint32_t const hs_height,
260 uint32_t const hs_width,
261 uint32_t const count_lo,
262 uint32_t const count_hi,
263 uint32_t const count_step,
264 uint32_t const loops,
265 uint32_t const warmup,
266 bool const linearize,
267 bool const verify)
268 {
269 //
270 // return if nothing to do
271 //
272 if (count_hi <= 1)
273 return;
274
275 //
276 // size for the largest array
277 //
278 uint32_t count_hi_padded_in, count_hi_padded_out;
279
280 hs_pad(count_hi,&count_hi_padded_in,&count_hi_padded_out);
281
282 size_t const key_size = sizeof(uint32_t) * hs_words;
283 size_t const size_hi_in = count_hi_padded_in * key_size;
284 size_t const size_hi_out = count_hi_padded_out * key_size;
285
286 //
287 // allocate device extents
288 //
289 void * random_d;
290 void * vin_d;
291 void * vout_d;
292
293 cuda(Malloc(&random_d,size_hi_in));
294 cuda(Malloc(&vin_d, size_hi_in));
295 cuda(Malloc(&vout_d, size_hi_out));
296
297 //
298 // initialize device random extent
299 //
300 void * random_h = malloc(size_hi_in);
301
302 // fill with random numbers
303 hs_fill_rand(random_h,count_hi,hs_words);
304
305 // copy to device
306 cuda(Memcpy(random_d,random_h,size_hi_in,cudaMemcpyHostToDevice));
307
308 free(random_h);
309
310 //
311 // allocate host result extent
312 //
313 void * sorted_h = malloc(size_hi_in);
314 void * vout_h = malloc(size_hi_in);
315
316 //
317 // LABELS
318 //
319 fprintf(stdout,
320 "Device, "
321 "Driver, "
322 "Type, "
323 "Slab/Linear, "
324 "Verified?, "
325 "Keys, "
326 "Keys Padded In, "
327 "Keys Padded Out, "
328 "CPU Algorithm, "
329 "CPU Msecs, "
330 "CPU Mkeys/s, "
331 "Trials, "
332 "Avg. Msecs, "
333 "Min Msecs, "
334 "Max Msecs, "
335 "Avg. Mkeys/s, "
336 "Max. Mkeys/s\n");
337 //
338 // BENCHMARK
339 //
340 cudaEvent_t start, end;
341
342 cuda(EventCreate(&start));
343 cuda(EventCreate(&end));
344
345 for (uint32_t count=count_lo; count<=count_hi; count+=count_step)
346 {
347 // compute padding before sorting
348 uint32_t count_padded_in, count_padded_out;
349
350 hs_pad(count,&count_padded_in,&count_padded_out);
351
352 cuda(Memcpy(vin_d,random_d,count*key_size,cudaMemcpyDeviceToDevice));
353
354 float elapsed_ms_min = FLT_MAX;
355 float elapsed_ms_max = 0.0f;
356 float elapsed_ms_sum = 0.0f;
357
358 for (uint32_t ii=0; ii<warmup+loops; ii++)
359 {
360 if (ii == warmup)
361 {
362 elapsed_ms_min = FLT_MAX;
363 elapsed_ms_max = 0.0f;
364 elapsed_ms_sum = 0.0f;
365 }
366
367 //
368 // sort vin/vout
369 //
370 cuda(EventRecord(start,stream0));
371 cuda(StreamWaitEvent(stream1,start,0));
372 cuda(StreamWaitEvent(stream2,start,0));
373
374 hs_sort(vin_d,
375 vout_d,
376 count,
377 count_padded_in,
378 count_padded_out,
379 linearize,
380 stream0,
381 stream1,
382 stream2);
383
384 cuda(EventRecord(end,stream0));
385
386 cuda(EventSynchronize(end));
387
388 float elapsed;
389
390 cuda(EventElapsedTime(&elapsed,start,end));
391
392 elapsed_ms_min = MIN_MACRO(elapsed_ms_min,elapsed);
393 elapsed_ms_max = MAX_MACRO(elapsed_ms_max,elapsed);
394 elapsed_ms_sum += elapsed;
395 }
396
397 //
398 // verify
399 //
400 char const * cpu_algo = NULL;
401 double cpu_ns = 1.0;
402 bool verified = false;
403
404 if (verify)
405 {
406 //
407 // copy back the results
408 //
409 size_t const size_padded_in = count_padded_in * key_size;
410
411 cuda(Memcpy(sorted_h,vin_d, size_padded_in,cudaMemcpyDeviceToHost));
412 cuda(Memcpy(vout_h, vout_d,size_padded_in,cudaMemcpyDeviceToHost));
413
414 //
415 // sort the input with another algorithm
416 //
417 cpu_algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns);
418
419 // transpose the cpu sorted slabs before comparison
420 if (!linearize) {
421 hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);
422 }
423
424 verified = hs_verify_linear(hs_words,sorted_h,vout_h,count_padded_in);
425
426 #ifndef NDEBUG
427 if (!verified)
428 {
429 if (hs_words == 1) {
430 hs_debug_u32(hs_width,hs_height,vout_h, count);
431 hs_debug_u32(hs_width,hs_height,sorted_h,count);
432 } else { // ulong
433 hs_debug_u64(hs_width,hs_height,vout_h, count);
434 hs_debug_u64(hs_width,hs_height,sorted_h,count);
435 }
436 }
437 #endif
438 }
439
440 //
441 // REPORT
442 //
443 fprintf(stdout,"%s, %u, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
444 props->name,
445 driver_version,
446 (hs_words == 1) ? "uint32_t" : "uint64_t",
447 linearize ? "linear" : "slab",
448 verify ? (verified ? " OK " : "*FAIL*") : "UNVERIFIED",
449 count,
450 count_padded_in,
451 count_padded_out,
452 // CPU
453 verify ? cpu_algo : "UNVERIFIED",
454 verify ? (cpu_ns / 1000000.0) : 0.0, // milliseconds
455 verify ? (1000.0 * count / cpu_ns) : 0.0, // mkeys / sec
456 // GPU
457 loops,
458 elapsed_ms_sum / loops, // avg msecs
459 elapsed_ms_min, // min msecs
460 elapsed_ms_max, // max msecs
461 (double)(count * loops) / (1000.0 * elapsed_ms_sum), // mkeys / sec - avg
462 (double) count / (1000.0 * elapsed_ms_min)); // mkeys / sec - max
463
464 // quit early if not verified
465 if (verify && !verified)
466 break;
467 }
468
469 //
470 // dispose
471 //
472 cuda(EventDestroy(start));
473 cuda(EventDestroy(end));
474
475 free(sorted_h);
476 free(vout_h);
477
478 cuda(Free(random_d));
479 cuda(Free(vin_d));
480 cuda(Free(vout_d));
481 }
482
483 //
484 //
485 //
486
487 int
main(int argc,char const * argv[])488 main(int argc, char const * argv[])
489 {
490 //
491 // which CUDA device?
492 //
493 const int32_t device = (argc == 1) ? 0 : atoi(argv[1]);
494
495 struct cudaDeviceProp props;
496 cuda(GetDeviceProperties(&props,device));
497
498 cuda(SetDeviceFlags(cudaDeviceScheduleBlockingSync));
499 cuda(SetDevice(device));
500
501 int driver_version;
502
503 cuda(DriverGetVersion(&driver_version));
504
505 #ifndef NDEBUG
506 fprintf(stdout,"%s (%2d) : %u\n",
507 props.name,
508 props.multiProcessorCount,
509 driver_version);
510 #endif
511
512 //
513 // create some streams
514 //
515 cudaStream_t stream0,stream1,stream2;
516
517 cuda(StreamCreate(&stream0));
518 cuda(StreamCreate(&stream1));
519 cuda(StreamCreate(&stream2));
520
521 //
522 //
523 //
524 #ifdef NDEBUG
525 #define HS_BENCH_LOOPS 100
526 #define HS_BENCH_WARMUP 100
527 #else
528 #define HS_BENCH_LOOPS 1
529 #define HS_BENCH_WARMUP 0
530 #endif
531
532 //
533 // are we sorting 32-bit or 64-bit keys?
534 //
535 uint32_t const key_size = (argc <= 2) ? 2 : strtoul(argv[2],NULL,0);
536
537 hs_cuda_info_pfn hs_info;
538 hs_cuda_pad_pfn hs_pad;
539 hs_cuda_sort_pfn hs_sort;
540
541 if (key_size == 1)
542 {
543 hs_info = hs_cuda_info_u32;
544 hs_pad = hs_cuda_pad_u32;
545 hs_sort = hs_cuda_sort_u32;
546 }
547 else
548 {
549 hs_info = hs_cuda_info_u64;
550 hs_pad = hs_cuda_pad_u64;
551 hs_sort = hs_cuda_sort_u64;
552 }
553
554 //
555 // get some configuration info
556 //
557 uint32_t key_words, val_words, slab_height, slab_width_log2;
558
559 hs_info(&key_words,&val_words,&slab_height,&slab_width_log2);
560
561 //
562 // sort sizes and loops
563 //
564 uint32_t const kpb = slab_height << slab_width_log2;
565 uint32_t const count_lo = (argc <= 3) ? kpb : strtoul(argv[3],NULL,0);
566 uint32_t const count_hi = (argc <= 4) ? count_lo : strtoul(argv[4],NULL,0);
567 uint32_t const count_step = (argc <= 5) ? count_lo : strtoul(argv[5],NULL,0);
568 uint32_t const loops = (argc <= 6) ? HS_BENCH_LOOPS : strtoul(argv[6],NULL,0);
569 uint32_t const warmup = (argc <= 7) ? HS_BENCH_WARMUP : strtoul(argv[7],NULL,0);
570 bool const linearize = (argc <= 8) ? true : strtoul(argv[8],NULL,0);
571 bool const verify = (argc <= 9) ? true : strtoul(argv[9],NULL,0);
572
573 //
574 // benchmark
575 //
576 hs_bench(hs_pad,
577 hs_sort,
578 stream0,
579 stream1,
580 stream2,
581 &props,
582 driver_version,
583 key_words + val_words,
584 slab_height,
585 1 << slab_width_log2,
586 count_lo,
587 count_hi,
588 count_step,
589 loops,
590 warmup,
591 linearize,
592 verify);
593
594 //
595 // cleanup
596 //
597 cuda(StreamDestroy(stream0));
598 cuda(StreamDestroy(stream1));
599 cuda(StreamDestroy(stream2));
600
601 cuda(DeviceReset());
602
603 return EXIT_SUCCESS;
604 }
605