/* * Copyright 2016 Google Inc. * * Use of this source code is governed by a BSD-style license that can * be found in the LICENSE file. * */ // // // #include #include #include #include #include #include // // // #include // // // #include "common/cuda/assert_cuda.h" #include "common/macros.h" // // // #include "hs/cuda/sm_35/u32/hs_cuda.h" #include "hs/cuda/sm_35/u64/hs_cuda.h" // // PFNs to select between different key widths // typedef void (*hs_cuda_info_pfn)(uint32_t * const key_words, uint32_t * const val_words, uint32_t * const slab_height, uint32_t * const slab_width_log2); typedef void (*hs_cuda_pad_pfn)(uint32_t const count, uint32_t * const count_padded_in, uint32_t * const count_padded_out); typedef void (*hs_cuda_sort_pfn)(void * const vin, void * const vout, uint32_t const count, uint32_t const count_padded_in, uint32_t const count_padded_out, bool const linearize, cudaStream_t stream0, cudaStream_t stream1, cudaStream_t stream2); // // The quality of the RNG doesn't matter. The same number of // instructions will be run no matter what the key distribution looks // like. So here is something small and fast. // static uint32_t hs_rand_u32() { static uint32_t seed = 0xDEADBEEF; // Numerical Recipes seed = seed * 1664525 + 1013904223; return seed; } // // // static void hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words) { #if 1 for (uint32_t ii=0; ii 0) { memcpy(slab,vout_h,slab_size); for (uint32_t row=0; row 0) { memcpy(slab,vout_h,slab_size); for (uint32_t row=0; rowname, driver_version, (hs_words == 1) ? "uint32_t" : "uint64_t", linearize ? "linear" : "slab", verify ? (verified ? " OK " : "*FAIL*") : "UNVERIFIED", count, count_padded_in, count_padded_out, // CPU verify ? cpu_algo : "UNVERIFIED", verify ? (cpu_ns / 1000000.0) : 0.0, // milliseconds verify ? (1000.0 * count / cpu_ns) : 0.0, // mkeys / sec // GPU loops, elapsed_ms_sum / loops, // avg msecs elapsed_ms_min, // min msecs elapsed_ms_max, // max msecs (double)(count * loops) / (1000.0 * elapsed_ms_sum), // mkeys / sec - avg (double) count / (1000.0 * elapsed_ms_min)); // mkeys / sec - max // quit early if not verified if (verify && !verified) break; } // // dispose // cuda(EventDestroy(start)); cuda(EventDestroy(end)); free(sorted_h); free(vout_h); cuda(Free(random_d)); cuda(Free(vin_d)); cuda(Free(vout_d)); } // // // int main(int argc, char const * argv[]) { // // which CUDA device? // const int32_t device = (argc == 1) ? 0 : atoi(argv[1]); struct cudaDeviceProp props; cuda(GetDeviceProperties(&props,device)); cuda(SetDeviceFlags(cudaDeviceScheduleBlockingSync)); cuda(SetDevice(device)); int driver_version; cuda(DriverGetVersion(&driver_version)); #ifndef NDEBUG fprintf(stdout,"%s (%2d) : %u\n", props.name, props.multiProcessorCount, driver_version); #endif // // create some streams // cudaStream_t stream0,stream1,stream2; cuda(StreamCreate(&stream0)); cuda(StreamCreate(&stream1)); cuda(StreamCreate(&stream2)); // // // #ifdef NDEBUG #define HS_BENCH_LOOPS 100 #define HS_BENCH_WARMUP 100 #else #define HS_BENCH_LOOPS 1 #define HS_BENCH_WARMUP 0 #endif // // are we sorting 32-bit or 64-bit keys? // uint32_t const key_size = (argc <= 2) ? 2 : strtoul(argv[2],NULL,0); hs_cuda_info_pfn hs_info; hs_cuda_pad_pfn hs_pad; hs_cuda_sort_pfn hs_sort; if (key_size == 1) { hs_info = hs_cuda_info_u32; hs_pad = hs_cuda_pad_u32; hs_sort = hs_cuda_sort_u32; } else { hs_info = hs_cuda_info_u64; hs_pad = hs_cuda_pad_u64; hs_sort = hs_cuda_sort_u64; } // // get some configuration info // uint32_t key_words, val_words, slab_height, slab_width_log2; hs_info(&key_words,&val_words,&slab_height,&slab_width_log2); // // sort sizes and loops // uint32_t const kpb = slab_height << slab_width_log2; uint32_t const count_lo = (argc <= 3) ? kpb : strtoul(argv[3],NULL,0); uint32_t const count_hi = (argc <= 4) ? count_lo : strtoul(argv[4],NULL,0); uint32_t const count_step = (argc <= 5) ? count_lo : strtoul(argv[5],NULL,0); uint32_t const loops = (argc <= 6) ? HS_BENCH_LOOPS : strtoul(argv[6],NULL,0); uint32_t const warmup = (argc <= 7) ? HS_BENCH_WARMUP : strtoul(argv[7],NULL,0); bool const linearize = (argc <= 8) ? true : strtoul(argv[8],NULL,0); bool const verify = (argc <= 9) ? true : strtoul(argv[9],NULL,0); // // benchmark // hs_bench(hs_pad, hs_sort, stream0, stream1, stream2, &props, driver_version, key_words + val_words, slab_height, 1 << slab_width_log2, count_lo, count_hi, count_step, loops, warmup, linearize, verify); // // cleanup // cuda(StreamDestroy(stream0)); cuda(StreamDestroy(stream1)); cuda(StreamDestroy(stream2)); cuda(DeviceReset()); return EXIT_SUCCESS; }