/* * Copyright 2016 Google Inc. * * Use of this source code is governed by a BSD-style license that can * be found in the LICENSE file. * */ // // // #include #include #include #include // // #include "common/macros.h" #include "common/vk/assert_vk.h" #include "common/vk/host_alloc.h" #include "common/vk/cache_vk.h" // // // #include "hs_vk.h" // // Compile-time images of HotSort targets // #include "hs/vk/intel/gen8/u32/hs_target.h" #include "hs/vk/intel/gen8/u64/hs_target.h" #include "hs/vk/nvidia/sm_35/u32/hs_target.h" #include "hs/vk/nvidia/sm_35/u64/hs_target.h" #include "hs/vk/amd/gcn/u32/hs_target.h" #include "hs/vk/amd/gcn/u64/hs_target.h" // // // char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns); char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns); // // // static char const * hs_cpu_sort(void * sorted_h, uint32_t const hs_words, uint32_t const count, double * const cpu_ns) { if (hs_words == 1) return hs_cpu_sort_u32(sorted_h,count,cpu_ns); else return hs_cpu_sort_u64(sorted_h,count,cpu_ns); } static void hs_transpose_slabs_u32(uint32_t const hs_words, uint32_t const hs_width, uint32_t const hs_height, uint32_t * vout_h, uint32_t const count) { uint32_t const slab_keys = hs_width * hs_height; size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; uint32_t * const slab = ALLOCA_MACRO(slab_size); uint32_t slab_count = count / slab_keys; while (slab_count-- > 0) { memcpy(slab,vout_h,slab_size); for (uint32_t row=0; row 0) { memcpy(slab,vout_h,slab_size); for (uint32_t row=0; rowvendorID != vendor_id) || (phy_device_props->deviceID != device_id)) return false; if (phy_device_props->vendorID == 0x10DE) { // // FIXME -- for now, the kernels in this app are targeting // sm_35+ devices. You could add some rigorous rejection by // device id here... // if (key_val_words == 1) *hs_target = &hs_nvidia_sm35_u32; else *hs_target = &hs_nvidia_sm35_u64; } else if (phy_device_props->vendorID == 0x8086) { // // FIXME -- for now, the kernels in this app are targeting GEN8+ // devices -- this does *not* include variants of GEN9LP+ // "Apollo Lake" because that device has a different // architectural "shape" than GEN8 GTx. You could add some // rigorous rejection by device id here... // if (key_val_words == 1) *hs_target = &hs_intel_gen8_u32; else *hs_target = &hs_intel_gen8_u64; } else if (phy_device_props->vendorID == 0x1002) { // // AMD GCN // if (key_val_words == 1) *hs_target = &hs_amd_gcn_u32; else *hs_target = &hs_amd_gcn_u64; } else { return false; } return true; } // // // uint32_t vk_find_mem_type_idx(VkPhysicalDeviceMemoryProperties const * phy_device_mem_props, uint32_t const compatible_mem_types, VkMemoryPropertyFlags const required_mem_props, bool const abort) { // // FIXME -- jump between indices in the memoryTypeBits mask // uint32_t const count = phy_device_mem_props->memoryTypeCount; for (uint32_t index=0; indexmemoryTypes[index].propertyFlags & required_mem_props; if (common_props == required_mem_props) return index; } if (abort) { fprintf(stderr,"Memory type not found: %X\n",required_mem_props); exit(EXIT_FAILURE); } return UINT32_MAX; } // // // #ifdef NDEBUG #define HS_BENCH_LOOPS 100 #define HS_BENCH_WARMUP 100 #else #define HS_BENCH_LOOPS 1 #define HS_BENCH_WARMUP 0 #endif // // // int main(int argc, char const * argv[]) { // // select the target by vendor and device id // uint32_t const vendor_id = (argc <= 1) ? UINT32_MAX : strtoul(argv[1],NULL,16); uint32_t const device_id = (argc <= 2) ? UINT32_MAX : strtoul(argv[2],NULL,16); uint32_t const key_val_words = (argc <= 3) ? 1 : strtoul(argv[3],NULL,0); if ((key_val_words != 1) && (key_val_words != 2)) { fprintf(stderr,"Key/Val Words must be 1 or 2\n"); exit(EXIT_FAILURE); } // // create a Vulkan instances // VkApplicationInfo const app_info = { .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, .pNext = NULL, .pApplicationName = "Google HotSort Bench", .applicationVersion = 0, .pEngineName = "Google HotSort Gen", .engineVersion = 0, .apiVersion = VK_API_VERSION_1_1 }; char const * const instance_enabled_layers[] = { "VK_LAYER_LUNARG_standard_validation" }; char const * const instance_enabled_extensions[] = { VK_EXT_DEBUG_REPORT_EXTENSION_NAME }; uint32_t const instance_enabled_layer_count = #ifndef NDEBUG ARRAY_LENGTH_MACRO(instance_enabled_layers) #else 0 #endif ; uint32_t const instance_enabled_extension_count = #ifndef NDEBUG ARRAY_LENGTH_MACRO(instance_enabled_extensions) #else 0 #endif ; VkInstanceCreateInfo const instance_info = { .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, .pNext = NULL, .flags = 0, .pApplicationInfo = &app_info, .enabledLayerCount = instance_enabled_layer_count, .ppEnabledLayerNames = instance_enabled_layers, .enabledExtensionCount = instance_enabled_extension_count, .ppEnabledExtensionNames = instance_enabled_extensions }; VkInstance instance; vk(CreateInstance(&instance_info,NULL,&instance)); // // // #ifndef NDEBUG PFN_vkCreateDebugReportCallbackEXT vkCreateDebugReportCallbackEXT = (PFN_vkCreateDebugReportCallbackEXT) vkGetInstanceProcAddr(instance,"vkCreateDebugReportCallbackEXT"); PFN_vkDestroyDebugReportCallbackEXT vkDestroyDebugReportCallbackEXT = (PFN_vkDestroyDebugReportCallbackEXT) vkGetInstanceProcAddr(instance,"vkDestroyDebugReportCallbackEXT"); struct VkDebugReportCallbackCreateInfoEXT const drcci = { .sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, .pNext = NULL, .flags = UINT32_MAX, // enable everything for now .pfnCallback = vk_debug_report_cb, .pUserData = NULL }; VkDebugReportCallbackEXT drc; vk(CreateDebugReportCallbackEXT(instance, &drcci, NULL, &drc)); #endif // // acquire all physical devices and select a match // uint32_t phy_device_count; vk(EnumeratePhysicalDevices(instance, &phy_device_count, NULL)); VkPhysicalDevice * phy_devices = vk_host_alloc(NULL,phy_device_count * sizeof(*phy_devices)); vk(EnumeratePhysicalDevices(instance, &phy_device_count, phy_devices)); VkPhysicalDevice phy_device = VK_NULL_HANDLE; VkPhysicalDeviceProperties phy_device_props; struct hs_vk_target const * hs_target; for (uint32_t ii=0; iiconfig.slab.height << hs_target->config.slab.width_log2; uint32_t const count_lo = (argc <= 4) ? slab_size : strtoul(argv[ 4],NULL,0); uint32_t const count_hi = (argc <= 5) ? count_lo : strtoul(argv[ 5],NULL,0); uint32_t const count_step = (argc <= 6) ? count_lo : strtoul(argv[ 6],NULL,0); uint32_t const loops = (argc <= 7) ? HS_BENCH_LOOPS : strtoul(argv[ 7],NULL,0); uint32_t const warmup = (argc <= 8) ? HS_BENCH_WARMUP : strtoul(argv[ 8],NULL,0); bool const linearize = (argc <= 9) ? true : strtoul(argv[ 9],NULL,0) != 0; bool const verify = (argc <= 10) ? true : strtoul(argv[10],NULL,0) != 0; // // get the physical device's memory props // VkPhysicalDeviceMemoryProperties phy_device_mem_props; vkGetPhysicalDeviceMemoryProperties(phy_device,&phy_device_mem_props); // // get queue properties // VkQueueFamilyProperties queue_fam_props[2]; uint32_t queue_fam_count = ARRAY_LENGTH_MACRO(queue_fam_props); vkGetPhysicalDeviceQueueFamilyProperties(phy_device,&queue_fam_count,queue_fam_props); // // create device // float const queue_priorities[] = { 1.0f }; VkDeviceQueueCreateInfo const queue_info = { .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, .pNext = NULL, .flags = 0, .queueFamilyIndex = 0, .queueCount = 1, .pQueuePriorities = queue_priorities }; // // clumsily enable AMD GCN shader info extension // char const * const device_enabled_extensions[] = { #if defined( HS_VK_VERBOSE_STATISTICS_AMD ) || defined( HS_VK_VERBOSE_DISASSEMBLY_AMD ) VK_AMD_SHADER_INFO_EXTENSION_NAME #endif }; uint32_t device_enabled_extension_count = 0; #if defined( HS_VK_VERBOSE_STATISTICS_AMD ) || defined( HS_VK_VERBOSE_DISASSEMBLY_AMD ) if (phy_device_props.vendorID == 0x1002) device_enabled_extension_count = 1; #endif // // // VkPhysicalDeviceFeatures device_features = { false }; if (key_val_words == 2) { device_features.shaderInt64 = true; } VkDeviceCreateInfo const device_info = { .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, .pNext = NULL, .flags = 0, .queueCreateInfoCount = 1, .pQueueCreateInfos = &queue_info, .enabledLayerCount = 0, .ppEnabledLayerNames = NULL, .enabledExtensionCount = device_enabled_extension_count, .ppEnabledExtensionNames = device_enabled_extensions, .pEnabledFeatures = &device_features }; VkDevice device; vk(CreateDevice(phy_device,&device_info,NULL,&device)); // // get a queue // VkQueue queue; vkGetDeviceQueue(device,0,0,&queue); // // get the pipeline cache // VkPipelineCache pipeline_cache; vk_pipeline_cache_create(device,NULL,".vk_cache",&pipeline_cache); // // create a descriptor set pool // VkDescriptorPoolSize const dps[] = { { .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 2 } }; VkDescriptorPoolCreateInfo const dpci = { .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, .pNext = NULL, .flags = 0, // VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, .maxSets = 1, .poolSizeCount = ARRAY_LENGTH_MACRO(dps), .pPoolSizes = dps }; VkDescriptorPool desc_pool; vk(CreateDescriptorPool(device, &dpci, NULL, // allocator &desc_pool)); // // create HotSort device instance // struct hs_vk * hs = hs_vk_create(hs_target, device, NULL, pipeline_cache); // // create a HotSort descriptor set for this thread // VkDescriptorSet hs_ds = hs_vk_ds_alloc(hs,desc_pool); // // create a command pool for this thread // VkCommandPoolCreateInfo const cmd_pool_info = { .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, .pNext = NULL, .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, .queueFamilyIndex = 0, }; VkCommandPool cmd_pool; vk(CreateCommandPool(device, &cmd_pool_info, NULL, &cmd_pool)); // // create a query pool for benchmarking // static VkQueryPoolCreateInfo const query_pool_info = { .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, .pNext = NULL, .flags = 0, .queryType = VK_QUERY_TYPE_TIMESTAMP, .queryCount = 4, .pipelineStatistics = 0 }; VkQueryPool query_pool; vk(CreateQueryPool(device, &query_pool_info, NULL, &query_pool)); // // create two big buffers -- buffer_out_count is always the largest // uint32_t buffer_in_count, buffer_out_count; hs_vk_pad(hs,count_hi,&buffer_in_count,&buffer_out_count); size_t const buffer_out_size = buffer_out_count * key_val_words * sizeof(uint32_t); VkBufferCreateInfo bci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = NULL, .flags = 0, .size = buffer_out_size, .usage = 0, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = NULL }; VkBuffer vin, vout, sorted, rand; bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, vk(CreateBuffer(device, &bci, NULL, &vin)); vk(CreateBuffer(device, &bci, NULL, &sorted)); bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; vk(CreateBuffer(device, &bci, NULL, &vout)); bci.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT; vk(CreateBuffer(device, &bci, NULL, &rand)); // // get memory requirements for one of the buffers // VkMemoryRequirements mr_vin, mr_vout, mr_sorted, mr_rand; vkGetBufferMemoryRequirements(device,vin, &mr_vin); vkGetBufferMemoryRequirements(device,vout,&mr_vout); vkGetBufferMemoryRequirements(device,rand,&mr_sorted); vkGetBufferMemoryRequirements(device,rand,&mr_rand); // // allocate memory for the buffers // // for simplicity, all buffers are the same size // // vin and vout have the same usage // VkMemoryAllocateInfo const mai_vin_vout = { .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, .pNext = NULL, .allocationSize = mr_vin.size, .memoryTypeIndex = vk_find_mem_type_idx(&phy_device_mem_props, mr_vin.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, true) }; VkMemoryAllocateInfo const mai_sorted_rand = { .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, .pNext = NULL, .allocationSize = mr_sorted.size, .memoryTypeIndex = vk_find_mem_type_idx(&phy_device_mem_props, mr_sorted.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, true) }; VkDeviceMemory mem_vin, mem_vout, mem_sorted, mem_rand; vk(AllocateMemory(device, &mai_vin_vout, NULL, &mem_vin)); vk(AllocateMemory(device, &mai_vin_vout, NULL, &mem_vout)); vk(AllocateMemory(device, &mai_sorted_rand, NULL, &mem_sorted)); vk(AllocateMemory(device, &mai_sorted_rand, NULL, &mem_rand)); // // bind backing memory to the virtual allocations // vk(BindBufferMemory(device,vin, mem_vin, 0)); vk(BindBufferMemory(device,vout, mem_vout, 0)); vk(BindBufferMemory(device,sorted,mem_sorted,0)); vk(BindBufferMemory(device,rand, mem_rand, 0)); // // map and fill the rand buffer with random values // void * rand_h = vk_host_alloc(NULL,buffer_out_size); void * sorted_h = vk_host_alloc(NULL,buffer_out_size); hs_fill_rand(rand_h,buffer_out_count,key_val_words); void * rand_map; vk(MapMemory(device,mem_rand,0,VK_WHOLE_SIZE,0,&rand_map)); memcpy(rand_map,rand_h,buffer_out_size); vkUnmapMemory(device,mem_rand); // // create a single command buffer for this thread // VkCommandBufferAllocateInfo const cmd_buffer_info = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, .pNext = NULL, .commandPool = cmd_pool, .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, .commandBufferCount = 1 }; VkCommandBuffer cb; vk(AllocateCommandBuffers(device, &cmd_buffer_info, &cb)); // // // static VkCommandBufferBeginInfo const cb_begin_info = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, .pNext = NULL, .flags = 0, // VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, .pInheritanceInfo = NULL }; struct VkSubmitInfo const submit_info = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, .pNext = NULL, .waitSemaphoreCount = 0, .pWaitSemaphores = NULL, .pWaitDstStageMask = NULL, .commandBufferCount = 1, .pCommandBuffers = &cb, .signalSemaphoreCount = 0, .pSignalSemaphores = NULL }; // // labels // fprintf(stdout, "Device, " "Driver, " "Type, " "Slab/Linear, " "Verified?, " "Keys, " "Keys Padded In, " "Keys Padded Out, " "CPU, " "Algorithm, " "CPU Msecs, " "CPU Mkeys/s, " "GPU, " "Trials, " "Avg. Msecs, " "Min Msecs, " "Max Msecs, " "Avg. Mkeys/s, " "Max. Mkeys/s\n"); // // test a range // for (uint32_t count=count_lo; count<=count_hi; count+=count_step) { // // size the vin and vout arrays // uint32_t count_padded_in, count_padded_out; hs_vk_pad(hs,count,&count_padded_in,&count_padded_out); // // initialize vin with 'count' random keys // vkBeginCommandBuffer(cb,&cb_begin_info); VkBufferCopy const copy_rand = { .srcOffset = 0, .dstOffset = 0, .size = count * key_val_words * sizeof(uint32_t) }; vkCmdCopyBuffer(cb, rand, vin, 1, ©_rand); vk(EndCommandBuffer(cb)); vk(QueueSubmit(queue, 1, &submit_info, VK_NULL_HANDLE)); // FIXME -- put a fence here // wait for queue to drain vk(QueueWaitIdle(queue)); vk(ResetCommandBuffer(cb,0)); // // build the sorting command buffer // vkBeginCommandBuffer(cb,&cb_begin_info); // // starting timestamp // vkCmdWriteTimestamp(cb,VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,query_pool,0); // // bind the vin/vout buffers early // hs_vk_ds_bind(hs,hs_ds,cb,vin,vout); // // append sorting commands // hs_vk_sort(hs, cb, vin,0,0, vout,0,0, count, count_padded_in, count_padded_out, linearize); // // end timestamp // vkCmdWriteTimestamp(cb,VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,query_pool,1); // // end the command buffer // vk(EndCommandBuffer(cb)); // // measure the min/max/avg execution time // uint64_t elapsed_ns_min = UINT64_MAX; uint64_t elapsed_ns_max = 0; uint64_t elapsed_ns_sum = 0; for (uint32_t ii=0; iiconfig.slab.width_log2, hs_target->config.slab.height, sorted_map, count_padded_in); } // verify verified = memcmp(sorted_h,sorted_map,size_padded_in) == 0; #ifndef NDEBUG if (!verified) { if (key_val_words == 1) { hs_debug_u32(1u<config.slab.width_log2, hs_target->config.slab.height, sorted_h, count); hs_debug_u32(1u<config.slab.width_log2, hs_target->config.slab.height, sorted_map, count); } else // ulong { hs_debug_u64(1u<config.slab.width_log2, hs_target->config.slab.height, sorted_h, count); hs_debug_u64(1u<config.slab.width_log2, hs_target->config.slab.height, sorted_map, count); } } #endif vkUnmapMemory(device,mem_sorted); } // // REPORT // float const timestamp_period = phy_device_props.limits.timestampPeriod; fprintf(stdout,"%s, %u.%u.%u.%u, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n", phy_device_props.deviceName, (phy_device_props.driverVersion>>24)&0xFF, (phy_device_props.driverVersion>>16)&0xFF, (phy_device_props.driverVersion>> 8)&0xFF, (phy_device_props.driverVersion )&0xFF, (key_val_words == 1) ? "uint" : "ulong", linearize ? "linear" : "slab", verify ? (verified ? " OK " : "*FAIL*") : "UNVERIFIED", count, count_padded_in, count_padded_out, // CPU verify ? cpu_algo : "UNVERIFIED", verify ? (cpu_ns / 1000000.0) : 0.0, // milliseconds verify ? (1000.0 * count / cpu_ns) : 0.0, // mkeys / sec // GPU loops, timestamp_period * elapsed_ns_sum / 1e6 / loops, // avg msecs timestamp_period * elapsed_ns_min / 1e6, // min msecs timestamp_period * elapsed_ns_max / 1e6, // max msecs 1000.0 * count * loops / (timestamp_period * elapsed_ns_sum), // mkeys / sec - avg 1000.0 * count / (timestamp_period * elapsed_ns_min)); // mkeys / sec - max } // reset the descriptor pool vk(ResetDescriptorPool(device,desc_pool,0)); // // cleanup // // release shared HotSort state hs_vk_release(hs); // destroy the vin/vout buffers (before device memory) vkDestroyBuffer(device,vin, NULL); vkDestroyBuffer(device,vout, NULL); vkDestroyBuffer(device,sorted,NULL); vkDestroyBuffer(device,rand, NULL); // free device memory vkFreeMemory(device,mem_vin, NULL); vkFreeMemory(device,mem_vout, NULL); vkFreeMemory(device,mem_sorted,NULL); vkFreeMemory(device,mem_rand, NULL); // free host memory vk_host_free(NULL,rand_h); vk_host_free(NULL,sorted_h); // destroy the descriptor pool vkDestroyDescriptorPool(device,desc_pool,NULL); // destroy remaining... vkDestroyQueryPool(device,query_pool,NULL); vkFreeCommandBuffers(device,cmd_pool,1,&cb); vkDestroyCommandPool(device,cmd_pool,NULL); vk_pipeline_cache_destroy(device,NULL,".vk_cache",pipeline_cache); vkDestroyDevice(device,NULL); #ifndef NDEBUG vkDestroyDebugReportCallbackEXT(instance,drc,NULL); #endif vkDestroyInstance(instance,NULL); return EXIT_SUCCESS; } // // //