1 /*
2  * Copyright 2016 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can
5  * be found in the LICENSE file.
6  *
7  */
8 
9 //
10 //
11 //
12 
13 #include <stdlib.h>
14 #include <stdio.h>
15 #include <string.h>
16 #include <inttypes.h>
17 
18 //
19 //
20 
21 #include "common/macros.h"
22 #include "common/vk/assert_vk.h"
23 #include "common/vk/host_alloc.h"
24 #include "common/vk/cache_vk.h"
25 
26 //
27 //
28 //
29 
30 #include "hs_vk.h"
31 
32 //
33 // Compile-time images of HotSort targets
34 //
35 
36 #include "hs/vk/intel/gen8/u32/hs_target.h"
37 #include "hs/vk/intel/gen8/u64/hs_target.h"
38 
39 #include "hs/vk/nvidia/sm_35/u32/hs_target.h"
40 #include "hs/vk/nvidia/sm_35/u64/hs_target.h"
41 
42 #include "hs/vk/amd/gcn/u32/hs_target.h"
43 #include "hs/vk/amd/gcn/u64/hs_target.h"
44 
45 //
46 //
47 //
48 
49 char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns);
50 char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns);
51 
52 //
53 //
54 //
55 
56 static
57 char const *
hs_cpu_sort(void * sorted_h,uint32_t const hs_words,uint32_t const count,double * const cpu_ns)58 hs_cpu_sort(void     *       sorted_h,
59             uint32_t   const hs_words,
60             uint32_t   const count,
61             double   * const cpu_ns)
62 {
63   if (hs_words == 1)
64     return hs_cpu_sort_u32(sorted_h,count,cpu_ns);
65   else
66     return hs_cpu_sort_u64(sorted_h,count,cpu_ns);
67 }
68 
69 static
70 void
hs_transpose_slabs_u32(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,uint32_t * vout_h,uint32_t const count)71 hs_transpose_slabs_u32(uint32_t const hs_words,
72                        uint32_t const hs_width,
73                        uint32_t const hs_height,
74                        uint32_t *     vout_h,
75                        uint32_t const count)
76 {
77   uint32_t   const slab_keys  = hs_width * hs_height;
78   size_t     const slab_size  = sizeof(uint32_t) * hs_words * slab_keys;
79   uint32_t * const slab       = ALLOCA_MACRO(slab_size);
80   uint32_t         slab_count = count / slab_keys;
81 
82   while (slab_count-- > 0)
83     {
84       memcpy(slab,vout_h,slab_size);
85 
86       for (uint32_t row=0; row<hs_height; row++)
87         for (uint32_t col=0; col<hs_width; col++)
88           vout_h[col * hs_height + row] = slab[row * hs_width + col];
89 
90       vout_h += slab_keys;
91     }
92 }
93 
94 static
95 void
hs_transpose_slabs_u64(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,uint64_t * vout_h,uint32_t const count)96 hs_transpose_slabs_u64(uint32_t const hs_words,
97                        uint32_t const hs_width,
98                        uint32_t const hs_height,
99                        uint64_t *     vout_h,
100                        uint32_t const count)
101 {
102   uint32_t   const slab_keys  = hs_width * hs_height;
103   size_t     const slab_size  = sizeof(uint32_t) * hs_words * slab_keys;
104   uint64_t * const slab       = ALLOCA_MACRO(slab_size);
105   uint32_t         slab_count = count / slab_keys;
106 
107   while (slab_count-- > 0)
108     {
109       memcpy(slab,vout_h,slab_size);
110 
111       for (uint32_t row=0; row<hs_height; row++)
112         for (uint32_t col=0; col<hs_width; col++)
113           vout_h[col * hs_height + row] = slab[row * hs_width + col];
114 
115       vout_h += slab_keys;
116     }
117 }
118 
119 static
120 void
hs_transpose_slabs(uint32_t const hs_words,uint32_t const hs_width,uint32_t const hs_height,void * vout_h,uint32_t const count)121 hs_transpose_slabs(uint32_t const hs_words,
122                    uint32_t const hs_width,
123                    uint32_t const hs_height,
124                    void   *       vout_h,
125                    uint32_t const count)
126 {
127   if (hs_words == 1)
128     hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
129   else
130     hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
131 }
132 
133 //
134 //
135 //
136 
137 #ifndef NDEBUG
138 
139 static
140 VkBool32
141 VKAPI_PTR
vk_debug_report_cb(VkDebugReportFlagsEXT flags,VkDebugReportObjectTypeEXT objectType,uint64_t object,size_t location,int32_t messageCode,const char * pLayerPrefix,const char * pMessage,void * pUserData)142 vk_debug_report_cb(VkDebugReportFlagsEXT      flags,
143                    VkDebugReportObjectTypeEXT objectType,
144                    uint64_t                   object,
145                    size_t                     location,
146                    int32_t                    messageCode,
147                    const char*                pLayerPrefix,
148                    const char*                pMessage,
149                    void*                      pUserData)
150 {
151   char const * flag_str = "";
152   bool         is_error = false;
153 
154 #define VK_FLAG_CASE_TO_STRING(c)               \
155   case c:                                       \
156     flag_str = #c;                              \
157     is_error = true;                            \
158     break
159 
160   switch (flags)
161     {
162       // VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_INFORMATION_BIT_EXT);
163       VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_WARNING_BIT_EXT);
164       VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT);
165       VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_ERROR_BIT_EXT);
166       VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_DEBUG_BIT_EXT);
167     }
168 
169   if (is_error)
170     {
171       fprintf(stderr,"%s  %s  %s\n",
172               flag_str,
173               pLayerPrefix,
174               pMessage);
175     }
176 
177   return VK_FALSE;
178 }
179 
180 #endif
181 
182 //
183 //
184 //
185 
186 static
187 uint32_t
hs_rand_u32()188 hs_rand_u32()
189 {
190   static uint32_t seed = 0xDEADBEEF;
191 
192   // Numerical Recipes
193   seed = seed * 1664525 + 1013904223;
194 
195   return seed;
196 }
197 
198 //
199 //
200 //
201 
202 static
203 void
hs_fill_rand(uint32_t * vin_h,uint32_t const count,uint32_t const words)204 hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words)
205 {
206 #if   1
207   for (uint32_t ii=0; ii<count*words; ii++)
208     vin_h[ii] = hs_rand_u32();
209 #elif 0 // in-order
210   memset(vin_h,0,count*words*sizeof(uint32_t));
211   for (uint32_t ii=0; ii<count; ii++)
212     vin_h[ii*words] = ii;
213 #else   // reverse order
214   memset(vin_h,0,count*words*sizeof(uint32_t));
215   for (uint32_t ii=0; ii<count; ii++)
216     vin_h[ii*words] = count - 1 - ii;
217 #endif
218 }
219 
220 
221 //
222 //
223 //
224 
225 static
226 void
hs_debug_u32(uint32_t const hs_width,uint32_t const hs_height,uint32_t const * vout_h,uint32_t const count)227 hs_debug_u32(uint32_t const   hs_width,
228              uint32_t const   hs_height,
229              uint32_t const * vout_h,
230              uint32_t const   count)
231 {
232   uint32_t const slab_keys = hs_width * hs_height;
233   uint32_t const slabs     = (count + slab_keys - 1) / slab_keys;
234 
235   for (uint32_t ss=0; ss<slabs; ss++) {
236     fprintf(stderr,"%u\n",ss);
237     for (uint32_t cc=0; cc<hs_height; cc++) {
238       for (uint32_t rr=0; rr<hs_width; rr++)
239         fprintf(stderr,"%8" PRIX32 " ",*vout_h++);
240       fprintf(stderr,"\n");
241     }
242   }
243 }
244 
245 static
246 void
hs_debug_u64(uint32_t const hs_width,uint32_t const hs_height,uint64_t const * vout_h,uint32_t const count)247 hs_debug_u64(uint32_t const   hs_width,
248              uint32_t const   hs_height,
249              uint64_t const * vout_h,
250              uint32_t const   count)
251 {
252   uint32_t const slab_keys = hs_width * hs_height;
253   uint32_t const slabs     = (count + slab_keys - 1) / slab_keys;
254 
255   for (uint32_t ss=0; ss<slabs; ss++) {
256     fprintf(stderr,"%u\n",ss);
257     for (uint32_t cc=0; cc<hs_height; cc++) {
258       for (uint32_t rr=0; rr<hs_width; rr++)
259         fprintf(stderr,"%16" PRIX64 " ",*vout_h++);
260       fprintf(stderr,"\n");
261     }
262   }
263 }
264 
265 //
266 //
267 //
268 
269 bool
is_matching_device(VkPhysicalDeviceProperties const * const phy_device_props,struct hs_vk_target const ** const hs_target,uint32_t const vendor_id,uint32_t const device_id,uint32_t const key_val_words)270 is_matching_device(VkPhysicalDeviceProperties const * const phy_device_props,
271                    struct hs_vk_target const *      * const hs_target,
272                    uint32_t                           const vendor_id,
273                    uint32_t                           const device_id,
274                    uint32_t                           const key_val_words)
275 {
276   if ((phy_device_props->vendorID != vendor_id) || (phy_device_props->deviceID != device_id))
277     return false;
278 
279   if (phy_device_props->vendorID == 0x10DE)
280     {
281       //
282       // FIXME -- for now, the kernels in this app are targeting
283       // sm_35+ devices.  You could add some rigorous rejection by
284       // device id here...
285       //
286       if (key_val_words == 1)
287         *hs_target = &hs_nvidia_sm35_u32;
288       else
289         *hs_target = &hs_nvidia_sm35_u64;
290     }
291   else if (phy_device_props->vendorID == 0x8086)
292     {
293       //
294       // FIXME -- for now, the kernels in this app are targeting GEN8+
295       // devices -- this does *not* include variants of GEN9LP+
296       // "Apollo Lake" because that device has a different
297       // architectural "shape" than GEN8 GTx.  You could add some
298       // rigorous rejection by device id here...
299       //
300       if (key_val_words == 1)
301         *hs_target = &hs_intel_gen8_u32;
302       else
303         *hs_target = &hs_intel_gen8_u64;
304     }
305   else if (phy_device_props->vendorID == 0x1002)
306     {
307       //
308       // AMD GCN
309       //
310       if (key_val_words == 1)
311         *hs_target = &hs_amd_gcn_u32;
312       else
313         *hs_target = &hs_amd_gcn_u64;
314     }
315   else
316     {
317       return false;
318     }
319 
320   return true;
321 }
322 
323 //
324 //
325 //
326 
327 uint32_t
vk_find_mem_type_idx(VkPhysicalDeviceMemoryProperties const * phy_device_mem_props,uint32_t const compatible_mem_types,VkMemoryPropertyFlags const required_mem_props,bool const abort)328 vk_find_mem_type_idx(VkPhysicalDeviceMemoryProperties const * phy_device_mem_props,
329                      uint32_t                         const   compatible_mem_types,
330                      VkMemoryPropertyFlags            const   required_mem_props,
331                      bool                             const   abort)
332 {
333   //
334   // FIXME -- jump between indices in the memoryTypeBits mask
335   //
336   uint32_t const count = phy_device_mem_props->memoryTypeCount;
337 
338   for (uint32_t index=0; index<count; index++)
339     {
340       // acceptable memory type for this resource?
341       if ((compatible_mem_types & (1<<index)) == 0)
342         continue;
343 
344       // otherwise, find first match...
345       VkMemoryPropertyFlags const common_props =
346         phy_device_mem_props->memoryTypes[index].propertyFlags & required_mem_props;
347 
348       if (common_props == required_mem_props)
349         return index;
350     }
351 
352   if (abort)
353     {
354       fprintf(stderr,"Memory type not found: %X\n",required_mem_props);
355       exit(EXIT_FAILURE);
356     }
357 
358   return UINT32_MAX;
359 }
360 
361 //
362 //
363 //
364 
365 #ifdef NDEBUG
366 #define HS_BENCH_LOOPS   100
367 #define HS_BENCH_WARMUP  100
368 #else
369 #define HS_BENCH_LOOPS   1
370 #define HS_BENCH_WARMUP  0
371 #endif
372 
373 //
374 //
375 //
376 
377 int
main(int argc,char const * argv[])378 main(int argc, char const * argv[])
379 {
380   //
381   // select the target by vendor and device id
382   //
383   uint32_t const vendor_id     = (argc <= 1) ? UINT32_MAX : strtoul(argv[1],NULL,16);
384   uint32_t const device_id     = (argc <= 2) ? UINT32_MAX : strtoul(argv[2],NULL,16);
385   uint32_t const key_val_words = (argc <= 3) ? 1          : strtoul(argv[3],NULL,0);
386 
387   if ((key_val_words != 1) && (key_val_words != 2))
388     {
389       fprintf(stderr,"Key/Val Words must be 1 or 2\n");
390       exit(EXIT_FAILURE);
391     }
392 
393   //
394   // create a Vulkan instances
395   //
396   VkApplicationInfo const app_info = {
397       .sType                 = VK_STRUCTURE_TYPE_APPLICATION_INFO,
398       .pNext                 = NULL,
399       .pApplicationName      = "Google HotSort Bench",
400       .applicationVersion    = 0,
401       .pEngineName           = "Google HotSort Gen",
402       .engineVersion         = 0,
403       .apiVersion            = VK_API_VERSION_1_1
404   };
405 
406   char const * const instance_enabled_layers[] = {
407     "VK_LAYER_LUNARG_standard_validation"
408   };
409 
410   char const * const instance_enabled_extensions[] = {
411     VK_EXT_DEBUG_REPORT_EXTENSION_NAME
412   };
413 
414   uint32_t const instance_enabled_layer_count =
415 #ifndef NDEBUG
416     ARRAY_LENGTH_MACRO(instance_enabled_layers)
417 #else
418     0
419 #endif
420     ;
421 
422   uint32_t const instance_enabled_extension_count =
423 #ifndef NDEBUG
424     ARRAY_LENGTH_MACRO(instance_enabled_extensions)
425 #else
426     0
427 #endif
428     ;
429 
430   VkInstanceCreateInfo const instance_info = {
431     .sType                   = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
432     .pNext                   = NULL,
433     .flags                   = 0,
434     .pApplicationInfo        = &app_info,
435     .enabledLayerCount       = instance_enabled_layer_count,
436     .ppEnabledLayerNames     = instance_enabled_layers,
437     .enabledExtensionCount   = instance_enabled_extension_count,
438     .ppEnabledExtensionNames = instance_enabled_extensions
439   };
440 
441   VkInstance instance;
442 
443   vk(CreateInstance(&instance_info,NULL,&instance));
444 
445   //
446   //
447   //
448 #ifndef NDEBUG
449   PFN_vkCreateDebugReportCallbackEXT vkCreateDebugReportCallbackEXT =
450     (PFN_vkCreateDebugReportCallbackEXT)
451     vkGetInstanceProcAddr(instance,"vkCreateDebugReportCallbackEXT");
452 
453   PFN_vkDestroyDebugReportCallbackEXT vkDestroyDebugReportCallbackEXT =
454     (PFN_vkDestroyDebugReportCallbackEXT)
455     vkGetInstanceProcAddr(instance,"vkDestroyDebugReportCallbackEXT");
456 
457   struct VkDebugReportCallbackCreateInfoEXT const drcci = {
458     .sType       = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
459     .pNext       = NULL,
460     .flags       = UINT32_MAX, // enable everything for now
461     .pfnCallback = vk_debug_report_cb,
462     .pUserData   = NULL
463   };
464 
465   VkDebugReportCallbackEXT drc;
466 
467   vk(CreateDebugReportCallbackEXT(instance,
468                                   &drcci,
469                                   NULL,
470                                   &drc));
471 #endif
472 
473   //
474   // acquire all physical devices and select a match
475   //
476   uint32_t phy_device_count;
477 
478   vk(EnumeratePhysicalDevices(instance,
479                               &phy_device_count,
480                               NULL));
481 
482   VkPhysicalDevice * phy_devices = vk_host_alloc(NULL,phy_device_count * sizeof(*phy_devices));
483 
484   vk(EnumeratePhysicalDevices(instance,
485                               &phy_device_count,
486                               phy_devices));
487 
488   VkPhysicalDevice           phy_device = VK_NULL_HANDLE;
489   VkPhysicalDeviceProperties phy_device_props;
490 
491   struct hs_vk_target const * hs_target;
492 
493   for (uint32_t ii=0; ii<phy_device_count; ii++)
494     {
495       VkPhysicalDeviceProperties tmp;
496 
497       vkGetPhysicalDeviceProperties(phy_devices[ii],&tmp);
498 
499       bool const is_match = is_matching_device(&tmp,
500                                                &hs_target,
501                                                vendor_id,
502                                                device_id,
503                                                key_val_words);
504 
505       fprintf(stdout,"%c %4X : %4X : %s\n",
506               is_match ? '*' : ' ',
507               tmp.vendorID,
508               tmp.deviceID,
509               tmp.deviceName);
510 
511       if (is_match)
512         {
513           phy_device = phy_devices[ii];
514           memcpy(&phy_device_props,&tmp,sizeof(tmp));
515         }
516 
517     }
518 
519   if (phy_device == VK_NULL_HANDLE)
520     {
521       fprintf(stderr,"Device %4X:%4X not found.\n",
522               vendor_id & 0xFFFF,
523               device_id & 0xFFFF);
524 
525       return EXIT_FAILURE;
526     }
527 
528   vk_host_free(NULL,phy_devices);
529 
530   //
531   // Get rest of command line
532   //
533   uint32_t const slab_size    = hs_target->config.slab.height << hs_target->config.slab.width_log2;
534 
535   uint32_t const count_lo     = (argc <=  4) ? slab_size       : strtoul(argv[ 4],NULL,0);
536   uint32_t const count_hi     = (argc <=  5) ? count_lo        : strtoul(argv[ 5],NULL,0);
537   uint32_t const count_step   = (argc <=  6) ? count_lo        : strtoul(argv[ 6],NULL,0);
538   uint32_t const loops        = (argc <=  7) ? HS_BENCH_LOOPS  : strtoul(argv[ 7],NULL,0);
539   uint32_t const warmup       = (argc <=  8) ? HS_BENCH_WARMUP : strtoul(argv[ 8],NULL,0);
540   bool     const linearize    = (argc <=  9) ? true            : strtoul(argv[ 9],NULL,0) != 0;
541   bool     const verify       = (argc <= 10) ? true            : strtoul(argv[10],NULL,0) != 0;
542 
543   //
544   // get the physical device's memory props
545   //
546   VkPhysicalDeviceMemoryProperties phy_device_mem_props;
547 
548   vkGetPhysicalDeviceMemoryProperties(phy_device,&phy_device_mem_props);
549 
550   //
551   // get queue properties
552   //
553   VkQueueFamilyProperties queue_fam_props[2];
554   uint32_t                queue_fam_count = ARRAY_LENGTH_MACRO(queue_fam_props);
555 
556   vkGetPhysicalDeviceQueueFamilyProperties(phy_device,&queue_fam_count,queue_fam_props);
557 
558   //
559   // create device
560   //
561   float const queue_priorities[] = { 1.0f };
562 
563   VkDeviceQueueCreateInfo const queue_info = {
564     .sType            = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
565     .pNext            = NULL,
566     .flags            = 0,
567     .queueFamilyIndex = 0,
568     .queueCount       = 1,
569     .pQueuePriorities = queue_priorities
570   };
571 
572   //
573   // clumsily enable AMD GCN shader info extension
574   //
575   char const * const device_enabled_extensions[] = {
576 #if defined( HS_VK_VERBOSE_STATISTICS_AMD ) || defined( HS_VK_VERBOSE_DISASSEMBLY_AMD )
577     VK_AMD_SHADER_INFO_EXTENSION_NAME
578 #endif
579   };
580 
581   uint32_t device_enabled_extension_count = 0;
582 
583 #if defined( HS_VK_VERBOSE_STATISTICS_AMD ) || defined( HS_VK_VERBOSE_DISASSEMBLY_AMD )
584   if (phy_device_props.vendorID == 0x1002)
585     device_enabled_extension_count = 1;
586 #endif
587 
588   //
589   //
590   //
591   VkPhysicalDeviceFeatures device_features = { false };
592 
593   if (key_val_words == 2)
594     {
595       device_features.shaderInt64 = true;
596     }
597 
598   VkDeviceCreateInfo const device_info = {
599     .sType                   = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
600     .pNext                   = NULL,
601     .flags                   = 0,
602     .queueCreateInfoCount    = 1,
603     .pQueueCreateInfos       = &queue_info,
604     .enabledLayerCount       = 0,
605     .ppEnabledLayerNames     = NULL,
606     .enabledExtensionCount   = device_enabled_extension_count,
607     .ppEnabledExtensionNames = device_enabled_extensions,
608     .pEnabledFeatures        = &device_features
609   };
610 
611   VkDevice device;
612 
613   vk(CreateDevice(phy_device,&device_info,NULL,&device));
614 
615   //
616   // get a queue
617   //
618   VkQueue queue;
619 
620   vkGetDeviceQueue(device,0,0,&queue);
621 
622   //
623   // get the pipeline cache
624   //
625   VkPipelineCache pipeline_cache;
626 
627   vk_pipeline_cache_create(device,NULL,".vk_cache",&pipeline_cache);
628 
629   //
630   // create a descriptor set pool
631   //
632   VkDescriptorPoolSize const dps[] = {
633     {
634       .type            = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
635       .descriptorCount = 2
636     }
637   };
638 
639   VkDescriptorPoolCreateInfo const dpci = {
640     .sType         = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
641     .pNext         = NULL,
642     .flags         = 0, // VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
643     .maxSets       = 1,
644     .poolSizeCount = ARRAY_LENGTH_MACRO(dps),
645     .pPoolSizes    = dps
646   };
647 
648   VkDescriptorPool desc_pool;
649 
650   vk(CreateDescriptorPool(device,
651                           &dpci,
652                           NULL, // allocator
653                           &desc_pool));
654 
655   //
656   // create HotSort device instance
657   //
658   struct hs_vk * hs = hs_vk_create(hs_target,
659                                    device,
660                                    NULL,
661                                    pipeline_cache);
662   //
663   // create a HotSort descriptor set for this thread
664   //
665   VkDescriptorSet hs_ds = hs_vk_ds_alloc(hs,desc_pool);
666 
667   //
668   // create a command pool for this thread
669   //
670   VkCommandPoolCreateInfo const cmd_pool_info = {
671     .sType            = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
672     .pNext            = NULL,
673     .flags            = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
674     .queueFamilyIndex = 0,
675   };
676 
677   VkCommandPool cmd_pool;
678 
679   vk(CreateCommandPool(device,
680                        &cmd_pool_info,
681                        NULL,
682                        &cmd_pool));
683 
684   //
685   // create a query pool for benchmarking
686   //
687   static VkQueryPoolCreateInfo const query_pool_info = {
688     .sType              = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
689     .pNext              = NULL,
690     .flags              = 0,
691     .queryType          = VK_QUERY_TYPE_TIMESTAMP,
692     .queryCount         = 4,
693     .pipelineStatistics = 0
694   };
695 
696   VkQueryPool query_pool;
697 
698   vk(CreateQueryPool(device,
699                      &query_pool_info,
700                      NULL,
701                      &query_pool));
702 
703   //
704   // create two big buffers -- buffer_out_count is always the largest
705   //
706   uint32_t buffer_in_count, buffer_out_count;
707 
708   hs_vk_pad(hs,count_hi,&buffer_in_count,&buffer_out_count);
709 
710   size_t const buffer_out_size = buffer_out_count * key_val_words * sizeof(uint32_t);
711 
712   VkBufferCreateInfo bci = {
713     .sType                 = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
714     .pNext                 = NULL,
715     .flags                 = 0,
716     .size                  = buffer_out_size,
717     .usage                 = 0,
718     .sharingMode           = VK_SHARING_MODE_EXCLUSIVE,
719     .queueFamilyIndexCount = 0,
720     .pQueueFamilyIndices   = NULL
721   };
722 
723   VkBuffer vin, vout, sorted, rand;
724 
725   bci.usage =
726     VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
727     VK_BUFFER_USAGE_TRANSFER_DST_BIT,
728 
729   vk(CreateBuffer(device,
730                   &bci,
731                   NULL,
732                   &vin));
733 
734   vk(CreateBuffer(device,
735                   &bci,
736                   NULL,
737                   &sorted));
738 
739   bci.usage =
740     VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
741     VK_BUFFER_USAGE_TRANSFER_SRC_BIT   |
742     VK_BUFFER_USAGE_TRANSFER_DST_BIT;
743 
744   vk(CreateBuffer(device,
745                   &bci,
746                   NULL,
747                   &vout));
748 
749   bci.usage =
750     VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
751     VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
752 
753   vk(CreateBuffer(device,
754                   &bci,
755                   NULL,
756                   &rand));
757 
758   //
759   // get memory requirements for one of the buffers
760   //
761   VkMemoryRequirements mr_vin, mr_vout, mr_sorted, mr_rand;
762 
763   vkGetBufferMemoryRequirements(device,vin, &mr_vin);
764   vkGetBufferMemoryRequirements(device,vout,&mr_vout);
765 
766   vkGetBufferMemoryRequirements(device,rand,&mr_sorted);
767   vkGetBufferMemoryRequirements(device,rand,&mr_rand);
768 
769   //
770   // allocate memory for the buffers
771   //
772   // for simplicity, all buffers are the same size
773   //
774   // vin and vout have the same usage
775   //
776   VkMemoryAllocateInfo const mai_vin_vout = {
777     .sType           = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
778     .pNext           = NULL,
779     .allocationSize  = mr_vin.size,
780     .memoryTypeIndex = vk_find_mem_type_idx(&phy_device_mem_props,
781                                             mr_vin.memoryTypeBits,
782                                             VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
783                                             true)
784   };
785 
786   VkMemoryAllocateInfo const mai_sorted_rand = {
787     .sType           = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
788     .pNext           = NULL,
789     .allocationSize  = mr_sorted.size,
790     .memoryTypeIndex = vk_find_mem_type_idx(&phy_device_mem_props,
791                                             mr_sorted.memoryTypeBits,
792                                             VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
793                                             VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
794                                             true)
795   };
796 
797   VkDeviceMemory mem_vin, mem_vout, mem_sorted, mem_rand;
798 
799   vk(AllocateMemory(device,
800                     &mai_vin_vout,
801                     NULL,
802                     &mem_vin));
803 
804   vk(AllocateMemory(device,
805                     &mai_vin_vout,
806                     NULL,
807                     &mem_vout));
808 
809   vk(AllocateMemory(device,
810                     &mai_sorted_rand,
811                     NULL,
812                     &mem_sorted));
813 
814   vk(AllocateMemory(device,
815                     &mai_sorted_rand,
816                     NULL,
817                     &mem_rand));
818 
819   //
820   // bind backing memory to the virtual allocations
821   //
822   vk(BindBufferMemory(device,vin,   mem_vin,   0));
823   vk(BindBufferMemory(device,vout,  mem_vout,  0));
824 
825   vk(BindBufferMemory(device,sorted,mem_sorted,0));
826   vk(BindBufferMemory(device,rand,  mem_rand,  0));
827 
828   //
829   // map and fill the rand buffer with random values
830   //
831   void * rand_h   = vk_host_alloc(NULL,buffer_out_size);
832   void * sorted_h = vk_host_alloc(NULL,buffer_out_size);
833 
834   hs_fill_rand(rand_h,buffer_out_count,key_val_words);
835 
836   void * rand_map;
837 
838   vk(MapMemory(device,mem_rand,0,VK_WHOLE_SIZE,0,&rand_map));
839 
840   memcpy(rand_map,rand_h,buffer_out_size);
841 
842   vkUnmapMemory(device,mem_rand);
843 
844   //
845   // create a single command buffer for this thread
846   //
847   VkCommandBufferAllocateInfo const cmd_buffer_info = {
848     .sType              = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
849     .pNext              = NULL,
850     .commandPool        = cmd_pool,
851     .level              = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
852     .commandBufferCount = 1
853   };
854 
855   VkCommandBuffer cb;
856 
857   vk(AllocateCommandBuffers(device,
858                             &cmd_buffer_info,
859                             &cb));
860 
861   //
862   //
863   //
864   static VkCommandBufferBeginInfo const cb_begin_info = {
865     .sType            = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
866     .pNext            = NULL,
867     .flags            = 0, // VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
868     .pInheritanceInfo = NULL
869   };
870 
871   struct VkSubmitInfo const submit_info = {
872     .sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO,
873     .pNext                = NULL,
874     .waitSemaphoreCount   = 0,
875     .pWaitSemaphores      = NULL,
876     .pWaitDstStageMask    = NULL,
877     .commandBufferCount   = 1,
878     .pCommandBuffers      = &cb,
879     .signalSemaphoreCount = 0,
880     .pSignalSemaphores    = NULL
881   };
882 
883   //
884   // labels
885   //
886   fprintf(stdout,
887           "Device, "
888           "Driver, "
889           "Type, "
890           "Slab/Linear, "
891           "Verified?, "
892           "Keys, "
893           "Keys Padded In, "
894           "Keys Padded Out, "
895           "CPU, "
896           "Algorithm, "
897           "CPU Msecs, "
898           "CPU Mkeys/s, "
899           "GPU, "
900           "Trials, "
901           "Avg. Msecs, "
902           "Min Msecs, "
903           "Max Msecs, "
904           "Avg. Mkeys/s, "
905           "Max. Mkeys/s\n");
906 
907   //
908   // test a range
909   //
910   for (uint32_t count=count_lo; count<=count_hi; count+=count_step)
911     {
912       //
913       // size the vin and vout arrays
914       //
915       uint32_t count_padded_in, count_padded_out;
916 
917       hs_vk_pad(hs,count,&count_padded_in,&count_padded_out);
918 
919       //
920       // initialize vin with 'count' random keys
921       //
922       vkBeginCommandBuffer(cb,&cb_begin_info);
923 
924       VkBufferCopy const copy_rand = {
925         .srcOffset = 0,
926         .dstOffset = 0,
927         .size      = count * key_val_words * sizeof(uint32_t)
928       };
929 
930       vkCmdCopyBuffer(cb,
931                       rand,
932                       vin,
933                       1,
934                       &copy_rand);
935 
936       vk(EndCommandBuffer(cb));
937 
938       vk(QueueSubmit(queue,
939                      1,
940                      &submit_info,
941                      VK_NULL_HANDLE)); // FIXME -- put a fence here
942 
943       // wait for queue to drain
944       vk(QueueWaitIdle(queue));
945       vk(ResetCommandBuffer(cb,0));
946 
947       //
948       // build the sorting command buffer
949       //
950       vkBeginCommandBuffer(cb,&cb_begin_info);
951 
952       //
953       // starting timestamp
954       //
955       vkCmdWriteTimestamp(cb,VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,query_pool,0);
956 
957       //
958       // bind the vin/vout buffers early
959       //
960       hs_vk_ds_bind(hs,hs_ds,cb,vin,vout);
961 
962       //
963       // append sorting commands
964       //
965       hs_vk_sort(hs,
966                  cb,
967                  vin,0,0,
968                  vout,0,0,
969                  count,
970                  count_padded_in,
971                  count_padded_out,
972                  linearize);
973 
974       //
975       // end timestamp
976       //
977       vkCmdWriteTimestamp(cb,VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,query_pool,1);
978 
979       //
980       // end the command buffer
981       //
982       vk(EndCommandBuffer(cb));
983 
984       //
985       // measure the min/max/avg execution time
986       //
987       uint64_t elapsed_ns_min = UINT64_MAX;
988       uint64_t elapsed_ns_max = 0;
989       uint64_t elapsed_ns_sum = 0;
990 
991       for (uint32_t ii=0; ii<warmup+loops; ii++)
992         {
993           if (ii == warmup)
994             {
995               elapsed_ns_min = UINT64_MAX;
996               elapsed_ns_max = 0;
997               elapsed_ns_sum = 0;
998             }
999 
1000           vk(QueueSubmit(queue,
1001                          1,
1002                          &submit_info,
1003                          VK_NULL_HANDLE)); // FIXME -- put a fence here
1004 
1005           // wait for queue to drain
1006           vk(QueueWaitIdle(queue));
1007 
1008           // get results
1009           uint64_t timestamps[2];
1010 
1011           vk(GetQueryPoolResults(device,query_pool,
1012                                  0,ARRAY_LENGTH_MACRO(timestamps),
1013                                  sizeof(timestamps),
1014                                  timestamps,
1015                                  sizeof(timestamps[0]),
1016                                  VK_QUERY_RESULT_64_BIT |
1017                                  VK_QUERY_RESULT_WAIT_BIT));
1018 
1019           uint64_t const t = timestamps[1] - timestamps[0];
1020 
1021           elapsed_ns_min  = MIN_MACRO(elapsed_ns_min,t);
1022           elapsed_ns_max  = MAX_MACRO(elapsed_ns_max,t);
1023           elapsed_ns_sum += t;
1024         }
1025 
1026       vk(ResetCommandBuffer(cb,0));
1027 
1028       //
1029       // copy the results back and, optionally, verify them
1030       //
1031       char const * cpu_algo = NULL;
1032       double       cpu_ns   = 0.0;
1033       bool         verified = false;
1034 
1035       if (verify)
1036         {
1037           size_t const size_padded_in = count_padded_in * key_val_words * sizeof(uint32_t);
1038 
1039           vkBeginCommandBuffer(cb,&cb_begin_info);
1040 
1041           VkBufferCopy const copy_vout = {
1042             .srcOffset = 0,
1043             .dstOffset = 0,
1044             .size      = size_padded_in
1045           };
1046 
1047           vkCmdCopyBuffer(cb,
1048                           vout,
1049                           sorted,
1050                           1,
1051                           &copy_vout);
1052 
1053           vk(EndCommandBuffer(cb));
1054 
1055           vk(QueueSubmit(queue,
1056                          1,
1057                          &submit_info,
1058                          VK_NULL_HANDLE)); // FIXME -- put a fence here
1059 
1060           // wait for queue to drain
1061           vk(QueueWaitIdle(queue));
1062           vk(ResetCommandBuffer(cb,0));
1063 
1064           size_t const size_sorted_h = count * key_val_words * sizeof(uint32_t);
1065 
1066           // copy and sort random data
1067           memcpy(sorted_h,rand_h,size_sorted_h);
1068           memset((uint8_t*)sorted_h + size_sorted_h,-1,size_padded_in-size_sorted_h);
1069 
1070           cpu_algo = hs_cpu_sort(sorted_h,key_val_words,count_padded_in,&cpu_ns);
1071 
1072           void * sorted_map;
1073 
1074           vk(MapMemory(device,mem_sorted,0,VK_WHOLE_SIZE,0,&sorted_map));
1075 
1076           if (!linearize) {
1077             hs_transpose_slabs(key_val_words,
1078                                1u<<hs_target->config.slab.width_log2,
1079                                hs_target->config.slab.height,
1080                                sorted_map,
1081                                count_padded_in);
1082           }
1083 
1084           // verify
1085           verified = memcmp(sorted_h,sorted_map,size_padded_in) == 0;
1086 
1087 #ifndef NDEBUG
1088           if (!verified)
1089             {
1090               if (key_val_words == 1)
1091                 {
1092                   hs_debug_u32(1u<<hs_target->config.slab.width_log2,
1093                                hs_target->config.slab.height,
1094                                sorted_h,
1095                                count);
1096 
1097                   hs_debug_u32(1u<<hs_target->config.slab.width_log2,
1098                                hs_target->config.slab.height,
1099                                sorted_map,
1100                                count);
1101                 }
1102               else // ulong
1103                 {
1104                   hs_debug_u64(1u<<hs_target->config.slab.width_log2,
1105                                hs_target->config.slab.height,
1106                                sorted_h,
1107                                count);
1108 
1109                   hs_debug_u64(1u<<hs_target->config.slab.width_log2,
1110                                hs_target->config.slab.height,
1111                                sorted_map,
1112                                count);
1113                 }
1114             }
1115 #endif
1116 
1117           vkUnmapMemory(device,mem_sorted);
1118         }
1119 
1120       //
1121       // REPORT
1122       //
1123       float const timestamp_period = phy_device_props.limits.timestampPeriod;
1124 
1125       fprintf(stdout,"%s, %u.%u.%u.%u, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
1126               phy_device_props.deviceName,
1127               (phy_device_props.driverVersion>>24)&0xFF,
1128               (phy_device_props.driverVersion>>16)&0xFF,
1129               (phy_device_props.driverVersion>> 8)&0xFF,
1130               (phy_device_props.driverVersion    )&0xFF,
1131               (key_val_words == 1) ? "uint" : "ulong",
1132               linearize ? "linear" : "slab",
1133               verify ? (verified ? "  OK  " : "*FAIL*") : "UNVERIFIED",
1134               count,
1135               count_padded_in,
1136               count_padded_out,
1137               // CPU
1138               verify ? cpu_algo : "UNVERIFIED",
1139               verify ? (cpu_ns / 1000000.0)      : 0.0,                      // milliseconds
1140               verify ? (1000.0 * count / cpu_ns) : 0.0,                      // mkeys / sec
1141               // GPU
1142               loops,
1143               timestamp_period * elapsed_ns_sum / 1e6 / loops,               // avg msecs
1144               timestamp_period * elapsed_ns_min / 1e6,                       // min msecs
1145               timestamp_period * elapsed_ns_max / 1e6,                       // max msecs
1146               1000.0 * count * loops / (timestamp_period * elapsed_ns_sum),  // mkeys / sec - avg
1147               1000.0 * count         / (timestamp_period * elapsed_ns_min)); // mkeys / sec - max
1148     }
1149 
1150   // reset the descriptor pool
1151   vk(ResetDescriptorPool(device,desc_pool,0));
1152 
1153   //
1154   // cleanup
1155   //
1156 
1157   // release shared HotSort state
1158   hs_vk_release(hs);
1159 
1160   // destroy the vin/vout buffers (before device memory)
1161   vkDestroyBuffer(device,vin,   NULL);
1162   vkDestroyBuffer(device,vout,  NULL);
1163   vkDestroyBuffer(device,sorted,NULL);
1164   vkDestroyBuffer(device,rand,  NULL);
1165 
1166   // free device memory
1167   vkFreeMemory(device,mem_vin,   NULL);
1168   vkFreeMemory(device,mem_vout,  NULL);
1169   vkFreeMemory(device,mem_sorted,NULL);
1170   vkFreeMemory(device,mem_rand,  NULL);
1171 
1172   // free host memory
1173   vk_host_free(NULL,rand_h);
1174   vk_host_free(NULL,sorted_h);
1175 
1176   // destroy the descriptor pool
1177   vkDestroyDescriptorPool(device,desc_pool,NULL);
1178 
1179   // destroy remaining...
1180   vkDestroyQueryPool(device,query_pool,NULL);
1181   vkFreeCommandBuffers(device,cmd_pool,1,&cb);
1182   vkDestroyCommandPool(device,cmd_pool,NULL);
1183 
1184   vk_pipeline_cache_destroy(device,NULL,".vk_cache",pipeline_cache);
1185 
1186   vkDestroyDevice(device,NULL);
1187 
1188 #ifndef NDEBUG
1189   vkDestroyDebugReportCallbackEXT(instance,drc,NULL);
1190 #endif
1191 
1192   vkDestroyInstance(instance,NULL);
1193 
1194   return EXIT_SUCCESS;
1195 }
1196 
1197 //
1198 //
1199 //
1200