1 /*
2  * Copyright 2018 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can
5  * be found in the LICENSE file.
6  *
7  */
8 
9 #pragma once
10 
11 //
12 //
13 //
14 
15 #include <cuda.h>
16 #include <stdint.h>
17 #include <stdbool.h>
18 
19 //
20 // Info about the algorithm configuration.
21 //
22 
23 void
24 hs_cuda_info_u32(uint32_t * const key_words,
25                  uint32_t * const val_words,
26                  uint32_t * const slab_height,
27                  uint32_t * const slab_width_log2);
28 
29 //
30 // Determine what padding will be applied to the input and output
31 // buffers.
32 //
33 // Always check to see if the allocated buffers are large enough.
34 //
35 // count                    : number of keys
36 // count + count_padded_in  : additional keys required for sorting
37 // count + count_padded_out : additional keys required for merging
38 //
39 
40 void
41 hs_cuda_pad_u32(uint32_t   const count,
42                 uint32_t * const count_padded_in,
43                 uint32_t * const count_padded_out);
44 
45 //
46 // Sort the keys in the vin buffer and store them in the vout buffer.
47 //
48 // If vout is NULL then the sort will be performed in place.
49 //
50 // The implementation assumes the command queue is out-of-order.
51 //
52 
53 void
54 hs_cuda_sort_u32(uint32_t * const vin,
55                  uint32_t * const vout,
56                  uint32_t   const count,
57                  uint32_t   const count_padded_in,
58                  uint32_t   const count_padded_out,
59                  bool       const linearize,
60                  cudaStream_t     stream0,  // primary stream
61                  cudaStream_t     stream1,  // auxilary streams
62                  cudaStream_t     stream2); // for concurrency
63 
64 //
65 //
66 //
67