1 /*
2  * Copyright 2016 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can
5  * be found in the LICENSE file.
6  *
7  */
8 
9 #pragma once
10 
11 //
12 // TODO:
13 //
14 // Add Key-Val sorting support -- easy.
15 //
16 
17 #include <stdio.h>
18 #include <stdint.h>
19 
20 //
21 // All code generation is driven by the specified architectural
22 // details and host platform API.
23 //
24 // In general, the warps-per-block and keys-per-thread are the
25 // critical knobs for tuning performance.
26 //
27 
28 struct hsg_config
29 {
30   struct {
31 
32     struct {
33       uint32_t  warps;
34       uint32_t  lo;
35       uint32_t  hi;
36     } flip;
37 
38     struct {
39       uint32_t  warps;
40       uint32_t  lo;
41       uint32_t  hi;
42     } half;
43 
44     uint32_t    max_log2;
45 
46   } merge;
47 
48   struct {
49     uint32_t    warps_min;
50     uint32_t    warps_max;
51     uint32_t    warps_mod;
52 
53     uint32_t    smem_min;
54     uint32_t    smem_quantum;
55 
56     uint32_t    smem_bs;
57     uint32_t    smem_bc;
58   } block;
59 
60   struct {
61     uint32_t    lanes;
62     uint32_t    lanes_log2;
63     uint32_t    skpw_bs;
64   } warp;
65 
66   struct {
67     uint32_t    regs;
68     uint32_t    xtra;
69   } thread;
70 
71   struct {
72     uint32_t    words;
73   } type;
74 };
75 
76 //
77 // HotSort can merge non-power-of-two blocks of warps
78 //
79 
80 struct hsg_level
81 {
82   uint32_t    count; // networks >= 2
83 
84   uint32_t    diffs        [2];
85   uint32_t    diff_masks   [2];
86   uint32_t    evenodds     [2];
87   uint32_t    evenodd_masks[2];
88   uint32_t    networks     [2];
89 
90   union {
91     uint64_t  b64;
92     uint32_t  b32a2[2];
93   } active;
94 };
95 
96 //
97 //
98 //
99 
100 #define MERGE_LEVELS_MAX_LOG2  7 // merge up to 128 warps
101 #define MERGE_LEVELS_MAX_SIZE  (1 << MERGE_LEVELS_MAX_LOG2)
102 
103 //
104 // This is computed
105 //
106 
107 struct hsg_merge
108 {
109   uint32_t         offsets [MERGE_LEVELS_MAX_SIZE];
110   uint32_t         networks[MERGE_LEVELS_MAX_SIZE];
111 
112   struct hsg_level levels[MERGE_LEVELS_MAX_LOG2];
113 
114   uint32_t         index;
115 
116   uint32_t         warps;
117 
118   uint32_t         rows_bs;
119   uint32_t         rows_bc;
120 
121   uint32_t         skpw_bc;
122 };
123 
124 //
125 //
126 //
127 
128 #if 0
129 
130 #define HSG_FILE_NAME_SIZE  80
131 
132 struct hsg_file
133 {
134   FILE       * file;
135   char const * prefix;
136   char         name[HSG_FILE_NAME_SIZE];
137 };
138 
139 //
140 //
141 //
142 
143 typedef enum hsg_file_type {
144 
145   HSG_FILE_TYPE_HEADER,
146   HSG_FILE_TYPE_SOURCE,
147 
148   HSG_FILE_TYPE_COUNT
149 
150 } hsg_file_type;
151 
152 #endif
153 
154 //
155 //
156 //
157 
158 #define HSG_OP_EXPAND_ALL()                                     \
159   HSG_OP_EXPAND_X(HSG_OP_TYPE_EXIT)                             \
160                                                                 \
161   HSG_OP_EXPAND_X(HSG_OP_TYPE_END)                              \
162   HSG_OP_EXPAND_X(HSG_OP_TYPE_BEGIN)                            \
163   HSG_OP_EXPAND_X(HSG_OP_TYPE_ELSE)                             \
164                                                                 \
165   HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_BEGIN)                     \
166   HSG_OP_EXPAND_X(HSG_OP_TYPE_TARGET_END)                       \
167                                                                 \
168   HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PROTO)           \
169   HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_PREAMBLE)        \
170   HSG_OP_EXPAND_X(HSG_OP_TYPE_TRANSPOSE_KERNEL_BODY)            \
171                                                                 \
172   HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_KERNEL_PROTO)                  \
173   HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_KERNEL_PREAMBLE)               \
174                                                                 \
175   HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_KERNEL_PROTO)                  \
176   HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_KERNEL_PREAMBLE)               \
177                                                                 \
178   HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_KERNEL_PROTO)                  \
179   HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_KERNEL_PREAMBLE)               \
180                                                                 \
181   HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_KERNEL_PROTO)                  \
182   HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_KERNEL_PREAMBLE)               \
183                                                                 \
184   HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_GLOBAL_LOAD)               \
185   HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_GLOBAL_STORE)              \
186                                                                 \
187   HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_LEFT)          \
188   HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_LEFT)         \
189   HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_LOAD_RIGHT)         \
190   HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_REG_GLOBAL_STORE_RIGHT)        \
191   HSG_OP_EXPAND_X(HSG_OP_TYPE_FM_MERGE_RIGHT_PRED)              \
192                                                                 \
193   HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_LOAD)               \
194   HSG_OP_EXPAND_X(HSG_OP_TYPE_HM_REG_GLOBAL_STORE)              \
195                                                                 \
196   HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_FLIP)                        \
197   HSG_OP_EXPAND_X(HSG_OP_TYPE_SLAB_HALF)                        \
198                                                                 \
199   HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_FLIP)                         \
200   HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_HALF)                         \
201                                                                 \
202   HSG_OP_EXPAND_X(HSG_OP_TYPE_CMP_XCHG)                         \
203                                                                 \
204   HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_STORE_V)            \
205   HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_V)             \
206   HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_REG_SHARED_LOAD_V)             \
207                                                                 \
208   HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_REG_SHARED_STORE_LEFT)         \
209   HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_STORE_RIGHT)        \
210                                                                 \
211   HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_LEFT)          \
212   HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_REG_SHARED_LOAD_RIGHT)         \
213                                                                 \
214   HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_REG_GLOBAL_LOAD_LEFT)          \
215                                                                 \
216   HSG_OP_EXPAND_X(HSG_OP_TYPE_BLOCK_SYNC)                       \
217                                                                 \
218   HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_FRAC_PRED)                     \
219                                                                 \
220   HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_MERGE_H_PREAMBLE)              \
221   HSG_OP_EXPAND_X(HSG_OP_TYPE_BC_MERGE_H_PREAMBLE)              \
222                                                                 \
223   HSG_OP_EXPAND_X(HSG_OP_TYPE_BX_MERGE_H_PRED)                  \
224                                                                 \
225   HSG_OP_EXPAND_X(HSG_OP_TYPE_BS_ACTIVE_PRED)                   \
226                                                                 \
227   HSG_OP_EXPAND_X(HSG_OP_TYPE_COUNT)
228 
229 //
230 //
231 //
232 
233 #undef  HSG_OP_EXPAND_X
234 #define HSG_OP_EXPAND_X(t) t ,
235 
236 typedef enum hsg_op_type {
237 
238   HSG_OP_EXPAND_ALL()
239 
240 } hsg_op_type;
241 
242 //
243 //
244 //
245 
246 struct hsg_op
247 {
248   hsg_op_type  type;
249 
250   union {
251 
252     struct {
253       int32_t  a;
254       int32_t  b;
255       int32_t  c;
256     };
257 
258     struct {
259       int32_t  n;
260       int32_t  v;
261     };
262 
263     struct {
264       int32_t  m;
265       int32_t  w;
266     };
267 
268   };
269 };
270 
271 //
272 //
273 //
274 
275 extern char const * const hsg_op_type_string[];
276 
277 //
278 //
279 //
280 
281 struct hsg_target
282 {
283   char const              * define;
284   struct hsg_target_state * state;
285 };
286 
287 //
288 // All targets share this prototype
289 //
290 
291 typedef
292 void
293 (*hsg_target_pfn)(struct hsg_target       * const target,
294                   struct hsg_config const * const config,
295                   struct hsg_merge  const * const merge,
296                   struct hsg_op     const * const ops,
297                   uint32_t                  const depth);
298 //
299 //
300 //
301 
302 extern
303 void
304 hsg_target_debug(struct hsg_target       * const target,
305                  struct hsg_config const * const config,
306                  struct hsg_merge  const * const merge,
307                  struct hsg_op     const * const ops,
308                  uint32_t                  const depth);
309 
310 extern
311 void
312 hsg_target_cuda(struct hsg_target       * const target,
313                 struct hsg_config const * const config,
314                 struct hsg_merge  const * const merge,
315                 struct hsg_op     const * const ops,
316                 uint32_t                  const depth);
317 
318 extern
319 void
320 hsg_target_opencl(struct hsg_target       * const target,
321                   struct hsg_config const * const config,
322                   struct hsg_merge  const * const merge,
323                   struct hsg_op     const * const ops,
324                   uint32_t                  const depth);
325 
326 extern
327 void
328 hsg_target_glsl(struct hsg_target       * const target,
329                 struct hsg_config const * const config,
330                 struct hsg_merge  const * const merge,
331                 struct hsg_op     const * const ops,
332                 uint32_t                  const depth);
333 //
334 //
335 //
336