1//
2// Copyright 2016 Google Inc.
3//
4// Use of this source code is governed by a BSD-style
5// license that can be found in the LICENSE file.
6//
7
8// target-specific config
9#include "hs_config.h"
10
11// arch/target-specific macros
12#include "hs_cl_macros.h"
13
14//
15//
16//
17
18HS_BS_KERNEL_PROTO(1, 0)
19{
20  HS_SLAB_GLOBAL_PREAMBLE();
21  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
22  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
23  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
24  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
25  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
26  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
27  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
28  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
29  HS_CMP_XCHG(r1, r5);
30  HS_CMP_XCHG(r2, r6);
31  HS_CMP_XCHG(r3, r7);
32  HS_CMP_XCHG(r4, r8);
33  HS_CMP_XCHG(r1, r3);
34  HS_CMP_XCHG(r2, r4);
35  HS_CMP_XCHG(r5, r7);
36  HS_CMP_XCHG(r6, r8);
37  HS_CMP_XCHG(r3, r5);
38  HS_CMP_XCHG(r4, r6);
39  HS_CMP_XCHG(r1, r2);
40  HS_CMP_XCHG(r3, r4);
41  HS_CMP_XCHG(r5, r6);
42  HS_CMP_XCHG(r7, r8);
43  HS_CMP_XCHG(r2, r5);
44  HS_CMP_XCHG(r4, r7);
45  HS_CMP_XCHG(r2, r3);
46  HS_CMP_XCHG(r4, r5);
47  HS_CMP_XCHG(r6, r7);
48  {
49    HS_SLAB_FLIP_PREAMBLE(1);
50    HS_CMP_FLIP(0, r1, r8);
51    HS_CMP_FLIP(1, r2, r7);
52    HS_CMP_FLIP(2, r3, r6);
53    HS_CMP_FLIP(3, r4, r5);
54  }
55  HS_CMP_XCHG(r1, r5);
56  HS_CMP_XCHG(r3, r7);
57  HS_CMP_XCHG(r1, r3);
58  HS_CMP_XCHG(r5, r7);
59  HS_CMP_XCHG(r2, r6);
60  HS_CMP_XCHG(r4, r8);
61  HS_CMP_XCHG(r2, r4);
62  HS_CMP_XCHG(r6, r8);
63  HS_CMP_XCHG(r1, r2);
64  HS_CMP_XCHG(r3, r4);
65  HS_CMP_XCHG(r5, r6);
66  HS_CMP_XCHG(r7, r8);
67  {
68    HS_SLAB_FLIP_PREAMBLE(3);
69    HS_CMP_FLIP(0, r1, r8);
70    HS_CMP_FLIP(1, r2, r7);
71    HS_CMP_FLIP(2, r3, r6);
72    HS_CMP_FLIP(3, r4, r5);
73  }
74  {
75    HS_SLAB_HALF_PREAMBLE(1);
76    HS_CMP_HALF(0, r1);
77    HS_CMP_HALF(1, r2);
78    HS_CMP_HALF(2, r3);
79    HS_CMP_HALF(3, r4);
80    HS_CMP_HALF(4, r5);
81    HS_CMP_HALF(5, r6);
82    HS_CMP_HALF(6, r7);
83    HS_CMP_HALF(7, r8);
84  }
85  HS_CMP_XCHG(r1, r5);
86  HS_CMP_XCHG(r3, r7);
87  HS_CMP_XCHG(r1, r3);
88  HS_CMP_XCHG(r5, r7);
89  HS_CMP_XCHG(r2, r6);
90  HS_CMP_XCHG(r4, r8);
91  HS_CMP_XCHG(r2, r4);
92  HS_CMP_XCHG(r6, r8);
93  HS_CMP_XCHG(r1, r2);
94  HS_CMP_XCHG(r3, r4);
95  HS_CMP_XCHG(r5, r6);
96  HS_CMP_XCHG(r7, r8);
97  {
98    HS_SLAB_FLIP_PREAMBLE(7);
99    HS_CMP_FLIP(0, r1, r8);
100    HS_CMP_FLIP(1, r2, r7);
101    HS_CMP_FLIP(2, r3, r6);
102    HS_CMP_FLIP(3, r4, r5);
103  }
104  {
105    HS_SLAB_HALF_PREAMBLE(2);
106    HS_CMP_HALF(0, r1);
107    HS_CMP_HALF(1, r2);
108    HS_CMP_HALF(2, r3);
109    HS_CMP_HALF(3, r4);
110    HS_CMP_HALF(4, r5);
111    HS_CMP_HALF(5, r6);
112    HS_CMP_HALF(6, r7);
113    HS_CMP_HALF(7, r8);
114  }
115  {
116    HS_SLAB_HALF_PREAMBLE(1);
117    HS_CMP_HALF(0, r1);
118    HS_CMP_HALF(1, r2);
119    HS_CMP_HALF(2, r3);
120    HS_CMP_HALF(3, r4);
121    HS_CMP_HALF(4, r5);
122    HS_CMP_HALF(5, r6);
123    HS_CMP_HALF(6, r7);
124    HS_CMP_HALF(7, r8);
125  }
126  HS_CMP_XCHG(r1, r5);
127  HS_CMP_XCHG(r3, r7);
128  HS_CMP_XCHG(r1, r3);
129  HS_CMP_XCHG(r5, r7);
130  HS_CMP_XCHG(r2, r6);
131  HS_CMP_XCHG(r4, r8);
132  HS_CMP_XCHG(r2, r4);
133  HS_CMP_XCHG(r6, r8);
134  HS_CMP_XCHG(r1, r2);
135  HS_CMP_XCHG(r3, r4);
136  HS_CMP_XCHG(r5, r6);
137  HS_CMP_XCHG(r7, r8);
138  {
139    HS_SLAB_FLIP_PREAMBLE(15);
140    HS_CMP_FLIP(0, r1, r8);
141    HS_CMP_FLIP(1, r2, r7);
142    HS_CMP_FLIP(2, r3, r6);
143    HS_CMP_FLIP(3, r4, r5);
144  }
145  {
146    HS_SLAB_HALF_PREAMBLE(4);
147    HS_CMP_HALF(0, r1);
148    HS_CMP_HALF(1, r2);
149    HS_CMP_HALF(2, r3);
150    HS_CMP_HALF(3, r4);
151    HS_CMP_HALF(4, r5);
152    HS_CMP_HALF(5, r6);
153    HS_CMP_HALF(6, r7);
154    HS_CMP_HALF(7, r8);
155  }
156  {
157    HS_SLAB_HALF_PREAMBLE(2);
158    HS_CMP_HALF(0, r1);
159    HS_CMP_HALF(1, r2);
160    HS_CMP_HALF(2, r3);
161    HS_CMP_HALF(3, r4);
162    HS_CMP_HALF(4, r5);
163    HS_CMP_HALF(5, r6);
164    HS_CMP_HALF(6, r7);
165    HS_CMP_HALF(7, r8);
166  }
167  {
168    HS_SLAB_HALF_PREAMBLE(1);
169    HS_CMP_HALF(0, r1);
170    HS_CMP_HALF(1, r2);
171    HS_CMP_HALF(2, r3);
172    HS_CMP_HALF(3, r4);
173    HS_CMP_HALF(4, r5);
174    HS_CMP_HALF(5, r6);
175    HS_CMP_HALF(6, r7);
176    HS_CMP_HALF(7, r8);
177  }
178  HS_CMP_XCHG(r1, r5);
179  HS_CMP_XCHG(r3, r7);
180  HS_CMP_XCHG(r1, r3);
181  HS_CMP_XCHG(r5, r7);
182  HS_CMP_XCHG(r2, r6);
183  HS_CMP_XCHG(r4, r8);
184  HS_CMP_XCHG(r2, r4);
185  HS_CMP_XCHG(r6, r8);
186  HS_CMP_XCHG(r1, r2);
187  HS_CMP_XCHG(r3, r4);
188  HS_CMP_XCHG(r5, r6);
189  HS_CMP_XCHG(r7, r8);
190  HS_SLAB_GLOBAL_STORE(0, r1);
191  HS_SLAB_GLOBAL_STORE(1, r2);
192  HS_SLAB_GLOBAL_STORE(2, r3);
193  HS_SLAB_GLOBAL_STORE(3, r4);
194  HS_SLAB_GLOBAL_STORE(4, r5);
195  HS_SLAB_GLOBAL_STORE(5, r6);
196  HS_SLAB_GLOBAL_STORE(6, r7);
197  HS_SLAB_GLOBAL_STORE(7, r8);
198}
199
200HS_BS_KERNEL_PROTO(2, 1)
201{
202  HS_BLOCK_LOCAL_MEM_DECL(32, 8);
203
204  HS_SLAB_GLOBAL_PREAMBLE();
205  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
206  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
207  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
208  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
209  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
210  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
211  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
212  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
213  HS_CMP_XCHG(r1, r5);
214  HS_CMP_XCHG(r2, r6);
215  HS_CMP_XCHG(r3, r7);
216  HS_CMP_XCHG(r4, r8);
217  HS_CMP_XCHG(r1, r3);
218  HS_CMP_XCHG(r2, r4);
219  HS_CMP_XCHG(r5, r7);
220  HS_CMP_XCHG(r6, r8);
221  HS_CMP_XCHG(r3, r5);
222  HS_CMP_XCHG(r4, r6);
223  HS_CMP_XCHG(r1, r2);
224  HS_CMP_XCHG(r3, r4);
225  HS_CMP_XCHG(r5, r6);
226  HS_CMP_XCHG(r7, r8);
227  HS_CMP_XCHG(r2, r5);
228  HS_CMP_XCHG(r4, r7);
229  HS_CMP_XCHG(r2, r3);
230  HS_CMP_XCHG(r4, r5);
231  HS_CMP_XCHG(r6, r7);
232  {
233    HS_SLAB_FLIP_PREAMBLE(1);
234    HS_CMP_FLIP(0, r1, r8);
235    HS_CMP_FLIP(1, r2, r7);
236    HS_CMP_FLIP(2, r3, r6);
237    HS_CMP_FLIP(3, r4, r5);
238  }
239  HS_CMP_XCHG(r1, r5);
240  HS_CMP_XCHG(r3, r7);
241  HS_CMP_XCHG(r1, r3);
242  HS_CMP_XCHG(r5, r7);
243  HS_CMP_XCHG(r2, r6);
244  HS_CMP_XCHG(r4, r8);
245  HS_CMP_XCHG(r2, r4);
246  HS_CMP_XCHG(r6, r8);
247  HS_CMP_XCHG(r1, r2);
248  HS_CMP_XCHG(r3, r4);
249  HS_CMP_XCHG(r5, r6);
250  HS_CMP_XCHG(r7, r8);
251  {
252    HS_SLAB_FLIP_PREAMBLE(3);
253    HS_CMP_FLIP(0, r1, r8);
254    HS_CMP_FLIP(1, r2, r7);
255    HS_CMP_FLIP(2, r3, r6);
256    HS_CMP_FLIP(3, r4, r5);
257  }
258  {
259    HS_SLAB_HALF_PREAMBLE(1);
260    HS_CMP_HALF(0, r1);
261    HS_CMP_HALF(1, r2);
262    HS_CMP_HALF(2, r3);
263    HS_CMP_HALF(3, r4);
264    HS_CMP_HALF(4, r5);
265    HS_CMP_HALF(5, r6);
266    HS_CMP_HALF(6, r7);
267    HS_CMP_HALF(7, r8);
268  }
269  HS_CMP_XCHG(r1, r5);
270  HS_CMP_XCHG(r3, r7);
271  HS_CMP_XCHG(r1, r3);
272  HS_CMP_XCHG(r5, r7);
273  HS_CMP_XCHG(r2, r6);
274  HS_CMP_XCHG(r4, r8);
275  HS_CMP_XCHG(r2, r4);
276  HS_CMP_XCHG(r6, r8);
277  HS_CMP_XCHG(r1, r2);
278  HS_CMP_XCHG(r3, r4);
279  HS_CMP_XCHG(r5, r6);
280  HS_CMP_XCHG(r7, r8);
281  {
282    HS_SLAB_FLIP_PREAMBLE(7);
283    HS_CMP_FLIP(0, r1, r8);
284    HS_CMP_FLIP(1, r2, r7);
285    HS_CMP_FLIP(2, r3, r6);
286    HS_CMP_FLIP(3, r4, r5);
287  }
288  {
289    HS_SLAB_HALF_PREAMBLE(2);
290    HS_CMP_HALF(0, r1);
291    HS_CMP_HALF(1, r2);
292    HS_CMP_HALF(2, r3);
293    HS_CMP_HALF(3, r4);
294    HS_CMP_HALF(4, r5);
295    HS_CMP_HALF(5, r6);
296    HS_CMP_HALF(6, r7);
297    HS_CMP_HALF(7, r8);
298  }
299  {
300    HS_SLAB_HALF_PREAMBLE(1);
301    HS_CMP_HALF(0, r1);
302    HS_CMP_HALF(1, r2);
303    HS_CMP_HALF(2, r3);
304    HS_CMP_HALF(3, r4);
305    HS_CMP_HALF(4, r5);
306    HS_CMP_HALF(5, r6);
307    HS_CMP_HALF(6, r7);
308    HS_CMP_HALF(7, r8);
309  }
310  HS_CMP_XCHG(r1, r5);
311  HS_CMP_XCHG(r3, r7);
312  HS_CMP_XCHG(r1, r3);
313  HS_CMP_XCHG(r5, r7);
314  HS_CMP_XCHG(r2, r6);
315  HS_CMP_XCHG(r4, r8);
316  HS_CMP_XCHG(r2, r4);
317  HS_CMP_XCHG(r6, r8);
318  HS_CMP_XCHG(r1, r2);
319  HS_CMP_XCHG(r3, r4);
320  HS_CMP_XCHG(r5, r6);
321  HS_CMP_XCHG(r7, r8);
322  {
323    HS_SLAB_FLIP_PREAMBLE(15);
324    HS_CMP_FLIP(0, r1, r8);
325    HS_CMP_FLIP(1, r2, r7);
326    HS_CMP_FLIP(2, r3, r6);
327    HS_CMP_FLIP(3, r4, r5);
328  }
329  {
330    HS_SLAB_HALF_PREAMBLE(4);
331    HS_CMP_HALF(0, r1);
332    HS_CMP_HALF(1, r2);
333    HS_CMP_HALF(2, r3);
334    HS_CMP_HALF(3, r4);
335    HS_CMP_HALF(4, r5);
336    HS_CMP_HALF(5, r6);
337    HS_CMP_HALF(6, r7);
338    HS_CMP_HALF(7, r8);
339  }
340  {
341    HS_SLAB_HALF_PREAMBLE(2);
342    HS_CMP_HALF(0, r1);
343    HS_CMP_HALF(1, r2);
344    HS_CMP_HALF(2, r3);
345    HS_CMP_HALF(3, r4);
346    HS_CMP_HALF(4, r5);
347    HS_CMP_HALF(5, r6);
348    HS_CMP_HALF(6, r7);
349    HS_CMP_HALF(7, r8);
350  }
351  {
352    HS_SLAB_HALF_PREAMBLE(1);
353    HS_CMP_HALF(0, r1);
354    HS_CMP_HALF(1, r2);
355    HS_CMP_HALF(2, r3);
356    HS_CMP_HALF(3, r4);
357    HS_CMP_HALF(4, r5);
358    HS_CMP_HALF(5, r6);
359    HS_CMP_HALF(6, r7);
360    HS_CMP_HALF(7, r8);
361  }
362  HS_CMP_XCHG(r1, r5);
363  HS_CMP_XCHG(r3, r7);
364  HS_CMP_XCHG(r1, r3);
365  HS_CMP_XCHG(r5, r7);
366  HS_CMP_XCHG(r2, r6);
367  HS_CMP_XCHG(r4, r8);
368  HS_CMP_XCHG(r2, r4);
369  HS_CMP_XCHG(r6, r8);
370  HS_CMP_XCHG(r1, r2);
371  HS_CMP_XCHG(r3, r4);
372  HS_CMP_XCHG(r5, r6);
373  HS_CMP_XCHG(r7, r8);
374  HS_BS_MERGE_H_PREAMBLE(2);
375  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0) = r1;
376  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1) = r8;
377  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2) = r2;
378  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3) = r7;
379  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4) = r3;
380  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5) = r6;
381  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6) = r4;
382  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7) = r5;
383  HS_BLOCK_BARRIER();
384  {
385    {
386      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
387      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(16);
388      HS_CMP_XCHG(r0_1, r0_2);
389      HS_SLAB_LOCAL_L(0) = r0_1;
390      HS_SLAB_LOCAL_R(16) = r0_2;
391    }
392    {
393      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(64);
394      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(80);
395      HS_CMP_XCHG(r0_1, r0_2);
396      HS_SLAB_LOCAL_L(64) = r0_1;
397      HS_SLAB_LOCAL_R(80) = r0_2;
398    }
399    {
400      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128);
401      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(144);
402      HS_CMP_XCHG(r0_1, r0_2);
403      HS_SLAB_LOCAL_L(128) = r0_1;
404      HS_SLAB_LOCAL_R(144) = r0_2;
405    }
406    {
407      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(192);
408      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(208);
409      HS_CMP_XCHG(r0_1, r0_2);
410      HS_SLAB_LOCAL_L(192) = r0_1;
411      HS_SLAB_LOCAL_R(208) = r0_2;
412    }
413  }
414  HS_BLOCK_BARRIER();
415  r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0);
416  r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1);
417  r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2);
418  r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3);
419  r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4);
420  r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5);
421  r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6);
422  r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7);
423  {
424    {
425      HS_SLAB_HALF_PREAMBLE(8);
426      HS_CMP_HALF(0, r1);
427      HS_CMP_HALF(1, r2);
428      HS_CMP_HALF(2, r3);
429      HS_CMP_HALF(3, r4);
430      HS_CMP_HALF(4, r5);
431      HS_CMP_HALF(5, r6);
432      HS_CMP_HALF(6, r7);
433      HS_CMP_HALF(7, r8);
434    }
435    {
436      HS_SLAB_HALF_PREAMBLE(4);
437      HS_CMP_HALF(0, r1);
438      HS_CMP_HALF(1, r2);
439      HS_CMP_HALF(2, r3);
440      HS_CMP_HALF(3, r4);
441      HS_CMP_HALF(4, r5);
442      HS_CMP_HALF(5, r6);
443      HS_CMP_HALF(6, r7);
444      HS_CMP_HALF(7, r8);
445    }
446    {
447      HS_SLAB_HALF_PREAMBLE(2);
448      HS_CMP_HALF(0, r1);
449      HS_CMP_HALF(1, r2);
450      HS_CMP_HALF(2, r3);
451      HS_CMP_HALF(3, r4);
452      HS_CMP_HALF(4, r5);
453      HS_CMP_HALF(5, r6);
454      HS_CMP_HALF(6, r7);
455      HS_CMP_HALF(7, r8);
456    }
457    {
458      HS_SLAB_HALF_PREAMBLE(1);
459      HS_CMP_HALF(0, r1);
460      HS_CMP_HALF(1, r2);
461      HS_CMP_HALF(2, r3);
462      HS_CMP_HALF(3, r4);
463      HS_CMP_HALF(4, r5);
464      HS_CMP_HALF(5, r6);
465      HS_CMP_HALF(6, r7);
466      HS_CMP_HALF(7, r8);
467    }
468    HS_CMP_XCHG(r1, r5);
469    HS_CMP_XCHG(r3, r7);
470    HS_CMP_XCHG(r1, r3);
471    HS_CMP_XCHG(r5, r7);
472    HS_CMP_XCHG(r2, r6);
473    HS_CMP_XCHG(r4, r8);
474    HS_CMP_XCHG(r2, r4);
475    HS_CMP_XCHG(r6, r8);
476    HS_CMP_XCHG(r1, r2);
477    HS_CMP_XCHG(r3, r4);
478    HS_CMP_XCHG(r5, r6);
479    HS_CMP_XCHG(r7, r8);
480  }
481  HS_SLAB_GLOBAL_STORE(0, r1);
482  HS_SLAB_GLOBAL_STORE(1, r2);
483  HS_SLAB_GLOBAL_STORE(2, r3);
484  HS_SLAB_GLOBAL_STORE(3, r4);
485  HS_SLAB_GLOBAL_STORE(4, r5);
486  HS_SLAB_GLOBAL_STORE(5, r6);
487  HS_SLAB_GLOBAL_STORE(6, r7);
488  HS_SLAB_GLOBAL_STORE(7, r8);
489}
490
491HS_BS_KERNEL_PROTO(4, 2)
492{
493  HS_BLOCK_LOCAL_MEM_DECL(64, 8);
494
495  HS_SLAB_GLOBAL_PREAMBLE();
496  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
497  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
498  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
499  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
500  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
501  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
502  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
503  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
504  HS_CMP_XCHG(r1, r5);
505  HS_CMP_XCHG(r2, r6);
506  HS_CMP_XCHG(r3, r7);
507  HS_CMP_XCHG(r4, r8);
508  HS_CMP_XCHG(r1, r3);
509  HS_CMP_XCHG(r2, r4);
510  HS_CMP_XCHG(r5, r7);
511  HS_CMP_XCHG(r6, r8);
512  HS_CMP_XCHG(r3, r5);
513  HS_CMP_XCHG(r4, r6);
514  HS_CMP_XCHG(r1, r2);
515  HS_CMP_XCHG(r3, r4);
516  HS_CMP_XCHG(r5, r6);
517  HS_CMP_XCHG(r7, r8);
518  HS_CMP_XCHG(r2, r5);
519  HS_CMP_XCHG(r4, r7);
520  HS_CMP_XCHG(r2, r3);
521  HS_CMP_XCHG(r4, r5);
522  HS_CMP_XCHG(r6, r7);
523  {
524    HS_SLAB_FLIP_PREAMBLE(1);
525    HS_CMP_FLIP(0, r1, r8);
526    HS_CMP_FLIP(1, r2, r7);
527    HS_CMP_FLIP(2, r3, r6);
528    HS_CMP_FLIP(3, r4, r5);
529  }
530  HS_CMP_XCHG(r1, r5);
531  HS_CMP_XCHG(r3, r7);
532  HS_CMP_XCHG(r1, r3);
533  HS_CMP_XCHG(r5, r7);
534  HS_CMP_XCHG(r2, r6);
535  HS_CMP_XCHG(r4, r8);
536  HS_CMP_XCHG(r2, r4);
537  HS_CMP_XCHG(r6, r8);
538  HS_CMP_XCHG(r1, r2);
539  HS_CMP_XCHG(r3, r4);
540  HS_CMP_XCHG(r5, r6);
541  HS_CMP_XCHG(r7, r8);
542  {
543    HS_SLAB_FLIP_PREAMBLE(3);
544    HS_CMP_FLIP(0, r1, r8);
545    HS_CMP_FLIP(1, r2, r7);
546    HS_CMP_FLIP(2, r3, r6);
547    HS_CMP_FLIP(3, r4, r5);
548  }
549  {
550    HS_SLAB_HALF_PREAMBLE(1);
551    HS_CMP_HALF(0, r1);
552    HS_CMP_HALF(1, r2);
553    HS_CMP_HALF(2, r3);
554    HS_CMP_HALF(3, r4);
555    HS_CMP_HALF(4, r5);
556    HS_CMP_HALF(5, r6);
557    HS_CMP_HALF(6, r7);
558    HS_CMP_HALF(7, r8);
559  }
560  HS_CMP_XCHG(r1, r5);
561  HS_CMP_XCHG(r3, r7);
562  HS_CMP_XCHG(r1, r3);
563  HS_CMP_XCHG(r5, r7);
564  HS_CMP_XCHG(r2, r6);
565  HS_CMP_XCHG(r4, r8);
566  HS_CMP_XCHG(r2, r4);
567  HS_CMP_XCHG(r6, r8);
568  HS_CMP_XCHG(r1, r2);
569  HS_CMP_XCHG(r3, r4);
570  HS_CMP_XCHG(r5, r6);
571  HS_CMP_XCHG(r7, r8);
572  {
573    HS_SLAB_FLIP_PREAMBLE(7);
574    HS_CMP_FLIP(0, r1, r8);
575    HS_CMP_FLIP(1, r2, r7);
576    HS_CMP_FLIP(2, r3, r6);
577    HS_CMP_FLIP(3, r4, r5);
578  }
579  {
580    HS_SLAB_HALF_PREAMBLE(2);
581    HS_CMP_HALF(0, r1);
582    HS_CMP_HALF(1, r2);
583    HS_CMP_HALF(2, r3);
584    HS_CMP_HALF(3, r4);
585    HS_CMP_HALF(4, r5);
586    HS_CMP_HALF(5, r6);
587    HS_CMP_HALF(6, r7);
588    HS_CMP_HALF(7, r8);
589  }
590  {
591    HS_SLAB_HALF_PREAMBLE(1);
592    HS_CMP_HALF(0, r1);
593    HS_CMP_HALF(1, r2);
594    HS_CMP_HALF(2, r3);
595    HS_CMP_HALF(3, r4);
596    HS_CMP_HALF(4, r5);
597    HS_CMP_HALF(5, r6);
598    HS_CMP_HALF(6, r7);
599    HS_CMP_HALF(7, r8);
600  }
601  HS_CMP_XCHG(r1, r5);
602  HS_CMP_XCHG(r3, r7);
603  HS_CMP_XCHG(r1, r3);
604  HS_CMP_XCHG(r5, r7);
605  HS_CMP_XCHG(r2, r6);
606  HS_CMP_XCHG(r4, r8);
607  HS_CMP_XCHG(r2, r4);
608  HS_CMP_XCHG(r6, r8);
609  HS_CMP_XCHG(r1, r2);
610  HS_CMP_XCHG(r3, r4);
611  HS_CMP_XCHG(r5, r6);
612  HS_CMP_XCHG(r7, r8);
613  {
614    HS_SLAB_FLIP_PREAMBLE(15);
615    HS_CMP_FLIP(0, r1, r8);
616    HS_CMP_FLIP(1, r2, r7);
617    HS_CMP_FLIP(2, r3, r6);
618    HS_CMP_FLIP(3, r4, r5);
619  }
620  {
621    HS_SLAB_HALF_PREAMBLE(4);
622    HS_CMP_HALF(0, r1);
623    HS_CMP_HALF(1, r2);
624    HS_CMP_HALF(2, r3);
625    HS_CMP_HALF(3, r4);
626    HS_CMP_HALF(4, r5);
627    HS_CMP_HALF(5, r6);
628    HS_CMP_HALF(6, r7);
629    HS_CMP_HALF(7, r8);
630  }
631  {
632    HS_SLAB_HALF_PREAMBLE(2);
633    HS_CMP_HALF(0, r1);
634    HS_CMP_HALF(1, r2);
635    HS_CMP_HALF(2, r3);
636    HS_CMP_HALF(3, r4);
637    HS_CMP_HALF(4, r5);
638    HS_CMP_HALF(5, r6);
639    HS_CMP_HALF(6, r7);
640    HS_CMP_HALF(7, r8);
641  }
642  {
643    HS_SLAB_HALF_PREAMBLE(1);
644    HS_CMP_HALF(0, r1);
645    HS_CMP_HALF(1, r2);
646    HS_CMP_HALF(2, r3);
647    HS_CMP_HALF(3, r4);
648    HS_CMP_HALF(4, r5);
649    HS_CMP_HALF(5, r6);
650    HS_CMP_HALF(6, r7);
651    HS_CMP_HALF(7, r8);
652  }
653  HS_CMP_XCHG(r1, r5);
654  HS_CMP_XCHG(r3, r7);
655  HS_CMP_XCHG(r1, r3);
656  HS_CMP_XCHG(r5, r7);
657  HS_CMP_XCHG(r2, r6);
658  HS_CMP_XCHG(r4, r8);
659  HS_CMP_XCHG(r2, r4);
660  HS_CMP_XCHG(r6, r8);
661  HS_CMP_XCHG(r1, r2);
662  HS_CMP_XCHG(r3, r4);
663  HS_CMP_XCHG(r5, r6);
664  HS_CMP_XCHG(r7, r8);
665  HS_BS_MERGE_H_PREAMBLE(4);
666  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1;
667  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r8;
668  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2;
669  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r7;
670  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3;
671  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r6;
672  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4;
673  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r5;
674  HS_BLOCK_BARRIER();
675  {
676    {
677      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
678      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(16);
679      HS_CMP_XCHG(r0_1, r0_2);
680      HS_SLAB_LOCAL_L(0) = r0_1;
681      HS_SLAB_LOCAL_R(16) = r0_2;
682    }
683    {
684      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32);
685      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(48);
686      HS_CMP_XCHG(r1_1, r1_2);
687      HS_SLAB_LOCAL_L(32) = r1_1;
688      HS_SLAB_LOCAL_R(48) = r1_2;
689    }
690    {
691      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256);
692      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(272);
693      HS_CMP_XCHG(r0_1, r0_2);
694      HS_SLAB_LOCAL_L(256) = r0_1;
695      HS_SLAB_LOCAL_R(272) = r0_2;
696    }
697    {
698      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(288);
699      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(304);
700      HS_CMP_XCHG(r1_1, r1_2);
701      HS_SLAB_LOCAL_L(288) = r1_1;
702      HS_SLAB_LOCAL_R(304) = r1_2;
703    }
704  }
705  HS_BLOCK_BARRIER();
706  r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
707  r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
708  r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
709  r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
710  r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
711  r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
712  r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
713  r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
714  {
715    {
716      HS_SLAB_HALF_PREAMBLE(8);
717      HS_CMP_HALF(0, r1);
718      HS_CMP_HALF(1, r2);
719      HS_CMP_HALF(2, r3);
720      HS_CMP_HALF(3, r4);
721      HS_CMP_HALF(4, r5);
722      HS_CMP_HALF(5, r6);
723      HS_CMP_HALF(6, r7);
724      HS_CMP_HALF(7, r8);
725    }
726    {
727      HS_SLAB_HALF_PREAMBLE(4);
728      HS_CMP_HALF(0, r1);
729      HS_CMP_HALF(1, r2);
730      HS_CMP_HALF(2, r3);
731      HS_CMP_HALF(3, r4);
732      HS_CMP_HALF(4, r5);
733      HS_CMP_HALF(5, r6);
734      HS_CMP_HALF(6, r7);
735      HS_CMP_HALF(7, r8);
736    }
737    {
738      HS_SLAB_HALF_PREAMBLE(2);
739      HS_CMP_HALF(0, r1);
740      HS_CMP_HALF(1, r2);
741      HS_CMP_HALF(2, r3);
742      HS_CMP_HALF(3, r4);
743      HS_CMP_HALF(4, r5);
744      HS_CMP_HALF(5, r6);
745      HS_CMP_HALF(6, r7);
746      HS_CMP_HALF(7, r8);
747    }
748    {
749      HS_SLAB_HALF_PREAMBLE(1);
750      HS_CMP_HALF(0, r1);
751      HS_CMP_HALF(1, r2);
752      HS_CMP_HALF(2, r3);
753      HS_CMP_HALF(3, r4);
754      HS_CMP_HALF(4, r5);
755      HS_CMP_HALF(5, r6);
756      HS_CMP_HALF(6, r7);
757      HS_CMP_HALF(7, r8);
758    }
759    HS_CMP_XCHG(r1, r5);
760    HS_CMP_XCHG(r3, r7);
761    HS_CMP_XCHG(r1, r3);
762    HS_CMP_XCHG(r5, r7);
763    HS_CMP_XCHG(r2, r6);
764    HS_CMP_XCHG(r4, r8);
765    HS_CMP_XCHG(r2, r4);
766    HS_CMP_XCHG(r6, r8);
767    HS_CMP_XCHG(r1, r2);
768    HS_CMP_XCHG(r3, r4);
769    HS_CMP_XCHG(r5, r6);
770    HS_CMP_XCHG(r7, r8);
771  }
772  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1;
773  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r8;
774  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2;
775  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r7;
776  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3;
777  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r6;
778  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4;
779  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r5;
780  HS_BLOCK_BARRIER();
781  {
782    {
783      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
784      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16);
785      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(32);
786      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(48);
787      HS_CMP_XCHG(r0_2, r0_3);
788      HS_CMP_XCHG(r0_1, r0_4);
789      HS_CMP_XCHG(r0_3, r0_4);
790      HS_CMP_XCHG(r0_1, r0_2);
791      HS_SLAB_LOCAL_L(0) = r0_1;
792      HS_SLAB_LOCAL_L(16) = r0_2;
793      HS_SLAB_LOCAL_R(32) = r0_3;
794      HS_SLAB_LOCAL_R(48) = r0_4;
795    }
796    {
797      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256);
798      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(272);
799      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(288);
800      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(304);
801      HS_CMP_XCHG(r0_2, r0_3);
802      HS_CMP_XCHG(r0_1, r0_4);
803      HS_CMP_XCHG(r0_3, r0_4);
804      HS_CMP_XCHG(r0_1, r0_2);
805      HS_SLAB_LOCAL_L(256) = r0_1;
806      HS_SLAB_LOCAL_L(272) = r0_2;
807      HS_SLAB_LOCAL_R(288) = r0_3;
808      HS_SLAB_LOCAL_R(304) = r0_4;
809    }
810  }
811  HS_BLOCK_BARRIER();
812  r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
813  r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
814  r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
815  r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
816  r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
817  r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
818  r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
819  r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
820  {
821    {
822      HS_SLAB_HALF_PREAMBLE(8);
823      HS_CMP_HALF(0, r1);
824      HS_CMP_HALF(1, r2);
825      HS_CMP_HALF(2, r3);
826      HS_CMP_HALF(3, r4);
827      HS_CMP_HALF(4, r5);
828      HS_CMP_HALF(5, r6);
829      HS_CMP_HALF(6, r7);
830      HS_CMP_HALF(7, r8);
831    }
832    {
833      HS_SLAB_HALF_PREAMBLE(4);
834      HS_CMP_HALF(0, r1);
835      HS_CMP_HALF(1, r2);
836      HS_CMP_HALF(2, r3);
837      HS_CMP_HALF(3, r4);
838      HS_CMP_HALF(4, r5);
839      HS_CMP_HALF(5, r6);
840      HS_CMP_HALF(6, r7);
841      HS_CMP_HALF(7, r8);
842    }
843    {
844      HS_SLAB_HALF_PREAMBLE(2);
845      HS_CMP_HALF(0, r1);
846      HS_CMP_HALF(1, r2);
847      HS_CMP_HALF(2, r3);
848      HS_CMP_HALF(3, r4);
849      HS_CMP_HALF(4, r5);
850      HS_CMP_HALF(5, r6);
851      HS_CMP_HALF(6, r7);
852      HS_CMP_HALF(7, r8);
853    }
854    {
855      HS_SLAB_HALF_PREAMBLE(1);
856      HS_CMP_HALF(0, r1);
857      HS_CMP_HALF(1, r2);
858      HS_CMP_HALF(2, r3);
859      HS_CMP_HALF(3, r4);
860      HS_CMP_HALF(4, r5);
861      HS_CMP_HALF(5, r6);
862      HS_CMP_HALF(6, r7);
863      HS_CMP_HALF(7, r8);
864    }
865    HS_CMP_XCHG(r1, r5);
866    HS_CMP_XCHG(r3, r7);
867    HS_CMP_XCHG(r1, r3);
868    HS_CMP_XCHG(r5, r7);
869    HS_CMP_XCHG(r2, r6);
870    HS_CMP_XCHG(r4, r8);
871    HS_CMP_XCHG(r2, r4);
872    HS_CMP_XCHG(r6, r8);
873    HS_CMP_XCHG(r1, r2);
874    HS_CMP_XCHG(r3, r4);
875    HS_CMP_XCHG(r5, r6);
876    HS_CMP_XCHG(r7, r8);
877  }
878  HS_SLAB_GLOBAL_STORE(0, r1);
879  HS_SLAB_GLOBAL_STORE(1, r2);
880  HS_SLAB_GLOBAL_STORE(2, r3);
881  HS_SLAB_GLOBAL_STORE(3, r4);
882  HS_SLAB_GLOBAL_STORE(4, r5);
883  HS_SLAB_GLOBAL_STORE(5, r6);
884  HS_SLAB_GLOBAL_STORE(6, r7);
885  HS_SLAB_GLOBAL_STORE(7, r8);
886}
887
888HS_BS_KERNEL_PROTO(8, 3)
889{
890  HS_BLOCK_LOCAL_MEM_DECL(128, 8);
891
892  HS_SLAB_GLOBAL_PREAMBLE();
893  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
894  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
895  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
896  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
897  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
898  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
899  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
900  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
901  HS_CMP_XCHG(r1, r5);
902  HS_CMP_XCHG(r2, r6);
903  HS_CMP_XCHG(r3, r7);
904  HS_CMP_XCHG(r4, r8);
905  HS_CMP_XCHG(r1, r3);
906  HS_CMP_XCHG(r2, r4);
907  HS_CMP_XCHG(r5, r7);
908  HS_CMP_XCHG(r6, r8);
909  HS_CMP_XCHG(r3, r5);
910  HS_CMP_XCHG(r4, r6);
911  HS_CMP_XCHG(r1, r2);
912  HS_CMP_XCHG(r3, r4);
913  HS_CMP_XCHG(r5, r6);
914  HS_CMP_XCHG(r7, r8);
915  HS_CMP_XCHG(r2, r5);
916  HS_CMP_XCHG(r4, r7);
917  HS_CMP_XCHG(r2, r3);
918  HS_CMP_XCHG(r4, r5);
919  HS_CMP_XCHG(r6, r7);
920  {
921    HS_SLAB_FLIP_PREAMBLE(1);
922    HS_CMP_FLIP(0, r1, r8);
923    HS_CMP_FLIP(1, r2, r7);
924    HS_CMP_FLIP(2, r3, r6);
925    HS_CMP_FLIP(3, r4, r5);
926  }
927  HS_CMP_XCHG(r1, r5);
928  HS_CMP_XCHG(r3, r7);
929  HS_CMP_XCHG(r1, r3);
930  HS_CMP_XCHG(r5, r7);
931  HS_CMP_XCHG(r2, r6);
932  HS_CMP_XCHG(r4, r8);
933  HS_CMP_XCHG(r2, r4);
934  HS_CMP_XCHG(r6, r8);
935  HS_CMP_XCHG(r1, r2);
936  HS_CMP_XCHG(r3, r4);
937  HS_CMP_XCHG(r5, r6);
938  HS_CMP_XCHG(r7, r8);
939  {
940    HS_SLAB_FLIP_PREAMBLE(3);
941    HS_CMP_FLIP(0, r1, r8);
942    HS_CMP_FLIP(1, r2, r7);
943    HS_CMP_FLIP(2, r3, r6);
944    HS_CMP_FLIP(3, r4, r5);
945  }
946  {
947    HS_SLAB_HALF_PREAMBLE(1);
948    HS_CMP_HALF(0, r1);
949    HS_CMP_HALF(1, r2);
950    HS_CMP_HALF(2, r3);
951    HS_CMP_HALF(3, r4);
952    HS_CMP_HALF(4, r5);
953    HS_CMP_HALF(5, r6);
954    HS_CMP_HALF(6, r7);
955    HS_CMP_HALF(7, r8);
956  }
957  HS_CMP_XCHG(r1, r5);
958  HS_CMP_XCHG(r3, r7);
959  HS_CMP_XCHG(r1, r3);
960  HS_CMP_XCHG(r5, r7);
961  HS_CMP_XCHG(r2, r6);
962  HS_CMP_XCHG(r4, r8);
963  HS_CMP_XCHG(r2, r4);
964  HS_CMP_XCHG(r6, r8);
965  HS_CMP_XCHG(r1, r2);
966  HS_CMP_XCHG(r3, r4);
967  HS_CMP_XCHG(r5, r6);
968  HS_CMP_XCHG(r7, r8);
969  {
970    HS_SLAB_FLIP_PREAMBLE(7);
971    HS_CMP_FLIP(0, r1, r8);
972    HS_CMP_FLIP(1, r2, r7);
973    HS_CMP_FLIP(2, r3, r6);
974    HS_CMP_FLIP(3, r4, r5);
975  }
976  {
977    HS_SLAB_HALF_PREAMBLE(2);
978    HS_CMP_HALF(0, r1);
979    HS_CMP_HALF(1, r2);
980    HS_CMP_HALF(2, r3);
981    HS_CMP_HALF(3, r4);
982    HS_CMP_HALF(4, r5);
983    HS_CMP_HALF(5, r6);
984    HS_CMP_HALF(6, r7);
985    HS_CMP_HALF(7, r8);
986  }
987  {
988    HS_SLAB_HALF_PREAMBLE(1);
989    HS_CMP_HALF(0, r1);
990    HS_CMP_HALF(1, r2);
991    HS_CMP_HALF(2, r3);
992    HS_CMP_HALF(3, r4);
993    HS_CMP_HALF(4, r5);
994    HS_CMP_HALF(5, r6);
995    HS_CMP_HALF(6, r7);
996    HS_CMP_HALF(7, r8);
997  }
998  HS_CMP_XCHG(r1, r5);
999  HS_CMP_XCHG(r3, r7);
1000  HS_CMP_XCHG(r1, r3);
1001  HS_CMP_XCHG(r5, r7);
1002  HS_CMP_XCHG(r2, r6);
1003  HS_CMP_XCHG(r4, r8);
1004  HS_CMP_XCHG(r2, r4);
1005  HS_CMP_XCHG(r6, r8);
1006  HS_CMP_XCHG(r1, r2);
1007  HS_CMP_XCHG(r3, r4);
1008  HS_CMP_XCHG(r5, r6);
1009  HS_CMP_XCHG(r7, r8);
1010  {
1011    HS_SLAB_FLIP_PREAMBLE(15);
1012    HS_CMP_FLIP(0, r1, r8);
1013    HS_CMP_FLIP(1, r2, r7);
1014    HS_CMP_FLIP(2, r3, r6);
1015    HS_CMP_FLIP(3, r4, r5);
1016  }
1017  {
1018    HS_SLAB_HALF_PREAMBLE(4);
1019    HS_CMP_HALF(0, r1);
1020    HS_CMP_HALF(1, r2);
1021    HS_CMP_HALF(2, r3);
1022    HS_CMP_HALF(3, r4);
1023    HS_CMP_HALF(4, r5);
1024    HS_CMP_HALF(5, r6);
1025    HS_CMP_HALF(6, r7);
1026    HS_CMP_HALF(7, r8);
1027  }
1028  {
1029    HS_SLAB_HALF_PREAMBLE(2);
1030    HS_CMP_HALF(0, r1);
1031    HS_CMP_HALF(1, r2);
1032    HS_CMP_HALF(2, r3);
1033    HS_CMP_HALF(3, r4);
1034    HS_CMP_HALF(4, r5);
1035    HS_CMP_HALF(5, r6);
1036    HS_CMP_HALF(6, r7);
1037    HS_CMP_HALF(7, r8);
1038  }
1039  {
1040    HS_SLAB_HALF_PREAMBLE(1);
1041    HS_CMP_HALF(0, r1);
1042    HS_CMP_HALF(1, r2);
1043    HS_CMP_HALF(2, r3);
1044    HS_CMP_HALF(3, r4);
1045    HS_CMP_HALF(4, r5);
1046    HS_CMP_HALF(5, r6);
1047    HS_CMP_HALF(6, r7);
1048    HS_CMP_HALF(7, r8);
1049  }
1050  HS_CMP_XCHG(r1, r5);
1051  HS_CMP_XCHG(r3, r7);
1052  HS_CMP_XCHG(r1, r3);
1053  HS_CMP_XCHG(r5, r7);
1054  HS_CMP_XCHG(r2, r6);
1055  HS_CMP_XCHG(r4, r8);
1056  HS_CMP_XCHG(r2, r4);
1057  HS_CMP_XCHG(r6, r8);
1058  HS_CMP_XCHG(r1, r2);
1059  HS_CMP_XCHG(r3, r4);
1060  HS_CMP_XCHG(r5, r6);
1061  HS_CMP_XCHG(r7, r8);
1062  HS_BS_MERGE_H_PREAMBLE(8);
1063  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
1064  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8;
1065  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
1066  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7;
1067  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
1068  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6;
1069  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
1070  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5;
1071  HS_BLOCK_BARRIER();
1072  {
1073    {
1074      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1075      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(16);
1076      HS_CMP_XCHG(r0_1, r0_2);
1077      HS_SLAB_LOCAL_L(0) = r0_1;
1078      HS_SLAB_LOCAL_R(16) = r0_2;
1079    }
1080    {
1081      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32);
1082      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(48);
1083      HS_CMP_XCHG(r1_1, r1_2);
1084      HS_SLAB_LOCAL_L(32) = r1_1;
1085      HS_SLAB_LOCAL_R(48) = r1_2;
1086    }
1087    {
1088      HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(64);
1089      HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(80);
1090      HS_CMP_XCHG(r2_1, r2_2);
1091      HS_SLAB_LOCAL_L(64) = r2_1;
1092      HS_SLAB_LOCAL_R(80) = r2_2;
1093    }
1094    {
1095      HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(96);
1096      HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(112);
1097      HS_CMP_XCHG(r3_1, r3_2);
1098      HS_SLAB_LOCAL_L(96) = r3_1;
1099      HS_SLAB_LOCAL_R(112) = r3_2;
1100    }
1101  }
1102  HS_BLOCK_BARRIER();
1103  r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
1104  r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
1105  r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
1106  r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
1107  r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
1108  r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
1109  r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
1110  r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
1111  {
1112    {
1113      HS_SLAB_HALF_PREAMBLE(8);
1114      HS_CMP_HALF(0, r1);
1115      HS_CMP_HALF(1, r2);
1116      HS_CMP_HALF(2, r3);
1117      HS_CMP_HALF(3, r4);
1118      HS_CMP_HALF(4, r5);
1119      HS_CMP_HALF(5, r6);
1120      HS_CMP_HALF(6, r7);
1121      HS_CMP_HALF(7, r8);
1122    }
1123    {
1124      HS_SLAB_HALF_PREAMBLE(4);
1125      HS_CMP_HALF(0, r1);
1126      HS_CMP_HALF(1, r2);
1127      HS_CMP_HALF(2, r3);
1128      HS_CMP_HALF(3, r4);
1129      HS_CMP_HALF(4, r5);
1130      HS_CMP_HALF(5, r6);
1131      HS_CMP_HALF(6, r7);
1132      HS_CMP_HALF(7, r8);
1133    }
1134    {
1135      HS_SLAB_HALF_PREAMBLE(2);
1136      HS_CMP_HALF(0, r1);
1137      HS_CMP_HALF(1, r2);
1138      HS_CMP_HALF(2, r3);
1139      HS_CMP_HALF(3, r4);
1140      HS_CMP_HALF(4, r5);
1141      HS_CMP_HALF(5, r6);
1142      HS_CMP_HALF(6, r7);
1143      HS_CMP_HALF(7, r8);
1144    }
1145    {
1146      HS_SLAB_HALF_PREAMBLE(1);
1147      HS_CMP_HALF(0, r1);
1148      HS_CMP_HALF(1, r2);
1149      HS_CMP_HALF(2, r3);
1150      HS_CMP_HALF(3, r4);
1151      HS_CMP_HALF(4, r5);
1152      HS_CMP_HALF(5, r6);
1153      HS_CMP_HALF(6, r7);
1154      HS_CMP_HALF(7, r8);
1155    }
1156    HS_CMP_XCHG(r1, r5);
1157    HS_CMP_XCHG(r3, r7);
1158    HS_CMP_XCHG(r1, r3);
1159    HS_CMP_XCHG(r5, r7);
1160    HS_CMP_XCHG(r2, r6);
1161    HS_CMP_XCHG(r4, r8);
1162    HS_CMP_XCHG(r2, r4);
1163    HS_CMP_XCHG(r6, r8);
1164    HS_CMP_XCHG(r1, r2);
1165    HS_CMP_XCHG(r3, r4);
1166    HS_CMP_XCHG(r5, r6);
1167    HS_CMP_XCHG(r7, r8);
1168  }
1169  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
1170  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8;
1171  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
1172  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7;
1173  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
1174  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6;
1175  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
1176  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5;
1177  HS_BLOCK_BARRIER();
1178  {
1179    {
1180      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1181      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16);
1182      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(32);
1183      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(48);
1184      HS_CMP_XCHG(r0_2, r0_3);
1185      HS_CMP_XCHG(r0_1, r0_4);
1186      HS_CMP_XCHG(r0_3, r0_4);
1187      HS_CMP_XCHG(r0_1, r0_2);
1188      HS_SLAB_LOCAL_L(0) = r0_1;
1189      HS_SLAB_LOCAL_L(16) = r0_2;
1190      HS_SLAB_LOCAL_R(32) = r0_3;
1191      HS_SLAB_LOCAL_R(48) = r0_4;
1192    }
1193    {
1194      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
1195      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(80);
1196      HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(96);
1197      HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(112);
1198      HS_CMP_XCHG(r1_2, r1_3);
1199      HS_CMP_XCHG(r1_1, r1_4);
1200      HS_CMP_XCHG(r1_3, r1_4);
1201      HS_CMP_XCHG(r1_1, r1_2);
1202      HS_SLAB_LOCAL_L(64) = r1_1;
1203      HS_SLAB_LOCAL_L(80) = r1_2;
1204      HS_SLAB_LOCAL_R(96) = r1_3;
1205      HS_SLAB_LOCAL_R(112) = r1_4;
1206    }
1207  }
1208  HS_BLOCK_BARRIER();
1209  r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
1210  r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
1211  r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
1212  r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
1213  r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
1214  r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
1215  r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
1216  r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
1217  {
1218    {
1219      HS_SLAB_HALF_PREAMBLE(8);
1220      HS_CMP_HALF(0, r1);
1221      HS_CMP_HALF(1, r2);
1222      HS_CMP_HALF(2, r3);
1223      HS_CMP_HALF(3, r4);
1224      HS_CMP_HALF(4, r5);
1225      HS_CMP_HALF(5, r6);
1226      HS_CMP_HALF(6, r7);
1227      HS_CMP_HALF(7, r8);
1228    }
1229    {
1230      HS_SLAB_HALF_PREAMBLE(4);
1231      HS_CMP_HALF(0, r1);
1232      HS_CMP_HALF(1, r2);
1233      HS_CMP_HALF(2, r3);
1234      HS_CMP_HALF(3, r4);
1235      HS_CMP_HALF(4, r5);
1236      HS_CMP_HALF(5, r6);
1237      HS_CMP_HALF(6, r7);
1238      HS_CMP_HALF(7, r8);
1239    }
1240    {
1241      HS_SLAB_HALF_PREAMBLE(2);
1242      HS_CMP_HALF(0, r1);
1243      HS_CMP_HALF(1, r2);
1244      HS_CMP_HALF(2, r3);
1245      HS_CMP_HALF(3, r4);
1246      HS_CMP_HALF(4, r5);
1247      HS_CMP_HALF(5, r6);
1248      HS_CMP_HALF(6, r7);
1249      HS_CMP_HALF(7, r8);
1250    }
1251    {
1252      HS_SLAB_HALF_PREAMBLE(1);
1253      HS_CMP_HALF(0, r1);
1254      HS_CMP_HALF(1, r2);
1255      HS_CMP_HALF(2, r3);
1256      HS_CMP_HALF(3, r4);
1257      HS_CMP_HALF(4, r5);
1258      HS_CMP_HALF(5, r6);
1259      HS_CMP_HALF(6, r7);
1260      HS_CMP_HALF(7, r8);
1261    }
1262    HS_CMP_XCHG(r1, r5);
1263    HS_CMP_XCHG(r3, r7);
1264    HS_CMP_XCHG(r1, r3);
1265    HS_CMP_XCHG(r5, r7);
1266    HS_CMP_XCHG(r2, r6);
1267    HS_CMP_XCHG(r4, r8);
1268    HS_CMP_XCHG(r2, r4);
1269    HS_CMP_XCHG(r6, r8);
1270    HS_CMP_XCHG(r1, r2);
1271    HS_CMP_XCHG(r3, r4);
1272    HS_CMP_XCHG(r5, r6);
1273    HS_CMP_XCHG(r7, r8);
1274  }
1275  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
1276  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8;
1277  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
1278  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7;
1279  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
1280  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6;
1281  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
1282  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5;
1283  HS_BLOCK_BARRIER();
1284  {
1285    {
1286      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1287      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16);
1288      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(32);
1289      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(48);
1290      HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(64);
1291      HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(80);
1292      HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(96);
1293      HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(112);
1294      HS_CMP_XCHG(r0_4, r0_5);
1295      HS_CMP_XCHG(r0_3, r0_6);
1296      HS_CMP_XCHG(r0_2, r0_7);
1297      HS_CMP_XCHG(r0_1, r0_8);
1298      HS_CMP_XCHG(r0_5, r0_7);
1299      HS_CMP_XCHG(r0_6, r0_8);
1300      HS_CMP_XCHG(r0_5, r0_6);
1301      HS_CMP_XCHG(r0_7, r0_8);
1302      HS_CMP_XCHG(r0_1, r0_3);
1303      HS_CMP_XCHG(r0_2, r0_4);
1304      HS_CMP_XCHG(r0_1, r0_2);
1305      HS_CMP_XCHG(r0_3, r0_4);
1306      HS_SLAB_LOCAL_L(0) = r0_1;
1307      HS_SLAB_LOCAL_L(16) = r0_2;
1308      HS_SLAB_LOCAL_L(32) = r0_3;
1309      HS_SLAB_LOCAL_L(48) = r0_4;
1310      HS_SLAB_LOCAL_R(64) = r0_5;
1311      HS_SLAB_LOCAL_R(80) = r0_6;
1312      HS_SLAB_LOCAL_R(96) = r0_7;
1313      HS_SLAB_LOCAL_R(112) = r0_8;
1314    }
1315  }
1316  HS_BLOCK_BARRIER();
1317  r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
1318  r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
1319  r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
1320  r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
1321  r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
1322  r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
1323  r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
1324  r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
1325  {
1326    {
1327      HS_SLAB_HALF_PREAMBLE(8);
1328      HS_CMP_HALF(0, r1);
1329      HS_CMP_HALF(1, r2);
1330      HS_CMP_HALF(2, r3);
1331      HS_CMP_HALF(3, r4);
1332      HS_CMP_HALF(4, r5);
1333      HS_CMP_HALF(5, r6);
1334      HS_CMP_HALF(6, r7);
1335      HS_CMP_HALF(7, r8);
1336    }
1337    {
1338      HS_SLAB_HALF_PREAMBLE(4);
1339      HS_CMP_HALF(0, r1);
1340      HS_CMP_HALF(1, r2);
1341      HS_CMP_HALF(2, r3);
1342      HS_CMP_HALF(3, r4);
1343      HS_CMP_HALF(4, r5);
1344      HS_CMP_HALF(5, r6);
1345      HS_CMP_HALF(6, r7);
1346      HS_CMP_HALF(7, r8);
1347    }
1348    {
1349      HS_SLAB_HALF_PREAMBLE(2);
1350      HS_CMP_HALF(0, r1);
1351      HS_CMP_HALF(1, r2);
1352      HS_CMP_HALF(2, r3);
1353      HS_CMP_HALF(3, r4);
1354      HS_CMP_HALF(4, r5);
1355      HS_CMP_HALF(5, r6);
1356      HS_CMP_HALF(6, r7);
1357      HS_CMP_HALF(7, r8);
1358    }
1359    {
1360      HS_SLAB_HALF_PREAMBLE(1);
1361      HS_CMP_HALF(0, r1);
1362      HS_CMP_HALF(1, r2);
1363      HS_CMP_HALF(2, r3);
1364      HS_CMP_HALF(3, r4);
1365      HS_CMP_HALF(4, r5);
1366      HS_CMP_HALF(5, r6);
1367      HS_CMP_HALF(6, r7);
1368      HS_CMP_HALF(7, r8);
1369    }
1370    HS_CMP_XCHG(r1, r5);
1371    HS_CMP_XCHG(r3, r7);
1372    HS_CMP_XCHG(r1, r3);
1373    HS_CMP_XCHG(r5, r7);
1374    HS_CMP_XCHG(r2, r6);
1375    HS_CMP_XCHG(r4, r8);
1376    HS_CMP_XCHG(r2, r4);
1377    HS_CMP_XCHG(r6, r8);
1378    HS_CMP_XCHG(r1, r2);
1379    HS_CMP_XCHG(r3, r4);
1380    HS_CMP_XCHG(r5, r6);
1381    HS_CMP_XCHG(r7, r8);
1382  }
1383  HS_SLAB_GLOBAL_STORE(0, r1);
1384  HS_SLAB_GLOBAL_STORE(1, r2);
1385  HS_SLAB_GLOBAL_STORE(2, r3);
1386  HS_SLAB_GLOBAL_STORE(3, r4);
1387  HS_SLAB_GLOBAL_STORE(4, r5);
1388  HS_SLAB_GLOBAL_STORE(5, r6);
1389  HS_SLAB_GLOBAL_STORE(6, r7);
1390  HS_SLAB_GLOBAL_STORE(7, r8);
1391}
1392
1393HS_BS_KERNEL_PROTO(16, 4)
1394{
1395  HS_BLOCK_LOCAL_MEM_DECL(256, 8);
1396
1397  HS_SLAB_GLOBAL_PREAMBLE();
1398  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
1399  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
1400  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
1401  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
1402  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
1403  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
1404  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
1405  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
1406  HS_CMP_XCHG(r1, r5);
1407  HS_CMP_XCHG(r2, r6);
1408  HS_CMP_XCHG(r3, r7);
1409  HS_CMP_XCHG(r4, r8);
1410  HS_CMP_XCHG(r1, r3);
1411  HS_CMP_XCHG(r2, r4);
1412  HS_CMP_XCHG(r5, r7);
1413  HS_CMP_XCHG(r6, r8);
1414  HS_CMP_XCHG(r3, r5);
1415  HS_CMP_XCHG(r4, r6);
1416  HS_CMP_XCHG(r1, r2);
1417  HS_CMP_XCHG(r3, r4);
1418  HS_CMP_XCHG(r5, r6);
1419  HS_CMP_XCHG(r7, r8);
1420  HS_CMP_XCHG(r2, r5);
1421  HS_CMP_XCHG(r4, r7);
1422  HS_CMP_XCHG(r2, r3);
1423  HS_CMP_XCHG(r4, r5);
1424  HS_CMP_XCHG(r6, r7);
1425  {
1426    HS_SLAB_FLIP_PREAMBLE(1);
1427    HS_CMP_FLIP(0, r1, r8);
1428    HS_CMP_FLIP(1, r2, r7);
1429    HS_CMP_FLIP(2, r3, r6);
1430    HS_CMP_FLIP(3, r4, r5);
1431  }
1432  HS_CMP_XCHG(r1, r5);
1433  HS_CMP_XCHG(r3, r7);
1434  HS_CMP_XCHG(r1, r3);
1435  HS_CMP_XCHG(r5, r7);
1436  HS_CMP_XCHG(r2, r6);
1437  HS_CMP_XCHG(r4, r8);
1438  HS_CMP_XCHG(r2, r4);
1439  HS_CMP_XCHG(r6, r8);
1440  HS_CMP_XCHG(r1, r2);
1441  HS_CMP_XCHG(r3, r4);
1442  HS_CMP_XCHG(r5, r6);
1443  HS_CMP_XCHG(r7, r8);
1444  {
1445    HS_SLAB_FLIP_PREAMBLE(3);
1446    HS_CMP_FLIP(0, r1, r8);
1447    HS_CMP_FLIP(1, r2, r7);
1448    HS_CMP_FLIP(2, r3, r6);
1449    HS_CMP_FLIP(3, r4, r5);
1450  }
1451  {
1452    HS_SLAB_HALF_PREAMBLE(1);
1453    HS_CMP_HALF(0, r1);
1454    HS_CMP_HALF(1, r2);
1455    HS_CMP_HALF(2, r3);
1456    HS_CMP_HALF(3, r4);
1457    HS_CMP_HALF(4, r5);
1458    HS_CMP_HALF(5, r6);
1459    HS_CMP_HALF(6, r7);
1460    HS_CMP_HALF(7, r8);
1461  }
1462  HS_CMP_XCHG(r1, r5);
1463  HS_CMP_XCHG(r3, r7);
1464  HS_CMP_XCHG(r1, r3);
1465  HS_CMP_XCHG(r5, r7);
1466  HS_CMP_XCHG(r2, r6);
1467  HS_CMP_XCHG(r4, r8);
1468  HS_CMP_XCHG(r2, r4);
1469  HS_CMP_XCHG(r6, r8);
1470  HS_CMP_XCHG(r1, r2);
1471  HS_CMP_XCHG(r3, r4);
1472  HS_CMP_XCHG(r5, r6);
1473  HS_CMP_XCHG(r7, r8);
1474  {
1475    HS_SLAB_FLIP_PREAMBLE(7);
1476    HS_CMP_FLIP(0, r1, r8);
1477    HS_CMP_FLIP(1, r2, r7);
1478    HS_CMP_FLIP(2, r3, r6);
1479    HS_CMP_FLIP(3, r4, r5);
1480  }
1481  {
1482    HS_SLAB_HALF_PREAMBLE(2);
1483    HS_CMP_HALF(0, r1);
1484    HS_CMP_HALF(1, r2);
1485    HS_CMP_HALF(2, r3);
1486    HS_CMP_HALF(3, r4);
1487    HS_CMP_HALF(4, r5);
1488    HS_CMP_HALF(5, r6);
1489    HS_CMP_HALF(6, r7);
1490    HS_CMP_HALF(7, r8);
1491  }
1492  {
1493    HS_SLAB_HALF_PREAMBLE(1);
1494    HS_CMP_HALF(0, r1);
1495    HS_CMP_HALF(1, r2);
1496    HS_CMP_HALF(2, r3);
1497    HS_CMP_HALF(3, r4);
1498    HS_CMP_HALF(4, r5);
1499    HS_CMP_HALF(5, r6);
1500    HS_CMP_HALF(6, r7);
1501    HS_CMP_HALF(7, r8);
1502  }
1503  HS_CMP_XCHG(r1, r5);
1504  HS_CMP_XCHG(r3, r7);
1505  HS_CMP_XCHG(r1, r3);
1506  HS_CMP_XCHG(r5, r7);
1507  HS_CMP_XCHG(r2, r6);
1508  HS_CMP_XCHG(r4, r8);
1509  HS_CMP_XCHG(r2, r4);
1510  HS_CMP_XCHG(r6, r8);
1511  HS_CMP_XCHG(r1, r2);
1512  HS_CMP_XCHG(r3, r4);
1513  HS_CMP_XCHG(r5, r6);
1514  HS_CMP_XCHG(r7, r8);
1515  {
1516    HS_SLAB_FLIP_PREAMBLE(15);
1517    HS_CMP_FLIP(0, r1, r8);
1518    HS_CMP_FLIP(1, r2, r7);
1519    HS_CMP_FLIP(2, r3, r6);
1520    HS_CMP_FLIP(3, r4, r5);
1521  }
1522  {
1523    HS_SLAB_HALF_PREAMBLE(4);
1524    HS_CMP_HALF(0, r1);
1525    HS_CMP_HALF(1, r2);
1526    HS_CMP_HALF(2, r3);
1527    HS_CMP_HALF(3, r4);
1528    HS_CMP_HALF(4, r5);
1529    HS_CMP_HALF(5, r6);
1530    HS_CMP_HALF(6, r7);
1531    HS_CMP_HALF(7, r8);
1532  }
1533  {
1534    HS_SLAB_HALF_PREAMBLE(2);
1535    HS_CMP_HALF(0, r1);
1536    HS_CMP_HALF(1, r2);
1537    HS_CMP_HALF(2, r3);
1538    HS_CMP_HALF(3, r4);
1539    HS_CMP_HALF(4, r5);
1540    HS_CMP_HALF(5, r6);
1541    HS_CMP_HALF(6, r7);
1542    HS_CMP_HALF(7, r8);
1543  }
1544  {
1545    HS_SLAB_HALF_PREAMBLE(1);
1546    HS_CMP_HALF(0, r1);
1547    HS_CMP_HALF(1, r2);
1548    HS_CMP_HALF(2, r3);
1549    HS_CMP_HALF(3, r4);
1550    HS_CMP_HALF(4, r5);
1551    HS_CMP_HALF(5, r6);
1552    HS_CMP_HALF(6, r7);
1553    HS_CMP_HALF(7, r8);
1554  }
1555  HS_CMP_XCHG(r1, r5);
1556  HS_CMP_XCHG(r3, r7);
1557  HS_CMP_XCHG(r1, r3);
1558  HS_CMP_XCHG(r5, r7);
1559  HS_CMP_XCHG(r2, r6);
1560  HS_CMP_XCHG(r4, r8);
1561  HS_CMP_XCHG(r2, r4);
1562  HS_CMP_XCHG(r6, r8);
1563  HS_CMP_XCHG(r1, r2);
1564  HS_CMP_XCHG(r3, r4);
1565  HS_CMP_XCHG(r5, r6);
1566  HS_CMP_XCHG(r7, r8);
1567  HS_BS_MERGE_H_PREAMBLE(16);
1568  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
1569  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8;
1570  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
1571  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7;
1572  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
1573  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6;
1574  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
1575  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5;
1576  HS_BLOCK_BARRIER();
1577  if (HS_SUBGROUP_ID() < 8) {
1578    {
1579      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1580      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(16);
1581      HS_CMP_XCHG(r0_1, r0_2);
1582      HS_SLAB_LOCAL_L(0) = r0_1;
1583      HS_SLAB_LOCAL_R(16) = r0_2;
1584    }
1585    {
1586      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32);
1587      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(48);
1588      HS_CMP_XCHG(r1_1, r1_2);
1589      HS_SLAB_LOCAL_L(32) = r1_1;
1590      HS_SLAB_LOCAL_R(48) = r1_2;
1591    }
1592    {
1593      HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(64);
1594      HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(80);
1595      HS_CMP_XCHG(r2_1, r2_2);
1596      HS_SLAB_LOCAL_L(64) = r2_1;
1597      HS_SLAB_LOCAL_R(80) = r2_2;
1598    }
1599    {
1600      HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(96);
1601      HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(112);
1602      HS_CMP_XCHG(r3_1, r3_2);
1603      HS_SLAB_LOCAL_L(96) = r3_1;
1604      HS_SLAB_LOCAL_R(112) = r3_2;
1605    }
1606    {
1607      HS_KEY_TYPE r4_1 = HS_SLAB_LOCAL_L(128);
1608      HS_KEY_TYPE r4_2 = HS_SLAB_LOCAL_R(144);
1609      HS_CMP_XCHG(r4_1, r4_2);
1610      HS_SLAB_LOCAL_L(128) = r4_1;
1611      HS_SLAB_LOCAL_R(144) = r4_2;
1612    }
1613    {
1614      HS_KEY_TYPE r5_1 = HS_SLAB_LOCAL_L(160);
1615      HS_KEY_TYPE r5_2 = HS_SLAB_LOCAL_R(176);
1616      HS_CMP_XCHG(r5_1, r5_2);
1617      HS_SLAB_LOCAL_L(160) = r5_1;
1618      HS_SLAB_LOCAL_R(176) = r5_2;
1619    }
1620    {
1621      HS_KEY_TYPE r6_1 = HS_SLAB_LOCAL_L(192);
1622      HS_KEY_TYPE r6_2 = HS_SLAB_LOCAL_R(208);
1623      HS_CMP_XCHG(r6_1, r6_2);
1624      HS_SLAB_LOCAL_L(192) = r6_1;
1625      HS_SLAB_LOCAL_R(208) = r6_2;
1626    }
1627    {
1628      HS_KEY_TYPE r7_1 = HS_SLAB_LOCAL_L(224);
1629      HS_KEY_TYPE r7_2 = HS_SLAB_LOCAL_R(240);
1630      HS_CMP_XCHG(r7_1, r7_2);
1631      HS_SLAB_LOCAL_L(224) = r7_1;
1632      HS_SLAB_LOCAL_R(240) = r7_2;
1633    }
1634  }
1635  HS_BLOCK_BARRIER();
1636  r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
1637  r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
1638  r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
1639  r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
1640  r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
1641  r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
1642  r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
1643  r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
1644  {
1645    {
1646      HS_SLAB_HALF_PREAMBLE(8);
1647      HS_CMP_HALF(0, r1);
1648      HS_CMP_HALF(1, r2);
1649      HS_CMP_HALF(2, r3);
1650      HS_CMP_HALF(3, r4);
1651      HS_CMP_HALF(4, r5);
1652      HS_CMP_HALF(5, r6);
1653      HS_CMP_HALF(6, r7);
1654      HS_CMP_HALF(7, r8);
1655    }
1656    {
1657      HS_SLAB_HALF_PREAMBLE(4);
1658      HS_CMP_HALF(0, r1);
1659      HS_CMP_HALF(1, r2);
1660      HS_CMP_HALF(2, r3);
1661      HS_CMP_HALF(3, r4);
1662      HS_CMP_HALF(4, r5);
1663      HS_CMP_HALF(5, r6);
1664      HS_CMP_HALF(6, r7);
1665      HS_CMP_HALF(7, r8);
1666    }
1667    {
1668      HS_SLAB_HALF_PREAMBLE(2);
1669      HS_CMP_HALF(0, r1);
1670      HS_CMP_HALF(1, r2);
1671      HS_CMP_HALF(2, r3);
1672      HS_CMP_HALF(3, r4);
1673      HS_CMP_HALF(4, r5);
1674      HS_CMP_HALF(5, r6);
1675      HS_CMP_HALF(6, r7);
1676      HS_CMP_HALF(7, r8);
1677    }
1678    {
1679      HS_SLAB_HALF_PREAMBLE(1);
1680      HS_CMP_HALF(0, r1);
1681      HS_CMP_HALF(1, r2);
1682      HS_CMP_HALF(2, r3);
1683      HS_CMP_HALF(3, r4);
1684      HS_CMP_HALF(4, r5);
1685      HS_CMP_HALF(5, r6);
1686      HS_CMP_HALF(6, r7);
1687      HS_CMP_HALF(7, r8);
1688    }
1689    HS_CMP_XCHG(r1, r5);
1690    HS_CMP_XCHG(r3, r7);
1691    HS_CMP_XCHG(r1, r3);
1692    HS_CMP_XCHG(r5, r7);
1693    HS_CMP_XCHG(r2, r6);
1694    HS_CMP_XCHG(r4, r8);
1695    HS_CMP_XCHG(r2, r4);
1696    HS_CMP_XCHG(r6, r8);
1697    HS_CMP_XCHG(r1, r2);
1698    HS_CMP_XCHG(r3, r4);
1699    HS_CMP_XCHG(r5, r6);
1700    HS_CMP_XCHG(r7, r8);
1701  }
1702  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
1703  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8;
1704  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
1705  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7;
1706  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
1707  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6;
1708  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
1709  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5;
1710  HS_BLOCK_BARRIER();
1711  if (HS_SUBGROUP_ID() < 8) {
1712    {
1713      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1714      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16);
1715      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(32);
1716      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(48);
1717      HS_CMP_XCHG(r0_2, r0_3);
1718      HS_CMP_XCHG(r0_1, r0_4);
1719      HS_CMP_XCHG(r0_3, r0_4);
1720      HS_CMP_XCHG(r0_1, r0_2);
1721      HS_SLAB_LOCAL_L(0) = r0_1;
1722      HS_SLAB_LOCAL_L(16) = r0_2;
1723      HS_SLAB_LOCAL_R(32) = r0_3;
1724      HS_SLAB_LOCAL_R(48) = r0_4;
1725    }
1726    {
1727      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
1728      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(80);
1729      HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(96);
1730      HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(112);
1731      HS_CMP_XCHG(r1_2, r1_3);
1732      HS_CMP_XCHG(r1_1, r1_4);
1733      HS_CMP_XCHG(r1_3, r1_4);
1734      HS_CMP_XCHG(r1_1, r1_2);
1735      HS_SLAB_LOCAL_L(64) = r1_1;
1736      HS_SLAB_LOCAL_L(80) = r1_2;
1737      HS_SLAB_LOCAL_R(96) = r1_3;
1738      HS_SLAB_LOCAL_R(112) = r1_4;
1739    }
1740    {
1741      HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(128);
1742      HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_L(144);
1743      HS_KEY_TYPE r2_3 = HS_SLAB_LOCAL_R(160);
1744      HS_KEY_TYPE r2_4 = HS_SLAB_LOCAL_R(176);
1745      HS_CMP_XCHG(r2_2, r2_3);
1746      HS_CMP_XCHG(r2_1, r2_4);
1747      HS_CMP_XCHG(r2_3, r2_4);
1748      HS_CMP_XCHG(r2_1, r2_2);
1749      HS_SLAB_LOCAL_L(128) = r2_1;
1750      HS_SLAB_LOCAL_L(144) = r2_2;
1751      HS_SLAB_LOCAL_R(160) = r2_3;
1752      HS_SLAB_LOCAL_R(176) = r2_4;
1753    }
1754    {
1755      HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(192);
1756      HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_L(208);
1757      HS_KEY_TYPE r3_3 = HS_SLAB_LOCAL_R(224);
1758      HS_KEY_TYPE r3_4 = HS_SLAB_LOCAL_R(240);
1759      HS_CMP_XCHG(r3_2, r3_3);
1760      HS_CMP_XCHG(r3_1, r3_4);
1761      HS_CMP_XCHG(r3_3, r3_4);
1762      HS_CMP_XCHG(r3_1, r3_2);
1763      HS_SLAB_LOCAL_L(192) = r3_1;
1764      HS_SLAB_LOCAL_L(208) = r3_2;
1765      HS_SLAB_LOCAL_R(224) = r3_3;
1766      HS_SLAB_LOCAL_R(240) = r3_4;
1767    }
1768  }
1769  HS_BLOCK_BARRIER();
1770  r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
1771  r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
1772  r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
1773  r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
1774  r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
1775  r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
1776  r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
1777  r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
1778  {
1779    {
1780      HS_SLAB_HALF_PREAMBLE(8);
1781      HS_CMP_HALF(0, r1);
1782      HS_CMP_HALF(1, r2);
1783      HS_CMP_HALF(2, r3);
1784      HS_CMP_HALF(3, r4);
1785      HS_CMP_HALF(4, r5);
1786      HS_CMP_HALF(5, r6);
1787      HS_CMP_HALF(6, r7);
1788      HS_CMP_HALF(7, r8);
1789    }
1790    {
1791      HS_SLAB_HALF_PREAMBLE(4);
1792      HS_CMP_HALF(0, r1);
1793      HS_CMP_HALF(1, r2);
1794      HS_CMP_HALF(2, r3);
1795      HS_CMP_HALF(3, r4);
1796      HS_CMP_HALF(4, r5);
1797      HS_CMP_HALF(5, r6);
1798      HS_CMP_HALF(6, r7);
1799      HS_CMP_HALF(7, r8);
1800    }
1801    {
1802      HS_SLAB_HALF_PREAMBLE(2);
1803      HS_CMP_HALF(0, r1);
1804      HS_CMP_HALF(1, r2);
1805      HS_CMP_HALF(2, r3);
1806      HS_CMP_HALF(3, r4);
1807      HS_CMP_HALF(4, r5);
1808      HS_CMP_HALF(5, r6);
1809      HS_CMP_HALF(6, r7);
1810      HS_CMP_HALF(7, r8);
1811    }
1812    {
1813      HS_SLAB_HALF_PREAMBLE(1);
1814      HS_CMP_HALF(0, r1);
1815      HS_CMP_HALF(1, r2);
1816      HS_CMP_HALF(2, r3);
1817      HS_CMP_HALF(3, r4);
1818      HS_CMP_HALF(4, r5);
1819      HS_CMP_HALF(5, r6);
1820      HS_CMP_HALF(6, r7);
1821      HS_CMP_HALF(7, r8);
1822    }
1823    HS_CMP_XCHG(r1, r5);
1824    HS_CMP_XCHG(r3, r7);
1825    HS_CMP_XCHG(r1, r3);
1826    HS_CMP_XCHG(r5, r7);
1827    HS_CMP_XCHG(r2, r6);
1828    HS_CMP_XCHG(r4, r8);
1829    HS_CMP_XCHG(r2, r4);
1830    HS_CMP_XCHG(r6, r8);
1831    HS_CMP_XCHG(r1, r2);
1832    HS_CMP_XCHG(r3, r4);
1833    HS_CMP_XCHG(r5, r6);
1834    HS_CMP_XCHG(r7, r8);
1835  }
1836  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
1837  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8;
1838  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
1839  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7;
1840  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
1841  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6;
1842  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
1843  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5;
1844  HS_BLOCK_BARRIER();
1845  if (HS_SUBGROUP_ID() < 8) {
1846    {
1847      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1848      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16);
1849      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(32);
1850      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(48);
1851      HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(64);
1852      HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(80);
1853      HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(96);
1854      HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(112);
1855      HS_CMP_XCHG(r0_4, r0_5);
1856      HS_CMP_XCHG(r0_3, r0_6);
1857      HS_CMP_XCHG(r0_2, r0_7);
1858      HS_CMP_XCHG(r0_1, r0_8);
1859      HS_CMP_XCHG(r0_5, r0_7);
1860      HS_CMP_XCHG(r0_6, r0_8);
1861      HS_CMP_XCHG(r0_5, r0_6);
1862      HS_CMP_XCHG(r0_7, r0_8);
1863      HS_CMP_XCHG(r0_1, r0_3);
1864      HS_CMP_XCHG(r0_2, r0_4);
1865      HS_CMP_XCHG(r0_1, r0_2);
1866      HS_CMP_XCHG(r0_3, r0_4);
1867      HS_SLAB_LOCAL_L(0) = r0_1;
1868      HS_SLAB_LOCAL_L(16) = r0_2;
1869      HS_SLAB_LOCAL_L(32) = r0_3;
1870      HS_SLAB_LOCAL_L(48) = r0_4;
1871      HS_SLAB_LOCAL_R(64) = r0_5;
1872      HS_SLAB_LOCAL_R(80) = r0_6;
1873      HS_SLAB_LOCAL_R(96) = r0_7;
1874      HS_SLAB_LOCAL_R(112) = r0_8;
1875    }
1876    {
1877      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(128);
1878      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(144);
1879      HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_L(160);
1880      HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_L(176);
1881      HS_KEY_TYPE r1_5 = HS_SLAB_LOCAL_R(192);
1882      HS_KEY_TYPE r1_6 = HS_SLAB_LOCAL_R(208);
1883      HS_KEY_TYPE r1_7 = HS_SLAB_LOCAL_R(224);
1884      HS_KEY_TYPE r1_8 = HS_SLAB_LOCAL_R(240);
1885      HS_CMP_XCHG(r1_4, r1_5);
1886      HS_CMP_XCHG(r1_3, r1_6);
1887      HS_CMP_XCHG(r1_2, r1_7);
1888      HS_CMP_XCHG(r1_1, r1_8);
1889      HS_CMP_XCHG(r1_5, r1_7);
1890      HS_CMP_XCHG(r1_6, r1_8);
1891      HS_CMP_XCHG(r1_5, r1_6);
1892      HS_CMP_XCHG(r1_7, r1_8);
1893      HS_CMP_XCHG(r1_1, r1_3);
1894      HS_CMP_XCHG(r1_2, r1_4);
1895      HS_CMP_XCHG(r1_1, r1_2);
1896      HS_CMP_XCHG(r1_3, r1_4);
1897      HS_SLAB_LOCAL_L(128) = r1_1;
1898      HS_SLAB_LOCAL_L(144) = r1_2;
1899      HS_SLAB_LOCAL_L(160) = r1_3;
1900      HS_SLAB_LOCAL_L(176) = r1_4;
1901      HS_SLAB_LOCAL_R(192) = r1_5;
1902      HS_SLAB_LOCAL_R(208) = r1_6;
1903      HS_SLAB_LOCAL_R(224) = r1_7;
1904      HS_SLAB_LOCAL_R(240) = r1_8;
1905    }
1906  }
1907  HS_BLOCK_BARRIER();
1908  r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
1909  r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
1910  r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
1911  r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
1912  r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
1913  r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
1914  r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
1915  r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
1916  {
1917    {
1918      HS_SLAB_HALF_PREAMBLE(8);
1919      HS_CMP_HALF(0, r1);
1920      HS_CMP_HALF(1, r2);
1921      HS_CMP_HALF(2, r3);
1922      HS_CMP_HALF(3, r4);
1923      HS_CMP_HALF(4, r5);
1924      HS_CMP_HALF(5, r6);
1925      HS_CMP_HALF(6, r7);
1926      HS_CMP_HALF(7, r8);
1927    }
1928    {
1929      HS_SLAB_HALF_PREAMBLE(4);
1930      HS_CMP_HALF(0, r1);
1931      HS_CMP_HALF(1, r2);
1932      HS_CMP_HALF(2, r3);
1933      HS_CMP_HALF(3, r4);
1934      HS_CMP_HALF(4, r5);
1935      HS_CMP_HALF(5, r6);
1936      HS_CMP_HALF(6, r7);
1937      HS_CMP_HALF(7, r8);
1938    }
1939    {
1940      HS_SLAB_HALF_PREAMBLE(2);
1941      HS_CMP_HALF(0, r1);
1942      HS_CMP_HALF(1, r2);
1943      HS_CMP_HALF(2, r3);
1944      HS_CMP_HALF(3, r4);
1945      HS_CMP_HALF(4, r5);
1946      HS_CMP_HALF(5, r6);
1947      HS_CMP_HALF(6, r7);
1948      HS_CMP_HALF(7, r8);
1949    }
1950    {
1951      HS_SLAB_HALF_PREAMBLE(1);
1952      HS_CMP_HALF(0, r1);
1953      HS_CMP_HALF(1, r2);
1954      HS_CMP_HALF(2, r3);
1955      HS_CMP_HALF(3, r4);
1956      HS_CMP_HALF(4, r5);
1957      HS_CMP_HALF(5, r6);
1958      HS_CMP_HALF(6, r7);
1959      HS_CMP_HALF(7, r8);
1960    }
1961    HS_CMP_XCHG(r1, r5);
1962    HS_CMP_XCHG(r3, r7);
1963    HS_CMP_XCHG(r1, r3);
1964    HS_CMP_XCHG(r5, r7);
1965    HS_CMP_XCHG(r2, r6);
1966    HS_CMP_XCHG(r4, r8);
1967    HS_CMP_XCHG(r2, r4);
1968    HS_CMP_XCHG(r6, r8);
1969    HS_CMP_XCHG(r1, r2);
1970    HS_CMP_XCHG(r3, r4);
1971    HS_CMP_XCHG(r5, r6);
1972    HS_CMP_XCHG(r7, r8);
1973  }
1974  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
1975  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8;
1976  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
1977  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7;
1978  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
1979  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6;
1980  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
1981  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5;
1982  HS_BLOCK_BARRIER();
1983  if (HS_SUBGROUP_ID() < 8) {
1984    {
1985      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1986      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16);
1987      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(32);
1988      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(48);
1989      HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_L(64);
1990      HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_L(80);
1991      HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_L(96);
1992      HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_L(112);
1993      HS_KEY_TYPE r0_9 = HS_SLAB_LOCAL_R(128);
1994      HS_KEY_TYPE r0_10 = HS_SLAB_LOCAL_R(144);
1995      HS_KEY_TYPE r0_11 = HS_SLAB_LOCAL_R(160);
1996      HS_KEY_TYPE r0_12 = HS_SLAB_LOCAL_R(176);
1997      HS_KEY_TYPE r0_13 = HS_SLAB_LOCAL_R(192);
1998      HS_KEY_TYPE r0_14 = HS_SLAB_LOCAL_R(208);
1999      HS_KEY_TYPE r0_15 = HS_SLAB_LOCAL_R(224);
2000      HS_KEY_TYPE r0_16 = HS_SLAB_LOCAL_R(240);
2001      HS_CMP_XCHG(r0_8, r0_9);
2002      HS_CMP_XCHG(r0_7, r0_10);
2003      HS_CMP_XCHG(r0_6, r0_11);
2004      HS_CMP_XCHG(r0_5, r0_12);
2005      HS_CMP_XCHG(r0_4, r0_13);
2006      HS_CMP_XCHG(r0_3, r0_14);
2007      HS_CMP_XCHG(r0_2, r0_15);
2008      HS_CMP_XCHG(r0_1, r0_16);
2009      HS_CMP_XCHG(r0_9, r0_13);
2010      HS_CMP_XCHG(r0_11, r0_15);
2011      HS_CMP_XCHG(r0_9, r0_11);
2012      HS_CMP_XCHG(r0_13, r0_15);
2013      HS_CMP_XCHG(r0_10, r0_14);
2014      HS_CMP_XCHG(r0_12, r0_16);
2015      HS_CMP_XCHG(r0_10, r0_12);
2016      HS_CMP_XCHG(r0_14, r0_16);
2017      HS_CMP_XCHG(r0_9, r0_10);
2018      HS_CMP_XCHG(r0_11, r0_12);
2019      HS_CMP_XCHG(r0_13, r0_14);
2020      HS_CMP_XCHG(r0_15, r0_16);
2021      HS_CMP_XCHG(r0_1, r0_5);
2022      HS_CMP_XCHG(r0_3, r0_7);
2023      HS_CMP_XCHG(r0_1, r0_3);
2024      HS_CMP_XCHG(r0_5, r0_7);
2025      HS_CMP_XCHG(r0_2, r0_6);
2026      HS_CMP_XCHG(r0_4, r0_8);
2027      HS_CMP_XCHG(r0_2, r0_4);
2028      HS_CMP_XCHG(r0_6, r0_8);
2029      HS_CMP_XCHG(r0_1, r0_2);
2030      HS_CMP_XCHG(r0_3, r0_4);
2031      HS_CMP_XCHG(r0_5, r0_6);
2032      HS_CMP_XCHG(r0_7, r0_8);
2033      HS_SLAB_LOCAL_L(0) = r0_1;
2034      HS_SLAB_LOCAL_L(16) = r0_2;
2035      HS_SLAB_LOCAL_L(32) = r0_3;
2036      HS_SLAB_LOCAL_L(48) = r0_4;
2037      HS_SLAB_LOCAL_L(64) = r0_5;
2038      HS_SLAB_LOCAL_L(80) = r0_6;
2039      HS_SLAB_LOCAL_L(96) = r0_7;
2040      HS_SLAB_LOCAL_L(112) = r0_8;
2041      HS_SLAB_LOCAL_R(128) = r0_9;
2042      HS_SLAB_LOCAL_R(144) = r0_10;
2043      HS_SLAB_LOCAL_R(160) = r0_11;
2044      HS_SLAB_LOCAL_R(176) = r0_12;
2045      HS_SLAB_LOCAL_R(192) = r0_13;
2046      HS_SLAB_LOCAL_R(208) = r0_14;
2047      HS_SLAB_LOCAL_R(224) = r0_15;
2048      HS_SLAB_LOCAL_R(240) = r0_16;
2049    }
2050  }
2051  HS_BLOCK_BARRIER();
2052  r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2053  r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2054  r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2055  r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2056  r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2057  r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2058  r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2059  r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2060  {
2061    {
2062      HS_SLAB_HALF_PREAMBLE(8);
2063      HS_CMP_HALF(0, r1);
2064      HS_CMP_HALF(1, r2);
2065      HS_CMP_HALF(2, r3);
2066      HS_CMP_HALF(3, r4);
2067      HS_CMP_HALF(4, r5);
2068      HS_CMP_HALF(5, r6);
2069      HS_CMP_HALF(6, r7);
2070      HS_CMP_HALF(7, r8);
2071    }
2072    {
2073      HS_SLAB_HALF_PREAMBLE(4);
2074      HS_CMP_HALF(0, r1);
2075      HS_CMP_HALF(1, r2);
2076      HS_CMP_HALF(2, r3);
2077      HS_CMP_HALF(3, r4);
2078      HS_CMP_HALF(4, r5);
2079      HS_CMP_HALF(5, r6);
2080      HS_CMP_HALF(6, r7);
2081      HS_CMP_HALF(7, r8);
2082    }
2083    {
2084      HS_SLAB_HALF_PREAMBLE(2);
2085      HS_CMP_HALF(0, r1);
2086      HS_CMP_HALF(1, r2);
2087      HS_CMP_HALF(2, r3);
2088      HS_CMP_HALF(3, r4);
2089      HS_CMP_HALF(4, r5);
2090      HS_CMP_HALF(5, r6);
2091      HS_CMP_HALF(6, r7);
2092      HS_CMP_HALF(7, r8);
2093    }
2094    {
2095      HS_SLAB_HALF_PREAMBLE(1);
2096      HS_CMP_HALF(0, r1);
2097      HS_CMP_HALF(1, r2);
2098      HS_CMP_HALF(2, r3);
2099      HS_CMP_HALF(3, r4);
2100      HS_CMP_HALF(4, r5);
2101      HS_CMP_HALF(5, r6);
2102      HS_CMP_HALF(6, r7);
2103      HS_CMP_HALF(7, r8);
2104    }
2105    HS_CMP_XCHG(r1, r5);
2106    HS_CMP_XCHG(r3, r7);
2107    HS_CMP_XCHG(r1, r3);
2108    HS_CMP_XCHG(r5, r7);
2109    HS_CMP_XCHG(r2, r6);
2110    HS_CMP_XCHG(r4, r8);
2111    HS_CMP_XCHG(r2, r4);
2112    HS_CMP_XCHG(r6, r8);
2113    HS_CMP_XCHG(r1, r2);
2114    HS_CMP_XCHG(r3, r4);
2115    HS_CMP_XCHG(r5, r6);
2116    HS_CMP_XCHG(r7, r8);
2117  }
2118  HS_SLAB_GLOBAL_STORE(0, r1);
2119  HS_SLAB_GLOBAL_STORE(1, r2);
2120  HS_SLAB_GLOBAL_STORE(2, r3);
2121  HS_SLAB_GLOBAL_STORE(3, r4);
2122  HS_SLAB_GLOBAL_STORE(4, r5);
2123  HS_SLAB_GLOBAL_STORE(5, r6);
2124  HS_SLAB_GLOBAL_STORE(6, r7);
2125  HS_SLAB_GLOBAL_STORE(7, r8);
2126}
2127
2128HS_BC_KERNEL_PROTO(1, 0)
2129{
2130  HS_SLAB_GLOBAL_PREAMBLE();
2131  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0);
2132  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1);
2133  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2);
2134  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3);
2135  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4);
2136  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5);
2137  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6);
2138  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7);
2139  {
2140    {
2141      HS_SLAB_HALF_PREAMBLE(8);
2142      HS_CMP_HALF(0, r1);
2143      HS_CMP_HALF(1, r2);
2144      HS_CMP_HALF(2, r3);
2145      HS_CMP_HALF(3, r4);
2146      HS_CMP_HALF(4, r5);
2147      HS_CMP_HALF(5, r6);
2148      HS_CMP_HALF(6, r7);
2149      HS_CMP_HALF(7, r8);
2150    }
2151    {
2152      HS_SLAB_HALF_PREAMBLE(4);
2153      HS_CMP_HALF(0, r1);
2154      HS_CMP_HALF(1, r2);
2155      HS_CMP_HALF(2, r3);
2156      HS_CMP_HALF(3, r4);
2157      HS_CMP_HALF(4, r5);
2158      HS_CMP_HALF(5, r6);
2159      HS_CMP_HALF(6, r7);
2160      HS_CMP_HALF(7, r8);
2161    }
2162    {
2163      HS_SLAB_HALF_PREAMBLE(2);
2164      HS_CMP_HALF(0, r1);
2165      HS_CMP_HALF(1, r2);
2166      HS_CMP_HALF(2, r3);
2167      HS_CMP_HALF(3, r4);
2168      HS_CMP_HALF(4, r5);
2169      HS_CMP_HALF(5, r6);
2170      HS_CMP_HALF(6, r7);
2171      HS_CMP_HALF(7, r8);
2172    }
2173    {
2174      HS_SLAB_HALF_PREAMBLE(1);
2175      HS_CMP_HALF(0, r1);
2176      HS_CMP_HALF(1, r2);
2177      HS_CMP_HALF(2, r3);
2178      HS_CMP_HALF(3, r4);
2179      HS_CMP_HALF(4, r5);
2180      HS_CMP_HALF(5, r6);
2181      HS_CMP_HALF(6, r7);
2182      HS_CMP_HALF(7, r8);
2183    }
2184    HS_CMP_XCHG(r1, r5);
2185    HS_CMP_XCHG(r3, r7);
2186    HS_CMP_XCHG(r1, r3);
2187    HS_CMP_XCHG(r5, r7);
2188    HS_CMP_XCHG(r2, r6);
2189    HS_CMP_XCHG(r4, r8);
2190    HS_CMP_XCHG(r2, r4);
2191    HS_CMP_XCHG(r6, r8);
2192    HS_CMP_XCHG(r1, r2);
2193    HS_CMP_XCHG(r3, r4);
2194    HS_CMP_XCHG(r5, r6);
2195    HS_CMP_XCHG(r7, r8);
2196  }
2197  HS_SLAB_GLOBAL_STORE(0, r1);
2198  HS_SLAB_GLOBAL_STORE(1, r2);
2199  HS_SLAB_GLOBAL_STORE(2, r3);
2200  HS_SLAB_GLOBAL_STORE(3, r4);
2201  HS_SLAB_GLOBAL_STORE(4, r5);
2202  HS_SLAB_GLOBAL_STORE(5, r6);
2203  HS_SLAB_GLOBAL_STORE(6, r7);
2204  HS_SLAB_GLOBAL_STORE(7, r8);
2205}
2206
2207HS_BC_KERNEL_PROTO(2, 1)
2208{
2209  HS_BLOCK_LOCAL_MEM_DECL(32, 8);
2210
2211  HS_SLAB_GLOBAL_PREAMBLE();
2212  HS_BC_MERGE_H_PREAMBLE(2);
2213  {
2214    {
2215      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
2216      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8);
2217      HS_CMP_XCHG(r0_1, r0_2);
2218      HS_SLAB_LOCAL_L(0) = r0_1;
2219      HS_SLAB_LOCAL_L(16) = r0_2;
2220    }
2221    {
2222      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(2);
2223      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(10);
2224      HS_CMP_XCHG(r0_1, r0_2);
2225      HS_SLAB_LOCAL_L(64) = r0_1;
2226      HS_SLAB_LOCAL_L(80) = r0_2;
2227    }
2228    {
2229      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4);
2230      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(12);
2231      HS_CMP_XCHG(r0_1, r0_2);
2232      HS_SLAB_LOCAL_L(128) = r0_1;
2233      HS_SLAB_LOCAL_L(144) = r0_2;
2234    }
2235    {
2236      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(6);
2237      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(14);
2238      HS_CMP_XCHG(r0_1, r0_2);
2239      HS_SLAB_LOCAL_L(192) = r0_1;
2240      HS_SLAB_LOCAL_L(208) = r0_2;
2241    }
2242  }
2243  HS_BLOCK_BARRIER();
2244  HS_KEY_TYPE r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0);
2245  HS_KEY_TYPE r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1);
2246  HS_KEY_TYPE r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2);
2247  HS_KEY_TYPE r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3);
2248  HS_KEY_TYPE r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4);
2249  HS_KEY_TYPE r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5);
2250  HS_KEY_TYPE r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6);
2251  HS_KEY_TYPE r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7);
2252  {
2253    {
2254      HS_SLAB_HALF_PREAMBLE(8);
2255      HS_CMP_HALF(0, r1);
2256      HS_CMP_HALF(1, r2);
2257      HS_CMP_HALF(2, r3);
2258      HS_CMP_HALF(3, r4);
2259      HS_CMP_HALF(4, r5);
2260      HS_CMP_HALF(5, r6);
2261      HS_CMP_HALF(6, r7);
2262      HS_CMP_HALF(7, r8);
2263    }
2264    {
2265      HS_SLAB_HALF_PREAMBLE(4);
2266      HS_CMP_HALF(0, r1);
2267      HS_CMP_HALF(1, r2);
2268      HS_CMP_HALF(2, r3);
2269      HS_CMP_HALF(3, r4);
2270      HS_CMP_HALF(4, r5);
2271      HS_CMP_HALF(5, r6);
2272      HS_CMP_HALF(6, r7);
2273      HS_CMP_HALF(7, r8);
2274    }
2275    {
2276      HS_SLAB_HALF_PREAMBLE(2);
2277      HS_CMP_HALF(0, r1);
2278      HS_CMP_HALF(1, r2);
2279      HS_CMP_HALF(2, r3);
2280      HS_CMP_HALF(3, r4);
2281      HS_CMP_HALF(4, r5);
2282      HS_CMP_HALF(5, r6);
2283      HS_CMP_HALF(6, r7);
2284      HS_CMP_HALF(7, r8);
2285    }
2286    {
2287      HS_SLAB_HALF_PREAMBLE(1);
2288      HS_CMP_HALF(0, r1);
2289      HS_CMP_HALF(1, r2);
2290      HS_CMP_HALF(2, r3);
2291      HS_CMP_HALF(3, r4);
2292      HS_CMP_HALF(4, r5);
2293      HS_CMP_HALF(5, r6);
2294      HS_CMP_HALF(6, r7);
2295      HS_CMP_HALF(7, r8);
2296    }
2297    HS_CMP_XCHG(r1, r5);
2298    HS_CMP_XCHG(r3, r7);
2299    HS_CMP_XCHG(r1, r3);
2300    HS_CMP_XCHG(r5, r7);
2301    HS_CMP_XCHG(r2, r6);
2302    HS_CMP_XCHG(r4, r8);
2303    HS_CMP_XCHG(r2, r4);
2304    HS_CMP_XCHG(r6, r8);
2305    HS_CMP_XCHG(r1, r2);
2306    HS_CMP_XCHG(r3, r4);
2307    HS_CMP_XCHG(r5, r6);
2308    HS_CMP_XCHG(r7, r8);
2309  }
2310  HS_SLAB_GLOBAL_STORE(0, r1);
2311  HS_SLAB_GLOBAL_STORE(1, r2);
2312  HS_SLAB_GLOBAL_STORE(2, r3);
2313  HS_SLAB_GLOBAL_STORE(3, r4);
2314  HS_SLAB_GLOBAL_STORE(4, r5);
2315  HS_SLAB_GLOBAL_STORE(5, r6);
2316  HS_SLAB_GLOBAL_STORE(6, r7);
2317  HS_SLAB_GLOBAL_STORE(7, r8);
2318}
2319
2320HS_BC_KERNEL_PROTO(4, 2)
2321{
2322  HS_BLOCK_LOCAL_MEM_DECL(64, 8);
2323
2324  HS_SLAB_GLOBAL_PREAMBLE();
2325  HS_BC_MERGE_H_PREAMBLE(4);
2326  {
2327    {
2328      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
2329      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8);
2330      HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16);
2331      HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24);
2332      HS_CMP_XCHG(r0_1, r0_3);
2333      HS_CMP_XCHG(r0_2, r0_4);
2334      HS_CMP_XCHG(r0_1, r0_2);
2335      HS_CMP_XCHG(r0_3, r0_4);
2336      HS_SLAB_LOCAL_L(0) = r0_1;
2337      HS_SLAB_LOCAL_L(16) = r0_2;
2338      HS_SLAB_LOCAL_L(32) = r0_3;
2339      HS_SLAB_LOCAL_L(48) = r0_4;
2340    }
2341    {
2342      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4);
2343      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(12);
2344      HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(20);
2345      HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(28);
2346      HS_CMP_XCHG(r0_1, r0_3);
2347      HS_CMP_XCHG(r0_2, r0_4);
2348      HS_CMP_XCHG(r0_1, r0_2);
2349      HS_CMP_XCHG(r0_3, r0_4);
2350      HS_SLAB_LOCAL_L(256) = r0_1;
2351      HS_SLAB_LOCAL_L(272) = r0_2;
2352      HS_SLAB_LOCAL_L(288) = r0_3;
2353      HS_SLAB_LOCAL_L(304) = r0_4;
2354    }
2355  }
2356  HS_BLOCK_BARRIER();
2357  HS_KEY_TYPE r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
2358  HS_KEY_TYPE r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
2359  HS_KEY_TYPE r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
2360  HS_KEY_TYPE r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
2361  HS_KEY_TYPE r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
2362  HS_KEY_TYPE r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
2363  HS_KEY_TYPE r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
2364  HS_KEY_TYPE r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
2365  {
2366    {
2367      HS_SLAB_HALF_PREAMBLE(8);
2368      HS_CMP_HALF(0, r1);
2369      HS_CMP_HALF(1, r2);
2370      HS_CMP_HALF(2, r3);
2371      HS_CMP_HALF(3, r4);
2372      HS_CMP_HALF(4, r5);
2373      HS_CMP_HALF(5, r6);
2374      HS_CMP_HALF(6, r7);
2375      HS_CMP_HALF(7, r8);
2376    }
2377    {
2378      HS_SLAB_HALF_PREAMBLE(4);
2379      HS_CMP_HALF(0, r1);
2380      HS_CMP_HALF(1, r2);
2381      HS_CMP_HALF(2, r3);
2382      HS_CMP_HALF(3, r4);
2383      HS_CMP_HALF(4, r5);
2384      HS_CMP_HALF(5, r6);
2385      HS_CMP_HALF(6, r7);
2386      HS_CMP_HALF(7, r8);
2387    }
2388    {
2389      HS_SLAB_HALF_PREAMBLE(2);
2390      HS_CMP_HALF(0, r1);
2391      HS_CMP_HALF(1, r2);
2392      HS_CMP_HALF(2, r3);
2393      HS_CMP_HALF(3, r4);
2394      HS_CMP_HALF(4, r5);
2395      HS_CMP_HALF(5, r6);
2396      HS_CMP_HALF(6, r7);
2397      HS_CMP_HALF(7, r8);
2398    }
2399    {
2400      HS_SLAB_HALF_PREAMBLE(1);
2401      HS_CMP_HALF(0, r1);
2402      HS_CMP_HALF(1, r2);
2403      HS_CMP_HALF(2, r3);
2404      HS_CMP_HALF(3, r4);
2405      HS_CMP_HALF(4, r5);
2406      HS_CMP_HALF(5, r6);
2407      HS_CMP_HALF(6, r7);
2408      HS_CMP_HALF(7, r8);
2409    }
2410    HS_CMP_XCHG(r1, r5);
2411    HS_CMP_XCHG(r3, r7);
2412    HS_CMP_XCHG(r1, r3);
2413    HS_CMP_XCHG(r5, r7);
2414    HS_CMP_XCHG(r2, r6);
2415    HS_CMP_XCHG(r4, r8);
2416    HS_CMP_XCHG(r2, r4);
2417    HS_CMP_XCHG(r6, r8);
2418    HS_CMP_XCHG(r1, r2);
2419    HS_CMP_XCHG(r3, r4);
2420    HS_CMP_XCHG(r5, r6);
2421    HS_CMP_XCHG(r7, r8);
2422  }
2423  HS_SLAB_GLOBAL_STORE(0, r1);
2424  HS_SLAB_GLOBAL_STORE(1, r2);
2425  HS_SLAB_GLOBAL_STORE(2, r3);
2426  HS_SLAB_GLOBAL_STORE(3, r4);
2427  HS_SLAB_GLOBAL_STORE(4, r5);
2428  HS_SLAB_GLOBAL_STORE(5, r6);
2429  HS_SLAB_GLOBAL_STORE(6, r7);
2430  HS_SLAB_GLOBAL_STORE(7, r8);
2431}
2432
2433HS_BC_KERNEL_PROTO(8, 3)
2434{
2435  HS_BLOCK_LOCAL_MEM_DECL(128, 8);
2436
2437  HS_SLAB_GLOBAL_PREAMBLE();
2438  HS_BC_MERGE_H_PREAMBLE(8);
2439  {
2440    {
2441      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
2442      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8);
2443      HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16);
2444      HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24);
2445      HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(32);
2446      HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(40);
2447      HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(48);
2448      HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(56);
2449      HS_CMP_XCHG(r0_1, r0_5);
2450      HS_CMP_XCHG(r0_3, r0_7);
2451      HS_CMP_XCHG(r0_1, r0_3);
2452      HS_CMP_XCHG(r0_5, r0_7);
2453      HS_CMP_XCHG(r0_2, r0_6);
2454      HS_CMP_XCHG(r0_4, r0_8);
2455      HS_CMP_XCHG(r0_2, r0_4);
2456      HS_CMP_XCHG(r0_6, r0_8);
2457      HS_CMP_XCHG(r0_1, r0_2);
2458      HS_CMP_XCHG(r0_3, r0_4);
2459      HS_CMP_XCHG(r0_5, r0_6);
2460      HS_CMP_XCHG(r0_7, r0_8);
2461      HS_SLAB_LOCAL_L(0) = r0_1;
2462      HS_SLAB_LOCAL_L(16) = r0_2;
2463      HS_SLAB_LOCAL_L(32) = r0_3;
2464      HS_SLAB_LOCAL_L(48) = r0_4;
2465      HS_SLAB_LOCAL_L(64) = r0_5;
2466      HS_SLAB_LOCAL_L(80) = r0_6;
2467      HS_SLAB_LOCAL_L(96) = r0_7;
2468      HS_SLAB_LOCAL_L(112) = r0_8;
2469    }
2470  }
2471  HS_BLOCK_BARRIER();
2472  HS_KEY_TYPE r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
2473  HS_KEY_TYPE r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
2474  HS_KEY_TYPE r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
2475  HS_KEY_TYPE r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
2476  HS_KEY_TYPE r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
2477  HS_KEY_TYPE r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
2478  HS_KEY_TYPE r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
2479  HS_KEY_TYPE r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
2480  {
2481    {
2482      HS_SLAB_HALF_PREAMBLE(8);
2483      HS_CMP_HALF(0, r1);
2484      HS_CMP_HALF(1, r2);
2485      HS_CMP_HALF(2, r3);
2486      HS_CMP_HALF(3, r4);
2487      HS_CMP_HALF(4, r5);
2488      HS_CMP_HALF(5, r6);
2489      HS_CMP_HALF(6, r7);
2490      HS_CMP_HALF(7, r8);
2491    }
2492    {
2493      HS_SLAB_HALF_PREAMBLE(4);
2494      HS_CMP_HALF(0, r1);
2495      HS_CMP_HALF(1, r2);
2496      HS_CMP_HALF(2, r3);
2497      HS_CMP_HALF(3, r4);
2498      HS_CMP_HALF(4, r5);
2499      HS_CMP_HALF(5, r6);
2500      HS_CMP_HALF(6, r7);
2501      HS_CMP_HALF(7, r8);
2502    }
2503    {
2504      HS_SLAB_HALF_PREAMBLE(2);
2505      HS_CMP_HALF(0, r1);
2506      HS_CMP_HALF(1, r2);
2507      HS_CMP_HALF(2, r3);
2508      HS_CMP_HALF(3, r4);
2509      HS_CMP_HALF(4, r5);
2510      HS_CMP_HALF(5, r6);
2511      HS_CMP_HALF(6, r7);
2512      HS_CMP_HALF(7, r8);
2513    }
2514    {
2515      HS_SLAB_HALF_PREAMBLE(1);
2516      HS_CMP_HALF(0, r1);
2517      HS_CMP_HALF(1, r2);
2518      HS_CMP_HALF(2, r3);
2519      HS_CMP_HALF(3, r4);
2520      HS_CMP_HALF(4, r5);
2521      HS_CMP_HALF(5, r6);
2522      HS_CMP_HALF(6, r7);
2523      HS_CMP_HALF(7, r8);
2524    }
2525    HS_CMP_XCHG(r1, r5);
2526    HS_CMP_XCHG(r3, r7);
2527    HS_CMP_XCHG(r1, r3);
2528    HS_CMP_XCHG(r5, r7);
2529    HS_CMP_XCHG(r2, r6);
2530    HS_CMP_XCHG(r4, r8);
2531    HS_CMP_XCHG(r2, r4);
2532    HS_CMP_XCHG(r6, r8);
2533    HS_CMP_XCHG(r1, r2);
2534    HS_CMP_XCHG(r3, r4);
2535    HS_CMP_XCHG(r5, r6);
2536    HS_CMP_XCHG(r7, r8);
2537  }
2538  HS_SLAB_GLOBAL_STORE(0, r1);
2539  HS_SLAB_GLOBAL_STORE(1, r2);
2540  HS_SLAB_GLOBAL_STORE(2, r3);
2541  HS_SLAB_GLOBAL_STORE(3, r4);
2542  HS_SLAB_GLOBAL_STORE(4, r5);
2543  HS_SLAB_GLOBAL_STORE(5, r6);
2544  HS_SLAB_GLOBAL_STORE(6, r7);
2545  HS_SLAB_GLOBAL_STORE(7, r8);
2546}
2547
2548HS_BC_KERNEL_PROTO(16, 4)
2549{
2550  HS_BLOCK_LOCAL_MEM_DECL(256, 8);
2551
2552  HS_SLAB_GLOBAL_PREAMBLE();
2553  HS_BC_MERGE_H_PREAMBLE(16);
2554  if (HS_SUBGROUP_ID() < 8) {
2555    {
2556      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
2557      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8);
2558      HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16);
2559      HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24);
2560      HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(32);
2561      HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(40);
2562      HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(48);
2563      HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(56);
2564      HS_KEY_TYPE r0_9 = HS_BC_GLOBAL_LOAD_L(64);
2565      HS_KEY_TYPE r0_10 = HS_BC_GLOBAL_LOAD_L(72);
2566      HS_KEY_TYPE r0_11 = HS_BC_GLOBAL_LOAD_L(80);
2567      HS_KEY_TYPE r0_12 = HS_BC_GLOBAL_LOAD_L(88);
2568      HS_KEY_TYPE r0_13 = HS_BC_GLOBAL_LOAD_L(96);
2569      HS_KEY_TYPE r0_14 = HS_BC_GLOBAL_LOAD_L(104);
2570      HS_KEY_TYPE r0_15 = HS_BC_GLOBAL_LOAD_L(112);
2571      HS_KEY_TYPE r0_16 = HS_BC_GLOBAL_LOAD_L(120);
2572      HS_CMP_XCHG(r0_1, r0_9);
2573      HS_CMP_XCHG(r0_5, r0_13);
2574      HS_CMP_XCHG(r0_1, r0_5);
2575      HS_CMP_XCHG(r0_9, r0_13);
2576      HS_CMP_XCHG(r0_3, r0_11);
2577      HS_CMP_XCHG(r0_7, r0_15);
2578      HS_CMP_XCHG(r0_3, r0_7);
2579      HS_CMP_XCHG(r0_11, r0_15);
2580      HS_CMP_XCHG(r0_1, r0_3);
2581      HS_CMP_XCHG(r0_5, r0_7);
2582      HS_CMP_XCHG(r0_9, r0_11);
2583      HS_CMP_XCHG(r0_13, r0_15);
2584      HS_CMP_XCHG(r0_2, r0_10);
2585      HS_CMP_XCHG(r0_6, r0_14);
2586      HS_CMP_XCHG(r0_2, r0_6);
2587      HS_CMP_XCHG(r0_10, r0_14);
2588      HS_CMP_XCHG(r0_4, r0_12);
2589      HS_CMP_XCHG(r0_8, r0_16);
2590      HS_CMP_XCHG(r0_4, r0_8);
2591      HS_CMP_XCHG(r0_12, r0_16);
2592      HS_CMP_XCHG(r0_2, r0_4);
2593      HS_CMP_XCHG(r0_6, r0_8);
2594      HS_CMP_XCHG(r0_10, r0_12);
2595      HS_CMP_XCHG(r0_14, r0_16);
2596      HS_CMP_XCHG(r0_1, r0_2);
2597      HS_CMP_XCHG(r0_3, r0_4);
2598      HS_CMP_XCHG(r0_5, r0_6);
2599      HS_CMP_XCHG(r0_7, r0_8);
2600      HS_CMP_XCHG(r0_9, r0_10);
2601      HS_CMP_XCHG(r0_11, r0_12);
2602      HS_CMP_XCHG(r0_13, r0_14);
2603      HS_CMP_XCHG(r0_15, r0_16);
2604      HS_SLAB_LOCAL_L(0) = r0_1;
2605      HS_SLAB_LOCAL_L(16) = r0_2;
2606      HS_SLAB_LOCAL_L(32) = r0_3;
2607      HS_SLAB_LOCAL_L(48) = r0_4;
2608      HS_SLAB_LOCAL_L(64) = r0_5;
2609      HS_SLAB_LOCAL_L(80) = r0_6;
2610      HS_SLAB_LOCAL_L(96) = r0_7;
2611      HS_SLAB_LOCAL_L(112) = r0_8;
2612      HS_SLAB_LOCAL_L(128) = r0_9;
2613      HS_SLAB_LOCAL_L(144) = r0_10;
2614      HS_SLAB_LOCAL_L(160) = r0_11;
2615      HS_SLAB_LOCAL_L(176) = r0_12;
2616      HS_SLAB_LOCAL_L(192) = r0_13;
2617      HS_SLAB_LOCAL_L(208) = r0_14;
2618      HS_SLAB_LOCAL_L(224) = r0_15;
2619      HS_SLAB_LOCAL_L(240) = r0_16;
2620    }
2621  }
2622  HS_BLOCK_BARRIER();
2623  HS_KEY_TYPE r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2624  HS_KEY_TYPE r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2625  HS_KEY_TYPE r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2626  HS_KEY_TYPE r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2627  HS_KEY_TYPE r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2628  HS_KEY_TYPE r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2629  HS_KEY_TYPE r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2630  HS_KEY_TYPE r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2631  {
2632    {
2633      HS_SLAB_HALF_PREAMBLE(8);
2634      HS_CMP_HALF(0, r1);
2635      HS_CMP_HALF(1, r2);
2636      HS_CMP_HALF(2, r3);
2637      HS_CMP_HALF(3, r4);
2638      HS_CMP_HALF(4, r5);
2639      HS_CMP_HALF(5, r6);
2640      HS_CMP_HALF(6, r7);
2641      HS_CMP_HALF(7, r8);
2642    }
2643    {
2644      HS_SLAB_HALF_PREAMBLE(4);
2645      HS_CMP_HALF(0, r1);
2646      HS_CMP_HALF(1, r2);
2647      HS_CMP_HALF(2, r3);
2648      HS_CMP_HALF(3, r4);
2649      HS_CMP_HALF(4, r5);
2650      HS_CMP_HALF(5, r6);
2651      HS_CMP_HALF(6, r7);
2652      HS_CMP_HALF(7, r8);
2653    }
2654    {
2655      HS_SLAB_HALF_PREAMBLE(2);
2656      HS_CMP_HALF(0, r1);
2657      HS_CMP_HALF(1, r2);
2658      HS_CMP_HALF(2, r3);
2659      HS_CMP_HALF(3, r4);
2660      HS_CMP_HALF(4, r5);
2661      HS_CMP_HALF(5, r6);
2662      HS_CMP_HALF(6, r7);
2663      HS_CMP_HALF(7, r8);
2664    }
2665    {
2666      HS_SLAB_HALF_PREAMBLE(1);
2667      HS_CMP_HALF(0, r1);
2668      HS_CMP_HALF(1, r2);
2669      HS_CMP_HALF(2, r3);
2670      HS_CMP_HALF(3, r4);
2671      HS_CMP_HALF(4, r5);
2672      HS_CMP_HALF(5, r6);
2673      HS_CMP_HALF(6, r7);
2674      HS_CMP_HALF(7, r8);
2675    }
2676    HS_CMP_XCHG(r1, r5);
2677    HS_CMP_XCHG(r3, r7);
2678    HS_CMP_XCHG(r1, r3);
2679    HS_CMP_XCHG(r5, r7);
2680    HS_CMP_XCHG(r2, r6);
2681    HS_CMP_XCHG(r4, r8);
2682    HS_CMP_XCHG(r2, r4);
2683    HS_CMP_XCHG(r6, r8);
2684    HS_CMP_XCHG(r1, r2);
2685    HS_CMP_XCHG(r3, r4);
2686    HS_CMP_XCHG(r5, r6);
2687    HS_CMP_XCHG(r7, r8);
2688  }
2689  HS_SLAB_GLOBAL_STORE(0, r1);
2690  HS_SLAB_GLOBAL_STORE(1, r2);
2691  HS_SLAB_GLOBAL_STORE(2, r3);
2692  HS_SLAB_GLOBAL_STORE(3, r4);
2693  HS_SLAB_GLOBAL_STORE(4, r5);
2694  HS_SLAB_GLOBAL_STORE(5, r6);
2695  HS_SLAB_GLOBAL_STORE(6, r7);
2696  HS_SLAB_GLOBAL_STORE(7, r8);
2697}
2698
2699HS_FM_KERNEL_PROTO(0, 0)
2700{
2701  HS_FM_PREAMBLE(8);
2702  HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
2703  HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
2704  HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
2705  HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
2706  HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
2707  HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
2708  HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
2709  HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
2710  HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
2711  HS_CMP_XCHG(r8, r9);
2712  HS_CMP_XCHG(r1, r5);
2713  HS_CMP_XCHG(r3, r7);
2714  HS_CMP_XCHG(r1, r3);
2715  HS_CMP_XCHG(r5, r7);
2716  HS_CMP_XCHG(r2, r6);
2717  HS_CMP_XCHG(r4, r8);
2718  HS_CMP_XCHG(r2, r4);
2719  HS_CMP_XCHG(r6, r8);
2720  HS_CMP_XCHG(r1, r2);
2721  HS_CMP_XCHG(r3, r4);
2722  HS_CMP_XCHG(r5, r6);
2723  HS_CMP_XCHG(r7, r8);
2724  HS_XM_GLOBAL_STORE_L(0, r1);
2725  HS_XM_GLOBAL_STORE_L(1, r2);
2726  HS_XM_GLOBAL_STORE_L(2, r3);
2727  HS_XM_GLOBAL_STORE_L(3, r4);
2728  HS_XM_GLOBAL_STORE_L(4, r5);
2729  HS_XM_GLOBAL_STORE_L(5, r6);
2730  HS_XM_GLOBAL_STORE_L(6, r7);
2731  HS_XM_GLOBAL_STORE_L(7, r8);
2732  HS_FM_GLOBAL_STORE_R(0, r9);
2733}
2734
2735HS_FM_KERNEL_PROTO(0, 1)
2736{
2737  HS_FM_PREAMBLE(8);
2738  HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
2739  HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
2740  HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
2741  HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
2742  HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
2743  HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
2744  HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
2745  HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
2746  HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
2747  HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1);
2748  HS_CMP_XCHG(r8, r9);
2749  HS_CMP_XCHG(r7, r10);
2750  HS_CMP_XCHG(r1, r5);
2751  HS_CMP_XCHG(r3, r7);
2752  HS_CMP_XCHG(r1, r3);
2753  HS_CMP_XCHG(r5, r7);
2754  HS_CMP_XCHG(r2, r6);
2755  HS_CMP_XCHG(r4, r8);
2756  HS_CMP_XCHG(r2, r4);
2757  HS_CMP_XCHG(r6, r8);
2758  HS_CMP_XCHG(r1, r2);
2759  HS_CMP_XCHG(r3, r4);
2760  HS_CMP_XCHG(r5, r6);
2761  HS_CMP_XCHG(r7, r8);
2762  HS_CMP_XCHG(r9, r10);
2763  HS_XM_GLOBAL_STORE_L(0, r1);
2764  HS_XM_GLOBAL_STORE_L(1, r2);
2765  HS_XM_GLOBAL_STORE_L(2, r3);
2766  HS_XM_GLOBAL_STORE_L(3, r4);
2767  HS_XM_GLOBAL_STORE_L(4, r5);
2768  HS_XM_GLOBAL_STORE_L(5, r6);
2769  HS_XM_GLOBAL_STORE_L(6, r7);
2770  HS_XM_GLOBAL_STORE_L(7, r8);
2771  HS_FM_GLOBAL_STORE_R(0, r9);
2772  HS_FM_GLOBAL_STORE_R(1, r10);
2773}
2774
2775HS_FM_KERNEL_PROTO(0, 2)
2776{
2777  HS_FM_PREAMBLE(8);
2778  HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
2779  HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
2780  HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
2781  HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
2782  HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
2783  HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
2784  HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
2785  HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
2786  HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
2787  HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1);
2788  HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2);
2789  HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3);
2790  HS_CMP_XCHG(r8, r9);
2791  HS_CMP_XCHG(r7, r10);
2792  HS_CMP_XCHG(r6, r11);
2793  HS_CMP_XCHG(r5, r12);
2794  HS_CMP_XCHG(r1, r5);
2795  HS_CMP_XCHG(r3, r7);
2796  HS_CMP_XCHG(r1, r3);
2797  HS_CMP_XCHG(r5, r7);
2798  HS_CMP_XCHG(r2, r6);
2799  HS_CMP_XCHG(r4, r8);
2800  HS_CMP_XCHG(r2, r4);
2801  HS_CMP_XCHG(r6, r8);
2802  HS_CMP_XCHG(r1, r2);
2803  HS_CMP_XCHG(r3, r4);
2804  HS_CMP_XCHG(r5, r6);
2805  HS_CMP_XCHG(r7, r8);
2806  HS_CMP_XCHG(r9, r11);
2807  HS_CMP_XCHG(r10, r12);
2808  HS_CMP_XCHG(r9, r10);
2809  HS_CMP_XCHG(r11, r12);
2810  HS_XM_GLOBAL_STORE_L(0, r1);
2811  HS_XM_GLOBAL_STORE_L(1, r2);
2812  HS_XM_GLOBAL_STORE_L(2, r3);
2813  HS_XM_GLOBAL_STORE_L(3, r4);
2814  HS_XM_GLOBAL_STORE_L(4, r5);
2815  HS_XM_GLOBAL_STORE_L(5, r6);
2816  HS_XM_GLOBAL_STORE_L(6, r7);
2817  HS_XM_GLOBAL_STORE_L(7, r8);
2818  HS_FM_GLOBAL_STORE_R(0, r9);
2819  HS_FM_GLOBAL_STORE_R(1, r10);
2820  HS_FM_GLOBAL_STORE_R(2, r11);
2821  HS_FM_GLOBAL_STORE_R(3, r12);
2822}
2823
2824HS_FM_KERNEL_PROTO(0, 3)
2825{
2826  HS_FM_PREAMBLE(8);
2827  HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
2828  HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
2829  HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
2830  HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
2831  HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
2832  HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
2833  HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
2834  HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
2835  HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
2836  HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1);
2837  HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2);
2838  HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3);
2839  HS_KEY_TYPE r13 = HS_FM_GLOBAL_LOAD_R(4);
2840  HS_KEY_TYPE r14 = HS_FM_GLOBAL_LOAD_R(5);
2841  HS_KEY_TYPE r15 = HS_FM_GLOBAL_LOAD_R(6);
2842  HS_KEY_TYPE r16 = HS_FM_GLOBAL_LOAD_R(7);
2843  HS_CMP_XCHG(r8, r9);
2844  HS_CMP_XCHG(r7, r10);
2845  HS_CMP_XCHG(r6, r11);
2846  HS_CMP_XCHG(r5, r12);
2847  HS_CMP_XCHG(r4, r13);
2848  HS_CMP_XCHG(r3, r14);
2849  HS_CMP_XCHG(r2, r15);
2850  HS_CMP_XCHG(r1, r16);
2851  HS_CMP_XCHG(r1, r5);
2852  HS_CMP_XCHG(r3, r7);
2853  HS_CMP_XCHG(r1, r3);
2854  HS_CMP_XCHG(r5, r7);
2855  HS_CMP_XCHG(r2, r6);
2856  HS_CMP_XCHG(r4, r8);
2857  HS_CMP_XCHG(r2, r4);
2858  HS_CMP_XCHG(r6, r8);
2859  HS_CMP_XCHG(r1, r2);
2860  HS_CMP_XCHG(r3, r4);
2861  HS_CMP_XCHG(r5, r6);
2862  HS_CMP_XCHG(r7, r8);
2863  HS_CMP_XCHG(r9, r13);
2864  HS_CMP_XCHG(r11, r15);
2865  HS_CMP_XCHG(r9, r11);
2866  HS_CMP_XCHG(r13, r15);
2867  HS_CMP_XCHG(r10, r14);
2868  HS_CMP_XCHG(r12, r16);
2869  HS_CMP_XCHG(r10, r12);
2870  HS_CMP_XCHG(r14, r16);
2871  HS_CMP_XCHG(r9, r10);
2872  HS_CMP_XCHG(r11, r12);
2873  HS_CMP_XCHG(r13, r14);
2874  HS_CMP_XCHG(r15, r16);
2875  HS_XM_GLOBAL_STORE_L(0, r1);
2876  HS_XM_GLOBAL_STORE_L(1, r2);
2877  HS_XM_GLOBAL_STORE_L(2, r3);
2878  HS_XM_GLOBAL_STORE_L(3, r4);
2879  HS_XM_GLOBAL_STORE_L(4, r5);
2880  HS_XM_GLOBAL_STORE_L(5, r6);
2881  HS_XM_GLOBAL_STORE_L(6, r7);
2882  HS_XM_GLOBAL_STORE_L(7, r8);
2883  HS_FM_GLOBAL_STORE_R(0, r9);
2884  HS_FM_GLOBAL_STORE_R(1, r10);
2885  HS_FM_GLOBAL_STORE_R(2, r11);
2886  HS_FM_GLOBAL_STORE_R(3, r12);
2887  HS_FM_GLOBAL_STORE_R(4, r13);
2888  HS_FM_GLOBAL_STORE_R(5, r14);
2889  HS_FM_GLOBAL_STORE_R(6, r15);
2890  HS_FM_GLOBAL_STORE_R(7, r16);
2891}
2892
2893HS_HM_KERNEL_PROTO(0)
2894{
2895  HS_HM_PREAMBLE(8);
2896  HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
2897  HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
2898  HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
2899  HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
2900  HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
2901  HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
2902  HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
2903  HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
2904  HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
2905  HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
2906  HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
2907  HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
2908  HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
2909  HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
2910  HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
2911  HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
2912  HS_CMP_XCHG(r1, r9);
2913  HS_CMP_XCHG(r5, r13);
2914  HS_CMP_XCHG(r1, r5);
2915  HS_CMP_XCHG(r9, r13);
2916  HS_CMP_XCHG(r3, r11);
2917  HS_CMP_XCHG(r7, r15);
2918  HS_CMP_XCHG(r3, r7);
2919  HS_CMP_XCHG(r11, r15);
2920  HS_CMP_XCHG(r1, r3);
2921  HS_CMP_XCHG(r5, r7);
2922  HS_CMP_XCHG(r9, r11);
2923  HS_CMP_XCHG(r13, r15);
2924  HS_CMP_XCHG(r2, r10);
2925  HS_CMP_XCHG(r6, r14);
2926  HS_CMP_XCHG(r2, r6);
2927  HS_CMP_XCHG(r10, r14);
2928  HS_CMP_XCHG(r4, r12);
2929  HS_CMP_XCHG(r8, r16);
2930  HS_CMP_XCHG(r4, r8);
2931  HS_CMP_XCHG(r12, r16);
2932  HS_CMP_XCHG(r2, r4);
2933  HS_CMP_XCHG(r6, r8);
2934  HS_CMP_XCHG(r10, r12);
2935  HS_CMP_XCHG(r14, r16);
2936  HS_CMP_XCHG(r1, r2);
2937  HS_CMP_XCHG(r3, r4);
2938  HS_CMP_XCHG(r5, r6);
2939  HS_CMP_XCHG(r7, r8);
2940  HS_CMP_XCHG(r9, r10);
2941  HS_CMP_XCHG(r11, r12);
2942  HS_CMP_XCHG(r13, r14);
2943  HS_CMP_XCHG(r15, r16);
2944  HS_XM_GLOBAL_STORE_L(0, r1);
2945  HS_XM_GLOBAL_STORE_L(1, r2);
2946  HS_XM_GLOBAL_STORE_L(2, r3);
2947  HS_XM_GLOBAL_STORE_L(3, r4);
2948  HS_XM_GLOBAL_STORE_L(4, r5);
2949  HS_XM_GLOBAL_STORE_L(5, r6);
2950  HS_XM_GLOBAL_STORE_L(6, r7);
2951  HS_XM_GLOBAL_STORE_L(7, r8);
2952  HS_XM_GLOBAL_STORE_L(8, r9);
2953  HS_XM_GLOBAL_STORE_L(9, r10);
2954  HS_XM_GLOBAL_STORE_L(10, r11);
2955  HS_XM_GLOBAL_STORE_L(11, r12);
2956  HS_XM_GLOBAL_STORE_L(12, r13);
2957  HS_XM_GLOBAL_STORE_L(13, r14);
2958  HS_XM_GLOBAL_STORE_L(14, r15);
2959  HS_XM_GLOBAL_STORE_L(15, r16);
2960}
2961
2962HS_TRANSPOSE_KERNEL_PROTO()
2963{
2964  HS_SLAB_GLOBAL_PREAMBLE();
2965  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0);
2966  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1);
2967  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2);
2968  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3);
2969  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4);
2970  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5);
2971  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6);
2972  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7);
2973  HS_TRANSPOSE_SLAB()
2974}
2975
2976//
2977//
2978//
2979