1//
2// Copyright 2016 Google Inc.
3//
4// Use of this source code is governed by a BSD-style
5// license that can be found in the LICENSE file.
6//
7
8// target-specific config
9#include "hs_config.h"
10
11// arch/target-specific macros
12#include "hs_cl_macros.h"
13
14//
15//
16//
17
18HS_BS_KERNEL_PROTO(1, 0)
19{
20  HS_SLAB_GLOBAL_PREAMBLE();
21  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
22  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
23  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
24  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
25  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
26  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
27  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
28  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
29  HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8);
30  HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9);
31  HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10);
32  HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11);
33  HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12);
34  HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13);
35  HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14);
36  HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15);
37  HS_CMP_XCHG(r1, r2);
38  HS_CMP_XCHG(r3, r4);
39  HS_CMP_XCHG(r5, r6);
40  HS_CMP_XCHG(r7, r8);
41  HS_CMP_XCHG(r9, r10);
42  HS_CMP_XCHG(r11, r12);
43  HS_CMP_XCHG(r13, r14);
44  HS_CMP_XCHG(r15, r16);
45  HS_CMP_XCHG(r1, r3);
46  HS_CMP_XCHG(r5, r7);
47  HS_CMP_XCHG(r9, r11);
48  HS_CMP_XCHG(r13, r15);
49  HS_CMP_XCHG(r2, r4);
50  HS_CMP_XCHG(r6, r8);
51  HS_CMP_XCHG(r10, r12);
52  HS_CMP_XCHG(r14, r16);
53  HS_CMP_XCHG(r1, r5);
54  HS_CMP_XCHG(r9, r13);
55  HS_CMP_XCHG(r2, r6);
56  HS_CMP_XCHG(r10, r14);
57  HS_CMP_XCHG(r3, r7);
58  HS_CMP_XCHG(r11, r15);
59  HS_CMP_XCHG(r4, r8);
60  HS_CMP_XCHG(r12, r16);
61  HS_CMP_XCHG(r1, r9);
62  HS_CMP_XCHG(r2, r10);
63  HS_CMP_XCHG(r3, r11);
64  HS_CMP_XCHG(r4, r12);
65  HS_CMP_XCHG(r5, r13);
66  HS_CMP_XCHG(r6, r14);
67  HS_CMP_XCHG(r7, r15);
68  HS_CMP_XCHG(r8, r16);
69  HS_CMP_XCHG(r6, r11);
70  HS_CMP_XCHG(r7, r10);
71  HS_CMP_XCHG(r4, r13);
72  HS_CMP_XCHG(r14, r15);
73  HS_CMP_XCHG(r8, r12);
74  HS_CMP_XCHG(r2, r3);
75  HS_CMP_XCHG(r5, r9);
76  HS_CMP_XCHG(r2, r5);
77  HS_CMP_XCHG(r8, r14);
78  HS_CMP_XCHG(r3, r9);
79  HS_CMP_XCHG(r12, r15);
80  HS_CMP_XCHG(r3, r5);
81  HS_CMP_XCHG(r6, r7);
82  HS_CMP_XCHG(r10, r11);
83  HS_CMP_XCHG(r12, r14);
84  HS_CMP_XCHG(r4, r9);
85  HS_CMP_XCHG(r8, r13);
86  HS_CMP_XCHG(r7, r9);
87  HS_CMP_XCHG(r11, r13);
88  HS_CMP_XCHG(r4, r6);
89  HS_CMP_XCHG(r8, r10);
90  HS_CMP_XCHG(r4, r5);
91  HS_CMP_XCHG(r6, r7);
92  HS_CMP_XCHG(r8, r9);
93  HS_CMP_XCHG(r10, r11);
94  HS_CMP_XCHG(r12, r13);
95  HS_CMP_XCHG(r7, r8);
96  HS_CMP_XCHG(r9, r10);
97  {
98    HS_SLAB_FLIP_PREAMBLE(1);
99    HS_CMP_FLIP(0, r1, r16);
100    HS_CMP_FLIP(1, r2, r15);
101    HS_CMP_FLIP(2, r3, r14);
102    HS_CMP_FLIP(3, r4, r13);
103    HS_CMP_FLIP(4, r5, r12);
104    HS_CMP_FLIP(5, r6, r11);
105    HS_CMP_FLIP(6, r7, r10);
106    HS_CMP_FLIP(7, r8, r9);
107  }
108  HS_CMP_XCHG(r1, r9);
109  HS_CMP_XCHG(r5, r13);
110  HS_CMP_XCHG(r1, r5);
111  HS_CMP_XCHG(r9, r13);
112  HS_CMP_XCHG(r3, r11);
113  HS_CMP_XCHG(r7, r15);
114  HS_CMP_XCHG(r3, r7);
115  HS_CMP_XCHG(r11, r15);
116  HS_CMP_XCHG(r1, r3);
117  HS_CMP_XCHG(r5, r7);
118  HS_CMP_XCHG(r9, r11);
119  HS_CMP_XCHG(r13, r15);
120  HS_CMP_XCHG(r2, r10);
121  HS_CMP_XCHG(r6, r14);
122  HS_CMP_XCHG(r2, r6);
123  HS_CMP_XCHG(r10, r14);
124  HS_CMP_XCHG(r4, r12);
125  HS_CMP_XCHG(r8, r16);
126  HS_CMP_XCHG(r4, r8);
127  HS_CMP_XCHG(r12, r16);
128  HS_CMP_XCHG(r2, r4);
129  HS_CMP_XCHG(r6, r8);
130  HS_CMP_XCHG(r10, r12);
131  HS_CMP_XCHG(r14, r16);
132  HS_CMP_XCHG(r1, r2);
133  HS_CMP_XCHG(r3, r4);
134  HS_CMP_XCHG(r5, r6);
135  HS_CMP_XCHG(r7, r8);
136  HS_CMP_XCHG(r9, r10);
137  HS_CMP_XCHG(r11, r12);
138  HS_CMP_XCHG(r13, r14);
139  HS_CMP_XCHG(r15, r16);
140  {
141    HS_SLAB_FLIP_PREAMBLE(3);
142    HS_CMP_FLIP(0, r1, r16);
143    HS_CMP_FLIP(1, r2, r15);
144    HS_CMP_FLIP(2, r3, r14);
145    HS_CMP_FLIP(3, r4, r13);
146    HS_CMP_FLIP(4, r5, r12);
147    HS_CMP_FLIP(5, r6, r11);
148    HS_CMP_FLIP(6, r7, r10);
149    HS_CMP_FLIP(7, r8, r9);
150  }
151  {
152    HS_SLAB_HALF_PREAMBLE(1);
153    HS_CMP_HALF(0, r1);
154    HS_CMP_HALF(1, r2);
155    HS_CMP_HALF(2, r3);
156    HS_CMP_HALF(3, r4);
157    HS_CMP_HALF(4, r5);
158    HS_CMP_HALF(5, r6);
159    HS_CMP_HALF(6, r7);
160    HS_CMP_HALF(7, r8);
161    HS_CMP_HALF(8, r9);
162    HS_CMP_HALF(9, r10);
163    HS_CMP_HALF(10, r11);
164    HS_CMP_HALF(11, r12);
165    HS_CMP_HALF(12, r13);
166    HS_CMP_HALF(13, r14);
167    HS_CMP_HALF(14, r15);
168    HS_CMP_HALF(15, r16);
169  }
170  HS_CMP_XCHG(r1, r9);
171  HS_CMP_XCHG(r5, r13);
172  HS_CMP_XCHG(r1, r5);
173  HS_CMP_XCHG(r9, r13);
174  HS_CMP_XCHG(r3, r11);
175  HS_CMP_XCHG(r7, r15);
176  HS_CMP_XCHG(r3, r7);
177  HS_CMP_XCHG(r11, r15);
178  HS_CMP_XCHG(r1, r3);
179  HS_CMP_XCHG(r5, r7);
180  HS_CMP_XCHG(r9, r11);
181  HS_CMP_XCHG(r13, r15);
182  HS_CMP_XCHG(r2, r10);
183  HS_CMP_XCHG(r6, r14);
184  HS_CMP_XCHG(r2, r6);
185  HS_CMP_XCHG(r10, r14);
186  HS_CMP_XCHG(r4, r12);
187  HS_CMP_XCHG(r8, r16);
188  HS_CMP_XCHG(r4, r8);
189  HS_CMP_XCHG(r12, r16);
190  HS_CMP_XCHG(r2, r4);
191  HS_CMP_XCHG(r6, r8);
192  HS_CMP_XCHG(r10, r12);
193  HS_CMP_XCHG(r14, r16);
194  HS_CMP_XCHG(r1, r2);
195  HS_CMP_XCHG(r3, r4);
196  HS_CMP_XCHG(r5, r6);
197  HS_CMP_XCHG(r7, r8);
198  HS_CMP_XCHG(r9, r10);
199  HS_CMP_XCHG(r11, r12);
200  HS_CMP_XCHG(r13, r14);
201  HS_CMP_XCHG(r15, r16);
202  {
203    HS_SLAB_FLIP_PREAMBLE(7);
204    HS_CMP_FLIP(0, r1, r16);
205    HS_CMP_FLIP(1, r2, r15);
206    HS_CMP_FLIP(2, r3, r14);
207    HS_CMP_FLIP(3, r4, r13);
208    HS_CMP_FLIP(4, r5, r12);
209    HS_CMP_FLIP(5, r6, r11);
210    HS_CMP_FLIP(6, r7, r10);
211    HS_CMP_FLIP(7, r8, r9);
212  }
213  {
214    HS_SLAB_HALF_PREAMBLE(2);
215    HS_CMP_HALF(0, r1);
216    HS_CMP_HALF(1, r2);
217    HS_CMP_HALF(2, r3);
218    HS_CMP_HALF(3, r4);
219    HS_CMP_HALF(4, r5);
220    HS_CMP_HALF(5, r6);
221    HS_CMP_HALF(6, r7);
222    HS_CMP_HALF(7, r8);
223    HS_CMP_HALF(8, r9);
224    HS_CMP_HALF(9, r10);
225    HS_CMP_HALF(10, r11);
226    HS_CMP_HALF(11, r12);
227    HS_CMP_HALF(12, r13);
228    HS_CMP_HALF(13, r14);
229    HS_CMP_HALF(14, r15);
230    HS_CMP_HALF(15, r16);
231  }
232  {
233    HS_SLAB_HALF_PREAMBLE(1);
234    HS_CMP_HALF(0, r1);
235    HS_CMP_HALF(1, r2);
236    HS_CMP_HALF(2, r3);
237    HS_CMP_HALF(3, r4);
238    HS_CMP_HALF(4, r5);
239    HS_CMP_HALF(5, r6);
240    HS_CMP_HALF(6, r7);
241    HS_CMP_HALF(7, r8);
242    HS_CMP_HALF(8, r9);
243    HS_CMP_HALF(9, r10);
244    HS_CMP_HALF(10, r11);
245    HS_CMP_HALF(11, r12);
246    HS_CMP_HALF(12, r13);
247    HS_CMP_HALF(13, r14);
248    HS_CMP_HALF(14, r15);
249    HS_CMP_HALF(15, r16);
250  }
251  HS_CMP_XCHG(r1, r9);
252  HS_CMP_XCHG(r5, r13);
253  HS_CMP_XCHG(r1, r5);
254  HS_CMP_XCHG(r9, r13);
255  HS_CMP_XCHG(r3, r11);
256  HS_CMP_XCHG(r7, r15);
257  HS_CMP_XCHG(r3, r7);
258  HS_CMP_XCHG(r11, r15);
259  HS_CMP_XCHG(r1, r3);
260  HS_CMP_XCHG(r5, r7);
261  HS_CMP_XCHG(r9, r11);
262  HS_CMP_XCHG(r13, r15);
263  HS_CMP_XCHG(r2, r10);
264  HS_CMP_XCHG(r6, r14);
265  HS_CMP_XCHG(r2, r6);
266  HS_CMP_XCHG(r10, r14);
267  HS_CMP_XCHG(r4, r12);
268  HS_CMP_XCHG(r8, r16);
269  HS_CMP_XCHG(r4, r8);
270  HS_CMP_XCHG(r12, r16);
271  HS_CMP_XCHG(r2, r4);
272  HS_CMP_XCHG(r6, r8);
273  HS_CMP_XCHG(r10, r12);
274  HS_CMP_XCHG(r14, r16);
275  HS_CMP_XCHG(r1, r2);
276  HS_CMP_XCHG(r3, r4);
277  HS_CMP_XCHG(r5, r6);
278  HS_CMP_XCHG(r7, r8);
279  HS_CMP_XCHG(r9, r10);
280  HS_CMP_XCHG(r11, r12);
281  HS_CMP_XCHG(r13, r14);
282  HS_CMP_XCHG(r15, r16);
283  HS_SLAB_GLOBAL_STORE(0, r1);
284  HS_SLAB_GLOBAL_STORE(1, r2);
285  HS_SLAB_GLOBAL_STORE(2, r3);
286  HS_SLAB_GLOBAL_STORE(3, r4);
287  HS_SLAB_GLOBAL_STORE(4, r5);
288  HS_SLAB_GLOBAL_STORE(5, r6);
289  HS_SLAB_GLOBAL_STORE(6, r7);
290  HS_SLAB_GLOBAL_STORE(7, r8);
291  HS_SLAB_GLOBAL_STORE(8, r9);
292  HS_SLAB_GLOBAL_STORE(9, r10);
293  HS_SLAB_GLOBAL_STORE(10, r11);
294  HS_SLAB_GLOBAL_STORE(11, r12);
295  HS_SLAB_GLOBAL_STORE(12, r13);
296  HS_SLAB_GLOBAL_STORE(13, r14);
297  HS_SLAB_GLOBAL_STORE(14, r15);
298  HS_SLAB_GLOBAL_STORE(15, r16);
299}
300
301HS_BS_KERNEL_PROTO(2, 1)
302{
303  HS_BLOCK_LOCAL_MEM_DECL(16, 16);
304
305  HS_SLAB_GLOBAL_PREAMBLE();
306  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
307  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
308  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
309  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
310  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
311  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
312  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
313  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
314  HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8);
315  HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9);
316  HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10);
317  HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11);
318  HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12);
319  HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13);
320  HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14);
321  HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15);
322  HS_CMP_XCHG(r1, r2);
323  HS_CMP_XCHG(r3, r4);
324  HS_CMP_XCHG(r5, r6);
325  HS_CMP_XCHG(r7, r8);
326  HS_CMP_XCHG(r9, r10);
327  HS_CMP_XCHG(r11, r12);
328  HS_CMP_XCHG(r13, r14);
329  HS_CMP_XCHG(r15, r16);
330  HS_CMP_XCHG(r1, r3);
331  HS_CMP_XCHG(r5, r7);
332  HS_CMP_XCHG(r9, r11);
333  HS_CMP_XCHG(r13, r15);
334  HS_CMP_XCHG(r2, r4);
335  HS_CMP_XCHG(r6, r8);
336  HS_CMP_XCHG(r10, r12);
337  HS_CMP_XCHG(r14, r16);
338  HS_CMP_XCHG(r1, r5);
339  HS_CMP_XCHG(r9, r13);
340  HS_CMP_XCHG(r2, r6);
341  HS_CMP_XCHG(r10, r14);
342  HS_CMP_XCHG(r3, r7);
343  HS_CMP_XCHG(r11, r15);
344  HS_CMP_XCHG(r4, r8);
345  HS_CMP_XCHG(r12, r16);
346  HS_CMP_XCHG(r1, r9);
347  HS_CMP_XCHG(r2, r10);
348  HS_CMP_XCHG(r3, r11);
349  HS_CMP_XCHG(r4, r12);
350  HS_CMP_XCHG(r5, r13);
351  HS_CMP_XCHG(r6, r14);
352  HS_CMP_XCHG(r7, r15);
353  HS_CMP_XCHG(r8, r16);
354  HS_CMP_XCHG(r6, r11);
355  HS_CMP_XCHG(r7, r10);
356  HS_CMP_XCHG(r4, r13);
357  HS_CMP_XCHG(r14, r15);
358  HS_CMP_XCHG(r8, r12);
359  HS_CMP_XCHG(r2, r3);
360  HS_CMP_XCHG(r5, r9);
361  HS_CMP_XCHG(r2, r5);
362  HS_CMP_XCHG(r8, r14);
363  HS_CMP_XCHG(r3, r9);
364  HS_CMP_XCHG(r12, r15);
365  HS_CMP_XCHG(r3, r5);
366  HS_CMP_XCHG(r6, r7);
367  HS_CMP_XCHG(r10, r11);
368  HS_CMP_XCHG(r12, r14);
369  HS_CMP_XCHG(r4, r9);
370  HS_CMP_XCHG(r8, r13);
371  HS_CMP_XCHG(r7, r9);
372  HS_CMP_XCHG(r11, r13);
373  HS_CMP_XCHG(r4, r6);
374  HS_CMP_XCHG(r8, r10);
375  HS_CMP_XCHG(r4, r5);
376  HS_CMP_XCHG(r6, r7);
377  HS_CMP_XCHG(r8, r9);
378  HS_CMP_XCHG(r10, r11);
379  HS_CMP_XCHG(r12, r13);
380  HS_CMP_XCHG(r7, r8);
381  HS_CMP_XCHG(r9, r10);
382  {
383    HS_SLAB_FLIP_PREAMBLE(1);
384    HS_CMP_FLIP(0, r1, r16);
385    HS_CMP_FLIP(1, r2, r15);
386    HS_CMP_FLIP(2, r3, r14);
387    HS_CMP_FLIP(3, r4, r13);
388    HS_CMP_FLIP(4, r5, r12);
389    HS_CMP_FLIP(5, r6, r11);
390    HS_CMP_FLIP(6, r7, r10);
391    HS_CMP_FLIP(7, r8, r9);
392  }
393  HS_CMP_XCHG(r1, r9);
394  HS_CMP_XCHG(r5, r13);
395  HS_CMP_XCHG(r1, r5);
396  HS_CMP_XCHG(r9, r13);
397  HS_CMP_XCHG(r3, r11);
398  HS_CMP_XCHG(r7, r15);
399  HS_CMP_XCHG(r3, r7);
400  HS_CMP_XCHG(r11, r15);
401  HS_CMP_XCHG(r1, r3);
402  HS_CMP_XCHG(r5, r7);
403  HS_CMP_XCHG(r9, r11);
404  HS_CMP_XCHG(r13, r15);
405  HS_CMP_XCHG(r2, r10);
406  HS_CMP_XCHG(r6, r14);
407  HS_CMP_XCHG(r2, r6);
408  HS_CMP_XCHG(r10, r14);
409  HS_CMP_XCHG(r4, r12);
410  HS_CMP_XCHG(r8, r16);
411  HS_CMP_XCHG(r4, r8);
412  HS_CMP_XCHG(r12, r16);
413  HS_CMP_XCHG(r2, r4);
414  HS_CMP_XCHG(r6, r8);
415  HS_CMP_XCHG(r10, r12);
416  HS_CMP_XCHG(r14, r16);
417  HS_CMP_XCHG(r1, r2);
418  HS_CMP_XCHG(r3, r4);
419  HS_CMP_XCHG(r5, r6);
420  HS_CMP_XCHG(r7, r8);
421  HS_CMP_XCHG(r9, r10);
422  HS_CMP_XCHG(r11, r12);
423  HS_CMP_XCHG(r13, r14);
424  HS_CMP_XCHG(r15, r16);
425  {
426    HS_SLAB_FLIP_PREAMBLE(3);
427    HS_CMP_FLIP(0, r1, r16);
428    HS_CMP_FLIP(1, r2, r15);
429    HS_CMP_FLIP(2, r3, r14);
430    HS_CMP_FLIP(3, r4, r13);
431    HS_CMP_FLIP(4, r5, r12);
432    HS_CMP_FLIP(5, r6, r11);
433    HS_CMP_FLIP(6, r7, r10);
434    HS_CMP_FLIP(7, r8, r9);
435  }
436  {
437    HS_SLAB_HALF_PREAMBLE(1);
438    HS_CMP_HALF(0, r1);
439    HS_CMP_HALF(1, r2);
440    HS_CMP_HALF(2, r3);
441    HS_CMP_HALF(3, r4);
442    HS_CMP_HALF(4, r5);
443    HS_CMP_HALF(5, r6);
444    HS_CMP_HALF(6, r7);
445    HS_CMP_HALF(7, r8);
446    HS_CMP_HALF(8, r9);
447    HS_CMP_HALF(9, r10);
448    HS_CMP_HALF(10, r11);
449    HS_CMP_HALF(11, r12);
450    HS_CMP_HALF(12, r13);
451    HS_CMP_HALF(13, r14);
452    HS_CMP_HALF(14, r15);
453    HS_CMP_HALF(15, r16);
454  }
455  HS_CMP_XCHG(r1, r9);
456  HS_CMP_XCHG(r5, r13);
457  HS_CMP_XCHG(r1, r5);
458  HS_CMP_XCHG(r9, r13);
459  HS_CMP_XCHG(r3, r11);
460  HS_CMP_XCHG(r7, r15);
461  HS_CMP_XCHG(r3, r7);
462  HS_CMP_XCHG(r11, r15);
463  HS_CMP_XCHG(r1, r3);
464  HS_CMP_XCHG(r5, r7);
465  HS_CMP_XCHG(r9, r11);
466  HS_CMP_XCHG(r13, r15);
467  HS_CMP_XCHG(r2, r10);
468  HS_CMP_XCHG(r6, r14);
469  HS_CMP_XCHG(r2, r6);
470  HS_CMP_XCHG(r10, r14);
471  HS_CMP_XCHG(r4, r12);
472  HS_CMP_XCHG(r8, r16);
473  HS_CMP_XCHG(r4, r8);
474  HS_CMP_XCHG(r12, r16);
475  HS_CMP_XCHG(r2, r4);
476  HS_CMP_XCHG(r6, r8);
477  HS_CMP_XCHG(r10, r12);
478  HS_CMP_XCHG(r14, r16);
479  HS_CMP_XCHG(r1, r2);
480  HS_CMP_XCHG(r3, r4);
481  HS_CMP_XCHG(r5, r6);
482  HS_CMP_XCHG(r7, r8);
483  HS_CMP_XCHG(r9, r10);
484  HS_CMP_XCHG(r11, r12);
485  HS_CMP_XCHG(r13, r14);
486  HS_CMP_XCHG(r15, r16);
487  {
488    HS_SLAB_FLIP_PREAMBLE(7);
489    HS_CMP_FLIP(0, r1, r16);
490    HS_CMP_FLIP(1, r2, r15);
491    HS_CMP_FLIP(2, r3, r14);
492    HS_CMP_FLIP(3, r4, r13);
493    HS_CMP_FLIP(4, r5, r12);
494    HS_CMP_FLIP(5, r6, r11);
495    HS_CMP_FLIP(6, r7, r10);
496    HS_CMP_FLIP(7, r8, r9);
497  }
498  {
499    HS_SLAB_HALF_PREAMBLE(2);
500    HS_CMP_HALF(0, r1);
501    HS_CMP_HALF(1, r2);
502    HS_CMP_HALF(2, r3);
503    HS_CMP_HALF(3, r4);
504    HS_CMP_HALF(4, r5);
505    HS_CMP_HALF(5, r6);
506    HS_CMP_HALF(6, r7);
507    HS_CMP_HALF(7, r8);
508    HS_CMP_HALF(8, r9);
509    HS_CMP_HALF(9, r10);
510    HS_CMP_HALF(10, r11);
511    HS_CMP_HALF(11, r12);
512    HS_CMP_HALF(12, r13);
513    HS_CMP_HALF(13, r14);
514    HS_CMP_HALF(14, r15);
515    HS_CMP_HALF(15, r16);
516  }
517  {
518    HS_SLAB_HALF_PREAMBLE(1);
519    HS_CMP_HALF(0, r1);
520    HS_CMP_HALF(1, r2);
521    HS_CMP_HALF(2, r3);
522    HS_CMP_HALF(3, r4);
523    HS_CMP_HALF(4, r5);
524    HS_CMP_HALF(5, r6);
525    HS_CMP_HALF(6, r7);
526    HS_CMP_HALF(7, r8);
527    HS_CMP_HALF(8, r9);
528    HS_CMP_HALF(9, r10);
529    HS_CMP_HALF(10, r11);
530    HS_CMP_HALF(11, r12);
531    HS_CMP_HALF(12, r13);
532    HS_CMP_HALF(13, r14);
533    HS_CMP_HALF(14, r15);
534    HS_CMP_HALF(15, r16);
535  }
536  HS_CMP_XCHG(r1, r9);
537  HS_CMP_XCHG(r5, r13);
538  HS_CMP_XCHG(r1, r5);
539  HS_CMP_XCHG(r9, r13);
540  HS_CMP_XCHG(r3, r11);
541  HS_CMP_XCHG(r7, r15);
542  HS_CMP_XCHG(r3, r7);
543  HS_CMP_XCHG(r11, r15);
544  HS_CMP_XCHG(r1, r3);
545  HS_CMP_XCHG(r5, r7);
546  HS_CMP_XCHG(r9, r11);
547  HS_CMP_XCHG(r13, r15);
548  HS_CMP_XCHG(r2, r10);
549  HS_CMP_XCHG(r6, r14);
550  HS_CMP_XCHG(r2, r6);
551  HS_CMP_XCHG(r10, r14);
552  HS_CMP_XCHG(r4, r12);
553  HS_CMP_XCHG(r8, r16);
554  HS_CMP_XCHG(r4, r8);
555  HS_CMP_XCHG(r12, r16);
556  HS_CMP_XCHG(r2, r4);
557  HS_CMP_XCHG(r6, r8);
558  HS_CMP_XCHG(r10, r12);
559  HS_CMP_XCHG(r14, r16);
560  HS_CMP_XCHG(r1, r2);
561  HS_CMP_XCHG(r3, r4);
562  HS_CMP_XCHG(r5, r6);
563  HS_CMP_XCHG(r7, r8);
564  HS_CMP_XCHG(r9, r10);
565  HS_CMP_XCHG(r11, r12);
566  HS_CMP_XCHG(r13, r14);
567  HS_CMP_XCHG(r15, r16);
568  HS_BS_MERGE_H_PREAMBLE(2);
569  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0) = r1;
570  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1) = r16;
571  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2) = r2;
572  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3) = r15;
573  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4) = r3;
574  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5) = r14;
575  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6) = r4;
576  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7) = r13;
577  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 8) = r5;
578  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 9) = r12;
579  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 10) = r6;
580  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 11) = r11;
581  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 12) = r7;
582  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 13) = r10;
583  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 14) = r8;
584  HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 15) = r9;
585  HS_BLOCK_BARRIER();
586  {
587    {
588      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
589      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8);
590      HS_CMP_XCHG(r0_1, r0_2);
591      HS_SLAB_LOCAL_L(0) = r0_1;
592      HS_SLAB_LOCAL_R(8) = r0_2;
593    }
594    {
595      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(32);
596      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(40);
597      HS_CMP_XCHG(r0_1, r0_2);
598      HS_SLAB_LOCAL_L(32) = r0_1;
599      HS_SLAB_LOCAL_R(40) = r0_2;
600    }
601    {
602      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(64);
603      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(72);
604      HS_CMP_XCHG(r0_1, r0_2);
605      HS_SLAB_LOCAL_L(64) = r0_1;
606      HS_SLAB_LOCAL_R(72) = r0_2;
607    }
608    {
609      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(96);
610      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(104);
611      HS_CMP_XCHG(r0_1, r0_2);
612      HS_SLAB_LOCAL_L(96) = r0_1;
613      HS_SLAB_LOCAL_R(104) = r0_2;
614    }
615    {
616      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128);
617      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(136);
618      HS_CMP_XCHG(r0_1, r0_2);
619      HS_SLAB_LOCAL_L(128) = r0_1;
620      HS_SLAB_LOCAL_R(136) = r0_2;
621    }
622    {
623      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(160);
624      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(168);
625      HS_CMP_XCHG(r0_1, r0_2);
626      HS_SLAB_LOCAL_L(160) = r0_1;
627      HS_SLAB_LOCAL_R(168) = r0_2;
628    }
629    {
630      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(192);
631      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(200);
632      HS_CMP_XCHG(r0_1, r0_2);
633      HS_SLAB_LOCAL_L(192) = r0_1;
634      HS_SLAB_LOCAL_R(200) = r0_2;
635    }
636    {
637      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(224);
638      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(232);
639      HS_CMP_XCHG(r0_1, r0_2);
640      HS_SLAB_LOCAL_L(224) = r0_1;
641      HS_SLAB_LOCAL_R(232) = r0_2;
642    }
643  }
644  HS_BLOCK_BARRIER();
645  r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0);
646  r16 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1);
647  r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2);
648  r15 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3);
649  r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4);
650  r14 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5);
651  r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6);
652  r13 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7);
653  r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 8);
654  r12 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 9);
655  r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 10);
656  r11 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 11);
657  r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 12);
658  r10 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 13);
659  r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 14);
660  r9 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 15);
661  {
662    {
663      HS_SLAB_HALF_PREAMBLE(4);
664      HS_CMP_HALF(0, r1);
665      HS_CMP_HALF(1, r2);
666      HS_CMP_HALF(2, r3);
667      HS_CMP_HALF(3, r4);
668      HS_CMP_HALF(4, r5);
669      HS_CMP_HALF(5, r6);
670      HS_CMP_HALF(6, r7);
671      HS_CMP_HALF(7, r8);
672      HS_CMP_HALF(8, r9);
673      HS_CMP_HALF(9, r10);
674      HS_CMP_HALF(10, r11);
675      HS_CMP_HALF(11, r12);
676      HS_CMP_HALF(12, r13);
677      HS_CMP_HALF(13, r14);
678      HS_CMP_HALF(14, r15);
679      HS_CMP_HALF(15, r16);
680    }
681    {
682      HS_SLAB_HALF_PREAMBLE(2);
683      HS_CMP_HALF(0, r1);
684      HS_CMP_HALF(1, r2);
685      HS_CMP_HALF(2, r3);
686      HS_CMP_HALF(3, r4);
687      HS_CMP_HALF(4, r5);
688      HS_CMP_HALF(5, r6);
689      HS_CMP_HALF(6, r7);
690      HS_CMP_HALF(7, r8);
691      HS_CMP_HALF(8, r9);
692      HS_CMP_HALF(9, r10);
693      HS_CMP_HALF(10, r11);
694      HS_CMP_HALF(11, r12);
695      HS_CMP_HALF(12, r13);
696      HS_CMP_HALF(13, r14);
697      HS_CMP_HALF(14, r15);
698      HS_CMP_HALF(15, r16);
699    }
700    {
701      HS_SLAB_HALF_PREAMBLE(1);
702      HS_CMP_HALF(0, r1);
703      HS_CMP_HALF(1, r2);
704      HS_CMP_HALF(2, r3);
705      HS_CMP_HALF(3, r4);
706      HS_CMP_HALF(4, r5);
707      HS_CMP_HALF(5, r6);
708      HS_CMP_HALF(6, r7);
709      HS_CMP_HALF(7, r8);
710      HS_CMP_HALF(8, r9);
711      HS_CMP_HALF(9, r10);
712      HS_CMP_HALF(10, r11);
713      HS_CMP_HALF(11, r12);
714      HS_CMP_HALF(12, r13);
715      HS_CMP_HALF(13, r14);
716      HS_CMP_HALF(14, r15);
717      HS_CMP_HALF(15, r16);
718    }
719    HS_CMP_XCHG(r1, r9);
720    HS_CMP_XCHG(r5, r13);
721    HS_CMP_XCHG(r1, r5);
722    HS_CMP_XCHG(r9, r13);
723    HS_CMP_XCHG(r3, r11);
724    HS_CMP_XCHG(r7, r15);
725    HS_CMP_XCHG(r3, r7);
726    HS_CMP_XCHG(r11, r15);
727    HS_CMP_XCHG(r1, r3);
728    HS_CMP_XCHG(r5, r7);
729    HS_CMP_XCHG(r9, r11);
730    HS_CMP_XCHG(r13, r15);
731    HS_CMP_XCHG(r2, r10);
732    HS_CMP_XCHG(r6, r14);
733    HS_CMP_XCHG(r2, r6);
734    HS_CMP_XCHG(r10, r14);
735    HS_CMP_XCHG(r4, r12);
736    HS_CMP_XCHG(r8, r16);
737    HS_CMP_XCHG(r4, r8);
738    HS_CMP_XCHG(r12, r16);
739    HS_CMP_XCHG(r2, r4);
740    HS_CMP_XCHG(r6, r8);
741    HS_CMP_XCHG(r10, r12);
742    HS_CMP_XCHG(r14, r16);
743    HS_CMP_XCHG(r1, r2);
744    HS_CMP_XCHG(r3, r4);
745    HS_CMP_XCHG(r5, r6);
746    HS_CMP_XCHG(r7, r8);
747    HS_CMP_XCHG(r9, r10);
748    HS_CMP_XCHG(r11, r12);
749    HS_CMP_XCHG(r13, r14);
750    HS_CMP_XCHG(r15, r16);
751  }
752  HS_SLAB_GLOBAL_STORE(0, r1);
753  HS_SLAB_GLOBAL_STORE(1, r2);
754  HS_SLAB_GLOBAL_STORE(2, r3);
755  HS_SLAB_GLOBAL_STORE(3, r4);
756  HS_SLAB_GLOBAL_STORE(4, r5);
757  HS_SLAB_GLOBAL_STORE(5, r6);
758  HS_SLAB_GLOBAL_STORE(6, r7);
759  HS_SLAB_GLOBAL_STORE(7, r8);
760  HS_SLAB_GLOBAL_STORE(8, r9);
761  HS_SLAB_GLOBAL_STORE(9, r10);
762  HS_SLAB_GLOBAL_STORE(10, r11);
763  HS_SLAB_GLOBAL_STORE(11, r12);
764  HS_SLAB_GLOBAL_STORE(12, r13);
765  HS_SLAB_GLOBAL_STORE(13, r14);
766  HS_SLAB_GLOBAL_STORE(14, r15);
767  HS_SLAB_GLOBAL_STORE(15, r16);
768}
769
770HS_BS_KERNEL_PROTO(4, 2)
771{
772  HS_BLOCK_LOCAL_MEM_DECL(32, 16);
773
774  HS_SLAB_GLOBAL_PREAMBLE();
775  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
776  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
777  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
778  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
779  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
780  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
781  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
782  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
783  HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8);
784  HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9);
785  HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10);
786  HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11);
787  HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12);
788  HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13);
789  HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14);
790  HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15);
791  HS_CMP_XCHG(r1, r2);
792  HS_CMP_XCHG(r3, r4);
793  HS_CMP_XCHG(r5, r6);
794  HS_CMP_XCHG(r7, r8);
795  HS_CMP_XCHG(r9, r10);
796  HS_CMP_XCHG(r11, r12);
797  HS_CMP_XCHG(r13, r14);
798  HS_CMP_XCHG(r15, r16);
799  HS_CMP_XCHG(r1, r3);
800  HS_CMP_XCHG(r5, r7);
801  HS_CMP_XCHG(r9, r11);
802  HS_CMP_XCHG(r13, r15);
803  HS_CMP_XCHG(r2, r4);
804  HS_CMP_XCHG(r6, r8);
805  HS_CMP_XCHG(r10, r12);
806  HS_CMP_XCHG(r14, r16);
807  HS_CMP_XCHG(r1, r5);
808  HS_CMP_XCHG(r9, r13);
809  HS_CMP_XCHG(r2, r6);
810  HS_CMP_XCHG(r10, r14);
811  HS_CMP_XCHG(r3, r7);
812  HS_CMP_XCHG(r11, r15);
813  HS_CMP_XCHG(r4, r8);
814  HS_CMP_XCHG(r12, r16);
815  HS_CMP_XCHG(r1, r9);
816  HS_CMP_XCHG(r2, r10);
817  HS_CMP_XCHG(r3, r11);
818  HS_CMP_XCHG(r4, r12);
819  HS_CMP_XCHG(r5, r13);
820  HS_CMP_XCHG(r6, r14);
821  HS_CMP_XCHG(r7, r15);
822  HS_CMP_XCHG(r8, r16);
823  HS_CMP_XCHG(r6, r11);
824  HS_CMP_XCHG(r7, r10);
825  HS_CMP_XCHG(r4, r13);
826  HS_CMP_XCHG(r14, r15);
827  HS_CMP_XCHG(r8, r12);
828  HS_CMP_XCHG(r2, r3);
829  HS_CMP_XCHG(r5, r9);
830  HS_CMP_XCHG(r2, r5);
831  HS_CMP_XCHG(r8, r14);
832  HS_CMP_XCHG(r3, r9);
833  HS_CMP_XCHG(r12, r15);
834  HS_CMP_XCHG(r3, r5);
835  HS_CMP_XCHG(r6, r7);
836  HS_CMP_XCHG(r10, r11);
837  HS_CMP_XCHG(r12, r14);
838  HS_CMP_XCHG(r4, r9);
839  HS_CMP_XCHG(r8, r13);
840  HS_CMP_XCHG(r7, r9);
841  HS_CMP_XCHG(r11, r13);
842  HS_CMP_XCHG(r4, r6);
843  HS_CMP_XCHG(r8, r10);
844  HS_CMP_XCHG(r4, r5);
845  HS_CMP_XCHG(r6, r7);
846  HS_CMP_XCHG(r8, r9);
847  HS_CMP_XCHG(r10, r11);
848  HS_CMP_XCHG(r12, r13);
849  HS_CMP_XCHG(r7, r8);
850  HS_CMP_XCHG(r9, r10);
851  {
852    HS_SLAB_FLIP_PREAMBLE(1);
853    HS_CMP_FLIP(0, r1, r16);
854    HS_CMP_FLIP(1, r2, r15);
855    HS_CMP_FLIP(2, r3, r14);
856    HS_CMP_FLIP(3, r4, r13);
857    HS_CMP_FLIP(4, r5, r12);
858    HS_CMP_FLIP(5, r6, r11);
859    HS_CMP_FLIP(6, r7, r10);
860    HS_CMP_FLIP(7, r8, r9);
861  }
862  HS_CMP_XCHG(r1, r9);
863  HS_CMP_XCHG(r5, r13);
864  HS_CMP_XCHG(r1, r5);
865  HS_CMP_XCHG(r9, r13);
866  HS_CMP_XCHG(r3, r11);
867  HS_CMP_XCHG(r7, r15);
868  HS_CMP_XCHG(r3, r7);
869  HS_CMP_XCHG(r11, r15);
870  HS_CMP_XCHG(r1, r3);
871  HS_CMP_XCHG(r5, r7);
872  HS_CMP_XCHG(r9, r11);
873  HS_CMP_XCHG(r13, r15);
874  HS_CMP_XCHG(r2, r10);
875  HS_CMP_XCHG(r6, r14);
876  HS_CMP_XCHG(r2, r6);
877  HS_CMP_XCHG(r10, r14);
878  HS_CMP_XCHG(r4, r12);
879  HS_CMP_XCHG(r8, r16);
880  HS_CMP_XCHG(r4, r8);
881  HS_CMP_XCHG(r12, r16);
882  HS_CMP_XCHG(r2, r4);
883  HS_CMP_XCHG(r6, r8);
884  HS_CMP_XCHG(r10, r12);
885  HS_CMP_XCHG(r14, r16);
886  HS_CMP_XCHG(r1, r2);
887  HS_CMP_XCHG(r3, r4);
888  HS_CMP_XCHG(r5, r6);
889  HS_CMP_XCHG(r7, r8);
890  HS_CMP_XCHG(r9, r10);
891  HS_CMP_XCHG(r11, r12);
892  HS_CMP_XCHG(r13, r14);
893  HS_CMP_XCHG(r15, r16);
894  {
895    HS_SLAB_FLIP_PREAMBLE(3);
896    HS_CMP_FLIP(0, r1, r16);
897    HS_CMP_FLIP(1, r2, r15);
898    HS_CMP_FLIP(2, r3, r14);
899    HS_CMP_FLIP(3, r4, r13);
900    HS_CMP_FLIP(4, r5, r12);
901    HS_CMP_FLIP(5, r6, r11);
902    HS_CMP_FLIP(6, r7, r10);
903    HS_CMP_FLIP(7, r8, r9);
904  }
905  {
906    HS_SLAB_HALF_PREAMBLE(1);
907    HS_CMP_HALF(0, r1);
908    HS_CMP_HALF(1, r2);
909    HS_CMP_HALF(2, r3);
910    HS_CMP_HALF(3, r4);
911    HS_CMP_HALF(4, r5);
912    HS_CMP_HALF(5, r6);
913    HS_CMP_HALF(6, r7);
914    HS_CMP_HALF(7, r8);
915    HS_CMP_HALF(8, r9);
916    HS_CMP_HALF(9, r10);
917    HS_CMP_HALF(10, r11);
918    HS_CMP_HALF(11, r12);
919    HS_CMP_HALF(12, r13);
920    HS_CMP_HALF(13, r14);
921    HS_CMP_HALF(14, r15);
922    HS_CMP_HALF(15, r16);
923  }
924  HS_CMP_XCHG(r1, r9);
925  HS_CMP_XCHG(r5, r13);
926  HS_CMP_XCHG(r1, r5);
927  HS_CMP_XCHG(r9, r13);
928  HS_CMP_XCHG(r3, r11);
929  HS_CMP_XCHG(r7, r15);
930  HS_CMP_XCHG(r3, r7);
931  HS_CMP_XCHG(r11, r15);
932  HS_CMP_XCHG(r1, r3);
933  HS_CMP_XCHG(r5, r7);
934  HS_CMP_XCHG(r9, r11);
935  HS_CMP_XCHG(r13, r15);
936  HS_CMP_XCHG(r2, r10);
937  HS_CMP_XCHG(r6, r14);
938  HS_CMP_XCHG(r2, r6);
939  HS_CMP_XCHG(r10, r14);
940  HS_CMP_XCHG(r4, r12);
941  HS_CMP_XCHG(r8, r16);
942  HS_CMP_XCHG(r4, r8);
943  HS_CMP_XCHG(r12, r16);
944  HS_CMP_XCHG(r2, r4);
945  HS_CMP_XCHG(r6, r8);
946  HS_CMP_XCHG(r10, r12);
947  HS_CMP_XCHG(r14, r16);
948  HS_CMP_XCHG(r1, r2);
949  HS_CMP_XCHG(r3, r4);
950  HS_CMP_XCHG(r5, r6);
951  HS_CMP_XCHG(r7, r8);
952  HS_CMP_XCHG(r9, r10);
953  HS_CMP_XCHG(r11, r12);
954  HS_CMP_XCHG(r13, r14);
955  HS_CMP_XCHG(r15, r16);
956  {
957    HS_SLAB_FLIP_PREAMBLE(7);
958    HS_CMP_FLIP(0, r1, r16);
959    HS_CMP_FLIP(1, r2, r15);
960    HS_CMP_FLIP(2, r3, r14);
961    HS_CMP_FLIP(3, r4, r13);
962    HS_CMP_FLIP(4, r5, r12);
963    HS_CMP_FLIP(5, r6, r11);
964    HS_CMP_FLIP(6, r7, r10);
965    HS_CMP_FLIP(7, r8, r9);
966  }
967  {
968    HS_SLAB_HALF_PREAMBLE(2);
969    HS_CMP_HALF(0, r1);
970    HS_CMP_HALF(1, r2);
971    HS_CMP_HALF(2, r3);
972    HS_CMP_HALF(3, r4);
973    HS_CMP_HALF(4, r5);
974    HS_CMP_HALF(5, r6);
975    HS_CMP_HALF(6, r7);
976    HS_CMP_HALF(7, r8);
977    HS_CMP_HALF(8, r9);
978    HS_CMP_HALF(9, r10);
979    HS_CMP_HALF(10, r11);
980    HS_CMP_HALF(11, r12);
981    HS_CMP_HALF(12, r13);
982    HS_CMP_HALF(13, r14);
983    HS_CMP_HALF(14, r15);
984    HS_CMP_HALF(15, r16);
985  }
986  {
987    HS_SLAB_HALF_PREAMBLE(1);
988    HS_CMP_HALF(0, r1);
989    HS_CMP_HALF(1, r2);
990    HS_CMP_HALF(2, r3);
991    HS_CMP_HALF(3, r4);
992    HS_CMP_HALF(4, r5);
993    HS_CMP_HALF(5, r6);
994    HS_CMP_HALF(6, r7);
995    HS_CMP_HALF(7, r8);
996    HS_CMP_HALF(8, r9);
997    HS_CMP_HALF(9, r10);
998    HS_CMP_HALF(10, r11);
999    HS_CMP_HALF(11, r12);
1000    HS_CMP_HALF(12, r13);
1001    HS_CMP_HALF(13, r14);
1002    HS_CMP_HALF(14, r15);
1003    HS_CMP_HALF(15, r16);
1004  }
1005  HS_CMP_XCHG(r1, r9);
1006  HS_CMP_XCHG(r5, r13);
1007  HS_CMP_XCHG(r1, r5);
1008  HS_CMP_XCHG(r9, r13);
1009  HS_CMP_XCHG(r3, r11);
1010  HS_CMP_XCHG(r7, r15);
1011  HS_CMP_XCHG(r3, r7);
1012  HS_CMP_XCHG(r11, r15);
1013  HS_CMP_XCHG(r1, r3);
1014  HS_CMP_XCHG(r5, r7);
1015  HS_CMP_XCHG(r9, r11);
1016  HS_CMP_XCHG(r13, r15);
1017  HS_CMP_XCHG(r2, r10);
1018  HS_CMP_XCHG(r6, r14);
1019  HS_CMP_XCHG(r2, r6);
1020  HS_CMP_XCHG(r10, r14);
1021  HS_CMP_XCHG(r4, r12);
1022  HS_CMP_XCHG(r8, r16);
1023  HS_CMP_XCHG(r4, r8);
1024  HS_CMP_XCHG(r12, r16);
1025  HS_CMP_XCHG(r2, r4);
1026  HS_CMP_XCHG(r6, r8);
1027  HS_CMP_XCHG(r10, r12);
1028  HS_CMP_XCHG(r14, r16);
1029  HS_CMP_XCHG(r1, r2);
1030  HS_CMP_XCHG(r3, r4);
1031  HS_CMP_XCHG(r5, r6);
1032  HS_CMP_XCHG(r7, r8);
1033  HS_CMP_XCHG(r9, r10);
1034  HS_CMP_XCHG(r11, r12);
1035  HS_CMP_XCHG(r13, r14);
1036  HS_CMP_XCHG(r15, r16);
1037  HS_BS_MERGE_H_PREAMBLE(4);
1038  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1;
1039  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r16;
1040  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2;
1041  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r15;
1042  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3;
1043  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r14;
1044  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4;
1045  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r13;
1046  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8) = r5;
1047  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9) = r12;
1048  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10) = r6;
1049  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11) = r11;
1050  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12) = r7;
1051  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13) = r10;
1052  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14) = r8;
1053  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15) = r9;
1054  HS_BLOCK_BARRIER();
1055  {
1056    {
1057      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1058      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8);
1059      HS_CMP_XCHG(r0_1, r0_2);
1060      HS_SLAB_LOCAL_L(0) = r0_1;
1061      HS_SLAB_LOCAL_R(8) = r0_2;
1062    }
1063    {
1064      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(16);
1065      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(24);
1066      HS_CMP_XCHG(r1_1, r1_2);
1067      HS_SLAB_LOCAL_L(16) = r1_1;
1068      HS_SLAB_LOCAL_R(24) = r1_2;
1069    }
1070    {
1071      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128);
1072      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(136);
1073      HS_CMP_XCHG(r0_1, r0_2);
1074      HS_SLAB_LOCAL_L(128) = r0_1;
1075      HS_SLAB_LOCAL_R(136) = r0_2;
1076    }
1077    {
1078      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(144);
1079      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(152);
1080      HS_CMP_XCHG(r1_1, r1_2);
1081      HS_SLAB_LOCAL_L(144) = r1_1;
1082      HS_SLAB_LOCAL_R(152) = r1_2;
1083    }
1084    {
1085      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256);
1086      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(264);
1087      HS_CMP_XCHG(r0_1, r0_2);
1088      HS_SLAB_LOCAL_L(256) = r0_1;
1089      HS_SLAB_LOCAL_R(264) = r0_2;
1090    }
1091    {
1092      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(272);
1093      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(280);
1094      HS_CMP_XCHG(r1_1, r1_2);
1095      HS_SLAB_LOCAL_L(272) = r1_1;
1096      HS_SLAB_LOCAL_R(280) = r1_2;
1097    }
1098    {
1099      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(384);
1100      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(392);
1101      HS_CMP_XCHG(r0_1, r0_2);
1102      HS_SLAB_LOCAL_L(384) = r0_1;
1103      HS_SLAB_LOCAL_R(392) = r0_2;
1104    }
1105    {
1106      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(400);
1107      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(408);
1108      HS_CMP_XCHG(r1_1, r1_2);
1109      HS_SLAB_LOCAL_L(400) = r1_1;
1110      HS_SLAB_LOCAL_R(408) = r1_2;
1111    }
1112  }
1113  HS_BLOCK_BARRIER();
1114  r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
1115  r16 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
1116  r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
1117  r15 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
1118  r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
1119  r14 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
1120  r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
1121  r13 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
1122  r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8);
1123  r12 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9);
1124  r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10);
1125  r11 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11);
1126  r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12);
1127  r10 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13);
1128  r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14);
1129  r9 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15);
1130  {
1131    {
1132      HS_SLAB_HALF_PREAMBLE(4);
1133      HS_CMP_HALF(0, r1);
1134      HS_CMP_HALF(1, r2);
1135      HS_CMP_HALF(2, r3);
1136      HS_CMP_HALF(3, r4);
1137      HS_CMP_HALF(4, r5);
1138      HS_CMP_HALF(5, r6);
1139      HS_CMP_HALF(6, r7);
1140      HS_CMP_HALF(7, r8);
1141      HS_CMP_HALF(8, r9);
1142      HS_CMP_HALF(9, r10);
1143      HS_CMP_HALF(10, r11);
1144      HS_CMP_HALF(11, r12);
1145      HS_CMP_HALF(12, r13);
1146      HS_CMP_HALF(13, r14);
1147      HS_CMP_HALF(14, r15);
1148      HS_CMP_HALF(15, r16);
1149    }
1150    {
1151      HS_SLAB_HALF_PREAMBLE(2);
1152      HS_CMP_HALF(0, r1);
1153      HS_CMP_HALF(1, r2);
1154      HS_CMP_HALF(2, r3);
1155      HS_CMP_HALF(3, r4);
1156      HS_CMP_HALF(4, r5);
1157      HS_CMP_HALF(5, r6);
1158      HS_CMP_HALF(6, r7);
1159      HS_CMP_HALF(7, r8);
1160      HS_CMP_HALF(8, r9);
1161      HS_CMP_HALF(9, r10);
1162      HS_CMP_HALF(10, r11);
1163      HS_CMP_HALF(11, r12);
1164      HS_CMP_HALF(12, r13);
1165      HS_CMP_HALF(13, r14);
1166      HS_CMP_HALF(14, r15);
1167      HS_CMP_HALF(15, r16);
1168    }
1169    {
1170      HS_SLAB_HALF_PREAMBLE(1);
1171      HS_CMP_HALF(0, r1);
1172      HS_CMP_HALF(1, r2);
1173      HS_CMP_HALF(2, r3);
1174      HS_CMP_HALF(3, r4);
1175      HS_CMP_HALF(4, r5);
1176      HS_CMP_HALF(5, r6);
1177      HS_CMP_HALF(6, r7);
1178      HS_CMP_HALF(7, r8);
1179      HS_CMP_HALF(8, r9);
1180      HS_CMP_HALF(9, r10);
1181      HS_CMP_HALF(10, r11);
1182      HS_CMP_HALF(11, r12);
1183      HS_CMP_HALF(12, r13);
1184      HS_CMP_HALF(13, r14);
1185      HS_CMP_HALF(14, r15);
1186      HS_CMP_HALF(15, r16);
1187    }
1188    HS_CMP_XCHG(r1, r9);
1189    HS_CMP_XCHG(r5, r13);
1190    HS_CMP_XCHG(r1, r5);
1191    HS_CMP_XCHG(r9, r13);
1192    HS_CMP_XCHG(r3, r11);
1193    HS_CMP_XCHG(r7, r15);
1194    HS_CMP_XCHG(r3, r7);
1195    HS_CMP_XCHG(r11, r15);
1196    HS_CMP_XCHG(r1, r3);
1197    HS_CMP_XCHG(r5, r7);
1198    HS_CMP_XCHG(r9, r11);
1199    HS_CMP_XCHG(r13, r15);
1200    HS_CMP_XCHG(r2, r10);
1201    HS_CMP_XCHG(r6, r14);
1202    HS_CMP_XCHG(r2, r6);
1203    HS_CMP_XCHG(r10, r14);
1204    HS_CMP_XCHG(r4, r12);
1205    HS_CMP_XCHG(r8, r16);
1206    HS_CMP_XCHG(r4, r8);
1207    HS_CMP_XCHG(r12, r16);
1208    HS_CMP_XCHG(r2, r4);
1209    HS_CMP_XCHG(r6, r8);
1210    HS_CMP_XCHG(r10, r12);
1211    HS_CMP_XCHG(r14, r16);
1212    HS_CMP_XCHG(r1, r2);
1213    HS_CMP_XCHG(r3, r4);
1214    HS_CMP_XCHG(r5, r6);
1215    HS_CMP_XCHG(r7, r8);
1216    HS_CMP_XCHG(r9, r10);
1217    HS_CMP_XCHG(r11, r12);
1218    HS_CMP_XCHG(r13, r14);
1219    HS_CMP_XCHG(r15, r16);
1220  }
1221  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1;
1222  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r16;
1223  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2;
1224  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r15;
1225  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3;
1226  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r14;
1227  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4;
1228  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r13;
1229  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8) = r5;
1230  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9) = r12;
1231  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10) = r6;
1232  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11) = r11;
1233  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12) = r7;
1234  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13) = r10;
1235  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14) = r8;
1236  HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15) = r9;
1237  HS_BLOCK_BARRIER();
1238  {
1239    {
1240      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1241      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8);
1242      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(16);
1243      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(24);
1244      HS_CMP_XCHG(r0_2, r0_3);
1245      HS_CMP_XCHG(r0_1, r0_4);
1246      HS_CMP_XCHG(r0_3, r0_4);
1247      HS_CMP_XCHG(r0_1, r0_2);
1248      HS_SLAB_LOCAL_L(0) = r0_1;
1249      HS_SLAB_LOCAL_L(8) = r0_2;
1250      HS_SLAB_LOCAL_R(16) = r0_3;
1251      HS_SLAB_LOCAL_R(24) = r0_4;
1252    }
1253    {
1254      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128);
1255      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(136);
1256      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(144);
1257      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(152);
1258      HS_CMP_XCHG(r0_2, r0_3);
1259      HS_CMP_XCHG(r0_1, r0_4);
1260      HS_CMP_XCHG(r0_3, r0_4);
1261      HS_CMP_XCHG(r0_1, r0_2);
1262      HS_SLAB_LOCAL_L(128) = r0_1;
1263      HS_SLAB_LOCAL_L(136) = r0_2;
1264      HS_SLAB_LOCAL_R(144) = r0_3;
1265      HS_SLAB_LOCAL_R(152) = r0_4;
1266    }
1267    {
1268      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256);
1269      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(264);
1270      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(272);
1271      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(280);
1272      HS_CMP_XCHG(r0_2, r0_3);
1273      HS_CMP_XCHG(r0_1, r0_4);
1274      HS_CMP_XCHG(r0_3, r0_4);
1275      HS_CMP_XCHG(r0_1, r0_2);
1276      HS_SLAB_LOCAL_L(256) = r0_1;
1277      HS_SLAB_LOCAL_L(264) = r0_2;
1278      HS_SLAB_LOCAL_R(272) = r0_3;
1279      HS_SLAB_LOCAL_R(280) = r0_4;
1280    }
1281    {
1282      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(384);
1283      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(392);
1284      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(400);
1285      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(408);
1286      HS_CMP_XCHG(r0_2, r0_3);
1287      HS_CMP_XCHG(r0_1, r0_4);
1288      HS_CMP_XCHG(r0_3, r0_4);
1289      HS_CMP_XCHG(r0_1, r0_2);
1290      HS_SLAB_LOCAL_L(384) = r0_1;
1291      HS_SLAB_LOCAL_L(392) = r0_2;
1292      HS_SLAB_LOCAL_R(400) = r0_3;
1293      HS_SLAB_LOCAL_R(408) = r0_4;
1294    }
1295  }
1296  HS_BLOCK_BARRIER();
1297  r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
1298  r16 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
1299  r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
1300  r15 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
1301  r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
1302  r14 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
1303  r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
1304  r13 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
1305  r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8);
1306  r12 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9);
1307  r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10);
1308  r11 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11);
1309  r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12);
1310  r10 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13);
1311  r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14);
1312  r9 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15);
1313  {
1314    {
1315      HS_SLAB_HALF_PREAMBLE(4);
1316      HS_CMP_HALF(0, r1);
1317      HS_CMP_HALF(1, r2);
1318      HS_CMP_HALF(2, r3);
1319      HS_CMP_HALF(3, r4);
1320      HS_CMP_HALF(4, r5);
1321      HS_CMP_HALF(5, r6);
1322      HS_CMP_HALF(6, r7);
1323      HS_CMP_HALF(7, r8);
1324      HS_CMP_HALF(8, r9);
1325      HS_CMP_HALF(9, r10);
1326      HS_CMP_HALF(10, r11);
1327      HS_CMP_HALF(11, r12);
1328      HS_CMP_HALF(12, r13);
1329      HS_CMP_HALF(13, r14);
1330      HS_CMP_HALF(14, r15);
1331      HS_CMP_HALF(15, r16);
1332    }
1333    {
1334      HS_SLAB_HALF_PREAMBLE(2);
1335      HS_CMP_HALF(0, r1);
1336      HS_CMP_HALF(1, r2);
1337      HS_CMP_HALF(2, r3);
1338      HS_CMP_HALF(3, r4);
1339      HS_CMP_HALF(4, r5);
1340      HS_CMP_HALF(5, r6);
1341      HS_CMP_HALF(6, r7);
1342      HS_CMP_HALF(7, r8);
1343      HS_CMP_HALF(8, r9);
1344      HS_CMP_HALF(9, r10);
1345      HS_CMP_HALF(10, r11);
1346      HS_CMP_HALF(11, r12);
1347      HS_CMP_HALF(12, r13);
1348      HS_CMP_HALF(13, r14);
1349      HS_CMP_HALF(14, r15);
1350      HS_CMP_HALF(15, r16);
1351    }
1352    {
1353      HS_SLAB_HALF_PREAMBLE(1);
1354      HS_CMP_HALF(0, r1);
1355      HS_CMP_HALF(1, r2);
1356      HS_CMP_HALF(2, r3);
1357      HS_CMP_HALF(3, r4);
1358      HS_CMP_HALF(4, r5);
1359      HS_CMP_HALF(5, r6);
1360      HS_CMP_HALF(6, r7);
1361      HS_CMP_HALF(7, r8);
1362      HS_CMP_HALF(8, r9);
1363      HS_CMP_HALF(9, r10);
1364      HS_CMP_HALF(10, r11);
1365      HS_CMP_HALF(11, r12);
1366      HS_CMP_HALF(12, r13);
1367      HS_CMP_HALF(13, r14);
1368      HS_CMP_HALF(14, r15);
1369      HS_CMP_HALF(15, r16);
1370    }
1371    HS_CMP_XCHG(r1, r9);
1372    HS_CMP_XCHG(r5, r13);
1373    HS_CMP_XCHG(r1, r5);
1374    HS_CMP_XCHG(r9, r13);
1375    HS_CMP_XCHG(r3, r11);
1376    HS_CMP_XCHG(r7, r15);
1377    HS_CMP_XCHG(r3, r7);
1378    HS_CMP_XCHG(r11, r15);
1379    HS_CMP_XCHG(r1, r3);
1380    HS_CMP_XCHG(r5, r7);
1381    HS_CMP_XCHG(r9, r11);
1382    HS_CMP_XCHG(r13, r15);
1383    HS_CMP_XCHG(r2, r10);
1384    HS_CMP_XCHG(r6, r14);
1385    HS_CMP_XCHG(r2, r6);
1386    HS_CMP_XCHG(r10, r14);
1387    HS_CMP_XCHG(r4, r12);
1388    HS_CMP_XCHG(r8, r16);
1389    HS_CMP_XCHG(r4, r8);
1390    HS_CMP_XCHG(r12, r16);
1391    HS_CMP_XCHG(r2, r4);
1392    HS_CMP_XCHG(r6, r8);
1393    HS_CMP_XCHG(r10, r12);
1394    HS_CMP_XCHG(r14, r16);
1395    HS_CMP_XCHG(r1, r2);
1396    HS_CMP_XCHG(r3, r4);
1397    HS_CMP_XCHG(r5, r6);
1398    HS_CMP_XCHG(r7, r8);
1399    HS_CMP_XCHG(r9, r10);
1400    HS_CMP_XCHG(r11, r12);
1401    HS_CMP_XCHG(r13, r14);
1402    HS_CMP_XCHG(r15, r16);
1403  }
1404  HS_SLAB_GLOBAL_STORE(0, r1);
1405  HS_SLAB_GLOBAL_STORE(1, r2);
1406  HS_SLAB_GLOBAL_STORE(2, r3);
1407  HS_SLAB_GLOBAL_STORE(3, r4);
1408  HS_SLAB_GLOBAL_STORE(4, r5);
1409  HS_SLAB_GLOBAL_STORE(5, r6);
1410  HS_SLAB_GLOBAL_STORE(6, r7);
1411  HS_SLAB_GLOBAL_STORE(7, r8);
1412  HS_SLAB_GLOBAL_STORE(8, r9);
1413  HS_SLAB_GLOBAL_STORE(9, r10);
1414  HS_SLAB_GLOBAL_STORE(10, r11);
1415  HS_SLAB_GLOBAL_STORE(11, r12);
1416  HS_SLAB_GLOBAL_STORE(12, r13);
1417  HS_SLAB_GLOBAL_STORE(13, r14);
1418  HS_SLAB_GLOBAL_STORE(14, r15);
1419  HS_SLAB_GLOBAL_STORE(15, r16);
1420}
1421
1422HS_BS_KERNEL_PROTO(8, 3)
1423{
1424  HS_BLOCK_LOCAL_MEM_DECL(64, 16);
1425
1426  HS_SLAB_GLOBAL_PREAMBLE();
1427  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
1428  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
1429  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
1430  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
1431  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
1432  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
1433  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
1434  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
1435  HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8);
1436  HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9);
1437  HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10);
1438  HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11);
1439  HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12);
1440  HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13);
1441  HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14);
1442  HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15);
1443  HS_CMP_XCHG(r1, r2);
1444  HS_CMP_XCHG(r3, r4);
1445  HS_CMP_XCHG(r5, r6);
1446  HS_CMP_XCHG(r7, r8);
1447  HS_CMP_XCHG(r9, r10);
1448  HS_CMP_XCHG(r11, r12);
1449  HS_CMP_XCHG(r13, r14);
1450  HS_CMP_XCHG(r15, r16);
1451  HS_CMP_XCHG(r1, r3);
1452  HS_CMP_XCHG(r5, r7);
1453  HS_CMP_XCHG(r9, r11);
1454  HS_CMP_XCHG(r13, r15);
1455  HS_CMP_XCHG(r2, r4);
1456  HS_CMP_XCHG(r6, r8);
1457  HS_CMP_XCHG(r10, r12);
1458  HS_CMP_XCHG(r14, r16);
1459  HS_CMP_XCHG(r1, r5);
1460  HS_CMP_XCHG(r9, r13);
1461  HS_CMP_XCHG(r2, r6);
1462  HS_CMP_XCHG(r10, r14);
1463  HS_CMP_XCHG(r3, r7);
1464  HS_CMP_XCHG(r11, r15);
1465  HS_CMP_XCHG(r4, r8);
1466  HS_CMP_XCHG(r12, r16);
1467  HS_CMP_XCHG(r1, r9);
1468  HS_CMP_XCHG(r2, r10);
1469  HS_CMP_XCHG(r3, r11);
1470  HS_CMP_XCHG(r4, r12);
1471  HS_CMP_XCHG(r5, r13);
1472  HS_CMP_XCHG(r6, r14);
1473  HS_CMP_XCHG(r7, r15);
1474  HS_CMP_XCHG(r8, r16);
1475  HS_CMP_XCHG(r6, r11);
1476  HS_CMP_XCHG(r7, r10);
1477  HS_CMP_XCHG(r4, r13);
1478  HS_CMP_XCHG(r14, r15);
1479  HS_CMP_XCHG(r8, r12);
1480  HS_CMP_XCHG(r2, r3);
1481  HS_CMP_XCHG(r5, r9);
1482  HS_CMP_XCHG(r2, r5);
1483  HS_CMP_XCHG(r8, r14);
1484  HS_CMP_XCHG(r3, r9);
1485  HS_CMP_XCHG(r12, r15);
1486  HS_CMP_XCHG(r3, r5);
1487  HS_CMP_XCHG(r6, r7);
1488  HS_CMP_XCHG(r10, r11);
1489  HS_CMP_XCHG(r12, r14);
1490  HS_CMP_XCHG(r4, r9);
1491  HS_CMP_XCHG(r8, r13);
1492  HS_CMP_XCHG(r7, r9);
1493  HS_CMP_XCHG(r11, r13);
1494  HS_CMP_XCHG(r4, r6);
1495  HS_CMP_XCHG(r8, r10);
1496  HS_CMP_XCHG(r4, r5);
1497  HS_CMP_XCHG(r6, r7);
1498  HS_CMP_XCHG(r8, r9);
1499  HS_CMP_XCHG(r10, r11);
1500  HS_CMP_XCHG(r12, r13);
1501  HS_CMP_XCHG(r7, r8);
1502  HS_CMP_XCHG(r9, r10);
1503  {
1504    HS_SLAB_FLIP_PREAMBLE(1);
1505    HS_CMP_FLIP(0, r1, r16);
1506    HS_CMP_FLIP(1, r2, r15);
1507    HS_CMP_FLIP(2, r3, r14);
1508    HS_CMP_FLIP(3, r4, r13);
1509    HS_CMP_FLIP(4, r5, r12);
1510    HS_CMP_FLIP(5, r6, r11);
1511    HS_CMP_FLIP(6, r7, r10);
1512    HS_CMP_FLIP(7, r8, r9);
1513  }
1514  HS_CMP_XCHG(r1, r9);
1515  HS_CMP_XCHG(r5, r13);
1516  HS_CMP_XCHG(r1, r5);
1517  HS_CMP_XCHG(r9, r13);
1518  HS_CMP_XCHG(r3, r11);
1519  HS_CMP_XCHG(r7, r15);
1520  HS_CMP_XCHG(r3, r7);
1521  HS_CMP_XCHG(r11, r15);
1522  HS_CMP_XCHG(r1, r3);
1523  HS_CMP_XCHG(r5, r7);
1524  HS_CMP_XCHG(r9, r11);
1525  HS_CMP_XCHG(r13, r15);
1526  HS_CMP_XCHG(r2, r10);
1527  HS_CMP_XCHG(r6, r14);
1528  HS_CMP_XCHG(r2, r6);
1529  HS_CMP_XCHG(r10, r14);
1530  HS_CMP_XCHG(r4, r12);
1531  HS_CMP_XCHG(r8, r16);
1532  HS_CMP_XCHG(r4, r8);
1533  HS_CMP_XCHG(r12, r16);
1534  HS_CMP_XCHG(r2, r4);
1535  HS_CMP_XCHG(r6, r8);
1536  HS_CMP_XCHG(r10, r12);
1537  HS_CMP_XCHG(r14, r16);
1538  HS_CMP_XCHG(r1, r2);
1539  HS_CMP_XCHG(r3, r4);
1540  HS_CMP_XCHG(r5, r6);
1541  HS_CMP_XCHG(r7, r8);
1542  HS_CMP_XCHG(r9, r10);
1543  HS_CMP_XCHG(r11, r12);
1544  HS_CMP_XCHG(r13, r14);
1545  HS_CMP_XCHG(r15, r16);
1546  {
1547    HS_SLAB_FLIP_PREAMBLE(3);
1548    HS_CMP_FLIP(0, r1, r16);
1549    HS_CMP_FLIP(1, r2, r15);
1550    HS_CMP_FLIP(2, r3, r14);
1551    HS_CMP_FLIP(3, r4, r13);
1552    HS_CMP_FLIP(4, r5, r12);
1553    HS_CMP_FLIP(5, r6, r11);
1554    HS_CMP_FLIP(6, r7, r10);
1555    HS_CMP_FLIP(7, r8, r9);
1556  }
1557  {
1558    HS_SLAB_HALF_PREAMBLE(1);
1559    HS_CMP_HALF(0, r1);
1560    HS_CMP_HALF(1, r2);
1561    HS_CMP_HALF(2, r3);
1562    HS_CMP_HALF(3, r4);
1563    HS_CMP_HALF(4, r5);
1564    HS_CMP_HALF(5, r6);
1565    HS_CMP_HALF(6, r7);
1566    HS_CMP_HALF(7, r8);
1567    HS_CMP_HALF(8, r9);
1568    HS_CMP_HALF(9, r10);
1569    HS_CMP_HALF(10, r11);
1570    HS_CMP_HALF(11, r12);
1571    HS_CMP_HALF(12, r13);
1572    HS_CMP_HALF(13, r14);
1573    HS_CMP_HALF(14, r15);
1574    HS_CMP_HALF(15, r16);
1575  }
1576  HS_CMP_XCHG(r1, r9);
1577  HS_CMP_XCHG(r5, r13);
1578  HS_CMP_XCHG(r1, r5);
1579  HS_CMP_XCHG(r9, r13);
1580  HS_CMP_XCHG(r3, r11);
1581  HS_CMP_XCHG(r7, r15);
1582  HS_CMP_XCHG(r3, r7);
1583  HS_CMP_XCHG(r11, r15);
1584  HS_CMP_XCHG(r1, r3);
1585  HS_CMP_XCHG(r5, r7);
1586  HS_CMP_XCHG(r9, r11);
1587  HS_CMP_XCHG(r13, r15);
1588  HS_CMP_XCHG(r2, r10);
1589  HS_CMP_XCHG(r6, r14);
1590  HS_CMP_XCHG(r2, r6);
1591  HS_CMP_XCHG(r10, r14);
1592  HS_CMP_XCHG(r4, r12);
1593  HS_CMP_XCHG(r8, r16);
1594  HS_CMP_XCHG(r4, r8);
1595  HS_CMP_XCHG(r12, r16);
1596  HS_CMP_XCHG(r2, r4);
1597  HS_CMP_XCHG(r6, r8);
1598  HS_CMP_XCHG(r10, r12);
1599  HS_CMP_XCHG(r14, r16);
1600  HS_CMP_XCHG(r1, r2);
1601  HS_CMP_XCHG(r3, r4);
1602  HS_CMP_XCHG(r5, r6);
1603  HS_CMP_XCHG(r7, r8);
1604  HS_CMP_XCHG(r9, r10);
1605  HS_CMP_XCHG(r11, r12);
1606  HS_CMP_XCHG(r13, r14);
1607  HS_CMP_XCHG(r15, r16);
1608  {
1609    HS_SLAB_FLIP_PREAMBLE(7);
1610    HS_CMP_FLIP(0, r1, r16);
1611    HS_CMP_FLIP(1, r2, r15);
1612    HS_CMP_FLIP(2, r3, r14);
1613    HS_CMP_FLIP(3, r4, r13);
1614    HS_CMP_FLIP(4, r5, r12);
1615    HS_CMP_FLIP(5, r6, r11);
1616    HS_CMP_FLIP(6, r7, r10);
1617    HS_CMP_FLIP(7, r8, r9);
1618  }
1619  {
1620    HS_SLAB_HALF_PREAMBLE(2);
1621    HS_CMP_HALF(0, r1);
1622    HS_CMP_HALF(1, r2);
1623    HS_CMP_HALF(2, r3);
1624    HS_CMP_HALF(3, r4);
1625    HS_CMP_HALF(4, r5);
1626    HS_CMP_HALF(5, r6);
1627    HS_CMP_HALF(6, r7);
1628    HS_CMP_HALF(7, r8);
1629    HS_CMP_HALF(8, r9);
1630    HS_CMP_HALF(9, r10);
1631    HS_CMP_HALF(10, r11);
1632    HS_CMP_HALF(11, r12);
1633    HS_CMP_HALF(12, r13);
1634    HS_CMP_HALF(13, r14);
1635    HS_CMP_HALF(14, r15);
1636    HS_CMP_HALF(15, r16);
1637  }
1638  {
1639    HS_SLAB_HALF_PREAMBLE(1);
1640    HS_CMP_HALF(0, r1);
1641    HS_CMP_HALF(1, r2);
1642    HS_CMP_HALF(2, r3);
1643    HS_CMP_HALF(3, r4);
1644    HS_CMP_HALF(4, r5);
1645    HS_CMP_HALF(5, r6);
1646    HS_CMP_HALF(6, r7);
1647    HS_CMP_HALF(7, r8);
1648    HS_CMP_HALF(8, r9);
1649    HS_CMP_HALF(9, r10);
1650    HS_CMP_HALF(10, r11);
1651    HS_CMP_HALF(11, r12);
1652    HS_CMP_HALF(12, r13);
1653    HS_CMP_HALF(13, r14);
1654    HS_CMP_HALF(14, r15);
1655    HS_CMP_HALF(15, r16);
1656  }
1657  HS_CMP_XCHG(r1, r9);
1658  HS_CMP_XCHG(r5, r13);
1659  HS_CMP_XCHG(r1, r5);
1660  HS_CMP_XCHG(r9, r13);
1661  HS_CMP_XCHG(r3, r11);
1662  HS_CMP_XCHG(r7, r15);
1663  HS_CMP_XCHG(r3, r7);
1664  HS_CMP_XCHG(r11, r15);
1665  HS_CMP_XCHG(r1, r3);
1666  HS_CMP_XCHG(r5, r7);
1667  HS_CMP_XCHG(r9, r11);
1668  HS_CMP_XCHG(r13, r15);
1669  HS_CMP_XCHG(r2, r10);
1670  HS_CMP_XCHG(r6, r14);
1671  HS_CMP_XCHG(r2, r6);
1672  HS_CMP_XCHG(r10, r14);
1673  HS_CMP_XCHG(r4, r12);
1674  HS_CMP_XCHG(r8, r16);
1675  HS_CMP_XCHG(r4, r8);
1676  HS_CMP_XCHG(r12, r16);
1677  HS_CMP_XCHG(r2, r4);
1678  HS_CMP_XCHG(r6, r8);
1679  HS_CMP_XCHG(r10, r12);
1680  HS_CMP_XCHG(r14, r16);
1681  HS_CMP_XCHG(r1, r2);
1682  HS_CMP_XCHG(r3, r4);
1683  HS_CMP_XCHG(r5, r6);
1684  HS_CMP_XCHG(r7, r8);
1685  HS_CMP_XCHG(r9, r10);
1686  HS_CMP_XCHG(r11, r12);
1687  HS_CMP_XCHG(r13, r14);
1688  HS_CMP_XCHG(r15, r16);
1689  HS_BS_MERGE_H_PREAMBLE(8);
1690  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
1691  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r16;
1692  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
1693  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r15;
1694  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
1695  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r14;
1696  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
1697  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r13;
1698  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8) = r5;
1699  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9) = r12;
1700  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10) = r6;
1701  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11) = r11;
1702  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12) = r7;
1703  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13) = r10;
1704  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14) = r8;
1705  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15) = r9;
1706  HS_BLOCK_BARRIER();
1707  {
1708    {
1709      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1710      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8);
1711      HS_CMP_XCHG(r0_1, r0_2);
1712      HS_SLAB_LOCAL_L(0) = r0_1;
1713      HS_SLAB_LOCAL_R(8) = r0_2;
1714    }
1715    {
1716      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(16);
1717      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(24);
1718      HS_CMP_XCHG(r1_1, r1_2);
1719      HS_SLAB_LOCAL_L(16) = r1_1;
1720      HS_SLAB_LOCAL_R(24) = r1_2;
1721    }
1722    {
1723      HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(32);
1724      HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(40);
1725      HS_CMP_XCHG(r2_1, r2_2);
1726      HS_SLAB_LOCAL_L(32) = r2_1;
1727      HS_SLAB_LOCAL_R(40) = r2_2;
1728    }
1729    {
1730      HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(48);
1731      HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(56);
1732      HS_CMP_XCHG(r3_1, r3_2);
1733      HS_SLAB_LOCAL_L(48) = r3_1;
1734      HS_SLAB_LOCAL_R(56) = r3_2;
1735    }
1736    {
1737      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
1738      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(520);
1739      HS_CMP_XCHG(r0_1, r0_2);
1740      HS_SLAB_LOCAL_L(512) = r0_1;
1741      HS_SLAB_LOCAL_R(520) = r0_2;
1742    }
1743    {
1744      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(528);
1745      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(536);
1746      HS_CMP_XCHG(r1_1, r1_2);
1747      HS_SLAB_LOCAL_L(528) = r1_1;
1748      HS_SLAB_LOCAL_R(536) = r1_2;
1749    }
1750    {
1751      HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(544);
1752      HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(552);
1753      HS_CMP_XCHG(r2_1, r2_2);
1754      HS_SLAB_LOCAL_L(544) = r2_1;
1755      HS_SLAB_LOCAL_R(552) = r2_2;
1756    }
1757    {
1758      HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(560);
1759      HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(568);
1760      HS_CMP_XCHG(r3_1, r3_2);
1761      HS_SLAB_LOCAL_L(560) = r3_1;
1762      HS_SLAB_LOCAL_R(568) = r3_2;
1763    }
1764  }
1765  HS_BLOCK_BARRIER();
1766  r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
1767  r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
1768  r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
1769  r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
1770  r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
1771  r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
1772  r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
1773  r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
1774  r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8);
1775  r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9);
1776  r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10);
1777  r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11);
1778  r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12);
1779  r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13);
1780  r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14);
1781  r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15);
1782  {
1783    {
1784      HS_SLAB_HALF_PREAMBLE(4);
1785      HS_CMP_HALF(0, r1);
1786      HS_CMP_HALF(1, r2);
1787      HS_CMP_HALF(2, r3);
1788      HS_CMP_HALF(3, r4);
1789      HS_CMP_HALF(4, r5);
1790      HS_CMP_HALF(5, r6);
1791      HS_CMP_HALF(6, r7);
1792      HS_CMP_HALF(7, r8);
1793      HS_CMP_HALF(8, r9);
1794      HS_CMP_HALF(9, r10);
1795      HS_CMP_HALF(10, r11);
1796      HS_CMP_HALF(11, r12);
1797      HS_CMP_HALF(12, r13);
1798      HS_CMP_HALF(13, r14);
1799      HS_CMP_HALF(14, r15);
1800      HS_CMP_HALF(15, r16);
1801    }
1802    {
1803      HS_SLAB_HALF_PREAMBLE(2);
1804      HS_CMP_HALF(0, r1);
1805      HS_CMP_HALF(1, r2);
1806      HS_CMP_HALF(2, r3);
1807      HS_CMP_HALF(3, r4);
1808      HS_CMP_HALF(4, r5);
1809      HS_CMP_HALF(5, r6);
1810      HS_CMP_HALF(6, r7);
1811      HS_CMP_HALF(7, r8);
1812      HS_CMP_HALF(8, r9);
1813      HS_CMP_HALF(9, r10);
1814      HS_CMP_HALF(10, r11);
1815      HS_CMP_HALF(11, r12);
1816      HS_CMP_HALF(12, r13);
1817      HS_CMP_HALF(13, r14);
1818      HS_CMP_HALF(14, r15);
1819      HS_CMP_HALF(15, r16);
1820    }
1821    {
1822      HS_SLAB_HALF_PREAMBLE(1);
1823      HS_CMP_HALF(0, r1);
1824      HS_CMP_HALF(1, r2);
1825      HS_CMP_HALF(2, r3);
1826      HS_CMP_HALF(3, r4);
1827      HS_CMP_HALF(4, r5);
1828      HS_CMP_HALF(5, r6);
1829      HS_CMP_HALF(6, r7);
1830      HS_CMP_HALF(7, r8);
1831      HS_CMP_HALF(8, r9);
1832      HS_CMP_HALF(9, r10);
1833      HS_CMP_HALF(10, r11);
1834      HS_CMP_HALF(11, r12);
1835      HS_CMP_HALF(12, r13);
1836      HS_CMP_HALF(13, r14);
1837      HS_CMP_HALF(14, r15);
1838      HS_CMP_HALF(15, r16);
1839    }
1840    HS_CMP_XCHG(r1, r9);
1841    HS_CMP_XCHG(r5, r13);
1842    HS_CMP_XCHG(r1, r5);
1843    HS_CMP_XCHG(r9, r13);
1844    HS_CMP_XCHG(r3, r11);
1845    HS_CMP_XCHG(r7, r15);
1846    HS_CMP_XCHG(r3, r7);
1847    HS_CMP_XCHG(r11, r15);
1848    HS_CMP_XCHG(r1, r3);
1849    HS_CMP_XCHG(r5, r7);
1850    HS_CMP_XCHG(r9, r11);
1851    HS_CMP_XCHG(r13, r15);
1852    HS_CMP_XCHG(r2, r10);
1853    HS_CMP_XCHG(r6, r14);
1854    HS_CMP_XCHG(r2, r6);
1855    HS_CMP_XCHG(r10, r14);
1856    HS_CMP_XCHG(r4, r12);
1857    HS_CMP_XCHG(r8, r16);
1858    HS_CMP_XCHG(r4, r8);
1859    HS_CMP_XCHG(r12, r16);
1860    HS_CMP_XCHG(r2, r4);
1861    HS_CMP_XCHG(r6, r8);
1862    HS_CMP_XCHG(r10, r12);
1863    HS_CMP_XCHG(r14, r16);
1864    HS_CMP_XCHG(r1, r2);
1865    HS_CMP_XCHG(r3, r4);
1866    HS_CMP_XCHG(r5, r6);
1867    HS_CMP_XCHG(r7, r8);
1868    HS_CMP_XCHG(r9, r10);
1869    HS_CMP_XCHG(r11, r12);
1870    HS_CMP_XCHG(r13, r14);
1871    HS_CMP_XCHG(r15, r16);
1872  }
1873  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
1874  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r16;
1875  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
1876  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r15;
1877  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
1878  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r14;
1879  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
1880  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r13;
1881  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8) = r5;
1882  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9) = r12;
1883  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10) = r6;
1884  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11) = r11;
1885  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12) = r7;
1886  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13) = r10;
1887  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14) = r8;
1888  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15) = r9;
1889  HS_BLOCK_BARRIER();
1890  {
1891    {
1892      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
1893      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8);
1894      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(16);
1895      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(24);
1896      HS_CMP_XCHG(r0_2, r0_3);
1897      HS_CMP_XCHG(r0_1, r0_4);
1898      HS_CMP_XCHG(r0_3, r0_4);
1899      HS_CMP_XCHG(r0_1, r0_2);
1900      HS_SLAB_LOCAL_L(0) = r0_1;
1901      HS_SLAB_LOCAL_L(8) = r0_2;
1902      HS_SLAB_LOCAL_R(16) = r0_3;
1903      HS_SLAB_LOCAL_R(24) = r0_4;
1904    }
1905    {
1906      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32);
1907      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(40);
1908      HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(48);
1909      HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(56);
1910      HS_CMP_XCHG(r1_2, r1_3);
1911      HS_CMP_XCHG(r1_1, r1_4);
1912      HS_CMP_XCHG(r1_3, r1_4);
1913      HS_CMP_XCHG(r1_1, r1_2);
1914      HS_SLAB_LOCAL_L(32) = r1_1;
1915      HS_SLAB_LOCAL_L(40) = r1_2;
1916      HS_SLAB_LOCAL_R(48) = r1_3;
1917      HS_SLAB_LOCAL_R(56) = r1_4;
1918    }
1919    {
1920      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
1921      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(520);
1922      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(528);
1923      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(536);
1924      HS_CMP_XCHG(r0_2, r0_3);
1925      HS_CMP_XCHG(r0_1, r0_4);
1926      HS_CMP_XCHG(r0_3, r0_4);
1927      HS_CMP_XCHG(r0_1, r0_2);
1928      HS_SLAB_LOCAL_L(512) = r0_1;
1929      HS_SLAB_LOCAL_L(520) = r0_2;
1930      HS_SLAB_LOCAL_R(528) = r0_3;
1931      HS_SLAB_LOCAL_R(536) = r0_4;
1932    }
1933    {
1934      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(544);
1935      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(552);
1936      HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(560);
1937      HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(568);
1938      HS_CMP_XCHG(r1_2, r1_3);
1939      HS_CMP_XCHG(r1_1, r1_4);
1940      HS_CMP_XCHG(r1_3, r1_4);
1941      HS_CMP_XCHG(r1_1, r1_2);
1942      HS_SLAB_LOCAL_L(544) = r1_1;
1943      HS_SLAB_LOCAL_L(552) = r1_2;
1944      HS_SLAB_LOCAL_R(560) = r1_3;
1945      HS_SLAB_LOCAL_R(568) = r1_4;
1946    }
1947  }
1948  HS_BLOCK_BARRIER();
1949  r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
1950  r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
1951  r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
1952  r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
1953  r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
1954  r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
1955  r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
1956  r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
1957  r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8);
1958  r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9);
1959  r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10);
1960  r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11);
1961  r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12);
1962  r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13);
1963  r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14);
1964  r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15);
1965  {
1966    {
1967      HS_SLAB_HALF_PREAMBLE(4);
1968      HS_CMP_HALF(0, r1);
1969      HS_CMP_HALF(1, r2);
1970      HS_CMP_HALF(2, r3);
1971      HS_CMP_HALF(3, r4);
1972      HS_CMP_HALF(4, r5);
1973      HS_CMP_HALF(5, r6);
1974      HS_CMP_HALF(6, r7);
1975      HS_CMP_HALF(7, r8);
1976      HS_CMP_HALF(8, r9);
1977      HS_CMP_HALF(9, r10);
1978      HS_CMP_HALF(10, r11);
1979      HS_CMP_HALF(11, r12);
1980      HS_CMP_HALF(12, r13);
1981      HS_CMP_HALF(13, r14);
1982      HS_CMP_HALF(14, r15);
1983      HS_CMP_HALF(15, r16);
1984    }
1985    {
1986      HS_SLAB_HALF_PREAMBLE(2);
1987      HS_CMP_HALF(0, r1);
1988      HS_CMP_HALF(1, r2);
1989      HS_CMP_HALF(2, r3);
1990      HS_CMP_HALF(3, r4);
1991      HS_CMP_HALF(4, r5);
1992      HS_CMP_HALF(5, r6);
1993      HS_CMP_HALF(6, r7);
1994      HS_CMP_HALF(7, r8);
1995      HS_CMP_HALF(8, r9);
1996      HS_CMP_HALF(9, r10);
1997      HS_CMP_HALF(10, r11);
1998      HS_CMP_HALF(11, r12);
1999      HS_CMP_HALF(12, r13);
2000      HS_CMP_HALF(13, r14);
2001      HS_CMP_HALF(14, r15);
2002      HS_CMP_HALF(15, r16);
2003    }
2004    {
2005      HS_SLAB_HALF_PREAMBLE(1);
2006      HS_CMP_HALF(0, r1);
2007      HS_CMP_HALF(1, r2);
2008      HS_CMP_HALF(2, r3);
2009      HS_CMP_HALF(3, r4);
2010      HS_CMP_HALF(4, r5);
2011      HS_CMP_HALF(5, r6);
2012      HS_CMP_HALF(6, r7);
2013      HS_CMP_HALF(7, r8);
2014      HS_CMP_HALF(8, r9);
2015      HS_CMP_HALF(9, r10);
2016      HS_CMP_HALF(10, r11);
2017      HS_CMP_HALF(11, r12);
2018      HS_CMP_HALF(12, r13);
2019      HS_CMP_HALF(13, r14);
2020      HS_CMP_HALF(14, r15);
2021      HS_CMP_HALF(15, r16);
2022    }
2023    HS_CMP_XCHG(r1, r9);
2024    HS_CMP_XCHG(r5, r13);
2025    HS_CMP_XCHG(r1, r5);
2026    HS_CMP_XCHG(r9, r13);
2027    HS_CMP_XCHG(r3, r11);
2028    HS_CMP_XCHG(r7, r15);
2029    HS_CMP_XCHG(r3, r7);
2030    HS_CMP_XCHG(r11, r15);
2031    HS_CMP_XCHG(r1, r3);
2032    HS_CMP_XCHG(r5, r7);
2033    HS_CMP_XCHG(r9, r11);
2034    HS_CMP_XCHG(r13, r15);
2035    HS_CMP_XCHG(r2, r10);
2036    HS_CMP_XCHG(r6, r14);
2037    HS_CMP_XCHG(r2, r6);
2038    HS_CMP_XCHG(r10, r14);
2039    HS_CMP_XCHG(r4, r12);
2040    HS_CMP_XCHG(r8, r16);
2041    HS_CMP_XCHG(r4, r8);
2042    HS_CMP_XCHG(r12, r16);
2043    HS_CMP_XCHG(r2, r4);
2044    HS_CMP_XCHG(r6, r8);
2045    HS_CMP_XCHG(r10, r12);
2046    HS_CMP_XCHG(r14, r16);
2047    HS_CMP_XCHG(r1, r2);
2048    HS_CMP_XCHG(r3, r4);
2049    HS_CMP_XCHG(r5, r6);
2050    HS_CMP_XCHG(r7, r8);
2051    HS_CMP_XCHG(r9, r10);
2052    HS_CMP_XCHG(r11, r12);
2053    HS_CMP_XCHG(r13, r14);
2054    HS_CMP_XCHG(r15, r16);
2055  }
2056  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
2057  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r16;
2058  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
2059  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r15;
2060  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
2061  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r14;
2062  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
2063  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r13;
2064  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8) = r5;
2065  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9) = r12;
2066  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10) = r6;
2067  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11) = r11;
2068  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12) = r7;
2069  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13) = r10;
2070  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14) = r8;
2071  HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15) = r9;
2072  HS_BLOCK_BARRIER();
2073  {
2074    {
2075      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
2076      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8);
2077      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(16);
2078      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(24);
2079      HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(32);
2080      HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(40);
2081      HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(48);
2082      HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(56);
2083      HS_CMP_XCHG(r0_4, r0_5);
2084      HS_CMP_XCHG(r0_3, r0_6);
2085      HS_CMP_XCHG(r0_2, r0_7);
2086      HS_CMP_XCHG(r0_1, r0_8);
2087      HS_CMP_XCHG(r0_5, r0_7);
2088      HS_CMP_XCHG(r0_6, r0_8);
2089      HS_CMP_XCHG(r0_5, r0_6);
2090      HS_CMP_XCHG(r0_7, r0_8);
2091      HS_CMP_XCHG(r0_1, r0_3);
2092      HS_CMP_XCHG(r0_2, r0_4);
2093      HS_CMP_XCHG(r0_1, r0_2);
2094      HS_CMP_XCHG(r0_3, r0_4);
2095      HS_SLAB_LOCAL_L(0) = r0_1;
2096      HS_SLAB_LOCAL_L(8) = r0_2;
2097      HS_SLAB_LOCAL_L(16) = r0_3;
2098      HS_SLAB_LOCAL_L(24) = r0_4;
2099      HS_SLAB_LOCAL_R(32) = r0_5;
2100      HS_SLAB_LOCAL_R(40) = r0_6;
2101      HS_SLAB_LOCAL_R(48) = r0_7;
2102      HS_SLAB_LOCAL_R(56) = r0_8;
2103    }
2104    {
2105      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
2106      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(520);
2107      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(528);
2108      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(536);
2109      HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(544);
2110      HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(552);
2111      HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(560);
2112      HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(568);
2113      HS_CMP_XCHG(r0_4, r0_5);
2114      HS_CMP_XCHG(r0_3, r0_6);
2115      HS_CMP_XCHG(r0_2, r0_7);
2116      HS_CMP_XCHG(r0_1, r0_8);
2117      HS_CMP_XCHG(r0_5, r0_7);
2118      HS_CMP_XCHG(r0_6, r0_8);
2119      HS_CMP_XCHG(r0_5, r0_6);
2120      HS_CMP_XCHG(r0_7, r0_8);
2121      HS_CMP_XCHG(r0_1, r0_3);
2122      HS_CMP_XCHG(r0_2, r0_4);
2123      HS_CMP_XCHG(r0_1, r0_2);
2124      HS_CMP_XCHG(r0_3, r0_4);
2125      HS_SLAB_LOCAL_L(512) = r0_1;
2126      HS_SLAB_LOCAL_L(520) = r0_2;
2127      HS_SLAB_LOCAL_L(528) = r0_3;
2128      HS_SLAB_LOCAL_L(536) = r0_4;
2129      HS_SLAB_LOCAL_R(544) = r0_5;
2130      HS_SLAB_LOCAL_R(552) = r0_6;
2131      HS_SLAB_LOCAL_R(560) = r0_7;
2132      HS_SLAB_LOCAL_R(568) = r0_8;
2133    }
2134  }
2135  HS_BLOCK_BARRIER();
2136  r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
2137  r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
2138  r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
2139  r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
2140  r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
2141  r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
2142  r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
2143  r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
2144  r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8);
2145  r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9);
2146  r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10);
2147  r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11);
2148  r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12);
2149  r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13);
2150  r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14);
2151  r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15);
2152  {
2153    {
2154      HS_SLAB_HALF_PREAMBLE(4);
2155      HS_CMP_HALF(0, r1);
2156      HS_CMP_HALF(1, r2);
2157      HS_CMP_HALF(2, r3);
2158      HS_CMP_HALF(3, r4);
2159      HS_CMP_HALF(4, r5);
2160      HS_CMP_HALF(5, r6);
2161      HS_CMP_HALF(6, r7);
2162      HS_CMP_HALF(7, r8);
2163      HS_CMP_HALF(8, r9);
2164      HS_CMP_HALF(9, r10);
2165      HS_CMP_HALF(10, r11);
2166      HS_CMP_HALF(11, r12);
2167      HS_CMP_HALF(12, r13);
2168      HS_CMP_HALF(13, r14);
2169      HS_CMP_HALF(14, r15);
2170      HS_CMP_HALF(15, r16);
2171    }
2172    {
2173      HS_SLAB_HALF_PREAMBLE(2);
2174      HS_CMP_HALF(0, r1);
2175      HS_CMP_HALF(1, r2);
2176      HS_CMP_HALF(2, r3);
2177      HS_CMP_HALF(3, r4);
2178      HS_CMP_HALF(4, r5);
2179      HS_CMP_HALF(5, r6);
2180      HS_CMP_HALF(6, r7);
2181      HS_CMP_HALF(7, r8);
2182      HS_CMP_HALF(8, r9);
2183      HS_CMP_HALF(9, r10);
2184      HS_CMP_HALF(10, r11);
2185      HS_CMP_HALF(11, r12);
2186      HS_CMP_HALF(12, r13);
2187      HS_CMP_HALF(13, r14);
2188      HS_CMP_HALF(14, r15);
2189      HS_CMP_HALF(15, r16);
2190    }
2191    {
2192      HS_SLAB_HALF_PREAMBLE(1);
2193      HS_CMP_HALF(0, r1);
2194      HS_CMP_HALF(1, r2);
2195      HS_CMP_HALF(2, r3);
2196      HS_CMP_HALF(3, r4);
2197      HS_CMP_HALF(4, r5);
2198      HS_CMP_HALF(5, r6);
2199      HS_CMP_HALF(6, r7);
2200      HS_CMP_HALF(7, r8);
2201      HS_CMP_HALF(8, r9);
2202      HS_CMP_HALF(9, r10);
2203      HS_CMP_HALF(10, r11);
2204      HS_CMP_HALF(11, r12);
2205      HS_CMP_HALF(12, r13);
2206      HS_CMP_HALF(13, r14);
2207      HS_CMP_HALF(14, r15);
2208      HS_CMP_HALF(15, r16);
2209    }
2210    HS_CMP_XCHG(r1, r9);
2211    HS_CMP_XCHG(r5, r13);
2212    HS_CMP_XCHG(r1, r5);
2213    HS_CMP_XCHG(r9, r13);
2214    HS_CMP_XCHG(r3, r11);
2215    HS_CMP_XCHG(r7, r15);
2216    HS_CMP_XCHG(r3, r7);
2217    HS_CMP_XCHG(r11, r15);
2218    HS_CMP_XCHG(r1, r3);
2219    HS_CMP_XCHG(r5, r7);
2220    HS_CMP_XCHG(r9, r11);
2221    HS_CMP_XCHG(r13, r15);
2222    HS_CMP_XCHG(r2, r10);
2223    HS_CMP_XCHG(r6, r14);
2224    HS_CMP_XCHG(r2, r6);
2225    HS_CMP_XCHG(r10, r14);
2226    HS_CMP_XCHG(r4, r12);
2227    HS_CMP_XCHG(r8, r16);
2228    HS_CMP_XCHG(r4, r8);
2229    HS_CMP_XCHG(r12, r16);
2230    HS_CMP_XCHG(r2, r4);
2231    HS_CMP_XCHG(r6, r8);
2232    HS_CMP_XCHG(r10, r12);
2233    HS_CMP_XCHG(r14, r16);
2234    HS_CMP_XCHG(r1, r2);
2235    HS_CMP_XCHG(r3, r4);
2236    HS_CMP_XCHG(r5, r6);
2237    HS_CMP_XCHG(r7, r8);
2238    HS_CMP_XCHG(r9, r10);
2239    HS_CMP_XCHG(r11, r12);
2240    HS_CMP_XCHG(r13, r14);
2241    HS_CMP_XCHG(r15, r16);
2242  }
2243  HS_SLAB_GLOBAL_STORE(0, r1);
2244  HS_SLAB_GLOBAL_STORE(1, r2);
2245  HS_SLAB_GLOBAL_STORE(2, r3);
2246  HS_SLAB_GLOBAL_STORE(3, r4);
2247  HS_SLAB_GLOBAL_STORE(4, r5);
2248  HS_SLAB_GLOBAL_STORE(5, r6);
2249  HS_SLAB_GLOBAL_STORE(6, r7);
2250  HS_SLAB_GLOBAL_STORE(7, r8);
2251  HS_SLAB_GLOBAL_STORE(8, r9);
2252  HS_SLAB_GLOBAL_STORE(9, r10);
2253  HS_SLAB_GLOBAL_STORE(10, r11);
2254  HS_SLAB_GLOBAL_STORE(11, r12);
2255  HS_SLAB_GLOBAL_STORE(12, r13);
2256  HS_SLAB_GLOBAL_STORE(13, r14);
2257  HS_SLAB_GLOBAL_STORE(14, r15);
2258  HS_SLAB_GLOBAL_STORE(15, r16);
2259}
2260
2261HS_BS_KERNEL_PROTO(16, 4)
2262{
2263  HS_BLOCK_LOCAL_MEM_DECL(128, 16);
2264
2265  HS_SLAB_GLOBAL_PREAMBLE();
2266  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
2267  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
2268  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
2269  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
2270  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
2271  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
2272  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
2273  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
2274  HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8);
2275  HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9);
2276  HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10);
2277  HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11);
2278  HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12);
2279  HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13);
2280  HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14);
2281  HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15);
2282  HS_CMP_XCHG(r1, r2);
2283  HS_CMP_XCHG(r3, r4);
2284  HS_CMP_XCHG(r5, r6);
2285  HS_CMP_XCHG(r7, r8);
2286  HS_CMP_XCHG(r9, r10);
2287  HS_CMP_XCHG(r11, r12);
2288  HS_CMP_XCHG(r13, r14);
2289  HS_CMP_XCHG(r15, r16);
2290  HS_CMP_XCHG(r1, r3);
2291  HS_CMP_XCHG(r5, r7);
2292  HS_CMP_XCHG(r9, r11);
2293  HS_CMP_XCHG(r13, r15);
2294  HS_CMP_XCHG(r2, r4);
2295  HS_CMP_XCHG(r6, r8);
2296  HS_CMP_XCHG(r10, r12);
2297  HS_CMP_XCHG(r14, r16);
2298  HS_CMP_XCHG(r1, r5);
2299  HS_CMP_XCHG(r9, r13);
2300  HS_CMP_XCHG(r2, r6);
2301  HS_CMP_XCHG(r10, r14);
2302  HS_CMP_XCHG(r3, r7);
2303  HS_CMP_XCHG(r11, r15);
2304  HS_CMP_XCHG(r4, r8);
2305  HS_CMP_XCHG(r12, r16);
2306  HS_CMP_XCHG(r1, r9);
2307  HS_CMP_XCHG(r2, r10);
2308  HS_CMP_XCHG(r3, r11);
2309  HS_CMP_XCHG(r4, r12);
2310  HS_CMP_XCHG(r5, r13);
2311  HS_CMP_XCHG(r6, r14);
2312  HS_CMP_XCHG(r7, r15);
2313  HS_CMP_XCHG(r8, r16);
2314  HS_CMP_XCHG(r6, r11);
2315  HS_CMP_XCHG(r7, r10);
2316  HS_CMP_XCHG(r4, r13);
2317  HS_CMP_XCHG(r14, r15);
2318  HS_CMP_XCHG(r8, r12);
2319  HS_CMP_XCHG(r2, r3);
2320  HS_CMP_XCHG(r5, r9);
2321  HS_CMP_XCHG(r2, r5);
2322  HS_CMP_XCHG(r8, r14);
2323  HS_CMP_XCHG(r3, r9);
2324  HS_CMP_XCHG(r12, r15);
2325  HS_CMP_XCHG(r3, r5);
2326  HS_CMP_XCHG(r6, r7);
2327  HS_CMP_XCHG(r10, r11);
2328  HS_CMP_XCHG(r12, r14);
2329  HS_CMP_XCHG(r4, r9);
2330  HS_CMP_XCHG(r8, r13);
2331  HS_CMP_XCHG(r7, r9);
2332  HS_CMP_XCHG(r11, r13);
2333  HS_CMP_XCHG(r4, r6);
2334  HS_CMP_XCHG(r8, r10);
2335  HS_CMP_XCHG(r4, r5);
2336  HS_CMP_XCHG(r6, r7);
2337  HS_CMP_XCHG(r8, r9);
2338  HS_CMP_XCHG(r10, r11);
2339  HS_CMP_XCHG(r12, r13);
2340  HS_CMP_XCHG(r7, r8);
2341  HS_CMP_XCHG(r9, r10);
2342  {
2343    HS_SLAB_FLIP_PREAMBLE(1);
2344    HS_CMP_FLIP(0, r1, r16);
2345    HS_CMP_FLIP(1, r2, r15);
2346    HS_CMP_FLIP(2, r3, r14);
2347    HS_CMP_FLIP(3, r4, r13);
2348    HS_CMP_FLIP(4, r5, r12);
2349    HS_CMP_FLIP(5, r6, r11);
2350    HS_CMP_FLIP(6, r7, r10);
2351    HS_CMP_FLIP(7, r8, r9);
2352  }
2353  HS_CMP_XCHG(r1, r9);
2354  HS_CMP_XCHG(r5, r13);
2355  HS_CMP_XCHG(r1, r5);
2356  HS_CMP_XCHG(r9, r13);
2357  HS_CMP_XCHG(r3, r11);
2358  HS_CMP_XCHG(r7, r15);
2359  HS_CMP_XCHG(r3, r7);
2360  HS_CMP_XCHG(r11, r15);
2361  HS_CMP_XCHG(r1, r3);
2362  HS_CMP_XCHG(r5, r7);
2363  HS_CMP_XCHG(r9, r11);
2364  HS_CMP_XCHG(r13, r15);
2365  HS_CMP_XCHG(r2, r10);
2366  HS_CMP_XCHG(r6, r14);
2367  HS_CMP_XCHG(r2, r6);
2368  HS_CMP_XCHG(r10, r14);
2369  HS_CMP_XCHG(r4, r12);
2370  HS_CMP_XCHG(r8, r16);
2371  HS_CMP_XCHG(r4, r8);
2372  HS_CMP_XCHG(r12, r16);
2373  HS_CMP_XCHG(r2, r4);
2374  HS_CMP_XCHG(r6, r8);
2375  HS_CMP_XCHG(r10, r12);
2376  HS_CMP_XCHG(r14, r16);
2377  HS_CMP_XCHG(r1, r2);
2378  HS_CMP_XCHG(r3, r4);
2379  HS_CMP_XCHG(r5, r6);
2380  HS_CMP_XCHG(r7, r8);
2381  HS_CMP_XCHG(r9, r10);
2382  HS_CMP_XCHG(r11, r12);
2383  HS_CMP_XCHG(r13, r14);
2384  HS_CMP_XCHG(r15, r16);
2385  {
2386    HS_SLAB_FLIP_PREAMBLE(3);
2387    HS_CMP_FLIP(0, r1, r16);
2388    HS_CMP_FLIP(1, r2, r15);
2389    HS_CMP_FLIP(2, r3, r14);
2390    HS_CMP_FLIP(3, r4, r13);
2391    HS_CMP_FLIP(4, r5, r12);
2392    HS_CMP_FLIP(5, r6, r11);
2393    HS_CMP_FLIP(6, r7, r10);
2394    HS_CMP_FLIP(7, r8, r9);
2395  }
2396  {
2397    HS_SLAB_HALF_PREAMBLE(1);
2398    HS_CMP_HALF(0, r1);
2399    HS_CMP_HALF(1, r2);
2400    HS_CMP_HALF(2, r3);
2401    HS_CMP_HALF(3, r4);
2402    HS_CMP_HALF(4, r5);
2403    HS_CMP_HALF(5, r6);
2404    HS_CMP_HALF(6, r7);
2405    HS_CMP_HALF(7, r8);
2406    HS_CMP_HALF(8, r9);
2407    HS_CMP_HALF(9, r10);
2408    HS_CMP_HALF(10, r11);
2409    HS_CMP_HALF(11, r12);
2410    HS_CMP_HALF(12, r13);
2411    HS_CMP_HALF(13, r14);
2412    HS_CMP_HALF(14, r15);
2413    HS_CMP_HALF(15, r16);
2414  }
2415  HS_CMP_XCHG(r1, r9);
2416  HS_CMP_XCHG(r5, r13);
2417  HS_CMP_XCHG(r1, r5);
2418  HS_CMP_XCHG(r9, r13);
2419  HS_CMP_XCHG(r3, r11);
2420  HS_CMP_XCHG(r7, r15);
2421  HS_CMP_XCHG(r3, r7);
2422  HS_CMP_XCHG(r11, r15);
2423  HS_CMP_XCHG(r1, r3);
2424  HS_CMP_XCHG(r5, r7);
2425  HS_CMP_XCHG(r9, r11);
2426  HS_CMP_XCHG(r13, r15);
2427  HS_CMP_XCHG(r2, r10);
2428  HS_CMP_XCHG(r6, r14);
2429  HS_CMP_XCHG(r2, r6);
2430  HS_CMP_XCHG(r10, r14);
2431  HS_CMP_XCHG(r4, r12);
2432  HS_CMP_XCHG(r8, r16);
2433  HS_CMP_XCHG(r4, r8);
2434  HS_CMP_XCHG(r12, r16);
2435  HS_CMP_XCHG(r2, r4);
2436  HS_CMP_XCHG(r6, r8);
2437  HS_CMP_XCHG(r10, r12);
2438  HS_CMP_XCHG(r14, r16);
2439  HS_CMP_XCHG(r1, r2);
2440  HS_CMP_XCHG(r3, r4);
2441  HS_CMP_XCHG(r5, r6);
2442  HS_CMP_XCHG(r7, r8);
2443  HS_CMP_XCHG(r9, r10);
2444  HS_CMP_XCHG(r11, r12);
2445  HS_CMP_XCHG(r13, r14);
2446  HS_CMP_XCHG(r15, r16);
2447  {
2448    HS_SLAB_FLIP_PREAMBLE(7);
2449    HS_CMP_FLIP(0, r1, r16);
2450    HS_CMP_FLIP(1, r2, r15);
2451    HS_CMP_FLIP(2, r3, r14);
2452    HS_CMP_FLIP(3, r4, r13);
2453    HS_CMP_FLIP(4, r5, r12);
2454    HS_CMP_FLIP(5, r6, r11);
2455    HS_CMP_FLIP(6, r7, r10);
2456    HS_CMP_FLIP(7, r8, r9);
2457  }
2458  {
2459    HS_SLAB_HALF_PREAMBLE(2);
2460    HS_CMP_HALF(0, r1);
2461    HS_CMP_HALF(1, r2);
2462    HS_CMP_HALF(2, r3);
2463    HS_CMP_HALF(3, r4);
2464    HS_CMP_HALF(4, r5);
2465    HS_CMP_HALF(5, r6);
2466    HS_CMP_HALF(6, r7);
2467    HS_CMP_HALF(7, r8);
2468    HS_CMP_HALF(8, r9);
2469    HS_CMP_HALF(9, r10);
2470    HS_CMP_HALF(10, r11);
2471    HS_CMP_HALF(11, r12);
2472    HS_CMP_HALF(12, r13);
2473    HS_CMP_HALF(13, r14);
2474    HS_CMP_HALF(14, r15);
2475    HS_CMP_HALF(15, r16);
2476  }
2477  {
2478    HS_SLAB_HALF_PREAMBLE(1);
2479    HS_CMP_HALF(0, r1);
2480    HS_CMP_HALF(1, r2);
2481    HS_CMP_HALF(2, r3);
2482    HS_CMP_HALF(3, r4);
2483    HS_CMP_HALF(4, r5);
2484    HS_CMP_HALF(5, r6);
2485    HS_CMP_HALF(6, r7);
2486    HS_CMP_HALF(7, r8);
2487    HS_CMP_HALF(8, r9);
2488    HS_CMP_HALF(9, r10);
2489    HS_CMP_HALF(10, r11);
2490    HS_CMP_HALF(11, r12);
2491    HS_CMP_HALF(12, r13);
2492    HS_CMP_HALF(13, r14);
2493    HS_CMP_HALF(14, r15);
2494    HS_CMP_HALF(15, r16);
2495  }
2496  HS_CMP_XCHG(r1, r9);
2497  HS_CMP_XCHG(r5, r13);
2498  HS_CMP_XCHG(r1, r5);
2499  HS_CMP_XCHG(r9, r13);
2500  HS_CMP_XCHG(r3, r11);
2501  HS_CMP_XCHG(r7, r15);
2502  HS_CMP_XCHG(r3, r7);
2503  HS_CMP_XCHG(r11, r15);
2504  HS_CMP_XCHG(r1, r3);
2505  HS_CMP_XCHG(r5, r7);
2506  HS_CMP_XCHG(r9, r11);
2507  HS_CMP_XCHG(r13, r15);
2508  HS_CMP_XCHG(r2, r10);
2509  HS_CMP_XCHG(r6, r14);
2510  HS_CMP_XCHG(r2, r6);
2511  HS_CMP_XCHG(r10, r14);
2512  HS_CMP_XCHG(r4, r12);
2513  HS_CMP_XCHG(r8, r16);
2514  HS_CMP_XCHG(r4, r8);
2515  HS_CMP_XCHG(r12, r16);
2516  HS_CMP_XCHG(r2, r4);
2517  HS_CMP_XCHG(r6, r8);
2518  HS_CMP_XCHG(r10, r12);
2519  HS_CMP_XCHG(r14, r16);
2520  HS_CMP_XCHG(r1, r2);
2521  HS_CMP_XCHG(r3, r4);
2522  HS_CMP_XCHG(r5, r6);
2523  HS_CMP_XCHG(r7, r8);
2524  HS_CMP_XCHG(r9, r10);
2525  HS_CMP_XCHG(r11, r12);
2526  HS_CMP_XCHG(r13, r14);
2527  HS_CMP_XCHG(r15, r16);
2528  HS_BS_MERGE_H_PREAMBLE(16);
2529  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
2530  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16;
2531  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
2532  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15;
2533  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
2534  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14;
2535  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
2536  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13;
2537  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5;
2538  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12;
2539  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6;
2540  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11;
2541  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7;
2542  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10;
2543  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8;
2544  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9;
2545  HS_BLOCK_BARRIER();
2546  {
2547    {
2548      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
2549      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(8);
2550      HS_CMP_XCHG(r0_1, r0_2);
2551      HS_SLAB_LOCAL_L(0) = r0_1;
2552      HS_SLAB_LOCAL_R(8) = r0_2;
2553    }
2554    {
2555      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(16);
2556      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(24);
2557      HS_CMP_XCHG(r1_1, r1_2);
2558      HS_SLAB_LOCAL_L(16) = r1_1;
2559      HS_SLAB_LOCAL_R(24) = r1_2;
2560    }
2561    {
2562      HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(32);
2563      HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(40);
2564      HS_CMP_XCHG(r2_1, r2_2);
2565      HS_SLAB_LOCAL_L(32) = r2_1;
2566      HS_SLAB_LOCAL_R(40) = r2_2;
2567    }
2568    {
2569      HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(48);
2570      HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(56);
2571      HS_CMP_XCHG(r3_1, r3_2);
2572      HS_SLAB_LOCAL_L(48) = r3_1;
2573      HS_SLAB_LOCAL_R(56) = r3_2;
2574    }
2575    {
2576      HS_KEY_TYPE r4_1 = HS_SLAB_LOCAL_L(64);
2577      HS_KEY_TYPE r4_2 = HS_SLAB_LOCAL_R(72);
2578      HS_CMP_XCHG(r4_1, r4_2);
2579      HS_SLAB_LOCAL_L(64) = r4_1;
2580      HS_SLAB_LOCAL_R(72) = r4_2;
2581    }
2582    {
2583      HS_KEY_TYPE r5_1 = HS_SLAB_LOCAL_L(80);
2584      HS_KEY_TYPE r5_2 = HS_SLAB_LOCAL_R(88);
2585      HS_CMP_XCHG(r5_1, r5_2);
2586      HS_SLAB_LOCAL_L(80) = r5_1;
2587      HS_SLAB_LOCAL_R(88) = r5_2;
2588    }
2589    {
2590      HS_KEY_TYPE r6_1 = HS_SLAB_LOCAL_L(96);
2591      HS_KEY_TYPE r6_2 = HS_SLAB_LOCAL_R(104);
2592      HS_CMP_XCHG(r6_1, r6_2);
2593      HS_SLAB_LOCAL_L(96) = r6_1;
2594      HS_SLAB_LOCAL_R(104) = r6_2;
2595    }
2596    {
2597      HS_KEY_TYPE r7_1 = HS_SLAB_LOCAL_L(112);
2598      HS_KEY_TYPE r7_2 = HS_SLAB_LOCAL_R(120);
2599      HS_CMP_XCHG(r7_1, r7_2);
2600      HS_SLAB_LOCAL_L(112) = r7_1;
2601      HS_SLAB_LOCAL_R(120) = r7_2;
2602    }
2603  }
2604  HS_BLOCK_BARRIER();
2605  r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2606  r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2607  r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2608  r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2609  r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2610  r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2611  r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2612  r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2613  r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8);
2614  r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9);
2615  r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10);
2616  r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11);
2617  r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12);
2618  r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13);
2619  r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14);
2620  r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15);
2621  {
2622    {
2623      HS_SLAB_HALF_PREAMBLE(4);
2624      HS_CMP_HALF(0, r1);
2625      HS_CMP_HALF(1, r2);
2626      HS_CMP_HALF(2, r3);
2627      HS_CMP_HALF(3, r4);
2628      HS_CMP_HALF(4, r5);
2629      HS_CMP_HALF(5, r6);
2630      HS_CMP_HALF(6, r7);
2631      HS_CMP_HALF(7, r8);
2632      HS_CMP_HALF(8, r9);
2633      HS_CMP_HALF(9, r10);
2634      HS_CMP_HALF(10, r11);
2635      HS_CMP_HALF(11, r12);
2636      HS_CMP_HALF(12, r13);
2637      HS_CMP_HALF(13, r14);
2638      HS_CMP_HALF(14, r15);
2639      HS_CMP_HALF(15, r16);
2640    }
2641    {
2642      HS_SLAB_HALF_PREAMBLE(2);
2643      HS_CMP_HALF(0, r1);
2644      HS_CMP_HALF(1, r2);
2645      HS_CMP_HALF(2, r3);
2646      HS_CMP_HALF(3, r4);
2647      HS_CMP_HALF(4, r5);
2648      HS_CMP_HALF(5, r6);
2649      HS_CMP_HALF(6, r7);
2650      HS_CMP_HALF(7, r8);
2651      HS_CMP_HALF(8, r9);
2652      HS_CMP_HALF(9, r10);
2653      HS_CMP_HALF(10, r11);
2654      HS_CMP_HALF(11, r12);
2655      HS_CMP_HALF(12, r13);
2656      HS_CMP_HALF(13, r14);
2657      HS_CMP_HALF(14, r15);
2658      HS_CMP_HALF(15, r16);
2659    }
2660    {
2661      HS_SLAB_HALF_PREAMBLE(1);
2662      HS_CMP_HALF(0, r1);
2663      HS_CMP_HALF(1, r2);
2664      HS_CMP_HALF(2, r3);
2665      HS_CMP_HALF(3, r4);
2666      HS_CMP_HALF(4, r5);
2667      HS_CMP_HALF(5, r6);
2668      HS_CMP_HALF(6, r7);
2669      HS_CMP_HALF(7, r8);
2670      HS_CMP_HALF(8, r9);
2671      HS_CMP_HALF(9, r10);
2672      HS_CMP_HALF(10, r11);
2673      HS_CMP_HALF(11, r12);
2674      HS_CMP_HALF(12, r13);
2675      HS_CMP_HALF(13, r14);
2676      HS_CMP_HALF(14, r15);
2677      HS_CMP_HALF(15, r16);
2678    }
2679    HS_CMP_XCHG(r1, r9);
2680    HS_CMP_XCHG(r5, r13);
2681    HS_CMP_XCHG(r1, r5);
2682    HS_CMP_XCHG(r9, r13);
2683    HS_CMP_XCHG(r3, r11);
2684    HS_CMP_XCHG(r7, r15);
2685    HS_CMP_XCHG(r3, r7);
2686    HS_CMP_XCHG(r11, r15);
2687    HS_CMP_XCHG(r1, r3);
2688    HS_CMP_XCHG(r5, r7);
2689    HS_CMP_XCHG(r9, r11);
2690    HS_CMP_XCHG(r13, r15);
2691    HS_CMP_XCHG(r2, r10);
2692    HS_CMP_XCHG(r6, r14);
2693    HS_CMP_XCHG(r2, r6);
2694    HS_CMP_XCHG(r10, r14);
2695    HS_CMP_XCHG(r4, r12);
2696    HS_CMP_XCHG(r8, r16);
2697    HS_CMP_XCHG(r4, r8);
2698    HS_CMP_XCHG(r12, r16);
2699    HS_CMP_XCHG(r2, r4);
2700    HS_CMP_XCHG(r6, r8);
2701    HS_CMP_XCHG(r10, r12);
2702    HS_CMP_XCHG(r14, r16);
2703    HS_CMP_XCHG(r1, r2);
2704    HS_CMP_XCHG(r3, r4);
2705    HS_CMP_XCHG(r5, r6);
2706    HS_CMP_XCHG(r7, r8);
2707    HS_CMP_XCHG(r9, r10);
2708    HS_CMP_XCHG(r11, r12);
2709    HS_CMP_XCHG(r13, r14);
2710    HS_CMP_XCHG(r15, r16);
2711  }
2712  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
2713  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16;
2714  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
2715  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15;
2716  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
2717  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14;
2718  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
2719  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13;
2720  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5;
2721  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12;
2722  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6;
2723  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11;
2724  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7;
2725  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10;
2726  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8;
2727  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9;
2728  HS_BLOCK_BARRIER();
2729  {
2730    {
2731      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
2732      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8);
2733      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(16);
2734      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(24);
2735      HS_CMP_XCHG(r0_2, r0_3);
2736      HS_CMP_XCHG(r0_1, r0_4);
2737      HS_CMP_XCHG(r0_3, r0_4);
2738      HS_CMP_XCHG(r0_1, r0_2);
2739      HS_SLAB_LOCAL_L(0) = r0_1;
2740      HS_SLAB_LOCAL_L(8) = r0_2;
2741      HS_SLAB_LOCAL_R(16) = r0_3;
2742      HS_SLAB_LOCAL_R(24) = r0_4;
2743    }
2744    {
2745      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32);
2746      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(40);
2747      HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(48);
2748      HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(56);
2749      HS_CMP_XCHG(r1_2, r1_3);
2750      HS_CMP_XCHG(r1_1, r1_4);
2751      HS_CMP_XCHG(r1_3, r1_4);
2752      HS_CMP_XCHG(r1_1, r1_2);
2753      HS_SLAB_LOCAL_L(32) = r1_1;
2754      HS_SLAB_LOCAL_L(40) = r1_2;
2755      HS_SLAB_LOCAL_R(48) = r1_3;
2756      HS_SLAB_LOCAL_R(56) = r1_4;
2757    }
2758    {
2759      HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(64);
2760      HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_L(72);
2761      HS_KEY_TYPE r2_3 = HS_SLAB_LOCAL_R(80);
2762      HS_KEY_TYPE r2_4 = HS_SLAB_LOCAL_R(88);
2763      HS_CMP_XCHG(r2_2, r2_3);
2764      HS_CMP_XCHG(r2_1, r2_4);
2765      HS_CMP_XCHG(r2_3, r2_4);
2766      HS_CMP_XCHG(r2_1, r2_2);
2767      HS_SLAB_LOCAL_L(64) = r2_1;
2768      HS_SLAB_LOCAL_L(72) = r2_2;
2769      HS_SLAB_LOCAL_R(80) = r2_3;
2770      HS_SLAB_LOCAL_R(88) = r2_4;
2771    }
2772    {
2773      HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(96);
2774      HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_L(104);
2775      HS_KEY_TYPE r3_3 = HS_SLAB_LOCAL_R(112);
2776      HS_KEY_TYPE r3_4 = HS_SLAB_LOCAL_R(120);
2777      HS_CMP_XCHG(r3_2, r3_3);
2778      HS_CMP_XCHG(r3_1, r3_4);
2779      HS_CMP_XCHG(r3_3, r3_4);
2780      HS_CMP_XCHG(r3_1, r3_2);
2781      HS_SLAB_LOCAL_L(96) = r3_1;
2782      HS_SLAB_LOCAL_L(104) = r3_2;
2783      HS_SLAB_LOCAL_R(112) = r3_3;
2784      HS_SLAB_LOCAL_R(120) = r3_4;
2785    }
2786  }
2787  HS_BLOCK_BARRIER();
2788  r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2789  r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2790  r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2791  r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2792  r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2793  r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2794  r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2795  r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2796  r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8);
2797  r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9);
2798  r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10);
2799  r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11);
2800  r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12);
2801  r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13);
2802  r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14);
2803  r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15);
2804  {
2805    {
2806      HS_SLAB_HALF_PREAMBLE(4);
2807      HS_CMP_HALF(0, r1);
2808      HS_CMP_HALF(1, r2);
2809      HS_CMP_HALF(2, r3);
2810      HS_CMP_HALF(3, r4);
2811      HS_CMP_HALF(4, r5);
2812      HS_CMP_HALF(5, r6);
2813      HS_CMP_HALF(6, r7);
2814      HS_CMP_HALF(7, r8);
2815      HS_CMP_HALF(8, r9);
2816      HS_CMP_HALF(9, r10);
2817      HS_CMP_HALF(10, r11);
2818      HS_CMP_HALF(11, r12);
2819      HS_CMP_HALF(12, r13);
2820      HS_CMP_HALF(13, r14);
2821      HS_CMP_HALF(14, r15);
2822      HS_CMP_HALF(15, r16);
2823    }
2824    {
2825      HS_SLAB_HALF_PREAMBLE(2);
2826      HS_CMP_HALF(0, r1);
2827      HS_CMP_HALF(1, r2);
2828      HS_CMP_HALF(2, r3);
2829      HS_CMP_HALF(3, r4);
2830      HS_CMP_HALF(4, r5);
2831      HS_CMP_HALF(5, r6);
2832      HS_CMP_HALF(6, r7);
2833      HS_CMP_HALF(7, r8);
2834      HS_CMP_HALF(8, r9);
2835      HS_CMP_HALF(9, r10);
2836      HS_CMP_HALF(10, r11);
2837      HS_CMP_HALF(11, r12);
2838      HS_CMP_HALF(12, r13);
2839      HS_CMP_HALF(13, r14);
2840      HS_CMP_HALF(14, r15);
2841      HS_CMP_HALF(15, r16);
2842    }
2843    {
2844      HS_SLAB_HALF_PREAMBLE(1);
2845      HS_CMP_HALF(0, r1);
2846      HS_CMP_HALF(1, r2);
2847      HS_CMP_HALF(2, r3);
2848      HS_CMP_HALF(3, r4);
2849      HS_CMP_HALF(4, r5);
2850      HS_CMP_HALF(5, r6);
2851      HS_CMP_HALF(6, r7);
2852      HS_CMP_HALF(7, r8);
2853      HS_CMP_HALF(8, r9);
2854      HS_CMP_HALF(9, r10);
2855      HS_CMP_HALF(10, r11);
2856      HS_CMP_HALF(11, r12);
2857      HS_CMP_HALF(12, r13);
2858      HS_CMP_HALF(13, r14);
2859      HS_CMP_HALF(14, r15);
2860      HS_CMP_HALF(15, r16);
2861    }
2862    HS_CMP_XCHG(r1, r9);
2863    HS_CMP_XCHG(r5, r13);
2864    HS_CMP_XCHG(r1, r5);
2865    HS_CMP_XCHG(r9, r13);
2866    HS_CMP_XCHG(r3, r11);
2867    HS_CMP_XCHG(r7, r15);
2868    HS_CMP_XCHG(r3, r7);
2869    HS_CMP_XCHG(r11, r15);
2870    HS_CMP_XCHG(r1, r3);
2871    HS_CMP_XCHG(r5, r7);
2872    HS_CMP_XCHG(r9, r11);
2873    HS_CMP_XCHG(r13, r15);
2874    HS_CMP_XCHG(r2, r10);
2875    HS_CMP_XCHG(r6, r14);
2876    HS_CMP_XCHG(r2, r6);
2877    HS_CMP_XCHG(r10, r14);
2878    HS_CMP_XCHG(r4, r12);
2879    HS_CMP_XCHG(r8, r16);
2880    HS_CMP_XCHG(r4, r8);
2881    HS_CMP_XCHG(r12, r16);
2882    HS_CMP_XCHG(r2, r4);
2883    HS_CMP_XCHG(r6, r8);
2884    HS_CMP_XCHG(r10, r12);
2885    HS_CMP_XCHG(r14, r16);
2886    HS_CMP_XCHG(r1, r2);
2887    HS_CMP_XCHG(r3, r4);
2888    HS_CMP_XCHG(r5, r6);
2889    HS_CMP_XCHG(r7, r8);
2890    HS_CMP_XCHG(r9, r10);
2891    HS_CMP_XCHG(r11, r12);
2892    HS_CMP_XCHG(r13, r14);
2893    HS_CMP_XCHG(r15, r16);
2894  }
2895  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
2896  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16;
2897  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
2898  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15;
2899  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
2900  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14;
2901  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
2902  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13;
2903  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5;
2904  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12;
2905  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6;
2906  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11;
2907  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7;
2908  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10;
2909  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8;
2910  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9;
2911  HS_BLOCK_BARRIER();
2912  {
2913    {
2914      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
2915      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8);
2916      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(16);
2917      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(24);
2918      HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(32);
2919      HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(40);
2920      HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(48);
2921      HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(56);
2922      HS_CMP_XCHG(r0_4, r0_5);
2923      HS_CMP_XCHG(r0_3, r0_6);
2924      HS_CMP_XCHG(r0_2, r0_7);
2925      HS_CMP_XCHG(r0_1, r0_8);
2926      HS_CMP_XCHG(r0_5, r0_7);
2927      HS_CMP_XCHG(r0_6, r0_8);
2928      HS_CMP_XCHG(r0_5, r0_6);
2929      HS_CMP_XCHG(r0_7, r0_8);
2930      HS_CMP_XCHG(r0_1, r0_3);
2931      HS_CMP_XCHG(r0_2, r0_4);
2932      HS_CMP_XCHG(r0_1, r0_2);
2933      HS_CMP_XCHG(r0_3, r0_4);
2934      HS_SLAB_LOCAL_L(0) = r0_1;
2935      HS_SLAB_LOCAL_L(8) = r0_2;
2936      HS_SLAB_LOCAL_L(16) = r0_3;
2937      HS_SLAB_LOCAL_L(24) = r0_4;
2938      HS_SLAB_LOCAL_R(32) = r0_5;
2939      HS_SLAB_LOCAL_R(40) = r0_6;
2940      HS_SLAB_LOCAL_R(48) = r0_7;
2941      HS_SLAB_LOCAL_R(56) = r0_8;
2942    }
2943    {
2944      HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
2945      HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(72);
2946      HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_L(80);
2947      HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_L(88);
2948      HS_KEY_TYPE r1_5 = HS_SLAB_LOCAL_R(96);
2949      HS_KEY_TYPE r1_6 = HS_SLAB_LOCAL_R(104);
2950      HS_KEY_TYPE r1_7 = HS_SLAB_LOCAL_R(112);
2951      HS_KEY_TYPE r1_8 = HS_SLAB_LOCAL_R(120);
2952      HS_CMP_XCHG(r1_4, r1_5);
2953      HS_CMP_XCHG(r1_3, r1_6);
2954      HS_CMP_XCHG(r1_2, r1_7);
2955      HS_CMP_XCHG(r1_1, r1_8);
2956      HS_CMP_XCHG(r1_5, r1_7);
2957      HS_CMP_XCHG(r1_6, r1_8);
2958      HS_CMP_XCHG(r1_5, r1_6);
2959      HS_CMP_XCHG(r1_7, r1_8);
2960      HS_CMP_XCHG(r1_1, r1_3);
2961      HS_CMP_XCHG(r1_2, r1_4);
2962      HS_CMP_XCHG(r1_1, r1_2);
2963      HS_CMP_XCHG(r1_3, r1_4);
2964      HS_SLAB_LOCAL_L(64) = r1_1;
2965      HS_SLAB_LOCAL_L(72) = r1_2;
2966      HS_SLAB_LOCAL_L(80) = r1_3;
2967      HS_SLAB_LOCAL_L(88) = r1_4;
2968      HS_SLAB_LOCAL_R(96) = r1_5;
2969      HS_SLAB_LOCAL_R(104) = r1_6;
2970      HS_SLAB_LOCAL_R(112) = r1_7;
2971      HS_SLAB_LOCAL_R(120) = r1_8;
2972    }
2973  }
2974  HS_BLOCK_BARRIER();
2975  r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
2976  r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
2977  r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
2978  r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
2979  r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
2980  r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
2981  r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
2982  r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
2983  r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8);
2984  r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9);
2985  r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10);
2986  r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11);
2987  r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12);
2988  r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13);
2989  r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14);
2990  r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15);
2991  {
2992    {
2993      HS_SLAB_HALF_PREAMBLE(4);
2994      HS_CMP_HALF(0, r1);
2995      HS_CMP_HALF(1, r2);
2996      HS_CMP_HALF(2, r3);
2997      HS_CMP_HALF(3, r4);
2998      HS_CMP_HALF(4, r5);
2999      HS_CMP_HALF(5, r6);
3000      HS_CMP_HALF(6, r7);
3001      HS_CMP_HALF(7, r8);
3002      HS_CMP_HALF(8, r9);
3003      HS_CMP_HALF(9, r10);
3004      HS_CMP_HALF(10, r11);
3005      HS_CMP_HALF(11, r12);
3006      HS_CMP_HALF(12, r13);
3007      HS_CMP_HALF(13, r14);
3008      HS_CMP_HALF(14, r15);
3009      HS_CMP_HALF(15, r16);
3010    }
3011    {
3012      HS_SLAB_HALF_PREAMBLE(2);
3013      HS_CMP_HALF(0, r1);
3014      HS_CMP_HALF(1, r2);
3015      HS_CMP_HALF(2, r3);
3016      HS_CMP_HALF(3, r4);
3017      HS_CMP_HALF(4, r5);
3018      HS_CMP_HALF(5, r6);
3019      HS_CMP_HALF(6, r7);
3020      HS_CMP_HALF(7, r8);
3021      HS_CMP_HALF(8, r9);
3022      HS_CMP_HALF(9, r10);
3023      HS_CMP_HALF(10, r11);
3024      HS_CMP_HALF(11, r12);
3025      HS_CMP_HALF(12, r13);
3026      HS_CMP_HALF(13, r14);
3027      HS_CMP_HALF(14, r15);
3028      HS_CMP_HALF(15, r16);
3029    }
3030    {
3031      HS_SLAB_HALF_PREAMBLE(1);
3032      HS_CMP_HALF(0, r1);
3033      HS_CMP_HALF(1, r2);
3034      HS_CMP_HALF(2, r3);
3035      HS_CMP_HALF(3, r4);
3036      HS_CMP_HALF(4, r5);
3037      HS_CMP_HALF(5, r6);
3038      HS_CMP_HALF(6, r7);
3039      HS_CMP_HALF(7, r8);
3040      HS_CMP_HALF(8, r9);
3041      HS_CMP_HALF(9, r10);
3042      HS_CMP_HALF(10, r11);
3043      HS_CMP_HALF(11, r12);
3044      HS_CMP_HALF(12, r13);
3045      HS_CMP_HALF(13, r14);
3046      HS_CMP_HALF(14, r15);
3047      HS_CMP_HALF(15, r16);
3048    }
3049    HS_CMP_XCHG(r1, r9);
3050    HS_CMP_XCHG(r5, r13);
3051    HS_CMP_XCHG(r1, r5);
3052    HS_CMP_XCHG(r9, r13);
3053    HS_CMP_XCHG(r3, r11);
3054    HS_CMP_XCHG(r7, r15);
3055    HS_CMP_XCHG(r3, r7);
3056    HS_CMP_XCHG(r11, r15);
3057    HS_CMP_XCHG(r1, r3);
3058    HS_CMP_XCHG(r5, r7);
3059    HS_CMP_XCHG(r9, r11);
3060    HS_CMP_XCHG(r13, r15);
3061    HS_CMP_XCHG(r2, r10);
3062    HS_CMP_XCHG(r6, r14);
3063    HS_CMP_XCHG(r2, r6);
3064    HS_CMP_XCHG(r10, r14);
3065    HS_CMP_XCHG(r4, r12);
3066    HS_CMP_XCHG(r8, r16);
3067    HS_CMP_XCHG(r4, r8);
3068    HS_CMP_XCHG(r12, r16);
3069    HS_CMP_XCHG(r2, r4);
3070    HS_CMP_XCHG(r6, r8);
3071    HS_CMP_XCHG(r10, r12);
3072    HS_CMP_XCHG(r14, r16);
3073    HS_CMP_XCHG(r1, r2);
3074    HS_CMP_XCHG(r3, r4);
3075    HS_CMP_XCHG(r5, r6);
3076    HS_CMP_XCHG(r7, r8);
3077    HS_CMP_XCHG(r9, r10);
3078    HS_CMP_XCHG(r11, r12);
3079    HS_CMP_XCHG(r13, r14);
3080    HS_CMP_XCHG(r15, r16);
3081  }
3082  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
3083  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16;
3084  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
3085  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15;
3086  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
3087  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14;
3088  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
3089  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13;
3090  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5;
3091  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12;
3092  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6;
3093  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11;
3094  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7;
3095  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10;
3096  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8;
3097  HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9;
3098  HS_BLOCK_BARRIER();
3099  {
3100    {
3101      HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
3102      HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(8);
3103      HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(16);
3104      HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(24);
3105      HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_L(32);
3106      HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_L(40);
3107      HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_L(48);
3108      HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_L(56);
3109      HS_KEY_TYPE r0_9 = HS_SLAB_LOCAL_R(64);
3110      HS_KEY_TYPE r0_10 = HS_SLAB_LOCAL_R(72);
3111      HS_KEY_TYPE r0_11 = HS_SLAB_LOCAL_R(80);
3112      HS_KEY_TYPE r0_12 = HS_SLAB_LOCAL_R(88);
3113      HS_KEY_TYPE r0_13 = HS_SLAB_LOCAL_R(96);
3114      HS_KEY_TYPE r0_14 = HS_SLAB_LOCAL_R(104);
3115      HS_KEY_TYPE r0_15 = HS_SLAB_LOCAL_R(112);
3116      HS_KEY_TYPE r0_16 = HS_SLAB_LOCAL_R(120);
3117      HS_CMP_XCHG(r0_8, r0_9);
3118      HS_CMP_XCHG(r0_7, r0_10);
3119      HS_CMP_XCHG(r0_6, r0_11);
3120      HS_CMP_XCHG(r0_5, r0_12);
3121      HS_CMP_XCHG(r0_4, r0_13);
3122      HS_CMP_XCHG(r0_3, r0_14);
3123      HS_CMP_XCHG(r0_2, r0_15);
3124      HS_CMP_XCHG(r0_1, r0_16);
3125      HS_CMP_XCHG(r0_9, r0_13);
3126      HS_CMP_XCHG(r0_11, r0_15);
3127      HS_CMP_XCHG(r0_9, r0_11);
3128      HS_CMP_XCHG(r0_13, r0_15);
3129      HS_CMP_XCHG(r0_10, r0_14);
3130      HS_CMP_XCHG(r0_12, r0_16);
3131      HS_CMP_XCHG(r0_10, r0_12);
3132      HS_CMP_XCHG(r0_14, r0_16);
3133      HS_CMP_XCHG(r0_9, r0_10);
3134      HS_CMP_XCHG(r0_11, r0_12);
3135      HS_CMP_XCHG(r0_13, r0_14);
3136      HS_CMP_XCHG(r0_15, r0_16);
3137      HS_CMP_XCHG(r0_1, r0_5);
3138      HS_CMP_XCHG(r0_3, r0_7);
3139      HS_CMP_XCHG(r0_1, r0_3);
3140      HS_CMP_XCHG(r0_5, r0_7);
3141      HS_CMP_XCHG(r0_2, r0_6);
3142      HS_CMP_XCHG(r0_4, r0_8);
3143      HS_CMP_XCHG(r0_2, r0_4);
3144      HS_CMP_XCHG(r0_6, r0_8);
3145      HS_CMP_XCHG(r0_1, r0_2);
3146      HS_CMP_XCHG(r0_3, r0_4);
3147      HS_CMP_XCHG(r0_5, r0_6);
3148      HS_CMP_XCHG(r0_7, r0_8);
3149      HS_SLAB_LOCAL_L(0) = r0_1;
3150      HS_SLAB_LOCAL_L(8) = r0_2;
3151      HS_SLAB_LOCAL_L(16) = r0_3;
3152      HS_SLAB_LOCAL_L(24) = r0_4;
3153      HS_SLAB_LOCAL_L(32) = r0_5;
3154      HS_SLAB_LOCAL_L(40) = r0_6;
3155      HS_SLAB_LOCAL_L(48) = r0_7;
3156      HS_SLAB_LOCAL_L(56) = r0_8;
3157      HS_SLAB_LOCAL_R(64) = r0_9;
3158      HS_SLAB_LOCAL_R(72) = r0_10;
3159      HS_SLAB_LOCAL_R(80) = r0_11;
3160      HS_SLAB_LOCAL_R(88) = r0_12;
3161      HS_SLAB_LOCAL_R(96) = r0_13;
3162      HS_SLAB_LOCAL_R(104) = r0_14;
3163      HS_SLAB_LOCAL_R(112) = r0_15;
3164      HS_SLAB_LOCAL_R(120) = r0_16;
3165    }
3166  }
3167  HS_BLOCK_BARRIER();
3168  r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
3169  r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
3170  r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
3171  r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
3172  r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
3173  r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
3174  r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
3175  r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
3176  r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8);
3177  r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9);
3178  r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10);
3179  r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11);
3180  r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12);
3181  r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13);
3182  r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14);
3183  r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15);
3184  {
3185    {
3186      HS_SLAB_HALF_PREAMBLE(4);
3187      HS_CMP_HALF(0, r1);
3188      HS_CMP_HALF(1, r2);
3189      HS_CMP_HALF(2, r3);
3190      HS_CMP_HALF(3, r4);
3191      HS_CMP_HALF(4, r5);
3192      HS_CMP_HALF(5, r6);
3193      HS_CMP_HALF(6, r7);
3194      HS_CMP_HALF(7, r8);
3195      HS_CMP_HALF(8, r9);
3196      HS_CMP_HALF(9, r10);
3197      HS_CMP_HALF(10, r11);
3198      HS_CMP_HALF(11, r12);
3199      HS_CMP_HALF(12, r13);
3200      HS_CMP_HALF(13, r14);
3201      HS_CMP_HALF(14, r15);
3202      HS_CMP_HALF(15, r16);
3203    }
3204    {
3205      HS_SLAB_HALF_PREAMBLE(2);
3206      HS_CMP_HALF(0, r1);
3207      HS_CMP_HALF(1, r2);
3208      HS_CMP_HALF(2, r3);
3209      HS_CMP_HALF(3, r4);
3210      HS_CMP_HALF(4, r5);
3211      HS_CMP_HALF(5, r6);
3212      HS_CMP_HALF(6, r7);
3213      HS_CMP_HALF(7, r8);
3214      HS_CMP_HALF(8, r9);
3215      HS_CMP_HALF(9, r10);
3216      HS_CMP_HALF(10, r11);
3217      HS_CMP_HALF(11, r12);
3218      HS_CMP_HALF(12, r13);
3219      HS_CMP_HALF(13, r14);
3220      HS_CMP_HALF(14, r15);
3221      HS_CMP_HALF(15, r16);
3222    }
3223    {
3224      HS_SLAB_HALF_PREAMBLE(1);
3225      HS_CMP_HALF(0, r1);
3226      HS_CMP_HALF(1, r2);
3227      HS_CMP_HALF(2, r3);
3228      HS_CMP_HALF(3, r4);
3229      HS_CMP_HALF(4, r5);
3230      HS_CMP_HALF(5, r6);
3231      HS_CMP_HALF(6, r7);
3232      HS_CMP_HALF(7, r8);
3233      HS_CMP_HALF(8, r9);
3234      HS_CMP_HALF(9, r10);
3235      HS_CMP_HALF(10, r11);
3236      HS_CMP_HALF(11, r12);
3237      HS_CMP_HALF(12, r13);
3238      HS_CMP_HALF(13, r14);
3239      HS_CMP_HALF(14, r15);
3240      HS_CMP_HALF(15, r16);
3241    }
3242    HS_CMP_XCHG(r1, r9);
3243    HS_CMP_XCHG(r5, r13);
3244    HS_CMP_XCHG(r1, r5);
3245    HS_CMP_XCHG(r9, r13);
3246    HS_CMP_XCHG(r3, r11);
3247    HS_CMP_XCHG(r7, r15);
3248    HS_CMP_XCHG(r3, r7);
3249    HS_CMP_XCHG(r11, r15);
3250    HS_CMP_XCHG(r1, r3);
3251    HS_CMP_XCHG(r5, r7);
3252    HS_CMP_XCHG(r9, r11);
3253    HS_CMP_XCHG(r13, r15);
3254    HS_CMP_XCHG(r2, r10);
3255    HS_CMP_XCHG(r6, r14);
3256    HS_CMP_XCHG(r2, r6);
3257    HS_CMP_XCHG(r10, r14);
3258    HS_CMP_XCHG(r4, r12);
3259    HS_CMP_XCHG(r8, r16);
3260    HS_CMP_XCHG(r4, r8);
3261    HS_CMP_XCHG(r12, r16);
3262    HS_CMP_XCHG(r2, r4);
3263    HS_CMP_XCHG(r6, r8);
3264    HS_CMP_XCHG(r10, r12);
3265    HS_CMP_XCHG(r14, r16);
3266    HS_CMP_XCHG(r1, r2);
3267    HS_CMP_XCHG(r3, r4);
3268    HS_CMP_XCHG(r5, r6);
3269    HS_CMP_XCHG(r7, r8);
3270    HS_CMP_XCHG(r9, r10);
3271    HS_CMP_XCHG(r11, r12);
3272    HS_CMP_XCHG(r13, r14);
3273    HS_CMP_XCHG(r15, r16);
3274  }
3275  HS_SLAB_GLOBAL_STORE(0, r1);
3276  HS_SLAB_GLOBAL_STORE(1, r2);
3277  HS_SLAB_GLOBAL_STORE(2, r3);
3278  HS_SLAB_GLOBAL_STORE(3, r4);
3279  HS_SLAB_GLOBAL_STORE(4, r5);
3280  HS_SLAB_GLOBAL_STORE(5, r6);
3281  HS_SLAB_GLOBAL_STORE(6, r7);
3282  HS_SLAB_GLOBAL_STORE(7, r8);
3283  HS_SLAB_GLOBAL_STORE(8, r9);
3284  HS_SLAB_GLOBAL_STORE(9, r10);
3285  HS_SLAB_GLOBAL_STORE(10, r11);
3286  HS_SLAB_GLOBAL_STORE(11, r12);
3287  HS_SLAB_GLOBAL_STORE(12, r13);
3288  HS_SLAB_GLOBAL_STORE(13, r14);
3289  HS_SLAB_GLOBAL_STORE(14, r15);
3290  HS_SLAB_GLOBAL_STORE(15, r16);
3291}
3292
3293HS_BC_KERNEL_PROTO(1, 0)
3294{
3295  HS_SLAB_GLOBAL_PREAMBLE();
3296  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0);
3297  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1);
3298  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2);
3299  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3);
3300  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4);
3301  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5);
3302  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6);
3303  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7);
3304  HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vout, 8);
3305  HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vout, 9);
3306  HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vout, 10);
3307  HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vout, 11);
3308  HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vout, 12);
3309  HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vout, 13);
3310  HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vout, 14);
3311  HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vout, 15);
3312  {
3313    {
3314      HS_SLAB_HALF_PREAMBLE(4);
3315      HS_CMP_HALF(0, r1);
3316      HS_CMP_HALF(1, r2);
3317      HS_CMP_HALF(2, r3);
3318      HS_CMP_HALF(3, r4);
3319      HS_CMP_HALF(4, r5);
3320      HS_CMP_HALF(5, r6);
3321      HS_CMP_HALF(6, r7);
3322      HS_CMP_HALF(7, r8);
3323      HS_CMP_HALF(8, r9);
3324      HS_CMP_HALF(9, r10);
3325      HS_CMP_HALF(10, r11);
3326      HS_CMP_HALF(11, r12);
3327      HS_CMP_HALF(12, r13);
3328      HS_CMP_HALF(13, r14);
3329      HS_CMP_HALF(14, r15);
3330      HS_CMP_HALF(15, r16);
3331    }
3332    {
3333      HS_SLAB_HALF_PREAMBLE(2);
3334      HS_CMP_HALF(0, r1);
3335      HS_CMP_HALF(1, r2);
3336      HS_CMP_HALF(2, r3);
3337      HS_CMP_HALF(3, r4);
3338      HS_CMP_HALF(4, r5);
3339      HS_CMP_HALF(5, r6);
3340      HS_CMP_HALF(6, r7);
3341      HS_CMP_HALF(7, r8);
3342      HS_CMP_HALF(8, r9);
3343      HS_CMP_HALF(9, r10);
3344      HS_CMP_HALF(10, r11);
3345      HS_CMP_HALF(11, r12);
3346      HS_CMP_HALF(12, r13);
3347      HS_CMP_HALF(13, r14);
3348      HS_CMP_HALF(14, r15);
3349      HS_CMP_HALF(15, r16);
3350    }
3351    {
3352      HS_SLAB_HALF_PREAMBLE(1);
3353      HS_CMP_HALF(0, r1);
3354      HS_CMP_HALF(1, r2);
3355      HS_CMP_HALF(2, r3);
3356      HS_CMP_HALF(3, r4);
3357      HS_CMP_HALF(4, r5);
3358      HS_CMP_HALF(5, r6);
3359      HS_CMP_HALF(6, r7);
3360      HS_CMP_HALF(7, r8);
3361      HS_CMP_HALF(8, r9);
3362      HS_CMP_HALF(9, r10);
3363      HS_CMP_HALF(10, r11);
3364      HS_CMP_HALF(11, r12);
3365      HS_CMP_HALF(12, r13);
3366      HS_CMP_HALF(13, r14);
3367      HS_CMP_HALF(14, r15);
3368      HS_CMP_HALF(15, r16);
3369    }
3370    HS_CMP_XCHG(r1, r9);
3371    HS_CMP_XCHG(r5, r13);
3372    HS_CMP_XCHG(r1, r5);
3373    HS_CMP_XCHG(r9, r13);
3374    HS_CMP_XCHG(r3, r11);
3375    HS_CMP_XCHG(r7, r15);
3376    HS_CMP_XCHG(r3, r7);
3377    HS_CMP_XCHG(r11, r15);
3378    HS_CMP_XCHG(r1, r3);
3379    HS_CMP_XCHG(r5, r7);
3380    HS_CMP_XCHG(r9, r11);
3381    HS_CMP_XCHG(r13, r15);
3382    HS_CMP_XCHG(r2, r10);
3383    HS_CMP_XCHG(r6, r14);
3384    HS_CMP_XCHG(r2, r6);
3385    HS_CMP_XCHG(r10, r14);
3386    HS_CMP_XCHG(r4, r12);
3387    HS_CMP_XCHG(r8, r16);
3388    HS_CMP_XCHG(r4, r8);
3389    HS_CMP_XCHG(r12, r16);
3390    HS_CMP_XCHG(r2, r4);
3391    HS_CMP_XCHG(r6, r8);
3392    HS_CMP_XCHG(r10, r12);
3393    HS_CMP_XCHG(r14, r16);
3394    HS_CMP_XCHG(r1, r2);
3395    HS_CMP_XCHG(r3, r4);
3396    HS_CMP_XCHG(r5, r6);
3397    HS_CMP_XCHG(r7, r8);
3398    HS_CMP_XCHG(r9, r10);
3399    HS_CMP_XCHG(r11, r12);
3400    HS_CMP_XCHG(r13, r14);
3401    HS_CMP_XCHG(r15, r16);
3402  }
3403  HS_SLAB_GLOBAL_STORE(0, r1);
3404  HS_SLAB_GLOBAL_STORE(1, r2);
3405  HS_SLAB_GLOBAL_STORE(2, r3);
3406  HS_SLAB_GLOBAL_STORE(3, r4);
3407  HS_SLAB_GLOBAL_STORE(4, r5);
3408  HS_SLAB_GLOBAL_STORE(5, r6);
3409  HS_SLAB_GLOBAL_STORE(6, r7);
3410  HS_SLAB_GLOBAL_STORE(7, r8);
3411  HS_SLAB_GLOBAL_STORE(8, r9);
3412  HS_SLAB_GLOBAL_STORE(9, r10);
3413  HS_SLAB_GLOBAL_STORE(10, r11);
3414  HS_SLAB_GLOBAL_STORE(11, r12);
3415  HS_SLAB_GLOBAL_STORE(12, r13);
3416  HS_SLAB_GLOBAL_STORE(13, r14);
3417  HS_SLAB_GLOBAL_STORE(14, r15);
3418  HS_SLAB_GLOBAL_STORE(15, r16);
3419}
3420
3421HS_BC_KERNEL_PROTO(2, 1)
3422{
3423  HS_BLOCK_LOCAL_MEM_DECL(16, 16);
3424
3425  HS_SLAB_GLOBAL_PREAMBLE();
3426  HS_BC_MERGE_H_PREAMBLE(2);
3427  {
3428    {
3429      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
3430      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16);
3431      HS_CMP_XCHG(r0_1, r0_2);
3432      HS_SLAB_LOCAL_L(0) = r0_1;
3433      HS_SLAB_LOCAL_L(8) = r0_2;
3434    }
3435    {
3436      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(2);
3437      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(18);
3438      HS_CMP_XCHG(r0_1, r0_2);
3439      HS_SLAB_LOCAL_L(32) = r0_1;
3440      HS_SLAB_LOCAL_L(40) = r0_2;
3441    }
3442    {
3443      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4);
3444      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(20);
3445      HS_CMP_XCHG(r0_1, r0_2);
3446      HS_SLAB_LOCAL_L(64) = r0_1;
3447      HS_SLAB_LOCAL_L(72) = r0_2;
3448    }
3449    {
3450      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(6);
3451      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(22);
3452      HS_CMP_XCHG(r0_1, r0_2);
3453      HS_SLAB_LOCAL_L(96) = r0_1;
3454      HS_SLAB_LOCAL_L(104) = r0_2;
3455    }
3456    {
3457      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8);
3458      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(24);
3459      HS_CMP_XCHG(r0_1, r0_2);
3460      HS_SLAB_LOCAL_L(128) = r0_1;
3461      HS_SLAB_LOCAL_L(136) = r0_2;
3462    }
3463    {
3464      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(10);
3465      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(26);
3466      HS_CMP_XCHG(r0_1, r0_2);
3467      HS_SLAB_LOCAL_L(160) = r0_1;
3468      HS_SLAB_LOCAL_L(168) = r0_2;
3469    }
3470    {
3471      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(12);
3472      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(28);
3473      HS_CMP_XCHG(r0_1, r0_2);
3474      HS_SLAB_LOCAL_L(192) = r0_1;
3475      HS_SLAB_LOCAL_L(200) = r0_2;
3476    }
3477    {
3478      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(14);
3479      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(30);
3480      HS_CMP_XCHG(r0_1, r0_2);
3481      HS_SLAB_LOCAL_L(224) = r0_1;
3482      HS_SLAB_LOCAL_L(232) = r0_2;
3483    }
3484  }
3485  HS_BLOCK_BARRIER();
3486  HS_KEY_TYPE r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0);
3487  HS_KEY_TYPE r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1);
3488  HS_KEY_TYPE r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2);
3489  HS_KEY_TYPE r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3);
3490  HS_KEY_TYPE r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4);
3491  HS_KEY_TYPE r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5);
3492  HS_KEY_TYPE r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6);
3493  HS_KEY_TYPE r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7);
3494  HS_KEY_TYPE r9 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 8);
3495  HS_KEY_TYPE r10 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 9);
3496  HS_KEY_TYPE r11 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 10);
3497  HS_KEY_TYPE r12 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 11);
3498  HS_KEY_TYPE r13 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 12);
3499  HS_KEY_TYPE r14 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 13);
3500  HS_KEY_TYPE r15 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 14);
3501  HS_KEY_TYPE r16 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 15);
3502  {
3503    {
3504      HS_SLAB_HALF_PREAMBLE(4);
3505      HS_CMP_HALF(0, r1);
3506      HS_CMP_HALF(1, r2);
3507      HS_CMP_HALF(2, r3);
3508      HS_CMP_HALF(3, r4);
3509      HS_CMP_HALF(4, r5);
3510      HS_CMP_HALF(5, r6);
3511      HS_CMP_HALF(6, r7);
3512      HS_CMP_HALF(7, r8);
3513      HS_CMP_HALF(8, r9);
3514      HS_CMP_HALF(9, r10);
3515      HS_CMP_HALF(10, r11);
3516      HS_CMP_HALF(11, r12);
3517      HS_CMP_HALF(12, r13);
3518      HS_CMP_HALF(13, r14);
3519      HS_CMP_HALF(14, r15);
3520      HS_CMP_HALF(15, r16);
3521    }
3522    {
3523      HS_SLAB_HALF_PREAMBLE(2);
3524      HS_CMP_HALF(0, r1);
3525      HS_CMP_HALF(1, r2);
3526      HS_CMP_HALF(2, r3);
3527      HS_CMP_HALF(3, r4);
3528      HS_CMP_HALF(4, r5);
3529      HS_CMP_HALF(5, r6);
3530      HS_CMP_HALF(6, r7);
3531      HS_CMP_HALF(7, r8);
3532      HS_CMP_HALF(8, r9);
3533      HS_CMP_HALF(9, r10);
3534      HS_CMP_HALF(10, r11);
3535      HS_CMP_HALF(11, r12);
3536      HS_CMP_HALF(12, r13);
3537      HS_CMP_HALF(13, r14);
3538      HS_CMP_HALF(14, r15);
3539      HS_CMP_HALF(15, r16);
3540    }
3541    {
3542      HS_SLAB_HALF_PREAMBLE(1);
3543      HS_CMP_HALF(0, r1);
3544      HS_CMP_HALF(1, r2);
3545      HS_CMP_HALF(2, r3);
3546      HS_CMP_HALF(3, r4);
3547      HS_CMP_HALF(4, r5);
3548      HS_CMP_HALF(5, r6);
3549      HS_CMP_HALF(6, r7);
3550      HS_CMP_HALF(7, r8);
3551      HS_CMP_HALF(8, r9);
3552      HS_CMP_HALF(9, r10);
3553      HS_CMP_HALF(10, r11);
3554      HS_CMP_HALF(11, r12);
3555      HS_CMP_HALF(12, r13);
3556      HS_CMP_HALF(13, r14);
3557      HS_CMP_HALF(14, r15);
3558      HS_CMP_HALF(15, r16);
3559    }
3560    HS_CMP_XCHG(r1, r9);
3561    HS_CMP_XCHG(r5, r13);
3562    HS_CMP_XCHG(r1, r5);
3563    HS_CMP_XCHG(r9, r13);
3564    HS_CMP_XCHG(r3, r11);
3565    HS_CMP_XCHG(r7, r15);
3566    HS_CMP_XCHG(r3, r7);
3567    HS_CMP_XCHG(r11, r15);
3568    HS_CMP_XCHG(r1, r3);
3569    HS_CMP_XCHG(r5, r7);
3570    HS_CMP_XCHG(r9, r11);
3571    HS_CMP_XCHG(r13, r15);
3572    HS_CMP_XCHG(r2, r10);
3573    HS_CMP_XCHG(r6, r14);
3574    HS_CMP_XCHG(r2, r6);
3575    HS_CMP_XCHG(r10, r14);
3576    HS_CMP_XCHG(r4, r12);
3577    HS_CMP_XCHG(r8, r16);
3578    HS_CMP_XCHG(r4, r8);
3579    HS_CMP_XCHG(r12, r16);
3580    HS_CMP_XCHG(r2, r4);
3581    HS_CMP_XCHG(r6, r8);
3582    HS_CMP_XCHG(r10, r12);
3583    HS_CMP_XCHG(r14, r16);
3584    HS_CMP_XCHG(r1, r2);
3585    HS_CMP_XCHG(r3, r4);
3586    HS_CMP_XCHG(r5, r6);
3587    HS_CMP_XCHG(r7, r8);
3588    HS_CMP_XCHG(r9, r10);
3589    HS_CMP_XCHG(r11, r12);
3590    HS_CMP_XCHG(r13, r14);
3591    HS_CMP_XCHG(r15, r16);
3592  }
3593  HS_SLAB_GLOBAL_STORE(0, r1);
3594  HS_SLAB_GLOBAL_STORE(1, r2);
3595  HS_SLAB_GLOBAL_STORE(2, r3);
3596  HS_SLAB_GLOBAL_STORE(3, r4);
3597  HS_SLAB_GLOBAL_STORE(4, r5);
3598  HS_SLAB_GLOBAL_STORE(5, r6);
3599  HS_SLAB_GLOBAL_STORE(6, r7);
3600  HS_SLAB_GLOBAL_STORE(7, r8);
3601  HS_SLAB_GLOBAL_STORE(8, r9);
3602  HS_SLAB_GLOBAL_STORE(9, r10);
3603  HS_SLAB_GLOBAL_STORE(10, r11);
3604  HS_SLAB_GLOBAL_STORE(11, r12);
3605  HS_SLAB_GLOBAL_STORE(12, r13);
3606  HS_SLAB_GLOBAL_STORE(13, r14);
3607  HS_SLAB_GLOBAL_STORE(14, r15);
3608  HS_SLAB_GLOBAL_STORE(15, r16);
3609}
3610
3611HS_BC_KERNEL_PROTO(4, 2)
3612{
3613  HS_BLOCK_LOCAL_MEM_DECL(32, 16);
3614
3615  HS_SLAB_GLOBAL_PREAMBLE();
3616  HS_BC_MERGE_H_PREAMBLE(4);
3617  {
3618    {
3619      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
3620      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16);
3621      HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(32);
3622      HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(48);
3623      HS_CMP_XCHG(r0_1, r0_3);
3624      HS_CMP_XCHG(r0_2, r0_4);
3625      HS_CMP_XCHG(r0_1, r0_2);
3626      HS_CMP_XCHG(r0_3, r0_4);
3627      HS_SLAB_LOCAL_L(0) = r0_1;
3628      HS_SLAB_LOCAL_L(8) = r0_2;
3629      HS_SLAB_LOCAL_L(16) = r0_3;
3630      HS_SLAB_LOCAL_L(24) = r0_4;
3631    }
3632    {
3633      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4);
3634      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(20);
3635      HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(36);
3636      HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(52);
3637      HS_CMP_XCHG(r0_1, r0_3);
3638      HS_CMP_XCHG(r0_2, r0_4);
3639      HS_CMP_XCHG(r0_1, r0_2);
3640      HS_CMP_XCHG(r0_3, r0_4);
3641      HS_SLAB_LOCAL_L(128) = r0_1;
3642      HS_SLAB_LOCAL_L(136) = r0_2;
3643      HS_SLAB_LOCAL_L(144) = r0_3;
3644      HS_SLAB_LOCAL_L(152) = r0_4;
3645    }
3646    {
3647      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8);
3648      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(24);
3649      HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(40);
3650      HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(56);
3651      HS_CMP_XCHG(r0_1, r0_3);
3652      HS_CMP_XCHG(r0_2, r0_4);
3653      HS_CMP_XCHG(r0_1, r0_2);
3654      HS_CMP_XCHG(r0_3, r0_4);
3655      HS_SLAB_LOCAL_L(256) = r0_1;
3656      HS_SLAB_LOCAL_L(264) = r0_2;
3657      HS_SLAB_LOCAL_L(272) = r0_3;
3658      HS_SLAB_LOCAL_L(280) = r0_4;
3659    }
3660    {
3661      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(12);
3662      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(28);
3663      HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(44);
3664      HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(60);
3665      HS_CMP_XCHG(r0_1, r0_3);
3666      HS_CMP_XCHG(r0_2, r0_4);
3667      HS_CMP_XCHG(r0_1, r0_2);
3668      HS_CMP_XCHG(r0_3, r0_4);
3669      HS_SLAB_LOCAL_L(384) = r0_1;
3670      HS_SLAB_LOCAL_L(392) = r0_2;
3671      HS_SLAB_LOCAL_L(400) = r0_3;
3672      HS_SLAB_LOCAL_L(408) = r0_4;
3673    }
3674  }
3675  HS_BLOCK_BARRIER();
3676  HS_KEY_TYPE r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
3677  HS_KEY_TYPE r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
3678  HS_KEY_TYPE r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
3679  HS_KEY_TYPE r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
3680  HS_KEY_TYPE r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
3681  HS_KEY_TYPE r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
3682  HS_KEY_TYPE r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
3683  HS_KEY_TYPE r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
3684  HS_KEY_TYPE r9 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8);
3685  HS_KEY_TYPE r10 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9);
3686  HS_KEY_TYPE r11 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10);
3687  HS_KEY_TYPE r12 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11);
3688  HS_KEY_TYPE r13 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12);
3689  HS_KEY_TYPE r14 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13);
3690  HS_KEY_TYPE r15 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14);
3691  HS_KEY_TYPE r16 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15);
3692  {
3693    {
3694      HS_SLAB_HALF_PREAMBLE(4);
3695      HS_CMP_HALF(0, r1);
3696      HS_CMP_HALF(1, r2);
3697      HS_CMP_HALF(2, r3);
3698      HS_CMP_HALF(3, r4);
3699      HS_CMP_HALF(4, r5);
3700      HS_CMP_HALF(5, r6);
3701      HS_CMP_HALF(6, r7);
3702      HS_CMP_HALF(7, r8);
3703      HS_CMP_HALF(8, r9);
3704      HS_CMP_HALF(9, r10);
3705      HS_CMP_HALF(10, r11);
3706      HS_CMP_HALF(11, r12);
3707      HS_CMP_HALF(12, r13);
3708      HS_CMP_HALF(13, r14);
3709      HS_CMP_HALF(14, r15);
3710      HS_CMP_HALF(15, r16);
3711    }
3712    {
3713      HS_SLAB_HALF_PREAMBLE(2);
3714      HS_CMP_HALF(0, r1);
3715      HS_CMP_HALF(1, r2);
3716      HS_CMP_HALF(2, r3);
3717      HS_CMP_HALF(3, r4);
3718      HS_CMP_HALF(4, r5);
3719      HS_CMP_HALF(5, r6);
3720      HS_CMP_HALF(6, r7);
3721      HS_CMP_HALF(7, r8);
3722      HS_CMP_HALF(8, r9);
3723      HS_CMP_HALF(9, r10);
3724      HS_CMP_HALF(10, r11);
3725      HS_CMP_HALF(11, r12);
3726      HS_CMP_HALF(12, r13);
3727      HS_CMP_HALF(13, r14);
3728      HS_CMP_HALF(14, r15);
3729      HS_CMP_HALF(15, r16);
3730    }
3731    {
3732      HS_SLAB_HALF_PREAMBLE(1);
3733      HS_CMP_HALF(0, r1);
3734      HS_CMP_HALF(1, r2);
3735      HS_CMP_HALF(2, r3);
3736      HS_CMP_HALF(3, r4);
3737      HS_CMP_HALF(4, r5);
3738      HS_CMP_HALF(5, r6);
3739      HS_CMP_HALF(6, r7);
3740      HS_CMP_HALF(7, r8);
3741      HS_CMP_HALF(8, r9);
3742      HS_CMP_HALF(9, r10);
3743      HS_CMP_HALF(10, r11);
3744      HS_CMP_HALF(11, r12);
3745      HS_CMP_HALF(12, r13);
3746      HS_CMP_HALF(13, r14);
3747      HS_CMP_HALF(14, r15);
3748      HS_CMP_HALF(15, r16);
3749    }
3750    HS_CMP_XCHG(r1, r9);
3751    HS_CMP_XCHG(r5, r13);
3752    HS_CMP_XCHG(r1, r5);
3753    HS_CMP_XCHG(r9, r13);
3754    HS_CMP_XCHG(r3, r11);
3755    HS_CMP_XCHG(r7, r15);
3756    HS_CMP_XCHG(r3, r7);
3757    HS_CMP_XCHG(r11, r15);
3758    HS_CMP_XCHG(r1, r3);
3759    HS_CMP_XCHG(r5, r7);
3760    HS_CMP_XCHG(r9, r11);
3761    HS_CMP_XCHG(r13, r15);
3762    HS_CMP_XCHG(r2, r10);
3763    HS_CMP_XCHG(r6, r14);
3764    HS_CMP_XCHG(r2, r6);
3765    HS_CMP_XCHG(r10, r14);
3766    HS_CMP_XCHG(r4, r12);
3767    HS_CMP_XCHG(r8, r16);
3768    HS_CMP_XCHG(r4, r8);
3769    HS_CMP_XCHG(r12, r16);
3770    HS_CMP_XCHG(r2, r4);
3771    HS_CMP_XCHG(r6, r8);
3772    HS_CMP_XCHG(r10, r12);
3773    HS_CMP_XCHG(r14, r16);
3774    HS_CMP_XCHG(r1, r2);
3775    HS_CMP_XCHG(r3, r4);
3776    HS_CMP_XCHG(r5, r6);
3777    HS_CMP_XCHG(r7, r8);
3778    HS_CMP_XCHG(r9, r10);
3779    HS_CMP_XCHG(r11, r12);
3780    HS_CMP_XCHG(r13, r14);
3781    HS_CMP_XCHG(r15, r16);
3782  }
3783  HS_SLAB_GLOBAL_STORE(0, r1);
3784  HS_SLAB_GLOBAL_STORE(1, r2);
3785  HS_SLAB_GLOBAL_STORE(2, r3);
3786  HS_SLAB_GLOBAL_STORE(3, r4);
3787  HS_SLAB_GLOBAL_STORE(4, r5);
3788  HS_SLAB_GLOBAL_STORE(5, r6);
3789  HS_SLAB_GLOBAL_STORE(6, r7);
3790  HS_SLAB_GLOBAL_STORE(7, r8);
3791  HS_SLAB_GLOBAL_STORE(8, r9);
3792  HS_SLAB_GLOBAL_STORE(9, r10);
3793  HS_SLAB_GLOBAL_STORE(10, r11);
3794  HS_SLAB_GLOBAL_STORE(11, r12);
3795  HS_SLAB_GLOBAL_STORE(12, r13);
3796  HS_SLAB_GLOBAL_STORE(13, r14);
3797  HS_SLAB_GLOBAL_STORE(14, r15);
3798  HS_SLAB_GLOBAL_STORE(15, r16);
3799}
3800
3801HS_BC_KERNEL_PROTO(8, 3)
3802{
3803  HS_BLOCK_LOCAL_MEM_DECL(64, 16);
3804
3805  HS_SLAB_GLOBAL_PREAMBLE();
3806  HS_BC_MERGE_H_PREAMBLE(8);
3807  {
3808    {
3809      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
3810      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16);
3811      HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(32);
3812      HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(48);
3813      HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(64);
3814      HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(80);
3815      HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(96);
3816      HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(112);
3817      HS_CMP_XCHG(r0_1, r0_5);
3818      HS_CMP_XCHG(r0_3, r0_7);
3819      HS_CMP_XCHG(r0_1, r0_3);
3820      HS_CMP_XCHG(r0_5, r0_7);
3821      HS_CMP_XCHG(r0_2, r0_6);
3822      HS_CMP_XCHG(r0_4, r0_8);
3823      HS_CMP_XCHG(r0_2, r0_4);
3824      HS_CMP_XCHG(r0_6, r0_8);
3825      HS_CMP_XCHG(r0_1, r0_2);
3826      HS_CMP_XCHG(r0_3, r0_4);
3827      HS_CMP_XCHG(r0_5, r0_6);
3828      HS_CMP_XCHG(r0_7, r0_8);
3829      HS_SLAB_LOCAL_L(0) = r0_1;
3830      HS_SLAB_LOCAL_L(8) = r0_2;
3831      HS_SLAB_LOCAL_L(16) = r0_3;
3832      HS_SLAB_LOCAL_L(24) = r0_4;
3833      HS_SLAB_LOCAL_L(32) = r0_5;
3834      HS_SLAB_LOCAL_L(40) = r0_6;
3835      HS_SLAB_LOCAL_L(48) = r0_7;
3836      HS_SLAB_LOCAL_L(56) = r0_8;
3837    }
3838    {
3839      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8);
3840      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(24);
3841      HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(40);
3842      HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(56);
3843      HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(72);
3844      HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(88);
3845      HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(104);
3846      HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(120);
3847      HS_CMP_XCHG(r0_1, r0_5);
3848      HS_CMP_XCHG(r0_3, r0_7);
3849      HS_CMP_XCHG(r0_1, r0_3);
3850      HS_CMP_XCHG(r0_5, r0_7);
3851      HS_CMP_XCHG(r0_2, r0_6);
3852      HS_CMP_XCHG(r0_4, r0_8);
3853      HS_CMP_XCHG(r0_2, r0_4);
3854      HS_CMP_XCHG(r0_6, r0_8);
3855      HS_CMP_XCHG(r0_1, r0_2);
3856      HS_CMP_XCHG(r0_3, r0_4);
3857      HS_CMP_XCHG(r0_5, r0_6);
3858      HS_CMP_XCHG(r0_7, r0_8);
3859      HS_SLAB_LOCAL_L(512) = r0_1;
3860      HS_SLAB_LOCAL_L(520) = r0_2;
3861      HS_SLAB_LOCAL_L(528) = r0_3;
3862      HS_SLAB_LOCAL_L(536) = r0_4;
3863      HS_SLAB_LOCAL_L(544) = r0_5;
3864      HS_SLAB_LOCAL_L(552) = r0_6;
3865      HS_SLAB_LOCAL_L(560) = r0_7;
3866      HS_SLAB_LOCAL_L(568) = r0_8;
3867    }
3868  }
3869  HS_BLOCK_BARRIER();
3870  HS_KEY_TYPE r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
3871  HS_KEY_TYPE r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
3872  HS_KEY_TYPE r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
3873  HS_KEY_TYPE r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
3874  HS_KEY_TYPE r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
3875  HS_KEY_TYPE r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
3876  HS_KEY_TYPE r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
3877  HS_KEY_TYPE r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
3878  HS_KEY_TYPE r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8);
3879  HS_KEY_TYPE r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9);
3880  HS_KEY_TYPE r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10);
3881  HS_KEY_TYPE r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11);
3882  HS_KEY_TYPE r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12);
3883  HS_KEY_TYPE r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13);
3884  HS_KEY_TYPE r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14);
3885  HS_KEY_TYPE r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15);
3886  {
3887    {
3888      HS_SLAB_HALF_PREAMBLE(4);
3889      HS_CMP_HALF(0, r1);
3890      HS_CMP_HALF(1, r2);
3891      HS_CMP_HALF(2, r3);
3892      HS_CMP_HALF(3, r4);
3893      HS_CMP_HALF(4, r5);
3894      HS_CMP_HALF(5, r6);
3895      HS_CMP_HALF(6, r7);
3896      HS_CMP_HALF(7, r8);
3897      HS_CMP_HALF(8, r9);
3898      HS_CMP_HALF(9, r10);
3899      HS_CMP_HALF(10, r11);
3900      HS_CMP_HALF(11, r12);
3901      HS_CMP_HALF(12, r13);
3902      HS_CMP_HALF(13, r14);
3903      HS_CMP_HALF(14, r15);
3904      HS_CMP_HALF(15, r16);
3905    }
3906    {
3907      HS_SLAB_HALF_PREAMBLE(2);
3908      HS_CMP_HALF(0, r1);
3909      HS_CMP_HALF(1, r2);
3910      HS_CMP_HALF(2, r3);
3911      HS_CMP_HALF(3, r4);
3912      HS_CMP_HALF(4, r5);
3913      HS_CMP_HALF(5, r6);
3914      HS_CMP_HALF(6, r7);
3915      HS_CMP_HALF(7, r8);
3916      HS_CMP_HALF(8, r9);
3917      HS_CMP_HALF(9, r10);
3918      HS_CMP_HALF(10, r11);
3919      HS_CMP_HALF(11, r12);
3920      HS_CMP_HALF(12, r13);
3921      HS_CMP_HALF(13, r14);
3922      HS_CMP_HALF(14, r15);
3923      HS_CMP_HALF(15, r16);
3924    }
3925    {
3926      HS_SLAB_HALF_PREAMBLE(1);
3927      HS_CMP_HALF(0, r1);
3928      HS_CMP_HALF(1, r2);
3929      HS_CMP_HALF(2, r3);
3930      HS_CMP_HALF(3, r4);
3931      HS_CMP_HALF(4, r5);
3932      HS_CMP_HALF(5, r6);
3933      HS_CMP_HALF(6, r7);
3934      HS_CMP_HALF(7, r8);
3935      HS_CMP_HALF(8, r9);
3936      HS_CMP_HALF(9, r10);
3937      HS_CMP_HALF(10, r11);
3938      HS_CMP_HALF(11, r12);
3939      HS_CMP_HALF(12, r13);
3940      HS_CMP_HALF(13, r14);
3941      HS_CMP_HALF(14, r15);
3942      HS_CMP_HALF(15, r16);
3943    }
3944    HS_CMP_XCHG(r1, r9);
3945    HS_CMP_XCHG(r5, r13);
3946    HS_CMP_XCHG(r1, r5);
3947    HS_CMP_XCHG(r9, r13);
3948    HS_CMP_XCHG(r3, r11);
3949    HS_CMP_XCHG(r7, r15);
3950    HS_CMP_XCHG(r3, r7);
3951    HS_CMP_XCHG(r11, r15);
3952    HS_CMP_XCHG(r1, r3);
3953    HS_CMP_XCHG(r5, r7);
3954    HS_CMP_XCHG(r9, r11);
3955    HS_CMP_XCHG(r13, r15);
3956    HS_CMP_XCHG(r2, r10);
3957    HS_CMP_XCHG(r6, r14);
3958    HS_CMP_XCHG(r2, r6);
3959    HS_CMP_XCHG(r10, r14);
3960    HS_CMP_XCHG(r4, r12);
3961    HS_CMP_XCHG(r8, r16);
3962    HS_CMP_XCHG(r4, r8);
3963    HS_CMP_XCHG(r12, r16);
3964    HS_CMP_XCHG(r2, r4);
3965    HS_CMP_XCHG(r6, r8);
3966    HS_CMP_XCHG(r10, r12);
3967    HS_CMP_XCHG(r14, r16);
3968    HS_CMP_XCHG(r1, r2);
3969    HS_CMP_XCHG(r3, r4);
3970    HS_CMP_XCHG(r5, r6);
3971    HS_CMP_XCHG(r7, r8);
3972    HS_CMP_XCHG(r9, r10);
3973    HS_CMP_XCHG(r11, r12);
3974    HS_CMP_XCHG(r13, r14);
3975    HS_CMP_XCHG(r15, r16);
3976  }
3977  HS_SLAB_GLOBAL_STORE(0, r1);
3978  HS_SLAB_GLOBAL_STORE(1, r2);
3979  HS_SLAB_GLOBAL_STORE(2, r3);
3980  HS_SLAB_GLOBAL_STORE(3, r4);
3981  HS_SLAB_GLOBAL_STORE(4, r5);
3982  HS_SLAB_GLOBAL_STORE(5, r6);
3983  HS_SLAB_GLOBAL_STORE(6, r7);
3984  HS_SLAB_GLOBAL_STORE(7, r8);
3985  HS_SLAB_GLOBAL_STORE(8, r9);
3986  HS_SLAB_GLOBAL_STORE(9, r10);
3987  HS_SLAB_GLOBAL_STORE(10, r11);
3988  HS_SLAB_GLOBAL_STORE(11, r12);
3989  HS_SLAB_GLOBAL_STORE(12, r13);
3990  HS_SLAB_GLOBAL_STORE(13, r14);
3991  HS_SLAB_GLOBAL_STORE(14, r15);
3992  HS_SLAB_GLOBAL_STORE(15, r16);
3993}
3994
3995HS_BC_KERNEL_PROTO(16, 4)
3996{
3997  HS_BLOCK_LOCAL_MEM_DECL(128, 16);
3998
3999  HS_SLAB_GLOBAL_PREAMBLE();
4000  HS_BC_MERGE_H_PREAMBLE(16);
4001  {
4002    {
4003      HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
4004      HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16);
4005      HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(32);
4006      HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(48);
4007      HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(64);
4008      HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(80);
4009      HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(96);
4010      HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(112);
4011      HS_KEY_TYPE r0_9 = HS_BC_GLOBAL_LOAD_L(128);
4012      HS_KEY_TYPE r0_10 = HS_BC_GLOBAL_LOAD_L(144);
4013      HS_KEY_TYPE r0_11 = HS_BC_GLOBAL_LOAD_L(160);
4014      HS_KEY_TYPE r0_12 = HS_BC_GLOBAL_LOAD_L(176);
4015      HS_KEY_TYPE r0_13 = HS_BC_GLOBAL_LOAD_L(192);
4016      HS_KEY_TYPE r0_14 = HS_BC_GLOBAL_LOAD_L(208);
4017      HS_KEY_TYPE r0_15 = HS_BC_GLOBAL_LOAD_L(224);
4018      HS_KEY_TYPE r0_16 = HS_BC_GLOBAL_LOAD_L(240);
4019      HS_CMP_XCHG(r0_1, r0_9);
4020      HS_CMP_XCHG(r0_5, r0_13);
4021      HS_CMP_XCHG(r0_1, r0_5);
4022      HS_CMP_XCHG(r0_9, r0_13);
4023      HS_CMP_XCHG(r0_3, r0_11);
4024      HS_CMP_XCHG(r0_7, r0_15);
4025      HS_CMP_XCHG(r0_3, r0_7);
4026      HS_CMP_XCHG(r0_11, r0_15);
4027      HS_CMP_XCHG(r0_1, r0_3);
4028      HS_CMP_XCHG(r0_5, r0_7);
4029      HS_CMP_XCHG(r0_9, r0_11);
4030      HS_CMP_XCHG(r0_13, r0_15);
4031      HS_CMP_XCHG(r0_2, r0_10);
4032      HS_CMP_XCHG(r0_6, r0_14);
4033      HS_CMP_XCHG(r0_2, r0_6);
4034      HS_CMP_XCHG(r0_10, r0_14);
4035      HS_CMP_XCHG(r0_4, r0_12);
4036      HS_CMP_XCHG(r0_8, r0_16);
4037      HS_CMP_XCHG(r0_4, r0_8);
4038      HS_CMP_XCHG(r0_12, r0_16);
4039      HS_CMP_XCHG(r0_2, r0_4);
4040      HS_CMP_XCHG(r0_6, r0_8);
4041      HS_CMP_XCHG(r0_10, r0_12);
4042      HS_CMP_XCHG(r0_14, r0_16);
4043      HS_CMP_XCHG(r0_1, r0_2);
4044      HS_CMP_XCHG(r0_3, r0_4);
4045      HS_CMP_XCHG(r0_5, r0_6);
4046      HS_CMP_XCHG(r0_7, r0_8);
4047      HS_CMP_XCHG(r0_9, r0_10);
4048      HS_CMP_XCHG(r0_11, r0_12);
4049      HS_CMP_XCHG(r0_13, r0_14);
4050      HS_CMP_XCHG(r0_15, r0_16);
4051      HS_SLAB_LOCAL_L(0) = r0_1;
4052      HS_SLAB_LOCAL_L(8) = r0_2;
4053      HS_SLAB_LOCAL_L(16) = r0_3;
4054      HS_SLAB_LOCAL_L(24) = r0_4;
4055      HS_SLAB_LOCAL_L(32) = r0_5;
4056      HS_SLAB_LOCAL_L(40) = r0_6;
4057      HS_SLAB_LOCAL_L(48) = r0_7;
4058      HS_SLAB_LOCAL_L(56) = r0_8;
4059      HS_SLAB_LOCAL_L(64) = r0_9;
4060      HS_SLAB_LOCAL_L(72) = r0_10;
4061      HS_SLAB_LOCAL_L(80) = r0_11;
4062      HS_SLAB_LOCAL_L(88) = r0_12;
4063      HS_SLAB_LOCAL_L(96) = r0_13;
4064      HS_SLAB_LOCAL_L(104) = r0_14;
4065      HS_SLAB_LOCAL_L(112) = r0_15;
4066      HS_SLAB_LOCAL_L(120) = r0_16;
4067    }
4068  }
4069  HS_BLOCK_BARRIER();
4070  HS_KEY_TYPE r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
4071  HS_KEY_TYPE r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
4072  HS_KEY_TYPE r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
4073  HS_KEY_TYPE r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
4074  HS_KEY_TYPE r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
4075  HS_KEY_TYPE r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
4076  HS_KEY_TYPE r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
4077  HS_KEY_TYPE r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
4078  HS_KEY_TYPE r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8);
4079  HS_KEY_TYPE r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9);
4080  HS_KEY_TYPE r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10);
4081  HS_KEY_TYPE r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11);
4082  HS_KEY_TYPE r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12);
4083  HS_KEY_TYPE r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13);
4084  HS_KEY_TYPE r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14);
4085  HS_KEY_TYPE r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15);
4086  {
4087    {
4088      HS_SLAB_HALF_PREAMBLE(4);
4089      HS_CMP_HALF(0, r1);
4090      HS_CMP_HALF(1, r2);
4091      HS_CMP_HALF(2, r3);
4092      HS_CMP_HALF(3, r4);
4093      HS_CMP_HALF(4, r5);
4094      HS_CMP_HALF(5, r6);
4095      HS_CMP_HALF(6, r7);
4096      HS_CMP_HALF(7, r8);
4097      HS_CMP_HALF(8, r9);
4098      HS_CMP_HALF(9, r10);
4099      HS_CMP_HALF(10, r11);
4100      HS_CMP_HALF(11, r12);
4101      HS_CMP_HALF(12, r13);
4102      HS_CMP_HALF(13, r14);
4103      HS_CMP_HALF(14, r15);
4104      HS_CMP_HALF(15, r16);
4105    }
4106    {
4107      HS_SLAB_HALF_PREAMBLE(2);
4108      HS_CMP_HALF(0, r1);
4109      HS_CMP_HALF(1, r2);
4110      HS_CMP_HALF(2, r3);
4111      HS_CMP_HALF(3, r4);
4112      HS_CMP_HALF(4, r5);
4113      HS_CMP_HALF(5, r6);
4114      HS_CMP_HALF(6, r7);
4115      HS_CMP_HALF(7, r8);
4116      HS_CMP_HALF(8, r9);
4117      HS_CMP_HALF(9, r10);
4118      HS_CMP_HALF(10, r11);
4119      HS_CMP_HALF(11, r12);
4120      HS_CMP_HALF(12, r13);
4121      HS_CMP_HALF(13, r14);
4122      HS_CMP_HALF(14, r15);
4123      HS_CMP_HALF(15, r16);
4124    }
4125    {
4126      HS_SLAB_HALF_PREAMBLE(1);
4127      HS_CMP_HALF(0, r1);
4128      HS_CMP_HALF(1, r2);
4129      HS_CMP_HALF(2, r3);
4130      HS_CMP_HALF(3, r4);
4131      HS_CMP_HALF(4, r5);
4132      HS_CMP_HALF(5, r6);
4133      HS_CMP_HALF(6, r7);
4134      HS_CMP_HALF(7, r8);
4135      HS_CMP_HALF(8, r9);
4136      HS_CMP_HALF(9, r10);
4137      HS_CMP_HALF(10, r11);
4138      HS_CMP_HALF(11, r12);
4139      HS_CMP_HALF(12, r13);
4140      HS_CMP_HALF(13, r14);
4141      HS_CMP_HALF(14, r15);
4142      HS_CMP_HALF(15, r16);
4143    }
4144    HS_CMP_XCHG(r1, r9);
4145    HS_CMP_XCHG(r5, r13);
4146    HS_CMP_XCHG(r1, r5);
4147    HS_CMP_XCHG(r9, r13);
4148    HS_CMP_XCHG(r3, r11);
4149    HS_CMP_XCHG(r7, r15);
4150    HS_CMP_XCHG(r3, r7);
4151    HS_CMP_XCHG(r11, r15);
4152    HS_CMP_XCHG(r1, r3);
4153    HS_CMP_XCHG(r5, r7);
4154    HS_CMP_XCHG(r9, r11);
4155    HS_CMP_XCHG(r13, r15);
4156    HS_CMP_XCHG(r2, r10);
4157    HS_CMP_XCHG(r6, r14);
4158    HS_CMP_XCHG(r2, r6);
4159    HS_CMP_XCHG(r10, r14);
4160    HS_CMP_XCHG(r4, r12);
4161    HS_CMP_XCHG(r8, r16);
4162    HS_CMP_XCHG(r4, r8);
4163    HS_CMP_XCHG(r12, r16);
4164    HS_CMP_XCHG(r2, r4);
4165    HS_CMP_XCHG(r6, r8);
4166    HS_CMP_XCHG(r10, r12);
4167    HS_CMP_XCHG(r14, r16);
4168    HS_CMP_XCHG(r1, r2);
4169    HS_CMP_XCHG(r3, r4);
4170    HS_CMP_XCHG(r5, r6);
4171    HS_CMP_XCHG(r7, r8);
4172    HS_CMP_XCHG(r9, r10);
4173    HS_CMP_XCHG(r11, r12);
4174    HS_CMP_XCHG(r13, r14);
4175    HS_CMP_XCHG(r15, r16);
4176  }
4177  HS_SLAB_GLOBAL_STORE(0, r1);
4178  HS_SLAB_GLOBAL_STORE(1, r2);
4179  HS_SLAB_GLOBAL_STORE(2, r3);
4180  HS_SLAB_GLOBAL_STORE(3, r4);
4181  HS_SLAB_GLOBAL_STORE(4, r5);
4182  HS_SLAB_GLOBAL_STORE(5, r6);
4183  HS_SLAB_GLOBAL_STORE(6, r7);
4184  HS_SLAB_GLOBAL_STORE(7, r8);
4185  HS_SLAB_GLOBAL_STORE(8, r9);
4186  HS_SLAB_GLOBAL_STORE(9, r10);
4187  HS_SLAB_GLOBAL_STORE(10, r11);
4188  HS_SLAB_GLOBAL_STORE(11, r12);
4189  HS_SLAB_GLOBAL_STORE(12, r13);
4190  HS_SLAB_GLOBAL_STORE(13, r14);
4191  HS_SLAB_GLOBAL_STORE(14, r15);
4192  HS_SLAB_GLOBAL_STORE(15, r16);
4193}
4194
4195HS_FM_KERNEL_PROTO(1, 0)
4196{
4197  HS_FM_PREAMBLE(16);
4198  HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
4199  HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
4200  HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
4201  HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
4202  HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
4203  HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
4204  HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
4205  HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
4206  HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
4207  HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
4208  HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
4209  HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
4210  HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
4211  HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
4212  HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
4213  HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
4214  HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0);
4215  HS_CMP_XCHG(r16, r17);
4216  HS_CMP_XCHG(r1, r9);
4217  HS_CMP_XCHG(r5, r13);
4218  HS_CMP_XCHG(r1, r5);
4219  HS_CMP_XCHG(r9, r13);
4220  HS_CMP_XCHG(r3, r11);
4221  HS_CMP_XCHG(r7, r15);
4222  HS_CMP_XCHG(r3, r7);
4223  HS_CMP_XCHG(r11, r15);
4224  HS_CMP_XCHG(r1, r3);
4225  HS_CMP_XCHG(r5, r7);
4226  HS_CMP_XCHG(r9, r11);
4227  HS_CMP_XCHG(r13, r15);
4228  HS_CMP_XCHG(r2, r10);
4229  HS_CMP_XCHG(r6, r14);
4230  HS_CMP_XCHG(r2, r6);
4231  HS_CMP_XCHG(r10, r14);
4232  HS_CMP_XCHG(r4, r12);
4233  HS_CMP_XCHG(r8, r16);
4234  HS_CMP_XCHG(r4, r8);
4235  HS_CMP_XCHG(r12, r16);
4236  HS_CMP_XCHG(r2, r4);
4237  HS_CMP_XCHG(r6, r8);
4238  HS_CMP_XCHG(r10, r12);
4239  HS_CMP_XCHG(r14, r16);
4240  HS_CMP_XCHG(r1, r2);
4241  HS_CMP_XCHG(r3, r4);
4242  HS_CMP_XCHG(r5, r6);
4243  HS_CMP_XCHG(r7, r8);
4244  HS_CMP_XCHG(r9, r10);
4245  HS_CMP_XCHG(r11, r12);
4246  HS_CMP_XCHG(r13, r14);
4247  HS_CMP_XCHG(r15, r16);
4248  HS_XM_GLOBAL_STORE_L(0, r1);
4249  HS_XM_GLOBAL_STORE_L(1, r2);
4250  HS_XM_GLOBAL_STORE_L(2, r3);
4251  HS_XM_GLOBAL_STORE_L(3, r4);
4252  HS_XM_GLOBAL_STORE_L(4, r5);
4253  HS_XM_GLOBAL_STORE_L(5, r6);
4254  HS_XM_GLOBAL_STORE_L(6, r7);
4255  HS_XM_GLOBAL_STORE_L(7, r8);
4256  HS_XM_GLOBAL_STORE_L(8, r9);
4257  HS_XM_GLOBAL_STORE_L(9, r10);
4258  HS_XM_GLOBAL_STORE_L(10, r11);
4259  HS_XM_GLOBAL_STORE_L(11, r12);
4260  HS_XM_GLOBAL_STORE_L(12, r13);
4261  HS_XM_GLOBAL_STORE_L(13, r14);
4262  HS_XM_GLOBAL_STORE_L(14, r15);
4263  HS_XM_GLOBAL_STORE_L(15, r16);
4264  HS_FM_GLOBAL_STORE_R(0, r17);
4265}
4266
4267HS_FM_KERNEL_PROTO(1, 1)
4268{
4269  HS_FM_PREAMBLE(16);
4270  HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
4271  HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
4272  HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
4273  HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
4274  HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
4275  HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
4276  HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
4277  HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
4278  HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
4279  HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
4280  HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
4281  HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
4282  HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
4283  HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
4284  HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
4285  HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
4286  HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0);
4287  HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1);
4288  HS_CMP_XCHG(r16, r17);
4289  HS_CMP_XCHG(r15, r18);
4290  HS_CMP_XCHG(r1, r9);
4291  HS_CMP_XCHG(r5, r13);
4292  HS_CMP_XCHG(r1, r5);
4293  HS_CMP_XCHG(r9, r13);
4294  HS_CMP_XCHG(r3, r11);
4295  HS_CMP_XCHG(r7, r15);
4296  HS_CMP_XCHG(r3, r7);
4297  HS_CMP_XCHG(r11, r15);
4298  HS_CMP_XCHG(r1, r3);
4299  HS_CMP_XCHG(r5, r7);
4300  HS_CMP_XCHG(r9, r11);
4301  HS_CMP_XCHG(r13, r15);
4302  HS_CMP_XCHG(r2, r10);
4303  HS_CMP_XCHG(r6, r14);
4304  HS_CMP_XCHG(r2, r6);
4305  HS_CMP_XCHG(r10, r14);
4306  HS_CMP_XCHG(r4, r12);
4307  HS_CMP_XCHG(r8, r16);
4308  HS_CMP_XCHG(r4, r8);
4309  HS_CMP_XCHG(r12, r16);
4310  HS_CMP_XCHG(r2, r4);
4311  HS_CMP_XCHG(r6, r8);
4312  HS_CMP_XCHG(r10, r12);
4313  HS_CMP_XCHG(r14, r16);
4314  HS_CMP_XCHG(r1, r2);
4315  HS_CMP_XCHG(r3, r4);
4316  HS_CMP_XCHG(r5, r6);
4317  HS_CMP_XCHG(r7, r8);
4318  HS_CMP_XCHG(r9, r10);
4319  HS_CMP_XCHG(r11, r12);
4320  HS_CMP_XCHG(r13, r14);
4321  HS_CMP_XCHG(r15, r16);
4322  HS_CMP_XCHG(r17, r18);
4323  HS_XM_GLOBAL_STORE_L(0, r1);
4324  HS_XM_GLOBAL_STORE_L(1, r2);
4325  HS_XM_GLOBAL_STORE_L(2, r3);
4326  HS_XM_GLOBAL_STORE_L(3, r4);
4327  HS_XM_GLOBAL_STORE_L(4, r5);
4328  HS_XM_GLOBAL_STORE_L(5, r6);
4329  HS_XM_GLOBAL_STORE_L(6, r7);
4330  HS_XM_GLOBAL_STORE_L(7, r8);
4331  HS_XM_GLOBAL_STORE_L(8, r9);
4332  HS_XM_GLOBAL_STORE_L(9, r10);
4333  HS_XM_GLOBAL_STORE_L(10, r11);
4334  HS_XM_GLOBAL_STORE_L(11, r12);
4335  HS_XM_GLOBAL_STORE_L(12, r13);
4336  HS_XM_GLOBAL_STORE_L(13, r14);
4337  HS_XM_GLOBAL_STORE_L(14, r15);
4338  HS_XM_GLOBAL_STORE_L(15, r16);
4339  HS_FM_GLOBAL_STORE_R(0, r17);
4340  HS_FM_GLOBAL_STORE_R(1, r18);
4341}
4342
4343HS_FM_KERNEL_PROTO(1, 2)
4344{
4345  HS_FM_PREAMBLE(16);
4346  HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
4347  HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
4348  HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
4349  HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
4350  HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
4351  HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
4352  HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
4353  HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
4354  HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
4355  HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
4356  HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
4357  HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
4358  HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
4359  HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
4360  HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
4361  HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
4362  HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0);
4363  HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1);
4364  HS_KEY_TYPE r19 = HS_FM_GLOBAL_LOAD_R(2);
4365  HS_KEY_TYPE r20 = HS_FM_GLOBAL_LOAD_R(3);
4366  HS_CMP_XCHG(r16, r17);
4367  HS_CMP_XCHG(r15, r18);
4368  HS_CMP_XCHG(r14, r19);
4369  HS_CMP_XCHG(r13, r20);
4370  HS_CMP_XCHG(r1, r9);
4371  HS_CMP_XCHG(r5, r13);
4372  HS_CMP_XCHG(r1, r5);
4373  HS_CMP_XCHG(r9, r13);
4374  HS_CMP_XCHG(r3, r11);
4375  HS_CMP_XCHG(r7, r15);
4376  HS_CMP_XCHG(r3, r7);
4377  HS_CMP_XCHG(r11, r15);
4378  HS_CMP_XCHG(r1, r3);
4379  HS_CMP_XCHG(r5, r7);
4380  HS_CMP_XCHG(r9, r11);
4381  HS_CMP_XCHG(r13, r15);
4382  HS_CMP_XCHG(r2, r10);
4383  HS_CMP_XCHG(r6, r14);
4384  HS_CMP_XCHG(r2, r6);
4385  HS_CMP_XCHG(r10, r14);
4386  HS_CMP_XCHG(r4, r12);
4387  HS_CMP_XCHG(r8, r16);
4388  HS_CMP_XCHG(r4, r8);
4389  HS_CMP_XCHG(r12, r16);
4390  HS_CMP_XCHG(r2, r4);
4391  HS_CMP_XCHG(r6, r8);
4392  HS_CMP_XCHG(r10, r12);
4393  HS_CMP_XCHG(r14, r16);
4394  HS_CMP_XCHG(r1, r2);
4395  HS_CMP_XCHG(r3, r4);
4396  HS_CMP_XCHG(r5, r6);
4397  HS_CMP_XCHG(r7, r8);
4398  HS_CMP_XCHG(r9, r10);
4399  HS_CMP_XCHG(r11, r12);
4400  HS_CMP_XCHG(r13, r14);
4401  HS_CMP_XCHG(r15, r16);
4402  HS_CMP_XCHG(r17, r19);
4403  HS_CMP_XCHG(r18, r20);
4404  HS_CMP_XCHG(r17, r18);
4405  HS_CMP_XCHG(r19, r20);
4406  HS_XM_GLOBAL_STORE_L(0, r1);
4407  HS_XM_GLOBAL_STORE_L(1, r2);
4408  HS_XM_GLOBAL_STORE_L(2, r3);
4409  HS_XM_GLOBAL_STORE_L(3, r4);
4410  HS_XM_GLOBAL_STORE_L(4, r5);
4411  HS_XM_GLOBAL_STORE_L(5, r6);
4412  HS_XM_GLOBAL_STORE_L(6, r7);
4413  HS_XM_GLOBAL_STORE_L(7, r8);
4414  HS_XM_GLOBAL_STORE_L(8, r9);
4415  HS_XM_GLOBAL_STORE_L(9, r10);
4416  HS_XM_GLOBAL_STORE_L(10, r11);
4417  HS_XM_GLOBAL_STORE_L(11, r12);
4418  HS_XM_GLOBAL_STORE_L(12, r13);
4419  HS_XM_GLOBAL_STORE_L(13, r14);
4420  HS_XM_GLOBAL_STORE_L(14, r15);
4421  HS_XM_GLOBAL_STORE_L(15, r16);
4422  HS_FM_GLOBAL_STORE_R(0, r17);
4423  HS_FM_GLOBAL_STORE_R(1, r18);
4424  HS_FM_GLOBAL_STORE_R(2, r19);
4425  HS_FM_GLOBAL_STORE_R(3, r20);
4426}
4427
4428HS_FM_KERNEL_PROTO(1, 3)
4429{
4430  HS_FM_PREAMBLE(16);
4431  HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
4432  HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
4433  HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
4434  HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
4435  HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
4436  HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
4437  HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
4438  HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
4439  HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
4440  HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
4441  HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
4442  HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
4443  HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
4444  HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
4445  HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
4446  HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
4447  HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0);
4448  HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1);
4449  HS_KEY_TYPE r19 = HS_FM_GLOBAL_LOAD_R(2);
4450  HS_KEY_TYPE r20 = HS_FM_GLOBAL_LOAD_R(3);
4451  HS_KEY_TYPE r21 = HS_FM_GLOBAL_LOAD_R(4);
4452  HS_KEY_TYPE r22 = HS_FM_GLOBAL_LOAD_R(5);
4453  HS_KEY_TYPE r23 = HS_FM_GLOBAL_LOAD_R(6);
4454  HS_KEY_TYPE r24 = HS_FM_GLOBAL_LOAD_R(7);
4455  HS_CMP_XCHG(r16, r17);
4456  HS_CMP_XCHG(r15, r18);
4457  HS_CMP_XCHG(r14, r19);
4458  HS_CMP_XCHG(r13, r20);
4459  HS_CMP_XCHG(r12, r21);
4460  HS_CMP_XCHG(r11, r22);
4461  HS_CMP_XCHG(r10, r23);
4462  HS_CMP_XCHG(r9, r24);
4463  HS_CMP_XCHG(r1, r9);
4464  HS_CMP_XCHG(r5, r13);
4465  HS_CMP_XCHG(r1, r5);
4466  HS_CMP_XCHG(r9, r13);
4467  HS_CMP_XCHG(r3, r11);
4468  HS_CMP_XCHG(r7, r15);
4469  HS_CMP_XCHG(r3, r7);
4470  HS_CMP_XCHG(r11, r15);
4471  HS_CMP_XCHG(r1, r3);
4472  HS_CMP_XCHG(r5, r7);
4473  HS_CMP_XCHG(r9, r11);
4474  HS_CMP_XCHG(r13, r15);
4475  HS_CMP_XCHG(r2, r10);
4476  HS_CMP_XCHG(r6, r14);
4477  HS_CMP_XCHG(r2, r6);
4478  HS_CMP_XCHG(r10, r14);
4479  HS_CMP_XCHG(r4, r12);
4480  HS_CMP_XCHG(r8, r16);
4481  HS_CMP_XCHG(r4, r8);
4482  HS_CMP_XCHG(r12, r16);
4483  HS_CMP_XCHG(r2, r4);
4484  HS_CMP_XCHG(r6, r8);
4485  HS_CMP_XCHG(r10, r12);
4486  HS_CMP_XCHG(r14, r16);
4487  HS_CMP_XCHG(r1, r2);
4488  HS_CMP_XCHG(r3, r4);
4489  HS_CMP_XCHG(r5, r6);
4490  HS_CMP_XCHG(r7, r8);
4491  HS_CMP_XCHG(r9, r10);
4492  HS_CMP_XCHG(r11, r12);
4493  HS_CMP_XCHG(r13, r14);
4494  HS_CMP_XCHG(r15, r16);
4495  HS_CMP_XCHG(r17, r21);
4496  HS_CMP_XCHG(r19, r23);
4497  HS_CMP_XCHG(r17, r19);
4498  HS_CMP_XCHG(r21, r23);
4499  HS_CMP_XCHG(r18, r22);
4500  HS_CMP_XCHG(r20, r24);
4501  HS_CMP_XCHG(r18, r20);
4502  HS_CMP_XCHG(r22, r24);
4503  HS_CMP_XCHG(r17, r18);
4504  HS_CMP_XCHG(r19, r20);
4505  HS_CMP_XCHG(r21, r22);
4506  HS_CMP_XCHG(r23, r24);
4507  HS_XM_GLOBAL_STORE_L(0, r1);
4508  HS_XM_GLOBAL_STORE_L(1, r2);
4509  HS_XM_GLOBAL_STORE_L(2, r3);
4510  HS_XM_GLOBAL_STORE_L(3, r4);
4511  HS_XM_GLOBAL_STORE_L(4, r5);
4512  HS_XM_GLOBAL_STORE_L(5, r6);
4513  HS_XM_GLOBAL_STORE_L(6, r7);
4514  HS_XM_GLOBAL_STORE_L(7, r8);
4515  HS_XM_GLOBAL_STORE_L(8, r9);
4516  HS_XM_GLOBAL_STORE_L(9, r10);
4517  HS_XM_GLOBAL_STORE_L(10, r11);
4518  HS_XM_GLOBAL_STORE_L(11, r12);
4519  HS_XM_GLOBAL_STORE_L(12, r13);
4520  HS_XM_GLOBAL_STORE_L(13, r14);
4521  HS_XM_GLOBAL_STORE_L(14, r15);
4522  HS_XM_GLOBAL_STORE_L(15, r16);
4523  HS_FM_GLOBAL_STORE_R(0, r17);
4524  HS_FM_GLOBAL_STORE_R(1, r18);
4525  HS_FM_GLOBAL_STORE_R(2, r19);
4526  HS_FM_GLOBAL_STORE_R(3, r20);
4527  HS_FM_GLOBAL_STORE_R(4, r21);
4528  HS_FM_GLOBAL_STORE_R(5, r22);
4529  HS_FM_GLOBAL_STORE_R(6, r23);
4530  HS_FM_GLOBAL_STORE_R(7, r24);
4531}
4532
4533HS_FM_KERNEL_PROTO(1, 4)
4534{
4535  HS_FM_PREAMBLE(16);
4536  HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
4537  HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
4538  HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
4539  HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
4540  HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
4541  HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
4542  HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
4543  HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
4544  HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
4545  HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
4546  HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
4547  HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
4548  HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
4549  HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
4550  HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
4551  HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
4552  HS_KEY_TYPE r17 = HS_FM_GLOBAL_LOAD_R(0);
4553  HS_KEY_TYPE r18 = HS_FM_GLOBAL_LOAD_R(1);
4554  HS_KEY_TYPE r19 = HS_FM_GLOBAL_LOAD_R(2);
4555  HS_KEY_TYPE r20 = HS_FM_GLOBAL_LOAD_R(3);
4556  HS_KEY_TYPE r21 = HS_FM_GLOBAL_LOAD_R(4);
4557  HS_KEY_TYPE r22 = HS_FM_GLOBAL_LOAD_R(5);
4558  HS_KEY_TYPE r23 = HS_FM_GLOBAL_LOAD_R(6);
4559  HS_KEY_TYPE r24 = HS_FM_GLOBAL_LOAD_R(7);
4560  HS_KEY_TYPE r25 = HS_FM_GLOBAL_LOAD_R(8);
4561  HS_KEY_TYPE r26 = HS_FM_GLOBAL_LOAD_R(9);
4562  HS_KEY_TYPE r27 = HS_FM_GLOBAL_LOAD_R(10);
4563  HS_KEY_TYPE r28 = HS_FM_GLOBAL_LOAD_R(11);
4564  HS_KEY_TYPE r29 = HS_FM_GLOBAL_LOAD_R(12);
4565  HS_KEY_TYPE r30 = HS_FM_GLOBAL_LOAD_R(13);
4566  HS_KEY_TYPE r31 = HS_FM_GLOBAL_LOAD_R(14);
4567  HS_KEY_TYPE r32 = HS_FM_GLOBAL_LOAD_R(15);
4568  HS_CMP_XCHG(r16, r17);
4569  HS_CMP_XCHG(r15, r18);
4570  HS_CMP_XCHG(r14, r19);
4571  HS_CMP_XCHG(r13, r20);
4572  HS_CMP_XCHG(r12, r21);
4573  HS_CMP_XCHG(r11, r22);
4574  HS_CMP_XCHG(r10, r23);
4575  HS_CMP_XCHG(r9, r24);
4576  HS_CMP_XCHG(r8, r25);
4577  HS_CMP_XCHG(r7, r26);
4578  HS_CMP_XCHG(r6, r27);
4579  HS_CMP_XCHG(r5, r28);
4580  HS_CMP_XCHG(r4, r29);
4581  HS_CMP_XCHG(r3, r30);
4582  HS_CMP_XCHG(r2, r31);
4583  HS_CMP_XCHG(r1, r32);
4584  HS_CMP_XCHG(r1, r9);
4585  HS_CMP_XCHG(r5, r13);
4586  HS_CMP_XCHG(r1, r5);
4587  HS_CMP_XCHG(r9, r13);
4588  HS_CMP_XCHG(r3, r11);
4589  HS_CMP_XCHG(r7, r15);
4590  HS_CMP_XCHG(r3, r7);
4591  HS_CMP_XCHG(r11, r15);
4592  HS_CMP_XCHG(r1, r3);
4593  HS_CMP_XCHG(r5, r7);
4594  HS_CMP_XCHG(r9, r11);
4595  HS_CMP_XCHG(r13, r15);
4596  HS_CMP_XCHG(r2, r10);
4597  HS_CMP_XCHG(r6, r14);
4598  HS_CMP_XCHG(r2, r6);
4599  HS_CMP_XCHG(r10, r14);
4600  HS_CMP_XCHG(r4, r12);
4601  HS_CMP_XCHG(r8, r16);
4602  HS_CMP_XCHG(r4, r8);
4603  HS_CMP_XCHG(r12, r16);
4604  HS_CMP_XCHG(r2, r4);
4605  HS_CMP_XCHG(r6, r8);
4606  HS_CMP_XCHG(r10, r12);
4607  HS_CMP_XCHG(r14, r16);
4608  HS_CMP_XCHG(r1, r2);
4609  HS_CMP_XCHG(r3, r4);
4610  HS_CMP_XCHG(r5, r6);
4611  HS_CMP_XCHG(r7, r8);
4612  HS_CMP_XCHG(r9, r10);
4613  HS_CMP_XCHG(r11, r12);
4614  HS_CMP_XCHG(r13, r14);
4615  HS_CMP_XCHG(r15, r16);
4616  HS_CMP_XCHG(r17, r25);
4617  HS_CMP_XCHG(r21, r29);
4618  HS_CMP_XCHG(r17, r21);
4619  HS_CMP_XCHG(r25, r29);
4620  HS_CMP_XCHG(r19, r27);
4621  HS_CMP_XCHG(r23, r31);
4622  HS_CMP_XCHG(r19, r23);
4623  HS_CMP_XCHG(r27, r31);
4624  HS_CMP_XCHG(r17, r19);
4625  HS_CMP_XCHG(r21, r23);
4626  HS_CMP_XCHG(r25, r27);
4627  HS_CMP_XCHG(r29, r31);
4628  HS_CMP_XCHG(r18, r26);
4629  HS_CMP_XCHG(r22, r30);
4630  HS_CMP_XCHG(r18, r22);
4631  HS_CMP_XCHG(r26, r30);
4632  HS_CMP_XCHG(r20, r28);
4633  HS_CMP_XCHG(r24, r32);
4634  HS_CMP_XCHG(r20, r24);
4635  HS_CMP_XCHG(r28, r32);
4636  HS_CMP_XCHG(r18, r20);
4637  HS_CMP_XCHG(r22, r24);
4638  HS_CMP_XCHG(r26, r28);
4639  HS_CMP_XCHG(r30, r32);
4640  HS_CMP_XCHG(r17, r18);
4641  HS_CMP_XCHG(r19, r20);
4642  HS_CMP_XCHG(r21, r22);
4643  HS_CMP_XCHG(r23, r24);
4644  HS_CMP_XCHG(r25, r26);
4645  HS_CMP_XCHG(r27, r28);
4646  HS_CMP_XCHG(r29, r30);
4647  HS_CMP_XCHG(r31, r32);
4648  HS_XM_GLOBAL_STORE_L(0, r1);
4649  HS_XM_GLOBAL_STORE_L(1, r2);
4650  HS_XM_GLOBAL_STORE_L(2, r3);
4651  HS_XM_GLOBAL_STORE_L(3, r4);
4652  HS_XM_GLOBAL_STORE_L(4, r5);
4653  HS_XM_GLOBAL_STORE_L(5, r6);
4654  HS_XM_GLOBAL_STORE_L(6, r7);
4655  HS_XM_GLOBAL_STORE_L(7, r8);
4656  HS_XM_GLOBAL_STORE_L(8, r9);
4657  HS_XM_GLOBAL_STORE_L(9, r10);
4658  HS_XM_GLOBAL_STORE_L(10, r11);
4659  HS_XM_GLOBAL_STORE_L(11, r12);
4660  HS_XM_GLOBAL_STORE_L(12, r13);
4661  HS_XM_GLOBAL_STORE_L(13, r14);
4662  HS_XM_GLOBAL_STORE_L(14, r15);
4663  HS_XM_GLOBAL_STORE_L(15, r16);
4664  HS_FM_GLOBAL_STORE_R(0, r17);
4665  HS_FM_GLOBAL_STORE_R(1, r18);
4666  HS_FM_GLOBAL_STORE_R(2, r19);
4667  HS_FM_GLOBAL_STORE_R(3, r20);
4668  HS_FM_GLOBAL_STORE_R(4, r21);
4669  HS_FM_GLOBAL_STORE_R(5, r22);
4670  HS_FM_GLOBAL_STORE_R(6, r23);
4671  HS_FM_GLOBAL_STORE_R(7, r24);
4672  HS_FM_GLOBAL_STORE_R(8, r25);
4673  HS_FM_GLOBAL_STORE_R(9, r26);
4674  HS_FM_GLOBAL_STORE_R(10, r27);
4675  HS_FM_GLOBAL_STORE_R(11, r28);
4676  HS_FM_GLOBAL_STORE_R(12, r29);
4677  HS_FM_GLOBAL_STORE_R(13, r30);
4678  HS_FM_GLOBAL_STORE_R(14, r31);
4679  HS_FM_GLOBAL_STORE_R(15, r32);
4680}
4681
4682HS_HM_KERNEL_PROTO(1)
4683{
4684  HS_HM_PREAMBLE(16);
4685  HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
4686  HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
4687  HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
4688  HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
4689  HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
4690  HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
4691  HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
4692  HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
4693  HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
4694  HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
4695  HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
4696  HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
4697  HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
4698  HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
4699  HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
4700  HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
4701  HS_KEY_TYPE r17 = HS_XM_GLOBAL_LOAD_L(16);
4702  HS_KEY_TYPE r18 = HS_XM_GLOBAL_LOAD_L(17);
4703  HS_KEY_TYPE r19 = HS_XM_GLOBAL_LOAD_L(18);
4704  HS_KEY_TYPE r20 = HS_XM_GLOBAL_LOAD_L(19);
4705  HS_KEY_TYPE r21 = HS_XM_GLOBAL_LOAD_L(20);
4706  HS_KEY_TYPE r22 = HS_XM_GLOBAL_LOAD_L(21);
4707  HS_KEY_TYPE r23 = HS_XM_GLOBAL_LOAD_L(22);
4708  HS_KEY_TYPE r24 = HS_XM_GLOBAL_LOAD_L(23);
4709  HS_KEY_TYPE r25 = HS_XM_GLOBAL_LOAD_L(24);
4710  HS_KEY_TYPE r26 = HS_XM_GLOBAL_LOAD_L(25);
4711  HS_KEY_TYPE r27 = HS_XM_GLOBAL_LOAD_L(26);
4712  HS_KEY_TYPE r28 = HS_XM_GLOBAL_LOAD_L(27);
4713  HS_KEY_TYPE r29 = HS_XM_GLOBAL_LOAD_L(28);
4714  HS_KEY_TYPE r30 = HS_XM_GLOBAL_LOAD_L(29);
4715  HS_KEY_TYPE r31 = HS_XM_GLOBAL_LOAD_L(30);
4716  HS_KEY_TYPE r32 = HS_XM_GLOBAL_LOAD_L(31);
4717  HS_CMP_XCHG(r1, r17);
4718  HS_CMP_XCHG(r9, r25);
4719  HS_CMP_XCHG(r1, r9);
4720  HS_CMP_XCHG(r17, r25);
4721  HS_CMP_XCHG(r5, r21);
4722  HS_CMP_XCHG(r13, r29);
4723  HS_CMP_XCHG(r5, r13);
4724  HS_CMP_XCHG(r21, r29);
4725  HS_CMP_XCHG(r1, r5);
4726  HS_CMP_XCHG(r9, r13);
4727  HS_CMP_XCHG(r17, r21);
4728  HS_CMP_XCHG(r25, r29);
4729  HS_CMP_XCHG(r3, r19);
4730  HS_CMP_XCHG(r11, r27);
4731  HS_CMP_XCHG(r3, r11);
4732  HS_CMP_XCHG(r19, r27);
4733  HS_CMP_XCHG(r7, r23);
4734  HS_CMP_XCHG(r15, r31);
4735  HS_CMP_XCHG(r7, r15);
4736  HS_CMP_XCHG(r23, r31);
4737  HS_CMP_XCHG(r3, r7);
4738  HS_CMP_XCHG(r11, r15);
4739  HS_CMP_XCHG(r19, r23);
4740  HS_CMP_XCHG(r27, r31);
4741  HS_CMP_XCHG(r1, r3);
4742  HS_CMP_XCHG(r5, r7);
4743  HS_CMP_XCHG(r9, r11);
4744  HS_CMP_XCHG(r13, r15);
4745  HS_CMP_XCHG(r17, r19);
4746  HS_CMP_XCHG(r21, r23);
4747  HS_CMP_XCHG(r25, r27);
4748  HS_CMP_XCHG(r29, r31);
4749  HS_CMP_XCHG(r2, r18);
4750  HS_CMP_XCHG(r10, r26);
4751  HS_CMP_XCHG(r2, r10);
4752  HS_CMP_XCHG(r18, r26);
4753  HS_CMP_XCHG(r6, r22);
4754  HS_CMP_XCHG(r14, r30);
4755  HS_CMP_XCHG(r6, r14);
4756  HS_CMP_XCHG(r22, r30);
4757  HS_CMP_XCHG(r2, r6);
4758  HS_CMP_XCHG(r10, r14);
4759  HS_CMP_XCHG(r18, r22);
4760  HS_CMP_XCHG(r26, r30);
4761  HS_CMP_XCHG(r4, r20);
4762  HS_CMP_XCHG(r12, r28);
4763  HS_CMP_XCHG(r4, r12);
4764  HS_CMP_XCHG(r20, r28);
4765  HS_CMP_XCHG(r8, r24);
4766  HS_CMP_XCHG(r16, r32);
4767  HS_CMP_XCHG(r8, r16);
4768  HS_CMP_XCHG(r24, r32);
4769  HS_CMP_XCHG(r4, r8);
4770  HS_CMP_XCHG(r12, r16);
4771  HS_CMP_XCHG(r20, r24);
4772  HS_CMP_XCHG(r28, r32);
4773  HS_CMP_XCHG(r2, r4);
4774  HS_CMP_XCHG(r6, r8);
4775  HS_CMP_XCHG(r10, r12);
4776  HS_CMP_XCHG(r14, r16);
4777  HS_CMP_XCHG(r18, r20);
4778  HS_CMP_XCHG(r22, r24);
4779  HS_CMP_XCHG(r26, r28);
4780  HS_CMP_XCHG(r30, r32);
4781  HS_CMP_XCHG(r1, r2);
4782  HS_CMP_XCHG(r3, r4);
4783  HS_CMP_XCHG(r5, r6);
4784  HS_CMP_XCHG(r7, r8);
4785  HS_CMP_XCHG(r9, r10);
4786  HS_CMP_XCHG(r11, r12);
4787  HS_CMP_XCHG(r13, r14);
4788  HS_CMP_XCHG(r15, r16);
4789  HS_CMP_XCHG(r17, r18);
4790  HS_CMP_XCHG(r19, r20);
4791  HS_CMP_XCHG(r21, r22);
4792  HS_CMP_XCHG(r23, r24);
4793  HS_CMP_XCHG(r25, r26);
4794  HS_CMP_XCHG(r27, r28);
4795  HS_CMP_XCHG(r29, r30);
4796  HS_CMP_XCHG(r31, r32);
4797  HS_XM_GLOBAL_STORE_L(0, r1);
4798  HS_XM_GLOBAL_STORE_L(1, r2);
4799  HS_XM_GLOBAL_STORE_L(2, r3);
4800  HS_XM_GLOBAL_STORE_L(3, r4);
4801  HS_XM_GLOBAL_STORE_L(4, r5);
4802  HS_XM_GLOBAL_STORE_L(5, r6);
4803  HS_XM_GLOBAL_STORE_L(6, r7);
4804  HS_XM_GLOBAL_STORE_L(7, r8);
4805  HS_XM_GLOBAL_STORE_L(8, r9);
4806  HS_XM_GLOBAL_STORE_L(9, r10);
4807  HS_XM_GLOBAL_STORE_L(10, r11);
4808  HS_XM_GLOBAL_STORE_L(11, r12);
4809  HS_XM_GLOBAL_STORE_L(12, r13);
4810  HS_XM_GLOBAL_STORE_L(13, r14);
4811  HS_XM_GLOBAL_STORE_L(14, r15);
4812  HS_XM_GLOBAL_STORE_L(15, r16);
4813  HS_XM_GLOBAL_STORE_L(16, r17);
4814  HS_XM_GLOBAL_STORE_L(17, r18);
4815  HS_XM_GLOBAL_STORE_L(18, r19);
4816  HS_XM_GLOBAL_STORE_L(19, r20);
4817  HS_XM_GLOBAL_STORE_L(20, r21);
4818  HS_XM_GLOBAL_STORE_L(21, r22);
4819  HS_XM_GLOBAL_STORE_L(22, r23);
4820  HS_XM_GLOBAL_STORE_L(23, r24);
4821  HS_XM_GLOBAL_STORE_L(24, r25);
4822  HS_XM_GLOBAL_STORE_L(25, r26);
4823  HS_XM_GLOBAL_STORE_L(26, r27);
4824  HS_XM_GLOBAL_STORE_L(27, r28);
4825  HS_XM_GLOBAL_STORE_L(28, r29);
4826  HS_XM_GLOBAL_STORE_L(29, r30);
4827  HS_XM_GLOBAL_STORE_L(30, r31);
4828  HS_XM_GLOBAL_STORE_L(31, r32);
4829}
4830
4831HS_TRANSPOSE_KERNEL_PROTO()
4832{
4833  HS_SLAB_GLOBAL_PREAMBLE();
4834  HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0);
4835  HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1);
4836  HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2);
4837  HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3);
4838  HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4);
4839  HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5);
4840  HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6);
4841  HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7);
4842  HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vout, 8);
4843  HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vout, 9);
4844  HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vout, 10);
4845  HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vout, 11);
4846  HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vout, 12);
4847  HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vout, 13);
4848  HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vout, 14);
4849  HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vout, 15);
4850  HS_TRANSPOSE_SLAB()
4851}
4852
4853//
4854//
4855//
4856