1// 2// Copyright 2016 Google Inc. 3// 4// Use of this source code is governed by a BSD-style 5// license that can be found in the LICENSE file. 6// 7 8// target-specific config 9#include "hs_config.h" 10 11// arch/target-specific macros 12#include "hs_cl_macros.h" 13 14// 15// 16// 17 18HS_BS_KERNEL_PROTO(1, 0) 19{ 20 HS_SLAB_GLOBAL_PREAMBLE(); 21 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); 22 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); 23 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); 24 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); 25 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); 26 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); 27 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); 28 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); 29 HS_CMP_XCHG(r1, r5); 30 HS_CMP_XCHG(r2, r6); 31 HS_CMP_XCHG(r3, r7); 32 HS_CMP_XCHG(r4, r8); 33 HS_CMP_XCHG(r1, r3); 34 HS_CMP_XCHG(r2, r4); 35 HS_CMP_XCHG(r5, r7); 36 HS_CMP_XCHG(r6, r8); 37 HS_CMP_XCHG(r3, r5); 38 HS_CMP_XCHG(r4, r6); 39 HS_CMP_XCHG(r1, r2); 40 HS_CMP_XCHG(r3, r4); 41 HS_CMP_XCHG(r5, r6); 42 HS_CMP_XCHG(r7, r8); 43 HS_CMP_XCHG(r2, r5); 44 HS_CMP_XCHG(r4, r7); 45 HS_CMP_XCHG(r2, r3); 46 HS_CMP_XCHG(r4, r5); 47 HS_CMP_XCHG(r6, r7); 48 { 49 HS_SLAB_FLIP_PREAMBLE(1); 50 HS_CMP_FLIP(0, r1, r8); 51 HS_CMP_FLIP(1, r2, r7); 52 HS_CMP_FLIP(2, r3, r6); 53 HS_CMP_FLIP(3, r4, r5); 54 } 55 HS_CMP_XCHG(r1, r5); 56 HS_CMP_XCHG(r3, r7); 57 HS_CMP_XCHG(r1, r3); 58 HS_CMP_XCHG(r5, r7); 59 HS_CMP_XCHG(r2, r6); 60 HS_CMP_XCHG(r4, r8); 61 HS_CMP_XCHG(r2, r4); 62 HS_CMP_XCHG(r6, r8); 63 HS_CMP_XCHG(r1, r2); 64 HS_CMP_XCHG(r3, r4); 65 HS_CMP_XCHG(r5, r6); 66 HS_CMP_XCHG(r7, r8); 67 { 68 HS_SLAB_FLIP_PREAMBLE(3); 69 HS_CMP_FLIP(0, r1, r8); 70 HS_CMP_FLIP(1, r2, r7); 71 HS_CMP_FLIP(2, r3, r6); 72 HS_CMP_FLIP(3, r4, r5); 73 } 74 { 75 HS_SLAB_HALF_PREAMBLE(1); 76 HS_CMP_HALF(0, r1); 77 HS_CMP_HALF(1, r2); 78 HS_CMP_HALF(2, r3); 79 HS_CMP_HALF(3, r4); 80 HS_CMP_HALF(4, r5); 81 HS_CMP_HALF(5, r6); 82 HS_CMP_HALF(6, r7); 83 HS_CMP_HALF(7, r8); 84 } 85 HS_CMP_XCHG(r1, r5); 86 HS_CMP_XCHG(r3, r7); 87 HS_CMP_XCHG(r1, r3); 88 HS_CMP_XCHG(r5, r7); 89 HS_CMP_XCHG(r2, r6); 90 HS_CMP_XCHG(r4, r8); 91 HS_CMP_XCHG(r2, r4); 92 HS_CMP_XCHG(r6, r8); 93 HS_CMP_XCHG(r1, r2); 94 HS_CMP_XCHG(r3, r4); 95 HS_CMP_XCHG(r5, r6); 96 HS_CMP_XCHG(r7, r8); 97 { 98 HS_SLAB_FLIP_PREAMBLE(7); 99 HS_CMP_FLIP(0, r1, r8); 100 HS_CMP_FLIP(1, r2, r7); 101 HS_CMP_FLIP(2, r3, r6); 102 HS_CMP_FLIP(3, r4, r5); 103 } 104 { 105 HS_SLAB_HALF_PREAMBLE(2); 106 HS_CMP_HALF(0, r1); 107 HS_CMP_HALF(1, r2); 108 HS_CMP_HALF(2, r3); 109 HS_CMP_HALF(3, r4); 110 HS_CMP_HALF(4, r5); 111 HS_CMP_HALF(5, r6); 112 HS_CMP_HALF(6, r7); 113 HS_CMP_HALF(7, r8); 114 } 115 { 116 HS_SLAB_HALF_PREAMBLE(1); 117 HS_CMP_HALF(0, r1); 118 HS_CMP_HALF(1, r2); 119 HS_CMP_HALF(2, r3); 120 HS_CMP_HALF(3, r4); 121 HS_CMP_HALF(4, r5); 122 HS_CMP_HALF(5, r6); 123 HS_CMP_HALF(6, r7); 124 HS_CMP_HALF(7, r8); 125 } 126 HS_CMP_XCHG(r1, r5); 127 HS_CMP_XCHG(r3, r7); 128 HS_CMP_XCHG(r1, r3); 129 HS_CMP_XCHG(r5, r7); 130 HS_CMP_XCHG(r2, r6); 131 HS_CMP_XCHG(r4, r8); 132 HS_CMP_XCHG(r2, r4); 133 HS_CMP_XCHG(r6, r8); 134 HS_CMP_XCHG(r1, r2); 135 HS_CMP_XCHG(r3, r4); 136 HS_CMP_XCHG(r5, r6); 137 HS_CMP_XCHG(r7, r8); 138 { 139 HS_SLAB_FLIP_PREAMBLE(15); 140 HS_CMP_FLIP(0, r1, r8); 141 HS_CMP_FLIP(1, r2, r7); 142 HS_CMP_FLIP(2, r3, r6); 143 HS_CMP_FLIP(3, r4, r5); 144 } 145 { 146 HS_SLAB_HALF_PREAMBLE(4); 147 HS_CMP_HALF(0, r1); 148 HS_CMP_HALF(1, r2); 149 HS_CMP_HALF(2, r3); 150 HS_CMP_HALF(3, r4); 151 HS_CMP_HALF(4, r5); 152 HS_CMP_HALF(5, r6); 153 HS_CMP_HALF(6, r7); 154 HS_CMP_HALF(7, r8); 155 } 156 { 157 HS_SLAB_HALF_PREAMBLE(2); 158 HS_CMP_HALF(0, r1); 159 HS_CMP_HALF(1, r2); 160 HS_CMP_HALF(2, r3); 161 HS_CMP_HALF(3, r4); 162 HS_CMP_HALF(4, r5); 163 HS_CMP_HALF(5, r6); 164 HS_CMP_HALF(6, r7); 165 HS_CMP_HALF(7, r8); 166 } 167 { 168 HS_SLAB_HALF_PREAMBLE(1); 169 HS_CMP_HALF(0, r1); 170 HS_CMP_HALF(1, r2); 171 HS_CMP_HALF(2, r3); 172 HS_CMP_HALF(3, r4); 173 HS_CMP_HALF(4, r5); 174 HS_CMP_HALF(5, r6); 175 HS_CMP_HALF(6, r7); 176 HS_CMP_HALF(7, r8); 177 } 178 HS_CMP_XCHG(r1, r5); 179 HS_CMP_XCHG(r3, r7); 180 HS_CMP_XCHG(r1, r3); 181 HS_CMP_XCHG(r5, r7); 182 HS_CMP_XCHG(r2, r6); 183 HS_CMP_XCHG(r4, r8); 184 HS_CMP_XCHG(r2, r4); 185 HS_CMP_XCHG(r6, r8); 186 HS_CMP_XCHG(r1, r2); 187 HS_CMP_XCHG(r3, r4); 188 HS_CMP_XCHG(r5, r6); 189 HS_CMP_XCHG(r7, r8); 190 HS_SLAB_GLOBAL_STORE(0, r1); 191 HS_SLAB_GLOBAL_STORE(1, r2); 192 HS_SLAB_GLOBAL_STORE(2, r3); 193 HS_SLAB_GLOBAL_STORE(3, r4); 194 HS_SLAB_GLOBAL_STORE(4, r5); 195 HS_SLAB_GLOBAL_STORE(5, r6); 196 HS_SLAB_GLOBAL_STORE(6, r7); 197 HS_SLAB_GLOBAL_STORE(7, r8); 198} 199 200HS_BS_KERNEL_PROTO(2, 1) 201{ 202 HS_BLOCK_LOCAL_MEM_DECL(32, 8); 203 204 HS_SLAB_GLOBAL_PREAMBLE(); 205 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); 206 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); 207 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); 208 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); 209 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); 210 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); 211 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); 212 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); 213 HS_CMP_XCHG(r1, r5); 214 HS_CMP_XCHG(r2, r6); 215 HS_CMP_XCHG(r3, r7); 216 HS_CMP_XCHG(r4, r8); 217 HS_CMP_XCHG(r1, r3); 218 HS_CMP_XCHG(r2, r4); 219 HS_CMP_XCHG(r5, r7); 220 HS_CMP_XCHG(r6, r8); 221 HS_CMP_XCHG(r3, r5); 222 HS_CMP_XCHG(r4, r6); 223 HS_CMP_XCHG(r1, r2); 224 HS_CMP_XCHG(r3, r4); 225 HS_CMP_XCHG(r5, r6); 226 HS_CMP_XCHG(r7, r8); 227 HS_CMP_XCHG(r2, r5); 228 HS_CMP_XCHG(r4, r7); 229 HS_CMP_XCHG(r2, r3); 230 HS_CMP_XCHG(r4, r5); 231 HS_CMP_XCHG(r6, r7); 232 { 233 HS_SLAB_FLIP_PREAMBLE(1); 234 HS_CMP_FLIP(0, r1, r8); 235 HS_CMP_FLIP(1, r2, r7); 236 HS_CMP_FLIP(2, r3, r6); 237 HS_CMP_FLIP(3, r4, r5); 238 } 239 HS_CMP_XCHG(r1, r5); 240 HS_CMP_XCHG(r3, r7); 241 HS_CMP_XCHG(r1, r3); 242 HS_CMP_XCHG(r5, r7); 243 HS_CMP_XCHG(r2, r6); 244 HS_CMP_XCHG(r4, r8); 245 HS_CMP_XCHG(r2, r4); 246 HS_CMP_XCHG(r6, r8); 247 HS_CMP_XCHG(r1, r2); 248 HS_CMP_XCHG(r3, r4); 249 HS_CMP_XCHG(r5, r6); 250 HS_CMP_XCHG(r7, r8); 251 { 252 HS_SLAB_FLIP_PREAMBLE(3); 253 HS_CMP_FLIP(0, r1, r8); 254 HS_CMP_FLIP(1, r2, r7); 255 HS_CMP_FLIP(2, r3, r6); 256 HS_CMP_FLIP(3, r4, r5); 257 } 258 { 259 HS_SLAB_HALF_PREAMBLE(1); 260 HS_CMP_HALF(0, r1); 261 HS_CMP_HALF(1, r2); 262 HS_CMP_HALF(2, r3); 263 HS_CMP_HALF(3, r4); 264 HS_CMP_HALF(4, r5); 265 HS_CMP_HALF(5, r6); 266 HS_CMP_HALF(6, r7); 267 HS_CMP_HALF(7, r8); 268 } 269 HS_CMP_XCHG(r1, r5); 270 HS_CMP_XCHG(r3, r7); 271 HS_CMP_XCHG(r1, r3); 272 HS_CMP_XCHG(r5, r7); 273 HS_CMP_XCHG(r2, r6); 274 HS_CMP_XCHG(r4, r8); 275 HS_CMP_XCHG(r2, r4); 276 HS_CMP_XCHG(r6, r8); 277 HS_CMP_XCHG(r1, r2); 278 HS_CMP_XCHG(r3, r4); 279 HS_CMP_XCHG(r5, r6); 280 HS_CMP_XCHG(r7, r8); 281 { 282 HS_SLAB_FLIP_PREAMBLE(7); 283 HS_CMP_FLIP(0, r1, r8); 284 HS_CMP_FLIP(1, r2, r7); 285 HS_CMP_FLIP(2, r3, r6); 286 HS_CMP_FLIP(3, r4, r5); 287 } 288 { 289 HS_SLAB_HALF_PREAMBLE(2); 290 HS_CMP_HALF(0, r1); 291 HS_CMP_HALF(1, r2); 292 HS_CMP_HALF(2, r3); 293 HS_CMP_HALF(3, r4); 294 HS_CMP_HALF(4, r5); 295 HS_CMP_HALF(5, r6); 296 HS_CMP_HALF(6, r7); 297 HS_CMP_HALF(7, r8); 298 } 299 { 300 HS_SLAB_HALF_PREAMBLE(1); 301 HS_CMP_HALF(0, r1); 302 HS_CMP_HALF(1, r2); 303 HS_CMP_HALF(2, r3); 304 HS_CMP_HALF(3, r4); 305 HS_CMP_HALF(4, r5); 306 HS_CMP_HALF(5, r6); 307 HS_CMP_HALF(6, r7); 308 HS_CMP_HALF(7, r8); 309 } 310 HS_CMP_XCHG(r1, r5); 311 HS_CMP_XCHG(r3, r7); 312 HS_CMP_XCHG(r1, r3); 313 HS_CMP_XCHG(r5, r7); 314 HS_CMP_XCHG(r2, r6); 315 HS_CMP_XCHG(r4, r8); 316 HS_CMP_XCHG(r2, r4); 317 HS_CMP_XCHG(r6, r8); 318 HS_CMP_XCHG(r1, r2); 319 HS_CMP_XCHG(r3, r4); 320 HS_CMP_XCHG(r5, r6); 321 HS_CMP_XCHG(r7, r8); 322 { 323 HS_SLAB_FLIP_PREAMBLE(15); 324 HS_CMP_FLIP(0, r1, r8); 325 HS_CMP_FLIP(1, r2, r7); 326 HS_CMP_FLIP(2, r3, r6); 327 HS_CMP_FLIP(3, r4, r5); 328 } 329 { 330 HS_SLAB_HALF_PREAMBLE(4); 331 HS_CMP_HALF(0, r1); 332 HS_CMP_HALF(1, r2); 333 HS_CMP_HALF(2, r3); 334 HS_CMP_HALF(3, r4); 335 HS_CMP_HALF(4, r5); 336 HS_CMP_HALF(5, r6); 337 HS_CMP_HALF(6, r7); 338 HS_CMP_HALF(7, r8); 339 } 340 { 341 HS_SLAB_HALF_PREAMBLE(2); 342 HS_CMP_HALF(0, r1); 343 HS_CMP_HALF(1, r2); 344 HS_CMP_HALF(2, r3); 345 HS_CMP_HALF(3, r4); 346 HS_CMP_HALF(4, r5); 347 HS_CMP_HALF(5, r6); 348 HS_CMP_HALF(6, r7); 349 HS_CMP_HALF(7, r8); 350 } 351 { 352 HS_SLAB_HALF_PREAMBLE(1); 353 HS_CMP_HALF(0, r1); 354 HS_CMP_HALF(1, r2); 355 HS_CMP_HALF(2, r3); 356 HS_CMP_HALF(3, r4); 357 HS_CMP_HALF(4, r5); 358 HS_CMP_HALF(5, r6); 359 HS_CMP_HALF(6, r7); 360 HS_CMP_HALF(7, r8); 361 } 362 HS_CMP_XCHG(r1, r5); 363 HS_CMP_XCHG(r3, r7); 364 HS_CMP_XCHG(r1, r3); 365 HS_CMP_XCHG(r5, r7); 366 HS_CMP_XCHG(r2, r6); 367 HS_CMP_XCHG(r4, r8); 368 HS_CMP_XCHG(r2, r4); 369 HS_CMP_XCHG(r6, r8); 370 HS_CMP_XCHG(r1, r2); 371 HS_CMP_XCHG(r3, r4); 372 HS_CMP_XCHG(r5, r6); 373 HS_CMP_XCHG(r7, r8); 374 HS_BS_MERGE_H_PREAMBLE(2); 375 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0) = r1; 376 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1) = r8; 377 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2) = r2; 378 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3) = r7; 379 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4) = r3; 380 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5) = r6; 381 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6) = r4; 382 HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7) = r5; 383 HS_BLOCK_BARRIER(); 384 { 385 { 386 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 387 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(16); 388 HS_CMP_XCHG(r0_1, r0_2); 389 HS_SLAB_LOCAL_L(0) = r0_1; 390 HS_SLAB_LOCAL_R(16) = r0_2; 391 } 392 { 393 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(64); 394 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(80); 395 HS_CMP_XCHG(r0_1, r0_2); 396 HS_SLAB_LOCAL_L(64) = r0_1; 397 HS_SLAB_LOCAL_R(80) = r0_2; 398 } 399 { 400 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128); 401 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(144); 402 HS_CMP_XCHG(r0_1, r0_2); 403 HS_SLAB_LOCAL_L(128) = r0_1; 404 HS_SLAB_LOCAL_R(144) = r0_2; 405 } 406 { 407 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(192); 408 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(208); 409 HS_CMP_XCHG(r0_1, r0_2); 410 HS_SLAB_LOCAL_L(192) = r0_1; 411 HS_SLAB_LOCAL_R(208) = r0_2; 412 } 413 } 414 HS_BLOCK_BARRIER(); 415 r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0); 416 r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1); 417 r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2); 418 r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3); 419 r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4); 420 r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5); 421 r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6); 422 r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7); 423 { 424 { 425 HS_SLAB_HALF_PREAMBLE(8); 426 HS_CMP_HALF(0, r1); 427 HS_CMP_HALF(1, r2); 428 HS_CMP_HALF(2, r3); 429 HS_CMP_HALF(3, r4); 430 HS_CMP_HALF(4, r5); 431 HS_CMP_HALF(5, r6); 432 HS_CMP_HALF(6, r7); 433 HS_CMP_HALF(7, r8); 434 } 435 { 436 HS_SLAB_HALF_PREAMBLE(4); 437 HS_CMP_HALF(0, r1); 438 HS_CMP_HALF(1, r2); 439 HS_CMP_HALF(2, r3); 440 HS_CMP_HALF(3, r4); 441 HS_CMP_HALF(4, r5); 442 HS_CMP_HALF(5, r6); 443 HS_CMP_HALF(6, r7); 444 HS_CMP_HALF(7, r8); 445 } 446 { 447 HS_SLAB_HALF_PREAMBLE(2); 448 HS_CMP_HALF(0, r1); 449 HS_CMP_HALF(1, r2); 450 HS_CMP_HALF(2, r3); 451 HS_CMP_HALF(3, r4); 452 HS_CMP_HALF(4, r5); 453 HS_CMP_HALF(5, r6); 454 HS_CMP_HALF(6, r7); 455 HS_CMP_HALF(7, r8); 456 } 457 { 458 HS_SLAB_HALF_PREAMBLE(1); 459 HS_CMP_HALF(0, r1); 460 HS_CMP_HALF(1, r2); 461 HS_CMP_HALF(2, r3); 462 HS_CMP_HALF(3, r4); 463 HS_CMP_HALF(4, r5); 464 HS_CMP_HALF(5, r6); 465 HS_CMP_HALF(6, r7); 466 HS_CMP_HALF(7, r8); 467 } 468 HS_CMP_XCHG(r1, r5); 469 HS_CMP_XCHG(r3, r7); 470 HS_CMP_XCHG(r1, r3); 471 HS_CMP_XCHG(r5, r7); 472 HS_CMP_XCHG(r2, r6); 473 HS_CMP_XCHG(r4, r8); 474 HS_CMP_XCHG(r2, r4); 475 HS_CMP_XCHG(r6, r8); 476 HS_CMP_XCHG(r1, r2); 477 HS_CMP_XCHG(r3, r4); 478 HS_CMP_XCHG(r5, r6); 479 HS_CMP_XCHG(r7, r8); 480 } 481 HS_SLAB_GLOBAL_STORE(0, r1); 482 HS_SLAB_GLOBAL_STORE(1, r2); 483 HS_SLAB_GLOBAL_STORE(2, r3); 484 HS_SLAB_GLOBAL_STORE(3, r4); 485 HS_SLAB_GLOBAL_STORE(4, r5); 486 HS_SLAB_GLOBAL_STORE(5, r6); 487 HS_SLAB_GLOBAL_STORE(6, r7); 488 HS_SLAB_GLOBAL_STORE(7, r8); 489} 490 491HS_BS_KERNEL_PROTO(4, 2) 492{ 493 HS_BLOCK_LOCAL_MEM_DECL(64, 8); 494 495 HS_SLAB_GLOBAL_PREAMBLE(); 496 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); 497 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); 498 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); 499 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); 500 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); 501 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); 502 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); 503 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); 504 HS_CMP_XCHG(r1, r5); 505 HS_CMP_XCHG(r2, r6); 506 HS_CMP_XCHG(r3, r7); 507 HS_CMP_XCHG(r4, r8); 508 HS_CMP_XCHG(r1, r3); 509 HS_CMP_XCHG(r2, r4); 510 HS_CMP_XCHG(r5, r7); 511 HS_CMP_XCHG(r6, r8); 512 HS_CMP_XCHG(r3, r5); 513 HS_CMP_XCHG(r4, r6); 514 HS_CMP_XCHG(r1, r2); 515 HS_CMP_XCHG(r3, r4); 516 HS_CMP_XCHG(r5, r6); 517 HS_CMP_XCHG(r7, r8); 518 HS_CMP_XCHG(r2, r5); 519 HS_CMP_XCHG(r4, r7); 520 HS_CMP_XCHG(r2, r3); 521 HS_CMP_XCHG(r4, r5); 522 HS_CMP_XCHG(r6, r7); 523 { 524 HS_SLAB_FLIP_PREAMBLE(1); 525 HS_CMP_FLIP(0, r1, r8); 526 HS_CMP_FLIP(1, r2, r7); 527 HS_CMP_FLIP(2, r3, r6); 528 HS_CMP_FLIP(3, r4, r5); 529 } 530 HS_CMP_XCHG(r1, r5); 531 HS_CMP_XCHG(r3, r7); 532 HS_CMP_XCHG(r1, r3); 533 HS_CMP_XCHG(r5, r7); 534 HS_CMP_XCHG(r2, r6); 535 HS_CMP_XCHG(r4, r8); 536 HS_CMP_XCHG(r2, r4); 537 HS_CMP_XCHG(r6, r8); 538 HS_CMP_XCHG(r1, r2); 539 HS_CMP_XCHG(r3, r4); 540 HS_CMP_XCHG(r5, r6); 541 HS_CMP_XCHG(r7, r8); 542 { 543 HS_SLAB_FLIP_PREAMBLE(3); 544 HS_CMP_FLIP(0, r1, r8); 545 HS_CMP_FLIP(1, r2, r7); 546 HS_CMP_FLIP(2, r3, r6); 547 HS_CMP_FLIP(3, r4, r5); 548 } 549 { 550 HS_SLAB_HALF_PREAMBLE(1); 551 HS_CMP_HALF(0, r1); 552 HS_CMP_HALF(1, r2); 553 HS_CMP_HALF(2, r3); 554 HS_CMP_HALF(3, r4); 555 HS_CMP_HALF(4, r5); 556 HS_CMP_HALF(5, r6); 557 HS_CMP_HALF(6, r7); 558 HS_CMP_HALF(7, r8); 559 } 560 HS_CMP_XCHG(r1, r5); 561 HS_CMP_XCHG(r3, r7); 562 HS_CMP_XCHG(r1, r3); 563 HS_CMP_XCHG(r5, r7); 564 HS_CMP_XCHG(r2, r6); 565 HS_CMP_XCHG(r4, r8); 566 HS_CMP_XCHG(r2, r4); 567 HS_CMP_XCHG(r6, r8); 568 HS_CMP_XCHG(r1, r2); 569 HS_CMP_XCHG(r3, r4); 570 HS_CMP_XCHG(r5, r6); 571 HS_CMP_XCHG(r7, r8); 572 { 573 HS_SLAB_FLIP_PREAMBLE(7); 574 HS_CMP_FLIP(0, r1, r8); 575 HS_CMP_FLIP(1, r2, r7); 576 HS_CMP_FLIP(2, r3, r6); 577 HS_CMP_FLIP(3, r4, r5); 578 } 579 { 580 HS_SLAB_HALF_PREAMBLE(2); 581 HS_CMP_HALF(0, r1); 582 HS_CMP_HALF(1, r2); 583 HS_CMP_HALF(2, r3); 584 HS_CMP_HALF(3, r4); 585 HS_CMP_HALF(4, r5); 586 HS_CMP_HALF(5, r6); 587 HS_CMP_HALF(6, r7); 588 HS_CMP_HALF(7, r8); 589 } 590 { 591 HS_SLAB_HALF_PREAMBLE(1); 592 HS_CMP_HALF(0, r1); 593 HS_CMP_HALF(1, r2); 594 HS_CMP_HALF(2, r3); 595 HS_CMP_HALF(3, r4); 596 HS_CMP_HALF(4, r5); 597 HS_CMP_HALF(5, r6); 598 HS_CMP_HALF(6, r7); 599 HS_CMP_HALF(7, r8); 600 } 601 HS_CMP_XCHG(r1, r5); 602 HS_CMP_XCHG(r3, r7); 603 HS_CMP_XCHG(r1, r3); 604 HS_CMP_XCHG(r5, r7); 605 HS_CMP_XCHG(r2, r6); 606 HS_CMP_XCHG(r4, r8); 607 HS_CMP_XCHG(r2, r4); 608 HS_CMP_XCHG(r6, r8); 609 HS_CMP_XCHG(r1, r2); 610 HS_CMP_XCHG(r3, r4); 611 HS_CMP_XCHG(r5, r6); 612 HS_CMP_XCHG(r7, r8); 613 { 614 HS_SLAB_FLIP_PREAMBLE(15); 615 HS_CMP_FLIP(0, r1, r8); 616 HS_CMP_FLIP(1, r2, r7); 617 HS_CMP_FLIP(2, r3, r6); 618 HS_CMP_FLIP(3, r4, r5); 619 } 620 { 621 HS_SLAB_HALF_PREAMBLE(4); 622 HS_CMP_HALF(0, r1); 623 HS_CMP_HALF(1, r2); 624 HS_CMP_HALF(2, r3); 625 HS_CMP_HALF(3, r4); 626 HS_CMP_HALF(4, r5); 627 HS_CMP_HALF(5, r6); 628 HS_CMP_HALF(6, r7); 629 HS_CMP_HALF(7, r8); 630 } 631 { 632 HS_SLAB_HALF_PREAMBLE(2); 633 HS_CMP_HALF(0, r1); 634 HS_CMP_HALF(1, r2); 635 HS_CMP_HALF(2, r3); 636 HS_CMP_HALF(3, r4); 637 HS_CMP_HALF(4, r5); 638 HS_CMP_HALF(5, r6); 639 HS_CMP_HALF(6, r7); 640 HS_CMP_HALF(7, r8); 641 } 642 { 643 HS_SLAB_HALF_PREAMBLE(1); 644 HS_CMP_HALF(0, r1); 645 HS_CMP_HALF(1, r2); 646 HS_CMP_HALF(2, r3); 647 HS_CMP_HALF(3, r4); 648 HS_CMP_HALF(4, r5); 649 HS_CMP_HALF(5, r6); 650 HS_CMP_HALF(6, r7); 651 HS_CMP_HALF(7, r8); 652 } 653 HS_CMP_XCHG(r1, r5); 654 HS_CMP_XCHG(r3, r7); 655 HS_CMP_XCHG(r1, r3); 656 HS_CMP_XCHG(r5, r7); 657 HS_CMP_XCHG(r2, r6); 658 HS_CMP_XCHG(r4, r8); 659 HS_CMP_XCHG(r2, r4); 660 HS_CMP_XCHG(r6, r8); 661 HS_CMP_XCHG(r1, r2); 662 HS_CMP_XCHG(r3, r4); 663 HS_CMP_XCHG(r5, r6); 664 HS_CMP_XCHG(r7, r8); 665 HS_BS_MERGE_H_PREAMBLE(4); 666 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1; 667 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r8; 668 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2; 669 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r7; 670 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3; 671 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r6; 672 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4; 673 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r5; 674 HS_BLOCK_BARRIER(); 675 { 676 { 677 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 678 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(16); 679 HS_CMP_XCHG(r0_1, r0_2); 680 HS_SLAB_LOCAL_L(0) = r0_1; 681 HS_SLAB_LOCAL_R(16) = r0_2; 682 } 683 { 684 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32); 685 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(48); 686 HS_CMP_XCHG(r1_1, r1_2); 687 HS_SLAB_LOCAL_L(32) = r1_1; 688 HS_SLAB_LOCAL_R(48) = r1_2; 689 } 690 { 691 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256); 692 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(272); 693 HS_CMP_XCHG(r0_1, r0_2); 694 HS_SLAB_LOCAL_L(256) = r0_1; 695 HS_SLAB_LOCAL_R(272) = r0_2; 696 } 697 { 698 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(288); 699 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(304); 700 HS_CMP_XCHG(r1_1, r1_2); 701 HS_SLAB_LOCAL_L(288) = r1_1; 702 HS_SLAB_LOCAL_R(304) = r1_2; 703 } 704 } 705 HS_BLOCK_BARRIER(); 706 r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0); 707 r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1); 708 r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2); 709 r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3); 710 r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4); 711 r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5); 712 r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6); 713 r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7); 714 { 715 { 716 HS_SLAB_HALF_PREAMBLE(8); 717 HS_CMP_HALF(0, r1); 718 HS_CMP_HALF(1, r2); 719 HS_CMP_HALF(2, r3); 720 HS_CMP_HALF(3, r4); 721 HS_CMP_HALF(4, r5); 722 HS_CMP_HALF(5, r6); 723 HS_CMP_HALF(6, r7); 724 HS_CMP_HALF(7, r8); 725 } 726 { 727 HS_SLAB_HALF_PREAMBLE(4); 728 HS_CMP_HALF(0, r1); 729 HS_CMP_HALF(1, r2); 730 HS_CMP_HALF(2, r3); 731 HS_CMP_HALF(3, r4); 732 HS_CMP_HALF(4, r5); 733 HS_CMP_HALF(5, r6); 734 HS_CMP_HALF(6, r7); 735 HS_CMP_HALF(7, r8); 736 } 737 { 738 HS_SLAB_HALF_PREAMBLE(2); 739 HS_CMP_HALF(0, r1); 740 HS_CMP_HALF(1, r2); 741 HS_CMP_HALF(2, r3); 742 HS_CMP_HALF(3, r4); 743 HS_CMP_HALF(4, r5); 744 HS_CMP_HALF(5, r6); 745 HS_CMP_HALF(6, r7); 746 HS_CMP_HALF(7, r8); 747 } 748 { 749 HS_SLAB_HALF_PREAMBLE(1); 750 HS_CMP_HALF(0, r1); 751 HS_CMP_HALF(1, r2); 752 HS_CMP_HALF(2, r3); 753 HS_CMP_HALF(3, r4); 754 HS_CMP_HALF(4, r5); 755 HS_CMP_HALF(5, r6); 756 HS_CMP_HALF(6, r7); 757 HS_CMP_HALF(7, r8); 758 } 759 HS_CMP_XCHG(r1, r5); 760 HS_CMP_XCHG(r3, r7); 761 HS_CMP_XCHG(r1, r3); 762 HS_CMP_XCHG(r5, r7); 763 HS_CMP_XCHG(r2, r6); 764 HS_CMP_XCHG(r4, r8); 765 HS_CMP_XCHG(r2, r4); 766 HS_CMP_XCHG(r6, r8); 767 HS_CMP_XCHG(r1, r2); 768 HS_CMP_XCHG(r3, r4); 769 HS_CMP_XCHG(r5, r6); 770 HS_CMP_XCHG(r7, r8); 771 } 772 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1; 773 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r8; 774 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2; 775 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r7; 776 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3; 777 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r6; 778 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4; 779 HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r5; 780 HS_BLOCK_BARRIER(); 781 { 782 { 783 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 784 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16); 785 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(32); 786 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(48); 787 HS_CMP_XCHG(r0_2, r0_3); 788 HS_CMP_XCHG(r0_1, r0_4); 789 HS_CMP_XCHG(r0_3, r0_4); 790 HS_CMP_XCHG(r0_1, r0_2); 791 HS_SLAB_LOCAL_L(0) = r0_1; 792 HS_SLAB_LOCAL_L(16) = r0_2; 793 HS_SLAB_LOCAL_R(32) = r0_3; 794 HS_SLAB_LOCAL_R(48) = r0_4; 795 } 796 { 797 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256); 798 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(272); 799 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(288); 800 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(304); 801 HS_CMP_XCHG(r0_2, r0_3); 802 HS_CMP_XCHG(r0_1, r0_4); 803 HS_CMP_XCHG(r0_3, r0_4); 804 HS_CMP_XCHG(r0_1, r0_2); 805 HS_SLAB_LOCAL_L(256) = r0_1; 806 HS_SLAB_LOCAL_L(272) = r0_2; 807 HS_SLAB_LOCAL_R(288) = r0_3; 808 HS_SLAB_LOCAL_R(304) = r0_4; 809 } 810 } 811 HS_BLOCK_BARRIER(); 812 r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0); 813 r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1); 814 r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2); 815 r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3); 816 r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4); 817 r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5); 818 r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6); 819 r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7); 820 { 821 { 822 HS_SLAB_HALF_PREAMBLE(8); 823 HS_CMP_HALF(0, r1); 824 HS_CMP_HALF(1, r2); 825 HS_CMP_HALF(2, r3); 826 HS_CMP_HALF(3, r4); 827 HS_CMP_HALF(4, r5); 828 HS_CMP_HALF(5, r6); 829 HS_CMP_HALF(6, r7); 830 HS_CMP_HALF(7, r8); 831 } 832 { 833 HS_SLAB_HALF_PREAMBLE(4); 834 HS_CMP_HALF(0, r1); 835 HS_CMP_HALF(1, r2); 836 HS_CMP_HALF(2, r3); 837 HS_CMP_HALF(3, r4); 838 HS_CMP_HALF(4, r5); 839 HS_CMP_HALF(5, r6); 840 HS_CMP_HALF(6, r7); 841 HS_CMP_HALF(7, r8); 842 } 843 { 844 HS_SLAB_HALF_PREAMBLE(2); 845 HS_CMP_HALF(0, r1); 846 HS_CMP_HALF(1, r2); 847 HS_CMP_HALF(2, r3); 848 HS_CMP_HALF(3, r4); 849 HS_CMP_HALF(4, r5); 850 HS_CMP_HALF(5, r6); 851 HS_CMP_HALF(6, r7); 852 HS_CMP_HALF(7, r8); 853 } 854 { 855 HS_SLAB_HALF_PREAMBLE(1); 856 HS_CMP_HALF(0, r1); 857 HS_CMP_HALF(1, r2); 858 HS_CMP_HALF(2, r3); 859 HS_CMP_HALF(3, r4); 860 HS_CMP_HALF(4, r5); 861 HS_CMP_HALF(5, r6); 862 HS_CMP_HALF(6, r7); 863 HS_CMP_HALF(7, r8); 864 } 865 HS_CMP_XCHG(r1, r5); 866 HS_CMP_XCHG(r3, r7); 867 HS_CMP_XCHG(r1, r3); 868 HS_CMP_XCHG(r5, r7); 869 HS_CMP_XCHG(r2, r6); 870 HS_CMP_XCHG(r4, r8); 871 HS_CMP_XCHG(r2, r4); 872 HS_CMP_XCHG(r6, r8); 873 HS_CMP_XCHG(r1, r2); 874 HS_CMP_XCHG(r3, r4); 875 HS_CMP_XCHG(r5, r6); 876 HS_CMP_XCHG(r7, r8); 877 } 878 HS_SLAB_GLOBAL_STORE(0, r1); 879 HS_SLAB_GLOBAL_STORE(1, r2); 880 HS_SLAB_GLOBAL_STORE(2, r3); 881 HS_SLAB_GLOBAL_STORE(3, r4); 882 HS_SLAB_GLOBAL_STORE(4, r5); 883 HS_SLAB_GLOBAL_STORE(5, r6); 884 HS_SLAB_GLOBAL_STORE(6, r7); 885 HS_SLAB_GLOBAL_STORE(7, r8); 886} 887 888HS_BS_KERNEL_PROTO(8, 3) 889{ 890 HS_BLOCK_LOCAL_MEM_DECL(128, 8); 891 892 HS_SLAB_GLOBAL_PREAMBLE(); 893 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); 894 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); 895 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); 896 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); 897 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); 898 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); 899 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); 900 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); 901 HS_CMP_XCHG(r1, r5); 902 HS_CMP_XCHG(r2, r6); 903 HS_CMP_XCHG(r3, r7); 904 HS_CMP_XCHG(r4, r8); 905 HS_CMP_XCHG(r1, r3); 906 HS_CMP_XCHG(r2, r4); 907 HS_CMP_XCHG(r5, r7); 908 HS_CMP_XCHG(r6, r8); 909 HS_CMP_XCHG(r3, r5); 910 HS_CMP_XCHG(r4, r6); 911 HS_CMP_XCHG(r1, r2); 912 HS_CMP_XCHG(r3, r4); 913 HS_CMP_XCHG(r5, r6); 914 HS_CMP_XCHG(r7, r8); 915 HS_CMP_XCHG(r2, r5); 916 HS_CMP_XCHG(r4, r7); 917 HS_CMP_XCHG(r2, r3); 918 HS_CMP_XCHG(r4, r5); 919 HS_CMP_XCHG(r6, r7); 920 { 921 HS_SLAB_FLIP_PREAMBLE(1); 922 HS_CMP_FLIP(0, r1, r8); 923 HS_CMP_FLIP(1, r2, r7); 924 HS_CMP_FLIP(2, r3, r6); 925 HS_CMP_FLIP(3, r4, r5); 926 } 927 HS_CMP_XCHG(r1, r5); 928 HS_CMP_XCHG(r3, r7); 929 HS_CMP_XCHG(r1, r3); 930 HS_CMP_XCHG(r5, r7); 931 HS_CMP_XCHG(r2, r6); 932 HS_CMP_XCHG(r4, r8); 933 HS_CMP_XCHG(r2, r4); 934 HS_CMP_XCHG(r6, r8); 935 HS_CMP_XCHG(r1, r2); 936 HS_CMP_XCHG(r3, r4); 937 HS_CMP_XCHG(r5, r6); 938 HS_CMP_XCHG(r7, r8); 939 { 940 HS_SLAB_FLIP_PREAMBLE(3); 941 HS_CMP_FLIP(0, r1, r8); 942 HS_CMP_FLIP(1, r2, r7); 943 HS_CMP_FLIP(2, r3, r6); 944 HS_CMP_FLIP(3, r4, r5); 945 } 946 { 947 HS_SLAB_HALF_PREAMBLE(1); 948 HS_CMP_HALF(0, r1); 949 HS_CMP_HALF(1, r2); 950 HS_CMP_HALF(2, r3); 951 HS_CMP_HALF(3, r4); 952 HS_CMP_HALF(4, r5); 953 HS_CMP_HALF(5, r6); 954 HS_CMP_HALF(6, r7); 955 HS_CMP_HALF(7, r8); 956 } 957 HS_CMP_XCHG(r1, r5); 958 HS_CMP_XCHG(r3, r7); 959 HS_CMP_XCHG(r1, r3); 960 HS_CMP_XCHG(r5, r7); 961 HS_CMP_XCHG(r2, r6); 962 HS_CMP_XCHG(r4, r8); 963 HS_CMP_XCHG(r2, r4); 964 HS_CMP_XCHG(r6, r8); 965 HS_CMP_XCHG(r1, r2); 966 HS_CMP_XCHG(r3, r4); 967 HS_CMP_XCHG(r5, r6); 968 HS_CMP_XCHG(r7, r8); 969 { 970 HS_SLAB_FLIP_PREAMBLE(7); 971 HS_CMP_FLIP(0, r1, r8); 972 HS_CMP_FLIP(1, r2, r7); 973 HS_CMP_FLIP(2, r3, r6); 974 HS_CMP_FLIP(3, r4, r5); 975 } 976 { 977 HS_SLAB_HALF_PREAMBLE(2); 978 HS_CMP_HALF(0, r1); 979 HS_CMP_HALF(1, r2); 980 HS_CMP_HALF(2, r3); 981 HS_CMP_HALF(3, r4); 982 HS_CMP_HALF(4, r5); 983 HS_CMP_HALF(5, r6); 984 HS_CMP_HALF(6, r7); 985 HS_CMP_HALF(7, r8); 986 } 987 { 988 HS_SLAB_HALF_PREAMBLE(1); 989 HS_CMP_HALF(0, r1); 990 HS_CMP_HALF(1, r2); 991 HS_CMP_HALF(2, r3); 992 HS_CMP_HALF(3, r4); 993 HS_CMP_HALF(4, r5); 994 HS_CMP_HALF(5, r6); 995 HS_CMP_HALF(6, r7); 996 HS_CMP_HALF(7, r8); 997 } 998 HS_CMP_XCHG(r1, r5); 999 HS_CMP_XCHG(r3, r7); 1000 HS_CMP_XCHG(r1, r3); 1001 HS_CMP_XCHG(r5, r7); 1002 HS_CMP_XCHG(r2, r6); 1003 HS_CMP_XCHG(r4, r8); 1004 HS_CMP_XCHG(r2, r4); 1005 HS_CMP_XCHG(r6, r8); 1006 HS_CMP_XCHG(r1, r2); 1007 HS_CMP_XCHG(r3, r4); 1008 HS_CMP_XCHG(r5, r6); 1009 HS_CMP_XCHG(r7, r8); 1010 { 1011 HS_SLAB_FLIP_PREAMBLE(15); 1012 HS_CMP_FLIP(0, r1, r8); 1013 HS_CMP_FLIP(1, r2, r7); 1014 HS_CMP_FLIP(2, r3, r6); 1015 HS_CMP_FLIP(3, r4, r5); 1016 } 1017 { 1018 HS_SLAB_HALF_PREAMBLE(4); 1019 HS_CMP_HALF(0, r1); 1020 HS_CMP_HALF(1, r2); 1021 HS_CMP_HALF(2, r3); 1022 HS_CMP_HALF(3, r4); 1023 HS_CMP_HALF(4, r5); 1024 HS_CMP_HALF(5, r6); 1025 HS_CMP_HALF(6, r7); 1026 HS_CMP_HALF(7, r8); 1027 } 1028 { 1029 HS_SLAB_HALF_PREAMBLE(2); 1030 HS_CMP_HALF(0, r1); 1031 HS_CMP_HALF(1, r2); 1032 HS_CMP_HALF(2, r3); 1033 HS_CMP_HALF(3, r4); 1034 HS_CMP_HALF(4, r5); 1035 HS_CMP_HALF(5, r6); 1036 HS_CMP_HALF(6, r7); 1037 HS_CMP_HALF(7, r8); 1038 } 1039 { 1040 HS_SLAB_HALF_PREAMBLE(1); 1041 HS_CMP_HALF(0, r1); 1042 HS_CMP_HALF(1, r2); 1043 HS_CMP_HALF(2, r3); 1044 HS_CMP_HALF(3, r4); 1045 HS_CMP_HALF(4, r5); 1046 HS_CMP_HALF(5, r6); 1047 HS_CMP_HALF(6, r7); 1048 HS_CMP_HALF(7, r8); 1049 } 1050 HS_CMP_XCHG(r1, r5); 1051 HS_CMP_XCHG(r3, r7); 1052 HS_CMP_XCHG(r1, r3); 1053 HS_CMP_XCHG(r5, r7); 1054 HS_CMP_XCHG(r2, r6); 1055 HS_CMP_XCHG(r4, r8); 1056 HS_CMP_XCHG(r2, r4); 1057 HS_CMP_XCHG(r6, r8); 1058 HS_CMP_XCHG(r1, r2); 1059 HS_CMP_XCHG(r3, r4); 1060 HS_CMP_XCHG(r5, r6); 1061 HS_CMP_XCHG(r7, r8); 1062 HS_BS_MERGE_H_PREAMBLE(8); 1063 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1; 1064 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8; 1065 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2; 1066 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7; 1067 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3; 1068 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6; 1069 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4; 1070 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5; 1071 HS_BLOCK_BARRIER(); 1072 { 1073 { 1074 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 1075 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(16); 1076 HS_CMP_XCHG(r0_1, r0_2); 1077 HS_SLAB_LOCAL_L(0) = r0_1; 1078 HS_SLAB_LOCAL_R(16) = r0_2; 1079 } 1080 { 1081 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32); 1082 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(48); 1083 HS_CMP_XCHG(r1_1, r1_2); 1084 HS_SLAB_LOCAL_L(32) = r1_1; 1085 HS_SLAB_LOCAL_R(48) = r1_2; 1086 } 1087 { 1088 HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(64); 1089 HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(80); 1090 HS_CMP_XCHG(r2_1, r2_2); 1091 HS_SLAB_LOCAL_L(64) = r2_1; 1092 HS_SLAB_LOCAL_R(80) = r2_2; 1093 } 1094 { 1095 HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(96); 1096 HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(112); 1097 HS_CMP_XCHG(r3_1, r3_2); 1098 HS_SLAB_LOCAL_L(96) = r3_1; 1099 HS_SLAB_LOCAL_R(112) = r3_2; 1100 } 1101 } 1102 HS_BLOCK_BARRIER(); 1103 r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); 1104 r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); 1105 r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); 1106 r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); 1107 r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); 1108 r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); 1109 r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); 1110 r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); 1111 { 1112 { 1113 HS_SLAB_HALF_PREAMBLE(8); 1114 HS_CMP_HALF(0, r1); 1115 HS_CMP_HALF(1, r2); 1116 HS_CMP_HALF(2, r3); 1117 HS_CMP_HALF(3, r4); 1118 HS_CMP_HALF(4, r5); 1119 HS_CMP_HALF(5, r6); 1120 HS_CMP_HALF(6, r7); 1121 HS_CMP_HALF(7, r8); 1122 } 1123 { 1124 HS_SLAB_HALF_PREAMBLE(4); 1125 HS_CMP_HALF(0, r1); 1126 HS_CMP_HALF(1, r2); 1127 HS_CMP_HALF(2, r3); 1128 HS_CMP_HALF(3, r4); 1129 HS_CMP_HALF(4, r5); 1130 HS_CMP_HALF(5, r6); 1131 HS_CMP_HALF(6, r7); 1132 HS_CMP_HALF(7, r8); 1133 } 1134 { 1135 HS_SLAB_HALF_PREAMBLE(2); 1136 HS_CMP_HALF(0, r1); 1137 HS_CMP_HALF(1, r2); 1138 HS_CMP_HALF(2, r3); 1139 HS_CMP_HALF(3, r4); 1140 HS_CMP_HALF(4, r5); 1141 HS_CMP_HALF(5, r6); 1142 HS_CMP_HALF(6, r7); 1143 HS_CMP_HALF(7, r8); 1144 } 1145 { 1146 HS_SLAB_HALF_PREAMBLE(1); 1147 HS_CMP_HALF(0, r1); 1148 HS_CMP_HALF(1, r2); 1149 HS_CMP_HALF(2, r3); 1150 HS_CMP_HALF(3, r4); 1151 HS_CMP_HALF(4, r5); 1152 HS_CMP_HALF(5, r6); 1153 HS_CMP_HALF(6, r7); 1154 HS_CMP_HALF(7, r8); 1155 } 1156 HS_CMP_XCHG(r1, r5); 1157 HS_CMP_XCHG(r3, r7); 1158 HS_CMP_XCHG(r1, r3); 1159 HS_CMP_XCHG(r5, r7); 1160 HS_CMP_XCHG(r2, r6); 1161 HS_CMP_XCHG(r4, r8); 1162 HS_CMP_XCHG(r2, r4); 1163 HS_CMP_XCHG(r6, r8); 1164 HS_CMP_XCHG(r1, r2); 1165 HS_CMP_XCHG(r3, r4); 1166 HS_CMP_XCHG(r5, r6); 1167 HS_CMP_XCHG(r7, r8); 1168 } 1169 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1; 1170 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8; 1171 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2; 1172 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7; 1173 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3; 1174 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6; 1175 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4; 1176 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5; 1177 HS_BLOCK_BARRIER(); 1178 { 1179 { 1180 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 1181 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16); 1182 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(32); 1183 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(48); 1184 HS_CMP_XCHG(r0_2, r0_3); 1185 HS_CMP_XCHG(r0_1, r0_4); 1186 HS_CMP_XCHG(r0_3, r0_4); 1187 HS_CMP_XCHG(r0_1, r0_2); 1188 HS_SLAB_LOCAL_L(0) = r0_1; 1189 HS_SLAB_LOCAL_L(16) = r0_2; 1190 HS_SLAB_LOCAL_R(32) = r0_3; 1191 HS_SLAB_LOCAL_R(48) = r0_4; 1192 } 1193 { 1194 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64); 1195 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(80); 1196 HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(96); 1197 HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(112); 1198 HS_CMP_XCHG(r1_2, r1_3); 1199 HS_CMP_XCHG(r1_1, r1_4); 1200 HS_CMP_XCHG(r1_3, r1_4); 1201 HS_CMP_XCHG(r1_1, r1_2); 1202 HS_SLAB_LOCAL_L(64) = r1_1; 1203 HS_SLAB_LOCAL_L(80) = r1_2; 1204 HS_SLAB_LOCAL_R(96) = r1_3; 1205 HS_SLAB_LOCAL_R(112) = r1_4; 1206 } 1207 } 1208 HS_BLOCK_BARRIER(); 1209 r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); 1210 r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); 1211 r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); 1212 r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); 1213 r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); 1214 r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); 1215 r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); 1216 r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); 1217 { 1218 { 1219 HS_SLAB_HALF_PREAMBLE(8); 1220 HS_CMP_HALF(0, r1); 1221 HS_CMP_HALF(1, r2); 1222 HS_CMP_HALF(2, r3); 1223 HS_CMP_HALF(3, r4); 1224 HS_CMP_HALF(4, r5); 1225 HS_CMP_HALF(5, r6); 1226 HS_CMP_HALF(6, r7); 1227 HS_CMP_HALF(7, r8); 1228 } 1229 { 1230 HS_SLAB_HALF_PREAMBLE(4); 1231 HS_CMP_HALF(0, r1); 1232 HS_CMP_HALF(1, r2); 1233 HS_CMP_HALF(2, r3); 1234 HS_CMP_HALF(3, r4); 1235 HS_CMP_HALF(4, r5); 1236 HS_CMP_HALF(5, r6); 1237 HS_CMP_HALF(6, r7); 1238 HS_CMP_HALF(7, r8); 1239 } 1240 { 1241 HS_SLAB_HALF_PREAMBLE(2); 1242 HS_CMP_HALF(0, r1); 1243 HS_CMP_HALF(1, r2); 1244 HS_CMP_HALF(2, r3); 1245 HS_CMP_HALF(3, r4); 1246 HS_CMP_HALF(4, r5); 1247 HS_CMP_HALF(5, r6); 1248 HS_CMP_HALF(6, r7); 1249 HS_CMP_HALF(7, r8); 1250 } 1251 { 1252 HS_SLAB_HALF_PREAMBLE(1); 1253 HS_CMP_HALF(0, r1); 1254 HS_CMP_HALF(1, r2); 1255 HS_CMP_HALF(2, r3); 1256 HS_CMP_HALF(3, r4); 1257 HS_CMP_HALF(4, r5); 1258 HS_CMP_HALF(5, r6); 1259 HS_CMP_HALF(6, r7); 1260 HS_CMP_HALF(7, r8); 1261 } 1262 HS_CMP_XCHG(r1, r5); 1263 HS_CMP_XCHG(r3, r7); 1264 HS_CMP_XCHG(r1, r3); 1265 HS_CMP_XCHG(r5, r7); 1266 HS_CMP_XCHG(r2, r6); 1267 HS_CMP_XCHG(r4, r8); 1268 HS_CMP_XCHG(r2, r4); 1269 HS_CMP_XCHG(r6, r8); 1270 HS_CMP_XCHG(r1, r2); 1271 HS_CMP_XCHG(r3, r4); 1272 HS_CMP_XCHG(r5, r6); 1273 HS_CMP_XCHG(r7, r8); 1274 } 1275 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1; 1276 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r8; 1277 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2; 1278 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r7; 1279 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3; 1280 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r6; 1281 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4; 1282 HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r5; 1283 HS_BLOCK_BARRIER(); 1284 { 1285 { 1286 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 1287 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16); 1288 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(32); 1289 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(48); 1290 HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(64); 1291 HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(80); 1292 HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(96); 1293 HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(112); 1294 HS_CMP_XCHG(r0_4, r0_5); 1295 HS_CMP_XCHG(r0_3, r0_6); 1296 HS_CMP_XCHG(r0_2, r0_7); 1297 HS_CMP_XCHG(r0_1, r0_8); 1298 HS_CMP_XCHG(r0_5, r0_7); 1299 HS_CMP_XCHG(r0_6, r0_8); 1300 HS_CMP_XCHG(r0_5, r0_6); 1301 HS_CMP_XCHG(r0_7, r0_8); 1302 HS_CMP_XCHG(r0_1, r0_3); 1303 HS_CMP_XCHG(r0_2, r0_4); 1304 HS_CMP_XCHG(r0_1, r0_2); 1305 HS_CMP_XCHG(r0_3, r0_4); 1306 HS_SLAB_LOCAL_L(0) = r0_1; 1307 HS_SLAB_LOCAL_L(16) = r0_2; 1308 HS_SLAB_LOCAL_L(32) = r0_3; 1309 HS_SLAB_LOCAL_L(48) = r0_4; 1310 HS_SLAB_LOCAL_R(64) = r0_5; 1311 HS_SLAB_LOCAL_R(80) = r0_6; 1312 HS_SLAB_LOCAL_R(96) = r0_7; 1313 HS_SLAB_LOCAL_R(112) = r0_8; 1314 } 1315 } 1316 HS_BLOCK_BARRIER(); 1317 r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); 1318 r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); 1319 r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); 1320 r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); 1321 r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); 1322 r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); 1323 r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); 1324 r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); 1325 { 1326 { 1327 HS_SLAB_HALF_PREAMBLE(8); 1328 HS_CMP_HALF(0, r1); 1329 HS_CMP_HALF(1, r2); 1330 HS_CMP_HALF(2, r3); 1331 HS_CMP_HALF(3, r4); 1332 HS_CMP_HALF(4, r5); 1333 HS_CMP_HALF(5, r6); 1334 HS_CMP_HALF(6, r7); 1335 HS_CMP_HALF(7, r8); 1336 } 1337 { 1338 HS_SLAB_HALF_PREAMBLE(4); 1339 HS_CMP_HALF(0, r1); 1340 HS_CMP_HALF(1, r2); 1341 HS_CMP_HALF(2, r3); 1342 HS_CMP_HALF(3, r4); 1343 HS_CMP_HALF(4, r5); 1344 HS_CMP_HALF(5, r6); 1345 HS_CMP_HALF(6, r7); 1346 HS_CMP_HALF(7, r8); 1347 } 1348 { 1349 HS_SLAB_HALF_PREAMBLE(2); 1350 HS_CMP_HALF(0, r1); 1351 HS_CMP_HALF(1, r2); 1352 HS_CMP_HALF(2, r3); 1353 HS_CMP_HALF(3, r4); 1354 HS_CMP_HALF(4, r5); 1355 HS_CMP_HALF(5, r6); 1356 HS_CMP_HALF(6, r7); 1357 HS_CMP_HALF(7, r8); 1358 } 1359 { 1360 HS_SLAB_HALF_PREAMBLE(1); 1361 HS_CMP_HALF(0, r1); 1362 HS_CMP_HALF(1, r2); 1363 HS_CMP_HALF(2, r3); 1364 HS_CMP_HALF(3, r4); 1365 HS_CMP_HALF(4, r5); 1366 HS_CMP_HALF(5, r6); 1367 HS_CMP_HALF(6, r7); 1368 HS_CMP_HALF(7, r8); 1369 } 1370 HS_CMP_XCHG(r1, r5); 1371 HS_CMP_XCHG(r3, r7); 1372 HS_CMP_XCHG(r1, r3); 1373 HS_CMP_XCHG(r5, r7); 1374 HS_CMP_XCHG(r2, r6); 1375 HS_CMP_XCHG(r4, r8); 1376 HS_CMP_XCHG(r2, r4); 1377 HS_CMP_XCHG(r6, r8); 1378 HS_CMP_XCHG(r1, r2); 1379 HS_CMP_XCHG(r3, r4); 1380 HS_CMP_XCHG(r5, r6); 1381 HS_CMP_XCHG(r7, r8); 1382 } 1383 HS_SLAB_GLOBAL_STORE(0, r1); 1384 HS_SLAB_GLOBAL_STORE(1, r2); 1385 HS_SLAB_GLOBAL_STORE(2, r3); 1386 HS_SLAB_GLOBAL_STORE(3, r4); 1387 HS_SLAB_GLOBAL_STORE(4, r5); 1388 HS_SLAB_GLOBAL_STORE(5, r6); 1389 HS_SLAB_GLOBAL_STORE(6, r7); 1390 HS_SLAB_GLOBAL_STORE(7, r8); 1391} 1392 1393HS_BS_KERNEL_PROTO(16, 4) 1394{ 1395 HS_BLOCK_LOCAL_MEM_DECL(256, 8); 1396 1397 HS_SLAB_GLOBAL_PREAMBLE(); 1398 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); 1399 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); 1400 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); 1401 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); 1402 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); 1403 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); 1404 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); 1405 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); 1406 HS_CMP_XCHG(r1, r5); 1407 HS_CMP_XCHG(r2, r6); 1408 HS_CMP_XCHG(r3, r7); 1409 HS_CMP_XCHG(r4, r8); 1410 HS_CMP_XCHG(r1, r3); 1411 HS_CMP_XCHG(r2, r4); 1412 HS_CMP_XCHG(r5, r7); 1413 HS_CMP_XCHG(r6, r8); 1414 HS_CMP_XCHG(r3, r5); 1415 HS_CMP_XCHG(r4, r6); 1416 HS_CMP_XCHG(r1, r2); 1417 HS_CMP_XCHG(r3, r4); 1418 HS_CMP_XCHG(r5, r6); 1419 HS_CMP_XCHG(r7, r8); 1420 HS_CMP_XCHG(r2, r5); 1421 HS_CMP_XCHG(r4, r7); 1422 HS_CMP_XCHG(r2, r3); 1423 HS_CMP_XCHG(r4, r5); 1424 HS_CMP_XCHG(r6, r7); 1425 { 1426 HS_SLAB_FLIP_PREAMBLE(1); 1427 HS_CMP_FLIP(0, r1, r8); 1428 HS_CMP_FLIP(1, r2, r7); 1429 HS_CMP_FLIP(2, r3, r6); 1430 HS_CMP_FLIP(3, r4, r5); 1431 } 1432 HS_CMP_XCHG(r1, r5); 1433 HS_CMP_XCHG(r3, r7); 1434 HS_CMP_XCHG(r1, r3); 1435 HS_CMP_XCHG(r5, r7); 1436 HS_CMP_XCHG(r2, r6); 1437 HS_CMP_XCHG(r4, r8); 1438 HS_CMP_XCHG(r2, r4); 1439 HS_CMP_XCHG(r6, r8); 1440 HS_CMP_XCHG(r1, r2); 1441 HS_CMP_XCHG(r3, r4); 1442 HS_CMP_XCHG(r5, r6); 1443 HS_CMP_XCHG(r7, r8); 1444 { 1445 HS_SLAB_FLIP_PREAMBLE(3); 1446 HS_CMP_FLIP(0, r1, r8); 1447 HS_CMP_FLIP(1, r2, r7); 1448 HS_CMP_FLIP(2, r3, r6); 1449 HS_CMP_FLIP(3, r4, r5); 1450 } 1451 { 1452 HS_SLAB_HALF_PREAMBLE(1); 1453 HS_CMP_HALF(0, r1); 1454 HS_CMP_HALF(1, r2); 1455 HS_CMP_HALF(2, r3); 1456 HS_CMP_HALF(3, r4); 1457 HS_CMP_HALF(4, r5); 1458 HS_CMP_HALF(5, r6); 1459 HS_CMP_HALF(6, r7); 1460 HS_CMP_HALF(7, r8); 1461 } 1462 HS_CMP_XCHG(r1, r5); 1463 HS_CMP_XCHG(r3, r7); 1464 HS_CMP_XCHG(r1, r3); 1465 HS_CMP_XCHG(r5, r7); 1466 HS_CMP_XCHG(r2, r6); 1467 HS_CMP_XCHG(r4, r8); 1468 HS_CMP_XCHG(r2, r4); 1469 HS_CMP_XCHG(r6, r8); 1470 HS_CMP_XCHG(r1, r2); 1471 HS_CMP_XCHG(r3, r4); 1472 HS_CMP_XCHG(r5, r6); 1473 HS_CMP_XCHG(r7, r8); 1474 { 1475 HS_SLAB_FLIP_PREAMBLE(7); 1476 HS_CMP_FLIP(0, r1, r8); 1477 HS_CMP_FLIP(1, r2, r7); 1478 HS_CMP_FLIP(2, r3, r6); 1479 HS_CMP_FLIP(3, r4, r5); 1480 } 1481 { 1482 HS_SLAB_HALF_PREAMBLE(2); 1483 HS_CMP_HALF(0, r1); 1484 HS_CMP_HALF(1, r2); 1485 HS_CMP_HALF(2, r3); 1486 HS_CMP_HALF(3, r4); 1487 HS_CMP_HALF(4, r5); 1488 HS_CMP_HALF(5, r6); 1489 HS_CMP_HALF(6, r7); 1490 HS_CMP_HALF(7, r8); 1491 } 1492 { 1493 HS_SLAB_HALF_PREAMBLE(1); 1494 HS_CMP_HALF(0, r1); 1495 HS_CMP_HALF(1, r2); 1496 HS_CMP_HALF(2, r3); 1497 HS_CMP_HALF(3, r4); 1498 HS_CMP_HALF(4, r5); 1499 HS_CMP_HALF(5, r6); 1500 HS_CMP_HALF(6, r7); 1501 HS_CMP_HALF(7, r8); 1502 } 1503 HS_CMP_XCHG(r1, r5); 1504 HS_CMP_XCHG(r3, r7); 1505 HS_CMP_XCHG(r1, r3); 1506 HS_CMP_XCHG(r5, r7); 1507 HS_CMP_XCHG(r2, r6); 1508 HS_CMP_XCHG(r4, r8); 1509 HS_CMP_XCHG(r2, r4); 1510 HS_CMP_XCHG(r6, r8); 1511 HS_CMP_XCHG(r1, r2); 1512 HS_CMP_XCHG(r3, r4); 1513 HS_CMP_XCHG(r5, r6); 1514 HS_CMP_XCHG(r7, r8); 1515 { 1516 HS_SLAB_FLIP_PREAMBLE(15); 1517 HS_CMP_FLIP(0, r1, r8); 1518 HS_CMP_FLIP(1, r2, r7); 1519 HS_CMP_FLIP(2, r3, r6); 1520 HS_CMP_FLIP(3, r4, r5); 1521 } 1522 { 1523 HS_SLAB_HALF_PREAMBLE(4); 1524 HS_CMP_HALF(0, r1); 1525 HS_CMP_HALF(1, r2); 1526 HS_CMP_HALF(2, r3); 1527 HS_CMP_HALF(3, r4); 1528 HS_CMP_HALF(4, r5); 1529 HS_CMP_HALF(5, r6); 1530 HS_CMP_HALF(6, r7); 1531 HS_CMP_HALF(7, r8); 1532 } 1533 { 1534 HS_SLAB_HALF_PREAMBLE(2); 1535 HS_CMP_HALF(0, r1); 1536 HS_CMP_HALF(1, r2); 1537 HS_CMP_HALF(2, r3); 1538 HS_CMP_HALF(3, r4); 1539 HS_CMP_HALF(4, r5); 1540 HS_CMP_HALF(5, r6); 1541 HS_CMP_HALF(6, r7); 1542 HS_CMP_HALF(7, r8); 1543 } 1544 { 1545 HS_SLAB_HALF_PREAMBLE(1); 1546 HS_CMP_HALF(0, r1); 1547 HS_CMP_HALF(1, r2); 1548 HS_CMP_HALF(2, r3); 1549 HS_CMP_HALF(3, r4); 1550 HS_CMP_HALF(4, r5); 1551 HS_CMP_HALF(5, r6); 1552 HS_CMP_HALF(6, r7); 1553 HS_CMP_HALF(7, r8); 1554 } 1555 HS_CMP_XCHG(r1, r5); 1556 HS_CMP_XCHG(r3, r7); 1557 HS_CMP_XCHG(r1, r3); 1558 HS_CMP_XCHG(r5, r7); 1559 HS_CMP_XCHG(r2, r6); 1560 HS_CMP_XCHG(r4, r8); 1561 HS_CMP_XCHG(r2, r4); 1562 HS_CMP_XCHG(r6, r8); 1563 HS_CMP_XCHG(r1, r2); 1564 HS_CMP_XCHG(r3, r4); 1565 HS_CMP_XCHG(r5, r6); 1566 HS_CMP_XCHG(r7, r8); 1567 HS_BS_MERGE_H_PREAMBLE(16); 1568 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; 1569 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8; 1570 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; 1571 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7; 1572 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; 1573 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6; 1574 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; 1575 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5; 1576 HS_BLOCK_BARRIER(); 1577 if (HS_SUBGROUP_ID() < 8) { 1578 { 1579 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 1580 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(16); 1581 HS_CMP_XCHG(r0_1, r0_2); 1582 HS_SLAB_LOCAL_L(0) = r0_1; 1583 HS_SLAB_LOCAL_R(16) = r0_2; 1584 } 1585 { 1586 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(32); 1587 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(48); 1588 HS_CMP_XCHG(r1_1, r1_2); 1589 HS_SLAB_LOCAL_L(32) = r1_1; 1590 HS_SLAB_LOCAL_R(48) = r1_2; 1591 } 1592 { 1593 HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(64); 1594 HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(80); 1595 HS_CMP_XCHG(r2_1, r2_2); 1596 HS_SLAB_LOCAL_L(64) = r2_1; 1597 HS_SLAB_LOCAL_R(80) = r2_2; 1598 } 1599 { 1600 HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(96); 1601 HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(112); 1602 HS_CMP_XCHG(r3_1, r3_2); 1603 HS_SLAB_LOCAL_L(96) = r3_1; 1604 HS_SLAB_LOCAL_R(112) = r3_2; 1605 } 1606 { 1607 HS_KEY_TYPE r4_1 = HS_SLAB_LOCAL_L(128); 1608 HS_KEY_TYPE r4_2 = HS_SLAB_LOCAL_R(144); 1609 HS_CMP_XCHG(r4_1, r4_2); 1610 HS_SLAB_LOCAL_L(128) = r4_1; 1611 HS_SLAB_LOCAL_R(144) = r4_2; 1612 } 1613 { 1614 HS_KEY_TYPE r5_1 = HS_SLAB_LOCAL_L(160); 1615 HS_KEY_TYPE r5_2 = HS_SLAB_LOCAL_R(176); 1616 HS_CMP_XCHG(r5_1, r5_2); 1617 HS_SLAB_LOCAL_L(160) = r5_1; 1618 HS_SLAB_LOCAL_R(176) = r5_2; 1619 } 1620 { 1621 HS_KEY_TYPE r6_1 = HS_SLAB_LOCAL_L(192); 1622 HS_KEY_TYPE r6_2 = HS_SLAB_LOCAL_R(208); 1623 HS_CMP_XCHG(r6_1, r6_2); 1624 HS_SLAB_LOCAL_L(192) = r6_1; 1625 HS_SLAB_LOCAL_R(208) = r6_2; 1626 } 1627 { 1628 HS_KEY_TYPE r7_1 = HS_SLAB_LOCAL_L(224); 1629 HS_KEY_TYPE r7_2 = HS_SLAB_LOCAL_R(240); 1630 HS_CMP_XCHG(r7_1, r7_2); 1631 HS_SLAB_LOCAL_L(224) = r7_1; 1632 HS_SLAB_LOCAL_R(240) = r7_2; 1633 } 1634 } 1635 HS_BLOCK_BARRIER(); 1636 r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); 1637 r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); 1638 r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); 1639 r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); 1640 r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); 1641 r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); 1642 r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); 1643 r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); 1644 { 1645 { 1646 HS_SLAB_HALF_PREAMBLE(8); 1647 HS_CMP_HALF(0, r1); 1648 HS_CMP_HALF(1, r2); 1649 HS_CMP_HALF(2, r3); 1650 HS_CMP_HALF(3, r4); 1651 HS_CMP_HALF(4, r5); 1652 HS_CMP_HALF(5, r6); 1653 HS_CMP_HALF(6, r7); 1654 HS_CMP_HALF(7, r8); 1655 } 1656 { 1657 HS_SLAB_HALF_PREAMBLE(4); 1658 HS_CMP_HALF(0, r1); 1659 HS_CMP_HALF(1, r2); 1660 HS_CMP_HALF(2, r3); 1661 HS_CMP_HALF(3, r4); 1662 HS_CMP_HALF(4, r5); 1663 HS_CMP_HALF(5, r6); 1664 HS_CMP_HALF(6, r7); 1665 HS_CMP_HALF(7, r8); 1666 } 1667 { 1668 HS_SLAB_HALF_PREAMBLE(2); 1669 HS_CMP_HALF(0, r1); 1670 HS_CMP_HALF(1, r2); 1671 HS_CMP_HALF(2, r3); 1672 HS_CMP_HALF(3, r4); 1673 HS_CMP_HALF(4, r5); 1674 HS_CMP_HALF(5, r6); 1675 HS_CMP_HALF(6, r7); 1676 HS_CMP_HALF(7, r8); 1677 } 1678 { 1679 HS_SLAB_HALF_PREAMBLE(1); 1680 HS_CMP_HALF(0, r1); 1681 HS_CMP_HALF(1, r2); 1682 HS_CMP_HALF(2, r3); 1683 HS_CMP_HALF(3, r4); 1684 HS_CMP_HALF(4, r5); 1685 HS_CMP_HALF(5, r6); 1686 HS_CMP_HALF(6, r7); 1687 HS_CMP_HALF(7, r8); 1688 } 1689 HS_CMP_XCHG(r1, r5); 1690 HS_CMP_XCHG(r3, r7); 1691 HS_CMP_XCHG(r1, r3); 1692 HS_CMP_XCHG(r5, r7); 1693 HS_CMP_XCHG(r2, r6); 1694 HS_CMP_XCHG(r4, r8); 1695 HS_CMP_XCHG(r2, r4); 1696 HS_CMP_XCHG(r6, r8); 1697 HS_CMP_XCHG(r1, r2); 1698 HS_CMP_XCHG(r3, r4); 1699 HS_CMP_XCHG(r5, r6); 1700 HS_CMP_XCHG(r7, r8); 1701 } 1702 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; 1703 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8; 1704 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; 1705 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7; 1706 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; 1707 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6; 1708 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; 1709 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5; 1710 HS_BLOCK_BARRIER(); 1711 if (HS_SUBGROUP_ID() < 8) { 1712 { 1713 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 1714 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16); 1715 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(32); 1716 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(48); 1717 HS_CMP_XCHG(r0_2, r0_3); 1718 HS_CMP_XCHG(r0_1, r0_4); 1719 HS_CMP_XCHG(r0_3, r0_4); 1720 HS_CMP_XCHG(r0_1, r0_2); 1721 HS_SLAB_LOCAL_L(0) = r0_1; 1722 HS_SLAB_LOCAL_L(16) = r0_2; 1723 HS_SLAB_LOCAL_R(32) = r0_3; 1724 HS_SLAB_LOCAL_R(48) = r0_4; 1725 } 1726 { 1727 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64); 1728 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(80); 1729 HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(96); 1730 HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(112); 1731 HS_CMP_XCHG(r1_2, r1_3); 1732 HS_CMP_XCHG(r1_1, r1_4); 1733 HS_CMP_XCHG(r1_3, r1_4); 1734 HS_CMP_XCHG(r1_1, r1_2); 1735 HS_SLAB_LOCAL_L(64) = r1_1; 1736 HS_SLAB_LOCAL_L(80) = r1_2; 1737 HS_SLAB_LOCAL_R(96) = r1_3; 1738 HS_SLAB_LOCAL_R(112) = r1_4; 1739 } 1740 { 1741 HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(128); 1742 HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_L(144); 1743 HS_KEY_TYPE r2_3 = HS_SLAB_LOCAL_R(160); 1744 HS_KEY_TYPE r2_4 = HS_SLAB_LOCAL_R(176); 1745 HS_CMP_XCHG(r2_2, r2_3); 1746 HS_CMP_XCHG(r2_1, r2_4); 1747 HS_CMP_XCHG(r2_3, r2_4); 1748 HS_CMP_XCHG(r2_1, r2_2); 1749 HS_SLAB_LOCAL_L(128) = r2_1; 1750 HS_SLAB_LOCAL_L(144) = r2_2; 1751 HS_SLAB_LOCAL_R(160) = r2_3; 1752 HS_SLAB_LOCAL_R(176) = r2_4; 1753 } 1754 { 1755 HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(192); 1756 HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_L(208); 1757 HS_KEY_TYPE r3_3 = HS_SLAB_LOCAL_R(224); 1758 HS_KEY_TYPE r3_4 = HS_SLAB_LOCAL_R(240); 1759 HS_CMP_XCHG(r3_2, r3_3); 1760 HS_CMP_XCHG(r3_1, r3_4); 1761 HS_CMP_XCHG(r3_3, r3_4); 1762 HS_CMP_XCHG(r3_1, r3_2); 1763 HS_SLAB_LOCAL_L(192) = r3_1; 1764 HS_SLAB_LOCAL_L(208) = r3_2; 1765 HS_SLAB_LOCAL_R(224) = r3_3; 1766 HS_SLAB_LOCAL_R(240) = r3_4; 1767 } 1768 } 1769 HS_BLOCK_BARRIER(); 1770 r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); 1771 r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); 1772 r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); 1773 r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); 1774 r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); 1775 r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); 1776 r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); 1777 r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); 1778 { 1779 { 1780 HS_SLAB_HALF_PREAMBLE(8); 1781 HS_CMP_HALF(0, r1); 1782 HS_CMP_HALF(1, r2); 1783 HS_CMP_HALF(2, r3); 1784 HS_CMP_HALF(3, r4); 1785 HS_CMP_HALF(4, r5); 1786 HS_CMP_HALF(5, r6); 1787 HS_CMP_HALF(6, r7); 1788 HS_CMP_HALF(7, r8); 1789 } 1790 { 1791 HS_SLAB_HALF_PREAMBLE(4); 1792 HS_CMP_HALF(0, r1); 1793 HS_CMP_HALF(1, r2); 1794 HS_CMP_HALF(2, r3); 1795 HS_CMP_HALF(3, r4); 1796 HS_CMP_HALF(4, r5); 1797 HS_CMP_HALF(5, r6); 1798 HS_CMP_HALF(6, r7); 1799 HS_CMP_HALF(7, r8); 1800 } 1801 { 1802 HS_SLAB_HALF_PREAMBLE(2); 1803 HS_CMP_HALF(0, r1); 1804 HS_CMP_HALF(1, r2); 1805 HS_CMP_HALF(2, r3); 1806 HS_CMP_HALF(3, r4); 1807 HS_CMP_HALF(4, r5); 1808 HS_CMP_HALF(5, r6); 1809 HS_CMP_HALF(6, r7); 1810 HS_CMP_HALF(7, r8); 1811 } 1812 { 1813 HS_SLAB_HALF_PREAMBLE(1); 1814 HS_CMP_HALF(0, r1); 1815 HS_CMP_HALF(1, r2); 1816 HS_CMP_HALF(2, r3); 1817 HS_CMP_HALF(3, r4); 1818 HS_CMP_HALF(4, r5); 1819 HS_CMP_HALF(5, r6); 1820 HS_CMP_HALF(6, r7); 1821 HS_CMP_HALF(7, r8); 1822 } 1823 HS_CMP_XCHG(r1, r5); 1824 HS_CMP_XCHG(r3, r7); 1825 HS_CMP_XCHG(r1, r3); 1826 HS_CMP_XCHG(r5, r7); 1827 HS_CMP_XCHG(r2, r6); 1828 HS_CMP_XCHG(r4, r8); 1829 HS_CMP_XCHG(r2, r4); 1830 HS_CMP_XCHG(r6, r8); 1831 HS_CMP_XCHG(r1, r2); 1832 HS_CMP_XCHG(r3, r4); 1833 HS_CMP_XCHG(r5, r6); 1834 HS_CMP_XCHG(r7, r8); 1835 } 1836 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; 1837 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8; 1838 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; 1839 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7; 1840 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; 1841 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6; 1842 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; 1843 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5; 1844 HS_BLOCK_BARRIER(); 1845 if (HS_SUBGROUP_ID() < 8) { 1846 { 1847 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 1848 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16); 1849 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(32); 1850 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(48); 1851 HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(64); 1852 HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(80); 1853 HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(96); 1854 HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(112); 1855 HS_CMP_XCHG(r0_4, r0_5); 1856 HS_CMP_XCHG(r0_3, r0_6); 1857 HS_CMP_XCHG(r0_2, r0_7); 1858 HS_CMP_XCHG(r0_1, r0_8); 1859 HS_CMP_XCHG(r0_5, r0_7); 1860 HS_CMP_XCHG(r0_6, r0_8); 1861 HS_CMP_XCHG(r0_5, r0_6); 1862 HS_CMP_XCHG(r0_7, r0_8); 1863 HS_CMP_XCHG(r0_1, r0_3); 1864 HS_CMP_XCHG(r0_2, r0_4); 1865 HS_CMP_XCHG(r0_1, r0_2); 1866 HS_CMP_XCHG(r0_3, r0_4); 1867 HS_SLAB_LOCAL_L(0) = r0_1; 1868 HS_SLAB_LOCAL_L(16) = r0_2; 1869 HS_SLAB_LOCAL_L(32) = r0_3; 1870 HS_SLAB_LOCAL_L(48) = r0_4; 1871 HS_SLAB_LOCAL_R(64) = r0_5; 1872 HS_SLAB_LOCAL_R(80) = r0_6; 1873 HS_SLAB_LOCAL_R(96) = r0_7; 1874 HS_SLAB_LOCAL_R(112) = r0_8; 1875 } 1876 { 1877 HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(128); 1878 HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(144); 1879 HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_L(160); 1880 HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_L(176); 1881 HS_KEY_TYPE r1_5 = HS_SLAB_LOCAL_R(192); 1882 HS_KEY_TYPE r1_6 = HS_SLAB_LOCAL_R(208); 1883 HS_KEY_TYPE r1_7 = HS_SLAB_LOCAL_R(224); 1884 HS_KEY_TYPE r1_8 = HS_SLAB_LOCAL_R(240); 1885 HS_CMP_XCHG(r1_4, r1_5); 1886 HS_CMP_XCHG(r1_3, r1_6); 1887 HS_CMP_XCHG(r1_2, r1_7); 1888 HS_CMP_XCHG(r1_1, r1_8); 1889 HS_CMP_XCHG(r1_5, r1_7); 1890 HS_CMP_XCHG(r1_6, r1_8); 1891 HS_CMP_XCHG(r1_5, r1_6); 1892 HS_CMP_XCHG(r1_7, r1_8); 1893 HS_CMP_XCHG(r1_1, r1_3); 1894 HS_CMP_XCHG(r1_2, r1_4); 1895 HS_CMP_XCHG(r1_1, r1_2); 1896 HS_CMP_XCHG(r1_3, r1_4); 1897 HS_SLAB_LOCAL_L(128) = r1_1; 1898 HS_SLAB_LOCAL_L(144) = r1_2; 1899 HS_SLAB_LOCAL_L(160) = r1_3; 1900 HS_SLAB_LOCAL_L(176) = r1_4; 1901 HS_SLAB_LOCAL_R(192) = r1_5; 1902 HS_SLAB_LOCAL_R(208) = r1_6; 1903 HS_SLAB_LOCAL_R(224) = r1_7; 1904 HS_SLAB_LOCAL_R(240) = r1_8; 1905 } 1906 } 1907 HS_BLOCK_BARRIER(); 1908 r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); 1909 r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); 1910 r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); 1911 r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); 1912 r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); 1913 r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); 1914 r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); 1915 r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); 1916 { 1917 { 1918 HS_SLAB_HALF_PREAMBLE(8); 1919 HS_CMP_HALF(0, r1); 1920 HS_CMP_HALF(1, r2); 1921 HS_CMP_HALF(2, r3); 1922 HS_CMP_HALF(3, r4); 1923 HS_CMP_HALF(4, r5); 1924 HS_CMP_HALF(5, r6); 1925 HS_CMP_HALF(6, r7); 1926 HS_CMP_HALF(7, r8); 1927 } 1928 { 1929 HS_SLAB_HALF_PREAMBLE(4); 1930 HS_CMP_HALF(0, r1); 1931 HS_CMP_HALF(1, r2); 1932 HS_CMP_HALF(2, r3); 1933 HS_CMP_HALF(3, r4); 1934 HS_CMP_HALF(4, r5); 1935 HS_CMP_HALF(5, r6); 1936 HS_CMP_HALF(6, r7); 1937 HS_CMP_HALF(7, r8); 1938 } 1939 { 1940 HS_SLAB_HALF_PREAMBLE(2); 1941 HS_CMP_HALF(0, r1); 1942 HS_CMP_HALF(1, r2); 1943 HS_CMP_HALF(2, r3); 1944 HS_CMP_HALF(3, r4); 1945 HS_CMP_HALF(4, r5); 1946 HS_CMP_HALF(5, r6); 1947 HS_CMP_HALF(6, r7); 1948 HS_CMP_HALF(7, r8); 1949 } 1950 { 1951 HS_SLAB_HALF_PREAMBLE(1); 1952 HS_CMP_HALF(0, r1); 1953 HS_CMP_HALF(1, r2); 1954 HS_CMP_HALF(2, r3); 1955 HS_CMP_HALF(3, r4); 1956 HS_CMP_HALF(4, r5); 1957 HS_CMP_HALF(5, r6); 1958 HS_CMP_HALF(6, r7); 1959 HS_CMP_HALF(7, r8); 1960 } 1961 HS_CMP_XCHG(r1, r5); 1962 HS_CMP_XCHG(r3, r7); 1963 HS_CMP_XCHG(r1, r3); 1964 HS_CMP_XCHG(r5, r7); 1965 HS_CMP_XCHG(r2, r6); 1966 HS_CMP_XCHG(r4, r8); 1967 HS_CMP_XCHG(r2, r4); 1968 HS_CMP_XCHG(r6, r8); 1969 HS_CMP_XCHG(r1, r2); 1970 HS_CMP_XCHG(r3, r4); 1971 HS_CMP_XCHG(r5, r6); 1972 HS_CMP_XCHG(r7, r8); 1973 } 1974 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; 1975 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r8; 1976 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; 1977 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r7; 1978 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; 1979 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r6; 1980 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; 1981 HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r5; 1982 HS_BLOCK_BARRIER(); 1983 if (HS_SUBGROUP_ID() < 8) { 1984 { 1985 HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); 1986 HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(16); 1987 HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(32); 1988 HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(48); 1989 HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_L(64); 1990 HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_L(80); 1991 HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_L(96); 1992 HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_L(112); 1993 HS_KEY_TYPE r0_9 = HS_SLAB_LOCAL_R(128); 1994 HS_KEY_TYPE r0_10 = HS_SLAB_LOCAL_R(144); 1995 HS_KEY_TYPE r0_11 = HS_SLAB_LOCAL_R(160); 1996 HS_KEY_TYPE r0_12 = HS_SLAB_LOCAL_R(176); 1997 HS_KEY_TYPE r0_13 = HS_SLAB_LOCAL_R(192); 1998 HS_KEY_TYPE r0_14 = HS_SLAB_LOCAL_R(208); 1999 HS_KEY_TYPE r0_15 = HS_SLAB_LOCAL_R(224); 2000 HS_KEY_TYPE r0_16 = HS_SLAB_LOCAL_R(240); 2001 HS_CMP_XCHG(r0_8, r0_9); 2002 HS_CMP_XCHG(r0_7, r0_10); 2003 HS_CMP_XCHG(r0_6, r0_11); 2004 HS_CMP_XCHG(r0_5, r0_12); 2005 HS_CMP_XCHG(r0_4, r0_13); 2006 HS_CMP_XCHG(r0_3, r0_14); 2007 HS_CMP_XCHG(r0_2, r0_15); 2008 HS_CMP_XCHG(r0_1, r0_16); 2009 HS_CMP_XCHG(r0_9, r0_13); 2010 HS_CMP_XCHG(r0_11, r0_15); 2011 HS_CMP_XCHG(r0_9, r0_11); 2012 HS_CMP_XCHG(r0_13, r0_15); 2013 HS_CMP_XCHG(r0_10, r0_14); 2014 HS_CMP_XCHG(r0_12, r0_16); 2015 HS_CMP_XCHG(r0_10, r0_12); 2016 HS_CMP_XCHG(r0_14, r0_16); 2017 HS_CMP_XCHG(r0_9, r0_10); 2018 HS_CMP_XCHG(r0_11, r0_12); 2019 HS_CMP_XCHG(r0_13, r0_14); 2020 HS_CMP_XCHG(r0_15, r0_16); 2021 HS_CMP_XCHG(r0_1, r0_5); 2022 HS_CMP_XCHG(r0_3, r0_7); 2023 HS_CMP_XCHG(r0_1, r0_3); 2024 HS_CMP_XCHG(r0_5, r0_7); 2025 HS_CMP_XCHG(r0_2, r0_6); 2026 HS_CMP_XCHG(r0_4, r0_8); 2027 HS_CMP_XCHG(r0_2, r0_4); 2028 HS_CMP_XCHG(r0_6, r0_8); 2029 HS_CMP_XCHG(r0_1, r0_2); 2030 HS_CMP_XCHG(r0_3, r0_4); 2031 HS_CMP_XCHG(r0_5, r0_6); 2032 HS_CMP_XCHG(r0_7, r0_8); 2033 HS_SLAB_LOCAL_L(0) = r0_1; 2034 HS_SLAB_LOCAL_L(16) = r0_2; 2035 HS_SLAB_LOCAL_L(32) = r0_3; 2036 HS_SLAB_LOCAL_L(48) = r0_4; 2037 HS_SLAB_LOCAL_L(64) = r0_5; 2038 HS_SLAB_LOCAL_L(80) = r0_6; 2039 HS_SLAB_LOCAL_L(96) = r0_7; 2040 HS_SLAB_LOCAL_L(112) = r0_8; 2041 HS_SLAB_LOCAL_R(128) = r0_9; 2042 HS_SLAB_LOCAL_R(144) = r0_10; 2043 HS_SLAB_LOCAL_R(160) = r0_11; 2044 HS_SLAB_LOCAL_R(176) = r0_12; 2045 HS_SLAB_LOCAL_R(192) = r0_13; 2046 HS_SLAB_LOCAL_R(208) = r0_14; 2047 HS_SLAB_LOCAL_R(224) = r0_15; 2048 HS_SLAB_LOCAL_R(240) = r0_16; 2049 } 2050 } 2051 HS_BLOCK_BARRIER(); 2052 r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); 2053 r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); 2054 r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); 2055 r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); 2056 r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); 2057 r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); 2058 r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); 2059 r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); 2060 { 2061 { 2062 HS_SLAB_HALF_PREAMBLE(8); 2063 HS_CMP_HALF(0, r1); 2064 HS_CMP_HALF(1, r2); 2065 HS_CMP_HALF(2, r3); 2066 HS_CMP_HALF(3, r4); 2067 HS_CMP_HALF(4, r5); 2068 HS_CMP_HALF(5, r6); 2069 HS_CMP_HALF(6, r7); 2070 HS_CMP_HALF(7, r8); 2071 } 2072 { 2073 HS_SLAB_HALF_PREAMBLE(4); 2074 HS_CMP_HALF(0, r1); 2075 HS_CMP_HALF(1, r2); 2076 HS_CMP_HALF(2, r3); 2077 HS_CMP_HALF(3, r4); 2078 HS_CMP_HALF(4, r5); 2079 HS_CMP_HALF(5, r6); 2080 HS_CMP_HALF(6, r7); 2081 HS_CMP_HALF(7, r8); 2082 } 2083 { 2084 HS_SLAB_HALF_PREAMBLE(2); 2085 HS_CMP_HALF(0, r1); 2086 HS_CMP_HALF(1, r2); 2087 HS_CMP_HALF(2, r3); 2088 HS_CMP_HALF(3, r4); 2089 HS_CMP_HALF(4, r5); 2090 HS_CMP_HALF(5, r6); 2091 HS_CMP_HALF(6, r7); 2092 HS_CMP_HALF(7, r8); 2093 } 2094 { 2095 HS_SLAB_HALF_PREAMBLE(1); 2096 HS_CMP_HALF(0, r1); 2097 HS_CMP_HALF(1, r2); 2098 HS_CMP_HALF(2, r3); 2099 HS_CMP_HALF(3, r4); 2100 HS_CMP_HALF(4, r5); 2101 HS_CMP_HALF(5, r6); 2102 HS_CMP_HALF(6, r7); 2103 HS_CMP_HALF(7, r8); 2104 } 2105 HS_CMP_XCHG(r1, r5); 2106 HS_CMP_XCHG(r3, r7); 2107 HS_CMP_XCHG(r1, r3); 2108 HS_CMP_XCHG(r5, r7); 2109 HS_CMP_XCHG(r2, r6); 2110 HS_CMP_XCHG(r4, r8); 2111 HS_CMP_XCHG(r2, r4); 2112 HS_CMP_XCHG(r6, r8); 2113 HS_CMP_XCHG(r1, r2); 2114 HS_CMP_XCHG(r3, r4); 2115 HS_CMP_XCHG(r5, r6); 2116 HS_CMP_XCHG(r7, r8); 2117 } 2118 HS_SLAB_GLOBAL_STORE(0, r1); 2119 HS_SLAB_GLOBAL_STORE(1, r2); 2120 HS_SLAB_GLOBAL_STORE(2, r3); 2121 HS_SLAB_GLOBAL_STORE(3, r4); 2122 HS_SLAB_GLOBAL_STORE(4, r5); 2123 HS_SLAB_GLOBAL_STORE(5, r6); 2124 HS_SLAB_GLOBAL_STORE(6, r7); 2125 HS_SLAB_GLOBAL_STORE(7, r8); 2126} 2127 2128HS_BC_KERNEL_PROTO(1, 0) 2129{ 2130 HS_SLAB_GLOBAL_PREAMBLE(); 2131 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0); 2132 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1); 2133 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2); 2134 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3); 2135 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4); 2136 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5); 2137 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6); 2138 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7); 2139 { 2140 { 2141 HS_SLAB_HALF_PREAMBLE(8); 2142 HS_CMP_HALF(0, r1); 2143 HS_CMP_HALF(1, r2); 2144 HS_CMP_HALF(2, r3); 2145 HS_CMP_HALF(3, r4); 2146 HS_CMP_HALF(4, r5); 2147 HS_CMP_HALF(5, r6); 2148 HS_CMP_HALF(6, r7); 2149 HS_CMP_HALF(7, r8); 2150 } 2151 { 2152 HS_SLAB_HALF_PREAMBLE(4); 2153 HS_CMP_HALF(0, r1); 2154 HS_CMP_HALF(1, r2); 2155 HS_CMP_HALF(2, r3); 2156 HS_CMP_HALF(3, r4); 2157 HS_CMP_HALF(4, r5); 2158 HS_CMP_HALF(5, r6); 2159 HS_CMP_HALF(6, r7); 2160 HS_CMP_HALF(7, r8); 2161 } 2162 { 2163 HS_SLAB_HALF_PREAMBLE(2); 2164 HS_CMP_HALF(0, r1); 2165 HS_CMP_HALF(1, r2); 2166 HS_CMP_HALF(2, r3); 2167 HS_CMP_HALF(3, r4); 2168 HS_CMP_HALF(4, r5); 2169 HS_CMP_HALF(5, r6); 2170 HS_CMP_HALF(6, r7); 2171 HS_CMP_HALF(7, r8); 2172 } 2173 { 2174 HS_SLAB_HALF_PREAMBLE(1); 2175 HS_CMP_HALF(0, r1); 2176 HS_CMP_HALF(1, r2); 2177 HS_CMP_HALF(2, r3); 2178 HS_CMP_HALF(3, r4); 2179 HS_CMP_HALF(4, r5); 2180 HS_CMP_HALF(5, r6); 2181 HS_CMP_HALF(6, r7); 2182 HS_CMP_HALF(7, r8); 2183 } 2184 HS_CMP_XCHG(r1, r5); 2185 HS_CMP_XCHG(r3, r7); 2186 HS_CMP_XCHG(r1, r3); 2187 HS_CMP_XCHG(r5, r7); 2188 HS_CMP_XCHG(r2, r6); 2189 HS_CMP_XCHG(r4, r8); 2190 HS_CMP_XCHG(r2, r4); 2191 HS_CMP_XCHG(r6, r8); 2192 HS_CMP_XCHG(r1, r2); 2193 HS_CMP_XCHG(r3, r4); 2194 HS_CMP_XCHG(r5, r6); 2195 HS_CMP_XCHG(r7, r8); 2196 } 2197 HS_SLAB_GLOBAL_STORE(0, r1); 2198 HS_SLAB_GLOBAL_STORE(1, r2); 2199 HS_SLAB_GLOBAL_STORE(2, r3); 2200 HS_SLAB_GLOBAL_STORE(3, r4); 2201 HS_SLAB_GLOBAL_STORE(4, r5); 2202 HS_SLAB_GLOBAL_STORE(5, r6); 2203 HS_SLAB_GLOBAL_STORE(6, r7); 2204 HS_SLAB_GLOBAL_STORE(7, r8); 2205} 2206 2207HS_BC_KERNEL_PROTO(2, 1) 2208{ 2209 HS_BLOCK_LOCAL_MEM_DECL(32, 8); 2210 2211 HS_SLAB_GLOBAL_PREAMBLE(); 2212 HS_BC_MERGE_H_PREAMBLE(2); 2213 { 2214 { 2215 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); 2216 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8); 2217 HS_CMP_XCHG(r0_1, r0_2); 2218 HS_SLAB_LOCAL_L(0) = r0_1; 2219 HS_SLAB_LOCAL_L(16) = r0_2; 2220 } 2221 { 2222 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(2); 2223 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(10); 2224 HS_CMP_XCHG(r0_1, r0_2); 2225 HS_SLAB_LOCAL_L(64) = r0_1; 2226 HS_SLAB_LOCAL_L(80) = r0_2; 2227 } 2228 { 2229 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4); 2230 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(12); 2231 HS_CMP_XCHG(r0_1, r0_2); 2232 HS_SLAB_LOCAL_L(128) = r0_1; 2233 HS_SLAB_LOCAL_L(144) = r0_2; 2234 } 2235 { 2236 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(6); 2237 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(14); 2238 HS_CMP_XCHG(r0_1, r0_2); 2239 HS_SLAB_LOCAL_L(192) = r0_1; 2240 HS_SLAB_LOCAL_L(208) = r0_2; 2241 } 2242 } 2243 HS_BLOCK_BARRIER(); 2244 HS_KEY_TYPE r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0); 2245 HS_KEY_TYPE r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1); 2246 HS_KEY_TYPE r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2); 2247 HS_KEY_TYPE r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3); 2248 HS_KEY_TYPE r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4); 2249 HS_KEY_TYPE r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5); 2250 HS_KEY_TYPE r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6); 2251 HS_KEY_TYPE r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7); 2252 { 2253 { 2254 HS_SLAB_HALF_PREAMBLE(8); 2255 HS_CMP_HALF(0, r1); 2256 HS_CMP_HALF(1, r2); 2257 HS_CMP_HALF(2, r3); 2258 HS_CMP_HALF(3, r4); 2259 HS_CMP_HALF(4, r5); 2260 HS_CMP_HALF(5, r6); 2261 HS_CMP_HALF(6, r7); 2262 HS_CMP_HALF(7, r8); 2263 } 2264 { 2265 HS_SLAB_HALF_PREAMBLE(4); 2266 HS_CMP_HALF(0, r1); 2267 HS_CMP_HALF(1, r2); 2268 HS_CMP_HALF(2, r3); 2269 HS_CMP_HALF(3, r4); 2270 HS_CMP_HALF(4, r5); 2271 HS_CMP_HALF(5, r6); 2272 HS_CMP_HALF(6, r7); 2273 HS_CMP_HALF(7, r8); 2274 } 2275 { 2276 HS_SLAB_HALF_PREAMBLE(2); 2277 HS_CMP_HALF(0, r1); 2278 HS_CMP_HALF(1, r2); 2279 HS_CMP_HALF(2, r3); 2280 HS_CMP_HALF(3, r4); 2281 HS_CMP_HALF(4, r5); 2282 HS_CMP_HALF(5, r6); 2283 HS_CMP_HALF(6, r7); 2284 HS_CMP_HALF(7, r8); 2285 } 2286 { 2287 HS_SLAB_HALF_PREAMBLE(1); 2288 HS_CMP_HALF(0, r1); 2289 HS_CMP_HALF(1, r2); 2290 HS_CMP_HALF(2, r3); 2291 HS_CMP_HALF(3, r4); 2292 HS_CMP_HALF(4, r5); 2293 HS_CMP_HALF(5, r6); 2294 HS_CMP_HALF(6, r7); 2295 HS_CMP_HALF(7, r8); 2296 } 2297 HS_CMP_XCHG(r1, r5); 2298 HS_CMP_XCHG(r3, r7); 2299 HS_CMP_XCHG(r1, r3); 2300 HS_CMP_XCHG(r5, r7); 2301 HS_CMP_XCHG(r2, r6); 2302 HS_CMP_XCHG(r4, r8); 2303 HS_CMP_XCHG(r2, r4); 2304 HS_CMP_XCHG(r6, r8); 2305 HS_CMP_XCHG(r1, r2); 2306 HS_CMP_XCHG(r3, r4); 2307 HS_CMP_XCHG(r5, r6); 2308 HS_CMP_XCHG(r7, r8); 2309 } 2310 HS_SLAB_GLOBAL_STORE(0, r1); 2311 HS_SLAB_GLOBAL_STORE(1, r2); 2312 HS_SLAB_GLOBAL_STORE(2, r3); 2313 HS_SLAB_GLOBAL_STORE(3, r4); 2314 HS_SLAB_GLOBAL_STORE(4, r5); 2315 HS_SLAB_GLOBAL_STORE(5, r6); 2316 HS_SLAB_GLOBAL_STORE(6, r7); 2317 HS_SLAB_GLOBAL_STORE(7, r8); 2318} 2319 2320HS_BC_KERNEL_PROTO(4, 2) 2321{ 2322 HS_BLOCK_LOCAL_MEM_DECL(64, 8); 2323 2324 HS_SLAB_GLOBAL_PREAMBLE(); 2325 HS_BC_MERGE_H_PREAMBLE(4); 2326 { 2327 { 2328 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); 2329 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8); 2330 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16); 2331 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24); 2332 HS_CMP_XCHG(r0_1, r0_3); 2333 HS_CMP_XCHG(r0_2, r0_4); 2334 HS_CMP_XCHG(r0_1, r0_2); 2335 HS_CMP_XCHG(r0_3, r0_4); 2336 HS_SLAB_LOCAL_L(0) = r0_1; 2337 HS_SLAB_LOCAL_L(16) = r0_2; 2338 HS_SLAB_LOCAL_L(32) = r0_3; 2339 HS_SLAB_LOCAL_L(48) = r0_4; 2340 } 2341 { 2342 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4); 2343 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(12); 2344 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(20); 2345 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(28); 2346 HS_CMP_XCHG(r0_1, r0_3); 2347 HS_CMP_XCHG(r0_2, r0_4); 2348 HS_CMP_XCHG(r0_1, r0_2); 2349 HS_CMP_XCHG(r0_3, r0_4); 2350 HS_SLAB_LOCAL_L(256) = r0_1; 2351 HS_SLAB_LOCAL_L(272) = r0_2; 2352 HS_SLAB_LOCAL_L(288) = r0_3; 2353 HS_SLAB_LOCAL_L(304) = r0_4; 2354 } 2355 } 2356 HS_BLOCK_BARRIER(); 2357 HS_KEY_TYPE r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0); 2358 HS_KEY_TYPE r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1); 2359 HS_KEY_TYPE r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2); 2360 HS_KEY_TYPE r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3); 2361 HS_KEY_TYPE r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4); 2362 HS_KEY_TYPE r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5); 2363 HS_KEY_TYPE r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6); 2364 HS_KEY_TYPE r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7); 2365 { 2366 { 2367 HS_SLAB_HALF_PREAMBLE(8); 2368 HS_CMP_HALF(0, r1); 2369 HS_CMP_HALF(1, r2); 2370 HS_CMP_HALF(2, r3); 2371 HS_CMP_HALF(3, r4); 2372 HS_CMP_HALF(4, r5); 2373 HS_CMP_HALF(5, r6); 2374 HS_CMP_HALF(6, r7); 2375 HS_CMP_HALF(7, r8); 2376 } 2377 { 2378 HS_SLAB_HALF_PREAMBLE(4); 2379 HS_CMP_HALF(0, r1); 2380 HS_CMP_HALF(1, r2); 2381 HS_CMP_HALF(2, r3); 2382 HS_CMP_HALF(3, r4); 2383 HS_CMP_HALF(4, r5); 2384 HS_CMP_HALF(5, r6); 2385 HS_CMP_HALF(6, r7); 2386 HS_CMP_HALF(7, r8); 2387 } 2388 { 2389 HS_SLAB_HALF_PREAMBLE(2); 2390 HS_CMP_HALF(0, r1); 2391 HS_CMP_HALF(1, r2); 2392 HS_CMP_HALF(2, r3); 2393 HS_CMP_HALF(3, r4); 2394 HS_CMP_HALF(4, r5); 2395 HS_CMP_HALF(5, r6); 2396 HS_CMP_HALF(6, r7); 2397 HS_CMP_HALF(7, r8); 2398 } 2399 { 2400 HS_SLAB_HALF_PREAMBLE(1); 2401 HS_CMP_HALF(0, r1); 2402 HS_CMP_HALF(1, r2); 2403 HS_CMP_HALF(2, r3); 2404 HS_CMP_HALF(3, r4); 2405 HS_CMP_HALF(4, r5); 2406 HS_CMP_HALF(5, r6); 2407 HS_CMP_HALF(6, r7); 2408 HS_CMP_HALF(7, r8); 2409 } 2410 HS_CMP_XCHG(r1, r5); 2411 HS_CMP_XCHG(r3, r7); 2412 HS_CMP_XCHG(r1, r3); 2413 HS_CMP_XCHG(r5, r7); 2414 HS_CMP_XCHG(r2, r6); 2415 HS_CMP_XCHG(r4, r8); 2416 HS_CMP_XCHG(r2, r4); 2417 HS_CMP_XCHG(r6, r8); 2418 HS_CMP_XCHG(r1, r2); 2419 HS_CMP_XCHG(r3, r4); 2420 HS_CMP_XCHG(r5, r6); 2421 HS_CMP_XCHG(r7, r8); 2422 } 2423 HS_SLAB_GLOBAL_STORE(0, r1); 2424 HS_SLAB_GLOBAL_STORE(1, r2); 2425 HS_SLAB_GLOBAL_STORE(2, r3); 2426 HS_SLAB_GLOBAL_STORE(3, r4); 2427 HS_SLAB_GLOBAL_STORE(4, r5); 2428 HS_SLAB_GLOBAL_STORE(5, r6); 2429 HS_SLAB_GLOBAL_STORE(6, r7); 2430 HS_SLAB_GLOBAL_STORE(7, r8); 2431} 2432 2433HS_BC_KERNEL_PROTO(8, 3) 2434{ 2435 HS_BLOCK_LOCAL_MEM_DECL(128, 8); 2436 2437 HS_SLAB_GLOBAL_PREAMBLE(); 2438 HS_BC_MERGE_H_PREAMBLE(8); 2439 { 2440 { 2441 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); 2442 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8); 2443 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16); 2444 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24); 2445 HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(32); 2446 HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(40); 2447 HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(48); 2448 HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(56); 2449 HS_CMP_XCHG(r0_1, r0_5); 2450 HS_CMP_XCHG(r0_3, r0_7); 2451 HS_CMP_XCHG(r0_1, r0_3); 2452 HS_CMP_XCHG(r0_5, r0_7); 2453 HS_CMP_XCHG(r0_2, r0_6); 2454 HS_CMP_XCHG(r0_4, r0_8); 2455 HS_CMP_XCHG(r0_2, r0_4); 2456 HS_CMP_XCHG(r0_6, r0_8); 2457 HS_CMP_XCHG(r0_1, r0_2); 2458 HS_CMP_XCHG(r0_3, r0_4); 2459 HS_CMP_XCHG(r0_5, r0_6); 2460 HS_CMP_XCHG(r0_7, r0_8); 2461 HS_SLAB_LOCAL_L(0) = r0_1; 2462 HS_SLAB_LOCAL_L(16) = r0_2; 2463 HS_SLAB_LOCAL_L(32) = r0_3; 2464 HS_SLAB_LOCAL_L(48) = r0_4; 2465 HS_SLAB_LOCAL_L(64) = r0_5; 2466 HS_SLAB_LOCAL_L(80) = r0_6; 2467 HS_SLAB_LOCAL_L(96) = r0_7; 2468 HS_SLAB_LOCAL_L(112) = r0_8; 2469 } 2470 } 2471 HS_BLOCK_BARRIER(); 2472 HS_KEY_TYPE r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); 2473 HS_KEY_TYPE r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); 2474 HS_KEY_TYPE r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); 2475 HS_KEY_TYPE r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); 2476 HS_KEY_TYPE r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); 2477 HS_KEY_TYPE r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); 2478 HS_KEY_TYPE r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); 2479 HS_KEY_TYPE r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); 2480 { 2481 { 2482 HS_SLAB_HALF_PREAMBLE(8); 2483 HS_CMP_HALF(0, r1); 2484 HS_CMP_HALF(1, r2); 2485 HS_CMP_HALF(2, r3); 2486 HS_CMP_HALF(3, r4); 2487 HS_CMP_HALF(4, r5); 2488 HS_CMP_HALF(5, r6); 2489 HS_CMP_HALF(6, r7); 2490 HS_CMP_HALF(7, r8); 2491 } 2492 { 2493 HS_SLAB_HALF_PREAMBLE(4); 2494 HS_CMP_HALF(0, r1); 2495 HS_CMP_HALF(1, r2); 2496 HS_CMP_HALF(2, r3); 2497 HS_CMP_HALF(3, r4); 2498 HS_CMP_HALF(4, r5); 2499 HS_CMP_HALF(5, r6); 2500 HS_CMP_HALF(6, r7); 2501 HS_CMP_HALF(7, r8); 2502 } 2503 { 2504 HS_SLAB_HALF_PREAMBLE(2); 2505 HS_CMP_HALF(0, r1); 2506 HS_CMP_HALF(1, r2); 2507 HS_CMP_HALF(2, r3); 2508 HS_CMP_HALF(3, r4); 2509 HS_CMP_HALF(4, r5); 2510 HS_CMP_HALF(5, r6); 2511 HS_CMP_HALF(6, r7); 2512 HS_CMP_HALF(7, r8); 2513 } 2514 { 2515 HS_SLAB_HALF_PREAMBLE(1); 2516 HS_CMP_HALF(0, r1); 2517 HS_CMP_HALF(1, r2); 2518 HS_CMP_HALF(2, r3); 2519 HS_CMP_HALF(3, r4); 2520 HS_CMP_HALF(4, r5); 2521 HS_CMP_HALF(5, r6); 2522 HS_CMP_HALF(6, r7); 2523 HS_CMP_HALF(7, r8); 2524 } 2525 HS_CMP_XCHG(r1, r5); 2526 HS_CMP_XCHG(r3, r7); 2527 HS_CMP_XCHG(r1, r3); 2528 HS_CMP_XCHG(r5, r7); 2529 HS_CMP_XCHG(r2, r6); 2530 HS_CMP_XCHG(r4, r8); 2531 HS_CMP_XCHG(r2, r4); 2532 HS_CMP_XCHG(r6, r8); 2533 HS_CMP_XCHG(r1, r2); 2534 HS_CMP_XCHG(r3, r4); 2535 HS_CMP_XCHG(r5, r6); 2536 HS_CMP_XCHG(r7, r8); 2537 } 2538 HS_SLAB_GLOBAL_STORE(0, r1); 2539 HS_SLAB_GLOBAL_STORE(1, r2); 2540 HS_SLAB_GLOBAL_STORE(2, r3); 2541 HS_SLAB_GLOBAL_STORE(3, r4); 2542 HS_SLAB_GLOBAL_STORE(4, r5); 2543 HS_SLAB_GLOBAL_STORE(5, r6); 2544 HS_SLAB_GLOBAL_STORE(6, r7); 2545 HS_SLAB_GLOBAL_STORE(7, r8); 2546} 2547 2548HS_BC_KERNEL_PROTO(16, 4) 2549{ 2550 HS_BLOCK_LOCAL_MEM_DECL(256, 8); 2551 2552 HS_SLAB_GLOBAL_PREAMBLE(); 2553 HS_BC_MERGE_H_PREAMBLE(16); 2554 if (HS_SUBGROUP_ID() < 8) { 2555 { 2556 HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); 2557 HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(8); 2558 HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(16); 2559 HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(24); 2560 HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(32); 2561 HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(40); 2562 HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(48); 2563 HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(56); 2564 HS_KEY_TYPE r0_9 = HS_BC_GLOBAL_LOAD_L(64); 2565 HS_KEY_TYPE r0_10 = HS_BC_GLOBAL_LOAD_L(72); 2566 HS_KEY_TYPE r0_11 = HS_BC_GLOBAL_LOAD_L(80); 2567 HS_KEY_TYPE r0_12 = HS_BC_GLOBAL_LOAD_L(88); 2568 HS_KEY_TYPE r0_13 = HS_BC_GLOBAL_LOAD_L(96); 2569 HS_KEY_TYPE r0_14 = HS_BC_GLOBAL_LOAD_L(104); 2570 HS_KEY_TYPE r0_15 = HS_BC_GLOBAL_LOAD_L(112); 2571 HS_KEY_TYPE r0_16 = HS_BC_GLOBAL_LOAD_L(120); 2572 HS_CMP_XCHG(r0_1, r0_9); 2573 HS_CMP_XCHG(r0_5, r0_13); 2574 HS_CMP_XCHG(r0_1, r0_5); 2575 HS_CMP_XCHG(r0_9, r0_13); 2576 HS_CMP_XCHG(r0_3, r0_11); 2577 HS_CMP_XCHG(r0_7, r0_15); 2578 HS_CMP_XCHG(r0_3, r0_7); 2579 HS_CMP_XCHG(r0_11, r0_15); 2580 HS_CMP_XCHG(r0_1, r0_3); 2581 HS_CMP_XCHG(r0_5, r0_7); 2582 HS_CMP_XCHG(r0_9, r0_11); 2583 HS_CMP_XCHG(r0_13, r0_15); 2584 HS_CMP_XCHG(r0_2, r0_10); 2585 HS_CMP_XCHG(r0_6, r0_14); 2586 HS_CMP_XCHG(r0_2, r0_6); 2587 HS_CMP_XCHG(r0_10, r0_14); 2588 HS_CMP_XCHG(r0_4, r0_12); 2589 HS_CMP_XCHG(r0_8, r0_16); 2590 HS_CMP_XCHG(r0_4, r0_8); 2591 HS_CMP_XCHG(r0_12, r0_16); 2592 HS_CMP_XCHG(r0_2, r0_4); 2593 HS_CMP_XCHG(r0_6, r0_8); 2594 HS_CMP_XCHG(r0_10, r0_12); 2595 HS_CMP_XCHG(r0_14, r0_16); 2596 HS_CMP_XCHG(r0_1, r0_2); 2597 HS_CMP_XCHG(r0_3, r0_4); 2598 HS_CMP_XCHG(r0_5, r0_6); 2599 HS_CMP_XCHG(r0_7, r0_8); 2600 HS_CMP_XCHG(r0_9, r0_10); 2601 HS_CMP_XCHG(r0_11, r0_12); 2602 HS_CMP_XCHG(r0_13, r0_14); 2603 HS_CMP_XCHG(r0_15, r0_16); 2604 HS_SLAB_LOCAL_L(0) = r0_1; 2605 HS_SLAB_LOCAL_L(16) = r0_2; 2606 HS_SLAB_LOCAL_L(32) = r0_3; 2607 HS_SLAB_LOCAL_L(48) = r0_4; 2608 HS_SLAB_LOCAL_L(64) = r0_5; 2609 HS_SLAB_LOCAL_L(80) = r0_6; 2610 HS_SLAB_LOCAL_L(96) = r0_7; 2611 HS_SLAB_LOCAL_L(112) = r0_8; 2612 HS_SLAB_LOCAL_L(128) = r0_9; 2613 HS_SLAB_LOCAL_L(144) = r0_10; 2614 HS_SLAB_LOCAL_L(160) = r0_11; 2615 HS_SLAB_LOCAL_L(176) = r0_12; 2616 HS_SLAB_LOCAL_L(192) = r0_13; 2617 HS_SLAB_LOCAL_L(208) = r0_14; 2618 HS_SLAB_LOCAL_L(224) = r0_15; 2619 HS_SLAB_LOCAL_L(240) = r0_16; 2620 } 2621 } 2622 HS_BLOCK_BARRIER(); 2623 HS_KEY_TYPE r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); 2624 HS_KEY_TYPE r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); 2625 HS_KEY_TYPE r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); 2626 HS_KEY_TYPE r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); 2627 HS_KEY_TYPE r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); 2628 HS_KEY_TYPE r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); 2629 HS_KEY_TYPE r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); 2630 HS_KEY_TYPE r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); 2631 { 2632 { 2633 HS_SLAB_HALF_PREAMBLE(8); 2634 HS_CMP_HALF(0, r1); 2635 HS_CMP_HALF(1, r2); 2636 HS_CMP_HALF(2, r3); 2637 HS_CMP_HALF(3, r4); 2638 HS_CMP_HALF(4, r5); 2639 HS_CMP_HALF(5, r6); 2640 HS_CMP_HALF(6, r7); 2641 HS_CMP_HALF(7, r8); 2642 } 2643 { 2644 HS_SLAB_HALF_PREAMBLE(4); 2645 HS_CMP_HALF(0, r1); 2646 HS_CMP_HALF(1, r2); 2647 HS_CMP_HALF(2, r3); 2648 HS_CMP_HALF(3, r4); 2649 HS_CMP_HALF(4, r5); 2650 HS_CMP_HALF(5, r6); 2651 HS_CMP_HALF(6, r7); 2652 HS_CMP_HALF(7, r8); 2653 } 2654 { 2655 HS_SLAB_HALF_PREAMBLE(2); 2656 HS_CMP_HALF(0, r1); 2657 HS_CMP_HALF(1, r2); 2658 HS_CMP_HALF(2, r3); 2659 HS_CMP_HALF(3, r4); 2660 HS_CMP_HALF(4, r5); 2661 HS_CMP_HALF(5, r6); 2662 HS_CMP_HALF(6, r7); 2663 HS_CMP_HALF(7, r8); 2664 } 2665 { 2666 HS_SLAB_HALF_PREAMBLE(1); 2667 HS_CMP_HALF(0, r1); 2668 HS_CMP_HALF(1, r2); 2669 HS_CMP_HALF(2, r3); 2670 HS_CMP_HALF(3, r4); 2671 HS_CMP_HALF(4, r5); 2672 HS_CMP_HALF(5, r6); 2673 HS_CMP_HALF(6, r7); 2674 HS_CMP_HALF(7, r8); 2675 } 2676 HS_CMP_XCHG(r1, r5); 2677 HS_CMP_XCHG(r3, r7); 2678 HS_CMP_XCHG(r1, r3); 2679 HS_CMP_XCHG(r5, r7); 2680 HS_CMP_XCHG(r2, r6); 2681 HS_CMP_XCHG(r4, r8); 2682 HS_CMP_XCHG(r2, r4); 2683 HS_CMP_XCHG(r6, r8); 2684 HS_CMP_XCHG(r1, r2); 2685 HS_CMP_XCHG(r3, r4); 2686 HS_CMP_XCHG(r5, r6); 2687 HS_CMP_XCHG(r7, r8); 2688 } 2689 HS_SLAB_GLOBAL_STORE(0, r1); 2690 HS_SLAB_GLOBAL_STORE(1, r2); 2691 HS_SLAB_GLOBAL_STORE(2, r3); 2692 HS_SLAB_GLOBAL_STORE(3, r4); 2693 HS_SLAB_GLOBAL_STORE(4, r5); 2694 HS_SLAB_GLOBAL_STORE(5, r6); 2695 HS_SLAB_GLOBAL_STORE(6, r7); 2696 HS_SLAB_GLOBAL_STORE(7, r8); 2697} 2698 2699HS_FM_KERNEL_PROTO(0, 0) 2700{ 2701 HS_FM_PREAMBLE(8); 2702 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); 2703 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); 2704 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); 2705 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); 2706 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); 2707 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); 2708 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); 2709 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); 2710 HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0); 2711 HS_CMP_XCHG(r8, r9); 2712 HS_CMP_XCHG(r1, r5); 2713 HS_CMP_XCHG(r3, r7); 2714 HS_CMP_XCHG(r1, r3); 2715 HS_CMP_XCHG(r5, r7); 2716 HS_CMP_XCHG(r2, r6); 2717 HS_CMP_XCHG(r4, r8); 2718 HS_CMP_XCHG(r2, r4); 2719 HS_CMP_XCHG(r6, r8); 2720 HS_CMP_XCHG(r1, r2); 2721 HS_CMP_XCHG(r3, r4); 2722 HS_CMP_XCHG(r5, r6); 2723 HS_CMP_XCHG(r7, r8); 2724 HS_XM_GLOBAL_STORE_L(0, r1); 2725 HS_XM_GLOBAL_STORE_L(1, r2); 2726 HS_XM_GLOBAL_STORE_L(2, r3); 2727 HS_XM_GLOBAL_STORE_L(3, r4); 2728 HS_XM_GLOBAL_STORE_L(4, r5); 2729 HS_XM_GLOBAL_STORE_L(5, r6); 2730 HS_XM_GLOBAL_STORE_L(6, r7); 2731 HS_XM_GLOBAL_STORE_L(7, r8); 2732 HS_FM_GLOBAL_STORE_R(0, r9); 2733} 2734 2735HS_FM_KERNEL_PROTO(0, 1) 2736{ 2737 HS_FM_PREAMBLE(8); 2738 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); 2739 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); 2740 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); 2741 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); 2742 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); 2743 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); 2744 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); 2745 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); 2746 HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0); 2747 HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1); 2748 HS_CMP_XCHG(r8, r9); 2749 HS_CMP_XCHG(r7, r10); 2750 HS_CMP_XCHG(r1, r5); 2751 HS_CMP_XCHG(r3, r7); 2752 HS_CMP_XCHG(r1, r3); 2753 HS_CMP_XCHG(r5, r7); 2754 HS_CMP_XCHG(r2, r6); 2755 HS_CMP_XCHG(r4, r8); 2756 HS_CMP_XCHG(r2, r4); 2757 HS_CMP_XCHG(r6, r8); 2758 HS_CMP_XCHG(r1, r2); 2759 HS_CMP_XCHG(r3, r4); 2760 HS_CMP_XCHG(r5, r6); 2761 HS_CMP_XCHG(r7, r8); 2762 HS_CMP_XCHG(r9, r10); 2763 HS_XM_GLOBAL_STORE_L(0, r1); 2764 HS_XM_GLOBAL_STORE_L(1, r2); 2765 HS_XM_GLOBAL_STORE_L(2, r3); 2766 HS_XM_GLOBAL_STORE_L(3, r4); 2767 HS_XM_GLOBAL_STORE_L(4, r5); 2768 HS_XM_GLOBAL_STORE_L(5, r6); 2769 HS_XM_GLOBAL_STORE_L(6, r7); 2770 HS_XM_GLOBAL_STORE_L(7, r8); 2771 HS_FM_GLOBAL_STORE_R(0, r9); 2772 HS_FM_GLOBAL_STORE_R(1, r10); 2773} 2774 2775HS_FM_KERNEL_PROTO(0, 2) 2776{ 2777 HS_FM_PREAMBLE(8); 2778 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); 2779 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); 2780 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); 2781 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); 2782 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); 2783 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); 2784 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); 2785 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); 2786 HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0); 2787 HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1); 2788 HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2); 2789 HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3); 2790 HS_CMP_XCHG(r8, r9); 2791 HS_CMP_XCHG(r7, r10); 2792 HS_CMP_XCHG(r6, r11); 2793 HS_CMP_XCHG(r5, r12); 2794 HS_CMP_XCHG(r1, r5); 2795 HS_CMP_XCHG(r3, r7); 2796 HS_CMP_XCHG(r1, r3); 2797 HS_CMP_XCHG(r5, r7); 2798 HS_CMP_XCHG(r2, r6); 2799 HS_CMP_XCHG(r4, r8); 2800 HS_CMP_XCHG(r2, r4); 2801 HS_CMP_XCHG(r6, r8); 2802 HS_CMP_XCHG(r1, r2); 2803 HS_CMP_XCHG(r3, r4); 2804 HS_CMP_XCHG(r5, r6); 2805 HS_CMP_XCHG(r7, r8); 2806 HS_CMP_XCHG(r9, r11); 2807 HS_CMP_XCHG(r10, r12); 2808 HS_CMP_XCHG(r9, r10); 2809 HS_CMP_XCHG(r11, r12); 2810 HS_XM_GLOBAL_STORE_L(0, r1); 2811 HS_XM_GLOBAL_STORE_L(1, r2); 2812 HS_XM_GLOBAL_STORE_L(2, r3); 2813 HS_XM_GLOBAL_STORE_L(3, r4); 2814 HS_XM_GLOBAL_STORE_L(4, r5); 2815 HS_XM_GLOBAL_STORE_L(5, r6); 2816 HS_XM_GLOBAL_STORE_L(6, r7); 2817 HS_XM_GLOBAL_STORE_L(7, r8); 2818 HS_FM_GLOBAL_STORE_R(0, r9); 2819 HS_FM_GLOBAL_STORE_R(1, r10); 2820 HS_FM_GLOBAL_STORE_R(2, r11); 2821 HS_FM_GLOBAL_STORE_R(3, r12); 2822} 2823 2824HS_FM_KERNEL_PROTO(0, 3) 2825{ 2826 HS_FM_PREAMBLE(8); 2827 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); 2828 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); 2829 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); 2830 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); 2831 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); 2832 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); 2833 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); 2834 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); 2835 HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0); 2836 HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1); 2837 HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2); 2838 HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3); 2839 HS_KEY_TYPE r13 = HS_FM_GLOBAL_LOAD_R(4); 2840 HS_KEY_TYPE r14 = HS_FM_GLOBAL_LOAD_R(5); 2841 HS_KEY_TYPE r15 = HS_FM_GLOBAL_LOAD_R(6); 2842 HS_KEY_TYPE r16 = HS_FM_GLOBAL_LOAD_R(7); 2843 HS_CMP_XCHG(r8, r9); 2844 HS_CMP_XCHG(r7, r10); 2845 HS_CMP_XCHG(r6, r11); 2846 HS_CMP_XCHG(r5, r12); 2847 HS_CMP_XCHG(r4, r13); 2848 HS_CMP_XCHG(r3, r14); 2849 HS_CMP_XCHG(r2, r15); 2850 HS_CMP_XCHG(r1, r16); 2851 HS_CMP_XCHG(r1, r5); 2852 HS_CMP_XCHG(r3, r7); 2853 HS_CMP_XCHG(r1, r3); 2854 HS_CMP_XCHG(r5, r7); 2855 HS_CMP_XCHG(r2, r6); 2856 HS_CMP_XCHG(r4, r8); 2857 HS_CMP_XCHG(r2, r4); 2858 HS_CMP_XCHG(r6, r8); 2859 HS_CMP_XCHG(r1, r2); 2860 HS_CMP_XCHG(r3, r4); 2861 HS_CMP_XCHG(r5, r6); 2862 HS_CMP_XCHG(r7, r8); 2863 HS_CMP_XCHG(r9, r13); 2864 HS_CMP_XCHG(r11, r15); 2865 HS_CMP_XCHG(r9, r11); 2866 HS_CMP_XCHG(r13, r15); 2867 HS_CMP_XCHG(r10, r14); 2868 HS_CMP_XCHG(r12, r16); 2869 HS_CMP_XCHG(r10, r12); 2870 HS_CMP_XCHG(r14, r16); 2871 HS_CMP_XCHG(r9, r10); 2872 HS_CMP_XCHG(r11, r12); 2873 HS_CMP_XCHG(r13, r14); 2874 HS_CMP_XCHG(r15, r16); 2875 HS_XM_GLOBAL_STORE_L(0, r1); 2876 HS_XM_GLOBAL_STORE_L(1, r2); 2877 HS_XM_GLOBAL_STORE_L(2, r3); 2878 HS_XM_GLOBAL_STORE_L(3, r4); 2879 HS_XM_GLOBAL_STORE_L(4, r5); 2880 HS_XM_GLOBAL_STORE_L(5, r6); 2881 HS_XM_GLOBAL_STORE_L(6, r7); 2882 HS_XM_GLOBAL_STORE_L(7, r8); 2883 HS_FM_GLOBAL_STORE_R(0, r9); 2884 HS_FM_GLOBAL_STORE_R(1, r10); 2885 HS_FM_GLOBAL_STORE_R(2, r11); 2886 HS_FM_GLOBAL_STORE_R(3, r12); 2887 HS_FM_GLOBAL_STORE_R(4, r13); 2888 HS_FM_GLOBAL_STORE_R(5, r14); 2889 HS_FM_GLOBAL_STORE_R(6, r15); 2890 HS_FM_GLOBAL_STORE_R(7, r16); 2891} 2892 2893HS_HM_KERNEL_PROTO(0) 2894{ 2895 HS_HM_PREAMBLE(8); 2896 HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); 2897 HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); 2898 HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); 2899 HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); 2900 HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); 2901 HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); 2902 HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); 2903 HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); 2904 HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); 2905 HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); 2906 HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); 2907 HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); 2908 HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); 2909 HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); 2910 HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); 2911 HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); 2912 HS_CMP_XCHG(r1, r9); 2913 HS_CMP_XCHG(r5, r13); 2914 HS_CMP_XCHG(r1, r5); 2915 HS_CMP_XCHG(r9, r13); 2916 HS_CMP_XCHG(r3, r11); 2917 HS_CMP_XCHG(r7, r15); 2918 HS_CMP_XCHG(r3, r7); 2919 HS_CMP_XCHG(r11, r15); 2920 HS_CMP_XCHG(r1, r3); 2921 HS_CMP_XCHG(r5, r7); 2922 HS_CMP_XCHG(r9, r11); 2923 HS_CMP_XCHG(r13, r15); 2924 HS_CMP_XCHG(r2, r10); 2925 HS_CMP_XCHG(r6, r14); 2926 HS_CMP_XCHG(r2, r6); 2927 HS_CMP_XCHG(r10, r14); 2928 HS_CMP_XCHG(r4, r12); 2929 HS_CMP_XCHG(r8, r16); 2930 HS_CMP_XCHG(r4, r8); 2931 HS_CMP_XCHG(r12, r16); 2932 HS_CMP_XCHG(r2, r4); 2933 HS_CMP_XCHG(r6, r8); 2934 HS_CMP_XCHG(r10, r12); 2935 HS_CMP_XCHG(r14, r16); 2936 HS_CMP_XCHG(r1, r2); 2937 HS_CMP_XCHG(r3, r4); 2938 HS_CMP_XCHG(r5, r6); 2939 HS_CMP_XCHG(r7, r8); 2940 HS_CMP_XCHG(r9, r10); 2941 HS_CMP_XCHG(r11, r12); 2942 HS_CMP_XCHG(r13, r14); 2943 HS_CMP_XCHG(r15, r16); 2944 HS_XM_GLOBAL_STORE_L(0, r1); 2945 HS_XM_GLOBAL_STORE_L(1, r2); 2946 HS_XM_GLOBAL_STORE_L(2, r3); 2947 HS_XM_GLOBAL_STORE_L(3, r4); 2948 HS_XM_GLOBAL_STORE_L(4, r5); 2949 HS_XM_GLOBAL_STORE_L(5, r6); 2950 HS_XM_GLOBAL_STORE_L(6, r7); 2951 HS_XM_GLOBAL_STORE_L(7, r8); 2952 HS_XM_GLOBAL_STORE_L(8, r9); 2953 HS_XM_GLOBAL_STORE_L(9, r10); 2954 HS_XM_GLOBAL_STORE_L(10, r11); 2955 HS_XM_GLOBAL_STORE_L(11, r12); 2956 HS_XM_GLOBAL_STORE_L(12, r13); 2957 HS_XM_GLOBAL_STORE_L(13, r14); 2958 HS_XM_GLOBAL_STORE_L(14, r15); 2959 HS_XM_GLOBAL_STORE_L(15, r16); 2960} 2961 2962HS_TRANSPOSE_KERNEL_PROTO() 2963{ 2964 HS_SLAB_GLOBAL_PREAMBLE(); 2965 HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0); 2966 HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1); 2967 HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2); 2968 HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3); 2969 HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4); 2970 HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5); 2971 HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6); 2972 HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7); 2973 HS_TRANSPOSE_SLAB() 2974} 2975 2976// 2977// 2978// 2979