1 /* 2 BLAKE2 reference source code package - optimized C implementations 3 4 Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the 5 terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at 6 your option. The terms of these licenses can be found at: 7 8 - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 9 - OpenSSL license : https://www.openssl.org/source/license.html 10 - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 11 12 More information about the BLAKE2 hash function can be found at 13 https://blake2.net. 14 */ 15 #pragma once 16 #ifndef __BLAKE2B_LOAD_SSE41_H__ 17 #define __BLAKE2B_LOAD_SSE41_H__ 18 19 #define LOAD_MSG_0_1(b0, b1) \ 20 do \ 21 { \ 22 b0 = _mm_unpacklo_epi64(m0, m1); \ 23 b1 = _mm_unpacklo_epi64(m2, m3); \ 24 } while(0) 25 26 27 #define LOAD_MSG_0_2(b0, b1) \ 28 do \ 29 { \ 30 b0 = _mm_unpackhi_epi64(m0, m1); \ 31 b1 = _mm_unpackhi_epi64(m2, m3); \ 32 } while(0) 33 34 35 #define LOAD_MSG_0_3(b0, b1) \ 36 do \ 37 { \ 38 b0 = _mm_unpacklo_epi64(m4, m5); \ 39 b1 = _mm_unpacklo_epi64(m6, m7); \ 40 } while(0) 41 42 43 #define LOAD_MSG_0_4(b0, b1) \ 44 do \ 45 { \ 46 b0 = _mm_unpackhi_epi64(m4, m5); \ 47 b1 = _mm_unpackhi_epi64(m6, m7); \ 48 } while(0) 49 50 51 #define LOAD_MSG_1_1(b0, b1) \ 52 do \ 53 { \ 54 b0 = _mm_unpacklo_epi64(m7, m2); \ 55 b1 = _mm_unpackhi_epi64(m4, m6); \ 56 } while(0) 57 58 59 #define LOAD_MSG_1_2(b0, b1) \ 60 do \ 61 { \ 62 b0 = _mm_unpacklo_epi64(m5, m4); \ 63 b1 = _mm_alignr_epi8(m3, m7, 8); \ 64 } while(0) 65 66 67 #define LOAD_MSG_1_3(b0, b1) \ 68 do \ 69 { \ 70 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 71 b1 = _mm_unpackhi_epi64(m5, m2); \ 72 } while(0) 73 74 75 #define LOAD_MSG_1_4(b0, b1) \ 76 do \ 77 { \ 78 b0 = _mm_unpacklo_epi64(m6, m1); \ 79 b1 = _mm_unpackhi_epi64(m3, m1); \ 80 } while(0) 81 82 83 #define LOAD_MSG_2_1(b0, b1) \ 84 do \ 85 { \ 86 b0 = _mm_alignr_epi8(m6, m5, 8); \ 87 b1 = _mm_unpackhi_epi64(m2, m7); \ 88 } while(0) 89 90 91 #define LOAD_MSG_2_2(b0, b1) \ 92 do \ 93 { \ 94 b0 = _mm_unpacklo_epi64(m4, m0); \ 95 b1 = _mm_blend_epi16(m1, m6, 0xF0); \ 96 } while(0) 97 98 99 #define LOAD_MSG_2_3(b0, b1) \ 100 do \ 101 { \ 102 b0 = _mm_blend_epi16(m5, m1, 0xF0); \ 103 b1 = _mm_unpackhi_epi64(m3, m4); \ 104 } while(0) 105 106 107 #define LOAD_MSG_2_4(b0, b1) \ 108 do \ 109 { \ 110 b0 = _mm_unpacklo_epi64(m7, m3); \ 111 b1 = _mm_alignr_epi8(m2, m0, 8); \ 112 } while(0) 113 114 115 #define LOAD_MSG_3_1(b0, b1) \ 116 do \ 117 { \ 118 b0 = _mm_unpackhi_epi64(m3, m1); \ 119 b1 = _mm_unpackhi_epi64(m6, m5); \ 120 } while(0) 121 122 123 #define LOAD_MSG_3_2(b0, b1) \ 124 do \ 125 { \ 126 b0 = _mm_unpackhi_epi64(m4, m0); \ 127 b1 = _mm_unpacklo_epi64(m6, m7); \ 128 } while(0) 129 130 131 #define LOAD_MSG_3_3(b0, b1) \ 132 do \ 133 { \ 134 b0 = _mm_blend_epi16(m1, m2, 0xF0); \ 135 b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 136 } while(0) 137 138 139 #define LOAD_MSG_3_4(b0, b1) \ 140 do \ 141 { \ 142 b0 = _mm_unpacklo_epi64(m3, m5); \ 143 b1 = _mm_unpacklo_epi64(m0, m4); \ 144 } while(0) 145 146 147 #define LOAD_MSG_4_1(b0, b1) \ 148 do \ 149 { \ 150 b0 = _mm_unpackhi_epi64(m4, m2); \ 151 b1 = _mm_unpacklo_epi64(m1, m5); \ 152 } while(0) 153 154 155 #define LOAD_MSG_4_2(b0, b1) \ 156 do \ 157 { \ 158 b0 = _mm_blend_epi16(m0, m3, 0xF0); \ 159 b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 160 } while(0) 161 162 163 #define LOAD_MSG_4_3(b0, b1) \ 164 do \ 165 { \ 166 b0 = _mm_blend_epi16(m7, m5, 0xF0); \ 167 b1 = _mm_blend_epi16(m3, m1, 0xF0); \ 168 } while(0) 169 170 171 #define LOAD_MSG_4_4(b0, b1) \ 172 do \ 173 { \ 174 b0 = _mm_alignr_epi8(m6, m0, 8); \ 175 b1 = _mm_blend_epi16(m4, m6, 0xF0); \ 176 } while(0) 177 178 179 #define LOAD_MSG_5_1(b0, b1) \ 180 do \ 181 { \ 182 b0 = _mm_unpacklo_epi64(m1, m3); \ 183 b1 = _mm_unpacklo_epi64(m0, m4); \ 184 } while(0) 185 186 187 #define LOAD_MSG_5_2(b0, b1) \ 188 do \ 189 { \ 190 b0 = _mm_unpacklo_epi64(m6, m5); \ 191 b1 = _mm_unpackhi_epi64(m5, m1); \ 192 } while(0) 193 194 195 #define LOAD_MSG_5_3(b0, b1) \ 196 do \ 197 { \ 198 b0 = _mm_blend_epi16(m2, m3, 0xF0); \ 199 b1 = _mm_unpackhi_epi64(m7, m0); \ 200 } while(0) 201 202 203 #define LOAD_MSG_5_4(b0, b1) \ 204 do \ 205 { \ 206 b0 = _mm_unpackhi_epi64(m6, m2); \ 207 b1 = _mm_blend_epi16(m7, m4, 0xF0); \ 208 } while(0) 209 210 211 #define LOAD_MSG_6_1(b0, b1) \ 212 do \ 213 { \ 214 b0 = _mm_blend_epi16(m6, m0, 0xF0); \ 215 b1 = _mm_unpacklo_epi64(m7, m2); \ 216 } while(0) 217 218 219 #define LOAD_MSG_6_2(b0, b1) \ 220 do \ 221 { \ 222 b0 = _mm_unpackhi_epi64(m2, m7); \ 223 b1 = _mm_alignr_epi8(m5, m6, 8); \ 224 } while(0) 225 226 227 #define LOAD_MSG_6_3(b0, b1) \ 228 do \ 229 { \ 230 b0 = _mm_unpacklo_epi64(m0, m3); \ 231 b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ 232 } while(0) 233 234 235 #define LOAD_MSG_6_4(b0, b1) \ 236 do \ 237 { \ 238 b0 = _mm_unpackhi_epi64(m3, m1); \ 239 b1 = _mm_blend_epi16(m1, m5, 0xF0); \ 240 } while(0) 241 242 243 #define LOAD_MSG_7_1(b0, b1) \ 244 do \ 245 { \ 246 b0 = _mm_unpackhi_epi64(m6, m3); \ 247 b1 = _mm_blend_epi16(m6, m1, 0xF0); \ 248 } while(0) 249 250 251 #define LOAD_MSG_7_2(b0, b1) \ 252 do \ 253 { \ 254 b0 = _mm_alignr_epi8(m7, m5, 8); \ 255 b1 = _mm_unpackhi_epi64(m0, m4); \ 256 } while(0) 257 258 259 #define LOAD_MSG_7_3(b0, b1) \ 260 do \ 261 { \ 262 b0 = _mm_unpackhi_epi64(m2, m7); \ 263 b1 = _mm_unpacklo_epi64(m4, m1); \ 264 } while(0) 265 266 267 #define LOAD_MSG_7_4(b0, b1) \ 268 do \ 269 { \ 270 b0 = _mm_unpacklo_epi64(m0, m2); \ 271 b1 = _mm_unpacklo_epi64(m3, m5); \ 272 } while(0) 273 274 275 #define LOAD_MSG_8_1(b0, b1) \ 276 do \ 277 { \ 278 b0 = _mm_unpacklo_epi64(m3, m7); \ 279 b1 = _mm_alignr_epi8(m0, m5, 8); \ 280 } while(0) 281 282 283 #define LOAD_MSG_8_2(b0, b1) \ 284 do \ 285 { \ 286 b0 = _mm_unpackhi_epi64(m7, m4); \ 287 b1 = _mm_alignr_epi8(m4, m1, 8); \ 288 } while(0) 289 290 291 #define LOAD_MSG_8_3(b0, b1) \ 292 do \ 293 { \ 294 b0 = m6; \ 295 b1 = _mm_alignr_epi8(m5, m0, 8); \ 296 } while(0) 297 298 299 #define LOAD_MSG_8_4(b0, b1) \ 300 do \ 301 { \ 302 b0 = _mm_blend_epi16(m1, m3, 0xF0); \ 303 b1 = m2; \ 304 } while(0) 305 306 307 #define LOAD_MSG_9_1(b0, b1) \ 308 do \ 309 { \ 310 b0 = _mm_unpacklo_epi64(m5, m4); \ 311 b1 = _mm_unpackhi_epi64(m3, m0); \ 312 } while(0) 313 314 315 #define LOAD_MSG_9_2(b0, b1) \ 316 do \ 317 { \ 318 b0 = _mm_unpacklo_epi64(m1, m2); \ 319 b1 = _mm_blend_epi16(m3, m2, 0xF0); \ 320 } while(0) 321 322 323 #define LOAD_MSG_9_3(b0, b1) \ 324 do \ 325 { \ 326 b0 = _mm_unpackhi_epi64(m7, m4); \ 327 b1 = _mm_unpackhi_epi64(m1, m6); \ 328 } while(0) 329 330 331 #define LOAD_MSG_9_4(b0, b1) \ 332 do \ 333 { \ 334 b0 = _mm_alignr_epi8(m7, m5, 8); \ 335 b1 = _mm_unpacklo_epi64(m6, m0); \ 336 } while(0) 337 338 339 #define LOAD_MSG_10_1(b0, b1) \ 340 do \ 341 { \ 342 b0 = _mm_unpacklo_epi64(m0, m1); \ 343 b1 = _mm_unpacklo_epi64(m2, m3); \ 344 } while(0) 345 346 347 #define LOAD_MSG_10_2(b0, b1) \ 348 do \ 349 { \ 350 b0 = _mm_unpackhi_epi64(m0, m1); \ 351 b1 = _mm_unpackhi_epi64(m2, m3); \ 352 } while(0) 353 354 355 #define LOAD_MSG_10_3(b0, b1) \ 356 do \ 357 { \ 358 b0 = _mm_unpacklo_epi64(m4, m5); \ 359 b1 = _mm_unpacklo_epi64(m6, m7); \ 360 } while(0) 361 362 363 #define LOAD_MSG_10_4(b0, b1) \ 364 do \ 365 { \ 366 b0 = _mm_unpackhi_epi64(m4, m5); \ 367 b1 = _mm_unpackhi_epi64(m6, m7); \ 368 } while(0) 369 370 371 #define LOAD_MSG_11_1(b0, b1) \ 372 do \ 373 { \ 374 b0 = _mm_unpacklo_epi64(m7, m2); \ 375 b1 = _mm_unpackhi_epi64(m4, m6); \ 376 } while(0) 377 378 379 #define LOAD_MSG_11_2(b0, b1) \ 380 do \ 381 { \ 382 b0 = _mm_unpacklo_epi64(m5, m4); \ 383 b1 = _mm_alignr_epi8(m3, m7, 8); \ 384 } while(0) 385 386 387 #define LOAD_MSG_11_3(b0, b1) \ 388 do \ 389 { \ 390 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 391 b1 = _mm_unpackhi_epi64(m5, m2); \ 392 } while(0) 393 394 395 #define LOAD_MSG_11_4(b0, b1) \ 396 do \ 397 { \ 398 b0 = _mm_unpacklo_epi64(m6, m1); \ 399 b1 = _mm_unpackhi_epi64(m3, m1); \ 400 } while(0) 401 402 403 #endif 404 405