1#!/usr/bin/env perl 2 3# Copyright (c) 2017, Shay Gueron. 4# Copyright (c) 2017, Google Inc. 5# 6# Permission to use, copy, modify, and/or distribute this software for any 7# purpose with or without fee is hereby granted, provided that the above 8# copyright notice and this permission notice appear in all copies. 9# 10# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 13# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 15# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 16# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 17 18use warnings FATAL => 'all'; 19 20$flavour = shift; 21$output = shift; 22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 23 24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 25 26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 29die "can't locate x86_64-xlate.pl"; 30 31open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 32*STDOUT=*OUT; 33 34$code.=<<___; 35.data 36 37.align 16 38one: 39.quad 1,0 40two: 41.quad 2,0 42three: 43.quad 3,0 44four: 45.quad 4,0 46five: 47.quad 5,0 48six: 49.quad 6,0 50seven: 51.quad 7,0 52eight: 53.quad 8,0 54 55OR_MASK: 56.long 0x00000000,0x00000000,0x00000000,0x80000000 57poly: 58.quad 0x1, 0xc200000000000000 59mask: 60.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 61con1: 62.long 1,1,1,1 63con2: 64.long 0x1b,0x1b,0x1b,0x1b 65con3: 66.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 67and_mask: 68.long 0,0xffffffff, 0xffffffff, 0xffffffff 69___ 70 71$code.=<<___; 72.text 73___ 74 75sub gfmul { 76 ######################### 77 # a = T 78 # b = TMP0 - remains unchanged 79 # res = T 80 # uses also TMP1,TMP2,TMP3,TMP4 81 # __m128i GFMUL(__m128i A, __m128i B); 82 83 my $T = "%xmm0"; 84 my $TMP0 = "%xmm1"; 85 my $TMP1 = "%xmm2"; 86 my $TMP2 = "%xmm3"; 87 my $TMP3 = "%xmm4"; 88 my $TMP4 = "%xmm5"; 89 90 $code.=<<___; 91.type GFMUL,\@abi-omnipotent 92.align 16 93GFMUL: 94.cfi_startproc 95 vpclmulqdq \$0x00, $TMP0, $T, $TMP1 96 vpclmulqdq \$0x11, $TMP0, $T, $TMP4 97 vpclmulqdq \$0x10, $TMP0, $T, $TMP2 98 vpclmulqdq \$0x01, $TMP0, $T, $TMP3 99 vpxor $TMP3, $TMP2, $TMP2 100 vpslldq \$8, $TMP2, $TMP3 101 vpsrldq \$8, $TMP2, $TMP2 102 vpxor $TMP3, $TMP1, $TMP1 103 vpxor $TMP2, $TMP4, $TMP4 104 105 vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2 106 vpshufd \$78, $TMP1, $TMP3 107 vpxor $TMP3, $TMP2, $TMP1 108 109 vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2 110 vpshufd \$78, $TMP1, $TMP3 111 vpxor $TMP3, $TMP2, $TMP1 112 113 vpxor $TMP4, $TMP1, $T 114 ret 115.cfi_endproc 116.size GFMUL, .-GFMUL 117___ 118} 119gfmul(); 120 121sub aesgcmsiv_htable_init { 122 # aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to 123 # |out_htable|. 124 # void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H); 125 126 my $Htbl = "%rdi"; 127 my $H = "%rsi"; 128 my $T = "%xmm0"; 129 my $TMP0 = "%xmm1"; 130 131$code.=<<___; 132.globl aesgcmsiv_htable_init 133.type aesgcmsiv_htable_init,\@function,2 134.align 16 135aesgcmsiv_htable_init: 136.cfi_startproc 137 vmovdqa ($H), $T 138 vmovdqa $T, $TMP0 139 vmovdqa $T, ($Htbl) # H 140 call GFMUL 141 vmovdqa $T, 16($Htbl) # H^2 142 call GFMUL 143 vmovdqa $T, 32($Htbl) # H^3 144 call GFMUL 145 vmovdqa $T, 48($Htbl) # H^4 146 call GFMUL 147 vmovdqa $T, 64($Htbl) # H^5 148 call GFMUL 149 vmovdqa $T, 80($Htbl) # H^6 150 call GFMUL 151 vmovdqa $T, 96($Htbl) # H^7 152 call GFMUL 153 vmovdqa $T, 112($Htbl) # H^8 154 ret 155.cfi_endproc 156.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init 157___ 158} 159aesgcmsiv_htable_init(); 160 161sub aesgcmsiv_htable6_init { 162 # aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to 163 # |out_htable|. 164 # void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H); 165 # 166 my $Htbl = "%rdi"; 167 my $H = "%rsi"; 168 my $T = "%xmm0"; 169 my $TMP0 = "%xmm1"; 170 171 $code.=<<___; 172.globl aesgcmsiv_htable6_init 173.type aesgcmsiv_htable6_init,\@function,2 174.align 16 175aesgcmsiv_htable6_init: 176.cfi_startproc 177 vmovdqa ($H), $T 178 vmovdqa $T, $TMP0 179 vmovdqa $T, ($Htbl) # H 180 call GFMUL 181 vmovdqa $T, 16($Htbl) # H^2 182 call GFMUL 183 vmovdqa $T, 32($Htbl) # H^3 184 call GFMUL 185 vmovdqa $T, 48($Htbl) # H^4 186 call GFMUL 187 vmovdqa $T, 64($Htbl) # H^5 188 call GFMUL 189 vmovdqa $T, 80($Htbl) # H^6 190 ret 191.cfi_endproc 192.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init 193___ 194} 195aesgcmsiv_htable6_init(); 196 197sub aesgcmsiv_htable_polyval { 198 # void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T); 199 # parameter 1: %rdi Htable - pointer to Htable 200 # parameter 2: %rsi INp - pointer to input 201 # parameter 3: %rdx LEN - length of BUFFER in bytes 202 # parameter 4: %rcx T - pointer to POLYVAL output 203 204 my $DATA = "%xmm0"; 205 my $hlp0 = "%r11"; 206 my $Htbl = "%rdi"; 207 my $inp = "%rsi"; 208 my $len = "%rdx"; 209 my $TMP0 = "%xmm3"; 210 my $TMP1 = "%xmm4"; 211 my $TMP2 = "%xmm5"; 212 my $TMP3 = "%xmm6"; 213 my $TMP4 = "%xmm7"; 214 my $Tp = "%rcx"; 215 my $T = "%xmm1"; 216 my $Xhi = "%xmm9"; 217 218 my $SCHOOLBOOK_AAD = sub { 219 my ($i)=@_; 220 return <<___; 221 vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 222 vpxor $TMP3, $TMP2, $TMP2 223 vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 224 vpxor $TMP3, $TMP0, $TMP0 225 vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 226 vpxor $TMP3, $TMP1, $TMP1 227 vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 228 vpxor $TMP3, $TMP2, $TMP2 229___ 230 }; 231 232 $code.=<<___; 233.globl aesgcmsiv_htable_polyval 234.type aesgcmsiv_htable_polyval,\@function,4 235.align 16 236aesgcmsiv_htable_polyval: 237.cfi_startproc 238 test $len, $len 239 jnz .Lhtable_polyval_start 240 ret 241 242.Lhtable_polyval_start: 243 vzeroall 244 245 # We hash 8 blocks each iteration. If the total number of blocks is not a 246 # multiple of 8, we first hash the leading n%8 blocks. 247 movq $len, $hlp0 248 andq \$127, $hlp0 249 250 jz .Lhtable_polyval_no_prefix 251 252 vpxor $Xhi, $Xhi, $Xhi 253 vmovdqa ($Tp), $T 254 sub $hlp0, $len 255 256 sub \$16, $hlp0 257 258 # hash first prefix block 259 vmovdqu ($inp), $DATA 260 vpxor $T, $DATA, $DATA 261 262 vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2 263 vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0 264 vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1 265 vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3 266 vpxor $TMP3, $TMP2, $TMP2 267 268 lea 16($inp), $inp 269 test $hlp0, $hlp0 270 jnz .Lhtable_polyval_prefix_loop 271 jmp .Lhtable_polyval_prefix_complete 272 273 # hash remaining prefix bocks (up to 7 total prefix blocks) 274.align 64 275.Lhtable_polyval_prefix_loop: 276 sub \$16, $hlp0 277 278 vmovdqu ($inp), $DATA # next data block 279 280 vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP3 281 vpxor $TMP3, $TMP0, $TMP0 282 vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP3 283 vpxor $TMP3, $TMP1, $TMP1 284 vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP3 285 vpxor $TMP3, $TMP2, $TMP2 286 vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3 287 vpxor $TMP3, $TMP2, $TMP2 288 289 test $hlp0, $hlp0 290 291 lea 16($inp), $inp 292 293 jnz .Lhtable_polyval_prefix_loop 294 295.Lhtable_polyval_prefix_complete: 296 vpsrldq \$8, $TMP2, $TMP3 297 vpslldq \$8, $TMP2, $TMP2 298 299 vpxor $TMP3, $TMP1, $Xhi 300 vpxor $TMP2, $TMP0, $T 301 302 jmp .Lhtable_polyval_main_loop 303 304.Lhtable_polyval_no_prefix: 305 # At this point we know the number of blocks is a multiple of 8. However, 306 # the reduction in the main loop includes a multiplication by x^(-128). In 307 # order to counter this, the existing tag needs to be multipled by x^128. 308 # In practice, this just means that it is loaded into $Xhi, not $T. 309 vpxor $T, $T, $T 310 vmovdqa ($Tp), $Xhi 311 312.align 64 313.Lhtable_polyval_main_loop: 314 sub \$0x80, $len 315 jb .Lhtable_polyval_out 316 317 vmovdqu 16*7($inp), $DATA # Ii 318 319 vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2 320 vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0 321 vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1 322 vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3 323 vpxor $TMP3, $TMP2, $TMP2 324 325 ######################################################### 326 vmovdqu 16*6($inp), $DATA 327 ${\$SCHOOLBOOK_AAD->(1)} 328 329 ######################################################### 330 vmovdqu 16*5($inp), $DATA 331 332 vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 1a 333 vpalignr \$8, $T, $T, $T 334 335 ${\$SCHOOLBOOK_AAD->(2)} 336 337 vpxor $TMP4, $T, $T # reduction stage 1b 338 ######################################################### 339 vmovdqu 16*4($inp), $DATA 340 341 ${\$SCHOOLBOOK_AAD->(3)} 342 ######################################################### 343 vmovdqu 16*3($inp), $DATA 344 345 vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 2a 346 vpalignr \$8, $T, $T, $T 347 348 ${\$SCHOOLBOOK_AAD->(4)} 349 350 vpxor $TMP4, $T, $T # reduction stage 2b 351 ######################################################### 352 vmovdqu 16*2($inp), $DATA 353 354 ${\$SCHOOLBOOK_AAD->(5)} 355 356 vpxor $Xhi, $T, $T # reduction finalize 357 ######################################################### 358 vmovdqu 16*1($inp), $DATA 359 360 ${\$SCHOOLBOOK_AAD->(6)} 361 ######################################################### 362 vmovdqu 16*0($inp), $DATA 363 vpxor $T, $DATA, $DATA 364 365 ${\$SCHOOLBOOK_AAD->(7)} 366 ######################################################### 367 vpsrldq \$8, $TMP2, $TMP3 368 vpslldq \$8, $TMP2, $TMP2 369 370 vpxor $TMP3, $TMP1, $Xhi 371 vpxor $TMP2, $TMP0, $T 372 373 lea 16*8($inp), $inp 374 jmp .Lhtable_polyval_main_loop 375 376 ######################################################### 377 378.Lhtable_polyval_out: 379 vpclmulqdq \$0x10, poly(%rip), $T, $TMP3 380 vpalignr \$8, $T, $T, $T 381 vpxor $TMP3, $T, $T 382 383 vpclmulqdq \$0x10, poly(%rip), $T, $TMP3 384 vpalignr \$8, $T, $T, $T 385 vpxor $TMP3, $T, $T 386 vpxor $Xhi, $T, $T 387 388 vmovdqu $T, ($Tp) 389 vzeroupper 390 ret 391.cfi_endproc 392.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval 393___ 394} 395aesgcmsiv_htable_polyval(); 396 397sub aesgcmsiv_polyval_horner { 398 #void aesgcmsiv_polyval_horner(unsigned char T[16], // output 399 # const unsigned char* H, // H 400 # unsigned char* BUF, // Buffer 401 # unsigned int blocks); // Len2 402 # 403 # parameter 1: %rdi T - pointers to POLYVAL output 404 # parameter 2: %rsi Hp - pointer to H (user key) 405 # parameter 3: %rdx INp - pointer to input 406 # parameter 4: %rcx L - total number of blocks in input BUFFER 407 # 408 my $T = "%rdi"; 409 my $Hp = "%rsi"; 410 my $INp = "%rdx"; 411 my $L = "%rcx"; 412 my $LOC = "%r10"; 413 my $LEN = "%eax"; 414 my $H = "%xmm1"; 415 my $RES = "%xmm0"; 416 417 $code.=<<___; 418.globl aesgcmsiv_polyval_horner 419.type aesgcmsiv_polyval_horner,\@function,4 420.align 16 421aesgcmsiv_polyval_horner: 422.cfi_startproc 423 test $L, $L 424 jnz .Lpolyval_horner_start 425 ret 426 427.Lpolyval_horner_start: 428 # We will start with L GFMULS for POLYVAL(BIG_BUFFER) 429 # RES = GFMUL(RES, H) 430 431 xorq $LOC, $LOC 432 shlq \$4, $L # L contains number of bytes to process 433 434 vmovdqa ($Hp), $H 435 vmovdqa ($T), $RES 436 437.Lpolyval_horner_loop: 438 vpxor ($INp,$LOC), $RES, $RES # RES = RES + Xi 439 call GFMUL # RES = RES * H 440 441 add \$16, $LOC 442 cmp $LOC, $L 443 jne .Lpolyval_horner_loop 444 445 # calculation of T is complete. RES=T 446 vmovdqa $RES, ($T) 447 ret 448.cfi_endproc 449.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner 450___ 451} 452aesgcmsiv_polyval_horner(); 453 454# void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key); 455# parameter 1: %rdi 456# parameter 2: %rsi 457$code.=<<___; 458.globl aes128gcmsiv_aes_ks 459.type aes128gcmsiv_aes_ks,\@function,2 460.align 16 461aes128gcmsiv_aes_ks: 462.cfi_startproc 463 vmovdqu (%rdi), %xmm1 # xmm1 = user key 464 vmovdqa %xmm1, (%rsi) # rsi points to output 465 466 vmovdqa con1(%rip), %xmm0 467 vmovdqa mask(%rip), %xmm15 468 469 movq \$8, %rax 470 471.Lks128_loop: 472 addq \$16, %rsi # rsi points for next key 473 subq \$1, %rax 474 vpshufb %xmm15, %xmm1, %xmm2 # xmm2 = shuffled user key 475 vaesenclast %xmm0, %xmm2, %xmm2 476 vpslld \$1, %xmm0, %xmm0 477 vpslldq \$4, %xmm1, %xmm3 478 vpxor %xmm3, %xmm1, %xmm1 479 vpslldq \$4, %xmm3, %xmm3 480 vpxor %xmm3, %xmm1, %xmm1 481 vpslldq \$4, %xmm3, %xmm3 482 vpxor %xmm3, %xmm1, %xmm1 483 vpxor %xmm2, %xmm1, %xmm1 484 vmovdqa %xmm1, (%rsi) 485 jne .Lks128_loop 486 487 vmovdqa con2(%rip), %xmm0 488 vpshufb %xmm15, %xmm1, %xmm2 489 vaesenclast %xmm0, %xmm2, %xmm2 490 vpslld \$1, %xmm0, %xmm0 491 vpslldq \$4, %xmm1, %xmm3 492 vpxor %xmm3, %xmm1, %xmm1 493 vpslldq \$4, %xmm3, %xmm3 494 vpxor %xmm3, %xmm1, %xmm1 495 vpslldq \$4, %xmm3, %xmm3 496 vpxor %xmm3, %xmm1, %xmm1 497 vpxor %xmm2, %xmm1, %xmm1 498 vmovdqa %xmm1, 16(%rsi) 499 500 vpshufb %xmm15, %xmm1, %xmm2 501 vaesenclast %xmm0, %xmm2, %xmm2 502 vpslldq \$4, %xmm1, %xmm3 503 vpxor %xmm3, %xmm1, %xmm1 504 vpslldq \$4, %xmm3, %xmm3 505 vpxor %xmm3, %xmm1, %xmm1 506 vpslldq \$4, %xmm3, %xmm3 507 vpxor %xmm3, %xmm1, %xmm1 508 vpxor %xmm2, %xmm1, %xmm1 509 vmovdqa %xmm1, 32(%rsi) 510 ret 511.cfi_endproc 512.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks 513___ 514 515# void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key); 516# parameter 1: %rdi 517# parameter 2: %rsi 518$code.=<<___; 519.globl aes256gcmsiv_aes_ks 520.type aes256gcmsiv_aes_ks,\@function,2 521.align 16 522aes256gcmsiv_aes_ks: 523.cfi_startproc 524 vmovdqu (%rdi), %xmm1 525 vmovdqu 16(%rdi), %xmm3 526 vmovdqa %xmm1, (%rsi) 527 vmovdqa %xmm3, 16(%rsi) 528 vmovdqa con1(%rip), %xmm0 529 vmovdqa mask(%rip), %xmm15 530 vpxor %xmm14, %xmm14, %xmm14 531 mov \$6, %rax 532 533.Lks256_loop: 534 add \$32, %rsi 535 subq \$1, %rax 536 vpshufb %xmm15, %xmm3, %xmm2 537 vaesenclast %xmm0, %xmm2, %xmm2 538 vpslld \$1, %xmm0, %xmm0 539 vpsllq \$32, %xmm1, %xmm4 540 vpxor %xmm4, %xmm1, %xmm1 541 vpshufb con3(%rip), %xmm1, %xmm4 542 vpxor %xmm4, %xmm1, %xmm1 543 vpxor %xmm2, %xmm1, %xmm1 544 vmovdqa %xmm1, (%rsi) 545 vpshufd \$0xff, %xmm1, %xmm2 546 vaesenclast %xmm14, %xmm2, %xmm2 547 vpsllq \$32, %xmm3, %xmm4 548 vpxor %xmm4, %xmm3, %xmm3 549 vpshufb con3(%rip), %xmm3, %xmm4 550 vpxor %xmm4, %xmm3, %xmm3 551 vpxor %xmm2, %xmm3, %xmm3 552 vmovdqa %xmm3, 16(%rsi) 553 jne .Lks256_loop 554 555 vpshufb %xmm15, %xmm3, %xmm2 556 vaesenclast %xmm0, %xmm2, %xmm2 557 vpsllq \$32, %xmm1, %xmm4 558 vpxor %xmm4, %xmm1, %xmm1 559 vpshufb con3(%rip), %xmm1, %xmm4 560 vpxor %xmm4, %xmm1, %xmm1 561 vpxor %xmm2, %xmm1, %xmm1 562 vmovdqa %xmm1, 32(%rsi) 563 ret 564.cfi_endproc 565___ 566 567sub aes128gcmsiv_aes_ks_enc_x1 { 568 my $KS1_REGA = "%xmm1"; 569 my $KS1_REGB = "%xmm2"; 570 my $BLOCK1 = "%xmm4"; 571 my $AUXREG = "%xmm3"; 572 573 my $KS_BLOCK = sub { 574 my ($reg, $reg2, $auxReg) = @_; 575 return <<___; 576 vpsllq \$32, $reg, $auxReg #!!saving mov instruction to xmm3 577 vpxor $auxReg, $reg, $reg 578 vpshufb con3(%rip), $reg, $auxReg 579 vpxor $auxReg, $reg, $reg 580 vpxor $reg2, $reg, $reg 581___ 582 }; 583 584 my $round = sub { 585 my ($i, $j) = @_; 586 return <<___; 587 vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2 588 vaesenclast %xmm0, %xmm2, %xmm2 589 vpslld \$1, %xmm0, %xmm0 590 ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)} 591 vaesenc %xmm1, $BLOCK1, $BLOCK1 592 vmovdqa %xmm1, ${\eval(16*$i)}($j) 593___ 594 }; 595 596 my $roundlast = sub { 597 my ($i, $j) = @_; 598 return <<___; 599 vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2 600 vaesenclast %xmm0, %xmm2, %xmm2 601 ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)} 602 vaesenclast %xmm1, $BLOCK1, $BLOCK1 603 vmovdqa %xmm1, ${\eval(16*$i)}($j) 604___ 605 }; 606 607# parameter 1: %rdi Pointer to PT 608# parameter 2: %rsi Pointer to CT 609# parameter 4: %rdx Pointer to keys 610# parameter 5: %rcx Pointer to initial key 611 $code.=<<___; 612.globl aes128gcmsiv_aes_ks_enc_x1 613.type aes128gcmsiv_aes_ks_enc_x1,\@function,4 614.align 16 615aes128gcmsiv_aes_ks_enc_x1: 616.cfi_startproc 617 vmovdqa (%rcx), %xmm1 # xmm1 = first 16 bytes of random key 618 vmovdqa 0*16(%rdi), $BLOCK1 619 620 vmovdqa %xmm1, (%rdx) # KEY[0] = first 16 bytes of random key 621 vpxor %xmm1, $BLOCK1, $BLOCK1 622 623 vmovdqa con1(%rip), %xmm0 # xmm0 = 1,1,1,1 624 vmovdqa mask(%rip), %xmm15 # xmm15 = mask 625 626 ${\$round->(1, "%rdx")} 627 ${\$round->(2, "%rdx")} 628 ${\$round->(3, "%rdx")} 629 ${\$round->(4, "%rdx")} 630 ${\$round->(5, "%rdx")} 631 ${\$round->(6, "%rdx")} 632 ${\$round->(7, "%rdx")} 633 ${\$round->(8, "%rdx")} 634 635 vmovdqa con2(%rip), %xmm0 636 637 ${\$round->(9, "%rdx")} 638 ${\$roundlast->(10, "%rdx")} 639 640 vmovdqa $BLOCK1, 0*16(%rsi) 641 ret 642.cfi_endproc 643.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1 644___ 645} 646aes128gcmsiv_aes_ks_enc_x1(); 647 648sub aes128gcmsiv_kdf { 649 my $BLOCK1 = "%xmm9"; 650 my $BLOCK2 = "%xmm10"; 651 my $BLOCK3 = "%xmm11"; 652 my $BLOCK4 = "%xmm12"; 653 my $BLOCK5 = "%xmm13"; 654 my $BLOCK6 = "%xmm14"; 655 my $ONE = "%xmm13"; 656 my $KSp = "%rdx"; 657 my $STATE_1 = "%xmm1"; 658 659 my $enc_roundx4 = sub { 660 my ($i, $j) = @_; 661 return <<___; 662 vmovdqa ${\eval($i*16)}(%rdx), $j 663 vaesenc $j, $BLOCK1, $BLOCK1 664 vaesenc $j, $BLOCK2, $BLOCK2 665 vaesenc $j, $BLOCK3, $BLOCK3 666 vaesenc $j, $BLOCK4, $BLOCK4 667___ 668 }; 669 670 my $enc_roundlastx4 = sub { 671 my ($i, $j) = @_; 672 return <<___; 673 vmovdqa ${\eval($i*16)}(%rdx), $j 674 vaesenclast $j, $BLOCK1, $BLOCK1 675 vaesenclast $j, $BLOCK2, $BLOCK2 676 vaesenclast $j, $BLOCK3, $BLOCK3 677 vaesenclast $j, $BLOCK4, $BLOCK4 678___ 679 }; 680 681# void aes128gcmsiv_kdf(const uint8_t nonce[16], 682# uint8_t *out_key_material, 683# const uint8_t *key_schedule); 684 $code.=<<___; 685.globl aes128gcmsiv_kdf 686.type aes128gcmsiv_kdf,\@function,3 687.align 16 688aes128gcmsiv_kdf: 689.cfi_startproc 690# parameter 1: %rdi Pointer to NONCE 691# parameter 2: %rsi Pointer to CT 692# parameter 4: %rdx Pointer to keys 693 694 vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key 695 vmovdqa 0*16(%rdi), $BLOCK1 696 vmovdqa and_mask(%rip), $BLOCK4 697 vmovdqa one(%rip), $ONE 698 vpshufd \$0x90, $BLOCK1, $BLOCK1 699 vpand $BLOCK4, $BLOCK1, $BLOCK1 700 vpaddd $ONE, $BLOCK1, $BLOCK2 701 vpaddd $ONE, $BLOCK2, $BLOCK3 702 vpaddd $ONE, $BLOCK3, $BLOCK4 703 704 vpxor %xmm1, $BLOCK1, $BLOCK1 705 vpxor %xmm1, $BLOCK2, $BLOCK2 706 vpxor %xmm1, $BLOCK3, $BLOCK3 707 vpxor %xmm1, $BLOCK4, $BLOCK4 708 709 ${\$enc_roundx4->(1, "%xmm1")} 710 ${\$enc_roundx4->(2, "%xmm2")} 711 ${\$enc_roundx4->(3, "%xmm1")} 712 ${\$enc_roundx4->(4, "%xmm2")} 713 ${\$enc_roundx4->(5, "%xmm1")} 714 ${\$enc_roundx4->(6, "%xmm2")} 715 ${\$enc_roundx4->(7, "%xmm1")} 716 ${\$enc_roundx4->(8, "%xmm2")} 717 ${\$enc_roundx4->(9, "%xmm1")} 718 ${\$enc_roundlastx4->(10, "%xmm2")} 719 720 vmovdqa $BLOCK1, 0*16(%rsi) 721 vmovdqa $BLOCK2, 1*16(%rsi) 722 vmovdqa $BLOCK3, 2*16(%rsi) 723 vmovdqa $BLOCK4, 3*16(%rsi) 724 ret 725.cfi_endproc 726.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf 727___ 728} 729aes128gcmsiv_kdf(); 730 731sub aes128gcmsiv_enc_msg_x4 { 732 my $CTR1 = "%xmm0"; 733 my $CTR2 = "%xmm1"; 734 my $CTR3 = "%xmm2"; 735 my $CTR4 = "%xmm3"; 736 my $ADDER = "%xmm4"; 737 738 my $STATE1 = "%xmm5"; 739 my $STATE2 = "%xmm6"; 740 my $STATE3 = "%xmm7"; 741 my $STATE4 = "%xmm8"; 742 743 my $TMP = "%xmm12"; 744 my $TMP2 = "%xmm13"; 745 my $TMP3 = "%xmm14"; 746 my $IV = "%xmm15"; 747 748 my $PT = "%rdi"; 749 my $CT = "%rsi"; 750 my $TAG = "%rdx"; 751 my $KS = "%rcx"; 752 my $LEN = "%r8"; 753 754 my $aes_round = sub { 755 my ($i) = @_; 756 return <<___; 757 vmovdqu ${\eval($i*16)}($KS), $TMP 758 vaesenc $TMP, $STATE1, $STATE1 759 vaesenc $TMP, $STATE2, $STATE2 760 vaesenc $TMP, $STATE3, $STATE3 761 vaesenc $TMP, $STATE4, $STATE4 762___ 763 }; 764 765 my $aes_lastround = sub { 766 my ($i) = @_; 767 return <<___; 768 vmovdqu ${\eval($i*16)}($KS), $TMP 769 vaesenclast $TMP, $STATE1, $STATE1 770 vaesenclast $TMP, $STATE2, $STATE2 771 vaesenclast $TMP, $STATE3, $STATE3 772 vaesenclast $TMP, $STATE4, $STATE4 773___ 774 }; 775 776# void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT, 777# unsigned char* TAG, unsigned char* KS, 778# size_t byte_len); 779# parameter 1: %rdi #PT 780# parameter 2: %rsi #CT 781# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] 782# parameter 4: %rcx #KS 783# parameter 5: %r8 #LEN MSG_length in bytes 784 $code.=<<___; 785.globl aes128gcmsiv_enc_msg_x4 786.type aes128gcmsiv_enc_msg_x4,\@function,5 787.align 16 788aes128gcmsiv_enc_msg_x4: 789.cfi_startproc 790 test $LEN, $LEN 791 jnz .L128_enc_msg_x4_start 792 ret 793 794.L128_enc_msg_x4_start: 795 pushq %r12 796.cfi_push %r12 797 pushq %r13 798.cfi_push %r13 799 800 shrq \$4, $LEN # LEN = num of blocks 801 movq $LEN, %r10 802 shlq \$62, %r10 803 shrq \$62, %r10 804 805 # make IV from TAG 806 vmovdqa ($TAG), $IV 807 vpor OR_MASK(%rip), $IV, $IV #IV = [1]TAG[126...32][00..00] 808 809 vmovdqu four(%rip), $ADDER # Register to increment counters 810 vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00] 811 vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01] 812 vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02] 813 vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03] 814 815 shrq \$2, $LEN 816 je .L128_enc_msg_x4_check_remainder 817 818 subq \$64, $CT 819 subq \$64, $PT 820 821.L128_enc_msg_x4_loop1: 822 addq \$64, $CT 823 addq \$64, $PT 824 825 vmovdqa $CTR1, $STATE1 826 vmovdqa $CTR2, $STATE2 827 vmovdqa $CTR3, $STATE3 828 vmovdqa $CTR4, $STATE4 829 830 vpxor ($KS), $STATE1, $STATE1 831 vpxor ($KS), $STATE2, $STATE2 832 vpxor ($KS), $STATE3, $STATE3 833 vpxor ($KS), $STATE4, $STATE4 834 835 ${\$aes_round->(1)} 836 vpaddd $ADDER, $CTR1, $CTR1 837 ${\$aes_round->(2)} 838 vpaddd $ADDER, $CTR2, $CTR2 839 ${\$aes_round->(3)} 840 vpaddd $ADDER, $CTR3, $CTR3 841 ${\$aes_round->(4)} 842 vpaddd $ADDER, $CTR4, $CTR4 843 844 ${\$aes_round->(5)} 845 ${\$aes_round->(6)} 846 ${\$aes_round->(7)} 847 ${\$aes_round->(8)} 848 ${\$aes_round->(9)} 849 ${\$aes_lastround->(10)} 850 851 # XOR with Plaintext 852 vpxor 0*16($PT), $STATE1, $STATE1 853 vpxor 1*16($PT), $STATE2, $STATE2 854 vpxor 2*16($PT), $STATE3, $STATE3 855 vpxor 3*16($PT), $STATE4, $STATE4 856 857 subq \$1, $LEN 858 859 vmovdqu $STATE1, 0*16($CT) 860 vmovdqu $STATE2, 1*16($CT) 861 vmovdqu $STATE3, 2*16($CT) 862 vmovdqu $STATE4, 3*16($CT) 863 864 jne .L128_enc_msg_x4_loop1 865 866 addq \$64,$CT 867 addq \$64,$PT 868 869.L128_enc_msg_x4_check_remainder: 870 cmpq \$0, %r10 871 je .L128_enc_msg_x4_out 872 873.L128_enc_msg_x4_loop2: 874 # enc each block separately 875 # CTR1 is the highest counter (even if no LOOP done) 876 vmovdqa $CTR1, $STATE1 877 vpaddd one(%rip), $CTR1, $CTR1 # inc counter 878 879 vpxor ($KS), $STATE1, $STATE1 880 vaesenc 16($KS), $STATE1, $STATE1 881 vaesenc 32($KS), $STATE1, $STATE1 882 vaesenc 48($KS), $STATE1, $STATE1 883 vaesenc 64($KS), $STATE1, $STATE1 884 vaesenc 80($KS), $STATE1, $STATE1 885 vaesenc 96($KS), $STATE1, $STATE1 886 vaesenc 112($KS), $STATE1, $STATE1 887 vaesenc 128($KS), $STATE1, $STATE1 888 vaesenc 144($KS), $STATE1, $STATE1 889 vaesenclast 160($KS), $STATE1, $STATE1 890 891 # XOR with plaintext 892 vpxor ($PT), $STATE1, $STATE1 893 vmovdqu $STATE1, ($CT) 894 895 addq \$16, $PT 896 addq \$16, $CT 897 898 subq \$1, %r10 899 jne .L128_enc_msg_x4_loop2 900 901.L128_enc_msg_x4_out: 902 popq %r13 903.cfi_pop %r13 904 popq %r12 905.cfi_pop %r12 906 ret 907.cfi_endproc 908.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4 909___ 910} 911aes128gcmsiv_enc_msg_x4(); 912 913sub aes128gcmsiv_enc_msg_x8 { 914 my $STATE1 = "%xmm1"; 915 my $STATE2 = "%xmm2"; 916 my $STATE3 = "%xmm3"; 917 my $STATE4 = "%xmm4"; 918 my $STATE5 = "%xmm5"; 919 my $STATE6 = "%xmm6"; 920 my $STATE7 = "%xmm7"; 921 my $STATE8 = "%xmm8"; 922 923 my $CTR1 = "%xmm0"; 924 my $CTR2 = "%xmm9"; 925 my $CTR3 = "%xmm10"; 926 my $CTR4 = "%xmm11"; 927 my $CTR5 = "%xmm12"; 928 my $CTR6 = "%xmm13"; 929 my $CTR7 = "%xmm14"; 930 my $SCHED = "%xmm15"; 931 932 my $TMP1 = "%xmm1"; 933 my $TMP2 = "%xmm2"; 934 935 my $PT = "%rdi"; 936 my $CT = "%rsi"; 937 my $TAG = "%rdx"; 938 my $KS = "%rcx"; 939 my $LEN = "%r8"; 940 941 my $aes_round8 = sub { 942 my ($i) = @_; 943 return <<___; 944 vmovdqu ${\eval($i*16)}($KS), $SCHED 945 vaesenc $SCHED, $STATE1, $STATE1 946 vaesenc $SCHED, $STATE2, $STATE2 947 vaesenc $SCHED, $STATE3, $STATE3 948 vaesenc $SCHED, $STATE4, $STATE4 949 vaesenc $SCHED, $STATE5, $STATE5 950 vaesenc $SCHED, $STATE6, $STATE6 951 vaesenc $SCHED, $STATE7, $STATE7 952 vaesenc $SCHED, $STATE8, $STATE8 953___ 954 }; 955 956 my $aes_lastround8 = sub { 957 my ($i) = @_; 958 return <<___; 959 vmovdqu ${\eval($i*16)}($KS), $SCHED 960 vaesenclast $SCHED, $STATE1, $STATE1 961 vaesenclast $SCHED, $STATE2, $STATE2 962 vaesenclast $SCHED, $STATE3, $STATE3 963 vaesenclast $SCHED, $STATE4, $STATE4 964 vaesenclast $SCHED, $STATE5, $STATE5 965 vaesenclast $SCHED, $STATE6, $STATE6 966 vaesenclast $SCHED, $STATE7, $STATE7 967 vaesenclast $SCHED, $STATE8, $STATE8 968___ 969 }; 970 971# void ENC_MSG_x8(unsigned char* PT, 972# unsigned char* CT, 973# unsigned char* TAG, 974# unsigned char* KS, 975# size_t byte_len); 976# parameter 1: %rdi #PT 977# parameter 2: %rsi #CT 978# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] 979# parameter 4: %rcx #KS 980# parameter 5: %r8 #LEN MSG_length in bytes 981 $code.=<<___; 982.globl aes128gcmsiv_enc_msg_x8 983.type aes128gcmsiv_enc_msg_x8,\@function,5 984.align 16 985aes128gcmsiv_enc_msg_x8: 986.cfi_startproc 987 test $LEN, $LEN 988 jnz .L128_enc_msg_x8_start 989 ret 990 991.L128_enc_msg_x8_start: 992 pushq %r12 993.cfi_push %r12 994 pushq %r13 995.cfi_push %r13 996 pushq %rbp 997.cfi_push %rbp 998 movq %rsp, %rbp 999.cfi_def_cfa_register rbp 1000 1001 # Place in stack 1002 subq \$128, %rsp 1003 andq \$-64, %rsp 1004 1005 shrq \$4, $LEN # LEN = num of blocks 1006 movq $LEN, %r10 1007 shlq \$61, %r10 1008 shrq \$61, %r10 1009 1010 # make IV from TAG 1011 vmovdqu ($TAG), $TMP1 1012 vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00] 1013 1014 # store counter8 in the stack 1015 vpaddd seven(%rip), $TMP1, $CTR1 1016 vmovdqu $CTR1, (%rsp) # CTR8 = TAG[127...32][00..07] 1017 vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01] 1018 vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02] 1019 vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03] 1020 vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04] 1021 vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05] 1022 vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06] 1023 vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00] 1024 1025 shrq \$3, $LEN 1026 je .L128_enc_msg_x8_check_remainder 1027 1028 subq \$128, $CT 1029 subq \$128, $PT 1030 1031.L128_enc_msg_x8_loop1: 1032 addq \$128, $CT 1033 addq \$128, $PT 1034 1035 vmovdqa $CTR1, $STATE1 1036 vmovdqa $CTR2, $STATE2 1037 vmovdqa $CTR3, $STATE3 1038 vmovdqa $CTR4, $STATE4 1039 vmovdqa $CTR5, $STATE5 1040 vmovdqa $CTR6, $STATE6 1041 vmovdqa $CTR7, $STATE7 1042 # move from stack 1043 vmovdqu (%rsp), $STATE8 1044 1045 vpxor ($KS), $STATE1, $STATE1 1046 vpxor ($KS), $STATE2, $STATE2 1047 vpxor ($KS), $STATE3, $STATE3 1048 vpxor ($KS), $STATE4, $STATE4 1049 vpxor ($KS), $STATE5, $STATE5 1050 vpxor ($KS), $STATE6, $STATE6 1051 vpxor ($KS), $STATE7, $STATE7 1052 vpxor ($KS), $STATE8, $STATE8 1053 1054 ${\$aes_round8->(1)} 1055 vmovdqu (%rsp), $CTR7 # deal with CTR8 1056 vpaddd eight(%rip), $CTR7, $CTR7 1057 vmovdqu $CTR7, (%rsp) 1058 ${\$aes_round8->(2)} 1059 vpsubd one(%rip), $CTR7, $CTR7 1060 ${\$aes_round8->(3)} 1061 vpaddd eight(%rip), $CTR1, $CTR1 1062 ${\$aes_round8->(4)} 1063 vpaddd eight(%rip), $CTR2, $CTR2 1064 ${\$aes_round8->(5)} 1065 vpaddd eight(%rip), $CTR3, $CTR3 1066 ${\$aes_round8->(6)} 1067 vpaddd eight(%rip), $CTR4, $CTR4 1068 ${\$aes_round8->(7)} 1069 vpaddd eight(%rip), $CTR5, $CTR5 1070 ${\$aes_round8->(8)} 1071 vpaddd eight(%rip), $CTR6, $CTR6 1072 ${\$aes_round8->(9)} 1073 ${\$aes_lastround8->(10)} 1074 1075 # XOR with Plaintext 1076 vpxor 0*16($PT), $STATE1, $STATE1 1077 vpxor 1*16($PT), $STATE2, $STATE2 1078 vpxor 2*16($PT), $STATE3, $STATE3 1079 vpxor 3*16($PT), $STATE4, $STATE4 1080 vpxor 4*16($PT), $STATE5, $STATE5 1081 vpxor 5*16($PT), $STATE6, $STATE6 1082 vpxor 6*16($PT), $STATE7, $STATE7 1083 vpxor 7*16($PT), $STATE8, $STATE8 1084 1085 dec $LEN 1086 1087 vmovdqu $STATE1, 0*16($CT) 1088 vmovdqu $STATE2, 1*16($CT) 1089 vmovdqu $STATE3, 2*16($CT) 1090 vmovdqu $STATE4, 3*16($CT) 1091 vmovdqu $STATE5, 4*16($CT) 1092 vmovdqu $STATE6, 5*16($CT) 1093 vmovdqu $STATE7, 6*16($CT) 1094 vmovdqu $STATE8, 7*16($CT) 1095 1096 jne .L128_enc_msg_x8_loop1 1097 1098 addq \$128, $CT 1099 addq \$128, $PT 1100 1101.L128_enc_msg_x8_check_remainder: 1102 cmpq \$0, %r10 1103 je .L128_enc_msg_x8_out 1104 1105.L128_enc_msg_x8_loop2: 1106 # enc each block separately 1107 # CTR1 is the highest counter (even if no LOOP done) 1108 vmovdqa $CTR1, $STATE1 1109 vpaddd one(%rip), $CTR1, $CTR1 # inc counter 1110 1111 vpxor ($KS), $STATE1, $STATE1 1112 vaesenc 16($KS), $STATE1, $STATE1 1113 vaesenc 32($KS), $STATE1, $STATE1 1114 vaesenc 48($KS), $STATE1, $STATE1 1115 vaesenc 64($KS), $STATE1, $STATE1 1116 vaesenc 80($KS), $STATE1, $STATE1 1117 vaesenc 96($KS), $STATE1, $STATE1 1118 vaesenc 112($KS), $STATE1, $STATE1 1119 vaesenc 128($KS), $STATE1, $STATE1 1120 vaesenc 144($KS), $STATE1, $STATE1 1121 vaesenclast 160($KS), $STATE1, $STATE1 1122 1123 # XOR with Plaintext 1124 vpxor ($PT), $STATE1, $STATE1 1125 1126 vmovdqu $STATE1, ($CT) 1127 1128 addq \$16, $PT 1129 addq \$16, $CT 1130 1131 decq %r10 1132 jne .L128_enc_msg_x8_loop2 1133 1134.L128_enc_msg_x8_out: 1135 movq %rbp, %rsp 1136.cfi_def_cfa_register %rsp 1137 popq %rbp 1138.cfi_pop %rbp 1139 popq %r13 1140.cfi_pop %r13 1141 popq %r12 1142.cfi_pop %r12 1143 ret 1144.cfi_endproc 1145.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8 1146___ 1147} 1148aes128gcmsiv_enc_msg_x8(); 1149 1150sub aesgcmsiv_dec { 1151 my ($aes256) = @_; 1152 1153 my $T = "%xmm0"; 1154 my $TMP0 = "%xmm1"; 1155 my $TMP1 = "%xmm2"; 1156 my $TMP2 = "%xmm3"; 1157 my $TMP3 = "%xmm4"; 1158 my $TMP4 = "%xmm5"; 1159 my $TMP5 = "%xmm6"; 1160 my $CTR1 = "%xmm7"; 1161 my $CTR2 = "%xmm8"; 1162 my $CTR3 = "%xmm9"; 1163 my $CTR4 = "%xmm10"; 1164 my $CTR5 = "%xmm11"; 1165 my $CTR6 = "%xmm12"; 1166 my $CTR = "%xmm15"; 1167 my $CT = "%rdi"; 1168 my $PT = "%rsi"; 1169 my $POL = "%rdx"; 1170 my $Htbl = "%rcx"; 1171 my $KS = "%r8"; 1172 my $LEN = "%r9"; 1173 my $secureBuffer = "%rax"; 1174 my $HTABLE_ROUNDS = "%xmm13"; 1175 1176 my $labelPrefix = "128"; 1177 if ($aes256) { 1178 $labelPrefix = "256"; 1179 } 1180 1181 my $aes_round_dec = sub { 1182 my ($i) = @_; 1183 return <<___; 1184 vmovdqu ${\eval($i*16)}($KS), $TMP3 1185 vaesenc $TMP3, $CTR1, $CTR1 1186 vaesenc $TMP3, $CTR2, $CTR2 1187 vaesenc $TMP3, $CTR3, $CTR3 1188 vaesenc $TMP3, $CTR4, $CTR4 1189 vaesenc $TMP3, $CTR5, $CTR5 1190 vaesenc $TMP3, $CTR6, $CTR6 1191___ 1192 }; 1193 1194 my $aes_lastround_dec = sub { 1195 my ($i) = @_; 1196 return <<___; 1197 vmovdqu ${\eval($i*16)}($KS), $TMP3 1198 vaesenclast $TMP3, $CTR1, $CTR1 1199 vaesenclast $TMP3, $CTR2, $CTR2 1200 vaesenclast $TMP3, $CTR3, $CTR3 1201 vaesenclast $TMP3, $CTR4, $CTR4 1202 vaesenclast $TMP3, $CTR5, $CTR5 1203 vaesenclast $TMP3, $CTR6, $CTR6 1204___ 1205 }; 1206 1207 my $schoolbook = sub { 1208 my ($i) = @_; 1209 return <<___; 1210 vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5 1211 vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS 1212 1213 vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3 1214 vpxor $TMP3, $TMP0, $TMP0 1215 vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3 1216 vpxor $TMP3, $TMP1, $TMP1 1217 vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3 1218 vpxor $TMP3, $TMP2, $TMP2 1219 vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3 1220 vpxor $TMP3, $TMP0, $TMP0 1221___ 1222 }; 1223 1224 if ($aes256) { 1225 $code.=<<___; 1226.globl aes256gcmsiv_dec 1227.type aes256gcmsiv_dec,\@function,6 1228.align 16 1229aes256gcmsiv_dec: 1230___ 1231 } else { 1232 $code.=<<___; 1233.globl aes128gcmsiv_dec 1234.type aes128gcmsiv_dec,\@function,6 1235.align 16 1236aes128gcmsiv_dec: 1237___ 1238 } 1239 1240 $code.=<<___; 1241.cfi_startproc 1242 test \$~15, $LEN 1243 jnz .L${labelPrefix}_dec_start 1244 ret 1245 1246.L${labelPrefix}_dec_start: 1247 vzeroupper 1248 vmovdqa ($POL), $T 1249 movq $POL, $secureBuffer 1250 1251 leaq 32($secureBuffer), $secureBuffer 1252 leaq 32($Htbl), $Htbl 1253 1254 # make CTRBLKs from given tag. 1255 vmovdqu ($CT,$LEN), $CTR 1256 vpor OR_MASK(%rip), $CTR, $CTR # CTR = [1]TAG[126...32][00..00] 1257 andq \$~15, $LEN 1258 1259 # If less then 6 blocks, make singles 1260 cmp \$96, $LEN 1261 jb .L${labelPrefix}_dec_loop2 1262 1263 # Decrypt the first six blocks 1264 sub \$96, $LEN 1265 vmovdqa $CTR, $CTR1 1266 vpaddd one(%rip), $CTR1, $CTR2 1267 vpaddd two(%rip), $CTR1, $CTR3 1268 vpaddd one(%rip), $CTR3, $CTR4 1269 vpaddd two(%rip), $CTR3, $CTR5 1270 vpaddd one(%rip), $CTR5, $CTR6 1271 vpaddd two(%rip), $CTR5, $CTR 1272 1273 vpxor ($KS), $CTR1, $CTR1 1274 vpxor ($KS), $CTR2, $CTR2 1275 vpxor ($KS), $CTR3, $CTR3 1276 vpxor ($KS), $CTR4, $CTR4 1277 vpxor ($KS), $CTR5, $CTR5 1278 vpxor ($KS), $CTR6, $CTR6 1279 1280 ${\$aes_round_dec->(1)} 1281 ${\$aes_round_dec->(2)} 1282 ${\$aes_round_dec->(3)} 1283 ${\$aes_round_dec->(4)} 1284 ${\$aes_round_dec->(5)} 1285 ${\$aes_round_dec->(6)} 1286 ${\$aes_round_dec->(7)} 1287 ${\$aes_round_dec->(8)} 1288 ${\$aes_round_dec->(9)} 1289___ 1290 1291if ($aes256) { 1292$code.=<<___; 1293 ${\$aes_round_dec->(10)} 1294 ${\$aes_round_dec->(11)} 1295 ${\$aes_round_dec->(12)} 1296 ${\$aes_round_dec->(13)} 1297 ${\$aes_lastround_dec->(14)} 1298___ 1299} else { 1300$code.=<<___; 1301 ${\$aes_lastround_dec->(10)} 1302___ 1303} 1304 1305$code.=<<___; 1306 # XOR with CT 1307 vpxor 0*16($CT), $CTR1, $CTR1 1308 vpxor 1*16($CT), $CTR2, $CTR2 1309 vpxor 2*16($CT), $CTR3, $CTR3 1310 vpxor 3*16($CT), $CTR4, $CTR4 1311 vpxor 4*16($CT), $CTR5, $CTR5 1312 vpxor 5*16($CT), $CTR6, $CTR6 1313 1314 vmovdqu $CTR1, 0*16($PT) 1315 vmovdqu $CTR2, 1*16($PT) 1316 vmovdqu $CTR3, 2*16($PT) 1317 vmovdqu $CTR4, 3*16($PT) 1318 vmovdqu $CTR5, 4*16($PT) 1319 vmovdqu $CTR6, 5*16($PT) 1320 1321 addq \$96, $CT 1322 addq \$96, $PT 1323 jmp .L${labelPrefix}_dec_loop1 1324 1325# Decrypt 6 blocks each time while hashing previous 6 blocks 1326.align 64 1327.L${labelPrefix}_dec_loop1: 1328 cmp \$96, $LEN 1329 jb .L${labelPrefix}_dec_finish_96 1330 sub \$96, $LEN 1331 1332 vmovdqa $CTR6, $TMP5 1333 vmovdqa $CTR5, 1*16-32($secureBuffer) 1334 vmovdqa $CTR4, 2*16-32($secureBuffer) 1335 vmovdqa $CTR3, 3*16-32($secureBuffer) 1336 vmovdqa $CTR2, 4*16-32($secureBuffer) 1337 vmovdqa $CTR1, 5*16-32($secureBuffer) 1338 1339 vmovdqa $CTR, $CTR1 1340 vpaddd one(%rip), $CTR1, $CTR2 1341 vpaddd two(%rip), $CTR1, $CTR3 1342 vpaddd one(%rip), $CTR3, $CTR4 1343 vpaddd two(%rip), $CTR3, $CTR5 1344 vpaddd one(%rip), $CTR5, $CTR6 1345 vpaddd two(%rip), $CTR5, $CTR 1346 1347 vmovdqa ($KS), $TMP3 1348 vpxor $TMP3, $CTR1, $CTR1 1349 vpxor $TMP3, $CTR2, $CTR2 1350 vpxor $TMP3, $CTR3, $CTR3 1351 vpxor $TMP3, $CTR4, $CTR4 1352 vpxor $TMP3, $CTR5, $CTR5 1353 vpxor $TMP3, $CTR6, $CTR6 1354 1355 vmovdqu 0*16-32($Htbl), $TMP3 1356 vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1 1357 vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2 1358 vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0 1359 vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3 1360 vpxor $TMP3, $TMP0, $TMP0 1361 1362 ${\$aes_round_dec->(1)} 1363 ${\$schoolbook->(1)} 1364 1365 ${\$aes_round_dec->(2)} 1366 ${\$schoolbook->(2)} 1367 1368 ${\$aes_round_dec->(3)} 1369 ${\$schoolbook->(3)} 1370 1371 ${\$aes_round_dec->(4)} 1372 ${\$schoolbook->(4)} 1373 1374 ${\$aes_round_dec->(5)} 1375 ${\$aes_round_dec->(6)} 1376 ${\$aes_round_dec->(7)} 1377 1378 vmovdqa 5*16-32($secureBuffer), $TMP5 1379 vpxor $T, $TMP5, $TMP5 1380 vmovdqu 5*16-32($Htbl), $TMP4 1381 1382 vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3 1383 vpxor $TMP3, $TMP0, $TMP0 1384 vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3 1385 vpxor $TMP3, $TMP1, $TMP1 1386 vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3 1387 vpxor $TMP3, $TMP2, $TMP2 1388 vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3 1389 vpxor $TMP3, $TMP0, $TMP0 1390 1391 ${\$aes_round_dec->(8)} 1392 1393 vpsrldq \$8, $TMP0, $TMP3 1394 vpxor $TMP3, $TMP1, $TMP4 1395 vpslldq \$8, $TMP0, $TMP3 1396 vpxor $TMP3, $TMP2, $T 1397 1398 vmovdqa poly(%rip), $TMP2 1399 1400 ${\$aes_round_dec->(9)} 1401___ 1402 1403if ($aes256) { 1404$code.=<<___; 1405 ${\$aes_round_dec->(10)} 1406 ${\$aes_round_dec->(11)} 1407 ${\$aes_round_dec->(12)} 1408 ${\$aes_round_dec->(13)} 1409 vmovdqu 14*16($KS), $TMP5 1410___ 1411} else { 1412$code.=<<___; 1413 vmovdqu 10*16($KS), $TMP5 1414___ 1415} 1416 1417$code.=<<___; 1418 vpalignr \$8, $T, $T, $TMP1 1419 vpclmulqdq \$0x10, $TMP2, $T, $T 1420 vpxor $T, $TMP1, $T 1421 1422 vpxor 0*16($CT), $TMP5, $TMP3 1423 vaesenclast $TMP3, $CTR1, $CTR1 1424 vpxor 1*16($CT), $TMP5, $TMP3 1425 vaesenclast $TMP3, $CTR2, $CTR2 1426 vpxor 2*16($CT), $TMP5, $TMP3 1427 vaesenclast $TMP3, $CTR3, $CTR3 1428 vpxor 3*16($CT), $TMP5, $TMP3 1429 vaesenclast $TMP3, $CTR4, $CTR4 1430 vpxor 4*16($CT), $TMP5, $TMP3 1431 vaesenclast $TMP3, $CTR5, $CTR5 1432 vpxor 5*16($CT), $TMP5, $TMP3 1433 vaesenclast $TMP3, $CTR6, $CTR6 1434 1435 vpalignr \$8, $T, $T, $TMP1 1436 vpclmulqdq \$0x10, $TMP2, $T, $T 1437 vpxor $T, $TMP1, $T 1438 1439 vmovdqu $CTR1, 0*16($PT) 1440 vmovdqu $CTR2, 1*16($PT) 1441 vmovdqu $CTR3, 2*16($PT) 1442 vmovdqu $CTR4, 3*16($PT) 1443 vmovdqu $CTR5, 4*16($PT) 1444 vmovdqu $CTR6, 5*16($PT) 1445 1446 vpxor $TMP4, $T, $T 1447 1448 lea 96($CT), $CT 1449 lea 96($PT), $PT 1450 jmp .L${labelPrefix}_dec_loop1 1451 1452.L${labelPrefix}_dec_finish_96: 1453 vmovdqa $CTR6, $TMP5 1454 vmovdqa $CTR5, 1*16-32($secureBuffer) 1455 vmovdqa $CTR4, 2*16-32($secureBuffer) 1456 vmovdqa $CTR3, 3*16-32($secureBuffer) 1457 vmovdqa $CTR2, 4*16-32($secureBuffer) 1458 vmovdqa $CTR1, 5*16-32($secureBuffer) 1459 1460 vmovdqu 0*16-32($Htbl), $TMP3 1461 vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0 1462 vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1 1463 vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2 1464 vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3 1465 vpxor $TMP3, $TMP0, $TMP0 1466 1467 ${\$schoolbook->(1)} 1468 ${\$schoolbook->(2)} 1469 ${\$schoolbook->(3)} 1470 ${\$schoolbook->(4)} 1471 1472 vmovdqu 5*16-32($secureBuffer), $TMP5 1473 vpxor $T, $TMP5, $TMP5 1474 vmovdqu 5*16-32($Htbl), $TMP4 1475 vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3 1476 vpxor $TMP3, $TMP1, $TMP1 1477 vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3 1478 vpxor $TMP3, $TMP2, $TMP2 1479 vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3 1480 vpxor $TMP3, $TMP0, $TMP0 1481 vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3 1482 vpxor $TMP3, $TMP0, $TMP0 1483 1484 vpsrldq \$8, $TMP0, $TMP3 1485 vpxor $TMP3, $TMP1, $TMP4 1486 vpslldq \$8, $TMP0, $TMP3 1487 vpxor $TMP3, $TMP2, $T 1488 1489 vmovdqa poly(%rip), $TMP2 1490 1491 vpalignr \$8, $T, $T, $TMP1 1492 vpclmulqdq \$0x10, $TMP2, $T, $T 1493 vpxor $T, $TMP1, $T 1494 1495 vpalignr \$8, $T, $T, $TMP1 1496 vpclmulqdq \$0x10, $TMP2, $T, $T 1497 vpxor $T, $TMP1, $T 1498 1499 vpxor $TMP4, $T, $T 1500 1501.L${labelPrefix}_dec_loop2: 1502 # Here we encrypt any remaining whole block 1503 1504 # if there are no whole blocks 1505 cmp \$16, $LEN 1506 jb .L${labelPrefix}_dec_out 1507 sub \$16, $LEN 1508 1509 vmovdqa $CTR, $TMP1 1510 vpaddd one(%rip), $CTR, $CTR 1511 1512 vpxor 0*16($KS), $TMP1, $TMP1 1513 vaesenc 1*16($KS), $TMP1, $TMP1 1514 vaesenc 2*16($KS), $TMP1, $TMP1 1515 vaesenc 3*16($KS), $TMP1, $TMP1 1516 vaesenc 4*16($KS), $TMP1, $TMP1 1517 vaesenc 5*16($KS), $TMP1, $TMP1 1518 vaesenc 6*16($KS), $TMP1, $TMP1 1519 vaesenc 7*16($KS), $TMP1, $TMP1 1520 vaesenc 8*16($KS), $TMP1, $TMP1 1521 vaesenc 9*16($KS), $TMP1, $TMP1 1522___ 1523if ($aes256) { 1524$code.=<<___; 1525 vaesenc 10*16($KS), $TMP1, $TMP1 1526 vaesenc 11*16($KS), $TMP1, $TMP1 1527 vaesenc 12*16($KS), $TMP1, $TMP1 1528 vaesenc 13*16($KS), $TMP1, $TMP1 1529 vaesenclast 14*16($KS), $TMP1, $TMP1 1530___ 1531} else { 1532$code.=<<___; 1533 vaesenclast 10*16($KS), $TMP1, $TMP1 1534___ 1535} 1536 1537$code.=<<___; 1538 vpxor ($CT), $TMP1, $TMP1 1539 vmovdqu $TMP1, ($PT) 1540 addq \$16, $CT 1541 addq \$16, $PT 1542 1543 vpxor $TMP1, $T, $T 1544 vmovdqa -32($Htbl), $TMP0 1545 call GFMUL 1546 1547 jmp .L${labelPrefix}_dec_loop2 1548 1549.L${labelPrefix}_dec_out: 1550 vmovdqu $T, ($POL) 1551 ret 1552.cfi_endproc 1553___ 1554 1555 if ($aes256) { 1556 $code.=<<___; 1557.size aes256gcmsiv_dec, .-aes256gcmsiv_dec 1558___ 1559 } else { 1560 $code.=<<___; 1561.size aes128gcmsiv_dec, .-aes128gcmsiv_dec 1562___ 1563 } 1564} 1565 1566aesgcmsiv_dec(0); # emit 128-bit version 1567 1568sub aes128gcmsiv_ecb_enc_block { 1569 my $STATE_1 = "%xmm1"; 1570 my $KSp = "%rdx"; 1571 1572 # parameter 1: PT %rdi (pointer to 128 bit) 1573 # parameter 2: CT %rsi (pointer to 128 bit) 1574 # parameter 3: ks %rdx (pointer to ks) 1575 $code.=<<___; 1576.globl aes128gcmsiv_ecb_enc_block 1577.type aes128gcmsiv_ecb_enc_block,\@function,3 1578.align 16 1579aes128gcmsiv_ecb_enc_block: 1580.cfi_startproc 1581 vmovdqa (%rdi), $STATE_1 1582 1583 vpxor ($KSp), $STATE_1, $STATE_1 1584 vaesenc 1*16($KSp), $STATE_1, $STATE_1 1585 vaesenc 2*16($KSp), $STATE_1, $STATE_1 1586 vaesenc 3*16($KSp), $STATE_1, $STATE_1 1587 vaesenc 4*16($KSp), $STATE_1, $STATE_1 1588 vaesenc 5*16($KSp), $STATE_1, $STATE_1 1589 vaesenc 6*16($KSp), $STATE_1, $STATE_1 1590 vaesenc 7*16($KSp), $STATE_1, $STATE_1 1591 vaesenc 8*16($KSp), $STATE_1, $STATE_1 1592 vaesenc 9*16($KSp), $STATE_1, $STATE_1 1593 vaesenclast 10*16($KSp), $STATE_1, $STATE_1 # STATE_1 == IV 1594 1595 vmovdqa $STATE_1, (%rsi) 1596 1597 ret 1598.cfi_endproc 1599.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block 1600___ 1601} 1602aes128gcmsiv_ecb_enc_block(); 1603 1604sub aes256gcmsiv_aes_ks_enc_x1 { 1605 my $KS = "%rdx"; 1606 my $KEYp = "%rcx"; 1607 my $CON_MASK = "%xmm0"; 1608 my $MASK_256 = "%xmm15"; 1609 my $KEY_1 = "%xmm1"; 1610 my $KEY_2 = "%xmm3"; 1611 my $BLOCK1 = "%xmm8"; 1612 my $AUX_REG = "%xmm14"; 1613 my $PT = "%rdi"; 1614 my $CT = "%rsi"; 1615 1616 my $round_double = sub { 1617 my ($i, $j) = @_; 1618 return <<___; 1619 vpshufb %xmm15, %xmm3, %xmm2 1620 vaesenclast %xmm0, %xmm2, %xmm2 1621 vpslld \$1, %xmm0, %xmm0 1622 vpslldq \$4, %xmm1, %xmm4 1623 vpxor %xmm4, %xmm1, %xmm1 1624 vpslldq \$4, %xmm4, %xmm4 1625 vpxor %xmm4, %xmm1, %xmm1 1626 vpslldq \$4, %xmm4, %xmm4 1627 vpxor %xmm4, %xmm1, %xmm1 1628 vpxor %xmm2, %xmm1, %xmm1 1629 vaesenc %xmm1, $BLOCK1, $BLOCK1 1630 vmovdqu %xmm1, ${\eval(16*$i)}($KS) 1631 1632 vpshufd \$0xff, %xmm1, %xmm2 1633 vaesenclast %xmm14, %xmm2, %xmm2 1634 vpslldq \$4, %xmm3, %xmm4 1635 vpxor %xmm4, %xmm3, %xmm3 1636 vpslldq \$4, %xmm4, %xmm4 1637 vpxor %xmm4, %xmm3, %xmm3 1638 vpslldq \$4, %xmm4, %xmm4 1639 vpxor %xmm4, %xmm3, %xmm3 1640 vpxor %xmm2, %xmm3, %xmm3 1641 vaesenc %xmm3, $BLOCK1, $BLOCK1 1642 vmovdqu %xmm3, ${\eval(16*$j)}($KS) 1643___ 1644 }; 1645 1646 my $round_last = sub { 1647 my ($i) = @_; 1648 return <<___; 1649 vpshufb %xmm15, %xmm3, %xmm2 1650 vaesenclast %xmm0, %xmm2, %xmm2 1651 vpslldq \$4, %xmm1, %xmm4 1652 vpxor %xmm4, %xmm1, %xmm1 1653 vpslldq \$4, %xmm4, %xmm4 1654 vpxor %xmm4, %xmm1, %xmm1 1655 vpslldq \$4, %xmm4, %xmm4 1656 vpxor %xmm4, %xmm1, %xmm1 1657 vpxor %xmm2, %xmm1, %xmm1 1658 vaesenclast %xmm1, $BLOCK1, $BLOCK1 1659 vmovdqu %xmm1, ${\eval(16*$i)}($KS) 1660___ 1661 }; 1662 1663 # parameter 1: %rdi Pointer to PT1 1664 # parameter 2: %rsi Pointer to CT1 1665 # parameter 3: %rdx Pointer to KS 1666 # parameter 4: %rcx Pointer to initial key 1667 $code.=<<___; 1668.globl aes256gcmsiv_aes_ks_enc_x1 1669.type aes256gcmsiv_aes_ks_enc_x1,\@function,4 1670.align 16 1671aes256gcmsiv_aes_ks_enc_x1: 1672.cfi_startproc 1673 vmovdqa con1(%rip), $CON_MASK # CON_MASK = 1,1,1,1 1674 vmovdqa mask(%rip), $MASK_256 # MASK_256 1675 vmovdqa ($PT), $BLOCK1 1676 vmovdqa ($KEYp), $KEY_1 # KEY_1 || KEY_2 [0..7] = user key 1677 vmovdqa 16($KEYp), $KEY_2 1678 vpxor $KEY_1, $BLOCK1, $BLOCK1 1679 vaesenc $KEY_2, $BLOCK1, $BLOCK1 1680 vmovdqu $KEY_1, ($KS) # First round key 1681 vmovdqu $KEY_2, 16($KS) 1682 vpxor $AUX_REG, $AUX_REG, $AUX_REG 1683 1684 ${\$round_double->(2, 3)} 1685 ${\$round_double->(4, 5)} 1686 ${\$round_double->(6, 7)} 1687 ${\$round_double->(8, 9)} 1688 ${\$round_double->(10, 11)} 1689 ${\$round_double->(12, 13)} 1690 ${\$round_last->(14)} 1691 vmovdqa $BLOCK1, ($CT) 1692 ret 1693.cfi_endproc 1694.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1 1695___ 1696} 1697aes256gcmsiv_aes_ks_enc_x1(); 1698 1699sub aes256gcmsiv_ecb_enc_block { 1700 my $STATE_1 = "%xmm1"; 1701 my $PT = "%rdi"; 1702 my $CT = "%rsi"; 1703 my $KSp = "%rdx"; 1704 1705 # parameter 1: PT %rdi (pointer to 128 bit) 1706 # parameter 2: CT %rsi (pointer to 128 bit) 1707 # parameter 3: ks %rdx (pointer to ks) 1708 $code.=<<___; 1709.globl aes256gcmsiv_ecb_enc_block 1710.type aes256gcmsiv_ecb_enc_block,\@function,3 1711.align 16 1712aes256gcmsiv_ecb_enc_block: 1713.cfi_startproc 1714 vmovdqa (%rdi), $STATE_1 1715 vpxor ($KSp), $STATE_1, $STATE_1 1716 vaesenc 1*16($KSp), $STATE_1, $STATE_1 1717 vaesenc 2*16($KSp), $STATE_1, $STATE_1 1718 vaesenc 3*16($KSp), $STATE_1, $STATE_1 1719 vaesenc 4*16($KSp), $STATE_1, $STATE_1 1720 vaesenc 5*16($KSp), $STATE_1, $STATE_1 1721 vaesenc 6*16($KSp), $STATE_1, $STATE_1 1722 vaesenc 7*16($KSp), $STATE_1, $STATE_1 1723 vaesenc 8*16($KSp), $STATE_1, $STATE_1 1724 vaesenc 9*16($KSp), $STATE_1, $STATE_1 1725 vaesenc 10*16($KSp), $STATE_1, $STATE_1 1726 vaesenc 11*16($KSp), $STATE_1, $STATE_1 1727 vaesenc 12*16($KSp), $STATE_1, $STATE_1 1728 vaesenc 13*16($KSp), $STATE_1, $STATE_1 1729 vaesenclast 14*16($KSp), $STATE_1, $STATE_1 # $STATE_1 == IV 1730 vmovdqa $STATE_1, (%rsi) 1731 ret 1732.cfi_endproc 1733.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block 1734___ 1735} 1736aes256gcmsiv_ecb_enc_block(); 1737 1738sub aes256gcmsiv_enc_msg_x4 { 1739 my $CTR1 = "%xmm0"; 1740 my $CTR2 = "%xmm1"; 1741 my $CTR3 = "%xmm2"; 1742 my $CTR4 = "%xmm3"; 1743 my $ADDER = "%xmm4"; 1744 1745 my $STATE1 = "%xmm5"; 1746 my $STATE2 = "%xmm6"; 1747 my $STATE3 = "%xmm7"; 1748 my $STATE4 = "%xmm8"; 1749 1750 my $TMP = "%xmm12"; 1751 my $TMP2 = "%xmm13"; 1752 my $TMP3 = "%xmm14"; 1753 my $IV = "%xmm15"; 1754 1755 my $PT = "%rdi"; 1756 my $CT = "%rsi"; 1757 my $TAG = "%rdx"; 1758 my $KS = "%rcx"; 1759 my $LEN = "%r8"; 1760 1761 my $aes_round = sub { 1762 my ($i) = @_; 1763 return <<___; 1764 vmovdqu ${\eval($i*16)}($KS), $TMP 1765 vaesenc $TMP, $STATE1, $STATE1 1766 vaesenc $TMP, $STATE2, $STATE2 1767 vaesenc $TMP, $STATE3, $STATE3 1768 vaesenc $TMP, $STATE4, $STATE4 1769___ 1770 }; 1771 1772 my $aes_lastround = sub { 1773 my ($i) = @_; 1774 return <<___; 1775 vmovdqu ${\eval($i*16)}($KS), $TMP 1776 vaesenclast $TMP, $STATE1, $STATE1 1777 vaesenclast $TMP, $STATE2, $STATE2 1778 vaesenclast $TMP, $STATE3, $STATE3 1779 vaesenclast $TMP, $STATE4, $STATE4 1780___ 1781 }; 1782 1783 # void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT, 1784 # unsigned char* TAG, unsigned char* KS, 1785 # size_t byte_len); 1786 # parameter 1: %rdi #PT 1787 # parameter 2: %rsi #CT 1788 # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] 1789 # parameter 4: %rcx #KS 1790 # parameter 5: %r8 #LEN MSG_length in bytes 1791 $code.=<<___; 1792.globl aes256gcmsiv_enc_msg_x4 1793.type aes256gcmsiv_enc_msg_x4,\@function,5 1794.align 16 1795aes256gcmsiv_enc_msg_x4: 1796.cfi_startproc 1797 test $LEN, $LEN 1798 jnz .L256_enc_msg_x4_start 1799 ret 1800 1801.L256_enc_msg_x4_start: 1802 movq $LEN, %r10 1803 shrq \$4, $LEN # LEN = num of blocks 1804 shlq \$60, %r10 1805 jz .L256_enc_msg_x4_start2 1806 addq \$1, $LEN 1807 1808.L256_enc_msg_x4_start2: 1809 movq $LEN, %r10 1810 shlq \$62, %r10 1811 shrq \$62, %r10 1812 1813 # make IV from TAG 1814 vmovdqa ($TAG), $IV 1815 vpor OR_MASK(%rip), $IV, $IV # IV = [1]TAG[126...32][00..00] 1816 1817 vmovdqa four(%rip), $ADDER # Register to increment counters 1818 vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00] 1819 vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01] 1820 vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02] 1821 vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03] 1822 1823 shrq \$2, $LEN 1824 je .L256_enc_msg_x4_check_remainder 1825 1826 subq \$64, $CT 1827 subq \$64, $PT 1828 1829.L256_enc_msg_x4_loop1: 1830 addq \$64, $CT 1831 addq \$64, $PT 1832 1833 vmovdqa $CTR1, $STATE1 1834 vmovdqa $CTR2, $STATE2 1835 vmovdqa $CTR3, $STATE3 1836 vmovdqa $CTR4, $STATE4 1837 1838 vpxor ($KS), $STATE1, $STATE1 1839 vpxor ($KS), $STATE2, $STATE2 1840 vpxor ($KS), $STATE3, $STATE3 1841 vpxor ($KS), $STATE4, $STATE4 1842 1843 ${\$aes_round->(1)} 1844 vpaddd $ADDER, $CTR1, $CTR1 1845 ${\$aes_round->(2)} 1846 vpaddd $ADDER, $CTR2, $CTR2 1847 ${\$aes_round->(3)} 1848 vpaddd $ADDER, $CTR3, $CTR3 1849 ${\$aes_round->(4)} 1850 vpaddd $ADDER, $CTR4, $CTR4 1851 1852 ${\$aes_round->(5)} 1853 ${\$aes_round->(6)} 1854 ${\$aes_round->(7)} 1855 ${\$aes_round->(8)} 1856 ${\$aes_round->(9)} 1857 ${\$aes_round->(10)} 1858 ${\$aes_round->(11)} 1859 ${\$aes_round->(12)} 1860 ${\$aes_round->(13)} 1861 ${\$aes_lastround->(14)} 1862 1863 # XOR with Plaintext 1864 vpxor 0*16($PT), $STATE1, $STATE1 1865 vpxor 1*16($PT), $STATE2, $STATE2 1866 vpxor 2*16($PT), $STATE3, $STATE3 1867 vpxor 3*16($PT), $STATE4, $STATE4 1868 1869 subq \$1, $LEN 1870 1871 vmovdqu $STATE1, 0*16($CT) 1872 vmovdqu $STATE2, 1*16($CT) 1873 vmovdqu $STATE3, 2*16($CT) 1874 vmovdqu $STATE4, 3*16($CT) 1875 1876 jne .L256_enc_msg_x4_loop1 1877 1878 addq \$64, $CT 1879 addq \$64, $PT 1880 1881.L256_enc_msg_x4_check_remainder: 1882 cmpq \$0, %r10 1883 je .L256_enc_msg_x4_out 1884 1885.L256_enc_msg_x4_loop2: 1886 # encrypt each block separately 1887 # CTR1 is the highest counter (even if no LOOP done) 1888 1889 vmovdqa $CTR1, $STATE1 1890 vpaddd one(%rip), $CTR1, $CTR1 # inc counter 1891 vpxor ($KS), $STATE1, $STATE1 1892 vaesenc 16($KS), $STATE1, $STATE1 1893 vaesenc 32($KS), $STATE1, $STATE1 1894 vaesenc 48($KS), $STATE1, $STATE1 1895 vaesenc 64($KS), $STATE1, $STATE1 1896 vaesenc 80($KS), $STATE1, $STATE1 1897 vaesenc 96($KS), $STATE1, $STATE1 1898 vaesenc 112($KS), $STATE1, $STATE1 1899 vaesenc 128($KS), $STATE1, $STATE1 1900 vaesenc 144($KS), $STATE1, $STATE1 1901 vaesenc 160($KS), $STATE1, $STATE1 1902 vaesenc 176($KS), $STATE1, $STATE1 1903 vaesenc 192($KS), $STATE1, $STATE1 1904 vaesenc 208($KS), $STATE1, $STATE1 1905 vaesenclast 224($KS), $STATE1, $STATE1 1906 1907 # XOR with Plaintext 1908 vpxor ($PT), $STATE1, $STATE1 1909 1910 vmovdqu $STATE1, ($CT) 1911 1912 addq \$16, $PT 1913 addq \$16, $CT 1914 1915 subq \$1, %r10 1916 jne .L256_enc_msg_x4_loop2 1917 1918.L256_enc_msg_x4_out: 1919 ret 1920.cfi_endproc 1921.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4 1922___ 1923} 1924aes256gcmsiv_enc_msg_x4(); 1925 1926sub aes256gcmsiv_enc_msg_x8() { 1927 my $STATE1 = "%xmm1"; 1928 my $STATE2 = "%xmm2"; 1929 my $STATE3 = "%xmm3"; 1930 my $STATE4 = "%xmm4"; 1931 my $STATE5 = "%xmm5"; 1932 my $STATE6 = "%xmm6"; 1933 my $STATE7 = "%xmm7"; 1934 my $STATE8 = "%xmm8"; 1935 my $CTR1 = "%xmm0"; 1936 my $CTR2 = "%xmm9"; 1937 my $CTR3 = "%xmm10"; 1938 my $CTR4 = "%xmm11"; 1939 my $CTR5 = "%xmm12"; 1940 my $CTR6 = "%xmm13"; 1941 my $CTR7 = "%xmm14"; 1942 my $TMP1 = "%xmm1"; 1943 my $TMP2 = "%xmm2"; 1944 my $KS = "%rcx"; 1945 my $LEN = "%r8"; 1946 my $PT = "%rdi"; 1947 my $CT = "%rsi"; 1948 my $TAG = "%rdx"; 1949 my $SCHED = "%xmm15"; 1950 1951 my $aes_round8 = sub { 1952 my ($i) = @_; 1953 return <<___; 1954 vmovdqu ${\eval($i*16)}($KS), $SCHED 1955 vaesenc $SCHED, $STATE1, $STATE1 1956 vaesenc $SCHED, $STATE2, $STATE2 1957 vaesenc $SCHED, $STATE3, $STATE3 1958 vaesenc $SCHED, $STATE4, $STATE4 1959 vaesenc $SCHED, $STATE5, $STATE5 1960 vaesenc $SCHED, $STATE6, $STATE6 1961 vaesenc $SCHED, $STATE7, $STATE7 1962 vaesenc $SCHED, $STATE8, $STATE8 1963___ 1964 }; 1965 1966 my $aes_lastround8 = sub { 1967 my ($i) = @_; 1968 return <<___; 1969 vmovdqu ${\eval($i*16)}($KS), $SCHED 1970 vaesenclast $SCHED, $STATE1, $STATE1 1971 vaesenclast $SCHED, $STATE2, $STATE2 1972 vaesenclast $SCHED, $STATE3, $STATE3 1973 vaesenclast $SCHED, $STATE4, $STATE4 1974 vaesenclast $SCHED, $STATE5, $STATE5 1975 vaesenclast $SCHED, $STATE6, $STATE6 1976 vaesenclast $SCHED, $STATE7, $STATE7 1977 vaesenclast $SCHED, $STATE8, $STATE8 1978___ 1979 }; 1980 1981 # void ENC_MSG_x8(unsigned char* PT, 1982 # unsigned char* CT, 1983 # unsigned char* TAG, 1984 # unsigned char* KS, 1985 # size_t byte_len); 1986 # parameter 1: %rdi #PT 1987 # parameter 2: %rsi #CT 1988 # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] 1989 # parameter 4: %rcx #KS 1990 # parameter 5: %r8 #LEN MSG_length in bytes 1991 $code.=<<___; 1992.globl aes256gcmsiv_enc_msg_x8 1993.type aes256gcmsiv_enc_msg_x8,\@function,5 1994.align 16 1995aes256gcmsiv_enc_msg_x8: 1996.cfi_startproc 1997 test $LEN, $LEN 1998 jnz .L256_enc_msg_x8_start 1999 ret 2000 2001.L256_enc_msg_x8_start: 2002 # Place in stack 2003 movq %rsp, %r11 2004 subq \$16, %r11 2005 andq \$-64, %r11 2006 2007 movq $LEN, %r10 2008 shrq \$4, $LEN # LEN = num of blocks 2009 shlq \$60, %r10 2010 jz .L256_enc_msg_x8_start2 2011 addq \$1, $LEN 2012 2013.L256_enc_msg_x8_start2: 2014 movq $LEN, %r10 2015 shlq \$61, %r10 2016 shrq \$61, %r10 2017 2018 # Make IV from TAG 2019 vmovdqa ($TAG), $TMP1 2020 vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00] 2021 2022 # store counter8 on the stack 2023 vpaddd seven(%rip), $TMP1, $CTR1 2024 vmovdqa $CTR1, (%r11) # CTR8 = TAG[127...32][00..07] 2025 vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01] 2026 vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02] 2027 vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03] 2028 vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04] 2029 vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05] 2030 vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06] 2031 vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00] 2032 2033 shrq \$3, $LEN 2034 jz .L256_enc_msg_x8_check_remainder 2035 2036 subq \$128, $CT 2037 subq \$128, $PT 2038 2039.L256_enc_msg_x8_loop1: 2040 addq \$128, $CT 2041 addq \$128, $PT 2042 2043 vmovdqa $CTR1, $STATE1 2044 vmovdqa $CTR2, $STATE2 2045 vmovdqa $CTR3, $STATE3 2046 vmovdqa $CTR4, $STATE4 2047 vmovdqa $CTR5, $STATE5 2048 vmovdqa $CTR6, $STATE6 2049 vmovdqa $CTR7, $STATE7 2050 # move from stack 2051 vmovdqa (%r11), $STATE8 2052 2053 vpxor ($KS), $STATE1, $STATE1 2054 vpxor ($KS), $STATE2, $STATE2 2055 vpxor ($KS), $STATE3, $STATE3 2056 vpxor ($KS), $STATE4, $STATE4 2057 vpxor ($KS), $STATE5, $STATE5 2058 vpxor ($KS), $STATE6, $STATE6 2059 vpxor ($KS), $STATE7, $STATE7 2060 vpxor ($KS), $STATE8, $STATE8 2061 2062 ${\$aes_round8->(1)} 2063 vmovdqa (%r11), $CTR7 # deal with CTR8 2064 vpaddd eight(%rip), $CTR7, $CTR7 2065 vmovdqa $CTR7, (%r11) 2066 ${\$aes_round8->(2)} 2067 vpsubd one(%rip), $CTR7, $CTR7 2068 ${\$aes_round8->(3)} 2069 vpaddd eight(%rip), $CTR1, $CTR1 2070 ${\$aes_round8->(4)} 2071 vpaddd eight(%rip), $CTR2, $CTR2 2072 ${\$aes_round8->(5)} 2073 vpaddd eight(%rip), $CTR3, $CTR3 2074 ${\$aes_round8->(6)} 2075 vpaddd eight(%rip), $CTR4, $CTR4 2076 ${\$aes_round8->(7)} 2077 vpaddd eight(%rip), $CTR5, $CTR5 2078 ${\$aes_round8->(8)} 2079 vpaddd eight(%rip), $CTR6, $CTR6 2080 ${\$aes_round8->(9)} 2081 ${\$aes_round8->(10)} 2082 ${\$aes_round8->(11)} 2083 ${\$aes_round8->(12)} 2084 ${\$aes_round8->(13)} 2085 ${\$aes_lastround8->(14)} 2086 2087 # XOR with Plaintext 2088 vpxor 0*16($PT), $STATE1, $STATE1 2089 vpxor 1*16($PT), $STATE2, $STATE2 2090 vpxor 2*16($PT), $STATE3, $STATE3 2091 vpxor 3*16($PT), $STATE4, $STATE4 2092 vpxor 4*16($PT), $STATE5, $STATE5 2093 vpxor 5*16($PT), $STATE6, $STATE6 2094 vpxor 6*16($PT), $STATE7, $STATE7 2095 vpxor 7*16($PT), $STATE8, $STATE8 2096 2097 subq \$1, $LEN 2098 2099 vmovdqu $STATE1, 0*16($CT) 2100 vmovdqu $STATE2, 1*16($CT) 2101 vmovdqu $STATE3, 2*16($CT) 2102 vmovdqu $STATE4, 3*16($CT) 2103 vmovdqu $STATE5, 4*16($CT) 2104 vmovdqu $STATE6, 5*16($CT) 2105 vmovdqu $STATE7, 6*16($CT) 2106 vmovdqu $STATE8, 7*16($CT) 2107 2108 jne .L256_enc_msg_x8_loop1 2109 2110 addq \$128, $CT 2111 addq \$128, $PT 2112 2113.L256_enc_msg_x8_check_remainder: 2114 cmpq \$0, %r10 2115 je .L256_enc_msg_x8_out 2116 2117.L256_enc_msg_x8_loop2: 2118 # encrypt each block separately 2119 # CTR1 is the highest counter (even if no LOOP done) 2120 vmovdqa $CTR1, $STATE1 2121 vpaddd one(%rip), $CTR1, $CTR1 2122 2123 vpxor ($KS), $STATE1, $STATE1 2124 vaesenc 16($KS), $STATE1, $STATE1 2125 vaesenc 32($KS), $STATE1, $STATE1 2126 vaesenc 48($KS), $STATE1, $STATE1 2127 vaesenc 64($KS), $STATE1, $STATE1 2128 vaesenc 80($KS), $STATE1, $STATE1 2129 vaesenc 96($KS), $STATE1, $STATE1 2130 vaesenc 112($KS), $STATE1, $STATE1 2131 vaesenc 128($KS), $STATE1, $STATE1 2132 vaesenc 144($KS), $STATE1, $STATE1 2133 vaesenc 160($KS), $STATE1, $STATE1 2134 vaesenc 176($KS), $STATE1, $STATE1 2135 vaesenc 192($KS), $STATE1, $STATE1 2136 vaesenc 208($KS), $STATE1, $STATE1 2137 vaesenclast 224($KS), $STATE1, $STATE1 2138 2139 # XOR with Plaintext 2140 vpxor ($PT), $STATE1, $STATE1 2141 2142 vmovdqu $STATE1, ($CT) 2143 2144 addq \$16, $PT 2145 addq \$16, $CT 2146 subq \$1, %r10 2147 jnz .L256_enc_msg_x8_loop2 2148 2149.L256_enc_msg_x8_out: 2150 ret 2151 2152.cfi_endproc 2153.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8 2154___ 2155} 2156aes256gcmsiv_enc_msg_x8(); 2157aesgcmsiv_dec(1); 2158 2159sub aes256gcmsiv_kdf { 2160 my $ONE = "%xmm8"; 2161 my $BLOCK1 = "%xmm4"; 2162 my $BLOCK2 = "%xmm6"; 2163 my $BLOCK3 = "%xmm7"; 2164 my $BLOCK4 = "%xmm11"; 2165 my $BLOCK5 = "%xmm12"; 2166 my $BLOCK6 = "%xmm13"; 2167 2168 my $enc_roundx6 = sub { 2169 my ($i, $j) = @_; 2170 return <<___; 2171 vmovdqa ${\eval($i*16)}(%rdx), $j 2172 vaesenc $j, $BLOCK1, $BLOCK1 2173 vaesenc $j, $BLOCK2, $BLOCK2 2174 vaesenc $j, $BLOCK3, $BLOCK3 2175 vaesenc $j, $BLOCK4, $BLOCK4 2176 vaesenc $j, $BLOCK5, $BLOCK5 2177 vaesenc $j, $BLOCK6, $BLOCK6 2178___ 2179 }; 2180 2181 my $enc_roundlastx6 = sub { 2182 my ($i, $j) = @_; 2183 return <<___; 2184 vmovdqa ${\eval($i*16)}(%rdx), $j 2185 vaesenclast $j, $BLOCK1, $BLOCK1 2186 vaesenclast $j, $BLOCK2, $BLOCK2 2187 vaesenclast $j, $BLOCK3, $BLOCK3 2188 vaesenclast $j, $BLOCK4, $BLOCK4 2189 vaesenclast $j, $BLOCK5, $BLOCK5 2190 vaesenclast $j, $BLOCK6, $BLOCK6 2191___ 2192 }; 2193 2194 # void aes256gcmsiv_kdf(const uint8_t nonce[16], 2195 # uint8_t *out_key_material, 2196 # const uint8_t *key_schedule); 2197 $code.=<<___; 2198.globl aes256gcmsiv_kdf 2199.type aes256gcmsiv_kdf,\@function,3 2200.align 16 2201aes256gcmsiv_kdf: 2202.cfi_startproc 2203# parameter 1: %rdi Pointer to NONCE 2204# parameter 2: %rsi Pointer to CT 2205# parameter 4: %rdx Pointer to keys 2206 2207 vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key 2208 vmovdqa 0*16(%rdi), $BLOCK1 2209 vmovdqa and_mask(%rip), $BLOCK4 2210 vmovdqa one(%rip), $ONE 2211 vpshufd \$0x90, $BLOCK1, $BLOCK1 2212 vpand $BLOCK4, $BLOCK1, $BLOCK1 2213 vpaddd $ONE, $BLOCK1, $BLOCK2 2214 vpaddd $ONE, $BLOCK2, $BLOCK3 2215 vpaddd $ONE, $BLOCK3, $BLOCK4 2216 vpaddd $ONE, $BLOCK4, $BLOCK5 2217 vpaddd $ONE, $BLOCK5, $BLOCK6 2218 2219 vpxor %xmm1, $BLOCK1, $BLOCK1 2220 vpxor %xmm1, $BLOCK2, $BLOCK2 2221 vpxor %xmm1, $BLOCK3, $BLOCK3 2222 vpxor %xmm1, $BLOCK4, $BLOCK4 2223 vpxor %xmm1, $BLOCK5, $BLOCK5 2224 vpxor %xmm1, $BLOCK6, $BLOCK6 2225 2226 ${\$enc_roundx6->(1, "%xmm1")} 2227 ${\$enc_roundx6->(2, "%xmm2")} 2228 ${\$enc_roundx6->(3, "%xmm1")} 2229 ${\$enc_roundx6->(4, "%xmm2")} 2230 ${\$enc_roundx6->(5, "%xmm1")} 2231 ${\$enc_roundx6->(6, "%xmm2")} 2232 ${\$enc_roundx6->(7, "%xmm1")} 2233 ${\$enc_roundx6->(8, "%xmm2")} 2234 ${\$enc_roundx6->(9, "%xmm1")} 2235 ${\$enc_roundx6->(10, "%xmm2")} 2236 ${\$enc_roundx6->(11, "%xmm1")} 2237 ${\$enc_roundx6->(12, "%xmm2")} 2238 ${\$enc_roundx6->(13, "%xmm1")} 2239 ${\$enc_roundlastx6->(14, "%xmm2")} 2240 2241 vmovdqa $BLOCK1, 0*16(%rsi) 2242 vmovdqa $BLOCK2, 1*16(%rsi) 2243 vmovdqa $BLOCK3, 2*16(%rsi) 2244 vmovdqa $BLOCK4, 3*16(%rsi) 2245 vmovdqa $BLOCK5, 4*16(%rsi) 2246 vmovdqa $BLOCK6, 5*16(%rsi) 2247 ret 2248.cfi_endproc 2249.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf 2250___ 2251} 2252aes256gcmsiv_kdf(); 2253 2254print $code; 2255 2256close STDOUT; 2257