1#!/usr/bin/env perl 2 3################################################################### 4### AES-128 [originally in CTR mode] ### 5### bitsliced implementation for Intel Core 2 processors ### 6### requires support of SSE extensions up to SSSE3 ### 7### Author: Emilia Käsper and Peter Schwabe ### 8### Date: 2009-03-19 ### 9### Public domain ### 10### ### 11### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 12### further information. ### 13################################################################### 14# 15# September 2011. 16# 17# Started as transliteration to "perlasm" the original code has 18# undergone following changes: 19# 20# - code was made position-independent; 21# - rounds were folded into a loop resulting in >5x size reduction 22# from 12.5KB to 2.2KB; 23# - above was possibile thanks to mixcolumns() modification that 24# allowed to feed its output back to aesenc[last], this was 25# achieved at cost of two additional inter-registers moves; 26# - some instruction reordering and interleaving; 27# - this module doesn't implement key setup subroutine, instead it 28# relies on conversion of "conventional" key schedule as returned 29# by AES_set_encrypt_key (see discussion below); 30# - first and last round keys are treated differently, which allowed 31# to skip one shiftrows(), reduce bit-sliced key schedule and 32# speed-up conversion by 22%; 33# - support for 192- and 256-bit keys was added; 34# 35# Resulting performance in CPU cycles spent to encrypt one byte out 36# of 4096-byte buffer with 128-bit key is: 37# 38# Emilia's this(*) difference 39# 40# Core 2 9.30 8.69 +7% 41# Nehalem(**) 7.63 6.88 +11% 42# Atom 17.1 16.4 +4% 43# Silvermont - 12.9 44# Goldmont - 8.85 45# 46# (*) Comparison is not completely fair, because "this" is ECB, 47# i.e. no extra processing such as counter values calculation 48# and xor-ing input as in Emilia's CTR implementation is 49# performed. However, the CTR calculations stand for not more 50# than 1% of total time, so comparison is *rather* fair. 51# 52# (**) Results were collected on Westmere, which is considered to 53# be equivalent to Nehalem for this code. 54# 55# As for key schedule conversion subroutine. Interface to OpenSSL 56# relies on per-invocation on-the-fly conversion. This naturally 57# has impact on performance, especially for short inputs. Conversion 58# time in CPU cycles and its ratio to CPU cycles spent in 8x block 59# function is: 60# 61# conversion conversion/8x block 62# Core 2 240 0.22 63# Nehalem 180 0.20 64# Atom 430 0.20 65# 66# The ratio values mean that 128-byte blocks will be processed 67# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 68# etc. Then keep in mind that input sizes not divisible by 128 are 69# *effectively* slower, especially shortest ones, e.g. consecutive 70# 144-byte blocks are processed 44% slower than one would expect, 71# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 72# it's still faster than ["hyper-threading-safe" code path in] 73# aes-x86_64.pl on all lengths above 64 bytes... 74# 75# October 2011. 76# 77# Add decryption procedure. Performance in CPU cycles spent to decrypt 78# one byte out of 4096-byte buffer with 128-bit key is: 79# 80# Core 2 9.98 81# Nehalem 7.80 82# Atom 17.9 83# Silvermont 14.0 84# Goldmont 10.2 85# 86# November 2011. 87# 88# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 89# suboptimal, but XTS is meant to be used with larger blocks... 90# 91# <appro@openssl.org> 92 93$flavour = shift; 94$output = shift; 95if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 96 97$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 98 99$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 100( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 101( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 102die "can't locate x86_64-xlate.pl"; 103 104open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 105*STDOUT=*OUT; 106 107my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 108my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 109my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 110 111{ 112my ($key,$rounds,$const)=("%rax","%r10d","%r11"); 113 114sub Sbox { 115# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 116# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 117my @b=@_[0..7]; 118my @t=@_[8..11]; 119my @s=@_[12..15]; 120 &InBasisChange (@b); 121 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 122 &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 123} 124 125sub InBasisChange { 126# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 127# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 128my @b=@_[0..7]; 129$code.=<<___; 130 pxor @b[6], @b[5] 131 pxor @b[1], @b[2] 132 pxor @b[0], @b[3] 133 pxor @b[2], @b[6] 134 pxor @b[0], @b[5] 135 136 pxor @b[3], @b[6] 137 pxor @b[7], @b[3] 138 pxor @b[5], @b[7] 139 pxor @b[4], @b[3] 140 pxor @b[5], @b[4] 141 pxor @b[1], @b[3] 142 143 pxor @b[7], @b[2] 144 pxor @b[5], @b[1] 145___ 146} 147 148sub OutBasisChange { 149# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 150# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 151my @b=@_[0..7]; 152$code.=<<___; 153 pxor @b[6], @b[0] 154 pxor @b[4], @b[1] 155 pxor @b[0], @b[2] 156 pxor @b[6], @b[4] 157 pxor @b[1], @b[6] 158 159 pxor @b[5], @b[1] 160 pxor @b[3], @b[5] 161 pxor @b[7], @b[3] 162 pxor @b[5], @b[7] 163 pxor @b[5], @b[2] 164 165 pxor @b[7], @b[4] 166___ 167} 168 169sub InvSbox { 170# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 171# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 172my @b=@_[0..7]; 173my @t=@_[8..11]; 174my @s=@_[12..15]; 175 &InvInBasisChange (@b); 176 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 177 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 178} 179 180sub InvInBasisChange { # OutBasisChange in reverse 181my @b=@_[5,1,2,6,3,7,0,4]; 182$code.=<<___ 183 pxor @b[7], @b[4] 184 185 pxor @b[5], @b[7] 186 pxor @b[5], @b[2] 187 pxor @b[7], @b[3] 188 pxor @b[3], @b[5] 189 pxor @b[5], @b[1] 190 191 pxor @b[1], @b[6] 192 pxor @b[0], @b[2] 193 pxor @b[6], @b[4] 194 pxor @b[6], @b[0] 195 pxor @b[4], @b[1] 196___ 197} 198 199sub InvOutBasisChange { # InBasisChange in reverse 200my @b=@_[2,5,7,3,6,1,0,4]; 201$code.=<<___; 202 pxor @b[5], @b[1] 203 pxor @b[7], @b[2] 204 205 pxor @b[1], @b[3] 206 pxor @b[5], @b[4] 207 pxor @b[5], @b[7] 208 pxor @b[4], @b[3] 209 pxor @b[0], @b[5] 210 pxor @b[7], @b[3] 211 pxor @b[2], @b[6] 212 pxor @b[1], @b[2] 213 pxor @b[3], @b[6] 214 215 pxor @b[0], @b[3] 216 pxor @b[6], @b[5] 217___ 218} 219 220sub Mul_GF4 { 221#;************************************************************* 222#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 223#;************************************************************* 224my ($x0,$x1,$y0,$y1,$t0)=@_; 225$code.=<<___; 226 movdqa $y0, $t0 227 pxor $y1, $t0 228 pand $x0, $t0 229 pxor $x1, $x0 230 pand $y0, $x1 231 pand $y1, $x0 232 pxor $x1, $x0 233 pxor $t0, $x1 234___ 235} 236 237sub Mul_GF4_N { # not used, see next subroutine 238# multiply and scale by N 239my ($x0,$x1,$y0,$y1,$t0)=@_; 240$code.=<<___; 241 movdqa $y0, $t0 242 pxor $y1, $t0 243 pand $x0, $t0 244 pxor $x1, $x0 245 pand $y0, $x1 246 pand $y1, $x0 247 pxor $x0, $x1 248 pxor $t0, $x0 249___ 250} 251 252sub Mul_GF4_N_GF4 { 253# interleaved Mul_GF4_N and Mul_GF4 254my ($x0,$x1,$y0,$y1,$t0, 255 $x2,$x3,$y2,$y3,$t1)=@_; 256$code.=<<___; 257 movdqa $y0, $t0 258 movdqa $y2, $t1 259 pxor $y1, $t0 260 pxor $y3, $t1 261 pand $x0, $t0 262 pand $x2, $t1 263 pxor $x1, $x0 264 pxor $x3, $x2 265 pand $y0, $x1 266 pand $y2, $x3 267 pand $y1, $x0 268 pand $y3, $x2 269 pxor $x0, $x1 270 pxor $x3, $x2 271 pxor $t0, $x0 272 pxor $t1, $x3 273___ 274} 275sub Mul_GF16_2 { 276my @x=@_[0..7]; 277my @y=@_[8..11]; 278my @t=@_[12..15]; 279$code.=<<___; 280 movdqa @x[0], @t[0] 281 movdqa @x[1], @t[1] 282___ 283 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 284$code.=<<___; 285 pxor @x[2], @t[0] 286 pxor @x[3], @t[1] 287 pxor @y[2], @y[0] 288 pxor @y[3], @y[1] 289___ 290 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 291 @x[2], @x[3], @y[2], @y[3], @t[2]); 292$code.=<<___; 293 pxor @t[0], @x[0] 294 pxor @t[0], @x[2] 295 pxor @t[1], @x[1] 296 pxor @t[1], @x[3] 297 298 movdqa @x[4], @t[0] 299 movdqa @x[5], @t[1] 300 pxor @x[6], @t[0] 301 pxor @x[7], @t[1] 302___ 303 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 304 @x[6], @x[7], @y[2], @y[3], @t[2]); 305$code.=<<___; 306 pxor @y[2], @y[0] 307 pxor @y[3], @y[1] 308___ 309 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 310$code.=<<___; 311 pxor @t[0], @x[4] 312 pxor @t[0], @x[6] 313 pxor @t[1], @x[5] 314 pxor @t[1], @x[7] 315___ 316} 317sub Inv_GF256 { 318#;******************************************************************** 319#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 320#;******************************************************************** 321my @x=@_[0..7]; 322my @t=@_[8..11]; 323my @s=@_[12..15]; 324# direct optimizations from hardware 325$code.=<<___; 326 movdqa @x[4], @t[3] 327 movdqa @x[5], @t[2] 328 movdqa @x[1], @t[1] 329 movdqa @x[7], @s[1] 330 movdqa @x[0], @s[0] 331 332 pxor @x[6], @t[3] 333 pxor @x[7], @t[2] 334 pxor @x[3], @t[1] 335 movdqa @t[3], @s[2] 336 pxor @x[6], @s[1] 337 movdqa @t[2], @t[0] 338 pxor @x[2], @s[0] 339 movdqa @t[3], @s[3] 340 341 por @t[1], @t[2] 342 por @s[0], @t[3] 343 pxor @t[0], @s[3] 344 pand @s[0], @s[2] 345 pxor @t[1], @s[0] 346 pand @t[1], @t[0] 347 pand @s[0], @s[3] 348 movdqa @x[3], @s[0] 349 pxor @x[2], @s[0] 350 pand @s[0], @s[1] 351 pxor @s[1], @t[3] 352 pxor @s[1], @t[2] 353 movdqa @x[4], @s[1] 354 movdqa @x[1], @s[0] 355 pxor @x[5], @s[1] 356 pxor @x[0], @s[0] 357 movdqa @s[1], @t[1] 358 pand @s[0], @s[1] 359 por @s[0], @t[1] 360 pxor @s[1], @t[0] 361 pxor @s[3], @t[3] 362 pxor @s[2], @t[2] 363 pxor @s[3], @t[1] 364 movdqa @x[7], @s[0] 365 pxor @s[2], @t[0] 366 movdqa @x[6], @s[1] 367 pxor @s[2], @t[1] 368 movdqa @x[5], @s[2] 369 pand @x[3], @s[0] 370 movdqa @x[4], @s[3] 371 pand @x[2], @s[1] 372 pand @x[1], @s[2] 373 por @x[0], @s[3] 374 pxor @s[0], @t[3] 375 pxor @s[1], @t[2] 376 pxor @s[2], @t[1] 377 pxor @s[3], @t[0] 378 379 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 380 381 # new smaller inversion 382 383 movdqa @t[3], @s[0] 384 pand @t[1], @t[3] 385 pxor @t[2], @s[0] 386 387 movdqa @t[0], @s[2] 388 movdqa @s[0], @s[3] 389 pxor @t[3], @s[2] 390 pand @s[2], @s[3] 391 392 movdqa @t[1], @s[1] 393 pxor @t[2], @s[3] 394 pxor @t[0], @s[1] 395 396 pxor @t[2], @t[3] 397 398 pand @t[3], @s[1] 399 400 movdqa @s[2], @t[2] 401 pxor @t[0], @s[1] 402 403 pxor @s[1], @t[2] 404 pxor @s[1], @t[1] 405 406 pand @t[0], @t[2] 407 408 pxor @t[2], @s[2] 409 pxor @t[2], @t[1] 410 411 pand @s[3], @s[2] 412 413 pxor @s[0], @s[2] 414___ 415# output in s3, s2, s1, t1 416 417# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 418 419# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 420 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 421 422### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 423} 424 425# AES linear components 426 427sub ShiftRows { 428my @x=@_[0..7]; 429my $mask=pop; 430$code.=<<___; 431 pxor 0x00($key),@x[0] 432 pxor 0x10($key),@x[1] 433 pxor 0x20($key),@x[2] 434 pxor 0x30($key),@x[3] 435 pshufb $mask,@x[0] 436 pshufb $mask,@x[1] 437 pxor 0x40($key),@x[4] 438 pxor 0x50($key),@x[5] 439 pshufb $mask,@x[2] 440 pshufb $mask,@x[3] 441 pxor 0x60($key),@x[6] 442 pxor 0x70($key),@x[7] 443 pshufb $mask,@x[4] 444 pshufb $mask,@x[5] 445 pshufb $mask,@x[6] 446 pshufb $mask,@x[7] 447 lea 0x80($key),$key 448___ 449} 450 451sub MixColumns { 452# modified to emit output in order suitable for feeding back to aesenc[last] 453my @x=@_[0..7]; 454my @t=@_[8..15]; 455my $inv=@_[16]; # optional 456$code.=<<___; 457 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 458 pshufd \$0x93, @x[1], @t[1] 459 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 460 pshufd \$0x93, @x[2], @t[2] 461 pxor @t[1], @x[1] 462 pshufd \$0x93, @x[3], @t[3] 463 pxor @t[2], @x[2] 464 pshufd \$0x93, @x[4], @t[4] 465 pxor @t[3], @x[3] 466 pshufd \$0x93, @x[5], @t[5] 467 pxor @t[4], @x[4] 468 pshufd \$0x93, @x[6], @t[6] 469 pxor @t[5], @x[5] 470 pshufd \$0x93, @x[7], @t[7] 471 pxor @t[6], @x[6] 472 pxor @t[7], @x[7] 473 474 pxor @x[0], @t[1] 475 pxor @x[7], @t[0] 476 pxor @x[7], @t[1] 477 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 478 pxor @x[1], @t[2] 479 pshufd \$0x4E, @x[1], @x[1] 480 pxor @x[4], @t[5] 481 pxor @t[0], @x[0] 482 pxor @x[5], @t[6] 483 pxor @t[1], @x[1] 484 pxor @x[3], @t[4] 485 pshufd \$0x4E, @x[4], @t[0] 486 pxor @x[6], @t[7] 487 pshufd \$0x4E, @x[5], @t[1] 488 pxor @x[2], @t[3] 489 pshufd \$0x4E, @x[3], @x[4] 490 pxor @x[7], @t[3] 491 pshufd \$0x4E, @x[7], @x[5] 492 pxor @x[7], @t[4] 493 pshufd \$0x4E, @x[6], @x[3] 494 pxor @t[4], @t[0] 495 pshufd \$0x4E, @x[2], @x[6] 496 pxor @t[5], @t[1] 497___ 498$code.=<<___ if (!$inv); 499 pxor @t[3], @x[4] 500 pxor @t[7], @x[5] 501 pxor @t[6], @x[3] 502 movdqa @t[0], @x[2] 503 pxor @t[2], @x[6] 504 movdqa @t[1], @x[7] 505___ 506$code.=<<___ if ($inv); 507 pxor @x[4], @t[3] 508 pxor @t[7], @x[5] 509 pxor @x[3], @t[6] 510 movdqa @t[0], @x[3] 511 pxor @t[2], @x[6] 512 movdqa @t[6], @x[2] 513 movdqa @t[1], @x[7] 514 movdqa @x[6], @x[4] 515 movdqa @t[3], @x[6] 516___ 517} 518 519sub InvMixColumns_orig { 520my @x=@_[0..7]; 521my @t=@_[8..15]; 522 523$code.=<<___; 524 # multiplication by 0x0e 525 pshufd \$0x93, @x[7], @t[7] 526 movdqa @x[2], @t[2] 527 pxor @x[5], @x[7] # 7 5 528 pxor @x[5], @x[2] # 2 5 529 pshufd \$0x93, @x[0], @t[0] 530 movdqa @x[5], @t[5] 531 pxor @x[0], @x[5] # 5 0 [1] 532 pxor @x[1], @x[0] # 0 1 533 pshufd \$0x93, @x[1], @t[1] 534 pxor @x[2], @x[1] # 1 25 535 pxor @x[6], @x[0] # 01 6 [2] 536 pxor @x[3], @x[1] # 125 3 [4] 537 pshufd \$0x93, @x[3], @t[3] 538 pxor @x[0], @x[2] # 25 016 [3] 539 pxor @x[7], @x[3] # 3 75 540 pxor @x[6], @x[7] # 75 6 [0] 541 pshufd \$0x93, @x[6], @t[6] 542 movdqa @x[4], @t[4] 543 pxor @x[4], @x[6] # 6 4 544 pxor @x[3], @x[4] # 4 375 [6] 545 pxor @x[7], @x[3] # 375 756=36 546 pxor @t[5], @x[6] # 64 5 [7] 547 pxor @t[2], @x[3] # 36 2 548 pxor @t[4], @x[3] # 362 4 [5] 549 pshufd \$0x93, @t[5], @t[5] 550___ 551 my @y = @x[7,5,0,2,1,3,4,6]; 552$code.=<<___; 553 # multiplication by 0x0b 554 pxor @y[0], @y[1] 555 pxor @t[0], @y[0] 556 pxor @t[1], @y[1] 557 pshufd \$0x93, @t[2], @t[2] 558 pxor @t[5], @y[0] 559 pxor @t[6], @y[1] 560 pxor @t[7], @y[0] 561 pshufd \$0x93, @t[4], @t[4] 562 pxor @t[6], @t[7] # clobber t[7] 563 pxor @y[0], @y[1] 564 565 pxor @t[0], @y[3] 566 pshufd \$0x93, @t[0], @t[0] 567 pxor @t[1], @y[2] 568 pxor @t[1], @y[4] 569 pxor @t[2], @y[2] 570 pshufd \$0x93, @t[1], @t[1] 571 pxor @t[2], @y[3] 572 pxor @t[2], @y[5] 573 pxor @t[7], @y[2] 574 pshufd \$0x93, @t[2], @t[2] 575 pxor @t[3], @y[3] 576 pxor @t[3], @y[6] 577 pxor @t[3], @y[4] 578 pshufd \$0x93, @t[3], @t[3] 579 pxor @t[4], @y[7] 580 pxor @t[4], @y[5] 581 pxor @t[7], @y[7] 582 pxor @t[5], @y[3] 583 pxor @t[4], @y[4] 584 pxor @t[5], @t[7] # clobber t[7] even more 585 586 pxor @t[7], @y[5] 587 pshufd \$0x93, @t[4], @t[4] 588 pxor @t[7], @y[6] 589 pxor @t[7], @y[4] 590 591 pxor @t[5], @t[7] 592 pshufd \$0x93, @t[5], @t[5] 593 pxor @t[6], @t[7] # restore t[7] 594 595 # multiplication by 0x0d 596 pxor @y[7], @y[4] 597 pxor @t[4], @y[7] 598 pshufd \$0x93, @t[6], @t[6] 599 pxor @t[0], @y[2] 600 pxor @t[5], @y[7] 601 pxor @t[2], @y[2] 602 pshufd \$0x93, @t[7], @t[7] 603 604 pxor @y[1], @y[3] 605 pxor @t[1], @y[1] 606 pxor @t[0], @y[0] 607 pxor @t[0], @y[3] 608 pxor @t[5], @y[1] 609 pxor @t[5], @y[0] 610 pxor @t[7], @y[1] 611 pshufd \$0x93, @t[0], @t[0] 612 pxor @t[6], @y[0] 613 pxor @y[1], @y[3] 614 pxor @t[1], @y[4] 615 pshufd \$0x93, @t[1], @t[1] 616 617 pxor @t[7], @y[7] 618 pxor @t[2], @y[4] 619 pxor @t[2], @y[5] 620 pshufd \$0x93, @t[2], @t[2] 621 pxor @t[6], @y[2] 622 pxor @t[3], @t[6] # clobber t[6] 623 pxor @y[7], @y[4] 624 pxor @t[6], @y[3] 625 626 pxor @t[6], @y[6] 627 pxor @t[5], @y[5] 628 pxor @t[4], @y[6] 629 pshufd \$0x93, @t[4], @t[4] 630 pxor @t[6], @y[5] 631 pxor @t[7], @y[6] 632 pxor @t[3], @t[6] # restore t[6] 633 634 pshufd \$0x93, @t[5], @t[5] 635 pshufd \$0x93, @t[6], @t[6] 636 pshufd \$0x93, @t[7], @t[7] 637 pshufd \$0x93, @t[3], @t[3] 638 639 # multiplication by 0x09 640 pxor @y[1], @y[4] 641 pxor @y[1], @t[1] # t[1]=y[1] 642 pxor @t[5], @t[0] # clobber t[0] 643 pxor @t[5], @t[1] 644 pxor @t[0], @y[3] 645 pxor @y[0], @t[0] # t[0]=y[0] 646 pxor @t[6], @t[1] 647 pxor @t[7], @t[6] # clobber t[6] 648 pxor @t[1], @y[4] 649 pxor @t[4], @y[7] 650 pxor @y[4], @t[4] # t[4]=y[4] 651 pxor @t[3], @y[6] 652 pxor @y[3], @t[3] # t[3]=y[3] 653 pxor @t[2], @y[5] 654 pxor @y[2], @t[2] # t[2]=y[2] 655 pxor @t[7], @t[3] 656 pxor @y[5], @t[5] # t[5]=y[5] 657 pxor @t[6], @t[2] 658 pxor @t[6], @t[5] 659 pxor @y[6], @t[6] # t[6]=y[6] 660 pxor @y[7], @t[7] # t[7]=y[7] 661 662 movdqa @t[0],@XMM[0] 663 movdqa @t[1],@XMM[1] 664 movdqa @t[2],@XMM[2] 665 movdqa @t[3],@XMM[3] 666 movdqa @t[4],@XMM[4] 667 movdqa @t[5],@XMM[5] 668 movdqa @t[6],@XMM[6] 669 movdqa @t[7],@XMM[7] 670___ 671} 672 673sub InvMixColumns { 674my @x=@_[0..7]; 675my @t=@_[8..15]; 676 677# Thanks to Jussi Kivilinna for providing pointer to 678# 679# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 680# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 681# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 682# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 683 684$code.=<<___; 685 # multiplication by 0x05-0x00-0x04-0x00 686 pshufd \$0x4E, @x[0], @t[0] 687 pshufd \$0x4E, @x[6], @t[6] 688 pxor @x[0], @t[0] 689 pshufd \$0x4E, @x[7], @t[7] 690 pxor @x[6], @t[6] 691 pshufd \$0x4E, @x[1], @t[1] 692 pxor @x[7], @t[7] 693 pshufd \$0x4E, @x[2], @t[2] 694 pxor @x[1], @t[1] 695 pshufd \$0x4E, @x[3], @t[3] 696 pxor @x[2], @t[2] 697 pxor @t[6], @x[0] 698 pxor @t[6], @x[1] 699 pshufd \$0x4E, @x[4], @t[4] 700 pxor @x[3], @t[3] 701 pxor @t[0], @x[2] 702 pxor @t[1], @x[3] 703 pshufd \$0x4E, @x[5], @t[5] 704 pxor @x[4], @t[4] 705 pxor @t[7], @x[1] 706 pxor @t[2], @x[4] 707 pxor @x[5], @t[5] 708 709 pxor @t[7], @x[2] 710 pxor @t[6], @x[3] 711 pxor @t[6], @x[4] 712 pxor @t[3], @x[5] 713 pxor @t[4], @x[6] 714 pxor @t[7], @x[4] 715 pxor @t[7], @x[5] 716 pxor @t[5], @x[7] 717___ 718 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 719} 720 721sub aesenc { # not used 722my @b=@_[0..7]; 723my @t=@_[8..15]; 724$code.=<<___; 725 movdqa 0x30($const),@t[0] # .LSR 726___ 727 &ShiftRows (@b,@t[0]); 728 &Sbox (@b,@t); 729 &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 730} 731 732sub aesenclast { # not used 733my @b=@_[0..7]; 734my @t=@_[8..15]; 735$code.=<<___; 736 movdqa 0x40($const),@t[0] # .LSRM0 737___ 738 &ShiftRows (@b,@t[0]); 739 &Sbox (@b,@t); 740$code.=<<___ 741 pxor 0x00($key),@b[0] 742 pxor 0x10($key),@b[1] 743 pxor 0x20($key),@b[4] 744 pxor 0x30($key),@b[6] 745 pxor 0x40($key),@b[3] 746 pxor 0x50($key),@b[7] 747 pxor 0x60($key),@b[2] 748 pxor 0x70($key),@b[5] 749___ 750} 751 752sub swapmove { 753my ($a,$b,$n,$mask,$t)=@_; 754$code.=<<___; 755 movdqa $b,$t 756 psrlq \$$n,$b 757 pxor $a,$b 758 pand $mask,$b 759 pxor $b,$a 760 psllq \$$n,$b 761 pxor $t,$b 762___ 763} 764sub swapmove2x { 765my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 766$code.=<<___; 767 movdqa $b0,$t0 768 psrlq \$$n,$b0 769 movdqa $b1,$t1 770 psrlq \$$n,$b1 771 pxor $a0,$b0 772 pxor $a1,$b1 773 pand $mask,$b0 774 pand $mask,$b1 775 pxor $b0,$a0 776 psllq \$$n,$b0 777 pxor $b1,$a1 778 psllq \$$n,$b1 779 pxor $t0,$b0 780 pxor $t1,$b1 781___ 782} 783 784sub bitslice { 785my @x=reverse(@_[0..7]); 786my ($t0,$t1,$t2,$t3)=@_[8..11]; 787$code.=<<___; 788 movdqa 0x00($const),$t0 # .LBS0 789 movdqa 0x10($const),$t1 # .LBS1 790___ 791 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 792 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 793$code.=<<___; 794 movdqa 0x20($const),$t0 # .LBS2 795___ 796 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 797 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 798 799 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 800 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 801} 802 803$code.=<<___; 804.text 805 806.extern asm_AES_encrypt 807.extern asm_AES_decrypt 808 809.type _bsaes_encrypt8,\@abi-omnipotent 810.align 64 811_bsaes_encrypt8: 812 lea .LBS0(%rip), $const # constants table 813 814 movdqa ($key), @XMM[9] # round 0 key 815 lea 0x10($key), $key 816 movdqa 0x50($const), @XMM[8] # .LM0SR 817 pxor @XMM[9], @XMM[0] # xor with round0 key 818 pxor @XMM[9], @XMM[1] 819 pxor @XMM[9], @XMM[2] 820 pxor @XMM[9], @XMM[3] 821 pshufb @XMM[8], @XMM[0] 822 pshufb @XMM[8], @XMM[1] 823 pxor @XMM[9], @XMM[4] 824 pxor @XMM[9], @XMM[5] 825 pshufb @XMM[8], @XMM[2] 826 pshufb @XMM[8], @XMM[3] 827 pxor @XMM[9], @XMM[6] 828 pxor @XMM[9], @XMM[7] 829 pshufb @XMM[8], @XMM[4] 830 pshufb @XMM[8], @XMM[5] 831 pshufb @XMM[8], @XMM[6] 832 pshufb @XMM[8], @XMM[7] 833_bsaes_encrypt8_bitslice: 834___ 835 &bitslice (@XMM[0..7, 8..11]); 836$code.=<<___; 837 dec $rounds 838 jmp .Lenc_sbox 839.align 16 840.Lenc_loop: 841___ 842 &ShiftRows (@XMM[0..7, 8]); 843$code.=".Lenc_sbox:\n"; 844 &Sbox (@XMM[0..7, 8..15]); 845$code.=<<___; 846 dec $rounds 847 jl .Lenc_done 848___ 849 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 850$code.=<<___; 851 movdqa 0x30($const), @XMM[8] # .LSR 852 jnz .Lenc_loop 853 movdqa 0x40($const), @XMM[8] # .LSRM0 854 jmp .Lenc_loop 855.align 16 856.Lenc_done: 857___ 858 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 859 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 860$code.=<<___; 861 movdqa ($key), @XMM[8] # last round key 862 pxor @XMM[8], @XMM[4] 863 pxor @XMM[8], @XMM[6] 864 pxor @XMM[8], @XMM[3] 865 pxor @XMM[8], @XMM[7] 866 pxor @XMM[8], @XMM[2] 867 pxor @XMM[8], @XMM[5] 868 pxor @XMM[8], @XMM[0] 869 pxor @XMM[8], @XMM[1] 870 ret 871.size _bsaes_encrypt8,.-_bsaes_encrypt8 872 873.type _bsaes_decrypt8,\@abi-omnipotent 874.align 64 875_bsaes_decrypt8: 876 lea .LBS0(%rip), $const # constants table 877 878 movdqa ($key), @XMM[9] # round 0 key 879 lea 0x10($key), $key 880 movdqa -0x30($const), @XMM[8] # .LM0ISR 881 pxor @XMM[9], @XMM[0] # xor with round0 key 882 pxor @XMM[9], @XMM[1] 883 pxor @XMM[9], @XMM[2] 884 pxor @XMM[9], @XMM[3] 885 pshufb @XMM[8], @XMM[0] 886 pshufb @XMM[8], @XMM[1] 887 pxor @XMM[9], @XMM[4] 888 pxor @XMM[9], @XMM[5] 889 pshufb @XMM[8], @XMM[2] 890 pshufb @XMM[8], @XMM[3] 891 pxor @XMM[9], @XMM[6] 892 pxor @XMM[9], @XMM[7] 893 pshufb @XMM[8], @XMM[4] 894 pshufb @XMM[8], @XMM[5] 895 pshufb @XMM[8], @XMM[6] 896 pshufb @XMM[8], @XMM[7] 897___ 898 &bitslice (@XMM[0..7, 8..11]); 899$code.=<<___; 900 dec $rounds 901 jmp .Ldec_sbox 902.align 16 903.Ldec_loop: 904___ 905 &ShiftRows (@XMM[0..7, 8]); 906$code.=".Ldec_sbox:\n"; 907 &InvSbox (@XMM[0..7, 8..15]); 908$code.=<<___; 909 dec $rounds 910 jl .Ldec_done 911___ 912 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 913$code.=<<___; 914 movdqa -0x10($const), @XMM[8] # .LISR 915 jnz .Ldec_loop 916 movdqa -0x20($const), @XMM[8] # .LISRM0 917 jmp .Ldec_loop 918.align 16 919.Ldec_done: 920___ 921 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 922$code.=<<___; 923 movdqa ($key), @XMM[8] # last round key 924 pxor @XMM[8], @XMM[6] 925 pxor @XMM[8], @XMM[4] 926 pxor @XMM[8], @XMM[2] 927 pxor @XMM[8], @XMM[7] 928 pxor @XMM[8], @XMM[3] 929 pxor @XMM[8], @XMM[5] 930 pxor @XMM[8], @XMM[0] 931 pxor @XMM[8], @XMM[1] 932 ret 933.size _bsaes_decrypt8,.-_bsaes_decrypt8 934___ 935} 936{ 937my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 938 939sub bitslice_key { 940my @x=reverse(@_[0..7]); 941my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 942 943 &swapmove (@x[0,1],1,$bs0,$t2,$t3); 944$code.=<<___; 945 #&swapmove(@x[2,3],1,$t0,$t2,$t3); 946 movdqa @x[0], @x[2] 947 movdqa @x[1], @x[3] 948___ 949 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 950 951 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 952$code.=<<___; 953 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 954 movdqa @x[0], @x[4] 955 movdqa @x[2], @x[6] 956 movdqa @x[1], @x[5] 957 movdqa @x[3], @x[7] 958___ 959 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 960 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 961} 962 963$code.=<<___; 964.type _bsaes_key_convert,\@abi-omnipotent 965.align 16 966_bsaes_key_convert: 967 lea .Lmasks(%rip), $const 968 movdqu ($inp), %xmm7 # load round 0 key 969 lea 0x10($inp), $inp 970 movdqa 0x00($const), %xmm0 # 0x01... 971 movdqa 0x10($const), %xmm1 # 0x02... 972 movdqa 0x20($const), %xmm2 # 0x04... 973 movdqa 0x30($const), %xmm3 # 0x08... 974 movdqa 0x40($const), %xmm4 # .LM0 975 pcmpeqd %xmm5, %xmm5 # .LNOT 976 977 movdqu ($inp), %xmm6 # load round 1 key 978 movdqa %xmm7, ($out) # save round 0 key 979 lea 0x10($out), $out 980 dec $rounds 981 jmp .Lkey_loop 982.align 16 983.Lkey_loop: 984 pshufb %xmm4, %xmm6 # .LM0 985 986 movdqa %xmm0, %xmm8 987 movdqa %xmm1, %xmm9 988 989 pand %xmm6, %xmm8 990 pand %xmm6, %xmm9 991 movdqa %xmm2, %xmm10 992 pcmpeqb %xmm0, %xmm8 993 psllq \$4, %xmm0 # 0x10... 994 movdqa %xmm3, %xmm11 995 pcmpeqb %xmm1, %xmm9 996 psllq \$4, %xmm1 # 0x20... 997 998 pand %xmm6, %xmm10 999 pand %xmm6, %xmm11 1000 movdqa %xmm0, %xmm12 1001 pcmpeqb %xmm2, %xmm10 1002 psllq \$4, %xmm2 # 0x40... 1003 movdqa %xmm1, %xmm13 1004 pcmpeqb %xmm3, %xmm11 1005 psllq \$4, %xmm3 # 0x80... 1006 1007 movdqa %xmm2, %xmm14 1008 movdqa %xmm3, %xmm15 1009 pxor %xmm5, %xmm8 # "pnot" 1010 pxor %xmm5, %xmm9 1011 1012 pand %xmm6, %xmm12 1013 pand %xmm6, %xmm13 1014 movdqa %xmm8, 0x00($out) # write bit-sliced round key 1015 pcmpeqb %xmm0, %xmm12 1016 psrlq \$4, %xmm0 # 0x01... 1017 movdqa %xmm9, 0x10($out) 1018 pcmpeqb %xmm1, %xmm13 1019 psrlq \$4, %xmm1 # 0x02... 1020 lea 0x10($inp), $inp 1021 1022 pand %xmm6, %xmm14 1023 pand %xmm6, %xmm15 1024 movdqa %xmm10, 0x20($out) 1025 pcmpeqb %xmm2, %xmm14 1026 psrlq \$4, %xmm2 # 0x04... 1027 movdqa %xmm11, 0x30($out) 1028 pcmpeqb %xmm3, %xmm15 1029 psrlq \$4, %xmm3 # 0x08... 1030 movdqu ($inp), %xmm6 # load next round key 1031 1032 pxor %xmm5, %xmm13 # "pnot" 1033 pxor %xmm5, %xmm14 1034 movdqa %xmm12, 0x40($out) 1035 movdqa %xmm13, 0x50($out) 1036 movdqa %xmm14, 0x60($out) 1037 movdqa %xmm15, 0x70($out) 1038 lea 0x80($out),$out 1039 dec $rounds 1040 jnz .Lkey_loop 1041 1042 movdqa 0x50($const), %xmm7 # .L63 1043 #movdqa %xmm6, ($out) # don't save last round key 1044 ret 1045.size _bsaes_key_convert,.-_bsaes_key_convert 1046___ 1047} 1048 1049if (0 && !$win64) { # following four functions are unsupported interface 1050 # used for benchmarking... 1051$code.=<<___; 1052.globl bsaes_enc_key_convert 1053.type bsaes_enc_key_convert,\@function,2 1054.align 16 1055bsaes_enc_key_convert: 1056 mov 240($inp),%r10d # pass rounds 1057 mov $inp,%rcx # pass key 1058 mov $out,%rax # pass key schedule 1059 call _bsaes_key_convert 1060 pxor %xmm6,%xmm7 # fix up last round key 1061 movdqa %xmm7,(%rax) # save last round key 1062 ret 1063.size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1064 1065.globl bsaes_encrypt_128 1066.type bsaes_encrypt_128,\@function,4 1067.align 16 1068bsaes_encrypt_128: 1069.Lenc128_loop: 1070 movdqu 0x00($inp), @XMM[0] # load input 1071 movdqu 0x10($inp), @XMM[1] 1072 movdqu 0x20($inp), @XMM[2] 1073 movdqu 0x30($inp), @XMM[3] 1074 movdqu 0x40($inp), @XMM[4] 1075 movdqu 0x50($inp), @XMM[5] 1076 movdqu 0x60($inp), @XMM[6] 1077 movdqu 0x70($inp), @XMM[7] 1078 mov $key, %rax # pass the $key 1079 lea 0x80($inp), $inp 1080 mov \$10,%r10d 1081 1082 call _bsaes_encrypt8 1083 1084 movdqu @XMM[0], 0x00($out) # write output 1085 movdqu @XMM[1], 0x10($out) 1086 movdqu @XMM[4], 0x20($out) 1087 movdqu @XMM[6], 0x30($out) 1088 movdqu @XMM[3], 0x40($out) 1089 movdqu @XMM[7], 0x50($out) 1090 movdqu @XMM[2], 0x60($out) 1091 movdqu @XMM[5], 0x70($out) 1092 lea 0x80($out), $out 1093 sub \$0x80,$len 1094 ja .Lenc128_loop 1095 ret 1096.size bsaes_encrypt_128,.-bsaes_encrypt_128 1097 1098.globl bsaes_dec_key_convert 1099.type bsaes_dec_key_convert,\@function,2 1100.align 16 1101bsaes_dec_key_convert: 1102 mov 240($inp),%r10d # pass rounds 1103 mov $inp,%rcx # pass key 1104 mov $out,%rax # pass key schedule 1105 call _bsaes_key_convert 1106 pxor ($out),%xmm7 # fix up round 0 key 1107 movdqa %xmm6,(%rax) # save last round key 1108 movdqa %xmm7,($out) 1109 ret 1110.size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1111 1112.globl bsaes_decrypt_128 1113.type bsaes_decrypt_128,\@function,4 1114.align 16 1115bsaes_decrypt_128: 1116.Ldec128_loop: 1117 movdqu 0x00($inp), @XMM[0] # load input 1118 movdqu 0x10($inp), @XMM[1] 1119 movdqu 0x20($inp), @XMM[2] 1120 movdqu 0x30($inp), @XMM[3] 1121 movdqu 0x40($inp), @XMM[4] 1122 movdqu 0x50($inp), @XMM[5] 1123 movdqu 0x60($inp), @XMM[6] 1124 movdqu 0x70($inp), @XMM[7] 1125 mov $key, %rax # pass the $key 1126 lea 0x80($inp), $inp 1127 mov \$10,%r10d 1128 1129 call _bsaes_decrypt8 1130 1131 movdqu @XMM[0], 0x00($out) # write output 1132 movdqu @XMM[1], 0x10($out) 1133 movdqu @XMM[6], 0x20($out) 1134 movdqu @XMM[4], 0x30($out) 1135 movdqu @XMM[2], 0x40($out) 1136 movdqu @XMM[7], 0x50($out) 1137 movdqu @XMM[3], 0x60($out) 1138 movdqu @XMM[5], 0x70($out) 1139 lea 0x80($out), $out 1140 sub \$0x80,$len 1141 ja .Ldec128_loop 1142 ret 1143.size bsaes_decrypt_128,.-bsaes_decrypt_128 1144___ 1145} 1146{ 1147###################################################################### 1148# 1149# OpenSSL interface 1150# 1151my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1152 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1153my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1154 1155if ($ecb) { 1156$code.=<<___; 1157.globl bsaes_ecb_encrypt_blocks 1158.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1159.align 16 1160bsaes_ecb_encrypt_blocks: 1161 mov %rsp, %rax 1162.Lecb_enc_prologue: 1163 push %rbp 1164 push %rbx 1165 push %r12 1166 push %r13 1167 push %r14 1168 push %r15 1169 lea -0x48(%rsp),%rsp 1170___ 1171$code.=<<___ if ($win64); 1172 lea -0xa0(%rsp), %rsp 1173 movaps %xmm6, 0x40(%rsp) 1174 movaps %xmm7, 0x50(%rsp) 1175 movaps %xmm8, 0x60(%rsp) 1176 movaps %xmm9, 0x70(%rsp) 1177 movaps %xmm10, 0x80(%rsp) 1178 movaps %xmm11, 0x90(%rsp) 1179 movaps %xmm12, 0xa0(%rsp) 1180 movaps %xmm13, 0xb0(%rsp) 1181 movaps %xmm14, 0xc0(%rsp) 1182 movaps %xmm15, 0xd0(%rsp) 1183.Lecb_enc_body: 1184___ 1185$code.=<<___; 1186 mov %rsp,%rbp # backup %rsp 1187 mov 240($arg4),%eax # rounds 1188 mov $arg1,$inp # backup arguments 1189 mov $arg2,$out 1190 mov $arg3,$len 1191 mov $arg4,$key 1192 cmp \$8,$arg3 1193 jb .Lecb_enc_short 1194 1195 mov %eax,%ebx # backup rounds 1196 shl \$7,%rax # 128 bytes per inner round key 1197 sub \$`128-32`,%rax # size of bit-sliced key schedule 1198 sub %rax,%rsp 1199 mov %rsp,%rax # pass key schedule 1200 mov $key,%rcx # pass key 1201 mov %ebx,%r10d # pass rounds 1202 call _bsaes_key_convert 1203 pxor %xmm6,%xmm7 # fix up last round key 1204 movdqa %xmm7,(%rax) # save last round key 1205 1206 sub \$8,$len 1207.Lecb_enc_loop: 1208 movdqu 0x00($inp), @XMM[0] # load input 1209 movdqu 0x10($inp), @XMM[1] 1210 movdqu 0x20($inp), @XMM[2] 1211 movdqu 0x30($inp), @XMM[3] 1212 movdqu 0x40($inp), @XMM[4] 1213 movdqu 0x50($inp), @XMM[5] 1214 mov %rsp, %rax # pass key schedule 1215 movdqu 0x60($inp), @XMM[6] 1216 mov %ebx,%r10d # pass rounds 1217 movdqu 0x70($inp), @XMM[7] 1218 lea 0x80($inp), $inp 1219 1220 call _bsaes_encrypt8 1221 1222 movdqu @XMM[0], 0x00($out) # write output 1223 movdqu @XMM[1], 0x10($out) 1224 movdqu @XMM[4], 0x20($out) 1225 movdqu @XMM[6], 0x30($out) 1226 movdqu @XMM[3], 0x40($out) 1227 movdqu @XMM[7], 0x50($out) 1228 movdqu @XMM[2], 0x60($out) 1229 movdqu @XMM[5], 0x70($out) 1230 lea 0x80($out), $out 1231 sub \$8,$len 1232 jnc .Lecb_enc_loop 1233 1234 add \$8,$len 1235 jz .Lecb_enc_done 1236 1237 movdqu 0x00($inp), @XMM[0] # load input 1238 mov %rsp, %rax # pass key schedule 1239 mov %ebx,%r10d # pass rounds 1240 cmp \$2,$len 1241 jb .Lecb_enc_one 1242 movdqu 0x10($inp), @XMM[1] 1243 je .Lecb_enc_two 1244 movdqu 0x20($inp), @XMM[2] 1245 cmp \$4,$len 1246 jb .Lecb_enc_three 1247 movdqu 0x30($inp), @XMM[3] 1248 je .Lecb_enc_four 1249 movdqu 0x40($inp), @XMM[4] 1250 cmp \$6,$len 1251 jb .Lecb_enc_five 1252 movdqu 0x50($inp), @XMM[5] 1253 je .Lecb_enc_six 1254 movdqu 0x60($inp), @XMM[6] 1255 call _bsaes_encrypt8 1256 movdqu @XMM[0], 0x00($out) # write output 1257 movdqu @XMM[1], 0x10($out) 1258 movdqu @XMM[4], 0x20($out) 1259 movdqu @XMM[6], 0x30($out) 1260 movdqu @XMM[3], 0x40($out) 1261 movdqu @XMM[7], 0x50($out) 1262 movdqu @XMM[2], 0x60($out) 1263 jmp .Lecb_enc_done 1264.align 16 1265.Lecb_enc_six: 1266 call _bsaes_encrypt8 1267 movdqu @XMM[0], 0x00($out) # write output 1268 movdqu @XMM[1], 0x10($out) 1269 movdqu @XMM[4], 0x20($out) 1270 movdqu @XMM[6], 0x30($out) 1271 movdqu @XMM[3], 0x40($out) 1272 movdqu @XMM[7], 0x50($out) 1273 jmp .Lecb_enc_done 1274.align 16 1275.Lecb_enc_five: 1276 call _bsaes_encrypt8 1277 movdqu @XMM[0], 0x00($out) # write output 1278 movdqu @XMM[1], 0x10($out) 1279 movdqu @XMM[4], 0x20($out) 1280 movdqu @XMM[6], 0x30($out) 1281 movdqu @XMM[3], 0x40($out) 1282 jmp .Lecb_enc_done 1283.align 16 1284.Lecb_enc_four: 1285 call _bsaes_encrypt8 1286 movdqu @XMM[0], 0x00($out) # write output 1287 movdqu @XMM[1], 0x10($out) 1288 movdqu @XMM[4], 0x20($out) 1289 movdqu @XMM[6], 0x30($out) 1290 jmp .Lecb_enc_done 1291.align 16 1292.Lecb_enc_three: 1293 call _bsaes_encrypt8 1294 movdqu @XMM[0], 0x00($out) # write output 1295 movdqu @XMM[1], 0x10($out) 1296 movdqu @XMM[4], 0x20($out) 1297 jmp .Lecb_enc_done 1298.align 16 1299.Lecb_enc_two: 1300 call _bsaes_encrypt8 1301 movdqu @XMM[0], 0x00($out) # write output 1302 movdqu @XMM[1], 0x10($out) 1303 jmp .Lecb_enc_done 1304.align 16 1305.Lecb_enc_one: 1306 call _bsaes_encrypt8 1307 movdqu @XMM[0], 0x00($out) # write output 1308 jmp .Lecb_enc_done 1309.align 16 1310.Lecb_enc_short: 1311 lea ($inp), $arg1 1312 lea ($out), $arg2 1313 lea ($key), $arg3 1314 call asm_AES_encrypt 1315 lea 16($inp), $inp 1316 lea 16($out), $out 1317 dec $len 1318 jnz .Lecb_enc_short 1319 1320.Lecb_enc_done: 1321 lea (%rsp),%rax 1322 pxor %xmm0, %xmm0 1323.Lecb_enc_bzero: # wipe key schedule [if any] 1324 movdqa %xmm0, 0x00(%rax) 1325 movdqa %xmm0, 0x10(%rax) 1326 lea 0x20(%rax), %rax 1327 cmp %rax, %rbp 1328 jb .Lecb_enc_bzero 1329 1330 lea 0x78(%rbp),%rax 1331___ 1332$code.=<<___ if ($win64); 1333 movaps 0x40(%rbp), %xmm6 1334 movaps 0x50(%rbp), %xmm7 1335 movaps 0x60(%rbp), %xmm8 1336 movaps 0x70(%rbp), %xmm9 1337 movaps 0x80(%rbp), %xmm10 1338 movaps 0x90(%rbp), %xmm11 1339 movaps 0xa0(%rbp), %xmm12 1340 movaps 0xb0(%rbp), %xmm13 1341 movaps 0xc0(%rbp), %xmm14 1342 movaps 0xd0(%rbp), %xmm15 1343 lea 0xa0(%rax), %rax 1344.Lecb_enc_tail: 1345___ 1346$code.=<<___; 1347 mov -48(%rax), %r15 1348 mov -40(%rax), %r14 1349 mov -32(%rax), %r13 1350 mov -24(%rax), %r12 1351 mov -16(%rax), %rbx 1352 mov -8(%rax), %rbp 1353 lea (%rax), %rsp # restore %rsp 1354.Lecb_enc_epilogue: 1355 ret 1356.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1357 1358.globl bsaes_ecb_decrypt_blocks 1359.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1360.align 16 1361bsaes_ecb_decrypt_blocks: 1362 mov %rsp, %rax 1363.Lecb_dec_prologue: 1364 push %rbp 1365 push %rbx 1366 push %r12 1367 push %r13 1368 push %r14 1369 push %r15 1370 lea -0x48(%rsp),%rsp 1371___ 1372$code.=<<___ if ($win64); 1373 lea -0xa0(%rsp), %rsp 1374 movaps %xmm6, 0x40(%rsp) 1375 movaps %xmm7, 0x50(%rsp) 1376 movaps %xmm8, 0x60(%rsp) 1377 movaps %xmm9, 0x70(%rsp) 1378 movaps %xmm10, 0x80(%rsp) 1379 movaps %xmm11, 0x90(%rsp) 1380 movaps %xmm12, 0xa0(%rsp) 1381 movaps %xmm13, 0xb0(%rsp) 1382 movaps %xmm14, 0xc0(%rsp) 1383 movaps %xmm15, 0xd0(%rsp) 1384.Lecb_dec_body: 1385___ 1386$code.=<<___; 1387 mov %rsp,%rbp # backup %rsp 1388 mov 240($arg4),%eax # rounds 1389 mov $arg1,$inp # backup arguments 1390 mov $arg2,$out 1391 mov $arg3,$len 1392 mov $arg4,$key 1393 cmp \$8,$arg3 1394 jb .Lecb_dec_short 1395 1396 mov %eax,%ebx # backup rounds 1397 shl \$7,%rax # 128 bytes per inner round key 1398 sub \$`128-32`,%rax # size of bit-sliced key schedule 1399 sub %rax,%rsp 1400 mov %rsp,%rax # pass key schedule 1401 mov $key,%rcx # pass key 1402 mov %ebx,%r10d # pass rounds 1403 call _bsaes_key_convert 1404 pxor (%rsp),%xmm7 # fix up 0 round key 1405 movdqa %xmm6,(%rax) # save last round key 1406 movdqa %xmm7,(%rsp) 1407 1408 sub \$8,$len 1409.Lecb_dec_loop: 1410 movdqu 0x00($inp), @XMM[0] # load input 1411 movdqu 0x10($inp), @XMM[1] 1412 movdqu 0x20($inp), @XMM[2] 1413 movdqu 0x30($inp), @XMM[3] 1414 movdqu 0x40($inp), @XMM[4] 1415 movdqu 0x50($inp), @XMM[5] 1416 mov %rsp, %rax # pass key schedule 1417 movdqu 0x60($inp), @XMM[6] 1418 mov %ebx,%r10d # pass rounds 1419 movdqu 0x70($inp), @XMM[7] 1420 lea 0x80($inp), $inp 1421 1422 call _bsaes_decrypt8 1423 1424 movdqu @XMM[0], 0x00($out) # write output 1425 movdqu @XMM[1], 0x10($out) 1426 movdqu @XMM[6], 0x20($out) 1427 movdqu @XMM[4], 0x30($out) 1428 movdqu @XMM[2], 0x40($out) 1429 movdqu @XMM[7], 0x50($out) 1430 movdqu @XMM[3], 0x60($out) 1431 movdqu @XMM[5], 0x70($out) 1432 lea 0x80($out), $out 1433 sub \$8,$len 1434 jnc .Lecb_dec_loop 1435 1436 add \$8,$len 1437 jz .Lecb_dec_done 1438 1439 movdqu 0x00($inp), @XMM[0] # load input 1440 mov %rsp, %rax # pass key schedule 1441 mov %ebx,%r10d # pass rounds 1442 cmp \$2,$len 1443 jb .Lecb_dec_one 1444 movdqu 0x10($inp), @XMM[1] 1445 je .Lecb_dec_two 1446 movdqu 0x20($inp), @XMM[2] 1447 cmp \$4,$len 1448 jb .Lecb_dec_three 1449 movdqu 0x30($inp), @XMM[3] 1450 je .Lecb_dec_four 1451 movdqu 0x40($inp), @XMM[4] 1452 cmp \$6,$len 1453 jb .Lecb_dec_five 1454 movdqu 0x50($inp), @XMM[5] 1455 je .Lecb_dec_six 1456 movdqu 0x60($inp), @XMM[6] 1457 call _bsaes_decrypt8 1458 movdqu @XMM[0], 0x00($out) # write output 1459 movdqu @XMM[1], 0x10($out) 1460 movdqu @XMM[6], 0x20($out) 1461 movdqu @XMM[4], 0x30($out) 1462 movdqu @XMM[2], 0x40($out) 1463 movdqu @XMM[7], 0x50($out) 1464 movdqu @XMM[3], 0x60($out) 1465 jmp .Lecb_dec_done 1466.align 16 1467.Lecb_dec_six: 1468 call _bsaes_decrypt8 1469 movdqu @XMM[0], 0x00($out) # write output 1470 movdqu @XMM[1], 0x10($out) 1471 movdqu @XMM[6], 0x20($out) 1472 movdqu @XMM[4], 0x30($out) 1473 movdqu @XMM[2], 0x40($out) 1474 movdqu @XMM[7], 0x50($out) 1475 jmp .Lecb_dec_done 1476.align 16 1477.Lecb_dec_five: 1478 call _bsaes_decrypt8 1479 movdqu @XMM[0], 0x00($out) # write output 1480 movdqu @XMM[1], 0x10($out) 1481 movdqu @XMM[6], 0x20($out) 1482 movdqu @XMM[4], 0x30($out) 1483 movdqu @XMM[2], 0x40($out) 1484 jmp .Lecb_dec_done 1485.align 16 1486.Lecb_dec_four: 1487 call _bsaes_decrypt8 1488 movdqu @XMM[0], 0x00($out) # write output 1489 movdqu @XMM[1], 0x10($out) 1490 movdqu @XMM[6], 0x20($out) 1491 movdqu @XMM[4], 0x30($out) 1492 jmp .Lecb_dec_done 1493.align 16 1494.Lecb_dec_three: 1495 call _bsaes_decrypt8 1496 movdqu @XMM[0], 0x00($out) # write output 1497 movdqu @XMM[1], 0x10($out) 1498 movdqu @XMM[6], 0x20($out) 1499 jmp .Lecb_dec_done 1500.align 16 1501.Lecb_dec_two: 1502 call _bsaes_decrypt8 1503 movdqu @XMM[0], 0x00($out) # write output 1504 movdqu @XMM[1], 0x10($out) 1505 jmp .Lecb_dec_done 1506.align 16 1507.Lecb_dec_one: 1508 call _bsaes_decrypt8 1509 movdqu @XMM[0], 0x00($out) # write output 1510 jmp .Lecb_dec_done 1511.align 16 1512.Lecb_dec_short: 1513 lea ($inp), $arg1 1514 lea ($out), $arg2 1515 lea ($key), $arg3 1516 call asm_AES_decrypt 1517 lea 16($inp), $inp 1518 lea 16($out), $out 1519 dec $len 1520 jnz .Lecb_dec_short 1521 1522.Lecb_dec_done: 1523 lea (%rsp),%rax 1524 pxor %xmm0, %xmm0 1525.Lecb_dec_bzero: # wipe key schedule [if any] 1526 movdqa %xmm0, 0x00(%rax) 1527 movdqa %xmm0, 0x10(%rax) 1528 lea 0x20(%rax), %rax 1529 cmp %rax, %rbp 1530 jb .Lecb_dec_bzero 1531 1532 lea 0x78(%rbp),%rax 1533___ 1534$code.=<<___ if ($win64); 1535 movaps 0x40(%rbp), %xmm6 1536 movaps 0x50(%rbp), %xmm7 1537 movaps 0x60(%rbp), %xmm8 1538 movaps 0x70(%rbp), %xmm9 1539 movaps 0x80(%rbp), %xmm10 1540 movaps 0x90(%rbp), %xmm11 1541 movaps 0xa0(%rbp), %xmm12 1542 movaps 0xb0(%rbp), %xmm13 1543 movaps 0xc0(%rbp), %xmm14 1544 movaps 0xd0(%rbp), %xmm15 1545 lea 0xa0(%rax), %rax 1546.Lecb_dec_tail: 1547___ 1548$code.=<<___; 1549 mov -48(%rax), %r15 1550 mov -40(%rax), %r14 1551 mov -32(%rax), %r13 1552 mov -24(%rax), %r12 1553 mov -16(%rax), %rbx 1554 mov -8(%rax), %rbp 1555 lea (%rax), %rsp # restore %rsp 1556.Lecb_dec_epilogue: 1557 ret 1558.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1559___ 1560} 1561$code.=<<___; 1562.extern asm_AES_cbc_encrypt 1563.globl bsaes_cbc_encrypt 1564.type bsaes_cbc_encrypt,\@abi-omnipotent 1565.align 16 1566bsaes_cbc_encrypt: 1567___ 1568$code.=<<___ if ($win64); 1569 mov 48(%rsp),$arg6 # pull direction flag 1570___ 1571$code.=<<___; 1572 cmp \$0,$arg6 1573 jne asm_AES_cbc_encrypt 1574 cmp \$128,$arg3 1575 jb asm_AES_cbc_encrypt 1576 1577 mov %rsp, %rax 1578.Lcbc_dec_prologue: 1579 push %rbp 1580 push %rbx 1581 push %r12 1582 push %r13 1583 push %r14 1584 push %r15 1585 lea -0x48(%rsp), %rsp 1586___ 1587$code.=<<___ if ($win64); 1588 mov 0xa0(%rsp),$arg5 # pull ivp 1589 lea -0xa0(%rsp), %rsp 1590 movaps %xmm6, 0x40(%rsp) 1591 movaps %xmm7, 0x50(%rsp) 1592 movaps %xmm8, 0x60(%rsp) 1593 movaps %xmm9, 0x70(%rsp) 1594 movaps %xmm10, 0x80(%rsp) 1595 movaps %xmm11, 0x90(%rsp) 1596 movaps %xmm12, 0xa0(%rsp) 1597 movaps %xmm13, 0xb0(%rsp) 1598 movaps %xmm14, 0xc0(%rsp) 1599 movaps %xmm15, 0xd0(%rsp) 1600.Lcbc_dec_body: 1601___ 1602$code.=<<___; 1603 mov %rsp, %rbp # backup %rsp 1604 mov 240($arg4), %eax # rounds 1605 mov $arg1, $inp # backup arguments 1606 mov $arg2, $out 1607 mov $arg3, $len 1608 mov $arg4, $key 1609 mov $arg5, %rbx 1610 shr \$4, $len # bytes to blocks 1611 1612 mov %eax, %edx # rounds 1613 shl \$7, %rax # 128 bytes per inner round key 1614 sub \$`128-32`, %rax # size of bit-sliced key schedule 1615 sub %rax, %rsp 1616 1617 mov %rsp, %rax # pass key schedule 1618 mov $key, %rcx # pass key 1619 mov %edx, %r10d # pass rounds 1620 call _bsaes_key_convert 1621 pxor (%rsp),%xmm7 # fix up 0 round key 1622 movdqa %xmm6,(%rax) # save last round key 1623 movdqa %xmm7,(%rsp) 1624 1625 movdqu (%rbx), @XMM[15] # load IV 1626 sub \$8,$len 1627.Lcbc_dec_loop: 1628 movdqu 0x00($inp), @XMM[0] # load input 1629 movdqu 0x10($inp), @XMM[1] 1630 movdqu 0x20($inp), @XMM[2] 1631 movdqu 0x30($inp), @XMM[3] 1632 movdqu 0x40($inp), @XMM[4] 1633 movdqu 0x50($inp), @XMM[5] 1634 mov %rsp, %rax # pass key schedule 1635 movdqu 0x60($inp), @XMM[6] 1636 mov %edx,%r10d # pass rounds 1637 movdqu 0x70($inp), @XMM[7] 1638 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1639 1640 call _bsaes_decrypt8 1641 1642 pxor 0x20(%rbp), @XMM[0] # ^= IV 1643 movdqu 0x00($inp), @XMM[8] # re-load input 1644 movdqu 0x10($inp), @XMM[9] 1645 pxor @XMM[8], @XMM[1] 1646 movdqu 0x20($inp), @XMM[10] 1647 pxor @XMM[9], @XMM[6] 1648 movdqu 0x30($inp), @XMM[11] 1649 pxor @XMM[10], @XMM[4] 1650 movdqu 0x40($inp), @XMM[12] 1651 pxor @XMM[11], @XMM[2] 1652 movdqu 0x50($inp), @XMM[13] 1653 pxor @XMM[12], @XMM[7] 1654 movdqu 0x60($inp), @XMM[14] 1655 pxor @XMM[13], @XMM[3] 1656 movdqu 0x70($inp), @XMM[15] # IV 1657 pxor @XMM[14], @XMM[5] 1658 movdqu @XMM[0], 0x00($out) # write output 1659 lea 0x80($inp), $inp 1660 movdqu @XMM[1], 0x10($out) 1661 movdqu @XMM[6], 0x20($out) 1662 movdqu @XMM[4], 0x30($out) 1663 movdqu @XMM[2], 0x40($out) 1664 movdqu @XMM[7], 0x50($out) 1665 movdqu @XMM[3], 0x60($out) 1666 movdqu @XMM[5], 0x70($out) 1667 lea 0x80($out), $out 1668 sub \$8,$len 1669 jnc .Lcbc_dec_loop 1670 1671 add \$8,$len 1672 jz .Lcbc_dec_done 1673 1674 movdqu 0x00($inp), @XMM[0] # load input 1675 mov %rsp, %rax # pass key schedule 1676 mov %edx, %r10d # pass rounds 1677 cmp \$2,$len 1678 jb .Lcbc_dec_one 1679 movdqu 0x10($inp), @XMM[1] 1680 je .Lcbc_dec_two 1681 movdqu 0x20($inp), @XMM[2] 1682 cmp \$4,$len 1683 jb .Lcbc_dec_three 1684 movdqu 0x30($inp), @XMM[3] 1685 je .Lcbc_dec_four 1686 movdqu 0x40($inp), @XMM[4] 1687 cmp \$6,$len 1688 jb .Lcbc_dec_five 1689 movdqu 0x50($inp), @XMM[5] 1690 je .Lcbc_dec_six 1691 movdqu 0x60($inp), @XMM[6] 1692 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1693 call _bsaes_decrypt8 1694 pxor 0x20(%rbp), @XMM[0] # ^= IV 1695 movdqu 0x00($inp), @XMM[8] # re-load input 1696 movdqu 0x10($inp), @XMM[9] 1697 pxor @XMM[8], @XMM[1] 1698 movdqu 0x20($inp), @XMM[10] 1699 pxor @XMM[9], @XMM[6] 1700 movdqu 0x30($inp), @XMM[11] 1701 pxor @XMM[10], @XMM[4] 1702 movdqu 0x40($inp), @XMM[12] 1703 pxor @XMM[11], @XMM[2] 1704 movdqu 0x50($inp), @XMM[13] 1705 pxor @XMM[12], @XMM[7] 1706 movdqu 0x60($inp), @XMM[15] # IV 1707 pxor @XMM[13], @XMM[3] 1708 movdqu @XMM[0], 0x00($out) # write output 1709 movdqu @XMM[1], 0x10($out) 1710 movdqu @XMM[6], 0x20($out) 1711 movdqu @XMM[4], 0x30($out) 1712 movdqu @XMM[2], 0x40($out) 1713 movdqu @XMM[7], 0x50($out) 1714 movdqu @XMM[3], 0x60($out) 1715 jmp .Lcbc_dec_done 1716.align 16 1717.Lcbc_dec_six: 1718 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1719 call _bsaes_decrypt8 1720 pxor 0x20(%rbp), @XMM[0] # ^= IV 1721 movdqu 0x00($inp), @XMM[8] # re-load input 1722 movdqu 0x10($inp), @XMM[9] 1723 pxor @XMM[8], @XMM[1] 1724 movdqu 0x20($inp), @XMM[10] 1725 pxor @XMM[9], @XMM[6] 1726 movdqu 0x30($inp), @XMM[11] 1727 pxor @XMM[10], @XMM[4] 1728 movdqu 0x40($inp), @XMM[12] 1729 pxor @XMM[11], @XMM[2] 1730 movdqu 0x50($inp), @XMM[15] # IV 1731 pxor @XMM[12], @XMM[7] 1732 movdqu @XMM[0], 0x00($out) # write output 1733 movdqu @XMM[1], 0x10($out) 1734 movdqu @XMM[6], 0x20($out) 1735 movdqu @XMM[4], 0x30($out) 1736 movdqu @XMM[2], 0x40($out) 1737 movdqu @XMM[7], 0x50($out) 1738 jmp .Lcbc_dec_done 1739.align 16 1740.Lcbc_dec_five: 1741 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1742 call _bsaes_decrypt8 1743 pxor 0x20(%rbp), @XMM[0] # ^= IV 1744 movdqu 0x00($inp), @XMM[8] # re-load input 1745 movdqu 0x10($inp), @XMM[9] 1746 pxor @XMM[8], @XMM[1] 1747 movdqu 0x20($inp), @XMM[10] 1748 pxor @XMM[9], @XMM[6] 1749 movdqu 0x30($inp), @XMM[11] 1750 pxor @XMM[10], @XMM[4] 1751 movdqu 0x40($inp), @XMM[15] # IV 1752 pxor @XMM[11], @XMM[2] 1753 movdqu @XMM[0], 0x00($out) # write output 1754 movdqu @XMM[1], 0x10($out) 1755 movdqu @XMM[6], 0x20($out) 1756 movdqu @XMM[4], 0x30($out) 1757 movdqu @XMM[2], 0x40($out) 1758 jmp .Lcbc_dec_done 1759.align 16 1760.Lcbc_dec_four: 1761 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1762 call _bsaes_decrypt8 1763 pxor 0x20(%rbp), @XMM[0] # ^= IV 1764 movdqu 0x00($inp), @XMM[8] # re-load input 1765 movdqu 0x10($inp), @XMM[9] 1766 pxor @XMM[8], @XMM[1] 1767 movdqu 0x20($inp), @XMM[10] 1768 pxor @XMM[9], @XMM[6] 1769 movdqu 0x30($inp), @XMM[15] # IV 1770 pxor @XMM[10], @XMM[4] 1771 movdqu @XMM[0], 0x00($out) # write output 1772 movdqu @XMM[1], 0x10($out) 1773 movdqu @XMM[6], 0x20($out) 1774 movdqu @XMM[4], 0x30($out) 1775 jmp .Lcbc_dec_done 1776.align 16 1777.Lcbc_dec_three: 1778 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1779 call _bsaes_decrypt8 1780 pxor 0x20(%rbp), @XMM[0] # ^= IV 1781 movdqu 0x00($inp), @XMM[8] # re-load input 1782 movdqu 0x10($inp), @XMM[9] 1783 pxor @XMM[8], @XMM[1] 1784 movdqu 0x20($inp), @XMM[15] # IV 1785 pxor @XMM[9], @XMM[6] 1786 movdqu @XMM[0], 0x00($out) # write output 1787 movdqu @XMM[1], 0x10($out) 1788 movdqu @XMM[6], 0x20($out) 1789 jmp .Lcbc_dec_done 1790.align 16 1791.Lcbc_dec_two: 1792 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1793 call _bsaes_decrypt8 1794 pxor 0x20(%rbp), @XMM[0] # ^= IV 1795 movdqu 0x00($inp), @XMM[8] # re-load input 1796 movdqu 0x10($inp), @XMM[15] # IV 1797 pxor @XMM[8], @XMM[1] 1798 movdqu @XMM[0], 0x00($out) # write output 1799 movdqu @XMM[1], 0x10($out) 1800 jmp .Lcbc_dec_done 1801.align 16 1802.Lcbc_dec_one: 1803 lea ($inp), $arg1 1804 lea 0x20(%rbp), $arg2 # buffer output 1805 lea ($key), $arg3 1806 call asm_AES_decrypt # doesn't touch %xmm 1807 pxor 0x20(%rbp), @XMM[15] # ^= IV 1808 movdqu @XMM[15], ($out) # write output 1809 movdqa @XMM[0], @XMM[15] # IV 1810 1811.Lcbc_dec_done: 1812 movdqu @XMM[15], (%rbx) # return IV 1813 lea (%rsp), %rax 1814 pxor %xmm0, %xmm0 1815.Lcbc_dec_bzero: # wipe key schedule [if any] 1816 movdqa %xmm0, 0x00(%rax) 1817 movdqa %xmm0, 0x10(%rax) 1818 lea 0x20(%rax), %rax 1819 cmp %rax, %rbp 1820 ja .Lcbc_dec_bzero 1821 1822 lea 0x78(%rbp),%rax 1823___ 1824$code.=<<___ if ($win64); 1825 movaps 0x40(%rbp), %xmm6 1826 movaps 0x50(%rbp), %xmm7 1827 movaps 0x60(%rbp), %xmm8 1828 movaps 0x70(%rbp), %xmm9 1829 movaps 0x80(%rbp), %xmm10 1830 movaps 0x90(%rbp), %xmm11 1831 movaps 0xa0(%rbp), %xmm12 1832 movaps 0xb0(%rbp), %xmm13 1833 movaps 0xc0(%rbp), %xmm14 1834 movaps 0xd0(%rbp), %xmm15 1835 lea 0xa0(%rax), %rax 1836.Lcbc_dec_tail: 1837___ 1838$code.=<<___; 1839 mov -48(%rax), %r15 1840 mov -40(%rax), %r14 1841 mov -32(%rax), %r13 1842 mov -24(%rax), %r12 1843 mov -16(%rax), %rbx 1844 mov -8(%rax), %rbp 1845 lea (%rax), %rsp # restore %rsp 1846.Lcbc_dec_epilogue: 1847 ret 1848.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt 1849 1850.globl bsaes_ctr32_encrypt_blocks 1851.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1852.align 16 1853bsaes_ctr32_encrypt_blocks: 1854 mov %rsp, %rax 1855.Lctr_enc_prologue: 1856 push %rbp 1857 push %rbx 1858 push %r12 1859 push %r13 1860 push %r14 1861 push %r15 1862 lea -0x48(%rsp), %rsp 1863___ 1864$code.=<<___ if ($win64); 1865 mov 0xa0(%rsp),$arg5 # pull ivp 1866 lea -0xa0(%rsp), %rsp 1867 movaps %xmm6, 0x40(%rsp) 1868 movaps %xmm7, 0x50(%rsp) 1869 movaps %xmm8, 0x60(%rsp) 1870 movaps %xmm9, 0x70(%rsp) 1871 movaps %xmm10, 0x80(%rsp) 1872 movaps %xmm11, 0x90(%rsp) 1873 movaps %xmm12, 0xa0(%rsp) 1874 movaps %xmm13, 0xb0(%rsp) 1875 movaps %xmm14, 0xc0(%rsp) 1876 movaps %xmm15, 0xd0(%rsp) 1877.Lctr_enc_body: 1878___ 1879$code.=<<___; 1880 mov %rsp, %rbp # backup %rsp 1881 movdqu ($arg5), %xmm0 # load counter 1882 mov 240($arg4), %eax # rounds 1883 mov $arg1, $inp # backup arguments 1884 mov $arg2, $out 1885 mov $arg3, $len 1886 mov $arg4, $key 1887 movdqa %xmm0, 0x20(%rbp) # copy counter 1888 cmp \$8, $arg3 1889 jb .Lctr_enc_short 1890 1891 mov %eax, %ebx # rounds 1892 shl \$7, %rax # 128 bytes per inner round key 1893 sub \$`128-32`, %rax # size of bit-sliced key schedule 1894 sub %rax, %rsp 1895 1896 mov %rsp, %rax # pass key schedule 1897 mov $key, %rcx # pass key 1898 mov %ebx, %r10d # pass rounds 1899 call _bsaes_key_convert 1900 pxor %xmm6,%xmm7 # fix up last round key 1901 movdqa %xmm7,(%rax) # save last round key 1902 1903 movdqa (%rsp), @XMM[9] # load round0 key 1904 lea .LADD1(%rip), %r11 1905 movdqa 0x20(%rbp), @XMM[0] # counter copy 1906 movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1907 pshufb @XMM[8], @XMM[9] # byte swap upper part 1908 pshufb @XMM[8], @XMM[0] 1909 movdqa @XMM[9], (%rsp) # save adjusted round0 key 1910 jmp .Lctr_enc_loop 1911.align 16 1912.Lctr_enc_loop: 1913 movdqa @XMM[0], 0x20(%rbp) # save counter 1914 movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1915 movdqa @XMM[0], @XMM[2] 1916 paddd 0x00(%r11), @XMM[1] # .LADD1 1917 movdqa @XMM[0], @XMM[3] 1918 paddd 0x10(%r11), @XMM[2] # .LADD2 1919 movdqa @XMM[0], @XMM[4] 1920 paddd 0x20(%r11), @XMM[3] # .LADD3 1921 movdqa @XMM[0], @XMM[5] 1922 paddd 0x30(%r11), @XMM[4] # .LADD4 1923 movdqa @XMM[0], @XMM[6] 1924 paddd 0x40(%r11), @XMM[5] # .LADD5 1925 movdqa @XMM[0], @XMM[7] 1926 paddd 0x50(%r11), @XMM[6] # .LADD6 1927 paddd 0x60(%r11), @XMM[7] # .LADD7 1928 1929 # Borrow prologue from _bsaes_encrypt8 to use the opportunity 1930 # to flip byte order in 32-bit counter 1931 movdqa (%rsp), @XMM[9] # round 0 key 1932 lea 0x10(%rsp), %rax # pass key schedule 1933 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 1934 pxor @XMM[9], @XMM[0] # xor with round0 key 1935 pxor @XMM[9], @XMM[1] 1936 pxor @XMM[9], @XMM[2] 1937 pxor @XMM[9], @XMM[3] 1938 pshufb @XMM[8], @XMM[0] 1939 pshufb @XMM[8], @XMM[1] 1940 pxor @XMM[9], @XMM[4] 1941 pxor @XMM[9], @XMM[5] 1942 pshufb @XMM[8], @XMM[2] 1943 pshufb @XMM[8], @XMM[3] 1944 pxor @XMM[9], @XMM[6] 1945 pxor @XMM[9], @XMM[7] 1946 pshufb @XMM[8], @XMM[4] 1947 pshufb @XMM[8], @XMM[5] 1948 pshufb @XMM[8], @XMM[6] 1949 pshufb @XMM[8], @XMM[7] 1950 lea .LBS0(%rip), %r11 # constants table 1951 mov %ebx,%r10d # pass rounds 1952 1953 call _bsaes_encrypt8_bitslice 1954 1955 sub \$8,$len 1956 jc .Lctr_enc_loop_done 1957 1958 movdqu 0x00($inp), @XMM[8] # load input 1959 movdqu 0x10($inp), @XMM[9] 1960 movdqu 0x20($inp), @XMM[10] 1961 movdqu 0x30($inp), @XMM[11] 1962 movdqu 0x40($inp), @XMM[12] 1963 movdqu 0x50($inp), @XMM[13] 1964 movdqu 0x60($inp), @XMM[14] 1965 movdqu 0x70($inp), @XMM[15] 1966 lea 0x80($inp),$inp 1967 pxor @XMM[0], @XMM[8] 1968 movdqa 0x20(%rbp), @XMM[0] # load counter 1969 pxor @XMM[9], @XMM[1] 1970 movdqu @XMM[8], 0x00($out) # write output 1971 pxor @XMM[10], @XMM[4] 1972 movdqu @XMM[1], 0x10($out) 1973 pxor @XMM[11], @XMM[6] 1974 movdqu @XMM[4], 0x20($out) 1975 pxor @XMM[12], @XMM[3] 1976 movdqu @XMM[6], 0x30($out) 1977 pxor @XMM[13], @XMM[7] 1978 movdqu @XMM[3], 0x40($out) 1979 pxor @XMM[14], @XMM[2] 1980 movdqu @XMM[7], 0x50($out) 1981 pxor @XMM[15], @XMM[5] 1982 movdqu @XMM[2], 0x60($out) 1983 lea .LADD1(%rip), %r11 1984 movdqu @XMM[5], 0x70($out) 1985 lea 0x80($out), $out 1986 paddd 0x70(%r11), @XMM[0] # .LADD8 1987 jnz .Lctr_enc_loop 1988 1989 jmp .Lctr_enc_done 1990.align 16 1991.Lctr_enc_loop_done: 1992 add \$8, $len 1993 movdqu 0x00($inp), @XMM[8] # load input 1994 pxor @XMM[8], @XMM[0] 1995 movdqu @XMM[0], 0x00($out) # write output 1996 cmp \$2,$len 1997 jb .Lctr_enc_done 1998 movdqu 0x10($inp), @XMM[9] 1999 pxor @XMM[9], @XMM[1] 2000 movdqu @XMM[1], 0x10($out) 2001 je .Lctr_enc_done 2002 movdqu 0x20($inp), @XMM[10] 2003 pxor @XMM[10], @XMM[4] 2004 movdqu @XMM[4], 0x20($out) 2005 cmp \$4,$len 2006 jb .Lctr_enc_done 2007 movdqu 0x30($inp), @XMM[11] 2008 pxor @XMM[11], @XMM[6] 2009 movdqu @XMM[6], 0x30($out) 2010 je .Lctr_enc_done 2011 movdqu 0x40($inp), @XMM[12] 2012 pxor @XMM[12], @XMM[3] 2013 movdqu @XMM[3], 0x40($out) 2014 cmp \$6,$len 2015 jb .Lctr_enc_done 2016 movdqu 0x50($inp), @XMM[13] 2017 pxor @XMM[13], @XMM[7] 2018 movdqu @XMM[7], 0x50($out) 2019 je .Lctr_enc_done 2020 movdqu 0x60($inp), @XMM[14] 2021 pxor @XMM[14], @XMM[2] 2022 movdqu @XMM[2], 0x60($out) 2023 jmp .Lctr_enc_done 2024 2025.align 16 2026.Lctr_enc_short: 2027 lea 0x20(%rbp), $arg1 2028 lea 0x30(%rbp), $arg2 2029 lea ($key), $arg3 2030 call asm_AES_encrypt 2031 movdqu ($inp), @XMM[1] 2032 lea 16($inp), $inp 2033 mov 0x2c(%rbp), %eax # load 32-bit counter 2034 bswap %eax 2035 pxor 0x30(%rbp), @XMM[1] 2036 inc %eax # increment 2037 movdqu @XMM[1], ($out) 2038 bswap %eax 2039 lea 16($out), $out 2040 mov %eax, 0x2c(%rsp) # save 32-bit counter 2041 dec $len 2042 jnz .Lctr_enc_short 2043 2044.Lctr_enc_done: 2045 lea (%rsp), %rax 2046 pxor %xmm0, %xmm0 2047.Lctr_enc_bzero: # wipe key schedule [if any] 2048 movdqa %xmm0, 0x00(%rax) 2049 movdqa %xmm0, 0x10(%rax) 2050 lea 0x20(%rax), %rax 2051 cmp %rax, %rbp 2052 ja .Lctr_enc_bzero 2053 2054 lea 0x78(%rbp),%rax 2055___ 2056$code.=<<___ if ($win64); 2057 movaps 0x40(%rbp), %xmm6 2058 movaps 0x50(%rbp), %xmm7 2059 movaps 0x60(%rbp), %xmm8 2060 movaps 0x70(%rbp), %xmm9 2061 movaps 0x80(%rbp), %xmm10 2062 movaps 0x90(%rbp), %xmm11 2063 movaps 0xa0(%rbp), %xmm12 2064 movaps 0xb0(%rbp), %xmm13 2065 movaps 0xc0(%rbp), %xmm14 2066 movaps 0xd0(%rbp), %xmm15 2067 lea 0xa0(%rax), %rax 2068.Lctr_enc_tail: 2069___ 2070$code.=<<___; 2071 mov -48(%rax), %r15 2072 mov -40(%rax), %r14 2073 mov -32(%rax), %r13 2074 mov -24(%rax), %r12 2075 mov -16(%rax), %rbx 2076 mov -8(%rax), %rbp 2077 lea (%rax), %rsp # restore %rsp 2078.Lctr_enc_epilogue: 2079 ret 2080.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks 2081___ 2082###################################################################### 2083# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2084# const AES_KEY *key1, const AES_KEY *key2, 2085# const unsigned char iv[16]); 2086# 2087my ($twmask,$twres,$twtmp)=@XMM[13..15]; 2088$arg6=~s/d$//; 2089 2090$code.=<<___; 2091.globl bsaes_xts_encrypt 2092.type bsaes_xts_encrypt,\@abi-omnipotent 2093.align 16 2094bsaes_xts_encrypt: 2095 mov %rsp, %rax 2096.Lxts_enc_prologue: 2097 push %rbp 2098 push %rbx 2099 push %r12 2100 push %r13 2101 push %r14 2102 push %r15 2103 lea -0x48(%rsp), %rsp 2104___ 2105$code.=<<___ if ($win64); 2106 mov 0xa0(%rsp),$arg5 # pull key2 2107 mov 0xa8(%rsp),$arg6 # pull ivp 2108 lea -0xa0(%rsp), %rsp 2109 movaps %xmm6, 0x40(%rsp) 2110 movaps %xmm7, 0x50(%rsp) 2111 movaps %xmm8, 0x60(%rsp) 2112 movaps %xmm9, 0x70(%rsp) 2113 movaps %xmm10, 0x80(%rsp) 2114 movaps %xmm11, 0x90(%rsp) 2115 movaps %xmm12, 0xa0(%rsp) 2116 movaps %xmm13, 0xb0(%rsp) 2117 movaps %xmm14, 0xc0(%rsp) 2118 movaps %xmm15, 0xd0(%rsp) 2119.Lxts_enc_body: 2120___ 2121$code.=<<___; 2122 mov %rsp, %rbp # backup %rsp 2123 mov $arg1, $inp # backup arguments 2124 mov $arg2, $out 2125 mov $arg3, $len 2126 mov $arg4, $key 2127 2128 lea ($arg6), $arg1 2129 lea 0x20(%rbp), $arg2 2130 lea ($arg5), $arg3 2131 call asm_AES_encrypt # generate initial tweak 2132 2133 mov 240($key), %eax # rounds 2134 mov $len, %rbx # backup $len 2135 2136 mov %eax, %edx # rounds 2137 shl \$7, %rax # 128 bytes per inner round key 2138 sub \$`128-32`, %rax # size of bit-sliced key schedule 2139 sub %rax, %rsp 2140 2141 mov %rsp, %rax # pass key schedule 2142 mov $key, %rcx # pass key 2143 mov %edx, %r10d # pass rounds 2144 call _bsaes_key_convert 2145 pxor %xmm6, %xmm7 # fix up last round key 2146 movdqa %xmm7, (%rax) # save last round key 2147 2148 and \$-16, $len 2149 sub \$0x80, %rsp # place for tweak[8] 2150 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2151 2152 pxor $twtmp, $twtmp 2153 movdqa .Lxts_magic(%rip), $twmask 2154 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2155 2156 sub \$0x80, $len 2157 jc .Lxts_enc_short 2158 jmp .Lxts_enc_loop 2159 2160.align 16 2161.Lxts_enc_loop: 2162___ 2163 for ($i=0;$i<7;$i++) { 2164 $code.=<<___; 2165 pshufd \$0x13, $twtmp, $twres 2166 pxor $twtmp, $twtmp 2167 movdqa @XMM[7], @XMM[$i] 2168 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2169 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2170 pand $twmask, $twres # isolate carry and residue 2171 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2172 pxor $twres, @XMM[7] 2173___ 2174 $code.=<<___ if ($i>=1); 2175 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2176___ 2177 $code.=<<___ if ($i>=2); 2178 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2179___ 2180 } 2181$code.=<<___; 2182 movdqu 0x60($inp), @XMM[8+6] 2183 pxor @XMM[8+5], @XMM[5] 2184 movdqu 0x70($inp), @XMM[8+7] 2185 lea 0x80($inp), $inp 2186 movdqa @XMM[7], 0x70(%rsp) 2187 pxor @XMM[8+6], @XMM[6] 2188 lea 0x80(%rsp), %rax # pass key schedule 2189 pxor @XMM[8+7], @XMM[7] 2190 mov %edx, %r10d # pass rounds 2191 2192 call _bsaes_encrypt8 2193 2194 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2195 pxor 0x10(%rsp), @XMM[1] 2196 movdqu @XMM[0], 0x00($out) # write output 2197 pxor 0x20(%rsp), @XMM[4] 2198 movdqu @XMM[1], 0x10($out) 2199 pxor 0x30(%rsp), @XMM[6] 2200 movdqu @XMM[4], 0x20($out) 2201 pxor 0x40(%rsp), @XMM[3] 2202 movdqu @XMM[6], 0x30($out) 2203 pxor 0x50(%rsp), @XMM[7] 2204 movdqu @XMM[3], 0x40($out) 2205 pxor 0x60(%rsp), @XMM[2] 2206 movdqu @XMM[7], 0x50($out) 2207 pxor 0x70(%rsp), @XMM[5] 2208 movdqu @XMM[2], 0x60($out) 2209 movdqu @XMM[5], 0x70($out) 2210 lea 0x80($out), $out 2211 2212 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2213 pxor $twtmp, $twtmp 2214 movdqa .Lxts_magic(%rip), $twmask 2215 pcmpgtd @XMM[7], $twtmp 2216 pshufd \$0x13, $twtmp, $twres 2217 pxor $twtmp, $twtmp 2218 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2219 pand $twmask, $twres # isolate carry and residue 2220 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2221 pxor $twres, @XMM[7] 2222 2223 sub \$0x80,$len 2224 jnc .Lxts_enc_loop 2225 2226.Lxts_enc_short: 2227 add \$0x80, $len 2228 jz .Lxts_enc_done 2229___ 2230 for ($i=0;$i<7;$i++) { 2231 $code.=<<___; 2232 pshufd \$0x13, $twtmp, $twres 2233 pxor $twtmp, $twtmp 2234 movdqa @XMM[7], @XMM[$i] 2235 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2236 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2237 pand $twmask, $twres # isolate carry and residue 2238 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2239 pxor $twres, @XMM[7] 2240___ 2241 $code.=<<___ if ($i>=1); 2242 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2243 cmp \$`0x10*$i`,$len 2244 je .Lxts_enc_$i 2245___ 2246 $code.=<<___ if ($i>=2); 2247 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2248___ 2249 } 2250$code.=<<___; 2251 movdqu 0x60($inp), @XMM[8+6] 2252 pxor @XMM[8+5], @XMM[5] 2253 movdqa @XMM[7], 0x70(%rsp) 2254 lea 0x70($inp), $inp 2255 pxor @XMM[8+6], @XMM[6] 2256 lea 0x80(%rsp), %rax # pass key schedule 2257 mov %edx, %r10d # pass rounds 2258 2259 call _bsaes_encrypt8 2260 2261 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2262 pxor 0x10(%rsp), @XMM[1] 2263 movdqu @XMM[0], 0x00($out) # write output 2264 pxor 0x20(%rsp), @XMM[4] 2265 movdqu @XMM[1], 0x10($out) 2266 pxor 0x30(%rsp), @XMM[6] 2267 movdqu @XMM[4], 0x20($out) 2268 pxor 0x40(%rsp), @XMM[3] 2269 movdqu @XMM[6], 0x30($out) 2270 pxor 0x50(%rsp), @XMM[7] 2271 movdqu @XMM[3], 0x40($out) 2272 pxor 0x60(%rsp), @XMM[2] 2273 movdqu @XMM[7], 0x50($out) 2274 movdqu @XMM[2], 0x60($out) 2275 lea 0x70($out), $out 2276 2277 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2278 jmp .Lxts_enc_done 2279.align 16 2280.Lxts_enc_6: 2281 pxor @XMM[8+4], @XMM[4] 2282 lea 0x60($inp), $inp 2283 pxor @XMM[8+5], @XMM[5] 2284 lea 0x80(%rsp), %rax # pass key schedule 2285 mov %edx, %r10d # pass rounds 2286 2287 call _bsaes_encrypt8 2288 2289 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2290 pxor 0x10(%rsp), @XMM[1] 2291 movdqu @XMM[0], 0x00($out) # write output 2292 pxor 0x20(%rsp), @XMM[4] 2293 movdqu @XMM[1], 0x10($out) 2294 pxor 0x30(%rsp), @XMM[6] 2295 movdqu @XMM[4], 0x20($out) 2296 pxor 0x40(%rsp), @XMM[3] 2297 movdqu @XMM[6], 0x30($out) 2298 pxor 0x50(%rsp), @XMM[7] 2299 movdqu @XMM[3], 0x40($out) 2300 movdqu @XMM[7], 0x50($out) 2301 lea 0x60($out), $out 2302 2303 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2304 jmp .Lxts_enc_done 2305.align 16 2306.Lxts_enc_5: 2307 pxor @XMM[8+3], @XMM[3] 2308 lea 0x50($inp), $inp 2309 pxor @XMM[8+4], @XMM[4] 2310 lea 0x80(%rsp), %rax # pass key schedule 2311 mov %edx, %r10d # pass rounds 2312 2313 call _bsaes_encrypt8 2314 2315 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2316 pxor 0x10(%rsp), @XMM[1] 2317 movdqu @XMM[0], 0x00($out) # write output 2318 pxor 0x20(%rsp), @XMM[4] 2319 movdqu @XMM[1], 0x10($out) 2320 pxor 0x30(%rsp), @XMM[6] 2321 movdqu @XMM[4], 0x20($out) 2322 pxor 0x40(%rsp), @XMM[3] 2323 movdqu @XMM[6], 0x30($out) 2324 movdqu @XMM[3], 0x40($out) 2325 lea 0x50($out), $out 2326 2327 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2328 jmp .Lxts_enc_done 2329.align 16 2330.Lxts_enc_4: 2331 pxor @XMM[8+2], @XMM[2] 2332 lea 0x40($inp), $inp 2333 pxor @XMM[8+3], @XMM[3] 2334 lea 0x80(%rsp), %rax # pass key schedule 2335 mov %edx, %r10d # pass rounds 2336 2337 call _bsaes_encrypt8 2338 2339 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2340 pxor 0x10(%rsp), @XMM[1] 2341 movdqu @XMM[0], 0x00($out) # write output 2342 pxor 0x20(%rsp), @XMM[4] 2343 movdqu @XMM[1], 0x10($out) 2344 pxor 0x30(%rsp), @XMM[6] 2345 movdqu @XMM[4], 0x20($out) 2346 movdqu @XMM[6], 0x30($out) 2347 lea 0x40($out), $out 2348 2349 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2350 jmp .Lxts_enc_done 2351.align 16 2352.Lxts_enc_3: 2353 pxor @XMM[8+1], @XMM[1] 2354 lea 0x30($inp), $inp 2355 pxor @XMM[8+2], @XMM[2] 2356 lea 0x80(%rsp), %rax # pass key schedule 2357 mov %edx, %r10d # pass rounds 2358 2359 call _bsaes_encrypt8 2360 2361 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2362 pxor 0x10(%rsp), @XMM[1] 2363 movdqu @XMM[0], 0x00($out) # write output 2364 pxor 0x20(%rsp), @XMM[4] 2365 movdqu @XMM[1], 0x10($out) 2366 movdqu @XMM[4], 0x20($out) 2367 lea 0x30($out), $out 2368 2369 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2370 jmp .Lxts_enc_done 2371.align 16 2372.Lxts_enc_2: 2373 pxor @XMM[8+0], @XMM[0] 2374 lea 0x20($inp), $inp 2375 pxor @XMM[8+1], @XMM[1] 2376 lea 0x80(%rsp), %rax # pass key schedule 2377 mov %edx, %r10d # pass rounds 2378 2379 call _bsaes_encrypt8 2380 2381 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2382 pxor 0x10(%rsp), @XMM[1] 2383 movdqu @XMM[0], 0x00($out) # write output 2384 movdqu @XMM[1], 0x10($out) 2385 lea 0x20($out), $out 2386 2387 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2388 jmp .Lxts_enc_done 2389.align 16 2390.Lxts_enc_1: 2391 pxor @XMM[0], @XMM[8] 2392 lea 0x10($inp), $inp 2393 movdqa @XMM[8], 0x20(%rbp) 2394 lea 0x20(%rbp), $arg1 2395 lea 0x20(%rbp), $arg2 2396 lea ($key), $arg3 2397 call asm_AES_encrypt # doesn't touch %xmm 2398 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2399 #pxor @XMM[8], @XMM[0] 2400 #lea 0x80(%rsp), %rax # pass key schedule 2401 #mov %edx, %r10d # pass rounds 2402 #call _bsaes_encrypt8 2403 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2404 movdqu @XMM[0], 0x00($out) # write output 2405 lea 0x10($out), $out 2406 2407 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2408 2409.Lxts_enc_done: 2410 and \$15, %ebx 2411 jz .Lxts_enc_ret 2412 mov $out, %rdx 2413 2414.Lxts_enc_steal: 2415 movzb ($inp), %eax 2416 movzb -16(%rdx), %ecx 2417 lea 1($inp), $inp 2418 mov %al, -16(%rdx) 2419 mov %cl, 0(%rdx) 2420 lea 1(%rdx), %rdx 2421 sub \$1,%ebx 2422 jnz .Lxts_enc_steal 2423 2424 movdqu -16($out), @XMM[0] 2425 lea 0x20(%rbp), $arg1 2426 pxor @XMM[7], @XMM[0] 2427 lea 0x20(%rbp), $arg2 2428 movdqa @XMM[0], 0x20(%rbp) 2429 lea ($key), $arg3 2430 call asm_AES_encrypt # doesn't touch %xmm 2431 pxor 0x20(%rbp), @XMM[7] 2432 movdqu @XMM[7], -16($out) 2433 2434.Lxts_enc_ret: 2435 lea (%rsp), %rax 2436 pxor %xmm0, %xmm0 2437.Lxts_enc_bzero: # wipe key schedule [if any] 2438 movdqa %xmm0, 0x00(%rax) 2439 movdqa %xmm0, 0x10(%rax) 2440 lea 0x20(%rax), %rax 2441 cmp %rax, %rbp 2442 ja .Lxts_enc_bzero 2443 2444 lea 0x78(%rbp),%rax 2445___ 2446$code.=<<___ if ($win64); 2447 movaps 0x40(%rbp), %xmm6 2448 movaps 0x50(%rbp), %xmm7 2449 movaps 0x60(%rbp), %xmm8 2450 movaps 0x70(%rbp), %xmm9 2451 movaps 0x80(%rbp), %xmm10 2452 movaps 0x90(%rbp), %xmm11 2453 movaps 0xa0(%rbp), %xmm12 2454 movaps 0xb0(%rbp), %xmm13 2455 movaps 0xc0(%rbp), %xmm14 2456 movaps 0xd0(%rbp), %xmm15 2457 lea 0xa0(%rax), %rax 2458.Lxts_enc_tail: 2459___ 2460$code.=<<___; 2461 mov -48(%rax), %r15 2462 mov -40(%rax), %r14 2463 mov -32(%rax), %r13 2464 mov -24(%rax), %r12 2465 mov -16(%rax), %rbx 2466 mov -8(%rax), %rbp 2467 lea (%rax), %rsp # restore %rsp 2468.Lxts_enc_epilogue: 2469 ret 2470.size bsaes_xts_encrypt,.-bsaes_xts_encrypt 2471 2472.globl bsaes_xts_decrypt 2473.type bsaes_xts_decrypt,\@abi-omnipotent 2474.align 16 2475bsaes_xts_decrypt: 2476 mov %rsp, %rax 2477.Lxts_dec_prologue: 2478 push %rbp 2479 push %rbx 2480 push %r12 2481 push %r13 2482 push %r14 2483 push %r15 2484 lea -0x48(%rsp), %rsp 2485___ 2486$code.=<<___ if ($win64); 2487 mov 0xa0(%rsp),$arg5 # pull key2 2488 mov 0xa8(%rsp),$arg6 # pull ivp 2489 lea -0xa0(%rsp), %rsp 2490 movaps %xmm6, 0x40(%rsp) 2491 movaps %xmm7, 0x50(%rsp) 2492 movaps %xmm8, 0x60(%rsp) 2493 movaps %xmm9, 0x70(%rsp) 2494 movaps %xmm10, 0x80(%rsp) 2495 movaps %xmm11, 0x90(%rsp) 2496 movaps %xmm12, 0xa0(%rsp) 2497 movaps %xmm13, 0xb0(%rsp) 2498 movaps %xmm14, 0xc0(%rsp) 2499 movaps %xmm15, 0xd0(%rsp) 2500.Lxts_dec_body: 2501___ 2502$code.=<<___; 2503 mov %rsp, %rbp # backup %rsp 2504 mov $arg1, $inp # backup arguments 2505 mov $arg2, $out 2506 mov $arg3, $len 2507 mov $arg4, $key 2508 2509 lea ($arg6), $arg1 2510 lea 0x20(%rbp), $arg2 2511 lea ($arg5), $arg3 2512 call asm_AES_encrypt # generate initial tweak 2513 2514 mov 240($key), %eax # rounds 2515 mov $len, %rbx # backup $len 2516 2517 mov %eax, %edx # rounds 2518 shl \$7, %rax # 128 bytes per inner round key 2519 sub \$`128-32`, %rax # size of bit-sliced key schedule 2520 sub %rax, %rsp 2521 2522 mov %rsp, %rax # pass key schedule 2523 mov $key, %rcx # pass key 2524 mov %edx, %r10d # pass rounds 2525 call _bsaes_key_convert 2526 pxor (%rsp), %xmm7 # fix up round 0 key 2527 movdqa %xmm6, (%rax) # save last round key 2528 movdqa %xmm7, (%rsp) 2529 2530 xor %eax, %eax # if ($len%16) len-=16; 2531 and \$-16, $len 2532 test \$15, %ebx 2533 setnz %al 2534 shl \$4, %rax 2535 sub %rax, $len 2536 2537 sub \$0x80, %rsp # place for tweak[8] 2538 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2539 2540 pxor $twtmp, $twtmp 2541 movdqa .Lxts_magic(%rip), $twmask 2542 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2543 2544 sub \$0x80, $len 2545 jc .Lxts_dec_short 2546 jmp .Lxts_dec_loop 2547 2548.align 16 2549.Lxts_dec_loop: 2550___ 2551 for ($i=0;$i<7;$i++) { 2552 $code.=<<___; 2553 pshufd \$0x13, $twtmp, $twres 2554 pxor $twtmp, $twtmp 2555 movdqa @XMM[7], @XMM[$i] 2556 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2557 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2558 pand $twmask, $twres # isolate carry and residue 2559 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2560 pxor $twres, @XMM[7] 2561___ 2562 $code.=<<___ if ($i>=1); 2563 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2564___ 2565 $code.=<<___ if ($i>=2); 2566 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2567___ 2568 } 2569$code.=<<___; 2570 movdqu 0x60($inp), @XMM[8+6] 2571 pxor @XMM[8+5], @XMM[5] 2572 movdqu 0x70($inp), @XMM[8+7] 2573 lea 0x80($inp), $inp 2574 movdqa @XMM[7], 0x70(%rsp) 2575 pxor @XMM[8+6], @XMM[6] 2576 lea 0x80(%rsp), %rax # pass key schedule 2577 pxor @XMM[8+7], @XMM[7] 2578 mov %edx, %r10d # pass rounds 2579 2580 call _bsaes_decrypt8 2581 2582 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2583 pxor 0x10(%rsp), @XMM[1] 2584 movdqu @XMM[0], 0x00($out) # write output 2585 pxor 0x20(%rsp), @XMM[6] 2586 movdqu @XMM[1], 0x10($out) 2587 pxor 0x30(%rsp), @XMM[4] 2588 movdqu @XMM[6], 0x20($out) 2589 pxor 0x40(%rsp), @XMM[2] 2590 movdqu @XMM[4], 0x30($out) 2591 pxor 0x50(%rsp), @XMM[7] 2592 movdqu @XMM[2], 0x40($out) 2593 pxor 0x60(%rsp), @XMM[3] 2594 movdqu @XMM[7], 0x50($out) 2595 pxor 0x70(%rsp), @XMM[5] 2596 movdqu @XMM[3], 0x60($out) 2597 movdqu @XMM[5], 0x70($out) 2598 lea 0x80($out), $out 2599 2600 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2601 pxor $twtmp, $twtmp 2602 movdqa .Lxts_magic(%rip), $twmask 2603 pcmpgtd @XMM[7], $twtmp 2604 pshufd \$0x13, $twtmp, $twres 2605 pxor $twtmp, $twtmp 2606 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2607 pand $twmask, $twres # isolate carry and residue 2608 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2609 pxor $twres, @XMM[7] 2610 2611 sub \$0x80,$len 2612 jnc .Lxts_dec_loop 2613 2614.Lxts_dec_short: 2615 add \$0x80, $len 2616 jz .Lxts_dec_done 2617___ 2618 for ($i=0;$i<7;$i++) { 2619 $code.=<<___; 2620 pshufd \$0x13, $twtmp, $twres 2621 pxor $twtmp, $twtmp 2622 movdqa @XMM[7], @XMM[$i] 2623 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2624 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2625 pand $twmask, $twres # isolate carry and residue 2626 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2627 pxor $twres, @XMM[7] 2628___ 2629 $code.=<<___ if ($i>=1); 2630 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2631 cmp \$`0x10*$i`,$len 2632 je .Lxts_dec_$i 2633___ 2634 $code.=<<___ if ($i>=2); 2635 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2636___ 2637 } 2638$code.=<<___; 2639 movdqu 0x60($inp), @XMM[8+6] 2640 pxor @XMM[8+5], @XMM[5] 2641 movdqa @XMM[7], 0x70(%rsp) 2642 lea 0x70($inp), $inp 2643 pxor @XMM[8+6], @XMM[6] 2644 lea 0x80(%rsp), %rax # pass key schedule 2645 mov %edx, %r10d # pass rounds 2646 2647 call _bsaes_decrypt8 2648 2649 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2650 pxor 0x10(%rsp), @XMM[1] 2651 movdqu @XMM[0], 0x00($out) # write output 2652 pxor 0x20(%rsp), @XMM[6] 2653 movdqu @XMM[1], 0x10($out) 2654 pxor 0x30(%rsp), @XMM[4] 2655 movdqu @XMM[6], 0x20($out) 2656 pxor 0x40(%rsp), @XMM[2] 2657 movdqu @XMM[4], 0x30($out) 2658 pxor 0x50(%rsp), @XMM[7] 2659 movdqu @XMM[2], 0x40($out) 2660 pxor 0x60(%rsp), @XMM[3] 2661 movdqu @XMM[7], 0x50($out) 2662 movdqu @XMM[3], 0x60($out) 2663 lea 0x70($out), $out 2664 2665 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2666 jmp .Lxts_dec_done 2667.align 16 2668.Lxts_dec_6: 2669 pxor @XMM[8+4], @XMM[4] 2670 lea 0x60($inp), $inp 2671 pxor @XMM[8+5], @XMM[5] 2672 lea 0x80(%rsp), %rax # pass key schedule 2673 mov %edx, %r10d # pass rounds 2674 2675 call _bsaes_decrypt8 2676 2677 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2678 pxor 0x10(%rsp), @XMM[1] 2679 movdqu @XMM[0], 0x00($out) # write output 2680 pxor 0x20(%rsp), @XMM[6] 2681 movdqu @XMM[1], 0x10($out) 2682 pxor 0x30(%rsp), @XMM[4] 2683 movdqu @XMM[6], 0x20($out) 2684 pxor 0x40(%rsp), @XMM[2] 2685 movdqu @XMM[4], 0x30($out) 2686 pxor 0x50(%rsp), @XMM[7] 2687 movdqu @XMM[2], 0x40($out) 2688 movdqu @XMM[7], 0x50($out) 2689 lea 0x60($out), $out 2690 2691 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2692 jmp .Lxts_dec_done 2693.align 16 2694.Lxts_dec_5: 2695 pxor @XMM[8+3], @XMM[3] 2696 lea 0x50($inp), $inp 2697 pxor @XMM[8+4], @XMM[4] 2698 lea 0x80(%rsp), %rax # pass key schedule 2699 mov %edx, %r10d # pass rounds 2700 2701 call _bsaes_decrypt8 2702 2703 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2704 pxor 0x10(%rsp), @XMM[1] 2705 movdqu @XMM[0], 0x00($out) # write output 2706 pxor 0x20(%rsp), @XMM[6] 2707 movdqu @XMM[1], 0x10($out) 2708 pxor 0x30(%rsp), @XMM[4] 2709 movdqu @XMM[6], 0x20($out) 2710 pxor 0x40(%rsp), @XMM[2] 2711 movdqu @XMM[4], 0x30($out) 2712 movdqu @XMM[2], 0x40($out) 2713 lea 0x50($out), $out 2714 2715 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2716 jmp .Lxts_dec_done 2717.align 16 2718.Lxts_dec_4: 2719 pxor @XMM[8+2], @XMM[2] 2720 lea 0x40($inp), $inp 2721 pxor @XMM[8+3], @XMM[3] 2722 lea 0x80(%rsp), %rax # pass key schedule 2723 mov %edx, %r10d # pass rounds 2724 2725 call _bsaes_decrypt8 2726 2727 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2728 pxor 0x10(%rsp), @XMM[1] 2729 movdqu @XMM[0], 0x00($out) # write output 2730 pxor 0x20(%rsp), @XMM[6] 2731 movdqu @XMM[1], 0x10($out) 2732 pxor 0x30(%rsp), @XMM[4] 2733 movdqu @XMM[6], 0x20($out) 2734 movdqu @XMM[4], 0x30($out) 2735 lea 0x40($out), $out 2736 2737 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2738 jmp .Lxts_dec_done 2739.align 16 2740.Lxts_dec_3: 2741 pxor @XMM[8+1], @XMM[1] 2742 lea 0x30($inp), $inp 2743 pxor @XMM[8+2], @XMM[2] 2744 lea 0x80(%rsp), %rax # pass key schedule 2745 mov %edx, %r10d # pass rounds 2746 2747 call _bsaes_decrypt8 2748 2749 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2750 pxor 0x10(%rsp), @XMM[1] 2751 movdqu @XMM[0], 0x00($out) # write output 2752 pxor 0x20(%rsp), @XMM[6] 2753 movdqu @XMM[1], 0x10($out) 2754 movdqu @XMM[6], 0x20($out) 2755 lea 0x30($out), $out 2756 2757 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2758 jmp .Lxts_dec_done 2759.align 16 2760.Lxts_dec_2: 2761 pxor @XMM[8+0], @XMM[0] 2762 lea 0x20($inp), $inp 2763 pxor @XMM[8+1], @XMM[1] 2764 lea 0x80(%rsp), %rax # pass key schedule 2765 mov %edx, %r10d # pass rounds 2766 2767 call _bsaes_decrypt8 2768 2769 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2770 pxor 0x10(%rsp), @XMM[1] 2771 movdqu @XMM[0], 0x00($out) # write output 2772 movdqu @XMM[1], 0x10($out) 2773 lea 0x20($out), $out 2774 2775 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2776 jmp .Lxts_dec_done 2777.align 16 2778.Lxts_dec_1: 2779 pxor @XMM[0], @XMM[8] 2780 lea 0x10($inp), $inp 2781 movdqa @XMM[8], 0x20(%rbp) 2782 lea 0x20(%rbp), $arg1 2783 lea 0x20(%rbp), $arg2 2784 lea ($key), $arg3 2785 call asm_AES_decrypt # doesn't touch %xmm 2786 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2787 #pxor @XMM[8], @XMM[0] 2788 #lea 0x80(%rsp), %rax # pass key schedule 2789 #mov %edx, %r10d # pass rounds 2790 #call _bsaes_decrypt8 2791 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2792 movdqu @XMM[0], 0x00($out) # write output 2793 lea 0x10($out), $out 2794 2795 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2796 2797.Lxts_dec_done: 2798 and \$15, %ebx 2799 jz .Lxts_dec_ret 2800 2801 pxor $twtmp, $twtmp 2802 movdqa .Lxts_magic(%rip), $twmask 2803 pcmpgtd @XMM[7], $twtmp 2804 pshufd \$0x13, $twtmp, $twres 2805 movdqa @XMM[7], @XMM[6] 2806 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2807 pand $twmask, $twres # isolate carry and residue 2808 movdqu ($inp), @XMM[0] 2809 pxor $twres, @XMM[7] 2810 2811 lea 0x20(%rbp), $arg1 2812 pxor @XMM[7], @XMM[0] 2813 lea 0x20(%rbp), $arg2 2814 movdqa @XMM[0], 0x20(%rbp) 2815 lea ($key), $arg3 2816 call asm_AES_decrypt # doesn't touch %xmm 2817 pxor 0x20(%rbp), @XMM[7] 2818 mov $out, %rdx 2819 movdqu @XMM[7], ($out) 2820 2821.Lxts_dec_steal: 2822 movzb 16($inp), %eax 2823 movzb (%rdx), %ecx 2824 lea 1($inp), $inp 2825 mov %al, (%rdx) 2826 mov %cl, 16(%rdx) 2827 lea 1(%rdx), %rdx 2828 sub \$1,%ebx 2829 jnz .Lxts_dec_steal 2830 2831 movdqu ($out), @XMM[0] 2832 lea 0x20(%rbp), $arg1 2833 pxor @XMM[6], @XMM[0] 2834 lea 0x20(%rbp), $arg2 2835 movdqa @XMM[0], 0x20(%rbp) 2836 lea ($key), $arg3 2837 call asm_AES_decrypt # doesn't touch %xmm 2838 pxor 0x20(%rbp), @XMM[6] 2839 movdqu @XMM[6], ($out) 2840 2841.Lxts_dec_ret: 2842 lea (%rsp), %rax 2843 pxor %xmm0, %xmm0 2844.Lxts_dec_bzero: # wipe key schedule [if any] 2845 movdqa %xmm0, 0x00(%rax) 2846 movdqa %xmm0, 0x10(%rax) 2847 lea 0x20(%rax), %rax 2848 cmp %rax, %rbp 2849 ja .Lxts_dec_bzero 2850 2851 lea 0x78(%rbp),%rax 2852___ 2853$code.=<<___ if ($win64); 2854 movaps 0x40(%rbp), %xmm6 2855 movaps 0x50(%rbp), %xmm7 2856 movaps 0x60(%rbp), %xmm8 2857 movaps 0x70(%rbp), %xmm9 2858 movaps 0x80(%rbp), %xmm10 2859 movaps 0x90(%rbp), %xmm11 2860 movaps 0xa0(%rbp), %xmm12 2861 movaps 0xb0(%rbp), %xmm13 2862 movaps 0xc0(%rbp), %xmm14 2863 movaps 0xd0(%rbp), %xmm15 2864 lea 0xa0(%rax), %rax 2865.Lxts_dec_tail: 2866___ 2867$code.=<<___; 2868 mov -48(%rax), %r15 2869 mov -40(%rax), %r14 2870 mov -32(%rax), %r13 2871 mov -24(%rax), %r12 2872 mov -16(%rax), %rbx 2873 mov -8(%rax), %rbp 2874 lea (%rax), %rsp # restore %rsp 2875.Lxts_dec_epilogue: 2876 ret 2877.size bsaes_xts_decrypt,.-bsaes_xts_decrypt 2878___ 2879} 2880$code.=<<___; 2881.type _bsaes_const,\@object 2882.align 64 2883_bsaes_const: 2884.LM0ISR: # InvShiftRows constants 2885 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 2886.LISRM0: 2887 .quad 0x01040b0e0205080f, 0x0306090c00070a0d 2888.LISR: 2889 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 2890.LBS0: # bit-slice constants 2891 .quad 0x5555555555555555, 0x5555555555555555 2892.LBS1: 2893 .quad 0x3333333333333333, 0x3333333333333333 2894.LBS2: 2895 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 2896.LSR: # shiftrows constants 2897 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 2898.LSRM0: 2899 .quad 0x0304090e00050a0f, 0x01060b0c0207080d 2900.LM0SR: 2901 .quad 0x0a0e02060f03070b, 0x0004080c05090d01 2902.LSWPUP: # byte-swap upper dword 2903 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 2904.LSWPUPM0SR: 2905 .quad 0x0a0d02060c03070b, 0x0004080f05090e01 2906.LADD1: # counter increment constants 2907 .quad 0x0000000000000000, 0x0000000100000000 2908.LADD2: 2909 .quad 0x0000000000000000, 0x0000000200000000 2910.LADD3: 2911 .quad 0x0000000000000000, 0x0000000300000000 2912.LADD4: 2913 .quad 0x0000000000000000, 0x0000000400000000 2914.LADD5: 2915 .quad 0x0000000000000000, 0x0000000500000000 2916.LADD6: 2917 .quad 0x0000000000000000, 0x0000000600000000 2918.LADD7: 2919 .quad 0x0000000000000000, 0x0000000700000000 2920.LADD8: 2921 .quad 0x0000000000000000, 0x0000000800000000 2922.Lxts_magic: 2923 .long 0x87,0,1,0 2924.Lmasks: 2925 .quad 0x0101010101010101, 0x0101010101010101 2926 .quad 0x0202020202020202, 0x0202020202020202 2927 .quad 0x0404040404040404, 0x0404040404040404 2928 .quad 0x0808080808080808, 0x0808080808080808 2929.LM0: 2930 .quad 0x02060a0e03070b0f, 0x0004080c0105090d 2931.L63: 2932 .quad 0x6363636363636363, 0x6363636363636363 2933.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" 2934.align 64 2935.size _bsaes_const,.-_bsaes_const 2936___ 2937 2938# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2939# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2940if ($win64) { 2941$rec="%rcx"; 2942$frame="%rdx"; 2943$context="%r8"; 2944$disp="%r9"; 2945 2946$code.=<<___; 2947.extern __imp_RtlVirtualUnwind 2948.type se_handler,\@abi-omnipotent 2949.align 16 2950se_handler: 2951 push %rsi 2952 push %rdi 2953 push %rbx 2954 push %rbp 2955 push %r12 2956 push %r13 2957 push %r14 2958 push %r15 2959 pushfq 2960 sub \$64,%rsp 2961 2962 mov 120($context),%rax # pull context->Rax 2963 mov 248($context),%rbx # pull context->Rip 2964 2965 mov 8($disp),%rsi # disp->ImageBase 2966 mov 56($disp),%r11 # disp->HandlerData 2967 2968 mov 0(%r11),%r10d # HandlerData[0] 2969 lea (%rsi,%r10),%r10 # prologue label 2970 cmp %r10,%rbx # context->Rip<=prologue label 2971 jbe .Lin_prologue 2972 2973 mov 4(%r11),%r10d # HandlerData[1] 2974 lea (%rsi,%r10),%r10 # epilogue label 2975 cmp %r10,%rbx # context->Rip>=epilogue label 2976 jae .Lin_prologue 2977 2978 mov 8(%r11),%r10d # HandlerData[2] 2979 lea (%rsi,%r10),%r10 # epilogue label 2980 cmp %r10,%rbx # context->Rip>=tail label 2981 jae .Lin_tail 2982 2983 mov 160($context),%rax # pull context->Rbp 2984 2985 lea 0x40(%rax),%rsi # %xmm save area 2986 lea 512($context),%rdi # &context.Xmm6 2987 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 2988 .long 0xa548f3fc # cld; rep movsq 2989 lea 0xa0+0x78(%rax),%rax # adjust stack pointer 2990 2991.Lin_tail: 2992 mov -48(%rax),%rbp 2993 mov -40(%rax),%rbx 2994 mov -32(%rax),%r12 2995 mov -24(%rax),%r13 2996 mov -16(%rax),%r14 2997 mov -8(%rax),%r15 2998 mov %rbx,144($context) # restore context->Rbx 2999 mov %rbp,160($context) # restore context->Rbp 3000 mov %r12,216($context) # restore context->R12 3001 mov %r13,224($context) # restore context->R13 3002 mov %r14,232($context) # restore context->R14 3003 mov %r15,240($context) # restore context->R15 3004 3005.Lin_prologue: 3006 mov %rax,152($context) # restore context->Rsp 3007 3008 mov 40($disp),%rdi # disp->ContextRecord 3009 mov $context,%rsi # context 3010 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3011 .long 0xa548f3fc # cld; rep movsq 3012 3013 mov $disp,%rsi 3014 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3015 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3016 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3017 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3018 mov 40(%rsi),%r10 # disp->ContextRecord 3019 lea 56(%rsi),%r11 # &disp->HandlerData 3020 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3021 mov %r10,32(%rsp) # arg5 3022 mov %r11,40(%rsp) # arg6 3023 mov %r12,48(%rsp) # arg7 3024 mov %rcx,56(%rsp) # arg8, (NULL) 3025 call *__imp_RtlVirtualUnwind(%rip) 3026 3027 mov \$1,%eax # ExceptionContinueSearch 3028 add \$64,%rsp 3029 popfq 3030 pop %r15 3031 pop %r14 3032 pop %r13 3033 pop %r12 3034 pop %rbp 3035 pop %rbx 3036 pop %rdi 3037 pop %rsi 3038 ret 3039.size se_handler,.-se_handler 3040 3041.section .pdata 3042.align 4 3043___ 3044$code.=<<___ if ($ecb); 3045 .rva .Lecb_enc_prologue 3046 .rva .Lecb_enc_epilogue 3047 .rva .Lecb_enc_info 3048 3049 .rva .Lecb_dec_prologue 3050 .rva .Lecb_dec_epilogue 3051 .rva .Lecb_dec_info 3052___ 3053$code.=<<___; 3054 .rva .Lcbc_dec_prologue 3055 .rva .Lcbc_dec_epilogue 3056 .rva .Lcbc_dec_info 3057 3058 .rva .Lctr_enc_prologue 3059 .rva .Lctr_enc_epilogue 3060 .rva .Lctr_enc_info 3061 3062 .rva .Lxts_enc_prologue 3063 .rva .Lxts_enc_epilogue 3064 .rva .Lxts_enc_info 3065 3066 .rva .Lxts_dec_prologue 3067 .rva .Lxts_dec_epilogue 3068 .rva .Lxts_dec_info 3069 3070.section .xdata 3071.align 8 3072___ 3073$code.=<<___ if ($ecb); 3074.Lecb_enc_info: 3075 .byte 9,0,0,0 3076 .rva se_handler 3077 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3078 .rva .Lecb_enc_tail 3079 .long 0 3080.Lecb_dec_info: 3081 .byte 9,0,0,0 3082 .rva se_handler 3083 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3084 .rva .Lecb_dec_tail 3085 .long 0 3086___ 3087$code.=<<___; 3088.Lcbc_dec_info: 3089 .byte 9,0,0,0 3090 .rva se_handler 3091 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3092 .rva .Lcbc_dec_tail 3093 .long 0 3094.Lctr_enc_info: 3095 .byte 9,0,0,0 3096 .rva se_handler 3097 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3098 .rva .Lctr_enc_tail 3099 .long 0 3100.Lxts_enc_info: 3101 .byte 9,0,0,0 3102 .rva se_handler 3103 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3104 .rva .Lxts_enc_tail 3105 .long 0 3106.Lxts_dec_info: 3107 .byte 9,0,0,0 3108 .rva se_handler 3109 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3110 .rva .Lxts_dec_tail 3111 .long 0 3112___ 3113} 3114 3115$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3116 3117print $code; 3118 3119close STDOUT; 3120