1#! /usr/bin/env perl 2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 20# details]. 21# 22# Performance. 23# 24# Given aes(enc|dec) instructions' latency asymptotic performance for 25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 26# processed with 128-bit key. And given their throughput asymptotic 27# performance for parallelizable modes is 1.25 cycles per byte. Being 28# asymptotic limit it's not something you commonly achieve in reality, 29# but how close does one get? Below are results collected for 30# different modes and block sized. Pairs of numbers are for en-/ 31# decryption. 32# 33# 16-byte 64-byte 256-byte 1-KB 8-KB 34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 40# 41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 44# The results were collected with specially crafted speed.c benchmark 45# in order to compare them with results reported in "Intel Advanced 46# Encryption Standard (AES) New Instruction Set" White Paper Revision 47# 3.0 dated May 2010. All above results are consistently better. This 48# module also provides better performance for block sizes smaller than 49# 128 bytes in points *not* represented in the above table. 50# 51# Looking at the results for 8-KB buffer. 52# 53# CFB and OFB results are far from the limit, because implementation 54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 55# single-block aesni_encrypt, which is not the most optimal way to go. 56# CBC encrypt result is unexpectedly high and there is no documented 57# explanation for it. Seemingly there is a small penalty for feeding 58# the result back to AES unit the way it's done in CBC mode. There is 59# nothing one can do and the result appears optimal. CCM result is 60# identical to CBC, because CBC-MAC is essentially CBC encrypt without 61# saving output. CCM CTR "stays invisible," because it's neatly 62# interleaved wih CBC-MAC. This provides ~30% improvement over 63# "straghtforward" CCM implementation with CTR and CBC-MAC performed 64# disjointly. Parallelizable modes practically achieve the theoretical 65# limit. 66# 67# Looking at how results vary with buffer size. 68# 69# Curves are practically saturated at 1-KB buffer size. In most cases 70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 71# CTR curve doesn't follow this pattern and is "slowest" changing one 72# with "256-byte" result being 87% of "8-KB." This is because overhead 73# in CTR mode is most computationally intensive. Small-block CCM 74# decrypt is slower than encrypt, because first CTR and last CBC-MAC 75# iterations can't be interleaved. 76# 77# Results for 192- and 256-bit keys. 78# 79# EVP-free results were observed to scale perfectly with number of 80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 82# are a tad smaller, because the above mentioned penalty biases all 83# results by same constant value. In similar way function call 84# overhead affects small-block performance, as well as OFB and CFB 85# results. Differences are not large, most common coefficients are 86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 88 89# January 2011 90# 91# While Westmere processor features 6 cycles latency for aes[enc|dec] 92# instructions, which can be scheduled every second cycle, Sandy 93# Bridge spends 8 cycles per instruction, but it can schedule them 94# every cycle. This means that code targeting Westmere would perform 95# suboptimally on Sandy Bridge. Therefore this update. 96# 97# In addition, non-parallelizable CBC encrypt (as well as CCM) is 98# optimized. Relative improvement might appear modest, 8% on Westmere, 99# but in absolute terms it's 3.77 cycles per byte encrypted with 100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 101# should be compared to asymptotic limits of 3.75 for Westmere and 102# 5.00 for Sandy Bridge. Actually, the fact that they get this close 103# to asymptotic limits is quite amazing. Indeed, the limit is 104# calculated as latency times number of rounds, 10 for 128-bit key, 105# and divided by 16, the number of bytes in block, or in other words 106# it accounts *solely* for aesenc instructions. But there are extra 107# instructions, and numbers so close to the asymptotic limits mean 108# that it's as if it takes as little as *one* additional cycle to 109# execute all of them. How is it possible? It is possible thanks to 110# out-of-order execution logic, which manages to overlap post- 111# processing of previous block, things like saving the output, with 112# actual encryption of current block, as well as pre-processing of 113# current block, things like fetching input and xor-ing it with 114# 0-round element of the key schedule, with actual encryption of 115# previous block. Keep this in mind... 116# 117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 118# performance is achieved by interleaving instructions working on 119# independent blocks. In which case asymptotic limit for such modes 120# can be obtained by dividing above mentioned numbers by AES 121# instructions' interleave factor. Westmere can execute at most 3 122# instructions at a time, meaning that optimal interleave factor is 3, 123# and that's where the "magic" number of 1.25 come from. "Optimal 124# interleave factor" means that increase of interleave factor does 125# not improve performance. The formula has proven to reflect reality 126# pretty well on Westmere... Sandy Bridge on the other hand can 127# execute up to 8 AES instructions at a time, so how does varying 128# interleave factor affect the performance? Here is table for ECB 129# (numbers are cycles per byte processed with 128-bit key): 130# 131# instruction interleave factor 3x 6x 8x 132# theoretical asymptotic limit 1.67 0.83 0.625 133# measured performance for 8KB block 1.05 0.86 0.84 134# 135# "as if" interleave factor 4.7x 5.8x 6.0x 136# 137# Further data for other parallelizable modes: 138# 139# CBC decrypt 1.16 0.93 0.74 140# CTR 1.14 0.91 0.74 141# 142# Well, given 3x column it's probably inappropriate to call the limit 143# asymptotic, if it can be surpassed, isn't it? What happens there? 144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 145# magic is responsible for this. Processor overlaps not only the 146# additional instructions with AES ones, but even AES instuctions 147# processing adjacent triplets of independent blocks. In the 6x case 148# additional instructions still claim disproportionally small amount 149# of additional cycles, but in 8x case number of instructions must be 150# a tad too high for out-of-order logic to cope with, and AES unit 151# remains underutilized... As you can see 8x interleave is hardly 152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 153# utilizies 6x interleave because of limited register bank capacity. 154# 155# Higher interleave factors do have negative impact on Westmere 156# performance. While for ECB mode it's negligible ~1.5%, other 157# parallelizables perform ~5% worse, which is outweighed by ~25% 158# improvement on Sandy Bridge. To balance regression on Westmere 159# CTR mode was implemented with 6x aesenc interleave factor. 160 161# April 2011 162# 163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 165# in CTR mode AES instruction interleave factor was chosen to be 6x. 166 167# November 2015 168# 169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was 170# chosen to be 6x. 171 172###################################################################### 173# Current large-block performance in cycles per byte processed with 174# 128-bit key (less is better). 175# 176# CBC en-/decrypt CTR XTS ECB OCB 177# Westmere 3.77/1.25 1.25 1.25 1.26 178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 180# Skylake 2.62/0.63 0.63 0.63 0.63 181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 182# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 183# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 184# 185# (*) Atom Silvermont ECB result is suboptimal because of penalties 186# incurred by operations on %xmm8-15. As ECB is not considered 187# critical, nothing was done to mitigate the problem. 188 189$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 190 # generates drop-in replacement for 191 # crypto/aes/asm/aes-x86_64.pl:-) 192 193$flavour = shift; 194$output = shift; 195if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 196 197$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 198 199$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 200( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 201( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 202die "can't locate x86_64-xlate.pl"; 203 204open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 205*STDOUT=*OUT; 206 207$movkey = $PREFIX eq "aesni" ? "movups" : "movups"; 208@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 209 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 210 211$code=".text\n"; 212$code.=".extern OPENSSL_ia32cap_P\n"; 213 214$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 215# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 216$inp="%rdi"; 217$out="%rsi"; 218$len="%rdx"; 219$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 220$ivp="%r8"; # cbc, ctr, ... 221 222$rnds_="%r10d"; # backup copy for $rounds 223$key_="%r11"; # backup copy for $key 224 225# %xmm register layout 226$rndkey0="%xmm0"; $rndkey1="%xmm1"; 227$inout0="%xmm2"; $inout1="%xmm3"; 228$inout2="%xmm4"; $inout3="%xmm5"; 229$inout4="%xmm6"; $inout5="%xmm7"; 230$inout6="%xmm8"; $inout7="%xmm9"; 231 232$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 233$in0="%xmm8"; $iv="%xmm9"; 234 235# Inline version of internal aesni_[en|de]crypt1. 236# 237# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 238# cycles which take care of loop variables... 239{ my $sn; 240sub aesni_generate1 { 241my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 242++$sn; 243$code.=<<___; 244 $movkey ($key),$rndkey0 245 $movkey 16($key),$rndkey1 246___ 247$code.=<<___ if (defined($ivec)); 248 xorps $rndkey0,$ivec 249 lea 32($key),$key 250 xorps $ivec,$inout 251___ 252$code.=<<___ if (!defined($ivec)); 253 lea 32($key),$key 254 xorps $rndkey0,$inout 255___ 256$code.=<<___; 257.Loop_${p}1_$sn: 258 aes${p} $rndkey1,$inout 259 dec $rounds 260 $movkey ($key),$rndkey1 261 lea 16($key),$key 262 jnz .Loop_${p}1_$sn # loop body is 16 bytes 263 aes${p}last $rndkey1,$inout 264___ 265}} 266# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 267# 268{ my ($inp,$out,$key) = @_4args; 269 270$code.=<<___; 271.globl ${PREFIX}_encrypt 272.type ${PREFIX}_encrypt,\@abi-omnipotent 273.align 16 274${PREFIX}_encrypt: 275 movups ($inp),$inout0 # load input 276 mov 240($key),$rounds # key->rounds 277___ 278 &aesni_generate1("enc",$key,$rounds); 279$code.=<<___; 280 pxor $rndkey0,$rndkey0 # clear register bank 281 pxor $rndkey1,$rndkey1 282 movups $inout0,($out) # output 283 pxor $inout0,$inout0 284 ret 285.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 286 287.globl ${PREFIX}_decrypt 288.type ${PREFIX}_decrypt,\@abi-omnipotent 289.align 16 290${PREFIX}_decrypt: 291 movups ($inp),$inout0 # load input 292 mov 240($key),$rounds # key->rounds 293___ 294 &aesni_generate1("dec",$key,$rounds); 295$code.=<<___; 296 pxor $rndkey0,$rndkey0 # clear register bank 297 pxor $rndkey1,$rndkey1 298 movups $inout0,($out) # output 299 pxor $inout0,$inout0 300 ret 301.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 302___ 303} 304 305# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 306# factor. Why 3x subroutine were originally used in loops? Even though 307# aes[enc|dec] latency was originally 6, it could be scheduled only 308# every *2nd* cycle. Thus 3x interleave was the one providing optimal 309# utilization, i.e. when subroutine's throughput is virtually same as 310# of non-interleaved subroutine [for number of input blocks up to 3]. 311# This is why it originally made no sense to implement 2x subroutine. 312# But times change and it became appropriate to spend extra 192 bytes 313# on 2x subroutine on Atom Silvermont account. For processors that 314# can schedule aes[enc|dec] every cycle optimal interleave factor 315# equals to corresponding instructions latency. 8x is optimal for 316# * Bridge and "super-optimal" for other Intel CPUs... 317 318sub aesni_generate2 { 319my $dir=shift; 320# As already mentioned it takes in $key and $rounds, which are *not* 321# preserved. $inout[0-1] is cipher/clear text... 322$code.=<<___; 323.type _aesni_${dir}rypt2,\@abi-omnipotent 324.align 16 325_aesni_${dir}rypt2: 326 $movkey ($key),$rndkey0 327 shl \$4,$rounds 328 $movkey 16($key),$rndkey1 329 xorps $rndkey0,$inout0 330 xorps $rndkey0,$inout1 331 $movkey 32($key),$rndkey0 332 lea 32($key,$rounds),$key 333 neg %rax # $rounds 334 add \$16,%rax 335 336.L${dir}_loop2: 337 aes${dir} $rndkey1,$inout0 338 aes${dir} $rndkey1,$inout1 339 $movkey ($key,%rax),$rndkey1 340 add \$32,%rax 341 aes${dir} $rndkey0,$inout0 342 aes${dir} $rndkey0,$inout1 343 $movkey -16($key,%rax),$rndkey0 344 jnz .L${dir}_loop2 345 346 aes${dir} $rndkey1,$inout0 347 aes${dir} $rndkey1,$inout1 348 aes${dir}last $rndkey0,$inout0 349 aes${dir}last $rndkey0,$inout1 350 ret 351.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 352___ 353} 354sub aesni_generate3 { 355my $dir=shift; 356# As already mentioned it takes in $key and $rounds, which are *not* 357# preserved. $inout[0-2] is cipher/clear text... 358$code.=<<___; 359.type _aesni_${dir}rypt3,\@abi-omnipotent 360.align 16 361_aesni_${dir}rypt3: 362 $movkey ($key),$rndkey0 363 shl \$4,$rounds 364 $movkey 16($key),$rndkey1 365 xorps $rndkey0,$inout0 366 xorps $rndkey0,$inout1 367 xorps $rndkey0,$inout2 368 $movkey 32($key),$rndkey0 369 lea 32($key,$rounds),$key 370 neg %rax # $rounds 371 add \$16,%rax 372 373.L${dir}_loop3: 374 aes${dir} $rndkey1,$inout0 375 aes${dir} $rndkey1,$inout1 376 aes${dir} $rndkey1,$inout2 377 $movkey ($key,%rax),$rndkey1 378 add \$32,%rax 379 aes${dir} $rndkey0,$inout0 380 aes${dir} $rndkey0,$inout1 381 aes${dir} $rndkey0,$inout2 382 $movkey -16($key,%rax),$rndkey0 383 jnz .L${dir}_loop3 384 385 aes${dir} $rndkey1,$inout0 386 aes${dir} $rndkey1,$inout1 387 aes${dir} $rndkey1,$inout2 388 aes${dir}last $rndkey0,$inout0 389 aes${dir}last $rndkey0,$inout1 390 aes${dir}last $rndkey0,$inout2 391 ret 392.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 393___ 394} 395# 4x interleave is implemented to improve small block performance, 396# most notably [and naturally] 4 block by ~30%. One can argue that one 397# should have implemented 5x as well, but improvement would be <20%, 398# so it's not worth it... 399sub aesni_generate4 { 400my $dir=shift; 401# As already mentioned it takes in $key and $rounds, which are *not* 402# preserved. $inout[0-3] is cipher/clear text... 403$code.=<<___; 404.type _aesni_${dir}rypt4,\@abi-omnipotent 405.align 16 406_aesni_${dir}rypt4: 407 $movkey ($key),$rndkey0 408 shl \$4,$rounds 409 $movkey 16($key),$rndkey1 410 xorps $rndkey0,$inout0 411 xorps $rndkey0,$inout1 412 xorps $rndkey0,$inout2 413 xorps $rndkey0,$inout3 414 $movkey 32($key),$rndkey0 415 lea 32($key,$rounds),$key 416 neg %rax # $rounds 417 .byte 0x0f,0x1f,0x00 418 add \$16,%rax 419 420.L${dir}_loop4: 421 aes${dir} $rndkey1,$inout0 422 aes${dir} $rndkey1,$inout1 423 aes${dir} $rndkey1,$inout2 424 aes${dir} $rndkey1,$inout3 425 $movkey ($key,%rax),$rndkey1 426 add \$32,%rax 427 aes${dir} $rndkey0,$inout0 428 aes${dir} $rndkey0,$inout1 429 aes${dir} $rndkey0,$inout2 430 aes${dir} $rndkey0,$inout3 431 $movkey -16($key,%rax),$rndkey0 432 jnz .L${dir}_loop4 433 434 aes${dir} $rndkey1,$inout0 435 aes${dir} $rndkey1,$inout1 436 aes${dir} $rndkey1,$inout2 437 aes${dir} $rndkey1,$inout3 438 aes${dir}last $rndkey0,$inout0 439 aes${dir}last $rndkey0,$inout1 440 aes${dir}last $rndkey0,$inout2 441 aes${dir}last $rndkey0,$inout3 442 ret 443.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 444___ 445} 446sub aesni_generate6 { 447my $dir=shift; 448# As already mentioned it takes in $key and $rounds, which are *not* 449# preserved. $inout[0-5] is cipher/clear text... 450$code.=<<___; 451.type _aesni_${dir}rypt6,\@abi-omnipotent 452.align 16 453_aesni_${dir}rypt6: 454 $movkey ($key),$rndkey0 455 shl \$4,$rounds 456 $movkey 16($key),$rndkey1 457 xorps $rndkey0,$inout0 458 pxor $rndkey0,$inout1 459 pxor $rndkey0,$inout2 460 aes${dir} $rndkey1,$inout0 461 lea 32($key,$rounds),$key 462 neg %rax # $rounds 463 aes${dir} $rndkey1,$inout1 464 pxor $rndkey0,$inout3 465 pxor $rndkey0,$inout4 466 aes${dir} $rndkey1,$inout2 467 pxor $rndkey0,$inout5 468 $movkey ($key,%rax),$rndkey0 469 add \$16,%rax 470 jmp .L${dir}_loop6_enter 471.align 16 472.L${dir}_loop6: 473 aes${dir} $rndkey1,$inout0 474 aes${dir} $rndkey1,$inout1 475 aes${dir} $rndkey1,$inout2 476.L${dir}_loop6_enter: 477 aes${dir} $rndkey1,$inout3 478 aes${dir} $rndkey1,$inout4 479 aes${dir} $rndkey1,$inout5 480 $movkey ($key,%rax),$rndkey1 481 add \$32,%rax 482 aes${dir} $rndkey0,$inout0 483 aes${dir} $rndkey0,$inout1 484 aes${dir} $rndkey0,$inout2 485 aes${dir} $rndkey0,$inout3 486 aes${dir} $rndkey0,$inout4 487 aes${dir} $rndkey0,$inout5 488 $movkey -16($key,%rax),$rndkey0 489 jnz .L${dir}_loop6 490 491 aes${dir} $rndkey1,$inout0 492 aes${dir} $rndkey1,$inout1 493 aes${dir} $rndkey1,$inout2 494 aes${dir} $rndkey1,$inout3 495 aes${dir} $rndkey1,$inout4 496 aes${dir} $rndkey1,$inout5 497 aes${dir}last $rndkey0,$inout0 498 aes${dir}last $rndkey0,$inout1 499 aes${dir}last $rndkey0,$inout2 500 aes${dir}last $rndkey0,$inout3 501 aes${dir}last $rndkey0,$inout4 502 aes${dir}last $rndkey0,$inout5 503 ret 504.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 505___ 506} 507sub aesni_generate8 { 508my $dir=shift; 509# As already mentioned it takes in $key and $rounds, which are *not* 510# preserved. $inout[0-7] is cipher/clear text... 511$code.=<<___; 512.type _aesni_${dir}rypt8,\@abi-omnipotent 513.align 16 514_aesni_${dir}rypt8: 515 $movkey ($key),$rndkey0 516 shl \$4,$rounds 517 $movkey 16($key),$rndkey1 518 xorps $rndkey0,$inout0 519 xorps $rndkey0,$inout1 520 pxor $rndkey0,$inout2 521 pxor $rndkey0,$inout3 522 pxor $rndkey0,$inout4 523 lea 32($key,$rounds),$key 524 neg %rax # $rounds 525 aes${dir} $rndkey1,$inout0 526 pxor $rndkey0,$inout5 527 pxor $rndkey0,$inout6 528 aes${dir} $rndkey1,$inout1 529 pxor $rndkey0,$inout7 530 $movkey ($key,%rax),$rndkey0 531 add \$16,%rax 532 jmp .L${dir}_loop8_inner 533.align 16 534.L${dir}_loop8: 535 aes${dir} $rndkey1,$inout0 536 aes${dir} $rndkey1,$inout1 537.L${dir}_loop8_inner: 538 aes${dir} $rndkey1,$inout2 539 aes${dir} $rndkey1,$inout3 540 aes${dir} $rndkey1,$inout4 541 aes${dir} $rndkey1,$inout5 542 aes${dir} $rndkey1,$inout6 543 aes${dir} $rndkey1,$inout7 544.L${dir}_loop8_enter: 545 $movkey ($key,%rax),$rndkey1 546 add \$32,%rax 547 aes${dir} $rndkey0,$inout0 548 aes${dir} $rndkey0,$inout1 549 aes${dir} $rndkey0,$inout2 550 aes${dir} $rndkey0,$inout3 551 aes${dir} $rndkey0,$inout4 552 aes${dir} $rndkey0,$inout5 553 aes${dir} $rndkey0,$inout6 554 aes${dir} $rndkey0,$inout7 555 $movkey -16($key,%rax),$rndkey0 556 jnz .L${dir}_loop8 557 558 aes${dir} $rndkey1,$inout0 559 aes${dir} $rndkey1,$inout1 560 aes${dir} $rndkey1,$inout2 561 aes${dir} $rndkey1,$inout3 562 aes${dir} $rndkey1,$inout4 563 aes${dir} $rndkey1,$inout5 564 aes${dir} $rndkey1,$inout6 565 aes${dir} $rndkey1,$inout7 566 aes${dir}last $rndkey0,$inout0 567 aes${dir}last $rndkey0,$inout1 568 aes${dir}last $rndkey0,$inout2 569 aes${dir}last $rndkey0,$inout3 570 aes${dir}last $rndkey0,$inout4 571 aes${dir}last $rndkey0,$inout5 572 aes${dir}last $rndkey0,$inout6 573 aes${dir}last $rndkey0,$inout7 574 ret 575.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 576___ 577} 578&aesni_generate2("enc") if ($PREFIX eq "aesni"); 579&aesni_generate2("dec"); 580&aesni_generate3("enc") if ($PREFIX eq "aesni"); 581&aesni_generate3("dec"); 582&aesni_generate4("enc") if ($PREFIX eq "aesni"); 583&aesni_generate4("dec"); 584&aesni_generate6("enc") if ($PREFIX eq "aesni"); 585&aesni_generate6("dec"); 586&aesni_generate8("enc") if ($PREFIX eq "aesni"); 587&aesni_generate8("dec"); 588 589if ($PREFIX eq "aesni") { 590######################################################################## 591# void aesni_ecb_encrypt (const void *in, void *out, 592# size_t length, const AES_KEY *key, 593# int enc); 594$code.=<<___; 595.globl aesni_ecb_encrypt 596.type aesni_ecb_encrypt,\@function,5 597.align 16 598aesni_ecb_encrypt: 599___ 600$code.=<<___ if ($win64); 601 lea -0x58(%rsp),%rsp 602 movaps %xmm6,(%rsp) # offload $inout4..7 603 movaps %xmm7,0x10(%rsp) 604 movaps %xmm8,0x20(%rsp) 605 movaps %xmm9,0x30(%rsp) 606.Lecb_enc_body: 607___ 608$code.=<<___; 609 and \$-16,$len # if ($len<16) 610 jz .Lecb_ret # return 611 612 mov 240($key),$rounds # key->rounds 613 $movkey ($key),$rndkey0 614 mov $key,$key_ # backup $key 615 mov $rounds,$rnds_ # backup $rounds 616 test %r8d,%r8d # 5th argument 617 jz .Lecb_decrypt 618#--------------------------- ECB ENCRYPT ------------------------------# 619 cmp \$0x80,$len # if ($len<8*16) 620 jb .Lecb_enc_tail # short input 621 622 movdqu ($inp),$inout0 # load 8 input blocks 623 movdqu 0x10($inp),$inout1 624 movdqu 0x20($inp),$inout2 625 movdqu 0x30($inp),$inout3 626 movdqu 0x40($inp),$inout4 627 movdqu 0x50($inp),$inout5 628 movdqu 0x60($inp),$inout6 629 movdqu 0x70($inp),$inout7 630 lea 0x80($inp),$inp # $inp+=8*16 631 sub \$0x80,$len # $len-=8*16 (can be zero) 632 jmp .Lecb_enc_loop8_enter 633.align 16 634.Lecb_enc_loop8: 635 movups $inout0,($out) # store 8 output blocks 636 mov $key_,$key # restore $key 637 movdqu ($inp),$inout0 # load 8 input blocks 638 mov $rnds_,$rounds # restore $rounds 639 movups $inout1,0x10($out) 640 movdqu 0x10($inp),$inout1 641 movups $inout2,0x20($out) 642 movdqu 0x20($inp),$inout2 643 movups $inout3,0x30($out) 644 movdqu 0x30($inp),$inout3 645 movups $inout4,0x40($out) 646 movdqu 0x40($inp),$inout4 647 movups $inout5,0x50($out) 648 movdqu 0x50($inp),$inout5 649 movups $inout6,0x60($out) 650 movdqu 0x60($inp),$inout6 651 movups $inout7,0x70($out) 652 lea 0x80($out),$out # $out+=8*16 653 movdqu 0x70($inp),$inout7 654 lea 0x80($inp),$inp # $inp+=8*16 655.Lecb_enc_loop8_enter: 656 657 call _aesni_encrypt8 658 659 sub \$0x80,$len 660 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 661 662 movups $inout0,($out) # store 8 output blocks 663 mov $key_,$key # restore $key 664 movups $inout1,0x10($out) 665 mov $rnds_,$rounds # restore $rounds 666 movups $inout2,0x20($out) 667 movups $inout3,0x30($out) 668 movups $inout4,0x40($out) 669 movups $inout5,0x50($out) 670 movups $inout6,0x60($out) 671 movups $inout7,0x70($out) 672 lea 0x80($out),$out # $out+=8*16 673 add \$0x80,$len # restore real remaining $len 674 jz .Lecb_ret # done if ($len==0) 675 676.Lecb_enc_tail: # $len is less than 8*16 677 movups ($inp),$inout0 678 cmp \$0x20,$len 679 jb .Lecb_enc_one 680 movups 0x10($inp),$inout1 681 je .Lecb_enc_two 682 movups 0x20($inp),$inout2 683 cmp \$0x40,$len 684 jb .Lecb_enc_three 685 movups 0x30($inp),$inout3 686 je .Lecb_enc_four 687 movups 0x40($inp),$inout4 688 cmp \$0x60,$len 689 jb .Lecb_enc_five 690 movups 0x50($inp),$inout5 691 je .Lecb_enc_six 692 movdqu 0x60($inp),$inout6 693 xorps $inout7,$inout7 694 call _aesni_encrypt8 695 movups $inout0,($out) # store 7 output blocks 696 movups $inout1,0x10($out) 697 movups $inout2,0x20($out) 698 movups $inout3,0x30($out) 699 movups $inout4,0x40($out) 700 movups $inout5,0x50($out) 701 movups $inout6,0x60($out) 702 jmp .Lecb_ret 703.align 16 704.Lecb_enc_one: 705___ 706 &aesni_generate1("enc",$key,$rounds); 707$code.=<<___; 708 movups $inout0,($out) # store one output block 709 jmp .Lecb_ret 710.align 16 711.Lecb_enc_two: 712 call _aesni_encrypt2 713 movups $inout0,($out) # store 2 output blocks 714 movups $inout1,0x10($out) 715 jmp .Lecb_ret 716.align 16 717.Lecb_enc_three: 718 call _aesni_encrypt3 719 movups $inout0,($out) # store 3 output blocks 720 movups $inout1,0x10($out) 721 movups $inout2,0x20($out) 722 jmp .Lecb_ret 723.align 16 724.Lecb_enc_four: 725 call _aesni_encrypt4 726 movups $inout0,($out) # store 4 output blocks 727 movups $inout1,0x10($out) 728 movups $inout2,0x20($out) 729 movups $inout3,0x30($out) 730 jmp .Lecb_ret 731.align 16 732.Lecb_enc_five: 733 xorps $inout5,$inout5 734 call _aesni_encrypt6 735 movups $inout0,($out) # store 5 output blocks 736 movups $inout1,0x10($out) 737 movups $inout2,0x20($out) 738 movups $inout3,0x30($out) 739 movups $inout4,0x40($out) 740 jmp .Lecb_ret 741.align 16 742.Lecb_enc_six: 743 call _aesni_encrypt6 744 movups $inout0,($out) # store 6 output blocks 745 movups $inout1,0x10($out) 746 movups $inout2,0x20($out) 747 movups $inout3,0x30($out) 748 movups $inout4,0x40($out) 749 movups $inout5,0x50($out) 750 jmp .Lecb_ret 751#--------------------------- ECB DECRYPT ------------------------------# 752.align 16 753.Lecb_decrypt: 754 cmp \$0x80,$len # if ($len<8*16) 755 jb .Lecb_dec_tail # short input 756 757 movdqu ($inp),$inout0 # load 8 input blocks 758 movdqu 0x10($inp),$inout1 759 movdqu 0x20($inp),$inout2 760 movdqu 0x30($inp),$inout3 761 movdqu 0x40($inp),$inout4 762 movdqu 0x50($inp),$inout5 763 movdqu 0x60($inp),$inout6 764 movdqu 0x70($inp),$inout7 765 lea 0x80($inp),$inp # $inp+=8*16 766 sub \$0x80,$len # $len-=8*16 (can be zero) 767 jmp .Lecb_dec_loop8_enter 768.align 16 769.Lecb_dec_loop8: 770 movups $inout0,($out) # store 8 output blocks 771 mov $key_,$key # restore $key 772 movdqu ($inp),$inout0 # load 8 input blocks 773 mov $rnds_,$rounds # restore $rounds 774 movups $inout1,0x10($out) 775 movdqu 0x10($inp),$inout1 776 movups $inout2,0x20($out) 777 movdqu 0x20($inp),$inout2 778 movups $inout3,0x30($out) 779 movdqu 0x30($inp),$inout3 780 movups $inout4,0x40($out) 781 movdqu 0x40($inp),$inout4 782 movups $inout5,0x50($out) 783 movdqu 0x50($inp),$inout5 784 movups $inout6,0x60($out) 785 movdqu 0x60($inp),$inout6 786 movups $inout7,0x70($out) 787 lea 0x80($out),$out # $out+=8*16 788 movdqu 0x70($inp),$inout7 789 lea 0x80($inp),$inp # $inp+=8*16 790.Lecb_dec_loop8_enter: 791 792 call _aesni_decrypt8 793 794 $movkey ($key_),$rndkey0 795 sub \$0x80,$len 796 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 797 798 movups $inout0,($out) # store 8 output blocks 799 pxor $inout0,$inout0 # clear register bank 800 mov $key_,$key # restore $key 801 movups $inout1,0x10($out) 802 pxor $inout1,$inout1 803 mov $rnds_,$rounds # restore $rounds 804 movups $inout2,0x20($out) 805 pxor $inout2,$inout2 806 movups $inout3,0x30($out) 807 pxor $inout3,$inout3 808 movups $inout4,0x40($out) 809 pxor $inout4,$inout4 810 movups $inout5,0x50($out) 811 pxor $inout5,$inout5 812 movups $inout6,0x60($out) 813 pxor $inout6,$inout6 814 movups $inout7,0x70($out) 815 pxor $inout7,$inout7 816 lea 0x80($out),$out # $out+=8*16 817 add \$0x80,$len # restore real remaining $len 818 jz .Lecb_ret # done if ($len==0) 819 820.Lecb_dec_tail: 821 movups ($inp),$inout0 822 cmp \$0x20,$len 823 jb .Lecb_dec_one 824 movups 0x10($inp),$inout1 825 je .Lecb_dec_two 826 movups 0x20($inp),$inout2 827 cmp \$0x40,$len 828 jb .Lecb_dec_three 829 movups 0x30($inp),$inout3 830 je .Lecb_dec_four 831 movups 0x40($inp),$inout4 832 cmp \$0x60,$len 833 jb .Lecb_dec_five 834 movups 0x50($inp),$inout5 835 je .Lecb_dec_six 836 movups 0x60($inp),$inout6 837 $movkey ($key),$rndkey0 838 xorps $inout7,$inout7 839 call _aesni_decrypt8 840 movups $inout0,($out) # store 7 output blocks 841 pxor $inout0,$inout0 # clear register bank 842 movups $inout1,0x10($out) 843 pxor $inout1,$inout1 844 movups $inout2,0x20($out) 845 pxor $inout2,$inout2 846 movups $inout3,0x30($out) 847 pxor $inout3,$inout3 848 movups $inout4,0x40($out) 849 pxor $inout4,$inout4 850 movups $inout5,0x50($out) 851 pxor $inout5,$inout5 852 movups $inout6,0x60($out) 853 pxor $inout6,$inout6 854 pxor $inout7,$inout7 855 jmp .Lecb_ret 856.align 16 857.Lecb_dec_one: 858___ 859 &aesni_generate1("dec",$key,$rounds); 860$code.=<<___; 861 movups $inout0,($out) # store one output block 862 pxor $inout0,$inout0 # clear register bank 863 jmp .Lecb_ret 864.align 16 865.Lecb_dec_two: 866 call _aesni_decrypt2 867 movups $inout0,($out) # store 2 output blocks 868 pxor $inout0,$inout0 # clear register bank 869 movups $inout1,0x10($out) 870 pxor $inout1,$inout1 871 jmp .Lecb_ret 872.align 16 873.Lecb_dec_three: 874 call _aesni_decrypt3 875 movups $inout0,($out) # store 3 output blocks 876 pxor $inout0,$inout0 # clear register bank 877 movups $inout1,0x10($out) 878 pxor $inout1,$inout1 879 movups $inout2,0x20($out) 880 pxor $inout2,$inout2 881 jmp .Lecb_ret 882.align 16 883.Lecb_dec_four: 884 call _aesni_decrypt4 885 movups $inout0,($out) # store 4 output blocks 886 pxor $inout0,$inout0 # clear register bank 887 movups $inout1,0x10($out) 888 pxor $inout1,$inout1 889 movups $inout2,0x20($out) 890 pxor $inout2,$inout2 891 movups $inout3,0x30($out) 892 pxor $inout3,$inout3 893 jmp .Lecb_ret 894.align 16 895.Lecb_dec_five: 896 xorps $inout5,$inout5 897 call _aesni_decrypt6 898 movups $inout0,($out) # store 5 output blocks 899 pxor $inout0,$inout0 # clear register bank 900 movups $inout1,0x10($out) 901 pxor $inout1,$inout1 902 movups $inout2,0x20($out) 903 pxor $inout2,$inout2 904 movups $inout3,0x30($out) 905 pxor $inout3,$inout3 906 movups $inout4,0x40($out) 907 pxor $inout4,$inout4 908 pxor $inout5,$inout5 909 jmp .Lecb_ret 910.align 16 911.Lecb_dec_six: 912 call _aesni_decrypt6 913 movups $inout0,($out) # store 6 output blocks 914 pxor $inout0,$inout0 # clear register bank 915 movups $inout1,0x10($out) 916 pxor $inout1,$inout1 917 movups $inout2,0x20($out) 918 pxor $inout2,$inout2 919 movups $inout3,0x30($out) 920 pxor $inout3,$inout3 921 movups $inout4,0x40($out) 922 pxor $inout4,$inout4 923 movups $inout5,0x50($out) 924 pxor $inout5,$inout5 925 926.Lecb_ret: 927 xorps $rndkey0,$rndkey0 # %xmm0 928 pxor $rndkey1,$rndkey1 929___ 930$code.=<<___ if ($win64); 931 movaps (%rsp),%xmm6 932 movaps %xmm0,(%rsp) # clear stack 933 movaps 0x10(%rsp),%xmm7 934 movaps %xmm0,0x10(%rsp) 935 movaps 0x20(%rsp),%xmm8 936 movaps %xmm0,0x20(%rsp) 937 movaps 0x30(%rsp),%xmm9 938 movaps %xmm0,0x30(%rsp) 939 lea 0x58(%rsp),%rsp 940.Lecb_enc_ret: 941___ 942$code.=<<___; 943 ret 944.size aesni_ecb_encrypt,.-aesni_ecb_encrypt 945___ 946 947{ 948###################################################################### 949# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 950# size_t blocks, const AES_KEY *key, 951# const char *ivec,char *cmac); 952# 953# Handles only complete blocks, operates on 64-bit counter and 954# does not update *ivec! Nor does it finalize CMAC value 955# (see engine/eng_aesni.c for details) 956# 957{ 958my $cmac="%r9"; # 6th argument 959 960my $increment="%xmm9"; 961my $iv="%xmm6"; 962my $bswap_mask="%xmm7"; 963 964$code.=<<___; 965.globl aesni_ccm64_encrypt_blocks 966.type aesni_ccm64_encrypt_blocks,\@function,6 967.align 16 968aesni_ccm64_encrypt_blocks: 969___ 970$code.=<<___ if ($win64); 971 lea -0x58(%rsp),%rsp 972 movaps %xmm6,(%rsp) # $iv 973 movaps %xmm7,0x10(%rsp) # $bswap_mask 974 movaps %xmm8,0x20(%rsp) # $in0 975 movaps %xmm9,0x30(%rsp) # $increment 976.Lccm64_enc_body: 977___ 978$code.=<<___; 979 mov 240($key),$rounds # key->rounds 980 movdqu ($ivp),$iv 981 movdqa .Lincrement64(%rip),$increment 982 movdqa .Lbswap_mask(%rip),$bswap_mask 983 984 shl \$4,$rounds 985 mov \$16,$rnds_ 986 lea 0($key),$key_ 987 movdqu ($cmac),$inout1 988 movdqa $iv,$inout0 989 lea 32($key,$rounds),$key # end of key schedule 990 pshufb $bswap_mask,$iv 991 sub %rax,%r10 # twisted $rounds 992 jmp .Lccm64_enc_outer 993.align 16 994.Lccm64_enc_outer: 995 $movkey ($key_),$rndkey0 996 mov %r10,%rax 997 movups ($inp),$in0 # load inp 998 999 xorps $rndkey0,$inout0 # counter 1000 $movkey 16($key_),$rndkey1 1001 xorps $in0,$rndkey0 1002 xorps $rndkey0,$inout1 # cmac^=inp 1003 $movkey 32($key_),$rndkey0 1004 1005.Lccm64_enc2_loop: 1006 aesenc $rndkey1,$inout0 1007 aesenc $rndkey1,$inout1 1008 $movkey ($key,%rax),$rndkey1 1009 add \$32,%rax 1010 aesenc $rndkey0,$inout0 1011 aesenc $rndkey0,$inout1 1012 $movkey -16($key,%rax),$rndkey0 1013 jnz .Lccm64_enc2_loop 1014 aesenc $rndkey1,$inout0 1015 aesenc $rndkey1,$inout1 1016 paddq $increment,$iv 1017 dec $len # $len-- ($len is in blocks) 1018 aesenclast $rndkey0,$inout0 1019 aesenclast $rndkey0,$inout1 1020 1021 lea 16($inp),$inp 1022 xorps $inout0,$in0 # inp ^= E(iv) 1023 movdqa $iv,$inout0 1024 movups $in0,($out) # save output 1025 pshufb $bswap_mask,$inout0 1026 lea 16($out),$out # $out+=16 1027 jnz .Lccm64_enc_outer # loop if ($len!=0) 1028 1029 pxor $rndkey0,$rndkey0 # clear register bank 1030 pxor $rndkey1,$rndkey1 1031 pxor $inout0,$inout0 1032 movups $inout1,($cmac) # store resulting mac 1033 pxor $inout1,$inout1 1034 pxor $in0,$in0 1035 pxor $iv,$iv 1036___ 1037$code.=<<___ if ($win64); 1038 movaps (%rsp),%xmm6 1039 movaps %xmm0,(%rsp) # clear stack 1040 movaps 0x10(%rsp),%xmm7 1041 movaps %xmm0,0x10(%rsp) 1042 movaps 0x20(%rsp),%xmm8 1043 movaps %xmm0,0x20(%rsp) 1044 movaps 0x30(%rsp),%xmm9 1045 movaps %xmm0,0x30(%rsp) 1046 lea 0x58(%rsp),%rsp 1047.Lccm64_enc_ret: 1048___ 1049$code.=<<___; 1050 ret 1051.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks 1052___ 1053###################################################################### 1054$code.=<<___; 1055.globl aesni_ccm64_decrypt_blocks 1056.type aesni_ccm64_decrypt_blocks,\@function,6 1057.align 16 1058aesni_ccm64_decrypt_blocks: 1059___ 1060$code.=<<___ if ($win64); 1061 lea -0x58(%rsp),%rsp 1062 movaps %xmm6,(%rsp) # $iv 1063 movaps %xmm7,0x10(%rsp) # $bswap_mask 1064 movaps %xmm8,0x20(%rsp) # $in8 1065 movaps %xmm9,0x30(%rsp) # $increment 1066.Lccm64_dec_body: 1067___ 1068$code.=<<___; 1069 mov 240($key),$rounds # key->rounds 1070 movups ($ivp),$iv 1071 movdqu ($cmac),$inout1 1072 movdqa .Lincrement64(%rip),$increment 1073 movdqa .Lbswap_mask(%rip),$bswap_mask 1074 1075 movaps $iv,$inout0 1076 mov $rounds,$rnds_ 1077 mov $key,$key_ 1078 pshufb $bswap_mask,$iv 1079___ 1080 &aesni_generate1("enc",$key,$rounds); 1081$code.=<<___; 1082 shl \$4,$rnds_ 1083 mov \$16,$rounds 1084 movups ($inp),$in0 # load inp 1085 paddq $increment,$iv 1086 lea 16($inp),$inp # $inp+=16 1087 sub %r10,%rax # twisted $rounds 1088 lea 32($key_,$rnds_),$key # end of key schedule 1089 mov %rax,%r10 1090 jmp .Lccm64_dec_outer 1091.align 16 1092.Lccm64_dec_outer: 1093 xorps $inout0,$in0 # inp ^= E(iv) 1094 movdqa $iv,$inout0 1095 movups $in0,($out) # save output 1096 lea 16($out),$out # $out+=16 1097 pshufb $bswap_mask,$inout0 1098 1099 sub \$1,$len # $len-- ($len is in blocks) 1100 jz .Lccm64_dec_break # if ($len==0) break 1101 1102 $movkey ($key_),$rndkey0 1103 mov %r10,%rax 1104 $movkey 16($key_),$rndkey1 1105 xorps $rndkey0,$in0 1106 xorps $rndkey0,$inout0 1107 xorps $in0,$inout1 # cmac^=out 1108 $movkey 32($key_),$rndkey0 1109 jmp .Lccm64_dec2_loop 1110.align 16 1111.Lccm64_dec2_loop: 1112 aesenc $rndkey1,$inout0 1113 aesenc $rndkey1,$inout1 1114 $movkey ($key,%rax),$rndkey1 1115 add \$32,%rax 1116 aesenc $rndkey0,$inout0 1117 aesenc $rndkey0,$inout1 1118 $movkey -16($key,%rax),$rndkey0 1119 jnz .Lccm64_dec2_loop 1120 movups ($inp),$in0 # load input 1121 paddq $increment,$iv 1122 aesenc $rndkey1,$inout0 1123 aesenc $rndkey1,$inout1 1124 aesenclast $rndkey0,$inout0 1125 aesenclast $rndkey0,$inout1 1126 lea 16($inp),$inp # $inp+=16 1127 jmp .Lccm64_dec_outer 1128 1129.align 16 1130.Lccm64_dec_break: 1131 #xorps $in0,$inout1 # cmac^=out 1132 mov 240($key_),$rounds 1133___ 1134 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1135$code.=<<___; 1136 pxor $rndkey0,$rndkey0 # clear register bank 1137 pxor $rndkey1,$rndkey1 1138 pxor $inout0,$inout0 1139 movups $inout1,($cmac) # store resulting mac 1140 pxor $inout1,$inout1 1141 pxor $in0,$in0 1142 pxor $iv,$iv 1143___ 1144$code.=<<___ if ($win64); 1145 movaps (%rsp),%xmm6 1146 movaps %xmm0,(%rsp) # clear stack 1147 movaps 0x10(%rsp),%xmm7 1148 movaps %xmm0,0x10(%rsp) 1149 movaps 0x20(%rsp),%xmm8 1150 movaps %xmm0,0x20(%rsp) 1151 movaps 0x30(%rsp),%xmm9 1152 movaps %xmm0,0x30(%rsp) 1153 lea 0x58(%rsp),%rsp 1154.Lccm64_dec_ret: 1155___ 1156$code.=<<___; 1157 ret 1158.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks 1159___ 1160} 1161###################################################################### 1162# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1163# size_t blocks, const AES_KEY *key, 1164# const char *ivec); 1165# 1166# Handles only complete blocks, operates on 32-bit counter and 1167# does not update *ivec! (see crypto/modes/ctr128.c for details) 1168# 1169# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1170# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1171# Keywords are full unroll and modulo-schedule counter calculations 1172# with zero-round key xor. 1173{ 1174my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1175my ($key0,$ctr)=("%ebp","${ivp}d"); 1176my $frame_size = 0x80 + ($win64?160:0); 1177 1178$code.=<<___; 1179.globl aesni_ctr32_encrypt_blocks 1180.type aesni_ctr32_encrypt_blocks,\@function,5 1181.align 16 1182aesni_ctr32_encrypt_blocks: 1183 cmp \$1,$len 1184 jne .Lctr32_bulk 1185 1186 # handle single block without allocating stack frame, 1187 # useful when handling edges 1188 movups ($ivp),$inout0 1189 movups ($inp),$inout1 1190 mov 240($key),%edx # key->rounds 1191___ 1192 &aesni_generate1("enc",$key,"%edx"); 1193$code.=<<___; 1194 pxor $rndkey0,$rndkey0 # clear register bank 1195 pxor $rndkey1,$rndkey1 1196 xorps $inout1,$inout0 1197 pxor $inout1,$inout1 1198 movups $inout0,($out) 1199 xorps $inout0,$inout0 1200 jmp .Lctr32_epilogue 1201 1202.align 16 1203.Lctr32_bulk: 1204 lea (%rsp),$key_ # use $key_ as frame pointer 1205 push %rbp 1206 sub \$$frame_size,%rsp 1207 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1208___ 1209$code.=<<___ if ($win64); 1210 movaps %xmm6,-0xa8($key_) # offload everything 1211 movaps %xmm7,-0x98($key_) 1212 movaps %xmm8,-0x88($key_) 1213 movaps %xmm9,-0x78($key_) 1214 movaps %xmm10,-0x68($key_) 1215 movaps %xmm11,-0x58($key_) 1216 movaps %xmm12,-0x48($key_) 1217 movaps %xmm13,-0x38($key_) 1218 movaps %xmm14,-0x28($key_) 1219 movaps %xmm15,-0x18($key_) 1220.Lctr32_body: 1221___ 1222$code.=<<___; 1223 1224 # 8 16-byte words on top of stack are counter values 1225 # xor-ed with zero-round key 1226 1227 movdqu ($ivp),$inout0 1228 movdqu ($key),$rndkey0 1229 mov 12($ivp),$ctr # counter LSB 1230 pxor $rndkey0,$inout0 1231 mov 12($key),$key0 # 0-round key LSB 1232 movdqa $inout0,0x00(%rsp) # populate counter block 1233 bswap $ctr 1234 movdqa $inout0,$inout1 1235 movdqa $inout0,$inout2 1236 movdqa $inout0,$inout3 1237 movdqa $inout0,0x40(%rsp) 1238 movdqa $inout0,0x50(%rsp) 1239 movdqa $inout0,0x60(%rsp) 1240 mov %rdx,%r10 # about to borrow %rdx 1241 movdqa $inout0,0x70(%rsp) 1242 1243 lea 1($ctr),%rax 1244 lea 2($ctr),%rdx 1245 bswap %eax 1246 bswap %edx 1247 xor $key0,%eax 1248 xor $key0,%edx 1249 pinsrd \$3,%eax,$inout1 1250 lea 3($ctr),%rax 1251 movdqa $inout1,0x10(%rsp) 1252 pinsrd \$3,%edx,$inout2 1253 bswap %eax 1254 mov %r10,%rdx # restore %rdx 1255 lea 4($ctr),%r10 1256 movdqa $inout2,0x20(%rsp) 1257 xor $key0,%eax 1258 bswap %r10d 1259 pinsrd \$3,%eax,$inout3 1260 xor $key0,%r10d 1261 movdqa $inout3,0x30(%rsp) 1262 lea 5($ctr),%r9 1263 mov %r10d,0x40+12(%rsp) 1264 bswap %r9d 1265 lea 6($ctr),%r10 1266 mov 240($key),$rounds # key->rounds 1267 xor $key0,%r9d 1268 bswap %r10d 1269 mov %r9d,0x50+12(%rsp) 1270 xor $key0,%r10d 1271 lea 7($ctr),%r9 1272 mov %r10d,0x60+12(%rsp) 1273 bswap %r9d 1274 mov OPENSSL_ia32cap_P+4(%rip),%r10d 1275 xor $key0,%r9d 1276 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1277 mov %r9d,0x70+12(%rsp) 1278 1279 $movkey 0x10($key),$rndkey1 1280 1281 movdqa 0x40(%rsp),$inout4 1282 movdqa 0x50(%rsp),$inout5 1283 1284 cmp \$8,$len # $len is in blocks 1285 jb .Lctr32_tail # short input if ($len<8) 1286 1287 sub \$6,$len # $len is biased by -6 1288 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1289 je .Lctr32_6x # [which denotes Atom Silvermont] 1290 1291 lea 0x80($key),$key # size optimization 1292 sub \$2,$len # $len is biased by -8 1293 jmp .Lctr32_loop8 1294 1295.align 16 1296.Lctr32_6x: 1297 shl \$4,$rounds 1298 mov \$48,$rnds_ 1299 bswap $key0 1300 lea 32($key,$rounds),$key # end of key schedule 1301 sub %rax,%r10 # twisted $rounds 1302 jmp .Lctr32_loop6 1303 1304.align 16 1305.Lctr32_loop6: 1306 add \$6,$ctr # next counter value 1307 $movkey -48($key,$rnds_),$rndkey0 1308 aesenc $rndkey1,$inout0 1309 mov $ctr,%eax 1310 xor $key0,%eax 1311 aesenc $rndkey1,$inout1 1312 movbe %eax,`0x00+12`(%rsp) # store next counter value 1313 lea 1($ctr),%eax 1314 aesenc $rndkey1,$inout2 1315 xor $key0,%eax 1316 movbe %eax,`0x10+12`(%rsp) 1317 aesenc $rndkey1,$inout3 1318 lea 2($ctr),%eax 1319 xor $key0,%eax 1320 aesenc $rndkey1,$inout4 1321 movbe %eax,`0x20+12`(%rsp) 1322 lea 3($ctr),%eax 1323 aesenc $rndkey1,$inout5 1324 $movkey -32($key,$rnds_),$rndkey1 1325 xor $key0,%eax 1326 1327 aesenc $rndkey0,$inout0 1328 movbe %eax,`0x30+12`(%rsp) 1329 lea 4($ctr),%eax 1330 aesenc $rndkey0,$inout1 1331 xor $key0,%eax 1332 movbe %eax,`0x40+12`(%rsp) 1333 aesenc $rndkey0,$inout2 1334 lea 5($ctr),%eax 1335 xor $key0,%eax 1336 aesenc $rndkey0,$inout3 1337 movbe %eax,`0x50+12`(%rsp) 1338 mov %r10,%rax # mov $rnds_,$rounds 1339 aesenc $rndkey0,$inout4 1340 aesenc $rndkey0,$inout5 1341 $movkey -16($key,$rnds_),$rndkey0 1342 1343 call .Lenc_loop6 1344 1345 movdqu ($inp),$inout6 # load 6 input blocks 1346 movdqu 0x10($inp),$inout7 1347 movdqu 0x20($inp),$in0 1348 movdqu 0x30($inp),$in1 1349 movdqu 0x40($inp),$in2 1350 movdqu 0x50($inp),$in3 1351 lea 0x60($inp),$inp # $inp+=6*16 1352 $movkey -64($key,$rnds_),$rndkey1 1353 pxor $inout0,$inout6 # inp^=E(ctr) 1354 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 1355 pxor $inout1,$inout7 1356 movaps 0x10(%rsp),$inout1 1357 pxor $inout2,$in0 1358 movaps 0x20(%rsp),$inout2 1359 pxor $inout3,$in1 1360 movaps 0x30(%rsp),$inout3 1361 pxor $inout4,$in2 1362 movaps 0x40(%rsp),$inout4 1363 pxor $inout5,$in3 1364 movaps 0x50(%rsp),$inout5 1365 movdqu $inout6,($out) # store 6 output blocks 1366 movdqu $inout7,0x10($out) 1367 movdqu $in0,0x20($out) 1368 movdqu $in1,0x30($out) 1369 movdqu $in2,0x40($out) 1370 movdqu $in3,0x50($out) 1371 lea 0x60($out),$out # $out+=6*16 1372 1373 sub \$6,$len 1374 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 1375 1376 add \$6,$len # restore real remaining $len 1377 jz .Lctr32_done # done if ($len==0) 1378 1379 lea -48($rnds_),$rounds 1380 lea -80($key,$rnds_),$key # restore $key 1381 neg $rounds 1382 shr \$4,$rounds # restore $rounds 1383 jmp .Lctr32_tail 1384 1385.align 32 1386.Lctr32_loop8: 1387 add \$8,$ctr # next counter value 1388 movdqa 0x60(%rsp),$inout6 1389 aesenc $rndkey1,$inout0 1390 mov $ctr,%r9d 1391 movdqa 0x70(%rsp),$inout7 1392 aesenc $rndkey1,$inout1 1393 bswap %r9d 1394 $movkey 0x20-0x80($key),$rndkey0 1395 aesenc $rndkey1,$inout2 1396 xor $key0,%r9d 1397 nop 1398 aesenc $rndkey1,$inout3 1399 mov %r9d,0x00+12(%rsp) # store next counter value 1400 lea 1($ctr),%r9 1401 aesenc $rndkey1,$inout4 1402 aesenc $rndkey1,$inout5 1403 aesenc $rndkey1,$inout6 1404 aesenc $rndkey1,$inout7 1405 $movkey 0x30-0x80($key),$rndkey1 1406___ 1407for($i=2;$i<8;$i++) { 1408my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1409$code.=<<___; 1410 bswap %r9d 1411 aesenc $rndkeyx,$inout0 1412 aesenc $rndkeyx,$inout1 1413 xor $key0,%r9d 1414 .byte 0x66,0x90 1415 aesenc $rndkeyx,$inout2 1416 aesenc $rndkeyx,$inout3 1417 mov %r9d,`0x10*($i-1)`+12(%rsp) 1418 lea $i($ctr),%r9 1419 aesenc $rndkeyx,$inout4 1420 aesenc $rndkeyx,$inout5 1421 aesenc $rndkeyx,$inout6 1422 aesenc $rndkeyx,$inout7 1423 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1424___ 1425} 1426$code.=<<___; 1427 bswap %r9d 1428 aesenc $rndkey0,$inout0 1429 aesenc $rndkey0,$inout1 1430 aesenc $rndkey0,$inout2 1431 xor $key0,%r9d 1432 movdqu 0x00($inp),$in0 # start loading input 1433 aesenc $rndkey0,$inout3 1434 mov %r9d,0x70+12(%rsp) 1435 cmp \$11,$rounds 1436 aesenc $rndkey0,$inout4 1437 aesenc $rndkey0,$inout5 1438 aesenc $rndkey0,$inout6 1439 aesenc $rndkey0,$inout7 1440 $movkey 0xa0-0x80($key),$rndkey0 1441 1442 jb .Lctr32_enc_done 1443 1444 aesenc $rndkey1,$inout0 1445 aesenc $rndkey1,$inout1 1446 aesenc $rndkey1,$inout2 1447 aesenc $rndkey1,$inout3 1448 aesenc $rndkey1,$inout4 1449 aesenc $rndkey1,$inout5 1450 aesenc $rndkey1,$inout6 1451 aesenc $rndkey1,$inout7 1452 $movkey 0xb0-0x80($key),$rndkey1 1453 1454 aesenc $rndkey0,$inout0 1455 aesenc $rndkey0,$inout1 1456 aesenc $rndkey0,$inout2 1457 aesenc $rndkey0,$inout3 1458 aesenc $rndkey0,$inout4 1459 aesenc $rndkey0,$inout5 1460 aesenc $rndkey0,$inout6 1461 aesenc $rndkey0,$inout7 1462 $movkey 0xc0-0x80($key),$rndkey0 1463 je .Lctr32_enc_done 1464 1465 aesenc $rndkey1,$inout0 1466 aesenc $rndkey1,$inout1 1467 aesenc $rndkey1,$inout2 1468 aesenc $rndkey1,$inout3 1469 aesenc $rndkey1,$inout4 1470 aesenc $rndkey1,$inout5 1471 aesenc $rndkey1,$inout6 1472 aesenc $rndkey1,$inout7 1473 $movkey 0xd0-0x80($key),$rndkey1 1474 1475 aesenc $rndkey0,$inout0 1476 aesenc $rndkey0,$inout1 1477 aesenc $rndkey0,$inout2 1478 aesenc $rndkey0,$inout3 1479 aesenc $rndkey0,$inout4 1480 aesenc $rndkey0,$inout5 1481 aesenc $rndkey0,$inout6 1482 aesenc $rndkey0,$inout7 1483 $movkey 0xe0-0x80($key),$rndkey0 1484 jmp .Lctr32_enc_done 1485 1486.align 16 1487.Lctr32_enc_done: 1488 movdqu 0x10($inp),$in1 1489 pxor $rndkey0,$in0 # input^=round[last] 1490 movdqu 0x20($inp),$in2 1491 pxor $rndkey0,$in1 1492 movdqu 0x30($inp),$in3 1493 pxor $rndkey0,$in2 1494 movdqu 0x40($inp),$in4 1495 pxor $rndkey0,$in3 1496 movdqu 0x50($inp),$in5 1497 pxor $rndkey0,$in4 1498 pxor $rndkey0,$in5 1499 aesenc $rndkey1,$inout0 1500 aesenc $rndkey1,$inout1 1501 aesenc $rndkey1,$inout2 1502 aesenc $rndkey1,$inout3 1503 aesenc $rndkey1,$inout4 1504 aesenc $rndkey1,$inout5 1505 aesenc $rndkey1,$inout6 1506 aesenc $rndkey1,$inout7 1507 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1508 lea 0x80($inp),$inp # $inp+=8*16 1509 1510 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1511 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1512 movdqu 0x70-0x80($inp),$in0 1513 aesenclast $in1,$inout1 1514 pxor $rndkey0,$in0 1515 movdqa 0x00(%rsp),$in1 # load next counter block 1516 aesenclast $in2,$inout2 1517 aesenclast $in3,$inout3 1518 movdqa 0x10(%rsp),$in2 1519 movdqa 0x20(%rsp),$in3 1520 aesenclast $in4,$inout4 1521 aesenclast $in5,$inout5 1522 movdqa 0x30(%rsp),$in4 1523 movdqa 0x40(%rsp),$in5 1524 aesenclast $rndkey1,$inout6 1525 movdqa 0x50(%rsp),$rndkey0 1526 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1527 aesenclast $in0,$inout7 1528 1529 movups $inout0,($out) # store 8 output blocks 1530 movdqa $in1,$inout0 1531 movups $inout1,0x10($out) 1532 movdqa $in2,$inout1 1533 movups $inout2,0x20($out) 1534 movdqa $in3,$inout2 1535 movups $inout3,0x30($out) 1536 movdqa $in4,$inout3 1537 movups $inout4,0x40($out) 1538 movdqa $in5,$inout4 1539 movups $inout5,0x50($out) 1540 movdqa $rndkey0,$inout5 1541 movups $inout6,0x60($out) 1542 movups $inout7,0x70($out) 1543 lea 0x80($out),$out # $out+=8*16 1544 1545 sub \$8,$len 1546 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1547 1548 add \$8,$len # restore real remainig $len 1549 jz .Lctr32_done # done if ($len==0) 1550 lea -0x80($key),$key 1551 1552.Lctr32_tail: 1553 # note that at this point $inout0..5 are populated with 1554 # counter values xor-ed with 0-round key 1555 lea 16($key),$key 1556 cmp \$4,$len 1557 jb .Lctr32_loop3 1558 je .Lctr32_loop4 1559 1560 # if ($len>4) compute 7 E(counter) 1561 shl \$4,$rounds 1562 movdqa 0x60(%rsp),$inout6 1563 pxor $inout7,$inout7 1564 1565 $movkey 16($key),$rndkey0 1566 aesenc $rndkey1,$inout0 1567 aesenc $rndkey1,$inout1 1568 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1569 neg %rax 1570 aesenc $rndkey1,$inout2 1571 add \$16,%rax # prepare for .Lenc_loop8_enter 1572 movups ($inp),$in0 1573 aesenc $rndkey1,$inout3 1574 aesenc $rndkey1,$inout4 1575 movups 0x10($inp),$in1 # pre-load input 1576 movups 0x20($inp),$in2 1577 aesenc $rndkey1,$inout5 1578 aesenc $rndkey1,$inout6 1579 1580 call .Lenc_loop8_enter 1581 1582 movdqu 0x30($inp),$in3 1583 pxor $in0,$inout0 1584 movdqu 0x40($inp),$in0 1585 pxor $in1,$inout1 1586 movdqu $inout0,($out) # store output 1587 pxor $in2,$inout2 1588 movdqu $inout1,0x10($out) 1589 pxor $in3,$inout3 1590 movdqu $inout2,0x20($out) 1591 pxor $in0,$inout4 1592 movdqu $inout3,0x30($out) 1593 movdqu $inout4,0x40($out) 1594 cmp \$6,$len 1595 jb .Lctr32_done # $len was 5, stop store 1596 1597 movups 0x50($inp),$in1 1598 xorps $in1,$inout5 1599 movups $inout5,0x50($out) 1600 je .Lctr32_done # $len was 6, stop store 1601 1602 movups 0x60($inp),$in2 1603 xorps $in2,$inout6 1604 movups $inout6,0x60($out) 1605 jmp .Lctr32_done # $len was 7, stop store 1606 1607.align 32 1608.Lctr32_loop4: 1609 aesenc $rndkey1,$inout0 1610 lea 16($key),$key 1611 dec $rounds 1612 aesenc $rndkey1,$inout1 1613 aesenc $rndkey1,$inout2 1614 aesenc $rndkey1,$inout3 1615 $movkey ($key),$rndkey1 1616 jnz .Lctr32_loop4 1617 aesenclast $rndkey1,$inout0 1618 aesenclast $rndkey1,$inout1 1619 movups ($inp),$in0 # load input 1620 movups 0x10($inp),$in1 1621 aesenclast $rndkey1,$inout2 1622 aesenclast $rndkey1,$inout3 1623 movups 0x20($inp),$in2 1624 movups 0x30($inp),$in3 1625 1626 xorps $in0,$inout0 1627 movups $inout0,($out) # store output 1628 xorps $in1,$inout1 1629 movups $inout1,0x10($out) 1630 pxor $in2,$inout2 1631 movdqu $inout2,0x20($out) 1632 pxor $in3,$inout3 1633 movdqu $inout3,0x30($out) 1634 jmp .Lctr32_done # $len was 4, stop store 1635 1636.align 32 1637.Lctr32_loop3: 1638 aesenc $rndkey1,$inout0 1639 lea 16($key),$key 1640 dec $rounds 1641 aesenc $rndkey1,$inout1 1642 aesenc $rndkey1,$inout2 1643 $movkey ($key),$rndkey1 1644 jnz .Lctr32_loop3 1645 aesenclast $rndkey1,$inout0 1646 aesenclast $rndkey1,$inout1 1647 aesenclast $rndkey1,$inout2 1648 1649 movups ($inp),$in0 # load input 1650 xorps $in0,$inout0 1651 movups $inout0,($out) # store output 1652 cmp \$2,$len 1653 jb .Lctr32_done # $len was 1, stop store 1654 1655 movups 0x10($inp),$in1 1656 xorps $in1,$inout1 1657 movups $inout1,0x10($out) 1658 je .Lctr32_done # $len was 2, stop store 1659 1660 movups 0x20($inp),$in2 1661 xorps $in2,$inout2 1662 movups $inout2,0x20($out) # $len was 3, stop store 1663 1664.Lctr32_done: 1665 xorps %xmm0,%xmm0 # clear regiser bank 1666 xor $key0,$key0 1667 pxor %xmm1,%xmm1 1668 pxor %xmm2,%xmm2 1669 pxor %xmm3,%xmm3 1670 pxor %xmm4,%xmm4 1671 pxor %xmm5,%xmm5 1672___ 1673$code.=<<___ if (!$win64); 1674 pxor %xmm6,%xmm6 1675 pxor %xmm7,%xmm7 1676 movaps %xmm0,0x00(%rsp) # clear stack 1677 pxor %xmm8,%xmm8 1678 movaps %xmm0,0x10(%rsp) 1679 pxor %xmm9,%xmm9 1680 movaps %xmm0,0x20(%rsp) 1681 pxor %xmm10,%xmm10 1682 movaps %xmm0,0x30(%rsp) 1683 pxor %xmm11,%xmm11 1684 movaps %xmm0,0x40(%rsp) 1685 pxor %xmm12,%xmm12 1686 movaps %xmm0,0x50(%rsp) 1687 pxor %xmm13,%xmm13 1688 movaps %xmm0,0x60(%rsp) 1689 pxor %xmm14,%xmm14 1690 movaps %xmm0,0x70(%rsp) 1691 pxor %xmm15,%xmm15 1692___ 1693$code.=<<___ if ($win64); 1694 movaps -0xa8($key_),%xmm6 1695 movaps %xmm0,-0xa8($key_) # clear stack 1696 movaps -0x98($key_),%xmm7 1697 movaps %xmm0,-0x98($key_) 1698 movaps -0x88($key_),%xmm8 1699 movaps %xmm0,-0x88($key_) 1700 movaps -0x78($key_),%xmm9 1701 movaps %xmm0,-0x78($key_) 1702 movaps -0x68($key_),%xmm10 1703 movaps %xmm0,-0x68($key_) 1704 movaps -0x58($key_),%xmm11 1705 movaps %xmm0,-0x58($key_) 1706 movaps -0x48($key_),%xmm12 1707 movaps %xmm0,-0x48($key_) 1708 movaps -0x38($key_),%xmm13 1709 movaps %xmm0,-0x38($key_) 1710 movaps -0x28($key_),%xmm14 1711 movaps %xmm0,-0x28($key_) 1712 movaps -0x18($key_),%xmm15 1713 movaps %xmm0,-0x18($key_) 1714 movaps %xmm0,0x00(%rsp) 1715 movaps %xmm0,0x10(%rsp) 1716 movaps %xmm0,0x20(%rsp) 1717 movaps %xmm0,0x30(%rsp) 1718 movaps %xmm0,0x40(%rsp) 1719 movaps %xmm0,0x50(%rsp) 1720 movaps %xmm0,0x60(%rsp) 1721 movaps %xmm0,0x70(%rsp) 1722___ 1723$code.=<<___; 1724 mov -8($key_),%rbp 1725 lea ($key_),%rsp 1726.Lctr32_epilogue: 1727 ret 1728.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks 1729___ 1730} 1731 1732###################################################################### 1733# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1734# const AES_KEY *key1, const AES_KEY *key2 1735# const unsigned char iv[16]); 1736# 1737{ 1738my @tweak=map("%xmm$_",(10..15)); 1739my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1740my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1741my $frame_size = 0x70 + ($win64?160:0); 1742my $key_ = "%rbp"; # override so that we can use %r11 as FP 1743 1744$code.=<<___; 1745.globl aesni_xts_encrypt 1746.type aesni_xts_encrypt,\@function,6 1747.align 16 1748aesni_xts_encrypt: 1749 lea (%rsp),%r11 # frame pointer 1750 push %rbp 1751 sub \$$frame_size,%rsp 1752 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1753___ 1754$code.=<<___ if ($win64); 1755 movaps %xmm6,-0xa8(%r11) # offload everything 1756 movaps %xmm7,-0x98(%r11) 1757 movaps %xmm8,-0x88(%r11) 1758 movaps %xmm9,-0x78(%r11) 1759 movaps %xmm10,-0x68(%r11) 1760 movaps %xmm11,-0x58(%r11) 1761 movaps %xmm12,-0x48(%r11) 1762 movaps %xmm13,-0x38(%r11) 1763 movaps %xmm14,-0x28(%r11) 1764 movaps %xmm15,-0x18(%r11) 1765.Lxts_enc_body: 1766___ 1767$code.=<<___; 1768 movups ($ivp),$inout0 # load clear-text tweak 1769 mov 240(%r8),$rounds # key2->rounds 1770 mov 240($key),$rnds_ # key1->rounds 1771___ 1772 # generate the tweak 1773 &aesni_generate1("enc",$key2,$rounds,$inout0); 1774$code.=<<___; 1775 $movkey ($key),$rndkey0 # zero round key 1776 mov $key,$key_ # backup $key 1777 mov $rnds_,$rounds # backup $rounds 1778 shl \$4,$rnds_ 1779 mov $len,$len_ # backup $len 1780 and \$-16,$len 1781 1782 $movkey 16($key,$rnds_),$rndkey1 # last round key 1783 1784 movdqa .Lxts_magic(%rip),$twmask 1785 movdqa $inout0,@tweak[5] 1786 pshufd \$0x5f,$inout0,$twres 1787 pxor $rndkey0,$rndkey1 1788___ 1789 # alternative tweak calculation algorithm is based on suggestions 1790 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1791 # and should help in the future... 1792 for ($i=0;$i<4;$i++) { 1793 $code.=<<___; 1794 movdqa $twres,$twtmp 1795 paddd $twres,$twres 1796 movdqa @tweak[5],@tweak[$i] 1797 psrad \$31,$twtmp # broadcast upper bits 1798 paddq @tweak[5],@tweak[5] 1799 pand $twmask,$twtmp 1800 pxor $rndkey0,@tweak[$i] 1801 pxor $twtmp,@tweak[5] 1802___ 1803 } 1804$code.=<<___; 1805 movdqa @tweak[5],@tweak[4] 1806 psrad \$31,$twres 1807 paddq @tweak[5],@tweak[5] 1808 pand $twmask,$twres 1809 pxor $rndkey0,@tweak[4] 1810 pxor $twres,@tweak[5] 1811 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1812 1813 sub \$16*6,$len 1814 jc .Lxts_enc_short # if $len-=6*16 borrowed 1815 1816 mov \$16+96,$rounds 1817 lea 32($key_,$rnds_),$key # end of key schedule 1818 sub %r10,%rax # twisted $rounds 1819 $movkey 16($key_),$rndkey1 1820 mov %rax,%r10 # backup twisted $rounds 1821 lea .Lxts_magic(%rip),%r8 1822 jmp .Lxts_enc_grandloop 1823 1824.align 32 1825.Lxts_enc_grandloop: 1826 movdqu `16*0`($inp),$inout0 # load input 1827 movdqa $rndkey0,$twmask 1828 movdqu `16*1`($inp),$inout1 1829 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1830 movdqu `16*2`($inp),$inout2 1831 pxor @tweak[1],$inout1 1832 aesenc $rndkey1,$inout0 1833 movdqu `16*3`($inp),$inout3 1834 pxor @tweak[2],$inout2 1835 aesenc $rndkey1,$inout1 1836 movdqu `16*4`($inp),$inout4 1837 pxor @tweak[3],$inout3 1838 aesenc $rndkey1,$inout2 1839 movdqu `16*5`($inp),$inout5 1840 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1841 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1842 pxor @tweak[4],$inout4 1843 aesenc $rndkey1,$inout3 1844 $movkey 32($key_),$rndkey0 1845 lea `16*6`($inp),$inp 1846 pxor $twmask,$inout5 1847 1848 pxor $twres,@tweak[0] # calclulate tweaks^round[last] 1849 aesenc $rndkey1,$inout4 1850 pxor $twres,@tweak[1] 1851 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1852 aesenc $rndkey1,$inout5 1853 $movkey 48($key_),$rndkey1 1854 pxor $twres,@tweak[2] 1855 1856 aesenc $rndkey0,$inout0 1857 pxor $twres,@tweak[3] 1858 movdqa @tweak[1],`16*1`(%rsp) 1859 aesenc $rndkey0,$inout1 1860 pxor $twres,@tweak[4] 1861 movdqa @tweak[2],`16*2`(%rsp) 1862 aesenc $rndkey0,$inout2 1863 aesenc $rndkey0,$inout3 1864 pxor $twres,$twmask 1865 movdqa @tweak[4],`16*4`(%rsp) 1866 aesenc $rndkey0,$inout4 1867 aesenc $rndkey0,$inout5 1868 $movkey 64($key_),$rndkey0 1869 movdqa $twmask,`16*5`(%rsp) 1870 pshufd \$0x5f,@tweak[5],$twres 1871 jmp .Lxts_enc_loop6 1872.align 32 1873.Lxts_enc_loop6: 1874 aesenc $rndkey1,$inout0 1875 aesenc $rndkey1,$inout1 1876 aesenc $rndkey1,$inout2 1877 aesenc $rndkey1,$inout3 1878 aesenc $rndkey1,$inout4 1879 aesenc $rndkey1,$inout5 1880 $movkey -64($key,%rax),$rndkey1 1881 add \$32,%rax 1882 1883 aesenc $rndkey0,$inout0 1884 aesenc $rndkey0,$inout1 1885 aesenc $rndkey0,$inout2 1886 aesenc $rndkey0,$inout3 1887 aesenc $rndkey0,$inout4 1888 aesenc $rndkey0,$inout5 1889 $movkey -80($key,%rax),$rndkey0 1890 jnz .Lxts_enc_loop6 1891 1892 movdqa (%r8),$twmask # start calculating next tweak 1893 movdqa $twres,$twtmp 1894 paddd $twres,$twres 1895 aesenc $rndkey1,$inout0 1896 paddq @tweak[5],@tweak[5] 1897 psrad \$31,$twtmp 1898 aesenc $rndkey1,$inout1 1899 pand $twmask,$twtmp 1900 $movkey ($key_),@tweak[0] # load round[0] 1901 aesenc $rndkey1,$inout2 1902 aesenc $rndkey1,$inout3 1903 aesenc $rndkey1,$inout4 1904 pxor $twtmp,@tweak[5] 1905 movaps @tweak[0],@tweak[1] # copy round[0] 1906 aesenc $rndkey1,$inout5 1907 $movkey -64($key),$rndkey1 1908 1909 movdqa $twres,$twtmp 1910 aesenc $rndkey0,$inout0 1911 paddd $twres,$twres 1912 pxor @tweak[5],@tweak[0] 1913 aesenc $rndkey0,$inout1 1914 psrad \$31,$twtmp 1915 paddq @tweak[5],@tweak[5] 1916 aesenc $rndkey0,$inout2 1917 aesenc $rndkey0,$inout3 1918 pand $twmask,$twtmp 1919 movaps @tweak[1],@tweak[2] 1920 aesenc $rndkey0,$inout4 1921 pxor $twtmp,@tweak[5] 1922 movdqa $twres,$twtmp 1923 aesenc $rndkey0,$inout5 1924 $movkey -48($key),$rndkey0 1925 1926 paddd $twres,$twres 1927 aesenc $rndkey1,$inout0 1928 pxor @tweak[5],@tweak[1] 1929 psrad \$31,$twtmp 1930 aesenc $rndkey1,$inout1 1931 paddq @tweak[5],@tweak[5] 1932 pand $twmask,$twtmp 1933 aesenc $rndkey1,$inout2 1934 aesenc $rndkey1,$inout3 1935 movdqa @tweak[3],`16*3`(%rsp) 1936 pxor $twtmp,@tweak[5] 1937 aesenc $rndkey1,$inout4 1938 movaps @tweak[2],@tweak[3] 1939 movdqa $twres,$twtmp 1940 aesenc $rndkey1,$inout5 1941 $movkey -32($key),$rndkey1 1942 1943 paddd $twres,$twres 1944 aesenc $rndkey0,$inout0 1945 pxor @tweak[5],@tweak[2] 1946 psrad \$31,$twtmp 1947 aesenc $rndkey0,$inout1 1948 paddq @tweak[5],@tweak[5] 1949 pand $twmask,$twtmp 1950 aesenc $rndkey0,$inout2 1951 aesenc $rndkey0,$inout3 1952 aesenc $rndkey0,$inout4 1953 pxor $twtmp,@tweak[5] 1954 movaps @tweak[3],@tweak[4] 1955 aesenc $rndkey0,$inout5 1956 1957 movdqa $twres,$rndkey0 1958 paddd $twres,$twres 1959 aesenc $rndkey1,$inout0 1960 pxor @tweak[5],@tweak[3] 1961 psrad \$31,$rndkey0 1962 aesenc $rndkey1,$inout1 1963 paddq @tweak[5],@tweak[5] 1964 pand $twmask,$rndkey0 1965 aesenc $rndkey1,$inout2 1966 aesenc $rndkey1,$inout3 1967 pxor $rndkey0,@tweak[5] 1968 $movkey ($key_),$rndkey0 1969 aesenc $rndkey1,$inout4 1970 aesenc $rndkey1,$inout5 1971 $movkey 16($key_),$rndkey1 1972 1973 pxor @tweak[5],@tweak[4] 1974 aesenclast `16*0`(%rsp),$inout0 1975 psrad \$31,$twres 1976 paddq @tweak[5],@tweak[5] 1977 aesenclast `16*1`(%rsp),$inout1 1978 aesenclast `16*2`(%rsp),$inout2 1979 pand $twmask,$twres 1980 mov %r10,%rax # restore $rounds 1981 aesenclast `16*3`(%rsp),$inout3 1982 aesenclast `16*4`(%rsp),$inout4 1983 aesenclast `16*5`(%rsp),$inout5 1984 pxor $twres,@tweak[5] 1985 1986 lea `16*6`($out),$out # $out+=6*16 1987 movups $inout0,`-16*6`($out) # store 6 output blocks 1988 movups $inout1,`-16*5`($out) 1989 movups $inout2,`-16*4`($out) 1990 movups $inout3,`-16*3`($out) 1991 movups $inout4,`-16*2`($out) 1992 movups $inout5,`-16*1`($out) 1993 sub \$16*6,$len 1994 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 1995 1996 mov \$16+96,$rounds 1997 sub $rnds_,$rounds 1998 mov $key_,$key # restore $key 1999 shr \$4,$rounds # restore original value 2000 2001.Lxts_enc_short: 2002 # at the point @tweak[0..5] are populated with tweak values 2003 mov $rounds,$rnds_ # backup $rounds 2004 pxor $rndkey0,@tweak[0] 2005 add \$16*6,$len # restore real remaining $len 2006 jz .Lxts_enc_done # done if ($len==0) 2007 2008 pxor $rndkey0,@tweak[1] 2009 cmp \$0x20,$len 2010 jb .Lxts_enc_one # $len is 1*16 2011 pxor $rndkey0,@tweak[2] 2012 je .Lxts_enc_two # $len is 2*16 2013 2014 pxor $rndkey0,@tweak[3] 2015 cmp \$0x40,$len 2016 jb .Lxts_enc_three # $len is 3*16 2017 pxor $rndkey0,@tweak[4] 2018 je .Lxts_enc_four # $len is 4*16 2019 2020 movdqu ($inp),$inout0 # $len is 5*16 2021 movdqu 16*1($inp),$inout1 2022 movdqu 16*2($inp),$inout2 2023 pxor @tweak[0],$inout0 2024 movdqu 16*3($inp),$inout3 2025 pxor @tweak[1],$inout1 2026 movdqu 16*4($inp),$inout4 2027 lea 16*5($inp),$inp # $inp+=5*16 2028 pxor @tweak[2],$inout2 2029 pxor @tweak[3],$inout3 2030 pxor @tweak[4],$inout4 2031 pxor $inout5,$inout5 2032 2033 call _aesni_encrypt6 2034 2035 xorps @tweak[0],$inout0 2036 movdqa @tweak[5],@tweak[0] 2037 xorps @tweak[1],$inout1 2038 xorps @tweak[2],$inout2 2039 movdqu $inout0,($out) # store 5 output blocks 2040 xorps @tweak[3],$inout3 2041 movdqu $inout1,16*1($out) 2042 xorps @tweak[4],$inout4 2043 movdqu $inout2,16*2($out) 2044 movdqu $inout3,16*3($out) 2045 movdqu $inout4,16*4($out) 2046 lea 16*5($out),$out # $out+=5*16 2047 jmp .Lxts_enc_done 2048 2049.align 16 2050.Lxts_enc_one: 2051 movups ($inp),$inout0 2052 lea 16*1($inp),$inp # inp+=1*16 2053 xorps @tweak[0],$inout0 2054___ 2055 &aesni_generate1("enc",$key,$rounds); 2056$code.=<<___; 2057 xorps @tweak[0],$inout0 2058 movdqa @tweak[1],@tweak[0] 2059 movups $inout0,($out) # store one output block 2060 lea 16*1($out),$out # $out+=1*16 2061 jmp .Lxts_enc_done 2062 2063.align 16 2064.Lxts_enc_two: 2065 movups ($inp),$inout0 2066 movups 16($inp),$inout1 2067 lea 32($inp),$inp # $inp+=2*16 2068 xorps @tweak[0],$inout0 2069 xorps @tweak[1],$inout1 2070 2071 call _aesni_encrypt2 2072 2073 xorps @tweak[0],$inout0 2074 movdqa @tweak[2],@tweak[0] 2075 xorps @tweak[1],$inout1 2076 movups $inout0,($out) # store 2 output blocks 2077 movups $inout1,16*1($out) 2078 lea 16*2($out),$out # $out+=2*16 2079 jmp .Lxts_enc_done 2080 2081.align 16 2082.Lxts_enc_three: 2083 movups ($inp),$inout0 2084 movups 16*1($inp),$inout1 2085 movups 16*2($inp),$inout2 2086 lea 16*3($inp),$inp # $inp+=3*16 2087 xorps @tweak[0],$inout0 2088 xorps @tweak[1],$inout1 2089 xorps @tweak[2],$inout2 2090 2091 call _aesni_encrypt3 2092 2093 xorps @tweak[0],$inout0 2094 movdqa @tweak[3],@tweak[0] 2095 xorps @tweak[1],$inout1 2096 xorps @tweak[2],$inout2 2097 movups $inout0,($out) # store 3 output blocks 2098 movups $inout1,16*1($out) 2099 movups $inout2,16*2($out) 2100 lea 16*3($out),$out # $out+=3*16 2101 jmp .Lxts_enc_done 2102 2103.align 16 2104.Lxts_enc_four: 2105 movups ($inp),$inout0 2106 movups 16*1($inp),$inout1 2107 movups 16*2($inp),$inout2 2108 xorps @tweak[0],$inout0 2109 movups 16*3($inp),$inout3 2110 lea 16*4($inp),$inp # $inp+=4*16 2111 xorps @tweak[1],$inout1 2112 xorps @tweak[2],$inout2 2113 xorps @tweak[3],$inout3 2114 2115 call _aesni_encrypt4 2116 2117 pxor @tweak[0],$inout0 2118 movdqa @tweak[4],@tweak[0] 2119 pxor @tweak[1],$inout1 2120 pxor @tweak[2],$inout2 2121 movdqu $inout0,($out) # store 4 output blocks 2122 pxor @tweak[3],$inout3 2123 movdqu $inout1,16*1($out) 2124 movdqu $inout2,16*2($out) 2125 movdqu $inout3,16*3($out) 2126 lea 16*4($out),$out # $out+=4*16 2127 jmp .Lxts_enc_done 2128 2129.align 16 2130.Lxts_enc_done: 2131 and \$15,$len_ # see if $len%16 is 0 2132 jz .Lxts_enc_ret 2133 mov $len_,$len 2134 2135.Lxts_enc_steal: 2136 movzb ($inp),%eax # borrow $rounds ... 2137 movzb -16($out),%ecx # ... and $key 2138 lea 1($inp),$inp 2139 mov %al,-16($out) 2140 mov %cl,0($out) 2141 lea 1($out),$out 2142 sub \$1,$len 2143 jnz .Lxts_enc_steal 2144 2145 sub $len_,$out # rewind $out 2146 mov $key_,$key # restore $key 2147 mov $rnds_,$rounds # restore $rounds 2148 2149 movups -16($out),$inout0 2150 xorps @tweak[0],$inout0 2151___ 2152 &aesni_generate1("enc",$key,$rounds); 2153$code.=<<___; 2154 xorps @tweak[0],$inout0 2155 movups $inout0,-16($out) 2156 2157.Lxts_enc_ret: 2158 xorps %xmm0,%xmm0 # clear register bank 2159 pxor %xmm1,%xmm1 2160 pxor %xmm2,%xmm2 2161 pxor %xmm3,%xmm3 2162 pxor %xmm4,%xmm4 2163 pxor %xmm5,%xmm5 2164___ 2165$code.=<<___ if (!$win64); 2166 pxor %xmm6,%xmm6 2167 pxor %xmm7,%xmm7 2168 movaps %xmm0,0x00(%rsp) # clear stack 2169 pxor %xmm8,%xmm8 2170 movaps %xmm0,0x10(%rsp) 2171 pxor %xmm9,%xmm9 2172 movaps %xmm0,0x20(%rsp) 2173 pxor %xmm10,%xmm10 2174 movaps %xmm0,0x30(%rsp) 2175 pxor %xmm11,%xmm11 2176 movaps %xmm0,0x40(%rsp) 2177 pxor %xmm12,%xmm12 2178 movaps %xmm0,0x50(%rsp) 2179 pxor %xmm13,%xmm13 2180 movaps %xmm0,0x60(%rsp) 2181 pxor %xmm14,%xmm14 2182 pxor %xmm15,%xmm15 2183___ 2184$code.=<<___ if ($win64); 2185 movaps -0xa8(%r11),%xmm6 2186 movaps %xmm0,-0xa8(%r11) # clear stack 2187 movaps -0x98(%r11),%xmm7 2188 movaps %xmm0,-0x98(%r11) 2189 movaps -0x88(%r11),%xmm8 2190 movaps %xmm0,-0x88(%r11) 2191 movaps -0x78(%r11),%xmm9 2192 movaps %xmm0,-0x78(%r11) 2193 movaps -0x68(%r11),%xmm10 2194 movaps %xmm0,-0x68(%r11) 2195 movaps -0x58(%r11),%xmm11 2196 movaps %xmm0,-0x58(%r11) 2197 movaps -0x48(%r11),%xmm12 2198 movaps %xmm0,-0x48(%r11) 2199 movaps -0x38(%r11),%xmm13 2200 movaps %xmm0,-0x38(%r11) 2201 movaps -0x28(%r11),%xmm14 2202 movaps %xmm0,-0x28(%r11) 2203 movaps -0x18(%r11),%xmm15 2204 movaps %xmm0,-0x18(%r11) 2205 movaps %xmm0,0x00(%rsp) 2206 movaps %xmm0,0x10(%rsp) 2207 movaps %xmm0,0x20(%rsp) 2208 movaps %xmm0,0x30(%rsp) 2209 movaps %xmm0,0x40(%rsp) 2210 movaps %xmm0,0x50(%rsp) 2211 movaps %xmm0,0x60(%rsp) 2212___ 2213$code.=<<___; 2214 mov -8(%r11),%rbp 2215 lea (%r11),%rsp 2216.Lxts_enc_epilogue: 2217 ret 2218.size aesni_xts_encrypt,.-aesni_xts_encrypt 2219___ 2220 2221$code.=<<___; 2222.globl aesni_xts_decrypt 2223.type aesni_xts_decrypt,\@function,6 2224.align 16 2225aesni_xts_decrypt: 2226 lea (%rsp),%r11 # frame pointer 2227 push %rbp 2228 sub \$$frame_size,%rsp 2229 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2230___ 2231$code.=<<___ if ($win64); 2232 movaps %xmm6,-0xa8(%r11) # offload everything 2233 movaps %xmm7,-0x98(%r11) 2234 movaps %xmm8,-0x88(%r11) 2235 movaps %xmm9,-0x78(%r11) 2236 movaps %xmm10,-0x68(%r11) 2237 movaps %xmm11,-0x58(%r11) 2238 movaps %xmm12,-0x48(%r11) 2239 movaps %xmm13,-0x38(%r11) 2240 movaps %xmm14,-0x28(%r11) 2241 movaps %xmm15,-0x18(%r11) 2242.Lxts_dec_body: 2243___ 2244$code.=<<___; 2245 movups ($ivp),$inout0 # load clear-text tweak 2246 mov 240($key2),$rounds # key2->rounds 2247 mov 240($key),$rnds_ # key1->rounds 2248___ 2249 # generate the tweak 2250 &aesni_generate1("enc",$key2,$rounds,$inout0); 2251$code.=<<___; 2252 xor %eax,%eax # if ($len%16) len-=16; 2253 test \$15,$len 2254 setnz %al 2255 shl \$4,%rax 2256 sub %rax,$len 2257 2258 $movkey ($key),$rndkey0 # zero round key 2259 mov $key,$key_ # backup $key 2260 mov $rnds_,$rounds # backup $rounds 2261 shl \$4,$rnds_ 2262 mov $len,$len_ # backup $len 2263 and \$-16,$len 2264 2265 $movkey 16($key,$rnds_),$rndkey1 # last round key 2266 2267 movdqa .Lxts_magic(%rip),$twmask 2268 movdqa $inout0,@tweak[5] 2269 pshufd \$0x5f,$inout0,$twres 2270 pxor $rndkey0,$rndkey1 2271___ 2272 for ($i=0;$i<4;$i++) { 2273 $code.=<<___; 2274 movdqa $twres,$twtmp 2275 paddd $twres,$twres 2276 movdqa @tweak[5],@tweak[$i] 2277 psrad \$31,$twtmp # broadcast upper bits 2278 paddq @tweak[5],@tweak[5] 2279 pand $twmask,$twtmp 2280 pxor $rndkey0,@tweak[$i] 2281 pxor $twtmp,@tweak[5] 2282___ 2283 } 2284$code.=<<___; 2285 movdqa @tweak[5],@tweak[4] 2286 psrad \$31,$twres 2287 paddq @tweak[5],@tweak[5] 2288 pand $twmask,$twres 2289 pxor $rndkey0,@tweak[4] 2290 pxor $twres,@tweak[5] 2291 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2292 2293 sub \$16*6,$len 2294 jc .Lxts_dec_short # if $len-=6*16 borrowed 2295 2296 mov \$16+96,$rounds 2297 lea 32($key_,$rnds_),$key # end of key schedule 2298 sub %r10,%rax # twisted $rounds 2299 $movkey 16($key_),$rndkey1 2300 mov %rax,%r10 # backup twisted $rounds 2301 lea .Lxts_magic(%rip),%r8 2302 jmp .Lxts_dec_grandloop 2303 2304.align 32 2305.Lxts_dec_grandloop: 2306 movdqu `16*0`($inp),$inout0 # load input 2307 movdqa $rndkey0,$twmask 2308 movdqu `16*1`($inp),$inout1 2309 pxor @tweak[0],$inout0 # intput^=tweak^round[0] 2310 movdqu `16*2`($inp),$inout2 2311 pxor @tweak[1],$inout1 2312 aesdec $rndkey1,$inout0 2313 movdqu `16*3`($inp),$inout3 2314 pxor @tweak[2],$inout2 2315 aesdec $rndkey1,$inout1 2316 movdqu `16*4`($inp),$inout4 2317 pxor @tweak[3],$inout3 2318 aesdec $rndkey1,$inout2 2319 movdqu `16*5`($inp),$inout5 2320 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2321 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2322 pxor @tweak[4],$inout4 2323 aesdec $rndkey1,$inout3 2324 $movkey 32($key_),$rndkey0 2325 lea `16*6`($inp),$inp 2326 pxor $twmask,$inout5 2327 2328 pxor $twres,@tweak[0] # calclulate tweaks^round[last] 2329 aesdec $rndkey1,$inout4 2330 pxor $twres,@tweak[1] 2331 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2332 aesdec $rndkey1,$inout5 2333 $movkey 48($key_),$rndkey1 2334 pxor $twres,@tweak[2] 2335 2336 aesdec $rndkey0,$inout0 2337 pxor $twres,@tweak[3] 2338 movdqa @tweak[1],`16*1`(%rsp) 2339 aesdec $rndkey0,$inout1 2340 pxor $twres,@tweak[4] 2341 movdqa @tweak[2],`16*2`(%rsp) 2342 aesdec $rndkey0,$inout2 2343 aesdec $rndkey0,$inout3 2344 pxor $twres,$twmask 2345 movdqa @tweak[4],`16*4`(%rsp) 2346 aesdec $rndkey0,$inout4 2347 aesdec $rndkey0,$inout5 2348 $movkey 64($key_),$rndkey0 2349 movdqa $twmask,`16*5`(%rsp) 2350 pshufd \$0x5f,@tweak[5],$twres 2351 jmp .Lxts_dec_loop6 2352.align 32 2353.Lxts_dec_loop6: 2354 aesdec $rndkey1,$inout0 2355 aesdec $rndkey1,$inout1 2356 aesdec $rndkey1,$inout2 2357 aesdec $rndkey1,$inout3 2358 aesdec $rndkey1,$inout4 2359 aesdec $rndkey1,$inout5 2360 $movkey -64($key,%rax),$rndkey1 2361 add \$32,%rax 2362 2363 aesdec $rndkey0,$inout0 2364 aesdec $rndkey0,$inout1 2365 aesdec $rndkey0,$inout2 2366 aesdec $rndkey0,$inout3 2367 aesdec $rndkey0,$inout4 2368 aesdec $rndkey0,$inout5 2369 $movkey -80($key,%rax),$rndkey0 2370 jnz .Lxts_dec_loop6 2371 2372 movdqa (%r8),$twmask # start calculating next tweak 2373 movdqa $twres,$twtmp 2374 paddd $twres,$twres 2375 aesdec $rndkey1,$inout0 2376 paddq @tweak[5],@tweak[5] 2377 psrad \$31,$twtmp 2378 aesdec $rndkey1,$inout1 2379 pand $twmask,$twtmp 2380 $movkey ($key_),@tweak[0] # load round[0] 2381 aesdec $rndkey1,$inout2 2382 aesdec $rndkey1,$inout3 2383 aesdec $rndkey1,$inout4 2384 pxor $twtmp,@tweak[5] 2385 movaps @tweak[0],@tweak[1] # copy round[0] 2386 aesdec $rndkey1,$inout5 2387 $movkey -64($key),$rndkey1 2388 2389 movdqa $twres,$twtmp 2390 aesdec $rndkey0,$inout0 2391 paddd $twres,$twres 2392 pxor @tweak[5],@tweak[0] 2393 aesdec $rndkey0,$inout1 2394 psrad \$31,$twtmp 2395 paddq @tweak[5],@tweak[5] 2396 aesdec $rndkey0,$inout2 2397 aesdec $rndkey0,$inout3 2398 pand $twmask,$twtmp 2399 movaps @tweak[1],@tweak[2] 2400 aesdec $rndkey0,$inout4 2401 pxor $twtmp,@tweak[5] 2402 movdqa $twres,$twtmp 2403 aesdec $rndkey0,$inout5 2404 $movkey -48($key),$rndkey0 2405 2406 paddd $twres,$twres 2407 aesdec $rndkey1,$inout0 2408 pxor @tweak[5],@tweak[1] 2409 psrad \$31,$twtmp 2410 aesdec $rndkey1,$inout1 2411 paddq @tweak[5],@tweak[5] 2412 pand $twmask,$twtmp 2413 aesdec $rndkey1,$inout2 2414 aesdec $rndkey1,$inout3 2415 movdqa @tweak[3],`16*3`(%rsp) 2416 pxor $twtmp,@tweak[5] 2417 aesdec $rndkey1,$inout4 2418 movaps @tweak[2],@tweak[3] 2419 movdqa $twres,$twtmp 2420 aesdec $rndkey1,$inout5 2421 $movkey -32($key),$rndkey1 2422 2423 paddd $twres,$twres 2424 aesdec $rndkey0,$inout0 2425 pxor @tweak[5],@tweak[2] 2426 psrad \$31,$twtmp 2427 aesdec $rndkey0,$inout1 2428 paddq @tweak[5],@tweak[5] 2429 pand $twmask,$twtmp 2430 aesdec $rndkey0,$inout2 2431 aesdec $rndkey0,$inout3 2432 aesdec $rndkey0,$inout4 2433 pxor $twtmp,@tweak[5] 2434 movaps @tweak[3],@tweak[4] 2435 aesdec $rndkey0,$inout5 2436 2437 movdqa $twres,$rndkey0 2438 paddd $twres,$twres 2439 aesdec $rndkey1,$inout0 2440 pxor @tweak[5],@tweak[3] 2441 psrad \$31,$rndkey0 2442 aesdec $rndkey1,$inout1 2443 paddq @tweak[5],@tweak[5] 2444 pand $twmask,$rndkey0 2445 aesdec $rndkey1,$inout2 2446 aesdec $rndkey1,$inout3 2447 pxor $rndkey0,@tweak[5] 2448 $movkey ($key_),$rndkey0 2449 aesdec $rndkey1,$inout4 2450 aesdec $rndkey1,$inout5 2451 $movkey 16($key_),$rndkey1 2452 2453 pxor @tweak[5],@tweak[4] 2454 aesdeclast `16*0`(%rsp),$inout0 2455 psrad \$31,$twres 2456 paddq @tweak[5],@tweak[5] 2457 aesdeclast `16*1`(%rsp),$inout1 2458 aesdeclast `16*2`(%rsp),$inout2 2459 pand $twmask,$twres 2460 mov %r10,%rax # restore $rounds 2461 aesdeclast `16*3`(%rsp),$inout3 2462 aesdeclast `16*4`(%rsp),$inout4 2463 aesdeclast `16*5`(%rsp),$inout5 2464 pxor $twres,@tweak[5] 2465 2466 lea `16*6`($out),$out # $out+=6*16 2467 movups $inout0,`-16*6`($out) # store 6 output blocks 2468 movups $inout1,`-16*5`($out) 2469 movups $inout2,`-16*4`($out) 2470 movups $inout3,`-16*3`($out) 2471 movups $inout4,`-16*2`($out) 2472 movups $inout5,`-16*1`($out) 2473 sub \$16*6,$len 2474 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2475 2476 mov \$16+96,$rounds 2477 sub $rnds_,$rounds 2478 mov $key_,$key # restore $key 2479 shr \$4,$rounds # restore original value 2480 2481.Lxts_dec_short: 2482 # at the point @tweak[0..5] are populated with tweak values 2483 mov $rounds,$rnds_ # backup $rounds 2484 pxor $rndkey0,@tweak[0] 2485 pxor $rndkey0,@tweak[1] 2486 add \$16*6,$len # restore real remaining $len 2487 jz .Lxts_dec_done # done if ($len==0) 2488 2489 pxor $rndkey0,@tweak[2] 2490 cmp \$0x20,$len 2491 jb .Lxts_dec_one # $len is 1*16 2492 pxor $rndkey0,@tweak[3] 2493 je .Lxts_dec_two # $len is 2*16 2494 2495 pxor $rndkey0,@tweak[4] 2496 cmp \$0x40,$len 2497 jb .Lxts_dec_three # $len is 3*16 2498 je .Lxts_dec_four # $len is 4*16 2499 2500 movdqu ($inp),$inout0 # $len is 5*16 2501 movdqu 16*1($inp),$inout1 2502 movdqu 16*2($inp),$inout2 2503 pxor @tweak[0],$inout0 2504 movdqu 16*3($inp),$inout3 2505 pxor @tweak[1],$inout1 2506 movdqu 16*4($inp),$inout4 2507 lea 16*5($inp),$inp # $inp+=5*16 2508 pxor @tweak[2],$inout2 2509 pxor @tweak[3],$inout3 2510 pxor @tweak[4],$inout4 2511 2512 call _aesni_decrypt6 2513 2514 xorps @tweak[0],$inout0 2515 xorps @tweak[1],$inout1 2516 xorps @tweak[2],$inout2 2517 movdqu $inout0,($out) # store 5 output blocks 2518 xorps @tweak[3],$inout3 2519 movdqu $inout1,16*1($out) 2520 xorps @tweak[4],$inout4 2521 movdqu $inout2,16*2($out) 2522 pxor $twtmp,$twtmp 2523 movdqu $inout3,16*3($out) 2524 pcmpgtd @tweak[5],$twtmp 2525 movdqu $inout4,16*4($out) 2526 lea 16*5($out),$out # $out+=5*16 2527 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2528 and \$15,$len_ 2529 jz .Lxts_dec_ret 2530 2531 movdqa @tweak[5],@tweak[0] 2532 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2533 pand $twmask,@tweak[1] # isolate carry and residue 2534 pxor @tweak[5],@tweak[1] 2535 jmp .Lxts_dec_done2 2536 2537.align 16 2538.Lxts_dec_one: 2539 movups ($inp),$inout0 2540 lea 16*1($inp),$inp # $inp+=1*16 2541 xorps @tweak[0],$inout0 2542___ 2543 &aesni_generate1("dec",$key,$rounds); 2544$code.=<<___; 2545 xorps @tweak[0],$inout0 2546 movdqa @tweak[1],@tweak[0] 2547 movups $inout0,($out) # store one output block 2548 movdqa @tweak[2],@tweak[1] 2549 lea 16*1($out),$out # $out+=1*16 2550 jmp .Lxts_dec_done 2551 2552.align 16 2553.Lxts_dec_two: 2554 movups ($inp),$inout0 2555 movups 16($inp),$inout1 2556 lea 32($inp),$inp # $inp+=2*16 2557 xorps @tweak[0],$inout0 2558 xorps @tweak[1],$inout1 2559 2560 call _aesni_decrypt2 2561 2562 xorps @tweak[0],$inout0 2563 movdqa @tweak[2],@tweak[0] 2564 xorps @tweak[1],$inout1 2565 movdqa @tweak[3],@tweak[1] 2566 movups $inout0,($out) # store 2 output blocks 2567 movups $inout1,16*1($out) 2568 lea 16*2($out),$out # $out+=2*16 2569 jmp .Lxts_dec_done 2570 2571.align 16 2572.Lxts_dec_three: 2573 movups ($inp),$inout0 2574 movups 16*1($inp),$inout1 2575 movups 16*2($inp),$inout2 2576 lea 16*3($inp),$inp # $inp+=3*16 2577 xorps @tweak[0],$inout0 2578 xorps @tweak[1],$inout1 2579 xorps @tweak[2],$inout2 2580 2581 call _aesni_decrypt3 2582 2583 xorps @tweak[0],$inout0 2584 movdqa @tweak[3],@tweak[0] 2585 xorps @tweak[1],$inout1 2586 movdqa @tweak[4],@tweak[1] 2587 xorps @tweak[2],$inout2 2588 movups $inout0,($out) # store 3 output blocks 2589 movups $inout1,16*1($out) 2590 movups $inout2,16*2($out) 2591 lea 16*3($out),$out # $out+=3*16 2592 jmp .Lxts_dec_done 2593 2594.align 16 2595.Lxts_dec_four: 2596 movups ($inp),$inout0 2597 movups 16*1($inp),$inout1 2598 movups 16*2($inp),$inout2 2599 xorps @tweak[0],$inout0 2600 movups 16*3($inp),$inout3 2601 lea 16*4($inp),$inp # $inp+=4*16 2602 xorps @tweak[1],$inout1 2603 xorps @tweak[2],$inout2 2604 xorps @tweak[3],$inout3 2605 2606 call _aesni_decrypt4 2607 2608 pxor @tweak[0],$inout0 2609 movdqa @tweak[4],@tweak[0] 2610 pxor @tweak[1],$inout1 2611 movdqa @tweak[5],@tweak[1] 2612 pxor @tweak[2],$inout2 2613 movdqu $inout0,($out) # store 4 output blocks 2614 pxor @tweak[3],$inout3 2615 movdqu $inout1,16*1($out) 2616 movdqu $inout2,16*2($out) 2617 movdqu $inout3,16*3($out) 2618 lea 16*4($out),$out # $out+=4*16 2619 jmp .Lxts_dec_done 2620 2621.align 16 2622.Lxts_dec_done: 2623 and \$15,$len_ # see if $len%16 is 0 2624 jz .Lxts_dec_ret 2625.Lxts_dec_done2: 2626 mov $len_,$len 2627 mov $key_,$key # restore $key 2628 mov $rnds_,$rounds # restore $rounds 2629 2630 movups ($inp),$inout0 2631 xorps @tweak[1],$inout0 2632___ 2633 &aesni_generate1("dec",$key,$rounds); 2634$code.=<<___; 2635 xorps @tweak[1],$inout0 2636 movups $inout0,($out) 2637 2638.Lxts_dec_steal: 2639 movzb 16($inp),%eax # borrow $rounds ... 2640 movzb ($out),%ecx # ... and $key 2641 lea 1($inp),$inp 2642 mov %al,($out) 2643 mov %cl,16($out) 2644 lea 1($out),$out 2645 sub \$1,$len 2646 jnz .Lxts_dec_steal 2647 2648 sub $len_,$out # rewind $out 2649 mov $key_,$key # restore $key 2650 mov $rnds_,$rounds # restore $rounds 2651 2652 movups ($out),$inout0 2653 xorps @tweak[0],$inout0 2654___ 2655 &aesni_generate1("dec",$key,$rounds); 2656$code.=<<___; 2657 xorps @tweak[0],$inout0 2658 movups $inout0,($out) 2659 2660.Lxts_dec_ret: 2661 xorps %xmm0,%xmm0 # clear register bank 2662 pxor %xmm1,%xmm1 2663 pxor %xmm2,%xmm2 2664 pxor %xmm3,%xmm3 2665 pxor %xmm4,%xmm4 2666 pxor %xmm5,%xmm5 2667___ 2668$code.=<<___ if (!$win64); 2669 pxor %xmm6,%xmm6 2670 pxor %xmm7,%xmm7 2671 movaps %xmm0,0x00(%rsp) # clear stack 2672 pxor %xmm8,%xmm8 2673 movaps %xmm0,0x10(%rsp) 2674 pxor %xmm9,%xmm9 2675 movaps %xmm0,0x20(%rsp) 2676 pxor %xmm10,%xmm10 2677 movaps %xmm0,0x30(%rsp) 2678 pxor %xmm11,%xmm11 2679 movaps %xmm0,0x40(%rsp) 2680 pxor %xmm12,%xmm12 2681 movaps %xmm0,0x50(%rsp) 2682 pxor %xmm13,%xmm13 2683 movaps %xmm0,0x60(%rsp) 2684 pxor %xmm14,%xmm14 2685 pxor %xmm15,%xmm15 2686___ 2687$code.=<<___ if ($win64); 2688 movaps -0xa8(%r11),%xmm6 2689 movaps %xmm0,-0xa8(%r11) # clear stack 2690 movaps -0x98(%r11),%xmm7 2691 movaps %xmm0,-0x98(%r11) 2692 movaps -0x88(%r11),%xmm8 2693 movaps %xmm0,-0x88(%r11) 2694 movaps -0x78(%r11),%xmm9 2695 movaps %xmm0,-0x78(%r11) 2696 movaps -0x68(%r11),%xmm10 2697 movaps %xmm0,-0x68(%r11) 2698 movaps -0x58(%r11),%xmm11 2699 movaps %xmm0,-0x58(%r11) 2700 movaps -0x48(%r11),%xmm12 2701 movaps %xmm0,-0x48(%r11) 2702 movaps -0x38(%r11),%xmm13 2703 movaps %xmm0,-0x38(%r11) 2704 movaps -0x28(%r11),%xmm14 2705 movaps %xmm0,-0x28(%r11) 2706 movaps -0x18(%r11),%xmm15 2707 movaps %xmm0,-0x18(%r11) 2708 movaps %xmm0,0x00(%rsp) 2709 movaps %xmm0,0x10(%rsp) 2710 movaps %xmm0,0x20(%rsp) 2711 movaps %xmm0,0x30(%rsp) 2712 movaps %xmm0,0x40(%rsp) 2713 movaps %xmm0,0x50(%rsp) 2714 movaps %xmm0,0x60(%rsp) 2715___ 2716$code.=<<___; 2717 mov -8(%r11),%rbp 2718 lea (%r11),%rsp 2719.Lxts_dec_epilogue: 2720 ret 2721.size aesni_xts_decrypt,.-aesni_xts_decrypt 2722___ 2723} 2724 2725###################################################################### 2726# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 2727# const AES_KEY *key, unsigned int start_block_num, 2728# unsigned char offset_i[16], const unsigned char L_[][16], 2729# unsigned char checksum[16]); 2730# 2731{ 2732my @offset=map("%xmm$_",(10..15)); 2733my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); 2734my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments 2735my ($L_p,$checksum_p) = ("%rbx","%rbp"); 2736my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); 2737my $seventh_arg = $win64 ? 56 : 8; 2738my $blocks = $len; 2739 2740$code.=<<___; 2741.globl aesni_ocb_encrypt 2742.type aesni_ocb_encrypt,\@function,6 2743.align 32 2744aesni_ocb_encrypt: 2745 lea (%rsp),%rax 2746 push %rbx 2747 push %rbp 2748 push %r12 2749 push %r13 2750 push %r14 2751___ 2752$code.=<<___ if ($win64); 2753 lea -0xa0(%rsp),%rsp 2754 movaps %xmm6,0x00(%rsp) # offload everything 2755 movaps %xmm7,0x10(%rsp) 2756 movaps %xmm8,0x20(%rsp) 2757 movaps %xmm9,0x30(%rsp) 2758 movaps %xmm10,0x40(%rsp) 2759 movaps %xmm11,0x50(%rsp) 2760 movaps %xmm12,0x60(%rsp) 2761 movaps %xmm13,0x70(%rsp) 2762 movaps %xmm14,0x80(%rsp) 2763 movaps %xmm15,0x90(%rsp) 2764.Locb_enc_body: 2765___ 2766$code.=<<___; 2767 mov $seventh_arg(%rax),$L_p # 7th argument 2768 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 2769 2770 mov 240($key),$rnds_ 2771 mov $key,$key_ 2772 shl \$4,$rnds_ 2773 $movkey ($key),$rndkey0l # round[0] 2774 $movkey 16($key,$rnds_),$rndkey1 # round[last] 2775 2776 movdqu ($offset_p),@offset[5] # load last offset_i 2777 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 2778 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 2779 2780 mov \$16+32,$rounds 2781 lea 32($key_,$rnds_),$key 2782 $movkey 16($key_),$rndkey1 # round[1] 2783 sub %r10,%rax # twisted $rounds 2784 mov %rax,%r10 # backup twisted $rounds 2785 2786 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 2787 movdqu ($checksum_p),$checksum # load checksum 2788 2789 test \$1,$block_num # is first block number odd? 2790 jnz .Locb_enc_odd 2791 2792 bsf $block_num,$i1 2793 add \$1,$block_num 2794 shl \$4,$i1 2795 movdqu ($L_p,$i1),$inout5 # borrow 2796 movdqu ($inp),$inout0 2797 lea 16($inp),$inp 2798 2799 call __ocb_encrypt1 2800 2801 movdqa $inout5,@offset[5] 2802 movups $inout0,($out) 2803 lea 16($out),$out 2804 sub \$1,$blocks 2805 jz .Locb_enc_done 2806 2807.Locb_enc_odd: 2808 lea 1($block_num),$i1 # even-numbered blocks 2809 lea 3($block_num),$i3 2810 lea 5($block_num),$i5 2811 lea 6($block_num),$block_num 2812 bsf $i1,$i1 # ntz(block) 2813 bsf $i3,$i3 2814 bsf $i5,$i5 2815 shl \$4,$i1 # ntz(block) -> table offset 2816 shl \$4,$i3 2817 shl \$4,$i5 2818 2819 sub \$6,$blocks 2820 jc .Locb_enc_short 2821 jmp .Locb_enc_grandloop 2822 2823.align 32 2824.Locb_enc_grandloop: 2825 movdqu `16*0`($inp),$inout0 # load input 2826 movdqu `16*1`($inp),$inout1 2827 movdqu `16*2`($inp),$inout2 2828 movdqu `16*3`($inp),$inout3 2829 movdqu `16*4`($inp),$inout4 2830 movdqu `16*5`($inp),$inout5 2831 lea `16*6`($inp),$inp 2832 2833 call __ocb_encrypt6 2834 2835 movups $inout0,`16*0`($out) # store output 2836 movups $inout1,`16*1`($out) 2837 movups $inout2,`16*2`($out) 2838 movups $inout3,`16*3`($out) 2839 movups $inout4,`16*4`($out) 2840 movups $inout5,`16*5`($out) 2841 lea `16*6`($out),$out 2842 sub \$6,$blocks 2843 jnc .Locb_enc_grandloop 2844 2845.Locb_enc_short: 2846 add \$6,$blocks 2847 jz .Locb_enc_done 2848 2849 movdqu `16*0`($inp),$inout0 2850 cmp \$2,$blocks 2851 jb .Locb_enc_one 2852 movdqu `16*1`($inp),$inout1 2853 je .Locb_enc_two 2854 2855 movdqu `16*2`($inp),$inout2 2856 cmp \$4,$blocks 2857 jb .Locb_enc_three 2858 movdqu `16*3`($inp),$inout3 2859 je .Locb_enc_four 2860 2861 movdqu `16*4`($inp),$inout4 2862 pxor $inout5,$inout5 2863 2864 call __ocb_encrypt6 2865 2866 movdqa @offset[4],@offset[5] 2867 movups $inout0,`16*0`($out) 2868 movups $inout1,`16*1`($out) 2869 movups $inout2,`16*2`($out) 2870 movups $inout3,`16*3`($out) 2871 movups $inout4,`16*4`($out) 2872 2873 jmp .Locb_enc_done 2874 2875.align 16 2876.Locb_enc_one: 2877 movdqa @offset[0],$inout5 # borrow 2878 2879 call __ocb_encrypt1 2880 2881 movdqa $inout5,@offset[5] 2882 movups $inout0,`16*0`($out) 2883 jmp .Locb_enc_done 2884 2885.align 16 2886.Locb_enc_two: 2887 pxor $inout2,$inout2 2888 pxor $inout3,$inout3 2889 2890 call __ocb_encrypt4 2891 2892 movdqa @offset[1],@offset[5] 2893 movups $inout0,`16*0`($out) 2894 movups $inout1,`16*1`($out) 2895 2896 jmp .Locb_enc_done 2897 2898.align 16 2899.Locb_enc_three: 2900 pxor $inout3,$inout3 2901 2902 call __ocb_encrypt4 2903 2904 movdqa @offset[2],@offset[5] 2905 movups $inout0,`16*0`($out) 2906 movups $inout1,`16*1`($out) 2907 movups $inout2,`16*2`($out) 2908 2909 jmp .Locb_enc_done 2910 2911.align 16 2912.Locb_enc_four: 2913 call __ocb_encrypt4 2914 2915 movdqa @offset[3],@offset[5] 2916 movups $inout0,`16*0`($out) 2917 movups $inout1,`16*1`($out) 2918 movups $inout2,`16*2`($out) 2919 movups $inout3,`16*3`($out) 2920 2921.Locb_enc_done: 2922 pxor $rndkey0,@offset[5] # "remove" round[last] 2923 movdqu $checksum,($checksum_p) # store checksum 2924 movdqu @offset[5],($offset_p) # store last offset_i 2925 2926 xorps %xmm0,%xmm0 # clear register bank 2927 pxor %xmm1,%xmm1 2928 pxor %xmm2,%xmm2 2929 pxor %xmm3,%xmm3 2930 pxor %xmm4,%xmm4 2931 pxor %xmm5,%xmm5 2932___ 2933$code.=<<___ if (!$win64); 2934 pxor %xmm6,%xmm6 2935 pxor %xmm7,%xmm7 2936 pxor %xmm8,%xmm8 2937 pxor %xmm9,%xmm9 2938 pxor %xmm10,%xmm10 2939 pxor %xmm11,%xmm11 2940 pxor %xmm12,%xmm12 2941 pxor %xmm13,%xmm13 2942 pxor %xmm14,%xmm14 2943 pxor %xmm15,%xmm15 2944 lea 0x28(%rsp),%rax 2945___ 2946$code.=<<___ if ($win64); 2947 movaps 0x00(%rsp),%xmm6 2948 movaps %xmm0,0x00(%rsp) # clear stack 2949 movaps 0x10(%rsp),%xmm7 2950 movaps %xmm0,0x10(%rsp) 2951 movaps 0x20(%rsp),%xmm8 2952 movaps %xmm0,0x20(%rsp) 2953 movaps 0x30(%rsp),%xmm9 2954 movaps %xmm0,0x30(%rsp) 2955 movaps 0x40(%rsp),%xmm10 2956 movaps %xmm0,0x40(%rsp) 2957 movaps 0x50(%rsp),%xmm11 2958 movaps %xmm0,0x50(%rsp) 2959 movaps 0x60(%rsp),%xmm12 2960 movaps %xmm0,0x60(%rsp) 2961 movaps 0x70(%rsp),%xmm13 2962 movaps %xmm0,0x70(%rsp) 2963 movaps 0x80(%rsp),%xmm14 2964 movaps %xmm0,0x80(%rsp) 2965 movaps 0x90(%rsp),%xmm15 2966 movaps %xmm0,0x90(%rsp) 2967 lea 0xa0+0x28(%rsp),%rax 2968.Locb_enc_pop: 2969___ 2970$code.=<<___; 2971 mov -40(%rax),%r14 2972 mov -32(%rax),%r13 2973 mov -24(%rax),%r12 2974 mov -16(%rax),%rbp 2975 mov -8(%rax),%rbx 2976 lea (%rax),%rsp 2977.Locb_enc_epilogue: 2978 ret 2979.size aesni_ocb_encrypt,.-aesni_ocb_encrypt 2980 2981.type __ocb_encrypt6,\@abi-omnipotent 2982.align 32 2983__ocb_encrypt6: 2984 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 2985 movdqu ($L_p,$i1),@offset[1] 2986 movdqa @offset[0],@offset[2] 2987 movdqu ($L_p,$i3),@offset[3] 2988 movdqa @offset[0],@offset[4] 2989 pxor @offset[5],@offset[0] 2990 movdqu ($L_p,$i5),@offset[5] 2991 pxor @offset[0],@offset[1] 2992 pxor $inout0,$checksum # accumulate checksum 2993 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 2994 pxor @offset[1],@offset[2] 2995 pxor $inout1,$checksum 2996 pxor @offset[1],$inout1 2997 pxor @offset[2],@offset[3] 2998 pxor $inout2,$checksum 2999 pxor @offset[2],$inout2 3000 pxor @offset[3],@offset[4] 3001 pxor $inout3,$checksum 3002 pxor @offset[3],$inout3 3003 pxor @offset[4],@offset[5] 3004 pxor $inout4,$checksum 3005 pxor @offset[4],$inout4 3006 pxor $inout5,$checksum 3007 pxor @offset[5],$inout5 3008 $movkey 32($key_),$rndkey0 3009 3010 lea 1($block_num),$i1 # even-numbered blocks 3011 lea 3($block_num),$i3 3012 lea 5($block_num),$i5 3013 add \$6,$block_num 3014 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3015 bsf $i1,$i1 # ntz(block) 3016 bsf $i3,$i3 3017 bsf $i5,$i5 3018 3019 aesenc $rndkey1,$inout0 3020 aesenc $rndkey1,$inout1 3021 aesenc $rndkey1,$inout2 3022 aesenc $rndkey1,$inout3 3023 pxor $rndkey0l,@offset[1] 3024 pxor $rndkey0l,@offset[2] 3025 aesenc $rndkey1,$inout4 3026 pxor $rndkey0l,@offset[3] 3027 pxor $rndkey0l,@offset[4] 3028 aesenc $rndkey1,$inout5 3029 $movkey 48($key_),$rndkey1 3030 pxor $rndkey0l,@offset[5] 3031 3032 aesenc $rndkey0,$inout0 3033 aesenc $rndkey0,$inout1 3034 aesenc $rndkey0,$inout2 3035 aesenc $rndkey0,$inout3 3036 aesenc $rndkey0,$inout4 3037 aesenc $rndkey0,$inout5 3038 $movkey 64($key_),$rndkey0 3039 shl \$4,$i1 # ntz(block) -> table offset 3040 shl \$4,$i3 3041 jmp .Locb_enc_loop6 3042 3043.align 32 3044.Locb_enc_loop6: 3045 aesenc $rndkey1,$inout0 3046 aesenc $rndkey1,$inout1 3047 aesenc $rndkey1,$inout2 3048 aesenc $rndkey1,$inout3 3049 aesenc $rndkey1,$inout4 3050 aesenc $rndkey1,$inout5 3051 $movkey ($key,%rax),$rndkey1 3052 add \$32,%rax 3053 3054 aesenc $rndkey0,$inout0 3055 aesenc $rndkey0,$inout1 3056 aesenc $rndkey0,$inout2 3057 aesenc $rndkey0,$inout3 3058 aesenc $rndkey0,$inout4 3059 aesenc $rndkey0,$inout5 3060 $movkey -16($key,%rax),$rndkey0 3061 jnz .Locb_enc_loop6 3062 3063 aesenc $rndkey1,$inout0 3064 aesenc $rndkey1,$inout1 3065 aesenc $rndkey1,$inout2 3066 aesenc $rndkey1,$inout3 3067 aesenc $rndkey1,$inout4 3068 aesenc $rndkey1,$inout5 3069 $movkey 16($key_),$rndkey1 3070 shl \$4,$i5 3071 3072 aesenclast @offset[0],$inout0 3073 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3074 mov %r10,%rax # restore twisted rounds 3075 aesenclast @offset[1],$inout1 3076 aesenclast @offset[2],$inout2 3077 aesenclast @offset[3],$inout3 3078 aesenclast @offset[4],$inout4 3079 aesenclast @offset[5],$inout5 3080 ret 3081.size __ocb_encrypt6,.-__ocb_encrypt6 3082 3083.type __ocb_encrypt4,\@abi-omnipotent 3084.align 32 3085__ocb_encrypt4: 3086 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3087 movdqu ($L_p,$i1),@offset[1] 3088 movdqa @offset[0],@offset[2] 3089 movdqu ($L_p,$i3),@offset[3] 3090 pxor @offset[5],@offset[0] 3091 pxor @offset[0],@offset[1] 3092 pxor $inout0,$checksum # accumulate checksum 3093 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3094 pxor @offset[1],@offset[2] 3095 pxor $inout1,$checksum 3096 pxor @offset[1],$inout1 3097 pxor @offset[2],@offset[3] 3098 pxor $inout2,$checksum 3099 pxor @offset[2],$inout2 3100 pxor $inout3,$checksum 3101 pxor @offset[3],$inout3 3102 $movkey 32($key_),$rndkey0 3103 3104 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3105 pxor $rndkey0l,@offset[1] 3106 pxor $rndkey0l,@offset[2] 3107 pxor $rndkey0l,@offset[3] 3108 3109 aesenc $rndkey1,$inout0 3110 aesenc $rndkey1,$inout1 3111 aesenc $rndkey1,$inout2 3112 aesenc $rndkey1,$inout3 3113 $movkey 48($key_),$rndkey1 3114 3115 aesenc $rndkey0,$inout0 3116 aesenc $rndkey0,$inout1 3117 aesenc $rndkey0,$inout2 3118 aesenc $rndkey0,$inout3 3119 $movkey 64($key_),$rndkey0 3120 jmp .Locb_enc_loop4 3121 3122.align 32 3123.Locb_enc_loop4: 3124 aesenc $rndkey1,$inout0 3125 aesenc $rndkey1,$inout1 3126 aesenc $rndkey1,$inout2 3127 aesenc $rndkey1,$inout3 3128 $movkey ($key,%rax),$rndkey1 3129 add \$32,%rax 3130 3131 aesenc $rndkey0,$inout0 3132 aesenc $rndkey0,$inout1 3133 aesenc $rndkey0,$inout2 3134 aesenc $rndkey0,$inout3 3135 $movkey -16($key,%rax),$rndkey0 3136 jnz .Locb_enc_loop4 3137 3138 aesenc $rndkey1,$inout0 3139 aesenc $rndkey1,$inout1 3140 aesenc $rndkey1,$inout2 3141 aesenc $rndkey1,$inout3 3142 $movkey 16($key_),$rndkey1 3143 mov %r10,%rax # restore twisted rounds 3144 3145 aesenclast @offset[0],$inout0 3146 aesenclast @offset[1],$inout1 3147 aesenclast @offset[2],$inout2 3148 aesenclast @offset[3],$inout3 3149 ret 3150.size __ocb_encrypt4,.-__ocb_encrypt4 3151 3152.type __ocb_encrypt1,\@abi-omnipotent 3153.align 32 3154__ocb_encrypt1: 3155 pxor @offset[5],$inout5 # offset_i 3156 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3157 pxor $inout0,$checksum # accumulate checksum 3158 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3159 $movkey 32($key_),$rndkey0 3160 3161 aesenc $rndkey1,$inout0 3162 $movkey 48($key_),$rndkey1 3163 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3164 3165 aesenc $rndkey0,$inout0 3166 $movkey 64($key_),$rndkey0 3167 jmp .Locb_enc_loop1 3168 3169.align 32 3170.Locb_enc_loop1: 3171 aesenc $rndkey1,$inout0 3172 $movkey ($key,%rax),$rndkey1 3173 add \$32,%rax 3174 3175 aesenc $rndkey0,$inout0 3176 $movkey -16($key,%rax),$rndkey0 3177 jnz .Locb_enc_loop1 3178 3179 aesenc $rndkey1,$inout0 3180 $movkey 16($key_),$rndkey1 # redundant in tail 3181 mov %r10,%rax # restore twisted rounds 3182 3183 aesenclast $inout5,$inout0 3184 ret 3185.size __ocb_encrypt1,.-__ocb_encrypt1 3186 3187.globl aesni_ocb_decrypt 3188.type aesni_ocb_decrypt,\@function,6 3189.align 32 3190aesni_ocb_decrypt: 3191 lea (%rsp),%rax 3192 push %rbx 3193 push %rbp 3194 push %r12 3195 push %r13 3196 push %r14 3197___ 3198$code.=<<___ if ($win64); 3199 lea -0xa0(%rsp),%rsp 3200 movaps %xmm6,0x00(%rsp) # offload everything 3201 movaps %xmm7,0x10(%rsp) 3202 movaps %xmm8,0x20(%rsp) 3203 movaps %xmm9,0x30(%rsp) 3204 movaps %xmm10,0x40(%rsp) 3205 movaps %xmm11,0x50(%rsp) 3206 movaps %xmm12,0x60(%rsp) 3207 movaps %xmm13,0x70(%rsp) 3208 movaps %xmm14,0x80(%rsp) 3209 movaps %xmm15,0x90(%rsp) 3210.Locb_dec_body: 3211___ 3212$code.=<<___; 3213 mov $seventh_arg(%rax),$L_p # 7th argument 3214 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 3215 3216 mov 240($key),$rnds_ 3217 mov $key,$key_ 3218 shl \$4,$rnds_ 3219 $movkey ($key),$rndkey0l # round[0] 3220 $movkey 16($key,$rnds_),$rndkey1 # round[last] 3221 3222 movdqu ($offset_p),@offset[5] # load last offset_i 3223 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 3224 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 3225 3226 mov \$16+32,$rounds 3227 lea 32($key_,$rnds_),$key 3228 $movkey 16($key_),$rndkey1 # round[1] 3229 sub %r10,%rax # twisted $rounds 3230 mov %rax,%r10 # backup twisted $rounds 3231 3232 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3233 movdqu ($checksum_p),$checksum # load checksum 3234 3235 test \$1,$block_num # is first block number odd? 3236 jnz .Locb_dec_odd 3237 3238 bsf $block_num,$i1 3239 add \$1,$block_num 3240 shl \$4,$i1 3241 movdqu ($L_p,$i1),$inout5 # borrow 3242 movdqu ($inp),$inout0 3243 lea 16($inp),$inp 3244 3245 call __ocb_decrypt1 3246 3247 movdqa $inout5,@offset[5] 3248 movups $inout0,($out) 3249 xorps $inout0,$checksum # accumulate checksum 3250 lea 16($out),$out 3251 sub \$1,$blocks 3252 jz .Locb_dec_done 3253 3254.Locb_dec_odd: 3255 lea 1($block_num),$i1 # even-numbered blocks 3256 lea 3($block_num),$i3 3257 lea 5($block_num),$i5 3258 lea 6($block_num),$block_num 3259 bsf $i1,$i1 # ntz(block) 3260 bsf $i3,$i3 3261 bsf $i5,$i5 3262 shl \$4,$i1 # ntz(block) -> table offset 3263 shl \$4,$i3 3264 shl \$4,$i5 3265 3266 sub \$6,$blocks 3267 jc .Locb_dec_short 3268 jmp .Locb_dec_grandloop 3269 3270.align 32 3271.Locb_dec_grandloop: 3272 movdqu `16*0`($inp),$inout0 # load input 3273 movdqu `16*1`($inp),$inout1 3274 movdqu `16*2`($inp),$inout2 3275 movdqu `16*3`($inp),$inout3 3276 movdqu `16*4`($inp),$inout4 3277 movdqu `16*5`($inp),$inout5 3278 lea `16*6`($inp),$inp 3279 3280 call __ocb_decrypt6 3281 3282 movups $inout0,`16*0`($out) # store output 3283 pxor $inout0,$checksum # accumulate checksum 3284 movups $inout1,`16*1`($out) 3285 pxor $inout1,$checksum 3286 movups $inout2,`16*2`($out) 3287 pxor $inout2,$checksum 3288 movups $inout3,`16*3`($out) 3289 pxor $inout3,$checksum 3290 movups $inout4,`16*4`($out) 3291 pxor $inout4,$checksum 3292 movups $inout5,`16*5`($out) 3293 pxor $inout5,$checksum 3294 lea `16*6`($out),$out 3295 sub \$6,$blocks 3296 jnc .Locb_dec_grandloop 3297 3298.Locb_dec_short: 3299 add \$6,$blocks 3300 jz .Locb_dec_done 3301 3302 movdqu `16*0`($inp),$inout0 3303 cmp \$2,$blocks 3304 jb .Locb_dec_one 3305 movdqu `16*1`($inp),$inout1 3306 je .Locb_dec_two 3307 3308 movdqu `16*2`($inp),$inout2 3309 cmp \$4,$blocks 3310 jb .Locb_dec_three 3311 movdqu `16*3`($inp),$inout3 3312 je .Locb_dec_four 3313 3314 movdqu `16*4`($inp),$inout4 3315 pxor $inout5,$inout5 3316 3317 call __ocb_decrypt6 3318 3319 movdqa @offset[4],@offset[5] 3320 movups $inout0,`16*0`($out) # store output 3321 pxor $inout0,$checksum # accumulate checksum 3322 movups $inout1,`16*1`($out) 3323 pxor $inout1,$checksum 3324 movups $inout2,`16*2`($out) 3325 pxor $inout2,$checksum 3326 movups $inout3,`16*3`($out) 3327 pxor $inout3,$checksum 3328 movups $inout4,`16*4`($out) 3329 pxor $inout4,$checksum 3330 3331 jmp .Locb_dec_done 3332 3333.align 16 3334.Locb_dec_one: 3335 movdqa @offset[0],$inout5 # borrow 3336 3337 call __ocb_decrypt1 3338 3339 movdqa $inout5,@offset[5] 3340 movups $inout0,`16*0`($out) # store output 3341 xorps $inout0,$checksum # accumulate checksum 3342 jmp .Locb_dec_done 3343 3344.align 16 3345.Locb_dec_two: 3346 pxor $inout2,$inout2 3347 pxor $inout3,$inout3 3348 3349 call __ocb_decrypt4 3350 3351 movdqa @offset[1],@offset[5] 3352 movups $inout0,`16*0`($out) # store output 3353 xorps $inout0,$checksum # accumulate checksum 3354 movups $inout1,`16*1`($out) 3355 xorps $inout1,$checksum 3356 3357 jmp .Locb_dec_done 3358 3359.align 16 3360.Locb_dec_three: 3361 pxor $inout3,$inout3 3362 3363 call __ocb_decrypt4 3364 3365 movdqa @offset[2],@offset[5] 3366 movups $inout0,`16*0`($out) # store output 3367 xorps $inout0,$checksum # accumulate checksum 3368 movups $inout1,`16*1`($out) 3369 xorps $inout1,$checksum 3370 movups $inout2,`16*2`($out) 3371 xorps $inout2,$checksum 3372 3373 jmp .Locb_dec_done 3374 3375.align 16 3376.Locb_dec_four: 3377 call __ocb_decrypt4 3378 3379 movdqa @offset[3],@offset[5] 3380 movups $inout0,`16*0`($out) # store output 3381 pxor $inout0,$checksum # accumulate checksum 3382 movups $inout1,`16*1`($out) 3383 pxor $inout1,$checksum 3384 movups $inout2,`16*2`($out) 3385 pxor $inout2,$checksum 3386 movups $inout3,`16*3`($out) 3387 pxor $inout3,$checksum 3388 3389.Locb_dec_done: 3390 pxor $rndkey0,@offset[5] # "remove" round[last] 3391 movdqu $checksum,($checksum_p) # store checksum 3392 movdqu @offset[5],($offset_p) # store last offset_i 3393 3394 xorps %xmm0,%xmm0 # clear register bank 3395 pxor %xmm1,%xmm1 3396 pxor %xmm2,%xmm2 3397 pxor %xmm3,%xmm3 3398 pxor %xmm4,%xmm4 3399 pxor %xmm5,%xmm5 3400___ 3401$code.=<<___ if (!$win64); 3402 pxor %xmm6,%xmm6 3403 pxor %xmm7,%xmm7 3404 pxor %xmm8,%xmm8 3405 pxor %xmm9,%xmm9 3406 pxor %xmm10,%xmm10 3407 pxor %xmm11,%xmm11 3408 pxor %xmm12,%xmm12 3409 pxor %xmm13,%xmm13 3410 pxor %xmm14,%xmm14 3411 pxor %xmm15,%xmm15 3412 lea 0x28(%rsp),%rax 3413___ 3414$code.=<<___ if ($win64); 3415 movaps 0x00(%rsp),%xmm6 3416 movaps %xmm0,0x00(%rsp) # clear stack 3417 movaps 0x10(%rsp),%xmm7 3418 movaps %xmm0,0x10(%rsp) 3419 movaps 0x20(%rsp),%xmm8 3420 movaps %xmm0,0x20(%rsp) 3421 movaps 0x30(%rsp),%xmm9 3422 movaps %xmm0,0x30(%rsp) 3423 movaps 0x40(%rsp),%xmm10 3424 movaps %xmm0,0x40(%rsp) 3425 movaps 0x50(%rsp),%xmm11 3426 movaps %xmm0,0x50(%rsp) 3427 movaps 0x60(%rsp),%xmm12 3428 movaps %xmm0,0x60(%rsp) 3429 movaps 0x70(%rsp),%xmm13 3430 movaps %xmm0,0x70(%rsp) 3431 movaps 0x80(%rsp),%xmm14 3432 movaps %xmm0,0x80(%rsp) 3433 movaps 0x90(%rsp),%xmm15 3434 movaps %xmm0,0x90(%rsp) 3435 lea 0xa0+0x28(%rsp),%rax 3436.Locb_dec_pop: 3437___ 3438$code.=<<___; 3439 mov -40(%rax),%r14 3440 mov -32(%rax),%r13 3441 mov -24(%rax),%r12 3442 mov -16(%rax),%rbp 3443 mov -8(%rax),%rbx 3444 lea (%rax),%rsp 3445.Locb_dec_epilogue: 3446 ret 3447.size aesni_ocb_decrypt,.-aesni_ocb_decrypt 3448 3449.type __ocb_decrypt6,\@abi-omnipotent 3450.align 32 3451__ocb_decrypt6: 3452 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3453 movdqu ($L_p,$i1),@offset[1] 3454 movdqa @offset[0],@offset[2] 3455 movdqu ($L_p,$i3),@offset[3] 3456 movdqa @offset[0],@offset[4] 3457 pxor @offset[5],@offset[0] 3458 movdqu ($L_p,$i5),@offset[5] 3459 pxor @offset[0],@offset[1] 3460 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3461 pxor @offset[1],@offset[2] 3462 pxor @offset[1],$inout1 3463 pxor @offset[2],@offset[3] 3464 pxor @offset[2],$inout2 3465 pxor @offset[3],@offset[4] 3466 pxor @offset[3],$inout3 3467 pxor @offset[4],@offset[5] 3468 pxor @offset[4],$inout4 3469 pxor @offset[5],$inout5 3470 $movkey 32($key_),$rndkey0 3471 3472 lea 1($block_num),$i1 # even-numbered blocks 3473 lea 3($block_num),$i3 3474 lea 5($block_num),$i5 3475 add \$6,$block_num 3476 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3477 bsf $i1,$i1 # ntz(block) 3478 bsf $i3,$i3 3479 bsf $i5,$i5 3480 3481 aesdec $rndkey1,$inout0 3482 aesdec $rndkey1,$inout1 3483 aesdec $rndkey1,$inout2 3484 aesdec $rndkey1,$inout3 3485 pxor $rndkey0l,@offset[1] 3486 pxor $rndkey0l,@offset[2] 3487 aesdec $rndkey1,$inout4 3488 pxor $rndkey0l,@offset[3] 3489 pxor $rndkey0l,@offset[4] 3490 aesdec $rndkey1,$inout5 3491 $movkey 48($key_),$rndkey1 3492 pxor $rndkey0l,@offset[5] 3493 3494 aesdec $rndkey0,$inout0 3495 aesdec $rndkey0,$inout1 3496 aesdec $rndkey0,$inout2 3497 aesdec $rndkey0,$inout3 3498 aesdec $rndkey0,$inout4 3499 aesdec $rndkey0,$inout5 3500 $movkey 64($key_),$rndkey0 3501 shl \$4,$i1 # ntz(block) -> table offset 3502 shl \$4,$i3 3503 jmp .Locb_dec_loop6 3504 3505.align 32 3506.Locb_dec_loop6: 3507 aesdec $rndkey1,$inout0 3508 aesdec $rndkey1,$inout1 3509 aesdec $rndkey1,$inout2 3510 aesdec $rndkey1,$inout3 3511 aesdec $rndkey1,$inout4 3512 aesdec $rndkey1,$inout5 3513 $movkey ($key,%rax),$rndkey1 3514 add \$32,%rax 3515 3516 aesdec $rndkey0,$inout0 3517 aesdec $rndkey0,$inout1 3518 aesdec $rndkey0,$inout2 3519 aesdec $rndkey0,$inout3 3520 aesdec $rndkey0,$inout4 3521 aesdec $rndkey0,$inout5 3522 $movkey -16($key,%rax),$rndkey0 3523 jnz .Locb_dec_loop6 3524 3525 aesdec $rndkey1,$inout0 3526 aesdec $rndkey1,$inout1 3527 aesdec $rndkey1,$inout2 3528 aesdec $rndkey1,$inout3 3529 aesdec $rndkey1,$inout4 3530 aesdec $rndkey1,$inout5 3531 $movkey 16($key_),$rndkey1 3532 shl \$4,$i5 3533 3534 aesdeclast @offset[0],$inout0 3535 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3536 mov %r10,%rax # restore twisted rounds 3537 aesdeclast @offset[1],$inout1 3538 aesdeclast @offset[2],$inout2 3539 aesdeclast @offset[3],$inout3 3540 aesdeclast @offset[4],$inout4 3541 aesdeclast @offset[5],$inout5 3542 ret 3543.size __ocb_decrypt6,.-__ocb_decrypt6 3544 3545.type __ocb_decrypt4,\@abi-omnipotent 3546.align 32 3547__ocb_decrypt4: 3548 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3549 movdqu ($L_p,$i1),@offset[1] 3550 movdqa @offset[0],@offset[2] 3551 movdqu ($L_p,$i3),@offset[3] 3552 pxor @offset[5],@offset[0] 3553 pxor @offset[0],@offset[1] 3554 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3555 pxor @offset[1],@offset[2] 3556 pxor @offset[1],$inout1 3557 pxor @offset[2],@offset[3] 3558 pxor @offset[2],$inout2 3559 pxor @offset[3],$inout3 3560 $movkey 32($key_),$rndkey0 3561 3562 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3563 pxor $rndkey0l,@offset[1] 3564 pxor $rndkey0l,@offset[2] 3565 pxor $rndkey0l,@offset[3] 3566 3567 aesdec $rndkey1,$inout0 3568 aesdec $rndkey1,$inout1 3569 aesdec $rndkey1,$inout2 3570 aesdec $rndkey1,$inout3 3571 $movkey 48($key_),$rndkey1 3572 3573 aesdec $rndkey0,$inout0 3574 aesdec $rndkey0,$inout1 3575 aesdec $rndkey0,$inout2 3576 aesdec $rndkey0,$inout3 3577 $movkey 64($key_),$rndkey0 3578 jmp .Locb_dec_loop4 3579 3580.align 32 3581.Locb_dec_loop4: 3582 aesdec $rndkey1,$inout0 3583 aesdec $rndkey1,$inout1 3584 aesdec $rndkey1,$inout2 3585 aesdec $rndkey1,$inout3 3586 $movkey ($key,%rax),$rndkey1 3587 add \$32,%rax 3588 3589 aesdec $rndkey0,$inout0 3590 aesdec $rndkey0,$inout1 3591 aesdec $rndkey0,$inout2 3592 aesdec $rndkey0,$inout3 3593 $movkey -16($key,%rax),$rndkey0 3594 jnz .Locb_dec_loop4 3595 3596 aesdec $rndkey1,$inout0 3597 aesdec $rndkey1,$inout1 3598 aesdec $rndkey1,$inout2 3599 aesdec $rndkey1,$inout3 3600 $movkey 16($key_),$rndkey1 3601 mov %r10,%rax # restore twisted rounds 3602 3603 aesdeclast @offset[0],$inout0 3604 aesdeclast @offset[1],$inout1 3605 aesdeclast @offset[2],$inout2 3606 aesdeclast @offset[3],$inout3 3607 ret 3608.size __ocb_decrypt4,.-__ocb_decrypt4 3609 3610.type __ocb_decrypt1,\@abi-omnipotent 3611.align 32 3612__ocb_decrypt1: 3613 pxor @offset[5],$inout5 # offset_i 3614 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3615 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3616 $movkey 32($key_),$rndkey0 3617 3618 aesdec $rndkey1,$inout0 3619 $movkey 48($key_),$rndkey1 3620 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3621 3622 aesdec $rndkey0,$inout0 3623 $movkey 64($key_),$rndkey0 3624 jmp .Locb_dec_loop1 3625 3626.align 32 3627.Locb_dec_loop1: 3628 aesdec $rndkey1,$inout0 3629 $movkey ($key,%rax),$rndkey1 3630 add \$32,%rax 3631 3632 aesdec $rndkey0,$inout0 3633 $movkey -16($key,%rax),$rndkey0 3634 jnz .Locb_dec_loop1 3635 3636 aesdec $rndkey1,$inout0 3637 $movkey 16($key_),$rndkey1 # redundant in tail 3638 mov %r10,%rax # restore twisted rounds 3639 3640 aesdeclast $inout5,$inout0 3641 ret 3642.size __ocb_decrypt1,.-__ocb_decrypt1 3643___ 3644} }} 3645 3646######################################################################## 3647# void $PREFIX_cbc_encrypt (const void *inp, void *out, 3648# size_t length, const AES_KEY *key, 3649# unsigned char *ivp,const int enc); 3650{ 3651my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 3652my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 3653 3654$code.=<<___; 3655.globl ${PREFIX}_cbc_encrypt 3656.type ${PREFIX}_cbc_encrypt,\@function,6 3657.align 16 3658${PREFIX}_cbc_encrypt: 3659 test $len,$len # check length 3660 jz .Lcbc_ret 3661 3662 mov 240($key),$rnds_ # key->rounds 3663 mov $key,$key_ # backup $key 3664 test %r9d,%r9d # 6th argument 3665 jz .Lcbc_decrypt 3666#--------------------------- CBC ENCRYPT ------------------------------# 3667 movups ($ivp),$inout0 # load iv as initial state 3668 mov $rnds_,$rounds 3669 cmp \$16,$len 3670 jb .Lcbc_enc_tail 3671 sub \$16,$len 3672 jmp .Lcbc_enc_loop 3673.align 16 3674.Lcbc_enc_loop: 3675 movups ($inp),$inout1 # load input 3676 lea 16($inp),$inp 3677 #xorps $inout1,$inout0 3678___ 3679 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 3680$code.=<<___; 3681 mov $rnds_,$rounds # restore $rounds 3682 mov $key_,$key # restore $key 3683 movups $inout0,0($out) # store output 3684 lea 16($out),$out 3685 sub \$16,$len 3686 jnc .Lcbc_enc_loop 3687 add \$16,$len 3688 jnz .Lcbc_enc_tail 3689 pxor $rndkey0,$rndkey0 # clear register bank 3690 pxor $rndkey1,$rndkey1 3691 movups $inout0,($ivp) 3692 pxor $inout0,$inout0 3693 pxor $inout1,$inout1 3694 jmp .Lcbc_ret 3695 3696.Lcbc_enc_tail: 3697 mov $len,%rcx # zaps $key 3698 xchg $inp,$out # $inp is %rsi and $out is %rdi now 3699 .long 0x9066A4F3 # rep movsb 3700 mov \$16,%ecx # zero tail 3701 sub $len,%rcx 3702 xor %eax,%eax 3703 .long 0x9066AAF3 # rep stosb 3704 lea -16(%rdi),%rdi # rewind $out by 1 block 3705 mov $rnds_,$rounds # restore $rounds 3706 mov %rdi,%rsi # $inp and $out are the same 3707 mov $key_,$key # restore $key 3708 xor $len,$len # len=16 3709 jmp .Lcbc_enc_loop # one more spin 3710#--------------------------- CBC DECRYPT ------------------------------# 3711.align 16 3712.Lcbc_decrypt: 3713 cmp \$16,$len 3714 jne .Lcbc_decrypt_bulk 3715 3716 # handle single block without allocating stack frame, 3717 # useful in ciphertext stealing mode 3718 movdqu ($inp),$inout0 # load input 3719 movdqu ($ivp),$inout1 # load iv 3720 movdqa $inout0,$inout2 # future iv 3721___ 3722 &aesni_generate1("dec",$key,$rnds_); 3723$code.=<<___; 3724 pxor $rndkey0,$rndkey0 # clear register bank 3725 pxor $rndkey1,$rndkey1 3726 movdqu $inout2,($ivp) # store iv 3727 xorps $inout1,$inout0 # ^=iv 3728 pxor $inout1,$inout1 3729 movups $inout0,($out) # store output 3730 pxor $inout0,$inout0 3731 jmp .Lcbc_ret 3732.align 16 3733.Lcbc_decrypt_bulk: 3734 lea (%rsp),%r11 # frame pointer 3735 push %rbp 3736 sub \$$frame_size,%rsp 3737 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 3738___ 3739$code.=<<___ if ($win64); 3740 movaps %xmm6,0x10(%rsp) 3741 movaps %xmm7,0x20(%rsp) 3742 movaps %xmm8,0x30(%rsp) 3743 movaps %xmm9,0x40(%rsp) 3744 movaps %xmm10,0x50(%rsp) 3745 movaps %xmm11,0x60(%rsp) 3746 movaps %xmm12,0x70(%rsp) 3747 movaps %xmm13,0x80(%rsp) 3748 movaps %xmm14,0x90(%rsp) 3749 movaps %xmm15,0xa0(%rsp) 3750.Lcbc_decrypt_body: 3751___ 3752 3753my $inp_=$key_="%rbp"; # reassign $key_ 3754 3755$code.=<<___; 3756 mov $key,$key_ # [re-]backup $key [after reassignment] 3757 movups ($ivp),$iv 3758 mov $rnds_,$rounds 3759 cmp \$0x50,$len 3760 jbe .Lcbc_dec_tail 3761 3762 $movkey ($key),$rndkey0 3763 movdqu 0x00($inp),$inout0 # load input 3764 movdqu 0x10($inp),$inout1 3765 movdqa $inout0,$in0 3766 movdqu 0x20($inp),$inout2 3767 movdqa $inout1,$in1 3768 movdqu 0x30($inp),$inout3 3769 movdqa $inout2,$in2 3770 movdqu 0x40($inp),$inout4 3771 movdqa $inout3,$in3 3772 movdqu 0x50($inp),$inout5 3773 movdqa $inout4,$in4 3774 mov OPENSSL_ia32cap_P+4(%rip),%r9d 3775 cmp \$0x70,$len 3776 jbe .Lcbc_dec_six_or_seven 3777 3778 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 3779 sub \$0x50,$len # $len is biased by -5*16 3780 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 3781 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] 3782 sub \$0x20,$len # $len is biased by -7*16 3783 lea 0x70($key),$key # size optimization 3784 jmp .Lcbc_dec_loop8_enter 3785.align 16 3786.Lcbc_dec_loop8: 3787 movups $inout7,($out) 3788 lea 0x10($out),$out 3789.Lcbc_dec_loop8_enter: 3790 movdqu 0x60($inp),$inout6 3791 pxor $rndkey0,$inout0 3792 movdqu 0x70($inp),$inout7 3793 pxor $rndkey0,$inout1 3794 $movkey 0x10-0x70($key),$rndkey1 3795 pxor $rndkey0,$inout2 3796 mov \$-1,$inp_ 3797 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 3798 pxor $rndkey0,$inout3 3799 pxor $rndkey0,$inout4 3800 pxor $rndkey0,$inout5 3801 pxor $rndkey0,$inout6 3802 3803 aesdec $rndkey1,$inout0 3804 pxor $rndkey0,$inout7 3805 $movkey 0x20-0x70($key),$rndkey0 3806 aesdec $rndkey1,$inout1 3807 aesdec $rndkey1,$inout2 3808 aesdec $rndkey1,$inout3 3809 aesdec $rndkey1,$inout4 3810 aesdec $rndkey1,$inout5 3811 aesdec $rndkey1,$inout6 3812 adc \$0,$inp_ 3813 and \$128,$inp_ 3814 aesdec $rndkey1,$inout7 3815 add $inp,$inp_ 3816 $movkey 0x30-0x70($key),$rndkey1 3817___ 3818for($i=1;$i<12;$i++) { 3819my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 3820$code.=<<___ if ($i==7); 3821 cmp \$11,$rounds 3822___ 3823$code.=<<___; 3824 aesdec $rndkeyx,$inout0 3825 aesdec $rndkeyx,$inout1 3826 aesdec $rndkeyx,$inout2 3827 aesdec $rndkeyx,$inout3 3828 aesdec $rndkeyx,$inout4 3829 aesdec $rndkeyx,$inout5 3830 aesdec $rndkeyx,$inout6 3831 aesdec $rndkeyx,$inout7 3832 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 3833___ 3834$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 3835 nop 3836___ 3837$code.=<<___ if ($i==7); 3838 jb .Lcbc_dec_done 3839___ 3840$code.=<<___ if ($i==9); 3841 je .Lcbc_dec_done 3842___ 3843$code.=<<___ if ($i==11); 3844 jmp .Lcbc_dec_done 3845___ 3846} 3847$code.=<<___; 3848.align 16 3849.Lcbc_dec_done: 3850 aesdec $rndkey1,$inout0 3851 aesdec $rndkey1,$inout1 3852 pxor $rndkey0,$iv 3853 pxor $rndkey0,$in0 3854 aesdec $rndkey1,$inout2 3855 aesdec $rndkey1,$inout3 3856 pxor $rndkey0,$in1 3857 pxor $rndkey0,$in2 3858 aesdec $rndkey1,$inout4 3859 aesdec $rndkey1,$inout5 3860 pxor $rndkey0,$in3 3861 pxor $rndkey0,$in4 3862 aesdec $rndkey1,$inout6 3863 aesdec $rndkey1,$inout7 3864 movdqu 0x50($inp),$rndkey1 3865 3866 aesdeclast $iv,$inout0 3867 movdqu 0x60($inp),$iv # borrow $iv 3868 pxor $rndkey0,$rndkey1 3869 aesdeclast $in0,$inout1 3870 pxor $rndkey0,$iv 3871 movdqu 0x70($inp),$rndkey0 # next IV 3872 aesdeclast $in1,$inout2 3873 lea 0x80($inp),$inp 3874 movdqu 0x00($inp_),$in0 3875 aesdeclast $in2,$inout3 3876 aesdeclast $in3,$inout4 3877 movdqu 0x10($inp_),$in1 3878 movdqu 0x20($inp_),$in2 3879 aesdeclast $in4,$inout5 3880 aesdeclast $rndkey1,$inout6 3881 movdqu 0x30($inp_),$in3 3882 movdqu 0x40($inp_),$in4 3883 aesdeclast $iv,$inout7 3884 movdqa $rndkey0,$iv # return $iv 3885 movdqu 0x50($inp_),$rndkey1 3886 $movkey -0x70($key),$rndkey0 3887 3888 movups $inout0,($out) # store output 3889 movdqa $in0,$inout0 3890 movups $inout1,0x10($out) 3891 movdqa $in1,$inout1 3892 movups $inout2,0x20($out) 3893 movdqa $in2,$inout2 3894 movups $inout3,0x30($out) 3895 movdqa $in3,$inout3 3896 movups $inout4,0x40($out) 3897 movdqa $in4,$inout4 3898 movups $inout5,0x50($out) 3899 movdqa $rndkey1,$inout5 3900 movups $inout6,0x60($out) 3901 lea 0x70($out),$out 3902 3903 sub \$0x80,$len 3904 ja .Lcbc_dec_loop8 3905 3906 movaps $inout7,$inout0 3907 lea -0x70($key),$key 3908 add \$0x70,$len 3909 jle .Lcbc_dec_clear_tail_collected 3910 movups $inout7,($out) 3911 lea 0x10($out),$out 3912 cmp \$0x50,$len 3913 jbe .Lcbc_dec_tail 3914 3915 movaps $in0,$inout0 3916.Lcbc_dec_six_or_seven: 3917 cmp \$0x60,$len 3918 ja .Lcbc_dec_seven 3919 3920 movaps $inout5,$inout6 3921 call _aesni_decrypt6 3922 pxor $iv,$inout0 # ^= IV 3923 movaps $inout6,$iv 3924 pxor $in0,$inout1 3925 movdqu $inout0,($out) 3926 pxor $in1,$inout2 3927 movdqu $inout1,0x10($out) 3928 pxor $inout1,$inout1 # clear register bank 3929 pxor $in2,$inout3 3930 movdqu $inout2,0x20($out) 3931 pxor $inout2,$inout2 3932 pxor $in3,$inout4 3933 movdqu $inout3,0x30($out) 3934 pxor $inout3,$inout3 3935 pxor $in4,$inout5 3936 movdqu $inout4,0x40($out) 3937 pxor $inout4,$inout4 3938 lea 0x50($out),$out 3939 movdqa $inout5,$inout0 3940 pxor $inout5,$inout5 3941 jmp .Lcbc_dec_tail_collected 3942 3943.align 16 3944.Lcbc_dec_seven: 3945 movups 0x60($inp),$inout6 3946 xorps $inout7,$inout7 3947 call _aesni_decrypt8 3948 movups 0x50($inp),$inout7 3949 pxor $iv,$inout0 # ^= IV 3950 movups 0x60($inp),$iv 3951 pxor $in0,$inout1 3952 movdqu $inout0,($out) 3953 pxor $in1,$inout2 3954 movdqu $inout1,0x10($out) 3955 pxor $inout1,$inout1 # clear register bank 3956 pxor $in2,$inout3 3957 movdqu $inout2,0x20($out) 3958 pxor $inout2,$inout2 3959 pxor $in3,$inout4 3960 movdqu $inout3,0x30($out) 3961 pxor $inout3,$inout3 3962 pxor $in4,$inout5 3963 movdqu $inout4,0x40($out) 3964 pxor $inout4,$inout4 3965 pxor $inout7,$inout6 3966 movdqu $inout5,0x50($out) 3967 pxor $inout5,$inout5 3968 lea 0x60($out),$out 3969 movdqa $inout6,$inout0 3970 pxor $inout6,$inout6 3971 pxor $inout7,$inout7 3972 jmp .Lcbc_dec_tail_collected 3973 3974.align 16 3975.Lcbc_dec_loop6: 3976 movups $inout5,($out) 3977 lea 0x10($out),$out 3978 movdqu 0x00($inp),$inout0 # load input 3979 movdqu 0x10($inp),$inout1 3980 movdqa $inout0,$in0 3981 movdqu 0x20($inp),$inout2 3982 movdqa $inout1,$in1 3983 movdqu 0x30($inp),$inout3 3984 movdqa $inout2,$in2 3985 movdqu 0x40($inp),$inout4 3986 movdqa $inout3,$in3 3987 movdqu 0x50($inp),$inout5 3988 movdqa $inout4,$in4 3989.Lcbc_dec_loop6_enter: 3990 lea 0x60($inp),$inp 3991 movdqa $inout5,$inout6 3992 3993 call _aesni_decrypt6 3994 3995 pxor $iv,$inout0 # ^= IV 3996 movdqa $inout6,$iv 3997 pxor $in0,$inout1 3998 movdqu $inout0,($out) 3999 pxor $in1,$inout2 4000 movdqu $inout1,0x10($out) 4001 pxor $in2,$inout3 4002 movdqu $inout2,0x20($out) 4003 pxor $in3,$inout4 4004 mov $key_,$key 4005 movdqu $inout3,0x30($out) 4006 pxor $in4,$inout5 4007 mov $rnds_,$rounds 4008 movdqu $inout4,0x40($out) 4009 lea 0x50($out),$out 4010 sub \$0x60,$len 4011 ja .Lcbc_dec_loop6 4012 4013 movdqa $inout5,$inout0 4014 add \$0x50,$len 4015 jle .Lcbc_dec_clear_tail_collected 4016 movups $inout5,($out) 4017 lea 0x10($out),$out 4018 4019.Lcbc_dec_tail: 4020 movups ($inp),$inout0 4021 sub \$0x10,$len 4022 jbe .Lcbc_dec_one # $len is 1*16 or less 4023 4024 movups 0x10($inp),$inout1 4025 movaps $inout0,$in0 4026 sub \$0x10,$len 4027 jbe .Lcbc_dec_two # $len is 2*16 or less 4028 4029 movups 0x20($inp),$inout2 4030 movaps $inout1,$in1 4031 sub \$0x10,$len 4032 jbe .Lcbc_dec_three # $len is 3*16 or less 4033 4034 movups 0x30($inp),$inout3 4035 movaps $inout2,$in2 4036 sub \$0x10,$len 4037 jbe .Lcbc_dec_four # $len is 4*16 or less 4038 4039 movups 0x40($inp),$inout4 # $len is 5*16 or less 4040 movaps $inout3,$in3 4041 movaps $inout4,$in4 4042 xorps $inout5,$inout5 4043 call _aesni_decrypt6 4044 pxor $iv,$inout0 4045 movaps $in4,$iv 4046 pxor $in0,$inout1 4047 movdqu $inout0,($out) 4048 pxor $in1,$inout2 4049 movdqu $inout1,0x10($out) 4050 pxor $inout1,$inout1 # clear register bank 4051 pxor $in2,$inout3 4052 movdqu $inout2,0x20($out) 4053 pxor $inout2,$inout2 4054 pxor $in3,$inout4 4055 movdqu $inout3,0x30($out) 4056 pxor $inout3,$inout3 4057 lea 0x40($out),$out 4058 movdqa $inout4,$inout0 4059 pxor $inout4,$inout4 4060 pxor $inout5,$inout5 4061 sub \$0x10,$len 4062 jmp .Lcbc_dec_tail_collected 4063 4064.align 16 4065.Lcbc_dec_one: 4066 movaps $inout0,$in0 4067___ 4068 &aesni_generate1("dec",$key,$rounds); 4069$code.=<<___; 4070 xorps $iv,$inout0 4071 movaps $in0,$iv 4072 jmp .Lcbc_dec_tail_collected 4073.align 16 4074.Lcbc_dec_two: 4075 movaps $inout1,$in1 4076 call _aesni_decrypt2 4077 pxor $iv,$inout0 4078 movaps $in1,$iv 4079 pxor $in0,$inout1 4080 movdqu $inout0,($out) 4081 movdqa $inout1,$inout0 4082 pxor $inout1,$inout1 # clear register bank 4083 lea 0x10($out),$out 4084 jmp .Lcbc_dec_tail_collected 4085.align 16 4086.Lcbc_dec_three: 4087 movaps $inout2,$in2 4088 call _aesni_decrypt3 4089 pxor $iv,$inout0 4090 movaps $in2,$iv 4091 pxor $in0,$inout1 4092 movdqu $inout0,($out) 4093 pxor $in1,$inout2 4094 movdqu $inout1,0x10($out) 4095 pxor $inout1,$inout1 # clear register bank 4096 movdqa $inout2,$inout0 4097 pxor $inout2,$inout2 4098 lea 0x20($out),$out 4099 jmp .Lcbc_dec_tail_collected 4100.align 16 4101.Lcbc_dec_four: 4102 movaps $inout3,$in3 4103 call _aesni_decrypt4 4104 pxor $iv,$inout0 4105 movaps $in3,$iv 4106 pxor $in0,$inout1 4107 movdqu $inout0,($out) 4108 pxor $in1,$inout2 4109 movdqu $inout1,0x10($out) 4110 pxor $inout1,$inout1 # clear register bank 4111 pxor $in2,$inout3 4112 movdqu $inout2,0x20($out) 4113 pxor $inout2,$inout2 4114 movdqa $inout3,$inout0 4115 pxor $inout3,$inout3 4116 lea 0x30($out),$out 4117 jmp .Lcbc_dec_tail_collected 4118 4119.align 16 4120.Lcbc_dec_clear_tail_collected: 4121 pxor $inout1,$inout1 # clear register bank 4122 pxor $inout2,$inout2 4123 pxor $inout3,$inout3 4124___ 4125$code.=<<___ if (!$win64); 4126 pxor $inout4,$inout4 # %xmm6..9 4127 pxor $inout5,$inout5 4128 pxor $inout6,$inout6 4129 pxor $inout7,$inout7 4130___ 4131$code.=<<___; 4132.Lcbc_dec_tail_collected: 4133 movups $iv,($ivp) 4134 and \$15,$len 4135 jnz .Lcbc_dec_tail_partial 4136 movups $inout0,($out) 4137 pxor $inout0,$inout0 4138 jmp .Lcbc_dec_ret 4139.align 16 4140.Lcbc_dec_tail_partial: 4141 movaps $inout0,(%rsp) 4142 pxor $inout0,$inout0 4143 mov \$16,%rcx 4144 mov $out,%rdi 4145 sub $len,%rcx 4146 lea (%rsp),%rsi 4147 .long 0x9066A4F3 # rep movsb 4148 movdqa $inout0,(%rsp) 4149 4150.Lcbc_dec_ret: 4151 xorps $rndkey0,$rndkey0 # %xmm0 4152 pxor $rndkey1,$rndkey1 4153___ 4154$code.=<<___ if ($win64); 4155 movaps 0x10(%rsp),%xmm6 4156 movaps %xmm0,0x10(%rsp) # clear stack 4157 movaps 0x20(%rsp),%xmm7 4158 movaps %xmm0,0x20(%rsp) 4159 movaps 0x30(%rsp),%xmm8 4160 movaps %xmm0,0x30(%rsp) 4161 movaps 0x40(%rsp),%xmm9 4162 movaps %xmm0,0x40(%rsp) 4163 movaps 0x50(%rsp),%xmm10 4164 movaps %xmm0,0x50(%rsp) 4165 movaps 0x60(%rsp),%xmm11 4166 movaps %xmm0,0x60(%rsp) 4167 movaps 0x70(%rsp),%xmm12 4168 movaps %xmm0,0x70(%rsp) 4169 movaps 0x80(%rsp),%xmm13 4170 movaps %xmm0,0x80(%rsp) 4171 movaps 0x90(%rsp),%xmm14 4172 movaps %xmm0,0x90(%rsp) 4173 movaps 0xa0(%rsp),%xmm15 4174 movaps %xmm0,0xa0(%rsp) 4175___ 4176$code.=<<___; 4177 mov -8(%r11),%rbp 4178 lea (%r11),%rsp 4179.Lcbc_ret: 4180 ret 4181.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 4182___ 4183} 4184# int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 4185# int bits, AES_KEY *key) 4186# 4187# input: $inp user-supplied key 4188# $bits $inp length in bits 4189# $key pointer to key schedule 4190# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4191# *$key key schedule 4192# 4193{ my ($inp,$bits,$key) = @_4args; 4194 $bits =~ s/%r/%e/; 4195 4196$code.=<<___; 4197.globl ${PREFIX}_set_decrypt_key 4198.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 4199.align 16 4200${PREFIX}_set_decrypt_key: 4201 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4202 call __aesni_set_encrypt_key 4203 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 4204 test %eax,%eax 4205 jnz .Ldec_key_ret 4206 lea 16($key,$bits),$inp # points at the end of key schedule 4207 4208 $movkey ($key),%xmm0 # just swap 4209 $movkey ($inp),%xmm1 4210 $movkey %xmm0,($inp) 4211 $movkey %xmm1,($key) 4212 lea 16($key),$key 4213 lea -16($inp),$inp 4214 4215.Ldec_key_inverse: 4216 $movkey ($key),%xmm0 # swap and inverse 4217 $movkey ($inp),%xmm1 4218 aesimc %xmm0,%xmm0 4219 aesimc %xmm1,%xmm1 4220 lea 16($key),$key 4221 lea -16($inp),$inp 4222 $movkey %xmm0,16($inp) 4223 $movkey %xmm1,-16($key) 4224 cmp $key,$inp 4225 ja .Ldec_key_inverse 4226 4227 $movkey ($key),%xmm0 # inverse middle 4228 aesimc %xmm0,%xmm0 4229 pxor %xmm1,%xmm1 4230 $movkey %xmm0,($inp) 4231 pxor %xmm0,%xmm0 4232.Ldec_key_ret: 4233 add \$8,%rsp 4234 ret 4235.LSEH_end_set_decrypt_key: 4236.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 4237___ 4238 4239# This is based on submission by 4240# 4241# Huang Ying <ying.huang@intel.com> 4242# Vinodh Gopal <vinodh.gopal@intel.com> 4243# Kahraman Akdemir 4244# 4245# Aggressively optimized in respect to aeskeygenassist's critical path 4246# and is contained in %xmm0-5 to meet Win64 ABI requirement. 4247# 4248# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 4249# int bits, AES_KEY * const key); 4250# 4251# input: $inp user-supplied key 4252# $bits $inp length in bits 4253# $key pointer to key schedule 4254# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4255# $bits rounds-1 (used in aesni_set_decrypt_key) 4256# *$key key schedule 4257# $key pointer to key schedule (used in 4258# aesni_set_decrypt_key) 4259# 4260# Subroutine is frame-less, which means that only volatile registers 4261# are used. Note that it's declared "abi-omnipotent", which means that 4262# amount of volatile registers is smaller on Windows. 4263# 4264$code.=<<___; 4265.globl ${PREFIX}_set_encrypt_key 4266.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 4267.align 16 4268${PREFIX}_set_encrypt_key: 4269__aesni_set_encrypt_key: 4270 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4271 mov \$-1,%rax 4272 test $inp,$inp 4273 jz .Lenc_key_ret 4274 test $key,$key 4275 jz .Lenc_key_ret 4276 4277 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits 4278 movups ($inp),%xmm0 # pull first 128 bits of *userKey 4279 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 4280 and OPENSSL_ia32cap_P+4(%rip),%r10d 4281 lea 16($key),%rax # %rax is used as modifiable copy of $key 4282 cmp \$256,$bits 4283 je .L14rounds 4284 cmp \$192,$bits 4285 je .L12rounds 4286 cmp \$128,$bits 4287 jne .Lbad_keybits 4288 4289.L10rounds: 4290 mov \$9,$bits # 10 rounds for 128-bit key 4291 cmp \$`1<<28`,%r10d # AVX, bit no XOP 4292 je .L10rounds_alt 4293 4294 $movkey %xmm0,($key) # round 0 4295 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 4296 call .Lkey_expansion_128_cold 4297 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 4298 call .Lkey_expansion_128 4299 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 4300 call .Lkey_expansion_128 4301 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 4302 call .Lkey_expansion_128 4303 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 4304 call .Lkey_expansion_128 4305 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 4306 call .Lkey_expansion_128 4307 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 4308 call .Lkey_expansion_128 4309 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 4310 call .Lkey_expansion_128 4311 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 4312 call .Lkey_expansion_128 4313 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 4314 call .Lkey_expansion_128 4315 $movkey %xmm0,(%rax) 4316 mov $bits,80(%rax) # 240(%rdx) 4317 xor %eax,%eax 4318 jmp .Lenc_key_ret 4319 4320.align 16 4321.L10rounds_alt: 4322 movdqa .Lkey_rotate(%rip),%xmm5 4323 mov \$8,%r10d 4324 movdqa .Lkey_rcon1(%rip),%xmm4 4325 movdqa %xmm0,%xmm2 4326 movdqu %xmm0,($key) 4327 jmp .Loop_key128 4328 4329.align 16 4330.Loop_key128: 4331 pshufb %xmm5,%xmm0 4332 aesenclast %xmm4,%xmm0 4333 pslld \$1,%xmm4 4334 lea 16(%rax),%rax 4335 4336 movdqa %xmm2,%xmm3 4337 pslldq \$4,%xmm2 4338 pxor %xmm2,%xmm3 4339 pslldq \$4,%xmm2 4340 pxor %xmm2,%xmm3 4341 pslldq \$4,%xmm2 4342 pxor %xmm3,%xmm2 4343 4344 pxor %xmm2,%xmm0 4345 movdqu %xmm0,-16(%rax) 4346 movdqa %xmm0,%xmm2 4347 4348 dec %r10d 4349 jnz .Loop_key128 4350 4351 movdqa .Lkey_rcon1b(%rip),%xmm4 4352 4353 pshufb %xmm5,%xmm0 4354 aesenclast %xmm4,%xmm0 4355 pslld \$1,%xmm4 4356 4357 movdqa %xmm2,%xmm3 4358 pslldq \$4,%xmm2 4359 pxor %xmm2,%xmm3 4360 pslldq \$4,%xmm2 4361 pxor %xmm2,%xmm3 4362 pslldq \$4,%xmm2 4363 pxor %xmm3,%xmm2 4364 4365 pxor %xmm2,%xmm0 4366 movdqu %xmm0,(%rax) 4367 4368 movdqa %xmm0,%xmm2 4369 pshufb %xmm5,%xmm0 4370 aesenclast %xmm4,%xmm0 4371 4372 movdqa %xmm2,%xmm3 4373 pslldq \$4,%xmm2 4374 pxor %xmm2,%xmm3 4375 pslldq \$4,%xmm2 4376 pxor %xmm2,%xmm3 4377 pslldq \$4,%xmm2 4378 pxor %xmm3,%xmm2 4379 4380 pxor %xmm2,%xmm0 4381 movdqu %xmm0,16(%rax) 4382 4383 mov $bits,96(%rax) # 240($key) 4384 xor %eax,%eax 4385 jmp .Lenc_key_ret 4386 4387.align 16 4388.L12rounds: 4389 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 4390 mov \$11,$bits # 12 rounds for 192 4391 cmp \$`1<<28`,%r10d # AVX, but no XOP 4392 je .L12rounds_alt 4393 4394 $movkey %xmm0,($key) # round 0 4395 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 4396 call .Lkey_expansion_192a_cold 4397 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 4398 call .Lkey_expansion_192b 4399 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 4400 call .Lkey_expansion_192a 4401 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 4402 call .Lkey_expansion_192b 4403 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 4404 call .Lkey_expansion_192a 4405 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 4406 call .Lkey_expansion_192b 4407 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 4408 call .Lkey_expansion_192a 4409 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 4410 call .Lkey_expansion_192b 4411 $movkey %xmm0,(%rax) 4412 mov $bits,48(%rax) # 240(%rdx) 4413 xor %rax, %rax 4414 jmp .Lenc_key_ret 4415 4416.align 16 4417.L12rounds_alt: 4418 movdqa .Lkey_rotate192(%rip),%xmm5 4419 movdqa .Lkey_rcon1(%rip),%xmm4 4420 mov \$8,%r10d 4421 movdqu %xmm0,($key) 4422 jmp .Loop_key192 4423 4424.align 16 4425.Loop_key192: 4426 movq %xmm2,0(%rax) 4427 movdqa %xmm2,%xmm1 4428 pshufb %xmm5,%xmm2 4429 aesenclast %xmm4,%xmm2 4430 pslld \$1, %xmm4 4431 lea 24(%rax),%rax 4432 4433 movdqa %xmm0,%xmm3 4434 pslldq \$4,%xmm0 4435 pxor %xmm0,%xmm3 4436 pslldq \$4,%xmm0 4437 pxor %xmm0,%xmm3 4438 pslldq \$4,%xmm0 4439 pxor %xmm3,%xmm0 4440 4441 pshufd \$0xff,%xmm0,%xmm3 4442 pxor %xmm1,%xmm3 4443 pslldq \$4,%xmm1 4444 pxor %xmm1,%xmm3 4445 4446 pxor %xmm2,%xmm0 4447 pxor %xmm3,%xmm2 4448 movdqu %xmm0,-16(%rax) 4449 4450 dec %r10d 4451 jnz .Loop_key192 4452 4453 mov $bits,32(%rax) # 240($key) 4454 xor %eax,%eax 4455 jmp .Lenc_key_ret 4456 4457.align 16 4458.L14rounds: 4459 movups 16($inp),%xmm2 # remaning half of *userKey 4460 mov \$13,$bits # 14 rounds for 256 4461 lea 16(%rax),%rax 4462 cmp \$`1<<28`,%r10d # AVX, but no XOP 4463 je .L14rounds_alt 4464 4465 $movkey %xmm0,($key) # round 0 4466 $movkey %xmm2,16($key) # round 1 4467 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 4468 call .Lkey_expansion_256a_cold 4469 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 4470 call .Lkey_expansion_256b 4471 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 4472 call .Lkey_expansion_256a 4473 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 4474 call .Lkey_expansion_256b 4475 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 4476 call .Lkey_expansion_256a 4477 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 4478 call .Lkey_expansion_256b 4479 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 4480 call .Lkey_expansion_256a 4481 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 4482 call .Lkey_expansion_256b 4483 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 4484 call .Lkey_expansion_256a 4485 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 4486 call .Lkey_expansion_256b 4487 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 4488 call .Lkey_expansion_256a 4489 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 4490 call .Lkey_expansion_256b 4491 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 4492 call .Lkey_expansion_256a 4493 $movkey %xmm0,(%rax) 4494 mov $bits,16(%rax) # 240(%rdx) 4495 xor %rax,%rax 4496 jmp .Lenc_key_ret 4497 4498.align 16 4499.L14rounds_alt: 4500 movdqa .Lkey_rotate(%rip),%xmm5 4501 movdqa .Lkey_rcon1(%rip),%xmm4 4502 mov \$7,%r10d 4503 movdqu %xmm0,0($key) 4504 movdqa %xmm2,%xmm1 4505 movdqu %xmm2,16($key) 4506 jmp .Loop_key256 4507 4508.align 16 4509.Loop_key256: 4510 pshufb %xmm5,%xmm2 4511 aesenclast %xmm4,%xmm2 4512 4513 movdqa %xmm0,%xmm3 4514 pslldq \$4,%xmm0 4515 pxor %xmm0,%xmm3 4516 pslldq \$4,%xmm0 4517 pxor %xmm0,%xmm3 4518 pslldq \$4,%xmm0 4519 pxor %xmm3,%xmm0 4520 pslld \$1,%xmm4 4521 4522 pxor %xmm2,%xmm0 4523 movdqu %xmm0,(%rax) 4524 4525 dec %r10d 4526 jz .Ldone_key256 4527 4528 pshufd \$0xff,%xmm0,%xmm2 4529 pxor %xmm3,%xmm3 4530 aesenclast %xmm3,%xmm2 4531 4532 movdqa %xmm1,%xmm3 4533 pslldq \$4,%xmm1 4534 pxor %xmm1,%xmm3 4535 pslldq \$4,%xmm1 4536 pxor %xmm1,%xmm3 4537 pslldq \$4,%xmm1 4538 pxor %xmm3,%xmm1 4539 4540 pxor %xmm1,%xmm2 4541 movdqu %xmm2,16(%rax) 4542 lea 32(%rax),%rax 4543 movdqa %xmm2,%xmm1 4544 4545 jmp .Loop_key256 4546 4547.Ldone_key256: 4548 mov $bits,16(%rax) # 240($key) 4549 xor %eax,%eax 4550 jmp .Lenc_key_ret 4551 4552.align 16 4553.Lbad_keybits: 4554 mov \$-2,%rax 4555.Lenc_key_ret: 4556 pxor %xmm0,%xmm0 4557 pxor %xmm1,%xmm1 4558 pxor %xmm2,%xmm2 4559 pxor %xmm3,%xmm3 4560 pxor %xmm4,%xmm4 4561 pxor %xmm5,%xmm5 4562 add \$8,%rsp 4563 ret 4564.LSEH_end_set_encrypt_key: 4565 4566.align 16 4567.Lkey_expansion_128: 4568 $movkey %xmm0,(%rax) 4569 lea 16(%rax),%rax 4570.Lkey_expansion_128_cold: 4571 shufps \$0b00010000,%xmm0,%xmm4 4572 xorps %xmm4, %xmm0 4573 shufps \$0b10001100,%xmm0,%xmm4 4574 xorps %xmm4, %xmm0 4575 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4576 xorps %xmm1,%xmm0 4577 ret 4578 4579.align 16 4580.Lkey_expansion_192a: 4581 $movkey %xmm0,(%rax) 4582 lea 16(%rax),%rax 4583.Lkey_expansion_192a_cold: 4584 movaps %xmm2, %xmm5 4585.Lkey_expansion_192b_warm: 4586 shufps \$0b00010000,%xmm0,%xmm4 4587 movdqa %xmm2,%xmm3 4588 xorps %xmm4,%xmm0 4589 shufps \$0b10001100,%xmm0,%xmm4 4590 pslldq \$4,%xmm3 4591 xorps %xmm4,%xmm0 4592 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 4593 pxor %xmm3,%xmm2 4594 pxor %xmm1,%xmm0 4595 pshufd \$0b11111111,%xmm0,%xmm3 4596 pxor %xmm3,%xmm2 4597 ret 4598 4599.align 16 4600.Lkey_expansion_192b: 4601 movaps %xmm0,%xmm3 4602 shufps \$0b01000100,%xmm0,%xmm5 4603 $movkey %xmm5,(%rax) 4604 shufps \$0b01001110,%xmm2,%xmm3 4605 $movkey %xmm3,16(%rax) 4606 lea 32(%rax),%rax 4607 jmp .Lkey_expansion_192b_warm 4608 4609.align 16 4610.Lkey_expansion_256a: 4611 $movkey %xmm2,(%rax) 4612 lea 16(%rax),%rax 4613.Lkey_expansion_256a_cold: 4614 shufps \$0b00010000,%xmm0,%xmm4 4615 xorps %xmm4,%xmm0 4616 shufps \$0b10001100,%xmm0,%xmm4 4617 xorps %xmm4,%xmm0 4618 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4619 xorps %xmm1,%xmm0 4620 ret 4621 4622.align 16 4623.Lkey_expansion_256b: 4624 $movkey %xmm0,(%rax) 4625 lea 16(%rax),%rax 4626 4627 shufps \$0b00010000,%xmm2,%xmm4 4628 xorps %xmm4,%xmm2 4629 shufps \$0b10001100,%xmm2,%xmm4 4630 xorps %xmm4,%xmm2 4631 shufps \$0b10101010,%xmm1,%xmm1 # critical path 4632 xorps %xmm1,%xmm2 4633 ret 4634.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 4635.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 4636___ 4637} 4638 4639$code.=<<___; 4640.align 64 4641.Lbswap_mask: 4642 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 4643.Lincrement32: 4644 .long 6,6,6,0 4645.Lincrement64: 4646 .long 1,0,0,0 4647.Lxts_magic: 4648 .long 0x87,0,1,0 4649.Lincrement1: 4650 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 4651.Lkey_rotate: 4652 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 4653.Lkey_rotate192: 4654 .long 0x04070605,0x04070605,0x04070605,0x04070605 4655.Lkey_rcon1: 4656 .long 1,1,1,1 4657.Lkey_rcon1b: 4658 .long 0x1b,0x1b,0x1b,0x1b 4659 4660.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 4661.align 64 4662___ 4663 4664# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 4665# CONTEXT *context,DISPATCHER_CONTEXT *disp) 4666if ($win64) { 4667$rec="%rcx"; 4668$frame="%rdx"; 4669$context="%r8"; 4670$disp="%r9"; 4671 4672$code.=<<___; 4673.extern __imp_RtlVirtualUnwind 4674___ 4675$code.=<<___ if ($PREFIX eq "aesni"); 4676.type ecb_ccm64_se_handler,\@abi-omnipotent 4677.align 16 4678ecb_ccm64_se_handler: 4679 push %rsi 4680 push %rdi 4681 push %rbx 4682 push %rbp 4683 push %r12 4684 push %r13 4685 push %r14 4686 push %r15 4687 pushfq 4688 sub \$64,%rsp 4689 4690 mov 120($context),%rax # pull context->Rax 4691 mov 248($context),%rbx # pull context->Rip 4692 4693 mov 8($disp),%rsi # disp->ImageBase 4694 mov 56($disp),%r11 # disp->HandlerData 4695 4696 mov 0(%r11),%r10d # HandlerData[0] 4697 lea (%rsi,%r10),%r10 # prologue label 4698 cmp %r10,%rbx # context->Rip<prologue label 4699 jb .Lcommon_seh_tail 4700 4701 mov 152($context),%rax # pull context->Rsp 4702 4703 mov 4(%r11),%r10d # HandlerData[1] 4704 lea (%rsi,%r10),%r10 # epilogue label 4705 cmp %r10,%rbx # context->Rip>=epilogue label 4706 jae .Lcommon_seh_tail 4707 4708 lea 0(%rax),%rsi # %xmm save area 4709 lea 512($context),%rdi # &context.Xmm6 4710 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 4711 .long 0xa548f3fc # cld; rep movsq 4712 lea 0x58(%rax),%rax # adjust stack pointer 4713 4714 jmp .Lcommon_seh_tail 4715.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler 4716 4717.type ctr_xts_se_handler,\@abi-omnipotent 4718.align 16 4719ctr_xts_se_handler: 4720 push %rsi 4721 push %rdi 4722 push %rbx 4723 push %rbp 4724 push %r12 4725 push %r13 4726 push %r14 4727 push %r15 4728 pushfq 4729 sub \$64,%rsp 4730 4731 mov 120($context),%rax # pull context->Rax 4732 mov 248($context),%rbx # pull context->Rip 4733 4734 mov 8($disp),%rsi # disp->ImageBase 4735 mov 56($disp),%r11 # disp->HandlerData 4736 4737 mov 0(%r11),%r10d # HandlerData[0] 4738 lea (%rsi,%r10),%r10 # prologue lable 4739 cmp %r10,%rbx # context->Rip<prologue label 4740 jb .Lcommon_seh_tail 4741 4742 mov 152($context),%rax # pull context->Rsp 4743 4744 mov 4(%r11),%r10d # HandlerData[1] 4745 lea (%rsi,%r10),%r10 # epilogue label 4746 cmp %r10,%rbx # context->Rip>=epilogue label 4747 jae .Lcommon_seh_tail 4748 4749 mov 208($context),%rax # pull context->R11 4750 4751 lea -0xa8(%rax),%rsi # %xmm save area 4752 lea 512($context),%rdi # & context.Xmm6 4753 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4754 .long 0xa548f3fc # cld; rep movsq 4755 4756 mov -8(%rax),%rbp # restore saved %rbp 4757 mov %rbp,160($context) # restore context->Rbp 4758 jmp .Lcommon_seh_tail 4759.size ctr_xts_se_handler,.-ctr_xts_se_handler 4760 4761.type ocb_se_handler,\@abi-omnipotent 4762.align 16 4763ocb_se_handler: 4764 push %rsi 4765 push %rdi 4766 push %rbx 4767 push %rbp 4768 push %r12 4769 push %r13 4770 push %r14 4771 push %r15 4772 pushfq 4773 sub \$64,%rsp 4774 4775 mov 120($context),%rax # pull context->Rax 4776 mov 248($context),%rbx # pull context->Rip 4777 4778 mov 8($disp),%rsi # disp->ImageBase 4779 mov 56($disp),%r11 # disp->HandlerData 4780 4781 mov 0(%r11),%r10d # HandlerData[0] 4782 lea (%rsi,%r10),%r10 # prologue lable 4783 cmp %r10,%rbx # context->Rip<prologue label 4784 jb .Lcommon_seh_tail 4785 4786 mov 4(%r11),%r10d # HandlerData[1] 4787 lea (%rsi,%r10),%r10 # epilogue label 4788 cmp %r10,%rbx # context->Rip>=epilogue label 4789 jae .Lcommon_seh_tail 4790 4791 mov 8(%r11),%r10d # HandlerData[2] 4792 lea (%rsi,%r10),%r10 4793 cmp %r10,%rbx # context->Rip>=pop label 4794 jae .Locb_no_xmm 4795 4796 mov 152($context),%rax # pull context->Rsp 4797 4798 lea (%rax),%rsi # %xmm save area 4799 lea 512($context),%rdi # & context.Xmm6 4800 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4801 .long 0xa548f3fc # cld; rep movsq 4802 lea 0xa0+0x28(%rax),%rax 4803 4804.Locb_no_xmm: 4805 mov -8(%rax),%rbx 4806 mov -16(%rax),%rbp 4807 mov -24(%rax),%r12 4808 mov -32(%rax),%r13 4809 mov -40(%rax),%r14 4810 4811 mov %rbx,144($context) # restore context->Rbx 4812 mov %rbp,160($context) # restore context->Rbp 4813 mov %r12,216($context) # restore context->R12 4814 mov %r13,224($context) # restore context->R13 4815 mov %r14,232($context) # restore context->R14 4816 4817 jmp .Lcommon_seh_tail 4818.size ocb_se_handler,.-ocb_se_handler 4819___ 4820$code.=<<___; 4821.type cbc_se_handler,\@abi-omnipotent 4822.align 16 4823cbc_se_handler: 4824 push %rsi 4825 push %rdi 4826 push %rbx 4827 push %rbp 4828 push %r12 4829 push %r13 4830 push %r14 4831 push %r15 4832 pushfq 4833 sub \$64,%rsp 4834 4835 mov 152($context),%rax # pull context->Rsp 4836 mov 248($context),%rbx # pull context->Rip 4837 4838 lea .Lcbc_decrypt_bulk(%rip),%r10 4839 cmp %r10,%rbx # context->Rip<"prologue" label 4840 jb .Lcommon_seh_tail 4841 4842 mov 120($context),%rax # pull context->Rax 4843 4844 lea .Lcbc_decrypt_body(%rip),%r10 4845 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 4846 jb .Lcommon_seh_tail 4847 4848 mov 152($context),%rax # pull context->Rsp 4849 4850 lea .Lcbc_ret(%rip),%r10 4851 cmp %r10,%rbx # context->Rip>="epilogue" label 4852 jae .Lcommon_seh_tail 4853 4854 lea 16(%rax),%rsi # %xmm save area 4855 lea 512($context),%rdi # &context.Xmm6 4856 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4857 .long 0xa548f3fc # cld; rep movsq 4858 4859 mov 208($context),%rax # pull context->R11 4860 4861 mov -8(%rax),%rbp # restore saved %rbp 4862 mov %rbp,160($context) # restore context->Rbp 4863 4864.Lcommon_seh_tail: 4865 mov 8(%rax),%rdi 4866 mov 16(%rax),%rsi 4867 mov %rax,152($context) # restore context->Rsp 4868 mov %rsi,168($context) # restore context->Rsi 4869 mov %rdi,176($context) # restore context->Rdi 4870 4871 mov 40($disp),%rdi # disp->ContextRecord 4872 mov $context,%rsi # context 4873 mov \$154,%ecx # sizeof(CONTEXT) 4874 .long 0xa548f3fc # cld; rep movsq 4875 4876 mov $disp,%rsi 4877 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4878 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4879 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4880 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4881 mov 40(%rsi),%r10 # disp->ContextRecord 4882 lea 56(%rsi),%r11 # &disp->HandlerData 4883 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4884 mov %r10,32(%rsp) # arg5 4885 mov %r11,40(%rsp) # arg6 4886 mov %r12,48(%rsp) # arg7 4887 mov %rcx,56(%rsp) # arg8, (NULL) 4888 call *__imp_RtlVirtualUnwind(%rip) 4889 4890 mov \$1,%eax # ExceptionContinueSearch 4891 add \$64,%rsp 4892 popfq 4893 pop %r15 4894 pop %r14 4895 pop %r13 4896 pop %r12 4897 pop %rbp 4898 pop %rbx 4899 pop %rdi 4900 pop %rsi 4901 ret 4902.size cbc_se_handler,.-cbc_se_handler 4903 4904.section .pdata 4905.align 4 4906___ 4907$code.=<<___ if ($PREFIX eq "aesni"); 4908 .rva .LSEH_begin_aesni_ecb_encrypt 4909 .rva .LSEH_end_aesni_ecb_encrypt 4910 .rva .LSEH_info_ecb 4911 4912 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks 4913 .rva .LSEH_end_aesni_ccm64_encrypt_blocks 4914 .rva .LSEH_info_ccm64_enc 4915 4916 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks 4917 .rva .LSEH_end_aesni_ccm64_decrypt_blocks 4918 .rva .LSEH_info_ccm64_dec 4919 4920 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks 4921 .rva .LSEH_end_aesni_ctr32_encrypt_blocks 4922 .rva .LSEH_info_ctr32 4923 4924 .rva .LSEH_begin_aesni_xts_encrypt 4925 .rva .LSEH_end_aesni_xts_encrypt 4926 .rva .LSEH_info_xts_enc 4927 4928 .rva .LSEH_begin_aesni_xts_decrypt 4929 .rva .LSEH_end_aesni_xts_decrypt 4930 .rva .LSEH_info_xts_dec 4931 4932 .rva .LSEH_begin_aesni_ocb_encrypt 4933 .rva .LSEH_end_aesni_ocb_encrypt 4934 .rva .LSEH_info_ocb_enc 4935 4936 .rva .LSEH_begin_aesni_ocb_decrypt 4937 .rva .LSEH_end_aesni_ocb_decrypt 4938 .rva .LSEH_info_ocb_dec 4939___ 4940$code.=<<___; 4941 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 4942 .rva .LSEH_end_${PREFIX}_cbc_encrypt 4943 .rva .LSEH_info_cbc 4944 4945 .rva ${PREFIX}_set_decrypt_key 4946 .rva .LSEH_end_set_decrypt_key 4947 .rva .LSEH_info_key 4948 4949 .rva ${PREFIX}_set_encrypt_key 4950 .rva .LSEH_end_set_encrypt_key 4951 .rva .LSEH_info_key 4952.section .xdata 4953.align 8 4954___ 4955$code.=<<___ if ($PREFIX eq "aesni"); 4956.LSEH_info_ecb: 4957 .byte 9,0,0,0 4958 .rva ecb_ccm64_se_handler 4959 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 4960.LSEH_info_ccm64_enc: 4961 .byte 9,0,0,0 4962 .rva ecb_ccm64_se_handler 4963 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 4964.LSEH_info_ccm64_dec: 4965 .byte 9,0,0,0 4966 .rva ecb_ccm64_se_handler 4967 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 4968.LSEH_info_ctr32: 4969 .byte 9,0,0,0 4970 .rva ctr_xts_se_handler 4971 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 4972.LSEH_info_xts_enc: 4973 .byte 9,0,0,0 4974 .rva ctr_xts_se_handler 4975 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 4976.LSEH_info_xts_dec: 4977 .byte 9,0,0,0 4978 .rva ctr_xts_se_handler 4979 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 4980.LSEH_info_ocb_enc: 4981 .byte 9,0,0,0 4982 .rva ocb_se_handler 4983 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[] 4984 .rva .Locb_enc_pop 4985 .long 0 4986.LSEH_info_ocb_dec: 4987 .byte 9,0,0,0 4988 .rva ocb_se_handler 4989 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[] 4990 .rva .Locb_dec_pop 4991 .long 0 4992___ 4993$code.=<<___; 4994.LSEH_info_cbc: 4995 .byte 9,0,0,0 4996 .rva cbc_se_handler 4997.LSEH_info_key: 4998 .byte 0x01,0x04,0x01,0x00 4999 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 5000___ 5001} 5002 5003sub rex { 5004 local *opcode=shift; 5005 my ($dst,$src)=@_; 5006 my $rex=0; 5007 5008 $rex|=0x04 if($dst>=8); 5009 $rex|=0x01 if($src>=8); 5010 push @opcode,$rex|0x40 if($rex); 5011} 5012 5013sub aesni { 5014 my $line=shift; 5015 my @opcode=(0x66); 5016 5017 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5018 rex(\@opcode,$4,$3); 5019 push @opcode,0x0f,0x3a,0xdf; 5020 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 5021 my $c=$2; 5022 push @opcode,$c=~/^0/?oct($c):$c; 5023 return ".byte\t".join(',',@opcode); 5024 } 5025 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5026 my %opcodelet = ( 5027 "aesimc" => 0xdb, 5028 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5029 "aesdec" => 0xde, "aesdeclast" => 0xdf 5030 ); 5031 return undef if (!defined($opcodelet{$1})); 5032 rex(\@opcode,$3,$2); 5033 push @opcode,0x0f,0x38,$opcodelet{$1}; 5034 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 5035 return ".byte\t".join(',',@opcode); 5036 } 5037 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 5038 my %opcodelet = ( 5039 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5040 "aesdec" => 0xde, "aesdeclast" => 0xdf 5041 ); 5042 return undef if (!defined($opcodelet{$1})); 5043 my $off = $2; 5044 push @opcode,0x44 if ($3>=8); 5045 push @opcode,0x0f,0x38,$opcodelet{$1}; 5046 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 5047 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 5048 return ".byte\t".join(',',@opcode); 5049 } 5050 return $line; 5051} 5052 5053sub movbe { 5054 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 5055} 5056 5057$code =~ s/\`([^\`]*)\`/eval($1)/gem; 5058$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 5059#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 5060$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 5061 5062print $code; 5063 5064close STDOUT; 5065