1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# This module implements support for Intel AES-NI extension. In 11# OpenSSL context it's used with Intel engine, but can also be used as 12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 13# details]. 14# 15# Performance. 16# 17# To start with see corresponding paragraph in aesni-x86_64.pl... 18# Instead of filling table similar to one found there I've chosen to 19# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 20# The simplified table below represents 32-bit performance relative 21# to 64-bit one in every given point. Ratios vary for different 22# encryption modes, therefore interval values. 23# 24# 16-byte 64-byte 256-byte 1-KB 8-KB 25# 53-67% 67-84% 91-94% 95-98% 97-99.5% 26# 27# Lower ratios for smaller block sizes are perfectly understandable, 28# because function call overhead is higher in 32-bit mode. Largest 29# 8-KB block performance is virtually same: 32-bit code is less than 30# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 31 32# January 2011 33# 34# See aesni-x86_64.pl for details. Unlike x86_64 version this module 35# interleaves at most 6 aes[enc|dec] instructions, because there are 36# not enough registers for 8x interleave [which should be optimal for 37# Sandy Bridge]. Actually, performance results for 6x interleave 38# factor presented in aesni-x86_64.pl (except for CTR) are for this 39# module. 40 41# April 2011 42# 43# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 44# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 45 46###################################################################### 47# Current large-block performance in cycles per byte processed with 48# 128-bit key (less is better). 49# 50# CBC en-/decrypt CTR XTS ECB 51# Westmere 3.77/1.37 1.37 1.52 1.27 52# * Bridge 5.07/0.98 0.99 1.09 0.91 53# Haswell 4.44/0.80 0.97 1.03 0.72 54# Silvermont 5.77/3.56 3.67 4.03 3.46 55# Bulldozer 5.80/0.98 1.05 1.24 0.93 56 57$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 58 # generates drop-in replacement for 59 # crypto/aes/asm/aes-586.pl:-) 60$inline=1; # inline _aesni_[en|de]crypt 61 62$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 63push(@INC,"${dir}","${dir}../../perlasm"); 64require "x86asm.pl"; 65 66&asm_init($ARGV[0],$0); 67 68&external_label("OPENSSL_ia32cap_P"); 69&static_label("key_const"); 70 71if ($PREFIX eq "aesni") { $movekey=\&movups; } 72else { $movekey=\&movups; } 73 74$len="eax"; 75$rounds="ecx"; 76$key="edx"; 77$inp="esi"; 78$out="edi"; 79$rounds_="ebx"; # backup copy for $rounds 80$key_="ebp"; # backup copy for $key 81 82$rndkey0="xmm0"; 83$rndkey1="xmm1"; 84$inout0="xmm2"; 85$inout1="xmm3"; 86$inout2="xmm4"; 87$inout3="xmm5"; $in1="xmm5"; 88$inout4="xmm6"; $in0="xmm6"; 89$inout5="xmm7"; $ivec="xmm7"; 90 91# AESNI extenstion 92sub aeskeygenassist 93{ my($dst,$src,$imm)=@_; 94 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 95 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 96} 97sub aescommon 98{ my($opcodelet,$dst,$src)=@_; 99 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 100 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 101} 102sub aesimc { aescommon(0xdb,@_); } 103sub aesenc { aescommon(0xdc,@_); } 104sub aesenclast { aescommon(0xdd,@_); } 105sub aesdec { aescommon(0xde,@_); } 106sub aesdeclast { aescommon(0xdf,@_); } 107 108# Inline version of internal aesni_[en|de]crypt1 109{ my $sn; 110sub aesni_inline_generate1 111{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 112 $sn++; 113 114 &$movekey ($rndkey0,&QWP(0,$key)); 115 &$movekey ($rndkey1,&QWP(16,$key)); 116 &xorps ($ivec,$rndkey0) if (defined($ivec)); 117 &lea ($key,&DWP(32,$key)); 118 &xorps ($inout,$ivec) if (defined($ivec)); 119 &xorps ($inout,$rndkey0) if (!defined($ivec)); 120 &set_label("${p}1_loop_$sn"); 121 eval"&aes${p} ($inout,$rndkey1)"; 122 &dec ($rounds); 123 &$movekey ($rndkey1,&QWP(0,$key)); 124 &lea ($key,&DWP(16,$key)); 125 &jnz (&label("${p}1_loop_$sn")); 126 eval"&aes${p}last ($inout,$rndkey1)"; 127}} 128 129sub aesni_generate1 # fully unrolled loop 130{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 131 132 &function_begin_B("_aesni_${p}rypt1"); 133 &movups ($rndkey0,&QWP(0,$key)); 134 &$movekey ($rndkey1,&QWP(0x10,$key)); 135 &xorps ($inout,$rndkey0); 136 &$movekey ($rndkey0,&QWP(0x20,$key)); 137 &lea ($key,&DWP(0x30,$key)); 138 &cmp ($rounds,11); 139 &jb (&label("${p}128")); 140 &lea ($key,&DWP(0x20,$key)); 141 &je (&label("${p}192")); 142 &lea ($key,&DWP(0x20,$key)); 143 eval"&aes${p} ($inout,$rndkey1)"; 144 &$movekey ($rndkey1,&QWP(-0x40,$key)); 145 eval"&aes${p} ($inout,$rndkey0)"; 146 &$movekey ($rndkey0,&QWP(-0x30,$key)); 147 &set_label("${p}192"); 148 eval"&aes${p} ($inout,$rndkey1)"; 149 &$movekey ($rndkey1,&QWP(-0x20,$key)); 150 eval"&aes${p} ($inout,$rndkey0)"; 151 &$movekey ($rndkey0,&QWP(-0x10,$key)); 152 &set_label("${p}128"); 153 eval"&aes${p} ($inout,$rndkey1)"; 154 &$movekey ($rndkey1,&QWP(0,$key)); 155 eval"&aes${p} ($inout,$rndkey0)"; 156 &$movekey ($rndkey0,&QWP(0x10,$key)); 157 eval"&aes${p} ($inout,$rndkey1)"; 158 &$movekey ($rndkey1,&QWP(0x20,$key)); 159 eval"&aes${p} ($inout,$rndkey0)"; 160 &$movekey ($rndkey0,&QWP(0x30,$key)); 161 eval"&aes${p} ($inout,$rndkey1)"; 162 &$movekey ($rndkey1,&QWP(0x40,$key)); 163 eval"&aes${p} ($inout,$rndkey0)"; 164 &$movekey ($rndkey0,&QWP(0x50,$key)); 165 eval"&aes${p} ($inout,$rndkey1)"; 166 &$movekey ($rndkey1,&QWP(0x60,$key)); 167 eval"&aes${p} ($inout,$rndkey0)"; 168 &$movekey ($rndkey0,&QWP(0x70,$key)); 169 eval"&aes${p} ($inout,$rndkey1)"; 170 eval"&aes${p}last ($inout,$rndkey0)"; 171 &ret(); 172 &function_end_B("_aesni_${p}rypt1"); 173} 174 175# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 176&aesni_generate1("enc") if (!$inline); 177&function_begin_B("${PREFIX}_encrypt"); 178 &mov ("eax",&wparam(0)); 179 &mov ($key,&wparam(2)); 180 &movups ($inout0,&QWP(0,"eax")); 181 &mov ($rounds,&DWP(240,$key)); 182 &mov ("eax",&wparam(1)); 183 if ($inline) 184 { &aesni_inline_generate1("enc"); } 185 else 186 { &call ("_aesni_encrypt1"); } 187 &pxor ($rndkey0,$rndkey0); # clear register bank 188 &pxor ($rndkey1,$rndkey1); 189 &movups (&QWP(0,"eax"),$inout0); 190 &pxor ($inout0,$inout0); 191 &ret (); 192&function_end_B("${PREFIX}_encrypt"); 193 194# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 195&aesni_generate1("dec") if(!$inline); 196&function_begin_B("${PREFIX}_decrypt"); 197 &mov ("eax",&wparam(0)); 198 &mov ($key,&wparam(2)); 199 &movups ($inout0,&QWP(0,"eax")); 200 &mov ($rounds,&DWP(240,$key)); 201 &mov ("eax",&wparam(1)); 202 if ($inline) 203 { &aesni_inline_generate1("dec"); } 204 else 205 { &call ("_aesni_decrypt1"); } 206 &pxor ($rndkey0,$rndkey0); # clear register bank 207 &pxor ($rndkey1,$rndkey1); 208 &movups (&QWP(0,"eax"),$inout0); 209 &pxor ($inout0,$inout0); 210 &ret (); 211&function_end_B("${PREFIX}_decrypt"); 212 213# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 214# factor. Why 3x subroutine were originally used in loops? Even though 215# aes[enc|dec] latency was originally 6, it could be scheduled only 216# every *2nd* cycle. Thus 3x interleave was the one providing optimal 217# utilization, i.e. when subroutine's throughput is virtually same as 218# of non-interleaved subroutine [for number of input blocks up to 3]. 219# This is why it originally made no sense to implement 2x subroutine. 220# But times change and it became appropriate to spend extra 192 bytes 221# on 2x subroutine on Atom Silvermont account. For processors that 222# can schedule aes[enc|dec] every cycle optimal interleave factor 223# equals to corresponding instructions latency. 8x is optimal for 224# * Bridge, but it's unfeasible to accommodate such implementation 225# in XMM registers addreassable in 32-bit mode and therefore maximum 226# of 6x is used instead... 227 228sub aesni_generate2 229{ my $p=shift; 230 231 &function_begin_B("_aesni_${p}rypt2"); 232 &$movekey ($rndkey0,&QWP(0,$key)); 233 &shl ($rounds,4); 234 &$movekey ($rndkey1,&QWP(16,$key)); 235 &xorps ($inout0,$rndkey0); 236 &pxor ($inout1,$rndkey0); 237 &$movekey ($rndkey0,&QWP(32,$key)); 238 &lea ($key,&DWP(32,$key,$rounds)); 239 &neg ($rounds); 240 &add ($rounds,16); 241 242 &set_label("${p}2_loop"); 243 eval"&aes${p} ($inout0,$rndkey1)"; 244 eval"&aes${p} ($inout1,$rndkey1)"; 245 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 246 &add ($rounds,32); 247 eval"&aes${p} ($inout0,$rndkey0)"; 248 eval"&aes${p} ($inout1,$rndkey0)"; 249 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 250 &jnz (&label("${p}2_loop")); 251 eval"&aes${p} ($inout0,$rndkey1)"; 252 eval"&aes${p} ($inout1,$rndkey1)"; 253 eval"&aes${p}last ($inout0,$rndkey0)"; 254 eval"&aes${p}last ($inout1,$rndkey0)"; 255 &ret(); 256 &function_end_B("_aesni_${p}rypt2"); 257} 258 259sub aesni_generate3 260{ my $p=shift; 261 262 &function_begin_B("_aesni_${p}rypt3"); 263 &$movekey ($rndkey0,&QWP(0,$key)); 264 &shl ($rounds,4); 265 &$movekey ($rndkey1,&QWP(16,$key)); 266 &xorps ($inout0,$rndkey0); 267 &pxor ($inout1,$rndkey0); 268 &pxor ($inout2,$rndkey0); 269 &$movekey ($rndkey0,&QWP(32,$key)); 270 &lea ($key,&DWP(32,$key,$rounds)); 271 &neg ($rounds); 272 &add ($rounds,16); 273 274 &set_label("${p}3_loop"); 275 eval"&aes${p} ($inout0,$rndkey1)"; 276 eval"&aes${p} ($inout1,$rndkey1)"; 277 eval"&aes${p} ($inout2,$rndkey1)"; 278 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 279 &add ($rounds,32); 280 eval"&aes${p} ($inout0,$rndkey0)"; 281 eval"&aes${p} ($inout1,$rndkey0)"; 282 eval"&aes${p} ($inout2,$rndkey0)"; 283 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 284 &jnz (&label("${p}3_loop")); 285 eval"&aes${p} ($inout0,$rndkey1)"; 286 eval"&aes${p} ($inout1,$rndkey1)"; 287 eval"&aes${p} ($inout2,$rndkey1)"; 288 eval"&aes${p}last ($inout0,$rndkey0)"; 289 eval"&aes${p}last ($inout1,$rndkey0)"; 290 eval"&aes${p}last ($inout2,$rndkey0)"; 291 &ret(); 292 &function_end_B("_aesni_${p}rypt3"); 293} 294 295# 4x interleave is implemented to improve small block performance, 296# most notably [and naturally] 4 block by ~30%. One can argue that one 297# should have implemented 5x as well, but improvement would be <20%, 298# so it's not worth it... 299sub aesni_generate4 300{ my $p=shift; 301 302 &function_begin_B("_aesni_${p}rypt4"); 303 &$movekey ($rndkey0,&QWP(0,$key)); 304 &$movekey ($rndkey1,&QWP(16,$key)); 305 &shl ($rounds,4); 306 &xorps ($inout0,$rndkey0); 307 &pxor ($inout1,$rndkey0); 308 &pxor ($inout2,$rndkey0); 309 &pxor ($inout3,$rndkey0); 310 &$movekey ($rndkey0,&QWP(32,$key)); 311 &lea ($key,&DWP(32,$key,$rounds)); 312 &neg ($rounds); 313 &data_byte (0x0f,0x1f,0x40,0x00); 314 &add ($rounds,16); 315 316 &set_label("${p}4_loop"); 317 eval"&aes${p} ($inout0,$rndkey1)"; 318 eval"&aes${p} ($inout1,$rndkey1)"; 319 eval"&aes${p} ($inout2,$rndkey1)"; 320 eval"&aes${p} ($inout3,$rndkey1)"; 321 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 322 &add ($rounds,32); 323 eval"&aes${p} ($inout0,$rndkey0)"; 324 eval"&aes${p} ($inout1,$rndkey0)"; 325 eval"&aes${p} ($inout2,$rndkey0)"; 326 eval"&aes${p} ($inout3,$rndkey0)"; 327 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 328 &jnz (&label("${p}4_loop")); 329 330 eval"&aes${p} ($inout0,$rndkey1)"; 331 eval"&aes${p} ($inout1,$rndkey1)"; 332 eval"&aes${p} ($inout2,$rndkey1)"; 333 eval"&aes${p} ($inout3,$rndkey1)"; 334 eval"&aes${p}last ($inout0,$rndkey0)"; 335 eval"&aes${p}last ($inout1,$rndkey0)"; 336 eval"&aes${p}last ($inout2,$rndkey0)"; 337 eval"&aes${p}last ($inout3,$rndkey0)"; 338 &ret(); 339 &function_end_B("_aesni_${p}rypt4"); 340} 341 342sub aesni_generate6 343{ my $p=shift; 344 345 &function_begin_B("_aesni_${p}rypt6"); 346 &static_label("_aesni_${p}rypt6_enter"); 347 &$movekey ($rndkey0,&QWP(0,$key)); 348 &shl ($rounds,4); 349 &$movekey ($rndkey1,&QWP(16,$key)); 350 &xorps ($inout0,$rndkey0); 351 &pxor ($inout1,$rndkey0); # pxor does better here 352 &pxor ($inout2,$rndkey0); 353 eval"&aes${p} ($inout0,$rndkey1)"; 354 &pxor ($inout3,$rndkey0); 355 &pxor ($inout4,$rndkey0); 356 eval"&aes${p} ($inout1,$rndkey1)"; 357 &lea ($key,&DWP(32,$key,$rounds)); 358 &neg ($rounds); 359 eval"&aes${p} ($inout2,$rndkey1)"; 360 &pxor ($inout5,$rndkey0); 361 &$movekey ($rndkey0,&QWP(0,$key,$rounds)); 362 &add ($rounds,16); 363 &jmp (&label("_aesni_${p}rypt6_inner")); 364 365 &set_label("${p}6_loop",16); 366 eval"&aes${p} ($inout0,$rndkey1)"; 367 eval"&aes${p} ($inout1,$rndkey1)"; 368 eval"&aes${p} ($inout2,$rndkey1)"; 369 &set_label("_aesni_${p}rypt6_inner"); 370 eval"&aes${p} ($inout3,$rndkey1)"; 371 eval"&aes${p} ($inout4,$rndkey1)"; 372 eval"&aes${p} ($inout5,$rndkey1)"; 373 &set_label("_aesni_${p}rypt6_enter"); 374 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 375 &add ($rounds,32); 376 eval"&aes${p} ($inout0,$rndkey0)"; 377 eval"&aes${p} ($inout1,$rndkey0)"; 378 eval"&aes${p} ($inout2,$rndkey0)"; 379 eval"&aes${p} ($inout3,$rndkey0)"; 380 eval"&aes${p} ($inout4,$rndkey0)"; 381 eval"&aes${p} ($inout5,$rndkey0)"; 382 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 383 &jnz (&label("${p}6_loop")); 384 385 eval"&aes${p} ($inout0,$rndkey1)"; 386 eval"&aes${p} ($inout1,$rndkey1)"; 387 eval"&aes${p} ($inout2,$rndkey1)"; 388 eval"&aes${p} ($inout3,$rndkey1)"; 389 eval"&aes${p} ($inout4,$rndkey1)"; 390 eval"&aes${p} ($inout5,$rndkey1)"; 391 eval"&aes${p}last ($inout0,$rndkey0)"; 392 eval"&aes${p}last ($inout1,$rndkey0)"; 393 eval"&aes${p}last ($inout2,$rndkey0)"; 394 eval"&aes${p}last ($inout3,$rndkey0)"; 395 eval"&aes${p}last ($inout4,$rndkey0)"; 396 eval"&aes${p}last ($inout5,$rndkey0)"; 397 &ret(); 398 &function_end_B("_aesni_${p}rypt6"); 399} 400&aesni_generate2("enc") if ($PREFIX eq "aesni"); 401&aesni_generate2("dec"); 402&aesni_generate3("enc") if ($PREFIX eq "aesni"); 403&aesni_generate3("dec"); 404&aesni_generate4("enc") if ($PREFIX eq "aesni"); 405&aesni_generate4("dec"); 406&aesni_generate6("enc") if ($PREFIX eq "aesni"); 407&aesni_generate6("dec"); 408 409if ($PREFIX eq "aesni") { 410###################################################################### 411# void aesni_ecb_encrypt (const void *in, void *out, 412# size_t length, const AES_KEY *key, 413# int enc); 414&function_begin("aesni_ecb_encrypt"); 415 &mov ($inp,&wparam(0)); 416 &mov ($out,&wparam(1)); 417 &mov ($len,&wparam(2)); 418 &mov ($key,&wparam(3)); 419 &mov ($rounds_,&wparam(4)); 420 &and ($len,-16); 421 &jz (&label("ecb_ret")); 422 &mov ($rounds,&DWP(240,$key)); 423 &test ($rounds_,$rounds_); 424 &jz (&label("ecb_decrypt")); 425 426 &mov ($key_,$key); # backup $key 427 &mov ($rounds_,$rounds); # backup $rounds 428 &cmp ($len,0x60); 429 &jb (&label("ecb_enc_tail")); 430 431 &movdqu ($inout0,&QWP(0,$inp)); 432 &movdqu ($inout1,&QWP(0x10,$inp)); 433 &movdqu ($inout2,&QWP(0x20,$inp)); 434 &movdqu ($inout3,&QWP(0x30,$inp)); 435 &movdqu ($inout4,&QWP(0x40,$inp)); 436 &movdqu ($inout5,&QWP(0x50,$inp)); 437 &lea ($inp,&DWP(0x60,$inp)); 438 &sub ($len,0x60); 439 &jmp (&label("ecb_enc_loop6_enter")); 440 441&set_label("ecb_enc_loop6",16); 442 &movups (&QWP(0,$out),$inout0); 443 &movdqu ($inout0,&QWP(0,$inp)); 444 &movups (&QWP(0x10,$out),$inout1); 445 &movdqu ($inout1,&QWP(0x10,$inp)); 446 &movups (&QWP(0x20,$out),$inout2); 447 &movdqu ($inout2,&QWP(0x20,$inp)); 448 &movups (&QWP(0x30,$out),$inout3); 449 &movdqu ($inout3,&QWP(0x30,$inp)); 450 &movups (&QWP(0x40,$out),$inout4); 451 &movdqu ($inout4,&QWP(0x40,$inp)); 452 &movups (&QWP(0x50,$out),$inout5); 453 &lea ($out,&DWP(0x60,$out)); 454 &movdqu ($inout5,&QWP(0x50,$inp)); 455 &lea ($inp,&DWP(0x60,$inp)); 456&set_label("ecb_enc_loop6_enter"); 457 458 &call ("_aesni_encrypt6"); 459 460 &mov ($key,$key_); # restore $key 461 &mov ($rounds,$rounds_); # restore $rounds 462 &sub ($len,0x60); 463 &jnc (&label("ecb_enc_loop6")); 464 465 &movups (&QWP(0,$out),$inout0); 466 &movups (&QWP(0x10,$out),$inout1); 467 &movups (&QWP(0x20,$out),$inout2); 468 &movups (&QWP(0x30,$out),$inout3); 469 &movups (&QWP(0x40,$out),$inout4); 470 &movups (&QWP(0x50,$out),$inout5); 471 &lea ($out,&DWP(0x60,$out)); 472 &add ($len,0x60); 473 &jz (&label("ecb_ret")); 474 475&set_label("ecb_enc_tail"); 476 &movups ($inout0,&QWP(0,$inp)); 477 &cmp ($len,0x20); 478 &jb (&label("ecb_enc_one")); 479 &movups ($inout1,&QWP(0x10,$inp)); 480 &je (&label("ecb_enc_two")); 481 &movups ($inout2,&QWP(0x20,$inp)); 482 &cmp ($len,0x40); 483 &jb (&label("ecb_enc_three")); 484 &movups ($inout3,&QWP(0x30,$inp)); 485 &je (&label("ecb_enc_four")); 486 &movups ($inout4,&QWP(0x40,$inp)); 487 &xorps ($inout5,$inout5); 488 &call ("_aesni_encrypt6"); 489 &movups (&QWP(0,$out),$inout0); 490 &movups (&QWP(0x10,$out),$inout1); 491 &movups (&QWP(0x20,$out),$inout2); 492 &movups (&QWP(0x30,$out),$inout3); 493 &movups (&QWP(0x40,$out),$inout4); 494 jmp (&label("ecb_ret")); 495 496&set_label("ecb_enc_one",16); 497 if ($inline) 498 { &aesni_inline_generate1("enc"); } 499 else 500 { &call ("_aesni_encrypt1"); } 501 &movups (&QWP(0,$out),$inout0); 502 &jmp (&label("ecb_ret")); 503 504&set_label("ecb_enc_two",16); 505 &call ("_aesni_encrypt2"); 506 &movups (&QWP(0,$out),$inout0); 507 &movups (&QWP(0x10,$out),$inout1); 508 &jmp (&label("ecb_ret")); 509 510&set_label("ecb_enc_three",16); 511 &call ("_aesni_encrypt3"); 512 &movups (&QWP(0,$out),$inout0); 513 &movups (&QWP(0x10,$out),$inout1); 514 &movups (&QWP(0x20,$out),$inout2); 515 &jmp (&label("ecb_ret")); 516 517&set_label("ecb_enc_four",16); 518 &call ("_aesni_encrypt4"); 519 &movups (&QWP(0,$out),$inout0); 520 &movups (&QWP(0x10,$out),$inout1); 521 &movups (&QWP(0x20,$out),$inout2); 522 &movups (&QWP(0x30,$out),$inout3); 523 &jmp (&label("ecb_ret")); 524###################################################################### 525&set_label("ecb_decrypt",16); 526 &mov ($key_,$key); # backup $key 527 &mov ($rounds_,$rounds); # backup $rounds 528 &cmp ($len,0x60); 529 &jb (&label("ecb_dec_tail")); 530 531 &movdqu ($inout0,&QWP(0,$inp)); 532 &movdqu ($inout1,&QWP(0x10,$inp)); 533 &movdqu ($inout2,&QWP(0x20,$inp)); 534 &movdqu ($inout3,&QWP(0x30,$inp)); 535 &movdqu ($inout4,&QWP(0x40,$inp)); 536 &movdqu ($inout5,&QWP(0x50,$inp)); 537 &lea ($inp,&DWP(0x60,$inp)); 538 &sub ($len,0x60); 539 &jmp (&label("ecb_dec_loop6_enter")); 540 541&set_label("ecb_dec_loop6",16); 542 &movups (&QWP(0,$out),$inout0); 543 &movdqu ($inout0,&QWP(0,$inp)); 544 &movups (&QWP(0x10,$out),$inout1); 545 &movdqu ($inout1,&QWP(0x10,$inp)); 546 &movups (&QWP(0x20,$out),$inout2); 547 &movdqu ($inout2,&QWP(0x20,$inp)); 548 &movups (&QWP(0x30,$out),$inout3); 549 &movdqu ($inout3,&QWP(0x30,$inp)); 550 &movups (&QWP(0x40,$out),$inout4); 551 &movdqu ($inout4,&QWP(0x40,$inp)); 552 &movups (&QWP(0x50,$out),$inout5); 553 &lea ($out,&DWP(0x60,$out)); 554 &movdqu ($inout5,&QWP(0x50,$inp)); 555 &lea ($inp,&DWP(0x60,$inp)); 556&set_label("ecb_dec_loop6_enter"); 557 558 &call ("_aesni_decrypt6"); 559 560 &mov ($key,$key_); # restore $key 561 &mov ($rounds,$rounds_); # restore $rounds 562 &sub ($len,0x60); 563 &jnc (&label("ecb_dec_loop6")); 564 565 &movups (&QWP(0,$out),$inout0); 566 &movups (&QWP(0x10,$out),$inout1); 567 &movups (&QWP(0x20,$out),$inout2); 568 &movups (&QWP(0x30,$out),$inout3); 569 &movups (&QWP(0x40,$out),$inout4); 570 &movups (&QWP(0x50,$out),$inout5); 571 &lea ($out,&DWP(0x60,$out)); 572 &add ($len,0x60); 573 &jz (&label("ecb_ret")); 574 575&set_label("ecb_dec_tail"); 576 &movups ($inout0,&QWP(0,$inp)); 577 &cmp ($len,0x20); 578 &jb (&label("ecb_dec_one")); 579 &movups ($inout1,&QWP(0x10,$inp)); 580 &je (&label("ecb_dec_two")); 581 &movups ($inout2,&QWP(0x20,$inp)); 582 &cmp ($len,0x40); 583 &jb (&label("ecb_dec_three")); 584 &movups ($inout3,&QWP(0x30,$inp)); 585 &je (&label("ecb_dec_four")); 586 &movups ($inout4,&QWP(0x40,$inp)); 587 &xorps ($inout5,$inout5); 588 &call ("_aesni_decrypt6"); 589 &movups (&QWP(0,$out),$inout0); 590 &movups (&QWP(0x10,$out),$inout1); 591 &movups (&QWP(0x20,$out),$inout2); 592 &movups (&QWP(0x30,$out),$inout3); 593 &movups (&QWP(0x40,$out),$inout4); 594 &jmp (&label("ecb_ret")); 595 596&set_label("ecb_dec_one",16); 597 if ($inline) 598 { &aesni_inline_generate1("dec"); } 599 else 600 { &call ("_aesni_decrypt1"); } 601 &movups (&QWP(0,$out),$inout0); 602 &jmp (&label("ecb_ret")); 603 604&set_label("ecb_dec_two",16); 605 &call ("_aesni_decrypt2"); 606 &movups (&QWP(0,$out),$inout0); 607 &movups (&QWP(0x10,$out),$inout1); 608 &jmp (&label("ecb_ret")); 609 610&set_label("ecb_dec_three",16); 611 &call ("_aesni_decrypt3"); 612 &movups (&QWP(0,$out),$inout0); 613 &movups (&QWP(0x10,$out),$inout1); 614 &movups (&QWP(0x20,$out),$inout2); 615 &jmp (&label("ecb_ret")); 616 617&set_label("ecb_dec_four",16); 618 &call ("_aesni_decrypt4"); 619 &movups (&QWP(0,$out),$inout0); 620 &movups (&QWP(0x10,$out),$inout1); 621 &movups (&QWP(0x20,$out),$inout2); 622 &movups (&QWP(0x30,$out),$inout3); 623 624&set_label("ecb_ret"); 625 &pxor ("xmm0","xmm0"); # clear register bank 626 &pxor ("xmm1","xmm1"); 627 &pxor ("xmm2","xmm2"); 628 &pxor ("xmm3","xmm3"); 629 &pxor ("xmm4","xmm4"); 630 &pxor ("xmm5","xmm5"); 631 &pxor ("xmm6","xmm6"); 632 &pxor ("xmm7","xmm7"); 633&function_end("aesni_ecb_encrypt"); 634 635###################################################################### 636# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 637# size_t blocks, const AES_KEY *key, 638# const char *ivec,char *cmac); 639# 640# Handles only complete blocks, operates on 64-bit counter and 641# does not update *ivec! Nor does it finalize CMAC value 642# (see engine/eng_aesni.c for details) 643# 644{ my $cmac=$inout1; 645&function_begin("aesni_ccm64_encrypt_blocks"); 646 &mov ($inp,&wparam(0)); 647 &mov ($out,&wparam(1)); 648 &mov ($len,&wparam(2)); 649 &mov ($key,&wparam(3)); 650 &mov ($rounds_,&wparam(4)); 651 &mov ($rounds,&wparam(5)); 652 &mov ($key_,"esp"); 653 &sub ("esp",60); 654 &and ("esp",-16); # align stack 655 &mov (&DWP(48,"esp"),$key_); 656 657 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 658 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 659 &mov ($rounds,&DWP(240,$key)); 660 661 # compose byte-swap control mask for pshufb on stack 662 &mov (&DWP(0,"esp"),0x0c0d0e0f); 663 &mov (&DWP(4,"esp"),0x08090a0b); 664 &mov (&DWP(8,"esp"),0x04050607); 665 &mov (&DWP(12,"esp"),0x00010203); 666 667 # compose counter increment vector on stack 668 &mov ($rounds_,1); 669 &xor ($key_,$key_); 670 &mov (&DWP(16,"esp"),$rounds_); 671 &mov (&DWP(20,"esp"),$key_); 672 &mov (&DWP(24,"esp"),$key_); 673 &mov (&DWP(28,"esp"),$key_); 674 675 &shl ($rounds,4); 676 &mov ($rounds_,16); 677 &lea ($key_,&DWP(0,$key)); 678 &movdqa ($inout3,&QWP(0,"esp")); 679 &movdqa ($inout0,$ivec); 680 &lea ($key,&DWP(32,$key,$rounds)); 681 &sub ($rounds_,$rounds); 682 &pshufb ($ivec,$inout3); 683 684&set_label("ccm64_enc_outer"); 685 &$movekey ($rndkey0,&QWP(0,$key_)); 686 &mov ($rounds,$rounds_); 687 &movups ($in0,&QWP(0,$inp)); 688 689 &xorps ($inout0,$rndkey0); 690 &$movekey ($rndkey1,&QWP(16,$key_)); 691 &xorps ($rndkey0,$in0); 692 &xorps ($cmac,$rndkey0); # cmac^=inp 693 &$movekey ($rndkey0,&QWP(32,$key_)); 694 695&set_label("ccm64_enc2_loop"); 696 &aesenc ($inout0,$rndkey1); 697 &aesenc ($cmac,$rndkey1); 698 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 699 &add ($rounds,32); 700 &aesenc ($inout0,$rndkey0); 701 &aesenc ($cmac,$rndkey0); 702 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 703 &jnz (&label("ccm64_enc2_loop")); 704 &aesenc ($inout0,$rndkey1); 705 &aesenc ($cmac,$rndkey1); 706 &paddq ($ivec,&QWP(16,"esp")); 707 &dec ($len); 708 &aesenclast ($inout0,$rndkey0); 709 &aesenclast ($cmac,$rndkey0); 710 711 &lea ($inp,&DWP(16,$inp)); 712 &xorps ($in0,$inout0); # inp^=E(ivec) 713 &movdqa ($inout0,$ivec); 714 &movups (&QWP(0,$out),$in0); # save output 715 &pshufb ($inout0,$inout3); 716 &lea ($out,&DWP(16,$out)); 717 &jnz (&label("ccm64_enc_outer")); 718 719 &mov ("esp",&DWP(48,"esp")); 720 &mov ($out,&wparam(5)); 721 &movups (&QWP(0,$out),$cmac); 722 723 &pxor ("xmm0","xmm0"); # clear register bank 724 &pxor ("xmm1","xmm1"); 725 &pxor ("xmm2","xmm2"); 726 &pxor ("xmm3","xmm3"); 727 &pxor ("xmm4","xmm4"); 728 &pxor ("xmm5","xmm5"); 729 &pxor ("xmm6","xmm6"); 730 &pxor ("xmm7","xmm7"); 731&function_end("aesni_ccm64_encrypt_blocks"); 732 733&function_begin("aesni_ccm64_decrypt_blocks"); 734 &mov ($inp,&wparam(0)); 735 &mov ($out,&wparam(1)); 736 &mov ($len,&wparam(2)); 737 &mov ($key,&wparam(3)); 738 &mov ($rounds_,&wparam(4)); 739 &mov ($rounds,&wparam(5)); 740 &mov ($key_,"esp"); 741 &sub ("esp",60); 742 &and ("esp",-16); # align stack 743 &mov (&DWP(48,"esp"),$key_); 744 745 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 746 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 747 &mov ($rounds,&DWP(240,$key)); 748 749 # compose byte-swap control mask for pshufb on stack 750 &mov (&DWP(0,"esp"),0x0c0d0e0f); 751 &mov (&DWP(4,"esp"),0x08090a0b); 752 &mov (&DWP(8,"esp"),0x04050607); 753 &mov (&DWP(12,"esp"),0x00010203); 754 755 # compose counter increment vector on stack 756 &mov ($rounds_,1); 757 &xor ($key_,$key_); 758 &mov (&DWP(16,"esp"),$rounds_); 759 &mov (&DWP(20,"esp"),$key_); 760 &mov (&DWP(24,"esp"),$key_); 761 &mov (&DWP(28,"esp"),$key_); 762 763 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 764 &movdqa ($inout0,$ivec); 765 766 &mov ($key_,$key); 767 &mov ($rounds_,$rounds); 768 769 &pshufb ($ivec,$inout3); 770 if ($inline) 771 { &aesni_inline_generate1("enc"); } 772 else 773 { &call ("_aesni_encrypt1"); } 774 &shl ($rounds_,4); 775 &mov ($rounds,16); 776 &movups ($in0,&QWP(0,$inp)); # load inp 777 &paddq ($ivec,&QWP(16,"esp")); 778 &lea ($inp,&QWP(16,$inp)); 779 &sub ($rounds,$rounds_); 780 &lea ($key,&DWP(32,$key_,$rounds_)); 781 &mov ($rounds_,$rounds); 782 &jmp (&label("ccm64_dec_outer")); 783 784&set_label("ccm64_dec_outer",16); 785 &xorps ($in0,$inout0); # inp ^= E(ivec) 786 &movdqa ($inout0,$ivec); 787 &movups (&QWP(0,$out),$in0); # save output 788 &lea ($out,&DWP(16,$out)); 789 &pshufb ($inout0,$inout3); 790 791 &sub ($len,1); 792 &jz (&label("ccm64_dec_break")); 793 794 &$movekey ($rndkey0,&QWP(0,$key_)); 795 &mov ($rounds,$rounds_); 796 &$movekey ($rndkey1,&QWP(16,$key_)); 797 &xorps ($in0,$rndkey0); 798 &xorps ($inout0,$rndkey0); 799 &xorps ($cmac,$in0); # cmac^=out 800 &$movekey ($rndkey0,&QWP(32,$key_)); 801 802&set_label("ccm64_dec2_loop"); 803 &aesenc ($inout0,$rndkey1); 804 &aesenc ($cmac,$rndkey1); 805 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 806 &add ($rounds,32); 807 &aesenc ($inout0,$rndkey0); 808 &aesenc ($cmac,$rndkey0); 809 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 810 &jnz (&label("ccm64_dec2_loop")); 811 &movups ($in0,&QWP(0,$inp)); # load inp 812 &paddq ($ivec,&QWP(16,"esp")); 813 &aesenc ($inout0,$rndkey1); 814 &aesenc ($cmac,$rndkey1); 815 &aesenclast ($inout0,$rndkey0); 816 &aesenclast ($cmac,$rndkey0); 817 &lea ($inp,&QWP(16,$inp)); 818 &jmp (&label("ccm64_dec_outer")); 819 820&set_label("ccm64_dec_break",16); 821 &mov ($rounds,&DWP(240,$key_)); 822 &mov ($key,$key_); 823 if ($inline) 824 { &aesni_inline_generate1("enc",$cmac,$in0); } 825 else 826 { &call ("_aesni_encrypt1",$cmac); } 827 828 &mov ("esp",&DWP(48,"esp")); 829 &mov ($out,&wparam(5)); 830 &movups (&QWP(0,$out),$cmac); 831 832 &pxor ("xmm0","xmm0"); # clear register bank 833 &pxor ("xmm1","xmm1"); 834 &pxor ("xmm2","xmm2"); 835 &pxor ("xmm3","xmm3"); 836 &pxor ("xmm4","xmm4"); 837 &pxor ("xmm5","xmm5"); 838 &pxor ("xmm6","xmm6"); 839 &pxor ("xmm7","xmm7"); 840&function_end("aesni_ccm64_decrypt_blocks"); 841} 842 843###################################################################### 844# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 845# size_t blocks, const AES_KEY *key, 846# const char *ivec); 847# 848# Handles only complete blocks, operates on 32-bit counter and 849# does not update *ivec! (see crypto/modes/ctr128.c for details) 850# 851# stack layout: 852# 0 pshufb mask 853# 16 vector addend: 0,6,6,6 854# 32 counter-less ivec 855# 48 1st triplet of counter vector 856# 64 2nd triplet of counter vector 857# 80 saved %esp 858 859&function_begin("aesni_ctr32_encrypt_blocks"); 860 &mov ($inp,&wparam(0)); 861 &mov ($out,&wparam(1)); 862 &mov ($len,&wparam(2)); 863 &mov ($key,&wparam(3)); 864 &mov ($rounds_,&wparam(4)); 865 &mov ($key_,"esp"); 866 &sub ("esp",88); 867 &and ("esp",-16); # align stack 868 &mov (&DWP(80,"esp"),$key_); 869 870 &cmp ($len,1); 871 &je (&label("ctr32_one_shortcut")); 872 873 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 874 875 # compose byte-swap control mask for pshufb on stack 876 &mov (&DWP(0,"esp"),0x0c0d0e0f); 877 &mov (&DWP(4,"esp"),0x08090a0b); 878 &mov (&DWP(8,"esp"),0x04050607); 879 &mov (&DWP(12,"esp"),0x00010203); 880 881 # compose counter increment vector on stack 882 &mov ($rounds,6); 883 &xor ($key_,$key_); 884 &mov (&DWP(16,"esp"),$rounds); 885 &mov (&DWP(20,"esp"),$rounds); 886 &mov (&DWP(24,"esp"),$rounds); 887 &mov (&DWP(28,"esp"),$key_); 888 889 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 890 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 891 892 &mov ($rounds,&DWP(240,$key)); # key->rounds 893 894 # compose 2 vectors of 3x32-bit counters 895 &bswap ($rounds_); 896 &pxor ($rndkey0,$rndkey0); 897 &pxor ($rndkey1,$rndkey1); 898 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 899 &pinsrd ($rndkey0,$rounds_,0); 900 &lea ($key_,&DWP(3,$rounds_)); 901 &pinsrd ($rndkey1,$key_,0); 902 &inc ($rounds_); 903 &pinsrd ($rndkey0,$rounds_,1); 904 &inc ($key_); 905 &pinsrd ($rndkey1,$key_,1); 906 &inc ($rounds_); 907 &pinsrd ($rndkey0,$rounds_,2); 908 &inc ($key_); 909 &pinsrd ($rndkey1,$key_,2); 910 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 911 &pshufb ($rndkey0,$inout0); # byte swap 912 &movdqu ($inout4,&QWP(0,$key)); # key[0] 913 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 914 &pshufb ($rndkey1,$inout0); # byte swap 915 916 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword 917 &pshufd ($inout1,$rndkey0,2<<6); 918 &cmp ($len,6); 919 &jb (&label("ctr32_tail")); 920 &pxor ($inout5,$inout4); # counter-less ivec^key[0] 921 &shl ($rounds,4); 922 &mov ($rounds_,16); 923 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] 924 &mov ($key_,$key); # backup $key 925 &sub ($rounds_,$rounds); # backup twisted $rounds 926 &lea ($key,&DWP(32,$key,$rounds)); 927 &sub ($len,6); 928 &jmp (&label("ctr32_loop6")); 929 930&set_label("ctr32_loop6",16); 931 # inlining _aesni_encrypt6's prologue gives ~6% improvement... 932 &pshufd ($inout2,$rndkey0,1<<6); 933 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec 934 &pshufd ($inout3,$rndkey1,3<<6); 935 &pxor ($inout0,$rndkey0); # merge counter-less ivec 936 &pshufd ($inout4,$rndkey1,2<<6); 937 &pxor ($inout1,$rndkey0); 938 &pshufd ($inout5,$rndkey1,1<<6); 939 &$movekey ($rndkey1,&QWP(16,$key_)); 940 &pxor ($inout2,$rndkey0); 941 &pxor ($inout3,$rndkey0); 942 &aesenc ($inout0,$rndkey1); 943 &pxor ($inout4,$rndkey0); 944 &pxor ($inout5,$rndkey0); 945 &aesenc ($inout1,$rndkey1); 946 &$movekey ($rndkey0,&QWP(32,$key_)); 947 &mov ($rounds,$rounds_); 948 &aesenc ($inout2,$rndkey1); 949 &aesenc ($inout3,$rndkey1); 950 &aesenc ($inout4,$rndkey1); 951 &aesenc ($inout5,$rndkey1); 952 953 &call (&label("_aesni_encrypt6_enter")); 954 955 &movups ($rndkey1,&QWP(0,$inp)); 956 &movups ($rndkey0,&QWP(0x10,$inp)); 957 &xorps ($inout0,$rndkey1); 958 &movups ($rndkey1,&QWP(0x20,$inp)); 959 &xorps ($inout1,$rndkey0); 960 &movups (&QWP(0,$out),$inout0); 961 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 962 &xorps ($inout2,$rndkey1); 963 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet 964 &movups (&QWP(0x10,$out),$inout1); 965 &movups (&QWP(0x20,$out),$inout2); 966 967 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment 968 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment 969 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 970 971 &movups ($inout1,&QWP(0x30,$inp)); 972 &movups ($inout2,&QWP(0x40,$inp)); 973 &xorps ($inout3,$inout1); 974 &movups ($inout1,&QWP(0x50,$inp)); 975 &lea ($inp,&DWP(0x60,$inp)); 976 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 977 &pshufb ($rndkey0,$inout0); # byte swap 978 &xorps ($inout4,$inout2); 979 &movups (&QWP(0x30,$out),$inout3); 980 &xorps ($inout5,$inout1); 981 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 982 &pshufb ($rndkey1,$inout0); # byte swap 983 &movups (&QWP(0x40,$out),$inout4); 984 &pshufd ($inout0,$rndkey0,3<<6); 985 &movups (&QWP(0x50,$out),$inout5); 986 &lea ($out,&DWP(0x60,$out)); 987 988 &pshufd ($inout1,$rndkey0,2<<6); 989 &sub ($len,6); 990 &jnc (&label("ctr32_loop6")); 991 992 &add ($len,6); 993 &jz (&label("ctr32_ret")); 994 &movdqu ($inout5,&QWP(0,$key_)); 995 &mov ($key,$key_); 996 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec 997 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 998 999&set_label("ctr32_tail"); 1000 &por ($inout0,$inout5); 1001 &cmp ($len,2); 1002 &jb (&label("ctr32_one")); 1003 1004 &pshufd ($inout2,$rndkey0,1<<6); 1005 &por ($inout1,$inout5); 1006 &je (&label("ctr32_two")); 1007 1008 &pshufd ($inout3,$rndkey1,3<<6); 1009 &por ($inout2,$inout5); 1010 &cmp ($len,4); 1011 &jb (&label("ctr32_three")); 1012 1013 &pshufd ($inout4,$rndkey1,2<<6); 1014 &por ($inout3,$inout5); 1015 &je (&label("ctr32_four")); 1016 1017 &por ($inout4,$inout5); 1018 &call ("_aesni_encrypt6"); 1019 &movups ($rndkey1,&QWP(0,$inp)); 1020 &movups ($rndkey0,&QWP(0x10,$inp)); 1021 &xorps ($inout0,$rndkey1); 1022 &movups ($rndkey1,&QWP(0x20,$inp)); 1023 &xorps ($inout1,$rndkey0); 1024 &movups ($rndkey0,&QWP(0x30,$inp)); 1025 &xorps ($inout2,$rndkey1); 1026 &movups ($rndkey1,&QWP(0x40,$inp)); 1027 &xorps ($inout3,$rndkey0); 1028 &movups (&QWP(0,$out),$inout0); 1029 &xorps ($inout4,$rndkey1); 1030 &movups (&QWP(0x10,$out),$inout1); 1031 &movups (&QWP(0x20,$out),$inout2); 1032 &movups (&QWP(0x30,$out),$inout3); 1033 &movups (&QWP(0x40,$out),$inout4); 1034 &jmp (&label("ctr32_ret")); 1035 1036&set_label("ctr32_one_shortcut",16); 1037 &movups ($inout0,&QWP(0,$rounds_)); # load ivec 1038 &mov ($rounds,&DWP(240,$key)); 1039 1040&set_label("ctr32_one"); 1041 if ($inline) 1042 { &aesni_inline_generate1("enc"); } 1043 else 1044 { &call ("_aesni_encrypt1"); } 1045 &movups ($in0,&QWP(0,$inp)); 1046 &xorps ($in0,$inout0); 1047 &movups (&QWP(0,$out),$in0); 1048 &jmp (&label("ctr32_ret")); 1049 1050&set_label("ctr32_two",16); 1051 &call ("_aesni_encrypt2"); 1052 &movups ($inout3,&QWP(0,$inp)); 1053 &movups ($inout4,&QWP(0x10,$inp)); 1054 &xorps ($inout0,$inout3); 1055 &xorps ($inout1,$inout4); 1056 &movups (&QWP(0,$out),$inout0); 1057 &movups (&QWP(0x10,$out),$inout1); 1058 &jmp (&label("ctr32_ret")); 1059 1060&set_label("ctr32_three",16); 1061 &call ("_aesni_encrypt3"); 1062 &movups ($inout3,&QWP(0,$inp)); 1063 &movups ($inout4,&QWP(0x10,$inp)); 1064 &xorps ($inout0,$inout3); 1065 &movups ($inout5,&QWP(0x20,$inp)); 1066 &xorps ($inout1,$inout4); 1067 &movups (&QWP(0,$out),$inout0); 1068 &xorps ($inout2,$inout5); 1069 &movups (&QWP(0x10,$out),$inout1); 1070 &movups (&QWP(0x20,$out),$inout2); 1071 &jmp (&label("ctr32_ret")); 1072 1073&set_label("ctr32_four",16); 1074 &call ("_aesni_encrypt4"); 1075 &movups ($inout4,&QWP(0,$inp)); 1076 &movups ($inout5,&QWP(0x10,$inp)); 1077 &movups ($rndkey1,&QWP(0x20,$inp)); 1078 &xorps ($inout0,$inout4); 1079 &movups ($rndkey0,&QWP(0x30,$inp)); 1080 &xorps ($inout1,$inout5); 1081 &movups (&QWP(0,$out),$inout0); 1082 &xorps ($inout2,$rndkey1); 1083 &movups (&QWP(0x10,$out),$inout1); 1084 &xorps ($inout3,$rndkey0); 1085 &movups (&QWP(0x20,$out),$inout2); 1086 &movups (&QWP(0x30,$out),$inout3); 1087 1088&set_label("ctr32_ret"); 1089 &pxor ("xmm0","xmm0"); # clear register bank 1090 &pxor ("xmm1","xmm1"); 1091 &pxor ("xmm2","xmm2"); 1092 &pxor ("xmm3","xmm3"); 1093 &pxor ("xmm4","xmm4"); 1094 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack 1095 &pxor ("xmm5","xmm5"); 1096 &movdqa (&QWP(48,"esp"),"xmm0"); 1097 &pxor ("xmm6","xmm6"); 1098 &movdqa (&QWP(64,"esp"),"xmm0"); 1099 &pxor ("xmm7","xmm7"); 1100 &mov ("esp",&DWP(80,"esp")); 1101&function_end("aesni_ctr32_encrypt_blocks"); 1102 1103###################################################################### 1104# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1105# const AES_KEY *key1, const AES_KEY *key2 1106# const unsigned char iv[16]); 1107# 1108{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 1109 1110&function_begin("aesni_xts_encrypt"); 1111 &mov ($key,&wparam(4)); # key2 1112 &mov ($inp,&wparam(5)); # clear-text tweak 1113 1114 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1115 &movups ($inout0,&QWP(0,$inp)); 1116 if ($inline) 1117 { &aesni_inline_generate1("enc"); } 1118 else 1119 { &call ("_aesni_encrypt1"); } 1120 1121 &mov ($inp,&wparam(0)); 1122 &mov ($out,&wparam(1)); 1123 &mov ($len,&wparam(2)); 1124 &mov ($key,&wparam(3)); # key1 1125 1126 &mov ($key_,"esp"); 1127 &sub ("esp",16*7+8); 1128 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1129 &and ("esp",-16); # align stack 1130 1131 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1132 &mov (&DWP(16*6+4,"esp"),0); 1133 &mov (&DWP(16*6+8,"esp"),1); 1134 &mov (&DWP(16*6+12,"esp"),0); 1135 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1136 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1137 1138 &movdqa ($tweak,$inout0); 1139 &pxor ($twtmp,$twtmp); 1140 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1141 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1142 1143 &and ($len,-16); 1144 &mov ($key_,$key); # backup $key 1145 &mov ($rounds_,$rounds); # backup $rounds 1146 &sub ($len,16*6); 1147 &jc (&label("xts_enc_short")); 1148 1149 &shl ($rounds,4); 1150 &mov ($rounds_,16); 1151 &sub ($rounds_,$rounds); 1152 &lea ($key,&DWP(32,$key,$rounds)); 1153 &jmp (&label("xts_enc_loop6")); 1154 1155&set_label("xts_enc_loop6",16); 1156 for ($i=0;$i<4;$i++) { 1157 &pshufd ($twres,$twtmp,0x13); 1158 &pxor ($twtmp,$twtmp); 1159 &movdqa (&QWP(16*$i,"esp"),$tweak); 1160 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1161 &pand ($twres,$twmask); # isolate carry and residue 1162 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1163 &pxor ($tweak,$twres); 1164 } 1165 &pshufd ($inout5,$twtmp,0x13); 1166 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1167 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1168 &$movekey ($rndkey0,&QWP(0,$key_)); 1169 &pand ($inout5,$twmask); # isolate carry and residue 1170 &movups ($inout0,&QWP(0,$inp)); # load input 1171 &pxor ($inout5,$tweak); 1172 1173 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1174 &mov ($rounds,$rounds_); # restore $rounds 1175 &movdqu ($inout1,&QWP(16*1,$inp)); 1176 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1177 &movdqu ($inout2,&QWP(16*2,$inp)); 1178 &pxor ($inout1,$rndkey0); 1179 &movdqu ($inout3,&QWP(16*3,$inp)); 1180 &pxor ($inout2,$rndkey0); 1181 &movdqu ($inout4,&QWP(16*4,$inp)); 1182 &pxor ($inout3,$rndkey0); 1183 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1184 &pxor ($inout4,$rndkey0); 1185 &lea ($inp,&DWP(16*6,$inp)); 1186 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1187 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1188 &pxor ($inout5,$rndkey1); 1189 1190 &$movekey ($rndkey1,&QWP(16,$key_)); 1191 &pxor ($inout1,&QWP(16*1,"esp")); 1192 &pxor ($inout2,&QWP(16*2,"esp")); 1193 &aesenc ($inout0,$rndkey1); 1194 &pxor ($inout3,&QWP(16*3,"esp")); 1195 &pxor ($inout4,&QWP(16*4,"esp")); 1196 &aesenc ($inout1,$rndkey1); 1197 &pxor ($inout5,$rndkey0); 1198 &$movekey ($rndkey0,&QWP(32,$key_)); 1199 &aesenc ($inout2,$rndkey1); 1200 &aesenc ($inout3,$rndkey1); 1201 &aesenc ($inout4,$rndkey1); 1202 &aesenc ($inout5,$rndkey1); 1203 &call (&label("_aesni_encrypt6_enter")); 1204 1205 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1206 &pxor ($twtmp,$twtmp); 1207 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1208 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1209 &xorps ($inout1,&QWP(16*1,"esp")); 1210 &movups (&QWP(16*0,$out),$inout0); # write output 1211 &xorps ($inout2,&QWP(16*2,"esp")); 1212 &movups (&QWP(16*1,$out),$inout1); 1213 &xorps ($inout3,&QWP(16*3,"esp")); 1214 &movups (&QWP(16*2,$out),$inout2); 1215 &xorps ($inout4,&QWP(16*4,"esp")); 1216 &movups (&QWP(16*3,$out),$inout3); 1217 &xorps ($inout5,$tweak); 1218 &movups (&QWP(16*4,$out),$inout4); 1219 &pshufd ($twres,$twtmp,0x13); 1220 &movups (&QWP(16*5,$out),$inout5); 1221 &lea ($out,&DWP(16*6,$out)); 1222 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1223 1224 &pxor ($twtmp,$twtmp); 1225 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1226 &pand ($twres,$twmask); # isolate carry and residue 1227 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1228 &pxor ($tweak,$twres); 1229 1230 &sub ($len,16*6); 1231 &jnc (&label("xts_enc_loop6")); 1232 1233 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1234 &mov ($key,$key_); # restore $key 1235 &mov ($rounds_,$rounds); 1236 1237&set_label("xts_enc_short"); 1238 &add ($len,16*6); 1239 &jz (&label("xts_enc_done6x")); 1240 1241 &movdqa ($inout3,$tweak); # put aside previous tweak 1242 &cmp ($len,0x20); 1243 &jb (&label("xts_enc_one")); 1244 1245 &pshufd ($twres,$twtmp,0x13); 1246 &pxor ($twtmp,$twtmp); 1247 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1248 &pand ($twres,$twmask); # isolate carry and residue 1249 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1250 &pxor ($tweak,$twres); 1251 &je (&label("xts_enc_two")); 1252 1253 &pshufd ($twres,$twtmp,0x13); 1254 &pxor ($twtmp,$twtmp); 1255 &movdqa ($inout4,$tweak); # put aside previous tweak 1256 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1257 &pand ($twres,$twmask); # isolate carry and residue 1258 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1259 &pxor ($tweak,$twres); 1260 &cmp ($len,0x40); 1261 &jb (&label("xts_enc_three")); 1262 1263 &pshufd ($twres,$twtmp,0x13); 1264 &pxor ($twtmp,$twtmp); 1265 &movdqa ($inout5,$tweak); # put aside previous tweak 1266 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1267 &pand ($twres,$twmask); # isolate carry and residue 1268 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1269 &pxor ($tweak,$twres); 1270 &movdqa (&QWP(16*0,"esp"),$inout3); 1271 &movdqa (&QWP(16*1,"esp"),$inout4); 1272 &je (&label("xts_enc_four")); 1273 1274 &movdqa (&QWP(16*2,"esp"),$inout5); 1275 &pshufd ($inout5,$twtmp,0x13); 1276 &movdqa (&QWP(16*3,"esp"),$tweak); 1277 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1278 &pand ($inout5,$twmask); # isolate carry and residue 1279 &pxor ($inout5,$tweak); 1280 1281 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1282 &movdqu ($inout1,&QWP(16*1,$inp)); 1283 &movdqu ($inout2,&QWP(16*2,$inp)); 1284 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1285 &movdqu ($inout3,&QWP(16*3,$inp)); 1286 &pxor ($inout1,&QWP(16*1,"esp")); 1287 &movdqu ($inout4,&QWP(16*4,$inp)); 1288 &pxor ($inout2,&QWP(16*2,"esp")); 1289 &lea ($inp,&DWP(16*5,$inp)); 1290 &pxor ($inout3,&QWP(16*3,"esp")); 1291 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1292 &pxor ($inout4,$inout5); 1293 1294 &call ("_aesni_encrypt6"); 1295 1296 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1297 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1298 &xorps ($inout1,&QWP(16*1,"esp")); 1299 &xorps ($inout2,&QWP(16*2,"esp")); 1300 &movups (&QWP(16*0,$out),$inout0); # write output 1301 &xorps ($inout3,&QWP(16*3,"esp")); 1302 &movups (&QWP(16*1,$out),$inout1); 1303 &xorps ($inout4,$tweak); 1304 &movups (&QWP(16*2,$out),$inout2); 1305 &movups (&QWP(16*3,$out),$inout3); 1306 &movups (&QWP(16*4,$out),$inout4); 1307 &lea ($out,&DWP(16*5,$out)); 1308 &jmp (&label("xts_enc_done")); 1309 1310&set_label("xts_enc_one",16); 1311 &movups ($inout0,&QWP(16*0,$inp)); # load input 1312 &lea ($inp,&DWP(16*1,$inp)); 1313 &xorps ($inout0,$inout3); # input^=tweak 1314 if ($inline) 1315 { &aesni_inline_generate1("enc"); } 1316 else 1317 { &call ("_aesni_encrypt1"); } 1318 &xorps ($inout0,$inout3); # output^=tweak 1319 &movups (&QWP(16*0,$out),$inout0); # write output 1320 &lea ($out,&DWP(16*1,$out)); 1321 1322 &movdqa ($tweak,$inout3); # last tweak 1323 &jmp (&label("xts_enc_done")); 1324 1325&set_label("xts_enc_two",16); 1326 &movaps ($inout4,$tweak); # put aside last tweak 1327 1328 &movups ($inout0,&QWP(16*0,$inp)); # load input 1329 &movups ($inout1,&QWP(16*1,$inp)); 1330 &lea ($inp,&DWP(16*2,$inp)); 1331 &xorps ($inout0,$inout3); # input^=tweak 1332 &xorps ($inout1,$inout4); 1333 1334 &call ("_aesni_encrypt2"); 1335 1336 &xorps ($inout0,$inout3); # output^=tweak 1337 &xorps ($inout1,$inout4); 1338 &movups (&QWP(16*0,$out),$inout0); # write output 1339 &movups (&QWP(16*1,$out),$inout1); 1340 &lea ($out,&DWP(16*2,$out)); 1341 1342 &movdqa ($tweak,$inout4); # last tweak 1343 &jmp (&label("xts_enc_done")); 1344 1345&set_label("xts_enc_three",16); 1346 &movaps ($inout5,$tweak); # put aside last tweak 1347 &movups ($inout0,&QWP(16*0,$inp)); # load input 1348 &movups ($inout1,&QWP(16*1,$inp)); 1349 &movups ($inout2,&QWP(16*2,$inp)); 1350 &lea ($inp,&DWP(16*3,$inp)); 1351 &xorps ($inout0,$inout3); # input^=tweak 1352 &xorps ($inout1,$inout4); 1353 &xorps ($inout2,$inout5); 1354 1355 &call ("_aesni_encrypt3"); 1356 1357 &xorps ($inout0,$inout3); # output^=tweak 1358 &xorps ($inout1,$inout4); 1359 &xorps ($inout2,$inout5); 1360 &movups (&QWP(16*0,$out),$inout0); # write output 1361 &movups (&QWP(16*1,$out),$inout1); 1362 &movups (&QWP(16*2,$out),$inout2); 1363 &lea ($out,&DWP(16*3,$out)); 1364 1365 &movdqa ($tweak,$inout5); # last tweak 1366 &jmp (&label("xts_enc_done")); 1367 1368&set_label("xts_enc_four",16); 1369 &movaps ($inout4,$tweak); # put aside last tweak 1370 1371 &movups ($inout0,&QWP(16*0,$inp)); # load input 1372 &movups ($inout1,&QWP(16*1,$inp)); 1373 &movups ($inout2,&QWP(16*2,$inp)); 1374 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1375 &movups ($inout3,&QWP(16*3,$inp)); 1376 &lea ($inp,&DWP(16*4,$inp)); 1377 &xorps ($inout1,&QWP(16*1,"esp")); 1378 &xorps ($inout2,$inout5); 1379 &xorps ($inout3,$inout4); 1380 1381 &call ("_aesni_encrypt4"); 1382 1383 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1384 &xorps ($inout1,&QWP(16*1,"esp")); 1385 &xorps ($inout2,$inout5); 1386 &movups (&QWP(16*0,$out),$inout0); # write output 1387 &xorps ($inout3,$inout4); 1388 &movups (&QWP(16*1,$out),$inout1); 1389 &movups (&QWP(16*2,$out),$inout2); 1390 &movups (&QWP(16*3,$out),$inout3); 1391 &lea ($out,&DWP(16*4,$out)); 1392 1393 &movdqa ($tweak,$inout4); # last tweak 1394 &jmp (&label("xts_enc_done")); 1395 1396&set_label("xts_enc_done6x",16); # $tweak is pre-calculated 1397 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1398 &and ($len,15); 1399 &jz (&label("xts_enc_ret")); 1400 &movdqa ($inout3,$tweak); 1401 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1402 &jmp (&label("xts_enc_steal")); 1403 1404&set_label("xts_enc_done",16); 1405 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1406 &pxor ($twtmp,$twtmp); 1407 &and ($len,15); 1408 &jz (&label("xts_enc_ret")); 1409 1410 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1411 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1412 &pshufd ($inout3,$twtmp,0x13); 1413 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1414 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 1415 &pxor ($inout3,$tweak); 1416 1417&set_label("xts_enc_steal"); 1418 &movz ($rounds,&BP(0,$inp)); 1419 &movz ($key,&BP(-16,$out)); 1420 &lea ($inp,&DWP(1,$inp)); 1421 &mov (&BP(-16,$out),&LB($rounds)); 1422 &mov (&BP(0,$out),&LB($key)); 1423 &lea ($out,&DWP(1,$out)); 1424 &sub ($len,1); 1425 &jnz (&label("xts_enc_steal")); 1426 1427 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1428 &mov ($key,$key_); # restore $key 1429 &mov ($rounds,$rounds_); # restore $rounds 1430 1431 &movups ($inout0,&QWP(-16,$out)); # load input 1432 &xorps ($inout0,$inout3); # input^=tweak 1433 if ($inline) 1434 { &aesni_inline_generate1("enc"); } 1435 else 1436 { &call ("_aesni_encrypt1"); } 1437 &xorps ($inout0,$inout3); # output^=tweak 1438 &movups (&QWP(-16,$out),$inout0); # write output 1439 1440&set_label("xts_enc_ret"); 1441 &pxor ("xmm0","xmm0"); # clear register bank 1442 &pxor ("xmm1","xmm1"); 1443 &pxor ("xmm2","xmm2"); 1444 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1445 &pxor ("xmm3","xmm3"); 1446 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1447 &pxor ("xmm4","xmm4"); 1448 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1449 &pxor ("xmm5","xmm5"); 1450 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1451 &pxor ("xmm6","xmm6"); 1452 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1453 &pxor ("xmm7","xmm7"); 1454 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1455 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1456&function_end("aesni_xts_encrypt"); 1457 1458&function_begin("aesni_xts_decrypt"); 1459 &mov ($key,&wparam(4)); # key2 1460 &mov ($inp,&wparam(5)); # clear-text tweak 1461 1462 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1463 &movups ($inout0,&QWP(0,$inp)); 1464 if ($inline) 1465 { &aesni_inline_generate1("enc"); } 1466 else 1467 { &call ("_aesni_encrypt1"); } 1468 1469 &mov ($inp,&wparam(0)); 1470 &mov ($out,&wparam(1)); 1471 &mov ($len,&wparam(2)); 1472 &mov ($key,&wparam(3)); # key1 1473 1474 &mov ($key_,"esp"); 1475 &sub ("esp",16*7+8); 1476 &and ("esp",-16); # align stack 1477 1478 &xor ($rounds_,$rounds_); # if(len%16) len-=16; 1479 &test ($len,15); 1480 &setnz (&LB($rounds_)); 1481 &shl ($rounds_,4); 1482 &sub ($len,$rounds_); 1483 1484 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1485 &mov (&DWP(16*6+4,"esp"),0); 1486 &mov (&DWP(16*6+8,"esp"),1); 1487 &mov (&DWP(16*6+12,"esp"),0); 1488 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1489 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1490 1491 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1492 &mov ($key_,$key); # backup $key 1493 &mov ($rounds_,$rounds); # backup $rounds 1494 1495 &movdqa ($tweak,$inout0); 1496 &pxor ($twtmp,$twtmp); 1497 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1498 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1499 1500 &and ($len,-16); 1501 &sub ($len,16*6); 1502 &jc (&label("xts_dec_short")); 1503 1504 &shl ($rounds,4); 1505 &mov ($rounds_,16); 1506 &sub ($rounds_,$rounds); 1507 &lea ($key,&DWP(32,$key,$rounds)); 1508 &jmp (&label("xts_dec_loop6")); 1509 1510&set_label("xts_dec_loop6",16); 1511 for ($i=0;$i<4;$i++) { 1512 &pshufd ($twres,$twtmp,0x13); 1513 &pxor ($twtmp,$twtmp); 1514 &movdqa (&QWP(16*$i,"esp"),$tweak); 1515 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1516 &pand ($twres,$twmask); # isolate carry and residue 1517 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1518 &pxor ($tweak,$twres); 1519 } 1520 &pshufd ($inout5,$twtmp,0x13); 1521 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1522 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1523 &$movekey ($rndkey0,&QWP(0,$key_)); 1524 &pand ($inout5,$twmask); # isolate carry and residue 1525 &movups ($inout0,&QWP(0,$inp)); # load input 1526 &pxor ($inout5,$tweak); 1527 1528 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1529 &mov ($rounds,$rounds_); 1530 &movdqu ($inout1,&QWP(16*1,$inp)); 1531 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1532 &movdqu ($inout2,&QWP(16*2,$inp)); 1533 &pxor ($inout1,$rndkey0); 1534 &movdqu ($inout3,&QWP(16*3,$inp)); 1535 &pxor ($inout2,$rndkey0); 1536 &movdqu ($inout4,&QWP(16*4,$inp)); 1537 &pxor ($inout3,$rndkey0); 1538 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1539 &pxor ($inout4,$rndkey0); 1540 &lea ($inp,&DWP(16*6,$inp)); 1541 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1542 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1543 &pxor ($inout5,$rndkey1); 1544 1545 &$movekey ($rndkey1,&QWP(16,$key_)); 1546 &pxor ($inout1,&QWP(16*1,"esp")); 1547 &pxor ($inout2,&QWP(16*2,"esp")); 1548 &aesdec ($inout0,$rndkey1); 1549 &pxor ($inout3,&QWP(16*3,"esp")); 1550 &pxor ($inout4,&QWP(16*4,"esp")); 1551 &aesdec ($inout1,$rndkey1); 1552 &pxor ($inout5,$rndkey0); 1553 &$movekey ($rndkey0,&QWP(32,$key_)); 1554 &aesdec ($inout2,$rndkey1); 1555 &aesdec ($inout3,$rndkey1); 1556 &aesdec ($inout4,$rndkey1); 1557 &aesdec ($inout5,$rndkey1); 1558 &call (&label("_aesni_decrypt6_enter")); 1559 1560 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1561 &pxor ($twtmp,$twtmp); 1562 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1563 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1564 &xorps ($inout1,&QWP(16*1,"esp")); 1565 &movups (&QWP(16*0,$out),$inout0); # write output 1566 &xorps ($inout2,&QWP(16*2,"esp")); 1567 &movups (&QWP(16*1,$out),$inout1); 1568 &xorps ($inout3,&QWP(16*3,"esp")); 1569 &movups (&QWP(16*2,$out),$inout2); 1570 &xorps ($inout4,&QWP(16*4,"esp")); 1571 &movups (&QWP(16*3,$out),$inout3); 1572 &xorps ($inout5,$tweak); 1573 &movups (&QWP(16*4,$out),$inout4); 1574 &pshufd ($twres,$twtmp,0x13); 1575 &movups (&QWP(16*5,$out),$inout5); 1576 &lea ($out,&DWP(16*6,$out)); 1577 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1578 1579 &pxor ($twtmp,$twtmp); 1580 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1581 &pand ($twres,$twmask); # isolate carry and residue 1582 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1583 &pxor ($tweak,$twres); 1584 1585 &sub ($len,16*6); 1586 &jnc (&label("xts_dec_loop6")); 1587 1588 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1589 &mov ($key,$key_); # restore $key 1590 &mov ($rounds_,$rounds); 1591 1592&set_label("xts_dec_short"); 1593 &add ($len,16*6); 1594 &jz (&label("xts_dec_done6x")); 1595 1596 &movdqa ($inout3,$tweak); # put aside previous tweak 1597 &cmp ($len,0x20); 1598 &jb (&label("xts_dec_one")); 1599 1600 &pshufd ($twres,$twtmp,0x13); 1601 &pxor ($twtmp,$twtmp); 1602 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1603 &pand ($twres,$twmask); # isolate carry and residue 1604 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1605 &pxor ($tweak,$twres); 1606 &je (&label("xts_dec_two")); 1607 1608 &pshufd ($twres,$twtmp,0x13); 1609 &pxor ($twtmp,$twtmp); 1610 &movdqa ($inout4,$tweak); # put aside previous tweak 1611 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1612 &pand ($twres,$twmask); # isolate carry and residue 1613 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1614 &pxor ($tweak,$twres); 1615 &cmp ($len,0x40); 1616 &jb (&label("xts_dec_three")); 1617 1618 &pshufd ($twres,$twtmp,0x13); 1619 &pxor ($twtmp,$twtmp); 1620 &movdqa ($inout5,$tweak); # put aside previous tweak 1621 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1622 &pand ($twres,$twmask); # isolate carry and residue 1623 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1624 &pxor ($tweak,$twres); 1625 &movdqa (&QWP(16*0,"esp"),$inout3); 1626 &movdqa (&QWP(16*1,"esp"),$inout4); 1627 &je (&label("xts_dec_four")); 1628 1629 &movdqa (&QWP(16*2,"esp"),$inout5); 1630 &pshufd ($inout5,$twtmp,0x13); 1631 &movdqa (&QWP(16*3,"esp"),$tweak); 1632 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1633 &pand ($inout5,$twmask); # isolate carry and residue 1634 &pxor ($inout5,$tweak); 1635 1636 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1637 &movdqu ($inout1,&QWP(16*1,$inp)); 1638 &movdqu ($inout2,&QWP(16*2,$inp)); 1639 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1640 &movdqu ($inout3,&QWP(16*3,$inp)); 1641 &pxor ($inout1,&QWP(16*1,"esp")); 1642 &movdqu ($inout4,&QWP(16*4,$inp)); 1643 &pxor ($inout2,&QWP(16*2,"esp")); 1644 &lea ($inp,&DWP(16*5,$inp)); 1645 &pxor ($inout3,&QWP(16*3,"esp")); 1646 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1647 &pxor ($inout4,$inout5); 1648 1649 &call ("_aesni_decrypt6"); 1650 1651 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1652 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1653 &xorps ($inout1,&QWP(16*1,"esp")); 1654 &xorps ($inout2,&QWP(16*2,"esp")); 1655 &movups (&QWP(16*0,$out),$inout0); # write output 1656 &xorps ($inout3,&QWP(16*3,"esp")); 1657 &movups (&QWP(16*1,$out),$inout1); 1658 &xorps ($inout4,$tweak); 1659 &movups (&QWP(16*2,$out),$inout2); 1660 &movups (&QWP(16*3,$out),$inout3); 1661 &movups (&QWP(16*4,$out),$inout4); 1662 &lea ($out,&DWP(16*5,$out)); 1663 &jmp (&label("xts_dec_done")); 1664 1665&set_label("xts_dec_one",16); 1666 &movups ($inout0,&QWP(16*0,$inp)); # load input 1667 &lea ($inp,&DWP(16*1,$inp)); 1668 &xorps ($inout0,$inout3); # input^=tweak 1669 if ($inline) 1670 { &aesni_inline_generate1("dec"); } 1671 else 1672 { &call ("_aesni_decrypt1"); } 1673 &xorps ($inout0,$inout3); # output^=tweak 1674 &movups (&QWP(16*0,$out),$inout0); # write output 1675 &lea ($out,&DWP(16*1,$out)); 1676 1677 &movdqa ($tweak,$inout3); # last tweak 1678 &jmp (&label("xts_dec_done")); 1679 1680&set_label("xts_dec_two",16); 1681 &movaps ($inout4,$tweak); # put aside last tweak 1682 1683 &movups ($inout0,&QWP(16*0,$inp)); # load input 1684 &movups ($inout1,&QWP(16*1,$inp)); 1685 &lea ($inp,&DWP(16*2,$inp)); 1686 &xorps ($inout0,$inout3); # input^=tweak 1687 &xorps ($inout1,$inout4); 1688 1689 &call ("_aesni_decrypt2"); 1690 1691 &xorps ($inout0,$inout3); # output^=tweak 1692 &xorps ($inout1,$inout4); 1693 &movups (&QWP(16*0,$out),$inout0); # write output 1694 &movups (&QWP(16*1,$out),$inout1); 1695 &lea ($out,&DWP(16*2,$out)); 1696 1697 &movdqa ($tweak,$inout4); # last tweak 1698 &jmp (&label("xts_dec_done")); 1699 1700&set_label("xts_dec_three",16); 1701 &movaps ($inout5,$tweak); # put aside last tweak 1702 &movups ($inout0,&QWP(16*0,$inp)); # load input 1703 &movups ($inout1,&QWP(16*1,$inp)); 1704 &movups ($inout2,&QWP(16*2,$inp)); 1705 &lea ($inp,&DWP(16*3,$inp)); 1706 &xorps ($inout0,$inout3); # input^=tweak 1707 &xorps ($inout1,$inout4); 1708 &xorps ($inout2,$inout5); 1709 1710 &call ("_aesni_decrypt3"); 1711 1712 &xorps ($inout0,$inout3); # output^=tweak 1713 &xorps ($inout1,$inout4); 1714 &xorps ($inout2,$inout5); 1715 &movups (&QWP(16*0,$out),$inout0); # write output 1716 &movups (&QWP(16*1,$out),$inout1); 1717 &movups (&QWP(16*2,$out),$inout2); 1718 &lea ($out,&DWP(16*3,$out)); 1719 1720 &movdqa ($tweak,$inout5); # last tweak 1721 &jmp (&label("xts_dec_done")); 1722 1723&set_label("xts_dec_four",16); 1724 &movaps ($inout4,$tweak); # put aside last tweak 1725 1726 &movups ($inout0,&QWP(16*0,$inp)); # load input 1727 &movups ($inout1,&QWP(16*1,$inp)); 1728 &movups ($inout2,&QWP(16*2,$inp)); 1729 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1730 &movups ($inout3,&QWP(16*3,$inp)); 1731 &lea ($inp,&DWP(16*4,$inp)); 1732 &xorps ($inout1,&QWP(16*1,"esp")); 1733 &xorps ($inout2,$inout5); 1734 &xorps ($inout3,$inout4); 1735 1736 &call ("_aesni_decrypt4"); 1737 1738 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1739 &xorps ($inout1,&QWP(16*1,"esp")); 1740 &xorps ($inout2,$inout5); 1741 &movups (&QWP(16*0,$out),$inout0); # write output 1742 &xorps ($inout3,$inout4); 1743 &movups (&QWP(16*1,$out),$inout1); 1744 &movups (&QWP(16*2,$out),$inout2); 1745 &movups (&QWP(16*3,$out),$inout3); 1746 &lea ($out,&DWP(16*4,$out)); 1747 1748 &movdqa ($tweak,$inout4); # last tweak 1749 &jmp (&label("xts_dec_done")); 1750 1751&set_label("xts_dec_done6x",16); # $tweak is pre-calculated 1752 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1753 &and ($len,15); 1754 &jz (&label("xts_dec_ret")); 1755 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1756 &jmp (&label("xts_dec_only_one_more")); 1757 1758&set_label("xts_dec_done",16); 1759 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1760 &pxor ($twtmp,$twtmp); 1761 &and ($len,15); 1762 &jz (&label("xts_dec_ret")); 1763 1764 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1765 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1766 &pshufd ($twres,$twtmp,0x13); 1767 &pxor ($twtmp,$twtmp); 1768 &movdqa ($twmask,&QWP(16*6,"esp")); 1769 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1770 &pand ($twres,$twmask); # isolate carry and residue 1771 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1772 &pxor ($tweak,$twres); 1773 1774&set_label("xts_dec_only_one_more"); 1775 &pshufd ($inout3,$twtmp,0x13); 1776 &movdqa ($inout4,$tweak); # put aside previous tweak 1777 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1778 &pand ($inout3,$twmask); # isolate carry and residue 1779 &pxor ($inout3,$tweak); 1780 1781 &mov ($key,$key_); # restore $key 1782 &mov ($rounds,$rounds_); # restore $rounds 1783 1784 &movups ($inout0,&QWP(0,$inp)); # load input 1785 &xorps ($inout0,$inout3); # input^=tweak 1786 if ($inline) 1787 { &aesni_inline_generate1("dec"); } 1788 else 1789 { &call ("_aesni_decrypt1"); } 1790 &xorps ($inout0,$inout3); # output^=tweak 1791 &movups (&QWP(0,$out),$inout0); # write output 1792 1793&set_label("xts_dec_steal"); 1794 &movz ($rounds,&BP(16,$inp)); 1795 &movz ($key,&BP(0,$out)); 1796 &lea ($inp,&DWP(1,$inp)); 1797 &mov (&BP(0,$out),&LB($rounds)); 1798 &mov (&BP(16,$out),&LB($key)); 1799 &lea ($out,&DWP(1,$out)); 1800 &sub ($len,1); 1801 &jnz (&label("xts_dec_steal")); 1802 1803 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1804 &mov ($key,$key_); # restore $key 1805 &mov ($rounds,$rounds_); # restore $rounds 1806 1807 &movups ($inout0,&QWP(0,$out)); # load input 1808 &xorps ($inout0,$inout4); # input^=tweak 1809 if ($inline) 1810 { &aesni_inline_generate1("dec"); } 1811 else 1812 { &call ("_aesni_decrypt1"); } 1813 &xorps ($inout0,$inout4); # output^=tweak 1814 &movups (&QWP(0,$out),$inout0); # write output 1815 1816&set_label("xts_dec_ret"); 1817 &pxor ("xmm0","xmm0"); # clear register bank 1818 &pxor ("xmm1","xmm1"); 1819 &pxor ("xmm2","xmm2"); 1820 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1821 &pxor ("xmm3","xmm3"); 1822 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1823 &pxor ("xmm4","xmm4"); 1824 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1825 &pxor ("xmm5","xmm5"); 1826 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1827 &pxor ("xmm6","xmm6"); 1828 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1829 &pxor ("xmm7","xmm7"); 1830 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1831 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1832&function_end("aesni_xts_decrypt"); 1833} 1834} 1835 1836###################################################################### 1837# void $PREFIX_cbc_encrypt (const void *inp, void *out, 1838# size_t length, const AES_KEY *key, 1839# unsigned char *ivp,const int enc); 1840&function_begin("${PREFIX}_cbc_encrypt"); 1841 &mov ($inp,&wparam(0)); 1842 &mov ($rounds_,"esp"); 1843 &mov ($out,&wparam(1)); 1844 &sub ($rounds_,24); 1845 &mov ($len,&wparam(2)); 1846 &and ($rounds_,-16); 1847 &mov ($key,&wparam(3)); 1848 &mov ($key_,&wparam(4)); 1849 &test ($len,$len); 1850 &jz (&label("cbc_abort")); 1851 1852 &cmp (&wparam(5),0); 1853 &xchg ($rounds_,"esp"); # alloca 1854 &movups ($ivec,&QWP(0,$key_)); # load IV 1855 &mov ($rounds,&DWP(240,$key)); 1856 &mov ($key_,$key); # backup $key 1857 &mov (&DWP(16,"esp"),$rounds_); # save original %esp 1858 &mov ($rounds_,$rounds); # backup $rounds 1859 &je (&label("cbc_decrypt")); 1860 1861 &movaps ($inout0,$ivec); 1862 &cmp ($len,16); 1863 &jb (&label("cbc_enc_tail")); 1864 &sub ($len,16); 1865 &jmp (&label("cbc_enc_loop")); 1866 1867&set_label("cbc_enc_loop",16); 1868 &movups ($ivec,&QWP(0,$inp)); # input actually 1869 &lea ($inp,&DWP(16,$inp)); 1870 if ($inline) 1871 { &aesni_inline_generate1("enc",$inout0,$ivec); } 1872 else 1873 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 1874 &mov ($rounds,$rounds_); # restore $rounds 1875 &mov ($key,$key_); # restore $key 1876 &movups (&QWP(0,$out),$inout0); # store output 1877 &lea ($out,&DWP(16,$out)); 1878 &sub ($len,16); 1879 &jnc (&label("cbc_enc_loop")); 1880 &add ($len,16); 1881 &jnz (&label("cbc_enc_tail")); 1882 &movaps ($ivec,$inout0); 1883 &pxor ($inout0,$inout0); 1884 &jmp (&label("cbc_ret")); 1885 1886&set_label("cbc_enc_tail"); 1887 &mov ("ecx",$len); # zaps $rounds 1888 &data_word(0xA4F3F689); # rep movsb 1889 &mov ("ecx",16); # zero tail 1890 &sub ("ecx",$len); 1891 &xor ("eax","eax"); # zaps $len 1892 &data_word(0xAAF3F689); # rep stosb 1893 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 1894 &mov ($rounds,$rounds_); # restore $rounds 1895 &mov ($inp,$out); # $inp and $out are the same 1896 &mov ($key,$key_); # restore $key 1897 &jmp (&label("cbc_enc_loop")); 1898###################################################################### 1899&set_label("cbc_decrypt",16); 1900 &cmp ($len,0x50); 1901 &jbe (&label("cbc_dec_tail")); 1902 &movaps (&QWP(0,"esp"),$ivec); # save IV 1903 &sub ($len,0x50); 1904 &jmp (&label("cbc_dec_loop6_enter")); 1905 1906&set_label("cbc_dec_loop6",16); 1907 &movaps (&QWP(0,"esp"),$rndkey0); # save IV 1908 &movups (&QWP(0,$out),$inout5); 1909 &lea ($out,&DWP(0x10,$out)); 1910&set_label("cbc_dec_loop6_enter"); 1911 &movdqu ($inout0,&QWP(0,$inp)); 1912 &movdqu ($inout1,&QWP(0x10,$inp)); 1913 &movdqu ($inout2,&QWP(0x20,$inp)); 1914 &movdqu ($inout3,&QWP(0x30,$inp)); 1915 &movdqu ($inout4,&QWP(0x40,$inp)); 1916 &movdqu ($inout5,&QWP(0x50,$inp)); 1917 1918 &call ("_aesni_decrypt6"); 1919 1920 &movups ($rndkey1,&QWP(0,$inp)); 1921 &movups ($rndkey0,&QWP(0x10,$inp)); 1922 &xorps ($inout0,&QWP(0,"esp")); # ^=IV 1923 &xorps ($inout1,$rndkey1); 1924 &movups ($rndkey1,&QWP(0x20,$inp)); 1925 &xorps ($inout2,$rndkey0); 1926 &movups ($rndkey0,&QWP(0x30,$inp)); 1927 &xorps ($inout3,$rndkey1); 1928 &movups ($rndkey1,&QWP(0x40,$inp)); 1929 &xorps ($inout4,$rndkey0); 1930 &movups ($rndkey0,&QWP(0x50,$inp)); # IV 1931 &xorps ($inout5,$rndkey1); 1932 &movups (&QWP(0,$out),$inout0); 1933 &movups (&QWP(0x10,$out),$inout1); 1934 &lea ($inp,&DWP(0x60,$inp)); 1935 &movups (&QWP(0x20,$out),$inout2); 1936 &mov ($rounds,$rounds_); # restore $rounds 1937 &movups (&QWP(0x30,$out),$inout3); 1938 &mov ($key,$key_); # restore $key 1939 &movups (&QWP(0x40,$out),$inout4); 1940 &lea ($out,&DWP(0x50,$out)); 1941 &sub ($len,0x60); 1942 &ja (&label("cbc_dec_loop6")); 1943 1944 &movaps ($inout0,$inout5); 1945 &movaps ($ivec,$rndkey0); 1946 &add ($len,0x50); 1947 &jle (&label("cbc_dec_clear_tail_collected")); 1948 &movups (&QWP(0,$out),$inout0); 1949 &lea ($out,&DWP(0x10,$out)); 1950&set_label("cbc_dec_tail"); 1951 &movups ($inout0,&QWP(0,$inp)); 1952 &movaps ($in0,$inout0); 1953 &cmp ($len,0x10); 1954 &jbe (&label("cbc_dec_one")); 1955 1956 &movups ($inout1,&QWP(0x10,$inp)); 1957 &movaps ($in1,$inout1); 1958 &cmp ($len,0x20); 1959 &jbe (&label("cbc_dec_two")); 1960 1961 &movups ($inout2,&QWP(0x20,$inp)); 1962 &cmp ($len,0x30); 1963 &jbe (&label("cbc_dec_three")); 1964 1965 &movups ($inout3,&QWP(0x30,$inp)); 1966 &cmp ($len,0x40); 1967 &jbe (&label("cbc_dec_four")); 1968 1969 &movups ($inout4,&QWP(0x40,$inp)); 1970 &movaps (&QWP(0,"esp"),$ivec); # save IV 1971 &movups ($inout0,&QWP(0,$inp)); 1972 &xorps ($inout5,$inout5); 1973 &call ("_aesni_decrypt6"); 1974 &movups ($rndkey1,&QWP(0,$inp)); 1975 &movups ($rndkey0,&QWP(0x10,$inp)); 1976 &xorps ($inout0,&QWP(0,"esp")); # ^= IV 1977 &xorps ($inout1,$rndkey1); 1978 &movups ($rndkey1,&QWP(0x20,$inp)); 1979 &xorps ($inout2,$rndkey0); 1980 &movups ($rndkey0,&QWP(0x30,$inp)); 1981 &xorps ($inout3,$rndkey1); 1982 &movups ($ivec,&QWP(0x40,$inp)); # IV 1983 &xorps ($inout4,$rndkey0); 1984 &movups (&QWP(0,$out),$inout0); 1985 &movups (&QWP(0x10,$out),$inout1); 1986 &pxor ($inout1,$inout1); 1987 &movups (&QWP(0x20,$out),$inout2); 1988 &pxor ($inout2,$inout2); 1989 &movups (&QWP(0x30,$out),$inout3); 1990 &pxor ($inout3,$inout3); 1991 &lea ($out,&DWP(0x40,$out)); 1992 &movaps ($inout0,$inout4); 1993 &pxor ($inout4,$inout4); 1994 &sub ($len,0x50); 1995 &jmp (&label("cbc_dec_tail_collected")); 1996 1997&set_label("cbc_dec_one",16); 1998 if ($inline) 1999 { &aesni_inline_generate1("dec"); } 2000 else 2001 { &call ("_aesni_decrypt1"); } 2002 &xorps ($inout0,$ivec); 2003 &movaps ($ivec,$in0); 2004 &sub ($len,0x10); 2005 &jmp (&label("cbc_dec_tail_collected")); 2006 2007&set_label("cbc_dec_two",16); 2008 &call ("_aesni_decrypt2"); 2009 &xorps ($inout0,$ivec); 2010 &xorps ($inout1,$in0); 2011 &movups (&QWP(0,$out),$inout0); 2012 &movaps ($inout0,$inout1); 2013 &pxor ($inout1,$inout1); 2014 &lea ($out,&DWP(0x10,$out)); 2015 &movaps ($ivec,$in1); 2016 &sub ($len,0x20); 2017 &jmp (&label("cbc_dec_tail_collected")); 2018 2019&set_label("cbc_dec_three",16); 2020 &call ("_aesni_decrypt3"); 2021 &xorps ($inout0,$ivec); 2022 &xorps ($inout1,$in0); 2023 &xorps ($inout2,$in1); 2024 &movups (&QWP(0,$out),$inout0); 2025 &movaps ($inout0,$inout2); 2026 &pxor ($inout2,$inout2); 2027 &movups (&QWP(0x10,$out),$inout1); 2028 &pxor ($inout1,$inout1); 2029 &lea ($out,&DWP(0x20,$out)); 2030 &movups ($ivec,&QWP(0x20,$inp)); 2031 &sub ($len,0x30); 2032 &jmp (&label("cbc_dec_tail_collected")); 2033 2034&set_label("cbc_dec_four",16); 2035 &call ("_aesni_decrypt4"); 2036 &movups ($rndkey1,&QWP(0x10,$inp)); 2037 &movups ($rndkey0,&QWP(0x20,$inp)); 2038 &xorps ($inout0,$ivec); 2039 &movups ($ivec,&QWP(0x30,$inp)); 2040 &xorps ($inout1,$in0); 2041 &movups (&QWP(0,$out),$inout0); 2042 &xorps ($inout2,$rndkey1); 2043 &movups (&QWP(0x10,$out),$inout1); 2044 &pxor ($inout1,$inout1); 2045 &xorps ($inout3,$rndkey0); 2046 &movups (&QWP(0x20,$out),$inout2); 2047 &pxor ($inout2,$inout2); 2048 &lea ($out,&DWP(0x30,$out)); 2049 &movaps ($inout0,$inout3); 2050 &pxor ($inout3,$inout3); 2051 &sub ($len,0x40); 2052 &jmp (&label("cbc_dec_tail_collected")); 2053 2054&set_label("cbc_dec_clear_tail_collected",16); 2055 &pxor ($inout1,$inout1); 2056 &pxor ($inout2,$inout2); 2057 &pxor ($inout3,$inout3); 2058 &pxor ($inout4,$inout4); 2059&set_label("cbc_dec_tail_collected"); 2060 &and ($len,15); 2061 &jnz (&label("cbc_dec_tail_partial")); 2062 &movups (&QWP(0,$out),$inout0); 2063 &pxor ($rndkey0,$rndkey0); 2064 &jmp (&label("cbc_ret")); 2065 2066&set_label("cbc_dec_tail_partial",16); 2067 &movaps (&QWP(0,"esp"),$inout0); 2068 &pxor ($rndkey0,$rndkey0); 2069 &mov ("ecx",16); 2070 &mov ($inp,"esp"); 2071 &sub ("ecx",$len); 2072 &data_word(0xA4F3F689); # rep movsb 2073 &movdqa (&QWP(0,"esp"),$inout0); 2074 2075&set_label("cbc_ret"); 2076 &mov ("esp",&DWP(16,"esp")); # pull original %esp 2077 &mov ($key_,&wparam(4)); 2078 &pxor ($inout0,$inout0); 2079 &pxor ($rndkey1,$rndkey1); 2080 &movups (&QWP(0,$key_),$ivec); # output IV 2081 &pxor ($ivec,$ivec); 2082&set_label("cbc_abort"); 2083&function_end("${PREFIX}_cbc_encrypt"); 2084 2085###################################################################### 2086# Mechanical port from aesni-x86_64.pl. 2087# 2088# _aesni_set_encrypt_key is private interface, 2089# input: 2090# "eax" const unsigned char *userKey 2091# $rounds int bits 2092# $key AES_KEY *key 2093# output: 2094# "eax" return code 2095# $round rounds 2096 2097&function_begin_B("_aesni_set_encrypt_key"); 2098 &push ("ebp"); 2099 &push ("ebx"); 2100 &test ("eax","eax"); 2101 &jz (&label("bad_pointer")); 2102 &test ($key,$key); 2103 &jz (&label("bad_pointer")); 2104 2105 &call (&label("pic")); 2106&set_label("pic"); 2107 &blindpop("ebx"); 2108 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); 2109 2110 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const")); 2111 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 2112 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 2113 &mov ("ebp",&DWP(4,"ebp")); 2114 &lea ($key,&DWP(16,$key)); 2115 &and ("ebp",1<<28|1<<11); # AVX and XOP bits 2116 &cmp ($rounds,256); 2117 &je (&label("14rounds")); 2118 &cmp ($rounds,192); 2119 &je (&label("12rounds")); 2120 &cmp ($rounds,128); 2121 &jne (&label("bad_keybits")); 2122 2123&set_label("10rounds",16); 2124 &cmp ("ebp",1<<28); 2125 &je (&label("10rounds_alt")); 2126 2127 &mov ($rounds,9); 2128 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 2129 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 2130 &call (&label("key_128_cold")); 2131 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 2132 &call (&label("key_128")); 2133 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 2134 &call (&label("key_128")); 2135 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 2136 &call (&label("key_128")); 2137 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 2138 &call (&label("key_128")); 2139 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 2140 &call (&label("key_128")); 2141 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 2142 &call (&label("key_128")); 2143 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 2144 &call (&label("key_128")); 2145 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 2146 &call (&label("key_128")); 2147 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 2148 &call (&label("key_128")); 2149 &$movekey (&QWP(0,$key),"xmm0"); 2150 &mov (&DWP(80,$key),$rounds); 2151 2152 &jmp (&label("good_key")); 2153 2154&set_label("key_128",16); 2155 &$movekey (&QWP(0,$key),"xmm0"); 2156 &lea ($key,&DWP(16,$key)); 2157&set_label("key_128_cold"); 2158 &shufps ("xmm4","xmm0",0b00010000); 2159 &xorps ("xmm0","xmm4"); 2160 &shufps ("xmm4","xmm0",0b10001100); 2161 &xorps ("xmm0","xmm4"); 2162 &shufps ("xmm1","xmm1",0b11111111); # critical path 2163 &xorps ("xmm0","xmm1"); 2164 &ret(); 2165 2166&set_label("10rounds_alt",16); 2167 &movdqa ("xmm5",&QWP(0x00,"ebx")); 2168 &mov ($rounds,8); 2169 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2170 &movdqa ("xmm2","xmm0"); 2171 &movdqu (&QWP(-16,$key),"xmm0"); 2172 2173&set_label("loop_key128"); 2174 &pshufb ("xmm0","xmm5"); 2175 &aesenclast ("xmm0","xmm4"); 2176 &pslld ("xmm4",1); 2177 &lea ($key,&DWP(16,$key)); 2178 2179 &movdqa ("xmm3","xmm2"); 2180 &pslldq ("xmm2",4); 2181 &pxor ("xmm3","xmm2"); 2182 &pslldq ("xmm2",4); 2183 &pxor ("xmm3","xmm2"); 2184 &pslldq ("xmm2",4); 2185 &pxor ("xmm2","xmm3"); 2186 2187 &pxor ("xmm0","xmm2"); 2188 &movdqu (&QWP(-16,$key),"xmm0"); 2189 &movdqa ("xmm2","xmm0"); 2190 2191 &dec ($rounds); 2192 &jnz (&label("loop_key128")); 2193 2194 &movdqa ("xmm4",&QWP(0x30,"ebx")); 2195 2196 &pshufb ("xmm0","xmm5"); 2197 &aesenclast ("xmm0","xmm4"); 2198 &pslld ("xmm4",1); 2199 2200 &movdqa ("xmm3","xmm2"); 2201 &pslldq ("xmm2",4); 2202 &pxor ("xmm3","xmm2"); 2203 &pslldq ("xmm2",4); 2204 &pxor ("xmm3","xmm2"); 2205 &pslldq ("xmm2",4); 2206 &pxor ("xmm2","xmm3"); 2207 2208 &pxor ("xmm0","xmm2"); 2209 &movdqu (&QWP(0,$key),"xmm0"); 2210 2211 &movdqa ("xmm2","xmm0"); 2212 &pshufb ("xmm0","xmm5"); 2213 &aesenclast ("xmm0","xmm4"); 2214 2215 &movdqa ("xmm3","xmm2"); 2216 &pslldq ("xmm2",4); 2217 &pxor ("xmm3","xmm2"); 2218 &pslldq ("xmm2",4); 2219 &pxor ("xmm3","xmm2"); 2220 &pslldq ("xmm2",4); 2221 &pxor ("xmm2","xmm3"); 2222 2223 &pxor ("xmm0","xmm2"); 2224 &movdqu (&QWP(16,$key),"xmm0"); 2225 2226 &mov ($rounds,9); 2227 &mov (&DWP(96,$key),$rounds); 2228 2229 &jmp (&label("good_key")); 2230 2231&set_label("12rounds",16); 2232 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 2233 &cmp ("ebp",1<<28); 2234 &je (&label("12rounds_alt")); 2235 2236 &mov ($rounds,11); 2237 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 2238 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 2239 &call (&label("key_192a_cold")); 2240 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 2241 &call (&label("key_192b")); 2242 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 2243 &call (&label("key_192a")); 2244 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 2245 &call (&label("key_192b")); 2246 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 2247 &call (&label("key_192a")); 2248 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 2249 &call (&label("key_192b")); 2250 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 2251 &call (&label("key_192a")); 2252 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 2253 &call (&label("key_192b")); 2254 &$movekey (&QWP(0,$key),"xmm0"); 2255 &mov (&DWP(48,$key),$rounds); 2256 2257 &jmp (&label("good_key")); 2258 2259&set_label("key_192a",16); 2260 &$movekey (&QWP(0,$key),"xmm0"); 2261 &lea ($key,&DWP(16,$key)); 2262&set_label("key_192a_cold",16); 2263 &movaps ("xmm5","xmm2"); 2264&set_label("key_192b_warm"); 2265 &shufps ("xmm4","xmm0",0b00010000); 2266 &movdqa ("xmm3","xmm2"); 2267 &xorps ("xmm0","xmm4"); 2268 &shufps ("xmm4","xmm0",0b10001100); 2269 &pslldq ("xmm3",4); 2270 &xorps ("xmm0","xmm4"); 2271 &pshufd ("xmm1","xmm1",0b01010101); # critical path 2272 &pxor ("xmm2","xmm3"); 2273 &pxor ("xmm0","xmm1"); 2274 &pshufd ("xmm3","xmm0",0b11111111); 2275 &pxor ("xmm2","xmm3"); 2276 &ret(); 2277 2278&set_label("key_192b",16); 2279 &movaps ("xmm3","xmm0"); 2280 &shufps ("xmm5","xmm0",0b01000100); 2281 &$movekey (&QWP(0,$key),"xmm5"); 2282 &shufps ("xmm3","xmm2",0b01001110); 2283 &$movekey (&QWP(16,$key),"xmm3"); 2284 &lea ($key,&DWP(32,$key)); 2285 &jmp (&label("key_192b_warm")); 2286 2287&set_label("12rounds_alt",16); 2288 &movdqa ("xmm5",&QWP(0x10,"ebx")); 2289 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2290 &mov ($rounds,8); 2291 &movdqu (&QWP(-16,$key),"xmm0"); 2292 2293&set_label("loop_key192"); 2294 &movq (&QWP(0,$key),"xmm2"); 2295 &movdqa ("xmm1","xmm2"); 2296 &pshufb ("xmm2","xmm5"); 2297 &aesenclast ("xmm2","xmm4"); 2298 &pslld ("xmm4",1); 2299 &lea ($key,&DWP(24,$key)); 2300 2301 &movdqa ("xmm3","xmm0"); 2302 &pslldq ("xmm0",4); 2303 &pxor ("xmm3","xmm0"); 2304 &pslldq ("xmm0",4); 2305 &pxor ("xmm3","xmm0"); 2306 &pslldq ("xmm0",4); 2307 &pxor ("xmm0","xmm3"); 2308 2309 &pshufd ("xmm3","xmm0",0xff); 2310 &pxor ("xmm3","xmm1"); 2311 &pslldq ("xmm1",4); 2312 &pxor ("xmm3","xmm1"); 2313 2314 &pxor ("xmm0","xmm2"); 2315 &pxor ("xmm2","xmm3"); 2316 &movdqu (&QWP(-16,$key),"xmm0"); 2317 2318 &dec ($rounds); 2319 &jnz (&label("loop_key192")); 2320 2321 &mov ($rounds,11); 2322 &mov (&DWP(32,$key),$rounds); 2323 2324 &jmp (&label("good_key")); 2325 2326&set_label("14rounds",16); 2327 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 2328 &lea ($key,&DWP(16,$key)); 2329 &cmp ("ebp",1<<28); 2330 &je (&label("14rounds_alt")); 2331 2332 &mov ($rounds,13); 2333 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 2334 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 2335 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 2336 &call (&label("key_256a_cold")); 2337 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 2338 &call (&label("key_256b")); 2339 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 2340 &call (&label("key_256a")); 2341 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 2342 &call (&label("key_256b")); 2343 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 2344 &call (&label("key_256a")); 2345 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 2346 &call (&label("key_256b")); 2347 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 2348 &call (&label("key_256a")); 2349 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 2350 &call (&label("key_256b")); 2351 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 2352 &call (&label("key_256a")); 2353 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 2354 &call (&label("key_256b")); 2355 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 2356 &call (&label("key_256a")); 2357 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 2358 &call (&label("key_256b")); 2359 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 2360 &call (&label("key_256a")); 2361 &$movekey (&QWP(0,$key),"xmm0"); 2362 &mov (&DWP(16,$key),$rounds); 2363 &xor ("eax","eax"); 2364 2365 &jmp (&label("good_key")); 2366 2367&set_label("key_256a",16); 2368 &$movekey (&QWP(0,$key),"xmm2"); 2369 &lea ($key,&DWP(16,$key)); 2370&set_label("key_256a_cold"); 2371 &shufps ("xmm4","xmm0",0b00010000); 2372 &xorps ("xmm0","xmm4"); 2373 &shufps ("xmm4","xmm0",0b10001100); 2374 &xorps ("xmm0","xmm4"); 2375 &shufps ("xmm1","xmm1",0b11111111); # critical path 2376 &xorps ("xmm0","xmm1"); 2377 &ret(); 2378 2379&set_label("key_256b",16); 2380 &$movekey (&QWP(0,$key),"xmm0"); 2381 &lea ($key,&DWP(16,$key)); 2382 2383 &shufps ("xmm4","xmm2",0b00010000); 2384 &xorps ("xmm2","xmm4"); 2385 &shufps ("xmm4","xmm2",0b10001100); 2386 &xorps ("xmm2","xmm4"); 2387 &shufps ("xmm1","xmm1",0b10101010); # critical path 2388 &xorps ("xmm2","xmm1"); 2389 &ret(); 2390 2391&set_label("14rounds_alt",16); 2392 &movdqa ("xmm5",&QWP(0x00,"ebx")); 2393 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2394 &mov ($rounds,7); 2395 &movdqu (&QWP(-32,$key),"xmm0"); 2396 &movdqa ("xmm1","xmm2"); 2397 &movdqu (&QWP(-16,$key),"xmm2"); 2398 2399&set_label("loop_key256"); 2400 &pshufb ("xmm2","xmm5"); 2401 &aesenclast ("xmm2","xmm4"); 2402 2403 &movdqa ("xmm3","xmm0"); 2404 &pslldq ("xmm0",4); 2405 &pxor ("xmm3","xmm0"); 2406 &pslldq ("xmm0",4); 2407 &pxor ("xmm3","xmm0"); 2408 &pslldq ("xmm0",4); 2409 &pxor ("xmm0","xmm3"); 2410 &pslld ("xmm4",1); 2411 2412 &pxor ("xmm0","xmm2"); 2413 &movdqu (&QWP(0,$key),"xmm0"); 2414 2415 &dec ($rounds); 2416 &jz (&label("done_key256")); 2417 2418 &pshufd ("xmm2","xmm0",0xff); 2419 &pxor ("xmm3","xmm3"); 2420 &aesenclast ("xmm2","xmm3"); 2421 2422 &movdqa ("xmm3","xmm1") 2423 &pslldq ("xmm1",4); 2424 &pxor ("xmm3","xmm1"); 2425 &pslldq ("xmm1",4); 2426 &pxor ("xmm3","xmm1"); 2427 &pslldq ("xmm1",4); 2428 &pxor ("xmm1","xmm3"); 2429 2430 &pxor ("xmm2","xmm1"); 2431 &movdqu (&QWP(16,$key),"xmm2"); 2432 &lea ($key,&DWP(32,$key)); 2433 &movdqa ("xmm1","xmm2"); 2434 &jmp (&label("loop_key256")); 2435 2436&set_label("done_key256"); 2437 &mov ($rounds,13); 2438 &mov (&DWP(16,$key),$rounds); 2439 2440&set_label("good_key"); 2441 &pxor ("xmm0","xmm0"); 2442 &pxor ("xmm1","xmm1"); 2443 &pxor ("xmm2","xmm2"); 2444 &pxor ("xmm3","xmm3"); 2445 &pxor ("xmm4","xmm4"); 2446 &pxor ("xmm5","xmm5"); 2447 &xor ("eax","eax"); 2448 &pop ("ebx"); 2449 &pop ("ebp"); 2450 &ret (); 2451 2452&set_label("bad_pointer",4); 2453 &mov ("eax",-1); 2454 &pop ("ebx"); 2455 &pop ("ebp"); 2456 &ret (); 2457&set_label("bad_keybits",4); 2458 &pxor ("xmm0","xmm0"); 2459 &mov ("eax",-2); 2460 &pop ("ebx"); 2461 &pop ("ebp"); 2462 &ret (); 2463&function_end_B("_aesni_set_encrypt_key"); 2464 2465# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 2466# AES_KEY *key) 2467&function_begin_B("${PREFIX}_set_encrypt_key"); 2468 &mov ("eax",&wparam(0)); 2469 &mov ($rounds,&wparam(1)); 2470 &mov ($key,&wparam(2)); 2471 &call ("_aesni_set_encrypt_key"); 2472 &ret (); 2473&function_end_B("${PREFIX}_set_encrypt_key"); 2474 2475# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 2476# AES_KEY *key) 2477&function_begin_B("${PREFIX}_set_decrypt_key"); 2478 &mov ("eax",&wparam(0)); 2479 &mov ($rounds,&wparam(1)); 2480 &mov ($key,&wparam(2)); 2481 &call ("_aesni_set_encrypt_key"); 2482 &mov ($key,&wparam(2)); 2483 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key 2484 &test ("eax","eax"); 2485 &jnz (&label("dec_key_ret")); 2486 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 2487 2488 &$movekey ("xmm0",&QWP(0,$key)); # just swap 2489 &$movekey ("xmm1",&QWP(0,"eax")); 2490 &$movekey (&QWP(0,"eax"),"xmm0"); 2491 &$movekey (&QWP(0,$key),"xmm1"); 2492 &lea ($key,&DWP(16,$key)); 2493 &lea ("eax",&DWP(-16,"eax")); 2494 2495&set_label("dec_key_inverse"); 2496 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 2497 &$movekey ("xmm1",&QWP(0,"eax")); 2498 &aesimc ("xmm0","xmm0"); 2499 &aesimc ("xmm1","xmm1"); 2500 &lea ($key,&DWP(16,$key)); 2501 &lea ("eax",&DWP(-16,"eax")); 2502 &$movekey (&QWP(16,"eax"),"xmm0"); 2503 &$movekey (&QWP(-16,$key),"xmm1"); 2504 &cmp ("eax",$key); 2505 &ja (&label("dec_key_inverse")); 2506 2507 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 2508 &aesimc ("xmm0","xmm0"); 2509 &$movekey (&QWP(0,$key),"xmm0"); 2510 2511 &pxor ("xmm0","xmm0"); 2512 &pxor ("xmm1","xmm1"); 2513 &xor ("eax","eax"); # return success 2514&set_label("dec_key_ret"); 2515 &ret (); 2516&function_end_B("${PREFIX}_set_decrypt_key"); 2517 2518&set_label("key_const",64); 2519&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); 2520&data_word(0x04070605,0x04070605,0x04070605,0x04070605); 2521&data_word(1,1,1,1); 2522&data_word(0x1b,0x1b,0x1b,0x1b); 2523&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 2524 2525&asm_finish(); 2526