1#!/usr/bin/env perl 2 3###################################################################### 4## Constant-time SSSE3 AES core implementation. 5## version 0.1 6## 7## By Mike Hamburg (Stanford University), 2009 8## Public domain. 9## 10## For details see http://shiftleft.org/papers/vector_aes/ and 11## http://crypto.stanford.edu/vpaes/. 12 13###################################################################### 14# September 2011. 15# 16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for 17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt 18# doesn't handle partial vectors (doesn't have to if called from 19# EVP only). "Drop-in" implies that this module doesn't share key 20# schedule structure with the original nor does it make assumption 21# about its alignment... 22# 23# Performance summary. aes-586.pl column lists large-block CBC 24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per 25# byte processed with 128-bit key, and vpaes-x86.pl column - [also 26# large-block CBC] encrypt/decrypt. 27# 28# aes-586.pl vpaes-x86.pl 29# 30# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) 31# Nehalem 27.9/40.4/18.1 10.2/11.9 32# Atom 70.7/92.1/60.1 61.1/75.4(***) 33# Silvermont 45.4/62.9/24.1 49.2/61.1(***) 34# 35# (*) "Hyper-threading" in the context refers rather to cache shared 36# among multiple cores, than to specifically Intel HTT. As vast 37# majority of contemporary cores share cache, slower code path 38# is common place. In other words "with-hyper-threading-off" 39# results are presented mostly for reference purposes. 40# 41# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. 42# 43# (***) Less impressive improvement on Core 2 and Atom is due to slow 44# pshufb, yet it's respectable +28%/64% improvement on Core 2 45# and +15% on Atom (as implied, over "hyper-threading-safe" 46# code path). 47# 48# <appro@openssl.org> 49 50$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 51push(@INC,"${dir}","${dir}../../perlasm"); 52require "x86asm.pl"; 53 54&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); 55 56$PREFIX="vpaes"; 57 58my ($round, $base, $magic, $key, $const, $inp, $out)= 59 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); 60 61&static_label("_vpaes_consts"); 62&static_label("_vpaes_schedule_low_round"); 63 64&set_label("_vpaes_consts",64); 65$k_inv=-0x30; # inv, inva 66 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); 67 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); 68 69$k_s0F=-0x10; # s0F 70 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); 71 72$k_ipt=0x00; # input transform (lo, hi) 73 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); 74 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); 75 76$k_sb1=0x20; # sb1u, sb1t 77 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); 78 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); 79$k_sb2=0x40; # sb2u, sb2t 80 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); 81 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); 82$k_sbo=0x60; # sbou, sbot 83 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); 84 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); 85 86$k_mc_forward=0x80; # mc_forward 87 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); 88 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); 89 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); 90 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); 91 92$k_mc_backward=0xc0; # mc_backward 93 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); 94 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); 95 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); 96 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); 97 98$k_sr=0x100; # sr 99 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); 100 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); 101 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); 102 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); 103 104$k_rcon=0x140; # rcon 105 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); 106 107$k_s63=0x150; # s63: all equal to 0x63 transformed 108 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); 109 110$k_opt=0x160; # output transform 111 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); 112 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); 113 114$k_deskew=0x180; # deskew tables: inverts the sbox's "skew" 115 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); 116 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); 117## 118## Decryption stuff 119## Key schedule constants 120## 121$k_dksd=0x1a0; # decryption key schedule: invskew x*D 122 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); 123 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); 124$k_dksb=0x1c0; # decryption key schedule: invskew x*B 125 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); 126 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); 127$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 128 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); 129 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); 130$k_dks9=0x200; # decryption key schedule: invskew x*9 131 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); 132 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); 133 134## 135## Decryption stuff 136## Round function constants 137## 138$k_dipt=0x220; # decryption input transform 139 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); 140 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); 141 142$k_dsb9=0x240; # decryption sbox output *9*u, *9*t 143 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); 144 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); 145$k_dsbd=0x260; # decryption sbox output *D*u, *D*t 146 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); 147 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); 148$k_dsbb=0x280; # decryption sbox output *B*u, *B*t 149 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); 150 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); 151$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t 152 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); 153 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); 154$k_dsbo=0x2c0; # decryption sbox final output 155 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); 156 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); 157&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); 158&align (64); 159 160&function_begin_B("_vpaes_preheat"); 161 &add ($const,&DWP(0,"esp")); 162 &movdqa ("xmm7",&QWP($k_inv,$const)); 163 &movdqa ("xmm6",&QWP($k_s0F,$const)); 164 &ret (); 165&function_end_B("_vpaes_preheat"); 166 167## 168## _aes_encrypt_core 169## 170## AES-encrypt %xmm0. 171## 172## Inputs: 173## %xmm0 = input 174## %xmm6-%xmm7 as in _vpaes_preheat 175## (%edx) = scheduled keys 176## 177## Output in %xmm0 178## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx 179## 180## 181&function_begin_B("_vpaes_encrypt_core"); 182 &mov ($magic,16); 183 &mov ($round,&DWP(240,$key)); 184 &movdqa ("xmm1","xmm6") 185 &movdqa ("xmm2",&QWP($k_ipt,$const)); 186 &pandn ("xmm1","xmm0"); 187 &pand ("xmm0","xmm6"); 188 &movdqu ("xmm5",&QWP(0,$key)); 189 &pshufb ("xmm2","xmm0"); 190 &movdqa ("xmm0",&QWP($k_ipt+16,$const)); 191 &pxor ("xmm2","xmm5"); 192 &psrld ("xmm1",4); 193 &add ($key,16); 194 &pshufb ("xmm0","xmm1"); 195 &lea ($base,&DWP($k_mc_backward,$const)); 196 &pxor ("xmm0","xmm2"); 197 &jmp (&label("enc_entry")); 198 199 200&set_label("enc_loop",16); 201 # middle of middle round 202 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u 203 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t 204 &pshufb ("xmm4","xmm2"); # 4 = sb1u 205 &pshufb ("xmm0","xmm3"); # 0 = sb1t 206 &pxor ("xmm4","xmm5"); # 4 = sb1u + k 207 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u 208 &pxor ("xmm0","xmm4"); # 0 = A 209 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] 210 &pshufb ("xmm5","xmm2"); # 4 = sb2u 211 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t 212 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] 213 &pshufb ("xmm2","xmm3"); # 2 = sb2t 214 &movdqa ("xmm3","xmm0"); # 3 = A 215 &pxor ("xmm2","xmm5"); # 2 = 2A 216 &pshufb ("xmm0","xmm1"); # 0 = B 217 &add ($key,16); # next key 218 &pxor ("xmm0","xmm2"); # 0 = 2A+B 219 &pshufb ("xmm3","xmm4"); # 3 = D 220 &add ($magic,16); # next mc 221 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D 222 &pshufb ("xmm0","xmm1"); # 0 = 2B+C 223 &and ($magic,0x30); # ... mod 4 224 &sub ($round,1); # nr-- 225 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D 226 227&set_label("enc_entry"); 228 # top of round 229 &movdqa ("xmm1","xmm6"); # 1 : i 230 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k 231 &pandn ("xmm1","xmm0"); # 1 = i<<4 232 &psrld ("xmm1",4); # 1 = i 233 &pand ("xmm0","xmm6"); # 0 = k 234 &pshufb ("xmm5","xmm0"); # 2 = a/k 235 &movdqa ("xmm3","xmm7"); # 3 : 1/i 236 &pxor ("xmm0","xmm1"); # 0 = j 237 &pshufb ("xmm3","xmm1"); # 3 = 1/i 238 &movdqa ("xmm4","xmm7"); # 4 : 1/j 239 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k 240 &pshufb ("xmm4","xmm0"); # 4 = 1/j 241 &movdqa ("xmm2","xmm7"); # 2 : 1/iak 242 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k 243 &pshufb ("xmm2","xmm3"); # 2 = 1/iak 244 &movdqa ("xmm3","xmm7"); # 3 : 1/jak 245 &pxor ("xmm2","xmm0"); # 2 = io 246 &pshufb ("xmm3","xmm4"); # 3 = 1/jak 247 &movdqu ("xmm5",&QWP(0,$key)); 248 &pxor ("xmm3","xmm1"); # 3 = jo 249 &jnz (&label("enc_loop")); 250 251 # middle of last round 252 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo 253 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 254 &pshufb ("xmm4","xmm2"); # 4 = sbou 255 &pxor ("xmm4","xmm5"); # 4 = sb1u + k 256 &pshufb ("xmm0","xmm3"); # 0 = sb1t 257 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] 258 &pxor ("xmm0","xmm4"); # 0 = A 259 &pshufb ("xmm0","xmm1"); 260 &ret (); 261&function_end_B("_vpaes_encrypt_core"); 262 263## 264## Decryption core 265## 266## Same API as encryption core. 267## 268&function_begin_B("_vpaes_decrypt_core"); 269 &lea ($base,&DWP($k_dsbd,$const)); 270 &mov ($round,&DWP(240,$key)); 271 &movdqa ("xmm1","xmm6"); 272 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); 273 &pandn ("xmm1","xmm0"); 274 &mov ($magic,$round); 275 &psrld ("xmm1",4) 276 &movdqu ("xmm5",&QWP(0,$key)); 277 &shl ($magic,4); 278 &pand ("xmm0","xmm6"); 279 &pshufb ("xmm2","xmm0"); 280 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); 281 &xor ($magic,0x30); 282 &pshufb ("xmm0","xmm1"); 283 &and ($magic,0x30); 284 &pxor ("xmm2","xmm5"); 285 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); 286 &pxor ("xmm0","xmm2"); 287 &add ($key,16); 288 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); 289 &jmp (&label("dec_entry")); 290 291&set_label("dec_loop",16); 292## 293## Inverse mix columns 294## 295 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u 296 &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t 297 &pshufb ("xmm4","xmm2"); # 4 = sb9u 298 &pshufb ("xmm1","xmm3"); # 0 = sb9t 299 &pxor ("xmm0","xmm4"); 300 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu 301 &pxor ("xmm0","xmm1"); # 0 = ch 302 &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt 303 304 &pshufb ("xmm4","xmm2"); # 4 = sbdu 305 &pshufb ("xmm0","xmm5"); # MC ch 306 &pshufb ("xmm1","xmm3"); # 0 = sbdt 307 &pxor ("xmm0","xmm4"); # 4 = ch 308 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu 309 &pxor ("xmm0","xmm1"); # 0 = ch 310 &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt 311 312 &pshufb ("xmm4","xmm2"); # 4 = sbbu 313 &pshufb ("xmm0","xmm5"); # MC ch 314 &pshufb ("xmm1","xmm3"); # 0 = sbbt 315 &pxor ("xmm0","xmm4"); # 4 = ch 316 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu 317 &pxor ("xmm0","xmm1"); # 0 = ch 318 &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet 319 320 &pshufb ("xmm4","xmm2"); # 4 = sbeu 321 &pshufb ("xmm0","xmm5"); # MC ch 322 &pshufb ("xmm1","xmm3"); # 0 = sbet 323 &pxor ("xmm0","xmm4"); # 4 = ch 324 &add ($key,16); # next round key 325 &palignr("xmm5","xmm5",12); 326 &pxor ("xmm0","xmm1"); # 0 = ch 327 &sub ($round,1); # nr-- 328 329&set_label("dec_entry"); 330 # top of round 331 &movdqa ("xmm1","xmm6"); # 1 : i 332 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k 333 &pandn ("xmm1","xmm0"); # 1 = i<<4 334 &pand ("xmm0","xmm6"); # 0 = k 335 &psrld ("xmm1",4); # 1 = i 336 &pshufb ("xmm2","xmm0"); # 2 = a/k 337 &movdqa ("xmm3","xmm7"); # 3 : 1/i 338 &pxor ("xmm0","xmm1"); # 0 = j 339 &pshufb ("xmm3","xmm1"); # 3 = 1/i 340 &movdqa ("xmm4","xmm7"); # 4 : 1/j 341 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k 342 &pshufb ("xmm4","xmm0"); # 4 = 1/j 343 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k 344 &movdqa ("xmm2","xmm7"); # 2 : 1/iak 345 &pshufb ("xmm2","xmm3"); # 2 = 1/iak 346 &movdqa ("xmm3","xmm7"); # 3 : 1/jak 347 &pxor ("xmm2","xmm0"); # 2 = io 348 &pshufb ("xmm3","xmm4"); # 3 = 1/jak 349 &movdqu ("xmm0",&QWP(0,$key)); 350 &pxor ("xmm3","xmm1"); # 3 = jo 351 &jnz (&label("dec_loop")); 352 353 # middle of last round 354 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou 355 &pshufb ("xmm4","xmm2"); # 4 = sbou 356 &pxor ("xmm4","xmm0"); # 4 = sb1u + k 357 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot 358 &movdqa ("xmm2",&QWP(0,$magic)); 359 &pshufb ("xmm0","xmm3"); # 0 = sb1t 360 &pxor ("xmm0","xmm4"); # 0 = A 361 &pshufb ("xmm0","xmm2"); 362 &ret (); 363&function_end_B("_vpaes_decrypt_core"); 364 365######################################################## 366## ## 367## AES key schedule ## 368## ## 369######################################################## 370&function_begin_B("_vpaes_schedule_core"); 371 &add ($const,&DWP(0,"esp")); 372 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) 373 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon 374 375 # input transform 376 &movdqa ("xmm3","xmm0"); 377 &lea ($base,&DWP($k_ipt,$const)); 378 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 379 &call ("_vpaes_schedule_transform"); 380 &movdqa ("xmm7","xmm0"); 381 382 &test ($out,$out); 383 &jnz (&label("schedule_am_decrypting")); 384 385 # encrypting, output zeroth round key after transform 386 &movdqu (&QWP(0,$key),"xmm0"); 387 &jmp (&label("schedule_go")); 388 389&set_label("schedule_am_decrypting"); 390 # decrypting, output zeroth round key after shiftrows 391 &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 392 &pshufb ("xmm3","xmm1"); 393 &movdqu (&QWP(0,$key),"xmm3"); 394 &xor ($magic,0x30); 395 396&set_label("schedule_go"); 397 &cmp ($round,192); 398 &ja (&label("schedule_256")); 399 &je (&label("schedule_192")); 400 # 128: fall though 401 402## 403## .schedule_128 404## 405## 128-bit specific part of key schedule. 406## 407## This schedule is really simple, because all its parts 408## are accomplished by the subroutines. 409## 410&set_label("schedule_128"); 411 &mov ($round,10); 412 413&set_label("loop_schedule_128"); 414 &call ("_vpaes_schedule_round"); 415 &dec ($round); 416 &jz (&label("schedule_mangle_last")); 417 &call ("_vpaes_schedule_mangle"); # write output 418 &jmp (&label("loop_schedule_128")); 419 420## 421## .aes_schedule_192 422## 423## 192-bit specific part of key schedule. 424## 425## The main body of this schedule is the same as the 128-bit 426## schedule, but with more smearing. The long, high side is 427## stored in %xmm7 as before, and the short, low side is in 428## the high bits of %xmm6. 429## 430## This schedule is somewhat nastier, however, because each 431## round produces 192 bits of key material, or 1.5 round keys. 432## Therefore, on each cycle we do 2 rounds and produce 3 round 433## keys. 434## 435&set_label("schedule_192",16); 436 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) 437 &call ("_vpaes_schedule_transform"); # input transform 438 &movdqa ("xmm6","xmm0"); # save short part 439 &pxor ("xmm4","xmm4"); # clear 4 440 &movhlps("xmm6","xmm4"); # clobber low side with zeros 441 &mov ($round,4); 442 443&set_label("loop_schedule_192"); 444 &call ("_vpaes_schedule_round"); 445 &palignr("xmm0","xmm6",8); 446 &call ("_vpaes_schedule_mangle"); # save key n 447 &call ("_vpaes_schedule_192_smear"); 448 &call ("_vpaes_schedule_mangle"); # save key n+1 449 &call ("_vpaes_schedule_round"); 450 &dec ($round); 451 &jz (&label("schedule_mangle_last")); 452 &call ("_vpaes_schedule_mangle"); # save key n+2 453 &call ("_vpaes_schedule_192_smear"); 454 &jmp (&label("loop_schedule_192")); 455 456## 457## .aes_schedule_256 458## 459## 256-bit specific part of key schedule. 460## 461## The structure here is very similar to the 128-bit 462## schedule, but with an additional "low side" in 463## %xmm6. The low side's rounds are the same as the 464## high side's, except no rcon and no rotation. 465## 466&set_label("schedule_256",16); 467 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) 468 &call ("_vpaes_schedule_transform"); # input transform 469 &mov ($round,7); 470 471&set_label("loop_schedule_256"); 472 &call ("_vpaes_schedule_mangle"); # output low result 473 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 474 475 # high round 476 &call ("_vpaes_schedule_round"); 477 &dec ($round); 478 &jz (&label("schedule_mangle_last")); 479 &call ("_vpaes_schedule_mangle"); 480 481 # low round. swap xmm7 and xmm6 482 &pshufd ("xmm0","xmm0",0xFF); 483 &movdqa (&QWP(20,"esp"),"xmm7"); 484 &movdqa ("xmm7","xmm6"); 485 &call ("_vpaes_schedule_low_round"); 486 &movdqa ("xmm7",&QWP(20,"esp")); 487 488 &jmp (&label("loop_schedule_256")); 489 490## 491## .aes_schedule_mangle_last 492## 493## Mangler for last round of key schedule 494## Mangles %xmm0 495## when encrypting, outputs out(%xmm0) ^ 63 496## when decrypting, outputs unskew(%xmm0) 497## 498## Always called right before return... jumps to cleanup and exits 499## 500&set_label("schedule_mangle_last",16); 501 # schedule last round key from xmm0 502 &lea ($base,&DWP($k_deskew,$const)); 503 &test ($out,$out); 504 &jnz (&label("schedule_mangle_last_dec")); 505 506 # encrypting 507 &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 508 &pshufb ("xmm0","xmm1"); # output permute 509 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform 510 &add ($key,32); 511 512&set_label("schedule_mangle_last_dec"); 513 &add ($key,-16); 514 &pxor ("xmm0",&QWP($k_s63,$const)); 515 &call ("_vpaes_schedule_transform"); # output transform 516 &movdqu (&QWP(0,$key),"xmm0"); # save last key 517 518 # cleanup 519 &pxor ("xmm0","xmm0"); 520 &pxor ("xmm1","xmm1"); 521 &pxor ("xmm2","xmm2"); 522 &pxor ("xmm3","xmm3"); 523 &pxor ("xmm4","xmm4"); 524 &pxor ("xmm5","xmm5"); 525 &pxor ("xmm6","xmm6"); 526 &pxor ("xmm7","xmm7"); 527 &ret (); 528&function_end_B("_vpaes_schedule_core"); 529 530## 531## .aes_schedule_192_smear 532## 533## Smear the short, low side in the 192-bit key schedule. 534## 535## Inputs: 536## %xmm7: high side, b a x y 537## %xmm6: low side, d c 0 0 538## %xmm13: 0 539## 540## Outputs: 541## %xmm6: b+c+d b+c 0 0 542## %xmm0: b+c+d b+c b a 543## 544&function_begin_B("_vpaes_schedule_192_smear"); 545 &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0 546 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a 547 &pxor ("xmm6","xmm1"); # -> c+d c 0 0 548 &pxor ("xmm1","xmm1"); 549 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a 550 &movdqa ("xmm0","xmm6"); 551 &movhlps("xmm6","xmm1"); # clobber low side with zeros 552 &ret (); 553&function_end_B("_vpaes_schedule_192_smear"); 554 555## 556## .aes_schedule_round 557## 558## Runs one main round of the key schedule on %xmm0, %xmm7 559## 560## Specifically, runs subbytes on the high dword of %xmm0 561## then rotates it by one byte and xors into the low dword of 562## %xmm7. 563## 564## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 565## next rcon. 566## 567## Smears the dwords of %xmm7 by xoring the low into the 568## second low, result into third, result into highest. 569## 570## Returns results in %xmm7 = %xmm0. 571## Clobbers %xmm1-%xmm5. 572## 573&function_begin_B("_vpaes_schedule_round"); 574 # extract rcon from xmm8 575 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 576 &pxor ("xmm1","xmm1"); 577 &palignr("xmm1","xmm2",15); 578 &palignr("xmm2","xmm2",15); 579 &pxor ("xmm7","xmm1"); 580 581 # rotate 582 &pshufd ("xmm0","xmm0",0xFF); 583 &palignr("xmm0","xmm0",1); 584 585 # fall through... 586 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 587 588 # low round: same as high round, but no rotation and no rcon. 589&set_label("_vpaes_schedule_low_round"); 590 # smear xmm7 591 &movdqa ("xmm1","xmm7"); 592 &pslldq ("xmm7",4); 593 &pxor ("xmm7","xmm1"); 594 &movdqa ("xmm1","xmm7"); 595 &pslldq ("xmm7",8); 596 &pxor ("xmm7","xmm1"); 597 &pxor ("xmm7",&QWP($k_s63,$const)); 598 599 # subbyte 600 &movdqa ("xmm4",&QWP($k_s0F,$const)); 601 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j 602 &movdqa ("xmm1","xmm4"); 603 &pandn ("xmm1","xmm0"); 604 &psrld ("xmm1",4); # 1 = i 605 &pand ("xmm0","xmm4"); # 0 = k 606 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k 607 &pshufb ("xmm2","xmm0"); # 2 = a/k 608 &pxor ("xmm0","xmm1"); # 0 = j 609 &movdqa ("xmm3","xmm5"); # 3 : 1/i 610 &pshufb ("xmm3","xmm1"); # 3 = 1/i 611 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k 612 &movdqa ("xmm4","xmm5"); # 4 : 1/j 613 &pshufb ("xmm4","xmm0"); # 4 = 1/j 614 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k 615 &movdqa ("xmm2","xmm5"); # 2 : 1/iak 616 &pshufb ("xmm2","xmm3"); # 2 = 1/iak 617 &pxor ("xmm2","xmm0"); # 2 = io 618 &movdqa ("xmm3","xmm5"); # 3 : 1/jak 619 &pshufb ("xmm3","xmm4"); # 3 = 1/jak 620 &pxor ("xmm3","xmm1"); # 3 = jo 621 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou 622 &pshufb ("xmm4","xmm2"); # 4 = sbou 623 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot 624 &pshufb ("xmm0","xmm3"); # 0 = sb1t 625 &pxor ("xmm0","xmm4"); # 0 = sbox output 626 627 # add in smeared stuff 628 &pxor ("xmm0","xmm7"); 629 &movdqa ("xmm7","xmm0"); 630 &ret (); 631&function_end_B("_vpaes_schedule_round"); 632 633## 634## .aes_schedule_transform 635## 636## Linear-transform %xmm0 according to tables at (%ebx) 637## 638## Output in %xmm0 639## Clobbers %xmm1, %xmm2 640## 641&function_begin_B("_vpaes_schedule_transform"); 642 &movdqa ("xmm2",&QWP($k_s0F,$const)); 643 &movdqa ("xmm1","xmm2"); 644 &pandn ("xmm1","xmm0"); 645 &psrld ("xmm1",4); 646 &pand ("xmm0","xmm2"); 647 &movdqa ("xmm2",&QWP(0,$base)); 648 &pshufb ("xmm2","xmm0"); 649 &movdqa ("xmm0",&QWP(16,$base)); 650 &pshufb ("xmm0","xmm1"); 651 &pxor ("xmm0","xmm2"); 652 &ret (); 653&function_end_B("_vpaes_schedule_transform"); 654 655## 656## .aes_schedule_mangle 657## 658## Mangle xmm0 from (basis-transformed) standard version 659## to our version. 660## 661## On encrypt, 662## xor with 0x63 663## multiply by circulant 0,1,1,1 664## apply shiftrows transform 665## 666## On decrypt, 667## xor with 0x63 668## multiply by "inverse mixcolumns" circulant E,B,D,9 669## deskew 670## apply shiftrows transform 671## 672## 673## Writes out to (%edx), and increments or decrements it 674## Keeps track of round number mod 4 in %ecx 675## Preserves xmm0 676## Clobbers xmm1-xmm5 677## 678&function_begin_B("_vpaes_schedule_mangle"); 679 &movdqa ("xmm4","xmm0"); # save xmm0 for later 680 &movdqa ("xmm5",&QWP($k_mc_forward,$const)); 681 &test ($out,$out); 682 &jnz (&label("schedule_mangle_dec")); 683 684 # encrypting 685 &add ($key,16); 686 &pxor ("xmm4",&QWP($k_s63,$const)); 687 &pshufb ("xmm4","xmm5"); 688 &movdqa ("xmm3","xmm4"); 689 &pshufb ("xmm4","xmm5"); 690 &pxor ("xmm3","xmm4"); 691 &pshufb ("xmm4","xmm5"); 692 &pxor ("xmm3","xmm4"); 693 694 &jmp (&label("schedule_mangle_both")); 695 696&set_label("schedule_mangle_dec",16); 697 # inverse mix columns 698 &movdqa ("xmm2",&QWP($k_s0F,$const)); 699 &lea ($inp,&DWP($k_dksd,$const)); 700 &movdqa ("xmm1","xmm2"); 701 &pandn ("xmm1","xmm4"); 702 &psrld ("xmm1",4); # 1 = hi 703 &pand ("xmm4","xmm2"); # 4 = lo 704 705 &movdqa ("xmm2",&QWP(0,$inp)); 706 &pshufb ("xmm2","xmm4"); 707 &movdqa ("xmm3",&QWP(0x10,$inp)); 708 &pshufb ("xmm3","xmm1"); 709 &pxor ("xmm3","xmm2"); 710 &pshufb ("xmm3","xmm5"); 711 712 &movdqa ("xmm2",&QWP(0x20,$inp)); 713 &pshufb ("xmm2","xmm4"); 714 &pxor ("xmm2","xmm3"); 715 &movdqa ("xmm3",&QWP(0x30,$inp)); 716 &pshufb ("xmm3","xmm1"); 717 &pxor ("xmm3","xmm2"); 718 &pshufb ("xmm3","xmm5"); 719 720 &movdqa ("xmm2",&QWP(0x40,$inp)); 721 &pshufb ("xmm2","xmm4"); 722 &pxor ("xmm2","xmm3"); 723 &movdqa ("xmm3",&QWP(0x50,$inp)); 724 &pshufb ("xmm3","xmm1"); 725 &pxor ("xmm3","xmm2"); 726 &pshufb ("xmm3","xmm5"); 727 728 &movdqa ("xmm2",&QWP(0x60,$inp)); 729 &pshufb ("xmm2","xmm4"); 730 &pxor ("xmm2","xmm3"); 731 &movdqa ("xmm3",&QWP(0x70,$inp)); 732 &pshufb ("xmm3","xmm1"); 733 &pxor ("xmm3","xmm2"); 734 735 &add ($key,-16); 736 737&set_label("schedule_mangle_both"); 738 &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 739 &pshufb ("xmm3","xmm1"); 740 &add ($magic,-16); 741 &and ($magic,0x30); 742 &movdqu (&QWP(0,$key),"xmm3"); 743 &ret (); 744&function_end_B("_vpaes_schedule_mangle"); 745 746# 747# Interface to OpenSSL 748# 749&function_begin("${PREFIX}_set_encrypt_key"); 750 &mov ($inp,&wparam(0)); # inp 751 &lea ($base,&DWP(-56,"esp")); 752 &mov ($round,&wparam(1)); # bits 753 &and ($base,-16); 754 &mov ($key,&wparam(2)); # key 755 &xchg ($base,"esp"); # alloca 756 &mov (&DWP(48,"esp"),$base); 757 758 &mov ($base,$round); 759 &shr ($base,5); 760 &add ($base,5); 761 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; 762 &mov ($magic,0x30); 763 &mov ($out,0); 764 765 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 766 &call ("_vpaes_schedule_core"); 767&set_label("pic_point"); 768 769 &mov ("esp",&DWP(48,"esp")); 770 &xor ("eax","eax"); 771&function_end("${PREFIX}_set_encrypt_key"); 772 773&function_begin("${PREFIX}_set_decrypt_key"); 774 &mov ($inp,&wparam(0)); # inp 775 &lea ($base,&DWP(-56,"esp")); 776 &mov ($round,&wparam(1)); # bits 777 &and ($base,-16); 778 &mov ($key,&wparam(2)); # key 779 &xchg ($base,"esp"); # alloca 780 &mov (&DWP(48,"esp"),$base); 781 782 &mov ($base,$round); 783 &shr ($base,5); 784 &add ($base,5); 785 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; 786 &shl ($base,4); 787 &lea ($key,&DWP(16,$key,$base)); 788 789 &mov ($out,1); 790 &mov ($magic,$round); 791 &shr ($magic,1); 792 &and ($magic,32); 793 &xor ($magic,32); # nbist==192?0:32; 794 795 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 796 &call ("_vpaes_schedule_core"); 797&set_label("pic_point"); 798 799 &mov ("esp",&DWP(48,"esp")); 800 &xor ("eax","eax"); 801&function_end("${PREFIX}_set_decrypt_key"); 802 803&function_begin("${PREFIX}_encrypt"); 804 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 805 &call ("_vpaes_preheat"); 806&set_label("pic_point"); 807 &mov ($inp,&wparam(0)); # inp 808 &lea ($base,&DWP(-56,"esp")); 809 &mov ($out,&wparam(1)); # out 810 &and ($base,-16); 811 &mov ($key,&wparam(2)); # key 812 &xchg ($base,"esp"); # alloca 813 &mov (&DWP(48,"esp"),$base); 814 815 &movdqu ("xmm0",&QWP(0,$inp)); 816 &call ("_vpaes_encrypt_core"); 817 &movdqu (&QWP(0,$out),"xmm0"); 818 819 &mov ("esp",&DWP(48,"esp")); 820&function_end("${PREFIX}_encrypt"); 821 822&function_begin("${PREFIX}_decrypt"); 823 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 824 &call ("_vpaes_preheat"); 825&set_label("pic_point"); 826 &mov ($inp,&wparam(0)); # inp 827 &lea ($base,&DWP(-56,"esp")); 828 &mov ($out,&wparam(1)); # out 829 &and ($base,-16); 830 &mov ($key,&wparam(2)); # key 831 &xchg ($base,"esp"); # alloca 832 &mov (&DWP(48,"esp"),$base); 833 834 &movdqu ("xmm0",&QWP(0,$inp)); 835 &call ("_vpaes_decrypt_core"); 836 &movdqu (&QWP(0,$out),"xmm0"); 837 838 &mov ("esp",&DWP(48,"esp")); 839&function_end("${PREFIX}_decrypt"); 840 841&function_begin("${PREFIX}_cbc_encrypt"); 842 &mov ($inp,&wparam(0)); # inp 843 &mov ($out,&wparam(1)); # out 844 &mov ($round,&wparam(2)); # len 845 &mov ($key,&wparam(3)); # key 846 &sub ($round,16); 847 &jc (&label("cbc_abort")); 848 &lea ($base,&DWP(-56,"esp")); 849 &mov ($const,&wparam(4)); # ivp 850 &and ($base,-16); 851 &mov ($magic,&wparam(5)); # enc 852 &xchg ($base,"esp"); # alloca 853 &movdqu ("xmm1",&QWP(0,$const)); # load IV 854 &sub ($out,$inp); 855 &mov (&DWP(48,"esp"),$base); 856 857 &mov (&DWP(0,"esp"),$out); # save out 858 &mov (&DWP(4,"esp"),$key) # save key 859 &mov (&DWP(8,"esp"),$const); # save ivp 860 &mov ($out,$round); # $out works as $len 861 862 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 863 &call ("_vpaes_preheat"); 864&set_label("pic_point"); 865 &cmp ($magic,0); 866 &je (&label("cbc_dec_loop")); 867 &jmp (&label("cbc_enc_loop")); 868 869&set_label("cbc_enc_loop",16); 870 &movdqu ("xmm0",&QWP(0,$inp)); # load input 871 &pxor ("xmm0","xmm1"); # inp^=iv 872 &call ("_vpaes_encrypt_core"); 873 &mov ($base,&DWP(0,"esp")); # restore out 874 &mov ($key,&DWP(4,"esp")); # restore key 875 &movdqa ("xmm1","xmm0"); 876 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output 877 &lea ($inp,&DWP(16,$inp)); 878 &sub ($out,16); 879 &jnc (&label("cbc_enc_loop")); 880 &jmp (&label("cbc_done")); 881 882&set_label("cbc_dec_loop",16); 883 &movdqu ("xmm0",&QWP(0,$inp)); # load input 884 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV 885 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV 886 &call ("_vpaes_decrypt_core"); 887 &mov ($base,&DWP(0,"esp")); # restore out 888 &mov ($key,&DWP(4,"esp")); # restore key 889 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv 890 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV 891 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output 892 &lea ($inp,&DWP(16,$inp)); 893 &sub ($out,16); 894 &jnc (&label("cbc_dec_loop")); 895 896&set_label("cbc_done"); 897 &mov ($base,&DWP(8,"esp")); # restore ivp 898 &mov ("esp",&DWP(48,"esp")); 899 &movdqu (&QWP(0,$base),"xmm1"); # write IV 900&set_label("cbc_abort"); 901&function_end("${PREFIX}_cbc_encrypt"); 902 903&asm_finish(); 904