1#!/usr/bin/env perl 2 3###################################################################### 4## Constant-time SSSE3 AES core implementation. 5## version 0.1 6## 7## By Mike Hamburg (Stanford University), 2009 8## Public domain. 9## 10## For details see http://shiftleft.org/papers/vector_aes/ and 11## http://crypto.stanford.edu/vpaes/. 12 13###################################################################### 14# September 2011. 15# 16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for 17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt 18# doesn't handle partial vectors (doesn't have to if called from 19# EVP only). "Drop-in" implies that this module doesn't share key 20# schedule structure with the original nor does it make assumption 21# about its alignment... 22# 23# Performance summary. aes-586.pl column lists large-block CBC 24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per 25# byte processed with 128-bit key, and vpaes-x86.pl column - [also 26# large-block CBC] encrypt/decrypt. 27# 28# aes-586.pl vpaes-x86.pl 29# 30# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) 31# Nehalem 27.9/40.4/18.1 10.2/11.9 32# Atom 70.7/92.1/60.1 61.1/75.4(***) 33# Silvermont 45.4/62.9/24.1 49.2/61.1(***) 34# 35# (*) "Hyper-threading" in the context refers rather to cache shared 36# among multiple cores, than to specifically Intel HTT. As vast 37# majority of contemporary cores share cache, slower code path 38# is common place. In other words "with-hyper-threading-off" 39# results are presented mostly for reference purposes. 40# 41# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. 42# 43# (***) Less impressive improvement on Core 2 and Atom is due to slow 44# pshufb, yet it's respectable +28%/64% improvement on Core 2 45# and +15% on Atom (as implied, over "hyper-threading-safe" 46# code path). 47# 48# <appro@openssl.org> 49 50$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 51push(@INC,"${dir}","${dir}../../perlasm"); 52require "x86asm.pl"; 53 54$output = pop; 55open OUT,">$output"; 56*STDOUT=*OUT; 57 58&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); 59 60$PREFIX="vpaes"; 61 62my ($round, $base, $magic, $key, $const, $inp, $out)= 63 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); 64 65&static_label("_vpaes_consts"); 66&static_label("_vpaes_schedule_low_round"); 67 68&set_label("_vpaes_consts",64); 69$k_inv=-0x30; # inv, inva 70 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); 71 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); 72 73$k_s0F=-0x10; # s0F 74 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); 75 76$k_ipt=0x00; # input transform (lo, hi) 77 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); 78 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); 79 80$k_sb1=0x20; # sb1u, sb1t 81 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); 82 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); 83$k_sb2=0x40; # sb2u, sb2t 84 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); 85 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); 86$k_sbo=0x60; # sbou, sbot 87 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); 88 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); 89 90$k_mc_forward=0x80; # mc_forward 91 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); 92 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); 93 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); 94 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); 95 96$k_mc_backward=0xc0; # mc_backward 97 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); 98 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); 99 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); 100 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); 101 102$k_sr=0x100; # sr 103 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); 104 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); 105 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); 106 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); 107 108$k_rcon=0x140; # rcon 109 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); 110 111$k_s63=0x150; # s63: all equal to 0x63 transformed 112 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); 113 114$k_opt=0x160; # output transform 115 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); 116 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); 117 118$k_deskew=0x180; # deskew tables: inverts the sbox's "skew" 119 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); 120 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); 121## 122## Decryption stuff 123## Key schedule constants 124## 125$k_dksd=0x1a0; # decryption key schedule: invskew x*D 126 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); 127 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); 128$k_dksb=0x1c0; # decryption key schedule: invskew x*B 129 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); 130 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); 131$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 132 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); 133 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); 134$k_dks9=0x200; # decryption key schedule: invskew x*9 135 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); 136 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); 137 138## 139## Decryption stuff 140## Round function constants 141## 142$k_dipt=0x220; # decryption input transform 143 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); 144 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); 145 146$k_dsb9=0x240; # decryption sbox output *9*u, *9*t 147 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); 148 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); 149$k_dsbd=0x260; # decryption sbox output *D*u, *D*t 150 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); 151 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); 152$k_dsbb=0x280; # decryption sbox output *B*u, *B*t 153 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); 154 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); 155$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t 156 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); 157 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); 158$k_dsbo=0x2c0; # decryption sbox final output 159 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); 160 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); 161&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); 162&align (64); 163 164&function_begin_B("_vpaes_preheat"); 165 &add ($const,&DWP(0,"esp")); 166 &movdqa ("xmm7",&QWP($k_inv,$const)); 167 &movdqa ("xmm6",&QWP($k_s0F,$const)); 168 &ret (); 169&function_end_B("_vpaes_preheat"); 170 171## 172## _aes_encrypt_core 173## 174## AES-encrypt %xmm0. 175## 176## Inputs: 177## %xmm0 = input 178## %xmm6-%xmm7 as in _vpaes_preheat 179## (%edx) = scheduled keys 180## 181## Output in %xmm0 182## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx 183## 184## 185&function_begin_B("_vpaes_encrypt_core"); 186 &mov ($magic,16); 187 &mov ($round,&DWP(240,$key)); 188 &movdqa ("xmm1","xmm6") 189 &movdqa ("xmm2",&QWP($k_ipt,$const)); 190 &pandn ("xmm1","xmm0"); 191 &pand ("xmm0","xmm6"); 192 &movdqu ("xmm5",&QWP(0,$key)); 193 &pshufb ("xmm2","xmm0"); 194 &movdqa ("xmm0",&QWP($k_ipt+16,$const)); 195 &pxor ("xmm2","xmm5"); 196 &psrld ("xmm1",4); 197 &add ($key,16); 198 &pshufb ("xmm0","xmm1"); 199 &lea ($base,&DWP($k_mc_backward,$const)); 200 &pxor ("xmm0","xmm2"); 201 &jmp (&label("enc_entry")); 202 203 204&set_label("enc_loop",16); 205 # middle of middle round 206 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u 207 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t 208 &pshufb ("xmm4","xmm2"); # 4 = sb1u 209 &pshufb ("xmm0","xmm3"); # 0 = sb1t 210 &pxor ("xmm4","xmm5"); # 4 = sb1u + k 211 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u 212 &pxor ("xmm0","xmm4"); # 0 = A 213 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] 214 &pshufb ("xmm5","xmm2"); # 4 = sb2u 215 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t 216 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] 217 &pshufb ("xmm2","xmm3"); # 2 = sb2t 218 &movdqa ("xmm3","xmm0"); # 3 = A 219 &pxor ("xmm2","xmm5"); # 2 = 2A 220 &pshufb ("xmm0","xmm1"); # 0 = B 221 &add ($key,16); # next key 222 &pxor ("xmm0","xmm2"); # 0 = 2A+B 223 &pshufb ("xmm3","xmm4"); # 3 = D 224 &add ($magic,16); # next mc 225 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D 226 &pshufb ("xmm0","xmm1"); # 0 = 2B+C 227 &and ($magic,0x30); # ... mod 4 228 &sub ($round,1); # nr-- 229 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D 230 231&set_label("enc_entry"); 232 # top of round 233 &movdqa ("xmm1","xmm6"); # 1 : i 234 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k 235 &pandn ("xmm1","xmm0"); # 1 = i<<4 236 &psrld ("xmm1",4); # 1 = i 237 &pand ("xmm0","xmm6"); # 0 = k 238 &pshufb ("xmm5","xmm0"); # 2 = a/k 239 &movdqa ("xmm3","xmm7"); # 3 : 1/i 240 &pxor ("xmm0","xmm1"); # 0 = j 241 &pshufb ("xmm3","xmm1"); # 3 = 1/i 242 &movdqa ("xmm4","xmm7"); # 4 : 1/j 243 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k 244 &pshufb ("xmm4","xmm0"); # 4 = 1/j 245 &movdqa ("xmm2","xmm7"); # 2 : 1/iak 246 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k 247 &pshufb ("xmm2","xmm3"); # 2 = 1/iak 248 &movdqa ("xmm3","xmm7"); # 3 : 1/jak 249 &pxor ("xmm2","xmm0"); # 2 = io 250 &pshufb ("xmm3","xmm4"); # 3 = 1/jak 251 &movdqu ("xmm5",&QWP(0,$key)); 252 &pxor ("xmm3","xmm1"); # 3 = jo 253 &jnz (&label("enc_loop")); 254 255 # middle of last round 256 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo 257 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 258 &pshufb ("xmm4","xmm2"); # 4 = sbou 259 &pxor ("xmm4","xmm5"); # 4 = sb1u + k 260 &pshufb ("xmm0","xmm3"); # 0 = sb1t 261 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] 262 &pxor ("xmm0","xmm4"); # 0 = A 263 &pshufb ("xmm0","xmm1"); 264 &ret (); 265&function_end_B("_vpaes_encrypt_core"); 266 267## 268## Decryption core 269## 270## Same API as encryption core. 271## 272&function_begin_B("_vpaes_decrypt_core"); 273 &lea ($base,&DWP($k_dsbd,$const)); 274 &mov ($round,&DWP(240,$key)); 275 &movdqa ("xmm1","xmm6"); 276 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); 277 &pandn ("xmm1","xmm0"); 278 &mov ($magic,$round); 279 &psrld ("xmm1",4) 280 &movdqu ("xmm5",&QWP(0,$key)); 281 &shl ($magic,4); 282 &pand ("xmm0","xmm6"); 283 &pshufb ("xmm2","xmm0"); 284 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); 285 &xor ($magic,0x30); 286 &pshufb ("xmm0","xmm1"); 287 &and ($magic,0x30); 288 &pxor ("xmm2","xmm5"); 289 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); 290 &pxor ("xmm0","xmm2"); 291 &add ($key,16); 292 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); 293 &jmp (&label("dec_entry")); 294 295&set_label("dec_loop",16); 296## 297## Inverse mix columns 298## 299 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u 300 &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t 301 &pshufb ("xmm4","xmm2"); # 4 = sb9u 302 &pshufb ("xmm1","xmm3"); # 0 = sb9t 303 &pxor ("xmm0","xmm4"); 304 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu 305 &pxor ("xmm0","xmm1"); # 0 = ch 306 &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt 307 308 &pshufb ("xmm4","xmm2"); # 4 = sbdu 309 &pshufb ("xmm0","xmm5"); # MC ch 310 &pshufb ("xmm1","xmm3"); # 0 = sbdt 311 &pxor ("xmm0","xmm4"); # 4 = ch 312 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu 313 &pxor ("xmm0","xmm1"); # 0 = ch 314 &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt 315 316 &pshufb ("xmm4","xmm2"); # 4 = sbbu 317 &pshufb ("xmm0","xmm5"); # MC ch 318 &pshufb ("xmm1","xmm3"); # 0 = sbbt 319 &pxor ("xmm0","xmm4"); # 4 = ch 320 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu 321 &pxor ("xmm0","xmm1"); # 0 = ch 322 &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet 323 324 &pshufb ("xmm4","xmm2"); # 4 = sbeu 325 &pshufb ("xmm0","xmm5"); # MC ch 326 &pshufb ("xmm1","xmm3"); # 0 = sbet 327 &pxor ("xmm0","xmm4"); # 4 = ch 328 &add ($key,16); # next round key 329 &palignr("xmm5","xmm5",12); 330 &pxor ("xmm0","xmm1"); # 0 = ch 331 &sub ($round,1); # nr-- 332 333&set_label("dec_entry"); 334 # top of round 335 &movdqa ("xmm1","xmm6"); # 1 : i 336 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k 337 &pandn ("xmm1","xmm0"); # 1 = i<<4 338 &pand ("xmm0","xmm6"); # 0 = k 339 &psrld ("xmm1",4); # 1 = i 340 &pshufb ("xmm2","xmm0"); # 2 = a/k 341 &movdqa ("xmm3","xmm7"); # 3 : 1/i 342 &pxor ("xmm0","xmm1"); # 0 = j 343 &pshufb ("xmm3","xmm1"); # 3 = 1/i 344 &movdqa ("xmm4","xmm7"); # 4 : 1/j 345 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k 346 &pshufb ("xmm4","xmm0"); # 4 = 1/j 347 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k 348 &movdqa ("xmm2","xmm7"); # 2 : 1/iak 349 &pshufb ("xmm2","xmm3"); # 2 = 1/iak 350 &movdqa ("xmm3","xmm7"); # 3 : 1/jak 351 &pxor ("xmm2","xmm0"); # 2 = io 352 &pshufb ("xmm3","xmm4"); # 3 = 1/jak 353 &movdqu ("xmm0",&QWP(0,$key)); 354 &pxor ("xmm3","xmm1"); # 3 = jo 355 &jnz (&label("dec_loop")); 356 357 # middle of last round 358 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou 359 &pshufb ("xmm4","xmm2"); # 4 = sbou 360 &pxor ("xmm4","xmm0"); # 4 = sb1u + k 361 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot 362 &movdqa ("xmm2",&QWP(0,$magic)); 363 &pshufb ("xmm0","xmm3"); # 0 = sb1t 364 &pxor ("xmm0","xmm4"); # 0 = A 365 &pshufb ("xmm0","xmm2"); 366 &ret (); 367&function_end_B("_vpaes_decrypt_core"); 368 369######################################################## 370## ## 371## AES key schedule ## 372## ## 373######################################################## 374&function_begin_B("_vpaes_schedule_core"); 375 &add ($const,&DWP(0,"esp")); 376 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) 377 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon 378 379 # input transform 380 &movdqa ("xmm3","xmm0"); 381 &lea ($base,&DWP($k_ipt,$const)); 382 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 383 &call ("_vpaes_schedule_transform"); 384 &movdqa ("xmm7","xmm0"); 385 386 &test ($out,$out); 387 &jnz (&label("schedule_am_decrypting")); 388 389 # encrypting, output zeroth round key after transform 390 &movdqu (&QWP(0,$key),"xmm0"); 391 &jmp (&label("schedule_go")); 392 393&set_label("schedule_am_decrypting"); 394 # decrypting, output zeroth round key after shiftrows 395 &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 396 &pshufb ("xmm3","xmm1"); 397 &movdqu (&QWP(0,$key),"xmm3"); 398 &xor ($magic,0x30); 399 400&set_label("schedule_go"); 401 &cmp ($round,192); 402 &ja (&label("schedule_256")); 403 &je (&label("schedule_192")); 404 # 128: fall though 405 406## 407## .schedule_128 408## 409## 128-bit specific part of key schedule. 410## 411## This schedule is really simple, because all its parts 412## are accomplished by the subroutines. 413## 414&set_label("schedule_128"); 415 &mov ($round,10); 416 417&set_label("loop_schedule_128"); 418 &call ("_vpaes_schedule_round"); 419 &dec ($round); 420 &jz (&label("schedule_mangle_last")); 421 &call ("_vpaes_schedule_mangle"); # write output 422 &jmp (&label("loop_schedule_128")); 423 424## 425## .aes_schedule_192 426## 427## 192-bit specific part of key schedule. 428## 429## The main body of this schedule is the same as the 128-bit 430## schedule, but with more smearing. The long, high side is 431## stored in %xmm7 as before, and the short, low side is in 432## the high bits of %xmm6. 433## 434## This schedule is somewhat nastier, however, because each 435## round produces 192 bits of key material, or 1.5 round keys. 436## Therefore, on each cycle we do 2 rounds and produce 3 round 437## keys. 438## 439&set_label("schedule_192",16); 440 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) 441 &call ("_vpaes_schedule_transform"); # input transform 442 &movdqa ("xmm6","xmm0"); # save short part 443 &pxor ("xmm4","xmm4"); # clear 4 444 &movhlps("xmm6","xmm4"); # clobber low side with zeros 445 &mov ($round,4); 446 447&set_label("loop_schedule_192"); 448 &call ("_vpaes_schedule_round"); 449 &palignr("xmm0","xmm6",8); 450 &call ("_vpaes_schedule_mangle"); # save key n 451 &call ("_vpaes_schedule_192_smear"); 452 &call ("_vpaes_schedule_mangle"); # save key n+1 453 &call ("_vpaes_schedule_round"); 454 &dec ($round); 455 &jz (&label("schedule_mangle_last")); 456 &call ("_vpaes_schedule_mangle"); # save key n+2 457 &call ("_vpaes_schedule_192_smear"); 458 &jmp (&label("loop_schedule_192")); 459 460## 461## .aes_schedule_256 462## 463## 256-bit specific part of key schedule. 464## 465## The structure here is very similar to the 128-bit 466## schedule, but with an additional "low side" in 467## %xmm6. The low side's rounds are the same as the 468## high side's, except no rcon and no rotation. 469## 470&set_label("schedule_256",16); 471 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) 472 &call ("_vpaes_schedule_transform"); # input transform 473 &mov ($round,7); 474 475&set_label("loop_schedule_256"); 476 &call ("_vpaes_schedule_mangle"); # output low result 477 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 478 479 # high round 480 &call ("_vpaes_schedule_round"); 481 &dec ($round); 482 &jz (&label("schedule_mangle_last")); 483 &call ("_vpaes_schedule_mangle"); 484 485 # low round. swap xmm7 and xmm6 486 &pshufd ("xmm0","xmm0",0xFF); 487 &movdqa (&QWP(20,"esp"),"xmm7"); 488 &movdqa ("xmm7","xmm6"); 489 &call ("_vpaes_schedule_low_round"); 490 &movdqa ("xmm7",&QWP(20,"esp")); 491 492 &jmp (&label("loop_schedule_256")); 493 494## 495## .aes_schedule_mangle_last 496## 497## Mangler for last round of key schedule 498## Mangles %xmm0 499## when encrypting, outputs out(%xmm0) ^ 63 500## when decrypting, outputs unskew(%xmm0) 501## 502## Always called right before return... jumps to cleanup and exits 503## 504&set_label("schedule_mangle_last",16); 505 # schedule last round key from xmm0 506 &lea ($base,&DWP($k_deskew,$const)); 507 &test ($out,$out); 508 &jnz (&label("schedule_mangle_last_dec")); 509 510 # encrypting 511 &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 512 &pshufb ("xmm0","xmm1"); # output permute 513 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform 514 &add ($key,32); 515 516&set_label("schedule_mangle_last_dec"); 517 &add ($key,-16); 518 &pxor ("xmm0",&QWP($k_s63,$const)); 519 &call ("_vpaes_schedule_transform"); # output transform 520 &movdqu (&QWP(0,$key),"xmm0"); # save last key 521 522 # cleanup 523 &pxor ("xmm0","xmm0"); 524 &pxor ("xmm1","xmm1"); 525 &pxor ("xmm2","xmm2"); 526 &pxor ("xmm3","xmm3"); 527 &pxor ("xmm4","xmm4"); 528 &pxor ("xmm5","xmm5"); 529 &pxor ("xmm6","xmm6"); 530 &pxor ("xmm7","xmm7"); 531 &ret (); 532&function_end_B("_vpaes_schedule_core"); 533 534## 535## .aes_schedule_192_smear 536## 537## Smear the short, low side in the 192-bit key schedule. 538## 539## Inputs: 540## %xmm7: high side, b a x y 541## %xmm6: low side, d c 0 0 542## %xmm13: 0 543## 544## Outputs: 545## %xmm6: b+c+d b+c 0 0 546## %xmm0: b+c+d b+c b a 547## 548&function_begin_B("_vpaes_schedule_192_smear"); 549 &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0 550 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a 551 &pxor ("xmm6","xmm1"); # -> c+d c 0 0 552 &pxor ("xmm1","xmm1"); 553 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a 554 &movdqa ("xmm0","xmm6"); 555 &movhlps("xmm6","xmm1"); # clobber low side with zeros 556 &ret (); 557&function_end_B("_vpaes_schedule_192_smear"); 558 559## 560## .aes_schedule_round 561## 562## Runs one main round of the key schedule on %xmm0, %xmm7 563## 564## Specifically, runs subbytes on the high dword of %xmm0 565## then rotates it by one byte and xors into the low dword of 566## %xmm7. 567## 568## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 569## next rcon. 570## 571## Smears the dwords of %xmm7 by xoring the low into the 572## second low, result into third, result into highest. 573## 574## Returns results in %xmm7 = %xmm0. 575## Clobbers %xmm1-%xmm5. 576## 577&function_begin_B("_vpaes_schedule_round"); 578 # extract rcon from xmm8 579 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 580 &pxor ("xmm1","xmm1"); 581 &palignr("xmm1","xmm2",15); 582 &palignr("xmm2","xmm2",15); 583 &pxor ("xmm7","xmm1"); 584 585 # rotate 586 &pshufd ("xmm0","xmm0",0xFF); 587 &palignr("xmm0","xmm0",1); 588 589 # fall through... 590 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 591 592 # low round: same as high round, but no rotation and no rcon. 593&set_label("_vpaes_schedule_low_round"); 594 # smear xmm7 595 &movdqa ("xmm1","xmm7"); 596 &pslldq ("xmm7",4); 597 &pxor ("xmm7","xmm1"); 598 &movdqa ("xmm1","xmm7"); 599 &pslldq ("xmm7",8); 600 &pxor ("xmm7","xmm1"); 601 &pxor ("xmm7",&QWP($k_s63,$const)); 602 603 # subbyte 604 &movdqa ("xmm4",&QWP($k_s0F,$const)); 605 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j 606 &movdqa ("xmm1","xmm4"); 607 &pandn ("xmm1","xmm0"); 608 &psrld ("xmm1",4); # 1 = i 609 &pand ("xmm0","xmm4"); # 0 = k 610 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k 611 &pshufb ("xmm2","xmm0"); # 2 = a/k 612 &pxor ("xmm0","xmm1"); # 0 = j 613 &movdqa ("xmm3","xmm5"); # 3 : 1/i 614 &pshufb ("xmm3","xmm1"); # 3 = 1/i 615 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k 616 &movdqa ("xmm4","xmm5"); # 4 : 1/j 617 &pshufb ("xmm4","xmm0"); # 4 = 1/j 618 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k 619 &movdqa ("xmm2","xmm5"); # 2 : 1/iak 620 &pshufb ("xmm2","xmm3"); # 2 = 1/iak 621 &pxor ("xmm2","xmm0"); # 2 = io 622 &movdqa ("xmm3","xmm5"); # 3 : 1/jak 623 &pshufb ("xmm3","xmm4"); # 3 = 1/jak 624 &pxor ("xmm3","xmm1"); # 3 = jo 625 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou 626 &pshufb ("xmm4","xmm2"); # 4 = sbou 627 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot 628 &pshufb ("xmm0","xmm3"); # 0 = sb1t 629 &pxor ("xmm0","xmm4"); # 0 = sbox output 630 631 # add in smeared stuff 632 &pxor ("xmm0","xmm7"); 633 &movdqa ("xmm7","xmm0"); 634 &ret (); 635&function_end_B("_vpaes_schedule_round"); 636 637## 638## .aes_schedule_transform 639## 640## Linear-transform %xmm0 according to tables at (%ebx) 641## 642## Output in %xmm0 643## Clobbers %xmm1, %xmm2 644## 645&function_begin_B("_vpaes_schedule_transform"); 646 &movdqa ("xmm2",&QWP($k_s0F,$const)); 647 &movdqa ("xmm1","xmm2"); 648 &pandn ("xmm1","xmm0"); 649 &psrld ("xmm1",4); 650 &pand ("xmm0","xmm2"); 651 &movdqa ("xmm2",&QWP(0,$base)); 652 &pshufb ("xmm2","xmm0"); 653 &movdqa ("xmm0",&QWP(16,$base)); 654 &pshufb ("xmm0","xmm1"); 655 &pxor ("xmm0","xmm2"); 656 &ret (); 657&function_end_B("_vpaes_schedule_transform"); 658 659## 660## .aes_schedule_mangle 661## 662## Mangle xmm0 from (basis-transformed) standard version 663## to our version. 664## 665## On encrypt, 666## xor with 0x63 667## multiply by circulant 0,1,1,1 668## apply shiftrows transform 669## 670## On decrypt, 671## xor with 0x63 672## multiply by "inverse mixcolumns" circulant E,B,D,9 673## deskew 674## apply shiftrows transform 675## 676## 677## Writes out to (%edx), and increments or decrements it 678## Keeps track of round number mod 4 in %ecx 679## Preserves xmm0 680## Clobbers xmm1-xmm5 681## 682&function_begin_B("_vpaes_schedule_mangle"); 683 &movdqa ("xmm4","xmm0"); # save xmm0 for later 684 &movdqa ("xmm5",&QWP($k_mc_forward,$const)); 685 &test ($out,$out); 686 &jnz (&label("schedule_mangle_dec")); 687 688 # encrypting 689 &add ($key,16); 690 &pxor ("xmm4",&QWP($k_s63,$const)); 691 &pshufb ("xmm4","xmm5"); 692 &movdqa ("xmm3","xmm4"); 693 &pshufb ("xmm4","xmm5"); 694 &pxor ("xmm3","xmm4"); 695 &pshufb ("xmm4","xmm5"); 696 &pxor ("xmm3","xmm4"); 697 698 &jmp (&label("schedule_mangle_both")); 699 700&set_label("schedule_mangle_dec",16); 701 # inverse mix columns 702 &movdqa ("xmm2",&QWP($k_s0F,$const)); 703 &lea ($inp,&DWP($k_dksd,$const)); 704 &movdqa ("xmm1","xmm2"); 705 &pandn ("xmm1","xmm4"); 706 &psrld ("xmm1",4); # 1 = hi 707 &pand ("xmm4","xmm2"); # 4 = lo 708 709 &movdqa ("xmm2",&QWP(0,$inp)); 710 &pshufb ("xmm2","xmm4"); 711 &movdqa ("xmm3",&QWP(0x10,$inp)); 712 &pshufb ("xmm3","xmm1"); 713 &pxor ("xmm3","xmm2"); 714 &pshufb ("xmm3","xmm5"); 715 716 &movdqa ("xmm2",&QWP(0x20,$inp)); 717 &pshufb ("xmm2","xmm4"); 718 &pxor ("xmm2","xmm3"); 719 &movdqa ("xmm3",&QWP(0x30,$inp)); 720 &pshufb ("xmm3","xmm1"); 721 &pxor ("xmm3","xmm2"); 722 &pshufb ("xmm3","xmm5"); 723 724 &movdqa ("xmm2",&QWP(0x40,$inp)); 725 &pshufb ("xmm2","xmm4"); 726 &pxor ("xmm2","xmm3"); 727 &movdqa ("xmm3",&QWP(0x50,$inp)); 728 &pshufb ("xmm3","xmm1"); 729 &pxor ("xmm3","xmm2"); 730 &pshufb ("xmm3","xmm5"); 731 732 &movdqa ("xmm2",&QWP(0x60,$inp)); 733 &pshufb ("xmm2","xmm4"); 734 &pxor ("xmm2","xmm3"); 735 &movdqa ("xmm3",&QWP(0x70,$inp)); 736 &pshufb ("xmm3","xmm1"); 737 &pxor ("xmm3","xmm2"); 738 739 &add ($key,-16); 740 741&set_label("schedule_mangle_both"); 742 &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 743 &pshufb ("xmm3","xmm1"); 744 &add ($magic,-16); 745 &and ($magic,0x30); 746 &movdqu (&QWP(0,$key),"xmm3"); 747 &ret (); 748&function_end_B("_vpaes_schedule_mangle"); 749 750# 751# Interface to OpenSSL 752# 753&function_begin("${PREFIX}_set_encrypt_key"); 754 &mov ($inp,&wparam(0)); # inp 755 &lea ($base,&DWP(-56,"esp")); 756 &mov ($round,&wparam(1)); # bits 757 &and ($base,-16); 758 &mov ($key,&wparam(2)); # key 759 &xchg ($base,"esp"); # alloca 760 &mov (&DWP(48,"esp"),$base); 761 762 &mov ($base,$round); 763 &shr ($base,5); 764 &add ($base,5); 765 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; 766 &mov ($magic,0x30); 767 &mov ($out,0); 768 769 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 770 &call ("_vpaes_schedule_core"); 771&set_label("pic_point"); 772 773 &mov ("esp",&DWP(48,"esp")); 774 &xor ("eax","eax"); 775&function_end("${PREFIX}_set_encrypt_key"); 776 777&function_begin("${PREFIX}_set_decrypt_key"); 778 &mov ($inp,&wparam(0)); # inp 779 &lea ($base,&DWP(-56,"esp")); 780 &mov ($round,&wparam(1)); # bits 781 &and ($base,-16); 782 &mov ($key,&wparam(2)); # key 783 &xchg ($base,"esp"); # alloca 784 &mov (&DWP(48,"esp"),$base); 785 786 &mov ($base,$round); 787 &shr ($base,5); 788 &add ($base,5); 789 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; 790 &shl ($base,4); 791 &lea ($key,&DWP(16,$key,$base)); 792 793 &mov ($out,1); 794 &mov ($magic,$round); 795 &shr ($magic,1); 796 &and ($magic,32); 797 &xor ($magic,32); # nbist==192?0:32; 798 799 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 800 &call ("_vpaes_schedule_core"); 801&set_label("pic_point"); 802 803 &mov ("esp",&DWP(48,"esp")); 804 &xor ("eax","eax"); 805&function_end("${PREFIX}_set_decrypt_key"); 806 807&function_begin("${PREFIX}_encrypt"); 808 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 809 &call ("_vpaes_preheat"); 810&set_label("pic_point"); 811 &mov ($inp,&wparam(0)); # inp 812 &lea ($base,&DWP(-56,"esp")); 813 &mov ($out,&wparam(1)); # out 814 &and ($base,-16); 815 &mov ($key,&wparam(2)); # key 816 &xchg ($base,"esp"); # alloca 817 &mov (&DWP(48,"esp"),$base); 818 819 &movdqu ("xmm0",&QWP(0,$inp)); 820 &call ("_vpaes_encrypt_core"); 821 &movdqu (&QWP(0,$out),"xmm0"); 822 823 &mov ("esp",&DWP(48,"esp")); 824&function_end("${PREFIX}_encrypt"); 825 826&function_begin("${PREFIX}_decrypt"); 827 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 828 &call ("_vpaes_preheat"); 829&set_label("pic_point"); 830 &mov ($inp,&wparam(0)); # inp 831 &lea ($base,&DWP(-56,"esp")); 832 &mov ($out,&wparam(1)); # out 833 &and ($base,-16); 834 &mov ($key,&wparam(2)); # key 835 &xchg ($base,"esp"); # alloca 836 &mov (&DWP(48,"esp"),$base); 837 838 &movdqu ("xmm0",&QWP(0,$inp)); 839 &call ("_vpaes_decrypt_core"); 840 &movdqu (&QWP(0,$out),"xmm0"); 841 842 &mov ("esp",&DWP(48,"esp")); 843&function_end("${PREFIX}_decrypt"); 844 845&function_begin("${PREFIX}_cbc_encrypt"); 846 &mov ($inp,&wparam(0)); # inp 847 &mov ($out,&wparam(1)); # out 848 &mov ($round,&wparam(2)); # len 849 &mov ($key,&wparam(3)); # key 850 &sub ($round,16); 851 &jc (&label("cbc_abort")); 852 &lea ($base,&DWP(-56,"esp")); 853 &mov ($const,&wparam(4)); # ivp 854 &and ($base,-16); 855 &mov ($magic,&wparam(5)); # enc 856 &xchg ($base,"esp"); # alloca 857 &movdqu ("xmm1",&QWP(0,$const)); # load IV 858 &sub ($out,$inp); 859 &mov (&DWP(48,"esp"),$base); 860 861 &mov (&DWP(0,"esp"),$out); # save out 862 &mov (&DWP(4,"esp"),$key) # save key 863 &mov (&DWP(8,"esp"),$const); # save ivp 864 &mov ($out,$round); # $out works as $len 865 866 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 867 &call ("_vpaes_preheat"); 868&set_label("pic_point"); 869 &cmp ($magic,0); 870 &je (&label("cbc_dec_loop")); 871 &jmp (&label("cbc_enc_loop")); 872 873&set_label("cbc_enc_loop",16); 874 &movdqu ("xmm0",&QWP(0,$inp)); # load input 875 &pxor ("xmm0","xmm1"); # inp^=iv 876 &call ("_vpaes_encrypt_core"); 877 &mov ($base,&DWP(0,"esp")); # restore out 878 &mov ($key,&DWP(4,"esp")); # restore key 879 &movdqa ("xmm1","xmm0"); 880 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output 881 &lea ($inp,&DWP(16,$inp)); 882 &sub ($out,16); 883 &jnc (&label("cbc_enc_loop")); 884 &jmp (&label("cbc_done")); 885 886&set_label("cbc_dec_loop",16); 887 &movdqu ("xmm0",&QWP(0,$inp)); # load input 888 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV 889 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV 890 &call ("_vpaes_decrypt_core"); 891 &mov ($base,&DWP(0,"esp")); # restore out 892 &mov ($key,&DWP(4,"esp")); # restore key 893 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv 894 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV 895 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output 896 &lea ($inp,&DWP(16,$inp)); 897 &sub ($out,16); 898 &jnc (&label("cbc_dec_loop")); 899 900&set_label("cbc_done"); 901 &mov ($base,&DWP(8,"esp")); # restore ivp 902 &mov ("esp",&DWP(48,"esp")); 903 &movdqu (&QWP(0,$base),"xmm1"); # write IV 904&set_label("cbc_abort"); 905&function_end("${PREFIX}_cbc_encrypt"); 906 907&asm_finish(); 908 909close STDOUT; 910