1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-586.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86.pl column - [also
26# large-block CBC] encrypt/decrypt.
27#
28#		aes-586.pl		vpaes-x86.pl
29#
30# Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
31# Nehalem	27.9/40.4/18.1		10.2/11.9
32# Atom		70.7/92.1/60.1		61.1/75.4(***)
33# Silvermont	45.4/62.9/24.1		49.2/61.1(***)
34#
35# (*)	"Hyper-threading" in the context refers rather to cache shared
36#	among multiple cores, than to specifically Intel HTT. As vast
37#	majority of contemporary cores share cache, slower code path
38#	is common place. In other words "with-hyper-threading-off"
39#	results are presented mostly for reference purposes.
40#
41# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
42#
43# (***)	Less impressive improvement on Core 2 and Atom is due to slow
44#	pshufb,	yet it's respectable +28%/64%  improvement on Core 2
45#	and +15% on Atom (as implied, over "hyper-threading-safe"
46#	code path).
47#
48#						<appro@openssl.org>
49
50$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51push(@INC,"${dir}","${dir}../../perlasm");
52require "x86asm.pl";
53
54$output = pop;
55open OUT,">$output";
56*STDOUT=*OUT;
57
58&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
59
60$PREFIX="vpaes";
61
62my  ($round, $base, $magic, $key, $const, $inp, $out)=
63    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
64
65&static_label("_vpaes_consts");
66&static_label("_vpaes_schedule_low_round");
67
68&set_label("_vpaes_consts",64);
69$k_inv=-0x30;		# inv, inva
70	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
71	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
72
73$k_s0F=-0x10;		# s0F
74	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
75
76$k_ipt=0x00;		# input transform (lo, hi)
77	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
78	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
79
80$k_sb1=0x20;		# sb1u, sb1t
81	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
82	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
83$k_sb2=0x40;		# sb2u, sb2t
84	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
85	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
86$k_sbo=0x60;		# sbou, sbot
87	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
88	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
89
90$k_mc_forward=0x80;	# mc_forward
91	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
92	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
93	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
94	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
95
96$k_mc_backward=0xc0;	# mc_backward
97	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
98	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
99	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
100	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
101
102$k_sr=0x100;		# sr
103	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
104	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
105	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
106	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
107
108$k_rcon=0x140;		# rcon
109	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
110
111$k_s63=0x150;		# s63: all equal to 0x63 transformed
112	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
113
114$k_opt=0x160;		# output transform
115	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
116	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
117
118$k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
119	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
120	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
121##
122##  Decryption stuff
123##  Key schedule constants
124##
125$k_dksd=0x1a0;		# decryption key schedule: invskew x*D
126	&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
127	&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
128$k_dksb=0x1c0;		# decryption key schedule: invskew x*B
129	&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
130	&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
131$k_dkse=0x1e0;		# decryption key schedule: invskew x*E + 0x63
132	&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
133	&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
134$k_dks9=0x200;		# decryption key schedule: invskew x*9
135	&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
136	&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
137
138##
139##  Decryption stuff
140##  Round function constants
141##
142$k_dipt=0x220;		# decryption input transform
143	&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
144	&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
145
146$k_dsb9=0x240;		# decryption sbox output *9*u, *9*t
147	&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
148	&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
149$k_dsbd=0x260;		# decryption sbox output *D*u, *D*t
150	&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
151	&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
152$k_dsbb=0x280;		# decryption sbox output *B*u, *B*t
153	&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
154	&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
155$k_dsbe=0x2a0;		# decryption sbox output *E*u, *E*t
156	&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
157	&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
158$k_dsbo=0x2c0;		# decryption sbox final output
159	&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
160	&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
161&asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
162&align	(64);
163
164&function_begin_B("_vpaes_preheat");
165	&add	($const,&DWP(0,"esp"));
166	&movdqa	("xmm7",&QWP($k_inv,$const));
167	&movdqa	("xmm6",&QWP($k_s0F,$const));
168	&ret	();
169&function_end_B("_vpaes_preheat");
170
171##
172##  _aes_encrypt_core
173##
174##  AES-encrypt %xmm0.
175##
176##  Inputs:
177##     %xmm0 = input
178##     %xmm6-%xmm7 as in _vpaes_preheat
179##    (%edx) = scheduled keys
180##
181##  Output in %xmm0
182##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
183##
184##
185&function_begin_B("_vpaes_encrypt_core");
186	&mov	($magic,16);
187	&mov	($round,&DWP(240,$key));
188	&movdqa	("xmm1","xmm6")
189	&movdqa	("xmm2",&QWP($k_ipt,$const));
190	&pandn	("xmm1","xmm0");
191	&pand	("xmm0","xmm6");
192	&movdqu	("xmm5",&QWP(0,$key));
193	&pshufb	("xmm2","xmm0");
194	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
195	&pxor	("xmm2","xmm5");
196	&psrld	("xmm1",4);
197	&add	($key,16);
198	&pshufb	("xmm0","xmm1");
199	&lea	($base,&DWP($k_mc_backward,$const));
200	&pxor	("xmm0","xmm2");
201	&jmp	(&label("enc_entry"));
202
203
204&set_label("enc_loop",16);
205	# middle of middle round
206	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
207	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
208	&pshufb	("xmm4","xmm2");		# 4 = sb1u
209	&pshufb	("xmm0","xmm3");		# 0 = sb1t
210	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
211	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
212	&pxor	("xmm0","xmm4");		# 0 = A
213	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
214	&pshufb	("xmm5","xmm2");		# 4 = sb2u
215	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
216	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
217	&pshufb	("xmm2","xmm3");		# 2 = sb2t
218	&movdqa	("xmm3","xmm0");		# 3 = A
219	&pxor	("xmm2","xmm5");		# 2 = 2A
220	&pshufb	("xmm0","xmm1");		# 0 = B
221	&add	($key,16);			# next key
222	&pxor	("xmm0","xmm2");		# 0 = 2A+B
223	&pshufb	("xmm3","xmm4");		# 3 = D
224	&add	($magic,16);			# next mc
225	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
226	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
227	&and	($magic,0x30);			# ... mod 4
228	&sub	($round,1);			# nr--
229	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
230
231&set_label("enc_entry");
232	# top of round
233	&movdqa	("xmm1","xmm6");		# 1 : i
234	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
235	&pandn	("xmm1","xmm0");		# 1 = i<<4
236	&psrld	("xmm1",4);			# 1 = i
237	&pand	("xmm0","xmm6");		# 0 = k
238	&pshufb	("xmm5","xmm0");		# 2 = a/k
239	&movdqa	("xmm3","xmm7");		# 3 : 1/i
240	&pxor	("xmm0","xmm1");		# 0 = j
241	&pshufb	("xmm3","xmm1");		# 3 = 1/i
242	&movdqa	("xmm4","xmm7");		# 4 : 1/j
243	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
244	&pshufb	("xmm4","xmm0");		# 4 = 1/j
245	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
246	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
247	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
248	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
249	&pxor	("xmm2","xmm0");		# 2 = io
250	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
251	&movdqu	("xmm5",&QWP(0,$key));
252	&pxor	("xmm3","xmm1");		# 3 = jo
253	&jnz	(&label("enc_loop"));
254
255	# middle of last round
256	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
257	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
258	&pshufb	("xmm4","xmm2");		# 4 = sbou
259	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
260	&pshufb	("xmm0","xmm3");		# 0 = sb1t
261	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
262	&pxor	("xmm0","xmm4");		# 0 = A
263	&pshufb	("xmm0","xmm1");
264	&ret	();
265&function_end_B("_vpaes_encrypt_core");
266
267##
268##  Decryption core
269##
270##  Same API as encryption core.
271##
272&function_begin_B("_vpaes_decrypt_core");
273	&lea	($base,&DWP($k_dsbd,$const));
274	&mov	($round,&DWP(240,$key));
275	&movdqa	("xmm1","xmm6");
276	&movdqa	("xmm2",&QWP($k_dipt-$k_dsbd,$base));
277	&pandn	("xmm1","xmm0");
278	&mov	($magic,$round);
279	&psrld	("xmm1",4)
280	&movdqu	("xmm5",&QWP(0,$key));
281	&shl	($magic,4);
282	&pand	("xmm0","xmm6");
283	&pshufb	("xmm2","xmm0");
284	&movdqa	("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
285	&xor	($magic,0x30);
286	&pshufb	("xmm0","xmm1");
287	&and	($magic,0x30);
288	&pxor	("xmm2","xmm5");
289	&movdqa	("xmm5",&QWP($k_mc_forward+48,$const));
290	&pxor	("xmm0","xmm2");
291	&add	($key,16);
292	&lea	($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
293	&jmp	(&label("dec_entry"));
294
295&set_label("dec_loop",16);
296##
297##  Inverse mix columns
298##
299	&movdqa	("xmm4",&QWP(-0x20,$base));	# 4 : sb9u
300	&movdqa	("xmm1",&QWP(-0x10,$base));	# 0 : sb9t
301	&pshufb	("xmm4","xmm2");		# 4 = sb9u
302	&pshufb	("xmm1","xmm3");		# 0 = sb9t
303	&pxor	("xmm0","xmm4");
304	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
305	&pxor	("xmm0","xmm1");		# 0 = ch
306	&movdqa	("xmm1",&QWP(0x10,$base));	# 0 : sbdt
307
308	&pshufb	("xmm4","xmm2");		# 4 = sbdu
309	&pshufb	("xmm0","xmm5");		# MC ch
310	&pshufb	("xmm1","xmm3");		# 0 = sbdt
311	&pxor	("xmm0","xmm4");		# 4 = ch
312	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
313	&pxor	("xmm0","xmm1");		# 0 = ch
314	&movdqa	("xmm1",&QWP(0x30,$base));	# 0 : sbbt
315
316	&pshufb	("xmm4","xmm2");		# 4 = sbbu
317	&pshufb	("xmm0","xmm5");		# MC ch
318	&pshufb	("xmm1","xmm3");		# 0 = sbbt
319	&pxor	("xmm0","xmm4");		# 4 = ch
320	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
321	&pxor	("xmm0","xmm1");		# 0 = ch
322	&movdqa	("xmm1",&QWP(0x50,$base));	# 0 : sbet
323
324	&pshufb	("xmm4","xmm2");		# 4 = sbeu
325	&pshufb	("xmm0","xmm5");		# MC ch
326	&pshufb	("xmm1","xmm3");		# 0 = sbet
327	&pxor	("xmm0","xmm4");		# 4 = ch
328	&add	($key,16);			# next round key
329	&palignr("xmm5","xmm5",12);
330	&pxor	("xmm0","xmm1");		# 0 = ch
331	&sub	($round,1);			# nr--
332
333&set_label("dec_entry");
334	# top of round
335	&movdqa	("xmm1","xmm6");		# 1 : i
336	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
337	&pandn	("xmm1","xmm0");		# 1 = i<<4
338	&pand	("xmm0","xmm6");		# 0 = k
339	&psrld	("xmm1",4);			# 1 = i
340	&pshufb	("xmm2","xmm0");		# 2 = a/k
341	&movdqa	("xmm3","xmm7");		# 3 : 1/i
342	&pxor	("xmm0","xmm1");		# 0 = j
343	&pshufb	("xmm3","xmm1");		# 3 = 1/i
344	&movdqa	("xmm4","xmm7");		# 4 : 1/j
345	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
346	&pshufb	("xmm4","xmm0");		# 4 = 1/j
347	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
348	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
349	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
350	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
351	&pxor	("xmm2","xmm0");		# 2 = io
352	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
353	&movdqu	("xmm0",&QWP(0,$key));
354	&pxor	("xmm3","xmm1");		# 3 = jo
355	&jnz	(&label("dec_loop"));
356
357	# middle of last round
358	&movdqa	("xmm4",&QWP(0x60,$base));	# 3 : sbou
359	&pshufb	("xmm4","xmm2");		# 4 = sbou
360	&pxor	("xmm4","xmm0");		# 4 = sb1u + k
361	&movdqa	("xmm0",&QWP(0x70,$base));	# 0 : sbot
362	&movdqa	("xmm2",&QWP(0,$magic));
363	&pshufb	("xmm0","xmm3");		# 0 = sb1t
364	&pxor	("xmm0","xmm4");		# 0 = A
365	&pshufb	("xmm0","xmm2");
366	&ret	();
367&function_end_B("_vpaes_decrypt_core");
368
369########################################################
370##                                                    ##
371##                  AES key schedule                  ##
372##                                                    ##
373########################################################
374&function_begin_B("_vpaes_schedule_core");
375	&add	($const,&DWP(0,"esp"));
376	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
377	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
378
379	# input transform
380	&movdqa	("xmm3","xmm0");
381	&lea	($base,&DWP($k_ipt,$const));
382	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
383	&call	("_vpaes_schedule_transform");
384	&movdqa	("xmm7","xmm0");
385
386	&test	($out,$out);
387	&jnz	(&label("schedule_am_decrypting"));
388
389	# encrypting, output zeroth round key after transform
390	&movdqu	(&QWP(0,$key),"xmm0");
391	&jmp	(&label("schedule_go"));
392
393&set_label("schedule_am_decrypting");
394	# decrypting, output zeroth round key after shiftrows
395	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
396	&pshufb	("xmm3","xmm1");
397	&movdqu	(&QWP(0,$key),"xmm3");
398	&xor	($magic,0x30);
399
400&set_label("schedule_go");
401	&cmp	($round,192);
402	&ja	(&label("schedule_256"));
403	&je	(&label("schedule_192"));
404	# 128: fall though
405
406##
407##  .schedule_128
408##
409##  128-bit specific part of key schedule.
410##
411##  This schedule is really simple, because all its parts
412##  are accomplished by the subroutines.
413##
414&set_label("schedule_128");
415	&mov	($round,10);
416
417&set_label("loop_schedule_128");
418	&call	("_vpaes_schedule_round");
419	&dec	($round);
420	&jz	(&label("schedule_mangle_last"));
421	&call	("_vpaes_schedule_mangle");	# write output
422	&jmp	(&label("loop_schedule_128"));
423
424##
425##  .aes_schedule_192
426##
427##  192-bit specific part of key schedule.
428##
429##  The main body of this schedule is the same as the 128-bit
430##  schedule, but with more smearing.  The long, high side is
431##  stored in %xmm7 as before, and the short, low side is in
432##  the high bits of %xmm6.
433##
434##  This schedule is somewhat nastier, however, because each
435##  round produces 192 bits of key material, or 1.5 round keys.
436##  Therefore, on each cycle we do 2 rounds and produce 3 round
437##  keys.
438##
439&set_label("schedule_192",16);
440	&movdqu	("xmm0",&QWP(8,$inp));		# load key part 2 (very unaligned)
441	&call	("_vpaes_schedule_transform");	# input transform
442	&movdqa	("xmm6","xmm0");		# save short part
443	&pxor	("xmm4","xmm4");		# clear 4
444	&movhlps("xmm6","xmm4");		# clobber low side with zeros
445	&mov	($round,4);
446
447&set_label("loop_schedule_192");
448	&call	("_vpaes_schedule_round");
449	&palignr("xmm0","xmm6",8);
450	&call	("_vpaes_schedule_mangle");	# save key n
451	&call	("_vpaes_schedule_192_smear");
452	&call	("_vpaes_schedule_mangle");	# save key n+1
453	&call	("_vpaes_schedule_round");
454	&dec	($round);
455	&jz	(&label("schedule_mangle_last"));
456	&call	("_vpaes_schedule_mangle");	# save key n+2
457	&call	("_vpaes_schedule_192_smear");
458	&jmp	(&label("loop_schedule_192"));
459
460##
461##  .aes_schedule_256
462##
463##  256-bit specific part of key schedule.
464##
465##  The structure here is very similar to the 128-bit
466##  schedule, but with an additional "low side" in
467##  %xmm6.  The low side's rounds are the same as the
468##  high side's, except no rcon and no rotation.
469##
470&set_label("schedule_256",16);
471	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
472	&call	("_vpaes_schedule_transform");	# input transform
473	&mov	($round,7);
474
475&set_label("loop_schedule_256");
476	&call	("_vpaes_schedule_mangle");	# output low result
477	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
478
479	# high round
480	&call	("_vpaes_schedule_round");
481	&dec	($round);
482	&jz	(&label("schedule_mangle_last"));
483	&call	("_vpaes_schedule_mangle");
484
485	# low round. swap xmm7 and xmm6
486	&pshufd	("xmm0","xmm0",0xFF);
487	&movdqa	(&QWP(20,"esp"),"xmm7");
488	&movdqa	("xmm7","xmm6");
489	&call	("_vpaes_schedule_low_round");
490	&movdqa	("xmm7",&QWP(20,"esp"));
491
492	&jmp	(&label("loop_schedule_256"));
493
494##
495##  .aes_schedule_mangle_last
496##
497##  Mangler for last round of key schedule
498##  Mangles %xmm0
499##    when encrypting, outputs out(%xmm0) ^ 63
500##    when decrypting, outputs unskew(%xmm0)
501##
502##  Always called right before return... jumps to cleanup and exits
503##
504&set_label("schedule_mangle_last",16);
505	# schedule last round key from xmm0
506	&lea	($base,&DWP($k_deskew,$const));
507	&test	($out,$out);
508	&jnz	(&label("schedule_mangle_last_dec"));
509
510	# encrypting
511	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
512	&pshufb	("xmm0","xmm1");		# output permute
513	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
514	&add	($key,32);
515
516&set_label("schedule_mangle_last_dec");
517	&add	($key,-16);
518	&pxor	("xmm0",&QWP($k_s63,$const));
519	&call	("_vpaes_schedule_transform");	# output transform
520	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
521
522	# cleanup
523	&pxor	("xmm0","xmm0");
524	&pxor	("xmm1","xmm1");
525	&pxor	("xmm2","xmm2");
526	&pxor	("xmm3","xmm3");
527	&pxor	("xmm4","xmm4");
528	&pxor	("xmm5","xmm5");
529	&pxor	("xmm6","xmm6");
530	&pxor	("xmm7","xmm7");
531	&ret	();
532&function_end_B("_vpaes_schedule_core");
533
534##
535##  .aes_schedule_192_smear
536##
537##  Smear the short, low side in the 192-bit key schedule.
538##
539##  Inputs:
540##    %xmm7: high side, b  a  x  y
541##    %xmm6:  low side, d  c  0  0
542##    %xmm13: 0
543##
544##  Outputs:
545##    %xmm6: b+c+d  b+c  0  0
546##    %xmm0: b+c+d  b+c  b  a
547##
548&function_begin_B("_vpaes_schedule_192_smear");
549	&pshufd	("xmm1","xmm6",0x80);		# d c 0 0 -> c 0 0 0
550	&pshufd	("xmm0","xmm7",0xFE);		# b a _ _ -> b b b a
551	&pxor	("xmm6","xmm1");		# -> c+d c 0 0
552	&pxor	("xmm1","xmm1");
553	&pxor	("xmm6","xmm0");		# -> b+c+d b+c b a
554	&movdqa	("xmm0","xmm6");
555	&movhlps("xmm6","xmm1");		# clobber low side with zeros
556	&ret	();
557&function_end_B("_vpaes_schedule_192_smear");
558
559##
560##  .aes_schedule_round
561##
562##  Runs one main round of the key schedule on %xmm0, %xmm7
563##
564##  Specifically, runs subbytes on the high dword of %xmm0
565##  then rotates it by one byte and xors into the low dword of
566##  %xmm7.
567##
568##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
569##  next rcon.
570##
571##  Smears the dwords of %xmm7 by xoring the low into the
572##  second low, result into third, result into highest.
573##
574##  Returns results in %xmm7 = %xmm0.
575##  Clobbers %xmm1-%xmm5.
576##
577&function_begin_B("_vpaes_schedule_round");
578	# extract rcon from xmm8
579	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
580	&pxor	("xmm1","xmm1");
581	&palignr("xmm1","xmm2",15);
582	&palignr("xmm2","xmm2",15);
583	&pxor	("xmm7","xmm1");
584
585	# rotate
586	&pshufd	("xmm0","xmm0",0xFF);
587	&palignr("xmm0","xmm0",1);
588
589	# fall through...
590	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
591
592	# low round: same as high round, but no rotation and no rcon.
593&set_label("_vpaes_schedule_low_round");
594	# smear xmm7
595	&movdqa	("xmm1","xmm7");
596	&pslldq	("xmm7",4);
597	&pxor	("xmm7","xmm1");
598	&movdqa	("xmm1","xmm7");
599	&pslldq	("xmm7",8);
600	&pxor	("xmm7","xmm1");
601	&pxor	("xmm7",&QWP($k_s63,$const));
602
603	# subbyte
604	&movdqa	("xmm4",&QWP($k_s0F,$const));
605	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
606	&movdqa	("xmm1","xmm4");
607	&pandn	("xmm1","xmm0");
608	&psrld	("xmm1",4);			# 1 = i
609	&pand	("xmm0","xmm4");		# 0 = k
610	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
611	&pshufb	("xmm2","xmm0");		# 2 = a/k
612	&pxor	("xmm0","xmm1");		# 0 = j
613	&movdqa	("xmm3","xmm5");		# 3 : 1/i
614	&pshufb	("xmm3","xmm1");		# 3 = 1/i
615	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
616	&movdqa	("xmm4","xmm5");		# 4 : 1/j
617	&pshufb	("xmm4","xmm0");		# 4 = 1/j
618	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
619	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
620	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
621	&pxor	("xmm2","xmm0");		# 2 = io
622	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
623	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
624	&pxor	("xmm3","xmm1");		# 3 = jo
625	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
626	&pshufb	("xmm4","xmm2");		# 4 = sbou
627	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
628	&pshufb	("xmm0","xmm3");		# 0 = sb1t
629	&pxor	("xmm0","xmm4");		# 0 = sbox output
630
631	# add in smeared stuff
632	&pxor	("xmm0","xmm7");
633	&movdqa	("xmm7","xmm0");
634	&ret	();
635&function_end_B("_vpaes_schedule_round");
636
637##
638##  .aes_schedule_transform
639##
640##  Linear-transform %xmm0 according to tables at (%ebx)
641##
642##  Output in %xmm0
643##  Clobbers %xmm1, %xmm2
644##
645&function_begin_B("_vpaes_schedule_transform");
646	&movdqa	("xmm2",&QWP($k_s0F,$const));
647	&movdqa	("xmm1","xmm2");
648	&pandn	("xmm1","xmm0");
649	&psrld	("xmm1",4);
650	&pand	("xmm0","xmm2");
651	&movdqa	("xmm2",&QWP(0,$base));
652	&pshufb	("xmm2","xmm0");
653	&movdqa	("xmm0",&QWP(16,$base));
654	&pshufb	("xmm0","xmm1");
655	&pxor	("xmm0","xmm2");
656	&ret	();
657&function_end_B("_vpaes_schedule_transform");
658
659##
660##  .aes_schedule_mangle
661##
662##  Mangle xmm0 from (basis-transformed) standard version
663##  to our version.
664##
665##  On encrypt,
666##    xor with 0x63
667##    multiply by circulant 0,1,1,1
668##    apply shiftrows transform
669##
670##  On decrypt,
671##    xor with 0x63
672##    multiply by "inverse mixcolumns" circulant E,B,D,9
673##    deskew
674##    apply shiftrows transform
675##
676##
677##  Writes out to (%edx), and increments or decrements it
678##  Keeps track of round number mod 4 in %ecx
679##  Preserves xmm0
680##  Clobbers xmm1-xmm5
681##
682&function_begin_B("_vpaes_schedule_mangle");
683	&movdqa	("xmm4","xmm0");	# save xmm0 for later
684	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
685	&test	($out,$out);
686	&jnz	(&label("schedule_mangle_dec"));
687
688	# encrypting
689	&add	($key,16);
690	&pxor	("xmm4",&QWP($k_s63,$const));
691	&pshufb	("xmm4","xmm5");
692	&movdqa	("xmm3","xmm4");
693	&pshufb	("xmm4","xmm5");
694	&pxor	("xmm3","xmm4");
695	&pshufb	("xmm4","xmm5");
696	&pxor	("xmm3","xmm4");
697
698	&jmp	(&label("schedule_mangle_both"));
699
700&set_label("schedule_mangle_dec",16);
701	# inverse mix columns
702	&movdqa	("xmm2",&QWP($k_s0F,$const));
703	&lea	($inp,&DWP($k_dksd,$const));
704	&movdqa	("xmm1","xmm2");
705	&pandn	("xmm1","xmm4");
706	&psrld	("xmm1",4);			# 1 = hi
707	&pand	("xmm4","xmm2");		# 4 = lo
708
709	&movdqa	("xmm2",&QWP(0,$inp));
710	&pshufb	("xmm2","xmm4");
711	&movdqa	("xmm3",&QWP(0x10,$inp));
712	&pshufb	("xmm3","xmm1");
713	&pxor	("xmm3","xmm2");
714	&pshufb	("xmm3","xmm5");
715
716	&movdqa	("xmm2",&QWP(0x20,$inp));
717	&pshufb	("xmm2","xmm4");
718	&pxor	("xmm2","xmm3");
719	&movdqa	("xmm3",&QWP(0x30,$inp));
720	&pshufb	("xmm3","xmm1");
721	&pxor	("xmm3","xmm2");
722	&pshufb	("xmm3","xmm5");
723
724	&movdqa	("xmm2",&QWP(0x40,$inp));
725	&pshufb	("xmm2","xmm4");
726	&pxor	("xmm2","xmm3");
727	&movdqa	("xmm3",&QWP(0x50,$inp));
728	&pshufb	("xmm3","xmm1");
729	&pxor	("xmm3","xmm2");
730	&pshufb	("xmm3","xmm5");
731
732	&movdqa	("xmm2",&QWP(0x60,$inp));
733	&pshufb	("xmm2","xmm4");
734	&pxor	("xmm2","xmm3");
735	&movdqa	("xmm3",&QWP(0x70,$inp));
736	&pshufb	("xmm3","xmm1");
737	&pxor	("xmm3","xmm2");
738
739	&add	($key,-16);
740
741&set_label("schedule_mangle_both");
742	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
743	&pshufb	("xmm3","xmm1");
744	&add	($magic,-16);
745	&and	($magic,0x30);
746	&movdqu	(&QWP(0,$key),"xmm3");
747	&ret	();
748&function_end_B("_vpaes_schedule_mangle");
749
750#
751# Interface to OpenSSL
752#
753&function_begin("${PREFIX}_set_encrypt_key");
754	&mov	($inp,&wparam(0));		# inp
755	&lea	($base,&DWP(-56,"esp"));
756	&mov	($round,&wparam(1));		# bits
757	&and	($base,-16);
758	&mov	($key,&wparam(2));		# key
759	&xchg	($base,"esp");			# alloca
760	&mov	(&DWP(48,"esp"),$base);
761
762	&mov	($base,$round);
763	&shr	($base,5);
764	&add	($base,5);
765	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
766	&mov	($magic,0x30);
767	&mov	($out,0);
768
769	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
770	&call	("_vpaes_schedule_core");
771&set_label("pic_point");
772
773	&mov	("esp",&DWP(48,"esp"));
774	&xor	("eax","eax");
775&function_end("${PREFIX}_set_encrypt_key");
776
777&function_begin("${PREFIX}_set_decrypt_key");
778	&mov	($inp,&wparam(0));		# inp
779	&lea	($base,&DWP(-56,"esp"));
780	&mov	($round,&wparam(1));		# bits
781	&and	($base,-16);
782	&mov	($key,&wparam(2));		# key
783	&xchg	($base,"esp");			# alloca
784	&mov	(&DWP(48,"esp"),$base);
785
786	&mov	($base,$round);
787	&shr	($base,5);
788	&add	($base,5);
789	&mov	(&DWP(240,$key),$base);	# AES_KEY->rounds = nbits/32+5;
790	&shl	($base,4);
791	&lea	($key,&DWP(16,$key,$base));
792
793	&mov	($out,1);
794	&mov	($magic,$round);
795	&shr	($magic,1);
796	&and	($magic,32);
797	&xor	($magic,32);			# nbist==192?0:32;
798
799	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
800	&call	("_vpaes_schedule_core");
801&set_label("pic_point");
802
803	&mov	("esp",&DWP(48,"esp"));
804	&xor	("eax","eax");
805&function_end("${PREFIX}_set_decrypt_key");
806
807&function_begin("${PREFIX}_encrypt");
808	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
809	&call	("_vpaes_preheat");
810&set_label("pic_point");
811	&mov	($inp,&wparam(0));		# inp
812	&lea	($base,&DWP(-56,"esp"));
813	&mov	($out,&wparam(1));		# out
814	&and	($base,-16);
815	&mov	($key,&wparam(2));		# key
816	&xchg	($base,"esp");			# alloca
817	&mov	(&DWP(48,"esp"),$base);
818
819	&movdqu	("xmm0",&QWP(0,$inp));
820	&call	("_vpaes_encrypt_core");
821	&movdqu	(&QWP(0,$out),"xmm0");
822
823	&mov	("esp",&DWP(48,"esp"));
824&function_end("${PREFIX}_encrypt");
825
826&function_begin("${PREFIX}_decrypt");
827	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
828	&call	("_vpaes_preheat");
829&set_label("pic_point");
830	&mov	($inp,&wparam(0));		# inp
831	&lea	($base,&DWP(-56,"esp"));
832	&mov	($out,&wparam(1));		# out
833	&and	($base,-16);
834	&mov	($key,&wparam(2));		# key
835	&xchg	($base,"esp");			# alloca
836	&mov	(&DWP(48,"esp"),$base);
837
838	&movdqu	("xmm0",&QWP(0,$inp));
839	&call	("_vpaes_decrypt_core");
840	&movdqu	(&QWP(0,$out),"xmm0");
841
842	&mov	("esp",&DWP(48,"esp"));
843&function_end("${PREFIX}_decrypt");
844
845&function_begin("${PREFIX}_cbc_encrypt");
846	&mov	($inp,&wparam(0));		# inp
847	&mov	($out,&wparam(1));		# out
848	&mov	($round,&wparam(2));		# len
849	&mov	($key,&wparam(3));		# key
850	&sub	($round,16);
851	&jc	(&label("cbc_abort"));
852	&lea	($base,&DWP(-56,"esp"));
853	&mov	($const,&wparam(4));		# ivp
854	&and	($base,-16);
855	&mov	($magic,&wparam(5));		# enc
856	&xchg	($base,"esp");			# alloca
857	&movdqu	("xmm1",&QWP(0,$const));	# load IV
858	&sub	($out,$inp);
859	&mov	(&DWP(48,"esp"),$base);
860
861	&mov	(&DWP(0,"esp"),$out);		# save out
862	&mov	(&DWP(4,"esp"),$key)		# save key
863	&mov	(&DWP(8,"esp"),$const);		# save ivp
864	&mov	($out,$round);			# $out works as $len
865
866	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
867	&call	("_vpaes_preheat");
868&set_label("pic_point");
869	&cmp	($magic,0);
870	&je	(&label("cbc_dec_loop"));
871	&jmp	(&label("cbc_enc_loop"));
872
873&set_label("cbc_enc_loop",16);
874	&movdqu	("xmm0",&QWP(0,$inp));		# load input
875	&pxor	("xmm0","xmm1");		# inp^=iv
876	&call	("_vpaes_encrypt_core");
877	&mov	($base,&DWP(0,"esp"));		# restore out
878	&mov	($key,&DWP(4,"esp"));		# restore key
879	&movdqa	("xmm1","xmm0");
880	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
881	&lea	($inp,&DWP(16,$inp));
882	&sub	($out,16);
883	&jnc	(&label("cbc_enc_loop"));
884	&jmp	(&label("cbc_done"));
885
886&set_label("cbc_dec_loop",16);
887	&movdqu	("xmm0",&QWP(0,$inp));		# load input
888	&movdqa	(&QWP(16,"esp"),"xmm1");	# save IV
889	&movdqa	(&QWP(32,"esp"),"xmm0");	# save future IV
890	&call	("_vpaes_decrypt_core");
891	&mov	($base,&DWP(0,"esp"));		# restore out
892	&mov	($key,&DWP(4,"esp"));		# restore key
893	&pxor	("xmm0",&QWP(16,"esp"));	# out^=iv
894	&movdqa	("xmm1",&QWP(32,"esp"));	# load next IV
895	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
896	&lea	($inp,&DWP(16,$inp));
897	&sub	($out,16);
898	&jnc	(&label("cbc_dec_loop"));
899
900&set_label("cbc_done");
901	&mov	($base,&DWP(8,"esp"));		# restore ivp
902	&mov	("esp",&DWP(48,"esp"));
903	&movdqu	(&QWP(0,$base),"xmm1");		# write IV
904&set_label("cbc_abort");
905&function_end("${PREFIX}_cbc_encrypt");
906
907&asm_finish();
908
909close STDOUT;
910