1#! /usr/bin/env perl
2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10######################################################################
11## Constant-time SSSE3 AES core implementation.
12## version 0.1
13##
14## By Mike Hamburg (Stanford University), 2009
15## Public domain.
16##
17## For details see http://shiftleft.org/papers/vector_aes/ and
18## http://crypto.stanford.edu/vpaes/.
19
20######################################################################
21# September 2011.
22#
23# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
24# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
25# doesn't handle partial vectors (doesn't have to if called from
26# EVP only). "Drop-in" implies that this module doesn't share key
27# schedule structure with the original nor does it make assumption
28# about its alignment...
29#
30# Performance summary. aes-586.pl column lists large-block CBC
31# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
32# byte processed with 128-bit key, and vpaes-x86.pl column - [also
33# large-block CBC] encrypt/decrypt.
34#
35#		aes-586.pl		vpaes-x86.pl
36#
37# Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
38# Nehalem	27.9/40.4/18.1		10.2/11.9
39# Atom		70.7/92.1/60.1		61.1/75.4(***)
40# Silvermont	45.4/62.9/24.1		49.2/61.1(***)
41#
42# (*)	"Hyper-threading" in the context refers rather to cache shared
43#	among multiple cores, than to specifically Intel HTT. As vast
44#	majority of contemporary cores share cache, slower code path
45#	is common place. In other words "with-hyper-threading-off"
46#	results are presented mostly for reference purposes.
47#
48# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
49#
50# (***)	Less impressive improvement on Core 2 and Atom is due to slow
51#	pshufb,	yet it's respectable +28%/64%  improvement on Core 2
52#	and +15% on Atom (as implied, over "hyper-threading-safe"
53#	code path).
54#
55#						<appro@openssl.org>
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58push(@INC,"${dir}","${dir}../../../perlasm");
59require "x86asm.pl";
60
61$output = pop;
62open OUT,">$output";
63*STDOUT=*OUT;
64
65&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
66
67$PREFIX="vpaes";
68
69my  ($round, $base, $magic, $key, $const, $inp, $out)=
70    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
71
72&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST")
73&external_label("BORINGSSL_function_hit");
74&preprocessor_endif();
75&static_label("_vpaes_consts");
76&static_label("_vpaes_schedule_low_round");
77
78&set_label("_vpaes_consts",64);
79$k_inv=-0x30;		# inv, inva
80	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
81	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
82
83$k_s0F=-0x10;		# s0F
84	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
85
86$k_ipt=0x00;		# input transform (lo, hi)
87	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
88	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
89
90$k_sb1=0x20;		# sb1u, sb1t
91	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
92	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
93$k_sb2=0x40;		# sb2u, sb2t
94	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
95	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
96$k_sbo=0x60;		# sbou, sbot
97	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
98	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
99
100$k_mc_forward=0x80;	# mc_forward
101	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
102	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
103	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
104	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
105
106$k_mc_backward=0xc0;	# mc_backward
107	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
108	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
109	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
110	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
111
112$k_sr=0x100;		# sr
113	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
114	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
115	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
116	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
117
118$k_rcon=0x140;		# rcon
119	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
120
121$k_s63=0x150;		# s63: all equal to 0x63 transformed
122	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
123
124$k_opt=0x160;		# output transform
125	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
126	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
127
128$k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
129	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
130	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
131##
132##  Decryption stuff
133##  Key schedule constants
134##
135$k_dksd=0x1a0;		# decryption key schedule: invskew x*D
136	&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
137	&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
138$k_dksb=0x1c0;		# decryption key schedule: invskew x*B
139	&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
140	&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
141$k_dkse=0x1e0;		# decryption key schedule: invskew x*E + 0x63
142	&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
143	&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
144$k_dks9=0x200;		# decryption key schedule: invskew x*9
145	&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
146	&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
147
148##
149##  Decryption stuff
150##  Round function constants
151##
152$k_dipt=0x220;		# decryption input transform
153	&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
154	&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
155
156$k_dsb9=0x240;		# decryption sbox output *9*u, *9*t
157	&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
158	&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
159$k_dsbd=0x260;		# decryption sbox output *D*u, *D*t
160	&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
161	&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
162$k_dsbb=0x280;		# decryption sbox output *B*u, *B*t
163	&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
164	&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
165$k_dsbe=0x2a0;		# decryption sbox output *E*u, *E*t
166	&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
167	&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
168$k_dsbo=0x2c0;		# decryption sbox final output
169	&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
170	&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
171&asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
172&align	(64);
173
174&function_begin_B("_vpaes_preheat");
175	&add	($const,&DWP(0,"esp"));
176	&movdqa	("xmm7",&QWP($k_inv,$const));
177	&movdqa	("xmm6",&QWP($k_s0F,$const));
178	&ret	();
179&function_end_B("_vpaes_preheat");
180
181##
182##  _aes_encrypt_core
183##
184##  AES-encrypt %xmm0.
185##
186##  Inputs:
187##     %xmm0 = input
188##     %xmm6-%xmm7 as in _vpaes_preheat
189##    (%edx) = scheduled keys
190##
191##  Output in %xmm0
192##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
193##
194##
195&function_begin_B("_vpaes_encrypt_core");
196	&mov	($magic,16);
197	&mov	($round,&DWP(240,$key));
198	&movdqa	("xmm1","xmm6")
199	&movdqa	("xmm2",&QWP($k_ipt,$const));
200	&pandn	("xmm1","xmm0");
201	&pand	("xmm0","xmm6");
202	&movdqu	("xmm5",&QWP(0,$key));
203	&pshufb	("xmm2","xmm0");
204	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
205	&pxor	("xmm2","xmm5");
206	&psrld	("xmm1",4);
207	&add	($key,16);
208	&pshufb	("xmm0","xmm1");
209	&lea	($base,&DWP($k_mc_backward,$const));
210	&pxor	("xmm0","xmm2");
211	&jmp	(&label("enc_entry"));
212
213
214&set_label("enc_loop",16);
215	# middle of middle round
216	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
217	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
218	&pshufb	("xmm4","xmm2");		# 4 = sb1u
219	&pshufb	("xmm0","xmm3");		# 0 = sb1t
220	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
221	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
222	&pxor	("xmm0","xmm4");		# 0 = A
223	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
224	&pshufb	("xmm5","xmm2");		# 4 = sb2u
225	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
226	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
227	&pshufb	("xmm2","xmm3");		# 2 = sb2t
228	&movdqa	("xmm3","xmm0");		# 3 = A
229	&pxor	("xmm2","xmm5");		# 2 = 2A
230	&pshufb	("xmm0","xmm1");		# 0 = B
231	&add	($key,16);			# next key
232	&pxor	("xmm0","xmm2");		# 0 = 2A+B
233	&pshufb	("xmm3","xmm4");		# 3 = D
234	&add	($magic,16);			# next mc
235	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
236	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
237	&and	($magic,0x30);			# ... mod 4
238	&sub	($round,1);			# nr--
239	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
240
241&set_label("enc_entry");
242	# top of round
243	&movdqa	("xmm1","xmm6");		# 1 : i
244	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
245	&pandn	("xmm1","xmm0");		# 1 = i<<4
246	&psrld	("xmm1",4);			# 1 = i
247	&pand	("xmm0","xmm6");		# 0 = k
248	&pshufb	("xmm5","xmm0");		# 2 = a/k
249	&movdqa	("xmm3","xmm7");		# 3 : 1/i
250	&pxor	("xmm0","xmm1");		# 0 = j
251	&pshufb	("xmm3","xmm1");		# 3 = 1/i
252	&movdqa	("xmm4","xmm7");		# 4 : 1/j
253	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
254	&pshufb	("xmm4","xmm0");		# 4 = 1/j
255	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
256	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
257	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
258	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
259	&pxor	("xmm2","xmm0");		# 2 = io
260	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
261	&movdqu	("xmm5",&QWP(0,$key));
262	&pxor	("xmm3","xmm1");		# 3 = jo
263	&jnz	(&label("enc_loop"));
264
265	# middle of last round
266	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
267	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
268	&pshufb	("xmm4","xmm2");		# 4 = sbou
269	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
270	&pshufb	("xmm0","xmm3");		# 0 = sb1t
271	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
272	&pxor	("xmm0","xmm4");		# 0 = A
273	&pshufb	("xmm0","xmm1");
274	&ret	();
275&function_end_B("_vpaes_encrypt_core");
276
277##
278##  Decryption core
279##
280##  Same API as encryption core.
281##
282&function_begin_B("_vpaes_decrypt_core");
283	&lea	($base,&DWP($k_dsbd,$const));
284	&mov	($round,&DWP(240,$key));
285	&movdqa	("xmm1","xmm6");
286	&movdqa	("xmm2",&QWP($k_dipt-$k_dsbd,$base));
287	&pandn	("xmm1","xmm0");
288	&mov	($magic,$round);
289	&psrld	("xmm1",4)
290	&movdqu	("xmm5",&QWP(0,$key));
291	&shl	($magic,4);
292	&pand	("xmm0","xmm6");
293	&pshufb	("xmm2","xmm0");
294	&movdqa	("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
295	&xor	($magic,0x30);
296	&pshufb	("xmm0","xmm1");
297	&and	($magic,0x30);
298	&pxor	("xmm2","xmm5");
299	&movdqa	("xmm5",&QWP($k_mc_forward+48,$const));
300	&pxor	("xmm0","xmm2");
301	&add	($key,16);
302	&lea	($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
303	&jmp	(&label("dec_entry"));
304
305&set_label("dec_loop",16);
306##
307##  Inverse mix columns
308##
309	&movdqa	("xmm4",&QWP(-0x20,$base));	# 4 : sb9u
310	&movdqa	("xmm1",&QWP(-0x10,$base));	# 0 : sb9t
311	&pshufb	("xmm4","xmm2");		# 4 = sb9u
312	&pshufb	("xmm1","xmm3");		# 0 = sb9t
313	&pxor	("xmm0","xmm4");
314	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
315	&pxor	("xmm0","xmm1");		# 0 = ch
316	&movdqa	("xmm1",&QWP(0x10,$base));	# 0 : sbdt
317
318	&pshufb	("xmm4","xmm2");		# 4 = sbdu
319	&pshufb	("xmm0","xmm5");		# MC ch
320	&pshufb	("xmm1","xmm3");		# 0 = sbdt
321	&pxor	("xmm0","xmm4");		# 4 = ch
322	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
323	&pxor	("xmm0","xmm1");		# 0 = ch
324	&movdqa	("xmm1",&QWP(0x30,$base));	# 0 : sbbt
325
326	&pshufb	("xmm4","xmm2");		# 4 = sbbu
327	&pshufb	("xmm0","xmm5");		# MC ch
328	&pshufb	("xmm1","xmm3");		# 0 = sbbt
329	&pxor	("xmm0","xmm4");		# 4 = ch
330	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
331	&pxor	("xmm0","xmm1");		# 0 = ch
332	&movdqa	("xmm1",&QWP(0x50,$base));	# 0 : sbet
333
334	&pshufb	("xmm4","xmm2");		# 4 = sbeu
335	&pshufb	("xmm0","xmm5");		# MC ch
336	&pshufb	("xmm1","xmm3");		# 0 = sbet
337	&pxor	("xmm0","xmm4");		# 4 = ch
338	&add	($key,16);			# next round key
339	&palignr("xmm5","xmm5",12);
340	&pxor	("xmm0","xmm1");		# 0 = ch
341	&sub	($round,1);			# nr--
342
343&set_label("dec_entry");
344	# top of round
345	&movdqa	("xmm1","xmm6");		# 1 : i
346	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
347	&pandn	("xmm1","xmm0");		# 1 = i<<4
348	&pand	("xmm0","xmm6");		# 0 = k
349	&psrld	("xmm1",4);			# 1 = i
350	&pshufb	("xmm2","xmm0");		# 2 = a/k
351	&movdqa	("xmm3","xmm7");		# 3 : 1/i
352	&pxor	("xmm0","xmm1");		# 0 = j
353	&pshufb	("xmm3","xmm1");		# 3 = 1/i
354	&movdqa	("xmm4","xmm7");		# 4 : 1/j
355	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
356	&pshufb	("xmm4","xmm0");		# 4 = 1/j
357	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
358	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
359	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
360	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
361	&pxor	("xmm2","xmm0");		# 2 = io
362	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
363	&movdqu	("xmm0",&QWP(0,$key));
364	&pxor	("xmm3","xmm1");		# 3 = jo
365	&jnz	(&label("dec_loop"));
366
367	# middle of last round
368	&movdqa	("xmm4",&QWP(0x60,$base));	# 3 : sbou
369	&pshufb	("xmm4","xmm2");		# 4 = sbou
370	&pxor	("xmm4","xmm0");		# 4 = sb1u + k
371	&movdqa	("xmm0",&QWP(0x70,$base));	# 0 : sbot
372	&movdqa	("xmm2",&QWP(0,$magic));
373	&pshufb	("xmm0","xmm3");		# 0 = sb1t
374	&pxor	("xmm0","xmm4");		# 0 = A
375	&pshufb	("xmm0","xmm2");
376	&ret	();
377&function_end_B("_vpaes_decrypt_core");
378
379########################################################
380##                                                    ##
381##                  AES key schedule                  ##
382##                                                    ##
383########################################################
384&function_begin_B("_vpaes_schedule_core");
385	&add	($const,&DWP(0,"esp"));
386	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
387	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
388
389	# input transform
390	&movdqa	("xmm3","xmm0");
391	&lea	($base,&DWP($k_ipt,$const));
392	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
393	&call	("_vpaes_schedule_transform");
394	&movdqa	("xmm7","xmm0");
395
396	&test	($out,$out);
397	&jnz	(&label("schedule_am_decrypting"));
398
399	# encrypting, output zeroth round key after transform
400	&movdqu	(&QWP(0,$key),"xmm0");
401	&jmp	(&label("schedule_go"));
402
403&set_label("schedule_am_decrypting");
404	# decrypting, output zeroth round key after shiftrows
405	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
406	&pshufb	("xmm3","xmm1");
407	&movdqu	(&QWP(0,$key),"xmm3");
408	&xor	($magic,0x30);
409
410&set_label("schedule_go");
411	&cmp	($round,192);
412	&ja	(&label("schedule_256"));
413	&je	(&label("schedule_192"));
414	# 128: fall though
415
416##
417##  .schedule_128
418##
419##  128-bit specific part of key schedule.
420##
421##  This schedule is really simple, because all its parts
422##  are accomplished by the subroutines.
423##
424&set_label("schedule_128");
425	&mov	($round,10);
426
427&set_label("loop_schedule_128");
428	&call	("_vpaes_schedule_round");
429	&dec	($round);
430	&jz	(&label("schedule_mangle_last"));
431	&call	("_vpaes_schedule_mangle");	# write output
432	&jmp	(&label("loop_schedule_128"));
433
434##
435##  .aes_schedule_192
436##
437##  192-bit specific part of key schedule.
438##
439##  The main body of this schedule is the same as the 128-bit
440##  schedule, but with more smearing.  The long, high side is
441##  stored in %xmm7 as before, and the short, low side is in
442##  the high bits of %xmm6.
443##
444##  This schedule is somewhat nastier, however, because each
445##  round produces 192 bits of key material, or 1.5 round keys.
446##  Therefore, on each cycle we do 2 rounds and produce 3 round
447##  keys.
448##
449&set_label("schedule_192",16);
450	&movdqu	("xmm0",&QWP(8,$inp));		# load key part 2 (very unaligned)
451	&call	("_vpaes_schedule_transform");	# input transform
452	&movdqa	("xmm6","xmm0");		# save short part
453	&pxor	("xmm4","xmm4");		# clear 4
454	&movhlps("xmm6","xmm4");		# clobber low side with zeros
455	&mov	($round,4);
456
457&set_label("loop_schedule_192");
458	&call	("_vpaes_schedule_round");
459	&palignr("xmm0","xmm6",8);
460	&call	("_vpaes_schedule_mangle");	# save key n
461	&call	("_vpaes_schedule_192_smear");
462	&call	("_vpaes_schedule_mangle");	# save key n+1
463	&call	("_vpaes_schedule_round");
464	&dec	($round);
465	&jz	(&label("schedule_mangle_last"));
466	&call	("_vpaes_schedule_mangle");	# save key n+2
467	&call	("_vpaes_schedule_192_smear");
468	&jmp	(&label("loop_schedule_192"));
469
470##
471##  .aes_schedule_256
472##
473##  256-bit specific part of key schedule.
474##
475##  The structure here is very similar to the 128-bit
476##  schedule, but with an additional "low side" in
477##  %xmm6.  The low side's rounds are the same as the
478##  high side's, except no rcon and no rotation.
479##
480&set_label("schedule_256",16);
481	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
482	&call	("_vpaes_schedule_transform");	# input transform
483	&mov	($round,7);
484
485&set_label("loop_schedule_256");
486	&call	("_vpaes_schedule_mangle");	# output low result
487	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
488
489	# high round
490	&call	("_vpaes_schedule_round");
491	&dec	($round);
492	&jz	(&label("schedule_mangle_last"));
493	&call	("_vpaes_schedule_mangle");
494
495	# low round. swap xmm7 and xmm6
496	&pshufd	("xmm0","xmm0",0xFF);
497	&movdqa	(&QWP(20,"esp"),"xmm7");
498	&movdqa	("xmm7","xmm6");
499	&call	("_vpaes_schedule_low_round");
500	&movdqa	("xmm7",&QWP(20,"esp"));
501
502	&jmp	(&label("loop_schedule_256"));
503
504##
505##  .aes_schedule_mangle_last
506##
507##  Mangler for last round of key schedule
508##  Mangles %xmm0
509##    when encrypting, outputs out(%xmm0) ^ 63
510##    when decrypting, outputs unskew(%xmm0)
511##
512##  Always called right before return... jumps to cleanup and exits
513##
514&set_label("schedule_mangle_last",16);
515	# schedule last round key from xmm0
516	&lea	($base,&DWP($k_deskew,$const));
517	&test	($out,$out);
518	&jnz	(&label("schedule_mangle_last_dec"));
519
520	# encrypting
521	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
522	&pshufb	("xmm0","xmm1");		# output permute
523	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
524	&add	($key,32);
525
526&set_label("schedule_mangle_last_dec");
527	&add	($key,-16);
528	&pxor	("xmm0",&QWP($k_s63,$const));
529	&call	("_vpaes_schedule_transform");	# output transform
530	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
531
532	# cleanup
533	&pxor	("xmm0","xmm0");
534	&pxor	("xmm1","xmm1");
535	&pxor	("xmm2","xmm2");
536	&pxor	("xmm3","xmm3");
537	&pxor	("xmm4","xmm4");
538	&pxor	("xmm5","xmm5");
539	&pxor	("xmm6","xmm6");
540	&pxor	("xmm7","xmm7");
541	&ret	();
542&function_end_B("_vpaes_schedule_core");
543
544##
545##  .aes_schedule_192_smear
546##
547##  Smear the short, low side in the 192-bit key schedule.
548##
549##  Inputs:
550##    %xmm7: high side, b  a  x  y
551##    %xmm6:  low side, d  c  0  0
552##    %xmm13: 0
553##
554##  Outputs:
555##    %xmm6: b+c+d  b+c  0  0
556##    %xmm0: b+c+d  b+c  b  a
557##
558&function_begin_B("_vpaes_schedule_192_smear");
559	&pshufd	("xmm1","xmm6",0x80);		# d c 0 0 -> c 0 0 0
560	&pshufd	("xmm0","xmm7",0xFE);		# b a _ _ -> b b b a
561	&pxor	("xmm6","xmm1");		# -> c+d c 0 0
562	&pxor	("xmm1","xmm1");
563	&pxor	("xmm6","xmm0");		# -> b+c+d b+c b a
564	&movdqa	("xmm0","xmm6");
565	&movhlps("xmm6","xmm1");		# clobber low side with zeros
566	&ret	();
567&function_end_B("_vpaes_schedule_192_smear");
568
569##
570##  .aes_schedule_round
571##
572##  Runs one main round of the key schedule on %xmm0, %xmm7
573##
574##  Specifically, runs subbytes on the high dword of %xmm0
575##  then rotates it by one byte and xors into the low dword of
576##  %xmm7.
577##
578##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
579##  next rcon.
580##
581##  Smears the dwords of %xmm7 by xoring the low into the
582##  second low, result into third, result into highest.
583##
584##  Returns results in %xmm7 = %xmm0.
585##  Clobbers %xmm1-%xmm5.
586##
587&function_begin_B("_vpaes_schedule_round");
588	# extract rcon from xmm8
589	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
590	&pxor	("xmm1","xmm1");
591	&palignr("xmm1","xmm2",15);
592	&palignr("xmm2","xmm2",15);
593	&pxor	("xmm7","xmm1");
594
595	# rotate
596	&pshufd	("xmm0","xmm0",0xFF);
597	&palignr("xmm0","xmm0",1);
598
599	# fall through...
600	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
601
602	# low round: same as high round, but no rotation and no rcon.
603&set_label("_vpaes_schedule_low_round");
604	# smear xmm7
605	&movdqa	("xmm1","xmm7");
606	&pslldq	("xmm7",4);
607	&pxor	("xmm7","xmm1");
608	&movdqa	("xmm1","xmm7");
609	&pslldq	("xmm7",8);
610	&pxor	("xmm7","xmm1");
611	&pxor	("xmm7",&QWP($k_s63,$const));
612
613	# subbyte
614	&movdqa	("xmm4",&QWP($k_s0F,$const));
615	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
616	&movdqa	("xmm1","xmm4");
617	&pandn	("xmm1","xmm0");
618	&psrld	("xmm1",4);			# 1 = i
619	&pand	("xmm0","xmm4");		# 0 = k
620	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
621	&pshufb	("xmm2","xmm0");		# 2 = a/k
622	&pxor	("xmm0","xmm1");		# 0 = j
623	&movdqa	("xmm3","xmm5");		# 3 : 1/i
624	&pshufb	("xmm3","xmm1");		# 3 = 1/i
625	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
626	&movdqa	("xmm4","xmm5");		# 4 : 1/j
627	&pshufb	("xmm4","xmm0");		# 4 = 1/j
628	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
629	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
630	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
631	&pxor	("xmm2","xmm0");		# 2 = io
632	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
633	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
634	&pxor	("xmm3","xmm1");		# 3 = jo
635	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
636	&pshufb	("xmm4","xmm2");		# 4 = sbou
637	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
638	&pshufb	("xmm0","xmm3");		# 0 = sb1t
639	&pxor	("xmm0","xmm4");		# 0 = sbox output
640
641	# add in smeared stuff
642	&pxor	("xmm0","xmm7");
643	&movdqa	("xmm7","xmm0");
644	&ret	();
645&function_end_B("_vpaes_schedule_round");
646
647##
648##  .aes_schedule_transform
649##
650##  Linear-transform %xmm0 according to tables at (%ebx)
651##
652##  Output in %xmm0
653##  Clobbers %xmm1, %xmm2
654##
655&function_begin_B("_vpaes_schedule_transform");
656	&movdqa	("xmm2",&QWP($k_s0F,$const));
657	&movdqa	("xmm1","xmm2");
658	&pandn	("xmm1","xmm0");
659	&psrld	("xmm1",4);
660	&pand	("xmm0","xmm2");
661	&movdqa	("xmm2",&QWP(0,$base));
662	&pshufb	("xmm2","xmm0");
663	&movdqa	("xmm0",&QWP(16,$base));
664	&pshufb	("xmm0","xmm1");
665	&pxor	("xmm0","xmm2");
666	&ret	();
667&function_end_B("_vpaes_schedule_transform");
668
669##
670##  .aes_schedule_mangle
671##
672##  Mangle xmm0 from (basis-transformed) standard version
673##  to our version.
674##
675##  On encrypt,
676##    xor with 0x63
677##    multiply by circulant 0,1,1,1
678##    apply shiftrows transform
679##
680##  On decrypt,
681##    xor with 0x63
682##    multiply by "inverse mixcolumns" circulant E,B,D,9
683##    deskew
684##    apply shiftrows transform
685##
686##
687##  Writes out to (%edx), and increments or decrements it
688##  Keeps track of round number mod 4 in %ecx
689##  Preserves xmm0
690##  Clobbers xmm1-xmm5
691##
692&function_begin_B("_vpaes_schedule_mangle");
693	&movdqa	("xmm4","xmm0");	# save xmm0 for later
694	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
695	&test	($out,$out);
696	&jnz	(&label("schedule_mangle_dec"));
697
698	# encrypting
699	&add	($key,16);
700	&pxor	("xmm4",&QWP($k_s63,$const));
701	&pshufb	("xmm4","xmm5");
702	&movdqa	("xmm3","xmm4");
703	&pshufb	("xmm4","xmm5");
704	&pxor	("xmm3","xmm4");
705	&pshufb	("xmm4","xmm5");
706	&pxor	("xmm3","xmm4");
707
708	&jmp	(&label("schedule_mangle_both"));
709
710&set_label("schedule_mangle_dec",16);
711	# inverse mix columns
712	&movdqa	("xmm2",&QWP($k_s0F,$const));
713	&lea	($inp,&DWP($k_dksd,$const));
714	&movdqa	("xmm1","xmm2");
715	&pandn	("xmm1","xmm4");
716	&psrld	("xmm1",4);			# 1 = hi
717	&pand	("xmm4","xmm2");		# 4 = lo
718
719	&movdqa	("xmm2",&QWP(0,$inp));
720	&pshufb	("xmm2","xmm4");
721	&movdqa	("xmm3",&QWP(0x10,$inp));
722	&pshufb	("xmm3","xmm1");
723	&pxor	("xmm3","xmm2");
724	&pshufb	("xmm3","xmm5");
725
726	&movdqa	("xmm2",&QWP(0x20,$inp));
727	&pshufb	("xmm2","xmm4");
728	&pxor	("xmm2","xmm3");
729	&movdqa	("xmm3",&QWP(0x30,$inp));
730	&pshufb	("xmm3","xmm1");
731	&pxor	("xmm3","xmm2");
732	&pshufb	("xmm3","xmm5");
733
734	&movdqa	("xmm2",&QWP(0x40,$inp));
735	&pshufb	("xmm2","xmm4");
736	&pxor	("xmm2","xmm3");
737	&movdqa	("xmm3",&QWP(0x50,$inp));
738	&pshufb	("xmm3","xmm1");
739	&pxor	("xmm3","xmm2");
740	&pshufb	("xmm3","xmm5");
741
742	&movdqa	("xmm2",&QWP(0x60,$inp));
743	&pshufb	("xmm2","xmm4");
744	&pxor	("xmm2","xmm3");
745	&movdqa	("xmm3",&QWP(0x70,$inp));
746	&pshufb	("xmm3","xmm1");
747	&pxor	("xmm3","xmm2");
748
749	&add	($key,-16);
750
751&set_label("schedule_mangle_both");
752	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
753	&pshufb	("xmm3","xmm1");
754	&add	($magic,-16);
755	&and	($magic,0x30);
756	&movdqu	(&QWP(0,$key),"xmm3");
757	&ret	();
758&function_end_B("_vpaes_schedule_mangle");
759
760#
761# Interface to OpenSSL
762#
763&function_begin("${PREFIX}_set_encrypt_key");
764	record_function_hit(5);
765
766	&mov	($inp,&wparam(0));		# inp
767	&lea	($base,&DWP(-56,"esp"));
768	&mov	($round,&wparam(1));		# bits
769	&and	($base,-16);
770	&mov	($key,&wparam(2));		# key
771	&xchg	($base,"esp");			# alloca
772	&mov	(&DWP(48,"esp"),$base);
773
774	&mov	($base,$round);
775	&shr	($base,5);
776	&add	($base,5);
777	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
778	&mov	($magic,0x30);
779	&mov	($out,0);
780
781	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
782	&call	("_vpaes_schedule_core");
783&set_label("pic_point");
784
785	&mov	("esp",&DWP(48,"esp"));
786	&xor	("eax","eax");
787&function_end("${PREFIX}_set_encrypt_key");
788
789&function_begin("${PREFIX}_set_decrypt_key");
790	&mov	($inp,&wparam(0));		# inp
791	&lea	($base,&DWP(-56,"esp"));
792	&mov	($round,&wparam(1));		# bits
793	&and	($base,-16);
794	&mov	($key,&wparam(2));		# key
795	&xchg	($base,"esp");			# alloca
796	&mov	(&DWP(48,"esp"),$base);
797
798	&mov	($base,$round);
799	&shr	($base,5);
800	&add	($base,5);
801	&mov	(&DWP(240,$key),$base);	# AES_KEY->rounds = nbits/32+5;
802	&shl	($base,4);
803	&lea	($key,&DWP(16,$key,$base));
804
805	&mov	($out,1);
806	&mov	($magic,$round);
807	&shr	($magic,1);
808	&and	($magic,32);
809	&xor	($magic,32);			# nbist==192?0:32;
810
811	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
812	&call	("_vpaes_schedule_core");
813&set_label("pic_point");
814
815	&mov	("esp",&DWP(48,"esp"));
816	&xor	("eax","eax");
817&function_end("${PREFIX}_set_decrypt_key");
818
819&function_begin("${PREFIX}_encrypt");
820	record_function_hit(4);
821
822	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
823	&call	("_vpaes_preheat");
824&set_label("pic_point");
825	&mov	($inp,&wparam(0));		# inp
826	&lea	($base,&DWP(-56,"esp"));
827	&mov	($out,&wparam(1));		# out
828	&and	($base,-16);
829	&mov	($key,&wparam(2));		# key
830	&xchg	($base,"esp");			# alloca
831	&mov	(&DWP(48,"esp"),$base);
832
833	&movdqu	("xmm0",&QWP(0,$inp));
834	&call	("_vpaes_encrypt_core");
835	&movdqu	(&QWP(0,$out),"xmm0");
836
837	&mov	("esp",&DWP(48,"esp"));
838&function_end("${PREFIX}_encrypt");
839
840&function_begin("${PREFIX}_decrypt");
841	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
842	&call	("_vpaes_preheat");
843&set_label("pic_point");
844	&mov	($inp,&wparam(0));		# inp
845	&lea	($base,&DWP(-56,"esp"));
846	&mov	($out,&wparam(1));		# out
847	&and	($base,-16);
848	&mov	($key,&wparam(2));		# key
849	&xchg	($base,"esp");			# alloca
850	&mov	(&DWP(48,"esp"),$base);
851
852	&movdqu	("xmm0",&QWP(0,$inp));
853	&call	("_vpaes_decrypt_core");
854	&movdqu	(&QWP(0,$out),"xmm0");
855
856	&mov	("esp",&DWP(48,"esp"));
857&function_end("${PREFIX}_decrypt");
858
859&function_begin("${PREFIX}_cbc_encrypt");
860	&mov	($inp,&wparam(0));		# inp
861	&mov	($out,&wparam(1));		# out
862	&mov	($round,&wparam(2));		# len
863	&mov	($key,&wparam(3));		# key
864	&sub	($round,16);
865	&jc	(&label("cbc_abort"));
866	&lea	($base,&DWP(-56,"esp"));
867	&mov	($const,&wparam(4));		# ivp
868	&and	($base,-16);
869	&mov	($magic,&wparam(5));		# enc
870	&xchg	($base,"esp");			# alloca
871	&movdqu	("xmm1",&QWP(0,$const));	# load IV
872	&sub	($out,$inp);
873	&mov	(&DWP(48,"esp"),$base);
874
875	&mov	(&DWP(0,"esp"),$out);		# save out
876	&mov	(&DWP(4,"esp"),$key)		# save key
877	&mov	(&DWP(8,"esp"),$const);		# save ivp
878	&mov	($out,$round);			# $out works as $len
879
880	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
881	&call	("_vpaes_preheat");
882&set_label("pic_point");
883	&cmp	($magic,0);
884	&je	(&label("cbc_dec_loop"));
885	&jmp	(&label("cbc_enc_loop"));
886
887&set_label("cbc_enc_loop",16);
888	&movdqu	("xmm0",&QWP(0,$inp));		# load input
889	&pxor	("xmm0","xmm1");		# inp^=iv
890	&call	("_vpaes_encrypt_core");
891	&mov	($base,&DWP(0,"esp"));		# restore out
892	&mov	($key,&DWP(4,"esp"));		# restore key
893	&movdqa	("xmm1","xmm0");
894	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
895	&lea	($inp,&DWP(16,$inp));
896	&sub	($out,16);
897	&jnc	(&label("cbc_enc_loop"));
898	&jmp	(&label("cbc_done"));
899
900&set_label("cbc_dec_loop",16);
901	&movdqu	("xmm0",&QWP(0,$inp));		# load input
902	&movdqa	(&QWP(16,"esp"),"xmm1");	# save IV
903	&movdqa	(&QWP(32,"esp"),"xmm0");	# save future IV
904	&call	("_vpaes_decrypt_core");
905	&mov	($base,&DWP(0,"esp"));		# restore out
906	&mov	($key,&DWP(4,"esp"));		# restore key
907	&pxor	("xmm0",&QWP(16,"esp"));	# out^=iv
908	&movdqa	("xmm1",&QWP(32,"esp"));	# load next IV
909	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
910	&lea	($inp,&DWP(16,$inp));
911	&sub	($out,16);
912	&jnc	(&label("cbc_dec_loop"));
913
914&set_label("cbc_done");
915	&mov	($base,&DWP(8,"esp"));		# restore ivp
916	&mov	("esp",&DWP(48,"esp"));
917	&movdqu	(&QWP(0,$base),"xmm1");		# write IV
918&set_label("cbc_abort");
919&function_end("${PREFIX}_cbc_encrypt");
920
921&asm_finish();
922
923close STDOUT or die "error closing STDOUT";
924