1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode]				###
5### bitsliced implementation for Intel Core 2 processors	###
6### requires support of SSE extensions up to SSSE3		###
7### Author: Emilia Käsper and Peter Schwabe			###
8### Date: 2009-03-19						###
9### Public domain						###
10###								###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
12### further information.					###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22#   from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24#   allowed to feed its output back to aesenc[last], this was
25#   achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28#   relies on conversion of "conventional" key schedule as returned
29#   by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31#   to skip one shiftrows(), reduce bit-sliced key schedule and
32#   speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38#		Emilia's	this(*)		difference
39#
40# Core 2    	9.30		8.69		+7%
41# Nehalem(**) 	7.63		6.88		+11%
42# Atom	    	17.1		16.4		+4%
43# Silvermont	-		12.9
44# Goldmont	-		8.85
45#
46# (*)	Comparison is not completely fair, because "this" is ECB,
47#	i.e. no extra processing such as counter values calculation
48#	and xor-ing input as in Emilia's CTR implementation is
49#	performed. However, the CTR calculations stand for not more
50#	than 1% of total time, so comparison is *rather* fair.
51#
52# (**)	Results were collected on Westmere, which is considered to
53#	be equivalent to Nehalem for this code.
54#
55# As for key schedule conversion subroutine. Interface to OpenSSL
56# relies on per-invocation on-the-fly conversion. This naturally
57# has impact on performance, especially for short inputs. Conversion
58# time in CPU cycles and its ratio to CPU cycles spent in 8x block
59# function is:
60#
61# 		conversion	conversion/8x block
62# Core 2	240		0.22
63# Nehalem	180		0.20
64# Atom		430		0.20
65#
66# The ratio values mean that 128-byte blocks will be processed
67# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
68# etc. Then keep in mind that input sizes not divisible by 128 are
69# *effectively* slower, especially shortest ones, e.g. consecutive
70# 144-byte blocks are processed 44% slower than one would expect,
71# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
72# it's still faster than ["hyper-threading-safe" code path in]
73# aes-x86_64.pl on all lengths above 64 bytes...
74#
75# October 2011.
76#
77# Add decryption procedure. Performance in CPU cycles spent to decrypt
78# one byte out of 4096-byte buffer with 128-bit key is:
79#
80# Core 2	9.98
81# Nehalem	7.80
82# Atom		17.9
83# Silvermont	14.0
84# Goldmont	10.2
85#
86# November 2011.
87#
88# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
89# suboptimal, but XTS is meant to be used with larger blocks...
90#
91#						<appro@openssl.org>
92
93$flavour = shift;
94$output  = shift;
95if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
96
97$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
98
99$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
100( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
101( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
102die "can't locate x86_64-xlate.pl";
103
104open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
105*STDOUT=*OUT;
106
107my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
108my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
109my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
110
111{
112my ($key,$rounds,$const)=("%rax","%r10d","%r11");
113
114sub Sbox {
115# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
116# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
117my @b=@_[0..7];
118my @t=@_[8..11];
119my @s=@_[12..15];
120	&InBasisChange	(@b);
121	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
122	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
123}
124
125sub InBasisChange {
126# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
127# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
128my @b=@_[0..7];
129$code.=<<___;
130	pxor	@b[6], @b[5]
131	pxor	@b[1], @b[2]
132	pxor	@b[0], @b[3]
133	pxor	@b[2], @b[6]
134	pxor 	@b[0], @b[5]
135
136	pxor	@b[3], @b[6]
137	pxor	@b[7], @b[3]
138	pxor	@b[5], @b[7]
139	pxor	@b[4], @b[3]
140	pxor	@b[5], @b[4]
141	pxor	@b[1], @b[3]
142
143	pxor	@b[7], @b[2]
144	pxor	@b[5], @b[1]
145___
146}
147
148sub OutBasisChange {
149# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
150# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
151my @b=@_[0..7];
152$code.=<<___;
153	pxor	@b[6], @b[0]
154	pxor	@b[4], @b[1]
155	pxor	@b[0], @b[2]
156	pxor	@b[6], @b[4]
157	pxor	@b[1], @b[6]
158
159	pxor	@b[5], @b[1]
160	pxor	@b[3], @b[5]
161	pxor	@b[7], @b[3]
162	pxor	@b[5], @b[7]
163	pxor	@b[5], @b[2]
164
165	pxor	@b[7], @b[4]
166___
167}
168
169sub InvSbox {
170# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
171# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
172my @b=@_[0..7];
173my @t=@_[8..11];
174my @s=@_[12..15];
175	&InvInBasisChange	(@b);
176	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
177	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
178}
179
180sub InvInBasisChange {		# OutBasisChange in reverse
181my @b=@_[5,1,2,6,3,7,0,4];
182$code.=<<___
183	pxor	@b[7], @b[4]
184
185	pxor	@b[5], @b[7]
186	pxor	@b[5], @b[2]
187	pxor	@b[7], @b[3]
188	pxor	@b[3], @b[5]
189	pxor	@b[5], @b[1]
190
191	pxor	@b[1], @b[6]
192	pxor	@b[0], @b[2]
193	pxor	@b[6], @b[4]
194	pxor	@b[6], @b[0]
195	pxor	@b[4], @b[1]
196___
197}
198
199sub InvOutBasisChange {		# InBasisChange in reverse
200my @b=@_[2,5,7,3,6,1,0,4];
201$code.=<<___;
202	pxor	@b[5], @b[1]
203	pxor	@b[7], @b[2]
204
205	pxor	@b[1], @b[3]
206	pxor	@b[5], @b[4]
207	pxor	@b[5], @b[7]
208	pxor	@b[4], @b[3]
209	 pxor 	@b[0], @b[5]
210	pxor	@b[7], @b[3]
211	 pxor	@b[2], @b[6]
212	 pxor	@b[1], @b[2]
213	pxor	@b[3], @b[6]
214
215	pxor	@b[0], @b[3]
216	pxor	@b[6], @b[5]
217___
218}
219
220sub Mul_GF4 {
221#;*************************************************************
222#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
223#;*************************************************************
224my ($x0,$x1,$y0,$y1,$t0)=@_;
225$code.=<<___;
226	movdqa	$y0, $t0
227	pxor 	$y1, $t0
228	pand	$x0, $t0
229	pxor	$x1, $x0
230	pand	$y0, $x1
231	pand	$y1, $x0
232	pxor	$x1, $x0
233	pxor	$t0, $x1
234___
235}
236
237sub Mul_GF4_N {				# not used, see next subroutine
238# multiply and scale by N
239my ($x0,$x1,$y0,$y1,$t0)=@_;
240$code.=<<___;
241	movdqa	$y0, $t0
242	pxor	$y1, $t0
243	pand	$x0, $t0
244	pxor	$x1, $x0
245	pand	$y0, $x1
246	pand	$y1, $x0
247	pxor	$x0, $x1
248	pxor	$t0, $x0
249___
250}
251
252sub Mul_GF4_N_GF4 {
253# interleaved Mul_GF4_N and Mul_GF4
254my ($x0,$x1,$y0,$y1,$t0,
255    $x2,$x3,$y2,$y3,$t1)=@_;
256$code.=<<___;
257	movdqa	$y0, $t0
258	 movdqa	$y2, $t1
259	pxor	$y1, $t0
260	 pxor 	$y3, $t1
261	pand	$x0, $t0
262	 pand	$x2, $t1
263	pxor	$x1, $x0
264	 pxor	$x3, $x2
265	pand	$y0, $x1
266	 pand	$y2, $x3
267	pand	$y1, $x0
268	 pand	$y3, $x2
269	pxor	$x0, $x1
270	 pxor	$x3, $x2
271	pxor	$t0, $x0
272	 pxor	$t1, $x3
273___
274}
275sub Mul_GF16_2 {
276my @x=@_[0..7];
277my @y=@_[8..11];
278my @t=@_[12..15];
279$code.=<<___;
280	movdqa	@x[0], @t[0]
281	movdqa	@x[1], @t[1]
282___
283	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
284$code.=<<___;
285	pxor	@x[2], @t[0]
286	pxor	@x[3], @t[1]
287	pxor	@y[2], @y[0]
288	pxor	@y[3], @y[1]
289___
290	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
291			 @x[2], @x[3], @y[2], @y[3], @t[2]);
292$code.=<<___;
293	pxor	@t[0], @x[0]
294	pxor	@t[0], @x[2]
295	pxor	@t[1], @x[1]
296	pxor	@t[1], @x[3]
297
298	movdqa	@x[4], @t[0]
299	movdqa	@x[5], @t[1]
300	pxor	@x[6], @t[0]
301	pxor	@x[7], @t[1]
302___
303	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
304			 @x[6], @x[7], @y[2], @y[3], @t[2]);
305$code.=<<___;
306	pxor	@y[2], @y[0]
307	pxor	@y[3], @y[1]
308___
309	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
310$code.=<<___;
311	pxor	@t[0], @x[4]
312	pxor	@t[0], @x[6]
313	pxor	@t[1], @x[5]
314	pxor	@t[1], @x[7]
315___
316}
317sub Inv_GF256 {
318#;********************************************************************
319#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
320#;********************************************************************
321my @x=@_[0..7];
322my @t=@_[8..11];
323my @s=@_[12..15];
324# direct optimizations from hardware
325$code.=<<___;
326	movdqa	@x[4], @t[3]
327	movdqa	@x[5], @t[2]
328	movdqa	@x[1], @t[1]
329	movdqa	@x[7], @s[1]
330	movdqa	@x[0], @s[0]
331
332	pxor	@x[6], @t[3]
333	pxor	@x[7], @t[2]
334	pxor	@x[3], @t[1]
335	 movdqa	@t[3], @s[2]
336	pxor	@x[6], @s[1]
337	 movdqa	@t[2], @t[0]
338	pxor	@x[2], @s[0]
339	 movdqa	@t[3], @s[3]
340
341	por	@t[1], @t[2]
342	por	@s[0], @t[3]
343	pxor	@t[0], @s[3]
344	pand	@s[0], @s[2]
345	pxor	@t[1], @s[0]
346	pand	@t[1], @t[0]
347	pand	@s[0], @s[3]
348	movdqa	@x[3], @s[0]
349	pxor	@x[2], @s[0]
350	pand	@s[0], @s[1]
351	pxor	@s[1], @t[3]
352	pxor	@s[1], @t[2]
353	movdqa	@x[4], @s[1]
354	movdqa	@x[1], @s[0]
355	pxor	@x[5], @s[1]
356	pxor	@x[0], @s[0]
357	movdqa	@s[1], @t[1]
358	pand	@s[0], @s[1]
359	por	@s[0], @t[1]
360	pxor	@s[1], @t[0]
361	pxor	@s[3], @t[3]
362	pxor	@s[2], @t[2]
363	pxor	@s[3], @t[1]
364	movdqa	@x[7], @s[0]
365	pxor	@s[2], @t[0]
366	movdqa	@x[6], @s[1]
367	pxor	@s[2], @t[1]
368	movdqa	@x[5], @s[2]
369	pand	@x[3], @s[0]
370	movdqa	@x[4], @s[3]
371	pand	@x[2], @s[1]
372	pand	@x[1], @s[2]
373	por	@x[0], @s[3]
374	pxor	@s[0], @t[3]
375	pxor	@s[1], @t[2]
376	pxor	@s[2], @t[1]
377	pxor	@s[3], @t[0]
378
379	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
380
381	# new smaller inversion
382
383	movdqa	@t[3], @s[0]
384	pand	@t[1], @t[3]
385	pxor	@t[2], @s[0]
386
387	movdqa	@t[0], @s[2]
388	movdqa	@s[0], @s[3]
389	pxor	@t[3], @s[2]
390	pand	@s[2], @s[3]
391
392	movdqa	@t[1], @s[1]
393	pxor	@t[2], @s[3]
394	pxor	@t[0], @s[1]
395
396	pxor	@t[2], @t[3]
397
398	pand	@t[3], @s[1]
399
400	movdqa	@s[2], @t[2]
401	pxor	@t[0], @s[1]
402
403	pxor	@s[1], @t[2]
404	pxor	@s[1], @t[1]
405
406	pand	@t[0], @t[2]
407
408	pxor	@t[2], @s[2]
409	pxor	@t[2], @t[1]
410
411	pand	@s[3], @s[2]
412
413	pxor	@s[0], @s[2]
414___
415# output in s3, s2, s1, t1
416
417# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
418
419# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
420	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
421
422### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
423}
424
425# AES linear components
426
427sub ShiftRows {
428my @x=@_[0..7];
429my $mask=pop;
430$code.=<<___;
431	pxor	0x00($key),@x[0]
432	pxor	0x10($key),@x[1]
433	pxor	0x20($key),@x[2]
434	pxor	0x30($key),@x[3]
435	pshufb	$mask,@x[0]
436	pshufb	$mask,@x[1]
437	pxor	0x40($key),@x[4]
438	pxor	0x50($key),@x[5]
439	pshufb	$mask,@x[2]
440	pshufb	$mask,@x[3]
441	pxor	0x60($key),@x[6]
442	pxor	0x70($key),@x[7]
443	pshufb	$mask,@x[4]
444	pshufb	$mask,@x[5]
445	pshufb	$mask,@x[6]
446	pshufb	$mask,@x[7]
447	lea	0x80($key),$key
448___
449}
450
451sub MixColumns {
452# modified to emit output in order suitable for feeding back to aesenc[last]
453my @x=@_[0..7];
454my @t=@_[8..15];
455my $inv=@_[16];	# optional
456$code.=<<___;
457	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
458	pshufd	\$0x93, @x[1], @t[1]
459	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
460	pshufd	\$0x93, @x[2], @t[2]
461	 pxor	@t[1], @x[1]
462	pshufd	\$0x93, @x[3], @t[3]
463	 pxor	@t[2], @x[2]
464	pshufd	\$0x93, @x[4], @t[4]
465	 pxor	@t[3], @x[3]
466	pshufd	\$0x93, @x[5], @t[5]
467	 pxor	@t[4], @x[4]
468	pshufd	\$0x93, @x[6], @t[6]
469	 pxor	@t[5], @x[5]
470	pshufd	\$0x93, @x[7], @t[7]
471	 pxor	@t[6], @x[6]
472	 pxor	@t[7], @x[7]
473
474	pxor	@x[0], @t[1]
475	pxor	@x[7], @t[0]
476	pxor	@x[7], @t[1]
477	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
478	pxor	@x[1], @t[2]
479	 pshufd	\$0x4E, @x[1], @x[1]
480	pxor	@x[4], @t[5]
481	 pxor	@t[0], @x[0]
482	pxor	@x[5], @t[6]
483	 pxor	@t[1], @x[1]
484	pxor	@x[3], @t[4]
485	 pshufd	\$0x4E, @x[4], @t[0]
486	pxor	@x[6], @t[7]
487	 pshufd	\$0x4E, @x[5], @t[1]
488	pxor	@x[2], @t[3]
489	 pshufd	\$0x4E, @x[3], @x[4]
490	pxor	@x[7], @t[3]
491	 pshufd	\$0x4E, @x[7], @x[5]
492	pxor	@x[7], @t[4]
493	 pshufd	\$0x4E, @x[6], @x[3]
494	pxor	@t[4], @t[0]
495	 pshufd	\$0x4E, @x[2], @x[6]
496	pxor	@t[5], @t[1]
497___
498$code.=<<___ if (!$inv);
499	pxor	@t[3], @x[4]
500	pxor	@t[7], @x[5]
501	pxor	@t[6], @x[3]
502	 movdqa	@t[0], @x[2]
503	pxor	@t[2], @x[6]
504	 movdqa	@t[1], @x[7]
505___
506$code.=<<___ if ($inv);
507	pxor	@x[4], @t[3]
508	pxor	@t[7], @x[5]
509	pxor	@x[3], @t[6]
510	 movdqa	@t[0], @x[3]
511	pxor	@t[2], @x[6]
512	 movdqa	@t[6], @x[2]
513	 movdqa	@t[1], @x[7]
514	 movdqa	@x[6], @x[4]
515	 movdqa	@t[3], @x[6]
516___
517}
518
519sub InvMixColumns_orig {
520my @x=@_[0..7];
521my @t=@_[8..15];
522
523$code.=<<___;
524	# multiplication by 0x0e
525	pshufd	\$0x93, @x[7], @t[7]
526	movdqa	@x[2], @t[2]
527	pxor	@x[5], @x[7]		# 7 5
528	pxor	@x[5], @x[2]		# 2 5
529	pshufd	\$0x93, @x[0], @t[0]
530	movdqa	@x[5], @t[5]
531	pxor	@x[0], @x[5]		# 5 0		[1]
532	pxor	@x[1], @x[0]		# 0 1
533	pshufd	\$0x93, @x[1], @t[1]
534	pxor	@x[2], @x[1]		# 1 25
535	pxor	@x[6], @x[0]		# 01 6		[2]
536	pxor	@x[3], @x[1]		# 125 3		[4]
537	pshufd	\$0x93, @x[3], @t[3]
538	pxor	@x[0], @x[2]		# 25 016	[3]
539	pxor	@x[7], @x[3]		# 3 75
540	pxor	@x[6], @x[7]		# 75 6		[0]
541	pshufd	\$0x93, @x[6], @t[6]
542	movdqa	@x[4], @t[4]
543	pxor	@x[4], @x[6]		# 6 4
544	pxor	@x[3], @x[4]		# 4 375		[6]
545	pxor	@x[7], @x[3]		# 375 756=36
546	pxor	@t[5], @x[6]		# 64 5		[7]
547	pxor	@t[2], @x[3]		# 36 2
548	pxor	@t[4], @x[3]		# 362 4		[5]
549	pshufd	\$0x93, @t[5], @t[5]
550___
551					my @y = @x[7,5,0,2,1,3,4,6];
552$code.=<<___;
553	# multiplication by 0x0b
554	pxor	@y[0], @y[1]
555	pxor	@t[0], @y[0]
556	pxor	@t[1], @y[1]
557	pshufd	\$0x93, @t[2], @t[2]
558	pxor	@t[5], @y[0]
559	pxor	@t[6], @y[1]
560	pxor	@t[7], @y[0]
561	pshufd	\$0x93, @t[4], @t[4]
562	pxor	@t[6], @t[7]		# clobber t[7]
563	pxor	@y[0], @y[1]
564
565	pxor	@t[0], @y[3]
566	pshufd	\$0x93, @t[0], @t[0]
567	pxor	@t[1], @y[2]
568	pxor	@t[1], @y[4]
569	pxor	@t[2], @y[2]
570	pshufd	\$0x93, @t[1], @t[1]
571	pxor	@t[2], @y[3]
572	pxor	@t[2], @y[5]
573	pxor	@t[7], @y[2]
574	pshufd	\$0x93, @t[2], @t[2]
575	pxor	@t[3], @y[3]
576	pxor	@t[3], @y[6]
577	pxor	@t[3], @y[4]
578	pshufd	\$0x93, @t[3], @t[3]
579	pxor	@t[4], @y[7]
580	pxor	@t[4], @y[5]
581	pxor	@t[7], @y[7]
582	pxor	@t[5], @y[3]
583	pxor	@t[4], @y[4]
584	pxor	@t[5], @t[7]		# clobber t[7] even more
585
586	pxor	@t[7], @y[5]
587	pshufd	\$0x93, @t[4], @t[4]
588	pxor	@t[7], @y[6]
589	pxor	@t[7], @y[4]
590
591	pxor	@t[5], @t[7]
592	pshufd	\$0x93, @t[5], @t[5]
593	pxor	@t[6], @t[7]		# restore t[7]
594
595	# multiplication by 0x0d
596	pxor	@y[7], @y[4]
597	pxor	@t[4], @y[7]
598	pshufd	\$0x93, @t[6], @t[6]
599	pxor	@t[0], @y[2]
600	pxor	@t[5], @y[7]
601	pxor	@t[2], @y[2]
602	pshufd	\$0x93, @t[7], @t[7]
603
604	pxor	@y[1], @y[3]
605	pxor	@t[1], @y[1]
606	pxor	@t[0], @y[0]
607	pxor	@t[0], @y[3]
608	pxor	@t[5], @y[1]
609	pxor	@t[5], @y[0]
610	pxor	@t[7], @y[1]
611	pshufd	\$0x93, @t[0], @t[0]
612	pxor	@t[6], @y[0]
613	pxor	@y[1], @y[3]
614	pxor	@t[1], @y[4]
615	pshufd	\$0x93, @t[1], @t[1]
616
617	pxor	@t[7], @y[7]
618	pxor	@t[2], @y[4]
619	pxor	@t[2], @y[5]
620	pshufd	\$0x93, @t[2], @t[2]
621	pxor	@t[6], @y[2]
622	pxor	@t[3], @t[6]		# clobber t[6]
623	pxor	@y[7], @y[4]
624	pxor	@t[6], @y[3]
625
626	pxor	@t[6], @y[6]
627	pxor	@t[5], @y[5]
628	pxor	@t[4], @y[6]
629	pshufd	\$0x93, @t[4], @t[4]
630	pxor	@t[6], @y[5]
631	pxor	@t[7], @y[6]
632	pxor	@t[3], @t[6]		# restore t[6]
633
634	pshufd	\$0x93, @t[5], @t[5]
635	pshufd	\$0x93, @t[6], @t[6]
636	pshufd	\$0x93, @t[7], @t[7]
637	pshufd	\$0x93, @t[3], @t[3]
638
639	# multiplication by 0x09
640	pxor	@y[1], @y[4]
641	pxor	@y[1], @t[1]		# t[1]=y[1]
642	pxor	@t[5], @t[0]		# clobber t[0]
643	pxor	@t[5], @t[1]
644	pxor	@t[0], @y[3]
645	pxor	@y[0], @t[0]		# t[0]=y[0]
646	pxor	@t[6], @t[1]
647	pxor	@t[7], @t[6]		# clobber t[6]
648	pxor	@t[1], @y[4]
649	pxor	@t[4], @y[7]
650	pxor	@y[4], @t[4]		# t[4]=y[4]
651	pxor	@t[3], @y[6]
652	pxor	@y[3], @t[3]		# t[3]=y[3]
653	pxor	@t[2], @y[5]
654	pxor	@y[2], @t[2]		# t[2]=y[2]
655	pxor	@t[7], @t[3]
656	pxor	@y[5], @t[5]		# t[5]=y[5]
657	pxor	@t[6], @t[2]
658	pxor	@t[6], @t[5]
659	pxor	@y[6], @t[6]		# t[6]=y[6]
660	pxor	@y[7], @t[7]		# t[7]=y[7]
661
662	movdqa	@t[0],@XMM[0]
663	movdqa	@t[1],@XMM[1]
664	movdqa	@t[2],@XMM[2]
665	movdqa	@t[3],@XMM[3]
666	movdqa	@t[4],@XMM[4]
667	movdqa	@t[5],@XMM[5]
668	movdqa	@t[6],@XMM[6]
669	movdqa	@t[7],@XMM[7]
670___
671}
672
673sub InvMixColumns {
674my @x=@_[0..7];
675my @t=@_[8..15];
676
677# Thanks to Jussi Kivilinna for providing pointer to
678#
679# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
680# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
681# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
682# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
683
684$code.=<<___;
685	# multiplication by 0x05-0x00-0x04-0x00
686	pshufd	\$0x4E, @x[0], @t[0]
687	pshufd	\$0x4E, @x[6], @t[6]
688	pxor	@x[0], @t[0]
689	pshufd	\$0x4E, @x[7], @t[7]
690	pxor	@x[6], @t[6]
691	pshufd	\$0x4E, @x[1], @t[1]
692	pxor	@x[7], @t[7]
693	pshufd	\$0x4E, @x[2], @t[2]
694	pxor	@x[1], @t[1]
695	pshufd	\$0x4E, @x[3], @t[3]
696	pxor	@x[2], @t[2]
697	 pxor	@t[6], @x[0]
698	 pxor	@t[6], @x[1]
699	pshufd	\$0x4E, @x[4], @t[4]
700	pxor	@x[3], @t[3]
701	 pxor	@t[0], @x[2]
702	 pxor	@t[1], @x[3]
703	pshufd	\$0x4E, @x[5], @t[5]
704	pxor	@x[4], @t[4]
705	 pxor	@t[7], @x[1]
706	 pxor	@t[2], @x[4]
707	pxor	@x[5], @t[5]
708
709	 pxor	@t[7], @x[2]
710	 pxor	@t[6], @x[3]
711	 pxor	@t[6], @x[4]
712	 pxor	@t[3], @x[5]
713	 pxor	@t[4], @x[6]
714	 pxor	@t[7], @x[4]
715	 pxor	@t[7], @x[5]
716	 pxor	@t[5], @x[7]
717___
718	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
719}
720
721sub aesenc {				# not used
722my @b=@_[0..7];
723my @t=@_[8..15];
724$code.=<<___;
725	movdqa	0x30($const),@t[0]	# .LSR
726___
727	&ShiftRows	(@b,@t[0]);
728	&Sbox		(@b,@t);
729	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
730}
731
732sub aesenclast {			# not used
733my @b=@_[0..7];
734my @t=@_[8..15];
735$code.=<<___;
736	movdqa	0x40($const),@t[0]	# .LSRM0
737___
738	&ShiftRows	(@b,@t[0]);
739	&Sbox		(@b,@t);
740$code.=<<___
741	pxor	0x00($key),@b[0]
742	pxor	0x10($key),@b[1]
743	pxor	0x20($key),@b[4]
744	pxor	0x30($key),@b[6]
745	pxor	0x40($key),@b[3]
746	pxor	0x50($key),@b[7]
747	pxor	0x60($key),@b[2]
748	pxor	0x70($key),@b[5]
749___
750}
751
752sub swapmove {
753my ($a,$b,$n,$mask,$t)=@_;
754$code.=<<___;
755	movdqa	$b,$t
756	psrlq	\$$n,$b
757	pxor  	$a,$b
758	pand	$mask,$b
759	pxor	$b,$a
760	psllq	\$$n,$b
761	pxor	$t,$b
762___
763}
764sub swapmove2x {
765my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
766$code.=<<___;
767	movdqa	$b0,$t0
768	psrlq	\$$n,$b0
769	 movdqa	$b1,$t1
770	 psrlq	\$$n,$b1
771	pxor  	$a0,$b0
772	 pxor  	$a1,$b1
773	pand	$mask,$b0
774	 pand	$mask,$b1
775	pxor	$b0,$a0
776	psllq	\$$n,$b0
777	 pxor	$b1,$a1
778	 psllq	\$$n,$b1
779	pxor	$t0,$b0
780	 pxor	$t1,$b1
781___
782}
783
784sub bitslice {
785my @x=reverse(@_[0..7]);
786my ($t0,$t1,$t2,$t3)=@_[8..11];
787$code.=<<___;
788	movdqa	0x00($const),$t0	# .LBS0
789	movdqa	0x10($const),$t1	# .LBS1
790___
791	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
792	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
793$code.=<<___;
794	movdqa	0x20($const),$t0	# .LBS2
795___
796	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
797	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
798
799	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
800	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
801}
802
803$code.=<<___;
804.text
805
806.extern	asm_AES_encrypt
807.extern	asm_AES_decrypt
808
809.type	_bsaes_encrypt8,\@abi-omnipotent
810.align	64
811_bsaes_encrypt8:
812	lea	.LBS0(%rip), $const	# constants table
813
814	movdqa	($key), @XMM[9]		# round 0 key
815	lea	0x10($key), $key
816	movdqa	0x50($const), @XMM[8]	# .LM0SR
817	pxor	@XMM[9], @XMM[0]	# xor with round0 key
818	pxor	@XMM[9], @XMM[1]
819	pxor	@XMM[9], @XMM[2]
820	pxor	@XMM[9], @XMM[3]
821	 pshufb	@XMM[8], @XMM[0]
822	 pshufb	@XMM[8], @XMM[1]
823	pxor	@XMM[9], @XMM[4]
824	pxor	@XMM[9], @XMM[5]
825	 pshufb	@XMM[8], @XMM[2]
826	 pshufb	@XMM[8], @XMM[3]
827	pxor	@XMM[9], @XMM[6]
828	pxor	@XMM[9], @XMM[7]
829	 pshufb	@XMM[8], @XMM[4]
830	 pshufb	@XMM[8], @XMM[5]
831	 pshufb	@XMM[8], @XMM[6]
832	 pshufb	@XMM[8], @XMM[7]
833_bsaes_encrypt8_bitslice:
834___
835	&bitslice	(@XMM[0..7, 8..11]);
836$code.=<<___;
837	dec	$rounds
838	jmp	.Lenc_sbox
839.align	16
840.Lenc_loop:
841___
842	&ShiftRows	(@XMM[0..7, 8]);
843$code.=".Lenc_sbox:\n";
844	&Sbox		(@XMM[0..7, 8..15]);
845$code.=<<___;
846	dec	$rounds
847	jl	.Lenc_done
848___
849	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
850$code.=<<___;
851	movdqa	0x30($const), @XMM[8]	# .LSR
852	jnz	.Lenc_loop
853	movdqa	0x40($const), @XMM[8]	# .LSRM0
854	jmp	.Lenc_loop
855.align	16
856.Lenc_done:
857___
858	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
859	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
860$code.=<<___;
861	movdqa	($key), @XMM[8]		# last round key
862	pxor	@XMM[8], @XMM[4]
863	pxor	@XMM[8], @XMM[6]
864	pxor	@XMM[8], @XMM[3]
865	pxor	@XMM[8], @XMM[7]
866	pxor	@XMM[8], @XMM[2]
867	pxor	@XMM[8], @XMM[5]
868	pxor	@XMM[8], @XMM[0]
869	pxor	@XMM[8], @XMM[1]
870	ret
871.size	_bsaes_encrypt8,.-_bsaes_encrypt8
872
873.type	_bsaes_decrypt8,\@abi-omnipotent
874.align	64
875_bsaes_decrypt8:
876	lea	.LBS0(%rip), $const	# constants table
877
878	movdqa	($key), @XMM[9]		# round 0 key
879	lea	0x10($key), $key
880	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
881	pxor	@XMM[9], @XMM[0]	# xor with round0 key
882	pxor	@XMM[9], @XMM[1]
883	pxor	@XMM[9], @XMM[2]
884	pxor	@XMM[9], @XMM[3]
885	 pshufb	@XMM[8], @XMM[0]
886	 pshufb	@XMM[8], @XMM[1]
887	pxor	@XMM[9], @XMM[4]
888	pxor	@XMM[9], @XMM[5]
889	 pshufb	@XMM[8], @XMM[2]
890	 pshufb	@XMM[8], @XMM[3]
891	pxor	@XMM[9], @XMM[6]
892	pxor	@XMM[9], @XMM[7]
893	 pshufb	@XMM[8], @XMM[4]
894	 pshufb	@XMM[8], @XMM[5]
895	 pshufb	@XMM[8], @XMM[6]
896	 pshufb	@XMM[8], @XMM[7]
897___
898	&bitslice	(@XMM[0..7, 8..11]);
899$code.=<<___;
900	dec	$rounds
901	jmp	.Ldec_sbox
902.align	16
903.Ldec_loop:
904___
905	&ShiftRows	(@XMM[0..7, 8]);
906$code.=".Ldec_sbox:\n";
907	&InvSbox	(@XMM[0..7, 8..15]);
908$code.=<<___;
909	dec	$rounds
910	jl	.Ldec_done
911___
912	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
913$code.=<<___;
914	movdqa	-0x10($const), @XMM[8]	# .LISR
915	jnz	.Ldec_loop
916	movdqa	-0x20($const), @XMM[8]	# .LISRM0
917	jmp	.Ldec_loop
918.align	16
919.Ldec_done:
920___
921	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
922$code.=<<___;
923	movdqa	($key), @XMM[8]		# last round key
924	pxor	@XMM[8], @XMM[6]
925	pxor	@XMM[8], @XMM[4]
926	pxor	@XMM[8], @XMM[2]
927	pxor	@XMM[8], @XMM[7]
928	pxor	@XMM[8], @XMM[3]
929	pxor	@XMM[8], @XMM[5]
930	pxor	@XMM[8], @XMM[0]
931	pxor	@XMM[8], @XMM[1]
932	ret
933.size	_bsaes_decrypt8,.-_bsaes_decrypt8
934___
935}
936{
937my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
938
939sub bitslice_key {
940my @x=reverse(@_[0..7]);
941my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
942
943	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
944$code.=<<___;
945	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
946	movdqa	@x[0], @x[2]
947	movdqa	@x[1], @x[3]
948___
949	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
950
951	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
952$code.=<<___;
953	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
954	movdqa	@x[0], @x[4]
955	movdqa	@x[2], @x[6]
956	movdqa	@x[1], @x[5]
957	movdqa	@x[3], @x[7]
958___
959	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
960	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
961}
962
963$code.=<<___;
964.type	_bsaes_key_convert,\@abi-omnipotent
965.align	16
966_bsaes_key_convert:
967	lea	.Lmasks(%rip), $const
968	movdqu	($inp), %xmm7		# load round 0 key
969	lea	0x10($inp), $inp
970	movdqa	0x00($const), %xmm0	# 0x01...
971	movdqa	0x10($const), %xmm1	# 0x02...
972	movdqa	0x20($const), %xmm2	# 0x04...
973	movdqa	0x30($const), %xmm3	# 0x08...
974	movdqa	0x40($const), %xmm4	# .LM0
975	pcmpeqd	%xmm5, %xmm5		# .LNOT
976
977	movdqu	($inp), %xmm6		# load round 1 key
978	movdqa	%xmm7, ($out)		# save round 0 key
979	lea	0x10($out), $out
980	dec	$rounds
981	jmp	.Lkey_loop
982.align	16
983.Lkey_loop:
984	pshufb	%xmm4, %xmm6		# .LM0
985
986	movdqa	%xmm0,	%xmm8
987	movdqa	%xmm1,	%xmm9
988
989	pand	%xmm6,	%xmm8
990	pand	%xmm6,	%xmm9
991	movdqa	%xmm2,	%xmm10
992	pcmpeqb	%xmm0,	%xmm8
993	psllq	\$4,	%xmm0		# 0x10...
994	movdqa	%xmm3,	%xmm11
995	pcmpeqb	%xmm1,	%xmm9
996	psllq	\$4,	%xmm1		# 0x20...
997
998	pand	%xmm6,	%xmm10
999	pand	%xmm6,	%xmm11
1000	movdqa	%xmm0,	%xmm12
1001	pcmpeqb	%xmm2,	%xmm10
1002	psllq	\$4,	%xmm2		# 0x40...
1003	movdqa	%xmm1,	%xmm13
1004	pcmpeqb	%xmm3,	%xmm11
1005	psllq	\$4,	%xmm3		# 0x80...
1006
1007	movdqa	%xmm2,	%xmm14
1008	movdqa	%xmm3,	%xmm15
1009	 pxor	%xmm5,	%xmm8		# "pnot"
1010	 pxor	%xmm5,	%xmm9
1011
1012	pand	%xmm6,	%xmm12
1013	pand	%xmm6,	%xmm13
1014	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
1015	pcmpeqb	%xmm0,	%xmm12
1016	psrlq	\$4,	%xmm0		# 0x01...
1017	 movdqa	%xmm9, 0x10($out)
1018	pcmpeqb	%xmm1,	%xmm13
1019	psrlq	\$4,	%xmm1		# 0x02...
1020	 lea	0x10($inp), $inp
1021
1022	pand	%xmm6,	%xmm14
1023	pand	%xmm6,	%xmm15
1024	 movdqa	%xmm10, 0x20($out)
1025	pcmpeqb	%xmm2,	%xmm14
1026	psrlq	\$4,	%xmm2		# 0x04...
1027	 movdqa	%xmm11, 0x30($out)
1028	pcmpeqb	%xmm3,	%xmm15
1029	psrlq	\$4,	%xmm3		# 0x08...
1030	 movdqu	($inp), %xmm6		# load next round key
1031
1032	pxor	%xmm5, %xmm13		# "pnot"
1033	pxor	%xmm5, %xmm14
1034	movdqa	%xmm12, 0x40($out)
1035	movdqa	%xmm13, 0x50($out)
1036	movdqa	%xmm14, 0x60($out)
1037	movdqa	%xmm15, 0x70($out)
1038	lea	0x80($out),$out
1039	dec	$rounds
1040	jnz	.Lkey_loop
1041
1042	movdqa	0x50($const), %xmm7	# .L63
1043	#movdqa	%xmm6, ($out)		# don't save last round key
1044	ret
1045.size	_bsaes_key_convert,.-_bsaes_key_convert
1046___
1047}
1048
1049if (0 && !$win64) {	# following four functions are unsupported interface
1050			# used for benchmarking...
1051$code.=<<___;
1052.globl	bsaes_enc_key_convert
1053.type	bsaes_enc_key_convert,\@function,2
1054.align	16
1055bsaes_enc_key_convert:
1056	mov	240($inp),%r10d		# pass rounds
1057	mov	$inp,%rcx		# pass key
1058	mov	$out,%rax		# pass key schedule
1059	call	_bsaes_key_convert
1060	pxor	%xmm6,%xmm7		# fix up last round key
1061	movdqa	%xmm7,(%rax)		# save last round key
1062	ret
1063.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1064
1065.globl	bsaes_encrypt_128
1066.type	bsaes_encrypt_128,\@function,4
1067.align	16
1068bsaes_encrypt_128:
1069.Lenc128_loop:
1070	movdqu	0x00($inp), @XMM[0]	# load input
1071	movdqu	0x10($inp), @XMM[1]
1072	movdqu	0x20($inp), @XMM[2]
1073	movdqu	0x30($inp), @XMM[3]
1074	movdqu	0x40($inp), @XMM[4]
1075	movdqu	0x50($inp), @XMM[5]
1076	movdqu	0x60($inp), @XMM[6]
1077	movdqu	0x70($inp), @XMM[7]
1078	mov	$key, %rax		# pass the $key
1079	lea	0x80($inp), $inp
1080	mov	\$10,%r10d
1081
1082	call	_bsaes_encrypt8
1083
1084	movdqu	@XMM[0], 0x00($out)	# write output
1085	movdqu	@XMM[1], 0x10($out)
1086	movdqu	@XMM[4], 0x20($out)
1087	movdqu	@XMM[6], 0x30($out)
1088	movdqu	@XMM[3], 0x40($out)
1089	movdqu	@XMM[7], 0x50($out)
1090	movdqu	@XMM[2], 0x60($out)
1091	movdqu	@XMM[5], 0x70($out)
1092	lea	0x80($out), $out
1093	sub	\$0x80,$len
1094	ja	.Lenc128_loop
1095	ret
1096.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1097
1098.globl	bsaes_dec_key_convert
1099.type	bsaes_dec_key_convert,\@function,2
1100.align	16
1101bsaes_dec_key_convert:
1102	mov	240($inp),%r10d		# pass rounds
1103	mov	$inp,%rcx		# pass key
1104	mov	$out,%rax		# pass key schedule
1105	call	_bsaes_key_convert
1106	pxor	($out),%xmm7		# fix up round 0 key
1107	movdqa	%xmm6,(%rax)		# save last round key
1108	movdqa	%xmm7,($out)
1109	ret
1110.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1111
1112.globl	bsaes_decrypt_128
1113.type	bsaes_decrypt_128,\@function,4
1114.align	16
1115bsaes_decrypt_128:
1116.Ldec128_loop:
1117	movdqu	0x00($inp), @XMM[0]	# load input
1118	movdqu	0x10($inp), @XMM[1]
1119	movdqu	0x20($inp), @XMM[2]
1120	movdqu	0x30($inp), @XMM[3]
1121	movdqu	0x40($inp), @XMM[4]
1122	movdqu	0x50($inp), @XMM[5]
1123	movdqu	0x60($inp), @XMM[6]
1124	movdqu	0x70($inp), @XMM[7]
1125	mov	$key, %rax		# pass the $key
1126	lea	0x80($inp), $inp
1127	mov	\$10,%r10d
1128
1129	call	_bsaes_decrypt8
1130
1131	movdqu	@XMM[0], 0x00($out)	# write output
1132	movdqu	@XMM[1], 0x10($out)
1133	movdqu	@XMM[6], 0x20($out)
1134	movdqu	@XMM[4], 0x30($out)
1135	movdqu	@XMM[2], 0x40($out)
1136	movdqu	@XMM[7], 0x50($out)
1137	movdqu	@XMM[3], 0x60($out)
1138	movdqu	@XMM[5], 0x70($out)
1139	lea	0x80($out), $out
1140	sub	\$0x80,$len
1141	ja	.Ldec128_loop
1142	ret
1143.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1144___
1145}
1146{
1147######################################################################
1148#
1149# OpenSSL interface
1150#
1151my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1152						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1153my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1154
1155if ($ecb) {
1156$code.=<<___;
1157.globl	bsaes_ecb_encrypt_blocks
1158.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1159.align	16
1160bsaes_ecb_encrypt_blocks:
1161	mov	%rsp, %rax
1162.Lecb_enc_prologue:
1163	push	%rbp
1164	push	%rbx
1165	push	%r12
1166	push	%r13
1167	push	%r14
1168	push	%r15
1169	lea	-0x48(%rsp),%rsp
1170___
1171$code.=<<___ if ($win64);
1172	lea	-0xa0(%rsp), %rsp
1173	movaps	%xmm6, 0x40(%rsp)
1174	movaps	%xmm7, 0x50(%rsp)
1175	movaps	%xmm8, 0x60(%rsp)
1176	movaps	%xmm9, 0x70(%rsp)
1177	movaps	%xmm10, 0x80(%rsp)
1178	movaps	%xmm11, 0x90(%rsp)
1179	movaps	%xmm12, 0xa0(%rsp)
1180	movaps	%xmm13, 0xb0(%rsp)
1181	movaps	%xmm14, 0xc0(%rsp)
1182	movaps	%xmm15, 0xd0(%rsp)
1183.Lecb_enc_body:
1184___
1185$code.=<<___;
1186	mov	%rsp,%rbp		# backup %rsp
1187	mov	240($arg4),%eax		# rounds
1188	mov	$arg1,$inp		# backup arguments
1189	mov	$arg2,$out
1190	mov	$arg3,$len
1191	mov	$arg4,$key
1192	cmp	\$8,$arg3
1193	jb	.Lecb_enc_short
1194
1195	mov	%eax,%ebx		# backup rounds
1196	shl	\$7,%rax		# 128 bytes per inner round key
1197	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1198	sub	%rax,%rsp
1199	mov	%rsp,%rax		# pass key schedule
1200	mov	$key,%rcx		# pass key
1201	mov	%ebx,%r10d		# pass rounds
1202	call	_bsaes_key_convert
1203	pxor	%xmm6,%xmm7		# fix up last round key
1204	movdqa	%xmm7,(%rax)		# save last round key
1205
1206	sub	\$8,$len
1207.Lecb_enc_loop:
1208	movdqu	0x00($inp), @XMM[0]	# load input
1209	movdqu	0x10($inp), @XMM[1]
1210	movdqu	0x20($inp), @XMM[2]
1211	movdqu	0x30($inp), @XMM[3]
1212	movdqu	0x40($inp), @XMM[4]
1213	movdqu	0x50($inp), @XMM[5]
1214	mov	%rsp, %rax		# pass key schedule
1215	movdqu	0x60($inp), @XMM[6]
1216	mov	%ebx,%r10d		# pass rounds
1217	movdqu	0x70($inp), @XMM[7]
1218	lea	0x80($inp), $inp
1219
1220	call	_bsaes_encrypt8
1221
1222	movdqu	@XMM[0], 0x00($out)	# write output
1223	movdqu	@XMM[1], 0x10($out)
1224	movdqu	@XMM[4], 0x20($out)
1225	movdqu	@XMM[6], 0x30($out)
1226	movdqu	@XMM[3], 0x40($out)
1227	movdqu	@XMM[7], 0x50($out)
1228	movdqu	@XMM[2], 0x60($out)
1229	movdqu	@XMM[5], 0x70($out)
1230	lea	0x80($out), $out
1231	sub	\$8,$len
1232	jnc	.Lecb_enc_loop
1233
1234	add	\$8,$len
1235	jz	.Lecb_enc_done
1236
1237	movdqu	0x00($inp), @XMM[0]	# load input
1238	mov	%rsp, %rax		# pass key schedule
1239	mov	%ebx,%r10d		# pass rounds
1240	cmp	\$2,$len
1241	jb	.Lecb_enc_one
1242	movdqu	0x10($inp), @XMM[1]
1243	je	.Lecb_enc_two
1244	movdqu	0x20($inp), @XMM[2]
1245	cmp	\$4,$len
1246	jb	.Lecb_enc_three
1247	movdqu	0x30($inp), @XMM[3]
1248	je	.Lecb_enc_four
1249	movdqu	0x40($inp), @XMM[4]
1250	cmp	\$6,$len
1251	jb	.Lecb_enc_five
1252	movdqu	0x50($inp), @XMM[5]
1253	je	.Lecb_enc_six
1254	movdqu	0x60($inp), @XMM[6]
1255	call	_bsaes_encrypt8
1256	movdqu	@XMM[0], 0x00($out)	# write output
1257	movdqu	@XMM[1], 0x10($out)
1258	movdqu	@XMM[4], 0x20($out)
1259	movdqu	@XMM[6], 0x30($out)
1260	movdqu	@XMM[3], 0x40($out)
1261	movdqu	@XMM[7], 0x50($out)
1262	movdqu	@XMM[2], 0x60($out)
1263	jmp	.Lecb_enc_done
1264.align	16
1265.Lecb_enc_six:
1266	call	_bsaes_encrypt8
1267	movdqu	@XMM[0], 0x00($out)	# write output
1268	movdqu	@XMM[1], 0x10($out)
1269	movdqu	@XMM[4], 0x20($out)
1270	movdqu	@XMM[6], 0x30($out)
1271	movdqu	@XMM[3], 0x40($out)
1272	movdqu	@XMM[7], 0x50($out)
1273	jmp	.Lecb_enc_done
1274.align	16
1275.Lecb_enc_five:
1276	call	_bsaes_encrypt8
1277	movdqu	@XMM[0], 0x00($out)	# write output
1278	movdqu	@XMM[1], 0x10($out)
1279	movdqu	@XMM[4], 0x20($out)
1280	movdqu	@XMM[6], 0x30($out)
1281	movdqu	@XMM[3], 0x40($out)
1282	jmp	.Lecb_enc_done
1283.align	16
1284.Lecb_enc_four:
1285	call	_bsaes_encrypt8
1286	movdqu	@XMM[0], 0x00($out)	# write output
1287	movdqu	@XMM[1], 0x10($out)
1288	movdqu	@XMM[4], 0x20($out)
1289	movdqu	@XMM[6], 0x30($out)
1290	jmp	.Lecb_enc_done
1291.align	16
1292.Lecb_enc_three:
1293	call	_bsaes_encrypt8
1294	movdqu	@XMM[0], 0x00($out)	# write output
1295	movdqu	@XMM[1], 0x10($out)
1296	movdqu	@XMM[4], 0x20($out)
1297	jmp	.Lecb_enc_done
1298.align	16
1299.Lecb_enc_two:
1300	call	_bsaes_encrypt8
1301	movdqu	@XMM[0], 0x00($out)	# write output
1302	movdqu	@XMM[1], 0x10($out)
1303	jmp	.Lecb_enc_done
1304.align	16
1305.Lecb_enc_one:
1306	call	_bsaes_encrypt8
1307	movdqu	@XMM[0], 0x00($out)	# write output
1308	jmp	.Lecb_enc_done
1309.align	16
1310.Lecb_enc_short:
1311	lea	($inp), $arg1
1312	lea	($out), $arg2
1313	lea	($key), $arg3
1314	call	asm_AES_encrypt
1315	lea	16($inp), $inp
1316	lea	16($out), $out
1317	dec	$len
1318	jnz	.Lecb_enc_short
1319
1320.Lecb_enc_done:
1321	lea	(%rsp),%rax
1322	pxor	%xmm0, %xmm0
1323.Lecb_enc_bzero:			# wipe key schedule [if any]
1324	movdqa	%xmm0, 0x00(%rax)
1325	movdqa	%xmm0, 0x10(%rax)
1326	lea	0x20(%rax), %rax
1327	cmp	%rax, %rbp
1328	jb	.Lecb_enc_bzero
1329
1330	lea	0x78(%rbp),%rax
1331___
1332$code.=<<___ if ($win64);
1333	movaps	0x40(%rbp), %xmm6
1334	movaps	0x50(%rbp), %xmm7
1335	movaps	0x60(%rbp), %xmm8
1336	movaps	0x70(%rbp), %xmm9
1337	movaps	0x80(%rbp), %xmm10
1338	movaps	0x90(%rbp), %xmm11
1339	movaps	0xa0(%rbp), %xmm12
1340	movaps	0xb0(%rbp), %xmm13
1341	movaps	0xc0(%rbp), %xmm14
1342	movaps	0xd0(%rbp), %xmm15
1343	lea	0xa0(%rax), %rax
1344.Lecb_enc_tail:
1345___
1346$code.=<<___;
1347	mov	-48(%rax), %r15
1348	mov	-40(%rax), %r14
1349	mov	-32(%rax), %r13
1350	mov	-24(%rax), %r12
1351	mov	-16(%rax), %rbx
1352	mov	-8(%rax), %rbp
1353	lea	(%rax), %rsp		# restore %rsp
1354.Lecb_enc_epilogue:
1355	ret
1356.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1357
1358.globl	bsaes_ecb_decrypt_blocks
1359.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1360.align	16
1361bsaes_ecb_decrypt_blocks:
1362	mov	%rsp, %rax
1363.Lecb_dec_prologue:
1364	push	%rbp
1365	push	%rbx
1366	push	%r12
1367	push	%r13
1368	push	%r14
1369	push	%r15
1370	lea	-0x48(%rsp),%rsp
1371___
1372$code.=<<___ if ($win64);
1373	lea	-0xa0(%rsp), %rsp
1374	movaps	%xmm6, 0x40(%rsp)
1375	movaps	%xmm7, 0x50(%rsp)
1376	movaps	%xmm8, 0x60(%rsp)
1377	movaps	%xmm9, 0x70(%rsp)
1378	movaps	%xmm10, 0x80(%rsp)
1379	movaps	%xmm11, 0x90(%rsp)
1380	movaps	%xmm12, 0xa0(%rsp)
1381	movaps	%xmm13, 0xb0(%rsp)
1382	movaps	%xmm14, 0xc0(%rsp)
1383	movaps	%xmm15, 0xd0(%rsp)
1384.Lecb_dec_body:
1385___
1386$code.=<<___;
1387	mov	%rsp,%rbp		# backup %rsp
1388	mov	240($arg4),%eax		# rounds
1389	mov	$arg1,$inp		# backup arguments
1390	mov	$arg2,$out
1391	mov	$arg3,$len
1392	mov	$arg4,$key
1393	cmp	\$8,$arg3
1394	jb	.Lecb_dec_short
1395
1396	mov	%eax,%ebx		# backup rounds
1397	shl	\$7,%rax		# 128 bytes per inner round key
1398	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1399	sub	%rax,%rsp
1400	mov	%rsp,%rax		# pass key schedule
1401	mov	$key,%rcx		# pass key
1402	mov	%ebx,%r10d		# pass rounds
1403	call	_bsaes_key_convert
1404	pxor	(%rsp),%xmm7		# fix up 0 round key
1405	movdqa	%xmm6,(%rax)		# save last round key
1406	movdqa	%xmm7,(%rsp)
1407
1408	sub	\$8,$len
1409.Lecb_dec_loop:
1410	movdqu	0x00($inp), @XMM[0]	# load input
1411	movdqu	0x10($inp), @XMM[1]
1412	movdqu	0x20($inp), @XMM[2]
1413	movdqu	0x30($inp), @XMM[3]
1414	movdqu	0x40($inp), @XMM[4]
1415	movdqu	0x50($inp), @XMM[5]
1416	mov	%rsp, %rax		# pass key schedule
1417	movdqu	0x60($inp), @XMM[6]
1418	mov	%ebx,%r10d		# pass rounds
1419	movdqu	0x70($inp), @XMM[7]
1420	lea	0x80($inp), $inp
1421
1422	call	_bsaes_decrypt8
1423
1424	movdqu	@XMM[0], 0x00($out)	# write output
1425	movdqu	@XMM[1], 0x10($out)
1426	movdqu	@XMM[6], 0x20($out)
1427	movdqu	@XMM[4], 0x30($out)
1428	movdqu	@XMM[2], 0x40($out)
1429	movdqu	@XMM[7], 0x50($out)
1430	movdqu	@XMM[3], 0x60($out)
1431	movdqu	@XMM[5], 0x70($out)
1432	lea	0x80($out), $out
1433	sub	\$8,$len
1434	jnc	.Lecb_dec_loop
1435
1436	add	\$8,$len
1437	jz	.Lecb_dec_done
1438
1439	movdqu	0x00($inp), @XMM[0]	# load input
1440	mov	%rsp, %rax		# pass key schedule
1441	mov	%ebx,%r10d		# pass rounds
1442	cmp	\$2,$len
1443	jb	.Lecb_dec_one
1444	movdqu	0x10($inp), @XMM[1]
1445	je	.Lecb_dec_two
1446	movdqu	0x20($inp), @XMM[2]
1447	cmp	\$4,$len
1448	jb	.Lecb_dec_three
1449	movdqu	0x30($inp), @XMM[3]
1450	je	.Lecb_dec_four
1451	movdqu	0x40($inp), @XMM[4]
1452	cmp	\$6,$len
1453	jb	.Lecb_dec_five
1454	movdqu	0x50($inp), @XMM[5]
1455	je	.Lecb_dec_six
1456	movdqu	0x60($inp), @XMM[6]
1457	call	_bsaes_decrypt8
1458	movdqu	@XMM[0], 0x00($out)	# write output
1459	movdqu	@XMM[1], 0x10($out)
1460	movdqu	@XMM[6], 0x20($out)
1461	movdqu	@XMM[4], 0x30($out)
1462	movdqu	@XMM[2], 0x40($out)
1463	movdqu	@XMM[7], 0x50($out)
1464	movdqu	@XMM[3], 0x60($out)
1465	jmp	.Lecb_dec_done
1466.align	16
1467.Lecb_dec_six:
1468	call	_bsaes_decrypt8
1469	movdqu	@XMM[0], 0x00($out)	# write output
1470	movdqu	@XMM[1], 0x10($out)
1471	movdqu	@XMM[6], 0x20($out)
1472	movdqu	@XMM[4], 0x30($out)
1473	movdqu	@XMM[2], 0x40($out)
1474	movdqu	@XMM[7], 0x50($out)
1475	jmp	.Lecb_dec_done
1476.align	16
1477.Lecb_dec_five:
1478	call	_bsaes_decrypt8
1479	movdqu	@XMM[0], 0x00($out)	# write output
1480	movdqu	@XMM[1], 0x10($out)
1481	movdqu	@XMM[6], 0x20($out)
1482	movdqu	@XMM[4], 0x30($out)
1483	movdqu	@XMM[2], 0x40($out)
1484	jmp	.Lecb_dec_done
1485.align	16
1486.Lecb_dec_four:
1487	call	_bsaes_decrypt8
1488	movdqu	@XMM[0], 0x00($out)	# write output
1489	movdqu	@XMM[1], 0x10($out)
1490	movdqu	@XMM[6], 0x20($out)
1491	movdqu	@XMM[4], 0x30($out)
1492	jmp	.Lecb_dec_done
1493.align	16
1494.Lecb_dec_three:
1495	call	_bsaes_decrypt8
1496	movdqu	@XMM[0], 0x00($out)	# write output
1497	movdqu	@XMM[1], 0x10($out)
1498	movdqu	@XMM[6], 0x20($out)
1499	jmp	.Lecb_dec_done
1500.align	16
1501.Lecb_dec_two:
1502	call	_bsaes_decrypt8
1503	movdqu	@XMM[0], 0x00($out)	# write output
1504	movdqu	@XMM[1], 0x10($out)
1505	jmp	.Lecb_dec_done
1506.align	16
1507.Lecb_dec_one:
1508	call	_bsaes_decrypt8
1509	movdqu	@XMM[0], 0x00($out)	# write output
1510	jmp	.Lecb_dec_done
1511.align	16
1512.Lecb_dec_short:
1513	lea	($inp), $arg1
1514	lea	($out), $arg2
1515	lea	($key), $arg3
1516	call	asm_AES_decrypt
1517	lea	16($inp), $inp
1518	lea	16($out), $out
1519	dec	$len
1520	jnz	.Lecb_dec_short
1521
1522.Lecb_dec_done:
1523	lea	(%rsp),%rax
1524	pxor	%xmm0, %xmm0
1525.Lecb_dec_bzero:			# wipe key schedule [if any]
1526	movdqa	%xmm0, 0x00(%rax)
1527	movdqa	%xmm0, 0x10(%rax)
1528	lea	0x20(%rax), %rax
1529	cmp	%rax, %rbp
1530	jb	.Lecb_dec_bzero
1531
1532	lea	0x78(%rbp),%rax
1533___
1534$code.=<<___ if ($win64);
1535	movaps	0x40(%rbp), %xmm6
1536	movaps	0x50(%rbp), %xmm7
1537	movaps	0x60(%rbp), %xmm8
1538	movaps	0x70(%rbp), %xmm9
1539	movaps	0x80(%rbp), %xmm10
1540	movaps	0x90(%rbp), %xmm11
1541	movaps	0xa0(%rbp), %xmm12
1542	movaps	0xb0(%rbp), %xmm13
1543	movaps	0xc0(%rbp), %xmm14
1544	movaps	0xd0(%rbp), %xmm15
1545	lea	0xa0(%rax), %rax
1546.Lecb_dec_tail:
1547___
1548$code.=<<___;
1549	mov	-48(%rax), %r15
1550	mov	-40(%rax), %r14
1551	mov	-32(%rax), %r13
1552	mov	-24(%rax), %r12
1553	mov	-16(%rax), %rbx
1554	mov	-8(%rax), %rbp
1555	lea	(%rax), %rsp		# restore %rsp
1556.Lecb_dec_epilogue:
1557	ret
1558.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1559___
1560}
1561$code.=<<___;
1562.extern	asm_AES_cbc_encrypt
1563.globl	bsaes_cbc_encrypt
1564.type	bsaes_cbc_encrypt,\@abi-omnipotent
1565.align	16
1566bsaes_cbc_encrypt:
1567___
1568$code.=<<___ if ($win64);
1569	mov	48(%rsp),$arg6		# pull direction flag
1570___
1571$code.=<<___;
1572	cmp	\$0,$arg6
1573	jne	asm_AES_cbc_encrypt
1574	cmp	\$128,$arg3
1575	jb	asm_AES_cbc_encrypt
1576
1577	mov	%rsp, %rax
1578.Lcbc_dec_prologue:
1579	push	%rbp
1580	push	%rbx
1581	push	%r12
1582	push	%r13
1583	push	%r14
1584	push	%r15
1585	lea	-0x48(%rsp), %rsp
1586___
1587$code.=<<___ if ($win64);
1588	mov	0xa0(%rsp),$arg5	# pull ivp
1589	lea	-0xa0(%rsp), %rsp
1590	movaps	%xmm6, 0x40(%rsp)
1591	movaps	%xmm7, 0x50(%rsp)
1592	movaps	%xmm8, 0x60(%rsp)
1593	movaps	%xmm9, 0x70(%rsp)
1594	movaps	%xmm10, 0x80(%rsp)
1595	movaps	%xmm11, 0x90(%rsp)
1596	movaps	%xmm12, 0xa0(%rsp)
1597	movaps	%xmm13, 0xb0(%rsp)
1598	movaps	%xmm14, 0xc0(%rsp)
1599	movaps	%xmm15, 0xd0(%rsp)
1600.Lcbc_dec_body:
1601___
1602$code.=<<___;
1603	mov	%rsp, %rbp		# backup %rsp
1604	mov	240($arg4), %eax	# rounds
1605	mov	$arg1, $inp		# backup arguments
1606	mov	$arg2, $out
1607	mov	$arg3, $len
1608	mov	$arg4, $key
1609	mov	$arg5, %rbx
1610	shr	\$4, $len		# bytes to blocks
1611
1612	mov	%eax, %edx		# rounds
1613	shl	\$7, %rax		# 128 bytes per inner round key
1614	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1615	sub	%rax, %rsp
1616
1617	mov	%rsp, %rax		# pass key schedule
1618	mov	$key, %rcx		# pass key
1619	mov	%edx, %r10d		# pass rounds
1620	call	_bsaes_key_convert
1621	pxor	(%rsp),%xmm7		# fix up 0 round key
1622	movdqa	%xmm6,(%rax)		# save last round key
1623	movdqa	%xmm7,(%rsp)
1624
1625	movdqu	(%rbx), @XMM[15]	# load IV
1626	sub	\$8,$len
1627.Lcbc_dec_loop:
1628	movdqu	0x00($inp), @XMM[0]	# load input
1629	movdqu	0x10($inp), @XMM[1]
1630	movdqu	0x20($inp), @XMM[2]
1631	movdqu	0x30($inp), @XMM[3]
1632	movdqu	0x40($inp), @XMM[4]
1633	movdqu	0x50($inp), @XMM[5]
1634	mov	%rsp, %rax		# pass key schedule
1635	movdqu	0x60($inp), @XMM[6]
1636	mov	%edx,%r10d		# pass rounds
1637	movdqu	0x70($inp), @XMM[7]
1638	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1639
1640	call	_bsaes_decrypt8
1641
1642	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1643	movdqu	0x00($inp), @XMM[8]	# re-load input
1644	movdqu	0x10($inp), @XMM[9]
1645	pxor	@XMM[8], @XMM[1]
1646	movdqu	0x20($inp), @XMM[10]
1647	pxor	@XMM[9], @XMM[6]
1648	movdqu	0x30($inp), @XMM[11]
1649	pxor	@XMM[10], @XMM[4]
1650	movdqu	0x40($inp), @XMM[12]
1651	pxor	@XMM[11], @XMM[2]
1652	movdqu	0x50($inp), @XMM[13]
1653	pxor	@XMM[12], @XMM[7]
1654	movdqu	0x60($inp), @XMM[14]
1655	pxor	@XMM[13], @XMM[3]
1656	movdqu	0x70($inp), @XMM[15]	# IV
1657	pxor	@XMM[14], @XMM[5]
1658	movdqu	@XMM[0], 0x00($out)	# write output
1659	lea	0x80($inp), $inp
1660	movdqu	@XMM[1], 0x10($out)
1661	movdqu	@XMM[6], 0x20($out)
1662	movdqu	@XMM[4], 0x30($out)
1663	movdqu	@XMM[2], 0x40($out)
1664	movdqu	@XMM[7], 0x50($out)
1665	movdqu	@XMM[3], 0x60($out)
1666	movdqu	@XMM[5], 0x70($out)
1667	lea	0x80($out), $out
1668	sub	\$8,$len
1669	jnc	.Lcbc_dec_loop
1670
1671	add	\$8,$len
1672	jz	.Lcbc_dec_done
1673
1674	movdqu	0x00($inp), @XMM[0]	# load input
1675	mov	%rsp, %rax		# pass key schedule
1676	mov	%edx, %r10d		# pass rounds
1677	cmp	\$2,$len
1678	jb	.Lcbc_dec_one
1679	movdqu	0x10($inp), @XMM[1]
1680	je	.Lcbc_dec_two
1681	movdqu	0x20($inp), @XMM[2]
1682	cmp	\$4,$len
1683	jb	.Lcbc_dec_three
1684	movdqu	0x30($inp), @XMM[3]
1685	je	.Lcbc_dec_four
1686	movdqu	0x40($inp), @XMM[4]
1687	cmp	\$6,$len
1688	jb	.Lcbc_dec_five
1689	movdqu	0x50($inp), @XMM[5]
1690	je	.Lcbc_dec_six
1691	movdqu	0x60($inp), @XMM[6]
1692	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1693	call	_bsaes_decrypt8
1694	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1695	movdqu	0x00($inp), @XMM[8]	# re-load input
1696	movdqu	0x10($inp), @XMM[9]
1697	pxor	@XMM[8], @XMM[1]
1698	movdqu	0x20($inp), @XMM[10]
1699	pxor	@XMM[9], @XMM[6]
1700	movdqu	0x30($inp), @XMM[11]
1701	pxor	@XMM[10], @XMM[4]
1702	movdqu	0x40($inp), @XMM[12]
1703	pxor	@XMM[11], @XMM[2]
1704	movdqu	0x50($inp), @XMM[13]
1705	pxor	@XMM[12], @XMM[7]
1706	movdqu	0x60($inp), @XMM[15]	# IV
1707	pxor	@XMM[13], @XMM[3]
1708	movdqu	@XMM[0], 0x00($out)	# write output
1709	movdqu	@XMM[1], 0x10($out)
1710	movdqu	@XMM[6], 0x20($out)
1711	movdqu	@XMM[4], 0x30($out)
1712	movdqu	@XMM[2], 0x40($out)
1713	movdqu	@XMM[7], 0x50($out)
1714	movdqu	@XMM[3], 0x60($out)
1715	jmp	.Lcbc_dec_done
1716.align	16
1717.Lcbc_dec_six:
1718	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1719	call	_bsaes_decrypt8
1720	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1721	movdqu	0x00($inp), @XMM[8]	# re-load input
1722	movdqu	0x10($inp), @XMM[9]
1723	pxor	@XMM[8], @XMM[1]
1724	movdqu	0x20($inp), @XMM[10]
1725	pxor	@XMM[9], @XMM[6]
1726	movdqu	0x30($inp), @XMM[11]
1727	pxor	@XMM[10], @XMM[4]
1728	movdqu	0x40($inp), @XMM[12]
1729	pxor	@XMM[11], @XMM[2]
1730	movdqu	0x50($inp), @XMM[15]	# IV
1731	pxor	@XMM[12], @XMM[7]
1732	movdqu	@XMM[0], 0x00($out)	# write output
1733	movdqu	@XMM[1], 0x10($out)
1734	movdqu	@XMM[6], 0x20($out)
1735	movdqu	@XMM[4], 0x30($out)
1736	movdqu	@XMM[2], 0x40($out)
1737	movdqu	@XMM[7], 0x50($out)
1738	jmp	.Lcbc_dec_done
1739.align	16
1740.Lcbc_dec_five:
1741	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1742	call	_bsaes_decrypt8
1743	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1744	movdqu	0x00($inp), @XMM[8]	# re-load input
1745	movdqu	0x10($inp), @XMM[9]
1746	pxor	@XMM[8], @XMM[1]
1747	movdqu	0x20($inp), @XMM[10]
1748	pxor	@XMM[9], @XMM[6]
1749	movdqu	0x30($inp), @XMM[11]
1750	pxor	@XMM[10], @XMM[4]
1751	movdqu	0x40($inp), @XMM[15]	# IV
1752	pxor	@XMM[11], @XMM[2]
1753	movdqu	@XMM[0], 0x00($out)	# write output
1754	movdqu	@XMM[1], 0x10($out)
1755	movdqu	@XMM[6], 0x20($out)
1756	movdqu	@XMM[4], 0x30($out)
1757	movdqu	@XMM[2], 0x40($out)
1758	jmp	.Lcbc_dec_done
1759.align	16
1760.Lcbc_dec_four:
1761	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1762	call	_bsaes_decrypt8
1763	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1764	movdqu	0x00($inp), @XMM[8]	# re-load input
1765	movdqu	0x10($inp), @XMM[9]
1766	pxor	@XMM[8], @XMM[1]
1767	movdqu	0x20($inp), @XMM[10]
1768	pxor	@XMM[9], @XMM[6]
1769	movdqu	0x30($inp), @XMM[15]	# IV
1770	pxor	@XMM[10], @XMM[4]
1771	movdqu	@XMM[0], 0x00($out)	# write output
1772	movdqu	@XMM[1], 0x10($out)
1773	movdqu	@XMM[6], 0x20($out)
1774	movdqu	@XMM[4], 0x30($out)
1775	jmp	.Lcbc_dec_done
1776.align	16
1777.Lcbc_dec_three:
1778	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1779	call	_bsaes_decrypt8
1780	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1781	movdqu	0x00($inp), @XMM[8]	# re-load input
1782	movdqu	0x10($inp), @XMM[9]
1783	pxor	@XMM[8], @XMM[1]
1784	movdqu	0x20($inp), @XMM[15]	# IV
1785	pxor	@XMM[9], @XMM[6]
1786	movdqu	@XMM[0], 0x00($out)	# write output
1787	movdqu	@XMM[1], 0x10($out)
1788	movdqu	@XMM[6], 0x20($out)
1789	jmp	.Lcbc_dec_done
1790.align	16
1791.Lcbc_dec_two:
1792	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1793	call	_bsaes_decrypt8
1794	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1795	movdqu	0x00($inp), @XMM[8]	# re-load input
1796	movdqu	0x10($inp), @XMM[15]	# IV
1797	pxor	@XMM[8], @XMM[1]
1798	movdqu	@XMM[0], 0x00($out)	# write output
1799	movdqu	@XMM[1], 0x10($out)
1800	jmp	.Lcbc_dec_done
1801.align	16
1802.Lcbc_dec_one:
1803	lea	($inp), $arg1
1804	lea	0x20(%rbp), $arg2	# buffer output
1805	lea	($key), $arg3
1806	call	asm_AES_decrypt		# doesn't touch %xmm
1807	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1808	movdqu	@XMM[15], ($out)	# write output
1809	movdqa	@XMM[0], @XMM[15]	# IV
1810
1811.Lcbc_dec_done:
1812	movdqu	@XMM[15], (%rbx)	# return IV
1813	lea	(%rsp), %rax
1814	pxor	%xmm0, %xmm0
1815.Lcbc_dec_bzero:			# wipe key schedule [if any]
1816	movdqa	%xmm0, 0x00(%rax)
1817	movdqa	%xmm0, 0x10(%rax)
1818	lea	0x20(%rax), %rax
1819	cmp	%rax, %rbp
1820	ja	.Lcbc_dec_bzero
1821
1822	lea	0x78(%rbp),%rax
1823___
1824$code.=<<___ if ($win64);
1825	movaps	0x40(%rbp), %xmm6
1826	movaps	0x50(%rbp), %xmm7
1827	movaps	0x60(%rbp), %xmm8
1828	movaps	0x70(%rbp), %xmm9
1829	movaps	0x80(%rbp), %xmm10
1830	movaps	0x90(%rbp), %xmm11
1831	movaps	0xa0(%rbp), %xmm12
1832	movaps	0xb0(%rbp), %xmm13
1833	movaps	0xc0(%rbp), %xmm14
1834	movaps	0xd0(%rbp), %xmm15
1835	lea	0xa0(%rax), %rax
1836.Lcbc_dec_tail:
1837___
1838$code.=<<___;
1839	mov	-48(%rax), %r15
1840	mov	-40(%rax), %r14
1841	mov	-32(%rax), %r13
1842	mov	-24(%rax), %r12
1843	mov	-16(%rax), %rbx
1844	mov	-8(%rax), %rbp
1845	lea	(%rax), %rsp		# restore %rsp
1846.Lcbc_dec_epilogue:
1847	ret
1848.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1849
1850.globl	bsaes_ctr32_encrypt_blocks
1851.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1852.align	16
1853bsaes_ctr32_encrypt_blocks:
1854	mov	%rsp, %rax
1855.Lctr_enc_prologue:
1856	push	%rbp
1857	push	%rbx
1858	push	%r12
1859	push	%r13
1860	push	%r14
1861	push	%r15
1862	lea	-0x48(%rsp), %rsp
1863___
1864$code.=<<___ if ($win64);
1865	mov	0xa0(%rsp),$arg5	# pull ivp
1866	lea	-0xa0(%rsp), %rsp
1867	movaps	%xmm6, 0x40(%rsp)
1868	movaps	%xmm7, 0x50(%rsp)
1869	movaps	%xmm8, 0x60(%rsp)
1870	movaps	%xmm9, 0x70(%rsp)
1871	movaps	%xmm10, 0x80(%rsp)
1872	movaps	%xmm11, 0x90(%rsp)
1873	movaps	%xmm12, 0xa0(%rsp)
1874	movaps	%xmm13, 0xb0(%rsp)
1875	movaps	%xmm14, 0xc0(%rsp)
1876	movaps	%xmm15, 0xd0(%rsp)
1877.Lctr_enc_body:
1878___
1879$code.=<<___;
1880	mov	%rsp, %rbp		# backup %rsp
1881	movdqu	($arg5), %xmm0		# load counter
1882	mov	240($arg4), %eax	# rounds
1883	mov	$arg1, $inp		# backup arguments
1884	mov	$arg2, $out
1885	mov	$arg3, $len
1886	mov	$arg4, $key
1887	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1888	cmp	\$8, $arg3
1889	jb	.Lctr_enc_short
1890
1891	mov	%eax, %ebx		# rounds
1892	shl	\$7, %rax		# 128 bytes per inner round key
1893	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1894	sub	%rax, %rsp
1895
1896	mov	%rsp, %rax		# pass key schedule
1897	mov	$key, %rcx		# pass key
1898	mov	%ebx, %r10d		# pass rounds
1899	call	_bsaes_key_convert
1900	pxor	%xmm6,%xmm7		# fix up last round key
1901	movdqa	%xmm7,(%rax)		# save last round key
1902
1903	movdqa	(%rsp), @XMM[9]		# load round0 key
1904	lea	.LADD1(%rip), %r11
1905	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1906	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1907	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1908	pshufb	@XMM[8], @XMM[0]
1909	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1910	jmp	.Lctr_enc_loop
1911.align	16
1912.Lctr_enc_loop:
1913	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1914	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1915	movdqa	@XMM[0], @XMM[2]
1916	paddd	0x00(%r11), @XMM[1]	# .LADD1
1917	movdqa	@XMM[0], @XMM[3]
1918	paddd	0x10(%r11), @XMM[2]	# .LADD2
1919	movdqa	@XMM[0], @XMM[4]
1920	paddd	0x20(%r11), @XMM[3]	# .LADD3
1921	movdqa	@XMM[0], @XMM[5]
1922	paddd	0x30(%r11), @XMM[4]	# .LADD4
1923	movdqa	@XMM[0], @XMM[6]
1924	paddd	0x40(%r11), @XMM[5]	# .LADD5
1925	movdqa	@XMM[0], @XMM[7]
1926	paddd	0x50(%r11), @XMM[6]	# .LADD6
1927	paddd	0x60(%r11), @XMM[7]	# .LADD7
1928
1929	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
1930	# to flip byte order in 32-bit counter
1931	movdqa	(%rsp), @XMM[9]		# round 0 key
1932	lea	0x10(%rsp), %rax	# pass key schedule
1933	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
1934	pxor	@XMM[9], @XMM[0]	# xor with round0 key
1935	pxor	@XMM[9], @XMM[1]
1936	pxor	@XMM[9], @XMM[2]
1937	pxor	@XMM[9], @XMM[3]
1938	 pshufb	@XMM[8], @XMM[0]
1939	 pshufb	@XMM[8], @XMM[1]
1940	pxor	@XMM[9], @XMM[4]
1941	pxor	@XMM[9], @XMM[5]
1942	 pshufb	@XMM[8], @XMM[2]
1943	 pshufb	@XMM[8], @XMM[3]
1944	pxor	@XMM[9], @XMM[6]
1945	pxor	@XMM[9], @XMM[7]
1946	 pshufb	@XMM[8], @XMM[4]
1947	 pshufb	@XMM[8], @XMM[5]
1948	 pshufb	@XMM[8], @XMM[6]
1949	 pshufb	@XMM[8], @XMM[7]
1950	lea	.LBS0(%rip), %r11	# constants table
1951	mov	%ebx,%r10d		# pass rounds
1952
1953	call	_bsaes_encrypt8_bitslice
1954
1955	sub	\$8,$len
1956	jc	.Lctr_enc_loop_done
1957
1958	movdqu	0x00($inp), @XMM[8]	# load input
1959	movdqu	0x10($inp), @XMM[9]
1960	movdqu	0x20($inp), @XMM[10]
1961	movdqu	0x30($inp), @XMM[11]
1962	movdqu	0x40($inp), @XMM[12]
1963	movdqu	0x50($inp), @XMM[13]
1964	movdqu	0x60($inp), @XMM[14]
1965	movdqu	0x70($inp), @XMM[15]
1966	lea	0x80($inp),$inp
1967	pxor	@XMM[0], @XMM[8]
1968	movdqa	0x20(%rbp), @XMM[0]	# load counter
1969	pxor	@XMM[9], @XMM[1]
1970	movdqu	@XMM[8], 0x00($out)	# write output
1971	pxor	@XMM[10], @XMM[4]
1972	movdqu	@XMM[1], 0x10($out)
1973	pxor	@XMM[11], @XMM[6]
1974	movdqu	@XMM[4], 0x20($out)
1975	pxor	@XMM[12], @XMM[3]
1976	movdqu	@XMM[6], 0x30($out)
1977	pxor	@XMM[13], @XMM[7]
1978	movdqu	@XMM[3], 0x40($out)
1979	pxor	@XMM[14], @XMM[2]
1980	movdqu	@XMM[7], 0x50($out)
1981	pxor	@XMM[15], @XMM[5]
1982	movdqu	@XMM[2], 0x60($out)
1983	lea	.LADD1(%rip), %r11
1984	movdqu	@XMM[5], 0x70($out)
1985	lea	0x80($out), $out
1986	paddd	0x70(%r11), @XMM[0]	# .LADD8
1987	jnz	.Lctr_enc_loop
1988
1989	jmp	.Lctr_enc_done
1990.align	16
1991.Lctr_enc_loop_done:
1992	add	\$8, $len
1993	movdqu	0x00($inp), @XMM[8]	# load input
1994	pxor	@XMM[8], @XMM[0]
1995	movdqu	@XMM[0], 0x00($out)	# write output
1996	cmp	\$2,$len
1997	jb	.Lctr_enc_done
1998	movdqu	0x10($inp), @XMM[9]
1999	pxor	@XMM[9], @XMM[1]
2000	movdqu	@XMM[1], 0x10($out)
2001	je	.Lctr_enc_done
2002	movdqu	0x20($inp), @XMM[10]
2003	pxor	@XMM[10], @XMM[4]
2004	movdqu	@XMM[4], 0x20($out)
2005	cmp	\$4,$len
2006	jb	.Lctr_enc_done
2007	movdqu	0x30($inp), @XMM[11]
2008	pxor	@XMM[11], @XMM[6]
2009	movdqu	@XMM[6], 0x30($out)
2010	je	.Lctr_enc_done
2011	movdqu	0x40($inp), @XMM[12]
2012	pxor	@XMM[12], @XMM[3]
2013	movdqu	@XMM[3], 0x40($out)
2014	cmp	\$6,$len
2015	jb	.Lctr_enc_done
2016	movdqu	0x50($inp), @XMM[13]
2017	pxor	@XMM[13], @XMM[7]
2018	movdqu	@XMM[7], 0x50($out)
2019	je	.Lctr_enc_done
2020	movdqu	0x60($inp), @XMM[14]
2021	pxor	@XMM[14], @XMM[2]
2022	movdqu	@XMM[2], 0x60($out)
2023	jmp	.Lctr_enc_done
2024
2025.align	16
2026.Lctr_enc_short:
2027	lea	0x20(%rbp), $arg1
2028	lea	0x30(%rbp), $arg2
2029	lea	($key), $arg3
2030	call	asm_AES_encrypt
2031	movdqu	($inp), @XMM[1]
2032	lea	16($inp), $inp
2033	mov	0x2c(%rbp), %eax	# load 32-bit counter
2034	bswap	%eax
2035	pxor	0x30(%rbp), @XMM[1]
2036	inc	%eax			# increment
2037	movdqu	@XMM[1], ($out)
2038	bswap	%eax
2039	lea	16($out), $out
2040	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
2041	dec	$len
2042	jnz	.Lctr_enc_short
2043
2044.Lctr_enc_done:
2045	lea	(%rsp), %rax
2046	pxor	%xmm0, %xmm0
2047.Lctr_enc_bzero:			# wipe key schedule [if any]
2048	movdqa	%xmm0, 0x00(%rax)
2049	movdqa	%xmm0, 0x10(%rax)
2050	lea	0x20(%rax), %rax
2051	cmp	%rax, %rbp
2052	ja	.Lctr_enc_bzero
2053
2054	lea	0x78(%rbp),%rax
2055___
2056$code.=<<___ if ($win64);
2057	movaps	0x40(%rbp), %xmm6
2058	movaps	0x50(%rbp), %xmm7
2059	movaps	0x60(%rbp), %xmm8
2060	movaps	0x70(%rbp), %xmm9
2061	movaps	0x80(%rbp), %xmm10
2062	movaps	0x90(%rbp), %xmm11
2063	movaps	0xa0(%rbp), %xmm12
2064	movaps	0xb0(%rbp), %xmm13
2065	movaps	0xc0(%rbp), %xmm14
2066	movaps	0xd0(%rbp), %xmm15
2067	lea	0xa0(%rax), %rax
2068.Lctr_enc_tail:
2069___
2070$code.=<<___;
2071	mov	-48(%rax), %r15
2072	mov	-40(%rax), %r14
2073	mov	-32(%rax), %r13
2074	mov	-24(%rax), %r12
2075	mov	-16(%rax), %rbx
2076	mov	-8(%rax), %rbp
2077	lea	(%rax), %rsp		# restore %rsp
2078.Lctr_enc_epilogue:
2079	ret
2080.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2081___
2082######################################################################
2083# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2084#	const AES_KEY *key1, const AES_KEY *key2,
2085#	const unsigned char iv[16]);
2086#
2087my ($twmask,$twres,$twtmp)=@XMM[13..15];
2088$arg6=~s/d$//;
2089
2090$code.=<<___;
2091.globl	bsaes_xts_encrypt
2092.type	bsaes_xts_encrypt,\@abi-omnipotent
2093.align	16
2094bsaes_xts_encrypt:
2095	mov	%rsp, %rax
2096.Lxts_enc_prologue:
2097	push	%rbp
2098	push	%rbx
2099	push	%r12
2100	push	%r13
2101	push	%r14
2102	push	%r15
2103	lea	-0x48(%rsp), %rsp
2104___
2105$code.=<<___ if ($win64);
2106	mov	0xa0(%rsp),$arg5	# pull key2
2107	mov	0xa8(%rsp),$arg6	# pull ivp
2108	lea	-0xa0(%rsp), %rsp
2109	movaps	%xmm6, 0x40(%rsp)
2110	movaps	%xmm7, 0x50(%rsp)
2111	movaps	%xmm8, 0x60(%rsp)
2112	movaps	%xmm9, 0x70(%rsp)
2113	movaps	%xmm10, 0x80(%rsp)
2114	movaps	%xmm11, 0x90(%rsp)
2115	movaps	%xmm12, 0xa0(%rsp)
2116	movaps	%xmm13, 0xb0(%rsp)
2117	movaps	%xmm14, 0xc0(%rsp)
2118	movaps	%xmm15, 0xd0(%rsp)
2119.Lxts_enc_body:
2120___
2121$code.=<<___;
2122	mov	%rsp, %rbp		# backup %rsp
2123	mov	$arg1, $inp		# backup arguments
2124	mov	$arg2, $out
2125	mov	$arg3, $len
2126	mov	$arg4, $key
2127
2128	lea	($arg6), $arg1
2129	lea	0x20(%rbp), $arg2
2130	lea	($arg5), $arg3
2131	call	asm_AES_encrypt		# generate initial tweak
2132
2133	mov	240($key), %eax		# rounds
2134	mov	$len, %rbx		# backup $len
2135
2136	mov	%eax, %edx		# rounds
2137	shl	\$7, %rax		# 128 bytes per inner round key
2138	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2139	sub	%rax, %rsp
2140
2141	mov	%rsp, %rax		# pass key schedule
2142	mov	$key, %rcx		# pass key
2143	mov	%edx, %r10d		# pass rounds
2144	call	_bsaes_key_convert
2145	pxor	%xmm6, %xmm7		# fix up last round key
2146	movdqa	%xmm7, (%rax)		# save last round key
2147
2148	and	\$-16, $len
2149	sub	\$0x80, %rsp		# place for tweak[8]
2150	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2151
2152	pxor	$twtmp, $twtmp
2153	movdqa	.Lxts_magic(%rip), $twmask
2154	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2155
2156	sub	\$0x80, $len
2157	jc	.Lxts_enc_short
2158	jmp	.Lxts_enc_loop
2159
2160.align	16
2161.Lxts_enc_loop:
2162___
2163    for ($i=0;$i<7;$i++) {
2164    $code.=<<___;
2165	pshufd	\$0x13, $twtmp, $twres
2166	pxor	$twtmp, $twtmp
2167	movdqa	@XMM[7], @XMM[$i]
2168	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2169	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2170	pand	$twmask, $twres		# isolate carry and residue
2171	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2172	pxor	$twres, @XMM[7]
2173___
2174    $code.=<<___ if ($i>=1);
2175	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2176___
2177    $code.=<<___ if ($i>=2);
2178	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2179___
2180    }
2181$code.=<<___;
2182	movdqu	0x60($inp), @XMM[8+6]
2183	pxor	@XMM[8+5], @XMM[5]
2184	movdqu	0x70($inp), @XMM[8+7]
2185	lea	0x80($inp), $inp
2186	movdqa	@XMM[7], 0x70(%rsp)
2187	pxor	@XMM[8+6], @XMM[6]
2188	lea	0x80(%rsp), %rax	# pass key schedule
2189	pxor	@XMM[8+7], @XMM[7]
2190	mov	%edx, %r10d		# pass rounds
2191
2192	call	_bsaes_encrypt8
2193
2194	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2195	pxor	0x10(%rsp), @XMM[1]
2196	movdqu	@XMM[0], 0x00($out)	# write output
2197	pxor	0x20(%rsp), @XMM[4]
2198	movdqu	@XMM[1], 0x10($out)
2199	pxor	0x30(%rsp), @XMM[6]
2200	movdqu	@XMM[4], 0x20($out)
2201	pxor	0x40(%rsp), @XMM[3]
2202	movdqu	@XMM[6], 0x30($out)
2203	pxor	0x50(%rsp), @XMM[7]
2204	movdqu	@XMM[3], 0x40($out)
2205	pxor	0x60(%rsp), @XMM[2]
2206	movdqu	@XMM[7], 0x50($out)
2207	pxor	0x70(%rsp), @XMM[5]
2208	movdqu	@XMM[2], 0x60($out)
2209	movdqu	@XMM[5], 0x70($out)
2210	lea	0x80($out), $out
2211
2212	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2213	pxor	$twtmp, $twtmp
2214	movdqa	.Lxts_magic(%rip), $twmask
2215	pcmpgtd	@XMM[7], $twtmp
2216	pshufd	\$0x13, $twtmp, $twres
2217	pxor	$twtmp, $twtmp
2218	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2219	pand	$twmask, $twres		# isolate carry and residue
2220	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2221	pxor	$twres, @XMM[7]
2222
2223	sub	\$0x80,$len
2224	jnc	.Lxts_enc_loop
2225
2226.Lxts_enc_short:
2227	add	\$0x80, $len
2228	jz	.Lxts_enc_done
2229___
2230    for ($i=0;$i<7;$i++) {
2231    $code.=<<___;
2232	pshufd	\$0x13, $twtmp, $twres
2233	pxor	$twtmp, $twtmp
2234	movdqa	@XMM[7], @XMM[$i]
2235	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2236	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2237	pand	$twmask, $twres		# isolate carry and residue
2238	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2239	pxor	$twres, @XMM[7]
2240___
2241    $code.=<<___ if ($i>=1);
2242	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2243	cmp	\$`0x10*$i`,$len
2244	je	.Lxts_enc_$i
2245___
2246    $code.=<<___ if ($i>=2);
2247	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2248___
2249    }
2250$code.=<<___;
2251	movdqu	0x60($inp), @XMM[8+6]
2252	pxor	@XMM[8+5], @XMM[5]
2253	movdqa	@XMM[7], 0x70(%rsp)
2254	lea	0x70($inp), $inp
2255	pxor	@XMM[8+6], @XMM[6]
2256	lea	0x80(%rsp), %rax	# pass key schedule
2257	mov	%edx, %r10d		# pass rounds
2258
2259	call	_bsaes_encrypt8
2260
2261	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2262	pxor	0x10(%rsp), @XMM[1]
2263	movdqu	@XMM[0], 0x00($out)	# write output
2264	pxor	0x20(%rsp), @XMM[4]
2265	movdqu	@XMM[1], 0x10($out)
2266	pxor	0x30(%rsp), @XMM[6]
2267	movdqu	@XMM[4], 0x20($out)
2268	pxor	0x40(%rsp), @XMM[3]
2269	movdqu	@XMM[6], 0x30($out)
2270	pxor	0x50(%rsp), @XMM[7]
2271	movdqu	@XMM[3], 0x40($out)
2272	pxor	0x60(%rsp), @XMM[2]
2273	movdqu	@XMM[7], 0x50($out)
2274	movdqu	@XMM[2], 0x60($out)
2275	lea	0x70($out), $out
2276
2277	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2278	jmp	.Lxts_enc_done
2279.align	16
2280.Lxts_enc_6:
2281	pxor	@XMM[8+4], @XMM[4]
2282	lea	0x60($inp), $inp
2283	pxor	@XMM[8+5], @XMM[5]
2284	lea	0x80(%rsp), %rax	# pass key schedule
2285	mov	%edx, %r10d		# pass rounds
2286
2287	call	_bsaes_encrypt8
2288
2289	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2290	pxor	0x10(%rsp), @XMM[1]
2291	movdqu	@XMM[0], 0x00($out)	# write output
2292	pxor	0x20(%rsp), @XMM[4]
2293	movdqu	@XMM[1], 0x10($out)
2294	pxor	0x30(%rsp), @XMM[6]
2295	movdqu	@XMM[4], 0x20($out)
2296	pxor	0x40(%rsp), @XMM[3]
2297	movdqu	@XMM[6], 0x30($out)
2298	pxor	0x50(%rsp), @XMM[7]
2299	movdqu	@XMM[3], 0x40($out)
2300	movdqu	@XMM[7], 0x50($out)
2301	lea	0x60($out), $out
2302
2303	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2304	jmp	.Lxts_enc_done
2305.align	16
2306.Lxts_enc_5:
2307	pxor	@XMM[8+3], @XMM[3]
2308	lea	0x50($inp), $inp
2309	pxor	@XMM[8+4], @XMM[4]
2310	lea	0x80(%rsp), %rax	# pass key schedule
2311	mov	%edx, %r10d		# pass rounds
2312
2313	call	_bsaes_encrypt8
2314
2315	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2316	pxor	0x10(%rsp), @XMM[1]
2317	movdqu	@XMM[0], 0x00($out)	# write output
2318	pxor	0x20(%rsp), @XMM[4]
2319	movdqu	@XMM[1], 0x10($out)
2320	pxor	0x30(%rsp), @XMM[6]
2321	movdqu	@XMM[4], 0x20($out)
2322	pxor	0x40(%rsp), @XMM[3]
2323	movdqu	@XMM[6], 0x30($out)
2324	movdqu	@XMM[3], 0x40($out)
2325	lea	0x50($out), $out
2326
2327	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2328	jmp	.Lxts_enc_done
2329.align	16
2330.Lxts_enc_4:
2331	pxor	@XMM[8+2], @XMM[2]
2332	lea	0x40($inp), $inp
2333	pxor	@XMM[8+3], @XMM[3]
2334	lea	0x80(%rsp), %rax	# pass key schedule
2335	mov	%edx, %r10d		# pass rounds
2336
2337	call	_bsaes_encrypt8
2338
2339	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2340	pxor	0x10(%rsp), @XMM[1]
2341	movdqu	@XMM[0], 0x00($out)	# write output
2342	pxor	0x20(%rsp), @XMM[4]
2343	movdqu	@XMM[1], 0x10($out)
2344	pxor	0x30(%rsp), @XMM[6]
2345	movdqu	@XMM[4], 0x20($out)
2346	movdqu	@XMM[6], 0x30($out)
2347	lea	0x40($out), $out
2348
2349	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2350	jmp	.Lxts_enc_done
2351.align	16
2352.Lxts_enc_3:
2353	pxor	@XMM[8+1], @XMM[1]
2354	lea	0x30($inp), $inp
2355	pxor	@XMM[8+2], @XMM[2]
2356	lea	0x80(%rsp), %rax	# pass key schedule
2357	mov	%edx, %r10d		# pass rounds
2358
2359	call	_bsaes_encrypt8
2360
2361	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2362	pxor	0x10(%rsp), @XMM[1]
2363	movdqu	@XMM[0], 0x00($out)	# write output
2364	pxor	0x20(%rsp), @XMM[4]
2365	movdqu	@XMM[1], 0x10($out)
2366	movdqu	@XMM[4], 0x20($out)
2367	lea	0x30($out), $out
2368
2369	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2370	jmp	.Lxts_enc_done
2371.align	16
2372.Lxts_enc_2:
2373	pxor	@XMM[8+0], @XMM[0]
2374	lea	0x20($inp), $inp
2375	pxor	@XMM[8+1], @XMM[1]
2376	lea	0x80(%rsp), %rax	# pass key schedule
2377	mov	%edx, %r10d		# pass rounds
2378
2379	call	_bsaes_encrypt8
2380
2381	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2382	pxor	0x10(%rsp), @XMM[1]
2383	movdqu	@XMM[0], 0x00($out)	# write output
2384	movdqu	@XMM[1], 0x10($out)
2385	lea	0x20($out), $out
2386
2387	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2388	jmp	.Lxts_enc_done
2389.align	16
2390.Lxts_enc_1:
2391	pxor	@XMM[0], @XMM[8]
2392	lea	0x10($inp), $inp
2393	movdqa	@XMM[8], 0x20(%rbp)
2394	lea	0x20(%rbp), $arg1
2395	lea	0x20(%rbp), $arg2
2396	lea	($key), $arg3
2397	call	asm_AES_encrypt		# doesn't touch %xmm
2398	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2399	#pxor	@XMM[8], @XMM[0]
2400	#lea	0x80(%rsp), %rax	# pass key schedule
2401	#mov	%edx, %r10d		# pass rounds
2402	#call	_bsaes_encrypt8
2403	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2404	movdqu	@XMM[0], 0x00($out)	# write output
2405	lea	0x10($out), $out
2406
2407	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2408
2409.Lxts_enc_done:
2410	and	\$15, %ebx
2411	jz	.Lxts_enc_ret
2412	mov	$out, %rdx
2413
2414.Lxts_enc_steal:
2415	movzb	($inp), %eax
2416	movzb	-16(%rdx), %ecx
2417	lea	1($inp), $inp
2418	mov	%al, -16(%rdx)
2419	mov	%cl, 0(%rdx)
2420	lea	1(%rdx), %rdx
2421	sub	\$1,%ebx
2422	jnz	.Lxts_enc_steal
2423
2424	movdqu	-16($out), @XMM[0]
2425	lea	0x20(%rbp), $arg1
2426	pxor	@XMM[7], @XMM[0]
2427	lea	0x20(%rbp), $arg2
2428	movdqa	@XMM[0], 0x20(%rbp)
2429	lea	($key), $arg3
2430	call	asm_AES_encrypt		# doesn't touch %xmm
2431	pxor	0x20(%rbp), @XMM[7]
2432	movdqu	@XMM[7], -16($out)
2433
2434.Lxts_enc_ret:
2435	lea	(%rsp), %rax
2436	pxor	%xmm0, %xmm0
2437.Lxts_enc_bzero:			# wipe key schedule [if any]
2438	movdqa	%xmm0, 0x00(%rax)
2439	movdqa	%xmm0, 0x10(%rax)
2440	lea	0x20(%rax), %rax
2441	cmp	%rax, %rbp
2442	ja	.Lxts_enc_bzero
2443
2444	lea	0x78(%rbp),%rax
2445___
2446$code.=<<___ if ($win64);
2447	movaps	0x40(%rbp), %xmm6
2448	movaps	0x50(%rbp), %xmm7
2449	movaps	0x60(%rbp), %xmm8
2450	movaps	0x70(%rbp), %xmm9
2451	movaps	0x80(%rbp), %xmm10
2452	movaps	0x90(%rbp), %xmm11
2453	movaps	0xa0(%rbp), %xmm12
2454	movaps	0xb0(%rbp), %xmm13
2455	movaps	0xc0(%rbp), %xmm14
2456	movaps	0xd0(%rbp), %xmm15
2457	lea	0xa0(%rax), %rax
2458.Lxts_enc_tail:
2459___
2460$code.=<<___;
2461	mov	-48(%rax), %r15
2462	mov	-40(%rax), %r14
2463	mov	-32(%rax), %r13
2464	mov	-24(%rax), %r12
2465	mov	-16(%rax), %rbx
2466	mov	-8(%rax), %rbp
2467	lea	(%rax), %rsp		# restore %rsp
2468.Lxts_enc_epilogue:
2469	ret
2470.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2471
2472.globl	bsaes_xts_decrypt
2473.type	bsaes_xts_decrypt,\@abi-omnipotent
2474.align	16
2475bsaes_xts_decrypt:
2476	mov	%rsp, %rax
2477.Lxts_dec_prologue:
2478	push	%rbp
2479	push	%rbx
2480	push	%r12
2481	push	%r13
2482	push	%r14
2483	push	%r15
2484	lea	-0x48(%rsp), %rsp
2485___
2486$code.=<<___ if ($win64);
2487	mov	0xa0(%rsp),$arg5	# pull key2
2488	mov	0xa8(%rsp),$arg6	# pull ivp
2489	lea	-0xa0(%rsp), %rsp
2490	movaps	%xmm6, 0x40(%rsp)
2491	movaps	%xmm7, 0x50(%rsp)
2492	movaps	%xmm8, 0x60(%rsp)
2493	movaps	%xmm9, 0x70(%rsp)
2494	movaps	%xmm10, 0x80(%rsp)
2495	movaps	%xmm11, 0x90(%rsp)
2496	movaps	%xmm12, 0xa0(%rsp)
2497	movaps	%xmm13, 0xb0(%rsp)
2498	movaps	%xmm14, 0xc0(%rsp)
2499	movaps	%xmm15, 0xd0(%rsp)
2500.Lxts_dec_body:
2501___
2502$code.=<<___;
2503	mov	%rsp, %rbp		# backup %rsp
2504	mov	$arg1, $inp		# backup arguments
2505	mov	$arg2, $out
2506	mov	$arg3, $len
2507	mov	$arg4, $key
2508
2509	lea	($arg6), $arg1
2510	lea	0x20(%rbp), $arg2
2511	lea	($arg5), $arg3
2512	call	asm_AES_encrypt		# generate initial tweak
2513
2514	mov	240($key), %eax		# rounds
2515	mov	$len, %rbx		# backup $len
2516
2517	mov	%eax, %edx		# rounds
2518	shl	\$7, %rax		# 128 bytes per inner round key
2519	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2520	sub	%rax, %rsp
2521
2522	mov	%rsp, %rax		# pass key schedule
2523	mov	$key, %rcx		# pass key
2524	mov	%edx, %r10d		# pass rounds
2525	call	_bsaes_key_convert
2526	pxor	(%rsp), %xmm7		# fix up round 0 key
2527	movdqa	%xmm6, (%rax)		# save last round key
2528	movdqa	%xmm7, (%rsp)
2529
2530	xor	%eax, %eax		# if ($len%16) len-=16;
2531	and	\$-16, $len
2532	test	\$15, %ebx
2533	setnz	%al
2534	shl	\$4, %rax
2535	sub	%rax, $len
2536
2537	sub	\$0x80, %rsp		# place for tweak[8]
2538	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2539
2540	pxor	$twtmp, $twtmp
2541	movdqa	.Lxts_magic(%rip), $twmask
2542	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2543
2544	sub	\$0x80, $len
2545	jc	.Lxts_dec_short
2546	jmp	.Lxts_dec_loop
2547
2548.align	16
2549.Lxts_dec_loop:
2550___
2551    for ($i=0;$i<7;$i++) {
2552    $code.=<<___;
2553	pshufd	\$0x13, $twtmp, $twres
2554	pxor	$twtmp, $twtmp
2555	movdqa	@XMM[7], @XMM[$i]
2556	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2557	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2558	pand	$twmask, $twres		# isolate carry and residue
2559	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2560	pxor	$twres, @XMM[7]
2561___
2562    $code.=<<___ if ($i>=1);
2563	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2564___
2565    $code.=<<___ if ($i>=2);
2566	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2567___
2568    }
2569$code.=<<___;
2570	movdqu	0x60($inp), @XMM[8+6]
2571	pxor	@XMM[8+5], @XMM[5]
2572	movdqu	0x70($inp), @XMM[8+7]
2573	lea	0x80($inp), $inp
2574	movdqa	@XMM[7], 0x70(%rsp)
2575	pxor	@XMM[8+6], @XMM[6]
2576	lea	0x80(%rsp), %rax	# pass key schedule
2577	pxor	@XMM[8+7], @XMM[7]
2578	mov	%edx, %r10d		# pass rounds
2579
2580	call	_bsaes_decrypt8
2581
2582	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2583	pxor	0x10(%rsp), @XMM[1]
2584	movdqu	@XMM[0], 0x00($out)	# write output
2585	pxor	0x20(%rsp), @XMM[6]
2586	movdqu	@XMM[1], 0x10($out)
2587	pxor	0x30(%rsp), @XMM[4]
2588	movdqu	@XMM[6], 0x20($out)
2589	pxor	0x40(%rsp), @XMM[2]
2590	movdqu	@XMM[4], 0x30($out)
2591	pxor	0x50(%rsp), @XMM[7]
2592	movdqu	@XMM[2], 0x40($out)
2593	pxor	0x60(%rsp), @XMM[3]
2594	movdqu	@XMM[7], 0x50($out)
2595	pxor	0x70(%rsp), @XMM[5]
2596	movdqu	@XMM[3], 0x60($out)
2597	movdqu	@XMM[5], 0x70($out)
2598	lea	0x80($out), $out
2599
2600	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2601	pxor	$twtmp, $twtmp
2602	movdqa	.Lxts_magic(%rip), $twmask
2603	pcmpgtd	@XMM[7], $twtmp
2604	pshufd	\$0x13, $twtmp, $twres
2605	pxor	$twtmp, $twtmp
2606	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2607	pand	$twmask, $twres		# isolate carry and residue
2608	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2609	pxor	$twres, @XMM[7]
2610
2611	sub	\$0x80,$len
2612	jnc	.Lxts_dec_loop
2613
2614.Lxts_dec_short:
2615	add	\$0x80, $len
2616	jz	.Lxts_dec_done
2617___
2618    for ($i=0;$i<7;$i++) {
2619    $code.=<<___;
2620	pshufd	\$0x13, $twtmp, $twres
2621	pxor	$twtmp, $twtmp
2622	movdqa	@XMM[7], @XMM[$i]
2623	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2624	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2625	pand	$twmask, $twres		# isolate carry and residue
2626	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2627	pxor	$twres, @XMM[7]
2628___
2629    $code.=<<___ if ($i>=1);
2630	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2631	cmp	\$`0x10*$i`,$len
2632	je	.Lxts_dec_$i
2633___
2634    $code.=<<___ if ($i>=2);
2635	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2636___
2637    }
2638$code.=<<___;
2639	movdqu	0x60($inp), @XMM[8+6]
2640	pxor	@XMM[8+5], @XMM[5]
2641	movdqa	@XMM[7], 0x70(%rsp)
2642	lea	0x70($inp), $inp
2643	pxor	@XMM[8+6], @XMM[6]
2644	lea	0x80(%rsp), %rax	# pass key schedule
2645	mov	%edx, %r10d		# pass rounds
2646
2647	call	_bsaes_decrypt8
2648
2649	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2650	pxor	0x10(%rsp), @XMM[1]
2651	movdqu	@XMM[0], 0x00($out)	# write output
2652	pxor	0x20(%rsp), @XMM[6]
2653	movdqu	@XMM[1], 0x10($out)
2654	pxor	0x30(%rsp), @XMM[4]
2655	movdqu	@XMM[6], 0x20($out)
2656	pxor	0x40(%rsp), @XMM[2]
2657	movdqu	@XMM[4], 0x30($out)
2658	pxor	0x50(%rsp), @XMM[7]
2659	movdqu	@XMM[2], 0x40($out)
2660	pxor	0x60(%rsp), @XMM[3]
2661	movdqu	@XMM[7], 0x50($out)
2662	movdqu	@XMM[3], 0x60($out)
2663	lea	0x70($out), $out
2664
2665	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2666	jmp	.Lxts_dec_done
2667.align	16
2668.Lxts_dec_6:
2669	pxor	@XMM[8+4], @XMM[4]
2670	lea	0x60($inp), $inp
2671	pxor	@XMM[8+5], @XMM[5]
2672	lea	0x80(%rsp), %rax	# pass key schedule
2673	mov	%edx, %r10d		# pass rounds
2674
2675	call	_bsaes_decrypt8
2676
2677	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2678	pxor	0x10(%rsp), @XMM[1]
2679	movdqu	@XMM[0], 0x00($out)	# write output
2680	pxor	0x20(%rsp), @XMM[6]
2681	movdqu	@XMM[1], 0x10($out)
2682	pxor	0x30(%rsp), @XMM[4]
2683	movdqu	@XMM[6], 0x20($out)
2684	pxor	0x40(%rsp), @XMM[2]
2685	movdqu	@XMM[4], 0x30($out)
2686	pxor	0x50(%rsp), @XMM[7]
2687	movdqu	@XMM[2], 0x40($out)
2688	movdqu	@XMM[7], 0x50($out)
2689	lea	0x60($out), $out
2690
2691	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2692	jmp	.Lxts_dec_done
2693.align	16
2694.Lxts_dec_5:
2695	pxor	@XMM[8+3], @XMM[3]
2696	lea	0x50($inp), $inp
2697	pxor	@XMM[8+4], @XMM[4]
2698	lea	0x80(%rsp), %rax	# pass key schedule
2699	mov	%edx, %r10d		# pass rounds
2700
2701	call	_bsaes_decrypt8
2702
2703	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2704	pxor	0x10(%rsp), @XMM[1]
2705	movdqu	@XMM[0], 0x00($out)	# write output
2706	pxor	0x20(%rsp), @XMM[6]
2707	movdqu	@XMM[1], 0x10($out)
2708	pxor	0x30(%rsp), @XMM[4]
2709	movdqu	@XMM[6], 0x20($out)
2710	pxor	0x40(%rsp), @XMM[2]
2711	movdqu	@XMM[4], 0x30($out)
2712	movdqu	@XMM[2], 0x40($out)
2713	lea	0x50($out), $out
2714
2715	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2716	jmp	.Lxts_dec_done
2717.align	16
2718.Lxts_dec_4:
2719	pxor	@XMM[8+2], @XMM[2]
2720	lea	0x40($inp), $inp
2721	pxor	@XMM[8+3], @XMM[3]
2722	lea	0x80(%rsp), %rax	# pass key schedule
2723	mov	%edx, %r10d		# pass rounds
2724
2725	call	_bsaes_decrypt8
2726
2727	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2728	pxor	0x10(%rsp), @XMM[1]
2729	movdqu	@XMM[0], 0x00($out)	# write output
2730	pxor	0x20(%rsp), @XMM[6]
2731	movdqu	@XMM[1], 0x10($out)
2732	pxor	0x30(%rsp), @XMM[4]
2733	movdqu	@XMM[6], 0x20($out)
2734	movdqu	@XMM[4], 0x30($out)
2735	lea	0x40($out), $out
2736
2737	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2738	jmp	.Lxts_dec_done
2739.align	16
2740.Lxts_dec_3:
2741	pxor	@XMM[8+1], @XMM[1]
2742	lea	0x30($inp), $inp
2743	pxor	@XMM[8+2], @XMM[2]
2744	lea	0x80(%rsp), %rax	# pass key schedule
2745	mov	%edx, %r10d		# pass rounds
2746
2747	call	_bsaes_decrypt8
2748
2749	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2750	pxor	0x10(%rsp), @XMM[1]
2751	movdqu	@XMM[0], 0x00($out)	# write output
2752	pxor	0x20(%rsp), @XMM[6]
2753	movdqu	@XMM[1], 0x10($out)
2754	movdqu	@XMM[6], 0x20($out)
2755	lea	0x30($out), $out
2756
2757	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2758	jmp	.Lxts_dec_done
2759.align	16
2760.Lxts_dec_2:
2761	pxor	@XMM[8+0], @XMM[0]
2762	lea	0x20($inp), $inp
2763	pxor	@XMM[8+1], @XMM[1]
2764	lea	0x80(%rsp), %rax	# pass key schedule
2765	mov	%edx, %r10d		# pass rounds
2766
2767	call	_bsaes_decrypt8
2768
2769	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2770	pxor	0x10(%rsp), @XMM[1]
2771	movdqu	@XMM[0], 0x00($out)	# write output
2772	movdqu	@XMM[1], 0x10($out)
2773	lea	0x20($out), $out
2774
2775	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2776	jmp	.Lxts_dec_done
2777.align	16
2778.Lxts_dec_1:
2779	pxor	@XMM[0], @XMM[8]
2780	lea	0x10($inp), $inp
2781	movdqa	@XMM[8], 0x20(%rbp)
2782	lea	0x20(%rbp), $arg1
2783	lea	0x20(%rbp), $arg2
2784	lea	($key), $arg3
2785	call	asm_AES_decrypt		# doesn't touch %xmm
2786	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2787	#pxor	@XMM[8], @XMM[0]
2788	#lea	0x80(%rsp), %rax	# pass key schedule
2789	#mov	%edx, %r10d		# pass rounds
2790	#call	_bsaes_decrypt8
2791	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2792	movdqu	@XMM[0], 0x00($out)	# write output
2793	lea	0x10($out), $out
2794
2795	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2796
2797.Lxts_dec_done:
2798	and	\$15, %ebx
2799	jz	.Lxts_dec_ret
2800
2801	pxor	$twtmp, $twtmp
2802	movdqa	.Lxts_magic(%rip), $twmask
2803	pcmpgtd	@XMM[7], $twtmp
2804	pshufd	\$0x13, $twtmp, $twres
2805	movdqa	@XMM[7], @XMM[6]
2806	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2807	pand	$twmask, $twres		# isolate carry and residue
2808	movdqu	($inp), @XMM[0]
2809	pxor	$twres, @XMM[7]
2810
2811	lea	0x20(%rbp), $arg1
2812	pxor	@XMM[7], @XMM[0]
2813	lea	0x20(%rbp), $arg2
2814	movdqa	@XMM[0], 0x20(%rbp)
2815	lea	($key), $arg3
2816	call	asm_AES_decrypt		# doesn't touch %xmm
2817	pxor	0x20(%rbp), @XMM[7]
2818	mov	$out, %rdx
2819	movdqu	@XMM[7], ($out)
2820
2821.Lxts_dec_steal:
2822	movzb	16($inp), %eax
2823	movzb	(%rdx), %ecx
2824	lea	1($inp), $inp
2825	mov	%al, (%rdx)
2826	mov	%cl, 16(%rdx)
2827	lea	1(%rdx), %rdx
2828	sub	\$1,%ebx
2829	jnz	.Lxts_dec_steal
2830
2831	movdqu	($out), @XMM[0]
2832	lea	0x20(%rbp), $arg1
2833	pxor	@XMM[6], @XMM[0]
2834	lea	0x20(%rbp), $arg2
2835	movdqa	@XMM[0], 0x20(%rbp)
2836	lea	($key), $arg3
2837	call	asm_AES_decrypt		# doesn't touch %xmm
2838	pxor	0x20(%rbp), @XMM[6]
2839	movdqu	@XMM[6], ($out)
2840
2841.Lxts_dec_ret:
2842	lea	(%rsp), %rax
2843	pxor	%xmm0, %xmm0
2844.Lxts_dec_bzero:			# wipe key schedule [if any]
2845	movdqa	%xmm0, 0x00(%rax)
2846	movdqa	%xmm0, 0x10(%rax)
2847	lea	0x20(%rax), %rax
2848	cmp	%rax, %rbp
2849	ja	.Lxts_dec_bzero
2850
2851	lea	0x78(%rbp),%rax
2852___
2853$code.=<<___ if ($win64);
2854	movaps	0x40(%rbp), %xmm6
2855	movaps	0x50(%rbp), %xmm7
2856	movaps	0x60(%rbp), %xmm8
2857	movaps	0x70(%rbp), %xmm9
2858	movaps	0x80(%rbp), %xmm10
2859	movaps	0x90(%rbp), %xmm11
2860	movaps	0xa0(%rbp), %xmm12
2861	movaps	0xb0(%rbp), %xmm13
2862	movaps	0xc0(%rbp), %xmm14
2863	movaps	0xd0(%rbp), %xmm15
2864	lea	0xa0(%rax), %rax
2865.Lxts_dec_tail:
2866___
2867$code.=<<___;
2868	mov	-48(%rax), %r15
2869	mov	-40(%rax), %r14
2870	mov	-32(%rax), %r13
2871	mov	-24(%rax), %r12
2872	mov	-16(%rax), %rbx
2873	mov	-8(%rax), %rbp
2874	lea	(%rax), %rsp		# restore %rsp
2875.Lxts_dec_epilogue:
2876	ret
2877.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2878___
2879}
2880$code.=<<___;
2881.type	_bsaes_const,\@object
2882.align	64
2883_bsaes_const:
2884.LM0ISR:	# InvShiftRows constants
2885	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
2886.LISRM0:
2887	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
2888.LISR:
2889	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
2890.LBS0:		# bit-slice constants
2891	.quad	0x5555555555555555, 0x5555555555555555
2892.LBS1:
2893	.quad	0x3333333333333333, 0x3333333333333333
2894.LBS2:
2895	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2896.LSR:		# shiftrows constants
2897	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
2898.LSRM0:
2899	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
2900.LM0SR:
2901	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
2902.LSWPUP:	# byte-swap upper dword
2903	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
2904.LSWPUPM0SR:
2905	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
2906.LADD1:		# counter increment constants
2907	.quad	0x0000000000000000, 0x0000000100000000
2908.LADD2:
2909	.quad	0x0000000000000000, 0x0000000200000000
2910.LADD3:
2911	.quad	0x0000000000000000, 0x0000000300000000
2912.LADD4:
2913	.quad	0x0000000000000000, 0x0000000400000000
2914.LADD5:
2915	.quad	0x0000000000000000, 0x0000000500000000
2916.LADD6:
2917	.quad	0x0000000000000000, 0x0000000600000000
2918.LADD7:
2919	.quad	0x0000000000000000, 0x0000000700000000
2920.LADD8:
2921	.quad	0x0000000000000000, 0x0000000800000000
2922.Lxts_magic:
2923	.long	0x87,0,1,0
2924.Lmasks:
2925	.quad	0x0101010101010101, 0x0101010101010101
2926	.quad	0x0202020202020202, 0x0202020202020202
2927	.quad	0x0404040404040404, 0x0404040404040404
2928	.quad	0x0808080808080808, 0x0808080808080808
2929.LM0:
2930	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
2931.L63:
2932	.quad	0x6363636363636363, 0x6363636363636363
2933.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2934.align	64
2935.size	_bsaes_const,.-_bsaes_const
2936___
2937
2938# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2939#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2940if ($win64) {
2941$rec="%rcx";
2942$frame="%rdx";
2943$context="%r8";
2944$disp="%r9";
2945
2946$code.=<<___;
2947.extern	__imp_RtlVirtualUnwind
2948.type	se_handler,\@abi-omnipotent
2949.align	16
2950se_handler:
2951	push	%rsi
2952	push	%rdi
2953	push	%rbx
2954	push	%rbp
2955	push	%r12
2956	push	%r13
2957	push	%r14
2958	push	%r15
2959	pushfq
2960	sub	\$64,%rsp
2961
2962	mov	120($context),%rax	# pull context->Rax
2963	mov	248($context),%rbx	# pull context->Rip
2964
2965	mov	8($disp),%rsi		# disp->ImageBase
2966	mov	56($disp),%r11		# disp->HandlerData
2967
2968	mov	0(%r11),%r10d		# HandlerData[0]
2969	lea	(%rsi,%r10),%r10	# prologue label
2970	cmp	%r10,%rbx		# context->Rip<=prologue label
2971	jbe	.Lin_prologue
2972
2973	mov	4(%r11),%r10d		# HandlerData[1]
2974	lea	(%rsi,%r10),%r10	# epilogue label
2975	cmp	%r10,%rbx		# context->Rip>=epilogue label
2976	jae	.Lin_prologue
2977
2978	mov	8(%r11),%r10d		# HandlerData[2]
2979	lea	(%rsi,%r10),%r10	# epilogue label
2980	cmp	%r10,%rbx		# context->Rip>=tail label
2981	jae	.Lin_tail
2982
2983	mov	160($context),%rax	# pull context->Rbp
2984
2985	lea	0x40(%rax),%rsi		# %xmm save area
2986	lea	512($context),%rdi	# &context.Xmm6
2987	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2988	.long	0xa548f3fc		# cld; rep movsq
2989	lea	0xa0+0x78(%rax),%rax	# adjust stack pointer
2990
2991.Lin_tail:
2992	mov	-48(%rax),%rbp
2993	mov	-40(%rax),%rbx
2994	mov	-32(%rax),%r12
2995	mov	-24(%rax),%r13
2996	mov	-16(%rax),%r14
2997	mov	-8(%rax),%r15
2998	mov	%rbx,144($context)	# restore context->Rbx
2999	mov	%rbp,160($context)	# restore context->Rbp
3000	mov	%r12,216($context)	# restore context->R12
3001	mov	%r13,224($context)	# restore context->R13
3002	mov	%r14,232($context)	# restore context->R14
3003	mov	%r15,240($context)	# restore context->R15
3004
3005.Lin_prologue:
3006	mov	%rax,152($context)	# restore context->Rsp
3007
3008	mov	40($disp),%rdi		# disp->ContextRecord
3009	mov	$context,%rsi		# context
3010	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
3011	.long	0xa548f3fc		# cld; rep movsq
3012
3013	mov	$disp,%rsi
3014	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3015	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3016	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3017	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3018	mov	40(%rsi),%r10		# disp->ContextRecord
3019	lea	56(%rsi),%r11		# &disp->HandlerData
3020	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3021	mov	%r10,32(%rsp)		# arg5
3022	mov	%r11,40(%rsp)		# arg6
3023	mov	%r12,48(%rsp)		# arg7
3024	mov	%rcx,56(%rsp)		# arg8, (NULL)
3025	call	*__imp_RtlVirtualUnwind(%rip)
3026
3027	mov	\$1,%eax		# ExceptionContinueSearch
3028	add	\$64,%rsp
3029	popfq
3030	pop	%r15
3031	pop	%r14
3032	pop	%r13
3033	pop	%r12
3034	pop	%rbp
3035	pop	%rbx
3036	pop	%rdi
3037	pop	%rsi
3038	ret
3039.size	se_handler,.-se_handler
3040
3041.section	.pdata
3042.align	4
3043___
3044$code.=<<___ if ($ecb);
3045	.rva	.Lecb_enc_prologue
3046	.rva	.Lecb_enc_epilogue
3047	.rva	.Lecb_enc_info
3048
3049	.rva	.Lecb_dec_prologue
3050	.rva	.Lecb_dec_epilogue
3051	.rva	.Lecb_dec_info
3052___
3053$code.=<<___;
3054	.rva	.Lcbc_dec_prologue
3055	.rva	.Lcbc_dec_epilogue
3056	.rva	.Lcbc_dec_info
3057
3058	.rva	.Lctr_enc_prologue
3059	.rva	.Lctr_enc_epilogue
3060	.rva	.Lctr_enc_info
3061
3062	.rva	.Lxts_enc_prologue
3063	.rva	.Lxts_enc_epilogue
3064	.rva	.Lxts_enc_info
3065
3066	.rva	.Lxts_dec_prologue
3067	.rva	.Lxts_dec_epilogue
3068	.rva	.Lxts_dec_info
3069
3070.section	.xdata
3071.align	8
3072___
3073$code.=<<___ if ($ecb);
3074.Lecb_enc_info:
3075	.byte	9,0,0,0
3076	.rva	se_handler
3077	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3078	.rva	.Lecb_enc_tail
3079	.long	0
3080.Lecb_dec_info:
3081	.byte	9,0,0,0
3082	.rva	se_handler
3083	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3084	.rva	.Lecb_dec_tail
3085	.long	0
3086___
3087$code.=<<___;
3088.Lcbc_dec_info:
3089	.byte	9,0,0,0
3090	.rva	se_handler
3091	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3092	.rva	.Lcbc_dec_tail
3093	.long	0
3094.Lctr_enc_info:
3095	.byte	9,0,0,0
3096	.rva	se_handler
3097	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3098	.rva	.Lctr_enc_tail
3099	.long	0
3100.Lxts_enc_info:
3101	.byte	9,0,0,0
3102	.rva	se_handler
3103	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3104	.rva	.Lxts_enc_tail
3105	.long	0
3106.Lxts_dec_info:
3107	.byte	9,0,0,0
3108	.rva	se_handler
3109	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3110	.rva	.Lxts_dec_tail
3111	.long	0
3112___
3113}
3114
3115$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3116
3117print $code;
3118
3119close STDOUT;
3120