1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
20# details].
21#
22# Performance.
23#
24# To start with see corresponding paragraph in aesni-x86_64.pl...
25# Instead of filling table similar to one found there I've chosen to
26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27# The simplified table below represents 32-bit performance relative
28# to 64-bit one in every given point. Ratios vary for different
29# encryption modes, therefore interval values.
30#
31#	16-byte     64-byte     256-byte    1-KB        8-KB
32#	53-67%      67-84%      91-94%      95-98%      97-99.5%
33#
34# Lower ratios for smaller block sizes are perfectly understandable,
35# because function call overhead is higher in 32-bit mode. Largest
36# 8-KB block performance is virtually same: 32-bit code is less than
37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
38
39# January 2011
40#
41# See aesni-x86_64.pl for details. Unlike x86_64 version this module
42# interleaves at most 6 aes[enc|dec] instructions, because there are
43# not enough registers for 8x interleave [which should be optimal for
44# Sandy Bridge]. Actually, performance results for 6x interleave
45# factor presented in aesni-x86_64.pl (except for CTR) are for this
46# module.
47
48# April 2011
49#
50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
52
53# November 2015
54#
55# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL]
56
57######################################################################
58# Current large-block performance in cycles per byte processed with
59# 128-bit key (less is better).
60#
61#		CBC en-/decrypt	CTR	XTS	ECB	OCB
62# Westmere	3.77/1.37	1.37	1.52	1.27
63# * Bridge	5.07/0.98	0.99	1.09	0.91	1.10
64# Haswell	4.44/0.80	0.97	1.03	0.72	0.76
65# Skylake	2.68/0.65	0.65	0.66	0.64	0.66
66# Silvermont	5.77/3.56	3.67	4.03	3.46	4.03
67# Goldmont	3.84/1.39	1.39	1.63	1.31	1.70
68# Bulldozer	5.80/0.98	1.05	1.24	0.93	1.23
69
70$PREFIX="aes_hw";	# if $PREFIX is set to "AES", the script
71			# generates drop-in replacement for
72			# crypto/aes/asm/aes-586.pl:-)
73$AESNI_PREFIX="aes_hw";
74$inline=1;		# inline _aesni_[en|de]crypt
75
76$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
77push(@INC,"${dir}","${dir}../../../perlasm");
78require "x86asm.pl";
79
80$output = pop;
81open OUT,">$output";
82*STDOUT=*OUT;
83
84&asm_init($ARGV[0]);
85
86&external_label("OPENSSL_ia32cap_P");
87&preprocessor_ifndef("NDEBUG")
88&external_label("BORINGSSL_function_hit");
89&preprocessor_endif();
90&static_label("key_const");
91
92if ($PREFIX eq $AESNI_PREFIX)	{ $movekey=\&movups; }
93else			{ $movekey=\&movups; }
94
95$len="eax";
96$rounds="ecx";
97$key="edx";
98$inp="esi";
99$out="edi";
100$rounds_="ebx";	# backup copy for $rounds
101$key_="ebp";	# backup copy for $key
102
103$rndkey0="xmm0";
104$rndkey1="xmm1";
105$inout0="xmm2";
106$inout1="xmm3";
107$inout2="xmm4";
108$inout3="xmm5";	$in1="xmm5";
109$inout4="xmm6";	$in0="xmm6";
110$inout5="xmm7";	$ivec="xmm7";
111
112# AESNI extension
113sub aeskeygenassist
114{ my($dst,$src,$imm)=@_;
115    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
116    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
117}
118sub aescommon
119{ my($opcodelet,$dst,$src)=@_;
120    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
121    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
122}
123sub aesimc	{ aescommon(0xdb,@_); }
124sub aesenc	{ aescommon(0xdc,@_); }
125sub aesenclast	{ aescommon(0xdd,@_); }
126sub aesdec	{ aescommon(0xde,@_); }
127sub aesdeclast	{ aescommon(0xdf,@_); }
128
129# Inline version of internal aesni_[en|de]crypt1
130{ my $sn;
131sub aesni_inline_generate1
132{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
133  $sn++;
134
135    &$movekey		($rndkey0,&QWP(0,$key));
136    &$movekey		($rndkey1,&QWP(16,$key));
137    &xorps		($ivec,$rndkey0)	if (defined($ivec));
138    &lea		($key,&DWP(32,$key));
139    &xorps		($inout,$ivec)		if (defined($ivec));
140    &xorps		($inout,$rndkey0)	if (!defined($ivec));
141    &set_label("${p}1_loop_$sn");
142	eval"&aes${p}	($inout,$rndkey1)";
143	&dec		($rounds);
144	&$movekey	($rndkey1,&QWP(0,$key));
145	&lea		($key,&DWP(16,$key));
146    &jnz		(&label("${p}1_loop_$sn"));
147    eval"&aes${p}last	($inout,$rndkey1)";
148}}
149
150sub aesni_generate1	# fully unrolled loop
151{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
152
153    &function_begin_B("_aesni_${p}rypt1");
154	&movups		($rndkey0,&QWP(0,$key));
155	&$movekey	($rndkey1,&QWP(0x10,$key));
156	&xorps		($inout,$rndkey0);
157	&$movekey	($rndkey0,&QWP(0x20,$key));
158	&lea		($key,&DWP(0x30,$key));
159	&cmp		($rounds,11);
160	&jb		(&label("${p}128"));
161	&lea		($key,&DWP(0x20,$key));
162	&je		(&label("${p}192"));
163	&lea		($key,&DWP(0x20,$key));
164	eval"&aes${p}	($inout,$rndkey1)";
165	&$movekey	($rndkey1,&QWP(-0x40,$key));
166	eval"&aes${p}	($inout,$rndkey0)";
167	&$movekey	($rndkey0,&QWP(-0x30,$key));
168    &set_label("${p}192");
169	eval"&aes${p}	($inout,$rndkey1)";
170	&$movekey	($rndkey1,&QWP(-0x20,$key));
171	eval"&aes${p}	($inout,$rndkey0)";
172	&$movekey	($rndkey0,&QWP(-0x10,$key));
173    &set_label("${p}128");
174	eval"&aes${p}	($inout,$rndkey1)";
175	&$movekey	($rndkey1,&QWP(0,$key));
176	eval"&aes${p}	($inout,$rndkey0)";
177	&$movekey	($rndkey0,&QWP(0x10,$key));
178	eval"&aes${p}	($inout,$rndkey1)";
179	&$movekey	($rndkey1,&QWP(0x20,$key));
180	eval"&aes${p}	($inout,$rndkey0)";
181	&$movekey	($rndkey0,&QWP(0x30,$key));
182	eval"&aes${p}	($inout,$rndkey1)";
183	&$movekey	($rndkey1,&QWP(0x40,$key));
184	eval"&aes${p}	($inout,$rndkey0)";
185	&$movekey	($rndkey0,&QWP(0x50,$key));
186	eval"&aes${p}	($inout,$rndkey1)";
187	&$movekey	($rndkey1,&QWP(0x60,$key));
188	eval"&aes${p}	($inout,$rndkey0)";
189	&$movekey	($rndkey0,&QWP(0x70,$key));
190	eval"&aes${p}	($inout,$rndkey1)";
191    eval"&aes${p}last	($inout,$rndkey0)";
192    &ret();
193    &function_end_B("_aesni_${p}rypt1");
194}
195
196# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
197&aesni_generate1("enc") if (!$inline);
198&function_begin_B("${PREFIX}_encrypt");
199	&record_function_hit(1);
200
201	&mov	("eax",&wparam(0));
202	&mov	($key,&wparam(2));
203	&movups	($inout0,&QWP(0,"eax"));
204	&mov	($rounds,&DWP(240,$key));
205	&mov	("eax",&wparam(1));
206	if ($inline)
207	{   &aesni_inline_generate1("enc");	}
208	else
209	{   &call	("_aesni_encrypt1");	}
210	&pxor	($rndkey0,$rndkey0);		# clear register bank
211	&pxor	($rndkey1,$rndkey1);
212	&movups	(&QWP(0,"eax"),$inout0);
213	&pxor	($inout0,$inout0);
214	&ret	();
215&function_end_B("${PREFIX}_encrypt");
216
217# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
218&aesni_generate1("dec") if(!$inline);
219&function_begin_B("${PREFIX}_decrypt");
220	&mov	("eax",&wparam(0));
221	&mov	($key,&wparam(2));
222	&movups	($inout0,&QWP(0,"eax"));
223	&mov	($rounds,&DWP(240,$key));
224	&mov	("eax",&wparam(1));
225	if ($inline)
226	{   &aesni_inline_generate1("dec");	}
227	else
228	{   &call	("_aesni_decrypt1");	}
229	&pxor	($rndkey0,$rndkey0);		# clear register bank
230	&pxor	($rndkey1,$rndkey1);
231	&movups	(&QWP(0,"eax"),$inout0);
232	&pxor	($inout0,$inout0);
233	&ret	();
234&function_end_B("${PREFIX}_decrypt");
235
236# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
237# factor. Why 3x subroutine were originally used in loops? Even though
238# aes[enc|dec] latency was originally 6, it could be scheduled only
239# every *2nd* cycle. Thus 3x interleave was the one providing optimal
240# utilization, i.e. when subroutine's throughput is virtually same as
241# of non-interleaved subroutine [for number of input blocks up to 3].
242# This is why it originally made no sense to implement 2x subroutine.
243# But times change and it became appropriate to spend extra 192 bytes
244# on 2x subroutine on Atom Silvermont account. For processors that
245# can schedule aes[enc|dec] every cycle optimal interleave factor
246# equals to corresponding instructions latency. 8x is optimal for
247# * Bridge, but it's unfeasible to accommodate such implementation
248# in XMM registers addressable in 32-bit mode and therefore maximum
249# of 6x is used instead...
250
251sub aesni_generate2
252{ my $p=shift;
253
254    &function_begin_B("_aesni_${p}rypt2");
255	&$movekey	($rndkey0,&QWP(0,$key));
256	&shl		($rounds,4);
257	&$movekey	($rndkey1,&QWP(16,$key));
258	&xorps		($inout0,$rndkey0);
259	&pxor		($inout1,$rndkey0);
260	&$movekey	($rndkey0,&QWP(32,$key));
261	&lea		($key,&DWP(32,$key,$rounds));
262	&neg		($rounds);
263	&add		($rounds,16);
264
265    &set_label("${p}2_loop");
266	eval"&aes${p}	($inout0,$rndkey1)";
267	eval"&aes${p}	($inout1,$rndkey1)";
268	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
269	&add		($rounds,32);
270	eval"&aes${p}	($inout0,$rndkey0)";
271	eval"&aes${p}	($inout1,$rndkey0)";
272	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
273	&jnz		(&label("${p}2_loop"));
274    eval"&aes${p}	($inout0,$rndkey1)";
275    eval"&aes${p}	($inout1,$rndkey1)";
276    eval"&aes${p}last	($inout0,$rndkey0)";
277    eval"&aes${p}last	($inout1,$rndkey0)";
278    &ret();
279    &function_end_B("_aesni_${p}rypt2");
280}
281
282sub aesni_generate3
283{ my $p=shift;
284
285    &function_begin_B("_aesni_${p}rypt3");
286	&$movekey	($rndkey0,&QWP(0,$key));
287	&shl		($rounds,4);
288	&$movekey	($rndkey1,&QWP(16,$key));
289	&xorps		($inout0,$rndkey0);
290	&pxor		($inout1,$rndkey0);
291	&pxor		($inout2,$rndkey0);
292	&$movekey	($rndkey0,&QWP(32,$key));
293	&lea		($key,&DWP(32,$key,$rounds));
294	&neg		($rounds);
295	&add		($rounds,16);
296
297    &set_label("${p}3_loop");
298	eval"&aes${p}	($inout0,$rndkey1)";
299	eval"&aes${p}	($inout1,$rndkey1)";
300	eval"&aes${p}	($inout2,$rndkey1)";
301	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
302	&add		($rounds,32);
303	eval"&aes${p}	($inout0,$rndkey0)";
304	eval"&aes${p}	($inout1,$rndkey0)";
305	eval"&aes${p}	($inout2,$rndkey0)";
306	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
307	&jnz		(&label("${p}3_loop"));
308    eval"&aes${p}	($inout0,$rndkey1)";
309    eval"&aes${p}	($inout1,$rndkey1)";
310    eval"&aes${p}	($inout2,$rndkey1)";
311    eval"&aes${p}last	($inout0,$rndkey0)";
312    eval"&aes${p}last	($inout1,$rndkey0)";
313    eval"&aes${p}last	($inout2,$rndkey0)";
314    &ret();
315    &function_end_B("_aesni_${p}rypt3");
316}
317
318# 4x interleave is implemented to improve small block performance,
319# most notably [and naturally] 4 block by ~30%. One can argue that one
320# should have implemented 5x as well, but improvement  would be <20%,
321# so it's not worth it...
322sub aesni_generate4
323{ my $p=shift;
324
325    &function_begin_B("_aesni_${p}rypt4");
326	&$movekey	($rndkey0,&QWP(0,$key));
327	&$movekey	($rndkey1,&QWP(16,$key));
328	&shl		($rounds,4);
329	&xorps		($inout0,$rndkey0);
330	&pxor		($inout1,$rndkey0);
331	&pxor		($inout2,$rndkey0);
332	&pxor		($inout3,$rndkey0);
333	&$movekey	($rndkey0,&QWP(32,$key));
334	&lea		($key,&DWP(32,$key,$rounds));
335	&neg		($rounds);
336	&data_byte	(0x0f,0x1f,0x40,0x00);
337	&add		($rounds,16);
338
339    &set_label("${p}4_loop");
340	eval"&aes${p}	($inout0,$rndkey1)";
341	eval"&aes${p}	($inout1,$rndkey1)";
342	eval"&aes${p}	($inout2,$rndkey1)";
343	eval"&aes${p}	($inout3,$rndkey1)";
344	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
345	&add		($rounds,32);
346	eval"&aes${p}	($inout0,$rndkey0)";
347	eval"&aes${p}	($inout1,$rndkey0)";
348	eval"&aes${p}	($inout2,$rndkey0)";
349	eval"&aes${p}	($inout3,$rndkey0)";
350	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
351    &jnz		(&label("${p}4_loop"));
352
353    eval"&aes${p}	($inout0,$rndkey1)";
354    eval"&aes${p}	($inout1,$rndkey1)";
355    eval"&aes${p}	($inout2,$rndkey1)";
356    eval"&aes${p}	($inout3,$rndkey1)";
357    eval"&aes${p}last	($inout0,$rndkey0)";
358    eval"&aes${p}last	($inout1,$rndkey0)";
359    eval"&aes${p}last	($inout2,$rndkey0)";
360    eval"&aes${p}last	($inout3,$rndkey0)";
361    &ret();
362    &function_end_B("_aesni_${p}rypt4");
363}
364
365sub aesni_generate6
366{ my $p=shift;
367
368    &function_begin_B("_aesni_${p}rypt6");
369    &static_label("_aesni_${p}rypt6_enter");
370	&$movekey	($rndkey0,&QWP(0,$key));
371	&shl		($rounds,4);
372	&$movekey	($rndkey1,&QWP(16,$key));
373	&xorps		($inout0,$rndkey0);
374	&pxor		($inout1,$rndkey0);	# pxor does better here
375	&pxor		($inout2,$rndkey0);
376	eval"&aes${p}	($inout0,$rndkey1)";
377	&pxor		($inout3,$rndkey0);
378	&pxor		($inout4,$rndkey0);
379	eval"&aes${p}	($inout1,$rndkey1)";
380	&lea		($key,&DWP(32,$key,$rounds));
381	&neg		($rounds);
382	eval"&aes${p}	($inout2,$rndkey1)";
383	&pxor		($inout5,$rndkey0);
384	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
385	&add		($rounds,16);
386	&jmp		(&label("_aesni_${p}rypt6_inner"));
387
388    &set_label("${p}6_loop",16);
389	eval"&aes${p}	($inout0,$rndkey1)";
390	eval"&aes${p}	($inout1,$rndkey1)";
391	eval"&aes${p}	($inout2,$rndkey1)";
392    &set_label("_aesni_${p}rypt6_inner");
393	eval"&aes${p}	($inout3,$rndkey1)";
394	eval"&aes${p}	($inout4,$rndkey1)";
395	eval"&aes${p}	($inout5,$rndkey1)";
396    &set_label("_aesni_${p}rypt6_enter");
397	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
398	&add		($rounds,32);
399	eval"&aes${p}	($inout0,$rndkey0)";
400	eval"&aes${p}	($inout1,$rndkey0)";
401	eval"&aes${p}	($inout2,$rndkey0)";
402	eval"&aes${p}	($inout3,$rndkey0)";
403	eval"&aes${p}	($inout4,$rndkey0)";
404	eval"&aes${p}	($inout5,$rndkey0)";
405	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
406    &jnz		(&label("${p}6_loop"));
407
408    eval"&aes${p}	($inout0,$rndkey1)";
409    eval"&aes${p}	($inout1,$rndkey1)";
410    eval"&aes${p}	($inout2,$rndkey1)";
411    eval"&aes${p}	($inout3,$rndkey1)";
412    eval"&aes${p}	($inout4,$rndkey1)";
413    eval"&aes${p}	($inout5,$rndkey1)";
414    eval"&aes${p}last	($inout0,$rndkey0)";
415    eval"&aes${p}last	($inout1,$rndkey0)";
416    eval"&aes${p}last	($inout2,$rndkey0)";
417    eval"&aes${p}last	($inout3,$rndkey0)";
418    eval"&aes${p}last	($inout4,$rndkey0)";
419    eval"&aes${p}last	($inout5,$rndkey0)";
420    &ret();
421    &function_end_B("_aesni_${p}rypt6");
422}
423&aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX);
424&aesni_generate2("dec");
425&aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX);
426&aesni_generate3("dec");
427&aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX);
428&aesni_generate4("dec");
429&aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX);
430&aesni_generate6("dec");
431
432if ($PREFIX eq $AESNI_PREFIX) {
433######################################################################
434# void aes_hw_ecb_encrypt (const void *in, void *out,
435#                         size_t length, const AES_KEY *key,
436#                         int enc);
437&function_begin("${PREFIX}_ecb_encrypt");
438	&mov	($inp,&wparam(0));
439	&mov	($out,&wparam(1));
440	&mov	($len,&wparam(2));
441	&mov	($key,&wparam(3));
442	&mov	($rounds_,&wparam(4));
443	&and	($len,-16);
444	&jz	(&label("ecb_ret"));
445	&mov	($rounds,&DWP(240,$key));
446	&test	($rounds_,$rounds_);
447	&jz	(&label("ecb_decrypt"));
448
449	&mov	($key_,$key);		# backup $key
450	&mov	($rounds_,$rounds);	# backup $rounds
451	&cmp	($len,0x60);
452	&jb	(&label("ecb_enc_tail"));
453
454	&movdqu	($inout0,&QWP(0,$inp));
455	&movdqu	($inout1,&QWP(0x10,$inp));
456	&movdqu	($inout2,&QWP(0x20,$inp));
457	&movdqu	($inout3,&QWP(0x30,$inp));
458	&movdqu	($inout4,&QWP(0x40,$inp));
459	&movdqu	($inout5,&QWP(0x50,$inp));
460	&lea	($inp,&DWP(0x60,$inp));
461	&sub	($len,0x60);
462	&jmp	(&label("ecb_enc_loop6_enter"));
463
464&set_label("ecb_enc_loop6",16);
465	&movups	(&QWP(0,$out),$inout0);
466	&movdqu	($inout0,&QWP(0,$inp));
467	&movups	(&QWP(0x10,$out),$inout1);
468	&movdqu	($inout1,&QWP(0x10,$inp));
469	&movups	(&QWP(0x20,$out),$inout2);
470	&movdqu	($inout2,&QWP(0x20,$inp));
471	&movups	(&QWP(0x30,$out),$inout3);
472	&movdqu	($inout3,&QWP(0x30,$inp));
473	&movups	(&QWP(0x40,$out),$inout4);
474	&movdqu	($inout4,&QWP(0x40,$inp));
475	&movups	(&QWP(0x50,$out),$inout5);
476	&lea	($out,&DWP(0x60,$out));
477	&movdqu	($inout5,&QWP(0x50,$inp));
478	&lea	($inp,&DWP(0x60,$inp));
479&set_label("ecb_enc_loop6_enter");
480
481	&call	("_aesni_encrypt6");
482
483	&mov	($key,$key_);		# restore $key
484	&mov	($rounds,$rounds_);	# restore $rounds
485	&sub	($len,0x60);
486	&jnc	(&label("ecb_enc_loop6"));
487
488	&movups	(&QWP(0,$out),$inout0);
489	&movups	(&QWP(0x10,$out),$inout1);
490	&movups	(&QWP(0x20,$out),$inout2);
491	&movups	(&QWP(0x30,$out),$inout3);
492	&movups	(&QWP(0x40,$out),$inout4);
493	&movups	(&QWP(0x50,$out),$inout5);
494	&lea	($out,&DWP(0x60,$out));
495	&add	($len,0x60);
496	&jz	(&label("ecb_ret"));
497
498&set_label("ecb_enc_tail");
499	&movups	($inout0,&QWP(0,$inp));
500	&cmp	($len,0x20);
501	&jb	(&label("ecb_enc_one"));
502	&movups	($inout1,&QWP(0x10,$inp));
503	&je	(&label("ecb_enc_two"));
504	&movups	($inout2,&QWP(0x20,$inp));
505	&cmp	($len,0x40);
506	&jb	(&label("ecb_enc_three"));
507	&movups	($inout3,&QWP(0x30,$inp));
508	&je	(&label("ecb_enc_four"));
509	&movups	($inout4,&QWP(0x40,$inp));
510	&xorps	($inout5,$inout5);
511	&call	("_aesni_encrypt6");
512	&movups	(&QWP(0,$out),$inout0);
513	&movups	(&QWP(0x10,$out),$inout1);
514	&movups	(&QWP(0x20,$out),$inout2);
515	&movups	(&QWP(0x30,$out),$inout3);
516	&movups	(&QWP(0x40,$out),$inout4);
517	jmp	(&label("ecb_ret"));
518
519&set_label("ecb_enc_one",16);
520	if ($inline)
521	{   &aesni_inline_generate1("enc");	}
522	else
523	{   &call	("_aesni_encrypt1");	}
524	&movups	(&QWP(0,$out),$inout0);
525	&jmp	(&label("ecb_ret"));
526
527&set_label("ecb_enc_two",16);
528	&call	("_aesni_encrypt2");
529	&movups	(&QWP(0,$out),$inout0);
530	&movups	(&QWP(0x10,$out),$inout1);
531	&jmp	(&label("ecb_ret"));
532
533&set_label("ecb_enc_three",16);
534	&call	("_aesni_encrypt3");
535	&movups	(&QWP(0,$out),$inout0);
536	&movups	(&QWP(0x10,$out),$inout1);
537	&movups	(&QWP(0x20,$out),$inout2);
538	&jmp	(&label("ecb_ret"));
539
540&set_label("ecb_enc_four",16);
541	&call	("_aesni_encrypt4");
542	&movups	(&QWP(0,$out),$inout0);
543	&movups	(&QWP(0x10,$out),$inout1);
544	&movups	(&QWP(0x20,$out),$inout2);
545	&movups	(&QWP(0x30,$out),$inout3);
546	&jmp	(&label("ecb_ret"));
547######################################################################
548&set_label("ecb_decrypt",16);
549	&mov	($key_,$key);		# backup $key
550	&mov	($rounds_,$rounds);	# backup $rounds
551	&cmp	($len,0x60);
552	&jb	(&label("ecb_dec_tail"));
553
554	&movdqu	($inout0,&QWP(0,$inp));
555	&movdqu	($inout1,&QWP(0x10,$inp));
556	&movdqu	($inout2,&QWP(0x20,$inp));
557	&movdqu	($inout3,&QWP(0x30,$inp));
558	&movdqu	($inout4,&QWP(0x40,$inp));
559	&movdqu	($inout5,&QWP(0x50,$inp));
560	&lea	($inp,&DWP(0x60,$inp));
561	&sub	($len,0x60);
562	&jmp	(&label("ecb_dec_loop6_enter"));
563
564&set_label("ecb_dec_loop6",16);
565	&movups	(&QWP(0,$out),$inout0);
566	&movdqu	($inout0,&QWP(0,$inp));
567	&movups	(&QWP(0x10,$out),$inout1);
568	&movdqu	($inout1,&QWP(0x10,$inp));
569	&movups	(&QWP(0x20,$out),$inout2);
570	&movdqu	($inout2,&QWP(0x20,$inp));
571	&movups	(&QWP(0x30,$out),$inout3);
572	&movdqu	($inout3,&QWP(0x30,$inp));
573	&movups	(&QWP(0x40,$out),$inout4);
574	&movdqu	($inout4,&QWP(0x40,$inp));
575	&movups	(&QWP(0x50,$out),$inout5);
576	&lea	($out,&DWP(0x60,$out));
577	&movdqu	($inout5,&QWP(0x50,$inp));
578	&lea	($inp,&DWP(0x60,$inp));
579&set_label("ecb_dec_loop6_enter");
580
581	&call	("_aesni_decrypt6");
582
583	&mov	($key,$key_);		# restore $key
584	&mov	($rounds,$rounds_);	# restore $rounds
585	&sub	($len,0x60);
586	&jnc	(&label("ecb_dec_loop6"));
587
588	&movups	(&QWP(0,$out),$inout0);
589	&movups	(&QWP(0x10,$out),$inout1);
590	&movups	(&QWP(0x20,$out),$inout2);
591	&movups	(&QWP(0x30,$out),$inout3);
592	&movups	(&QWP(0x40,$out),$inout4);
593	&movups	(&QWP(0x50,$out),$inout5);
594	&lea	($out,&DWP(0x60,$out));
595	&add	($len,0x60);
596	&jz	(&label("ecb_ret"));
597
598&set_label("ecb_dec_tail");
599	&movups	($inout0,&QWP(0,$inp));
600	&cmp	($len,0x20);
601	&jb	(&label("ecb_dec_one"));
602	&movups	($inout1,&QWP(0x10,$inp));
603	&je	(&label("ecb_dec_two"));
604	&movups	($inout2,&QWP(0x20,$inp));
605	&cmp	($len,0x40);
606	&jb	(&label("ecb_dec_three"));
607	&movups	($inout3,&QWP(0x30,$inp));
608	&je	(&label("ecb_dec_four"));
609	&movups	($inout4,&QWP(0x40,$inp));
610	&xorps	($inout5,$inout5);
611	&call	("_aesni_decrypt6");
612	&movups	(&QWP(0,$out),$inout0);
613	&movups	(&QWP(0x10,$out),$inout1);
614	&movups	(&QWP(0x20,$out),$inout2);
615	&movups	(&QWP(0x30,$out),$inout3);
616	&movups	(&QWP(0x40,$out),$inout4);
617	&jmp	(&label("ecb_ret"));
618
619&set_label("ecb_dec_one",16);
620	if ($inline)
621	{   &aesni_inline_generate1("dec");	}
622	else
623	{   &call	("_aesni_decrypt1");	}
624	&movups	(&QWP(0,$out),$inout0);
625	&jmp	(&label("ecb_ret"));
626
627&set_label("ecb_dec_two",16);
628	&call	("_aesni_decrypt2");
629	&movups	(&QWP(0,$out),$inout0);
630	&movups	(&QWP(0x10,$out),$inout1);
631	&jmp	(&label("ecb_ret"));
632
633&set_label("ecb_dec_three",16);
634	&call	("_aesni_decrypt3");
635	&movups	(&QWP(0,$out),$inout0);
636	&movups	(&QWP(0x10,$out),$inout1);
637	&movups	(&QWP(0x20,$out),$inout2);
638	&jmp	(&label("ecb_ret"));
639
640&set_label("ecb_dec_four",16);
641	&call	("_aesni_decrypt4");
642	&movups	(&QWP(0,$out),$inout0);
643	&movups	(&QWP(0x10,$out),$inout1);
644	&movups	(&QWP(0x20,$out),$inout2);
645	&movups	(&QWP(0x30,$out),$inout3);
646
647&set_label("ecb_ret");
648	&pxor	("xmm0","xmm0");		# clear register bank
649	&pxor	("xmm1","xmm1");
650	&pxor	("xmm2","xmm2");
651	&pxor	("xmm3","xmm3");
652	&pxor	("xmm4","xmm4");
653	&pxor	("xmm5","xmm5");
654	&pxor	("xmm6","xmm6");
655	&pxor	("xmm7","xmm7");
656&function_end("${PREFIX}_ecb_encrypt");
657
658######################################################################
659# void aes_hw_ccm64_[en|de]crypt_blocks (const void *in, void *out,
660#                         size_t blocks, const AES_KEY *key,
661#                         const char *ivec,char *cmac);
662#
663# Handles only complete blocks, operates on 64-bit counter and
664# does not update *ivec! Nor does it finalize CMAC value
665# (see engine/eng_aesni.c for details)
666#
667{ my $cmac=$inout1;
668&function_begin("${PREFIX}_ccm64_encrypt_blocks");
669	&mov	($inp,&wparam(0));
670	&mov	($out,&wparam(1));
671	&mov	($len,&wparam(2));
672	&mov	($key,&wparam(3));
673	&mov	($rounds_,&wparam(4));
674	&mov	($rounds,&wparam(5));
675	&mov	($key_,"esp");
676	&sub	("esp",60);
677	&and	("esp",-16);			# align stack
678	&mov	(&DWP(48,"esp"),$key_);
679
680	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
681	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
682	&mov	($rounds,&DWP(240,$key));
683
684	# compose byte-swap control mask for pshufb on stack
685	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
686	&mov	(&DWP(4,"esp"),0x08090a0b);
687	&mov	(&DWP(8,"esp"),0x04050607);
688	&mov	(&DWP(12,"esp"),0x00010203);
689
690	# compose counter increment vector on stack
691	&mov	($rounds_,1);
692	&xor	($key_,$key_);
693	&mov	(&DWP(16,"esp"),$rounds_);
694	&mov	(&DWP(20,"esp"),$key_);
695	&mov	(&DWP(24,"esp"),$key_);
696	&mov	(&DWP(28,"esp"),$key_);
697
698	&shl	($rounds,4);
699	&mov	($rounds_,16);
700	&lea	($key_,&DWP(0,$key));
701	&movdqa	($inout3,&QWP(0,"esp"));
702	&movdqa	($inout0,$ivec);
703	&lea	($key,&DWP(32,$key,$rounds));
704	&sub	($rounds_,$rounds);
705	&pshufb	($ivec,$inout3);
706
707&set_label("ccm64_enc_outer");
708	&$movekey	($rndkey0,&QWP(0,$key_));
709	&mov		($rounds,$rounds_);
710	&movups		($in0,&QWP(0,$inp));
711
712	&xorps		($inout0,$rndkey0);
713	&$movekey	($rndkey1,&QWP(16,$key_));
714	&xorps		($rndkey0,$in0);
715	&xorps		($cmac,$rndkey0);		# cmac^=inp
716	&$movekey	($rndkey0,&QWP(32,$key_));
717
718&set_label("ccm64_enc2_loop");
719	&aesenc		($inout0,$rndkey1);
720	&aesenc		($cmac,$rndkey1);
721	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
722	&add		($rounds,32);
723	&aesenc		($inout0,$rndkey0);
724	&aesenc		($cmac,$rndkey0);
725	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
726	&jnz		(&label("ccm64_enc2_loop"));
727	&aesenc		($inout0,$rndkey1);
728	&aesenc		($cmac,$rndkey1);
729	&paddq		($ivec,&QWP(16,"esp"));
730	&dec		($len);
731	&aesenclast	($inout0,$rndkey0);
732	&aesenclast	($cmac,$rndkey0);
733
734	&lea	($inp,&DWP(16,$inp));
735	&xorps	($in0,$inout0);			# inp^=E(ivec)
736	&movdqa	($inout0,$ivec);
737	&movups	(&QWP(0,$out),$in0);		# save output
738	&pshufb	($inout0,$inout3);
739	&lea	($out,&DWP(16,$out));
740	&jnz	(&label("ccm64_enc_outer"));
741
742	&mov	("esp",&DWP(48,"esp"));
743	&mov	($out,&wparam(5));
744	&movups	(&QWP(0,$out),$cmac);
745
746	&pxor	("xmm0","xmm0");		# clear register bank
747	&pxor	("xmm1","xmm1");
748	&pxor	("xmm2","xmm2");
749	&pxor	("xmm3","xmm3");
750	&pxor	("xmm4","xmm4");
751	&pxor	("xmm5","xmm5");
752	&pxor	("xmm6","xmm6");
753	&pxor	("xmm7","xmm7");
754&function_end("${PREFIX}_ccm64_encrypt_blocks");
755
756&function_begin("${PREFIX}_ccm64_decrypt_blocks");
757	&mov	($inp,&wparam(0));
758	&mov	($out,&wparam(1));
759	&mov	($len,&wparam(2));
760	&mov	($key,&wparam(3));
761	&mov	($rounds_,&wparam(4));
762	&mov	($rounds,&wparam(5));
763	&mov	($key_,"esp");
764	&sub	("esp",60);
765	&and	("esp",-16);			# align stack
766	&mov	(&DWP(48,"esp"),$key_);
767
768	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
769	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
770	&mov	($rounds,&DWP(240,$key));
771
772	# compose byte-swap control mask for pshufb on stack
773	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
774	&mov	(&DWP(4,"esp"),0x08090a0b);
775	&mov	(&DWP(8,"esp"),0x04050607);
776	&mov	(&DWP(12,"esp"),0x00010203);
777
778	# compose counter increment vector on stack
779	&mov	($rounds_,1);
780	&xor	($key_,$key_);
781	&mov	(&DWP(16,"esp"),$rounds_);
782	&mov	(&DWP(20,"esp"),$key_);
783	&mov	(&DWP(24,"esp"),$key_);
784	&mov	(&DWP(28,"esp"),$key_);
785
786	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
787	&movdqa	($inout0,$ivec);
788
789	&mov	($key_,$key);
790	&mov	($rounds_,$rounds);
791
792	&pshufb	($ivec,$inout3);
793	if ($inline)
794	{   &aesni_inline_generate1("enc");	}
795	else
796	{   &call	("_aesni_encrypt1");	}
797	&shl	($rounds_,4);
798	&mov	($rounds,16);
799	&movups	($in0,&QWP(0,$inp));		# load inp
800	&paddq	($ivec,&QWP(16,"esp"));
801	&lea	($inp,&QWP(16,$inp));
802	&sub	($rounds,$rounds_);
803	&lea	($key,&DWP(32,$key_,$rounds_));
804	&mov	($rounds_,$rounds);
805	&jmp	(&label("ccm64_dec_outer"));
806
807&set_label("ccm64_dec_outer",16);
808	&xorps	($in0,$inout0);			# inp ^= E(ivec)
809	&movdqa	($inout0,$ivec);
810	&movups	(&QWP(0,$out),$in0);		# save output
811	&lea	($out,&DWP(16,$out));
812	&pshufb	($inout0,$inout3);
813
814	&sub	($len,1);
815	&jz	(&label("ccm64_dec_break"));
816
817	&$movekey	($rndkey0,&QWP(0,$key_));
818	&mov		($rounds,$rounds_);
819	&$movekey	($rndkey1,&QWP(16,$key_));
820	&xorps		($in0,$rndkey0);
821	&xorps		($inout0,$rndkey0);
822	&xorps		($cmac,$in0);		# cmac^=out
823	&$movekey	($rndkey0,&QWP(32,$key_));
824
825&set_label("ccm64_dec2_loop");
826	&aesenc		($inout0,$rndkey1);
827	&aesenc		($cmac,$rndkey1);
828	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
829	&add		($rounds,32);
830	&aesenc		($inout0,$rndkey0);
831	&aesenc		($cmac,$rndkey0);
832	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
833	&jnz		(&label("ccm64_dec2_loop"));
834	&movups		($in0,&QWP(0,$inp));	# load inp
835	&paddq		($ivec,&QWP(16,"esp"));
836	&aesenc		($inout0,$rndkey1);
837	&aesenc		($cmac,$rndkey1);
838	&aesenclast	($inout0,$rndkey0);
839	&aesenclast	($cmac,$rndkey0);
840	&lea		($inp,&QWP(16,$inp));
841	&jmp	(&label("ccm64_dec_outer"));
842
843&set_label("ccm64_dec_break",16);
844	&mov	($rounds,&DWP(240,$key_));
845	&mov	($key,$key_);
846	if ($inline)
847	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
848	else
849	{   &call	("_aesni_encrypt1",$cmac);	}
850
851	&mov	("esp",&DWP(48,"esp"));
852	&mov	($out,&wparam(5));
853	&movups	(&QWP(0,$out),$cmac);
854
855	&pxor	("xmm0","xmm0");		# clear register bank
856	&pxor	("xmm1","xmm1");
857	&pxor	("xmm2","xmm2");
858	&pxor	("xmm3","xmm3");
859	&pxor	("xmm4","xmm4");
860	&pxor	("xmm5","xmm5");
861	&pxor	("xmm6","xmm6");
862	&pxor	("xmm7","xmm7");
863&function_end("${PREFIX}_ccm64_decrypt_blocks");
864}
865
866######################################################################
867# void aes_hw_ctr32_encrypt_blocks (const void *in, void *out,
868#                         size_t blocks, const AES_KEY *key,
869#                         const char *ivec);
870#
871# Handles only complete blocks, operates on 32-bit counter and
872# does not update *ivec! (see crypto/modes/ctr128.c for details)
873#
874# stack layout:
875#	0	pshufb mask
876#	16	vector addend: 0,6,6,6
877# 	32	counter-less ivec
878#	48	1st triplet of counter vector
879#	64	2nd triplet of counter vector
880#	80	saved %esp
881
882&function_begin("${PREFIX}_ctr32_encrypt_blocks");
883	&record_function_hit(0);
884
885	&mov	($inp,&wparam(0));
886	&mov	($out,&wparam(1));
887	&mov	($len,&wparam(2));
888	&mov	($key,&wparam(3));
889	&mov	($rounds_,&wparam(4));
890	&mov	($key_,"esp");
891	&sub	("esp",88);
892	&and	("esp",-16);			# align stack
893	&mov	(&DWP(80,"esp"),$key_);
894
895	&cmp	($len,1);
896	&je	(&label("ctr32_one_shortcut"));
897
898	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
899
900	# compose byte-swap control mask for pshufb on stack
901	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
902	&mov	(&DWP(4,"esp"),0x08090a0b);
903	&mov	(&DWP(8,"esp"),0x04050607);
904	&mov	(&DWP(12,"esp"),0x00010203);
905
906	# compose counter increment vector on stack
907	&mov	($rounds,6);
908	&xor	($key_,$key_);
909	&mov	(&DWP(16,"esp"),$rounds);
910	&mov	(&DWP(20,"esp"),$rounds);
911	&mov	(&DWP(24,"esp"),$rounds);
912	&mov	(&DWP(28,"esp"),$key_);
913
914	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
915	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
916
917	&mov	($rounds,&DWP(240,$key));	# key->rounds
918
919	# compose 2 vectors of 3x32-bit counters
920	&bswap	($rounds_);
921	&pxor	($rndkey0,$rndkey0);
922	&pxor	($rndkey1,$rndkey1);
923	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
924	&pinsrd	($rndkey0,$rounds_,0);
925	&lea	($key_,&DWP(3,$rounds_));
926	&pinsrd	($rndkey1,$key_,0);
927	&inc	($rounds_);
928	&pinsrd	($rndkey0,$rounds_,1);
929	&inc	($key_);
930	&pinsrd	($rndkey1,$key_,1);
931	&inc	($rounds_);
932	&pinsrd	($rndkey0,$rounds_,2);
933	&inc	($key_);
934	&pinsrd	($rndkey1,$key_,2);
935	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
936	&pshufb	($rndkey0,$inout0);		# byte swap
937	&movdqu	($inout4,&QWP(0,$key));		# key[0]
938	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
939	&pshufb	($rndkey1,$inout0);		# byte swap
940
941	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
942	&pshufd	($inout1,$rndkey0,2<<6);
943	&cmp	($len,6);
944	&jb	(&label("ctr32_tail"));
945	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
946	&shl	($rounds,4);
947	&mov	($rounds_,16);
948	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
949	&mov	($key_,$key);			# backup $key
950	&sub	($rounds_,$rounds);		# backup twisted $rounds
951	&lea	($key,&DWP(32,$key,$rounds));
952	&sub	($len,6);
953	&jmp	(&label("ctr32_loop6"));
954
955&set_label("ctr32_loop6",16);
956	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
957	&pshufd	($inout2,$rndkey0,1<<6);
958	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
959	&pshufd	($inout3,$rndkey1,3<<6);
960	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
961	&pshufd	($inout4,$rndkey1,2<<6);
962	&pxor		($inout1,$rndkey0);
963	&pshufd	($inout5,$rndkey1,1<<6);
964	&$movekey	($rndkey1,&QWP(16,$key_));
965	&pxor		($inout2,$rndkey0);
966	&pxor		($inout3,$rndkey0);
967	&aesenc		($inout0,$rndkey1);
968	&pxor		($inout4,$rndkey0);
969	&pxor		($inout5,$rndkey0);
970	&aesenc		($inout1,$rndkey1);
971	&$movekey	($rndkey0,&QWP(32,$key_));
972	&mov		($rounds,$rounds_);
973	&aesenc		($inout2,$rndkey1);
974	&aesenc		($inout3,$rndkey1);
975	&aesenc		($inout4,$rndkey1);
976	&aesenc		($inout5,$rndkey1);
977
978	&call		(&label("_aesni_encrypt6_enter"));
979
980	&movups	($rndkey1,&QWP(0,$inp));
981	&movups	($rndkey0,&QWP(0x10,$inp));
982	&xorps	($inout0,$rndkey1);
983	&movups	($rndkey1,&QWP(0x20,$inp));
984	&xorps	($inout1,$rndkey0);
985	&movups	(&QWP(0,$out),$inout0);
986	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
987	&xorps	($inout2,$rndkey1);
988	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
989	&movups	(&QWP(0x10,$out),$inout1);
990	&movups	(&QWP(0x20,$out),$inout2);
991
992	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
993	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
994	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
995
996	&movups	($inout1,&QWP(0x30,$inp));
997	&movups	($inout2,&QWP(0x40,$inp));
998	&xorps	($inout3,$inout1);
999	&movups	($inout1,&QWP(0x50,$inp));
1000	&lea	($inp,&DWP(0x60,$inp));
1001	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
1002	&pshufb	($rndkey0,$inout0);		# byte swap
1003	&xorps	($inout4,$inout2);
1004	&movups	(&QWP(0x30,$out),$inout3);
1005	&xorps	($inout5,$inout1);
1006	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
1007	&pshufb	($rndkey1,$inout0);		# byte swap
1008	&movups	(&QWP(0x40,$out),$inout4);
1009	&pshufd	($inout0,$rndkey0,3<<6);
1010	&movups	(&QWP(0x50,$out),$inout5);
1011	&lea	($out,&DWP(0x60,$out));
1012
1013	&pshufd	($inout1,$rndkey0,2<<6);
1014	&sub	($len,6);
1015	&jnc	(&label("ctr32_loop6"));
1016
1017	&add	($len,6);
1018	&jz	(&label("ctr32_ret"));
1019	&movdqu	($inout5,&QWP(0,$key_));
1020	&mov	($key,$key_);
1021	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
1022	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1023
1024&set_label("ctr32_tail");
1025	&por	($inout0,$inout5);
1026	&cmp	($len,2);
1027	&jb	(&label("ctr32_one"));
1028
1029	&pshufd	($inout2,$rndkey0,1<<6);
1030	&por	($inout1,$inout5);
1031	&je	(&label("ctr32_two"));
1032
1033	&pshufd	($inout3,$rndkey1,3<<6);
1034	&por	($inout2,$inout5);
1035	&cmp	($len,4);
1036	&jb	(&label("ctr32_three"));
1037
1038	&pshufd	($inout4,$rndkey1,2<<6);
1039	&por	($inout3,$inout5);
1040	&je	(&label("ctr32_four"));
1041
1042	&por	($inout4,$inout5);
1043	&call	("_aesni_encrypt6");
1044	&movups	($rndkey1,&QWP(0,$inp));
1045	&movups	($rndkey0,&QWP(0x10,$inp));
1046	&xorps	($inout0,$rndkey1);
1047	&movups	($rndkey1,&QWP(0x20,$inp));
1048	&xorps	($inout1,$rndkey0);
1049	&movups	($rndkey0,&QWP(0x30,$inp));
1050	&xorps	($inout2,$rndkey1);
1051	&movups	($rndkey1,&QWP(0x40,$inp));
1052	&xorps	($inout3,$rndkey0);
1053	&movups	(&QWP(0,$out),$inout0);
1054	&xorps	($inout4,$rndkey1);
1055	&movups	(&QWP(0x10,$out),$inout1);
1056	&movups	(&QWP(0x20,$out),$inout2);
1057	&movups	(&QWP(0x30,$out),$inout3);
1058	&movups	(&QWP(0x40,$out),$inout4);
1059	&jmp	(&label("ctr32_ret"));
1060
1061&set_label("ctr32_one_shortcut",16);
1062	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
1063	&mov	($rounds,&DWP(240,$key));
1064
1065&set_label("ctr32_one");
1066	if ($inline)
1067	{   &aesni_inline_generate1("enc");	}
1068	else
1069	{   &call	("_aesni_encrypt1");	}
1070	&movups	($in0,&QWP(0,$inp));
1071	&xorps	($in0,$inout0);
1072	&movups	(&QWP(0,$out),$in0);
1073	&jmp	(&label("ctr32_ret"));
1074
1075&set_label("ctr32_two",16);
1076	&call	("_aesni_encrypt2");
1077	&movups	($inout3,&QWP(0,$inp));
1078	&movups	($inout4,&QWP(0x10,$inp));
1079	&xorps	($inout0,$inout3);
1080	&xorps	($inout1,$inout4);
1081	&movups	(&QWP(0,$out),$inout0);
1082	&movups	(&QWP(0x10,$out),$inout1);
1083	&jmp	(&label("ctr32_ret"));
1084
1085&set_label("ctr32_three",16);
1086	&call	("_aesni_encrypt3");
1087	&movups	($inout3,&QWP(0,$inp));
1088	&movups	($inout4,&QWP(0x10,$inp));
1089	&xorps	($inout0,$inout3);
1090	&movups	($inout5,&QWP(0x20,$inp));
1091	&xorps	($inout1,$inout4);
1092	&movups	(&QWP(0,$out),$inout0);
1093	&xorps	($inout2,$inout5);
1094	&movups	(&QWP(0x10,$out),$inout1);
1095	&movups	(&QWP(0x20,$out),$inout2);
1096	&jmp	(&label("ctr32_ret"));
1097
1098&set_label("ctr32_four",16);
1099	&call	("_aesni_encrypt4");
1100	&movups	($inout4,&QWP(0,$inp));
1101	&movups	($inout5,&QWP(0x10,$inp));
1102	&movups	($rndkey1,&QWP(0x20,$inp));
1103	&xorps	($inout0,$inout4);
1104	&movups	($rndkey0,&QWP(0x30,$inp));
1105	&xorps	($inout1,$inout5);
1106	&movups	(&QWP(0,$out),$inout0);
1107	&xorps	($inout2,$rndkey1);
1108	&movups	(&QWP(0x10,$out),$inout1);
1109	&xorps	($inout3,$rndkey0);
1110	&movups	(&QWP(0x20,$out),$inout2);
1111	&movups	(&QWP(0x30,$out),$inout3);
1112
1113&set_label("ctr32_ret");
1114	&pxor	("xmm0","xmm0");		# clear register bank
1115	&pxor	("xmm1","xmm1");
1116	&pxor	("xmm2","xmm2");
1117	&pxor	("xmm3","xmm3");
1118	&pxor	("xmm4","xmm4");
1119	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
1120	&pxor	("xmm5","xmm5");
1121	&movdqa	(&QWP(48,"esp"),"xmm0");
1122	&pxor	("xmm6","xmm6");
1123	&movdqa	(&QWP(64,"esp"),"xmm0");
1124	&pxor	("xmm7","xmm7");
1125	&mov	("esp",&DWP(80,"esp"));
1126&function_end("${PREFIX}_ctr32_encrypt_blocks");
1127
1128######################################################################
1129# void aes_hw_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1130#	const AES_KEY *key1, const AES_KEY *key2
1131#	const unsigned char iv[16]);
1132#
1133{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1134
1135&function_begin("${PREFIX}_xts_encrypt");
1136	&mov	($key,&wparam(4));		# key2
1137	&mov	($inp,&wparam(5));		# clear-text tweak
1138
1139	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1140	&movups	($inout0,&QWP(0,$inp));
1141	if ($inline)
1142	{   &aesni_inline_generate1("enc");	}
1143	else
1144	{   &call	("_aesni_encrypt1");	}
1145
1146	&mov	($inp,&wparam(0));
1147	&mov	($out,&wparam(1));
1148	&mov	($len,&wparam(2));
1149	&mov	($key,&wparam(3));		# key1
1150
1151	&mov	($key_,"esp");
1152	&sub	("esp",16*7+8);
1153	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1154	&and	("esp",-16);			# align stack
1155
1156	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1157	&mov	(&DWP(16*6+4,"esp"),0);
1158	&mov	(&DWP(16*6+8,"esp"),1);
1159	&mov	(&DWP(16*6+12,"esp"),0);
1160	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1161	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1162
1163	&movdqa	($tweak,$inout0);
1164	&pxor	($twtmp,$twtmp);
1165	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1166	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1167
1168	&and	($len,-16);
1169	&mov	($key_,$key);			# backup $key
1170	&mov	($rounds_,$rounds);		# backup $rounds
1171	&sub	($len,16*6);
1172	&jc	(&label("xts_enc_short"));
1173
1174	&shl	($rounds,4);
1175	&mov	($rounds_,16);
1176	&sub	($rounds_,$rounds);
1177	&lea	($key,&DWP(32,$key,$rounds));
1178	&jmp	(&label("xts_enc_loop6"));
1179
1180&set_label("xts_enc_loop6",16);
1181	for ($i=0;$i<4;$i++) {
1182	    &pshufd	($twres,$twtmp,0x13);
1183	    &pxor	($twtmp,$twtmp);
1184	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1185	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1186	    &pand	($twres,$twmask);	# isolate carry and residue
1187	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1188	    &pxor	($tweak,$twres);
1189	}
1190	&pshufd	($inout5,$twtmp,0x13);
1191	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1192	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1193	 &$movekey	($rndkey0,&QWP(0,$key_));
1194	&pand	($inout5,$twmask);		# isolate carry and residue
1195	 &movups	($inout0,&QWP(0,$inp));	# load input
1196	&pxor	($inout5,$tweak);
1197
1198	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1199	&mov	($rounds,$rounds_);		# restore $rounds
1200	&movdqu	($inout1,&QWP(16*1,$inp));
1201	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1202	&movdqu	($inout2,&QWP(16*2,$inp));
1203	 &pxor		($inout1,$rndkey0);
1204	&movdqu	($inout3,&QWP(16*3,$inp));
1205	 &pxor		($inout2,$rndkey0);
1206	&movdqu	($inout4,&QWP(16*4,$inp));
1207	 &pxor		($inout3,$rndkey0);
1208	&movdqu	($rndkey1,&QWP(16*5,$inp));
1209	 &pxor		($inout4,$rndkey0);
1210	&lea	($inp,&DWP(16*6,$inp));
1211	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1212	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1213	&pxor	($inout5,$rndkey1);
1214
1215	 &$movekey	($rndkey1,&QWP(16,$key_));
1216	&pxor	($inout1,&QWP(16*1,"esp"));
1217	&pxor	($inout2,&QWP(16*2,"esp"));
1218	 &aesenc	($inout0,$rndkey1);
1219	&pxor	($inout3,&QWP(16*3,"esp"));
1220	&pxor	($inout4,&QWP(16*4,"esp"));
1221	 &aesenc	($inout1,$rndkey1);
1222	&pxor		($inout5,$rndkey0);
1223	 &$movekey	($rndkey0,&QWP(32,$key_));
1224	 &aesenc	($inout2,$rndkey1);
1225	 &aesenc	($inout3,$rndkey1);
1226	 &aesenc	($inout4,$rndkey1);
1227	 &aesenc	($inout5,$rndkey1);
1228	&call		(&label("_aesni_encrypt6_enter"));
1229
1230	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1231       &pxor	($twtmp,$twtmp);
1232	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1233       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1234	&xorps	($inout1,&QWP(16*1,"esp"));
1235	&movups	(&QWP(16*0,$out),$inout0);	# write output
1236	&xorps	($inout2,&QWP(16*2,"esp"));
1237	&movups	(&QWP(16*1,$out),$inout1);
1238	&xorps	($inout3,&QWP(16*3,"esp"));
1239	&movups	(&QWP(16*2,$out),$inout2);
1240	&xorps	($inout4,&QWP(16*4,"esp"));
1241	&movups	(&QWP(16*3,$out),$inout3);
1242	&xorps	($inout5,$tweak);
1243	&movups	(&QWP(16*4,$out),$inout4);
1244       &pshufd	($twres,$twtmp,0x13);
1245	&movups	(&QWP(16*5,$out),$inout5);
1246	&lea	($out,&DWP(16*6,$out));
1247       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1248
1249	&pxor	($twtmp,$twtmp);
1250	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1251	&pand	($twres,$twmask);		# isolate carry and residue
1252	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1253	&pxor	($tweak,$twres);
1254
1255	&sub	($len,16*6);
1256	&jnc	(&label("xts_enc_loop6"));
1257
1258	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1259	&mov	($key,$key_);			# restore $key
1260	&mov	($rounds_,$rounds);
1261
1262&set_label("xts_enc_short");
1263	&add	($len,16*6);
1264	&jz	(&label("xts_enc_done6x"));
1265
1266	&movdqa	($inout3,$tweak);		# put aside previous tweak
1267	&cmp	($len,0x20);
1268	&jb	(&label("xts_enc_one"));
1269
1270	&pshufd	($twres,$twtmp,0x13);
1271	&pxor	($twtmp,$twtmp);
1272	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1273	&pand	($twres,$twmask);		# isolate carry and residue
1274	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1275	&pxor	($tweak,$twres);
1276	&je	(&label("xts_enc_two"));
1277
1278	&pshufd	($twres,$twtmp,0x13);
1279	&pxor	($twtmp,$twtmp);
1280	&movdqa	($inout4,$tweak);		# put aside previous tweak
1281	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1282	&pand	($twres,$twmask);		# isolate carry and residue
1283	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1284	&pxor	($tweak,$twres);
1285	&cmp	($len,0x40);
1286	&jb	(&label("xts_enc_three"));
1287
1288	&pshufd	($twres,$twtmp,0x13);
1289	&pxor	($twtmp,$twtmp);
1290	&movdqa	($inout5,$tweak);		# put aside previous tweak
1291	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1292	&pand	($twres,$twmask);		# isolate carry and residue
1293	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1294	&pxor	($tweak,$twres);
1295	&movdqa	(&QWP(16*0,"esp"),$inout3);
1296	&movdqa	(&QWP(16*1,"esp"),$inout4);
1297	&je	(&label("xts_enc_four"));
1298
1299	&movdqa	(&QWP(16*2,"esp"),$inout5);
1300	&pshufd	($inout5,$twtmp,0x13);
1301	&movdqa	(&QWP(16*3,"esp"),$tweak);
1302	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1303	&pand	($inout5,$twmask);		# isolate carry and residue
1304	&pxor	($inout5,$tweak);
1305
1306	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1307	&movdqu	($inout1,&QWP(16*1,$inp));
1308	&movdqu	($inout2,&QWP(16*2,$inp));
1309	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1310	&movdqu	($inout3,&QWP(16*3,$inp));
1311	&pxor	($inout1,&QWP(16*1,"esp"));
1312	&movdqu	($inout4,&QWP(16*4,$inp));
1313	&pxor	($inout2,&QWP(16*2,"esp"));
1314	&lea	($inp,&DWP(16*5,$inp));
1315	&pxor	($inout3,&QWP(16*3,"esp"));
1316	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1317	&pxor	($inout4,$inout5);
1318
1319	&call	("_aesni_encrypt6");
1320
1321	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1322	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1323	&xorps	($inout1,&QWP(16*1,"esp"));
1324	&xorps	($inout2,&QWP(16*2,"esp"));
1325	&movups	(&QWP(16*0,$out),$inout0);	# write output
1326	&xorps	($inout3,&QWP(16*3,"esp"));
1327	&movups	(&QWP(16*1,$out),$inout1);
1328	&xorps	($inout4,$tweak);
1329	&movups	(&QWP(16*2,$out),$inout2);
1330	&movups	(&QWP(16*3,$out),$inout3);
1331	&movups	(&QWP(16*4,$out),$inout4);
1332	&lea	($out,&DWP(16*5,$out));
1333	&jmp	(&label("xts_enc_done"));
1334
1335&set_label("xts_enc_one",16);
1336	&movups	($inout0,&QWP(16*0,$inp));	# load input
1337	&lea	($inp,&DWP(16*1,$inp));
1338	&xorps	($inout0,$inout3);		# input^=tweak
1339	if ($inline)
1340	{   &aesni_inline_generate1("enc");	}
1341	else
1342	{   &call	("_aesni_encrypt1");	}
1343	&xorps	($inout0,$inout3);		# output^=tweak
1344	&movups	(&QWP(16*0,$out),$inout0);	# write output
1345	&lea	($out,&DWP(16*1,$out));
1346
1347	&movdqa	($tweak,$inout3);		# last tweak
1348	&jmp	(&label("xts_enc_done"));
1349
1350&set_label("xts_enc_two",16);
1351	&movaps	($inout4,$tweak);		# put aside last tweak
1352
1353	&movups	($inout0,&QWP(16*0,$inp));	# load input
1354	&movups	($inout1,&QWP(16*1,$inp));
1355	&lea	($inp,&DWP(16*2,$inp));
1356	&xorps	($inout0,$inout3);		# input^=tweak
1357	&xorps	($inout1,$inout4);
1358
1359	&call	("_aesni_encrypt2");
1360
1361	&xorps	($inout0,$inout3);		# output^=tweak
1362	&xorps	($inout1,$inout4);
1363	&movups	(&QWP(16*0,$out),$inout0);	# write output
1364	&movups	(&QWP(16*1,$out),$inout1);
1365	&lea	($out,&DWP(16*2,$out));
1366
1367	&movdqa	($tweak,$inout4);		# last tweak
1368	&jmp	(&label("xts_enc_done"));
1369
1370&set_label("xts_enc_three",16);
1371	&movaps	($inout5,$tweak);		# put aside last tweak
1372	&movups	($inout0,&QWP(16*0,$inp));	# load input
1373	&movups	($inout1,&QWP(16*1,$inp));
1374	&movups	($inout2,&QWP(16*2,$inp));
1375	&lea	($inp,&DWP(16*3,$inp));
1376	&xorps	($inout0,$inout3);		# input^=tweak
1377	&xorps	($inout1,$inout4);
1378	&xorps	($inout2,$inout5);
1379
1380	&call	("_aesni_encrypt3");
1381
1382	&xorps	($inout0,$inout3);		# output^=tweak
1383	&xorps	($inout1,$inout4);
1384	&xorps	($inout2,$inout5);
1385	&movups	(&QWP(16*0,$out),$inout0);	# write output
1386	&movups	(&QWP(16*1,$out),$inout1);
1387	&movups	(&QWP(16*2,$out),$inout2);
1388	&lea	($out,&DWP(16*3,$out));
1389
1390	&movdqa	($tweak,$inout5);		# last tweak
1391	&jmp	(&label("xts_enc_done"));
1392
1393&set_label("xts_enc_four",16);
1394	&movaps	($inout4,$tweak);		# put aside last tweak
1395
1396	&movups	($inout0,&QWP(16*0,$inp));	# load input
1397	&movups	($inout1,&QWP(16*1,$inp));
1398	&movups	($inout2,&QWP(16*2,$inp));
1399	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1400	&movups	($inout3,&QWP(16*3,$inp));
1401	&lea	($inp,&DWP(16*4,$inp));
1402	&xorps	($inout1,&QWP(16*1,"esp"));
1403	&xorps	($inout2,$inout5);
1404	&xorps	($inout3,$inout4);
1405
1406	&call	("_aesni_encrypt4");
1407
1408	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1409	&xorps	($inout1,&QWP(16*1,"esp"));
1410	&xorps	($inout2,$inout5);
1411	&movups	(&QWP(16*0,$out),$inout0);	# write output
1412	&xorps	($inout3,$inout4);
1413	&movups	(&QWP(16*1,$out),$inout1);
1414	&movups	(&QWP(16*2,$out),$inout2);
1415	&movups	(&QWP(16*3,$out),$inout3);
1416	&lea	($out,&DWP(16*4,$out));
1417
1418	&movdqa	($tweak,$inout4);		# last tweak
1419	&jmp	(&label("xts_enc_done"));
1420
1421&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
1422	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1423	&and	($len,15);
1424	&jz	(&label("xts_enc_ret"));
1425	&movdqa	($inout3,$tweak);
1426	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1427	&jmp	(&label("xts_enc_steal"));
1428
1429&set_label("xts_enc_done",16);
1430	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1431	&pxor	($twtmp,$twtmp);
1432	&and	($len,15);
1433	&jz	(&label("xts_enc_ret"));
1434
1435	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1436	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1437	&pshufd	($inout3,$twtmp,0x13);
1438	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1439	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
1440	&pxor	($inout3,$tweak);
1441
1442&set_label("xts_enc_steal");
1443	&movz	($rounds,&BP(0,$inp));
1444	&movz	($key,&BP(-16,$out));
1445	&lea	($inp,&DWP(1,$inp));
1446	&mov	(&BP(-16,$out),&LB($rounds));
1447	&mov	(&BP(0,$out),&LB($key));
1448	&lea	($out,&DWP(1,$out));
1449	&sub	($len,1);
1450	&jnz	(&label("xts_enc_steal"));
1451
1452	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1453	&mov	($key,$key_);			# restore $key
1454	&mov	($rounds,$rounds_);		# restore $rounds
1455
1456	&movups	($inout0,&QWP(-16,$out));	# load input
1457	&xorps	($inout0,$inout3);		# input^=tweak
1458	if ($inline)
1459	{   &aesni_inline_generate1("enc");	}
1460	else
1461	{   &call	("_aesni_encrypt1");	}
1462	&xorps	($inout0,$inout3);		# output^=tweak
1463	&movups	(&QWP(-16,$out),$inout0);	# write output
1464
1465&set_label("xts_enc_ret");
1466	&pxor	("xmm0","xmm0");		# clear register bank
1467	&pxor	("xmm1","xmm1");
1468	&pxor	("xmm2","xmm2");
1469	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
1470	&pxor	("xmm3","xmm3");
1471	&movdqa	(&QWP(16*1,"esp"),"xmm0");
1472	&pxor	("xmm4","xmm4");
1473	&movdqa	(&QWP(16*2,"esp"),"xmm0");
1474	&pxor	("xmm5","xmm5");
1475	&movdqa	(&QWP(16*3,"esp"),"xmm0");
1476	&pxor	("xmm6","xmm6");
1477	&movdqa	(&QWP(16*4,"esp"),"xmm0");
1478	&pxor	("xmm7","xmm7");
1479	&movdqa	(&QWP(16*5,"esp"),"xmm0");
1480	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1481&function_end("${PREFIX}_xts_encrypt");
1482
1483&function_begin("${PREFIX}_xts_decrypt");
1484	&mov	($key,&wparam(4));		# key2
1485	&mov	($inp,&wparam(5));		# clear-text tweak
1486
1487	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1488	&movups	($inout0,&QWP(0,$inp));
1489	if ($inline)
1490	{   &aesni_inline_generate1("enc");	}
1491	else
1492	{   &call	("_aesni_encrypt1");	}
1493
1494	&mov	($inp,&wparam(0));
1495	&mov	($out,&wparam(1));
1496	&mov	($len,&wparam(2));
1497	&mov	($key,&wparam(3));		# key1
1498
1499	&mov	($key_,"esp");
1500	&sub	("esp",16*7+8);
1501	&and	("esp",-16);			# align stack
1502
1503	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
1504	&test	($len,15);
1505	&setnz	(&LB($rounds_));
1506	&shl	($rounds_,4);
1507	&sub	($len,$rounds_);
1508
1509	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1510	&mov	(&DWP(16*6+4,"esp"),0);
1511	&mov	(&DWP(16*6+8,"esp"),1);
1512	&mov	(&DWP(16*6+12,"esp"),0);
1513	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1514	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1515
1516	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1517	&mov	($key_,$key);			# backup $key
1518	&mov	($rounds_,$rounds);		# backup $rounds
1519
1520	&movdqa	($tweak,$inout0);
1521	&pxor	($twtmp,$twtmp);
1522	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1523	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1524
1525	&and	($len,-16);
1526	&sub	($len,16*6);
1527	&jc	(&label("xts_dec_short"));
1528
1529	&shl	($rounds,4);
1530	&mov	($rounds_,16);
1531	&sub	($rounds_,$rounds);
1532	&lea	($key,&DWP(32,$key,$rounds));
1533	&jmp	(&label("xts_dec_loop6"));
1534
1535&set_label("xts_dec_loop6",16);
1536	for ($i=0;$i<4;$i++) {
1537	    &pshufd	($twres,$twtmp,0x13);
1538	    &pxor	($twtmp,$twtmp);
1539	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1540	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1541	    &pand	($twres,$twmask);	# isolate carry and residue
1542	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1543	    &pxor	($tweak,$twres);
1544	}
1545	&pshufd	($inout5,$twtmp,0x13);
1546	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1547	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1548	 &$movekey	($rndkey0,&QWP(0,$key_));
1549	&pand	($inout5,$twmask);		# isolate carry and residue
1550	 &movups	($inout0,&QWP(0,$inp));	# load input
1551	&pxor	($inout5,$tweak);
1552
1553	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1554	&mov	($rounds,$rounds_);
1555	&movdqu	($inout1,&QWP(16*1,$inp));
1556	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1557	&movdqu	($inout2,&QWP(16*2,$inp));
1558	 &pxor		($inout1,$rndkey0);
1559	&movdqu	($inout3,&QWP(16*3,$inp));
1560	 &pxor		($inout2,$rndkey0);
1561	&movdqu	($inout4,&QWP(16*4,$inp));
1562	 &pxor		($inout3,$rndkey0);
1563	&movdqu	($rndkey1,&QWP(16*5,$inp));
1564	 &pxor		($inout4,$rndkey0);
1565	&lea	($inp,&DWP(16*6,$inp));
1566	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1567	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1568	&pxor	($inout5,$rndkey1);
1569
1570	 &$movekey	($rndkey1,&QWP(16,$key_));
1571	&pxor	($inout1,&QWP(16*1,"esp"));
1572	&pxor	($inout2,&QWP(16*2,"esp"));
1573	 &aesdec	($inout0,$rndkey1);
1574	&pxor	($inout3,&QWP(16*3,"esp"));
1575	&pxor	($inout4,&QWP(16*4,"esp"));
1576	 &aesdec	($inout1,$rndkey1);
1577	&pxor		($inout5,$rndkey0);
1578	 &$movekey	($rndkey0,&QWP(32,$key_));
1579	 &aesdec	($inout2,$rndkey1);
1580	 &aesdec	($inout3,$rndkey1);
1581	 &aesdec	($inout4,$rndkey1);
1582	 &aesdec	($inout5,$rndkey1);
1583	&call		(&label("_aesni_decrypt6_enter"));
1584
1585	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1586       &pxor	($twtmp,$twtmp);
1587	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1588       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1589	&xorps	($inout1,&QWP(16*1,"esp"));
1590	&movups	(&QWP(16*0,$out),$inout0);	# write output
1591	&xorps	($inout2,&QWP(16*2,"esp"));
1592	&movups	(&QWP(16*1,$out),$inout1);
1593	&xorps	($inout3,&QWP(16*3,"esp"));
1594	&movups	(&QWP(16*2,$out),$inout2);
1595	&xorps	($inout4,&QWP(16*4,"esp"));
1596	&movups	(&QWP(16*3,$out),$inout3);
1597	&xorps	($inout5,$tweak);
1598	&movups	(&QWP(16*4,$out),$inout4);
1599       &pshufd	($twres,$twtmp,0x13);
1600	&movups	(&QWP(16*5,$out),$inout5);
1601	&lea	($out,&DWP(16*6,$out));
1602       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1603
1604	&pxor	($twtmp,$twtmp);
1605	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1606	&pand	($twres,$twmask);		# isolate carry and residue
1607	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1608	&pxor	($tweak,$twres);
1609
1610	&sub	($len,16*6);
1611	&jnc	(&label("xts_dec_loop6"));
1612
1613	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1614	&mov	($key,$key_);			# restore $key
1615	&mov	($rounds_,$rounds);
1616
1617&set_label("xts_dec_short");
1618	&add	($len,16*6);
1619	&jz	(&label("xts_dec_done6x"));
1620
1621	&movdqa	($inout3,$tweak);		# put aside previous tweak
1622	&cmp	($len,0x20);
1623	&jb	(&label("xts_dec_one"));
1624
1625	&pshufd	($twres,$twtmp,0x13);
1626	&pxor	($twtmp,$twtmp);
1627	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1628	&pand	($twres,$twmask);		# isolate carry and residue
1629	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1630	&pxor	($tweak,$twres);
1631	&je	(&label("xts_dec_two"));
1632
1633	&pshufd	($twres,$twtmp,0x13);
1634	&pxor	($twtmp,$twtmp);
1635	&movdqa	($inout4,$tweak);		# put aside previous tweak
1636	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1637	&pand	($twres,$twmask);		# isolate carry and residue
1638	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1639	&pxor	($tweak,$twres);
1640	&cmp	($len,0x40);
1641	&jb	(&label("xts_dec_three"));
1642
1643	&pshufd	($twres,$twtmp,0x13);
1644	&pxor	($twtmp,$twtmp);
1645	&movdqa	($inout5,$tweak);		# put aside previous tweak
1646	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1647	&pand	($twres,$twmask);		# isolate carry and residue
1648	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1649	&pxor	($tweak,$twres);
1650	&movdqa	(&QWP(16*0,"esp"),$inout3);
1651	&movdqa	(&QWP(16*1,"esp"),$inout4);
1652	&je	(&label("xts_dec_four"));
1653
1654	&movdqa	(&QWP(16*2,"esp"),$inout5);
1655	&pshufd	($inout5,$twtmp,0x13);
1656	&movdqa	(&QWP(16*3,"esp"),$tweak);
1657	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1658	&pand	($inout5,$twmask);		# isolate carry and residue
1659	&pxor	($inout5,$tweak);
1660
1661	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1662	&movdqu	($inout1,&QWP(16*1,$inp));
1663	&movdqu	($inout2,&QWP(16*2,$inp));
1664	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1665	&movdqu	($inout3,&QWP(16*3,$inp));
1666	&pxor	($inout1,&QWP(16*1,"esp"));
1667	&movdqu	($inout4,&QWP(16*4,$inp));
1668	&pxor	($inout2,&QWP(16*2,"esp"));
1669	&lea	($inp,&DWP(16*5,$inp));
1670	&pxor	($inout3,&QWP(16*3,"esp"));
1671	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1672	&pxor	($inout4,$inout5);
1673
1674	&call	("_aesni_decrypt6");
1675
1676	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1677	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1678	&xorps	($inout1,&QWP(16*1,"esp"));
1679	&xorps	($inout2,&QWP(16*2,"esp"));
1680	&movups	(&QWP(16*0,$out),$inout0);	# write output
1681	&xorps	($inout3,&QWP(16*3,"esp"));
1682	&movups	(&QWP(16*1,$out),$inout1);
1683	&xorps	($inout4,$tweak);
1684	&movups	(&QWP(16*2,$out),$inout2);
1685	&movups	(&QWP(16*3,$out),$inout3);
1686	&movups	(&QWP(16*4,$out),$inout4);
1687	&lea	($out,&DWP(16*5,$out));
1688	&jmp	(&label("xts_dec_done"));
1689
1690&set_label("xts_dec_one",16);
1691	&movups	($inout0,&QWP(16*0,$inp));	# load input
1692	&lea	($inp,&DWP(16*1,$inp));
1693	&xorps	($inout0,$inout3);		# input^=tweak
1694	if ($inline)
1695	{   &aesni_inline_generate1("dec");	}
1696	else
1697	{   &call	("_aesni_decrypt1");	}
1698	&xorps	($inout0,$inout3);		# output^=tweak
1699	&movups	(&QWP(16*0,$out),$inout0);	# write output
1700	&lea	($out,&DWP(16*1,$out));
1701
1702	&movdqa	($tweak,$inout3);		# last tweak
1703	&jmp	(&label("xts_dec_done"));
1704
1705&set_label("xts_dec_two",16);
1706	&movaps	($inout4,$tweak);		# put aside last tweak
1707
1708	&movups	($inout0,&QWP(16*0,$inp));	# load input
1709	&movups	($inout1,&QWP(16*1,$inp));
1710	&lea	($inp,&DWP(16*2,$inp));
1711	&xorps	($inout0,$inout3);		# input^=tweak
1712	&xorps	($inout1,$inout4);
1713
1714	&call	("_aesni_decrypt2");
1715
1716	&xorps	($inout0,$inout3);		# output^=tweak
1717	&xorps	($inout1,$inout4);
1718	&movups	(&QWP(16*0,$out),$inout0);	# write output
1719	&movups	(&QWP(16*1,$out),$inout1);
1720	&lea	($out,&DWP(16*2,$out));
1721
1722	&movdqa	($tweak,$inout4);		# last tweak
1723	&jmp	(&label("xts_dec_done"));
1724
1725&set_label("xts_dec_three",16);
1726	&movaps	($inout5,$tweak);		# put aside last tweak
1727	&movups	($inout0,&QWP(16*0,$inp));	# load input
1728	&movups	($inout1,&QWP(16*1,$inp));
1729	&movups	($inout2,&QWP(16*2,$inp));
1730	&lea	($inp,&DWP(16*3,$inp));
1731	&xorps	($inout0,$inout3);		# input^=tweak
1732	&xorps	($inout1,$inout4);
1733	&xorps	($inout2,$inout5);
1734
1735	&call	("_aesni_decrypt3");
1736
1737	&xorps	($inout0,$inout3);		# output^=tweak
1738	&xorps	($inout1,$inout4);
1739	&xorps	($inout2,$inout5);
1740	&movups	(&QWP(16*0,$out),$inout0);	# write output
1741	&movups	(&QWP(16*1,$out),$inout1);
1742	&movups	(&QWP(16*2,$out),$inout2);
1743	&lea	($out,&DWP(16*3,$out));
1744
1745	&movdqa	($tweak,$inout5);		# last tweak
1746	&jmp	(&label("xts_dec_done"));
1747
1748&set_label("xts_dec_four",16);
1749	&movaps	($inout4,$tweak);		# put aside last tweak
1750
1751	&movups	($inout0,&QWP(16*0,$inp));	# load input
1752	&movups	($inout1,&QWP(16*1,$inp));
1753	&movups	($inout2,&QWP(16*2,$inp));
1754	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1755	&movups	($inout3,&QWP(16*3,$inp));
1756	&lea	($inp,&DWP(16*4,$inp));
1757	&xorps	($inout1,&QWP(16*1,"esp"));
1758	&xorps	($inout2,$inout5);
1759	&xorps	($inout3,$inout4);
1760
1761	&call	("_aesni_decrypt4");
1762
1763	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1764	&xorps	($inout1,&QWP(16*1,"esp"));
1765	&xorps	($inout2,$inout5);
1766	&movups	(&QWP(16*0,$out),$inout0);	# write output
1767	&xorps	($inout3,$inout4);
1768	&movups	(&QWP(16*1,$out),$inout1);
1769	&movups	(&QWP(16*2,$out),$inout2);
1770	&movups	(&QWP(16*3,$out),$inout3);
1771	&lea	($out,&DWP(16*4,$out));
1772
1773	&movdqa	($tweak,$inout4);		# last tweak
1774	&jmp	(&label("xts_dec_done"));
1775
1776&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
1777	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1778	&and	($len,15);
1779	&jz	(&label("xts_dec_ret"));
1780	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1781	&jmp	(&label("xts_dec_only_one_more"));
1782
1783&set_label("xts_dec_done",16);
1784	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1785	&pxor	($twtmp,$twtmp);
1786	&and	($len,15);
1787	&jz	(&label("xts_dec_ret"));
1788
1789	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1790	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1791	&pshufd	($twres,$twtmp,0x13);
1792	&pxor	($twtmp,$twtmp);
1793	&movdqa	($twmask,&QWP(16*6,"esp"));
1794	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1795	&pand	($twres,$twmask);		# isolate carry and residue
1796	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1797	&pxor	($tweak,$twres);
1798
1799&set_label("xts_dec_only_one_more");
1800	&pshufd	($inout3,$twtmp,0x13);
1801	&movdqa	($inout4,$tweak);		# put aside previous tweak
1802	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1803	&pand	($inout3,$twmask);		# isolate carry and residue
1804	&pxor	($inout3,$tweak);
1805
1806	&mov	($key,$key_);			# restore $key
1807	&mov	($rounds,$rounds_);		# restore $rounds
1808
1809	&movups	($inout0,&QWP(0,$inp));		# load input
1810	&xorps	($inout0,$inout3);		# input^=tweak
1811	if ($inline)
1812	{   &aesni_inline_generate1("dec");	}
1813	else
1814	{   &call	("_aesni_decrypt1");	}
1815	&xorps	($inout0,$inout3);		# output^=tweak
1816	&movups	(&QWP(0,$out),$inout0);		# write output
1817
1818&set_label("xts_dec_steal");
1819	&movz	($rounds,&BP(16,$inp));
1820	&movz	($key,&BP(0,$out));
1821	&lea	($inp,&DWP(1,$inp));
1822	&mov	(&BP(0,$out),&LB($rounds));
1823	&mov	(&BP(16,$out),&LB($key));
1824	&lea	($out,&DWP(1,$out));
1825	&sub	($len,1);
1826	&jnz	(&label("xts_dec_steal"));
1827
1828	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1829	&mov	($key,$key_);			# restore $key
1830	&mov	($rounds,$rounds_);		# restore $rounds
1831
1832	&movups	($inout0,&QWP(0,$out));		# load input
1833	&xorps	($inout0,$inout4);		# input^=tweak
1834	if ($inline)
1835	{   &aesni_inline_generate1("dec");	}
1836	else
1837	{   &call	("_aesni_decrypt1");	}
1838	&xorps	($inout0,$inout4);		# output^=tweak
1839	&movups	(&QWP(0,$out),$inout0);		# write output
1840
1841&set_label("xts_dec_ret");
1842	&pxor	("xmm0","xmm0");		# clear register bank
1843	&pxor	("xmm1","xmm1");
1844	&pxor	("xmm2","xmm2");
1845	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
1846	&pxor	("xmm3","xmm3");
1847	&movdqa	(&QWP(16*1,"esp"),"xmm0");
1848	&pxor	("xmm4","xmm4");
1849	&movdqa	(&QWP(16*2,"esp"),"xmm0");
1850	&pxor	("xmm5","xmm5");
1851	&movdqa	(&QWP(16*3,"esp"),"xmm0");
1852	&pxor	("xmm6","xmm6");
1853	&movdqa	(&QWP(16*4,"esp"),"xmm0");
1854	&pxor	("xmm7","xmm7");
1855	&movdqa	(&QWP(16*5,"esp"),"xmm0");
1856	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1857&function_end("${PREFIX}_xts_decrypt");
1858}
1859}
1860
1861######################################################################
1862# void $PREFIX_cbc_encrypt (const void *inp, void *out,
1863#                           size_t length, const AES_KEY *key,
1864#                           unsigned char *ivp,const int enc);
1865&function_begin("${PREFIX}_cbc_encrypt");
1866	&mov	($inp,&wparam(0));
1867	&mov	($rounds_,"esp");
1868	&mov	($out,&wparam(1));
1869	&sub	($rounds_,24);
1870	&mov	($len,&wparam(2));
1871	&and	($rounds_,-16);
1872	&mov	($key,&wparam(3));
1873	&mov	($key_,&wparam(4));
1874	&test	($len,$len);
1875	&jz	(&label("cbc_abort"));
1876
1877	&cmp	(&wparam(5),0);
1878	&xchg	($rounds_,"esp");		# alloca
1879	&movups	($ivec,&QWP(0,$key_));		# load IV
1880	&mov	($rounds,&DWP(240,$key));
1881	&mov	($key_,$key);			# backup $key
1882	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
1883	&mov	($rounds_,$rounds);		# backup $rounds
1884	&je	(&label("cbc_decrypt"));
1885
1886	&movaps	($inout0,$ivec);
1887	&cmp	($len,16);
1888	&jb	(&label("cbc_enc_tail"));
1889	&sub	($len,16);
1890	&jmp	(&label("cbc_enc_loop"));
1891
1892&set_label("cbc_enc_loop",16);
1893	&movups	($ivec,&QWP(0,$inp));		# input actually
1894	&lea	($inp,&DWP(16,$inp));
1895	if ($inline)
1896	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
1897	else
1898	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
1899	&mov	($rounds,$rounds_);	# restore $rounds
1900	&mov	($key,$key_);		# restore $key
1901	&movups	(&QWP(0,$out),$inout0);	# store output
1902	&lea	($out,&DWP(16,$out));
1903	&sub	($len,16);
1904	&jnc	(&label("cbc_enc_loop"));
1905	&add	($len,16);
1906	&jnz	(&label("cbc_enc_tail"));
1907	&movaps	($ivec,$inout0);
1908	&pxor	($inout0,$inout0);
1909	&jmp	(&label("cbc_ret"));
1910
1911&set_label("cbc_enc_tail");
1912	&mov	("ecx",$len);		# zaps $rounds
1913	&data_word(0xA4F3F689);		# rep movsb
1914	&mov	("ecx",16);		# zero tail
1915	&sub	("ecx",$len);
1916	&xor	("eax","eax");		# zaps $len
1917	&data_word(0xAAF3F689);		# rep stosb
1918	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
1919	&mov	($rounds,$rounds_);	# restore $rounds
1920	&mov	($inp,$out);		# $inp and $out are the same
1921	&mov	($key,$key_);		# restore $key
1922	&jmp	(&label("cbc_enc_loop"));
1923######################################################################
1924&set_label("cbc_decrypt",16);
1925	&cmp	($len,0x50);
1926	&jbe	(&label("cbc_dec_tail"));
1927	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
1928	&sub	($len,0x50);
1929	&jmp	(&label("cbc_dec_loop6_enter"));
1930
1931&set_label("cbc_dec_loop6",16);
1932	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
1933	&movups	(&QWP(0,$out),$inout5);
1934	&lea	($out,&DWP(0x10,$out));
1935&set_label("cbc_dec_loop6_enter");
1936	&movdqu	($inout0,&QWP(0,$inp));
1937	&movdqu	($inout1,&QWP(0x10,$inp));
1938	&movdqu	($inout2,&QWP(0x20,$inp));
1939	&movdqu	($inout3,&QWP(0x30,$inp));
1940	&movdqu	($inout4,&QWP(0x40,$inp));
1941	&movdqu	($inout5,&QWP(0x50,$inp));
1942
1943	&call	("_aesni_decrypt6");
1944
1945	&movups	($rndkey1,&QWP(0,$inp));
1946	&movups	($rndkey0,&QWP(0x10,$inp));
1947	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
1948	&xorps	($inout1,$rndkey1);
1949	&movups	($rndkey1,&QWP(0x20,$inp));
1950	&xorps	($inout2,$rndkey0);
1951	&movups	($rndkey0,&QWP(0x30,$inp));
1952	&xorps	($inout3,$rndkey1);
1953	&movups	($rndkey1,&QWP(0x40,$inp));
1954	&xorps	($inout4,$rndkey0);
1955	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
1956	&xorps	($inout5,$rndkey1);
1957	&movups	(&QWP(0,$out),$inout0);
1958	&movups	(&QWP(0x10,$out),$inout1);
1959	&lea	($inp,&DWP(0x60,$inp));
1960	&movups	(&QWP(0x20,$out),$inout2);
1961	&mov	($rounds,$rounds_);		# restore $rounds
1962	&movups	(&QWP(0x30,$out),$inout3);
1963	&mov	($key,$key_);			# restore $key
1964	&movups	(&QWP(0x40,$out),$inout4);
1965	&lea	($out,&DWP(0x50,$out));
1966	&sub	($len,0x60);
1967	&ja	(&label("cbc_dec_loop6"));
1968
1969	&movaps	($inout0,$inout5);
1970	&movaps	($ivec,$rndkey0);
1971	&add	($len,0x50);
1972	&jle	(&label("cbc_dec_clear_tail_collected"));
1973	&movups	(&QWP(0,$out),$inout0);
1974	&lea	($out,&DWP(0x10,$out));
1975&set_label("cbc_dec_tail");
1976	&movups	($inout0,&QWP(0,$inp));
1977	&movaps	($in0,$inout0);
1978	&cmp	($len,0x10);
1979	&jbe	(&label("cbc_dec_one"));
1980
1981	&movups	($inout1,&QWP(0x10,$inp));
1982	&movaps	($in1,$inout1);
1983	&cmp	($len,0x20);
1984	&jbe	(&label("cbc_dec_two"));
1985
1986	&movups	($inout2,&QWP(0x20,$inp));
1987	&cmp	($len,0x30);
1988	&jbe	(&label("cbc_dec_three"));
1989
1990	&movups	($inout3,&QWP(0x30,$inp));
1991	&cmp	($len,0x40);
1992	&jbe	(&label("cbc_dec_four"));
1993
1994	&movups	($inout4,&QWP(0x40,$inp));
1995	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
1996	&movups	($inout0,&QWP(0,$inp));
1997	&xorps	($inout5,$inout5);
1998	&call	("_aesni_decrypt6");
1999	&movups	($rndkey1,&QWP(0,$inp));
2000	&movups	($rndkey0,&QWP(0x10,$inp));
2001	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
2002	&xorps	($inout1,$rndkey1);
2003	&movups	($rndkey1,&QWP(0x20,$inp));
2004	&xorps	($inout2,$rndkey0);
2005	&movups	($rndkey0,&QWP(0x30,$inp));
2006	&xorps	($inout3,$rndkey1);
2007	&movups	($ivec,&QWP(0x40,$inp));	# IV
2008	&xorps	($inout4,$rndkey0);
2009	&movups	(&QWP(0,$out),$inout0);
2010	&movups	(&QWP(0x10,$out),$inout1);
2011	&pxor	($inout1,$inout1);
2012	&movups	(&QWP(0x20,$out),$inout2);
2013	&pxor	($inout2,$inout2);
2014	&movups	(&QWP(0x30,$out),$inout3);
2015	&pxor	($inout3,$inout3);
2016	&lea	($out,&DWP(0x40,$out));
2017	&movaps	($inout0,$inout4);
2018	&pxor	($inout4,$inout4);
2019	&sub	($len,0x50);
2020	&jmp	(&label("cbc_dec_tail_collected"));
2021
2022&set_label("cbc_dec_one",16);
2023	if ($inline)
2024	{   &aesni_inline_generate1("dec");	}
2025	else
2026	{   &call	("_aesni_decrypt1");	}
2027	&xorps	($inout0,$ivec);
2028	&movaps	($ivec,$in0);
2029	&sub	($len,0x10);
2030	&jmp	(&label("cbc_dec_tail_collected"));
2031
2032&set_label("cbc_dec_two",16);
2033	&call	("_aesni_decrypt2");
2034	&xorps	($inout0,$ivec);
2035	&xorps	($inout1,$in0);
2036	&movups	(&QWP(0,$out),$inout0);
2037	&movaps	($inout0,$inout1);
2038	&pxor	($inout1,$inout1);
2039	&lea	($out,&DWP(0x10,$out));
2040	&movaps	($ivec,$in1);
2041	&sub	($len,0x20);
2042	&jmp	(&label("cbc_dec_tail_collected"));
2043
2044&set_label("cbc_dec_three",16);
2045	&call	("_aesni_decrypt3");
2046	&xorps	($inout0,$ivec);
2047	&xorps	($inout1,$in0);
2048	&xorps	($inout2,$in1);
2049	&movups	(&QWP(0,$out),$inout0);
2050	&movaps	($inout0,$inout2);
2051	&pxor	($inout2,$inout2);
2052	&movups	(&QWP(0x10,$out),$inout1);
2053	&pxor	($inout1,$inout1);
2054	&lea	($out,&DWP(0x20,$out));
2055	&movups	($ivec,&QWP(0x20,$inp));
2056	&sub	($len,0x30);
2057	&jmp	(&label("cbc_dec_tail_collected"));
2058
2059&set_label("cbc_dec_four",16);
2060	&call	("_aesni_decrypt4");
2061	&movups	($rndkey1,&QWP(0x10,$inp));
2062	&movups	($rndkey0,&QWP(0x20,$inp));
2063	&xorps	($inout0,$ivec);
2064	&movups	($ivec,&QWP(0x30,$inp));
2065	&xorps	($inout1,$in0);
2066	&movups	(&QWP(0,$out),$inout0);
2067	&xorps	($inout2,$rndkey1);
2068	&movups	(&QWP(0x10,$out),$inout1);
2069	&pxor	($inout1,$inout1);
2070	&xorps	($inout3,$rndkey0);
2071	&movups	(&QWP(0x20,$out),$inout2);
2072	&pxor	($inout2,$inout2);
2073	&lea	($out,&DWP(0x30,$out));
2074	&movaps	($inout0,$inout3);
2075	&pxor	($inout3,$inout3);
2076	&sub	($len,0x40);
2077	&jmp	(&label("cbc_dec_tail_collected"));
2078
2079&set_label("cbc_dec_clear_tail_collected",16);
2080	&pxor	($inout1,$inout1);
2081	&pxor	($inout2,$inout2);
2082	&pxor	($inout3,$inout3);
2083	&pxor	($inout4,$inout4);
2084&set_label("cbc_dec_tail_collected");
2085	&and	($len,15);
2086	&jnz	(&label("cbc_dec_tail_partial"));
2087	&movups	(&QWP(0,$out),$inout0);
2088	&pxor	($rndkey0,$rndkey0);
2089	&jmp	(&label("cbc_ret"));
2090
2091&set_label("cbc_dec_tail_partial",16);
2092	&movaps	(&QWP(0,"esp"),$inout0);
2093	&pxor	($rndkey0,$rndkey0);
2094	&mov	("ecx",16);
2095	&mov	($inp,"esp");
2096	&sub	("ecx",$len);
2097	&data_word(0xA4F3F689);		# rep movsb
2098	&movdqa	(&QWP(0,"esp"),$inout0);
2099
2100&set_label("cbc_ret");
2101	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
2102	&mov	($key_,&wparam(4));
2103	&pxor	($inout0,$inout0);
2104	&pxor	($rndkey1,$rndkey1);
2105	&movups	(&QWP(0,$key_),$ivec);	# output IV
2106	&pxor	($ivec,$ivec);
2107&set_label("cbc_abort");
2108&function_end("${PREFIX}_cbc_encrypt");
2109
2110######################################################################
2111# Mechanical port from aesni-x86_64.pl.
2112#
2113# _aesni_set_encrypt_key is private interface,
2114# input:
2115#	"eax"	const unsigned char *userKey
2116#	$rounds	int bits
2117#	$key	AES_KEY *key
2118# output:
2119#	"eax"	return code
2120#	$round	rounds
2121
2122&function_begin_B("_aesni_set_encrypt_key");
2123	&push	("ebp");
2124	&push	("ebx");
2125	&test	("eax","eax");
2126	&jz	(&label("bad_pointer"));
2127	&test	($key,$key);
2128	&jz	(&label("bad_pointer"));
2129
2130	&call	(&label("pic"));
2131&set_label("pic");
2132	&blindpop("ebx");
2133	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2134
2135	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
2136	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
2137	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
2138	&mov	("ebp",&DWP(4,"ebp"));
2139	&lea	($key,&DWP(16,$key));
2140	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
2141	&cmp	($rounds,256);
2142	&je	(&label("14rounds"));
2143	&cmp	($rounds,192);
2144	&je	(&label("12rounds"));
2145	&cmp	($rounds,128);
2146	&jne	(&label("bad_keybits"));
2147
2148&set_label("10rounds",16);
2149	&cmp		("ebp",1<<28);
2150	&je		(&label("10rounds_alt"));
2151
2152	&mov		($rounds,9);
2153	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
2154	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
2155	&call		(&label("key_128_cold"));
2156	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
2157	&call		(&label("key_128"));
2158	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
2159	&call		(&label("key_128"));
2160	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
2161	&call		(&label("key_128"));
2162	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
2163	&call		(&label("key_128"));
2164	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
2165	&call		(&label("key_128"));
2166	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
2167	&call		(&label("key_128"));
2168	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
2169	&call		(&label("key_128"));
2170	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
2171	&call		(&label("key_128"));
2172	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
2173	&call		(&label("key_128"));
2174	&$movekey	(&QWP(0,$key),"xmm0");
2175	&mov		(&DWP(80,$key),$rounds);
2176
2177	&jmp	(&label("good_key"));
2178
2179&set_label("key_128",16);
2180	&$movekey	(&QWP(0,$key),"xmm0");
2181	&lea		($key,&DWP(16,$key));
2182&set_label("key_128_cold");
2183	&shufps		("xmm4","xmm0",0b00010000);
2184	&xorps		("xmm0","xmm4");
2185	&shufps		("xmm4","xmm0",0b10001100);
2186	&xorps		("xmm0","xmm4");
2187	&shufps		("xmm1","xmm1",0b11111111);	# critical path
2188	&xorps		("xmm0","xmm1");
2189	&ret();
2190
2191&set_label("10rounds_alt",16);
2192	&movdqa		("xmm5",&QWP(0x00,"ebx"));
2193	&mov		($rounds,8);
2194	&movdqa		("xmm4",&QWP(0x20,"ebx"));
2195	&movdqa		("xmm2","xmm0");
2196	&movdqu		(&QWP(-16,$key),"xmm0");
2197
2198&set_label("loop_key128");
2199	&pshufb		("xmm0","xmm5");
2200	&aesenclast	("xmm0","xmm4");
2201	&pslld		("xmm4",1);
2202	&lea		($key,&DWP(16,$key));
2203
2204	&movdqa		("xmm3","xmm2");
2205	&pslldq		("xmm2",4);
2206	&pxor		("xmm3","xmm2");
2207	&pslldq		("xmm2",4);
2208	&pxor		("xmm3","xmm2");
2209	&pslldq		("xmm2",4);
2210	&pxor		("xmm2","xmm3");
2211
2212	&pxor		("xmm0","xmm2");
2213	&movdqu		(&QWP(-16,$key),"xmm0");
2214	&movdqa		("xmm2","xmm0");
2215
2216	&dec		($rounds);
2217	&jnz		(&label("loop_key128"));
2218
2219	&movdqa		("xmm4",&QWP(0x30,"ebx"));
2220
2221	&pshufb		("xmm0","xmm5");
2222	&aesenclast	("xmm0","xmm4");
2223	&pslld		("xmm4",1);
2224
2225	&movdqa		("xmm3","xmm2");
2226	&pslldq		("xmm2",4);
2227	&pxor		("xmm3","xmm2");
2228	&pslldq		("xmm2",4);
2229	&pxor		("xmm3","xmm2");
2230	&pslldq		("xmm2",4);
2231	&pxor		("xmm2","xmm3");
2232
2233	&pxor		("xmm0","xmm2");
2234	&movdqu		(&QWP(0,$key),"xmm0");
2235
2236	&movdqa		("xmm2","xmm0");
2237	&pshufb		("xmm0","xmm5");
2238	&aesenclast	("xmm0","xmm4");
2239
2240	&movdqa		("xmm3","xmm2");
2241	&pslldq		("xmm2",4);
2242	&pxor		("xmm3","xmm2");
2243	&pslldq		("xmm2",4);
2244	&pxor		("xmm3","xmm2");
2245	&pslldq		("xmm2",4);
2246	&pxor		("xmm2","xmm3");
2247
2248	&pxor		("xmm0","xmm2");
2249	&movdqu		(&QWP(16,$key),"xmm0");
2250
2251	&mov		($rounds,9);
2252	&mov		(&DWP(96,$key),$rounds);
2253
2254	&jmp	(&label("good_key"));
2255
2256&set_label("12rounds",16);
2257	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
2258	&cmp		("ebp",1<<28);
2259	&je		(&label("12rounds_alt"));
2260
2261	&mov		($rounds,11);
2262	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
2263	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
2264	&call		(&label("key_192a_cold"));
2265	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
2266	&call		(&label("key_192b"));
2267	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
2268	&call		(&label("key_192a"));
2269	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
2270	&call		(&label("key_192b"));
2271	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
2272	&call		(&label("key_192a"));
2273	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
2274	&call		(&label("key_192b"));
2275	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
2276	&call		(&label("key_192a"));
2277	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
2278	&call		(&label("key_192b"));
2279	&$movekey	(&QWP(0,$key),"xmm0");
2280	&mov		(&DWP(48,$key),$rounds);
2281
2282	&jmp	(&label("good_key"));
2283
2284&set_label("key_192a",16);
2285	&$movekey	(&QWP(0,$key),"xmm0");
2286	&lea		($key,&DWP(16,$key));
2287&set_label("key_192a_cold",16);
2288	&movaps		("xmm5","xmm2");
2289&set_label("key_192b_warm");
2290	&shufps		("xmm4","xmm0",0b00010000);
2291	&movdqa		("xmm3","xmm2");
2292	&xorps		("xmm0","xmm4");
2293	&shufps		("xmm4","xmm0",0b10001100);
2294	&pslldq		("xmm3",4);
2295	&xorps		("xmm0","xmm4");
2296	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
2297	&pxor		("xmm2","xmm3");
2298	&pxor		("xmm0","xmm1");
2299	&pshufd		("xmm3","xmm0",0b11111111);
2300	&pxor		("xmm2","xmm3");
2301	&ret();
2302
2303&set_label("key_192b",16);
2304	&movaps		("xmm3","xmm0");
2305	&shufps		("xmm5","xmm0",0b01000100);
2306	&$movekey	(&QWP(0,$key),"xmm5");
2307	&shufps		("xmm3","xmm2",0b01001110);
2308	&$movekey	(&QWP(16,$key),"xmm3");
2309	&lea		($key,&DWP(32,$key));
2310	&jmp		(&label("key_192b_warm"));
2311
2312&set_label("12rounds_alt",16);
2313	&movdqa		("xmm5",&QWP(0x10,"ebx"));
2314	&movdqa		("xmm4",&QWP(0x20,"ebx"));
2315	&mov		($rounds,8);
2316	&movdqu		(&QWP(-16,$key),"xmm0");
2317
2318&set_label("loop_key192");
2319	&movq		(&QWP(0,$key),"xmm2");
2320	&movdqa		("xmm1","xmm2");
2321	&pshufb		("xmm2","xmm5");
2322	&aesenclast	("xmm2","xmm4");
2323	&pslld		("xmm4",1);
2324	&lea		($key,&DWP(24,$key));
2325
2326	&movdqa		("xmm3","xmm0");
2327	&pslldq		("xmm0",4);
2328	&pxor		("xmm3","xmm0");
2329	&pslldq		("xmm0",4);
2330	&pxor		("xmm3","xmm0");
2331	&pslldq		("xmm0",4);
2332	&pxor		("xmm0","xmm3");
2333
2334	&pshufd		("xmm3","xmm0",0xff);
2335	&pxor		("xmm3","xmm1");
2336	&pslldq		("xmm1",4);
2337	&pxor		("xmm3","xmm1");
2338
2339	&pxor		("xmm0","xmm2");
2340	&pxor		("xmm2","xmm3");
2341	&movdqu		(&QWP(-16,$key),"xmm0");
2342
2343	&dec		($rounds);
2344	&jnz		(&label("loop_key192"));
2345
2346	&mov	($rounds,11);
2347	&mov	(&DWP(32,$key),$rounds);
2348
2349	&jmp	(&label("good_key"));
2350
2351&set_label("14rounds",16);
2352	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
2353	&lea		($key,&DWP(16,$key));
2354	&cmp		("ebp",1<<28);
2355	&je		(&label("14rounds_alt"));
2356
2357	&mov		($rounds,13);
2358	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
2359	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
2360	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
2361	&call		(&label("key_256a_cold"));
2362	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
2363	&call		(&label("key_256b"));
2364	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
2365	&call		(&label("key_256a"));
2366	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
2367	&call		(&label("key_256b"));
2368	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
2369	&call		(&label("key_256a"));
2370	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
2371	&call		(&label("key_256b"));
2372	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
2373	&call		(&label("key_256a"));
2374	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
2375	&call		(&label("key_256b"));
2376	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
2377	&call		(&label("key_256a"));
2378	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
2379	&call		(&label("key_256b"));
2380	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
2381	&call		(&label("key_256a"));
2382	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
2383	&call		(&label("key_256b"));
2384	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
2385	&call		(&label("key_256a"));
2386	&$movekey	(&QWP(0,$key),"xmm0");
2387	&mov		(&DWP(16,$key),$rounds);
2388	&xor		("eax","eax");
2389
2390	&jmp	(&label("good_key"));
2391
2392&set_label("key_256a",16);
2393	&$movekey	(&QWP(0,$key),"xmm2");
2394	&lea		($key,&DWP(16,$key));
2395&set_label("key_256a_cold");
2396	&shufps		("xmm4","xmm0",0b00010000);
2397	&xorps		("xmm0","xmm4");
2398	&shufps		("xmm4","xmm0",0b10001100);
2399	&xorps		("xmm0","xmm4");
2400	&shufps		("xmm1","xmm1",0b11111111);	# critical path
2401	&xorps		("xmm0","xmm1");
2402	&ret();
2403
2404&set_label("key_256b",16);
2405	&$movekey	(&QWP(0,$key),"xmm0");
2406	&lea		($key,&DWP(16,$key));
2407
2408	&shufps		("xmm4","xmm2",0b00010000);
2409	&xorps		("xmm2","xmm4");
2410	&shufps		("xmm4","xmm2",0b10001100);
2411	&xorps		("xmm2","xmm4");
2412	&shufps		("xmm1","xmm1",0b10101010);	# critical path
2413	&xorps		("xmm2","xmm1");
2414	&ret();
2415
2416&set_label("14rounds_alt",16);
2417	&movdqa		("xmm5",&QWP(0x00,"ebx"));
2418	&movdqa		("xmm4",&QWP(0x20,"ebx"));
2419	&mov		($rounds,7);
2420	&movdqu		(&QWP(-32,$key),"xmm0");
2421	&movdqa		("xmm1","xmm2");
2422	&movdqu		(&QWP(-16,$key),"xmm2");
2423
2424&set_label("loop_key256");
2425	&pshufb		("xmm2","xmm5");
2426	&aesenclast	("xmm2","xmm4");
2427
2428	&movdqa		("xmm3","xmm0");
2429	&pslldq		("xmm0",4);
2430	&pxor		("xmm3","xmm0");
2431	&pslldq		("xmm0",4);
2432	&pxor		("xmm3","xmm0");
2433	&pslldq		("xmm0",4);
2434	&pxor		("xmm0","xmm3");
2435	&pslld		("xmm4",1);
2436
2437	&pxor		("xmm0","xmm2");
2438	&movdqu		(&QWP(0,$key),"xmm0");
2439
2440	&dec		($rounds);
2441	&jz		(&label("done_key256"));
2442
2443	&pshufd		("xmm2","xmm0",0xff);
2444	&pxor		("xmm3","xmm3");
2445	&aesenclast	("xmm2","xmm3");
2446
2447	&movdqa		("xmm3","xmm1");
2448	&pslldq		("xmm1",4);
2449	&pxor		("xmm3","xmm1");
2450	&pslldq		("xmm1",4);
2451	&pxor		("xmm3","xmm1");
2452	&pslldq		("xmm1",4);
2453	&pxor		("xmm1","xmm3");
2454
2455	&pxor		("xmm2","xmm1");
2456	&movdqu		(&QWP(16,$key),"xmm2");
2457	&lea		($key,&DWP(32,$key));
2458	&movdqa		("xmm1","xmm2");
2459	&jmp		(&label("loop_key256"));
2460
2461&set_label("done_key256");
2462	&mov		($rounds,13);
2463	&mov		(&DWP(16,$key),$rounds);
2464
2465&set_label("good_key");
2466	&pxor	("xmm0","xmm0");
2467	&pxor	("xmm1","xmm1");
2468	&pxor	("xmm2","xmm2");
2469	&pxor	("xmm3","xmm3");
2470	&pxor	("xmm4","xmm4");
2471	&pxor	("xmm5","xmm5");
2472	&xor	("eax","eax");
2473	&pop	("ebx");
2474	&pop	("ebp");
2475	&ret	();
2476
2477&set_label("bad_pointer",4);
2478	&mov	("eax",-1);
2479	&pop	("ebx");
2480	&pop	("ebp");
2481	&ret	();
2482&set_label("bad_keybits",4);
2483	&pxor	("xmm0","xmm0");
2484	&mov	("eax",-2);
2485	&pop	("ebx");
2486	&pop	("ebp");
2487	&ret	();
2488&function_end_B("_aesni_set_encrypt_key");
2489
2490# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2491#                              AES_KEY *key)
2492&function_begin_B("${PREFIX}_set_encrypt_key");
2493	&record_function_hit(3);
2494
2495	&mov	("eax",&wparam(0));
2496	&mov	($rounds,&wparam(1));
2497	&mov	($key,&wparam(2));
2498	&call	("_aesni_set_encrypt_key");
2499	&ret	();
2500&function_end_B("${PREFIX}_set_encrypt_key");
2501
2502# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2503#                              AES_KEY *key)
2504&function_begin_B("${PREFIX}_set_decrypt_key");
2505	&mov	("eax",&wparam(0));
2506	&mov	($rounds,&wparam(1));
2507	&mov	($key,&wparam(2));
2508	&call	("_aesni_set_encrypt_key");
2509	&mov	($key,&wparam(2));
2510	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
2511	&test	("eax","eax");
2512	&jnz	(&label("dec_key_ret"));
2513	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
2514
2515	&$movekey	("xmm0",&QWP(0,$key));	# just swap
2516	&$movekey	("xmm1",&QWP(0,"eax"));
2517	&$movekey	(&QWP(0,"eax"),"xmm0");
2518	&$movekey	(&QWP(0,$key),"xmm1");
2519	&lea		($key,&DWP(16,$key));
2520	&lea		("eax",&DWP(-16,"eax"));
2521
2522&set_label("dec_key_inverse");
2523	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
2524	&$movekey	("xmm1",&QWP(0,"eax"));
2525	&aesimc		("xmm0","xmm0");
2526	&aesimc		("xmm1","xmm1");
2527	&lea		($key,&DWP(16,$key));
2528	&lea		("eax",&DWP(-16,"eax"));
2529	&$movekey	(&QWP(16,"eax"),"xmm0");
2530	&$movekey	(&QWP(-16,$key),"xmm1");
2531	&cmp		("eax",$key);
2532	&ja		(&label("dec_key_inverse"));
2533
2534	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
2535	&aesimc		("xmm0","xmm0");
2536	&$movekey	(&QWP(0,$key),"xmm0");
2537
2538	&pxor		("xmm0","xmm0");
2539	&pxor		("xmm1","xmm1");
2540	&xor		("eax","eax");		# return success
2541&set_label("dec_key_ret");
2542	&ret	();
2543&function_end_B("${PREFIX}_set_decrypt_key");
2544
2545&set_label("key_const",64);
2546&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
2547&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
2548&data_word(1,1,1,1);
2549&data_word(0x1b,0x1b,0x1b,0x1b);
2550&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2551
2552&asm_finish();
2553
2554close STDOUT;
2555