1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13# details].
14#
15# Performance.
16#
17# To start with see corresponding paragraph in aesni-x86_64.pl...
18# Instead of filling table similar to one found there I've chosen to
19# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20# The simplified table below represents 32-bit performance relative
21# to 64-bit one in every given point. Ratios vary for different
22# encryption modes, therefore interval values.
23#
24#	16-byte     64-byte     256-byte    1-KB        8-KB
25#	53-67%      67-84%      91-94%      95-98%      97-99.5%
26#
27# Lower ratios for smaller block sizes are perfectly understandable,
28# because function call overhead is higher in 32-bit mode. Largest
29# 8-KB block performance is virtually same: 32-bit code is less than
30# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31
32# January 2011
33#
34# See aesni-x86_64.pl for details. Unlike x86_64 version this module
35# interleaves at most 6 aes[enc|dec] instructions, because there are
36# not enough registers for 8x interleave [which should be optimal for
37# Sandy Bridge]. Actually, performance results for 6x interleave
38# factor presented in aesni-x86_64.pl (except for CTR) are for this
39# module.
40
41# April 2011
42#
43# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
45
46######################################################################
47# Current large-block performance in cycles per byte processed with
48# 128-bit key (less is better).
49#
50#		CBC en-/decrypt	CTR	XTS	ECB
51# Westmere	3.77/1.37	1.37	1.52	1.27
52# * Bridge	5.07/0.98	0.99	1.09	0.91
53# Haswell	4.44/0.80	0.97	1.03	0.72
54# Silvermont	5.77/3.56	3.67	4.03	3.46
55# Bulldozer	5.80/0.98	1.05	1.24	0.93
56
57$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
58			# generates drop-in replacement for
59			# crypto/aes/asm/aes-586.pl:-)
60$inline=1;		# inline _aesni_[en|de]crypt
61
62$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63push(@INC,"${dir}","${dir}../../perlasm");
64require "x86asm.pl";
65
66&asm_init($ARGV[0],$0);
67
68&external_label("OPENSSL_ia32cap_P");
69&static_label("key_const");
70
71if ($PREFIX eq "aesni")	{ $movekey=\&movups; }
72else			{ $movekey=\&movups; }
73
74$len="eax";
75$rounds="ecx";
76$key="edx";
77$inp="esi";
78$out="edi";
79$rounds_="ebx";	# backup copy for $rounds
80$key_="ebp";	# backup copy for $key
81
82$rndkey0="xmm0";
83$rndkey1="xmm1";
84$inout0="xmm2";
85$inout1="xmm3";
86$inout2="xmm4";
87$inout3="xmm5";	$in1="xmm5";
88$inout4="xmm6";	$in0="xmm6";
89$inout5="xmm7";	$ivec="xmm7";
90
91# AESNI extenstion
92sub aeskeygenassist
93{ my($dst,$src,$imm)=@_;
94    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
95    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
96}
97sub aescommon
98{ my($opcodelet,$dst,$src)=@_;
99    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
100    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
101}
102sub aesimc	{ aescommon(0xdb,@_); }
103sub aesenc	{ aescommon(0xdc,@_); }
104sub aesenclast	{ aescommon(0xdd,@_); }
105sub aesdec	{ aescommon(0xde,@_); }
106sub aesdeclast	{ aescommon(0xdf,@_); }
107
108# Inline version of internal aesni_[en|de]crypt1
109{ my $sn;
110sub aesni_inline_generate1
111{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
112  $sn++;
113
114    &$movekey		($rndkey0,&QWP(0,$key));
115    &$movekey		($rndkey1,&QWP(16,$key));
116    &xorps		($ivec,$rndkey0)	if (defined($ivec));
117    &lea		($key,&DWP(32,$key));
118    &xorps		($inout,$ivec)		if (defined($ivec));
119    &xorps		($inout,$rndkey0)	if (!defined($ivec));
120    &set_label("${p}1_loop_$sn");
121	eval"&aes${p}	($inout,$rndkey1)";
122	&dec		($rounds);
123	&$movekey	($rndkey1,&QWP(0,$key));
124	&lea		($key,&DWP(16,$key));
125    &jnz		(&label("${p}1_loop_$sn"));
126    eval"&aes${p}last	($inout,$rndkey1)";
127}}
128
129sub aesni_generate1	# fully unrolled loop
130{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
131
132    &function_begin_B("_aesni_${p}rypt1");
133	&movups		($rndkey0,&QWP(0,$key));
134	&$movekey	($rndkey1,&QWP(0x10,$key));
135	&xorps		($inout,$rndkey0);
136	&$movekey	($rndkey0,&QWP(0x20,$key));
137	&lea		($key,&DWP(0x30,$key));
138	&cmp		($rounds,11);
139	&jb		(&label("${p}128"));
140	&lea		($key,&DWP(0x20,$key));
141	&je		(&label("${p}192"));
142	&lea		($key,&DWP(0x20,$key));
143	eval"&aes${p}	($inout,$rndkey1)";
144	&$movekey	($rndkey1,&QWP(-0x40,$key));
145	eval"&aes${p}	($inout,$rndkey0)";
146	&$movekey	($rndkey0,&QWP(-0x30,$key));
147    &set_label("${p}192");
148	eval"&aes${p}	($inout,$rndkey1)";
149	&$movekey	($rndkey1,&QWP(-0x20,$key));
150	eval"&aes${p}	($inout,$rndkey0)";
151	&$movekey	($rndkey0,&QWP(-0x10,$key));
152    &set_label("${p}128");
153	eval"&aes${p}	($inout,$rndkey1)";
154	&$movekey	($rndkey1,&QWP(0,$key));
155	eval"&aes${p}	($inout,$rndkey0)";
156	&$movekey	($rndkey0,&QWP(0x10,$key));
157	eval"&aes${p}	($inout,$rndkey1)";
158	&$movekey	($rndkey1,&QWP(0x20,$key));
159	eval"&aes${p}	($inout,$rndkey0)";
160	&$movekey	($rndkey0,&QWP(0x30,$key));
161	eval"&aes${p}	($inout,$rndkey1)";
162	&$movekey	($rndkey1,&QWP(0x40,$key));
163	eval"&aes${p}	($inout,$rndkey0)";
164	&$movekey	($rndkey0,&QWP(0x50,$key));
165	eval"&aes${p}	($inout,$rndkey1)";
166	&$movekey	($rndkey1,&QWP(0x60,$key));
167	eval"&aes${p}	($inout,$rndkey0)";
168	&$movekey	($rndkey0,&QWP(0x70,$key));
169	eval"&aes${p}	($inout,$rndkey1)";
170    eval"&aes${p}last	($inout,$rndkey0)";
171    &ret();
172    &function_end_B("_aesni_${p}rypt1");
173}
174
175# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
176&aesni_generate1("enc") if (!$inline);
177&function_begin_B("${PREFIX}_encrypt");
178	&mov	("eax",&wparam(0));
179	&mov	($key,&wparam(2));
180	&movups	($inout0,&QWP(0,"eax"));
181	&mov	($rounds,&DWP(240,$key));
182	&mov	("eax",&wparam(1));
183	if ($inline)
184	{   &aesni_inline_generate1("enc");	}
185	else
186	{   &call	("_aesni_encrypt1");	}
187	&pxor	($rndkey0,$rndkey0);		# clear register bank
188	&pxor	($rndkey1,$rndkey1);
189	&movups	(&QWP(0,"eax"),$inout0);
190	&pxor	($inout0,$inout0);
191	&ret	();
192&function_end_B("${PREFIX}_encrypt");
193
194# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
195&aesni_generate1("dec") if(!$inline);
196&function_begin_B("${PREFIX}_decrypt");
197	&mov	("eax",&wparam(0));
198	&mov	($key,&wparam(2));
199	&movups	($inout0,&QWP(0,"eax"));
200	&mov	($rounds,&DWP(240,$key));
201	&mov	("eax",&wparam(1));
202	if ($inline)
203	{   &aesni_inline_generate1("dec");	}
204	else
205	{   &call	("_aesni_decrypt1");	}
206	&pxor	($rndkey0,$rndkey0);		# clear register bank
207	&pxor	($rndkey1,$rndkey1);
208	&movups	(&QWP(0,"eax"),$inout0);
209	&pxor	($inout0,$inout0);
210	&ret	();
211&function_end_B("${PREFIX}_decrypt");
212
213# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
214# factor. Why 3x subroutine were originally used in loops? Even though
215# aes[enc|dec] latency was originally 6, it could be scheduled only
216# every *2nd* cycle. Thus 3x interleave was the one providing optimal
217# utilization, i.e. when subroutine's throughput is virtually same as
218# of non-interleaved subroutine [for number of input blocks up to 3].
219# This is why it originally made no sense to implement 2x subroutine.
220# But times change and it became appropriate to spend extra 192 bytes
221# on 2x subroutine on Atom Silvermont account. For processors that
222# can schedule aes[enc|dec] every cycle optimal interleave factor
223# equals to corresponding instructions latency. 8x is optimal for
224# * Bridge, but it's unfeasible to accommodate such implementation
225# in XMM registers addreassable in 32-bit mode and therefore maximum
226# of 6x is used instead...
227
228sub aesni_generate2
229{ my $p=shift;
230
231    &function_begin_B("_aesni_${p}rypt2");
232	&$movekey	($rndkey0,&QWP(0,$key));
233	&shl		($rounds,4);
234	&$movekey	($rndkey1,&QWP(16,$key));
235	&xorps		($inout0,$rndkey0);
236	&pxor		($inout1,$rndkey0);
237	&$movekey	($rndkey0,&QWP(32,$key));
238	&lea		($key,&DWP(32,$key,$rounds));
239	&neg		($rounds);
240	&add		($rounds,16);
241
242    &set_label("${p}2_loop");
243	eval"&aes${p}	($inout0,$rndkey1)";
244	eval"&aes${p}	($inout1,$rndkey1)";
245	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
246	&add		($rounds,32);
247	eval"&aes${p}	($inout0,$rndkey0)";
248	eval"&aes${p}	($inout1,$rndkey0)";
249	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
250	&jnz		(&label("${p}2_loop"));
251    eval"&aes${p}	($inout0,$rndkey1)";
252    eval"&aes${p}	($inout1,$rndkey1)";
253    eval"&aes${p}last	($inout0,$rndkey0)";
254    eval"&aes${p}last	($inout1,$rndkey0)";
255    &ret();
256    &function_end_B("_aesni_${p}rypt2");
257}
258
259sub aesni_generate3
260{ my $p=shift;
261
262    &function_begin_B("_aesni_${p}rypt3");
263	&$movekey	($rndkey0,&QWP(0,$key));
264	&shl		($rounds,4);
265	&$movekey	($rndkey1,&QWP(16,$key));
266	&xorps		($inout0,$rndkey0);
267	&pxor		($inout1,$rndkey0);
268	&pxor		($inout2,$rndkey0);
269	&$movekey	($rndkey0,&QWP(32,$key));
270	&lea		($key,&DWP(32,$key,$rounds));
271	&neg		($rounds);
272	&add		($rounds,16);
273
274    &set_label("${p}3_loop");
275	eval"&aes${p}	($inout0,$rndkey1)";
276	eval"&aes${p}	($inout1,$rndkey1)";
277	eval"&aes${p}	($inout2,$rndkey1)";
278	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
279	&add		($rounds,32);
280	eval"&aes${p}	($inout0,$rndkey0)";
281	eval"&aes${p}	($inout1,$rndkey0)";
282	eval"&aes${p}	($inout2,$rndkey0)";
283	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
284	&jnz		(&label("${p}3_loop"));
285    eval"&aes${p}	($inout0,$rndkey1)";
286    eval"&aes${p}	($inout1,$rndkey1)";
287    eval"&aes${p}	($inout2,$rndkey1)";
288    eval"&aes${p}last	($inout0,$rndkey0)";
289    eval"&aes${p}last	($inout1,$rndkey0)";
290    eval"&aes${p}last	($inout2,$rndkey0)";
291    &ret();
292    &function_end_B("_aesni_${p}rypt3");
293}
294
295# 4x interleave is implemented to improve small block performance,
296# most notably [and naturally] 4 block by ~30%. One can argue that one
297# should have implemented 5x as well, but improvement  would be <20%,
298# so it's not worth it...
299sub aesni_generate4
300{ my $p=shift;
301
302    &function_begin_B("_aesni_${p}rypt4");
303	&$movekey	($rndkey0,&QWP(0,$key));
304	&$movekey	($rndkey1,&QWP(16,$key));
305	&shl		($rounds,4);
306	&xorps		($inout0,$rndkey0);
307	&pxor		($inout1,$rndkey0);
308	&pxor		($inout2,$rndkey0);
309	&pxor		($inout3,$rndkey0);
310	&$movekey	($rndkey0,&QWP(32,$key));
311	&lea		($key,&DWP(32,$key,$rounds));
312	&neg		($rounds);
313	&data_byte	(0x0f,0x1f,0x40,0x00);
314	&add		($rounds,16);
315
316    &set_label("${p}4_loop");
317	eval"&aes${p}	($inout0,$rndkey1)";
318	eval"&aes${p}	($inout1,$rndkey1)";
319	eval"&aes${p}	($inout2,$rndkey1)";
320	eval"&aes${p}	($inout3,$rndkey1)";
321	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
322	&add		($rounds,32);
323	eval"&aes${p}	($inout0,$rndkey0)";
324	eval"&aes${p}	($inout1,$rndkey0)";
325	eval"&aes${p}	($inout2,$rndkey0)";
326	eval"&aes${p}	($inout3,$rndkey0)";
327	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
328    &jnz		(&label("${p}4_loop"));
329
330    eval"&aes${p}	($inout0,$rndkey1)";
331    eval"&aes${p}	($inout1,$rndkey1)";
332    eval"&aes${p}	($inout2,$rndkey1)";
333    eval"&aes${p}	($inout3,$rndkey1)";
334    eval"&aes${p}last	($inout0,$rndkey0)";
335    eval"&aes${p}last	($inout1,$rndkey0)";
336    eval"&aes${p}last	($inout2,$rndkey0)";
337    eval"&aes${p}last	($inout3,$rndkey0)";
338    &ret();
339    &function_end_B("_aesni_${p}rypt4");
340}
341
342sub aesni_generate6
343{ my $p=shift;
344
345    &function_begin_B("_aesni_${p}rypt6");
346    &static_label("_aesni_${p}rypt6_enter");
347	&$movekey	($rndkey0,&QWP(0,$key));
348	&shl		($rounds,4);
349	&$movekey	($rndkey1,&QWP(16,$key));
350	&xorps		($inout0,$rndkey0);
351	&pxor		($inout1,$rndkey0);	# pxor does better here
352	&pxor		($inout2,$rndkey0);
353	eval"&aes${p}	($inout0,$rndkey1)";
354	&pxor		($inout3,$rndkey0);
355	&pxor		($inout4,$rndkey0);
356	eval"&aes${p}	($inout1,$rndkey1)";
357	&lea		($key,&DWP(32,$key,$rounds));
358	&neg		($rounds);
359	eval"&aes${p}	($inout2,$rndkey1)";
360	&pxor		($inout5,$rndkey0);
361	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
362	&add		($rounds,16);
363	&jmp		(&label("_aesni_${p}rypt6_inner"));
364
365    &set_label("${p}6_loop",16);
366	eval"&aes${p}	($inout0,$rndkey1)";
367	eval"&aes${p}	($inout1,$rndkey1)";
368	eval"&aes${p}	($inout2,$rndkey1)";
369    &set_label("_aesni_${p}rypt6_inner");
370	eval"&aes${p}	($inout3,$rndkey1)";
371	eval"&aes${p}	($inout4,$rndkey1)";
372	eval"&aes${p}	($inout5,$rndkey1)";
373    &set_label("_aesni_${p}rypt6_enter");
374	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
375	&add		($rounds,32);
376	eval"&aes${p}	($inout0,$rndkey0)";
377	eval"&aes${p}	($inout1,$rndkey0)";
378	eval"&aes${p}	($inout2,$rndkey0)";
379	eval"&aes${p}	($inout3,$rndkey0)";
380	eval"&aes${p}	($inout4,$rndkey0)";
381	eval"&aes${p}	($inout5,$rndkey0)";
382	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
383    &jnz		(&label("${p}6_loop"));
384
385    eval"&aes${p}	($inout0,$rndkey1)";
386    eval"&aes${p}	($inout1,$rndkey1)";
387    eval"&aes${p}	($inout2,$rndkey1)";
388    eval"&aes${p}	($inout3,$rndkey1)";
389    eval"&aes${p}	($inout4,$rndkey1)";
390    eval"&aes${p}	($inout5,$rndkey1)";
391    eval"&aes${p}last	($inout0,$rndkey0)";
392    eval"&aes${p}last	($inout1,$rndkey0)";
393    eval"&aes${p}last	($inout2,$rndkey0)";
394    eval"&aes${p}last	($inout3,$rndkey0)";
395    eval"&aes${p}last	($inout4,$rndkey0)";
396    eval"&aes${p}last	($inout5,$rndkey0)";
397    &ret();
398    &function_end_B("_aesni_${p}rypt6");
399}
400&aesni_generate2("enc") if ($PREFIX eq "aesni");
401&aesni_generate2("dec");
402&aesni_generate3("enc") if ($PREFIX eq "aesni");
403&aesni_generate3("dec");
404&aesni_generate4("enc") if ($PREFIX eq "aesni");
405&aesni_generate4("dec");
406&aesni_generate6("enc") if ($PREFIX eq "aesni");
407&aesni_generate6("dec");
408
409if ($PREFIX eq "aesni") {
410######################################################################
411# void aesni_ecb_encrypt (const void *in, void *out,
412#                         size_t length, const AES_KEY *key,
413#                         int enc);
414&function_begin("aesni_ecb_encrypt");
415	&mov	($inp,&wparam(0));
416	&mov	($out,&wparam(1));
417	&mov	($len,&wparam(2));
418	&mov	($key,&wparam(3));
419	&mov	($rounds_,&wparam(4));
420	&and	($len,-16);
421	&jz	(&label("ecb_ret"));
422	&mov	($rounds,&DWP(240,$key));
423	&test	($rounds_,$rounds_);
424	&jz	(&label("ecb_decrypt"));
425
426	&mov	($key_,$key);		# backup $key
427	&mov	($rounds_,$rounds);	# backup $rounds
428	&cmp	($len,0x60);
429	&jb	(&label("ecb_enc_tail"));
430
431	&movdqu	($inout0,&QWP(0,$inp));
432	&movdqu	($inout1,&QWP(0x10,$inp));
433	&movdqu	($inout2,&QWP(0x20,$inp));
434	&movdqu	($inout3,&QWP(0x30,$inp));
435	&movdqu	($inout4,&QWP(0x40,$inp));
436	&movdqu	($inout5,&QWP(0x50,$inp));
437	&lea	($inp,&DWP(0x60,$inp));
438	&sub	($len,0x60);
439	&jmp	(&label("ecb_enc_loop6_enter"));
440
441&set_label("ecb_enc_loop6",16);
442	&movups	(&QWP(0,$out),$inout0);
443	&movdqu	($inout0,&QWP(0,$inp));
444	&movups	(&QWP(0x10,$out),$inout1);
445	&movdqu	($inout1,&QWP(0x10,$inp));
446	&movups	(&QWP(0x20,$out),$inout2);
447	&movdqu	($inout2,&QWP(0x20,$inp));
448	&movups	(&QWP(0x30,$out),$inout3);
449	&movdqu	($inout3,&QWP(0x30,$inp));
450	&movups	(&QWP(0x40,$out),$inout4);
451	&movdqu	($inout4,&QWP(0x40,$inp));
452	&movups	(&QWP(0x50,$out),$inout5);
453	&lea	($out,&DWP(0x60,$out));
454	&movdqu	($inout5,&QWP(0x50,$inp));
455	&lea	($inp,&DWP(0x60,$inp));
456&set_label("ecb_enc_loop6_enter");
457
458	&call	("_aesni_encrypt6");
459
460	&mov	($key,$key_);		# restore $key
461	&mov	($rounds,$rounds_);	# restore $rounds
462	&sub	($len,0x60);
463	&jnc	(&label("ecb_enc_loop6"));
464
465	&movups	(&QWP(0,$out),$inout0);
466	&movups	(&QWP(0x10,$out),$inout1);
467	&movups	(&QWP(0x20,$out),$inout2);
468	&movups	(&QWP(0x30,$out),$inout3);
469	&movups	(&QWP(0x40,$out),$inout4);
470	&movups	(&QWP(0x50,$out),$inout5);
471	&lea	($out,&DWP(0x60,$out));
472	&add	($len,0x60);
473	&jz	(&label("ecb_ret"));
474
475&set_label("ecb_enc_tail");
476	&movups	($inout0,&QWP(0,$inp));
477	&cmp	($len,0x20);
478	&jb	(&label("ecb_enc_one"));
479	&movups	($inout1,&QWP(0x10,$inp));
480	&je	(&label("ecb_enc_two"));
481	&movups	($inout2,&QWP(0x20,$inp));
482	&cmp	($len,0x40);
483	&jb	(&label("ecb_enc_three"));
484	&movups	($inout3,&QWP(0x30,$inp));
485	&je	(&label("ecb_enc_four"));
486	&movups	($inout4,&QWP(0x40,$inp));
487	&xorps	($inout5,$inout5);
488	&call	("_aesni_encrypt6");
489	&movups	(&QWP(0,$out),$inout0);
490	&movups	(&QWP(0x10,$out),$inout1);
491	&movups	(&QWP(0x20,$out),$inout2);
492	&movups	(&QWP(0x30,$out),$inout3);
493	&movups	(&QWP(0x40,$out),$inout4);
494	jmp	(&label("ecb_ret"));
495
496&set_label("ecb_enc_one",16);
497	if ($inline)
498	{   &aesni_inline_generate1("enc");	}
499	else
500	{   &call	("_aesni_encrypt1");	}
501	&movups	(&QWP(0,$out),$inout0);
502	&jmp	(&label("ecb_ret"));
503
504&set_label("ecb_enc_two",16);
505	&call	("_aesni_encrypt2");
506	&movups	(&QWP(0,$out),$inout0);
507	&movups	(&QWP(0x10,$out),$inout1);
508	&jmp	(&label("ecb_ret"));
509
510&set_label("ecb_enc_three",16);
511	&call	("_aesni_encrypt3");
512	&movups	(&QWP(0,$out),$inout0);
513	&movups	(&QWP(0x10,$out),$inout1);
514	&movups	(&QWP(0x20,$out),$inout2);
515	&jmp	(&label("ecb_ret"));
516
517&set_label("ecb_enc_four",16);
518	&call	("_aesni_encrypt4");
519	&movups	(&QWP(0,$out),$inout0);
520	&movups	(&QWP(0x10,$out),$inout1);
521	&movups	(&QWP(0x20,$out),$inout2);
522	&movups	(&QWP(0x30,$out),$inout3);
523	&jmp	(&label("ecb_ret"));
524######################################################################
525&set_label("ecb_decrypt",16);
526	&mov	($key_,$key);		# backup $key
527	&mov	($rounds_,$rounds);	# backup $rounds
528	&cmp	($len,0x60);
529	&jb	(&label("ecb_dec_tail"));
530
531	&movdqu	($inout0,&QWP(0,$inp));
532	&movdqu	($inout1,&QWP(0x10,$inp));
533	&movdqu	($inout2,&QWP(0x20,$inp));
534	&movdqu	($inout3,&QWP(0x30,$inp));
535	&movdqu	($inout4,&QWP(0x40,$inp));
536	&movdqu	($inout5,&QWP(0x50,$inp));
537	&lea	($inp,&DWP(0x60,$inp));
538	&sub	($len,0x60);
539	&jmp	(&label("ecb_dec_loop6_enter"));
540
541&set_label("ecb_dec_loop6",16);
542	&movups	(&QWP(0,$out),$inout0);
543	&movdqu	($inout0,&QWP(0,$inp));
544	&movups	(&QWP(0x10,$out),$inout1);
545	&movdqu	($inout1,&QWP(0x10,$inp));
546	&movups	(&QWP(0x20,$out),$inout2);
547	&movdqu	($inout2,&QWP(0x20,$inp));
548	&movups	(&QWP(0x30,$out),$inout3);
549	&movdqu	($inout3,&QWP(0x30,$inp));
550	&movups	(&QWP(0x40,$out),$inout4);
551	&movdqu	($inout4,&QWP(0x40,$inp));
552	&movups	(&QWP(0x50,$out),$inout5);
553	&lea	($out,&DWP(0x60,$out));
554	&movdqu	($inout5,&QWP(0x50,$inp));
555	&lea	($inp,&DWP(0x60,$inp));
556&set_label("ecb_dec_loop6_enter");
557
558	&call	("_aesni_decrypt6");
559
560	&mov	($key,$key_);		# restore $key
561	&mov	($rounds,$rounds_);	# restore $rounds
562	&sub	($len,0x60);
563	&jnc	(&label("ecb_dec_loop6"));
564
565	&movups	(&QWP(0,$out),$inout0);
566	&movups	(&QWP(0x10,$out),$inout1);
567	&movups	(&QWP(0x20,$out),$inout2);
568	&movups	(&QWP(0x30,$out),$inout3);
569	&movups	(&QWP(0x40,$out),$inout4);
570	&movups	(&QWP(0x50,$out),$inout5);
571	&lea	($out,&DWP(0x60,$out));
572	&add	($len,0x60);
573	&jz	(&label("ecb_ret"));
574
575&set_label("ecb_dec_tail");
576	&movups	($inout0,&QWP(0,$inp));
577	&cmp	($len,0x20);
578	&jb	(&label("ecb_dec_one"));
579	&movups	($inout1,&QWP(0x10,$inp));
580	&je	(&label("ecb_dec_two"));
581	&movups	($inout2,&QWP(0x20,$inp));
582	&cmp	($len,0x40);
583	&jb	(&label("ecb_dec_three"));
584	&movups	($inout3,&QWP(0x30,$inp));
585	&je	(&label("ecb_dec_four"));
586	&movups	($inout4,&QWP(0x40,$inp));
587	&xorps	($inout5,$inout5);
588	&call	("_aesni_decrypt6");
589	&movups	(&QWP(0,$out),$inout0);
590	&movups	(&QWP(0x10,$out),$inout1);
591	&movups	(&QWP(0x20,$out),$inout2);
592	&movups	(&QWP(0x30,$out),$inout3);
593	&movups	(&QWP(0x40,$out),$inout4);
594	&jmp	(&label("ecb_ret"));
595
596&set_label("ecb_dec_one",16);
597	if ($inline)
598	{   &aesni_inline_generate1("dec");	}
599	else
600	{   &call	("_aesni_decrypt1");	}
601	&movups	(&QWP(0,$out),$inout0);
602	&jmp	(&label("ecb_ret"));
603
604&set_label("ecb_dec_two",16);
605	&call	("_aesni_decrypt2");
606	&movups	(&QWP(0,$out),$inout0);
607	&movups	(&QWP(0x10,$out),$inout1);
608	&jmp	(&label("ecb_ret"));
609
610&set_label("ecb_dec_three",16);
611	&call	("_aesni_decrypt3");
612	&movups	(&QWP(0,$out),$inout0);
613	&movups	(&QWP(0x10,$out),$inout1);
614	&movups	(&QWP(0x20,$out),$inout2);
615	&jmp	(&label("ecb_ret"));
616
617&set_label("ecb_dec_four",16);
618	&call	("_aesni_decrypt4");
619	&movups	(&QWP(0,$out),$inout0);
620	&movups	(&QWP(0x10,$out),$inout1);
621	&movups	(&QWP(0x20,$out),$inout2);
622	&movups	(&QWP(0x30,$out),$inout3);
623
624&set_label("ecb_ret");
625	&pxor	("xmm0","xmm0");		# clear register bank
626	&pxor	("xmm1","xmm1");
627	&pxor	("xmm2","xmm2");
628	&pxor	("xmm3","xmm3");
629	&pxor	("xmm4","xmm4");
630	&pxor	("xmm5","xmm5");
631	&pxor	("xmm6","xmm6");
632	&pxor	("xmm7","xmm7");
633&function_end("aesni_ecb_encrypt");
634
635######################################################################
636# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
637#                         size_t blocks, const AES_KEY *key,
638#                         const char *ivec,char *cmac);
639#
640# Handles only complete blocks, operates on 64-bit counter and
641# does not update *ivec! Nor does it finalize CMAC value
642# (see engine/eng_aesni.c for details)
643#
644{ my $cmac=$inout1;
645&function_begin("aesni_ccm64_encrypt_blocks");
646	&mov	($inp,&wparam(0));
647	&mov	($out,&wparam(1));
648	&mov	($len,&wparam(2));
649	&mov	($key,&wparam(3));
650	&mov	($rounds_,&wparam(4));
651	&mov	($rounds,&wparam(5));
652	&mov	($key_,"esp");
653	&sub	("esp",60);
654	&and	("esp",-16);			# align stack
655	&mov	(&DWP(48,"esp"),$key_);
656
657	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
658	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
659	&mov	($rounds,&DWP(240,$key));
660
661	# compose byte-swap control mask for pshufb on stack
662	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
663	&mov	(&DWP(4,"esp"),0x08090a0b);
664	&mov	(&DWP(8,"esp"),0x04050607);
665	&mov	(&DWP(12,"esp"),0x00010203);
666
667	# compose counter increment vector on stack
668	&mov	($rounds_,1);
669	&xor	($key_,$key_);
670	&mov	(&DWP(16,"esp"),$rounds_);
671	&mov	(&DWP(20,"esp"),$key_);
672	&mov	(&DWP(24,"esp"),$key_);
673	&mov	(&DWP(28,"esp"),$key_);
674
675	&shl	($rounds,4);
676	&mov	($rounds_,16);
677	&lea	($key_,&DWP(0,$key));
678	&movdqa	($inout3,&QWP(0,"esp"));
679	&movdqa	($inout0,$ivec);
680	&lea	($key,&DWP(32,$key,$rounds));
681	&sub	($rounds_,$rounds);
682	&pshufb	($ivec,$inout3);
683
684&set_label("ccm64_enc_outer");
685	&$movekey	($rndkey0,&QWP(0,$key_));
686	&mov		($rounds,$rounds_);
687	&movups		($in0,&QWP(0,$inp));
688
689	&xorps		($inout0,$rndkey0);
690	&$movekey	($rndkey1,&QWP(16,$key_));
691	&xorps		($rndkey0,$in0);
692	&xorps		($cmac,$rndkey0);		# cmac^=inp
693	&$movekey	($rndkey0,&QWP(32,$key_));
694
695&set_label("ccm64_enc2_loop");
696	&aesenc		($inout0,$rndkey1);
697	&aesenc		($cmac,$rndkey1);
698	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
699	&add		($rounds,32);
700	&aesenc		($inout0,$rndkey0);
701	&aesenc		($cmac,$rndkey0);
702	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
703	&jnz		(&label("ccm64_enc2_loop"));
704	&aesenc		($inout0,$rndkey1);
705	&aesenc		($cmac,$rndkey1);
706	&paddq		($ivec,&QWP(16,"esp"));
707	&dec		($len);
708	&aesenclast	($inout0,$rndkey0);
709	&aesenclast	($cmac,$rndkey0);
710
711	&lea	($inp,&DWP(16,$inp));
712	&xorps	($in0,$inout0);			# inp^=E(ivec)
713	&movdqa	($inout0,$ivec);
714	&movups	(&QWP(0,$out),$in0);		# save output
715	&pshufb	($inout0,$inout3);
716	&lea	($out,&DWP(16,$out));
717	&jnz	(&label("ccm64_enc_outer"));
718
719	&mov	("esp",&DWP(48,"esp"));
720	&mov	($out,&wparam(5));
721	&movups	(&QWP(0,$out),$cmac);
722
723	&pxor	("xmm0","xmm0");		# clear register bank
724	&pxor	("xmm1","xmm1");
725	&pxor	("xmm2","xmm2");
726	&pxor	("xmm3","xmm3");
727	&pxor	("xmm4","xmm4");
728	&pxor	("xmm5","xmm5");
729	&pxor	("xmm6","xmm6");
730	&pxor	("xmm7","xmm7");
731&function_end("aesni_ccm64_encrypt_blocks");
732
733&function_begin("aesni_ccm64_decrypt_blocks");
734	&mov	($inp,&wparam(0));
735	&mov	($out,&wparam(1));
736	&mov	($len,&wparam(2));
737	&mov	($key,&wparam(3));
738	&mov	($rounds_,&wparam(4));
739	&mov	($rounds,&wparam(5));
740	&mov	($key_,"esp");
741	&sub	("esp",60);
742	&and	("esp",-16);			# align stack
743	&mov	(&DWP(48,"esp"),$key_);
744
745	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
746	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
747	&mov	($rounds,&DWP(240,$key));
748
749	# compose byte-swap control mask for pshufb on stack
750	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
751	&mov	(&DWP(4,"esp"),0x08090a0b);
752	&mov	(&DWP(8,"esp"),0x04050607);
753	&mov	(&DWP(12,"esp"),0x00010203);
754
755	# compose counter increment vector on stack
756	&mov	($rounds_,1);
757	&xor	($key_,$key_);
758	&mov	(&DWP(16,"esp"),$rounds_);
759	&mov	(&DWP(20,"esp"),$key_);
760	&mov	(&DWP(24,"esp"),$key_);
761	&mov	(&DWP(28,"esp"),$key_);
762
763	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
764	&movdqa	($inout0,$ivec);
765
766	&mov	($key_,$key);
767	&mov	($rounds_,$rounds);
768
769	&pshufb	($ivec,$inout3);
770	if ($inline)
771	{   &aesni_inline_generate1("enc");	}
772	else
773	{   &call	("_aesni_encrypt1");	}
774	&shl	($rounds_,4);
775	&mov	($rounds,16);
776	&movups	($in0,&QWP(0,$inp));		# load inp
777	&paddq	($ivec,&QWP(16,"esp"));
778	&lea	($inp,&QWP(16,$inp));
779	&sub	($rounds,$rounds_);
780	&lea	($key,&DWP(32,$key_,$rounds_));
781	&mov	($rounds_,$rounds);
782	&jmp	(&label("ccm64_dec_outer"));
783
784&set_label("ccm64_dec_outer",16);
785	&xorps	($in0,$inout0);			# inp ^= E(ivec)
786	&movdqa	($inout0,$ivec);
787	&movups	(&QWP(0,$out),$in0);		# save output
788	&lea	($out,&DWP(16,$out));
789	&pshufb	($inout0,$inout3);
790
791	&sub	($len,1);
792	&jz	(&label("ccm64_dec_break"));
793
794	&$movekey	($rndkey0,&QWP(0,$key_));
795	&mov		($rounds,$rounds_);
796	&$movekey	($rndkey1,&QWP(16,$key_));
797	&xorps		($in0,$rndkey0);
798	&xorps		($inout0,$rndkey0);
799	&xorps		($cmac,$in0);		# cmac^=out
800	&$movekey	($rndkey0,&QWP(32,$key_));
801
802&set_label("ccm64_dec2_loop");
803	&aesenc		($inout0,$rndkey1);
804	&aesenc		($cmac,$rndkey1);
805	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
806	&add		($rounds,32);
807	&aesenc		($inout0,$rndkey0);
808	&aesenc		($cmac,$rndkey0);
809	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
810	&jnz		(&label("ccm64_dec2_loop"));
811	&movups		($in0,&QWP(0,$inp));	# load inp
812	&paddq		($ivec,&QWP(16,"esp"));
813	&aesenc		($inout0,$rndkey1);
814	&aesenc		($cmac,$rndkey1);
815	&aesenclast	($inout0,$rndkey0);
816	&aesenclast	($cmac,$rndkey0);
817	&lea		($inp,&QWP(16,$inp));
818	&jmp	(&label("ccm64_dec_outer"));
819
820&set_label("ccm64_dec_break",16);
821	&mov	($rounds,&DWP(240,$key_));
822	&mov	($key,$key_);
823	if ($inline)
824	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
825	else
826	{   &call	("_aesni_encrypt1",$cmac);	}
827
828	&mov	("esp",&DWP(48,"esp"));
829	&mov	($out,&wparam(5));
830	&movups	(&QWP(0,$out),$cmac);
831
832	&pxor	("xmm0","xmm0");		# clear register bank
833	&pxor	("xmm1","xmm1");
834	&pxor	("xmm2","xmm2");
835	&pxor	("xmm3","xmm3");
836	&pxor	("xmm4","xmm4");
837	&pxor	("xmm5","xmm5");
838	&pxor	("xmm6","xmm6");
839	&pxor	("xmm7","xmm7");
840&function_end("aesni_ccm64_decrypt_blocks");
841}
842
843######################################################################
844# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
845#                         size_t blocks, const AES_KEY *key,
846#                         const char *ivec);
847#
848# Handles only complete blocks, operates on 32-bit counter and
849# does not update *ivec! (see crypto/modes/ctr128.c for details)
850#
851# stack layout:
852#	0	pshufb mask
853#	16	vector addend: 0,6,6,6
854# 	32	counter-less ivec
855#	48	1st triplet of counter vector
856#	64	2nd triplet of counter vector
857#	80	saved %esp
858
859&function_begin("aesni_ctr32_encrypt_blocks");
860	&mov	($inp,&wparam(0));
861	&mov	($out,&wparam(1));
862	&mov	($len,&wparam(2));
863	&mov	($key,&wparam(3));
864	&mov	($rounds_,&wparam(4));
865	&mov	($key_,"esp");
866	&sub	("esp",88);
867	&and	("esp",-16);			# align stack
868	&mov	(&DWP(80,"esp"),$key_);
869
870	&cmp	($len,1);
871	&je	(&label("ctr32_one_shortcut"));
872
873	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
874
875	# compose byte-swap control mask for pshufb on stack
876	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
877	&mov	(&DWP(4,"esp"),0x08090a0b);
878	&mov	(&DWP(8,"esp"),0x04050607);
879	&mov	(&DWP(12,"esp"),0x00010203);
880
881	# compose counter increment vector on stack
882	&mov	($rounds,6);
883	&xor	($key_,$key_);
884	&mov	(&DWP(16,"esp"),$rounds);
885	&mov	(&DWP(20,"esp"),$rounds);
886	&mov	(&DWP(24,"esp"),$rounds);
887	&mov	(&DWP(28,"esp"),$key_);
888
889	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
890	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
891
892	&mov	($rounds,&DWP(240,$key));	# key->rounds
893
894	# compose 2 vectors of 3x32-bit counters
895	&bswap	($rounds_);
896	&pxor	($rndkey0,$rndkey0);
897	&pxor	($rndkey1,$rndkey1);
898	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
899	&pinsrd	($rndkey0,$rounds_,0);
900	&lea	($key_,&DWP(3,$rounds_));
901	&pinsrd	($rndkey1,$key_,0);
902	&inc	($rounds_);
903	&pinsrd	($rndkey0,$rounds_,1);
904	&inc	($key_);
905	&pinsrd	($rndkey1,$key_,1);
906	&inc	($rounds_);
907	&pinsrd	($rndkey0,$rounds_,2);
908	&inc	($key_);
909	&pinsrd	($rndkey1,$key_,2);
910	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
911	&pshufb	($rndkey0,$inout0);		# byte swap
912	&movdqu	($inout4,&QWP(0,$key));		# key[0]
913	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
914	&pshufb	($rndkey1,$inout0);		# byte swap
915
916	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
917	&pshufd	($inout1,$rndkey0,2<<6);
918	&cmp	($len,6);
919	&jb	(&label("ctr32_tail"));
920	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
921	&shl	($rounds,4);
922	&mov	($rounds_,16);
923	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
924	&mov	($key_,$key);			# backup $key
925	&sub	($rounds_,$rounds);		# backup twisted $rounds
926	&lea	($key,&DWP(32,$key,$rounds));
927	&sub	($len,6);
928	&jmp	(&label("ctr32_loop6"));
929
930&set_label("ctr32_loop6",16);
931	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
932	&pshufd	($inout2,$rndkey0,1<<6);
933	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
934	&pshufd	($inout3,$rndkey1,3<<6);
935	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
936	&pshufd	($inout4,$rndkey1,2<<6);
937	&pxor		($inout1,$rndkey0);
938	&pshufd	($inout5,$rndkey1,1<<6);
939	&$movekey	($rndkey1,&QWP(16,$key_));
940	&pxor		($inout2,$rndkey0);
941	&pxor		($inout3,$rndkey0);
942	&aesenc		($inout0,$rndkey1);
943	&pxor		($inout4,$rndkey0);
944	&pxor		($inout5,$rndkey0);
945	&aesenc		($inout1,$rndkey1);
946	&$movekey	($rndkey0,&QWP(32,$key_));
947	&mov		($rounds,$rounds_);
948	&aesenc		($inout2,$rndkey1);
949	&aesenc		($inout3,$rndkey1);
950	&aesenc		($inout4,$rndkey1);
951	&aesenc		($inout5,$rndkey1);
952
953	&call		(&label("_aesni_encrypt6_enter"));
954
955	&movups	($rndkey1,&QWP(0,$inp));
956	&movups	($rndkey0,&QWP(0x10,$inp));
957	&xorps	($inout0,$rndkey1);
958	&movups	($rndkey1,&QWP(0x20,$inp));
959	&xorps	($inout1,$rndkey0);
960	&movups	(&QWP(0,$out),$inout0);
961	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
962	&xorps	($inout2,$rndkey1);
963	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
964	&movups	(&QWP(0x10,$out),$inout1);
965	&movups	(&QWP(0x20,$out),$inout2);
966
967	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
968	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
969	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
970
971	&movups	($inout1,&QWP(0x30,$inp));
972	&movups	($inout2,&QWP(0x40,$inp));
973	&xorps	($inout3,$inout1);
974	&movups	($inout1,&QWP(0x50,$inp));
975	&lea	($inp,&DWP(0x60,$inp));
976	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
977	&pshufb	($rndkey0,$inout0);		# byte swap
978	&xorps	($inout4,$inout2);
979	&movups	(&QWP(0x30,$out),$inout3);
980	&xorps	($inout5,$inout1);
981	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
982	&pshufb	($rndkey1,$inout0);		# byte swap
983	&movups	(&QWP(0x40,$out),$inout4);
984	&pshufd	($inout0,$rndkey0,3<<6);
985	&movups	(&QWP(0x50,$out),$inout5);
986	&lea	($out,&DWP(0x60,$out));
987
988	&pshufd	($inout1,$rndkey0,2<<6);
989	&sub	($len,6);
990	&jnc	(&label("ctr32_loop6"));
991
992	&add	($len,6);
993	&jz	(&label("ctr32_ret"));
994	&movdqu	($inout5,&QWP(0,$key_));
995	&mov	($key,$key_);
996	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
997	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
998
999&set_label("ctr32_tail");
1000	&por	($inout0,$inout5);
1001	&cmp	($len,2);
1002	&jb	(&label("ctr32_one"));
1003
1004	&pshufd	($inout2,$rndkey0,1<<6);
1005	&por	($inout1,$inout5);
1006	&je	(&label("ctr32_two"));
1007
1008	&pshufd	($inout3,$rndkey1,3<<6);
1009	&por	($inout2,$inout5);
1010	&cmp	($len,4);
1011	&jb	(&label("ctr32_three"));
1012
1013	&pshufd	($inout4,$rndkey1,2<<6);
1014	&por	($inout3,$inout5);
1015	&je	(&label("ctr32_four"));
1016
1017	&por	($inout4,$inout5);
1018	&call	("_aesni_encrypt6");
1019	&movups	($rndkey1,&QWP(0,$inp));
1020	&movups	($rndkey0,&QWP(0x10,$inp));
1021	&xorps	($inout0,$rndkey1);
1022	&movups	($rndkey1,&QWP(0x20,$inp));
1023	&xorps	($inout1,$rndkey0);
1024	&movups	($rndkey0,&QWP(0x30,$inp));
1025	&xorps	($inout2,$rndkey1);
1026	&movups	($rndkey1,&QWP(0x40,$inp));
1027	&xorps	($inout3,$rndkey0);
1028	&movups	(&QWP(0,$out),$inout0);
1029	&xorps	($inout4,$rndkey1);
1030	&movups	(&QWP(0x10,$out),$inout1);
1031	&movups	(&QWP(0x20,$out),$inout2);
1032	&movups	(&QWP(0x30,$out),$inout3);
1033	&movups	(&QWP(0x40,$out),$inout4);
1034	&jmp	(&label("ctr32_ret"));
1035
1036&set_label("ctr32_one_shortcut",16);
1037	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
1038	&mov	($rounds,&DWP(240,$key));
1039
1040&set_label("ctr32_one");
1041	if ($inline)
1042	{   &aesni_inline_generate1("enc");	}
1043	else
1044	{   &call	("_aesni_encrypt1");	}
1045	&movups	($in0,&QWP(0,$inp));
1046	&xorps	($in0,$inout0);
1047	&movups	(&QWP(0,$out),$in0);
1048	&jmp	(&label("ctr32_ret"));
1049
1050&set_label("ctr32_two",16);
1051	&call	("_aesni_encrypt2");
1052	&movups	($inout3,&QWP(0,$inp));
1053	&movups	($inout4,&QWP(0x10,$inp));
1054	&xorps	($inout0,$inout3);
1055	&xorps	($inout1,$inout4);
1056	&movups	(&QWP(0,$out),$inout0);
1057	&movups	(&QWP(0x10,$out),$inout1);
1058	&jmp	(&label("ctr32_ret"));
1059
1060&set_label("ctr32_three",16);
1061	&call	("_aesni_encrypt3");
1062	&movups	($inout3,&QWP(0,$inp));
1063	&movups	($inout4,&QWP(0x10,$inp));
1064	&xorps	($inout0,$inout3);
1065	&movups	($inout5,&QWP(0x20,$inp));
1066	&xorps	($inout1,$inout4);
1067	&movups	(&QWP(0,$out),$inout0);
1068	&xorps	($inout2,$inout5);
1069	&movups	(&QWP(0x10,$out),$inout1);
1070	&movups	(&QWP(0x20,$out),$inout2);
1071	&jmp	(&label("ctr32_ret"));
1072
1073&set_label("ctr32_four",16);
1074	&call	("_aesni_encrypt4");
1075	&movups	($inout4,&QWP(0,$inp));
1076	&movups	($inout5,&QWP(0x10,$inp));
1077	&movups	($rndkey1,&QWP(0x20,$inp));
1078	&xorps	($inout0,$inout4);
1079	&movups	($rndkey0,&QWP(0x30,$inp));
1080	&xorps	($inout1,$inout5);
1081	&movups	(&QWP(0,$out),$inout0);
1082	&xorps	($inout2,$rndkey1);
1083	&movups	(&QWP(0x10,$out),$inout1);
1084	&xorps	($inout3,$rndkey0);
1085	&movups	(&QWP(0x20,$out),$inout2);
1086	&movups	(&QWP(0x30,$out),$inout3);
1087
1088&set_label("ctr32_ret");
1089	&pxor	("xmm0","xmm0");		# clear register bank
1090	&pxor	("xmm1","xmm1");
1091	&pxor	("xmm2","xmm2");
1092	&pxor	("xmm3","xmm3");
1093	&pxor	("xmm4","xmm4");
1094	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
1095	&pxor	("xmm5","xmm5");
1096	&movdqa	(&QWP(48,"esp"),"xmm0");
1097	&pxor	("xmm6","xmm6");
1098	&movdqa	(&QWP(64,"esp"),"xmm0");
1099	&pxor	("xmm7","xmm7");
1100	&mov	("esp",&DWP(80,"esp"));
1101&function_end("aesni_ctr32_encrypt_blocks");
1102
1103######################################################################
1104# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1105#	const AES_KEY *key1, const AES_KEY *key2
1106#	const unsigned char iv[16]);
1107#
1108{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1109
1110&function_begin("aesni_xts_encrypt");
1111	&mov	($key,&wparam(4));		# key2
1112	&mov	($inp,&wparam(5));		# clear-text tweak
1113
1114	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1115	&movups	($inout0,&QWP(0,$inp));
1116	if ($inline)
1117	{   &aesni_inline_generate1("enc");	}
1118	else
1119	{   &call	("_aesni_encrypt1");	}
1120
1121	&mov	($inp,&wparam(0));
1122	&mov	($out,&wparam(1));
1123	&mov	($len,&wparam(2));
1124	&mov	($key,&wparam(3));		# key1
1125
1126	&mov	($key_,"esp");
1127	&sub	("esp",16*7+8);
1128	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1129	&and	("esp",-16);			# align stack
1130
1131	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1132	&mov	(&DWP(16*6+4,"esp"),0);
1133	&mov	(&DWP(16*6+8,"esp"),1);
1134	&mov	(&DWP(16*6+12,"esp"),0);
1135	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1136	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1137
1138	&movdqa	($tweak,$inout0);
1139	&pxor	($twtmp,$twtmp);
1140	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1141	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1142
1143	&and	($len,-16);
1144	&mov	($key_,$key);			# backup $key
1145	&mov	($rounds_,$rounds);		# backup $rounds
1146	&sub	($len,16*6);
1147	&jc	(&label("xts_enc_short"));
1148
1149	&shl	($rounds,4);
1150	&mov	($rounds_,16);
1151	&sub	($rounds_,$rounds);
1152	&lea	($key,&DWP(32,$key,$rounds));
1153	&jmp	(&label("xts_enc_loop6"));
1154
1155&set_label("xts_enc_loop6",16);
1156	for ($i=0;$i<4;$i++) {
1157	    &pshufd	($twres,$twtmp,0x13);
1158	    &pxor	($twtmp,$twtmp);
1159	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1160	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1161	    &pand	($twres,$twmask);	# isolate carry and residue
1162	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1163	    &pxor	($tweak,$twres);
1164	}
1165	&pshufd	($inout5,$twtmp,0x13);
1166	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1167	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1168	 &$movekey	($rndkey0,&QWP(0,$key_));
1169	&pand	($inout5,$twmask);		# isolate carry and residue
1170	 &movups	($inout0,&QWP(0,$inp));	# load input
1171	&pxor	($inout5,$tweak);
1172
1173	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1174	&mov	($rounds,$rounds_);		# restore $rounds
1175	&movdqu	($inout1,&QWP(16*1,$inp));
1176	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1177	&movdqu	($inout2,&QWP(16*2,$inp));
1178	 &pxor		($inout1,$rndkey0);
1179	&movdqu	($inout3,&QWP(16*3,$inp));
1180	 &pxor		($inout2,$rndkey0);
1181	&movdqu	($inout4,&QWP(16*4,$inp));
1182	 &pxor		($inout3,$rndkey0);
1183	&movdqu	($rndkey1,&QWP(16*5,$inp));
1184	 &pxor		($inout4,$rndkey0);
1185	&lea	($inp,&DWP(16*6,$inp));
1186	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1187	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1188	&pxor	($inout5,$rndkey1);
1189
1190	 &$movekey	($rndkey1,&QWP(16,$key_));
1191	&pxor	($inout1,&QWP(16*1,"esp"));
1192	&pxor	($inout2,&QWP(16*2,"esp"));
1193	 &aesenc	($inout0,$rndkey1);
1194	&pxor	($inout3,&QWP(16*3,"esp"));
1195	&pxor	($inout4,&QWP(16*4,"esp"));
1196	 &aesenc	($inout1,$rndkey1);
1197	&pxor		($inout5,$rndkey0);
1198	 &$movekey	($rndkey0,&QWP(32,$key_));
1199	 &aesenc	($inout2,$rndkey1);
1200	 &aesenc	($inout3,$rndkey1);
1201	 &aesenc	($inout4,$rndkey1);
1202	 &aesenc	($inout5,$rndkey1);
1203	&call		(&label("_aesni_encrypt6_enter"));
1204
1205	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1206       &pxor	($twtmp,$twtmp);
1207	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1208       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1209	&xorps	($inout1,&QWP(16*1,"esp"));
1210	&movups	(&QWP(16*0,$out),$inout0);	# write output
1211	&xorps	($inout2,&QWP(16*2,"esp"));
1212	&movups	(&QWP(16*1,$out),$inout1);
1213	&xorps	($inout3,&QWP(16*3,"esp"));
1214	&movups	(&QWP(16*2,$out),$inout2);
1215	&xorps	($inout4,&QWP(16*4,"esp"));
1216	&movups	(&QWP(16*3,$out),$inout3);
1217	&xorps	($inout5,$tweak);
1218	&movups	(&QWP(16*4,$out),$inout4);
1219       &pshufd	($twres,$twtmp,0x13);
1220	&movups	(&QWP(16*5,$out),$inout5);
1221	&lea	($out,&DWP(16*6,$out));
1222       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1223
1224	&pxor	($twtmp,$twtmp);
1225	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1226	&pand	($twres,$twmask);		# isolate carry and residue
1227	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1228	&pxor	($tweak,$twres);
1229
1230	&sub	($len,16*6);
1231	&jnc	(&label("xts_enc_loop6"));
1232
1233	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1234	&mov	($key,$key_);			# restore $key
1235	&mov	($rounds_,$rounds);
1236
1237&set_label("xts_enc_short");
1238	&add	($len,16*6);
1239	&jz	(&label("xts_enc_done6x"));
1240
1241	&movdqa	($inout3,$tweak);		# put aside previous tweak
1242	&cmp	($len,0x20);
1243	&jb	(&label("xts_enc_one"));
1244
1245	&pshufd	($twres,$twtmp,0x13);
1246	&pxor	($twtmp,$twtmp);
1247	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1248	&pand	($twres,$twmask);		# isolate carry and residue
1249	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1250	&pxor	($tweak,$twres);
1251	&je	(&label("xts_enc_two"));
1252
1253	&pshufd	($twres,$twtmp,0x13);
1254	&pxor	($twtmp,$twtmp);
1255	&movdqa	($inout4,$tweak);		# put aside previous tweak
1256	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1257	&pand	($twres,$twmask);		# isolate carry and residue
1258	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1259	&pxor	($tweak,$twres);
1260	&cmp	($len,0x40);
1261	&jb	(&label("xts_enc_three"));
1262
1263	&pshufd	($twres,$twtmp,0x13);
1264	&pxor	($twtmp,$twtmp);
1265	&movdqa	($inout5,$tweak);		# put aside previous tweak
1266	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1267	&pand	($twres,$twmask);		# isolate carry and residue
1268	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1269	&pxor	($tweak,$twres);
1270	&movdqa	(&QWP(16*0,"esp"),$inout3);
1271	&movdqa	(&QWP(16*1,"esp"),$inout4);
1272	&je	(&label("xts_enc_four"));
1273
1274	&movdqa	(&QWP(16*2,"esp"),$inout5);
1275	&pshufd	($inout5,$twtmp,0x13);
1276	&movdqa	(&QWP(16*3,"esp"),$tweak);
1277	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1278	&pand	($inout5,$twmask);		# isolate carry and residue
1279	&pxor	($inout5,$tweak);
1280
1281	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1282	&movdqu	($inout1,&QWP(16*1,$inp));
1283	&movdqu	($inout2,&QWP(16*2,$inp));
1284	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1285	&movdqu	($inout3,&QWP(16*3,$inp));
1286	&pxor	($inout1,&QWP(16*1,"esp"));
1287	&movdqu	($inout4,&QWP(16*4,$inp));
1288	&pxor	($inout2,&QWP(16*2,"esp"));
1289	&lea	($inp,&DWP(16*5,$inp));
1290	&pxor	($inout3,&QWP(16*3,"esp"));
1291	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1292	&pxor	($inout4,$inout5);
1293
1294	&call	("_aesni_encrypt6");
1295
1296	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1297	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1298	&xorps	($inout1,&QWP(16*1,"esp"));
1299	&xorps	($inout2,&QWP(16*2,"esp"));
1300	&movups	(&QWP(16*0,$out),$inout0);	# write output
1301	&xorps	($inout3,&QWP(16*3,"esp"));
1302	&movups	(&QWP(16*1,$out),$inout1);
1303	&xorps	($inout4,$tweak);
1304	&movups	(&QWP(16*2,$out),$inout2);
1305	&movups	(&QWP(16*3,$out),$inout3);
1306	&movups	(&QWP(16*4,$out),$inout4);
1307	&lea	($out,&DWP(16*5,$out));
1308	&jmp	(&label("xts_enc_done"));
1309
1310&set_label("xts_enc_one",16);
1311	&movups	($inout0,&QWP(16*0,$inp));	# load input
1312	&lea	($inp,&DWP(16*1,$inp));
1313	&xorps	($inout0,$inout3);		# input^=tweak
1314	if ($inline)
1315	{   &aesni_inline_generate1("enc");	}
1316	else
1317	{   &call	("_aesni_encrypt1");	}
1318	&xorps	($inout0,$inout3);		# output^=tweak
1319	&movups	(&QWP(16*0,$out),$inout0);	# write output
1320	&lea	($out,&DWP(16*1,$out));
1321
1322	&movdqa	($tweak,$inout3);		# last tweak
1323	&jmp	(&label("xts_enc_done"));
1324
1325&set_label("xts_enc_two",16);
1326	&movaps	($inout4,$tweak);		# put aside last tweak
1327
1328	&movups	($inout0,&QWP(16*0,$inp));	# load input
1329	&movups	($inout1,&QWP(16*1,$inp));
1330	&lea	($inp,&DWP(16*2,$inp));
1331	&xorps	($inout0,$inout3);		# input^=tweak
1332	&xorps	($inout1,$inout4);
1333
1334	&call	("_aesni_encrypt2");
1335
1336	&xorps	($inout0,$inout3);		# output^=tweak
1337	&xorps	($inout1,$inout4);
1338	&movups	(&QWP(16*0,$out),$inout0);	# write output
1339	&movups	(&QWP(16*1,$out),$inout1);
1340	&lea	($out,&DWP(16*2,$out));
1341
1342	&movdqa	($tweak,$inout4);		# last tweak
1343	&jmp	(&label("xts_enc_done"));
1344
1345&set_label("xts_enc_three",16);
1346	&movaps	($inout5,$tweak);		# put aside last tweak
1347	&movups	($inout0,&QWP(16*0,$inp));	# load input
1348	&movups	($inout1,&QWP(16*1,$inp));
1349	&movups	($inout2,&QWP(16*2,$inp));
1350	&lea	($inp,&DWP(16*3,$inp));
1351	&xorps	($inout0,$inout3);		# input^=tweak
1352	&xorps	($inout1,$inout4);
1353	&xorps	($inout2,$inout5);
1354
1355	&call	("_aesni_encrypt3");
1356
1357	&xorps	($inout0,$inout3);		# output^=tweak
1358	&xorps	($inout1,$inout4);
1359	&xorps	($inout2,$inout5);
1360	&movups	(&QWP(16*0,$out),$inout0);	# write output
1361	&movups	(&QWP(16*1,$out),$inout1);
1362	&movups	(&QWP(16*2,$out),$inout2);
1363	&lea	($out,&DWP(16*3,$out));
1364
1365	&movdqa	($tweak,$inout5);		# last tweak
1366	&jmp	(&label("xts_enc_done"));
1367
1368&set_label("xts_enc_four",16);
1369	&movaps	($inout4,$tweak);		# put aside last tweak
1370
1371	&movups	($inout0,&QWP(16*0,$inp));	# load input
1372	&movups	($inout1,&QWP(16*1,$inp));
1373	&movups	($inout2,&QWP(16*2,$inp));
1374	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1375	&movups	($inout3,&QWP(16*3,$inp));
1376	&lea	($inp,&DWP(16*4,$inp));
1377	&xorps	($inout1,&QWP(16*1,"esp"));
1378	&xorps	($inout2,$inout5);
1379	&xorps	($inout3,$inout4);
1380
1381	&call	("_aesni_encrypt4");
1382
1383	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1384	&xorps	($inout1,&QWP(16*1,"esp"));
1385	&xorps	($inout2,$inout5);
1386	&movups	(&QWP(16*0,$out),$inout0);	# write output
1387	&xorps	($inout3,$inout4);
1388	&movups	(&QWP(16*1,$out),$inout1);
1389	&movups	(&QWP(16*2,$out),$inout2);
1390	&movups	(&QWP(16*3,$out),$inout3);
1391	&lea	($out,&DWP(16*4,$out));
1392
1393	&movdqa	($tweak,$inout4);		# last tweak
1394	&jmp	(&label("xts_enc_done"));
1395
1396&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
1397	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1398	&and	($len,15);
1399	&jz	(&label("xts_enc_ret"));
1400	&movdqa	($inout3,$tweak);
1401	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1402	&jmp	(&label("xts_enc_steal"));
1403
1404&set_label("xts_enc_done",16);
1405	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1406	&pxor	($twtmp,$twtmp);
1407	&and	($len,15);
1408	&jz	(&label("xts_enc_ret"));
1409
1410	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1411	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1412	&pshufd	($inout3,$twtmp,0x13);
1413	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1414	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
1415	&pxor	($inout3,$tweak);
1416
1417&set_label("xts_enc_steal");
1418	&movz	($rounds,&BP(0,$inp));
1419	&movz	($key,&BP(-16,$out));
1420	&lea	($inp,&DWP(1,$inp));
1421	&mov	(&BP(-16,$out),&LB($rounds));
1422	&mov	(&BP(0,$out),&LB($key));
1423	&lea	($out,&DWP(1,$out));
1424	&sub	($len,1);
1425	&jnz	(&label("xts_enc_steal"));
1426
1427	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1428	&mov	($key,$key_);			# restore $key
1429	&mov	($rounds,$rounds_);		# restore $rounds
1430
1431	&movups	($inout0,&QWP(-16,$out));	# load input
1432	&xorps	($inout0,$inout3);		# input^=tweak
1433	if ($inline)
1434	{   &aesni_inline_generate1("enc");	}
1435	else
1436	{   &call	("_aesni_encrypt1");	}
1437	&xorps	($inout0,$inout3);		# output^=tweak
1438	&movups	(&QWP(-16,$out),$inout0);	# write output
1439
1440&set_label("xts_enc_ret");
1441	&pxor	("xmm0","xmm0");		# clear register bank
1442	&pxor	("xmm1","xmm1");
1443	&pxor	("xmm2","xmm2");
1444	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
1445	&pxor	("xmm3","xmm3");
1446	&movdqa	(&QWP(16*1,"esp"),"xmm0");
1447	&pxor	("xmm4","xmm4");
1448	&movdqa	(&QWP(16*2,"esp"),"xmm0");
1449	&pxor	("xmm5","xmm5");
1450	&movdqa	(&QWP(16*3,"esp"),"xmm0");
1451	&pxor	("xmm6","xmm6");
1452	&movdqa	(&QWP(16*4,"esp"),"xmm0");
1453	&pxor	("xmm7","xmm7");
1454	&movdqa	(&QWP(16*5,"esp"),"xmm0");
1455	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1456&function_end("aesni_xts_encrypt");
1457
1458&function_begin("aesni_xts_decrypt");
1459	&mov	($key,&wparam(4));		# key2
1460	&mov	($inp,&wparam(5));		# clear-text tweak
1461
1462	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1463	&movups	($inout0,&QWP(0,$inp));
1464	if ($inline)
1465	{   &aesni_inline_generate1("enc");	}
1466	else
1467	{   &call	("_aesni_encrypt1");	}
1468
1469	&mov	($inp,&wparam(0));
1470	&mov	($out,&wparam(1));
1471	&mov	($len,&wparam(2));
1472	&mov	($key,&wparam(3));		# key1
1473
1474	&mov	($key_,"esp");
1475	&sub	("esp",16*7+8);
1476	&and	("esp",-16);			# align stack
1477
1478	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
1479	&test	($len,15);
1480	&setnz	(&LB($rounds_));
1481	&shl	($rounds_,4);
1482	&sub	($len,$rounds_);
1483
1484	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1485	&mov	(&DWP(16*6+4,"esp"),0);
1486	&mov	(&DWP(16*6+8,"esp"),1);
1487	&mov	(&DWP(16*6+12,"esp"),0);
1488	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1489	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1490
1491	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1492	&mov	($key_,$key);			# backup $key
1493	&mov	($rounds_,$rounds);		# backup $rounds
1494
1495	&movdqa	($tweak,$inout0);
1496	&pxor	($twtmp,$twtmp);
1497	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1498	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1499
1500	&and	($len,-16);
1501	&sub	($len,16*6);
1502	&jc	(&label("xts_dec_short"));
1503
1504	&shl	($rounds,4);
1505	&mov	($rounds_,16);
1506	&sub	($rounds_,$rounds);
1507	&lea	($key,&DWP(32,$key,$rounds));
1508	&jmp	(&label("xts_dec_loop6"));
1509
1510&set_label("xts_dec_loop6",16);
1511	for ($i=0;$i<4;$i++) {
1512	    &pshufd	($twres,$twtmp,0x13);
1513	    &pxor	($twtmp,$twtmp);
1514	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1515	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1516	    &pand	($twres,$twmask);	# isolate carry and residue
1517	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1518	    &pxor	($tweak,$twres);
1519	}
1520	&pshufd	($inout5,$twtmp,0x13);
1521	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1522	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1523	 &$movekey	($rndkey0,&QWP(0,$key_));
1524	&pand	($inout5,$twmask);		# isolate carry and residue
1525	 &movups	($inout0,&QWP(0,$inp));	# load input
1526	&pxor	($inout5,$tweak);
1527
1528	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1529	&mov	($rounds,$rounds_);
1530	&movdqu	($inout1,&QWP(16*1,$inp));
1531	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1532	&movdqu	($inout2,&QWP(16*2,$inp));
1533	 &pxor		($inout1,$rndkey0);
1534	&movdqu	($inout3,&QWP(16*3,$inp));
1535	 &pxor		($inout2,$rndkey0);
1536	&movdqu	($inout4,&QWP(16*4,$inp));
1537	 &pxor		($inout3,$rndkey0);
1538	&movdqu	($rndkey1,&QWP(16*5,$inp));
1539	 &pxor		($inout4,$rndkey0);
1540	&lea	($inp,&DWP(16*6,$inp));
1541	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1542	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1543	&pxor	($inout5,$rndkey1);
1544
1545	 &$movekey	($rndkey1,&QWP(16,$key_));
1546	&pxor	($inout1,&QWP(16*1,"esp"));
1547	&pxor	($inout2,&QWP(16*2,"esp"));
1548	 &aesdec	($inout0,$rndkey1);
1549	&pxor	($inout3,&QWP(16*3,"esp"));
1550	&pxor	($inout4,&QWP(16*4,"esp"));
1551	 &aesdec	($inout1,$rndkey1);
1552	&pxor		($inout5,$rndkey0);
1553	 &$movekey	($rndkey0,&QWP(32,$key_));
1554	 &aesdec	($inout2,$rndkey1);
1555	 &aesdec	($inout3,$rndkey1);
1556	 &aesdec	($inout4,$rndkey1);
1557	 &aesdec	($inout5,$rndkey1);
1558	&call		(&label("_aesni_decrypt6_enter"));
1559
1560	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1561       &pxor	($twtmp,$twtmp);
1562	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1563       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1564	&xorps	($inout1,&QWP(16*1,"esp"));
1565	&movups	(&QWP(16*0,$out),$inout0);	# write output
1566	&xorps	($inout2,&QWP(16*2,"esp"));
1567	&movups	(&QWP(16*1,$out),$inout1);
1568	&xorps	($inout3,&QWP(16*3,"esp"));
1569	&movups	(&QWP(16*2,$out),$inout2);
1570	&xorps	($inout4,&QWP(16*4,"esp"));
1571	&movups	(&QWP(16*3,$out),$inout3);
1572	&xorps	($inout5,$tweak);
1573	&movups	(&QWP(16*4,$out),$inout4);
1574       &pshufd	($twres,$twtmp,0x13);
1575	&movups	(&QWP(16*5,$out),$inout5);
1576	&lea	($out,&DWP(16*6,$out));
1577       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1578
1579	&pxor	($twtmp,$twtmp);
1580	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1581	&pand	($twres,$twmask);		# isolate carry and residue
1582	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1583	&pxor	($tweak,$twres);
1584
1585	&sub	($len,16*6);
1586	&jnc	(&label("xts_dec_loop6"));
1587
1588	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1589	&mov	($key,$key_);			# restore $key
1590	&mov	($rounds_,$rounds);
1591
1592&set_label("xts_dec_short");
1593	&add	($len,16*6);
1594	&jz	(&label("xts_dec_done6x"));
1595
1596	&movdqa	($inout3,$tweak);		# put aside previous tweak
1597	&cmp	($len,0x20);
1598	&jb	(&label("xts_dec_one"));
1599
1600	&pshufd	($twres,$twtmp,0x13);
1601	&pxor	($twtmp,$twtmp);
1602	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1603	&pand	($twres,$twmask);		# isolate carry and residue
1604	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1605	&pxor	($tweak,$twres);
1606	&je	(&label("xts_dec_two"));
1607
1608	&pshufd	($twres,$twtmp,0x13);
1609	&pxor	($twtmp,$twtmp);
1610	&movdqa	($inout4,$tweak);		# put aside previous tweak
1611	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1612	&pand	($twres,$twmask);		# isolate carry and residue
1613	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1614	&pxor	($tweak,$twres);
1615	&cmp	($len,0x40);
1616	&jb	(&label("xts_dec_three"));
1617
1618	&pshufd	($twres,$twtmp,0x13);
1619	&pxor	($twtmp,$twtmp);
1620	&movdqa	($inout5,$tweak);		# put aside previous tweak
1621	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1622	&pand	($twres,$twmask);		# isolate carry and residue
1623	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1624	&pxor	($tweak,$twres);
1625	&movdqa	(&QWP(16*0,"esp"),$inout3);
1626	&movdqa	(&QWP(16*1,"esp"),$inout4);
1627	&je	(&label("xts_dec_four"));
1628
1629	&movdqa	(&QWP(16*2,"esp"),$inout5);
1630	&pshufd	($inout5,$twtmp,0x13);
1631	&movdqa	(&QWP(16*3,"esp"),$tweak);
1632	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1633	&pand	($inout5,$twmask);		# isolate carry and residue
1634	&pxor	($inout5,$tweak);
1635
1636	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1637	&movdqu	($inout1,&QWP(16*1,$inp));
1638	&movdqu	($inout2,&QWP(16*2,$inp));
1639	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1640	&movdqu	($inout3,&QWP(16*3,$inp));
1641	&pxor	($inout1,&QWP(16*1,"esp"));
1642	&movdqu	($inout4,&QWP(16*4,$inp));
1643	&pxor	($inout2,&QWP(16*2,"esp"));
1644	&lea	($inp,&DWP(16*5,$inp));
1645	&pxor	($inout3,&QWP(16*3,"esp"));
1646	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1647	&pxor	($inout4,$inout5);
1648
1649	&call	("_aesni_decrypt6");
1650
1651	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1652	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1653	&xorps	($inout1,&QWP(16*1,"esp"));
1654	&xorps	($inout2,&QWP(16*2,"esp"));
1655	&movups	(&QWP(16*0,$out),$inout0);	# write output
1656	&xorps	($inout3,&QWP(16*3,"esp"));
1657	&movups	(&QWP(16*1,$out),$inout1);
1658	&xorps	($inout4,$tweak);
1659	&movups	(&QWP(16*2,$out),$inout2);
1660	&movups	(&QWP(16*3,$out),$inout3);
1661	&movups	(&QWP(16*4,$out),$inout4);
1662	&lea	($out,&DWP(16*5,$out));
1663	&jmp	(&label("xts_dec_done"));
1664
1665&set_label("xts_dec_one",16);
1666	&movups	($inout0,&QWP(16*0,$inp));	# load input
1667	&lea	($inp,&DWP(16*1,$inp));
1668	&xorps	($inout0,$inout3);		# input^=tweak
1669	if ($inline)
1670	{   &aesni_inline_generate1("dec");	}
1671	else
1672	{   &call	("_aesni_decrypt1");	}
1673	&xorps	($inout0,$inout3);		# output^=tweak
1674	&movups	(&QWP(16*0,$out),$inout0);	# write output
1675	&lea	($out,&DWP(16*1,$out));
1676
1677	&movdqa	($tweak,$inout3);		# last tweak
1678	&jmp	(&label("xts_dec_done"));
1679
1680&set_label("xts_dec_two",16);
1681	&movaps	($inout4,$tweak);		# put aside last tweak
1682
1683	&movups	($inout0,&QWP(16*0,$inp));	# load input
1684	&movups	($inout1,&QWP(16*1,$inp));
1685	&lea	($inp,&DWP(16*2,$inp));
1686	&xorps	($inout0,$inout3);		# input^=tweak
1687	&xorps	($inout1,$inout4);
1688
1689	&call	("_aesni_decrypt2");
1690
1691	&xorps	($inout0,$inout3);		# output^=tweak
1692	&xorps	($inout1,$inout4);
1693	&movups	(&QWP(16*0,$out),$inout0);	# write output
1694	&movups	(&QWP(16*1,$out),$inout1);
1695	&lea	($out,&DWP(16*2,$out));
1696
1697	&movdqa	($tweak,$inout4);		# last tweak
1698	&jmp	(&label("xts_dec_done"));
1699
1700&set_label("xts_dec_three",16);
1701	&movaps	($inout5,$tweak);		# put aside last tweak
1702	&movups	($inout0,&QWP(16*0,$inp));	# load input
1703	&movups	($inout1,&QWP(16*1,$inp));
1704	&movups	($inout2,&QWP(16*2,$inp));
1705	&lea	($inp,&DWP(16*3,$inp));
1706	&xorps	($inout0,$inout3);		# input^=tweak
1707	&xorps	($inout1,$inout4);
1708	&xorps	($inout2,$inout5);
1709
1710	&call	("_aesni_decrypt3");
1711
1712	&xorps	($inout0,$inout3);		# output^=tweak
1713	&xorps	($inout1,$inout4);
1714	&xorps	($inout2,$inout5);
1715	&movups	(&QWP(16*0,$out),$inout0);	# write output
1716	&movups	(&QWP(16*1,$out),$inout1);
1717	&movups	(&QWP(16*2,$out),$inout2);
1718	&lea	($out,&DWP(16*3,$out));
1719
1720	&movdqa	($tweak,$inout5);		# last tweak
1721	&jmp	(&label("xts_dec_done"));
1722
1723&set_label("xts_dec_four",16);
1724	&movaps	($inout4,$tweak);		# put aside last tweak
1725
1726	&movups	($inout0,&QWP(16*0,$inp));	# load input
1727	&movups	($inout1,&QWP(16*1,$inp));
1728	&movups	($inout2,&QWP(16*2,$inp));
1729	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1730	&movups	($inout3,&QWP(16*3,$inp));
1731	&lea	($inp,&DWP(16*4,$inp));
1732	&xorps	($inout1,&QWP(16*1,"esp"));
1733	&xorps	($inout2,$inout5);
1734	&xorps	($inout3,$inout4);
1735
1736	&call	("_aesni_decrypt4");
1737
1738	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1739	&xorps	($inout1,&QWP(16*1,"esp"));
1740	&xorps	($inout2,$inout5);
1741	&movups	(&QWP(16*0,$out),$inout0);	# write output
1742	&xorps	($inout3,$inout4);
1743	&movups	(&QWP(16*1,$out),$inout1);
1744	&movups	(&QWP(16*2,$out),$inout2);
1745	&movups	(&QWP(16*3,$out),$inout3);
1746	&lea	($out,&DWP(16*4,$out));
1747
1748	&movdqa	($tweak,$inout4);		# last tweak
1749	&jmp	(&label("xts_dec_done"));
1750
1751&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
1752	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1753	&and	($len,15);
1754	&jz	(&label("xts_dec_ret"));
1755	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1756	&jmp	(&label("xts_dec_only_one_more"));
1757
1758&set_label("xts_dec_done",16);
1759	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1760	&pxor	($twtmp,$twtmp);
1761	&and	($len,15);
1762	&jz	(&label("xts_dec_ret"));
1763
1764	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1765	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1766	&pshufd	($twres,$twtmp,0x13);
1767	&pxor	($twtmp,$twtmp);
1768	&movdqa	($twmask,&QWP(16*6,"esp"));
1769	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1770	&pand	($twres,$twmask);		# isolate carry and residue
1771	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1772	&pxor	($tweak,$twres);
1773
1774&set_label("xts_dec_only_one_more");
1775	&pshufd	($inout3,$twtmp,0x13);
1776	&movdqa	($inout4,$tweak);		# put aside previous tweak
1777	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1778	&pand	($inout3,$twmask);		# isolate carry and residue
1779	&pxor	($inout3,$tweak);
1780
1781	&mov	($key,$key_);			# restore $key
1782	&mov	($rounds,$rounds_);		# restore $rounds
1783
1784	&movups	($inout0,&QWP(0,$inp));		# load input
1785	&xorps	($inout0,$inout3);		# input^=tweak
1786	if ($inline)
1787	{   &aesni_inline_generate1("dec");	}
1788	else
1789	{   &call	("_aesni_decrypt1");	}
1790	&xorps	($inout0,$inout3);		# output^=tweak
1791	&movups	(&QWP(0,$out),$inout0);		# write output
1792
1793&set_label("xts_dec_steal");
1794	&movz	($rounds,&BP(16,$inp));
1795	&movz	($key,&BP(0,$out));
1796	&lea	($inp,&DWP(1,$inp));
1797	&mov	(&BP(0,$out),&LB($rounds));
1798	&mov	(&BP(16,$out),&LB($key));
1799	&lea	($out,&DWP(1,$out));
1800	&sub	($len,1);
1801	&jnz	(&label("xts_dec_steal"));
1802
1803	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1804	&mov	($key,$key_);			# restore $key
1805	&mov	($rounds,$rounds_);		# restore $rounds
1806
1807	&movups	($inout0,&QWP(0,$out));		# load input
1808	&xorps	($inout0,$inout4);		# input^=tweak
1809	if ($inline)
1810	{   &aesni_inline_generate1("dec");	}
1811	else
1812	{   &call	("_aesni_decrypt1");	}
1813	&xorps	($inout0,$inout4);		# output^=tweak
1814	&movups	(&QWP(0,$out),$inout0);		# write output
1815
1816&set_label("xts_dec_ret");
1817	&pxor	("xmm0","xmm0");		# clear register bank
1818	&pxor	("xmm1","xmm1");
1819	&pxor	("xmm2","xmm2");
1820	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
1821	&pxor	("xmm3","xmm3");
1822	&movdqa	(&QWP(16*1,"esp"),"xmm0");
1823	&pxor	("xmm4","xmm4");
1824	&movdqa	(&QWP(16*2,"esp"),"xmm0");
1825	&pxor	("xmm5","xmm5");
1826	&movdqa	(&QWP(16*3,"esp"),"xmm0");
1827	&pxor	("xmm6","xmm6");
1828	&movdqa	(&QWP(16*4,"esp"),"xmm0");
1829	&pxor	("xmm7","xmm7");
1830	&movdqa	(&QWP(16*5,"esp"),"xmm0");
1831	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1832&function_end("aesni_xts_decrypt");
1833}
1834}
1835
1836######################################################################
1837# void $PREFIX_cbc_encrypt (const void *inp, void *out,
1838#                           size_t length, const AES_KEY *key,
1839#                           unsigned char *ivp,const int enc);
1840&function_begin("${PREFIX}_cbc_encrypt");
1841	&mov	($inp,&wparam(0));
1842	&mov	($rounds_,"esp");
1843	&mov	($out,&wparam(1));
1844	&sub	($rounds_,24);
1845	&mov	($len,&wparam(2));
1846	&and	($rounds_,-16);
1847	&mov	($key,&wparam(3));
1848	&mov	($key_,&wparam(4));
1849	&test	($len,$len);
1850	&jz	(&label("cbc_abort"));
1851
1852	&cmp	(&wparam(5),0);
1853	&xchg	($rounds_,"esp");		# alloca
1854	&movups	($ivec,&QWP(0,$key_));		# load IV
1855	&mov	($rounds,&DWP(240,$key));
1856	&mov	($key_,$key);			# backup $key
1857	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
1858	&mov	($rounds_,$rounds);		# backup $rounds
1859	&je	(&label("cbc_decrypt"));
1860
1861	&movaps	($inout0,$ivec);
1862	&cmp	($len,16);
1863	&jb	(&label("cbc_enc_tail"));
1864	&sub	($len,16);
1865	&jmp	(&label("cbc_enc_loop"));
1866
1867&set_label("cbc_enc_loop",16);
1868	&movups	($ivec,&QWP(0,$inp));		# input actually
1869	&lea	($inp,&DWP(16,$inp));
1870	if ($inline)
1871	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
1872	else
1873	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
1874	&mov	($rounds,$rounds_);	# restore $rounds
1875	&mov	($key,$key_);		# restore $key
1876	&movups	(&QWP(0,$out),$inout0);	# store output
1877	&lea	($out,&DWP(16,$out));
1878	&sub	($len,16);
1879	&jnc	(&label("cbc_enc_loop"));
1880	&add	($len,16);
1881	&jnz	(&label("cbc_enc_tail"));
1882	&movaps	($ivec,$inout0);
1883	&pxor	($inout0,$inout0);
1884	&jmp	(&label("cbc_ret"));
1885
1886&set_label("cbc_enc_tail");
1887	&mov	("ecx",$len);		# zaps $rounds
1888	&data_word(0xA4F3F689);		# rep movsb
1889	&mov	("ecx",16);		# zero tail
1890	&sub	("ecx",$len);
1891	&xor	("eax","eax");		# zaps $len
1892	&data_word(0xAAF3F689);		# rep stosb
1893	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
1894	&mov	($rounds,$rounds_);	# restore $rounds
1895	&mov	($inp,$out);		# $inp and $out are the same
1896	&mov	($key,$key_);		# restore $key
1897	&jmp	(&label("cbc_enc_loop"));
1898######################################################################
1899&set_label("cbc_decrypt",16);
1900	&cmp	($len,0x50);
1901	&jbe	(&label("cbc_dec_tail"));
1902	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
1903	&sub	($len,0x50);
1904	&jmp	(&label("cbc_dec_loop6_enter"));
1905
1906&set_label("cbc_dec_loop6",16);
1907	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
1908	&movups	(&QWP(0,$out),$inout5);
1909	&lea	($out,&DWP(0x10,$out));
1910&set_label("cbc_dec_loop6_enter");
1911	&movdqu	($inout0,&QWP(0,$inp));
1912	&movdqu	($inout1,&QWP(0x10,$inp));
1913	&movdqu	($inout2,&QWP(0x20,$inp));
1914	&movdqu	($inout3,&QWP(0x30,$inp));
1915	&movdqu	($inout4,&QWP(0x40,$inp));
1916	&movdqu	($inout5,&QWP(0x50,$inp));
1917
1918	&call	("_aesni_decrypt6");
1919
1920	&movups	($rndkey1,&QWP(0,$inp));
1921	&movups	($rndkey0,&QWP(0x10,$inp));
1922	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
1923	&xorps	($inout1,$rndkey1);
1924	&movups	($rndkey1,&QWP(0x20,$inp));
1925	&xorps	($inout2,$rndkey0);
1926	&movups	($rndkey0,&QWP(0x30,$inp));
1927	&xorps	($inout3,$rndkey1);
1928	&movups	($rndkey1,&QWP(0x40,$inp));
1929	&xorps	($inout4,$rndkey0);
1930	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
1931	&xorps	($inout5,$rndkey1);
1932	&movups	(&QWP(0,$out),$inout0);
1933	&movups	(&QWP(0x10,$out),$inout1);
1934	&lea	($inp,&DWP(0x60,$inp));
1935	&movups	(&QWP(0x20,$out),$inout2);
1936	&mov	($rounds,$rounds_);		# restore $rounds
1937	&movups	(&QWP(0x30,$out),$inout3);
1938	&mov	($key,$key_);			# restore $key
1939	&movups	(&QWP(0x40,$out),$inout4);
1940	&lea	($out,&DWP(0x50,$out));
1941	&sub	($len,0x60);
1942	&ja	(&label("cbc_dec_loop6"));
1943
1944	&movaps	($inout0,$inout5);
1945	&movaps	($ivec,$rndkey0);
1946	&add	($len,0x50);
1947	&jle	(&label("cbc_dec_clear_tail_collected"));
1948	&movups	(&QWP(0,$out),$inout0);
1949	&lea	($out,&DWP(0x10,$out));
1950&set_label("cbc_dec_tail");
1951	&movups	($inout0,&QWP(0,$inp));
1952	&movaps	($in0,$inout0);
1953	&cmp	($len,0x10);
1954	&jbe	(&label("cbc_dec_one"));
1955
1956	&movups	($inout1,&QWP(0x10,$inp));
1957	&movaps	($in1,$inout1);
1958	&cmp	($len,0x20);
1959	&jbe	(&label("cbc_dec_two"));
1960
1961	&movups	($inout2,&QWP(0x20,$inp));
1962	&cmp	($len,0x30);
1963	&jbe	(&label("cbc_dec_three"));
1964
1965	&movups	($inout3,&QWP(0x30,$inp));
1966	&cmp	($len,0x40);
1967	&jbe	(&label("cbc_dec_four"));
1968
1969	&movups	($inout4,&QWP(0x40,$inp));
1970	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
1971	&movups	($inout0,&QWP(0,$inp));
1972	&xorps	($inout5,$inout5);
1973	&call	("_aesni_decrypt6");
1974	&movups	($rndkey1,&QWP(0,$inp));
1975	&movups	($rndkey0,&QWP(0x10,$inp));
1976	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
1977	&xorps	($inout1,$rndkey1);
1978	&movups	($rndkey1,&QWP(0x20,$inp));
1979	&xorps	($inout2,$rndkey0);
1980	&movups	($rndkey0,&QWP(0x30,$inp));
1981	&xorps	($inout3,$rndkey1);
1982	&movups	($ivec,&QWP(0x40,$inp));	# IV
1983	&xorps	($inout4,$rndkey0);
1984	&movups	(&QWP(0,$out),$inout0);
1985	&movups	(&QWP(0x10,$out),$inout1);
1986	&pxor	($inout1,$inout1);
1987	&movups	(&QWP(0x20,$out),$inout2);
1988	&pxor	($inout2,$inout2);
1989	&movups	(&QWP(0x30,$out),$inout3);
1990	&pxor	($inout3,$inout3);
1991	&lea	($out,&DWP(0x40,$out));
1992	&movaps	($inout0,$inout4);
1993	&pxor	($inout4,$inout4);
1994	&sub	($len,0x50);
1995	&jmp	(&label("cbc_dec_tail_collected"));
1996
1997&set_label("cbc_dec_one",16);
1998	if ($inline)
1999	{   &aesni_inline_generate1("dec");	}
2000	else
2001	{   &call	("_aesni_decrypt1");	}
2002	&xorps	($inout0,$ivec);
2003	&movaps	($ivec,$in0);
2004	&sub	($len,0x10);
2005	&jmp	(&label("cbc_dec_tail_collected"));
2006
2007&set_label("cbc_dec_two",16);
2008	&call	("_aesni_decrypt2");
2009	&xorps	($inout0,$ivec);
2010	&xorps	($inout1,$in0);
2011	&movups	(&QWP(0,$out),$inout0);
2012	&movaps	($inout0,$inout1);
2013	&pxor	($inout1,$inout1);
2014	&lea	($out,&DWP(0x10,$out));
2015	&movaps	($ivec,$in1);
2016	&sub	($len,0x20);
2017	&jmp	(&label("cbc_dec_tail_collected"));
2018
2019&set_label("cbc_dec_three",16);
2020	&call	("_aesni_decrypt3");
2021	&xorps	($inout0,$ivec);
2022	&xorps	($inout1,$in0);
2023	&xorps	($inout2,$in1);
2024	&movups	(&QWP(0,$out),$inout0);
2025	&movaps	($inout0,$inout2);
2026	&pxor	($inout2,$inout2);
2027	&movups	(&QWP(0x10,$out),$inout1);
2028	&pxor	($inout1,$inout1);
2029	&lea	($out,&DWP(0x20,$out));
2030	&movups	($ivec,&QWP(0x20,$inp));
2031	&sub	($len,0x30);
2032	&jmp	(&label("cbc_dec_tail_collected"));
2033
2034&set_label("cbc_dec_four",16);
2035	&call	("_aesni_decrypt4");
2036	&movups	($rndkey1,&QWP(0x10,$inp));
2037	&movups	($rndkey0,&QWP(0x20,$inp));
2038	&xorps	($inout0,$ivec);
2039	&movups	($ivec,&QWP(0x30,$inp));
2040	&xorps	($inout1,$in0);
2041	&movups	(&QWP(0,$out),$inout0);
2042	&xorps	($inout2,$rndkey1);
2043	&movups	(&QWP(0x10,$out),$inout1);
2044	&pxor	($inout1,$inout1);
2045	&xorps	($inout3,$rndkey0);
2046	&movups	(&QWP(0x20,$out),$inout2);
2047	&pxor	($inout2,$inout2);
2048	&lea	($out,&DWP(0x30,$out));
2049	&movaps	($inout0,$inout3);
2050	&pxor	($inout3,$inout3);
2051	&sub	($len,0x40);
2052	&jmp	(&label("cbc_dec_tail_collected"));
2053
2054&set_label("cbc_dec_clear_tail_collected",16);
2055	&pxor	($inout1,$inout1);
2056	&pxor	($inout2,$inout2);
2057	&pxor	($inout3,$inout3);
2058	&pxor	($inout4,$inout4);
2059&set_label("cbc_dec_tail_collected");
2060	&and	($len,15);
2061	&jnz	(&label("cbc_dec_tail_partial"));
2062	&movups	(&QWP(0,$out),$inout0);
2063	&pxor	($rndkey0,$rndkey0);
2064	&jmp	(&label("cbc_ret"));
2065
2066&set_label("cbc_dec_tail_partial",16);
2067	&movaps	(&QWP(0,"esp"),$inout0);
2068	&pxor	($rndkey0,$rndkey0);
2069	&mov	("ecx",16);
2070	&mov	($inp,"esp");
2071	&sub	("ecx",$len);
2072	&data_word(0xA4F3F689);		# rep movsb
2073	&movdqa	(&QWP(0,"esp"),$inout0);
2074
2075&set_label("cbc_ret");
2076	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
2077	&mov	($key_,&wparam(4));
2078	&pxor	($inout0,$inout0);
2079	&pxor	($rndkey1,$rndkey1);
2080	&movups	(&QWP(0,$key_),$ivec);	# output IV
2081	&pxor	($ivec,$ivec);
2082&set_label("cbc_abort");
2083&function_end("${PREFIX}_cbc_encrypt");
2084
2085######################################################################
2086# Mechanical port from aesni-x86_64.pl.
2087#
2088# _aesni_set_encrypt_key is private interface,
2089# input:
2090#	"eax"	const unsigned char *userKey
2091#	$rounds	int bits
2092#	$key	AES_KEY *key
2093# output:
2094#	"eax"	return code
2095#	$round	rounds
2096
2097&function_begin_B("_aesni_set_encrypt_key");
2098	&push	("ebp");
2099	&push	("ebx");
2100	&test	("eax","eax");
2101	&jz	(&label("bad_pointer"));
2102	&test	($key,$key);
2103	&jz	(&label("bad_pointer"));
2104
2105	&call	(&label("pic"));
2106&set_label("pic");
2107	&blindpop("ebx");
2108	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2109
2110	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
2111	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
2112	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
2113	&mov	("ebp",&DWP(4,"ebp"));
2114	&lea	($key,&DWP(16,$key));
2115	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
2116	&cmp	($rounds,256);
2117	&je	(&label("14rounds"));
2118	&cmp	($rounds,192);
2119	&je	(&label("12rounds"));
2120	&cmp	($rounds,128);
2121	&jne	(&label("bad_keybits"));
2122
2123&set_label("10rounds",16);
2124	&cmp		("ebp",1<<28);
2125	&je		(&label("10rounds_alt"));
2126
2127	&mov		($rounds,9);
2128	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
2129	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
2130	&call		(&label("key_128_cold"));
2131	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
2132	&call		(&label("key_128"));
2133	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
2134	&call		(&label("key_128"));
2135	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
2136	&call		(&label("key_128"));
2137	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
2138	&call		(&label("key_128"));
2139	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
2140	&call		(&label("key_128"));
2141	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
2142	&call		(&label("key_128"));
2143	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
2144	&call		(&label("key_128"));
2145	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
2146	&call		(&label("key_128"));
2147	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
2148	&call		(&label("key_128"));
2149	&$movekey	(&QWP(0,$key),"xmm0");
2150	&mov		(&DWP(80,$key),$rounds);
2151
2152	&jmp	(&label("good_key"));
2153
2154&set_label("key_128",16);
2155	&$movekey	(&QWP(0,$key),"xmm0");
2156	&lea		($key,&DWP(16,$key));
2157&set_label("key_128_cold");
2158	&shufps		("xmm4","xmm0",0b00010000);
2159	&xorps		("xmm0","xmm4");
2160	&shufps		("xmm4","xmm0",0b10001100);
2161	&xorps		("xmm0","xmm4");
2162	&shufps		("xmm1","xmm1",0b11111111);	# critical path
2163	&xorps		("xmm0","xmm1");
2164	&ret();
2165
2166&set_label("10rounds_alt",16);
2167	&movdqa		("xmm5",&QWP(0x00,"ebx"));
2168	&mov		($rounds,8);
2169	&movdqa		("xmm4",&QWP(0x20,"ebx"));
2170	&movdqa		("xmm2","xmm0");
2171	&movdqu		(&QWP(-16,$key),"xmm0");
2172
2173&set_label("loop_key128");
2174	&pshufb		("xmm0","xmm5");
2175	&aesenclast	("xmm0","xmm4");
2176	&pslld		("xmm4",1);
2177	&lea		($key,&DWP(16,$key));
2178
2179	&movdqa		("xmm3","xmm2");
2180	&pslldq		("xmm2",4);
2181	&pxor		("xmm3","xmm2");
2182	&pslldq		("xmm2",4);
2183	&pxor		("xmm3","xmm2");
2184	&pslldq		("xmm2",4);
2185	&pxor		("xmm2","xmm3");
2186
2187	&pxor		("xmm0","xmm2");
2188	&movdqu		(&QWP(-16,$key),"xmm0");
2189	&movdqa		("xmm2","xmm0");
2190
2191	&dec		($rounds);
2192	&jnz		(&label("loop_key128"));
2193
2194	&movdqa		("xmm4",&QWP(0x30,"ebx"));
2195
2196	&pshufb		("xmm0","xmm5");
2197	&aesenclast	("xmm0","xmm4");
2198	&pslld		("xmm4",1);
2199
2200	&movdqa		("xmm3","xmm2");
2201	&pslldq		("xmm2",4);
2202	&pxor		("xmm3","xmm2");
2203	&pslldq		("xmm2",4);
2204	&pxor		("xmm3","xmm2");
2205	&pslldq		("xmm2",4);
2206	&pxor		("xmm2","xmm3");
2207
2208	&pxor		("xmm0","xmm2");
2209	&movdqu		(&QWP(0,$key),"xmm0");
2210
2211	&movdqa		("xmm2","xmm0");
2212	&pshufb		("xmm0","xmm5");
2213	&aesenclast	("xmm0","xmm4");
2214
2215	&movdqa		("xmm3","xmm2");
2216	&pslldq		("xmm2",4);
2217	&pxor		("xmm3","xmm2");
2218	&pslldq		("xmm2",4);
2219	&pxor		("xmm3","xmm2");
2220	&pslldq		("xmm2",4);
2221	&pxor		("xmm2","xmm3");
2222
2223	&pxor		("xmm0","xmm2");
2224	&movdqu		(&QWP(16,$key),"xmm0");
2225
2226	&mov		($rounds,9);
2227	&mov		(&DWP(96,$key),$rounds);
2228
2229	&jmp	(&label("good_key"));
2230
2231&set_label("12rounds",16);
2232	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
2233	&cmp		("ebp",1<<28);
2234	&je		(&label("12rounds_alt"));
2235
2236	&mov		($rounds,11);
2237	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
2238	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
2239	&call		(&label("key_192a_cold"));
2240	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
2241	&call		(&label("key_192b"));
2242	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
2243	&call		(&label("key_192a"));
2244	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
2245	&call		(&label("key_192b"));
2246	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
2247	&call		(&label("key_192a"));
2248	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
2249	&call		(&label("key_192b"));
2250	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
2251	&call		(&label("key_192a"));
2252	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
2253	&call		(&label("key_192b"));
2254	&$movekey	(&QWP(0,$key),"xmm0");
2255	&mov		(&DWP(48,$key),$rounds);
2256
2257	&jmp	(&label("good_key"));
2258
2259&set_label("key_192a",16);
2260	&$movekey	(&QWP(0,$key),"xmm0");
2261	&lea		($key,&DWP(16,$key));
2262&set_label("key_192a_cold",16);
2263	&movaps		("xmm5","xmm2");
2264&set_label("key_192b_warm");
2265	&shufps		("xmm4","xmm0",0b00010000);
2266	&movdqa		("xmm3","xmm2");
2267	&xorps		("xmm0","xmm4");
2268	&shufps		("xmm4","xmm0",0b10001100);
2269	&pslldq		("xmm3",4);
2270	&xorps		("xmm0","xmm4");
2271	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
2272	&pxor		("xmm2","xmm3");
2273	&pxor		("xmm0","xmm1");
2274	&pshufd		("xmm3","xmm0",0b11111111);
2275	&pxor		("xmm2","xmm3");
2276	&ret();
2277
2278&set_label("key_192b",16);
2279	&movaps		("xmm3","xmm0");
2280	&shufps		("xmm5","xmm0",0b01000100);
2281	&$movekey	(&QWP(0,$key),"xmm5");
2282	&shufps		("xmm3","xmm2",0b01001110);
2283	&$movekey	(&QWP(16,$key),"xmm3");
2284	&lea		($key,&DWP(32,$key));
2285	&jmp		(&label("key_192b_warm"));
2286
2287&set_label("12rounds_alt",16);
2288	&movdqa		("xmm5",&QWP(0x10,"ebx"));
2289	&movdqa		("xmm4",&QWP(0x20,"ebx"));
2290	&mov		($rounds,8);
2291	&movdqu		(&QWP(-16,$key),"xmm0");
2292
2293&set_label("loop_key192");
2294	&movq		(&QWP(0,$key),"xmm2");
2295	&movdqa		("xmm1","xmm2");
2296	&pshufb		("xmm2","xmm5");
2297	&aesenclast	("xmm2","xmm4");
2298	&pslld		("xmm4",1);
2299	&lea		($key,&DWP(24,$key));
2300
2301	&movdqa		("xmm3","xmm0");
2302	&pslldq		("xmm0",4);
2303	&pxor		("xmm3","xmm0");
2304	&pslldq		("xmm0",4);
2305	&pxor		("xmm3","xmm0");
2306	&pslldq		("xmm0",4);
2307	&pxor		("xmm0","xmm3");
2308
2309	&pshufd		("xmm3","xmm0",0xff);
2310	&pxor		("xmm3","xmm1");
2311	&pslldq		("xmm1",4);
2312	&pxor		("xmm3","xmm1");
2313
2314	&pxor		("xmm0","xmm2");
2315	&pxor		("xmm2","xmm3");
2316	&movdqu		(&QWP(-16,$key),"xmm0");
2317
2318	&dec		($rounds);
2319	&jnz		(&label("loop_key192"));
2320
2321	&mov	($rounds,11);
2322	&mov	(&DWP(32,$key),$rounds);
2323
2324	&jmp	(&label("good_key"));
2325
2326&set_label("14rounds",16);
2327	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
2328	&lea		($key,&DWP(16,$key));
2329	&cmp		("ebp",1<<28);
2330	&je		(&label("14rounds_alt"));
2331
2332	&mov		($rounds,13);
2333	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
2334	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
2335	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
2336	&call		(&label("key_256a_cold"));
2337	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
2338	&call		(&label("key_256b"));
2339	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
2340	&call		(&label("key_256a"));
2341	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
2342	&call		(&label("key_256b"));
2343	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
2344	&call		(&label("key_256a"));
2345	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
2346	&call		(&label("key_256b"));
2347	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
2348	&call		(&label("key_256a"));
2349	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
2350	&call		(&label("key_256b"));
2351	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
2352	&call		(&label("key_256a"));
2353	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
2354	&call		(&label("key_256b"));
2355	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
2356	&call		(&label("key_256a"));
2357	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
2358	&call		(&label("key_256b"));
2359	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
2360	&call		(&label("key_256a"));
2361	&$movekey	(&QWP(0,$key),"xmm0");
2362	&mov		(&DWP(16,$key),$rounds);
2363	&xor		("eax","eax");
2364
2365	&jmp	(&label("good_key"));
2366
2367&set_label("key_256a",16);
2368	&$movekey	(&QWP(0,$key),"xmm2");
2369	&lea		($key,&DWP(16,$key));
2370&set_label("key_256a_cold");
2371	&shufps		("xmm4","xmm0",0b00010000);
2372	&xorps		("xmm0","xmm4");
2373	&shufps		("xmm4","xmm0",0b10001100);
2374	&xorps		("xmm0","xmm4");
2375	&shufps		("xmm1","xmm1",0b11111111);	# critical path
2376	&xorps		("xmm0","xmm1");
2377	&ret();
2378
2379&set_label("key_256b",16);
2380	&$movekey	(&QWP(0,$key),"xmm0");
2381	&lea		($key,&DWP(16,$key));
2382
2383	&shufps		("xmm4","xmm2",0b00010000);
2384	&xorps		("xmm2","xmm4");
2385	&shufps		("xmm4","xmm2",0b10001100);
2386	&xorps		("xmm2","xmm4");
2387	&shufps		("xmm1","xmm1",0b10101010);	# critical path
2388	&xorps		("xmm2","xmm1");
2389	&ret();
2390
2391&set_label("14rounds_alt",16);
2392	&movdqa		("xmm5",&QWP(0x00,"ebx"));
2393	&movdqa		("xmm4",&QWP(0x20,"ebx"));
2394	&mov		($rounds,7);
2395	&movdqu		(&QWP(-32,$key),"xmm0");
2396	&movdqa		("xmm1","xmm2");
2397	&movdqu		(&QWP(-16,$key),"xmm2");
2398
2399&set_label("loop_key256");
2400	&pshufb		("xmm2","xmm5");
2401	&aesenclast	("xmm2","xmm4");
2402
2403	&movdqa		("xmm3","xmm0");
2404	&pslldq		("xmm0",4);
2405	&pxor		("xmm3","xmm0");
2406	&pslldq		("xmm0",4);
2407	&pxor		("xmm3","xmm0");
2408	&pslldq		("xmm0",4);
2409	&pxor		("xmm0","xmm3");
2410	&pslld		("xmm4",1);
2411
2412	&pxor		("xmm0","xmm2");
2413	&movdqu		(&QWP(0,$key),"xmm0");
2414
2415	&dec		($rounds);
2416	&jz		(&label("done_key256"));
2417
2418	&pshufd		("xmm2","xmm0",0xff);
2419	&pxor		("xmm3","xmm3");
2420	&aesenclast	("xmm2","xmm3");
2421
2422	&movdqa		("xmm3","xmm1")
2423	&pslldq		("xmm1",4);
2424	&pxor		("xmm3","xmm1");
2425	&pslldq		("xmm1",4);
2426	&pxor		("xmm3","xmm1");
2427	&pslldq		("xmm1",4);
2428	&pxor		("xmm1","xmm3");
2429
2430	&pxor		("xmm2","xmm1");
2431	&movdqu		(&QWP(16,$key),"xmm2");
2432	&lea		($key,&DWP(32,$key));
2433	&movdqa		("xmm1","xmm2");
2434	&jmp		(&label("loop_key256"));
2435
2436&set_label("done_key256");
2437	&mov		($rounds,13);
2438	&mov		(&DWP(16,$key),$rounds);
2439
2440&set_label("good_key");
2441	&pxor	("xmm0","xmm0");
2442	&pxor	("xmm1","xmm1");
2443	&pxor	("xmm2","xmm2");
2444	&pxor	("xmm3","xmm3");
2445	&pxor	("xmm4","xmm4");
2446	&pxor	("xmm5","xmm5");
2447	&xor	("eax","eax");
2448	&pop	("ebx");
2449	&pop	("ebp");
2450	&ret	();
2451
2452&set_label("bad_pointer",4);
2453	&mov	("eax",-1);
2454	&pop	("ebx");
2455	&pop	("ebp");
2456	&ret	();
2457&set_label("bad_keybits",4);
2458	&pxor	("xmm0","xmm0");
2459	&mov	("eax",-2);
2460	&pop	("ebx");
2461	&pop	("ebp");
2462	&ret	();
2463&function_end_B("_aesni_set_encrypt_key");
2464
2465# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2466#                              AES_KEY *key)
2467&function_begin_B("${PREFIX}_set_encrypt_key");
2468	&mov	("eax",&wparam(0));
2469	&mov	($rounds,&wparam(1));
2470	&mov	($key,&wparam(2));
2471	&call	("_aesni_set_encrypt_key");
2472	&ret	();
2473&function_end_B("${PREFIX}_set_encrypt_key");
2474
2475# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2476#                              AES_KEY *key)
2477&function_begin_B("${PREFIX}_set_decrypt_key");
2478	&mov	("eax",&wparam(0));
2479	&mov	($rounds,&wparam(1));
2480	&mov	($key,&wparam(2));
2481	&call	("_aesni_set_encrypt_key");
2482	&mov	($key,&wparam(2));
2483	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
2484	&test	("eax","eax");
2485	&jnz	(&label("dec_key_ret"));
2486	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
2487
2488	&$movekey	("xmm0",&QWP(0,$key));	# just swap
2489	&$movekey	("xmm1",&QWP(0,"eax"));
2490	&$movekey	(&QWP(0,"eax"),"xmm0");
2491	&$movekey	(&QWP(0,$key),"xmm1");
2492	&lea		($key,&DWP(16,$key));
2493	&lea		("eax",&DWP(-16,"eax"));
2494
2495&set_label("dec_key_inverse");
2496	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
2497	&$movekey	("xmm1",&QWP(0,"eax"));
2498	&aesimc		("xmm0","xmm0");
2499	&aesimc		("xmm1","xmm1");
2500	&lea		($key,&DWP(16,$key));
2501	&lea		("eax",&DWP(-16,"eax"));
2502	&$movekey	(&QWP(16,"eax"),"xmm0");
2503	&$movekey	(&QWP(-16,$key),"xmm1");
2504	&cmp		("eax",$key);
2505	&ja		(&label("dec_key_inverse"));
2506
2507	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
2508	&aesimc		("xmm0","xmm0");
2509	&$movekey	(&QWP(0,$key),"xmm0");
2510
2511	&pxor		("xmm0","xmm0");
2512	&pxor		("xmm1","xmm1");
2513	&xor		("eax","eax");		# return success
2514&set_label("dec_key_ret");
2515	&ret	();
2516&function_end_B("${PREFIX}_set_decrypt_key");
2517
2518&set_label("key_const",64);
2519&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
2520&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
2521&data_word(1,1,1,1);
2522&data_word(0x1b,0x1b,0x1b,0x1b);
2523&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2524
2525&asm_finish();
2526