1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
28# asymptotic limit it's not something you commonly achieve in reality,
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33#	16-byte     64-byte     256-byte    1-KB        8-KB
34# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
35# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
36# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
37# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
38# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
39# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved wih CBC-MAC. This provides ~30% improvement over
63# "straghtforward" CCM implementation with CTR and CBC-MAC performed
64# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
121# instructions' interleave factor. Westmere can execute at most 3
122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor		3x	6x	8x
132# theoretical asymptotic limit		1.67	0.83	0.625
133# measured performance for 8KB block	1.05	0.86	0.84
134#
135# "as if" interleave factor		4.7x	5.8x	6.0x
136#
137# Further data for other parallelizable modes:
138#
139# CBC decrypt				1.16	0.93	0.74
140# CTR					1.14	0.91	0.74
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
146# additional instructions with AES ones, but even AES instuctions
147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions  still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153# utilizies 6x interleave because of limited register bank capacity.
154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
172######################################################################
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
176#		CBC en-/decrypt	CTR	XTS	ECB	OCB
177# Westmere	3.77/1.25	1.25	1.25	1.26
178# * Bridge	5.07/0.74	0.75	0.90	0.85	0.98
179# Haswell	4.44/0.63	0.63	0.73	0.63	0.70
180# Skylake	2.62/0.63	0.63	0.63	0.63
181# Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
182# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
183# Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
184#
185# (*)	Atom Silvermont ECB result is suboptimal because of penalties
186#	incurred by operations on %xmm8-15. As ECB is not considered
187#	critical, nothing was done to mitigate the problem.
188
189$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
190			# generates drop-in replacement for
191			# crypto/aes/asm/aes-x86_64.pl:-)
192
193$flavour = shift;
194$output  = shift;
195if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
196
197$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
198
199$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
200( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
201( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
202die "can't locate x86_64-xlate.pl";
203
204open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
205*STDOUT=*OUT;
206
207$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
208@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
209		("%rdi","%rsi","%rdx","%rcx");	# Unix order
210
211$code=".text\n";
212$code.=".extern	OPENSSL_ia32cap_P\n";
213
214$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
215# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
216$inp="%rdi";
217$out="%rsi";
218$len="%rdx";
219$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
220$ivp="%r8";	# cbc, ctr, ...
221
222$rnds_="%r10d";	# backup copy for $rounds
223$key_="%r11";	# backup copy for $key
224
225# %xmm register layout
226$rndkey0="%xmm0";	$rndkey1="%xmm1";
227$inout0="%xmm2";	$inout1="%xmm3";
228$inout2="%xmm4";	$inout3="%xmm5";
229$inout4="%xmm6";	$inout5="%xmm7";
230$inout6="%xmm8";	$inout7="%xmm9";
231
232$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
233$in0="%xmm8";		$iv="%xmm9";
234
235# Inline version of internal aesni_[en|de]crypt1.
236#
237# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
238# cycles which take care of loop variables...
239{ my $sn;
240sub aesni_generate1 {
241my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
242++$sn;
243$code.=<<___;
244	$movkey	($key),$rndkey0
245	$movkey	16($key),$rndkey1
246___
247$code.=<<___ if (defined($ivec));
248	xorps	$rndkey0,$ivec
249	lea	32($key),$key
250	xorps	$ivec,$inout
251___
252$code.=<<___ if (!defined($ivec));
253	lea	32($key),$key
254	xorps	$rndkey0,$inout
255___
256$code.=<<___;
257.Loop_${p}1_$sn:
258	aes${p}	$rndkey1,$inout
259	dec	$rounds
260	$movkey	($key),$rndkey1
261	lea	16($key),$key
262	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
263	aes${p}last	$rndkey1,$inout
264___
265}}
266# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
267#
268{ my ($inp,$out,$key) = @_4args;
269
270$code.=<<___;
271.globl	${PREFIX}_encrypt
272.type	${PREFIX}_encrypt,\@abi-omnipotent
273.align	16
274${PREFIX}_encrypt:
275	movups	($inp),$inout0		# load input
276	mov	240($key),$rounds	# key->rounds
277___
278	&aesni_generate1("enc",$key,$rounds);
279$code.=<<___;
280	 pxor	$rndkey0,$rndkey0	# clear register bank
281	 pxor	$rndkey1,$rndkey1
282	movups	$inout0,($out)		# output
283	 pxor	$inout0,$inout0
284	ret
285.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
286
287.globl	${PREFIX}_decrypt
288.type	${PREFIX}_decrypt,\@abi-omnipotent
289.align	16
290${PREFIX}_decrypt:
291	movups	($inp),$inout0		# load input
292	mov	240($key),$rounds	# key->rounds
293___
294	&aesni_generate1("dec",$key,$rounds);
295$code.=<<___;
296	 pxor	$rndkey0,$rndkey0	# clear register bank
297	 pxor	$rndkey1,$rndkey1
298	movups	$inout0,($out)		# output
299	 pxor	$inout0,$inout0
300	ret
301.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
302___
303}
304
305# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
306# factor. Why 3x subroutine were originally used in loops? Even though
307# aes[enc|dec] latency was originally 6, it could be scheduled only
308# every *2nd* cycle. Thus 3x interleave was the one providing optimal
309# utilization, i.e. when subroutine's throughput is virtually same as
310# of non-interleaved subroutine [for number of input blocks up to 3].
311# This is why it originally made no sense to implement 2x subroutine.
312# But times change and it became appropriate to spend extra 192 bytes
313# on 2x subroutine on Atom Silvermont account. For processors that
314# can schedule aes[enc|dec] every cycle optimal interleave factor
315# equals to corresponding instructions latency. 8x is optimal for
316# * Bridge and "super-optimal" for other Intel CPUs...
317
318sub aesni_generate2 {
319my $dir=shift;
320# As already mentioned it takes in $key and $rounds, which are *not*
321# preserved. $inout[0-1] is cipher/clear text...
322$code.=<<___;
323.type	_aesni_${dir}rypt2,\@abi-omnipotent
324.align	16
325_aesni_${dir}rypt2:
326	$movkey	($key),$rndkey0
327	shl	\$4,$rounds
328	$movkey	16($key),$rndkey1
329	xorps	$rndkey0,$inout0
330	xorps	$rndkey0,$inout1
331	$movkey	32($key),$rndkey0
332	lea	32($key,$rounds),$key
333	neg	%rax				# $rounds
334	add	\$16,%rax
335
336.L${dir}_loop2:
337	aes${dir}	$rndkey1,$inout0
338	aes${dir}	$rndkey1,$inout1
339	$movkey		($key,%rax),$rndkey1
340	add		\$32,%rax
341	aes${dir}	$rndkey0,$inout0
342	aes${dir}	$rndkey0,$inout1
343	$movkey		-16($key,%rax),$rndkey0
344	jnz		.L${dir}_loop2
345
346	aes${dir}	$rndkey1,$inout0
347	aes${dir}	$rndkey1,$inout1
348	aes${dir}last	$rndkey0,$inout0
349	aes${dir}last	$rndkey0,$inout1
350	ret
351.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
352___
353}
354sub aesni_generate3 {
355my $dir=shift;
356# As already mentioned it takes in $key and $rounds, which are *not*
357# preserved. $inout[0-2] is cipher/clear text...
358$code.=<<___;
359.type	_aesni_${dir}rypt3,\@abi-omnipotent
360.align	16
361_aesni_${dir}rypt3:
362	$movkey	($key),$rndkey0
363	shl	\$4,$rounds
364	$movkey	16($key),$rndkey1
365	xorps	$rndkey0,$inout0
366	xorps	$rndkey0,$inout1
367	xorps	$rndkey0,$inout2
368	$movkey	32($key),$rndkey0
369	lea	32($key,$rounds),$key
370	neg	%rax				# $rounds
371	add	\$16,%rax
372
373.L${dir}_loop3:
374	aes${dir}	$rndkey1,$inout0
375	aes${dir}	$rndkey1,$inout1
376	aes${dir}	$rndkey1,$inout2
377	$movkey		($key,%rax),$rndkey1
378	add		\$32,%rax
379	aes${dir}	$rndkey0,$inout0
380	aes${dir}	$rndkey0,$inout1
381	aes${dir}	$rndkey0,$inout2
382	$movkey		-16($key,%rax),$rndkey0
383	jnz		.L${dir}_loop3
384
385	aes${dir}	$rndkey1,$inout0
386	aes${dir}	$rndkey1,$inout1
387	aes${dir}	$rndkey1,$inout2
388	aes${dir}last	$rndkey0,$inout0
389	aes${dir}last	$rndkey0,$inout1
390	aes${dir}last	$rndkey0,$inout2
391	ret
392.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
393___
394}
395# 4x interleave is implemented to improve small block performance,
396# most notably [and naturally] 4 block by ~30%. One can argue that one
397# should have implemented 5x as well, but improvement would be <20%,
398# so it's not worth it...
399sub aesni_generate4 {
400my $dir=shift;
401# As already mentioned it takes in $key and $rounds, which are *not*
402# preserved. $inout[0-3] is cipher/clear text...
403$code.=<<___;
404.type	_aesni_${dir}rypt4,\@abi-omnipotent
405.align	16
406_aesni_${dir}rypt4:
407	$movkey	($key),$rndkey0
408	shl	\$4,$rounds
409	$movkey	16($key),$rndkey1
410	xorps	$rndkey0,$inout0
411	xorps	$rndkey0,$inout1
412	xorps	$rndkey0,$inout2
413	xorps	$rndkey0,$inout3
414	$movkey	32($key),$rndkey0
415	lea	32($key,$rounds),$key
416	neg	%rax				# $rounds
417	.byte	0x0f,0x1f,0x00
418	add	\$16,%rax
419
420.L${dir}_loop4:
421	aes${dir}	$rndkey1,$inout0
422	aes${dir}	$rndkey1,$inout1
423	aes${dir}	$rndkey1,$inout2
424	aes${dir}	$rndkey1,$inout3
425	$movkey		($key,%rax),$rndkey1
426	add		\$32,%rax
427	aes${dir}	$rndkey0,$inout0
428	aes${dir}	$rndkey0,$inout1
429	aes${dir}	$rndkey0,$inout2
430	aes${dir}	$rndkey0,$inout3
431	$movkey		-16($key,%rax),$rndkey0
432	jnz		.L${dir}_loop4
433
434	aes${dir}	$rndkey1,$inout0
435	aes${dir}	$rndkey1,$inout1
436	aes${dir}	$rndkey1,$inout2
437	aes${dir}	$rndkey1,$inout3
438	aes${dir}last	$rndkey0,$inout0
439	aes${dir}last	$rndkey0,$inout1
440	aes${dir}last	$rndkey0,$inout2
441	aes${dir}last	$rndkey0,$inout3
442	ret
443.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
444___
445}
446sub aesni_generate6 {
447my $dir=shift;
448# As already mentioned it takes in $key and $rounds, which are *not*
449# preserved. $inout[0-5] is cipher/clear text...
450$code.=<<___;
451.type	_aesni_${dir}rypt6,\@abi-omnipotent
452.align	16
453_aesni_${dir}rypt6:
454	$movkey		($key),$rndkey0
455	shl		\$4,$rounds
456	$movkey		16($key),$rndkey1
457	xorps		$rndkey0,$inout0
458	pxor		$rndkey0,$inout1
459	pxor		$rndkey0,$inout2
460	aes${dir}	$rndkey1,$inout0
461	lea		32($key,$rounds),$key
462	neg		%rax			# $rounds
463	aes${dir}	$rndkey1,$inout1
464	pxor		$rndkey0,$inout3
465	pxor		$rndkey0,$inout4
466	aes${dir}	$rndkey1,$inout2
467	pxor		$rndkey0,$inout5
468	$movkey		($key,%rax),$rndkey0
469	add		\$16,%rax
470	jmp		.L${dir}_loop6_enter
471.align	16
472.L${dir}_loop6:
473	aes${dir}	$rndkey1,$inout0
474	aes${dir}	$rndkey1,$inout1
475	aes${dir}	$rndkey1,$inout2
476.L${dir}_loop6_enter:
477	aes${dir}	$rndkey1,$inout3
478	aes${dir}	$rndkey1,$inout4
479	aes${dir}	$rndkey1,$inout5
480	$movkey		($key,%rax),$rndkey1
481	add		\$32,%rax
482	aes${dir}	$rndkey0,$inout0
483	aes${dir}	$rndkey0,$inout1
484	aes${dir}	$rndkey0,$inout2
485	aes${dir}	$rndkey0,$inout3
486	aes${dir}	$rndkey0,$inout4
487	aes${dir}	$rndkey0,$inout5
488	$movkey		-16($key,%rax),$rndkey0
489	jnz		.L${dir}_loop6
490
491	aes${dir}	$rndkey1,$inout0
492	aes${dir}	$rndkey1,$inout1
493	aes${dir}	$rndkey1,$inout2
494	aes${dir}	$rndkey1,$inout3
495	aes${dir}	$rndkey1,$inout4
496	aes${dir}	$rndkey1,$inout5
497	aes${dir}last	$rndkey0,$inout0
498	aes${dir}last	$rndkey0,$inout1
499	aes${dir}last	$rndkey0,$inout2
500	aes${dir}last	$rndkey0,$inout3
501	aes${dir}last	$rndkey0,$inout4
502	aes${dir}last	$rndkey0,$inout5
503	ret
504.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
505___
506}
507sub aesni_generate8 {
508my $dir=shift;
509# As already mentioned it takes in $key and $rounds, which are *not*
510# preserved. $inout[0-7] is cipher/clear text...
511$code.=<<___;
512.type	_aesni_${dir}rypt8,\@abi-omnipotent
513.align	16
514_aesni_${dir}rypt8:
515	$movkey		($key),$rndkey0
516	shl		\$4,$rounds
517	$movkey		16($key),$rndkey1
518	xorps		$rndkey0,$inout0
519	xorps		$rndkey0,$inout1
520	pxor		$rndkey0,$inout2
521	pxor		$rndkey0,$inout3
522	pxor		$rndkey0,$inout4
523	lea		32($key,$rounds),$key
524	neg		%rax			# $rounds
525	aes${dir}	$rndkey1,$inout0
526	pxor		$rndkey0,$inout5
527	pxor		$rndkey0,$inout6
528	aes${dir}	$rndkey1,$inout1
529	pxor		$rndkey0,$inout7
530	$movkey		($key,%rax),$rndkey0
531	add		\$16,%rax
532	jmp		.L${dir}_loop8_inner
533.align	16
534.L${dir}_loop8:
535	aes${dir}	$rndkey1,$inout0
536	aes${dir}	$rndkey1,$inout1
537.L${dir}_loop8_inner:
538	aes${dir}	$rndkey1,$inout2
539	aes${dir}	$rndkey1,$inout3
540	aes${dir}	$rndkey1,$inout4
541	aes${dir}	$rndkey1,$inout5
542	aes${dir}	$rndkey1,$inout6
543	aes${dir}	$rndkey1,$inout7
544.L${dir}_loop8_enter:
545	$movkey		($key,%rax),$rndkey1
546	add		\$32,%rax
547	aes${dir}	$rndkey0,$inout0
548	aes${dir}	$rndkey0,$inout1
549	aes${dir}	$rndkey0,$inout2
550	aes${dir}	$rndkey0,$inout3
551	aes${dir}	$rndkey0,$inout4
552	aes${dir}	$rndkey0,$inout5
553	aes${dir}	$rndkey0,$inout6
554	aes${dir}	$rndkey0,$inout7
555	$movkey		-16($key,%rax),$rndkey0
556	jnz		.L${dir}_loop8
557
558	aes${dir}	$rndkey1,$inout0
559	aes${dir}	$rndkey1,$inout1
560	aes${dir}	$rndkey1,$inout2
561	aes${dir}	$rndkey1,$inout3
562	aes${dir}	$rndkey1,$inout4
563	aes${dir}	$rndkey1,$inout5
564	aes${dir}	$rndkey1,$inout6
565	aes${dir}	$rndkey1,$inout7
566	aes${dir}last	$rndkey0,$inout0
567	aes${dir}last	$rndkey0,$inout1
568	aes${dir}last	$rndkey0,$inout2
569	aes${dir}last	$rndkey0,$inout3
570	aes${dir}last	$rndkey0,$inout4
571	aes${dir}last	$rndkey0,$inout5
572	aes${dir}last	$rndkey0,$inout6
573	aes${dir}last	$rndkey0,$inout7
574	ret
575.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
576___
577}
578&aesni_generate2("enc") if ($PREFIX eq "aesni");
579&aesni_generate2("dec");
580&aesni_generate3("enc") if ($PREFIX eq "aesni");
581&aesni_generate3("dec");
582&aesni_generate4("enc") if ($PREFIX eq "aesni");
583&aesni_generate4("dec");
584&aesni_generate6("enc") if ($PREFIX eq "aesni");
585&aesni_generate6("dec");
586&aesni_generate8("enc") if ($PREFIX eq "aesni");
587&aesni_generate8("dec");
588
589if ($PREFIX eq "aesni") {
590########################################################################
591# void aesni_ecb_encrypt (const void *in, void *out,
592#			  size_t length, const AES_KEY *key,
593#			  int enc);
594$code.=<<___;
595.globl	aesni_ecb_encrypt
596.type	aesni_ecb_encrypt,\@function,5
597.align	16
598aesni_ecb_encrypt:
599___
600$code.=<<___ if ($win64);
601	lea	-0x58(%rsp),%rsp
602	movaps	%xmm6,(%rsp)		# offload $inout4..7
603	movaps	%xmm7,0x10(%rsp)
604	movaps	%xmm8,0x20(%rsp)
605	movaps	%xmm9,0x30(%rsp)
606.Lecb_enc_body:
607___
608$code.=<<___;
609	and	\$-16,$len		# if ($len<16)
610	jz	.Lecb_ret		# return
611
612	mov	240($key),$rounds	# key->rounds
613	$movkey	($key),$rndkey0
614	mov	$key,$key_		# backup $key
615	mov	$rounds,$rnds_		# backup $rounds
616	test	%r8d,%r8d		# 5th argument
617	jz	.Lecb_decrypt
618#--------------------------- ECB ENCRYPT ------------------------------#
619	cmp	\$0x80,$len		# if ($len<8*16)
620	jb	.Lecb_enc_tail		# short input
621
622	movdqu	($inp),$inout0		# load 8 input blocks
623	movdqu	0x10($inp),$inout1
624	movdqu	0x20($inp),$inout2
625	movdqu	0x30($inp),$inout3
626	movdqu	0x40($inp),$inout4
627	movdqu	0x50($inp),$inout5
628	movdqu	0x60($inp),$inout6
629	movdqu	0x70($inp),$inout7
630	lea	0x80($inp),$inp		# $inp+=8*16
631	sub	\$0x80,$len		# $len-=8*16 (can be zero)
632	jmp	.Lecb_enc_loop8_enter
633.align 16
634.Lecb_enc_loop8:
635	movups	$inout0,($out)		# store 8 output blocks
636	mov	$key_,$key		# restore $key
637	movdqu	($inp),$inout0		# load 8 input blocks
638	mov	$rnds_,$rounds		# restore $rounds
639	movups	$inout1,0x10($out)
640	movdqu	0x10($inp),$inout1
641	movups	$inout2,0x20($out)
642	movdqu	0x20($inp),$inout2
643	movups	$inout3,0x30($out)
644	movdqu	0x30($inp),$inout3
645	movups	$inout4,0x40($out)
646	movdqu	0x40($inp),$inout4
647	movups	$inout5,0x50($out)
648	movdqu	0x50($inp),$inout5
649	movups	$inout6,0x60($out)
650	movdqu	0x60($inp),$inout6
651	movups	$inout7,0x70($out)
652	lea	0x80($out),$out		# $out+=8*16
653	movdqu	0x70($inp),$inout7
654	lea	0x80($inp),$inp		# $inp+=8*16
655.Lecb_enc_loop8_enter:
656
657	call	_aesni_encrypt8
658
659	sub	\$0x80,$len
660	jnc	.Lecb_enc_loop8		# loop if $len-=8*16 didn't borrow
661
662	movups	$inout0,($out)		# store 8 output blocks
663	mov	$key_,$key		# restore $key
664	movups	$inout1,0x10($out)
665	mov	$rnds_,$rounds		# restore $rounds
666	movups	$inout2,0x20($out)
667	movups	$inout3,0x30($out)
668	movups	$inout4,0x40($out)
669	movups	$inout5,0x50($out)
670	movups	$inout6,0x60($out)
671	movups	$inout7,0x70($out)
672	lea	0x80($out),$out		# $out+=8*16
673	add	\$0x80,$len		# restore real remaining $len
674	jz	.Lecb_ret		# done if ($len==0)
675
676.Lecb_enc_tail:				# $len is less than 8*16
677	movups	($inp),$inout0
678	cmp	\$0x20,$len
679	jb	.Lecb_enc_one
680	movups	0x10($inp),$inout1
681	je	.Lecb_enc_two
682	movups	0x20($inp),$inout2
683	cmp	\$0x40,$len
684	jb	.Lecb_enc_three
685	movups	0x30($inp),$inout3
686	je	.Lecb_enc_four
687	movups	0x40($inp),$inout4
688	cmp	\$0x60,$len
689	jb	.Lecb_enc_five
690	movups	0x50($inp),$inout5
691	je	.Lecb_enc_six
692	movdqu	0x60($inp),$inout6
693	xorps	$inout7,$inout7
694	call	_aesni_encrypt8
695	movups	$inout0,($out)		# store 7 output blocks
696	movups	$inout1,0x10($out)
697	movups	$inout2,0x20($out)
698	movups	$inout3,0x30($out)
699	movups	$inout4,0x40($out)
700	movups	$inout5,0x50($out)
701	movups	$inout6,0x60($out)
702	jmp	.Lecb_ret
703.align	16
704.Lecb_enc_one:
705___
706	&aesni_generate1("enc",$key,$rounds);
707$code.=<<___;
708	movups	$inout0,($out)		# store one output block
709	jmp	.Lecb_ret
710.align	16
711.Lecb_enc_two:
712	call	_aesni_encrypt2
713	movups	$inout0,($out)		# store 2 output blocks
714	movups	$inout1,0x10($out)
715	jmp	.Lecb_ret
716.align	16
717.Lecb_enc_three:
718	call	_aesni_encrypt3
719	movups	$inout0,($out)		# store 3 output blocks
720	movups	$inout1,0x10($out)
721	movups	$inout2,0x20($out)
722	jmp	.Lecb_ret
723.align	16
724.Lecb_enc_four:
725	call	_aesni_encrypt4
726	movups	$inout0,($out)		# store 4 output blocks
727	movups	$inout1,0x10($out)
728	movups	$inout2,0x20($out)
729	movups	$inout3,0x30($out)
730	jmp	.Lecb_ret
731.align	16
732.Lecb_enc_five:
733	xorps	$inout5,$inout5
734	call	_aesni_encrypt6
735	movups	$inout0,($out)		# store 5 output blocks
736	movups	$inout1,0x10($out)
737	movups	$inout2,0x20($out)
738	movups	$inout3,0x30($out)
739	movups	$inout4,0x40($out)
740	jmp	.Lecb_ret
741.align	16
742.Lecb_enc_six:
743	call	_aesni_encrypt6
744	movups	$inout0,($out)		# store 6 output blocks
745	movups	$inout1,0x10($out)
746	movups	$inout2,0x20($out)
747	movups	$inout3,0x30($out)
748	movups	$inout4,0x40($out)
749	movups	$inout5,0x50($out)
750	jmp	.Lecb_ret
751#--------------------------- ECB DECRYPT ------------------------------#
752.align	16
753.Lecb_decrypt:
754	cmp	\$0x80,$len		# if ($len<8*16)
755	jb	.Lecb_dec_tail		# short input
756
757	movdqu	($inp),$inout0		# load 8 input blocks
758	movdqu	0x10($inp),$inout1
759	movdqu	0x20($inp),$inout2
760	movdqu	0x30($inp),$inout3
761	movdqu	0x40($inp),$inout4
762	movdqu	0x50($inp),$inout5
763	movdqu	0x60($inp),$inout6
764	movdqu	0x70($inp),$inout7
765	lea	0x80($inp),$inp		# $inp+=8*16
766	sub	\$0x80,$len		# $len-=8*16 (can be zero)
767	jmp	.Lecb_dec_loop8_enter
768.align 16
769.Lecb_dec_loop8:
770	movups	$inout0,($out)		# store 8 output blocks
771	mov	$key_,$key		# restore $key
772	movdqu	($inp),$inout0		# load 8 input blocks
773	mov	$rnds_,$rounds		# restore $rounds
774	movups	$inout1,0x10($out)
775	movdqu	0x10($inp),$inout1
776	movups	$inout2,0x20($out)
777	movdqu	0x20($inp),$inout2
778	movups	$inout3,0x30($out)
779	movdqu	0x30($inp),$inout3
780	movups	$inout4,0x40($out)
781	movdqu	0x40($inp),$inout4
782	movups	$inout5,0x50($out)
783	movdqu	0x50($inp),$inout5
784	movups	$inout6,0x60($out)
785	movdqu	0x60($inp),$inout6
786	movups	$inout7,0x70($out)
787	lea	0x80($out),$out		# $out+=8*16
788	movdqu	0x70($inp),$inout7
789	lea	0x80($inp),$inp		# $inp+=8*16
790.Lecb_dec_loop8_enter:
791
792	call	_aesni_decrypt8
793
794	$movkey	($key_),$rndkey0
795	sub	\$0x80,$len
796	jnc	.Lecb_dec_loop8		# loop if $len-=8*16 didn't borrow
797
798	movups	$inout0,($out)		# store 8 output blocks
799	 pxor	$inout0,$inout0		# clear register bank
800	mov	$key_,$key		# restore $key
801	movups	$inout1,0x10($out)
802	 pxor	$inout1,$inout1
803	mov	$rnds_,$rounds		# restore $rounds
804	movups	$inout2,0x20($out)
805	 pxor	$inout2,$inout2
806	movups	$inout3,0x30($out)
807	 pxor	$inout3,$inout3
808	movups	$inout4,0x40($out)
809	 pxor	$inout4,$inout4
810	movups	$inout5,0x50($out)
811	 pxor	$inout5,$inout5
812	movups	$inout6,0x60($out)
813	 pxor	$inout6,$inout6
814	movups	$inout7,0x70($out)
815	 pxor	$inout7,$inout7
816	lea	0x80($out),$out		# $out+=8*16
817	add	\$0x80,$len		# restore real remaining $len
818	jz	.Lecb_ret		# done if ($len==0)
819
820.Lecb_dec_tail:
821	movups	($inp),$inout0
822	cmp	\$0x20,$len
823	jb	.Lecb_dec_one
824	movups	0x10($inp),$inout1
825	je	.Lecb_dec_two
826	movups	0x20($inp),$inout2
827	cmp	\$0x40,$len
828	jb	.Lecb_dec_three
829	movups	0x30($inp),$inout3
830	je	.Lecb_dec_four
831	movups	0x40($inp),$inout4
832	cmp	\$0x60,$len
833	jb	.Lecb_dec_five
834	movups	0x50($inp),$inout5
835	je	.Lecb_dec_six
836	movups	0x60($inp),$inout6
837	$movkey	($key),$rndkey0
838	xorps	$inout7,$inout7
839	call	_aesni_decrypt8
840	movups	$inout0,($out)		# store 7 output blocks
841	 pxor	$inout0,$inout0		# clear register bank
842	movups	$inout1,0x10($out)
843	 pxor	$inout1,$inout1
844	movups	$inout2,0x20($out)
845	 pxor	$inout2,$inout2
846	movups	$inout3,0x30($out)
847	 pxor	$inout3,$inout3
848	movups	$inout4,0x40($out)
849	 pxor	$inout4,$inout4
850	movups	$inout5,0x50($out)
851	 pxor	$inout5,$inout5
852	movups	$inout6,0x60($out)
853	 pxor	$inout6,$inout6
854	 pxor	$inout7,$inout7
855	jmp	.Lecb_ret
856.align	16
857.Lecb_dec_one:
858___
859	&aesni_generate1("dec",$key,$rounds);
860$code.=<<___;
861	movups	$inout0,($out)		# store one output block
862	 pxor	$inout0,$inout0		# clear register bank
863	jmp	.Lecb_ret
864.align	16
865.Lecb_dec_two:
866	call	_aesni_decrypt2
867	movups	$inout0,($out)		# store 2 output blocks
868	 pxor	$inout0,$inout0		# clear register bank
869	movups	$inout1,0x10($out)
870	 pxor	$inout1,$inout1
871	jmp	.Lecb_ret
872.align	16
873.Lecb_dec_three:
874	call	_aesni_decrypt3
875	movups	$inout0,($out)		# store 3 output blocks
876	 pxor	$inout0,$inout0		# clear register bank
877	movups	$inout1,0x10($out)
878	 pxor	$inout1,$inout1
879	movups	$inout2,0x20($out)
880	 pxor	$inout2,$inout2
881	jmp	.Lecb_ret
882.align	16
883.Lecb_dec_four:
884	call	_aesni_decrypt4
885	movups	$inout0,($out)		# store 4 output blocks
886	 pxor	$inout0,$inout0		# clear register bank
887	movups	$inout1,0x10($out)
888	 pxor	$inout1,$inout1
889	movups	$inout2,0x20($out)
890	 pxor	$inout2,$inout2
891	movups	$inout3,0x30($out)
892	 pxor	$inout3,$inout3
893	jmp	.Lecb_ret
894.align	16
895.Lecb_dec_five:
896	xorps	$inout5,$inout5
897	call	_aesni_decrypt6
898	movups	$inout0,($out)		# store 5 output blocks
899	 pxor	$inout0,$inout0		# clear register bank
900	movups	$inout1,0x10($out)
901	 pxor	$inout1,$inout1
902	movups	$inout2,0x20($out)
903	 pxor	$inout2,$inout2
904	movups	$inout3,0x30($out)
905	 pxor	$inout3,$inout3
906	movups	$inout4,0x40($out)
907	 pxor	$inout4,$inout4
908	 pxor	$inout5,$inout5
909	jmp	.Lecb_ret
910.align	16
911.Lecb_dec_six:
912	call	_aesni_decrypt6
913	movups	$inout0,($out)		# store 6 output blocks
914	 pxor	$inout0,$inout0		# clear register bank
915	movups	$inout1,0x10($out)
916	 pxor	$inout1,$inout1
917	movups	$inout2,0x20($out)
918	 pxor	$inout2,$inout2
919	movups	$inout3,0x30($out)
920	 pxor	$inout3,$inout3
921	movups	$inout4,0x40($out)
922	 pxor	$inout4,$inout4
923	movups	$inout5,0x50($out)
924	 pxor	$inout5,$inout5
925
926.Lecb_ret:
927	xorps	$rndkey0,$rndkey0	# %xmm0
928	pxor	$rndkey1,$rndkey1
929___
930$code.=<<___ if ($win64);
931	movaps	(%rsp),%xmm6
932	movaps	%xmm0,(%rsp)		# clear stack
933	movaps	0x10(%rsp),%xmm7
934	movaps	%xmm0,0x10(%rsp)
935	movaps	0x20(%rsp),%xmm8
936	movaps	%xmm0,0x20(%rsp)
937	movaps	0x30(%rsp),%xmm9
938	movaps	%xmm0,0x30(%rsp)
939	lea	0x58(%rsp),%rsp
940.Lecb_enc_ret:
941___
942$code.=<<___;
943	ret
944.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
945___
946
947{
948######################################################################
949# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
950#                         size_t blocks, const AES_KEY *key,
951#                         const char *ivec,char *cmac);
952#
953# Handles only complete blocks, operates on 64-bit counter and
954# does not update *ivec! Nor does it finalize CMAC value
955# (see engine/eng_aesni.c for details)
956#
957{
958my $cmac="%r9";	# 6th argument
959
960my $increment="%xmm9";
961my $iv="%xmm6";
962my $bswap_mask="%xmm7";
963
964$code.=<<___;
965.globl	aesni_ccm64_encrypt_blocks
966.type	aesni_ccm64_encrypt_blocks,\@function,6
967.align	16
968aesni_ccm64_encrypt_blocks:
969___
970$code.=<<___ if ($win64);
971	lea	-0x58(%rsp),%rsp
972	movaps	%xmm6,(%rsp)		# $iv
973	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
974	movaps	%xmm8,0x20(%rsp)	# $in0
975	movaps	%xmm9,0x30(%rsp)	# $increment
976.Lccm64_enc_body:
977___
978$code.=<<___;
979	mov	240($key),$rounds		# key->rounds
980	movdqu	($ivp),$iv
981	movdqa	.Lincrement64(%rip),$increment
982	movdqa	.Lbswap_mask(%rip),$bswap_mask
983
984	shl	\$4,$rounds
985	mov	\$16,$rnds_
986	lea	0($key),$key_
987	movdqu	($cmac),$inout1
988	movdqa	$iv,$inout0
989	lea	32($key,$rounds),$key		# end of key schedule
990	pshufb	$bswap_mask,$iv
991	sub	%rax,%r10			# twisted $rounds
992	jmp	.Lccm64_enc_outer
993.align	16
994.Lccm64_enc_outer:
995	$movkey	($key_),$rndkey0
996	mov	%r10,%rax
997	movups	($inp),$in0			# load inp
998
999	xorps	$rndkey0,$inout0		# counter
1000	$movkey	16($key_),$rndkey1
1001	xorps	$in0,$rndkey0
1002	xorps	$rndkey0,$inout1		# cmac^=inp
1003	$movkey	32($key_),$rndkey0
1004
1005.Lccm64_enc2_loop:
1006	aesenc	$rndkey1,$inout0
1007	aesenc	$rndkey1,$inout1
1008	$movkey	($key,%rax),$rndkey1
1009	add	\$32,%rax
1010	aesenc	$rndkey0,$inout0
1011	aesenc	$rndkey0,$inout1
1012	$movkey	-16($key,%rax),$rndkey0
1013	jnz	.Lccm64_enc2_loop
1014	aesenc	$rndkey1,$inout0
1015	aesenc	$rndkey1,$inout1
1016	paddq	$increment,$iv
1017	dec	$len				# $len-- ($len is in blocks)
1018	aesenclast	$rndkey0,$inout0
1019	aesenclast	$rndkey0,$inout1
1020
1021	lea	16($inp),$inp
1022	xorps	$inout0,$in0			# inp ^= E(iv)
1023	movdqa	$iv,$inout0
1024	movups	$in0,($out)			# save output
1025	pshufb	$bswap_mask,$inout0
1026	lea	16($out),$out			# $out+=16
1027	jnz	.Lccm64_enc_outer		# loop if ($len!=0)
1028
1029	 pxor	$rndkey0,$rndkey0		# clear register bank
1030	 pxor	$rndkey1,$rndkey1
1031	 pxor	$inout0,$inout0
1032	movups	$inout1,($cmac)			# store resulting mac
1033	 pxor	$inout1,$inout1
1034	 pxor	$in0,$in0
1035	 pxor	$iv,$iv
1036___
1037$code.=<<___ if ($win64);
1038	movaps	(%rsp),%xmm6
1039	movaps	%xmm0,(%rsp)			# clear stack
1040	movaps	0x10(%rsp),%xmm7
1041	movaps	%xmm0,0x10(%rsp)
1042	movaps	0x20(%rsp),%xmm8
1043	movaps	%xmm0,0x20(%rsp)
1044	movaps	0x30(%rsp),%xmm9
1045	movaps	%xmm0,0x30(%rsp)
1046	lea	0x58(%rsp),%rsp
1047.Lccm64_enc_ret:
1048___
1049$code.=<<___;
1050	ret
1051.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1052___
1053######################################################################
1054$code.=<<___;
1055.globl	aesni_ccm64_decrypt_blocks
1056.type	aesni_ccm64_decrypt_blocks,\@function,6
1057.align	16
1058aesni_ccm64_decrypt_blocks:
1059___
1060$code.=<<___ if ($win64);
1061	lea	-0x58(%rsp),%rsp
1062	movaps	%xmm6,(%rsp)		# $iv
1063	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
1064	movaps	%xmm8,0x20(%rsp)	# $in8
1065	movaps	%xmm9,0x30(%rsp)	# $increment
1066.Lccm64_dec_body:
1067___
1068$code.=<<___;
1069	mov	240($key),$rounds		# key->rounds
1070	movups	($ivp),$iv
1071	movdqu	($cmac),$inout1
1072	movdqa	.Lincrement64(%rip),$increment
1073	movdqa	.Lbswap_mask(%rip),$bswap_mask
1074
1075	movaps	$iv,$inout0
1076	mov	$rounds,$rnds_
1077	mov	$key,$key_
1078	pshufb	$bswap_mask,$iv
1079___
1080	&aesni_generate1("enc",$key,$rounds);
1081$code.=<<___;
1082	shl	\$4,$rnds_
1083	mov	\$16,$rounds
1084	movups	($inp),$in0			# load inp
1085	paddq	$increment,$iv
1086	lea	16($inp),$inp			# $inp+=16
1087	sub	%r10,%rax			# twisted $rounds
1088	lea	32($key_,$rnds_),$key		# end of key schedule
1089	mov	%rax,%r10
1090	jmp	.Lccm64_dec_outer
1091.align	16
1092.Lccm64_dec_outer:
1093	xorps	$inout0,$in0			# inp ^= E(iv)
1094	movdqa	$iv,$inout0
1095	movups	$in0,($out)			# save output
1096	lea	16($out),$out			# $out+=16
1097	pshufb	$bswap_mask,$inout0
1098
1099	sub	\$1,$len			# $len-- ($len is in blocks)
1100	jz	.Lccm64_dec_break		# if ($len==0) break
1101
1102	$movkey	($key_),$rndkey0
1103	mov	%r10,%rax
1104	$movkey	16($key_),$rndkey1
1105	xorps	$rndkey0,$in0
1106	xorps	$rndkey0,$inout0
1107	xorps	$in0,$inout1			# cmac^=out
1108	$movkey	32($key_),$rndkey0
1109	jmp	.Lccm64_dec2_loop
1110.align	16
1111.Lccm64_dec2_loop:
1112	aesenc	$rndkey1,$inout0
1113	aesenc	$rndkey1,$inout1
1114	$movkey	($key,%rax),$rndkey1
1115	add	\$32,%rax
1116	aesenc	$rndkey0,$inout0
1117	aesenc	$rndkey0,$inout1
1118	$movkey	-16($key,%rax),$rndkey0
1119	jnz	.Lccm64_dec2_loop
1120	movups	($inp),$in0			# load input
1121	paddq	$increment,$iv
1122	aesenc	$rndkey1,$inout0
1123	aesenc	$rndkey1,$inout1
1124	aesenclast	$rndkey0,$inout0
1125	aesenclast	$rndkey0,$inout1
1126	lea	16($inp),$inp			# $inp+=16
1127	jmp	.Lccm64_dec_outer
1128
1129.align	16
1130.Lccm64_dec_break:
1131	#xorps	$in0,$inout1			# cmac^=out
1132	mov	240($key_),$rounds
1133___
1134	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1135$code.=<<___;
1136	 pxor	$rndkey0,$rndkey0		# clear register bank
1137	 pxor	$rndkey1,$rndkey1
1138	 pxor	$inout0,$inout0
1139	movups	$inout1,($cmac)			# store resulting mac
1140	 pxor	$inout1,$inout1
1141	 pxor	$in0,$in0
1142	 pxor	$iv,$iv
1143___
1144$code.=<<___ if ($win64);
1145	movaps	(%rsp),%xmm6
1146	movaps	%xmm0,(%rsp)			# clear stack
1147	movaps	0x10(%rsp),%xmm7
1148	movaps	%xmm0,0x10(%rsp)
1149	movaps	0x20(%rsp),%xmm8
1150	movaps	%xmm0,0x20(%rsp)
1151	movaps	0x30(%rsp),%xmm9
1152	movaps	%xmm0,0x30(%rsp)
1153	lea	0x58(%rsp),%rsp
1154.Lccm64_dec_ret:
1155___
1156$code.=<<___;
1157	ret
1158.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1159___
1160}
1161######################################################################
1162# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1163#                         size_t blocks, const AES_KEY *key,
1164#                         const char *ivec);
1165#
1166# Handles only complete blocks, operates on 32-bit counter and
1167# does not update *ivec! (see crypto/modes/ctr128.c for details)
1168#
1169# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1170# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1171# Keywords are full unroll and modulo-schedule counter calculations
1172# with zero-round key xor.
1173{
1174my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1175my ($key0,$ctr)=("%ebp","${ivp}d");
1176my $frame_size = 0x80 + ($win64?160:0);
1177
1178$code.=<<___;
1179.globl	aesni_ctr32_encrypt_blocks
1180.type	aesni_ctr32_encrypt_blocks,\@function,5
1181.align	16
1182aesni_ctr32_encrypt_blocks:
1183	cmp	\$1,$len
1184	jne	.Lctr32_bulk
1185
1186	# handle single block without allocating stack frame,
1187	# useful when handling edges
1188	movups	($ivp),$inout0
1189	movups	($inp),$inout1
1190	mov	240($key),%edx			# key->rounds
1191___
1192	&aesni_generate1("enc",$key,"%edx");
1193$code.=<<___;
1194	 pxor	$rndkey0,$rndkey0		# clear register bank
1195	 pxor	$rndkey1,$rndkey1
1196	xorps	$inout1,$inout0
1197	 pxor	$inout1,$inout1
1198	movups	$inout0,($out)
1199	 xorps	$inout0,$inout0
1200	jmp	.Lctr32_epilogue
1201
1202.align	16
1203.Lctr32_bulk:
1204	lea	(%rsp),$key_			# use $key_ as frame pointer
1205	push	%rbp
1206	sub	\$$frame_size,%rsp
1207	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1208___
1209$code.=<<___ if ($win64);
1210	movaps	%xmm6,-0xa8($key_)		# offload everything
1211	movaps	%xmm7,-0x98($key_)
1212	movaps	%xmm8,-0x88($key_)
1213	movaps	%xmm9,-0x78($key_)
1214	movaps	%xmm10,-0x68($key_)
1215	movaps	%xmm11,-0x58($key_)
1216	movaps	%xmm12,-0x48($key_)
1217	movaps	%xmm13,-0x38($key_)
1218	movaps	%xmm14,-0x28($key_)
1219	movaps	%xmm15,-0x18($key_)
1220.Lctr32_body:
1221___
1222$code.=<<___;
1223
1224	# 8 16-byte words on top of stack are counter values
1225	# xor-ed with zero-round key
1226
1227	movdqu	($ivp),$inout0
1228	movdqu	($key),$rndkey0
1229	mov	12($ivp),$ctr			# counter LSB
1230	pxor	$rndkey0,$inout0
1231	mov	12($key),$key0			# 0-round key LSB
1232	movdqa	$inout0,0x00(%rsp)		# populate counter block
1233	bswap	$ctr
1234	movdqa	$inout0,$inout1
1235	movdqa	$inout0,$inout2
1236	movdqa	$inout0,$inout3
1237	movdqa	$inout0,0x40(%rsp)
1238	movdqa	$inout0,0x50(%rsp)
1239	movdqa	$inout0,0x60(%rsp)
1240	mov	%rdx,%r10			# about to borrow %rdx
1241	movdqa	$inout0,0x70(%rsp)
1242
1243	lea	1($ctr),%rax
1244	 lea	2($ctr),%rdx
1245	bswap	%eax
1246	 bswap	%edx
1247	xor	$key0,%eax
1248	 xor	$key0,%edx
1249	pinsrd	\$3,%eax,$inout1
1250	lea	3($ctr),%rax
1251	movdqa	$inout1,0x10(%rsp)
1252	 pinsrd	\$3,%edx,$inout2
1253	bswap	%eax
1254	 mov	%r10,%rdx			# restore %rdx
1255	 lea	4($ctr),%r10
1256	 movdqa	$inout2,0x20(%rsp)
1257	xor	$key0,%eax
1258	 bswap	%r10d
1259	pinsrd	\$3,%eax,$inout3
1260	 xor	$key0,%r10d
1261	movdqa	$inout3,0x30(%rsp)
1262	lea	5($ctr),%r9
1263	 mov	%r10d,0x40+12(%rsp)
1264	bswap	%r9d
1265	 lea	6($ctr),%r10
1266	mov	240($key),$rounds		# key->rounds
1267	xor	$key0,%r9d
1268	 bswap	%r10d
1269	mov	%r9d,0x50+12(%rsp)
1270	 xor	$key0,%r10d
1271	lea	7($ctr),%r9
1272	 mov	%r10d,0x60+12(%rsp)
1273	bswap	%r9d
1274	 mov	OPENSSL_ia32cap_P+4(%rip),%r10d
1275	xor	$key0,%r9d
1276	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
1277	mov	%r9d,0x70+12(%rsp)
1278
1279	$movkey	0x10($key),$rndkey1
1280
1281	movdqa	0x40(%rsp),$inout4
1282	movdqa	0x50(%rsp),$inout5
1283
1284	cmp	\$8,$len		# $len is in blocks
1285	jb	.Lctr32_tail		# short input if ($len<8)
1286
1287	sub	\$6,$len		# $len is biased by -6
1288	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
1289	je	.Lctr32_6x		# [which denotes Atom Silvermont]
1290
1291	lea	0x80($key),$key		# size optimization
1292	sub	\$2,$len		# $len is biased by -8
1293	jmp	.Lctr32_loop8
1294
1295.align	16
1296.Lctr32_6x:
1297	shl	\$4,$rounds
1298	mov	\$48,$rnds_
1299	bswap	$key0
1300	lea	32($key,$rounds),$key	# end of key schedule
1301	sub	%rax,%r10		# twisted $rounds
1302	jmp	.Lctr32_loop6
1303
1304.align	16
1305.Lctr32_loop6:
1306	 add	\$6,$ctr		# next counter value
1307	$movkey	-48($key,$rnds_),$rndkey0
1308	aesenc	$rndkey1,$inout0
1309	 mov	$ctr,%eax
1310	 xor	$key0,%eax
1311	aesenc	$rndkey1,$inout1
1312	 movbe	%eax,`0x00+12`(%rsp)	# store next counter value
1313	 lea	1($ctr),%eax
1314	aesenc	$rndkey1,$inout2
1315	 xor	$key0,%eax
1316	 movbe	%eax,`0x10+12`(%rsp)
1317	aesenc	$rndkey1,$inout3
1318	 lea	2($ctr),%eax
1319	 xor	$key0,%eax
1320	aesenc	$rndkey1,$inout4
1321	 movbe	%eax,`0x20+12`(%rsp)
1322	 lea	3($ctr),%eax
1323	aesenc	$rndkey1,$inout5
1324	$movkey	-32($key,$rnds_),$rndkey1
1325	 xor	$key0,%eax
1326
1327	aesenc	$rndkey0,$inout0
1328	 movbe	%eax,`0x30+12`(%rsp)
1329	 lea	4($ctr),%eax
1330	aesenc	$rndkey0,$inout1
1331	 xor	$key0,%eax
1332	 movbe	%eax,`0x40+12`(%rsp)
1333	aesenc	$rndkey0,$inout2
1334	 lea	5($ctr),%eax
1335	 xor	$key0,%eax
1336	aesenc	$rndkey0,$inout3
1337	 movbe	%eax,`0x50+12`(%rsp)
1338	 mov	%r10,%rax		# mov	$rnds_,$rounds
1339	aesenc	$rndkey0,$inout4
1340	aesenc	$rndkey0,$inout5
1341	$movkey	-16($key,$rnds_),$rndkey0
1342
1343	call	.Lenc_loop6
1344
1345	movdqu	($inp),$inout6		# load 6 input blocks
1346	movdqu	0x10($inp),$inout7
1347	movdqu	0x20($inp),$in0
1348	movdqu	0x30($inp),$in1
1349	movdqu	0x40($inp),$in2
1350	movdqu	0x50($inp),$in3
1351	lea	0x60($inp),$inp		# $inp+=6*16
1352	$movkey	-64($key,$rnds_),$rndkey1
1353	pxor	$inout0,$inout6		# inp^=E(ctr)
1354	movaps	0x00(%rsp),$inout0	# load next counter [xor-ed with 0 round]
1355	pxor	$inout1,$inout7
1356	movaps	0x10(%rsp),$inout1
1357	pxor	$inout2,$in0
1358	movaps	0x20(%rsp),$inout2
1359	pxor	$inout3,$in1
1360	movaps	0x30(%rsp),$inout3
1361	pxor	$inout4,$in2
1362	movaps	0x40(%rsp),$inout4
1363	pxor	$inout5,$in3
1364	movaps	0x50(%rsp),$inout5
1365	movdqu	$inout6,($out)		# store 6 output blocks
1366	movdqu	$inout7,0x10($out)
1367	movdqu	$in0,0x20($out)
1368	movdqu	$in1,0x30($out)
1369	movdqu	$in2,0x40($out)
1370	movdqu	$in3,0x50($out)
1371	lea	0x60($out),$out		# $out+=6*16
1372
1373	sub	\$6,$len
1374	jnc	.Lctr32_loop6		# loop if $len-=6 didn't borrow
1375
1376	add	\$6,$len		# restore real remaining $len
1377	jz	.Lctr32_done		# done if ($len==0)
1378
1379	lea	-48($rnds_),$rounds
1380	lea	-80($key,$rnds_),$key	# restore $key
1381	neg	$rounds
1382	shr	\$4,$rounds		# restore $rounds
1383	jmp	.Lctr32_tail
1384
1385.align	32
1386.Lctr32_loop8:
1387	 add		\$8,$ctr		# next counter value
1388	movdqa		0x60(%rsp),$inout6
1389	aesenc		$rndkey1,$inout0
1390	 mov		$ctr,%r9d
1391	movdqa		0x70(%rsp),$inout7
1392	aesenc		$rndkey1,$inout1
1393	 bswap		%r9d
1394	$movkey		0x20-0x80($key),$rndkey0
1395	aesenc		$rndkey1,$inout2
1396	 xor		$key0,%r9d
1397	 nop
1398	aesenc		$rndkey1,$inout3
1399	 mov		%r9d,0x00+12(%rsp)	# store next counter value
1400	 lea		1($ctr),%r9
1401	aesenc		$rndkey1,$inout4
1402	aesenc		$rndkey1,$inout5
1403	aesenc		$rndkey1,$inout6
1404	aesenc		$rndkey1,$inout7
1405	$movkey		0x30-0x80($key),$rndkey1
1406___
1407for($i=2;$i<8;$i++) {
1408my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1409$code.=<<___;
1410	 bswap		%r9d
1411	aesenc		$rndkeyx,$inout0
1412	aesenc		$rndkeyx,$inout1
1413	 xor		$key0,%r9d
1414	 .byte		0x66,0x90
1415	aesenc		$rndkeyx,$inout2
1416	aesenc		$rndkeyx,$inout3
1417	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
1418	 lea		$i($ctr),%r9
1419	aesenc		$rndkeyx,$inout4
1420	aesenc		$rndkeyx,$inout5
1421	aesenc		$rndkeyx,$inout6
1422	aesenc		$rndkeyx,$inout7
1423	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
1424___
1425}
1426$code.=<<___;
1427	 bswap		%r9d
1428	aesenc		$rndkey0,$inout0
1429	aesenc		$rndkey0,$inout1
1430	aesenc		$rndkey0,$inout2
1431	 xor		$key0,%r9d
1432	 movdqu		0x00($inp),$in0		# start loading input
1433	aesenc		$rndkey0,$inout3
1434	 mov		%r9d,0x70+12(%rsp)
1435	 cmp		\$11,$rounds
1436	aesenc		$rndkey0,$inout4
1437	aesenc		$rndkey0,$inout5
1438	aesenc		$rndkey0,$inout6
1439	aesenc		$rndkey0,$inout7
1440	$movkey		0xa0-0x80($key),$rndkey0
1441
1442	jb		.Lctr32_enc_done
1443
1444	aesenc		$rndkey1,$inout0
1445	aesenc		$rndkey1,$inout1
1446	aesenc		$rndkey1,$inout2
1447	aesenc		$rndkey1,$inout3
1448	aesenc		$rndkey1,$inout4
1449	aesenc		$rndkey1,$inout5
1450	aesenc		$rndkey1,$inout6
1451	aesenc		$rndkey1,$inout7
1452	$movkey		0xb0-0x80($key),$rndkey1
1453
1454	aesenc		$rndkey0,$inout0
1455	aesenc		$rndkey0,$inout1
1456	aesenc		$rndkey0,$inout2
1457	aesenc		$rndkey0,$inout3
1458	aesenc		$rndkey0,$inout4
1459	aesenc		$rndkey0,$inout5
1460	aesenc		$rndkey0,$inout6
1461	aesenc		$rndkey0,$inout7
1462	$movkey		0xc0-0x80($key),$rndkey0
1463	je		.Lctr32_enc_done
1464
1465	aesenc		$rndkey1,$inout0
1466	aesenc		$rndkey1,$inout1
1467	aesenc		$rndkey1,$inout2
1468	aesenc		$rndkey1,$inout3
1469	aesenc		$rndkey1,$inout4
1470	aesenc		$rndkey1,$inout5
1471	aesenc		$rndkey1,$inout6
1472	aesenc		$rndkey1,$inout7
1473	$movkey		0xd0-0x80($key),$rndkey1
1474
1475	aesenc		$rndkey0,$inout0
1476	aesenc		$rndkey0,$inout1
1477	aesenc		$rndkey0,$inout2
1478	aesenc		$rndkey0,$inout3
1479	aesenc		$rndkey0,$inout4
1480	aesenc		$rndkey0,$inout5
1481	aesenc		$rndkey0,$inout6
1482	aesenc		$rndkey0,$inout7
1483	$movkey		0xe0-0x80($key),$rndkey0
1484	jmp		.Lctr32_enc_done
1485
1486.align	16
1487.Lctr32_enc_done:
1488	movdqu		0x10($inp),$in1
1489	pxor		$rndkey0,$in0		# input^=round[last]
1490	movdqu		0x20($inp),$in2
1491	pxor		$rndkey0,$in1
1492	movdqu		0x30($inp),$in3
1493	pxor		$rndkey0,$in2
1494	movdqu		0x40($inp),$in4
1495	pxor		$rndkey0,$in3
1496	movdqu		0x50($inp),$in5
1497	pxor		$rndkey0,$in4
1498	pxor		$rndkey0,$in5
1499	aesenc		$rndkey1,$inout0
1500	aesenc		$rndkey1,$inout1
1501	aesenc		$rndkey1,$inout2
1502	aesenc		$rndkey1,$inout3
1503	aesenc		$rndkey1,$inout4
1504	aesenc		$rndkey1,$inout5
1505	aesenc		$rndkey1,$inout6
1506	aesenc		$rndkey1,$inout7
1507	movdqu		0x60($inp),$rndkey1	# borrow $rndkey1 for inp[6]
1508	lea		0x80($inp),$inp		# $inp+=8*16
1509
1510	aesenclast	$in0,$inout0		# $inN is inp[N]^round[last]
1511	pxor		$rndkey0,$rndkey1	# borrowed $rndkey
1512	movdqu		0x70-0x80($inp),$in0
1513	aesenclast	$in1,$inout1
1514	pxor		$rndkey0,$in0
1515	movdqa		0x00(%rsp),$in1		# load next counter block
1516	aesenclast	$in2,$inout2
1517	aesenclast	$in3,$inout3
1518	movdqa		0x10(%rsp),$in2
1519	movdqa		0x20(%rsp),$in3
1520	aesenclast	$in4,$inout4
1521	aesenclast	$in5,$inout5
1522	movdqa		0x30(%rsp),$in4
1523	movdqa		0x40(%rsp),$in5
1524	aesenclast	$rndkey1,$inout6
1525	movdqa		0x50(%rsp),$rndkey0
1526	$movkey		0x10-0x80($key),$rndkey1#real 1st-round key
1527	aesenclast	$in0,$inout7
1528
1529	movups		$inout0,($out)		# store 8 output blocks
1530	movdqa		$in1,$inout0
1531	movups		$inout1,0x10($out)
1532	movdqa		$in2,$inout1
1533	movups		$inout2,0x20($out)
1534	movdqa		$in3,$inout2
1535	movups		$inout3,0x30($out)
1536	movdqa		$in4,$inout3
1537	movups		$inout4,0x40($out)
1538	movdqa		$in5,$inout4
1539	movups		$inout5,0x50($out)
1540	movdqa		$rndkey0,$inout5
1541	movups		$inout6,0x60($out)
1542	movups		$inout7,0x70($out)
1543	lea		0x80($out),$out		# $out+=8*16
1544
1545	sub	\$8,$len
1546	jnc	.Lctr32_loop8			# loop if $len-=8 didn't borrow
1547
1548	add	\$8,$len			# restore real remainig $len
1549	jz	.Lctr32_done			# done if ($len==0)
1550	lea	-0x80($key),$key
1551
1552.Lctr32_tail:
1553	# note that at this point $inout0..5 are populated with
1554	# counter values xor-ed with 0-round key
1555	lea	16($key),$key
1556	cmp	\$4,$len
1557	jb	.Lctr32_loop3
1558	je	.Lctr32_loop4
1559
1560	# if ($len>4) compute 7 E(counter)
1561	shl		\$4,$rounds
1562	movdqa		0x60(%rsp),$inout6
1563	pxor		$inout7,$inout7
1564
1565	$movkey		16($key),$rndkey0
1566	aesenc		$rndkey1,$inout0
1567	aesenc		$rndkey1,$inout1
1568	lea		32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1569	neg		%rax
1570	aesenc		$rndkey1,$inout2
1571	add		\$16,%rax		# prepare for .Lenc_loop8_enter
1572	 movups		($inp),$in0
1573	aesenc		$rndkey1,$inout3
1574	aesenc		$rndkey1,$inout4
1575	 movups		0x10($inp),$in1		# pre-load input
1576	 movups		0x20($inp),$in2
1577	aesenc		$rndkey1,$inout5
1578	aesenc		$rndkey1,$inout6
1579
1580	call            .Lenc_loop8_enter
1581
1582	movdqu	0x30($inp),$in3
1583	pxor	$in0,$inout0
1584	movdqu	0x40($inp),$in0
1585	pxor	$in1,$inout1
1586	movdqu	$inout0,($out)			# store output
1587	pxor	$in2,$inout2
1588	movdqu	$inout1,0x10($out)
1589	pxor	$in3,$inout3
1590	movdqu	$inout2,0x20($out)
1591	pxor	$in0,$inout4
1592	movdqu	$inout3,0x30($out)
1593	movdqu	$inout4,0x40($out)
1594	cmp	\$6,$len
1595	jb	.Lctr32_done			# $len was 5, stop store
1596
1597	movups	0x50($inp),$in1
1598	xorps	$in1,$inout5
1599	movups	$inout5,0x50($out)
1600	je	.Lctr32_done			# $len was 6, stop store
1601
1602	movups	0x60($inp),$in2
1603	xorps	$in2,$inout6
1604	movups	$inout6,0x60($out)
1605	jmp	.Lctr32_done			# $len was 7, stop store
1606
1607.align	32
1608.Lctr32_loop4:
1609	aesenc		$rndkey1,$inout0
1610	lea		16($key),$key
1611	dec		$rounds
1612	aesenc		$rndkey1,$inout1
1613	aesenc		$rndkey1,$inout2
1614	aesenc		$rndkey1,$inout3
1615	$movkey		($key),$rndkey1
1616	jnz		.Lctr32_loop4
1617	aesenclast	$rndkey1,$inout0
1618	aesenclast	$rndkey1,$inout1
1619	 movups		($inp),$in0		# load input
1620	 movups		0x10($inp),$in1
1621	aesenclast	$rndkey1,$inout2
1622	aesenclast	$rndkey1,$inout3
1623	 movups		0x20($inp),$in2
1624	 movups		0x30($inp),$in3
1625
1626	xorps	$in0,$inout0
1627	movups	$inout0,($out)			# store output
1628	xorps	$in1,$inout1
1629	movups	$inout1,0x10($out)
1630	pxor	$in2,$inout2
1631	movdqu	$inout2,0x20($out)
1632	pxor	$in3,$inout3
1633	movdqu	$inout3,0x30($out)
1634	jmp	.Lctr32_done			# $len was 4, stop store
1635
1636.align	32
1637.Lctr32_loop3:
1638	aesenc		$rndkey1,$inout0
1639	lea		16($key),$key
1640	dec		$rounds
1641	aesenc		$rndkey1,$inout1
1642	aesenc		$rndkey1,$inout2
1643	$movkey		($key),$rndkey1
1644	jnz		.Lctr32_loop3
1645	aesenclast	$rndkey1,$inout0
1646	aesenclast	$rndkey1,$inout1
1647	aesenclast	$rndkey1,$inout2
1648
1649	movups	($inp),$in0			# load input
1650	xorps	$in0,$inout0
1651	movups	$inout0,($out)			# store output
1652	cmp	\$2,$len
1653	jb	.Lctr32_done			# $len was 1, stop store
1654
1655	movups	0x10($inp),$in1
1656	xorps	$in1,$inout1
1657	movups	$inout1,0x10($out)
1658	je	.Lctr32_done			# $len was 2, stop store
1659
1660	movups	0x20($inp),$in2
1661	xorps	$in2,$inout2
1662	movups	$inout2,0x20($out)		# $len was 3, stop store
1663
1664.Lctr32_done:
1665	xorps	%xmm0,%xmm0			# clear regiser bank
1666	xor	$key0,$key0
1667	pxor	%xmm1,%xmm1
1668	pxor	%xmm2,%xmm2
1669	pxor	%xmm3,%xmm3
1670	pxor	%xmm4,%xmm4
1671	pxor	%xmm5,%xmm5
1672___
1673$code.=<<___ if (!$win64);
1674	pxor	%xmm6,%xmm6
1675	pxor	%xmm7,%xmm7
1676	movaps	%xmm0,0x00(%rsp)		# clear stack
1677	pxor	%xmm8,%xmm8
1678	movaps	%xmm0,0x10(%rsp)
1679	pxor	%xmm9,%xmm9
1680	movaps	%xmm0,0x20(%rsp)
1681	pxor	%xmm10,%xmm10
1682	movaps	%xmm0,0x30(%rsp)
1683	pxor	%xmm11,%xmm11
1684	movaps	%xmm0,0x40(%rsp)
1685	pxor	%xmm12,%xmm12
1686	movaps	%xmm0,0x50(%rsp)
1687	pxor	%xmm13,%xmm13
1688	movaps	%xmm0,0x60(%rsp)
1689	pxor	%xmm14,%xmm14
1690	movaps	%xmm0,0x70(%rsp)
1691	pxor	%xmm15,%xmm15
1692___
1693$code.=<<___ if ($win64);
1694	movaps	-0xa8($key_),%xmm6
1695	movaps	%xmm0,-0xa8($key_)		# clear stack
1696	movaps	-0x98($key_),%xmm7
1697	movaps	%xmm0,-0x98($key_)
1698	movaps	-0x88($key_),%xmm8
1699	movaps	%xmm0,-0x88($key_)
1700	movaps	-0x78($key_),%xmm9
1701	movaps	%xmm0,-0x78($key_)
1702	movaps	-0x68($key_),%xmm10
1703	movaps	%xmm0,-0x68($key_)
1704	movaps	-0x58($key_),%xmm11
1705	movaps	%xmm0,-0x58($key_)
1706	movaps	-0x48($key_),%xmm12
1707	movaps	%xmm0,-0x48($key_)
1708	movaps	-0x38($key_),%xmm13
1709	movaps	%xmm0,-0x38($key_)
1710	movaps	-0x28($key_),%xmm14
1711	movaps	%xmm0,-0x28($key_)
1712	movaps	-0x18($key_),%xmm15
1713	movaps	%xmm0,-0x18($key_)
1714	movaps	%xmm0,0x00(%rsp)
1715	movaps	%xmm0,0x10(%rsp)
1716	movaps	%xmm0,0x20(%rsp)
1717	movaps	%xmm0,0x30(%rsp)
1718	movaps	%xmm0,0x40(%rsp)
1719	movaps	%xmm0,0x50(%rsp)
1720	movaps	%xmm0,0x60(%rsp)
1721	movaps	%xmm0,0x70(%rsp)
1722___
1723$code.=<<___;
1724	mov	-8($key_),%rbp
1725	lea	($key_),%rsp
1726.Lctr32_epilogue:
1727	ret
1728.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1729___
1730}
1731
1732######################################################################
1733# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1734#	const AES_KEY *key1, const AES_KEY *key2
1735#	const unsigned char iv[16]);
1736#
1737{
1738my @tweak=map("%xmm$_",(10..15));
1739my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1740my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1741my $frame_size = 0x70 + ($win64?160:0);
1742my $key_ = "%rbp";	# override so that we can use %r11 as FP
1743
1744$code.=<<___;
1745.globl	aesni_xts_encrypt
1746.type	aesni_xts_encrypt,\@function,6
1747.align	16
1748aesni_xts_encrypt:
1749	lea	(%rsp),%r11			# frame pointer
1750	push	%rbp
1751	sub	\$$frame_size,%rsp
1752	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1753___
1754$code.=<<___ if ($win64);
1755	movaps	%xmm6,-0xa8(%r11)		# offload everything
1756	movaps	%xmm7,-0x98(%r11)
1757	movaps	%xmm8,-0x88(%r11)
1758	movaps	%xmm9,-0x78(%r11)
1759	movaps	%xmm10,-0x68(%r11)
1760	movaps	%xmm11,-0x58(%r11)
1761	movaps	%xmm12,-0x48(%r11)
1762	movaps	%xmm13,-0x38(%r11)
1763	movaps	%xmm14,-0x28(%r11)
1764	movaps	%xmm15,-0x18(%r11)
1765.Lxts_enc_body:
1766___
1767$code.=<<___;
1768	movups	($ivp),$inout0			# load clear-text tweak
1769	mov	240(%r8),$rounds		# key2->rounds
1770	mov	240($key),$rnds_		# key1->rounds
1771___
1772	# generate the tweak
1773	&aesni_generate1("enc",$key2,$rounds,$inout0);
1774$code.=<<___;
1775	$movkey	($key),$rndkey0			# zero round key
1776	mov	$key,$key_			# backup $key
1777	mov	$rnds_,$rounds			# backup $rounds
1778	shl	\$4,$rnds_
1779	mov	$len,$len_			# backup $len
1780	and	\$-16,$len
1781
1782	$movkey	16($key,$rnds_),$rndkey1	# last round key
1783
1784	movdqa	.Lxts_magic(%rip),$twmask
1785	movdqa	$inout0,@tweak[5]
1786	pshufd	\$0x5f,$inout0,$twres
1787	pxor	$rndkey0,$rndkey1
1788___
1789    # alternative tweak calculation algorithm is based on suggestions
1790    # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1791    # and should help in the future...
1792    for ($i=0;$i<4;$i++) {
1793    $code.=<<___;
1794	movdqa	$twres,$twtmp
1795	paddd	$twres,$twres
1796	movdqa	@tweak[5],@tweak[$i]
1797	psrad	\$31,$twtmp			# broadcast upper bits
1798	paddq	@tweak[5],@tweak[5]
1799	pand	$twmask,$twtmp
1800	pxor	$rndkey0,@tweak[$i]
1801	pxor	$twtmp,@tweak[5]
1802___
1803    }
1804$code.=<<___;
1805	movdqa	@tweak[5],@tweak[4]
1806	psrad	\$31,$twres
1807	paddq	@tweak[5],@tweak[5]
1808	pand	$twmask,$twres
1809	pxor	$rndkey0,@tweak[4]
1810	pxor	$twres,@tweak[5]
1811	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
1812
1813	sub	\$16*6,$len
1814	jc	.Lxts_enc_short			# if $len-=6*16 borrowed
1815
1816	mov	\$16+96,$rounds
1817	lea	32($key_,$rnds_),$key		# end of key schedule
1818	sub	%r10,%rax			# twisted $rounds
1819	$movkey	16($key_),$rndkey1
1820	mov	%rax,%r10			# backup twisted $rounds
1821	lea	.Lxts_magic(%rip),%r8
1822	jmp	.Lxts_enc_grandloop
1823
1824.align	32
1825.Lxts_enc_grandloop:
1826	movdqu	`16*0`($inp),$inout0		# load input
1827	movdqa	$rndkey0,$twmask
1828	movdqu	`16*1`($inp),$inout1
1829	pxor	@tweak[0],$inout0		# input^=tweak^round[0]
1830	movdqu	`16*2`($inp),$inout2
1831	pxor	@tweak[1],$inout1
1832	 aesenc		$rndkey1,$inout0
1833	movdqu	`16*3`($inp),$inout3
1834	pxor	@tweak[2],$inout2
1835	 aesenc		$rndkey1,$inout1
1836	movdqu	`16*4`($inp),$inout4
1837	pxor	@tweak[3],$inout3
1838	 aesenc		$rndkey1,$inout2
1839	movdqu	`16*5`($inp),$inout5
1840	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
1841	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
1842	pxor	@tweak[4],$inout4
1843	 aesenc		$rndkey1,$inout3
1844	$movkey	32($key_),$rndkey0
1845	lea	`16*6`($inp),$inp
1846	pxor	$twmask,$inout5
1847
1848	 pxor	$twres,@tweak[0]		# calclulate tweaks^round[last]
1849	aesenc		$rndkey1,$inout4
1850	 pxor	$twres,@tweak[1]
1851	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^round[last]
1852	aesenc		$rndkey1,$inout5
1853	$movkey		48($key_),$rndkey1
1854	 pxor	$twres,@tweak[2]
1855
1856	aesenc		$rndkey0,$inout0
1857	 pxor	$twres,@tweak[3]
1858	 movdqa	@tweak[1],`16*1`(%rsp)
1859	aesenc		$rndkey0,$inout1
1860	 pxor	$twres,@tweak[4]
1861	 movdqa	@tweak[2],`16*2`(%rsp)
1862	aesenc		$rndkey0,$inout2
1863	aesenc		$rndkey0,$inout3
1864	 pxor	$twres,$twmask
1865	 movdqa	@tweak[4],`16*4`(%rsp)
1866	aesenc		$rndkey0,$inout4
1867	aesenc		$rndkey0,$inout5
1868	$movkey		64($key_),$rndkey0
1869	 movdqa	$twmask,`16*5`(%rsp)
1870	pshufd	\$0x5f,@tweak[5],$twres
1871	jmp	.Lxts_enc_loop6
1872.align	32
1873.Lxts_enc_loop6:
1874	aesenc		$rndkey1,$inout0
1875	aesenc		$rndkey1,$inout1
1876	aesenc		$rndkey1,$inout2
1877	aesenc		$rndkey1,$inout3
1878	aesenc		$rndkey1,$inout4
1879	aesenc		$rndkey1,$inout5
1880	$movkey		-64($key,%rax),$rndkey1
1881	add		\$32,%rax
1882
1883	aesenc		$rndkey0,$inout0
1884	aesenc		$rndkey0,$inout1
1885	aesenc		$rndkey0,$inout2
1886	aesenc		$rndkey0,$inout3
1887	aesenc		$rndkey0,$inout4
1888	aesenc		$rndkey0,$inout5
1889	$movkey		-80($key,%rax),$rndkey0
1890	jnz		.Lxts_enc_loop6
1891
1892	movdqa	(%r8),$twmask			# start calculating next tweak
1893	movdqa	$twres,$twtmp
1894	paddd	$twres,$twres
1895	 aesenc		$rndkey1,$inout0
1896	paddq	@tweak[5],@tweak[5]
1897	psrad	\$31,$twtmp
1898	 aesenc		$rndkey1,$inout1
1899	pand	$twmask,$twtmp
1900	$movkey	($key_),@tweak[0]		# load round[0]
1901	 aesenc		$rndkey1,$inout2
1902	 aesenc		$rndkey1,$inout3
1903	 aesenc		$rndkey1,$inout4
1904	pxor	$twtmp,@tweak[5]
1905	movaps	@tweak[0],@tweak[1]		# copy round[0]
1906	 aesenc		$rndkey1,$inout5
1907	 $movkey	-64($key),$rndkey1
1908
1909	movdqa	$twres,$twtmp
1910	 aesenc		$rndkey0,$inout0
1911	paddd	$twres,$twres
1912	pxor	@tweak[5],@tweak[0]
1913	 aesenc		$rndkey0,$inout1
1914	psrad	\$31,$twtmp
1915	paddq	@tweak[5],@tweak[5]
1916	 aesenc		$rndkey0,$inout2
1917	 aesenc		$rndkey0,$inout3
1918	pand	$twmask,$twtmp
1919	movaps	@tweak[1],@tweak[2]
1920	 aesenc		$rndkey0,$inout4
1921	pxor	$twtmp,@tweak[5]
1922	movdqa	$twres,$twtmp
1923	 aesenc		$rndkey0,$inout5
1924	 $movkey	-48($key),$rndkey0
1925
1926	paddd	$twres,$twres
1927	 aesenc		$rndkey1,$inout0
1928	pxor	@tweak[5],@tweak[1]
1929	psrad	\$31,$twtmp
1930	 aesenc		$rndkey1,$inout1
1931	paddq	@tweak[5],@tweak[5]
1932	pand	$twmask,$twtmp
1933	 aesenc		$rndkey1,$inout2
1934	 aesenc		$rndkey1,$inout3
1935	 movdqa	@tweak[3],`16*3`(%rsp)
1936	pxor	$twtmp,@tweak[5]
1937	 aesenc		$rndkey1,$inout4
1938	movaps	@tweak[2],@tweak[3]
1939	movdqa	$twres,$twtmp
1940	 aesenc		$rndkey1,$inout5
1941	 $movkey	-32($key),$rndkey1
1942
1943	paddd	$twres,$twres
1944	 aesenc		$rndkey0,$inout0
1945	pxor	@tweak[5],@tweak[2]
1946	psrad	\$31,$twtmp
1947	 aesenc		$rndkey0,$inout1
1948	paddq	@tweak[5],@tweak[5]
1949	pand	$twmask,$twtmp
1950	 aesenc		$rndkey0,$inout2
1951	 aesenc		$rndkey0,$inout3
1952	 aesenc		$rndkey0,$inout4
1953	pxor	$twtmp,@tweak[5]
1954	movaps	@tweak[3],@tweak[4]
1955	 aesenc		$rndkey0,$inout5
1956
1957	movdqa	$twres,$rndkey0
1958	paddd	$twres,$twres
1959	 aesenc		$rndkey1,$inout0
1960	pxor	@tweak[5],@tweak[3]
1961	psrad	\$31,$rndkey0
1962	 aesenc		$rndkey1,$inout1
1963	paddq	@tweak[5],@tweak[5]
1964	pand	$twmask,$rndkey0
1965	 aesenc		$rndkey1,$inout2
1966	 aesenc		$rndkey1,$inout3
1967	pxor	$rndkey0,@tweak[5]
1968	$movkey		($key_),$rndkey0
1969	 aesenc		$rndkey1,$inout4
1970	 aesenc		$rndkey1,$inout5
1971	$movkey		16($key_),$rndkey1
1972
1973	pxor	@tweak[5],@tweak[4]
1974	 aesenclast	`16*0`(%rsp),$inout0
1975	psrad	\$31,$twres
1976	paddq	@tweak[5],@tweak[5]
1977	 aesenclast	`16*1`(%rsp),$inout1
1978	 aesenclast	`16*2`(%rsp),$inout2
1979	pand	$twmask,$twres
1980	mov	%r10,%rax			# restore $rounds
1981	 aesenclast	`16*3`(%rsp),$inout3
1982	 aesenclast	`16*4`(%rsp),$inout4
1983	 aesenclast	`16*5`(%rsp),$inout5
1984	pxor	$twres,@tweak[5]
1985
1986	lea	`16*6`($out),$out		# $out+=6*16
1987	movups	$inout0,`-16*6`($out)		# store 6 output blocks
1988	movups	$inout1,`-16*5`($out)
1989	movups	$inout2,`-16*4`($out)
1990	movups	$inout3,`-16*3`($out)
1991	movups	$inout4,`-16*2`($out)
1992	movups	$inout5,`-16*1`($out)
1993	sub	\$16*6,$len
1994	jnc	.Lxts_enc_grandloop		# loop if $len-=6*16 didn't borrow
1995
1996	mov	\$16+96,$rounds
1997	sub	$rnds_,$rounds
1998	mov	$key_,$key			# restore $key
1999	shr	\$4,$rounds			# restore original value
2000
2001.Lxts_enc_short:
2002	# at the point @tweak[0..5] are populated with tweak values
2003	mov	$rounds,$rnds_			# backup $rounds
2004	pxor	$rndkey0,@tweak[0]
2005	add	\$16*6,$len			# restore real remaining $len
2006	jz	.Lxts_enc_done			# done if ($len==0)
2007
2008	pxor	$rndkey0,@tweak[1]
2009	cmp	\$0x20,$len
2010	jb	.Lxts_enc_one			# $len is 1*16
2011	pxor	$rndkey0,@tweak[2]
2012	je	.Lxts_enc_two			# $len is 2*16
2013
2014	pxor	$rndkey0,@tweak[3]
2015	cmp	\$0x40,$len
2016	jb	.Lxts_enc_three			# $len is 3*16
2017	pxor	$rndkey0,@tweak[4]
2018	je	.Lxts_enc_four			# $len is 4*16
2019
2020	movdqu	($inp),$inout0			# $len is 5*16
2021	movdqu	16*1($inp),$inout1
2022	movdqu	16*2($inp),$inout2
2023	pxor	@tweak[0],$inout0
2024	movdqu	16*3($inp),$inout3
2025	pxor	@tweak[1],$inout1
2026	movdqu	16*4($inp),$inout4
2027	lea	16*5($inp),$inp			# $inp+=5*16
2028	pxor	@tweak[2],$inout2
2029	pxor	@tweak[3],$inout3
2030	pxor	@tweak[4],$inout4
2031	pxor	$inout5,$inout5
2032
2033	call	_aesni_encrypt6
2034
2035	xorps	@tweak[0],$inout0
2036	movdqa	@tweak[5],@tweak[0]
2037	xorps	@tweak[1],$inout1
2038	xorps	@tweak[2],$inout2
2039	movdqu	$inout0,($out)			# store 5 output blocks
2040	xorps	@tweak[3],$inout3
2041	movdqu	$inout1,16*1($out)
2042	xorps	@tweak[4],$inout4
2043	movdqu	$inout2,16*2($out)
2044	movdqu	$inout3,16*3($out)
2045	movdqu	$inout4,16*4($out)
2046	lea	16*5($out),$out			# $out+=5*16
2047	jmp	.Lxts_enc_done
2048
2049.align	16
2050.Lxts_enc_one:
2051	movups	($inp),$inout0
2052	lea	16*1($inp),$inp			# inp+=1*16
2053	xorps	@tweak[0],$inout0
2054___
2055	&aesni_generate1("enc",$key,$rounds);
2056$code.=<<___;
2057	xorps	@tweak[0],$inout0
2058	movdqa	@tweak[1],@tweak[0]
2059	movups	$inout0,($out)			# store one output block
2060	lea	16*1($out),$out			# $out+=1*16
2061	jmp	.Lxts_enc_done
2062
2063.align	16
2064.Lxts_enc_two:
2065	movups	($inp),$inout0
2066	movups	16($inp),$inout1
2067	lea	32($inp),$inp			# $inp+=2*16
2068	xorps	@tweak[0],$inout0
2069	xorps	@tweak[1],$inout1
2070
2071	call	_aesni_encrypt2
2072
2073	xorps	@tweak[0],$inout0
2074	movdqa	@tweak[2],@tweak[0]
2075	xorps	@tweak[1],$inout1
2076	movups	$inout0,($out)			# store 2 output blocks
2077	movups	$inout1,16*1($out)
2078	lea	16*2($out),$out			# $out+=2*16
2079	jmp	.Lxts_enc_done
2080
2081.align	16
2082.Lxts_enc_three:
2083	movups	($inp),$inout0
2084	movups	16*1($inp),$inout1
2085	movups	16*2($inp),$inout2
2086	lea	16*3($inp),$inp			# $inp+=3*16
2087	xorps	@tweak[0],$inout0
2088	xorps	@tweak[1],$inout1
2089	xorps	@tweak[2],$inout2
2090
2091	call	_aesni_encrypt3
2092
2093	xorps	@tweak[0],$inout0
2094	movdqa	@tweak[3],@tweak[0]
2095	xorps	@tweak[1],$inout1
2096	xorps	@tweak[2],$inout2
2097	movups	$inout0,($out)			# store 3 output blocks
2098	movups	$inout1,16*1($out)
2099	movups	$inout2,16*2($out)
2100	lea	16*3($out),$out			# $out+=3*16
2101	jmp	.Lxts_enc_done
2102
2103.align	16
2104.Lxts_enc_four:
2105	movups	($inp),$inout0
2106	movups	16*1($inp),$inout1
2107	movups	16*2($inp),$inout2
2108	xorps	@tweak[0],$inout0
2109	movups	16*3($inp),$inout3
2110	lea	16*4($inp),$inp			# $inp+=4*16
2111	xorps	@tweak[1],$inout1
2112	xorps	@tweak[2],$inout2
2113	xorps	@tweak[3],$inout3
2114
2115	call	_aesni_encrypt4
2116
2117	pxor	@tweak[0],$inout0
2118	movdqa	@tweak[4],@tweak[0]
2119	pxor	@tweak[1],$inout1
2120	pxor	@tweak[2],$inout2
2121	movdqu	$inout0,($out)			# store 4 output blocks
2122	pxor	@tweak[3],$inout3
2123	movdqu	$inout1,16*1($out)
2124	movdqu	$inout2,16*2($out)
2125	movdqu	$inout3,16*3($out)
2126	lea	16*4($out),$out			# $out+=4*16
2127	jmp	.Lxts_enc_done
2128
2129.align	16
2130.Lxts_enc_done:
2131	and	\$15,$len_			# see if $len%16 is 0
2132	jz	.Lxts_enc_ret
2133	mov	$len_,$len
2134
2135.Lxts_enc_steal:
2136	movzb	($inp),%eax			# borrow $rounds ...
2137	movzb	-16($out),%ecx			# ... and $key
2138	lea	1($inp),$inp
2139	mov	%al,-16($out)
2140	mov	%cl,0($out)
2141	lea	1($out),$out
2142	sub	\$1,$len
2143	jnz	.Lxts_enc_steal
2144
2145	sub	$len_,$out			# rewind $out
2146	mov	$key_,$key			# restore $key
2147	mov	$rnds_,$rounds			# restore $rounds
2148
2149	movups	-16($out),$inout0
2150	xorps	@tweak[0],$inout0
2151___
2152	&aesni_generate1("enc",$key,$rounds);
2153$code.=<<___;
2154	xorps	@tweak[0],$inout0
2155	movups	$inout0,-16($out)
2156
2157.Lxts_enc_ret:
2158	xorps	%xmm0,%xmm0			# clear register bank
2159	pxor	%xmm1,%xmm1
2160	pxor	%xmm2,%xmm2
2161	pxor	%xmm3,%xmm3
2162	pxor	%xmm4,%xmm4
2163	pxor	%xmm5,%xmm5
2164___
2165$code.=<<___ if (!$win64);
2166	pxor	%xmm6,%xmm6
2167	pxor	%xmm7,%xmm7
2168	movaps	%xmm0,0x00(%rsp)		# clear stack
2169	pxor	%xmm8,%xmm8
2170	movaps	%xmm0,0x10(%rsp)
2171	pxor	%xmm9,%xmm9
2172	movaps	%xmm0,0x20(%rsp)
2173	pxor	%xmm10,%xmm10
2174	movaps	%xmm0,0x30(%rsp)
2175	pxor	%xmm11,%xmm11
2176	movaps	%xmm0,0x40(%rsp)
2177	pxor	%xmm12,%xmm12
2178	movaps	%xmm0,0x50(%rsp)
2179	pxor	%xmm13,%xmm13
2180	movaps	%xmm0,0x60(%rsp)
2181	pxor	%xmm14,%xmm14
2182	pxor	%xmm15,%xmm15
2183___
2184$code.=<<___ if ($win64);
2185	movaps	-0xa8(%r11),%xmm6
2186	movaps	%xmm0,-0xa8(%r11)		# clear stack
2187	movaps	-0x98(%r11),%xmm7
2188	movaps	%xmm0,-0x98(%r11)
2189	movaps	-0x88(%r11),%xmm8
2190	movaps	%xmm0,-0x88(%r11)
2191	movaps	-0x78(%r11),%xmm9
2192	movaps	%xmm0,-0x78(%r11)
2193	movaps	-0x68(%r11),%xmm10
2194	movaps	%xmm0,-0x68(%r11)
2195	movaps	-0x58(%r11),%xmm11
2196	movaps	%xmm0,-0x58(%r11)
2197	movaps	-0x48(%r11),%xmm12
2198	movaps	%xmm0,-0x48(%r11)
2199	movaps	-0x38(%r11),%xmm13
2200	movaps	%xmm0,-0x38(%r11)
2201	movaps	-0x28(%r11),%xmm14
2202	movaps	%xmm0,-0x28(%r11)
2203	movaps	-0x18(%r11),%xmm15
2204	movaps	%xmm0,-0x18(%r11)
2205	movaps	%xmm0,0x00(%rsp)
2206	movaps	%xmm0,0x10(%rsp)
2207	movaps	%xmm0,0x20(%rsp)
2208	movaps	%xmm0,0x30(%rsp)
2209	movaps	%xmm0,0x40(%rsp)
2210	movaps	%xmm0,0x50(%rsp)
2211	movaps	%xmm0,0x60(%rsp)
2212___
2213$code.=<<___;
2214	mov	-8(%r11),%rbp
2215	lea	(%r11),%rsp
2216.Lxts_enc_epilogue:
2217	ret
2218.size	aesni_xts_encrypt,.-aesni_xts_encrypt
2219___
2220
2221$code.=<<___;
2222.globl	aesni_xts_decrypt
2223.type	aesni_xts_decrypt,\@function,6
2224.align	16
2225aesni_xts_decrypt:
2226	lea	(%rsp),%r11			# frame pointer
2227	push	%rbp
2228	sub	\$$frame_size,%rsp
2229	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
2230___
2231$code.=<<___ if ($win64);
2232	movaps	%xmm6,-0xa8(%r11)		# offload everything
2233	movaps	%xmm7,-0x98(%r11)
2234	movaps	%xmm8,-0x88(%r11)
2235	movaps	%xmm9,-0x78(%r11)
2236	movaps	%xmm10,-0x68(%r11)
2237	movaps	%xmm11,-0x58(%r11)
2238	movaps	%xmm12,-0x48(%r11)
2239	movaps	%xmm13,-0x38(%r11)
2240	movaps	%xmm14,-0x28(%r11)
2241	movaps	%xmm15,-0x18(%r11)
2242.Lxts_dec_body:
2243___
2244$code.=<<___;
2245	movups	($ivp),$inout0			# load clear-text tweak
2246	mov	240($key2),$rounds		# key2->rounds
2247	mov	240($key),$rnds_		# key1->rounds
2248___
2249	# generate the tweak
2250	&aesni_generate1("enc",$key2,$rounds,$inout0);
2251$code.=<<___;
2252	xor	%eax,%eax			# if ($len%16) len-=16;
2253	test	\$15,$len
2254	setnz	%al
2255	shl	\$4,%rax
2256	sub	%rax,$len
2257
2258	$movkey	($key),$rndkey0			# zero round key
2259	mov	$key,$key_			# backup $key
2260	mov	$rnds_,$rounds			# backup $rounds
2261	shl	\$4,$rnds_
2262	mov	$len,$len_			# backup $len
2263	and	\$-16,$len
2264
2265	$movkey	16($key,$rnds_),$rndkey1	# last round key
2266
2267	movdqa	.Lxts_magic(%rip),$twmask
2268	movdqa	$inout0,@tweak[5]
2269	pshufd	\$0x5f,$inout0,$twres
2270	pxor	$rndkey0,$rndkey1
2271___
2272    for ($i=0;$i<4;$i++) {
2273    $code.=<<___;
2274	movdqa	$twres,$twtmp
2275	paddd	$twres,$twres
2276	movdqa	@tweak[5],@tweak[$i]
2277	psrad	\$31,$twtmp			# broadcast upper bits
2278	paddq	@tweak[5],@tweak[5]
2279	pand	$twmask,$twtmp
2280	pxor	$rndkey0,@tweak[$i]
2281	pxor	$twtmp,@tweak[5]
2282___
2283    }
2284$code.=<<___;
2285	movdqa	@tweak[5],@tweak[4]
2286	psrad	\$31,$twres
2287	paddq	@tweak[5],@tweak[5]
2288	pand	$twmask,$twres
2289	pxor	$rndkey0,@tweak[4]
2290	pxor	$twres,@tweak[5]
2291	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
2292
2293	sub	\$16*6,$len
2294	jc	.Lxts_dec_short			# if $len-=6*16 borrowed
2295
2296	mov	\$16+96,$rounds
2297	lea	32($key_,$rnds_),$key		# end of key schedule
2298	sub	%r10,%rax			# twisted $rounds
2299	$movkey	16($key_),$rndkey1
2300	mov	%rax,%r10			# backup twisted $rounds
2301	lea	.Lxts_magic(%rip),%r8
2302	jmp	.Lxts_dec_grandloop
2303
2304.align	32
2305.Lxts_dec_grandloop:
2306	movdqu	`16*0`($inp),$inout0		# load input
2307	movdqa	$rndkey0,$twmask
2308	movdqu	`16*1`($inp),$inout1
2309	pxor	@tweak[0],$inout0		# intput^=tweak^round[0]
2310	movdqu	`16*2`($inp),$inout2
2311	pxor	@tweak[1],$inout1
2312	 aesdec		$rndkey1,$inout0
2313	movdqu	`16*3`($inp),$inout3
2314	pxor	@tweak[2],$inout2
2315	 aesdec		$rndkey1,$inout1
2316	movdqu	`16*4`($inp),$inout4
2317	pxor	@tweak[3],$inout3
2318	 aesdec		$rndkey1,$inout2
2319	movdqu	`16*5`($inp),$inout5
2320	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
2321	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
2322	pxor	@tweak[4],$inout4
2323	 aesdec		$rndkey1,$inout3
2324	$movkey	32($key_),$rndkey0
2325	lea	`16*6`($inp),$inp
2326	pxor	$twmask,$inout5
2327
2328	 pxor	$twres,@tweak[0]		# calclulate tweaks^round[last]
2329	aesdec		$rndkey1,$inout4
2330	 pxor	$twres,@tweak[1]
2331	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
2332	aesdec		$rndkey1,$inout5
2333	$movkey		48($key_),$rndkey1
2334	 pxor	$twres,@tweak[2]
2335
2336	aesdec		$rndkey0,$inout0
2337	 pxor	$twres,@tweak[3]
2338	 movdqa	@tweak[1],`16*1`(%rsp)
2339	aesdec		$rndkey0,$inout1
2340	 pxor	$twres,@tweak[4]
2341	 movdqa	@tweak[2],`16*2`(%rsp)
2342	aesdec		$rndkey0,$inout2
2343	aesdec		$rndkey0,$inout3
2344	 pxor	$twres,$twmask
2345	 movdqa	@tweak[4],`16*4`(%rsp)
2346	aesdec		$rndkey0,$inout4
2347	aesdec		$rndkey0,$inout5
2348	$movkey		64($key_),$rndkey0
2349	 movdqa	$twmask,`16*5`(%rsp)
2350	pshufd	\$0x5f,@tweak[5],$twres
2351	jmp	.Lxts_dec_loop6
2352.align	32
2353.Lxts_dec_loop6:
2354	aesdec		$rndkey1,$inout0
2355	aesdec		$rndkey1,$inout1
2356	aesdec		$rndkey1,$inout2
2357	aesdec		$rndkey1,$inout3
2358	aesdec		$rndkey1,$inout4
2359	aesdec		$rndkey1,$inout5
2360	$movkey		-64($key,%rax),$rndkey1
2361	add		\$32,%rax
2362
2363	aesdec		$rndkey0,$inout0
2364	aesdec		$rndkey0,$inout1
2365	aesdec		$rndkey0,$inout2
2366	aesdec		$rndkey0,$inout3
2367	aesdec		$rndkey0,$inout4
2368	aesdec		$rndkey0,$inout5
2369	$movkey		-80($key,%rax),$rndkey0
2370	jnz		.Lxts_dec_loop6
2371
2372	movdqa	(%r8),$twmask			# start calculating next tweak
2373	movdqa	$twres,$twtmp
2374	paddd	$twres,$twres
2375	 aesdec		$rndkey1,$inout0
2376	paddq	@tweak[5],@tweak[5]
2377	psrad	\$31,$twtmp
2378	 aesdec		$rndkey1,$inout1
2379	pand	$twmask,$twtmp
2380	$movkey	($key_),@tweak[0]		# load round[0]
2381	 aesdec		$rndkey1,$inout2
2382	 aesdec		$rndkey1,$inout3
2383	 aesdec		$rndkey1,$inout4
2384	pxor	$twtmp,@tweak[5]
2385	movaps	@tweak[0],@tweak[1]		# copy round[0]
2386	 aesdec		$rndkey1,$inout5
2387	 $movkey	-64($key),$rndkey1
2388
2389	movdqa	$twres,$twtmp
2390	 aesdec		$rndkey0,$inout0
2391	paddd	$twres,$twres
2392	pxor	@tweak[5],@tweak[0]
2393	 aesdec		$rndkey0,$inout1
2394	psrad	\$31,$twtmp
2395	paddq	@tweak[5],@tweak[5]
2396	 aesdec		$rndkey0,$inout2
2397	 aesdec		$rndkey0,$inout3
2398	pand	$twmask,$twtmp
2399	movaps	@tweak[1],@tweak[2]
2400	 aesdec		$rndkey0,$inout4
2401	pxor	$twtmp,@tweak[5]
2402	movdqa	$twres,$twtmp
2403	 aesdec		$rndkey0,$inout5
2404	 $movkey	-48($key),$rndkey0
2405
2406	paddd	$twres,$twres
2407	 aesdec		$rndkey1,$inout0
2408	pxor	@tweak[5],@tweak[1]
2409	psrad	\$31,$twtmp
2410	 aesdec		$rndkey1,$inout1
2411	paddq	@tweak[5],@tweak[5]
2412	pand	$twmask,$twtmp
2413	 aesdec		$rndkey1,$inout2
2414	 aesdec		$rndkey1,$inout3
2415	 movdqa	@tweak[3],`16*3`(%rsp)
2416	pxor	$twtmp,@tweak[5]
2417	 aesdec		$rndkey1,$inout4
2418	movaps	@tweak[2],@tweak[3]
2419	movdqa	$twres,$twtmp
2420	 aesdec		$rndkey1,$inout5
2421	 $movkey	-32($key),$rndkey1
2422
2423	paddd	$twres,$twres
2424	 aesdec		$rndkey0,$inout0
2425	pxor	@tweak[5],@tweak[2]
2426	psrad	\$31,$twtmp
2427	 aesdec		$rndkey0,$inout1
2428	paddq	@tweak[5],@tweak[5]
2429	pand	$twmask,$twtmp
2430	 aesdec		$rndkey0,$inout2
2431	 aesdec		$rndkey0,$inout3
2432	 aesdec		$rndkey0,$inout4
2433	pxor	$twtmp,@tweak[5]
2434	movaps	@tweak[3],@tweak[4]
2435	 aesdec		$rndkey0,$inout5
2436
2437	movdqa	$twres,$rndkey0
2438	paddd	$twres,$twres
2439	 aesdec		$rndkey1,$inout0
2440	pxor	@tweak[5],@tweak[3]
2441	psrad	\$31,$rndkey0
2442	 aesdec		$rndkey1,$inout1
2443	paddq	@tweak[5],@tweak[5]
2444	pand	$twmask,$rndkey0
2445	 aesdec		$rndkey1,$inout2
2446	 aesdec		$rndkey1,$inout3
2447	pxor	$rndkey0,@tweak[5]
2448	$movkey		($key_),$rndkey0
2449	 aesdec		$rndkey1,$inout4
2450	 aesdec		$rndkey1,$inout5
2451	$movkey		16($key_),$rndkey1
2452
2453	pxor	@tweak[5],@tweak[4]
2454	 aesdeclast	`16*0`(%rsp),$inout0
2455	psrad	\$31,$twres
2456	paddq	@tweak[5],@tweak[5]
2457	 aesdeclast	`16*1`(%rsp),$inout1
2458	 aesdeclast	`16*2`(%rsp),$inout2
2459	pand	$twmask,$twres
2460	mov	%r10,%rax			# restore $rounds
2461	 aesdeclast	`16*3`(%rsp),$inout3
2462	 aesdeclast	`16*4`(%rsp),$inout4
2463	 aesdeclast	`16*5`(%rsp),$inout5
2464	pxor	$twres,@tweak[5]
2465
2466	lea	`16*6`($out),$out		# $out+=6*16
2467	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2468	movups	$inout1,`-16*5`($out)
2469	movups	$inout2,`-16*4`($out)
2470	movups	$inout3,`-16*3`($out)
2471	movups	$inout4,`-16*2`($out)
2472	movups	$inout5,`-16*1`($out)
2473	sub	\$16*6,$len
2474	jnc	.Lxts_dec_grandloop		# loop if $len-=6*16 didn't borrow
2475
2476	mov	\$16+96,$rounds
2477	sub	$rnds_,$rounds
2478	mov	$key_,$key			# restore $key
2479	shr	\$4,$rounds			# restore original value
2480
2481.Lxts_dec_short:
2482	# at the point @tweak[0..5] are populated with tweak values
2483	mov	$rounds,$rnds_			# backup $rounds
2484	pxor	$rndkey0,@tweak[0]
2485	pxor	$rndkey0,@tweak[1]
2486	add	\$16*6,$len			# restore real remaining $len
2487	jz	.Lxts_dec_done			# done if ($len==0)
2488
2489	pxor	$rndkey0,@tweak[2]
2490	cmp	\$0x20,$len
2491	jb	.Lxts_dec_one			# $len is 1*16
2492	pxor	$rndkey0,@tweak[3]
2493	je	.Lxts_dec_two			# $len is 2*16
2494
2495	pxor	$rndkey0,@tweak[4]
2496	cmp	\$0x40,$len
2497	jb	.Lxts_dec_three			# $len is 3*16
2498	je	.Lxts_dec_four			# $len is 4*16
2499
2500	movdqu	($inp),$inout0			# $len is 5*16
2501	movdqu	16*1($inp),$inout1
2502	movdqu	16*2($inp),$inout2
2503	pxor	@tweak[0],$inout0
2504	movdqu	16*3($inp),$inout3
2505	pxor	@tweak[1],$inout1
2506	movdqu	16*4($inp),$inout4
2507	lea	16*5($inp),$inp			# $inp+=5*16
2508	pxor	@tweak[2],$inout2
2509	pxor	@tweak[3],$inout3
2510	pxor	@tweak[4],$inout4
2511
2512	call	_aesni_decrypt6
2513
2514	xorps	@tweak[0],$inout0
2515	xorps	@tweak[1],$inout1
2516	xorps	@tweak[2],$inout2
2517	movdqu	$inout0,($out)			# store 5 output blocks
2518	xorps	@tweak[3],$inout3
2519	movdqu	$inout1,16*1($out)
2520	xorps	@tweak[4],$inout4
2521	movdqu	$inout2,16*2($out)
2522	 pxor		$twtmp,$twtmp
2523	movdqu	$inout3,16*3($out)
2524	 pcmpgtd	@tweak[5],$twtmp
2525	movdqu	$inout4,16*4($out)
2526	lea	16*5($out),$out			# $out+=5*16
2527	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
2528	and	\$15,$len_
2529	jz	.Lxts_dec_ret
2530
2531	movdqa	@tweak[5],@tweak[0]
2532	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
2533	pand	$twmask,@tweak[1]		# isolate carry and residue
2534	pxor	@tweak[5],@tweak[1]
2535	jmp	.Lxts_dec_done2
2536
2537.align	16
2538.Lxts_dec_one:
2539	movups	($inp),$inout0
2540	lea	16*1($inp),$inp			# $inp+=1*16
2541	xorps	@tweak[0],$inout0
2542___
2543	&aesni_generate1("dec",$key,$rounds);
2544$code.=<<___;
2545	xorps	@tweak[0],$inout0
2546	movdqa	@tweak[1],@tweak[0]
2547	movups	$inout0,($out)			# store one output block
2548	movdqa	@tweak[2],@tweak[1]
2549	lea	16*1($out),$out			# $out+=1*16
2550	jmp	.Lxts_dec_done
2551
2552.align	16
2553.Lxts_dec_two:
2554	movups	($inp),$inout0
2555	movups	16($inp),$inout1
2556	lea	32($inp),$inp			# $inp+=2*16
2557	xorps	@tweak[0],$inout0
2558	xorps	@tweak[1],$inout1
2559
2560	call	_aesni_decrypt2
2561
2562	xorps	@tweak[0],$inout0
2563	movdqa	@tweak[2],@tweak[0]
2564	xorps	@tweak[1],$inout1
2565	movdqa	@tweak[3],@tweak[1]
2566	movups	$inout0,($out)			# store 2 output blocks
2567	movups	$inout1,16*1($out)
2568	lea	16*2($out),$out			# $out+=2*16
2569	jmp	.Lxts_dec_done
2570
2571.align	16
2572.Lxts_dec_three:
2573	movups	($inp),$inout0
2574	movups	16*1($inp),$inout1
2575	movups	16*2($inp),$inout2
2576	lea	16*3($inp),$inp			# $inp+=3*16
2577	xorps	@tweak[0],$inout0
2578	xorps	@tweak[1],$inout1
2579	xorps	@tweak[2],$inout2
2580
2581	call	_aesni_decrypt3
2582
2583	xorps	@tweak[0],$inout0
2584	movdqa	@tweak[3],@tweak[0]
2585	xorps	@tweak[1],$inout1
2586	movdqa	@tweak[4],@tweak[1]
2587	xorps	@tweak[2],$inout2
2588	movups	$inout0,($out)			# store 3 output blocks
2589	movups	$inout1,16*1($out)
2590	movups	$inout2,16*2($out)
2591	lea	16*3($out),$out			# $out+=3*16
2592	jmp	.Lxts_dec_done
2593
2594.align	16
2595.Lxts_dec_four:
2596	movups	($inp),$inout0
2597	movups	16*1($inp),$inout1
2598	movups	16*2($inp),$inout2
2599	xorps	@tweak[0],$inout0
2600	movups	16*3($inp),$inout3
2601	lea	16*4($inp),$inp			# $inp+=4*16
2602	xorps	@tweak[1],$inout1
2603	xorps	@tweak[2],$inout2
2604	xorps	@tweak[3],$inout3
2605
2606	call	_aesni_decrypt4
2607
2608	pxor	@tweak[0],$inout0
2609	movdqa	@tweak[4],@tweak[0]
2610	pxor	@tweak[1],$inout1
2611	movdqa	@tweak[5],@tweak[1]
2612	pxor	@tweak[2],$inout2
2613	movdqu	$inout0,($out)			# store 4 output blocks
2614	pxor	@tweak[3],$inout3
2615	movdqu	$inout1,16*1($out)
2616	movdqu	$inout2,16*2($out)
2617	movdqu	$inout3,16*3($out)
2618	lea	16*4($out),$out			# $out+=4*16
2619	jmp	.Lxts_dec_done
2620
2621.align	16
2622.Lxts_dec_done:
2623	and	\$15,$len_			# see if $len%16 is 0
2624	jz	.Lxts_dec_ret
2625.Lxts_dec_done2:
2626	mov	$len_,$len
2627	mov	$key_,$key			# restore $key
2628	mov	$rnds_,$rounds			# restore $rounds
2629
2630	movups	($inp),$inout0
2631	xorps	@tweak[1],$inout0
2632___
2633	&aesni_generate1("dec",$key,$rounds);
2634$code.=<<___;
2635	xorps	@tweak[1],$inout0
2636	movups	$inout0,($out)
2637
2638.Lxts_dec_steal:
2639	movzb	16($inp),%eax			# borrow $rounds ...
2640	movzb	($out),%ecx			# ... and $key
2641	lea	1($inp),$inp
2642	mov	%al,($out)
2643	mov	%cl,16($out)
2644	lea	1($out),$out
2645	sub	\$1,$len
2646	jnz	.Lxts_dec_steal
2647
2648	sub	$len_,$out			# rewind $out
2649	mov	$key_,$key			# restore $key
2650	mov	$rnds_,$rounds			# restore $rounds
2651
2652	movups	($out),$inout0
2653	xorps	@tweak[0],$inout0
2654___
2655	&aesni_generate1("dec",$key,$rounds);
2656$code.=<<___;
2657	xorps	@tweak[0],$inout0
2658	movups	$inout0,($out)
2659
2660.Lxts_dec_ret:
2661	xorps	%xmm0,%xmm0			# clear register bank
2662	pxor	%xmm1,%xmm1
2663	pxor	%xmm2,%xmm2
2664	pxor	%xmm3,%xmm3
2665	pxor	%xmm4,%xmm4
2666	pxor	%xmm5,%xmm5
2667___
2668$code.=<<___ if (!$win64);
2669	pxor	%xmm6,%xmm6
2670	pxor	%xmm7,%xmm7
2671	movaps	%xmm0,0x00(%rsp)		# clear stack
2672	pxor	%xmm8,%xmm8
2673	movaps	%xmm0,0x10(%rsp)
2674	pxor	%xmm9,%xmm9
2675	movaps	%xmm0,0x20(%rsp)
2676	pxor	%xmm10,%xmm10
2677	movaps	%xmm0,0x30(%rsp)
2678	pxor	%xmm11,%xmm11
2679	movaps	%xmm0,0x40(%rsp)
2680	pxor	%xmm12,%xmm12
2681	movaps	%xmm0,0x50(%rsp)
2682	pxor	%xmm13,%xmm13
2683	movaps	%xmm0,0x60(%rsp)
2684	pxor	%xmm14,%xmm14
2685	pxor	%xmm15,%xmm15
2686___
2687$code.=<<___ if ($win64);
2688	movaps	-0xa8(%r11),%xmm6
2689	movaps	%xmm0,-0xa8(%r11)		# clear stack
2690	movaps	-0x98(%r11),%xmm7
2691	movaps	%xmm0,-0x98(%r11)
2692	movaps	-0x88(%r11),%xmm8
2693	movaps	%xmm0,-0x88(%r11)
2694	movaps	-0x78(%r11),%xmm9
2695	movaps	%xmm0,-0x78(%r11)
2696	movaps	-0x68(%r11),%xmm10
2697	movaps	%xmm0,-0x68(%r11)
2698	movaps	-0x58(%r11),%xmm11
2699	movaps	%xmm0,-0x58(%r11)
2700	movaps	-0x48(%r11),%xmm12
2701	movaps	%xmm0,-0x48(%r11)
2702	movaps	-0x38(%r11),%xmm13
2703	movaps	%xmm0,-0x38(%r11)
2704	movaps	-0x28(%r11),%xmm14
2705	movaps	%xmm0,-0x28(%r11)
2706	movaps	-0x18(%r11),%xmm15
2707	movaps	%xmm0,-0x18(%r11)
2708	movaps	%xmm0,0x00(%rsp)
2709	movaps	%xmm0,0x10(%rsp)
2710	movaps	%xmm0,0x20(%rsp)
2711	movaps	%xmm0,0x30(%rsp)
2712	movaps	%xmm0,0x40(%rsp)
2713	movaps	%xmm0,0x50(%rsp)
2714	movaps	%xmm0,0x60(%rsp)
2715___
2716$code.=<<___;
2717	mov	-8(%r11),%rbp
2718	lea	(%r11),%rsp
2719.Lxts_dec_epilogue:
2720	ret
2721.size	aesni_xts_decrypt,.-aesni_xts_decrypt
2722___
2723}
2724
2725######################################################################
2726# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2727#	const AES_KEY *key, unsigned int start_block_num,
2728#	unsigned char offset_i[16], const unsigned char L_[][16],
2729#	unsigned char checksum[16]);
2730#
2731{
2732my @offset=map("%xmm$_",(10..15));
2733my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2734my ($block_num,$offset_p)=("%r8","%r9");		# 5th and 6th arguments
2735my ($L_p,$checksum_p) = ("%rbx","%rbp");
2736my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2737my $seventh_arg = $win64 ? 56 : 8;
2738my $blocks = $len;
2739
2740$code.=<<___;
2741.globl	aesni_ocb_encrypt
2742.type	aesni_ocb_encrypt,\@function,6
2743.align	32
2744aesni_ocb_encrypt:
2745	lea	(%rsp),%rax
2746	push	%rbx
2747	push	%rbp
2748	push	%r12
2749	push	%r13
2750	push	%r14
2751___
2752$code.=<<___ if ($win64);
2753	lea	-0xa0(%rsp),%rsp
2754	movaps	%xmm6,0x00(%rsp)		# offload everything
2755	movaps	%xmm7,0x10(%rsp)
2756	movaps	%xmm8,0x20(%rsp)
2757	movaps	%xmm9,0x30(%rsp)
2758	movaps	%xmm10,0x40(%rsp)
2759	movaps	%xmm11,0x50(%rsp)
2760	movaps	%xmm12,0x60(%rsp)
2761	movaps	%xmm13,0x70(%rsp)
2762	movaps	%xmm14,0x80(%rsp)
2763	movaps	%xmm15,0x90(%rsp)
2764.Locb_enc_body:
2765___
2766$code.=<<___;
2767	mov	$seventh_arg(%rax),$L_p		# 7th argument
2768	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
2769
2770	mov	240($key),$rnds_
2771	mov	$key,$key_
2772	shl	\$4,$rnds_
2773	$movkey	($key),$rndkey0l		# round[0]
2774	$movkey	16($key,$rnds_),$rndkey1	# round[last]
2775
2776	movdqu	($offset_p),@offset[5]		# load last offset_i
2777	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
2778	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
2779
2780	mov	\$16+32,$rounds
2781	lea	32($key_,$rnds_),$key
2782	$movkey	16($key_),$rndkey1		# round[1]
2783	sub	%r10,%rax			# twisted $rounds
2784	mov	%rax,%r10			# backup twisted $rounds
2785
2786	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
2787	movdqu	($checksum_p),$checksum		# load checksum
2788
2789	test	\$1,$block_num			# is first block number odd?
2790	jnz	.Locb_enc_odd
2791
2792	bsf	$block_num,$i1
2793	add	\$1,$block_num
2794	shl	\$4,$i1
2795	movdqu	($L_p,$i1),$inout5		# borrow
2796	movdqu	($inp),$inout0
2797	lea	16($inp),$inp
2798
2799	call	__ocb_encrypt1
2800
2801	movdqa	$inout5,@offset[5]
2802	movups	$inout0,($out)
2803	lea	16($out),$out
2804	sub	\$1,$blocks
2805	jz	.Locb_enc_done
2806
2807.Locb_enc_odd:
2808	lea	1($block_num),$i1		# even-numbered blocks
2809	lea	3($block_num),$i3
2810	lea	5($block_num),$i5
2811	lea	6($block_num),$block_num
2812	bsf	$i1,$i1				# ntz(block)
2813	bsf	$i3,$i3
2814	bsf	$i5,$i5
2815	shl	\$4,$i1				# ntz(block) -> table offset
2816	shl	\$4,$i3
2817	shl	\$4,$i5
2818
2819	sub	\$6,$blocks
2820	jc	.Locb_enc_short
2821	jmp	.Locb_enc_grandloop
2822
2823.align	32
2824.Locb_enc_grandloop:
2825	movdqu	`16*0`($inp),$inout0		# load input
2826	movdqu	`16*1`($inp),$inout1
2827	movdqu	`16*2`($inp),$inout2
2828	movdqu	`16*3`($inp),$inout3
2829	movdqu	`16*4`($inp),$inout4
2830	movdqu	`16*5`($inp),$inout5
2831	lea	`16*6`($inp),$inp
2832
2833	call	__ocb_encrypt6
2834
2835	movups	$inout0,`16*0`($out)		# store output
2836	movups	$inout1,`16*1`($out)
2837	movups	$inout2,`16*2`($out)
2838	movups	$inout3,`16*3`($out)
2839	movups	$inout4,`16*4`($out)
2840	movups	$inout5,`16*5`($out)
2841	lea	`16*6`($out),$out
2842	sub	\$6,$blocks
2843	jnc	.Locb_enc_grandloop
2844
2845.Locb_enc_short:
2846	add	\$6,$blocks
2847	jz	.Locb_enc_done
2848
2849	movdqu	`16*0`($inp),$inout0
2850	cmp	\$2,$blocks
2851	jb	.Locb_enc_one
2852	movdqu	`16*1`($inp),$inout1
2853	je	.Locb_enc_two
2854
2855	movdqu	`16*2`($inp),$inout2
2856	cmp	\$4,$blocks
2857	jb	.Locb_enc_three
2858	movdqu	`16*3`($inp),$inout3
2859	je	.Locb_enc_four
2860
2861	movdqu	`16*4`($inp),$inout4
2862	pxor	$inout5,$inout5
2863
2864	call	__ocb_encrypt6
2865
2866	movdqa	@offset[4],@offset[5]
2867	movups	$inout0,`16*0`($out)
2868	movups	$inout1,`16*1`($out)
2869	movups	$inout2,`16*2`($out)
2870	movups	$inout3,`16*3`($out)
2871	movups	$inout4,`16*4`($out)
2872
2873	jmp	.Locb_enc_done
2874
2875.align	16
2876.Locb_enc_one:
2877	movdqa	@offset[0],$inout5		# borrow
2878
2879	call	__ocb_encrypt1
2880
2881	movdqa	$inout5,@offset[5]
2882	movups	$inout0,`16*0`($out)
2883	jmp	.Locb_enc_done
2884
2885.align	16
2886.Locb_enc_two:
2887	pxor	$inout2,$inout2
2888	pxor	$inout3,$inout3
2889
2890	call	__ocb_encrypt4
2891
2892	movdqa	@offset[1],@offset[5]
2893	movups	$inout0,`16*0`($out)
2894	movups	$inout1,`16*1`($out)
2895
2896	jmp	.Locb_enc_done
2897
2898.align	16
2899.Locb_enc_three:
2900	pxor	$inout3,$inout3
2901
2902	call	__ocb_encrypt4
2903
2904	movdqa	@offset[2],@offset[5]
2905	movups	$inout0,`16*0`($out)
2906	movups	$inout1,`16*1`($out)
2907	movups	$inout2,`16*2`($out)
2908
2909	jmp	.Locb_enc_done
2910
2911.align	16
2912.Locb_enc_four:
2913	call	__ocb_encrypt4
2914
2915	movdqa	@offset[3],@offset[5]
2916	movups	$inout0,`16*0`($out)
2917	movups	$inout1,`16*1`($out)
2918	movups	$inout2,`16*2`($out)
2919	movups	$inout3,`16*3`($out)
2920
2921.Locb_enc_done:
2922	pxor	$rndkey0,@offset[5]		# "remove" round[last]
2923	movdqu	$checksum,($checksum_p)		# store checksum
2924	movdqu	@offset[5],($offset_p)		# store last offset_i
2925
2926	xorps	%xmm0,%xmm0			# clear register bank
2927	pxor	%xmm1,%xmm1
2928	pxor	%xmm2,%xmm2
2929	pxor	%xmm3,%xmm3
2930	pxor	%xmm4,%xmm4
2931	pxor	%xmm5,%xmm5
2932___
2933$code.=<<___ if (!$win64);
2934	pxor	%xmm6,%xmm6
2935	pxor	%xmm7,%xmm7
2936	pxor	%xmm8,%xmm8
2937	pxor	%xmm9,%xmm9
2938	pxor	%xmm10,%xmm10
2939	pxor	%xmm11,%xmm11
2940	pxor	%xmm12,%xmm12
2941	pxor	%xmm13,%xmm13
2942	pxor	%xmm14,%xmm14
2943	pxor	%xmm15,%xmm15
2944	lea	0x28(%rsp),%rax
2945___
2946$code.=<<___ if ($win64);
2947	movaps	0x00(%rsp),%xmm6
2948	movaps	%xmm0,0x00(%rsp)		# clear stack
2949	movaps	0x10(%rsp),%xmm7
2950	movaps	%xmm0,0x10(%rsp)
2951	movaps	0x20(%rsp),%xmm8
2952	movaps	%xmm0,0x20(%rsp)
2953	movaps	0x30(%rsp),%xmm9
2954	movaps	%xmm0,0x30(%rsp)
2955	movaps	0x40(%rsp),%xmm10
2956	movaps	%xmm0,0x40(%rsp)
2957	movaps	0x50(%rsp),%xmm11
2958	movaps	%xmm0,0x50(%rsp)
2959	movaps	0x60(%rsp),%xmm12
2960	movaps	%xmm0,0x60(%rsp)
2961	movaps	0x70(%rsp),%xmm13
2962	movaps	%xmm0,0x70(%rsp)
2963	movaps	0x80(%rsp),%xmm14
2964	movaps	%xmm0,0x80(%rsp)
2965	movaps	0x90(%rsp),%xmm15
2966	movaps	%xmm0,0x90(%rsp)
2967	lea	0xa0+0x28(%rsp),%rax
2968.Locb_enc_pop:
2969___
2970$code.=<<___;
2971	mov	-40(%rax),%r14
2972	mov	-32(%rax),%r13
2973	mov	-24(%rax),%r12
2974	mov	-16(%rax),%rbp
2975	mov	-8(%rax),%rbx
2976	lea	(%rax),%rsp
2977.Locb_enc_epilogue:
2978	ret
2979.size	aesni_ocb_encrypt,.-aesni_ocb_encrypt
2980
2981.type	__ocb_encrypt6,\@abi-omnipotent
2982.align	32
2983__ocb_encrypt6:
2984	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
2985	 movdqu		($L_p,$i1),@offset[1]
2986	 movdqa		@offset[0],@offset[2]
2987	 movdqu		($L_p,$i3),@offset[3]
2988	 movdqa		@offset[0],@offset[4]
2989	 pxor		@offset[5],@offset[0]
2990	 movdqu		($L_p,$i5),@offset[5]
2991	 pxor		@offset[0],@offset[1]
2992	pxor		$inout0,$checksum	# accumulate checksum
2993	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
2994	 pxor		@offset[1],@offset[2]
2995	pxor		$inout1,$checksum
2996	pxor		@offset[1],$inout1
2997	 pxor		@offset[2],@offset[3]
2998	pxor		$inout2,$checksum
2999	pxor		@offset[2],$inout2
3000	 pxor		@offset[3],@offset[4]
3001	pxor		$inout3,$checksum
3002	pxor		@offset[3],$inout3
3003	 pxor		@offset[4],@offset[5]
3004	pxor		$inout4,$checksum
3005	pxor		@offset[4],$inout4
3006	pxor		$inout5,$checksum
3007	pxor		@offset[5],$inout5
3008	$movkey		32($key_),$rndkey0
3009
3010	lea		1($block_num),$i1	# even-numbered blocks
3011	lea		3($block_num),$i3
3012	lea		5($block_num),$i5
3013	add		\$6,$block_num
3014	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3015	bsf		$i1,$i1			# ntz(block)
3016	bsf		$i3,$i3
3017	bsf		$i5,$i5
3018
3019	aesenc		$rndkey1,$inout0
3020	aesenc		$rndkey1,$inout1
3021	aesenc		$rndkey1,$inout2
3022	aesenc		$rndkey1,$inout3
3023	 pxor		$rndkey0l,@offset[1]
3024	 pxor		$rndkey0l,@offset[2]
3025	aesenc		$rndkey1,$inout4
3026	 pxor		$rndkey0l,@offset[3]
3027	 pxor		$rndkey0l,@offset[4]
3028	aesenc		$rndkey1,$inout5
3029	$movkey		48($key_),$rndkey1
3030	 pxor		$rndkey0l,@offset[5]
3031
3032	aesenc		$rndkey0,$inout0
3033	aesenc		$rndkey0,$inout1
3034	aesenc		$rndkey0,$inout2
3035	aesenc		$rndkey0,$inout3
3036	aesenc		$rndkey0,$inout4
3037	aesenc		$rndkey0,$inout5
3038	$movkey		64($key_),$rndkey0
3039	shl		\$4,$i1			# ntz(block) -> table offset
3040	shl		\$4,$i3
3041	jmp		.Locb_enc_loop6
3042
3043.align	32
3044.Locb_enc_loop6:
3045	aesenc		$rndkey1,$inout0
3046	aesenc		$rndkey1,$inout1
3047	aesenc		$rndkey1,$inout2
3048	aesenc		$rndkey1,$inout3
3049	aesenc		$rndkey1,$inout4
3050	aesenc		$rndkey1,$inout5
3051	$movkey		($key,%rax),$rndkey1
3052	add		\$32,%rax
3053
3054	aesenc		$rndkey0,$inout0
3055	aesenc		$rndkey0,$inout1
3056	aesenc		$rndkey0,$inout2
3057	aesenc		$rndkey0,$inout3
3058	aesenc		$rndkey0,$inout4
3059	aesenc		$rndkey0,$inout5
3060	$movkey		-16($key,%rax),$rndkey0
3061	jnz		.Locb_enc_loop6
3062
3063	aesenc		$rndkey1,$inout0
3064	aesenc		$rndkey1,$inout1
3065	aesenc		$rndkey1,$inout2
3066	aesenc		$rndkey1,$inout3
3067	aesenc		$rndkey1,$inout4
3068	aesenc		$rndkey1,$inout5
3069	$movkey		16($key_),$rndkey1
3070	shl		\$4,$i5
3071
3072	aesenclast	@offset[0],$inout0
3073	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3074	mov		%r10,%rax		# restore twisted rounds
3075	aesenclast	@offset[1],$inout1
3076	aesenclast	@offset[2],$inout2
3077	aesenclast	@offset[3],$inout3
3078	aesenclast	@offset[4],$inout4
3079	aesenclast	@offset[5],$inout5
3080	ret
3081.size	__ocb_encrypt6,.-__ocb_encrypt6
3082
3083.type	__ocb_encrypt4,\@abi-omnipotent
3084.align	32
3085__ocb_encrypt4:
3086	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3087	 movdqu		($L_p,$i1),@offset[1]
3088	 movdqa		@offset[0],@offset[2]
3089	 movdqu		($L_p,$i3),@offset[3]
3090	 pxor		@offset[5],@offset[0]
3091	 pxor		@offset[0],@offset[1]
3092	pxor		$inout0,$checksum	# accumulate checksum
3093	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3094	 pxor		@offset[1],@offset[2]
3095	pxor		$inout1,$checksum
3096	pxor		@offset[1],$inout1
3097	 pxor		@offset[2],@offset[3]
3098	pxor		$inout2,$checksum
3099	pxor		@offset[2],$inout2
3100	pxor		$inout3,$checksum
3101	pxor		@offset[3],$inout3
3102	$movkey		32($key_),$rndkey0
3103
3104	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3105	 pxor		$rndkey0l,@offset[1]
3106	 pxor		$rndkey0l,@offset[2]
3107	 pxor		$rndkey0l,@offset[3]
3108
3109	aesenc		$rndkey1,$inout0
3110	aesenc		$rndkey1,$inout1
3111	aesenc		$rndkey1,$inout2
3112	aesenc		$rndkey1,$inout3
3113	$movkey		48($key_),$rndkey1
3114
3115	aesenc		$rndkey0,$inout0
3116	aesenc		$rndkey0,$inout1
3117	aesenc		$rndkey0,$inout2
3118	aesenc		$rndkey0,$inout3
3119	$movkey		64($key_),$rndkey0
3120	jmp		.Locb_enc_loop4
3121
3122.align	32
3123.Locb_enc_loop4:
3124	aesenc		$rndkey1,$inout0
3125	aesenc		$rndkey1,$inout1
3126	aesenc		$rndkey1,$inout2
3127	aesenc		$rndkey1,$inout3
3128	$movkey		($key,%rax),$rndkey1
3129	add		\$32,%rax
3130
3131	aesenc		$rndkey0,$inout0
3132	aesenc		$rndkey0,$inout1
3133	aesenc		$rndkey0,$inout2
3134	aesenc		$rndkey0,$inout3
3135	$movkey		-16($key,%rax),$rndkey0
3136	jnz		.Locb_enc_loop4
3137
3138	aesenc		$rndkey1,$inout0
3139	aesenc		$rndkey1,$inout1
3140	aesenc		$rndkey1,$inout2
3141	aesenc		$rndkey1,$inout3
3142	$movkey		16($key_),$rndkey1
3143	mov		%r10,%rax		# restore twisted rounds
3144
3145	aesenclast	@offset[0],$inout0
3146	aesenclast	@offset[1],$inout1
3147	aesenclast	@offset[2],$inout2
3148	aesenclast	@offset[3],$inout3
3149	ret
3150.size	__ocb_encrypt4,.-__ocb_encrypt4
3151
3152.type	__ocb_encrypt1,\@abi-omnipotent
3153.align	32
3154__ocb_encrypt1:
3155	 pxor		@offset[5],$inout5	# offset_i
3156	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3157	pxor		$inout0,$checksum	# accumulate checksum
3158	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3159	$movkey		32($key_),$rndkey0
3160
3161	aesenc		$rndkey1,$inout0
3162	$movkey		48($key_),$rndkey1
3163	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3164
3165	aesenc		$rndkey0,$inout0
3166	$movkey		64($key_),$rndkey0
3167	jmp		.Locb_enc_loop1
3168
3169.align	32
3170.Locb_enc_loop1:
3171	aesenc		$rndkey1,$inout0
3172	$movkey		($key,%rax),$rndkey1
3173	add		\$32,%rax
3174
3175	aesenc		$rndkey0,$inout0
3176	$movkey		-16($key,%rax),$rndkey0
3177	jnz		.Locb_enc_loop1
3178
3179	aesenc		$rndkey1,$inout0
3180	$movkey		16($key_),$rndkey1	# redundant in tail
3181	mov		%r10,%rax		# restore twisted rounds
3182
3183	aesenclast	$inout5,$inout0
3184	ret
3185.size	__ocb_encrypt1,.-__ocb_encrypt1
3186
3187.globl	aesni_ocb_decrypt
3188.type	aesni_ocb_decrypt,\@function,6
3189.align	32
3190aesni_ocb_decrypt:
3191	lea	(%rsp),%rax
3192	push	%rbx
3193	push	%rbp
3194	push	%r12
3195	push	%r13
3196	push	%r14
3197___
3198$code.=<<___ if ($win64);
3199	lea	-0xa0(%rsp),%rsp
3200	movaps	%xmm6,0x00(%rsp)		# offload everything
3201	movaps	%xmm7,0x10(%rsp)
3202	movaps	%xmm8,0x20(%rsp)
3203	movaps	%xmm9,0x30(%rsp)
3204	movaps	%xmm10,0x40(%rsp)
3205	movaps	%xmm11,0x50(%rsp)
3206	movaps	%xmm12,0x60(%rsp)
3207	movaps	%xmm13,0x70(%rsp)
3208	movaps	%xmm14,0x80(%rsp)
3209	movaps	%xmm15,0x90(%rsp)
3210.Locb_dec_body:
3211___
3212$code.=<<___;
3213	mov	$seventh_arg(%rax),$L_p		# 7th argument
3214	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
3215
3216	mov	240($key),$rnds_
3217	mov	$key,$key_
3218	shl	\$4,$rnds_
3219	$movkey	($key),$rndkey0l		# round[0]
3220	$movkey	16($key,$rnds_),$rndkey1	# round[last]
3221
3222	movdqu	($offset_p),@offset[5]		# load last offset_i
3223	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
3224	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
3225
3226	mov	\$16+32,$rounds
3227	lea	32($key_,$rnds_),$key
3228	$movkey	16($key_),$rndkey1		# round[1]
3229	sub	%r10,%rax			# twisted $rounds
3230	mov	%rax,%r10			# backup twisted $rounds
3231
3232	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
3233	movdqu	($checksum_p),$checksum		# load checksum
3234
3235	test	\$1,$block_num			# is first block number odd?
3236	jnz	.Locb_dec_odd
3237
3238	bsf	$block_num,$i1
3239	add	\$1,$block_num
3240	shl	\$4,$i1
3241	movdqu	($L_p,$i1),$inout5		# borrow
3242	movdqu	($inp),$inout0
3243	lea	16($inp),$inp
3244
3245	call	__ocb_decrypt1
3246
3247	movdqa	$inout5,@offset[5]
3248	movups	$inout0,($out)
3249	xorps	$inout0,$checksum		# accumulate checksum
3250	lea	16($out),$out
3251	sub	\$1,$blocks
3252	jz	.Locb_dec_done
3253
3254.Locb_dec_odd:
3255	lea	1($block_num),$i1		# even-numbered blocks
3256	lea	3($block_num),$i3
3257	lea	5($block_num),$i5
3258	lea	6($block_num),$block_num
3259	bsf	$i1,$i1				# ntz(block)
3260	bsf	$i3,$i3
3261	bsf	$i5,$i5
3262	shl	\$4,$i1				# ntz(block) -> table offset
3263	shl	\$4,$i3
3264	shl	\$4,$i5
3265
3266	sub	\$6,$blocks
3267	jc	.Locb_dec_short
3268	jmp	.Locb_dec_grandloop
3269
3270.align	32
3271.Locb_dec_grandloop:
3272	movdqu	`16*0`($inp),$inout0		# load input
3273	movdqu	`16*1`($inp),$inout1
3274	movdqu	`16*2`($inp),$inout2
3275	movdqu	`16*3`($inp),$inout3
3276	movdqu	`16*4`($inp),$inout4
3277	movdqu	`16*5`($inp),$inout5
3278	lea	`16*6`($inp),$inp
3279
3280	call	__ocb_decrypt6
3281
3282	movups	$inout0,`16*0`($out)		# store output
3283	pxor	$inout0,$checksum		# accumulate checksum
3284	movups	$inout1,`16*1`($out)
3285	pxor	$inout1,$checksum
3286	movups	$inout2,`16*2`($out)
3287	pxor	$inout2,$checksum
3288	movups	$inout3,`16*3`($out)
3289	pxor	$inout3,$checksum
3290	movups	$inout4,`16*4`($out)
3291	pxor	$inout4,$checksum
3292	movups	$inout5,`16*5`($out)
3293	pxor	$inout5,$checksum
3294	lea	`16*6`($out),$out
3295	sub	\$6,$blocks
3296	jnc	.Locb_dec_grandloop
3297
3298.Locb_dec_short:
3299	add	\$6,$blocks
3300	jz	.Locb_dec_done
3301
3302	movdqu	`16*0`($inp),$inout0
3303	cmp	\$2,$blocks
3304	jb	.Locb_dec_one
3305	movdqu	`16*1`($inp),$inout1
3306	je	.Locb_dec_two
3307
3308	movdqu	`16*2`($inp),$inout2
3309	cmp	\$4,$blocks
3310	jb	.Locb_dec_three
3311	movdqu	`16*3`($inp),$inout3
3312	je	.Locb_dec_four
3313
3314	movdqu	`16*4`($inp),$inout4
3315	pxor	$inout5,$inout5
3316
3317	call	__ocb_decrypt6
3318
3319	movdqa	@offset[4],@offset[5]
3320	movups	$inout0,`16*0`($out)		# store output
3321	pxor	$inout0,$checksum		# accumulate checksum
3322	movups	$inout1,`16*1`($out)
3323	pxor	$inout1,$checksum
3324	movups	$inout2,`16*2`($out)
3325	pxor	$inout2,$checksum
3326	movups	$inout3,`16*3`($out)
3327	pxor	$inout3,$checksum
3328	movups	$inout4,`16*4`($out)
3329	pxor	$inout4,$checksum
3330
3331	jmp	.Locb_dec_done
3332
3333.align	16
3334.Locb_dec_one:
3335	movdqa	@offset[0],$inout5		# borrow
3336
3337	call	__ocb_decrypt1
3338
3339	movdqa	$inout5,@offset[5]
3340	movups	$inout0,`16*0`($out)		# store output
3341	xorps	$inout0,$checksum		# accumulate checksum
3342	jmp	.Locb_dec_done
3343
3344.align	16
3345.Locb_dec_two:
3346	pxor	$inout2,$inout2
3347	pxor	$inout3,$inout3
3348
3349	call	__ocb_decrypt4
3350
3351	movdqa	@offset[1],@offset[5]
3352	movups	$inout0,`16*0`($out)		# store output
3353	xorps	$inout0,$checksum		# accumulate checksum
3354	movups	$inout1,`16*1`($out)
3355	xorps	$inout1,$checksum
3356
3357	jmp	.Locb_dec_done
3358
3359.align	16
3360.Locb_dec_three:
3361	pxor	$inout3,$inout3
3362
3363	call	__ocb_decrypt4
3364
3365	movdqa	@offset[2],@offset[5]
3366	movups	$inout0,`16*0`($out)		# store output
3367	xorps	$inout0,$checksum		# accumulate checksum
3368	movups	$inout1,`16*1`($out)
3369	xorps	$inout1,$checksum
3370	movups	$inout2,`16*2`($out)
3371	xorps	$inout2,$checksum
3372
3373	jmp	.Locb_dec_done
3374
3375.align	16
3376.Locb_dec_four:
3377	call	__ocb_decrypt4
3378
3379	movdqa	@offset[3],@offset[5]
3380	movups	$inout0,`16*0`($out)		# store output
3381	pxor	$inout0,$checksum		# accumulate checksum
3382	movups	$inout1,`16*1`($out)
3383	pxor	$inout1,$checksum
3384	movups	$inout2,`16*2`($out)
3385	pxor	$inout2,$checksum
3386	movups	$inout3,`16*3`($out)
3387	pxor	$inout3,$checksum
3388
3389.Locb_dec_done:
3390	pxor	$rndkey0,@offset[5]		# "remove" round[last]
3391	movdqu	$checksum,($checksum_p)		# store checksum
3392	movdqu	@offset[5],($offset_p)		# store last offset_i
3393
3394	xorps	%xmm0,%xmm0			# clear register bank
3395	pxor	%xmm1,%xmm1
3396	pxor	%xmm2,%xmm2
3397	pxor	%xmm3,%xmm3
3398	pxor	%xmm4,%xmm4
3399	pxor	%xmm5,%xmm5
3400___
3401$code.=<<___ if (!$win64);
3402	pxor	%xmm6,%xmm6
3403	pxor	%xmm7,%xmm7
3404	pxor	%xmm8,%xmm8
3405	pxor	%xmm9,%xmm9
3406	pxor	%xmm10,%xmm10
3407	pxor	%xmm11,%xmm11
3408	pxor	%xmm12,%xmm12
3409	pxor	%xmm13,%xmm13
3410	pxor	%xmm14,%xmm14
3411	pxor	%xmm15,%xmm15
3412	lea	0x28(%rsp),%rax
3413___
3414$code.=<<___ if ($win64);
3415	movaps	0x00(%rsp),%xmm6
3416	movaps	%xmm0,0x00(%rsp)		# clear stack
3417	movaps	0x10(%rsp),%xmm7
3418	movaps	%xmm0,0x10(%rsp)
3419	movaps	0x20(%rsp),%xmm8
3420	movaps	%xmm0,0x20(%rsp)
3421	movaps	0x30(%rsp),%xmm9
3422	movaps	%xmm0,0x30(%rsp)
3423	movaps	0x40(%rsp),%xmm10
3424	movaps	%xmm0,0x40(%rsp)
3425	movaps	0x50(%rsp),%xmm11
3426	movaps	%xmm0,0x50(%rsp)
3427	movaps	0x60(%rsp),%xmm12
3428	movaps	%xmm0,0x60(%rsp)
3429	movaps	0x70(%rsp),%xmm13
3430	movaps	%xmm0,0x70(%rsp)
3431	movaps	0x80(%rsp),%xmm14
3432	movaps	%xmm0,0x80(%rsp)
3433	movaps	0x90(%rsp),%xmm15
3434	movaps	%xmm0,0x90(%rsp)
3435	lea	0xa0+0x28(%rsp),%rax
3436.Locb_dec_pop:
3437___
3438$code.=<<___;
3439	mov	-40(%rax),%r14
3440	mov	-32(%rax),%r13
3441	mov	-24(%rax),%r12
3442	mov	-16(%rax),%rbp
3443	mov	-8(%rax),%rbx
3444	lea	(%rax),%rsp
3445.Locb_dec_epilogue:
3446	ret
3447.size	aesni_ocb_decrypt,.-aesni_ocb_decrypt
3448
3449.type	__ocb_decrypt6,\@abi-omnipotent
3450.align	32
3451__ocb_decrypt6:
3452	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3453	 movdqu		($L_p,$i1),@offset[1]
3454	 movdqa		@offset[0],@offset[2]
3455	 movdqu		($L_p,$i3),@offset[3]
3456	 movdqa		@offset[0],@offset[4]
3457	 pxor		@offset[5],@offset[0]
3458	 movdqu		($L_p,$i5),@offset[5]
3459	 pxor		@offset[0],@offset[1]
3460	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3461	 pxor		@offset[1],@offset[2]
3462	pxor		@offset[1],$inout1
3463	 pxor		@offset[2],@offset[3]
3464	pxor		@offset[2],$inout2
3465	 pxor		@offset[3],@offset[4]
3466	pxor		@offset[3],$inout3
3467	 pxor		@offset[4],@offset[5]
3468	pxor		@offset[4],$inout4
3469	pxor		@offset[5],$inout5
3470	$movkey		32($key_),$rndkey0
3471
3472	lea		1($block_num),$i1	# even-numbered blocks
3473	lea		3($block_num),$i3
3474	lea		5($block_num),$i5
3475	add		\$6,$block_num
3476	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3477	bsf		$i1,$i1			# ntz(block)
3478	bsf		$i3,$i3
3479	bsf		$i5,$i5
3480
3481	aesdec		$rndkey1,$inout0
3482	aesdec		$rndkey1,$inout1
3483	aesdec		$rndkey1,$inout2
3484	aesdec		$rndkey1,$inout3
3485	 pxor		$rndkey0l,@offset[1]
3486	 pxor		$rndkey0l,@offset[2]
3487	aesdec		$rndkey1,$inout4
3488	 pxor		$rndkey0l,@offset[3]
3489	 pxor		$rndkey0l,@offset[4]
3490	aesdec		$rndkey1,$inout5
3491	$movkey		48($key_),$rndkey1
3492	 pxor		$rndkey0l,@offset[5]
3493
3494	aesdec		$rndkey0,$inout0
3495	aesdec		$rndkey0,$inout1
3496	aesdec		$rndkey0,$inout2
3497	aesdec		$rndkey0,$inout3
3498	aesdec		$rndkey0,$inout4
3499	aesdec		$rndkey0,$inout5
3500	$movkey		64($key_),$rndkey0
3501	shl		\$4,$i1			# ntz(block) -> table offset
3502	shl		\$4,$i3
3503	jmp		.Locb_dec_loop6
3504
3505.align	32
3506.Locb_dec_loop6:
3507	aesdec		$rndkey1,$inout0
3508	aesdec		$rndkey1,$inout1
3509	aesdec		$rndkey1,$inout2
3510	aesdec		$rndkey1,$inout3
3511	aesdec		$rndkey1,$inout4
3512	aesdec		$rndkey1,$inout5
3513	$movkey		($key,%rax),$rndkey1
3514	add		\$32,%rax
3515
3516	aesdec		$rndkey0,$inout0
3517	aesdec		$rndkey0,$inout1
3518	aesdec		$rndkey0,$inout2
3519	aesdec		$rndkey0,$inout3
3520	aesdec		$rndkey0,$inout4
3521	aesdec		$rndkey0,$inout5
3522	$movkey		-16($key,%rax),$rndkey0
3523	jnz		.Locb_dec_loop6
3524
3525	aesdec		$rndkey1,$inout0
3526	aesdec		$rndkey1,$inout1
3527	aesdec		$rndkey1,$inout2
3528	aesdec		$rndkey1,$inout3
3529	aesdec		$rndkey1,$inout4
3530	aesdec		$rndkey1,$inout5
3531	$movkey		16($key_),$rndkey1
3532	shl		\$4,$i5
3533
3534	aesdeclast	@offset[0],$inout0
3535	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3536	mov		%r10,%rax		# restore twisted rounds
3537	aesdeclast	@offset[1],$inout1
3538	aesdeclast	@offset[2],$inout2
3539	aesdeclast	@offset[3],$inout3
3540	aesdeclast	@offset[4],$inout4
3541	aesdeclast	@offset[5],$inout5
3542	ret
3543.size	__ocb_decrypt6,.-__ocb_decrypt6
3544
3545.type	__ocb_decrypt4,\@abi-omnipotent
3546.align	32
3547__ocb_decrypt4:
3548	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3549	 movdqu		($L_p,$i1),@offset[1]
3550	 movdqa		@offset[0],@offset[2]
3551	 movdqu		($L_p,$i3),@offset[3]
3552	 pxor		@offset[5],@offset[0]
3553	 pxor		@offset[0],@offset[1]
3554	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3555	 pxor		@offset[1],@offset[2]
3556	pxor		@offset[1],$inout1
3557	 pxor		@offset[2],@offset[3]
3558	pxor		@offset[2],$inout2
3559	pxor		@offset[3],$inout3
3560	$movkey		32($key_),$rndkey0
3561
3562	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3563	 pxor		$rndkey0l,@offset[1]
3564	 pxor		$rndkey0l,@offset[2]
3565	 pxor		$rndkey0l,@offset[3]
3566
3567	aesdec		$rndkey1,$inout0
3568	aesdec		$rndkey1,$inout1
3569	aesdec		$rndkey1,$inout2
3570	aesdec		$rndkey1,$inout3
3571	$movkey		48($key_),$rndkey1
3572
3573	aesdec		$rndkey0,$inout0
3574	aesdec		$rndkey0,$inout1
3575	aesdec		$rndkey0,$inout2
3576	aesdec		$rndkey0,$inout3
3577	$movkey		64($key_),$rndkey0
3578	jmp		.Locb_dec_loop4
3579
3580.align	32
3581.Locb_dec_loop4:
3582	aesdec		$rndkey1,$inout0
3583	aesdec		$rndkey1,$inout1
3584	aesdec		$rndkey1,$inout2
3585	aesdec		$rndkey1,$inout3
3586	$movkey		($key,%rax),$rndkey1
3587	add		\$32,%rax
3588
3589	aesdec		$rndkey0,$inout0
3590	aesdec		$rndkey0,$inout1
3591	aesdec		$rndkey0,$inout2
3592	aesdec		$rndkey0,$inout3
3593	$movkey		-16($key,%rax),$rndkey0
3594	jnz		.Locb_dec_loop4
3595
3596	aesdec		$rndkey1,$inout0
3597	aesdec		$rndkey1,$inout1
3598	aesdec		$rndkey1,$inout2
3599	aesdec		$rndkey1,$inout3
3600	$movkey		16($key_),$rndkey1
3601	mov		%r10,%rax		# restore twisted rounds
3602
3603	aesdeclast	@offset[0],$inout0
3604	aesdeclast	@offset[1],$inout1
3605	aesdeclast	@offset[2],$inout2
3606	aesdeclast	@offset[3],$inout3
3607	ret
3608.size	__ocb_decrypt4,.-__ocb_decrypt4
3609
3610.type	__ocb_decrypt1,\@abi-omnipotent
3611.align	32
3612__ocb_decrypt1:
3613	 pxor		@offset[5],$inout5	# offset_i
3614	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3615	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3616	$movkey		32($key_),$rndkey0
3617
3618	aesdec		$rndkey1,$inout0
3619	$movkey		48($key_),$rndkey1
3620	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3621
3622	aesdec		$rndkey0,$inout0
3623	$movkey		64($key_),$rndkey0
3624	jmp		.Locb_dec_loop1
3625
3626.align	32
3627.Locb_dec_loop1:
3628	aesdec		$rndkey1,$inout0
3629	$movkey		($key,%rax),$rndkey1
3630	add		\$32,%rax
3631
3632	aesdec		$rndkey0,$inout0
3633	$movkey		-16($key,%rax),$rndkey0
3634	jnz		.Locb_dec_loop1
3635
3636	aesdec		$rndkey1,$inout0
3637	$movkey		16($key_),$rndkey1	# redundant in tail
3638	mov		%r10,%rax		# restore twisted rounds
3639
3640	aesdeclast	$inout5,$inout0
3641	ret
3642.size	__ocb_decrypt1,.-__ocb_decrypt1
3643___
3644} }}
3645
3646########################################################################
3647# void $PREFIX_cbc_encrypt (const void *inp, void *out,
3648#			    size_t length, const AES_KEY *key,
3649#			    unsigned char *ivp,const int enc);
3650{
3651my $frame_size = 0x10 + ($win64?0xa0:0);	# used in decrypt
3652my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3653
3654$code.=<<___;
3655.globl	${PREFIX}_cbc_encrypt
3656.type	${PREFIX}_cbc_encrypt,\@function,6
3657.align	16
3658${PREFIX}_cbc_encrypt:
3659	test	$len,$len		# check length
3660	jz	.Lcbc_ret
3661
3662	mov	240($key),$rnds_	# key->rounds
3663	mov	$key,$key_		# backup $key
3664	test	%r9d,%r9d		# 6th argument
3665	jz	.Lcbc_decrypt
3666#--------------------------- CBC ENCRYPT ------------------------------#
3667	movups	($ivp),$inout0		# load iv as initial state
3668	mov	$rnds_,$rounds
3669	cmp	\$16,$len
3670	jb	.Lcbc_enc_tail
3671	sub	\$16,$len
3672	jmp	.Lcbc_enc_loop
3673.align	16
3674.Lcbc_enc_loop:
3675	movups	($inp),$inout1		# load input
3676	lea	16($inp),$inp
3677	#xorps	$inout1,$inout0
3678___
3679	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3680$code.=<<___;
3681	mov	$rnds_,$rounds		# restore $rounds
3682	mov	$key_,$key		# restore $key
3683	movups	$inout0,0($out)		# store output
3684	lea	16($out),$out
3685	sub	\$16,$len
3686	jnc	.Lcbc_enc_loop
3687	add	\$16,$len
3688	jnz	.Lcbc_enc_tail
3689	 pxor	$rndkey0,$rndkey0	# clear register bank
3690	 pxor	$rndkey1,$rndkey1
3691	movups	$inout0,($ivp)
3692	 pxor	$inout0,$inout0
3693	 pxor	$inout1,$inout1
3694	jmp	.Lcbc_ret
3695
3696.Lcbc_enc_tail:
3697	mov	$len,%rcx	# zaps $key
3698	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
3699	.long	0x9066A4F3	# rep movsb
3700	mov	\$16,%ecx	# zero tail
3701	sub	$len,%rcx
3702	xor	%eax,%eax
3703	.long	0x9066AAF3	# rep stosb
3704	lea	-16(%rdi),%rdi	# rewind $out by 1 block
3705	mov	$rnds_,$rounds	# restore $rounds
3706	mov	%rdi,%rsi	# $inp and $out are the same
3707	mov	$key_,$key	# restore $key
3708	xor	$len,$len	# len=16
3709	jmp	.Lcbc_enc_loop	# one more spin
3710#--------------------------- CBC DECRYPT ------------------------------#
3711.align	16
3712.Lcbc_decrypt:
3713	cmp	\$16,$len
3714	jne	.Lcbc_decrypt_bulk
3715
3716	# handle single block without allocating stack frame,
3717	# useful in ciphertext stealing mode
3718	movdqu	($inp),$inout0		# load input
3719	movdqu	($ivp),$inout1		# load iv
3720	movdqa	$inout0,$inout2		# future iv
3721___
3722	&aesni_generate1("dec",$key,$rnds_);
3723$code.=<<___;
3724	 pxor	$rndkey0,$rndkey0	# clear register bank
3725	 pxor	$rndkey1,$rndkey1
3726	movdqu	$inout2,($ivp)		# store iv
3727	xorps	$inout1,$inout0		# ^=iv
3728	 pxor	$inout1,$inout1
3729	movups	$inout0,($out)		# store output
3730	 pxor	$inout0,$inout0
3731	jmp	.Lcbc_ret
3732.align	16
3733.Lcbc_decrypt_bulk:
3734	lea	(%rsp),%r11		# frame pointer
3735	push	%rbp
3736	sub	\$$frame_size,%rsp
3737	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
3738___
3739$code.=<<___ if ($win64);
3740	movaps	%xmm6,0x10(%rsp)
3741	movaps	%xmm7,0x20(%rsp)
3742	movaps	%xmm8,0x30(%rsp)
3743	movaps	%xmm9,0x40(%rsp)
3744	movaps	%xmm10,0x50(%rsp)
3745	movaps	%xmm11,0x60(%rsp)
3746	movaps	%xmm12,0x70(%rsp)
3747	movaps	%xmm13,0x80(%rsp)
3748	movaps	%xmm14,0x90(%rsp)
3749	movaps	%xmm15,0xa0(%rsp)
3750.Lcbc_decrypt_body:
3751___
3752
3753my $inp_=$key_="%rbp";			# reassign $key_
3754
3755$code.=<<___;
3756	mov	$key,$key_		# [re-]backup $key [after reassignment]
3757	movups	($ivp),$iv
3758	mov	$rnds_,$rounds
3759	cmp	\$0x50,$len
3760	jbe	.Lcbc_dec_tail
3761
3762	$movkey	($key),$rndkey0
3763	movdqu	0x00($inp),$inout0	# load input
3764	movdqu	0x10($inp),$inout1
3765	movdqa	$inout0,$in0
3766	movdqu	0x20($inp),$inout2
3767	movdqa	$inout1,$in1
3768	movdqu	0x30($inp),$inout3
3769	movdqa	$inout2,$in2
3770	movdqu	0x40($inp),$inout4
3771	movdqa	$inout3,$in3
3772	movdqu	0x50($inp),$inout5
3773	movdqa	$inout4,$in4
3774	mov	OPENSSL_ia32cap_P+4(%rip),%r9d
3775	cmp	\$0x70,$len
3776	jbe	.Lcbc_dec_six_or_seven
3777
3778	and	\$`1<<26|1<<22`,%r9d	# isolate XSAVE+MOVBE
3779	sub	\$0x50,$len		# $len is biased by -5*16
3780	cmp	\$`1<<22`,%r9d		# check for MOVBE without XSAVE
3781	je	.Lcbc_dec_loop6_enter	# [which denotes Atom Silvermont]
3782	sub	\$0x20,$len		# $len is biased by -7*16
3783	lea	0x70($key),$key		# size optimization
3784	jmp	.Lcbc_dec_loop8_enter
3785.align	16
3786.Lcbc_dec_loop8:
3787	movups	$inout7,($out)
3788	lea	0x10($out),$out
3789.Lcbc_dec_loop8_enter:
3790	movdqu		0x60($inp),$inout6
3791	pxor		$rndkey0,$inout0
3792	movdqu		0x70($inp),$inout7
3793	pxor		$rndkey0,$inout1
3794	$movkey		0x10-0x70($key),$rndkey1
3795	pxor		$rndkey0,$inout2
3796	mov		\$-1,$inp_
3797	cmp		\$0x70,$len	# is there at least 0x60 bytes ahead?
3798	pxor		$rndkey0,$inout3
3799	pxor		$rndkey0,$inout4
3800	pxor		$rndkey0,$inout5
3801	pxor		$rndkey0,$inout6
3802
3803	aesdec		$rndkey1,$inout0
3804	pxor		$rndkey0,$inout7
3805	$movkey		0x20-0x70($key),$rndkey0
3806	aesdec		$rndkey1,$inout1
3807	aesdec		$rndkey1,$inout2
3808	aesdec		$rndkey1,$inout3
3809	aesdec		$rndkey1,$inout4
3810	aesdec		$rndkey1,$inout5
3811	aesdec		$rndkey1,$inout6
3812	adc		\$0,$inp_
3813	and		\$128,$inp_
3814	aesdec		$rndkey1,$inout7
3815	add		$inp,$inp_
3816	$movkey		0x30-0x70($key),$rndkey1
3817___
3818for($i=1;$i<12;$i++) {
3819my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3820$code.=<<___	if ($i==7);
3821	cmp		\$11,$rounds
3822___
3823$code.=<<___;
3824	aesdec		$rndkeyx,$inout0
3825	aesdec		$rndkeyx,$inout1
3826	aesdec		$rndkeyx,$inout2
3827	aesdec		$rndkeyx,$inout3
3828	aesdec		$rndkeyx,$inout4
3829	aesdec		$rndkeyx,$inout5
3830	aesdec		$rndkeyx,$inout6
3831	aesdec		$rndkeyx,$inout7
3832	$movkey		`0x30+0x10*$i`-0x70($key),$rndkeyx
3833___
3834$code.=<<___	if ($i<6 || (!($i&1) && $i>7));
3835	nop
3836___
3837$code.=<<___	if ($i==7);
3838	jb		.Lcbc_dec_done
3839___
3840$code.=<<___	if ($i==9);
3841	je		.Lcbc_dec_done
3842___
3843$code.=<<___	if ($i==11);
3844	jmp		.Lcbc_dec_done
3845___
3846}
3847$code.=<<___;
3848.align	16
3849.Lcbc_dec_done:
3850	aesdec		$rndkey1,$inout0
3851	aesdec		$rndkey1,$inout1
3852	pxor		$rndkey0,$iv
3853	pxor		$rndkey0,$in0
3854	aesdec		$rndkey1,$inout2
3855	aesdec		$rndkey1,$inout3
3856	pxor		$rndkey0,$in1
3857	pxor		$rndkey0,$in2
3858	aesdec		$rndkey1,$inout4
3859	aesdec		$rndkey1,$inout5
3860	pxor		$rndkey0,$in3
3861	pxor		$rndkey0,$in4
3862	aesdec		$rndkey1,$inout6
3863	aesdec		$rndkey1,$inout7
3864	movdqu		0x50($inp),$rndkey1
3865
3866	aesdeclast	$iv,$inout0
3867	movdqu		0x60($inp),$iv		# borrow $iv
3868	pxor		$rndkey0,$rndkey1
3869	aesdeclast	$in0,$inout1
3870	pxor		$rndkey0,$iv
3871	movdqu		0x70($inp),$rndkey0	# next IV
3872	aesdeclast	$in1,$inout2
3873	lea		0x80($inp),$inp
3874	movdqu		0x00($inp_),$in0
3875	aesdeclast	$in2,$inout3
3876	aesdeclast	$in3,$inout4
3877	movdqu		0x10($inp_),$in1
3878	movdqu		0x20($inp_),$in2
3879	aesdeclast	$in4,$inout5
3880	aesdeclast	$rndkey1,$inout6
3881	movdqu		0x30($inp_),$in3
3882	movdqu		0x40($inp_),$in4
3883	aesdeclast	$iv,$inout7
3884	movdqa		$rndkey0,$iv		# return $iv
3885	movdqu		0x50($inp_),$rndkey1
3886	$movkey		-0x70($key),$rndkey0
3887
3888	movups		$inout0,($out)		# store output
3889	movdqa		$in0,$inout0
3890	movups		$inout1,0x10($out)
3891	movdqa		$in1,$inout1
3892	movups		$inout2,0x20($out)
3893	movdqa		$in2,$inout2
3894	movups		$inout3,0x30($out)
3895	movdqa		$in3,$inout3
3896	movups		$inout4,0x40($out)
3897	movdqa		$in4,$inout4
3898	movups		$inout5,0x50($out)
3899	movdqa		$rndkey1,$inout5
3900	movups		$inout6,0x60($out)
3901	lea		0x70($out),$out
3902
3903	sub	\$0x80,$len
3904	ja	.Lcbc_dec_loop8
3905
3906	movaps	$inout7,$inout0
3907	lea	-0x70($key),$key
3908	add	\$0x70,$len
3909	jle	.Lcbc_dec_clear_tail_collected
3910	movups	$inout7,($out)
3911	lea	0x10($out),$out
3912	cmp	\$0x50,$len
3913	jbe	.Lcbc_dec_tail
3914
3915	movaps	$in0,$inout0
3916.Lcbc_dec_six_or_seven:
3917	cmp	\$0x60,$len
3918	ja	.Lcbc_dec_seven
3919
3920	movaps	$inout5,$inout6
3921	call	_aesni_decrypt6
3922	pxor	$iv,$inout0		# ^= IV
3923	movaps	$inout6,$iv
3924	pxor	$in0,$inout1
3925	movdqu	$inout0,($out)
3926	pxor	$in1,$inout2
3927	movdqu	$inout1,0x10($out)
3928	 pxor	$inout1,$inout1		# clear register bank
3929	pxor	$in2,$inout3
3930	movdqu	$inout2,0x20($out)
3931	 pxor	$inout2,$inout2
3932	pxor	$in3,$inout4
3933	movdqu	$inout3,0x30($out)
3934	 pxor	$inout3,$inout3
3935	pxor	$in4,$inout5
3936	movdqu	$inout4,0x40($out)
3937	 pxor	$inout4,$inout4
3938	lea	0x50($out),$out
3939	movdqa	$inout5,$inout0
3940	 pxor	$inout5,$inout5
3941	jmp	.Lcbc_dec_tail_collected
3942
3943.align	16
3944.Lcbc_dec_seven:
3945	movups	0x60($inp),$inout6
3946	xorps	$inout7,$inout7
3947	call	_aesni_decrypt8
3948	movups	0x50($inp),$inout7
3949	pxor	$iv,$inout0		# ^= IV
3950	movups	0x60($inp),$iv
3951	pxor	$in0,$inout1
3952	movdqu	$inout0,($out)
3953	pxor	$in1,$inout2
3954	movdqu	$inout1,0x10($out)
3955	 pxor	$inout1,$inout1		# clear register bank
3956	pxor	$in2,$inout3
3957	movdqu	$inout2,0x20($out)
3958	 pxor	$inout2,$inout2
3959	pxor	$in3,$inout4
3960	movdqu	$inout3,0x30($out)
3961	 pxor	$inout3,$inout3
3962	pxor	$in4,$inout5
3963	movdqu	$inout4,0x40($out)
3964	 pxor	$inout4,$inout4
3965	pxor	$inout7,$inout6
3966	movdqu	$inout5,0x50($out)
3967	 pxor	$inout5,$inout5
3968	lea	0x60($out),$out
3969	movdqa	$inout6,$inout0
3970	 pxor	$inout6,$inout6
3971	 pxor	$inout7,$inout7
3972	jmp	.Lcbc_dec_tail_collected
3973
3974.align	16
3975.Lcbc_dec_loop6:
3976	movups	$inout5,($out)
3977	lea	0x10($out),$out
3978	movdqu	0x00($inp),$inout0	# load input
3979	movdqu	0x10($inp),$inout1
3980	movdqa	$inout0,$in0
3981	movdqu	0x20($inp),$inout2
3982	movdqa	$inout1,$in1
3983	movdqu	0x30($inp),$inout3
3984	movdqa	$inout2,$in2
3985	movdqu	0x40($inp),$inout4
3986	movdqa	$inout3,$in3
3987	movdqu	0x50($inp),$inout5
3988	movdqa	$inout4,$in4
3989.Lcbc_dec_loop6_enter:
3990	lea	0x60($inp),$inp
3991	movdqa	$inout5,$inout6
3992
3993	call	_aesni_decrypt6
3994
3995	pxor	$iv,$inout0		# ^= IV
3996	movdqa	$inout6,$iv
3997	pxor	$in0,$inout1
3998	movdqu	$inout0,($out)
3999	pxor	$in1,$inout2
4000	movdqu	$inout1,0x10($out)
4001	pxor	$in2,$inout3
4002	movdqu	$inout2,0x20($out)
4003	pxor	$in3,$inout4
4004	mov	$key_,$key
4005	movdqu	$inout3,0x30($out)
4006	pxor	$in4,$inout5
4007	mov	$rnds_,$rounds
4008	movdqu	$inout4,0x40($out)
4009	lea	0x50($out),$out
4010	sub	\$0x60,$len
4011	ja	.Lcbc_dec_loop6
4012
4013	movdqa	$inout5,$inout0
4014	add	\$0x50,$len
4015	jle	.Lcbc_dec_clear_tail_collected
4016	movups	$inout5,($out)
4017	lea	0x10($out),$out
4018
4019.Lcbc_dec_tail:
4020	movups	($inp),$inout0
4021	sub	\$0x10,$len
4022	jbe	.Lcbc_dec_one		# $len is 1*16 or less
4023
4024	movups	0x10($inp),$inout1
4025	movaps	$inout0,$in0
4026	sub	\$0x10,$len
4027	jbe	.Lcbc_dec_two		# $len is 2*16 or less
4028
4029	movups	0x20($inp),$inout2
4030	movaps	$inout1,$in1
4031	sub	\$0x10,$len
4032	jbe	.Lcbc_dec_three		# $len is 3*16 or less
4033
4034	movups	0x30($inp),$inout3
4035	movaps	$inout2,$in2
4036	sub	\$0x10,$len
4037	jbe	.Lcbc_dec_four		# $len is 4*16 or less
4038
4039	movups	0x40($inp),$inout4	# $len is 5*16 or less
4040	movaps	$inout3,$in3
4041	movaps	$inout4,$in4
4042	xorps	$inout5,$inout5
4043	call	_aesni_decrypt6
4044	pxor	$iv,$inout0
4045	movaps	$in4,$iv
4046	pxor	$in0,$inout1
4047	movdqu	$inout0,($out)
4048	pxor	$in1,$inout2
4049	movdqu	$inout1,0x10($out)
4050	 pxor	$inout1,$inout1		# clear register bank
4051	pxor	$in2,$inout3
4052	movdqu	$inout2,0x20($out)
4053	 pxor	$inout2,$inout2
4054	pxor	$in3,$inout4
4055	movdqu	$inout3,0x30($out)
4056	 pxor	$inout3,$inout3
4057	lea	0x40($out),$out
4058	movdqa	$inout4,$inout0
4059	 pxor	$inout4,$inout4
4060	 pxor	$inout5,$inout5
4061	sub	\$0x10,$len
4062	jmp	.Lcbc_dec_tail_collected
4063
4064.align	16
4065.Lcbc_dec_one:
4066	movaps	$inout0,$in0
4067___
4068	&aesni_generate1("dec",$key,$rounds);
4069$code.=<<___;
4070	xorps	$iv,$inout0
4071	movaps	$in0,$iv
4072	jmp	.Lcbc_dec_tail_collected
4073.align	16
4074.Lcbc_dec_two:
4075	movaps	$inout1,$in1
4076	call	_aesni_decrypt2
4077	pxor	$iv,$inout0
4078	movaps	$in1,$iv
4079	pxor	$in0,$inout1
4080	movdqu	$inout0,($out)
4081	movdqa	$inout1,$inout0
4082	 pxor	$inout1,$inout1		# clear register bank
4083	lea	0x10($out),$out
4084	jmp	.Lcbc_dec_tail_collected
4085.align	16
4086.Lcbc_dec_three:
4087	movaps	$inout2,$in2
4088	call	_aesni_decrypt3
4089	pxor	$iv,$inout0
4090	movaps	$in2,$iv
4091	pxor	$in0,$inout1
4092	movdqu	$inout0,($out)
4093	pxor	$in1,$inout2
4094	movdqu	$inout1,0x10($out)
4095	 pxor	$inout1,$inout1		# clear register bank
4096	movdqa	$inout2,$inout0
4097	 pxor	$inout2,$inout2
4098	lea	0x20($out),$out
4099	jmp	.Lcbc_dec_tail_collected
4100.align	16
4101.Lcbc_dec_four:
4102	movaps	$inout3,$in3
4103	call	_aesni_decrypt4
4104	pxor	$iv,$inout0
4105	movaps	$in3,$iv
4106	pxor	$in0,$inout1
4107	movdqu	$inout0,($out)
4108	pxor	$in1,$inout2
4109	movdqu	$inout1,0x10($out)
4110	 pxor	$inout1,$inout1		# clear register bank
4111	pxor	$in2,$inout3
4112	movdqu	$inout2,0x20($out)
4113	 pxor	$inout2,$inout2
4114	movdqa	$inout3,$inout0
4115	 pxor	$inout3,$inout3
4116	lea	0x30($out),$out
4117	jmp	.Lcbc_dec_tail_collected
4118
4119.align	16
4120.Lcbc_dec_clear_tail_collected:
4121	pxor	$inout1,$inout1		# clear register bank
4122	pxor	$inout2,$inout2
4123	pxor	$inout3,$inout3
4124___
4125$code.=<<___ if (!$win64);
4126	pxor	$inout4,$inout4		# %xmm6..9
4127	pxor	$inout5,$inout5
4128	pxor	$inout6,$inout6
4129	pxor	$inout7,$inout7
4130___
4131$code.=<<___;
4132.Lcbc_dec_tail_collected:
4133	movups	$iv,($ivp)
4134	and	\$15,$len
4135	jnz	.Lcbc_dec_tail_partial
4136	movups	$inout0,($out)
4137	pxor	$inout0,$inout0
4138	jmp	.Lcbc_dec_ret
4139.align	16
4140.Lcbc_dec_tail_partial:
4141	movaps	$inout0,(%rsp)
4142	pxor	$inout0,$inout0
4143	mov	\$16,%rcx
4144	mov	$out,%rdi
4145	sub	$len,%rcx
4146	lea	(%rsp),%rsi
4147	.long	0x9066A4F3		# rep movsb
4148	movdqa	$inout0,(%rsp)
4149
4150.Lcbc_dec_ret:
4151	xorps	$rndkey0,$rndkey0	# %xmm0
4152	pxor	$rndkey1,$rndkey1
4153___
4154$code.=<<___ if ($win64);
4155	movaps	0x10(%rsp),%xmm6
4156	movaps	%xmm0,0x10(%rsp)	# clear stack
4157	movaps	0x20(%rsp),%xmm7
4158	movaps	%xmm0,0x20(%rsp)
4159	movaps	0x30(%rsp),%xmm8
4160	movaps	%xmm0,0x30(%rsp)
4161	movaps	0x40(%rsp),%xmm9
4162	movaps	%xmm0,0x40(%rsp)
4163	movaps	0x50(%rsp),%xmm10
4164	movaps	%xmm0,0x50(%rsp)
4165	movaps	0x60(%rsp),%xmm11
4166	movaps	%xmm0,0x60(%rsp)
4167	movaps	0x70(%rsp),%xmm12
4168	movaps	%xmm0,0x70(%rsp)
4169	movaps	0x80(%rsp),%xmm13
4170	movaps	%xmm0,0x80(%rsp)
4171	movaps	0x90(%rsp),%xmm14
4172	movaps	%xmm0,0x90(%rsp)
4173	movaps	0xa0(%rsp),%xmm15
4174	movaps	%xmm0,0xa0(%rsp)
4175___
4176$code.=<<___;
4177	mov	-8(%r11),%rbp
4178	lea	(%r11),%rsp
4179.Lcbc_ret:
4180	ret
4181.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4182___
4183}
4184# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4185#				int bits, AES_KEY *key)
4186#
4187# input:	$inp	user-supplied key
4188#		$bits	$inp length in bits
4189#		$key	pointer to key schedule
4190# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4191#		*$key	key schedule
4192#
4193{ my ($inp,$bits,$key) = @_4args;
4194  $bits =~ s/%r/%e/;
4195
4196$code.=<<___;
4197.globl	${PREFIX}_set_decrypt_key
4198.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
4199.align	16
4200${PREFIX}_set_decrypt_key:
4201	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4202	call	__aesni_set_encrypt_key
4203	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
4204	test	%eax,%eax
4205	jnz	.Ldec_key_ret
4206	lea	16($key,$bits),$inp	# points at the end of key schedule
4207
4208	$movkey	($key),%xmm0		# just swap
4209	$movkey	($inp),%xmm1
4210	$movkey	%xmm0,($inp)
4211	$movkey	%xmm1,($key)
4212	lea	16($key),$key
4213	lea	-16($inp),$inp
4214
4215.Ldec_key_inverse:
4216	$movkey	($key),%xmm0		# swap and inverse
4217	$movkey	($inp),%xmm1
4218	aesimc	%xmm0,%xmm0
4219	aesimc	%xmm1,%xmm1
4220	lea	16($key),$key
4221	lea	-16($inp),$inp
4222	$movkey	%xmm0,16($inp)
4223	$movkey	%xmm1,-16($key)
4224	cmp	$key,$inp
4225	ja	.Ldec_key_inverse
4226
4227	$movkey	($key),%xmm0		# inverse middle
4228	aesimc	%xmm0,%xmm0
4229	pxor	%xmm1,%xmm1
4230	$movkey	%xmm0,($inp)
4231	pxor	%xmm0,%xmm0
4232.Ldec_key_ret:
4233	add	\$8,%rsp
4234	ret
4235.LSEH_end_set_decrypt_key:
4236.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4237___
4238
4239# This is based on submission by
4240#
4241#	Huang Ying <ying.huang@intel.com>
4242#	Vinodh Gopal <vinodh.gopal@intel.com>
4243#	Kahraman Akdemir
4244#
4245# Aggressively optimized in respect to aeskeygenassist's critical path
4246# and is contained in %xmm0-5 to meet Win64 ABI requirement.
4247#
4248# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4249#				int bits, AES_KEY * const key);
4250#
4251# input:	$inp	user-supplied key
4252#		$bits	$inp length in bits
4253#		$key	pointer to key schedule
4254# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4255#		$bits	rounds-1 (used in aesni_set_decrypt_key)
4256#		*$key	key schedule
4257#		$key	pointer to key schedule (used in
4258#			aesni_set_decrypt_key)
4259#
4260# Subroutine is frame-less, which means that only volatile registers
4261# are used. Note that it's declared "abi-omnipotent", which means that
4262# amount of volatile registers is smaller on Windows.
4263#
4264$code.=<<___;
4265.globl	${PREFIX}_set_encrypt_key
4266.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
4267.align	16
4268${PREFIX}_set_encrypt_key:
4269__aesni_set_encrypt_key:
4270	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4271	mov	\$-1,%rax
4272	test	$inp,$inp
4273	jz	.Lenc_key_ret
4274	test	$key,$key
4275	jz	.Lenc_key_ret
4276
4277	mov	\$`1<<28|1<<11`,%r10d	# AVX and XOP bits
4278	movups	($inp),%xmm0		# pull first 128 bits of *userKey
4279	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
4280	and	OPENSSL_ia32cap_P+4(%rip),%r10d
4281	lea	16($key),%rax		# %rax is used as modifiable copy of $key
4282	cmp	\$256,$bits
4283	je	.L14rounds
4284	cmp	\$192,$bits
4285	je	.L12rounds
4286	cmp	\$128,$bits
4287	jne	.Lbad_keybits
4288
4289.L10rounds:
4290	mov	\$9,$bits			# 10 rounds for 128-bit key
4291	cmp	\$`1<<28`,%r10d			# AVX, bit no XOP
4292	je	.L10rounds_alt
4293
4294	$movkey	%xmm0,($key)			# round 0
4295	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
4296	call		.Lkey_expansion_128_cold
4297	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
4298	call		.Lkey_expansion_128
4299	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
4300	call		.Lkey_expansion_128
4301	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
4302	call		.Lkey_expansion_128
4303	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
4304	call		.Lkey_expansion_128
4305	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
4306	call		.Lkey_expansion_128
4307	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
4308	call		.Lkey_expansion_128
4309	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
4310	call		.Lkey_expansion_128
4311	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
4312	call		.Lkey_expansion_128
4313	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
4314	call		.Lkey_expansion_128
4315	$movkey	%xmm0,(%rax)
4316	mov	$bits,80(%rax)	# 240(%rdx)
4317	xor	%eax,%eax
4318	jmp	.Lenc_key_ret
4319
4320.align	16
4321.L10rounds_alt:
4322	movdqa	.Lkey_rotate(%rip),%xmm5
4323	mov	\$8,%r10d
4324	movdqa	.Lkey_rcon1(%rip),%xmm4
4325	movdqa	%xmm0,%xmm2
4326	movdqu	%xmm0,($key)
4327	jmp	.Loop_key128
4328
4329.align	16
4330.Loop_key128:
4331	pshufb		%xmm5,%xmm0
4332	aesenclast	%xmm4,%xmm0
4333	pslld		\$1,%xmm4
4334	lea		16(%rax),%rax
4335
4336	movdqa		%xmm2,%xmm3
4337	pslldq		\$4,%xmm2
4338	pxor		%xmm2,%xmm3
4339	pslldq		\$4,%xmm2
4340	pxor		%xmm2,%xmm3
4341	pslldq		\$4,%xmm2
4342	pxor		%xmm3,%xmm2
4343
4344	pxor		%xmm2,%xmm0
4345	movdqu		%xmm0,-16(%rax)
4346	movdqa		%xmm0,%xmm2
4347
4348	dec	%r10d
4349	jnz	.Loop_key128
4350
4351	movdqa		.Lkey_rcon1b(%rip),%xmm4
4352
4353	pshufb		%xmm5,%xmm0
4354	aesenclast	%xmm4,%xmm0
4355	pslld		\$1,%xmm4
4356
4357	movdqa		%xmm2,%xmm3
4358	pslldq		\$4,%xmm2
4359	pxor		%xmm2,%xmm3
4360	pslldq		\$4,%xmm2
4361	pxor		%xmm2,%xmm3
4362	pslldq		\$4,%xmm2
4363	pxor		%xmm3,%xmm2
4364
4365	pxor		%xmm2,%xmm0
4366	movdqu		%xmm0,(%rax)
4367
4368	movdqa		%xmm0,%xmm2
4369	pshufb		%xmm5,%xmm0
4370	aesenclast	%xmm4,%xmm0
4371
4372	movdqa		%xmm2,%xmm3
4373	pslldq		\$4,%xmm2
4374	pxor		%xmm2,%xmm3
4375	pslldq		\$4,%xmm2
4376	pxor		%xmm2,%xmm3
4377	pslldq		\$4,%xmm2
4378	pxor		%xmm3,%xmm2
4379
4380	pxor		%xmm2,%xmm0
4381	movdqu		%xmm0,16(%rax)
4382
4383	mov	$bits,96(%rax)	# 240($key)
4384	xor	%eax,%eax
4385	jmp	.Lenc_key_ret
4386
4387.align	16
4388.L12rounds:
4389	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
4390	mov	\$11,$bits			# 12 rounds for 192
4391	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4392	je	.L12rounds_alt
4393
4394	$movkey	%xmm0,($key)			# round 0
4395	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
4396	call		.Lkey_expansion_192a_cold
4397	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
4398	call		.Lkey_expansion_192b
4399	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
4400	call		.Lkey_expansion_192a
4401	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
4402	call		.Lkey_expansion_192b
4403	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
4404	call		.Lkey_expansion_192a
4405	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
4406	call		.Lkey_expansion_192b
4407	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
4408	call		.Lkey_expansion_192a
4409	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
4410	call		.Lkey_expansion_192b
4411	$movkey	%xmm0,(%rax)
4412	mov	$bits,48(%rax)	# 240(%rdx)
4413	xor	%rax, %rax
4414	jmp	.Lenc_key_ret
4415
4416.align	16
4417.L12rounds_alt:
4418	movdqa	.Lkey_rotate192(%rip),%xmm5
4419	movdqa	.Lkey_rcon1(%rip),%xmm4
4420	mov	\$8,%r10d
4421	movdqu	%xmm0,($key)
4422	jmp	.Loop_key192
4423
4424.align	16
4425.Loop_key192:
4426	movq		%xmm2,0(%rax)
4427	movdqa		%xmm2,%xmm1
4428	pshufb		%xmm5,%xmm2
4429	aesenclast	%xmm4,%xmm2
4430	pslld		\$1, %xmm4
4431	lea		24(%rax),%rax
4432
4433	movdqa		%xmm0,%xmm3
4434	pslldq		\$4,%xmm0
4435	pxor		%xmm0,%xmm3
4436	pslldq		\$4,%xmm0
4437	pxor		%xmm0,%xmm3
4438	pslldq		\$4,%xmm0
4439	pxor		%xmm3,%xmm0
4440
4441	pshufd		\$0xff,%xmm0,%xmm3
4442	pxor		%xmm1,%xmm3
4443	pslldq		\$4,%xmm1
4444	pxor		%xmm1,%xmm3
4445
4446	pxor		%xmm2,%xmm0
4447	pxor		%xmm3,%xmm2
4448	movdqu		%xmm0,-16(%rax)
4449
4450	dec	%r10d
4451	jnz	.Loop_key192
4452
4453	mov	$bits,32(%rax)	# 240($key)
4454	xor	%eax,%eax
4455	jmp	.Lenc_key_ret
4456
4457.align	16
4458.L14rounds:
4459	movups	16($inp),%xmm2			# remaning half of *userKey
4460	mov	\$13,$bits			# 14 rounds for 256
4461	lea	16(%rax),%rax
4462	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4463	je	.L14rounds_alt
4464
4465	$movkey	%xmm0,($key)			# round 0
4466	$movkey	%xmm2,16($key)			# round 1
4467	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
4468	call		.Lkey_expansion_256a_cold
4469	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
4470	call		.Lkey_expansion_256b
4471	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
4472	call		.Lkey_expansion_256a
4473	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
4474	call		.Lkey_expansion_256b
4475	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
4476	call		.Lkey_expansion_256a
4477	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
4478	call		.Lkey_expansion_256b
4479	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
4480	call		.Lkey_expansion_256a
4481	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
4482	call		.Lkey_expansion_256b
4483	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
4484	call		.Lkey_expansion_256a
4485	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
4486	call		.Lkey_expansion_256b
4487	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
4488	call		.Lkey_expansion_256a
4489	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
4490	call		.Lkey_expansion_256b
4491	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
4492	call		.Lkey_expansion_256a
4493	$movkey	%xmm0,(%rax)
4494	mov	$bits,16(%rax)	# 240(%rdx)
4495	xor	%rax,%rax
4496	jmp	.Lenc_key_ret
4497
4498.align	16
4499.L14rounds_alt:
4500	movdqa	.Lkey_rotate(%rip),%xmm5
4501	movdqa	.Lkey_rcon1(%rip),%xmm4
4502	mov	\$7,%r10d
4503	movdqu	%xmm0,0($key)
4504	movdqa	%xmm2,%xmm1
4505	movdqu	%xmm2,16($key)
4506	jmp	.Loop_key256
4507
4508.align	16
4509.Loop_key256:
4510	pshufb		%xmm5,%xmm2
4511	aesenclast	%xmm4,%xmm2
4512
4513	movdqa		%xmm0,%xmm3
4514	pslldq		\$4,%xmm0
4515	pxor		%xmm0,%xmm3
4516	pslldq		\$4,%xmm0
4517	pxor		%xmm0,%xmm3
4518	pslldq		\$4,%xmm0
4519	pxor		%xmm3,%xmm0
4520	pslld		\$1,%xmm4
4521
4522	pxor		%xmm2,%xmm0
4523	movdqu		%xmm0,(%rax)
4524
4525	dec	%r10d
4526	jz	.Ldone_key256
4527
4528	pshufd		\$0xff,%xmm0,%xmm2
4529	pxor		%xmm3,%xmm3
4530	aesenclast	%xmm3,%xmm2
4531
4532	movdqa		%xmm1,%xmm3
4533	pslldq		\$4,%xmm1
4534	pxor		%xmm1,%xmm3
4535	pslldq		\$4,%xmm1
4536	pxor		%xmm1,%xmm3
4537	pslldq		\$4,%xmm1
4538	pxor		%xmm3,%xmm1
4539
4540	pxor		%xmm1,%xmm2
4541	movdqu		%xmm2,16(%rax)
4542	lea		32(%rax),%rax
4543	movdqa		%xmm2,%xmm1
4544
4545	jmp	.Loop_key256
4546
4547.Ldone_key256:
4548	mov	$bits,16(%rax)	# 240($key)
4549	xor	%eax,%eax
4550	jmp	.Lenc_key_ret
4551
4552.align	16
4553.Lbad_keybits:
4554	mov	\$-2,%rax
4555.Lenc_key_ret:
4556	pxor	%xmm0,%xmm0
4557	pxor	%xmm1,%xmm1
4558	pxor	%xmm2,%xmm2
4559	pxor	%xmm3,%xmm3
4560	pxor	%xmm4,%xmm4
4561	pxor	%xmm5,%xmm5
4562	add	\$8,%rsp
4563	ret
4564.LSEH_end_set_encrypt_key:
4565
4566.align	16
4567.Lkey_expansion_128:
4568	$movkey	%xmm0,(%rax)
4569	lea	16(%rax),%rax
4570.Lkey_expansion_128_cold:
4571	shufps	\$0b00010000,%xmm0,%xmm4
4572	xorps	%xmm4, %xmm0
4573	shufps	\$0b10001100,%xmm0,%xmm4
4574	xorps	%xmm4, %xmm0
4575	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4576	xorps	%xmm1,%xmm0
4577	ret
4578
4579.align 16
4580.Lkey_expansion_192a:
4581	$movkey	%xmm0,(%rax)
4582	lea	16(%rax),%rax
4583.Lkey_expansion_192a_cold:
4584	movaps	%xmm2, %xmm5
4585.Lkey_expansion_192b_warm:
4586	shufps	\$0b00010000,%xmm0,%xmm4
4587	movdqa	%xmm2,%xmm3
4588	xorps	%xmm4,%xmm0
4589	shufps	\$0b10001100,%xmm0,%xmm4
4590	pslldq	\$4,%xmm3
4591	xorps	%xmm4,%xmm0
4592	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
4593	pxor	%xmm3,%xmm2
4594	pxor	%xmm1,%xmm0
4595	pshufd	\$0b11111111,%xmm0,%xmm3
4596	pxor	%xmm3,%xmm2
4597	ret
4598
4599.align 16
4600.Lkey_expansion_192b:
4601	movaps	%xmm0,%xmm3
4602	shufps	\$0b01000100,%xmm0,%xmm5
4603	$movkey	%xmm5,(%rax)
4604	shufps	\$0b01001110,%xmm2,%xmm3
4605	$movkey	%xmm3,16(%rax)
4606	lea	32(%rax),%rax
4607	jmp	.Lkey_expansion_192b_warm
4608
4609.align	16
4610.Lkey_expansion_256a:
4611	$movkey	%xmm2,(%rax)
4612	lea	16(%rax),%rax
4613.Lkey_expansion_256a_cold:
4614	shufps	\$0b00010000,%xmm0,%xmm4
4615	xorps	%xmm4,%xmm0
4616	shufps	\$0b10001100,%xmm0,%xmm4
4617	xorps	%xmm4,%xmm0
4618	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4619	xorps	%xmm1,%xmm0
4620	ret
4621
4622.align 16
4623.Lkey_expansion_256b:
4624	$movkey	%xmm0,(%rax)
4625	lea	16(%rax),%rax
4626
4627	shufps	\$0b00010000,%xmm2,%xmm4
4628	xorps	%xmm4,%xmm2
4629	shufps	\$0b10001100,%xmm2,%xmm4
4630	xorps	%xmm4,%xmm2
4631	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
4632	xorps	%xmm1,%xmm2
4633	ret
4634.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4635.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4636___
4637}
4638
4639$code.=<<___;
4640.align	64
4641.Lbswap_mask:
4642	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4643.Lincrement32:
4644	.long	6,6,6,0
4645.Lincrement64:
4646	.long	1,0,0,0
4647.Lxts_magic:
4648	.long	0x87,0,1,0
4649.Lincrement1:
4650	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4651.Lkey_rotate:
4652	.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4653.Lkey_rotate192:
4654	.long	0x04070605,0x04070605,0x04070605,0x04070605
4655.Lkey_rcon1:
4656	.long	1,1,1,1
4657.Lkey_rcon1b:
4658	.long	0x1b,0x1b,0x1b,0x1b
4659
4660.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4661.align	64
4662___
4663
4664# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4665#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
4666if ($win64) {
4667$rec="%rcx";
4668$frame="%rdx";
4669$context="%r8";
4670$disp="%r9";
4671
4672$code.=<<___;
4673.extern	__imp_RtlVirtualUnwind
4674___
4675$code.=<<___ if ($PREFIX eq "aesni");
4676.type	ecb_ccm64_se_handler,\@abi-omnipotent
4677.align	16
4678ecb_ccm64_se_handler:
4679	push	%rsi
4680	push	%rdi
4681	push	%rbx
4682	push	%rbp
4683	push	%r12
4684	push	%r13
4685	push	%r14
4686	push	%r15
4687	pushfq
4688	sub	\$64,%rsp
4689
4690	mov	120($context),%rax	# pull context->Rax
4691	mov	248($context),%rbx	# pull context->Rip
4692
4693	mov	8($disp),%rsi		# disp->ImageBase
4694	mov	56($disp),%r11		# disp->HandlerData
4695
4696	mov	0(%r11),%r10d		# HandlerData[0]
4697	lea	(%rsi,%r10),%r10	# prologue label
4698	cmp	%r10,%rbx		# context->Rip<prologue label
4699	jb	.Lcommon_seh_tail
4700
4701	mov	152($context),%rax	# pull context->Rsp
4702
4703	mov	4(%r11),%r10d		# HandlerData[1]
4704	lea	(%rsi,%r10),%r10	# epilogue label
4705	cmp	%r10,%rbx		# context->Rip>=epilogue label
4706	jae	.Lcommon_seh_tail
4707
4708	lea	0(%rax),%rsi		# %xmm save area
4709	lea	512($context),%rdi	# &context.Xmm6
4710	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
4711	.long	0xa548f3fc		# cld; rep movsq
4712	lea	0x58(%rax),%rax		# adjust stack pointer
4713
4714	jmp	.Lcommon_seh_tail
4715.size	ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4716
4717.type	ctr_xts_se_handler,\@abi-omnipotent
4718.align	16
4719ctr_xts_se_handler:
4720	push	%rsi
4721	push	%rdi
4722	push	%rbx
4723	push	%rbp
4724	push	%r12
4725	push	%r13
4726	push	%r14
4727	push	%r15
4728	pushfq
4729	sub	\$64,%rsp
4730
4731	mov	120($context),%rax	# pull context->Rax
4732	mov	248($context),%rbx	# pull context->Rip
4733
4734	mov	8($disp),%rsi		# disp->ImageBase
4735	mov	56($disp),%r11		# disp->HandlerData
4736
4737	mov	0(%r11),%r10d		# HandlerData[0]
4738	lea	(%rsi,%r10),%r10	# prologue lable
4739	cmp	%r10,%rbx		# context->Rip<prologue label
4740	jb	.Lcommon_seh_tail
4741
4742	mov	152($context),%rax	# pull context->Rsp
4743
4744	mov	4(%r11),%r10d		# HandlerData[1]
4745	lea	(%rsi,%r10),%r10	# epilogue label
4746	cmp	%r10,%rbx		# context->Rip>=epilogue label
4747	jae	.Lcommon_seh_tail
4748
4749	mov	208($context),%rax	# pull context->R11
4750
4751	lea	-0xa8(%rax),%rsi	# %xmm save area
4752	lea	512($context),%rdi	# & context.Xmm6
4753	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4754	.long	0xa548f3fc		# cld; rep movsq
4755
4756	mov	-8(%rax),%rbp		# restore saved %rbp
4757	mov	%rbp,160($context)	# restore context->Rbp
4758	jmp	.Lcommon_seh_tail
4759.size	ctr_xts_se_handler,.-ctr_xts_se_handler
4760
4761.type	ocb_se_handler,\@abi-omnipotent
4762.align	16
4763ocb_se_handler:
4764	push	%rsi
4765	push	%rdi
4766	push	%rbx
4767	push	%rbp
4768	push	%r12
4769	push	%r13
4770	push	%r14
4771	push	%r15
4772	pushfq
4773	sub	\$64,%rsp
4774
4775	mov	120($context),%rax	# pull context->Rax
4776	mov	248($context),%rbx	# pull context->Rip
4777
4778	mov	8($disp),%rsi		# disp->ImageBase
4779	mov	56($disp),%r11		# disp->HandlerData
4780
4781	mov	0(%r11),%r10d		# HandlerData[0]
4782	lea	(%rsi,%r10),%r10	# prologue lable
4783	cmp	%r10,%rbx		# context->Rip<prologue label
4784	jb	.Lcommon_seh_tail
4785
4786	mov	4(%r11),%r10d		# HandlerData[1]
4787	lea	(%rsi,%r10),%r10	# epilogue label
4788	cmp	%r10,%rbx		# context->Rip>=epilogue label
4789	jae	.Lcommon_seh_tail
4790
4791	mov	8(%r11),%r10d		# HandlerData[2]
4792	lea	(%rsi,%r10),%r10
4793	cmp	%r10,%rbx		# context->Rip>=pop label
4794	jae	.Locb_no_xmm
4795
4796	mov	152($context),%rax	# pull context->Rsp
4797
4798	lea	(%rax),%rsi		# %xmm save area
4799	lea	512($context),%rdi	# & context.Xmm6
4800	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4801	.long	0xa548f3fc		# cld; rep movsq
4802	lea	0xa0+0x28(%rax),%rax
4803
4804.Locb_no_xmm:
4805	mov	-8(%rax),%rbx
4806	mov	-16(%rax),%rbp
4807	mov	-24(%rax),%r12
4808	mov	-32(%rax),%r13
4809	mov	-40(%rax),%r14
4810
4811	mov	%rbx,144($context)	# restore context->Rbx
4812	mov	%rbp,160($context)	# restore context->Rbp
4813	mov	%r12,216($context)	# restore context->R12
4814	mov	%r13,224($context)	# restore context->R13
4815	mov	%r14,232($context)	# restore context->R14
4816
4817	jmp	.Lcommon_seh_tail
4818.size	ocb_se_handler,.-ocb_se_handler
4819___
4820$code.=<<___;
4821.type	cbc_se_handler,\@abi-omnipotent
4822.align	16
4823cbc_se_handler:
4824	push	%rsi
4825	push	%rdi
4826	push	%rbx
4827	push	%rbp
4828	push	%r12
4829	push	%r13
4830	push	%r14
4831	push	%r15
4832	pushfq
4833	sub	\$64,%rsp
4834
4835	mov	152($context),%rax	# pull context->Rsp
4836	mov	248($context),%rbx	# pull context->Rip
4837
4838	lea	.Lcbc_decrypt_bulk(%rip),%r10
4839	cmp	%r10,%rbx		# context->Rip<"prologue" label
4840	jb	.Lcommon_seh_tail
4841
4842	mov	120($context),%rax	# pull context->Rax
4843
4844	lea	.Lcbc_decrypt_body(%rip),%r10
4845	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
4846	jb	.Lcommon_seh_tail
4847
4848	mov	152($context),%rax	# pull context->Rsp
4849
4850	lea	.Lcbc_ret(%rip),%r10
4851	cmp	%r10,%rbx		# context->Rip>="epilogue" label
4852	jae	.Lcommon_seh_tail
4853
4854	lea	16(%rax),%rsi		# %xmm save area
4855	lea	512($context),%rdi	# &context.Xmm6
4856	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4857	.long	0xa548f3fc		# cld; rep movsq
4858
4859	mov	208($context),%rax	# pull context->R11
4860
4861	mov	-8(%rax),%rbp		# restore saved %rbp
4862	mov	%rbp,160($context)	# restore context->Rbp
4863
4864.Lcommon_seh_tail:
4865	mov	8(%rax),%rdi
4866	mov	16(%rax),%rsi
4867	mov	%rax,152($context)	# restore context->Rsp
4868	mov	%rsi,168($context)	# restore context->Rsi
4869	mov	%rdi,176($context)	# restore context->Rdi
4870
4871	mov	40($disp),%rdi		# disp->ContextRecord
4872	mov	$context,%rsi		# context
4873	mov	\$154,%ecx		# sizeof(CONTEXT)
4874	.long	0xa548f3fc		# cld; rep movsq
4875
4876	mov	$disp,%rsi
4877	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4878	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4879	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4880	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4881	mov	40(%rsi),%r10		# disp->ContextRecord
4882	lea	56(%rsi),%r11		# &disp->HandlerData
4883	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4884	mov	%r10,32(%rsp)		# arg5
4885	mov	%r11,40(%rsp)		# arg6
4886	mov	%r12,48(%rsp)		# arg7
4887	mov	%rcx,56(%rsp)		# arg8, (NULL)
4888	call	*__imp_RtlVirtualUnwind(%rip)
4889
4890	mov	\$1,%eax		# ExceptionContinueSearch
4891	add	\$64,%rsp
4892	popfq
4893	pop	%r15
4894	pop	%r14
4895	pop	%r13
4896	pop	%r12
4897	pop	%rbp
4898	pop	%rbx
4899	pop	%rdi
4900	pop	%rsi
4901	ret
4902.size	cbc_se_handler,.-cbc_se_handler
4903
4904.section	.pdata
4905.align	4
4906___
4907$code.=<<___ if ($PREFIX eq "aesni");
4908	.rva	.LSEH_begin_aesni_ecb_encrypt
4909	.rva	.LSEH_end_aesni_ecb_encrypt
4910	.rva	.LSEH_info_ecb
4911
4912	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
4913	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
4914	.rva	.LSEH_info_ccm64_enc
4915
4916	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
4917	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
4918	.rva	.LSEH_info_ccm64_dec
4919
4920	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
4921	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
4922	.rva	.LSEH_info_ctr32
4923
4924	.rva	.LSEH_begin_aesni_xts_encrypt
4925	.rva	.LSEH_end_aesni_xts_encrypt
4926	.rva	.LSEH_info_xts_enc
4927
4928	.rva	.LSEH_begin_aesni_xts_decrypt
4929	.rva	.LSEH_end_aesni_xts_decrypt
4930	.rva	.LSEH_info_xts_dec
4931
4932	.rva	.LSEH_begin_aesni_ocb_encrypt
4933	.rva	.LSEH_end_aesni_ocb_encrypt
4934	.rva	.LSEH_info_ocb_enc
4935
4936	.rva	.LSEH_begin_aesni_ocb_decrypt
4937	.rva	.LSEH_end_aesni_ocb_decrypt
4938	.rva	.LSEH_info_ocb_dec
4939___
4940$code.=<<___;
4941	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
4942	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
4943	.rva	.LSEH_info_cbc
4944
4945	.rva	${PREFIX}_set_decrypt_key
4946	.rva	.LSEH_end_set_decrypt_key
4947	.rva	.LSEH_info_key
4948
4949	.rva	${PREFIX}_set_encrypt_key
4950	.rva	.LSEH_end_set_encrypt_key
4951	.rva	.LSEH_info_key
4952.section	.xdata
4953.align	8
4954___
4955$code.=<<___ if ($PREFIX eq "aesni");
4956.LSEH_info_ecb:
4957	.byte	9,0,0,0
4958	.rva	ecb_ccm64_se_handler
4959	.rva	.Lecb_enc_body,.Lecb_enc_ret		# HandlerData[]
4960.LSEH_info_ccm64_enc:
4961	.byte	9,0,0,0
4962	.rva	ecb_ccm64_se_handler
4963	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
4964.LSEH_info_ccm64_dec:
4965	.byte	9,0,0,0
4966	.rva	ecb_ccm64_se_handler
4967	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
4968.LSEH_info_ctr32:
4969	.byte	9,0,0,0
4970	.rva	ctr_xts_se_handler
4971	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
4972.LSEH_info_xts_enc:
4973	.byte	9,0,0,0
4974	.rva	ctr_xts_se_handler
4975	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
4976.LSEH_info_xts_dec:
4977	.byte	9,0,0,0
4978	.rva	ctr_xts_se_handler
4979	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
4980.LSEH_info_ocb_enc:
4981	.byte	9,0,0,0
4982	.rva	ocb_se_handler
4983	.rva	.Locb_enc_body,.Locb_enc_epilogue	# HandlerData[]
4984	.rva	.Locb_enc_pop
4985	.long	0
4986.LSEH_info_ocb_dec:
4987	.byte	9,0,0,0
4988	.rva	ocb_se_handler
4989	.rva	.Locb_dec_body,.Locb_dec_epilogue	# HandlerData[]
4990	.rva	.Locb_dec_pop
4991	.long	0
4992___
4993$code.=<<___;
4994.LSEH_info_cbc:
4995	.byte	9,0,0,0
4996	.rva	cbc_se_handler
4997.LSEH_info_key:
4998	.byte	0x01,0x04,0x01,0x00
4999	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
5000___
5001}
5002
5003sub rex {
5004  local *opcode=shift;
5005  my ($dst,$src)=@_;
5006  my $rex=0;
5007
5008    $rex|=0x04			if($dst>=8);
5009    $rex|=0x01			if($src>=8);
5010    push @opcode,$rex|0x40	if($rex);
5011}
5012
5013sub aesni {
5014  my $line=shift;
5015  my @opcode=(0x66);
5016
5017    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5018	rex(\@opcode,$4,$3);
5019	push @opcode,0x0f,0x3a,0xdf;
5020	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
5021	my $c=$2;
5022	push @opcode,$c=~/^0/?oct($c):$c;
5023	return ".byte\t".join(',',@opcode);
5024    }
5025    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5026	my %opcodelet = (
5027		"aesimc" => 0xdb,
5028		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5029		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5030	);
5031	return undef if (!defined($opcodelet{$1}));
5032	rex(\@opcode,$3,$2);
5033	push @opcode,0x0f,0x38,$opcodelet{$1};
5034	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
5035	return ".byte\t".join(',',@opcode);
5036    }
5037    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5038	my %opcodelet = (
5039		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5040		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5041	);
5042	return undef if (!defined($opcodelet{$1}));
5043	my $off = $2;
5044	push @opcode,0x44 if ($3>=8);
5045	push @opcode,0x0f,0x38,$opcodelet{$1};
5046	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
5047	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5048	return ".byte\t".join(',',@opcode);
5049    }
5050    return $line;
5051}
5052
5053sub movbe {
5054	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
5055}
5056
5057$code =~ s/\`([^\`]*)\`/eval($1)/gem;
5058$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5059#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
5060$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5061
5062print $code;
5063
5064close STDOUT;
5065