1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for AES instructions as per PowerISA
18# specification version 2.07, first implemented by POWER8 processor.
19# The module is endian-agnostic in sense that it supports both big-
20# and little-endian cases. Data alignment in parallelizable modes is
21# handled with VSX loads and stores, which implies MSR.VSX flag being
22# set. It should also be noted that ISA specification doesn't prohibit
23# alignment exceptions for these instructions on page boundaries.
24# Initially alignment was handled in pure AltiVec/VMX way [when data
25# is aligned programmatically, which in turn guarantees exception-
26# free execution], but it turned to hamper performance when vcipher
27# instructions are interleaved. It's reckoned that eventual
28# misalignment penalties at page boundaries are in average lower
29# than additional overhead in pure AltiVec approach.
30#
31# May 2016
32#
33# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
34# systems were measured.
35#
36######################################################################
37# Current large-block performance in cycles per byte processed with
38# 128-bit key (less is better).
39#
40#		CBC en-/decrypt	CTR	XTS
41# POWER8[le]	3.96/0.72	0.74	1.1
42# POWER8[be]	3.75/0.65	0.66	1.0
43
44$flavour = shift;
45
46if ($flavour =~ /64/) {
47	$SIZE_T	=8;
48	$LRSAVE	=2*$SIZE_T;
49	$STU	="stdu";
50	$POP	="ld";
51	$PUSH	="std";
52	$UCMP	="cmpld";
53	$SHL	="sldi";
54} elsif ($flavour =~ /32/) {
55	$SIZE_T	=4;
56	$LRSAVE	=$SIZE_T;
57	$STU	="stwu";
58	$POP	="lwz";
59	$PUSH	="stw";
60	$UCMP	="cmplw";
61	$SHL	="slwi";
62} else { die "nonsense $flavour"; }
63
64$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
65
66$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
67( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
68( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
69die "can't locate ppc-xlate.pl";
70
71open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
72
73$FRAME=8*$SIZE_T;
74$prefix="aes_hw";
75
76$sp="r1";
77$vrsave="r12";
78
79#########################################################################
80{{{	# Key setup procedures						#
81my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
82my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
83my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
84
85$code.=<<___;
86.machine	"any"
87
88.text
89
90.align	7
91rcon:
92.long	0x01000000, 0x01000000, 0x01000000, 0x01000000	?rev
93.long	0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000	?rev
94.long	0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c	?rev
95.long	0,0,0,0						?asis
96Lconsts:
97	mflr	r0
98	bcl	20,31,\$+4
99	mflr	$ptr	 #vvvvv "distance between . and rcon
100	addi	$ptr,$ptr,-0x48
101	mtlr	r0
102	blr
103	.long	0
104	.byte	0,12,0x14,0,0,0,0,0
105.asciz	"AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
106
107.globl	.${prefix}_set_encrypt_key
108.align	5
109.${prefix}_set_encrypt_key:
110Lset_encrypt_key:
111	mflr		r11
112	$PUSH		r11,$LRSAVE($sp)
113
114	li		$ptr,-1
115	${UCMP}i	$inp,0
116	beq-		Lenc_key_abort		# if ($inp==0) return -1;
117	${UCMP}i	$out,0
118	beq-		Lenc_key_abort		# if ($out==0) return -1;
119	li		$ptr,-2
120	cmpwi		$bits,128
121	blt-		Lenc_key_abort
122	cmpwi		$bits,256
123	bgt-		Lenc_key_abort
124	andi.		r0,$bits,0x3f
125	bne-		Lenc_key_abort
126
127	lis		r0,0xfff0
128	mfspr		$vrsave,256
129	mtspr		256,r0
130
131	bl		Lconsts
132	mtlr		r11
133
134	neg		r9,$inp
135	lvx		$in0,0,$inp
136	addi		$inp,$inp,15		# 15 is not typo
137	lvsr		$key,0,r9		# borrow $key
138	li		r8,0x20
139	cmpwi		$bits,192
140	lvx		$in1,0,$inp
141	le?vspltisb	$mask,0x0f		# borrow $mask
142	lvx		$rcon,0,$ptr
143	le?vxor		$key,$key,$mask		# adjust for byte swap
144	lvx		$mask,r8,$ptr
145	addi		$ptr,$ptr,0x10
146	vperm		$in0,$in0,$in1,$key	# align [and byte swap in LE]
147	li		$cnt,8
148	vxor		$zero,$zero,$zero
149	mtctr		$cnt
150
151	?lvsr		$outperm,0,$out
152	vspltisb	$outmask,-1
153	lvx		$outhead,0,$out
154	?vperm		$outmask,$zero,$outmask,$outperm
155
156	blt		Loop128
157	addi		$inp,$inp,8
158	beq		L192
159	addi		$inp,$inp,8
160	b		L256
161
162.align	4
163Loop128:
164	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
165	vsldoi		$tmp,$zero,$in0,12	# >>32
166	 vperm		$outtail,$in0,$in0,$outperm	# rotate
167	 vsel		$stage,$outhead,$outtail,$outmask
168	 vmr		$outhead,$outtail
169	vcipherlast	$key,$key,$rcon
170	 stvx		$stage,0,$out
171	 addi		$out,$out,16
172
173	vxor		$in0,$in0,$tmp
174	vsldoi		$tmp,$zero,$tmp,12	# >>32
175	vxor		$in0,$in0,$tmp
176	vsldoi		$tmp,$zero,$tmp,12	# >>32
177	vxor		$in0,$in0,$tmp
178	 vadduwm	$rcon,$rcon,$rcon
179	vxor		$in0,$in0,$key
180	bdnz		Loop128
181
182	lvx		$rcon,0,$ptr		# last two round keys
183
184	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
185	vsldoi		$tmp,$zero,$in0,12	# >>32
186	 vperm		$outtail,$in0,$in0,$outperm	# rotate
187	 vsel		$stage,$outhead,$outtail,$outmask
188	 vmr		$outhead,$outtail
189	vcipherlast	$key,$key,$rcon
190	 stvx		$stage,0,$out
191	 addi		$out,$out,16
192
193	vxor		$in0,$in0,$tmp
194	vsldoi		$tmp,$zero,$tmp,12	# >>32
195	vxor		$in0,$in0,$tmp
196	vsldoi		$tmp,$zero,$tmp,12	# >>32
197	vxor		$in0,$in0,$tmp
198	 vadduwm	$rcon,$rcon,$rcon
199	vxor		$in0,$in0,$key
200
201	vperm		$key,$in0,$in0,$mask	# rotate-n-splat
202	vsldoi		$tmp,$zero,$in0,12	# >>32
203	 vperm		$outtail,$in0,$in0,$outperm	# rotate
204	 vsel		$stage,$outhead,$outtail,$outmask
205	 vmr		$outhead,$outtail
206	vcipherlast	$key,$key,$rcon
207	 stvx		$stage,0,$out
208	 addi		$out,$out,16
209
210	vxor		$in0,$in0,$tmp
211	vsldoi		$tmp,$zero,$tmp,12	# >>32
212	vxor		$in0,$in0,$tmp
213	vsldoi		$tmp,$zero,$tmp,12	# >>32
214	vxor		$in0,$in0,$tmp
215	vxor		$in0,$in0,$key
216	 vperm		$outtail,$in0,$in0,$outperm	# rotate
217	 vsel		$stage,$outhead,$outtail,$outmask
218	 vmr		$outhead,$outtail
219	 stvx		$stage,0,$out
220
221	addi		$inp,$out,15		# 15 is not typo
222	addi		$out,$out,0x50
223
224	li		$rounds,10
225	b		Ldone
226
227.align	4
228L192:
229	lvx		$tmp,0,$inp
230	li		$cnt,4
231	 vperm		$outtail,$in0,$in0,$outperm	# rotate
232	 vsel		$stage,$outhead,$outtail,$outmask
233	 vmr		$outhead,$outtail
234	 stvx		$stage,0,$out
235	 addi		$out,$out,16
236	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
237	vspltisb	$key,8			# borrow $key
238	mtctr		$cnt
239	vsububm		$mask,$mask,$key	# adjust the mask
240
241Loop192:
242	vperm		$key,$in1,$in1,$mask	# roate-n-splat
243	vsldoi		$tmp,$zero,$in0,12	# >>32
244	vcipherlast	$key,$key,$rcon
245
246	vxor		$in0,$in0,$tmp
247	vsldoi		$tmp,$zero,$tmp,12	# >>32
248	vxor		$in0,$in0,$tmp
249	vsldoi		$tmp,$zero,$tmp,12	# >>32
250	vxor		$in0,$in0,$tmp
251
252	 vsldoi		$stage,$zero,$in1,8
253	vspltw		$tmp,$in0,3
254	vxor		$tmp,$tmp,$in1
255	vsldoi		$in1,$zero,$in1,12	# >>32
256	 vadduwm	$rcon,$rcon,$rcon
257	vxor		$in1,$in1,$tmp
258	vxor		$in0,$in0,$key
259	vxor		$in1,$in1,$key
260	 vsldoi		$stage,$stage,$in0,8
261
262	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
263	vsldoi		$tmp,$zero,$in0,12	# >>32
264	 vperm		$outtail,$stage,$stage,$outperm	# rotate
265	 vsel		$stage,$outhead,$outtail,$outmask
266	 vmr		$outhead,$outtail
267	vcipherlast	$key,$key,$rcon
268	 stvx		$stage,0,$out
269	 addi		$out,$out,16
270
271	 vsldoi		$stage,$in0,$in1,8
272	vxor		$in0,$in0,$tmp
273	vsldoi		$tmp,$zero,$tmp,12	# >>32
274	 vperm		$outtail,$stage,$stage,$outperm	# rotate
275	 vsel		$stage,$outhead,$outtail,$outmask
276	 vmr		$outhead,$outtail
277	vxor		$in0,$in0,$tmp
278	vsldoi		$tmp,$zero,$tmp,12	# >>32
279	vxor		$in0,$in0,$tmp
280	 stvx		$stage,0,$out
281	 addi		$out,$out,16
282
283	vspltw		$tmp,$in0,3
284	vxor		$tmp,$tmp,$in1
285	vsldoi		$in1,$zero,$in1,12	# >>32
286	 vadduwm	$rcon,$rcon,$rcon
287	vxor		$in1,$in1,$tmp
288	vxor		$in0,$in0,$key
289	vxor		$in1,$in1,$key
290	 vperm		$outtail,$in0,$in0,$outperm	# rotate
291	 vsel		$stage,$outhead,$outtail,$outmask
292	 vmr		$outhead,$outtail
293	 stvx		$stage,0,$out
294	 addi		$inp,$out,15		# 15 is not typo
295	 addi		$out,$out,16
296	bdnz		Loop192
297
298	li		$rounds,12
299	addi		$out,$out,0x20
300	b		Ldone
301
302.align	4
303L256:
304	lvx		$tmp,0,$inp
305	li		$cnt,7
306	li		$rounds,14
307	 vperm		$outtail,$in0,$in0,$outperm	# rotate
308	 vsel		$stage,$outhead,$outtail,$outmask
309	 vmr		$outhead,$outtail
310	 stvx		$stage,0,$out
311	 addi		$out,$out,16
312	vperm		$in1,$in1,$tmp,$key	# align [and byte swap in LE]
313	mtctr		$cnt
314
315Loop256:
316	vperm		$key,$in1,$in1,$mask	# rotate-n-splat
317	vsldoi		$tmp,$zero,$in0,12	# >>32
318	 vperm		$outtail,$in1,$in1,$outperm	# rotate
319	 vsel		$stage,$outhead,$outtail,$outmask
320	 vmr		$outhead,$outtail
321	vcipherlast	$key,$key,$rcon
322	 stvx		$stage,0,$out
323	 addi		$out,$out,16
324
325	vxor		$in0,$in0,$tmp
326	vsldoi		$tmp,$zero,$tmp,12	# >>32
327	vxor		$in0,$in0,$tmp
328	vsldoi		$tmp,$zero,$tmp,12	# >>32
329	vxor		$in0,$in0,$tmp
330	 vadduwm	$rcon,$rcon,$rcon
331	vxor		$in0,$in0,$key
332	 vperm		$outtail,$in0,$in0,$outperm	# rotate
333	 vsel		$stage,$outhead,$outtail,$outmask
334	 vmr		$outhead,$outtail
335	 stvx		$stage,0,$out
336	 addi		$inp,$out,15		# 15 is not typo
337	 addi		$out,$out,16
338	bdz		Ldone
339
340	vspltw		$key,$in0,3		# just splat
341	vsldoi		$tmp,$zero,$in1,12	# >>32
342	vsbox		$key,$key
343
344	vxor		$in1,$in1,$tmp
345	vsldoi		$tmp,$zero,$tmp,12	# >>32
346	vxor		$in1,$in1,$tmp
347	vsldoi		$tmp,$zero,$tmp,12	# >>32
348	vxor		$in1,$in1,$tmp
349
350	vxor		$in1,$in1,$key
351	b		Loop256
352
353.align	4
354Ldone:
355	lvx		$in1,0,$inp		# redundant in aligned case
356	vsel		$in1,$outhead,$in1,$outmask
357	stvx		$in1,0,$inp
358	li		$ptr,0
359	mtspr		256,$vrsave
360	stw		$rounds,0($out)
361
362Lenc_key_abort:
363	mr		r3,$ptr
364	blr
365	.long		0
366	.byte		0,12,0x14,1,0,0,3,0
367	.long		0
368.size	.${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
369
370.globl	.${prefix}_set_decrypt_key
371.align	5
372.${prefix}_set_decrypt_key:
373	$STU		$sp,-$FRAME($sp)
374	mflr		r10
375	$PUSH		r10,$FRAME+$LRSAVE($sp)
376	bl		Lset_encrypt_key
377	mtlr		r10
378
379	cmpwi		r3,0
380	bne-		Ldec_key_abort
381
382	slwi		$cnt,$rounds,4
383	subi		$inp,$out,240		# first round key
384	srwi		$rounds,$rounds,1
385	add		$out,$inp,$cnt		# last round key
386	mtctr		$rounds
387
388Ldeckey:
389	lwz		r0, 0($inp)
390	lwz		r6, 4($inp)
391	lwz		r7, 8($inp)
392	lwz		r8, 12($inp)
393	addi		$inp,$inp,16
394	lwz		r9, 0($out)
395	lwz		r10,4($out)
396	lwz		r11,8($out)
397	lwz		r12,12($out)
398	stw		r0, 0($out)
399	stw		r6, 4($out)
400	stw		r7, 8($out)
401	stw		r8, 12($out)
402	subi		$out,$out,16
403	stw		r9, -16($inp)
404	stw		r10,-12($inp)
405	stw		r11,-8($inp)
406	stw		r12,-4($inp)
407	bdnz		Ldeckey
408
409	xor		r3,r3,r3		# return value
410Ldec_key_abort:
411	addi		$sp,$sp,$FRAME
412	blr
413	.long		0
414	.byte		0,12,4,1,0x80,0,3,0
415	.long		0
416.size	.${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
417___
418}}}
419#########################################################################
420{{{	# Single block en- and decrypt procedures			#
421sub gen_block () {
422my $dir = shift;
423my $n   = $dir eq "de" ? "n" : "";
424my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
425
426$code.=<<___;
427.globl	.${prefix}_${dir}crypt
428.align	5
429.${prefix}_${dir}crypt:
430	lwz		$rounds,240($key)
431	lis		r0,0xfc00
432	mfspr		$vrsave,256
433	li		$idx,15			# 15 is not typo
434	mtspr		256,r0
435
436	lvx		v0,0,$inp
437	neg		r11,$out
438	lvx		v1,$idx,$inp
439	lvsl		v2,0,$inp		# inpperm
440	le?vspltisb	v4,0x0f
441	?lvsl		v3,0,r11		# outperm
442	le?vxor		v2,v2,v4
443	li		$idx,16
444	vperm		v0,v0,v1,v2		# align [and byte swap in LE]
445	lvx		v1,0,$key
446	?lvsl		v5,0,$key		# keyperm
447	srwi		$rounds,$rounds,1
448	lvx		v2,$idx,$key
449	addi		$idx,$idx,16
450	subi		$rounds,$rounds,1
451	?vperm		v1,v1,v2,v5		# align round key
452
453	vxor		v0,v0,v1
454	lvx		v1,$idx,$key
455	addi		$idx,$idx,16
456	mtctr		$rounds
457
458Loop_${dir}c:
459	?vperm		v2,v2,v1,v5
460	v${n}cipher	v0,v0,v2
461	lvx		v2,$idx,$key
462	addi		$idx,$idx,16
463	?vperm		v1,v1,v2,v5
464	v${n}cipher	v0,v0,v1
465	lvx		v1,$idx,$key
466	addi		$idx,$idx,16
467	bdnz		Loop_${dir}c
468
469	?vperm		v2,v2,v1,v5
470	v${n}cipher	v0,v0,v2
471	lvx		v2,$idx,$key
472	?vperm		v1,v1,v2,v5
473	v${n}cipherlast	v0,v0,v1
474
475	vspltisb	v2,-1
476	vxor		v1,v1,v1
477	li		$idx,15			# 15 is not typo
478	?vperm		v2,v1,v2,v3		# outmask
479	le?vxor		v3,v3,v4
480	lvx		v1,0,$out		# outhead
481	vperm		v0,v0,v0,v3		# rotate [and byte swap in LE]
482	vsel		v1,v1,v0,v2
483	lvx		v4,$idx,$out
484	stvx		v1,0,$out
485	vsel		v0,v0,v4,v2
486	stvx		v0,$idx,$out
487
488	mtspr		256,$vrsave
489	blr
490	.long		0
491	.byte		0,12,0x14,0,0,0,3,0
492	.long		0
493.size	.${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
494___
495}
496&gen_block("en");
497&gen_block("de");
498}}}
499#########################################################################
500{{{	# CBC en- and decrypt procedures				#
501my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
502my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
503my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
504						map("v$_",(4..10));
505$code.=<<___;
506.globl	.${prefix}_cbc_encrypt
507.align	5
508.${prefix}_cbc_encrypt:
509	${UCMP}i	$len,16
510	bltlr-
511
512	cmpwi		$enc,0			# test direction
513	lis		r0,0xffe0
514	mfspr		$vrsave,256
515	mtspr		256,r0
516
517	li		$idx,15
518	vxor		$rndkey0,$rndkey0,$rndkey0
519	le?vspltisb	$tmp,0x0f
520
521	lvx		$ivec,0,$ivp		# load [unaligned] iv
522	lvsl		$inpperm,0,$ivp
523	lvx		$inptail,$idx,$ivp
524	le?vxor		$inpperm,$inpperm,$tmp
525	vperm		$ivec,$ivec,$inptail,$inpperm
526
527	neg		r11,$inp
528	?lvsl		$keyperm,0,$key		# prepare for unaligned key
529	lwz		$rounds,240($key)
530
531	lvsr		$inpperm,0,r11		# prepare for unaligned load
532	lvx		$inptail,0,$inp
533	addi		$inp,$inp,15		# 15 is not typo
534	le?vxor		$inpperm,$inpperm,$tmp
535
536	?lvsr		$outperm,0,$out		# prepare for unaligned store
537	vspltisb	$outmask,-1
538	lvx		$outhead,0,$out
539	?vperm		$outmask,$rndkey0,$outmask,$outperm
540	le?vxor		$outperm,$outperm,$tmp
541
542	srwi		$rounds,$rounds,1
543	li		$idx,16
544	subi		$rounds,$rounds,1
545	beq		Lcbc_dec
546
547Lcbc_enc:
548	vmr		$inout,$inptail
549	lvx		$inptail,0,$inp
550	addi		$inp,$inp,16
551	mtctr		$rounds
552	subi		$len,$len,16		# len-=16
553
554	lvx		$rndkey0,0,$key
555	 vperm		$inout,$inout,$inptail,$inpperm
556	lvx		$rndkey1,$idx,$key
557	addi		$idx,$idx,16
558	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
559	vxor		$inout,$inout,$rndkey0
560	lvx		$rndkey0,$idx,$key
561	addi		$idx,$idx,16
562	vxor		$inout,$inout,$ivec
563
564Loop_cbc_enc:
565	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
566	vcipher		$inout,$inout,$rndkey1
567	lvx		$rndkey1,$idx,$key
568	addi		$idx,$idx,16
569	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
570	vcipher		$inout,$inout,$rndkey0
571	lvx		$rndkey0,$idx,$key
572	addi		$idx,$idx,16
573	bdnz		Loop_cbc_enc
574
575	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
576	vcipher		$inout,$inout,$rndkey1
577	lvx		$rndkey1,$idx,$key
578	li		$idx,16
579	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
580	vcipherlast	$ivec,$inout,$rndkey0
581	${UCMP}i	$len,16
582
583	vperm		$tmp,$ivec,$ivec,$outperm
584	vsel		$inout,$outhead,$tmp,$outmask
585	vmr		$outhead,$tmp
586	stvx		$inout,0,$out
587	addi		$out,$out,16
588	bge		Lcbc_enc
589
590	b		Lcbc_done
591
592.align	4
593Lcbc_dec:
594	${UCMP}i	$len,128
595	bge		_aesp8_cbc_decrypt8x
596	vmr		$tmp,$inptail
597	lvx		$inptail,0,$inp
598	addi		$inp,$inp,16
599	mtctr		$rounds
600	subi		$len,$len,16		# len-=16
601
602	lvx		$rndkey0,0,$key
603	 vperm		$tmp,$tmp,$inptail,$inpperm
604	lvx		$rndkey1,$idx,$key
605	addi		$idx,$idx,16
606	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
607	vxor		$inout,$tmp,$rndkey0
608	lvx		$rndkey0,$idx,$key
609	addi		$idx,$idx,16
610
611Loop_cbc_dec:
612	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
613	vncipher	$inout,$inout,$rndkey1
614	lvx		$rndkey1,$idx,$key
615	addi		$idx,$idx,16
616	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
617	vncipher	$inout,$inout,$rndkey0
618	lvx		$rndkey0,$idx,$key
619	addi		$idx,$idx,16
620	bdnz		Loop_cbc_dec
621
622	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
623	vncipher	$inout,$inout,$rndkey1
624	lvx		$rndkey1,$idx,$key
625	li		$idx,16
626	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
627	vncipherlast	$inout,$inout,$rndkey0
628	${UCMP}i	$len,16
629
630	vxor		$inout,$inout,$ivec
631	vmr		$ivec,$tmp
632	vperm		$tmp,$inout,$inout,$outperm
633	vsel		$inout,$outhead,$tmp,$outmask
634	vmr		$outhead,$tmp
635	stvx		$inout,0,$out
636	addi		$out,$out,16
637	bge		Lcbc_dec
638
639Lcbc_done:
640	addi		$out,$out,-1
641	lvx		$inout,0,$out		# redundant in aligned case
642	vsel		$inout,$outhead,$inout,$outmask
643	stvx		$inout,0,$out
644
645	neg		$enc,$ivp		# write [unaligned] iv
646	li		$idx,15			# 15 is not typo
647	vxor		$rndkey0,$rndkey0,$rndkey0
648	vspltisb	$outmask,-1
649	le?vspltisb	$tmp,0x0f
650	?lvsl		$outperm,0,$enc
651	?vperm		$outmask,$rndkey0,$outmask,$outperm
652	le?vxor		$outperm,$outperm,$tmp
653	lvx		$outhead,0,$ivp
654	vperm		$ivec,$ivec,$ivec,$outperm
655	vsel		$inout,$outhead,$ivec,$outmask
656	lvx		$inptail,$idx,$ivp
657	stvx		$inout,0,$ivp
658	vsel		$inout,$ivec,$inptail,$outmask
659	stvx		$inout,$idx,$ivp
660
661	mtspr		256,$vrsave
662	blr
663	.long		0
664	.byte		0,12,0x14,0,0,0,6,0
665	.long		0
666___
667#########################################################################
668{{	# Optimized CBC decrypt procedure				#
669my $key_="r11";
670my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
671    $x00=0 if ($flavour =~ /osx/);
672my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
673my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
674my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
675			# v26-v31 last 6 round keys
676my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
677
678$code.=<<___;
679.align	5
680_aesp8_cbc_decrypt8x:
681	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
682	li		r10,`$FRAME+8*16+15`
683	li		r11,`$FRAME+8*16+31`
684	stvx		v20,r10,$sp		# ABI says so
685	addi		r10,r10,32
686	stvx		v21,r11,$sp
687	addi		r11,r11,32
688	stvx		v22,r10,$sp
689	addi		r10,r10,32
690	stvx		v23,r11,$sp
691	addi		r11,r11,32
692	stvx		v24,r10,$sp
693	addi		r10,r10,32
694	stvx		v25,r11,$sp
695	addi		r11,r11,32
696	stvx		v26,r10,$sp
697	addi		r10,r10,32
698	stvx		v27,r11,$sp
699	addi		r11,r11,32
700	stvx		v28,r10,$sp
701	addi		r10,r10,32
702	stvx		v29,r11,$sp
703	addi		r11,r11,32
704	stvx		v30,r10,$sp
705	stvx		v31,r11,$sp
706	li		r0,-1
707	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
708	li		$x10,0x10
709	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
710	li		$x20,0x20
711	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
712	li		$x30,0x30
713	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
714	li		$x40,0x40
715	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
716	li		$x50,0x50
717	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
718	li		$x60,0x60
719	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
720	li		$x70,0x70
721	mtspr		256,r0
722
723	subi		$rounds,$rounds,3	# -4 in total
724	subi		$len,$len,128		# bias
725
726	lvx		$rndkey0,$x00,$key	# load key schedule
727	lvx		v30,$x10,$key
728	addi		$key,$key,0x20
729	lvx		v31,$x00,$key
730	?vperm		$rndkey0,$rndkey0,v30,$keyperm
731	addi		$key_,$sp,$FRAME+15
732	mtctr		$rounds
733
734Load_cbc_dec_key:
735	?vperm		v24,v30,v31,$keyperm
736	lvx		v30,$x10,$key
737	addi		$key,$key,0x20
738	stvx		v24,$x00,$key_		# off-load round[1]
739	?vperm		v25,v31,v30,$keyperm
740	lvx		v31,$x00,$key
741	stvx		v25,$x10,$key_		# off-load round[2]
742	addi		$key_,$key_,0x20
743	bdnz		Load_cbc_dec_key
744
745	lvx		v26,$x10,$key
746	?vperm		v24,v30,v31,$keyperm
747	lvx		v27,$x20,$key
748	stvx		v24,$x00,$key_		# off-load round[3]
749	?vperm		v25,v31,v26,$keyperm
750	lvx		v28,$x30,$key
751	stvx		v25,$x10,$key_		# off-load round[4]
752	addi		$key_,$sp,$FRAME+15	# rewind $key_
753	?vperm		v26,v26,v27,$keyperm
754	lvx		v29,$x40,$key
755	?vperm		v27,v27,v28,$keyperm
756	lvx		v30,$x50,$key
757	?vperm		v28,v28,v29,$keyperm
758	lvx		v31,$x60,$key
759	?vperm		v29,v29,v30,$keyperm
760	lvx		$out0,$x70,$key		# borrow $out0
761	?vperm		v30,v30,v31,$keyperm
762	lvx		v24,$x00,$key_		# pre-load round[1]
763	?vperm		v31,v31,$out0,$keyperm
764	lvx		v25,$x10,$key_		# pre-load round[2]
765
766	#lvx		$inptail,0,$inp		# "caller" already did this
767	#addi		$inp,$inp,15		# 15 is not typo
768	subi		$inp,$inp,15		# undo "caller"
769
770	 le?li		$idx,8
771	lvx_u		$in0,$x00,$inp		# load first 8 "words"
772	 le?lvsl	$inpperm,0,$idx
773	 le?vspltisb	$tmp,0x0f
774	lvx_u		$in1,$x10,$inp
775	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
776	lvx_u		$in2,$x20,$inp
777	 le?vperm	$in0,$in0,$in0,$inpperm
778	lvx_u		$in3,$x30,$inp
779	 le?vperm	$in1,$in1,$in1,$inpperm
780	lvx_u		$in4,$x40,$inp
781	 le?vperm	$in2,$in2,$in2,$inpperm
782	vxor		$out0,$in0,$rndkey0
783	lvx_u		$in5,$x50,$inp
784	 le?vperm	$in3,$in3,$in3,$inpperm
785	vxor		$out1,$in1,$rndkey0
786	lvx_u		$in6,$x60,$inp
787	 le?vperm	$in4,$in4,$in4,$inpperm
788	vxor		$out2,$in2,$rndkey0
789	lvx_u		$in7,$x70,$inp
790	addi		$inp,$inp,0x80
791	 le?vperm	$in5,$in5,$in5,$inpperm
792	vxor		$out3,$in3,$rndkey0
793	 le?vperm	$in6,$in6,$in6,$inpperm
794	vxor		$out4,$in4,$rndkey0
795	 le?vperm	$in7,$in7,$in7,$inpperm
796	vxor		$out5,$in5,$rndkey0
797	vxor		$out6,$in6,$rndkey0
798	vxor		$out7,$in7,$rndkey0
799
800	mtctr		$rounds
801	b		Loop_cbc_dec8x
802.align	5
803Loop_cbc_dec8x:
804	vncipher	$out0,$out0,v24
805	vncipher	$out1,$out1,v24
806	vncipher	$out2,$out2,v24
807	vncipher	$out3,$out3,v24
808	vncipher	$out4,$out4,v24
809	vncipher	$out5,$out5,v24
810	vncipher	$out6,$out6,v24
811	vncipher	$out7,$out7,v24
812	lvx		v24,$x20,$key_		# round[3]
813	addi		$key_,$key_,0x20
814
815	vncipher	$out0,$out0,v25
816	vncipher	$out1,$out1,v25
817	vncipher	$out2,$out2,v25
818	vncipher	$out3,$out3,v25
819	vncipher	$out4,$out4,v25
820	vncipher	$out5,$out5,v25
821	vncipher	$out6,$out6,v25
822	vncipher	$out7,$out7,v25
823	lvx		v25,$x10,$key_		# round[4]
824	bdnz		Loop_cbc_dec8x
825
826	subic		$len,$len,128		# $len-=128
827	vncipher	$out0,$out0,v24
828	vncipher	$out1,$out1,v24
829	vncipher	$out2,$out2,v24
830	vncipher	$out3,$out3,v24
831	vncipher	$out4,$out4,v24
832	vncipher	$out5,$out5,v24
833	vncipher	$out6,$out6,v24
834	vncipher	$out7,$out7,v24
835
836	subfe.		r0,r0,r0		# borrow?-1:0
837	vncipher	$out0,$out0,v25
838	vncipher	$out1,$out1,v25
839	vncipher	$out2,$out2,v25
840	vncipher	$out3,$out3,v25
841	vncipher	$out4,$out4,v25
842	vncipher	$out5,$out5,v25
843	vncipher	$out6,$out6,v25
844	vncipher	$out7,$out7,v25
845
846	and		r0,r0,$len
847	vncipher	$out0,$out0,v26
848	vncipher	$out1,$out1,v26
849	vncipher	$out2,$out2,v26
850	vncipher	$out3,$out3,v26
851	vncipher	$out4,$out4,v26
852	vncipher	$out5,$out5,v26
853	vncipher	$out6,$out6,v26
854	vncipher	$out7,$out7,v26
855
856	add		$inp,$inp,r0		# $inp is adjusted in such
857						# way that at exit from the
858						# loop inX-in7 are loaded
859						# with last "words"
860	vncipher	$out0,$out0,v27
861	vncipher	$out1,$out1,v27
862	vncipher	$out2,$out2,v27
863	vncipher	$out3,$out3,v27
864	vncipher	$out4,$out4,v27
865	vncipher	$out5,$out5,v27
866	vncipher	$out6,$out6,v27
867	vncipher	$out7,$out7,v27
868
869	addi		$key_,$sp,$FRAME+15	# rewind $key_
870	vncipher	$out0,$out0,v28
871	vncipher	$out1,$out1,v28
872	vncipher	$out2,$out2,v28
873	vncipher	$out3,$out3,v28
874	vncipher	$out4,$out4,v28
875	vncipher	$out5,$out5,v28
876	vncipher	$out6,$out6,v28
877	vncipher	$out7,$out7,v28
878	lvx		v24,$x00,$key_		# re-pre-load round[1]
879
880	vncipher	$out0,$out0,v29
881	vncipher	$out1,$out1,v29
882	vncipher	$out2,$out2,v29
883	vncipher	$out3,$out3,v29
884	vncipher	$out4,$out4,v29
885	vncipher	$out5,$out5,v29
886	vncipher	$out6,$out6,v29
887	vncipher	$out7,$out7,v29
888	lvx		v25,$x10,$key_		# re-pre-load round[2]
889
890	vncipher	$out0,$out0,v30
891	 vxor		$ivec,$ivec,v31		# xor with last round key
892	vncipher	$out1,$out1,v30
893	 vxor		$in0,$in0,v31
894	vncipher	$out2,$out2,v30
895	 vxor		$in1,$in1,v31
896	vncipher	$out3,$out3,v30
897	 vxor		$in2,$in2,v31
898	vncipher	$out4,$out4,v30
899	 vxor		$in3,$in3,v31
900	vncipher	$out5,$out5,v30
901	 vxor		$in4,$in4,v31
902	vncipher	$out6,$out6,v30
903	 vxor		$in5,$in5,v31
904	vncipher	$out7,$out7,v30
905	 vxor		$in6,$in6,v31
906
907	vncipherlast	$out0,$out0,$ivec
908	vncipherlast	$out1,$out1,$in0
909	 lvx_u		$in0,$x00,$inp		# load next input block
910	vncipherlast	$out2,$out2,$in1
911	 lvx_u		$in1,$x10,$inp
912	vncipherlast	$out3,$out3,$in2
913	 le?vperm	$in0,$in0,$in0,$inpperm
914	 lvx_u		$in2,$x20,$inp
915	vncipherlast	$out4,$out4,$in3
916	 le?vperm	$in1,$in1,$in1,$inpperm
917	 lvx_u		$in3,$x30,$inp
918	vncipherlast	$out5,$out5,$in4
919	 le?vperm	$in2,$in2,$in2,$inpperm
920	 lvx_u		$in4,$x40,$inp
921	vncipherlast	$out6,$out6,$in5
922	 le?vperm	$in3,$in3,$in3,$inpperm
923	 lvx_u		$in5,$x50,$inp
924	vncipherlast	$out7,$out7,$in6
925	 le?vperm	$in4,$in4,$in4,$inpperm
926	 lvx_u		$in6,$x60,$inp
927	vmr		$ivec,$in7
928	 le?vperm	$in5,$in5,$in5,$inpperm
929	 lvx_u		$in7,$x70,$inp
930	 addi		$inp,$inp,0x80
931
932	le?vperm	$out0,$out0,$out0,$inpperm
933	le?vperm	$out1,$out1,$out1,$inpperm
934	stvx_u		$out0,$x00,$out
935	 le?vperm	$in6,$in6,$in6,$inpperm
936	 vxor		$out0,$in0,$rndkey0
937	le?vperm	$out2,$out2,$out2,$inpperm
938	stvx_u		$out1,$x10,$out
939	 le?vperm	$in7,$in7,$in7,$inpperm
940	 vxor		$out1,$in1,$rndkey0
941	le?vperm	$out3,$out3,$out3,$inpperm
942	stvx_u		$out2,$x20,$out
943	 vxor		$out2,$in2,$rndkey0
944	le?vperm	$out4,$out4,$out4,$inpperm
945	stvx_u		$out3,$x30,$out
946	 vxor		$out3,$in3,$rndkey0
947	le?vperm	$out5,$out5,$out5,$inpperm
948	stvx_u		$out4,$x40,$out
949	 vxor		$out4,$in4,$rndkey0
950	le?vperm	$out6,$out6,$out6,$inpperm
951	stvx_u		$out5,$x50,$out
952	 vxor		$out5,$in5,$rndkey0
953	le?vperm	$out7,$out7,$out7,$inpperm
954	stvx_u		$out6,$x60,$out
955	 vxor		$out6,$in6,$rndkey0
956	stvx_u		$out7,$x70,$out
957	addi		$out,$out,0x80
958	 vxor		$out7,$in7,$rndkey0
959
960	mtctr		$rounds
961	beq		Loop_cbc_dec8x		# did $len-=128 borrow?
962
963	addic.		$len,$len,128
964	beq		Lcbc_dec8x_done
965	nop
966	nop
967
968Loop_cbc_dec8x_tail:				# up to 7 "words" tail...
969	vncipher	$out1,$out1,v24
970	vncipher	$out2,$out2,v24
971	vncipher	$out3,$out3,v24
972	vncipher	$out4,$out4,v24
973	vncipher	$out5,$out5,v24
974	vncipher	$out6,$out6,v24
975	vncipher	$out7,$out7,v24
976	lvx		v24,$x20,$key_		# round[3]
977	addi		$key_,$key_,0x20
978
979	vncipher	$out1,$out1,v25
980	vncipher	$out2,$out2,v25
981	vncipher	$out3,$out3,v25
982	vncipher	$out4,$out4,v25
983	vncipher	$out5,$out5,v25
984	vncipher	$out6,$out6,v25
985	vncipher	$out7,$out7,v25
986	lvx		v25,$x10,$key_		# round[4]
987	bdnz		Loop_cbc_dec8x_tail
988
989	vncipher	$out1,$out1,v24
990	vncipher	$out2,$out2,v24
991	vncipher	$out3,$out3,v24
992	vncipher	$out4,$out4,v24
993	vncipher	$out5,$out5,v24
994	vncipher	$out6,$out6,v24
995	vncipher	$out7,$out7,v24
996
997	vncipher	$out1,$out1,v25
998	vncipher	$out2,$out2,v25
999	vncipher	$out3,$out3,v25
1000	vncipher	$out4,$out4,v25
1001	vncipher	$out5,$out5,v25
1002	vncipher	$out6,$out6,v25
1003	vncipher	$out7,$out7,v25
1004
1005	vncipher	$out1,$out1,v26
1006	vncipher	$out2,$out2,v26
1007	vncipher	$out3,$out3,v26
1008	vncipher	$out4,$out4,v26
1009	vncipher	$out5,$out5,v26
1010	vncipher	$out6,$out6,v26
1011	vncipher	$out7,$out7,v26
1012
1013	vncipher	$out1,$out1,v27
1014	vncipher	$out2,$out2,v27
1015	vncipher	$out3,$out3,v27
1016	vncipher	$out4,$out4,v27
1017	vncipher	$out5,$out5,v27
1018	vncipher	$out6,$out6,v27
1019	vncipher	$out7,$out7,v27
1020
1021	vncipher	$out1,$out1,v28
1022	vncipher	$out2,$out2,v28
1023	vncipher	$out3,$out3,v28
1024	vncipher	$out4,$out4,v28
1025	vncipher	$out5,$out5,v28
1026	vncipher	$out6,$out6,v28
1027	vncipher	$out7,$out7,v28
1028
1029	vncipher	$out1,$out1,v29
1030	vncipher	$out2,$out2,v29
1031	vncipher	$out3,$out3,v29
1032	vncipher	$out4,$out4,v29
1033	vncipher	$out5,$out5,v29
1034	vncipher	$out6,$out6,v29
1035	vncipher	$out7,$out7,v29
1036
1037	vncipher	$out1,$out1,v30
1038	 vxor		$ivec,$ivec,v31		# last round key
1039	vncipher	$out2,$out2,v30
1040	 vxor		$in1,$in1,v31
1041	vncipher	$out3,$out3,v30
1042	 vxor		$in2,$in2,v31
1043	vncipher	$out4,$out4,v30
1044	 vxor		$in3,$in3,v31
1045	vncipher	$out5,$out5,v30
1046	 vxor		$in4,$in4,v31
1047	vncipher	$out6,$out6,v30
1048	 vxor		$in5,$in5,v31
1049	vncipher	$out7,$out7,v30
1050	 vxor		$in6,$in6,v31
1051
1052	cmplwi		$len,32			# switch($len)
1053	blt		Lcbc_dec8x_one
1054	nop
1055	beq		Lcbc_dec8x_two
1056	cmplwi		$len,64
1057	blt		Lcbc_dec8x_three
1058	nop
1059	beq		Lcbc_dec8x_four
1060	cmplwi		$len,96
1061	blt		Lcbc_dec8x_five
1062	nop
1063	beq		Lcbc_dec8x_six
1064
1065Lcbc_dec8x_seven:
1066	vncipherlast	$out1,$out1,$ivec
1067	vncipherlast	$out2,$out2,$in1
1068	vncipherlast	$out3,$out3,$in2
1069	vncipherlast	$out4,$out4,$in3
1070	vncipherlast	$out5,$out5,$in4
1071	vncipherlast	$out6,$out6,$in5
1072	vncipherlast	$out7,$out7,$in6
1073	vmr		$ivec,$in7
1074
1075	le?vperm	$out1,$out1,$out1,$inpperm
1076	le?vperm	$out2,$out2,$out2,$inpperm
1077	stvx_u		$out1,$x00,$out
1078	le?vperm	$out3,$out3,$out3,$inpperm
1079	stvx_u		$out2,$x10,$out
1080	le?vperm	$out4,$out4,$out4,$inpperm
1081	stvx_u		$out3,$x20,$out
1082	le?vperm	$out5,$out5,$out5,$inpperm
1083	stvx_u		$out4,$x30,$out
1084	le?vperm	$out6,$out6,$out6,$inpperm
1085	stvx_u		$out5,$x40,$out
1086	le?vperm	$out7,$out7,$out7,$inpperm
1087	stvx_u		$out6,$x50,$out
1088	stvx_u		$out7,$x60,$out
1089	addi		$out,$out,0x70
1090	b		Lcbc_dec8x_done
1091
1092.align	5
1093Lcbc_dec8x_six:
1094	vncipherlast	$out2,$out2,$ivec
1095	vncipherlast	$out3,$out3,$in2
1096	vncipherlast	$out4,$out4,$in3
1097	vncipherlast	$out5,$out5,$in4
1098	vncipherlast	$out6,$out6,$in5
1099	vncipherlast	$out7,$out7,$in6
1100	vmr		$ivec,$in7
1101
1102	le?vperm	$out2,$out2,$out2,$inpperm
1103	le?vperm	$out3,$out3,$out3,$inpperm
1104	stvx_u		$out2,$x00,$out
1105	le?vperm	$out4,$out4,$out4,$inpperm
1106	stvx_u		$out3,$x10,$out
1107	le?vperm	$out5,$out5,$out5,$inpperm
1108	stvx_u		$out4,$x20,$out
1109	le?vperm	$out6,$out6,$out6,$inpperm
1110	stvx_u		$out5,$x30,$out
1111	le?vperm	$out7,$out7,$out7,$inpperm
1112	stvx_u		$out6,$x40,$out
1113	stvx_u		$out7,$x50,$out
1114	addi		$out,$out,0x60
1115	b		Lcbc_dec8x_done
1116
1117.align	5
1118Lcbc_dec8x_five:
1119	vncipherlast	$out3,$out3,$ivec
1120	vncipherlast	$out4,$out4,$in3
1121	vncipherlast	$out5,$out5,$in4
1122	vncipherlast	$out6,$out6,$in5
1123	vncipherlast	$out7,$out7,$in6
1124	vmr		$ivec,$in7
1125
1126	le?vperm	$out3,$out3,$out3,$inpperm
1127	le?vperm	$out4,$out4,$out4,$inpperm
1128	stvx_u		$out3,$x00,$out
1129	le?vperm	$out5,$out5,$out5,$inpperm
1130	stvx_u		$out4,$x10,$out
1131	le?vperm	$out6,$out6,$out6,$inpperm
1132	stvx_u		$out5,$x20,$out
1133	le?vperm	$out7,$out7,$out7,$inpperm
1134	stvx_u		$out6,$x30,$out
1135	stvx_u		$out7,$x40,$out
1136	addi		$out,$out,0x50
1137	b		Lcbc_dec8x_done
1138
1139.align	5
1140Lcbc_dec8x_four:
1141	vncipherlast	$out4,$out4,$ivec
1142	vncipherlast	$out5,$out5,$in4
1143	vncipherlast	$out6,$out6,$in5
1144	vncipherlast	$out7,$out7,$in6
1145	vmr		$ivec,$in7
1146
1147	le?vperm	$out4,$out4,$out4,$inpperm
1148	le?vperm	$out5,$out5,$out5,$inpperm
1149	stvx_u		$out4,$x00,$out
1150	le?vperm	$out6,$out6,$out6,$inpperm
1151	stvx_u		$out5,$x10,$out
1152	le?vperm	$out7,$out7,$out7,$inpperm
1153	stvx_u		$out6,$x20,$out
1154	stvx_u		$out7,$x30,$out
1155	addi		$out,$out,0x40
1156	b		Lcbc_dec8x_done
1157
1158.align	5
1159Lcbc_dec8x_three:
1160	vncipherlast	$out5,$out5,$ivec
1161	vncipherlast	$out6,$out6,$in5
1162	vncipherlast	$out7,$out7,$in6
1163	vmr		$ivec,$in7
1164
1165	le?vperm	$out5,$out5,$out5,$inpperm
1166	le?vperm	$out6,$out6,$out6,$inpperm
1167	stvx_u		$out5,$x00,$out
1168	le?vperm	$out7,$out7,$out7,$inpperm
1169	stvx_u		$out6,$x10,$out
1170	stvx_u		$out7,$x20,$out
1171	addi		$out,$out,0x30
1172	b		Lcbc_dec8x_done
1173
1174.align	5
1175Lcbc_dec8x_two:
1176	vncipherlast	$out6,$out6,$ivec
1177	vncipherlast	$out7,$out7,$in6
1178	vmr		$ivec,$in7
1179
1180	le?vperm	$out6,$out6,$out6,$inpperm
1181	le?vperm	$out7,$out7,$out7,$inpperm
1182	stvx_u		$out6,$x00,$out
1183	stvx_u		$out7,$x10,$out
1184	addi		$out,$out,0x20
1185	b		Lcbc_dec8x_done
1186
1187.align	5
1188Lcbc_dec8x_one:
1189	vncipherlast	$out7,$out7,$ivec
1190	vmr		$ivec,$in7
1191
1192	le?vperm	$out7,$out7,$out7,$inpperm
1193	stvx_u		$out7,0,$out
1194	addi		$out,$out,0x10
1195
1196Lcbc_dec8x_done:
1197	le?vperm	$ivec,$ivec,$ivec,$inpperm
1198	stvx_u		$ivec,0,$ivp		# write [unaligned] iv
1199
1200	li		r10,`$FRAME+15`
1201	li		r11,`$FRAME+31`
1202	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1203	addi		r10,r10,32
1204	stvx		$inpperm,r11,$sp
1205	addi		r11,r11,32
1206	stvx		$inpperm,r10,$sp
1207	addi		r10,r10,32
1208	stvx		$inpperm,r11,$sp
1209	addi		r11,r11,32
1210	stvx		$inpperm,r10,$sp
1211	addi		r10,r10,32
1212	stvx		$inpperm,r11,$sp
1213	addi		r11,r11,32
1214	stvx		$inpperm,r10,$sp
1215	addi		r10,r10,32
1216	stvx		$inpperm,r11,$sp
1217	addi		r11,r11,32
1218
1219	mtspr		256,$vrsave
1220	lvx		v20,r10,$sp		# ABI says so
1221	addi		r10,r10,32
1222	lvx		v21,r11,$sp
1223	addi		r11,r11,32
1224	lvx		v22,r10,$sp
1225	addi		r10,r10,32
1226	lvx		v23,r11,$sp
1227	addi		r11,r11,32
1228	lvx		v24,r10,$sp
1229	addi		r10,r10,32
1230	lvx		v25,r11,$sp
1231	addi		r11,r11,32
1232	lvx		v26,r10,$sp
1233	addi		r10,r10,32
1234	lvx		v27,r11,$sp
1235	addi		r11,r11,32
1236	lvx		v28,r10,$sp
1237	addi		r10,r10,32
1238	lvx		v29,r11,$sp
1239	addi		r11,r11,32
1240	lvx		v30,r10,$sp
1241	lvx		v31,r11,$sp
1242	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1243	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1244	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1245	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1246	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1247	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1248	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1249	blr
1250	.long		0
1251	.byte		0,12,0x04,0,0x80,6,6,0
1252	.long		0
1253.size	.${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1254___
1255}}	}}}
1256
1257#########################################################################
1258{{{	# CTR procedure[s]						#
1259my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1260my ($rndkey0,$rndkey1,$inout,$tmp)=		map("v$_",(0..3));
1261my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1262						map("v$_",(4..11));
1263my $dat=$tmp;
1264
1265$code.=<<___;
1266.globl	.${prefix}_ctr32_encrypt_blocks
1267.align	5
1268.${prefix}_ctr32_encrypt_blocks:
1269	${UCMP}i	$len,1
1270	bltlr-
1271
1272	lis		r0,0xfff0
1273	mfspr		$vrsave,256
1274	mtspr		256,r0
1275
1276	li		$idx,15
1277	vxor		$rndkey0,$rndkey0,$rndkey0
1278	le?vspltisb	$tmp,0x0f
1279
1280	lvx		$ivec,0,$ivp		# load [unaligned] iv
1281	lvsl		$inpperm,0,$ivp
1282	lvx		$inptail,$idx,$ivp
1283	 vspltisb	$one,1
1284	le?vxor		$inpperm,$inpperm,$tmp
1285	vperm		$ivec,$ivec,$inptail,$inpperm
1286	 vsldoi		$one,$rndkey0,$one,1
1287
1288	neg		r11,$inp
1289	?lvsl		$keyperm,0,$key		# prepare for unaligned key
1290	lwz		$rounds,240($key)
1291
1292	lvsr		$inpperm,0,r11		# prepare for unaligned load
1293	lvx		$inptail,0,$inp
1294	addi		$inp,$inp,15		# 15 is not typo
1295	le?vxor		$inpperm,$inpperm,$tmp
1296
1297	srwi		$rounds,$rounds,1
1298	li		$idx,16
1299	subi		$rounds,$rounds,1
1300
1301	${UCMP}i	$len,8
1302	bge		_aesp8_ctr32_encrypt8x
1303
1304	?lvsr		$outperm,0,$out		# prepare for unaligned store
1305	vspltisb	$outmask,-1
1306	lvx		$outhead,0,$out
1307	?vperm		$outmask,$rndkey0,$outmask,$outperm
1308	le?vxor		$outperm,$outperm,$tmp
1309
1310	lvx		$rndkey0,0,$key
1311	mtctr		$rounds
1312	lvx		$rndkey1,$idx,$key
1313	addi		$idx,$idx,16
1314	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1315	vxor		$inout,$ivec,$rndkey0
1316	lvx		$rndkey0,$idx,$key
1317	addi		$idx,$idx,16
1318	b		Loop_ctr32_enc
1319
1320.align	5
1321Loop_ctr32_enc:
1322	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1323	vcipher		$inout,$inout,$rndkey1
1324	lvx		$rndkey1,$idx,$key
1325	addi		$idx,$idx,16
1326	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1327	vcipher		$inout,$inout,$rndkey0
1328	lvx		$rndkey0,$idx,$key
1329	addi		$idx,$idx,16
1330	bdnz		Loop_ctr32_enc
1331
1332	vadduwm		$ivec,$ivec,$one
1333	 vmr		$dat,$inptail
1334	 lvx		$inptail,0,$inp
1335	 addi		$inp,$inp,16
1336	 subic.		$len,$len,1		# blocks--
1337
1338	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1339	vcipher		$inout,$inout,$rndkey1
1340	lvx		$rndkey1,$idx,$key
1341	 vperm		$dat,$dat,$inptail,$inpperm
1342	 li		$idx,16
1343	?vperm		$rndkey1,$rndkey0,$rndkey1,$keyperm
1344	 lvx		$rndkey0,0,$key
1345	vxor		$dat,$dat,$rndkey1	# last round key
1346	vcipherlast	$inout,$inout,$dat
1347
1348	 lvx		$rndkey1,$idx,$key
1349	 addi		$idx,$idx,16
1350	vperm		$inout,$inout,$inout,$outperm
1351	vsel		$dat,$outhead,$inout,$outmask
1352	 mtctr		$rounds
1353	 ?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1354	vmr		$outhead,$inout
1355	 vxor		$inout,$ivec,$rndkey0
1356	 lvx		$rndkey0,$idx,$key
1357	 addi		$idx,$idx,16
1358	stvx		$dat,0,$out
1359	addi		$out,$out,16
1360	bne		Loop_ctr32_enc
1361
1362	addi		$out,$out,-1
1363	lvx		$inout,0,$out		# redundant in aligned case
1364	vsel		$inout,$outhead,$inout,$outmask
1365	stvx		$inout,0,$out
1366
1367	mtspr		256,$vrsave
1368	blr
1369	.long		0
1370	.byte		0,12,0x14,0,0,0,6,0
1371	.long		0
1372___
1373#########################################################################
1374{{	# Optimized CTR procedure					#
1375my $key_="r11";
1376my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1377    $x00=0 if ($flavour =~ /osx/);
1378my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1379my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1380my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
1381			# v26-v31 last 6 round keys
1382my ($tmp,$keyperm)=($in3,$in4);	# aliases with "caller", redundant assignment
1383my ($two,$three,$four)=($outhead,$outperm,$outmask);
1384
1385$code.=<<___;
1386.align	5
1387_aesp8_ctr32_encrypt8x:
1388	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1389	li		r10,`$FRAME+8*16+15`
1390	li		r11,`$FRAME+8*16+31`
1391	stvx		v20,r10,$sp		# ABI says so
1392	addi		r10,r10,32
1393	stvx		v21,r11,$sp
1394	addi		r11,r11,32
1395	stvx		v22,r10,$sp
1396	addi		r10,r10,32
1397	stvx		v23,r11,$sp
1398	addi		r11,r11,32
1399	stvx		v24,r10,$sp
1400	addi		r10,r10,32
1401	stvx		v25,r11,$sp
1402	addi		r11,r11,32
1403	stvx		v26,r10,$sp
1404	addi		r10,r10,32
1405	stvx		v27,r11,$sp
1406	addi		r11,r11,32
1407	stvx		v28,r10,$sp
1408	addi		r10,r10,32
1409	stvx		v29,r11,$sp
1410	addi		r11,r11,32
1411	stvx		v30,r10,$sp
1412	stvx		v31,r11,$sp
1413	li		r0,-1
1414	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
1415	li		$x10,0x10
1416	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1417	li		$x20,0x20
1418	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1419	li		$x30,0x30
1420	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1421	li		$x40,0x40
1422	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1423	li		$x50,0x50
1424	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1425	li		$x60,0x60
1426	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1427	li		$x70,0x70
1428	mtspr		256,r0
1429
1430	subi		$rounds,$rounds,3	# -4 in total
1431
1432	lvx		$rndkey0,$x00,$key	# load key schedule
1433	lvx		v30,$x10,$key
1434	addi		$key,$key,0x20
1435	lvx		v31,$x00,$key
1436	?vperm		$rndkey0,$rndkey0,v30,$keyperm
1437	addi		$key_,$sp,$FRAME+15
1438	mtctr		$rounds
1439
1440Load_ctr32_enc_key:
1441	?vperm		v24,v30,v31,$keyperm
1442	lvx		v30,$x10,$key
1443	addi		$key,$key,0x20
1444	stvx		v24,$x00,$key_		# off-load round[1]
1445	?vperm		v25,v31,v30,$keyperm
1446	lvx		v31,$x00,$key
1447	stvx		v25,$x10,$key_		# off-load round[2]
1448	addi		$key_,$key_,0x20
1449	bdnz		Load_ctr32_enc_key
1450
1451	lvx		v26,$x10,$key
1452	?vperm		v24,v30,v31,$keyperm
1453	lvx		v27,$x20,$key
1454	stvx		v24,$x00,$key_		# off-load round[3]
1455	?vperm		v25,v31,v26,$keyperm
1456	lvx		v28,$x30,$key
1457	stvx		v25,$x10,$key_		# off-load round[4]
1458	addi		$key_,$sp,$FRAME+15	# rewind $key_
1459	?vperm		v26,v26,v27,$keyperm
1460	lvx		v29,$x40,$key
1461	?vperm		v27,v27,v28,$keyperm
1462	lvx		v30,$x50,$key
1463	?vperm		v28,v28,v29,$keyperm
1464	lvx		v31,$x60,$key
1465	?vperm		v29,v29,v30,$keyperm
1466	lvx		$out0,$x70,$key		# borrow $out0
1467	?vperm		v30,v30,v31,$keyperm
1468	lvx		v24,$x00,$key_		# pre-load round[1]
1469	?vperm		v31,v31,$out0,$keyperm
1470	lvx		v25,$x10,$key_		# pre-load round[2]
1471
1472	vadduwm		$two,$one,$one
1473	subi		$inp,$inp,15		# undo "caller"
1474	$SHL		$len,$len,4
1475
1476	vadduwm		$out1,$ivec,$one	# counter values ...
1477	vadduwm		$out2,$ivec,$two
1478	vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1479	 le?li		$idx,8
1480	vadduwm		$out3,$out1,$two
1481	vxor		$out1,$out1,$rndkey0
1482	 le?lvsl	$inpperm,0,$idx
1483	vadduwm		$out4,$out2,$two
1484	vxor		$out2,$out2,$rndkey0
1485	 le?vspltisb	$tmp,0x0f
1486	vadduwm		$out5,$out3,$two
1487	vxor		$out3,$out3,$rndkey0
1488	 le?vxor	$inpperm,$inpperm,$tmp	# transform for lvx_u/stvx_u
1489	vadduwm		$out6,$out4,$two
1490	vxor		$out4,$out4,$rndkey0
1491	vadduwm		$out7,$out5,$two
1492	vxor		$out5,$out5,$rndkey0
1493	vadduwm		$ivec,$out6,$two	# next counter value
1494	vxor		$out6,$out6,$rndkey0
1495	vxor		$out7,$out7,$rndkey0
1496
1497	mtctr		$rounds
1498	b		Loop_ctr32_enc8x
1499.align	5
1500Loop_ctr32_enc8x:
1501	vcipher 	$out0,$out0,v24
1502	vcipher 	$out1,$out1,v24
1503	vcipher 	$out2,$out2,v24
1504	vcipher 	$out3,$out3,v24
1505	vcipher 	$out4,$out4,v24
1506	vcipher 	$out5,$out5,v24
1507	vcipher 	$out6,$out6,v24
1508	vcipher 	$out7,$out7,v24
1509Loop_ctr32_enc8x_middle:
1510	lvx		v24,$x20,$key_		# round[3]
1511	addi		$key_,$key_,0x20
1512
1513	vcipher 	$out0,$out0,v25
1514	vcipher 	$out1,$out1,v25
1515	vcipher 	$out2,$out2,v25
1516	vcipher 	$out3,$out3,v25
1517	vcipher 	$out4,$out4,v25
1518	vcipher 	$out5,$out5,v25
1519	vcipher 	$out6,$out6,v25
1520	vcipher 	$out7,$out7,v25
1521	lvx		v25,$x10,$key_		# round[4]
1522	bdnz		Loop_ctr32_enc8x
1523
1524	subic		r11,$len,256		# $len-256, borrow $key_
1525	vcipher 	$out0,$out0,v24
1526	vcipher 	$out1,$out1,v24
1527	vcipher 	$out2,$out2,v24
1528	vcipher 	$out3,$out3,v24
1529	vcipher 	$out4,$out4,v24
1530	vcipher 	$out5,$out5,v24
1531	vcipher 	$out6,$out6,v24
1532	vcipher 	$out7,$out7,v24
1533
1534	subfe		r0,r0,r0		# borrow?-1:0
1535	vcipher 	$out0,$out0,v25
1536	vcipher 	$out1,$out1,v25
1537	vcipher 	$out2,$out2,v25
1538	vcipher 	$out3,$out3,v25
1539	vcipher 	$out4,$out4,v25
1540	vcipher		$out5,$out5,v25
1541	vcipher		$out6,$out6,v25
1542	vcipher		$out7,$out7,v25
1543
1544	and		r0,r0,r11
1545	addi		$key_,$sp,$FRAME+15	# rewind $key_
1546	vcipher		$out0,$out0,v26
1547	vcipher		$out1,$out1,v26
1548	vcipher		$out2,$out2,v26
1549	vcipher		$out3,$out3,v26
1550	vcipher		$out4,$out4,v26
1551	vcipher		$out5,$out5,v26
1552	vcipher		$out6,$out6,v26
1553	vcipher		$out7,$out7,v26
1554	lvx		v24,$x00,$key_		# re-pre-load round[1]
1555
1556	subic		$len,$len,129		# $len-=129
1557	vcipher		$out0,$out0,v27
1558	addi		$len,$len,1		# $len-=128 really
1559	vcipher		$out1,$out1,v27
1560	vcipher		$out2,$out2,v27
1561	vcipher		$out3,$out3,v27
1562	vcipher		$out4,$out4,v27
1563	vcipher		$out5,$out5,v27
1564	vcipher		$out6,$out6,v27
1565	vcipher		$out7,$out7,v27
1566	lvx		v25,$x10,$key_		# re-pre-load round[2]
1567
1568	vcipher		$out0,$out0,v28
1569	 lvx_u		$in0,$x00,$inp		# load input
1570	vcipher		$out1,$out1,v28
1571	 lvx_u		$in1,$x10,$inp
1572	vcipher		$out2,$out2,v28
1573	 lvx_u		$in2,$x20,$inp
1574	vcipher		$out3,$out3,v28
1575	 lvx_u		$in3,$x30,$inp
1576	vcipher		$out4,$out4,v28
1577	 lvx_u		$in4,$x40,$inp
1578	vcipher		$out5,$out5,v28
1579	 lvx_u		$in5,$x50,$inp
1580	vcipher		$out6,$out6,v28
1581	 lvx_u		$in6,$x60,$inp
1582	vcipher		$out7,$out7,v28
1583	 lvx_u		$in7,$x70,$inp
1584	 addi		$inp,$inp,0x80
1585
1586	vcipher		$out0,$out0,v29
1587	 le?vperm	$in0,$in0,$in0,$inpperm
1588	vcipher		$out1,$out1,v29
1589	 le?vperm	$in1,$in1,$in1,$inpperm
1590	vcipher		$out2,$out2,v29
1591	 le?vperm	$in2,$in2,$in2,$inpperm
1592	vcipher		$out3,$out3,v29
1593	 le?vperm	$in3,$in3,$in3,$inpperm
1594	vcipher		$out4,$out4,v29
1595	 le?vperm	$in4,$in4,$in4,$inpperm
1596	vcipher		$out5,$out5,v29
1597	 le?vperm	$in5,$in5,$in5,$inpperm
1598	vcipher		$out6,$out6,v29
1599	 le?vperm	$in6,$in6,$in6,$inpperm
1600	vcipher		$out7,$out7,v29
1601	 le?vperm	$in7,$in7,$in7,$inpperm
1602
1603	add		$inp,$inp,r0		# $inp is adjusted in such
1604						# way that at exit from the
1605						# loop inX-in7 are loaded
1606						# with last "words"
1607	subfe.		r0,r0,r0		# borrow?-1:0
1608	vcipher		$out0,$out0,v30
1609	 vxor		$in0,$in0,v31		# xor with last round key
1610	vcipher		$out1,$out1,v30
1611	 vxor		$in1,$in1,v31
1612	vcipher		$out2,$out2,v30
1613	 vxor		$in2,$in2,v31
1614	vcipher		$out3,$out3,v30
1615	 vxor		$in3,$in3,v31
1616	vcipher		$out4,$out4,v30
1617	 vxor		$in4,$in4,v31
1618	vcipher		$out5,$out5,v30
1619	 vxor		$in5,$in5,v31
1620	vcipher		$out6,$out6,v30
1621	 vxor		$in6,$in6,v31
1622	vcipher		$out7,$out7,v30
1623	 vxor		$in7,$in7,v31
1624
1625	bne		Lctr32_enc8x_break	# did $len-129 borrow?
1626
1627	vcipherlast	$in0,$out0,$in0
1628	vcipherlast	$in1,$out1,$in1
1629	 vadduwm	$out1,$ivec,$one	# counter values ...
1630	vcipherlast	$in2,$out2,$in2
1631	 vadduwm	$out2,$ivec,$two
1632	 vxor		$out0,$ivec,$rndkey0	# ... xored with rndkey[0]
1633	vcipherlast	$in3,$out3,$in3
1634	 vadduwm	$out3,$out1,$two
1635	 vxor		$out1,$out1,$rndkey0
1636	vcipherlast	$in4,$out4,$in4
1637	 vadduwm	$out4,$out2,$two
1638	 vxor		$out2,$out2,$rndkey0
1639	vcipherlast	$in5,$out5,$in5
1640	 vadduwm	$out5,$out3,$two
1641	 vxor		$out3,$out3,$rndkey0
1642	vcipherlast	$in6,$out6,$in6
1643	 vadduwm	$out6,$out4,$two
1644	 vxor		$out4,$out4,$rndkey0
1645	vcipherlast	$in7,$out7,$in7
1646	 vadduwm	$out7,$out5,$two
1647	 vxor		$out5,$out5,$rndkey0
1648	le?vperm	$in0,$in0,$in0,$inpperm
1649	 vadduwm	$ivec,$out6,$two	# next counter value
1650	 vxor		$out6,$out6,$rndkey0
1651	le?vperm	$in1,$in1,$in1,$inpperm
1652	 vxor		$out7,$out7,$rndkey0
1653	mtctr		$rounds
1654
1655	 vcipher	$out0,$out0,v24
1656	stvx_u		$in0,$x00,$out
1657	le?vperm	$in2,$in2,$in2,$inpperm
1658	 vcipher	$out1,$out1,v24
1659	stvx_u		$in1,$x10,$out
1660	le?vperm	$in3,$in3,$in3,$inpperm
1661	 vcipher	$out2,$out2,v24
1662	stvx_u		$in2,$x20,$out
1663	le?vperm	$in4,$in4,$in4,$inpperm
1664	 vcipher	$out3,$out3,v24
1665	stvx_u		$in3,$x30,$out
1666	le?vperm	$in5,$in5,$in5,$inpperm
1667	 vcipher	$out4,$out4,v24
1668	stvx_u		$in4,$x40,$out
1669	le?vperm	$in6,$in6,$in6,$inpperm
1670	 vcipher	$out5,$out5,v24
1671	stvx_u		$in5,$x50,$out
1672	le?vperm	$in7,$in7,$in7,$inpperm
1673	 vcipher	$out6,$out6,v24
1674	stvx_u		$in6,$x60,$out
1675	 vcipher	$out7,$out7,v24
1676	stvx_u		$in7,$x70,$out
1677	addi		$out,$out,0x80
1678
1679	b		Loop_ctr32_enc8x_middle
1680
1681.align	5
1682Lctr32_enc8x_break:
1683	cmpwi		$len,-0x60
1684	blt		Lctr32_enc8x_one
1685	nop
1686	beq		Lctr32_enc8x_two
1687	cmpwi		$len,-0x40
1688	blt		Lctr32_enc8x_three
1689	nop
1690	beq		Lctr32_enc8x_four
1691	cmpwi		$len,-0x20
1692	blt		Lctr32_enc8x_five
1693	nop
1694	beq		Lctr32_enc8x_six
1695	cmpwi		$len,0x00
1696	blt		Lctr32_enc8x_seven
1697
1698Lctr32_enc8x_eight:
1699	vcipherlast	$out0,$out0,$in0
1700	vcipherlast	$out1,$out1,$in1
1701	vcipherlast	$out2,$out2,$in2
1702	vcipherlast	$out3,$out3,$in3
1703	vcipherlast	$out4,$out4,$in4
1704	vcipherlast	$out5,$out5,$in5
1705	vcipherlast	$out6,$out6,$in6
1706	vcipherlast	$out7,$out7,$in7
1707
1708	le?vperm	$out0,$out0,$out0,$inpperm
1709	le?vperm	$out1,$out1,$out1,$inpperm
1710	stvx_u		$out0,$x00,$out
1711	le?vperm	$out2,$out2,$out2,$inpperm
1712	stvx_u		$out1,$x10,$out
1713	le?vperm	$out3,$out3,$out3,$inpperm
1714	stvx_u		$out2,$x20,$out
1715	le?vperm	$out4,$out4,$out4,$inpperm
1716	stvx_u		$out3,$x30,$out
1717	le?vperm	$out5,$out5,$out5,$inpperm
1718	stvx_u		$out4,$x40,$out
1719	le?vperm	$out6,$out6,$out6,$inpperm
1720	stvx_u		$out5,$x50,$out
1721	le?vperm	$out7,$out7,$out7,$inpperm
1722	stvx_u		$out6,$x60,$out
1723	stvx_u		$out7,$x70,$out
1724	addi		$out,$out,0x80
1725	b		Lctr32_enc8x_done
1726
1727.align	5
1728Lctr32_enc8x_seven:
1729	vcipherlast	$out0,$out0,$in1
1730	vcipherlast	$out1,$out1,$in2
1731	vcipherlast	$out2,$out2,$in3
1732	vcipherlast	$out3,$out3,$in4
1733	vcipherlast	$out4,$out4,$in5
1734	vcipherlast	$out5,$out5,$in6
1735	vcipherlast	$out6,$out6,$in7
1736
1737	le?vperm	$out0,$out0,$out0,$inpperm
1738	le?vperm	$out1,$out1,$out1,$inpperm
1739	stvx_u		$out0,$x00,$out
1740	le?vperm	$out2,$out2,$out2,$inpperm
1741	stvx_u		$out1,$x10,$out
1742	le?vperm	$out3,$out3,$out3,$inpperm
1743	stvx_u		$out2,$x20,$out
1744	le?vperm	$out4,$out4,$out4,$inpperm
1745	stvx_u		$out3,$x30,$out
1746	le?vperm	$out5,$out5,$out5,$inpperm
1747	stvx_u		$out4,$x40,$out
1748	le?vperm	$out6,$out6,$out6,$inpperm
1749	stvx_u		$out5,$x50,$out
1750	stvx_u		$out6,$x60,$out
1751	addi		$out,$out,0x70
1752	b		Lctr32_enc8x_done
1753
1754.align	5
1755Lctr32_enc8x_six:
1756	vcipherlast	$out0,$out0,$in2
1757	vcipherlast	$out1,$out1,$in3
1758	vcipherlast	$out2,$out2,$in4
1759	vcipherlast	$out3,$out3,$in5
1760	vcipherlast	$out4,$out4,$in6
1761	vcipherlast	$out5,$out5,$in7
1762
1763	le?vperm	$out0,$out0,$out0,$inpperm
1764	le?vperm	$out1,$out1,$out1,$inpperm
1765	stvx_u		$out0,$x00,$out
1766	le?vperm	$out2,$out2,$out2,$inpperm
1767	stvx_u		$out1,$x10,$out
1768	le?vperm	$out3,$out3,$out3,$inpperm
1769	stvx_u		$out2,$x20,$out
1770	le?vperm	$out4,$out4,$out4,$inpperm
1771	stvx_u		$out3,$x30,$out
1772	le?vperm	$out5,$out5,$out5,$inpperm
1773	stvx_u		$out4,$x40,$out
1774	stvx_u		$out5,$x50,$out
1775	addi		$out,$out,0x60
1776	b		Lctr32_enc8x_done
1777
1778.align	5
1779Lctr32_enc8x_five:
1780	vcipherlast	$out0,$out0,$in3
1781	vcipherlast	$out1,$out1,$in4
1782	vcipherlast	$out2,$out2,$in5
1783	vcipherlast	$out3,$out3,$in6
1784	vcipherlast	$out4,$out4,$in7
1785
1786	le?vperm	$out0,$out0,$out0,$inpperm
1787	le?vperm	$out1,$out1,$out1,$inpperm
1788	stvx_u		$out0,$x00,$out
1789	le?vperm	$out2,$out2,$out2,$inpperm
1790	stvx_u		$out1,$x10,$out
1791	le?vperm	$out3,$out3,$out3,$inpperm
1792	stvx_u		$out2,$x20,$out
1793	le?vperm	$out4,$out4,$out4,$inpperm
1794	stvx_u		$out3,$x30,$out
1795	stvx_u		$out4,$x40,$out
1796	addi		$out,$out,0x50
1797	b		Lctr32_enc8x_done
1798
1799.align	5
1800Lctr32_enc8x_four:
1801	vcipherlast	$out0,$out0,$in4
1802	vcipherlast	$out1,$out1,$in5
1803	vcipherlast	$out2,$out2,$in6
1804	vcipherlast	$out3,$out3,$in7
1805
1806	le?vperm	$out0,$out0,$out0,$inpperm
1807	le?vperm	$out1,$out1,$out1,$inpperm
1808	stvx_u		$out0,$x00,$out
1809	le?vperm	$out2,$out2,$out2,$inpperm
1810	stvx_u		$out1,$x10,$out
1811	le?vperm	$out3,$out3,$out3,$inpperm
1812	stvx_u		$out2,$x20,$out
1813	stvx_u		$out3,$x30,$out
1814	addi		$out,$out,0x40
1815	b		Lctr32_enc8x_done
1816
1817.align	5
1818Lctr32_enc8x_three:
1819	vcipherlast	$out0,$out0,$in5
1820	vcipherlast	$out1,$out1,$in6
1821	vcipherlast	$out2,$out2,$in7
1822
1823	le?vperm	$out0,$out0,$out0,$inpperm
1824	le?vperm	$out1,$out1,$out1,$inpperm
1825	stvx_u		$out0,$x00,$out
1826	le?vperm	$out2,$out2,$out2,$inpperm
1827	stvx_u		$out1,$x10,$out
1828	stvx_u		$out2,$x20,$out
1829	addi		$out,$out,0x30
1830	b		Lcbc_dec8x_done
1831
1832.align	5
1833Lctr32_enc8x_two:
1834	vcipherlast	$out0,$out0,$in6
1835	vcipherlast	$out1,$out1,$in7
1836
1837	le?vperm	$out0,$out0,$out0,$inpperm
1838	le?vperm	$out1,$out1,$out1,$inpperm
1839	stvx_u		$out0,$x00,$out
1840	stvx_u		$out1,$x10,$out
1841	addi		$out,$out,0x20
1842	b		Lcbc_dec8x_done
1843
1844.align	5
1845Lctr32_enc8x_one:
1846	vcipherlast	$out0,$out0,$in7
1847
1848	le?vperm	$out0,$out0,$out0,$inpperm
1849	stvx_u		$out0,0,$out
1850	addi		$out,$out,0x10
1851
1852Lctr32_enc8x_done:
1853	li		r10,`$FRAME+15`
1854	li		r11,`$FRAME+31`
1855	stvx		$inpperm,r10,$sp	# wipe copies of round keys
1856	addi		r10,r10,32
1857	stvx		$inpperm,r11,$sp
1858	addi		r11,r11,32
1859	stvx		$inpperm,r10,$sp
1860	addi		r10,r10,32
1861	stvx		$inpperm,r11,$sp
1862	addi		r11,r11,32
1863	stvx		$inpperm,r10,$sp
1864	addi		r10,r10,32
1865	stvx		$inpperm,r11,$sp
1866	addi		r11,r11,32
1867	stvx		$inpperm,r10,$sp
1868	addi		r10,r10,32
1869	stvx		$inpperm,r11,$sp
1870	addi		r11,r11,32
1871
1872	mtspr		256,$vrsave
1873	lvx		v20,r10,$sp		# ABI says so
1874	addi		r10,r10,32
1875	lvx		v21,r11,$sp
1876	addi		r11,r11,32
1877	lvx		v22,r10,$sp
1878	addi		r10,r10,32
1879	lvx		v23,r11,$sp
1880	addi		r11,r11,32
1881	lvx		v24,r10,$sp
1882	addi		r10,r10,32
1883	lvx		v25,r11,$sp
1884	addi		r11,r11,32
1885	lvx		v26,r10,$sp
1886	addi		r10,r10,32
1887	lvx		v27,r11,$sp
1888	addi		r11,r11,32
1889	lvx		v28,r10,$sp
1890	addi		r10,r10,32
1891	lvx		v29,r11,$sp
1892	addi		r11,r11,32
1893	lvx		v30,r10,$sp
1894	lvx		v31,r11,$sp
1895	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1896	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1897	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1898	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1899	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1900	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1901	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1902	blr
1903	.long		0
1904	.byte		0,12,0x04,0,0x80,6,6,0
1905	.long		0
1906.size	.${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1907___
1908}}	}}}
1909
1910#########################################################################
1911{{{	# XTS procedures						#
1912# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,	#
1913#                             const AES_KEY *key1, const AES_KEY *key2,	#
1914#                             [const] unsigned char iv[16]);		#
1915# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which	#
1916# input tweak value is assumed to be encrypted already, and last tweak	#
1917# value, one suitable for consecutive call on same chunk of data, is	#
1918# written back to original buffer. In addition, in "tweak chaining"	#
1919# mode only complete input blocks are processed.			#
1920
1921my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =	map("r$_",(3..10));
1922my ($rndkey0,$rndkey1,$inout) =				map("v$_",(0..2));
1923my ($output,$inptail,$inpperm,$leperm,$keyperm) =	map("v$_",(3..7));
1924my ($tweak,$seven,$eighty7,$tmp,$tweak1) =		map("v$_",(8..12));
1925my $taillen = $key2;
1926
1927   ($inp,$idx) = ($idx,$inp);				# reassign
1928
1929$code.=<<___;
1930.globl	.${prefix}_xts_encrypt
1931.align	5
1932.${prefix}_xts_encrypt:
1933	mr		$inp,r3				# reassign
1934	li		r3,-1
1935	${UCMP}i	$len,16
1936	bltlr-
1937
1938	lis		r0,0xfff0
1939	mfspr		r12,256				# save vrsave
1940	li		r11,0
1941	mtspr		256,r0
1942
1943	vspltisb	$seven,0x07			# 0x070707..07
1944	le?lvsl		$leperm,r11,r11
1945	le?vspltisb	$tmp,0x0f
1946	le?vxor		$leperm,$leperm,$seven
1947
1948	li		$idx,15
1949	lvx		$tweak,0,$ivp			# load [unaligned] iv
1950	lvsl		$inpperm,0,$ivp
1951	lvx		$inptail,$idx,$ivp
1952	le?vxor		$inpperm,$inpperm,$tmp
1953	vperm		$tweak,$tweak,$inptail,$inpperm
1954
1955	neg		r11,$inp
1956	lvsr		$inpperm,0,r11			# prepare for unaligned load
1957	lvx		$inout,0,$inp
1958	addi		$inp,$inp,15			# 15 is not typo
1959	le?vxor		$inpperm,$inpperm,$tmp
1960
1961	${UCMP}i	$key2,0				# key2==NULL?
1962	beq		Lxts_enc_no_key2
1963
1964	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
1965	lwz		$rounds,240($key2)
1966	srwi		$rounds,$rounds,1
1967	subi		$rounds,$rounds,1
1968	li		$idx,16
1969
1970	lvx		$rndkey0,0,$key2
1971	lvx		$rndkey1,$idx,$key2
1972	addi		$idx,$idx,16
1973	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1974	vxor		$tweak,$tweak,$rndkey0
1975	lvx		$rndkey0,$idx,$key2
1976	addi		$idx,$idx,16
1977	mtctr		$rounds
1978
1979Ltweak_xts_enc:
1980	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1981	vcipher		$tweak,$tweak,$rndkey1
1982	lvx		$rndkey1,$idx,$key2
1983	addi		$idx,$idx,16
1984	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1985	vcipher		$tweak,$tweak,$rndkey0
1986	lvx		$rndkey0,$idx,$key2
1987	addi		$idx,$idx,16
1988	bdnz		Ltweak_xts_enc
1989
1990	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
1991	vcipher		$tweak,$tweak,$rndkey1
1992	lvx		$rndkey1,$idx,$key2
1993	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
1994	vcipherlast	$tweak,$tweak,$rndkey0
1995
1996	li		$ivp,0				# don't chain the tweak
1997	b		Lxts_enc
1998
1999Lxts_enc_no_key2:
2000	li		$idx,-16
2001	and		$len,$len,$idx			# in "tweak chaining"
2002							# mode only complete
2003							# blocks are processed
2004Lxts_enc:
2005	lvx		$inptail,0,$inp
2006	addi		$inp,$inp,16
2007
2008	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2009	lwz		$rounds,240($key1)
2010	srwi		$rounds,$rounds,1
2011	subi		$rounds,$rounds,1
2012	li		$idx,16
2013
2014	vslb		$eighty7,$seven,$seven		# 0x808080..80
2015	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2016	vspltisb	$tmp,1				# 0x010101..01
2017	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2018
2019	${UCMP}i	$len,96
2020	bge		_aesp8_xts_encrypt6x
2021
2022	andi.		$taillen,$len,15
2023	subic		r0,$len,32
2024	subi		$taillen,$taillen,16
2025	subfe		r0,r0,r0
2026	and		r0,r0,$taillen
2027	add		$inp,$inp,r0
2028
2029	lvx		$rndkey0,0,$key1
2030	lvx		$rndkey1,$idx,$key1
2031	addi		$idx,$idx,16
2032	vperm		$inout,$inout,$inptail,$inpperm
2033	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2034	vxor		$inout,$inout,$tweak
2035	vxor		$inout,$inout,$rndkey0
2036	lvx		$rndkey0,$idx,$key1
2037	addi		$idx,$idx,16
2038	mtctr		$rounds
2039	b		Loop_xts_enc
2040
2041.align	5
2042Loop_xts_enc:
2043	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2044	vcipher		$inout,$inout,$rndkey1
2045	lvx		$rndkey1,$idx,$key1
2046	addi		$idx,$idx,16
2047	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2048	vcipher		$inout,$inout,$rndkey0
2049	lvx		$rndkey0,$idx,$key1
2050	addi		$idx,$idx,16
2051	bdnz		Loop_xts_enc
2052
2053	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2054	vcipher		$inout,$inout,$rndkey1
2055	lvx		$rndkey1,$idx,$key1
2056	li		$idx,16
2057	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2058	vxor		$rndkey0,$rndkey0,$tweak
2059	vcipherlast	$output,$inout,$rndkey0
2060
2061	le?vperm	$tmp,$output,$output,$leperm
2062	be?nop
2063	le?stvx_u	$tmp,0,$out
2064	be?stvx_u	$output,0,$out
2065	addi		$out,$out,16
2066
2067	subic.		$len,$len,16
2068	beq		Lxts_enc_done
2069
2070	vmr		$inout,$inptail
2071	lvx		$inptail,0,$inp
2072	addi		$inp,$inp,16
2073	lvx		$rndkey0,0,$key1
2074	lvx		$rndkey1,$idx,$key1
2075	addi		$idx,$idx,16
2076
2077	subic		r0,$len,32
2078	subfe		r0,r0,r0
2079	and		r0,r0,$taillen
2080	add		$inp,$inp,r0
2081
2082	vsrab		$tmp,$tweak,$seven		# next tweak value
2083	vaddubm		$tweak,$tweak,$tweak
2084	vsldoi		$tmp,$tmp,$tmp,15
2085	vand		$tmp,$tmp,$eighty7
2086	vxor		$tweak,$tweak,$tmp
2087
2088	vperm		$inout,$inout,$inptail,$inpperm
2089	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2090	vxor		$inout,$inout,$tweak
2091	vxor		$output,$output,$rndkey0	# just in case $len<16
2092	vxor		$inout,$inout,$rndkey0
2093	lvx		$rndkey0,$idx,$key1
2094	addi		$idx,$idx,16
2095
2096	mtctr		$rounds
2097	${UCMP}i	$len,16
2098	bge		Loop_xts_enc
2099
2100	vxor		$output,$output,$tweak
2101	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2102	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2103	vspltisb	$tmp,-1
2104	vperm		$inptail,$inptail,$tmp,$inpperm
2105	vsel		$inout,$inout,$output,$inptail
2106
2107	subi		r11,$out,17
2108	subi		$out,$out,16
2109	mtctr		$len
2110	li		$len,16
2111Loop_xts_enc_steal:
2112	lbzu		r0,1(r11)
2113	stb		r0,16(r11)
2114	bdnz		Loop_xts_enc_steal
2115
2116	mtctr		$rounds
2117	b		Loop_xts_enc			# one more time...
2118
2119Lxts_enc_done:
2120	${UCMP}i	$ivp,0
2121	beq		Lxts_enc_ret
2122
2123	vsrab		$tmp,$tweak,$seven		# next tweak value
2124	vaddubm		$tweak,$tweak,$tweak
2125	vsldoi		$tmp,$tmp,$tmp,15
2126	vand		$tmp,$tmp,$eighty7
2127	vxor		$tweak,$tweak,$tmp
2128
2129	le?vperm	$tweak,$tweak,$tweak,$leperm
2130	stvx_u		$tweak,0,$ivp
2131
2132Lxts_enc_ret:
2133	mtspr		256,r12				# restore vrsave
2134	li		r3,0
2135	blr
2136	.long		0
2137	.byte		0,12,0x04,0,0x80,6,6,0
2138	.long		0
2139.size	.${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2140
2141.globl	.${prefix}_xts_decrypt
2142.align	5
2143.${prefix}_xts_decrypt:
2144	mr		$inp,r3				# reassign
2145	li		r3,-1
2146	${UCMP}i	$len,16
2147	bltlr-
2148
2149	lis		r0,0xfff8
2150	mfspr		r12,256				# save vrsave
2151	li		r11,0
2152	mtspr		256,r0
2153
2154	andi.		r0,$len,15
2155	neg		r0,r0
2156	andi.		r0,r0,16
2157	sub		$len,$len,r0
2158
2159	vspltisb	$seven,0x07			# 0x070707..07
2160	le?lvsl		$leperm,r11,r11
2161	le?vspltisb	$tmp,0x0f
2162	le?vxor		$leperm,$leperm,$seven
2163
2164	li		$idx,15
2165	lvx		$tweak,0,$ivp			# load [unaligned] iv
2166	lvsl		$inpperm,0,$ivp
2167	lvx		$inptail,$idx,$ivp
2168	le?vxor		$inpperm,$inpperm,$tmp
2169	vperm		$tweak,$tweak,$inptail,$inpperm
2170
2171	neg		r11,$inp
2172	lvsr		$inpperm,0,r11			# prepare for unaligned load
2173	lvx		$inout,0,$inp
2174	addi		$inp,$inp,15			# 15 is not typo
2175	le?vxor		$inpperm,$inpperm,$tmp
2176
2177	${UCMP}i	$key2,0				# key2==NULL?
2178	beq		Lxts_dec_no_key2
2179
2180	?lvsl		$keyperm,0,$key2		# prepare for unaligned key
2181	lwz		$rounds,240($key2)
2182	srwi		$rounds,$rounds,1
2183	subi		$rounds,$rounds,1
2184	li		$idx,16
2185
2186	lvx		$rndkey0,0,$key2
2187	lvx		$rndkey1,$idx,$key2
2188	addi		$idx,$idx,16
2189	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2190	vxor		$tweak,$tweak,$rndkey0
2191	lvx		$rndkey0,$idx,$key2
2192	addi		$idx,$idx,16
2193	mtctr		$rounds
2194
2195Ltweak_xts_dec:
2196	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2197	vcipher		$tweak,$tweak,$rndkey1
2198	lvx		$rndkey1,$idx,$key2
2199	addi		$idx,$idx,16
2200	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2201	vcipher		$tweak,$tweak,$rndkey0
2202	lvx		$rndkey0,$idx,$key2
2203	addi		$idx,$idx,16
2204	bdnz		Ltweak_xts_dec
2205
2206	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2207	vcipher		$tweak,$tweak,$rndkey1
2208	lvx		$rndkey1,$idx,$key2
2209	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2210	vcipherlast	$tweak,$tweak,$rndkey0
2211
2212	li		$ivp,0				# don't chain the tweak
2213	b		Lxts_dec
2214
2215Lxts_dec_no_key2:
2216	neg		$idx,$len
2217	andi.		$idx,$idx,15
2218	add		$len,$len,$idx			# in "tweak chaining"
2219							# mode only complete
2220							# blocks are processed
2221Lxts_dec:
2222	lvx		$inptail,0,$inp
2223	addi		$inp,$inp,16
2224
2225	?lvsl		$keyperm,0,$key1		# prepare for unaligned key
2226	lwz		$rounds,240($key1)
2227	srwi		$rounds,$rounds,1
2228	subi		$rounds,$rounds,1
2229	li		$idx,16
2230
2231	vslb		$eighty7,$seven,$seven		# 0x808080..80
2232	vor		$eighty7,$eighty7,$seven	# 0x878787..87
2233	vspltisb	$tmp,1				# 0x010101..01
2234	vsldoi		$eighty7,$eighty7,$tmp,15	# 0x870101..01
2235
2236	${UCMP}i	$len,96
2237	bge		_aesp8_xts_decrypt6x
2238
2239	lvx		$rndkey0,0,$key1
2240	lvx		$rndkey1,$idx,$key1
2241	addi		$idx,$idx,16
2242	vperm		$inout,$inout,$inptail,$inpperm
2243	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2244	vxor		$inout,$inout,$tweak
2245	vxor		$inout,$inout,$rndkey0
2246	lvx		$rndkey0,$idx,$key1
2247	addi		$idx,$idx,16
2248	mtctr		$rounds
2249
2250	${UCMP}i	$len,16
2251	blt		Ltail_xts_dec
2252	be?b		Loop_xts_dec
2253
2254.align	5
2255Loop_xts_dec:
2256	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2257	vncipher	$inout,$inout,$rndkey1
2258	lvx		$rndkey1,$idx,$key1
2259	addi		$idx,$idx,16
2260	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2261	vncipher	$inout,$inout,$rndkey0
2262	lvx		$rndkey0,$idx,$key1
2263	addi		$idx,$idx,16
2264	bdnz		Loop_xts_dec
2265
2266	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2267	vncipher	$inout,$inout,$rndkey1
2268	lvx		$rndkey1,$idx,$key1
2269	li		$idx,16
2270	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2271	vxor		$rndkey0,$rndkey0,$tweak
2272	vncipherlast	$output,$inout,$rndkey0
2273
2274	le?vperm	$tmp,$output,$output,$leperm
2275	be?nop
2276	le?stvx_u	$tmp,0,$out
2277	be?stvx_u	$output,0,$out
2278	addi		$out,$out,16
2279
2280	subic.		$len,$len,16
2281	beq		Lxts_dec_done
2282
2283	vmr		$inout,$inptail
2284	lvx		$inptail,0,$inp
2285	addi		$inp,$inp,16
2286	lvx		$rndkey0,0,$key1
2287	lvx		$rndkey1,$idx,$key1
2288	addi		$idx,$idx,16
2289
2290	vsrab		$tmp,$tweak,$seven		# next tweak value
2291	vaddubm		$tweak,$tweak,$tweak
2292	vsldoi		$tmp,$tmp,$tmp,15
2293	vand		$tmp,$tmp,$eighty7
2294	vxor		$tweak,$tweak,$tmp
2295
2296	vperm		$inout,$inout,$inptail,$inpperm
2297	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2298	vxor		$inout,$inout,$tweak
2299	vxor		$inout,$inout,$rndkey0
2300	lvx		$rndkey0,$idx,$key1
2301	addi		$idx,$idx,16
2302
2303	mtctr		$rounds
2304	${UCMP}i	$len,16
2305	bge		Loop_xts_dec
2306
2307Ltail_xts_dec:
2308	vsrab		$tmp,$tweak,$seven		# next tweak value
2309	vaddubm		$tweak1,$tweak,$tweak
2310	vsldoi		$tmp,$tmp,$tmp,15
2311	vand		$tmp,$tmp,$eighty7
2312	vxor		$tweak1,$tweak1,$tmp
2313
2314	subi		$inp,$inp,16
2315	add		$inp,$inp,$len
2316
2317	vxor		$inout,$inout,$tweak		# :-(
2318	vxor		$inout,$inout,$tweak1		# :-)
2319
2320Loop_xts_dec_short:
2321	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2322	vncipher	$inout,$inout,$rndkey1
2323	lvx		$rndkey1,$idx,$key1
2324	addi		$idx,$idx,16
2325	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2326	vncipher	$inout,$inout,$rndkey0
2327	lvx		$rndkey0,$idx,$key1
2328	addi		$idx,$idx,16
2329	bdnz		Loop_xts_dec_short
2330
2331	?vperm		$rndkey1,$rndkey1,$rndkey0,$keyperm
2332	vncipher	$inout,$inout,$rndkey1
2333	lvx		$rndkey1,$idx,$key1
2334	li		$idx,16
2335	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2336	vxor		$rndkey0,$rndkey0,$tweak1
2337	vncipherlast	$output,$inout,$rndkey0
2338
2339	le?vperm	$tmp,$output,$output,$leperm
2340	be?nop
2341	le?stvx_u	$tmp,0,$out
2342	be?stvx_u	$output,0,$out
2343
2344	vmr		$inout,$inptail
2345	lvx		$inptail,0,$inp
2346	#addi		$inp,$inp,16
2347	lvx		$rndkey0,0,$key1
2348	lvx		$rndkey1,$idx,$key1
2349	addi		$idx,$idx,16
2350	vperm		$inout,$inout,$inptail,$inpperm
2351	?vperm		$rndkey0,$rndkey0,$rndkey1,$keyperm
2352
2353	lvsr		$inpperm,0,$len			# $inpperm is no longer needed
2354	vxor		$inptail,$inptail,$inptail	# $inptail is no longer needed
2355	vspltisb	$tmp,-1
2356	vperm		$inptail,$inptail,$tmp,$inpperm
2357	vsel		$inout,$inout,$output,$inptail
2358
2359	vxor		$rndkey0,$rndkey0,$tweak
2360	vxor		$inout,$inout,$rndkey0
2361	lvx		$rndkey0,$idx,$key1
2362	addi		$idx,$idx,16
2363
2364	subi		r11,$out,1
2365	mtctr		$len
2366	li		$len,16
2367Loop_xts_dec_steal:
2368	lbzu		r0,1(r11)
2369	stb		r0,16(r11)
2370	bdnz		Loop_xts_dec_steal
2371
2372	mtctr		$rounds
2373	b		Loop_xts_dec			# one more time...
2374
2375Lxts_dec_done:
2376	${UCMP}i	$ivp,0
2377	beq		Lxts_dec_ret
2378
2379	vsrab		$tmp,$tweak,$seven		# next tweak value
2380	vaddubm		$tweak,$tweak,$tweak
2381	vsldoi		$tmp,$tmp,$tmp,15
2382	vand		$tmp,$tmp,$eighty7
2383	vxor		$tweak,$tweak,$tmp
2384
2385	le?vperm	$tweak,$tweak,$tweak,$leperm
2386	stvx_u		$tweak,0,$ivp
2387
2388Lxts_dec_ret:
2389	mtspr		256,r12				# restore vrsave
2390	li		r3,0
2391	blr
2392	.long		0
2393	.byte		0,12,0x04,0,0x80,6,6,0
2394	.long		0
2395.size	.${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2396___
2397#########################################################################
2398{{	# Optimized XTS procedures					#
2399my $key_=$key2;
2400my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
2401    $x00=0 if ($flavour =~ /osx/);
2402my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5 )=map("v$_",(0..5));
2403my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2404my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2405my $rndkey0="v23";	# v24-v25 rotating buffer for first found keys
2406			# v26-v31 last 6 round keys
2407my ($keyperm)=($out0);	# aliases with "caller", redundant assignment
2408my $taillen=$x70;
2409
2410$code.=<<___;
2411.align	5
2412_aesp8_xts_encrypt6x:
2413	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2414	mflr		r11
2415	li		r7,`$FRAME+8*16+15`
2416	li		r3,`$FRAME+8*16+31`
2417	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2418	stvx		v20,r7,$sp		# ABI says so
2419	addi		r7,r7,32
2420	stvx		v21,r3,$sp
2421	addi		r3,r3,32
2422	stvx		v22,r7,$sp
2423	addi		r7,r7,32
2424	stvx		v23,r3,$sp
2425	addi		r3,r3,32
2426	stvx		v24,r7,$sp
2427	addi		r7,r7,32
2428	stvx		v25,r3,$sp
2429	addi		r3,r3,32
2430	stvx		v26,r7,$sp
2431	addi		r7,r7,32
2432	stvx		v27,r3,$sp
2433	addi		r3,r3,32
2434	stvx		v28,r7,$sp
2435	addi		r7,r7,32
2436	stvx		v29,r3,$sp
2437	addi		r3,r3,32
2438	stvx		v30,r7,$sp
2439	stvx		v31,r3,$sp
2440	li		r0,-1
2441	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
2442	li		$x10,0x10
2443	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2444	li		$x20,0x20
2445	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2446	li		$x30,0x30
2447	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2448	li		$x40,0x40
2449	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2450	li		$x50,0x50
2451	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2452	li		$x60,0x60
2453	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2454	li		$x70,0x70
2455	mtspr		256,r0
2456
2457	subi		$rounds,$rounds,3	# -4 in total
2458
2459	lvx		$rndkey0,$x00,$key1	# load key schedule
2460	lvx		v30,$x10,$key1
2461	addi		$key1,$key1,0x20
2462	lvx		v31,$x00,$key1
2463	?vperm		$rndkey0,$rndkey0,v30,$keyperm
2464	addi		$key_,$sp,$FRAME+15
2465	mtctr		$rounds
2466
2467Load_xts_enc_key:
2468	?vperm		v24,v30,v31,$keyperm
2469	lvx		v30,$x10,$key1
2470	addi		$key1,$key1,0x20
2471	stvx		v24,$x00,$key_		# off-load round[1]
2472	?vperm		v25,v31,v30,$keyperm
2473	lvx		v31,$x00,$key1
2474	stvx		v25,$x10,$key_		# off-load round[2]
2475	addi		$key_,$key_,0x20
2476	bdnz		Load_xts_enc_key
2477
2478	lvx		v26,$x10,$key1
2479	?vperm		v24,v30,v31,$keyperm
2480	lvx		v27,$x20,$key1
2481	stvx		v24,$x00,$key_		# off-load round[3]
2482	?vperm		v25,v31,v26,$keyperm
2483	lvx		v28,$x30,$key1
2484	stvx		v25,$x10,$key_		# off-load round[4]
2485	addi		$key_,$sp,$FRAME+15	# rewind $key_
2486	?vperm		v26,v26,v27,$keyperm
2487	lvx		v29,$x40,$key1
2488	?vperm		v27,v27,v28,$keyperm
2489	lvx		v30,$x50,$key1
2490	?vperm		v28,v28,v29,$keyperm
2491	lvx		v31,$x60,$key1
2492	?vperm		v29,v29,v30,$keyperm
2493	lvx		$twk5,$x70,$key1	# borrow $twk5
2494	?vperm		v30,v30,v31,$keyperm
2495	lvx		v24,$x00,$key_		# pre-load round[1]
2496	?vperm		v31,v31,$twk5,$keyperm
2497	lvx		v25,$x10,$key_		# pre-load round[2]
2498
2499	 vperm		$in0,$inout,$inptail,$inpperm
2500	 subi		$inp,$inp,31		# undo "caller"
2501	vxor		$twk0,$tweak,$rndkey0
2502	vsrab		$tmp,$tweak,$seven	# next tweak value
2503	vaddubm		$tweak,$tweak,$tweak
2504	vsldoi		$tmp,$tmp,$tmp,15
2505	vand		$tmp,$tmp,$eighty7
2506	 vxor		$out0,$in0,$twk0
2507	vxor		$tweak,$tweak,$tmp
2508
2509	 lvx_u		$in1,$x10,$inp
2510	vxor		$twk1,$tweak,$rndkey0
2511	vsrab		$tmp,$tweak,$seven	# next tweak value
2512	vaddubm		$tweak,$tweak,$tweak
2513	vsldoi		$tmp,$tmp,$tmp,15
2514	 le?vperm	$in1,$in1,$in1,$leperm
2515	vand		$tmp,$tmp,$eighty7
2516	 vxor		$out1,$in1,$twk1
2517	vxor		$tweak,$tweak,$tmp
2518
2519	 lvx_u		$in2,$x20,$inp
2520	 andi.		$taillen,$len,15
2521	vxor		$twk2,$tweak,$rndkey0
2522	vsrab		$tmp,$tweak,$seven	# next tweak value
2523	vaddubm		$tweak,$tweak,$tweak
2524	vsldoi		$tmp,$tmp,$tmp,15
2525	 le?vperm	$in2,$in2,$in2,$leperm
2526	vand		$tmp,$tmp,$eighty7
2527	 vxor		$out2,$in2,$twk2
2528	vxor		$tweak,$tweak,$tmp
2529
2530	 lvx_u		$in3,$x30,$inp
2531	 sub		$len,$len,$taillen
2532	vxor		$twk3,$tweak,$rndkey0
2533	vsrab		$tmp,$tweak,$seven	# next tweak value
2534	vaddubm		$tweak,$tweak,$tweak
2535	vsldoi		$tmp,$tmp,$tmp,15
2536	 le?vperm	$in3,$in3,$in3,$leperm
2537	vand		$tmp,$tmp,$eighty7
2538	 vxor		$out3,$in3,$twk3
2539	vxor		$tweak,$tweak,$tmp
2540
2541	 lvx_u		$in4,$x40,$inp
2542	 subi		$len,$len,0x60
2543	vxor		$twk4,$tweak,$rndkey0
2544	vsrab		$tmp,$tweak,$seven	# next tweak value
2545	vaddubm		$tweak,$tweak,$tweak
2546	vsldoi		$tmp,$tmp,$tmp,15
2547	 le?vperm	$in4,$in4,$in4,$leperm
2548	vand		$tmp,$tmp,$eighty7
2549	 vxor		$out4,$in4,$twk4
2550	vxor		$tweak,$tweak,$tmp
2551
2552	 lvx_u		$in5,$x50,$inp
2553	 addi		$inp,$inp,0x60
2554	vxor		$twk5,$tweak,$rndkey0
2555	vsrab		$tmp,$tweak,$seven	# next tweak value
2556	vaddubm		$tweak,$tweak,$tweak
2557	vsldoi		$tmp,$tmp,$tmp,15
2558	 le?vperm	$in5,$in5,$in5,$leperm
2559	vand		$tmp,$tmp,$eighty7
2560	 vxor		$out5,$in5,$twk5
2561	vxor		$tweak,$tweak,$tmp
2562
2563	vxor		v31,v31,$rndkey0
2564	mtctr		$rounds
2565	b		Loop_xts_enc6x
2566
2567.align	5
2568Loop_xts_enc6x:
2569	vcipher		$out0,$out0,v24
2570	vcipher		$out1,$out1,v24
2571	vcipher		$out2,$out2,v24
2572	vcipher		$out3,$out3,v24
2573	vcipher		$out4,$out4,v24
2574	vcipher		$out5,$out5,v24
2575	lvx		v24,$x20,$key_		# round[3]
2576	addi		$key_,$key_,0x20
2577
2578	vcipher		$out0,$out0,v25
2579	vcipher		$out1,$out1,v25
2580	vcipher		$out2,$out2,v25
2581	vcipher		$out3,$out3,v25
2582	vcipher		$out4,$out4,v25
2583	vcipher		$out5,$out5,v25
2584	lvx		v25,$x10,$key_		# round[4]
2585	bdnz		Loop_xts_enc6x
2586
2587	subic		$len,$len,96		# $len-=96
2588	 vxor		$in0,$twk0,v31		# xor with last round key
2589	vcipher		$out0,$out0,v24
2590	vcipher		$out1,$out1,v24
2591	 vsrab		$tmp,$tweak,$seven	# next tweak value
2592	 vxor		$twk0,$tweak,$rndkey0
2593	 vaddubm	$tweak,$tweak,$tweak
2594	vcipher		$out2,$out2,v24
2595	vcipher		$out3,$out3,v24
2596	 vsldoi		$tmp,$tmp,$tmp,15
2597	vcipher		$out4,$out4,v24
2598	vcipher		$out5,$out5,v24
2599
2600	subfe.		r0,r0,r0		# borrow?-1:0
2601	 vand		$tmp,$tmp,$eighty7
2602	vcipher		$out0,$out0,v25
2603	vcipher		$out1,$out1,v25
2604	 vxor		$tweak,$tweak,$tmp
2605	vcipher		$out2,$out2,v25
2606	vcipher		$out3,$out3,v25
2607	 vxor		$in1,$twk1,v31
2608	 vsrab		$tmp,$tweak,$seven	# next tweak value
2609	 vxor		$twk1,$tweak,$rndkey0
2610	vcipher		$out4,$out4,v25
2611	vcipher		$out5,$out5,v25
2612
2613	and		r0,r0,$len
2614	 vaddubm	$tweak,$tweak,$tweak
2615	 vsldoi		$tmp,$tmp,$tmp,15
2616	vcipher		$out0,$out0,v26
2617	vcipher		$out1,$out1,v26
2618	 vand		$tmp,$tmp,$eighty7
2619	vcipher		$out2,$out2,v26
2620	vcipher		$out3,$out3,v26
2621	 vxor		$tweak,$tweak,$tmp
2622	vcipher		$out4,$out4,v26
2623	vcipher		$out5,$out5,v26
2624
2625	add		$inp,$inp,r0		# $inp is adjusted in such
2626						# way that at exit from the
2627						# loop inX-in5 are loaded
2628						# with last "words"
2629	 vxor		$in2,$twk2,v31
2630	 vsrab		$tmp,$tweak,$seven	# next tweak value
2631	 vxor		$twk2,$tweak,$rndkey0
2632	 vaddubm	$tweak,$tweak,$tweak
2633	vcipher		$out0,$out0,v27
2634	vcipher		$out1,$out1,v27
2635	 vsldoi		$tmp,$tmp,$tmp,15
2636	vcipher		$out2,$out2,v27
2637	vcipher		$out3,$out3,v27
2638	 vand		$tmp,$tmp,$eighty7
2639	vcipher		$out4,$out4,v27
2640	vcipher		$out5,$out5,v27
2641
2642	addi		$key_,$sp,$FRAME+15	# rewind $key_
2643	 vxor		$tweak,$tweak,$tmp
2644	vcipher		$out0,$out0,v28
2645	vcipher		$out1,$out1,v28
2646	 vxor		$in3,$twk3,v31
2647	 vsrab		$tmp,$tweak,$seven	# next tweak value
2648	 vxor		$twk3,$tweak,$rndkey0
2649	vcipher		$out2,$out2,v28
2650	vcipher		$out3,$out3,v28
2651	 vaddubm	$tweak,$tweak,$tweak
2652	 vsldoi		$tmp,$tmp,$tmp,15
2653	vcipher		$out4,$out4,v28
2654	vcipher		$out5,$out5,v28
2655	lvx		v24,$x00,$key_		# re-pre-load round[1]
2656	 vand		$tmp,$tmp,$eighty7
2657
2658	vcipher		$out0,$out0,v29
2659	vcipher		$out1,$out1,v29
2660	 vxor		$tweak,$tweak,$tmp
2661	vcipher		$out2,$out2,v29
2662	vcipher		$out3,$out3,v29
2663	 vxor		$in4,$twk4,v31
2664	 vsrab		$tmp,$tweak,$seven	# next tweak value
2665	 vxor		$twk4,$tweak,$rndkey0
2666	vcipher		$out4,$out4,v29
2667	vcipher		$out5,$out5,v29
2668	lvx		v25,$x10,$key_		# re-pre-load round[2]
2669	 vaddubm	$tweak,$tweak,$tweak
2670	 vsldoi		$tmp,$tmp,$tmp,15
2671
2672	vcipher		$out0,$out0,v30
2673	vcipher		$out1,$out1,v30
2674	 vand		$tmp,$tmp,$eighty7
2675	vcipher		$out2,$out2,v30
2676	vcipher		$out3,$out3,v30
2677	 vxor		$tweak,$tweak,$tmp
2678	vcipher		$out4,$out4,v30
2679	vcipher		$out5,$out5,v30
2680	 vxor		$in5,$twk5,v31
2681	 vsrab		$tmp,$tweak,$seven	# next tweak value
2682	 vxor		$twk5,$tweak,$rndkey0
2683
2684	vcipherlast	$out0,$out0,$in0
2685	 lvx_u		$in0,$x00,$inp		# load next input block
2686	 vaddubm	$tweak,$tweak,$tweak
2687	 vsldoi		$tmp,$tmp,$tmp,15
2688	vcipherlast	$out1,$out1,$in1
2689	 lvx_u		$in1,$x10,$inp
2690	vcipherlast	$out2,$out2,$in2
2691	 le?vperm	$in0,$in0,$in0,$leperm
2692	 lvx_u		$in2,$x20,$inp
2693	 vand		$tmp,$tmp,$eighty7
2694	vcipherlast	$out3,$out3,$in3
2695	 le?vperm	$in1,$in1,$in1,$leperm
2696	 lvx_u		$in3,$x30,$inp
2697	vcipherlast	$out4,$out4,$in4
2698	 le?vperm	$in2,$in2,$in2,$leperm
2699	 lvx_u		$in4,$x40,$inp
2700	 vxor		$tweak,$tweak,$tmp
2701	vcipherlast	$tmp,$out5,$in5		# last block might be needed
2702						# in stealing mode
2703	 le?vperm	$in3,$in3,$in3,$leperm
2704	 lvx_u		$in5,$x50,$inp
2705	 addi		$inp,$inp,0x60
2706	 le?vperm	$in4,$in4,$in4,$leperm
2707	 le?vperm	$in5,$in5,$in5,$leperm
2708
2709	le?vperm	$out0,$out0,$out0,$leperm
2710	le?vperm	$out1,$out1,$out1,$leperm
2711	stvx_u		$out0,$x00,$out		# store output
2712	 vxor		$out0,$in0,$twk0
2713	le?vperm	$out2,$out2,$out2,$leperm
2714	stvx_u		$out1,$x10,$out
2715	 vxor		$out1,$in1,$twk1
2716	le?vperm	$out3,$out3,$out3,$leperm
2717	stvx_u		$out2,$x20,$out
2718	 vxor		$out2,$in2,$twk2
2719	le?vperm	$out4,$out4,$out4,$leperm
2720	stvx_u		$out3,$x30,$out
2721	 vxor		$out3,$in3,$twk3
2722	le?vperm	$out5,$tmp,$tmp,$leperm
2723	stvx_u		$out4,$x40,$out
2724	 vxor		$out4,$in4,$twk4
2725	le?stvx_u	$out5,$x50,$out
2726	be?stvx_u	$tmp, $x50,$out
2727	 vxor		$out5,$in5,$twk5
2728	addi		$out,$out,0x60
2729
2730	mtctr		$rounds
2731	beq		Loop_xts_enc6x		# did $len-=96 borrow?
2732
2733	addic.		$len,$len,0x60
2734	beq		Lxts_enc6x_zero
2735	cmpwi		$len,0x20
2736	blt		Lxts_enc6x_one
2737	nop
2738	beq		Lxts_enc6x_two
2739	cmpwi		$len,0x40
2740	blt		Lxts_enc6x_three
2741	nop
2742	beq		Lxts_enc6x_four
2743
2744Lxts_enc6x_five:
2745	vxor		$out0,$in1,$twk0
2746	vxor		$out1,$in2,$twk1
2747	vxor		$out2,$in3,$twk2
2748	vxor		$out3,$in4,$twk3
2749	vxor		$out4,$in5,$twk4
2750
2751	bl		_aesp8_xts_enc5x
2752
2753	le?vperm	$out0,$out0,$out0,$leperm
2754	vmr		$twk0,$twk5		# unused tweak
2755	le?vperm	$out1,$out1,$out1,$leperm
2756	stvx_u		$out0,$x00,$out		# store output
2757	le?vperm	$out2,$out2,$out2,$leperm
2758	stvx_u		$out1,$x10,$out
2759	le?vperm	$out3,$out3,$out3,$leperm
2760	stvx_u		$out2,$x20,$out
2761	vxor		$tmp,$out4,$twk5	# last block prep for stealing
2762	le?vperm	$out4,$out4,$out4,$leperm
2763	stvx_u		$out3,$x30,$out
2764	stvx_u		$out4,$x40,$out
2765	addi		$out,$out,0x50
2766	bne		Lxts_enc6x_steal
2767	b		Lxts_enc6x_done
2768
2769.align	4
2770Lxts_enc6x_four:
2771	vxor		$out0,$in2,$twk0
2772	vxor		$out1,$in3,$twk1
2773	vxor		$out2,$in4,$twk2
2774	vxor		$out3,$in5,$twk3
2775	vxor		$out4,$out4,$out4
2776
2777	bl		_aesp8_xts_enc5x
2778
2779	le?vperm	$out0,$out0,$out0,$leperm
2780	vmr		$twk0,$twk4		# unused tweak
2781	le?vperm	$out1,$out1,$out1,$leperm
2782	stvx_u		$out0,$x00,$out		# store output
2783	le?vperm	$out2,$out2,$out2,$leperm
2784	stvx_u		$out1,$x10,$out
2785	vxor		$tmp,$out3,$twk4	# last block prep for stealing
2786	le?vperm	$out3,$out3,$out3,$leperm
2787	stvx_u		$out2,$x20,$out
2788	stvx_u		$out3,$x30,$out
2789	addi		$out,$out,0x40
2790	bne		Lxts_enc6x_steal
2791	b		Lxts_enc6x_done
2792
2793.align	4
2794Lxts_enc6x_three:
2795	vxor		$out0,$in3,$twk0
2796	vxor		$out1,$in4,$twk1
2797	vxor		$out2,$in5,$twk2
2798	vxor		$out3,$out3,$out3
2799	vxor		$out4,$out4,$out4
2800
2801	bl		_aesp8_xts_enc5x
2802
2803	le?vperm	$out0,$out0,$out0,$leperm
2804	vmr		$twk0,$twk3		# unused tweak
2805	le?vperm	$out1,$out1,$out1,$leperm
2806	stvx_u		$out0,$x00,$out		# store output
2807	vxor		$tmp,$out2,$twk3	# last block prep for stealing
2808	le?vperm	$out2,$out2,$out2,$leperm
2809	stvx_u		$out1,$x10,$out
2810	stvx_u		$out2,$x20,$out
2811	addi		$out,$out,0x30
2812	bne		Lxts_enc6x_steal
2813	b		Lxts_enc6x_done
2814
2815.align	4
2816Lxts_enc6x_two:
2817	vxor		$out0,$in4,$twk0
2818	vxor		$out1,$in5,$twk1
2819	vxor		$out2,$out2,$out2
2820	vxor		$out3,$out3,$out3
2821	vxor		$out4,$out4,$out4
2822
2823	bl		_aesp8_xts_enc5x
2824
2825	le?vperm	$out0,$out0,$out0,$leperm
2826	vmr		$twk0,$twk2		# unused tweak
2827	vxor		$tmp,$out1,$twk2	# last block prep for stealing
2828	le?vperm	$out1,$out1,$out1,$leperm
2829	stvx_u		$out0,$x00,$out		# store output
2830	stvx_u		$out1,$x10,$out
2831	addi		$out,$out,0x20
2832	bne		Lxts_enc6x_steal
2833	b		Lxts_enc6x_done
2834
2835.align	4
2836Lxts_enc6x_one:
2837	vxor		$out0,$in5,$twk0
2838	nop
2839Loop_xts_enc1x:
2840	vcipher		$out0,$out0,v24
2841	lvx		v24,$x20,$key_		# round[3]
2842	addi		$key_,$key_,0x20
2843
2844	vcipher		$out0,$out0,v25
2845	lvx		v25,$x10,$key_		# round[4]
2846	bdnz		Loop_xts_enc1x
2847
2848	add		$inp,$inp,$taillen
2849	cmpwi		$taillen,0
2850	vcipher		$out0,$out0,v24
2851
2852	subi		$inp,$inp,16
2853	vcipher		$out0,$out0,v25
2854
2855	lvsr		$inpperm,0,$taillen
2856	vcipher		$out0,$out0,v26
2857
2858	lvx_u		$in0,0,$inp
2859	vcipher		$out0,$out0,v27
2860
2861	addi		$key_,$sp,$FRAME+15	# rewind $key_
2862	vcipher		$out0,$out0,v28
2863	lvx		v24,$x00,$key_		# re-pre-load round[1]
2864
2865	vcipher		$out0,$out0,v29
2866	lvx		v25,$x10,$key_		# re-pre-load round[2]
2867	 vxor		$twk0,$twk0,v31
2868
2869	le?vperm	$in0,$in0,$in0,$leperm
2870	vcipher		$out0,$out0,v30
2871
2872	vperm		$in0,$in0,$in0,$inpperm
2873	vcipherlast	$out0,$out0,$twk0
2874
2875	vmr		$twk0,$twk1		# unused tweak
2876	vxor		$tmp,$out0,$twk1	# last block prep for stealing
2877	le?vperm	$out0,$out0,$out0,$leperm
2878	stvx_u		$out0,$x00,$out		# store output
2879	addi		$out,$out,0x10
2880	bne		Lxts_enc6x_steal
2881	b		Lxts_enc6x_done
2882
2883.align	4
2884Lxts_enc6x_zero:
2885	cmpwi		$taillen,0
2886	beq		Lxts_enc6x_done
2887
2888	add		$inp,$inp,$taillen
2889	subi		$inp,$inp,16
2890	lvx_u		$in0,0,$inp
2891	lvsr		$inpperm,0,$taillen	# $in5 is no more
2892	le?vperm	$in0,$in0,$in0,$leperm
2893	vperm		$in0,$in0,$in0,$inpperm
2894	vxor		$tmp,$tmp,$twk0
2895Lxts_enc6x_steal:
2896	vxor		$in0,$in0,$twk0
2897	vxor		$out0,$out0,$out0
2898	vspltisb	$out1,-1
2899	vperm		$out0,$out0,$out1,$inpperm
2900	vsel		$out0,$in0,$tmp,$out0	# $tmp is last block, remember?
2901
2902	subi		r30,$out,17
2903	subi		$out,$out,16
2904	mtctr		$taillen
2905Loop_xts_enc6x_steal:
2906	lbzu		r0,1(r30)
2907	stb		r0,16(r30)
2908	bdnz		Loop_xts_enc6x_steal
2909
2910	li		$taillen,0
2911	mtctr		$rounds
2912	b		Loop_xts_enc1x		# one more time...
2913
2914.align	4
2915Lxts_enc6x_done:
2916	${UCMP}i	$ivp,0
2917	beq		Lxts_enc6x_ret
2918
2919	vxor		$tweak,$twk0,$rndkey0
2920	le?vperm	$tweak,$tweak,$tweak,$leperm
2921	stvx_u		$tweak,0,$ivp
2922
2923Lxts_enc6x_ret:
2924	mtlr		r11
2925	li		r10,`$FRAME+15`
2926	li		r11,`$FRAME+31`
2927	stvx		$seven,r10,$sp		# wipe copies of round keys
2928	addi		r10,r10,32
2929	stvx		$seven,r11,$sp
2930	addi		r11,r11,32
2931	stvx		$seven,r10,$sp
2932	addi		r10,r10,32
2933	stvx		$seven,r11,$sp
2934	addi		r11,r11,32
2935	stvx		$seven,r10,$sp
2936	addi		r10,r10,32
2937	stvx		$seven,r11,$sp
2938	addi		r11,r11,32
2939	stvx		$seven,r10,$sp
2940	addi		r10,r10,32
2941	stvx		$seven,r11,$sp
2942	addi		r11,r11,32
2943
2944	mtspr		256,$vrsave
2945	lvx		v20,r10,$sp		# ABI says so
2946	addi		r10,r10,32
2947	lvx		v21,r11,$sp
2948	addi		r11,r11,32
2949	lvx		v22,r10,$sp
2950	addi		r10,r10,32
2951	lvx		v23,r11,$sp
2952	addi		r11,r11,32
2953	lvx		v24,r10,$sp
2954	addi		r10,r10,32
2955	lvx		v25,r11,$sp
2956	addi		r11,r11,32
2957	lvx		v26,r10,$sp
2958	addi		r10,r10,32
2959	lvx		v27,r11,$sp
2960	addi		r11,r11,32
2961	lvx		v28,r10,$sp
2962	addi		r10,r10,32
2963	lvx		v29,r11,$sp
2964	addi		r11,r11,32
2965	lvx		v30,r10,$sp
2966	lvx		v31,r11,$sp
2967	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2968	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2969	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2970	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2971	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2972	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2973	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
2974	blr
2975	.long		0
2976	.byte		0,12,0x04,1,0x80,6,6,0
2977	.long		0
2978
2979.align	5
2980_aesp8_xts_enc5x:
2981	vcipher		$out0,$out0,v24
2982	vcipher		$out1,$out1,v24
2983	vcipher		$out2,$out2,v24
2984	vcipher		$out3,$out3,v24
2985	vcipher		$out4,$out4,v24
2986	lvx		v24,$x20,$key_		# round[3]
2987	addi		$key_,$key_,0x20
2988
2989	vcipher		$out0,$out0,v25
2990	vcipher		$out1,$out1,v25
2991	vcipher		$out2,$out2,v25
2992	vcipher		$out3,$out3,v25
2993	vcipher		$out4,$out4,v25
2994	lvx		v25,$x10,$key_		# round[4]
2995	bdnz		_aesp8_xts_enc5x
2996
2997	add		$inp,$inp,$taillen
2998	cmpwi		$taillen,0
2999	vcipher		$out0,$out0,v24
3000	vcipher		$out1,$out1,v24
3001	vcipher		$out2,$out2,v24
3002	vcipher		$out3,$out3,v24
3003	vcipher		$out4,$out4,v24
3004
3005	subi		$inp,$inp,16
3006	vcipher		$out0,$out0,v25
3007	vcipher		$out1,$out1,v25
3008	vcipher		$out2,$out2,v25
3009	vcipher		$out3,$out3,v25
3010	vcipher		$out4,$out4,v25
3011	 vxor		$twk0,$twk0,v31
3012
3013	vcipher		$out0,$out0,v26
3014	lvsr		$inpperm,0,$taillen	# $in5 is no more
3015	vcipher		$out1,$out1,v26
3016	vcipher		$out2,$out2,v26
3017	vcipher		$out3,$out3,v26
3018	vcipher		$out4,$out4,v26
3019	 vxor		$in1,$twk1,v31
3020
3021	vcipher		$out0,$out0,v27
3022	lvx_u		$in0,0,$inp
3023	vcipher		$out1,$out1,v27
3024	vcipher		$out2,$out2,v27
3025	vcipher		$out3,$out3,v27
3026	vcipher		$out4,$out4,v27
3027	 vxor		$in2,$twk2,v31
3028
3029	addi		$key_,$sp,$FRAME+15	# rewind $key_
3030	vcipher		$out0,$out0,v28
3031	vcipher		$out1,$out1,v28
3032	vcipher		$out2,$out2,v28
3033	vcipher		$out3,$out3,v28
3034	vcipher		$out4,$out4,v28
3035	lvx		v24,$x00,$key_		# re-pre-load round[1]
3036	 vxor		$in3,$twk3,v31
3037
3038	vcipher		$out0,$out0,v29
3039	le?vperm	$in0,$in0,$in0,$leperm
3040	vcipher		$out1,$out1,v29
3041	vcipher		$out2,$out2,v29
3042	vcipher		$out3,$out3,v29
3043	vcipher		$out4,$out4,v29
3044	lvx		v25,$x10,$key_		# re-pre-load round[2]
3045	 vxor		$in4,$twk4,v31
3046
3047	vcipher		$out0,$out0,v30
3048	vperm		$in0,$in0,$in0,$inpperm
3049	vcipher		$out1,$out1,v30
3050	vcipher		$out2,$out2,v30
3051	vcipher		$out3,$out3,v30
3052	vcipher		$out4,$out4,v30
3053
3054	vcipherlast	$out0,$out0,$twk0
3055	vcipherlast	$out1,$out1,$in1
3056	vcipherlast	$out2,$out2,$in2
3057	vcipherlast	$out3,$out3,$in3
3058	vcipherlast	$out4,$out4,$in4
3059	blr
3060        .long   	0
3061        .byte   	0,12,0x14,0,0,0,0,0
3062
3063.align	5
3064_aesp8_xts_decrypt6x:
3065	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
3066	mflr		r11
3067	li		r7,`$FRAME+8*16+15`
3068	li		r3,`$FRAME+8*16+31`
3069	$PUSH		r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
3070	stvx		v20,r7,$sp		# ABI says so
3071	addi		r7,r7,32
3072	stvx		v21,r3,$sp
3073	addi		r3,r3,32
3074	stvx		v22,r7,$sp
3075	addi		r7,r7,32
3076	stvx		v23,r3,$sp
3077	addi		r3,r3,32
3078	stvx		v24,r7,$sp
3079	addi		r7,r7,32
3080	stvx		v25,r3,$sp
3081	addi		r3,r3,32
3082	stvx		v26,r7,$sp
3083	addi		r7,r7,32
3084	stvx		v27,r3,$sp
3085	addi		r3,r3,32
3086	stvx		v28,r7,$sp
3087	addi		r7,r7,32
3088	stvx		v29,r3,$sp
3089	addi		r3,r3,32
3090	stvx		v30,r7,$sp
3091	stvx		v31,r3,$sp
3092	li		r0,-1
3093	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
3094	li		$x10,0x10
3095	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3096	li		$x20,0x20
3097	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3098	li		$x30,0x30
3099	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3100	li		$x40,0x40
3101	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3102	li		$x50,0x50
3103	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3104	li		$x60,0x60
3105	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3106	li		$x70,0x70
3107	mtspr		256,r0
3108
3109	subi		$rounds,$rounds,3	# -4 in total
3110
3111	lvx		$rndkey0,$x00,$key1	# load key schedule
3112	lvx		v30,$x10,$key1
3113	addi		$key1,$key1,0x20
3114	lvx		v31,$x00,$key1
3115	?vperm		$rndkey0,$rndkey0,v30,$keyperm
3116	addi		$key_,$sp,$FRAME+15
3117	mtctr		$rounds
3118
3119Load_xts_dec_key:
3120	?vperm		v24,v30,v31,$keyperm
3121	lvx		v30,$x10,$key1
3122	addi		$key1,$key1,0x20
3123	stvx		v24,$x00,$key_		# off-load round[1]
3124	?vperm		v25,v31,v30,$keyperm
3125	lvx		v31,$x00,$key1
3126	stvx		v25,$x10,$key_		# off-load round[2]
3127	addi		$key_,$key_,0x20
3128	bdnz		Load_xts_dec_key
3129
3130	lvx		v26,$x10,$key1
3131	?vperm		v24,v30,v31,$keyperm
3132	lvx		v27,$x20,$key1
3133	stvx		v24,$x00,$key_		# off-load round[3]
3134	?vperm		v25,v31,v26,$keyperm
3135	lvx		v28,$x30,$key1
3136	stvx		v25,$x10,$key_		# off-load round[4]
3137	addi		$key_,$sp,$FRAME+15	# rewind $key_
3138	?vperm		v26,v26,v27,$keyperm
3139	lvx		v29,$x40,$key1
3140	?vperm		v27,v27,v28,$keyperm
3141	lvx		v30,$x50,$key1
3142	?vperm		v28,v28,v29,$keyperm
3143	lvx		v31,$x60,$key1
3144	?vperm		v29,v29,v30,$keyperm
3145	lvx		$twk5,$x70,$key1	# borrow $twk5
3146	?vperm		v30,v30,v31,$keyperm
3147	lvx		v24,$x00,$key_		# pre-load round[1]
3148	?vperm		v31,v31,$twk5,$keyperm
3149	lvx		v25,$x10,$key_		# pre-load round[2]
3150
3151	 vperm		$in0,$inout,$inptail,$inpperm
3152	 subi		$inp,$inp,31		# undo "caller"
3153	vxor		$twk0,$tweak,$rndkey0
3154	vsrab		$tmp,$tweak,$seven	# next tweak value
3155	vaddubm		$tweak,$tweak,$tweak
3156	vsldoi		$tmp,$tmp,$tmp,15
3157	vand		$tmp,$tmp,$eighty7
3158	 vxor		$out0,$in0,$twk0
3159	vxor		$tweak,$tweak,$tmp
3160
3161	 lvx_u		$in1,$x10,$inp
3162	vxor		$twk1,$tweak,$rndkey0
3163	vsrab		$tmp,$tweak,$seven	# next tweak value
3164	vaddubm		$tweak,$tweak,$tweak
3165	vsldoi		$tmp,$tmp,$tmp,15
3166	 le?vperm	$in1,$in1,$in1,$leperm
3167	vand		$tmp,$tmp,$eighty7
3168	 vxor		$out1,$in1,$twk1
3169	vxor		$tweak,$tweak,$tmp
3170
3171	 lvx_u		$in2,$x20,$inp
3172	 andi.		$taillen,$len,15
3173	vxor		$twk2,$tweak,$rndkey0
3174	vsrab		$tmp,$tweak,$seven	# next tweak value
3175	vaddubm		$tweak,$tweak,$tweak
3176	vsldoi		$tmp,$tmp,$tmp,15
3177	 le?vperm	$in2,$in2,$in2,$leperm
3178	vand		$tmp,$tmp,$eighty7
3179	 vxor		$out2,$in2,$twk2
3180	vxor		$tweak,$tweak,$tmp
3181
3182	 lvx_u		$in3,$x30,$inp
3183	 sub		$len,$len,$taillen
3184	vxor		$twk3,$tweak,$rndkey0
3185	vsrab		$tmp,$tweak,$seven	# next tweak value
3186	vaddubm		$tweak,$tweak,$tweak
3187	vsldoi		$tmp,$tmp,$tmp,15
3188	 le?vperm	$in3,$in3,$in3,$leperm
3189	vand		$tmp,$tmp,$eighty7
3190	 vxor		$out3,$in3,$twk3
3191	vxor		$tweak,$tweak,$tmp
3192
3193	 lvx_u		$in4,$x40,$inp
3194	 subi		$len,$len,0x60
3195	vxor		$twk4,$tweak,$rndkey0
3196	vsrab		$tmp,$tweak,$seven	# next tweak value
3197	vaddubm		$tweak,$tweak,$tweak
3198	vsldoi		$tmp,$tmp,$tmp,15
3199	 le?vperm	$in4,$in4,$in4,$leperm
3200	vand		$tmp,$tmp,$eighty7
3201	 vxor		$out4,$in4,$twk4
3202	vxor		$tweak,$tweak,$tmp
3203
3204	 lvx_u		$in5,$x50,$inp
3205	 addi		$inp,$inp,0x60
3206	vxor		$twk5,$tweak,$rndkey0
3207	vsrab		$tmp,$tweak,$seven	# next tweak value
3208	vaddubm		$tweak,$tweak,$tweak
3209	vsldoi		$tmp,$tmp,$tmp,15
3210	 le?vperm	$in5,$in5,$in5,$leperm
3211	vand		$tmp,$tmp,$eighty7
3212	 vxor		$out5,$in5,$twk5
3213	vxor		$tweak,$tweak,$tmp
3214
3215	vxor		v31,v31,$rndkey0
3216	mtctr		$rounds
3217	b		Loop_xts_dec6x
3218
3219.align	5
3220Loop_xts_dec6x:
3221	vncipher	$out0,$out0,v24
3222	vncipher	$out1,$out1,v24
3223	vncipher	$out2,$out2,v24
3224	vncipher	$out3,$out3,v24
3225	vncipher	$out4,$out4,v24
3226	vncipher	$out5,$out5,v24
3227	lvx		v24,$x20,$key_		# round[3]
3228	addi		$key_,$key_,0x20
3229
3230	vncipher	$out0,$out0,v25
3231	vncipher	$out1,$out1,v25
3232	vncipher	$out2,$out2,v25
3233	vncipher	$out3,$out3,v25
3234	vncipher	$out4,$out4,v25
3235	vncipher	$out5,$out5,v25
3236	lvx		v25,$x10,$key_		# round[4]
3237	bdnz		Loop_xts_dec6x
3238
3239	subic		$len,$len,96		# $len-=96
3240	 vxor		$in0,$twk0,v31		# xor with last round key
3241	vncipher	$out0,$out0,v24
3242	vncipher	$out1,$out1,v24
3243	 vsrab		$tmp,$tweak,$seven	# next tweak value
3244	 vxor		$twk0,$tweak,$rndkey0
3245	 vaddubm	$tweak,$tweak,$tweak
3246	vncipher	$out2,$out2,v24
3247	vncipher	$out3,$out3,v24
3248	 vsldoi		$tmp,$tmp,$tmp,15
3249	vncipher	$out4,$out4,v24
3250	vncipher	$out5,$out5,v24
3251
3252	subfe.		r0,r0,r0		# borrow?-1:0
3253	 vand		$tmp,$tmp,$eighty7
3254	vncipher	$out0,$out0,v25
3255	vncipher	$out1,$out1,v25
3256	 vxor		$tweak,$tweak,$tmp
3257	vncipher	$out2,$out2,v25
3258	vncipher	$out3,$out3,v25
3259	 vxor		$in1,$twk1,v31
3260	 vsrab		$tmp,$tweak,$seven	# next tweak value
3261	 vxor		$twk1,$tweak,$rndkey0
3262	vncipher	$out4,$out4,v25
3263	vncipher	$out5,$out5,v25
3264
3265	and		r0,r0,$len
3266	 vaddubm	$tweak,$tweak,$tweak
3267	 vsldoi		$tmp,$tmp,$tmp,15
3268	vncipher	$out0,$out0,v26
3269	vncipher	$out1,$out1,v26
3270	 vand		$tmp,$tmp,$eighty7
3271	vncipher	$out2,$out2,v26
3272	vncipher	$out3,$out3,v26
3273	 vxor		$tweak,$tweak,$tmp
3274	vncipher	$out4,$out4,v26
3275	vncipher	$out5,$out5,v26
3276
3277	add		$inp,$inp,r0		# $inp is adjusted in such
3278						# way that at exit from the
3279						# loop inX-in5 are loaded
3280						# with last "words"
3281	 vxor		$in2,$twk2,v31
3282	 vsrab		$tmp,$tweak,$seven	# next tweak value
3283	 vxor		$twk2,$tweak,$rndkey0
3284	 vaddubm	$tweak,$tweak,$tweak
3285	vncipher	$out0,$out0,v27
3286	vncipher	$out1,$out1,v27
3287	 vsldoi		$tmp,$tmp,$tmp,15
3288	vncipher	$out2,$out2,v27
3289	vncipher	$out3,$out3,v27
3290	 vand		$tmp,$tmp,$eighty7
3291	vncipher	$out4,$out4,v27
3292	vncipher	$out5,$out5,v27
3293
3294	addi		$key_,$sp,$FRAME+15	# rewind $key_
3295	 vxor		$tweak,$tweak,$tmp
3296	vncipher	$out0,$out0,v28
3297	vncipher	$out1,$out1,v28
3298	 vxor		$in3,$twk3,v31
3299	 vsrab		$tmp,$tweak,$seven	# next tweak value
3300	 vxor		$twk3,$tweak,$rndkey0
3301	vncipher	$out2,$out2,v28
3302	vncipher	$out3,$out3,v28
3303	 vaddubm	$tweak,$tweak,$tweak
3304	 vsldoi		$tmp,$tmp,$tmp,15
3305	vncipher	$out4,$out4,v28
3306	vncipher	$out5,$out5,v28
3307	lvx		v24,$x00,$key_		# re-pre-load round[1]
3308	 vand		$tmp,$tmp,$eighty7
3309
3310	vncipher	$out0,$out0,v29
3311	vncipher	$out1,$out1,v29
3312	 vxor		$tweak,$tweak,$tmp
3313	vncipher	$out2,$out2,v29
3314	vncipher	$out3,$out3,v29
3315	 vxor		$in4,$twk4,v31
3316	 vsrab		$tmp,$tweak,$seven	# next tweak value
3317	 vxor		$twk4,$tweak,$rndkey0
3318	vncipher	$out4,$out4,v29
3319	vncipher	$out5,$out5,v29
3320	lvx		v25,$x10,$key_		# re-pre-load round[2]
3321	 vaddubm	$tweak,$tweak,$tweak
3322	 vsldoi		$tmp,$tmp,$tmp,15
3323
3324	vncipher	$out0,$out0,v30
3325	vncipher	$out1,$out1,v30
3326	 vand		$tmp,$tmp,$eighty7
3327	vncipher	$out2,$out2,v30
3328	vncipher	$out3,$out3,v30
3329	 vxor		$tweak,$tweak,$tmp
3330	vncipher	$out4,$out4,v30
3331	vncipher	$out5,$out5,v30
3332	 vxor		$in5,$twk5,v31
3333	 vsrab		$tmp,$tweak,$seven	# next tweak value
3334	 vxor		$twk5,$tweak,$rndkey0
3335
3336	vncipherlast	$out0,$out0,$in0
3337	 lvx_u		$in0,$x00,$inp		# load next input block
3338	 vaddubm	$tweak,$tweak,$tweak
3339	 vsldoi		$tmp,$tmp,$tmp,15
3340	vncipherlast	$out1,$out1,$in1
3341	 lvx_u		$in1,$x10,$inp
3342	vncipherlast	$out2,$out2,$in2
3343	 le?vperm	$in0,$in0,$in0,$leperm
3344	 lvx_u		$in2,$x20,$inp
3345	 vand		$tmp,$tmp,$eighty7
3346	vncipherlast	$out3,$out3,$in3
3347	 le?vperm	$in1,$in1,$in1,$leperm
3348	 lvx_u		$in3,$x30,$inp
3349	vncipherlast	$out4,$out4,$in4
3350	 le?vperm	$in2,$in2,$in2,$leperm
3351	 lvx_u		$in4,$x40,$inp
3352	 vxor		$tweak,$tweak,$tmp
3353	vncipherlast	$out5,$out5,$in5
3354	 le?vperm	$in3,$in3,$in3,$leperm
3355	 lvx_u		$in5,$x50,$inp
3356	 addi		$inp,$inp,0x60
3357	 le?vperm	$in4,$in4,$in4,$leperm
3358	 le?vperm	$in5,$in5,$in5,$leperm
3359
3360	le?vperm	$out0,$out0,$out0,$leperm
3361	le?vperm	$out1,$out1,$out1,$leperm
3362	stvx_u		$out0,$x00,$out		# store output
3363	 vxor		$out0,$in0,$twk0
3364	le?vperm	$out2,$out2,$out2,$leperm
3365	stvx_u		$out1,$x10,$out
3366	 vxor		$out1,$in1,$twk1
3367	le?vperm	$out3,$out3,$out3,$leperm
3368	stvx_u		$out2,$x20,$out
3369	 vxor		$out2,$in2,$twk2
3370	le?vperm	$out4,$out4,$out4,$leperm
3371	stvx_u		$out3,$x30,$out
3372	 vxor		$out3,$in3,$twk3
3373	le?vperm	$out5,$out5,$out5,$leperm
3374	stvx_u		$out4,$x40,$out
3375	 vxor		$out4,$in4,$twk4
3376	stvx_u		$out5,$x50,$out
3377	 vxor		$out5,$in5,$twk5
3378	addi		$out,$out,0x60
3379
3380	mtctr		$rounds
3381	beq		Loop_xts_dec6x		# did $len-=96 borrow?
3382
3383	addic.		$len,$len,0x60
3384	beq		Lxts_dec6x_zero
3385	cmpwi		$len,0x20
3386	blt		Lxts_dec6x_one
3387	nop
3388	beq		Lxts_dec6x_two
3389	cmpwi		$len,0x40
3390	blt		Lxts_dec6x_three
3391	nop
3392	beq		Lxts_dec6x_four
3393
3394Lxts_dec6x_five:
3395	vxor		$out0,$in1,$twk0
3396	vxor		$out1,$in2,$twk1
3397	vxor		$out2,$in3,$twk2
3398	vxor		$out3,$in4,$twk3
3399	vxor		$out4,$in5,$twk4
3400
3401	bl		_aesp8_xts_dec5x
3402
3403	le?vperm	$out0,$out0,$out0,$leperm
3404	vmr		$twk0,$twk5		# unused tweak
3405	vxor		$twk1,$tweak,$rndkey0
3406	le?vperm	$out1,$out1,$out1,$leperm
3407	stvx_u		$out0,$x00,$out		# store output
3408	vxor		$out0,$in0,$twk1
3409	le?vperm	$out2,$out2,$out2,$leperm
3410	stvx_u		$out1,$x10,$out
3411	le?vperm	$out3,$out3,$out3,$leperm
3412	stvx_u		$out2,$x20,$out
3413	le?vperm	$out4,$out4,$out4,$leperm
3414	stvx_u		$out3,$x30,$out
3415	stvx_u		$out4,$x40,$out
3416	addi		$out,$out,0x50
3417	bne		Lxts_dec6x_steal
3418	b		Lxts_dec6x_done
3419
3420.align	4
3421Lxts_dec6x_four:
3422	vxor		$out0,$in2,$twk0
3423	vxor		$out1,$in3,$twk1
3424	vxor		$out2,$in4,$twk2
3425	vxor		$out3,$in5,$twk3
3426	vxor		$out4,$out4,$out4
3427
3428	bl		_aesp8_xts_dec5x
3429
3430	le?vperm	$out0,$out0,$out0,$leperm
3431	vmr		$twk0,$twk4		# unused tweak
3432	vmr		$twk1,$twk5
3433	le?vperm	$out1,$out1,$out1,$leperm
3434	stvx_u		$out0,$x00,$out		# store output
3435	vxor		$out0,$in0,$twk5
3436	le?vperm	$out2,$out2,$out2,$leperm
3437	stvx_u		$out1,$x10,$out
3438	le?vperm	$out3,$out3,$out3,$leperm
3439	stvx_u		$out2,$x20,$out
3440	stvx_u		$out3,$x30,$out
3441	addi		$out,$out,0x40
3442	bne		Lxts_dec6x_steal
3443	b		Lxts_dec6x_done
3444
3445.align	4
3446Lxts_dec6x_three:
3447	vxor		$out0,$in3,$twk0
3448	vxor		$out1,$in4,$twk1
3449	vxor		$out2,$in5,$twk2
3450	vxor		$out3,$out3,$out3
3451	vxor		$out4,$out4,$out4
3452
3453	bl		_aesp8_xts_dec5x
3454
3455	le?vperm	$out0,$out0,$out0,$leperm
3456	vmr		$twk0,$twk3		# unused tweak
3457	vmr		$twk1,$twk4
3458	le?vperm	$out1,$out1,$out1,$leperm
3459	stvx_u		$out0,$x00,$out		# store output
3460	vxor		$out0,$in0,$twk4
3461	le?vperm	$out2,$out2,$out2,$leperm
3462	stvx_u		$out1,$x10,$out
3463	stvx_u		$out2,$x20,$out
3464	addi		$out,$out,0x30
3465	bne		Lxts_dec6x_steal
3466	b		Lxts_dec6x_done
3467
3468.align	4
3469Lxts_dec6x_two:
3470	vxor		$out0,$in4,$twk0
3471	vxor		$out1,$in5,$twk1
3472	vxor		$out2,$out2,$out2
3473	vxor		$out3,$out3,$out3
3474	vxor		$out4,$out4,$out4
3475
3476	bl		_aesp8_xts_dec5x
3477
3478	le?vperm	$out0,$out0,$out0,$leperm
3479	vmr		$twk0,$twk2		# unused tweak
3480	vmr		$twk1,$twk3
3481	le?vperm	$out1,$out1,$out1,$leperm
3482	stvx_u		$out0,$x00,$out		# store output
3483	vxor		$out0,$in0,$twk3
3484	stvx_u		$out1,$x10,$out
3485	addi		$out,$out,0x20
3486	bne		Lxts_dec6x_steal
3487	b		Lxts_dec6x_done
3488
3489.align	4
3490Lxts_dec6x_one:
3491	vxor		$out0,$in5,$twk0
3492	nop
3493Loop_xts_dec1x:
3494	vncipher	$out0,$out0,v24
3495	lvx		v24,$x20,$key_		# round[3]
3496	addi		$key_,$key_,0x20
3497
3498	vncipher	$out0,$out0,v25
3499	lvx		v25,$x10,$key_		# round[4]
3500	bdnz		Loop_xts_dec1x
3501
3502	subi		r0,$taillen,1
3503	vncipher	$out0,$out0,v24
3504
3505	andi.		r0,r0,16
3506	cmpwi		$taillen,0
3507	vncipher	$out0,$out0,v25
3508
3509	sub		$inp,$inp,r0
3510	vncipher	$out0,$out0,v26
3511
3512	lvx_u		$in0,0,$inp
3513	vncipher	$out0,$out0,v27
3514
3515	addi		$key_,$sp,$FRAME+15	# rewind $key_
3516	vncipher	$out0,$out0,v28
3517	lvx		v24,$x00,$key_		# re-pre-load round[1]
3518
3519	vncipher	$out0,$out0,v29
3520	lvx		v25,$x10,$key_		# re-pre-load round[2]
3521	 vxor		$twk0,$twk0,v31
3522
3523	le?vperm	$in0,$in0,$in0,$leperm
3524	vncipher	$out0,$out0,v30
3525
3526	mtctr		$rounds
3527	vncipherlast	$out0,$out0,$twk0
3528
3529	vmr		$twk0,$twk1		# unused tweak
3530	vmr		$twk1,$twk2
3531	le?vperm	$out0,$out0,$out0,$leperm
3532	stvx_u		$out0,$x00,$out		# store output
3533	addi		$out,$out,0x10
3534	vxor		$out0,$in0,$twk2
3535	bne		Lxts_dec6x_steal
3536	b		Lxts_dec6x_done
3537
3538.align	4
3539Lxts_dec6x_zero:
3540	cmpwi		$taillen,0
3541	beq		Lxts_dec6x_done
3542
3543	lvx_u		$in0,0,$inp
3544	le?vperm	$in0,$in0,$in0,$leperm
3545	vxor		$out0,$in0,$twk1
3546Lxts_dec6x_steal:
3547	vncipher	$out0,$out0,v24
3548	lvx		v24,$x20,$key_		# round[3]
3549	addi		$key_,$key_,0x20
3550
3551	vncipher	$out0,$out0,v25
3552	lvx		v25,$x10,$key_		# round[4]
3553	bdnz		Lxts_dec6x_steal
3554
3555	add		$inp,$inp,$taillen
3556	vncipher	$out0,$out0,v24
3557
3558	cmpwi		$taillen,0
3559	vncipher	$out0,$out0,v25
3560
3561	lvx_u		$in0,0,$inp
3562	vncipher	$out0,$out0,v26
3563
3564	lvsr		$inpperm,0,$taillen	# $in5 is no more
3565	vncipher	$out0,$out0,v27
3566
3567	addi		$key_,$sp,$FRAME+15	# rewind $key_
3568	vncipher	$out0,$out0,v28
3569	lvx		v24,$x00,$key_		# re-pre-load round[1]
3570
3571	vncipher	$out0,$out0,v29
3572	lvx		v25,$x10,$key_		# re-pre-load round[2]
3573	 vxor		$twk1,$twk1,v31
3574
3575	le?vperm	$in0,$in0,$in0,$leperm
3576	vncipher	$out0,$out0,v30
3577
3578	vperm		$in0,$in0,$in0,$inpperm
3579	vncipherlast	$tmp,$out0,$twk1
3580
3581	le?vperm	$out0,$tmp,$tmp,$leperm
3582	le?stvx_u	$out0,0,$out
3583	be?stvx_u	$tmp,0,$out
3584
3585	vxor		$out0,$out0,$out0
3586	vspltisb	$out1,-1
3587	vperm		$out0,$out0,$out1,$inpperm
3588	vsel		$out0,$in0,$tmp,$out0
3589	vxor		$out0,$out0,$twk0
3590
3591	subi		r30,$out,1
3592	mtctr		$taillen
3593Loop_xts_dec6x_steal:
3594	lbzu		r0,1(r30)
3595	stb		r0,16(r30)
3596	bdnz		Loop_xts_dec6x_steal
3597
3598	li		$taillen,0
3599	mtctr		$rounds
3600	b		Loop_xts_dec1x		# one more time...
3601
3602.align	4
3603Lxts_dec6x_done:
3604	${UCMP}i	$ivp,0
3605	beq		Lxts_dec6x_ret
3606
3607	vxor		$tweak,$twk0,$rndkey0
3608	le?vperm	$tweak,$tweak,$tweak,$leperm
3609	stvx_u		$tweak,0,$ivp
3610
3611Lxts_dec6x_ret:
3612	mtlr		r11
3613	li		r10,`$FRAME+15`
3614	li		r11,`$FRAME+31`
3615	stvx		$seven,r10,$sp		# wipe copies of round keys
3616	addi		r10,r10,32
3617	stvx		$seven,r11,$sp
3618	addi		r11,r11,32
3619	stvx		$seven,r10,$sp
3620	addi		r10,r10,32
3621	stvx		$seven,r11,$sp
3622	addi		r11,r11,32
3623	stvx		$seven,r10,$sp
3624	addi		r10,r10,32
3625	stvx		$seven,r11,$sp
3626	addi		r11,r11,32
3627	stvx		$seven,r10,$sp
3628	addi		r10,r10,32
3629	stvx		$seven,r11,$sp
3630	addi		r11,r11,32
3631
3632	mtspr		256,$vrsave
3633	lvx		v20,r10,$sp		# ABI says so
3634	addi		r10,r10,32
3635	lvx		v21,r11,$sp
3636	addi		r11,r11,32
3637	lvx		v22,r10,$sp
3638	addi		r10,r10,32
3639	lvx		v23,r11,$sp
3640	addi		r11,r11,32
3641	lvx		v24,r10,$sp
3642	addi		r10,r10,32
3643	lvx		v25,r11,$sp
3644	addi		r11,r11,32
3645	lvx		v26,r10,$sp
3646	addi		r10,r10,32
3647	lvx		v27,r11,$sp
3648	addi		r11,r11,32
3649	lvx		v28,r10,$sp
3650	addi		r10,r10,32
3651	lvx		v29,r11,$sp
3652	addi		r11,r11,32
3653	lvx		v30,r10,$sp
3654	lvx		v31,r11,$sp
3655	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3656	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3657	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3658	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3659	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3660	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3661	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3662	blr
3663	.long		0
3664	.byte		0,12,0x04,1,0x80,6,6,0
3665	.long		0
3666
3667.align	5
3668_aesp8_xts_dec5x:
3669	vncipher	$out0,$out0,v24
3670	vncipher	$out1,$out1,v24
3671	vncipher	$out2,$out2,v24
3672	vncipher	$out3,$out3,v24
3673	vncipher	$out4,$out4,v24
3674	lvx		v24,$x20,$key_		# round[3]
3675	addi		$key_,$key_,0x20
3676
3677	vncipher	$out0,$out0,v25
3678	vncipher	$out1,$out1,v25
3679	vncipher	$out2,$out2,v25
3680	vncipher	$out3,$out3,v25
3681	vncipher	$out4,$out4,v25
3682	lvx		v25,$x10,$key_		# round[4]
3683	bdnz		_aesp8_xts_dec5x
3684
3685	subi		r0,$taillen,1
3686	vncipher	$out0,$out0,v24
3687	vncipher	$out1,$out1,v24
3688	vncipher	$out2,$out2,v24
3689	vncipher	$out3,$out3,v24
3690	vncipher	$out4,$out4,v24
3691
3692	andi.		r0,r0,16
3693	cmpwi		$taillen,0
3694	vncipher	$out0,$out0,v25
3695	vncipher	$out1,$out1,v25
3696	vncipher	$out2,$out2,v25
3697	vncipher	$out3,$out3,v25
3698	vncipher	$out4,$out4,v25
3699	 vxor		$twk0,$twk0,v31
3700
3701	sub		$inp,$inp,r0
3702	vncipher	$out0,$out0,v26
3703	vncipher	$out1,$out1,v26
3704	vncipher	$out2,$out2,v26
3705	vncipher	$out3,$out3,v26
3706	vncipher	$out4,$out4,v26
3707	 vxor		$in1,$twk1,v31
3708
3709	vncipher	$out0,$out0,v27
3710	lvx_u		$in0,0,$inp
3711	vncipher	$out1,$out1,v27
3712	vncipher	$out2,$out2,v27
3713	vncipher	$out3,$out3,v27
3714	vncipher	$out4,$out4,v27
3715	 vxor		$in2,$twk2,v31
3716
3717	addi		$key_,$sp,$FRAME+15	# rewind $key_
3718	vncipher	$out0,$out0,v28
3719	vncipher	$out1,$out1,v28
3720	vncipher	$out2,$out2,v28
3721	vncipher	$out3,$out3,v28
3722	vncipher	$out4,$out4,v28
3723	lvx		v24,$x00,$key_		# re-pre-load round[1]
3724	 vxor		$in3,$twk3,v31
3725
3726	vncipher	$out0,$out0,v29
3727	le?vperm	$in0,$in0,$in0,$leperm
3728	vncipher	$out1,$out1,v29
3729	vncipher	$out2,$out2,v29
3730	vncipher	$out3,$out3,v29
3731	vncipher	$out4,$out4,v29
3732	lvx		v25,$x10,$key_		# re-pre-load round[2]
3733	 vxor		$in4,$twk4,v31
3734
3735	vncipher	$out0,$out0,v30
3736	vncipher	$out1,$out1,v30
3737	vncipher	$out2,$out2,v30
3738	vncipher	$out3,$out3,v30
3739	vncipher	$out4,$out4,v30
3740
3741	vncipherlast	$out0,$out0,$twk0
3742	vncipherlast	$out1,$out1,$in1
3743	vncipherlast	$out2,$out2,$in2
3744	vncipherlast	$out3,$out3,$in3
3745	vncipherlast	$out4,$out4,$in4
3746	mtctr		$rounds
3747	blr
3748        .long   	0
3749        .byte   	0,12,0x14,0,0,0,0,0
3750___
3751}}	}}}
3752
3753my $consts=1;
3754foreach(split("\n",$code)) {
3755        s/\`([^\`]*)\`/eval($1)/geo;
3756
3757	# constants table endian-specific conversion
3758	if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3759	    my $conv=$3;
3760	    my @bytes=();
3761
3762	    # convert to endian-agnostic format
3763	    if ($1 eq "long") {
3764	      foreach (split(/,\s*/,$2)) {
3765		my $l = /^0/?oct:int;
3766		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3767	      }
3768	    } else {
3769		@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3770	    }
3771
3772	    # little-endian conversion
3773	    if ($flavour =~ /le$/o) {
3774		SWITCH: for($conv)  {
3775		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
3776		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
3777		}
3778	    }
3779
3780	    #emit
3781	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3782	    next;
3783	}
3784	$consts=0 if (m/Lconsts:/o);	# end of table
3785
3786	# instructions prefixed with '?' are endian-specific and need
3787	# to be adjusted accordingly...
3788	if ($flavour =~ /le$/o) {	# little-endian
3789	    s/le\?//o		or
3790	    s/be\?/#be#/o	or
3791	    s/\?lvsr/lvsl/o	or
3792	    s/\?lvsl/lvsr/o	or
3793	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3794	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3795	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3796	} else {			# big-endian
3797	    s/le\?/#le#/o	or
3798	    s/be\?//o		or
3799	    s/\?([a-z]+)/$1/o;
3800	}
3801
3802        print $_,"\n";
3803}
3804
3805close STDOUT;
3806