1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8#
9# Permission to use under GPL terms is granted.
10# ====================================================================
11
12# SHA256 block procedure for ARMv4. May 2007.
13
14# Performance is ~2x better than gcc 3.4 generated code and in "abso-
15# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16# byte [on single-issue Xscale PXA250 core].
17
18# July 2010.
19#
20# Rescheduling for dual-issue pipeline resulted in 22% improvement on
21# Cortex A8 core and ~20 cycles per processed byte.
22
23# February 2011.
24#
25# Profiler-assisted and platform-specific optimization resulted in 16%
26# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
27
28# September 2013.
29#
30# Add NEON implementation. On Cortex A8 it was measured to process one
31# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33# code (meaning that latter performs sub-optimally, nothing was done
34# about it).
35
36# May 2014.
37#
38# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
39
40$flavour = shift;
41if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
42else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
43
44if ($flavour && $flavour ne "void") {
45    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
48    die "can't locate arm-xlate.pl";
49
50    open STDOUT,"| \"$^X\" $xlate $flavour $output";
51} else {
52    open STDOUT,">$output";
53}
54
55$ctx="r0";	$t0="r0";
56$inp="r1";	$t4="r1";
57$len="r2";	$t1="r2";
58$T1="r3";	$t3="r3";
59$A="r4";
60$B="r5";
61$C="r6";
62$D="r7";
63$E="r8";
64$F="r9";
65$G="r10";
66$H="r11";
67@V=($A,$B,$C,$D,$E,$F,$G,$H);
68$t2="r12";
69$Ktbl="r14";
70
71@Sigma0=( 2,13,22);
72@Sigma1=( 6,11,25);
73@sigma0=( 7,18, 3);
74@sigma1=(17,19,10);
75
76sub BODY_00_15 {
77my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
78
79$code.=<<___ if ($i<16);
80#if __ARM_ARCH__>=7
81	@ ldr	$t1,[$inp],#4			@ $i
82# if $i==15
83	str	$inp,[sp,#17*4]			@ make room for $t4
84# endif
85	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
86	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
87	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
88# ifndef __ARMEB__
89	rev	$t1,$t1
90# endif
91#else
92	@ ldrb	$t1,[$inp,#3]			@ $i
93	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
94	ldrb	$t2,[$inp,#2]
95	ldrb	$t0,[$inp,#1]
96	orr	$t1,$t1,$t2,lsl#8
97	ldrb	$t2,[$inp],#4
98	orr	$t1,$t1,$t0,lsl#16
99# if $i==15
100	str	$inp,[sp,#17*4]			@ make room for $t4
101# endif
102	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
103	orr	$t1,$t1,$t2,lsl#24
104	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
105#endif
106___
107$code.=<<___;
108	ldr	$t2,[$Ktbl],#4			@ *K256++
109	add	$h,$h,$t1			@ h+=X[i]
110	str	$t1,[sp,#`$i%16`*4]
111	eor	$t1,$f,$g
112	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
113	and	$t1,$t1,$e
114	add	$h,$h,$t2			@ h+=K256[i]
115	eor	$t1,$t1,$g			@ Ch(e,f,g)
116	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
117	add	$h,$h,$t1			@ h+=Ch(e,f,g)
118#if $i==31
119	and	$t2,$t2,#0xff
120	cmp	$t2,#0xf2			@ done?
121#endif
122#if $i<15
123# if __ARM_ARCH__>=7
124	ldr	$t1,[$inp],#4			@ prefetch
125# else
126	ldrb	$t1,[$inp,#3]
127# endif
128	eor	$t2,$a,$b			@ a^b, b^c in next round
129#else
130	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
131	eor	$t2,$a,$b			@ a^b, b^c in next round
132	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
133#endif
134	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
135	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
136	add	$d,$d,$h			@ d+=h
137	eor	$t3,$t3,$b			@ Maj(a,b,c)
138	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
139	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
140___
141	($t2,$t3)=($t3,$t2);
142}
143
144sub BODY_16_XX {
145my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
146
147$code.=<<___;
148	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
149	@ ldr	$t4,[sp,#`($i+14)%16`*4]
150	mov	$t0,$t1,ror#$sigma0[0]
151	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
152	mov	$t2,$t4,ror#$sigma1[0]
153	eor	$t0,$t0,$t1,ror#$sigma0[1]
154	eor	$t2,$t2,$t4,ror#$sigma1[1]
155	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
156	ldr	$t1,[sp,#`($i+0)%16`*4]
157	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
158	ldr	$t4,[sp,#`($i+9)%16`*4]
159
160	add	$t2,$t2,$t0
161	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
162	add	$t1,$t1,$t2
163	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
164	add	$t1,$t1,$t4			@ X[i]
165___
166	&BODY_00_15(@_);
167}
168
169$code=<<___;
170#ifndef __KERNEL__
171# include <openssl/arm_arch.h>
172#else
173# define __ARM_ARCH__ __LINUX_ARM_ARCH__
174# define __ARM_MAX_ARCH__ 7
175#endif
176
177.text
178#if __ARM_ARCH__<7
179.code	32
180#else
181.syntax unified
182# if defined(__thumb2__) && !defined(__APPLE__)
183#  define adrl adr
184.thumb
185# else
186.code   32
187# endif
188#endif
189
190.type	K256,%object
191.align	5
192K256:
193.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
194.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
195.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
196.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
197.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
198.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
199.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
200.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
201.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
202.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
203.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
204.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
205.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
206.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
207.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
208.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
209.size	K256,.-K256
210.word	0				@ terminator
211#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
212.LOPENSSL_armcap:
213.word	OPENSSL_armcap_P-.Lsha256_block_data_order
214#endif
215.align	5
216
217.global	sha256_block_data_order
218.type	sha256_block_data_order,%function
219sha256_block_data_order:
220.Lsha256_block_data_order:
221#if __ARM_ARCH__<7
222	sub	r3,pc,#8		@ sha256_block_data_order
223#else
224	adr	r3,sha256_block_data_order
225#endif
226#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
227	ldr	r12,.LOPENSSL_armcap
228	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
229#ifdef	__APPLE__
230	ldr	r12,[r12]
231#endif
232	tst	r12,#ARMV8_SHA256
233	bne	.LARMv8
234	tst	r12,#ARMV7_NEON
235	bne	.LNEON
236#endif
237	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
238	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
239	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
240	sub	$Ktbl,r3,#256+32	@ K256
241	sub	sp,sp,#16*4		@ alloca(X[16])
242.Loop:
243# if __ARM_ARCH__>=7
244	ldr	$t1,[$inp],#4
245# else
246	ldrb	$t1,[$inp,#3]
247# endif
248	eor	$t3,$B,$C		@ magic
249	eor	$t2,$t2,$t2
250___
251for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
252$code.=".Lrounds_16_xx:\n";
253for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
254$code.=<<___;
255#if __ARM_ARCH__>=7
256	ite	eq			@ Thumb2 thing, sanity check in ARM
257#endif
258	ldreq	$t3,[sp,#16*4]		@ pull ctx
259	bne	.Lrounds_16_xx
260
261	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
262	ldr	$t0,[$t3,#0]
263	ldr	$t1,[$t3,#4]
264	ldr	$t2,[$t3,#8]
265	add	$A,$A,$t0
266	ldr	$t0,[$t3,#12]
267	add	$B,$B,$t1
268	ldr	$t1,[$t3,#16]
269	add	$C,$C,$t2
270	ldr	$t2,[$t3,#20]
271	add	$D,$D,$t0
272	ldr	$t0,[$t3,#24]
273	add	$E,$E,$t1
274	ldr	$t1,[$t3,#28]
275	add	$F,$F,$t2
276	ldr	$inp,[sp,#17*4]		@ pull inp
277	ldr	$t2,[sp,#18*4]		@ pull inp+len
278	add	$G,$G,$t0
279	add	$H,$H,$t1
280	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
281	cmp	$inp,$t2
282	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
283	bne	.Loop
284
285	add	sp,sp,#`16+3`*4	@ destroy frame
286#if __ARM_ARCH__>=5
287	ldmia	sp!,{r4-r11,pc}
288#else
289	ldmia	sp!,{r4-r11,lr}
290	tst	lr,#1
291	moveq	pc,lr			@ be binary compatible with V4, yet
292	bx	lr			@ interoperable with Thumb ISA:-)
293#endif
294.size	sha256_block_data_order,.-sha256_block_data_order
295___
296######################################################################
297# NEON stuff
298#
299{{{
300my @X=map("q$_",(0..3));
301my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
302my $Xfer=$t4;
303my $j=0;
304
305sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
306sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
307
308sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
309{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
310  my $arg = pop;
311    $arg = "#$arg" if ($arg*1 eq $arg);
312    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
313}
314
315sub Xupdate()
316{ use integer;
317  my $body = shift;
318  my @insns = (&$body,&$body,&$body,&$body);
319  my ($a,$b,$c,$d,$e,$f,$g,$h);
320
321	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
322	 eval(shift(@insns));
323	 eval(shift(@insns));
324	 eval(shift(@insns));
325	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
326	 eval(shift(@insns));
327	 eval(shift(@insns));
328	 eval(shift(@insns));
329	&vshr_u32	($T2,$T0,$sigma0[0]);
330	 eval(shift(@insns));
331	 eval(shift(@insns));
332	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
333	 eval(shift(@insns));
334	 eval(shift(@insns));
335	&vshr_u32	($T1,$T0,$sigma0[2]);
336	 eval(shift(@insns));
337	 eval(shift(@insns));
338	&vsli_32	($T2,$T0,32-$sigma0[0]);
339	 eval(shift(@insns));
340	 eval(shift(@insns));
341	&vshr_u32	($T3,$T0,$sigma0[1]);
342	 eval(shift(@insns));
343	 eval(shift(@insns));
344	&veor		($T1,$T1,$T2);
345	 eval(shift(@insns));
346	 eval(shift(@insns));
347	&vsli_32	($T3,$T0,32-$sigma0[1]);
348	 eval(shift(@insns));
349	 eval(shift(@insns));
350	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
351	 eval(shift(@insns));
352	 eval(shift(@insns));
353	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
354	 eval(shift(@insns));
355	 eval(shift(@insns));
356	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
357	 eval(shift(@insns));
358	 eval(shift(@insns));
359	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
360	 eval(shift(@insns));
361	 eval(shift(@insns));
362	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
363	 eval(shift(@insns));
364	 eval(shift(@insns));
365	  &veor		($T5,$T5,$T4);
366	 eval(shift(@insns));
367	 eval(shift(@insns));
368	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
369	 eval(shift(@insns));
370	 eval(shift(@insns));
371	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
372	 eval(shift(@insns));
373	 eval(shift(@insns));
374	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
375	 eval(shift(@insns));
376	 eval(shift(@insns));
377	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
378	 eval(shift(@insns));
379	 eval(shift(@insns));
380	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
381	 eval(shift(@insns));
382	 eval(shift(@insns));
383	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
384	 eval(shift(@insns));
385	 eval(shift(@insns));
386	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
387	 eval(shift(@insns));
388	 eval(shift(@insns));
389	  &veor		($T5,$T5,$T4);
390	 eval(shift(@insns));
391	 eval(shift(@insns));
392	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
393	 eval(shift(@insns));
394	 eval(shift(@insns));
395	&vld1_32	("{$T0}","[$Ktbl,:128]!");
396	 eval(shift(@insns));
397	 eval(shift(@insns));
398	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
399	 eval(shift(@insns));
400	 eval(shift(@insns));
401	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
402	 eval(shift(@insns));
403	 eval(shift(@insns));
404	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
405	 eval(shift(@insns));
406	 eval(shift(@insns));
407	&vadd_i32	($T0,$T0,@X[0]);
408	 while($#insns>=2) { eval(shift(@insns)); }
409	&vst1_32	("{$T0}","[$Xfer,:128]!");
410	 eval(shift(@insns));
411	 eval(shift(@insns));
412
413	push(@X,shift(@X));		# "rotate" X[]
414}
415
416sub Xpreload()
417{ use integer;
418  my $body = shift;
419  my @insns = (&$body,&$body,&$body,&$body);
420  my ($a,$b,$c,$d,$e,$f,$g,$h);
421
422	 eval(shift(@insns));
423	 eval(shift(@insns));
424	 eval(shift(@insns));
425	 eval(shift(@insns));
426	&vld1_32	("{$T0}","[$Ktbl,:128]!");
427	 eval(shift(@insns));
428	 eval(shift(@insns));
429	 eval(shift(@insns));
430	 eval(shift(@insns));
431	&vrev32_8	(@X[0],@X[0]);
432	 eval(shift(@insns));
433	 eval(shift(@insns));
434	 eval(shift(@insns));
435	 eval(shift(@insns));
436	&vadd_i32	($T0,$T0,@X[0]);
437	 foreach (@insns) { eval; }	# remaining instructions
438	&vst1_32	("{$T0}","[$Xfer,:128]!");
439
440	push(@X,shift(@X));		# "rotate" X[]
441}
442
443sub body_00_15 () {
444	(
445	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
446	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
447	'&eor	($t1,$f,$g)',
448	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
449	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
450	'&and	($t1,$t1,$e)',
451	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
452	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
453	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
454	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
455	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
456	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
457	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
458	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
459	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
460	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
461	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
462	'&add	($d,$d,$h)',			# d+=h
463	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
464	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
465	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
466	)
467}
468
469$code.=<<___;
470#if __ARM_MAX_ARCH__>=7
471.arch	armv7-a
472.fpu	neon
473
474.global	sha256_block_data_order_neon
475.type	sha256_block_data_order_neon,%function
476.align	4
477sha256_block_data_order_neon:
478.LNEON:
479	stmdb	sp!,{r4-r12,lr}
480
481	sub	$H,sp,#16*4+16
482	adrl	$Ktbl,K256
483	bic	$H,$H,#15		@ align for 128-bit stores
484	mov	$t2,sp
485	mov	sp,$H			@ alloca
486	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
487
488	vld1.8		{@X[0]},[$inp]!
489	vld1.8		{@X[1]},[$inp]!
490	vld1.8		{@X[2]},[$inp]!
491	vld1.8		{@X[3]},[$inp]!
492	vld1.32		{$T0},[$Ktbl,:128]!
493	vld1.32		{$T1},[$Ktbl,:128]!
494	vld1.32		{$T2},[$Ktbl,:128]!
495	vld1.32		{$T3},[$Ktbl,:128]!
496	vrev32.8	@X[0],@X[0]		@ yes, even on
497	str		$ctx,[sp,#64]
498	vrev32.8	@X[1],@X[1]		@ big-endian
499	str		$inp,[sp,#68]
500	mov		$Xfer,sp
501	vrev32.8	@X[2],@X[2]
502	str		$len,[sp,#72]
503	vrev32.8	@X[3],@X[3]
504	str		$t2,[sp,#76]		@ save original sp
505	vadd.i32	$T0,$T0,@X[0]
506	vadd.i32	$T1,$T1,@X[1]
507	vst1.32		{$T0},[$Xfer,:128]!
508	vadd.i32	$T2,$T2,@X[2]
509	vst1.32		{$T1},[$Xfer,:128]!
510	vadd.i32	$T3,$T3,@X[3]
511	vst1.32		{$T2},[$Xfer,:128]!
512	vst1.32		{$T3},[$Xfer,:128]!
513
514	ldmia		$ctx,{$A-$H}
515	sub		$Xfer,$Xfer,#64
516	ldr		$t1,[sp,#0]
517	eor		$t2,$t2,$t2
518	eor		$t3,$B,$C
519	b		.L_00_48
520
521.align	4
522.L_00_48:
523___
524	&Xupdate(\&body_00_15);
525	&Xupdate(\&body_00_15);
526	&Xupdate(\&body_00_15);
527	&Xupdate(\&body_00_15);
528$code.=<<___;
529	teq	$t1,#0				@ check for K256 terminator
530	ldr	$t1,[sp,#0]
531	sub	$Xfer,$Xfer,#64
532	bne	.L_00_48
533
534	ldr		$inp,[sp,#68]
535	ldr		$t0,[sp,#72]
536	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
537	teq		$inp,$t0
538	it		eq
539	subeq		$inp,$inp,#64		@ avoid SEGV
540	vld1.8		{@X[0]},[$inp]!		@ load next input block
541	vld1.8		{@X[1]},[$inp]!
542	vld1.8		{@X[2]},[$inp]!
543	vld1.8		{@X[3]},[$inp]!
544	it		ne
545	strne		$inp,[sp,#68]
546	mov		$Xfer,sp
547___
548	&Xpreload(\&body_00_15);
549	&Xpreload(\&body_00_15);
550	&Xpreload(\&body_00_15);
551	&Xpreload(\&body_00_15);
552$code.=<<___;
553	ldr	$t0,[$t1,#0]
554	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
555	ldr	$t2,[$t1,#4]
556	ldr	$t3,[$t1,#8]
557	ldr	$t4,[$t1,#12]
558	add	$A,$A,$t0			@ accumulate
559	ldr	$t0,[$t1,#16]
560	add	$B,$B,$t2
561	ldr	$t2,[$t1,#20]
562	add	$C,$C,$t3
563	ldr	$t3,[$t1,#24]
564	add	$D,$D,$t4
565	ldr	$t4,[$t1,#28]
566	add	$E,$E,$t0
567	str	$A,[$t1],#4
568	add	$F,$F,$t2
569	str	$B,[$t1],#4
570	add	$G,$G,$t3
571	str	$C,[$t1],#4
572	add	$H,$H,$t4
573	str	$D,[$t1],#4
574	stmia	$t1,{$E-$H}
575
576	ittte	ne
577	movne	$Xfer,sp
578	ldrne	$t1,[sp,#0]
579	eorne	$t2,$t2,$t2
580	ldreq	sp,[sp,#76]			@ restore original sp
581	itt	ne
582	eorne	$t3,$B,$C
583	bne	.L_00_48
584
585	ldmia	sp!,{r4-r12,pc}
586.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
587#endif
588___
589}}}
590######################################################################
591# ARMv8 stuff
592#
593{{{
594my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
595my @MSG=map("q$_",(8..11));
596my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
597my $Ktbl="r3";
598
599$code.=<<___;
600#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
601
602# if defined(__thumb2__) && !defined(__APPLE__)
603#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
604# else
605#  define INST(a,b,c,d)	.byte	a,b,c,d
606# endif
607
608.type	sha256_block_data_order_armv8,%function
609.align	5
610sha256_block_data_order_armv8:
611.LARMv8:
612	vld1.32	{$ABCD,$EFGH},[$ctx]
613# ifdef	__APPLE__
614	sub	$Ktbl,$Ktbl,#256+32
615# elif	defined(__thumb2__)
616	adr	$Ktbl,.LARMv8
617	sub	$Ktbl,$Ktbl,#.LARMv8-K256
618# else
619	adrl	$Ktbl,K256
620# endif
621	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
622
623.Loop_v8:
624	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
625	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
626	vld1.32		{$W0},[$Ktbl]!
627	vrev32.8	@MSG[0],@MSG[0]
628	vrev32.8	@MSG[1],@MSG[1]
629	vrev32.8	@MSG[2],@MSG[2]
630	vrev32.8	@MSG[3],@MSG[3]
631	vmov		$ABCD_SAVE,$ABCD	@ offload
632	vmov		$EFGH_SAVE,$EFGH
633	teq		$inp,$len
634___
635for($i=0;$i<12;$i++) {
636$code.=<<___;
637	vld1.32		{$W1},[$Ktbl]!
638	vadd.i32	$W0,$W0,@MSG[0]
639	sha256su0	@MSG[0],@MSG[1]
640	vmov		$abcd,$ABCD
641	sha256h		$ABCD,$EFGH,$W0
642	sha256h2	$EFGH,$abcd,$W0
643	sha256su1	@MSG[0],@MSG[2],@MSG[3]
644___
645	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
646}
647$code.=<<___;
648	vld1.32		{$W1},[$Ktbl]!
649	vadd.i32	$W0,$W0,@MSG[0]
650	vmov		$abcd,$ABCD
651	sha256h		$ABCD,$EFGH,$W0
652	sha256h2	$EFGH,$abcd,$W0
653
654	vld1.32		{$W0},[$Ktbl]!
655	vadd.i32	$W1,$W1,@MSG[1]
656	vmov		$abcd,$ABCD
657	sha256h		$ABCD,$EFGH,$W1
658	sha256h2	$EFGH,$abcd,$W1
659
660	vld1.32		{$W1},[$Ktbl]
661	vadd.i32	$W0,$W0,@MSG[2]
662	sub		$Ktbl,$Ktbl,#256-16	@ rewind
663	vmov		$abcd,$ABCD
664	sha256h		$ABCD,$EFGH,$W0
665	sha256h2	$EFGH,$abcd,$W0
666
667	vadd.i32	$W1,$W1,@MSG[3]
668	vmov		$abcd,$ABCD
669	sha256h		$ABCD,$EFGH,$W1
670	sha256h2	$EFGH,$abcd,$W1
671
672	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
673	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
674	it		ne
675	bne		.Loop_v8
676
677	vst1.32		{$ABCD,$EFGH},[$ctx]
678
679	ret		@ bx lr
680.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
681#endif
682___
683}}}
684$code.=<<___;
685.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
686.align	2
687#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
688.comm   OPENSSL_armcap_P,4,4
689.hidden OPENSSL_armcap_P
690#endif
691___
692
693open SELF,$0;
694while(<SELF>) {
695	next if (/^#!/);
696	last if (!s/^#/@/ and !/^$/);
697	print;
698}
699close SELF;
700
701{   my  %opcode = (
702	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
703	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
704
705    sub unsha256 {
706	my ($mnemonic,$arg)=@_;
707
708	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
709	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
710					 |(($2&7)<<17)|(($2&8)<<4)
711					 |(($3&7)<<1) |(($3&8)<<2);
712	    # since ARMv7 instructions are always encoded little-endian.
713	    # correct solution is to use .inst directive, but older
714	    # assemblers don't implement it:-(
715	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
716			$word&0xff,($word>>8)&0xff,
717			($word>>16)&0xff,($word>>24)&0xff,
718			$mnemonic,$arg;
719	}
720    }
721}
722
723foreach (split($/,$code)) {
724
725	s/\`([^\`]*)\`/eval $1/geo;
726
727	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
728
729	s/\bret\b/bx	lr/go		or
730	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
731
732	print $_,"\n";
733}
734
735close STDOUT; # enforce flush
736