1#! /usr/bin/env perl
2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15#
16# Permission to use under GPL terms is granted.
17# ====================================================================
18
19# SHA256 block procedure for ARMv4. May 2007.
20
21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23# byte [on single-issue Xscale PXA250 core].
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28# Cortex A8 core and ~20 cycles per processed byte.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35# September 2013.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process one
38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40# code (meaning that latter performs sub-optimally, nothing was done
41# about it).
42
43# May 2014.
44#
45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47$flavour = shift;
48if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
49else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
50
51if ($flavour && $flavour ne "void") {
52    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
54    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
55    die "can't locate arm-xlate.pl";
56
57    open STDOUT,"| \"$^X\" $xlate $flavour $output";
58} else {
59    open STDOUT,">$output";
60}
61
62$ctx="r0";	$t0="r0";
63$inp="r1";	$t4="r1";
64$len="r2";	$t1="r2";
65$T1="r3";	$t3="r3";
66$A="r4";
67$B="r5";
68$C="r6";
69$D="r7";
70$E="r8";
71$F="r9";
72$G="r10";
73$H="r11";
74@V=($A,$B,$C,$D,$E,$F,$G,$H);
75$t2="r12";
76$Ktbl="r14";
77
78@Sigma0=( 2,13,22);
79@Sigma1=( 6,11,25);
80@sigma0=( 7,18, 3);
81@sigma1=(17,19,10);
82
83sub BODY_00_15 {
84my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
85
86$code.=<<___ if ($i<16);
87#if __ARM_ARCH__>=7
88	@ ldr	$t1,[$inp],#4			@ $i
89# if $i==15
90	str	$inp,[sp,#17*4]			@ make room for $t4
91# endif
92	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
93	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
94	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
95# ifndef __ARMEB__
96	rev	$t1,$t1
97# endif
98#else
99	@ ldrb	$t1,[$inp,#3]			@ $i
100	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
101	ldrb	$t2,[$inp,#2]
102	ldrb	$t0,[$inp,#1]
103	orr	$t1,$t1,$t2,lsl#8
104	ldrb	$t2,[$inp],#4
105	orr	$t1,$t1,$t0,lsl#16
106# if $i==15
107	str	$inp,[sp,#17*4]			@ make room for $t4
108# endif
109	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
110	orr	$t1,$t1,$t2,lsl#24
111	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
112#endif
113___
114$code.=<<___;
115	ldr	$t2,[$Ktbl],#4			@ *K256++
116	add	$h,$h,$t1			@ h+=X[i]
117	str	$t1,[sp,#`$i%16`*4]
118	eor	$t1,$f,$g
119	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
120	and	$t1,$t1,$e
121	add	$h,$h,$t2			@ h+=K256[i]
122	eor	$t1,$t1,$g			@ Ch(e,f,g)
123	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
124	add	$h,$h,$t1			@ h+=Ch(e,f,g)
125#if $i==31
126	and	$t2,$t2,#0xff
127	cmp	$t2,#0xf2			@ done?
128#endif
129#if $i<15
130# if __ARM_ARCH__>=7
131	ldr	$t1,[$inp],#4			@ prefetch
132# else
133	ldrb	$t1,[$inp,#3]
134# endif
135	eor	$t2,$a,$b			@ a^b, b^c in next round
136#else
137	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
138	eor	$t2,$a,$b			@ a^b, b^c in next round
139	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
140#endif
141	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
142	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
143	add	$d,$d,$h			@ d+=h
144	eor	$t3,$t3,$b			@ Maj(a,b,c)
145	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
146	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
147___
148	($t2,$t3)=($t3,$t2);
149}
150
151sub BODY_16_XX {
152my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
153
154$code.=<<___;
155	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
156	@ ldr	$t4,[sp,#`($i+14)%16`*4]
157	mov	$t0,$t1,ror#$sigma0[0]
158	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
159	mov	$t2,$t4,ror#$sigma1[0]
160	eor	$t0,$t0,$t1,ror#$sigma0[1]
161	eor	$t2,$t2,$t4,ror#$sigma1[1]
162	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
163	ldr	$t1,[sp,#`($i+0)%16`*4]
164	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
165	ldr	$t4,[sp,#`($i+9)%16`*4]
166
167	add	$t2,$t2,$t0
168	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
169	add	$t1,$t1,$t2
170	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
171	add	$t1,$t1,$t4			@ X[i]
172___
173	&BODY_00_15(@_);
174}
175
176$code=<<___;
177#ifndef __KERNEL__
178# include <openssl/arm_arch.h>
179#else
180# define __ARM_ARCH__ __LINUX_ARM_ARCH__
181# define __ARM_MAX_ARCH__ 7
182#endif
183
184@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
185@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
186@ instructions are manually-encoded. (See unsha256.)
187.arch  armv7-a
188
189.text
190#if defined(__thumb2__)
191.syntax unified
192.thumb
193#else
194.code   32
195#endif
196
197.type	K256,%object
198.align	5
199K256:
200.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
201.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
202.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
203.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
204.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
205.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
206.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
207.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
208.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
209.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
210.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
211.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
212.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
213.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
214.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
215.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
216.size	K256,.-K256
217.word	0				@ terminator
218#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
219.LOPENSSL_armcap:
220.word	OPENSSL_armcap_P-.Lsha256_block_data_order
221#endif
222.align	5
223
224.global	sha256_block_data_order
225.type	sha256_block_data_order,%function
226sha256_block_data_order:
227.Lsha256_block_data_order:
228#if __ARM_ARCH__<7 && !defined(__thumb2__)
229	sub	r3,pc,#8		@ sha256_block_data_order
230#else
231	adr	r3,.Lsha256_block_data_order
232#endif
233#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
234	ldr	r12,.LOPENSSL_armcap
235	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
236#ifdef	__APPLE__
237	ldr	r12,[r12]
238#endif
239	tst	r12,#ARMV8_SHA256
240	bne	.LARMv8
241	tst	r12,#ARMV7_NEON
242	bne	.LNEON
243#endif
244	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
245	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
246	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
247	sub	$Ktbl,r3,#256+32	@ K256
248	sub	sp,sp,#16*4		@ alloca(X[16])
249.Loop:
250# if __ARM_ARCH__>=7
251	ldr	$t1,[$inp],#4
252# else
253	ldrb	$t1,[$inp,#3]
254# endif
255	eor	$t3,$B,$C		@ magic
256	eor	$t2,$t2,$t2
257___
258for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
259$code.=".Lrounds_16_xx:\n";
260for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
261$code.=<<___;
262#if __ARM_ARCH__>=7
263	ite	eq			@ Thumb2 thing, sanity check in ARM
264#endif
265	ldreq	$t3,[sp,#16*4]		@ pull ctx
266	bne	.Lrounds_16_xx
267
268	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
269	ldr	$t0,[$t3,#0]
270	ldr	$t1,[$t3,#4]
271	ldr	$t2,[$t3,#8]
272	add	$A,$A,$t0
273	ldr	$t0,[$t3,#12]
274	add	$B,$B,$t1
275	ldr	$t1,[$t3,#16]
276	add	$C,$C,$t2
277	ldr	$t2,[$t3,#20]
278	add	$D,$D,$t0
279	ldr	$t0,[$t3,#24]
280	add	$E,$E,$t1
281	ldr	$t1,[$t3,#28]
282	add	$F,$F,$t2
283	ldr	$inp,[sp,#17*4]		@ pull inp
284	ldr	$t2,[sp,#18*4]		@ pull inp+len
285	add	$G,$G,$t0
286	add	$H,$H,$t1
287	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
288	cmp	$inp,$t2
289	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
290	bne	.Loop
291
292	add	sp,sp,#`16+3`*4	@ destroy frame
293#if __ARM_ARCH__>=5
294	ldmia	sp!,{r4-r11,pc}
295#else
296	ldmia	sp!,{r4-r11,lr}
297	tst	lr,#1
298	moveq	pc,lr			@ be binary compatible with V4, yet
299	bx	lr			@ interoperable with Thumb ISA:-)
300#endif
301.size	sha256_block_data_order,.-sha256_block_data_order
302___
303######################################################################
304# NEON stuff
305#
306{{{
307my @X=map("q$_",(0..3));
308my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
309my $Xfer=$t4;
310my $j=0;
311
312sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
313sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
314
315sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
316{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
317  my $arg = pop;
318    $arg = "#$arg" if ($arg*1 eq $arg);
319    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
320}
321
322sub Xupdate()
323{ use integer;
324  my $body = shift;
325  my @insns = (&$body,&$body,&$body,&$body);
326  my ($a,$b,$c,$d,$e,$f,$g,$h);
327
328	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
329	 eval(shift(@insns));
330	 eval(shift(@insns));
331	 eval(shift(@insns));
332	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
333	 eval(shift(@insns));
334	 eval(shift(@insns));
335	 eval(shift(@insns));
336	&vshr_u32	($T2,$T0,$sigma0[0]);
337	 eval(shift(@insns));
338	 eval(shift(@insns));
339	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
340	 eval(shift(@insns));
341	 eval(shift(@insns));
342	&vshr_u32	($T1,$T0,$sigma0[2]);
343	 eval(shift(@insns));
344	 eval(shift(@insns));
345	&vsli_32	($T2,$T0,32-$sigma0[0]);
346	 eval(shift(@insns));
347	 eval(shift(@insns));
348	&vshr_u32	($T3,$T0,$sigma0[1]);
349	 eval(shift(@insns));
350	 eval(shift(@insns));
351	&veor		($T1,$T1,$T2);
352	 eval(shift(@insns));
353	 eval(shift(@insns));
354	&vsli_32	($T3,$T0,32-$sigma0[1]);
355	 eval(shift(@insns));
356	 eval(shift(@insns));
357	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
358	 eval(shift(@insns));
359	 eval(shift(@insns));
360	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
361	 eval(shift(@insns));
362	 eval(shift(@insns));
363	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
364	 eval(shift(@insns));
365	 eval(shift(@insns));
366	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
367	 eval(shift(@insns));
368	 eval(shift(@insns));
369	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
370	 eval(shift(@insns));
371	 eval(shift(@insns));
372	  &veor		($T5,$T5,$T4);
373	 eval(shift(@insns));
374	 eval(shift(@insns));
375	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
376	 eval(shift(@insns));
377	 eval(shift(@insns));
378	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
379	 eval(shift(@insns));
380	 eval(shift(@insns));
381	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
382	 eval(shift(@insns));
383	 eval(shift(@insns));
384	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
385	 eval(shift(@insns));
386	 eval(shift(@insns));
387	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
388	 eval(shift(@insns));
389	 eval(shift(@insns));
390	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
391	 eval(shift(@insns));
392	 eval(shift(@insns));
393	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
394	 eval(shift(@insns));
395	 eval(shift(@insns));
396	  &veor		($T5,$T5,$T4);
397	 eval(shift(@insns));
398	 eval(shift(@insns));
399	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
400	 eval(shift(@insns));
401	 eval(shift(@insns));
402	&vld1_32	("{$T0}","[$Ktbl,:128]!");
403	 eval(shift(@insns));
404	 eval(shift(@insns));
405	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
406	 eval(shift(@insns));
407	 eval(shift(@insns));
408	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
409	 eval(shift(@insns));
410	 eval(shift(@insns));
411	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
412	 eval(shift(@insns));
413	 eval(shift(@insns));
414	&vadd_i32	($T0,$T0,@X[0]);
415	 while($#insns>=2) { eval(shift(@insns)); }
416	&vst1_32	("{$T0}","[$Xfer,:128]!");
417	 eval(shift(@insns));
418	 eval(shift(@insns));
419
420	push(@X,shift(@X));		# "rotate" X[]
421}
422
423sub Xpreload()
424{ use integer;
425  my $body = shift;
426  my @insns = (&$body,&$body,&$body,&$body);
427  my ($a,$b,$c,$d,$e,$f,$g,$h);
428
429	 eval(shift(@insns));
430	 eval(shift(@insns));
431	 eval(shift(@insns));
432	 eval(shift(@insns));
433	&vld1_32	("{$T0}","[$Ktbl,:128]!");
434	 eval(shift(@insns));
435	 eval(shift(@insns));
436	 eval(shift(@insns));
437	 eval(shift(@insns));
438	&vrev32_8	(@X[0],@X[0]);
439	 eval(shift(@insns));
440	 eval(shift(@insns));
441	 eval(shift(@insns));
442	 eval(shift(@insns));
443	&vadd_i32	($T0,$T0,@X[0]);
444	 foreach (@insns) { eval; }	# remaining instructions
445	&vst1_32	("{$T0}","[$Xfer,:128]!");
446
447	push(@X,shift(@X));		# "rotate" X[]
448}
449
450sub body_00_15 () {
451	(
452	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
453	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
454	'&eor	($t1,$f,$g)',
455	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
456	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
457	'&and	($t1,$t1,$e)',
458	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
459	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
460	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
461	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
462	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
463	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
464	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
465	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
466	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
467	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
468	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
469	'&add	($d,$d,$h)',			# d+=h
470	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
471	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
472	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
473	)
474}
475
476$code.=<<___;
477#if __ARM_MAX_ARCH__>=7
478.arch	armv7-a
479.fpu	neon
480
481.global	sha256_block_data_order_neon
482.type	sha256_block_data_order_neon,%function
483.align	5
484.skip	16
485sha256_block_data_order_neon:
486.LNEON:
487	stmdb	sp!,{r4-r12,lr}
488
489	sub	$H,sp,#16*4+16
490	adr	$Ktbl,K256
491	bic	$H,$H,#15		@ align for 128-bit stores
492	mov	$t2,sp
493	mov	sp,$H			@ alloca
494	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
495
496	vld1.8		{@X[0]},[$inp]!
497	vld1.8		{@X[1]},[$inp]!
498	vld1.8		{@X[2]},[$inp]!
499	vld1.8		{@X[3]},[$inp]!
500	vld1.32		{$T0},[$Ktbl,:128]!
501	vld1.32		{$T1},[$Ktbl,:128]!
502	vld1.32		{$T2},[$Ktbl,:128]!
503	vld1.32		{$T3},[$Ktbl,:128]!
504	vrev32.8	@X[0],@X[0]		@ yes, even on
505	str		$ctx,[sp,#64]
506	vrev32.8	@X[1],@X[1]		@ big-endian
507	str		$inp,[sp,#68]
508	mov		$Xfer,sp
509	vrev32.8	@X[2],@X[2]
510	str		$len,[sp,#72]
511	vrev32.8	@X[3],@X[3]
512	str		$t2,[sp,#76]		@ save original sp
513	vadd.i32	$T0,$T0,@X[0]
514	vadd.i32	$T1,$T1,@X[1]
515	vst1.32		{$T0},[$Xfer,:128]!
516	vadd.i32	$T2,$T2,@X[2]
517	vst1.32		{$T1},[$Xfer,:128]!
518	vadd.i32	$T3,$T3,@X[3]
519	vst1.32		{$T2},[$Xfer,:128]!
520	vst1.32		{$T3},[$Xfer,:128]!
521
522	ldmia		$ctx,{$A-$H}
523	sub		$Xfer,$Xfer,#64
524	ldr		$t1,[sp,#0]
525	eor		$t2,$t2,$t2
526	eor		$t3,$B,$C
527	b		.L_00_48
528
529.align	4
530.L_00_48:
531___
532	&Xupdate(\&body_00_15);
533	&Xupdate(\&body_00_15);
534	&Xupdate(\&body_00_15);
535	&Xupdate(\&body_00_15);
536$code.=<<___;
537	teq	$t1,#0				@ check for K256 terminator
538	ldr	$t1,[sp,#0]
539	sub	$Xfer,$Xfer,#64
540	bne	.L_00_48
541
542	ldr		$inp,[sp,#68]
543	ldr		$t0,[sp,#72]
544	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
545	teq		$inp,$t0
546	it		eq
547	subeq		$inp,$inp,#64		@ avoid SEGV
548	vld1.8		{@X[0]},[$inp]!		@ load next input block
549	vld1.8		{@X[1]},[$inp]!
550	vld1.8		{@X[2]},[$inp]!
551	vld1.8		{@X[3]},[$inp]!
552	it		ne
553	strne		$inp,[sp,#68]
554	mov		$Xfer,sp
555___
556	&Xpreload(\&body_00_15);
557	&Xpreload(\&body_00_15);
558	&Xpreload(\&body_00_15);
559	&Xpreload(\&body_00_15);
560$code.=<<___;
561	ldr	$t0,[$t1,#0]
562	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
563	ldr	$t2,[$t1,#4]
564	ldr	$t3,[$t1,#8]
565	ldr	$t4,[$t1,#12]
566	add	$A,$A,$t0			@ accumulate
567	ldr	$t0,[$t1,#16]
568	add	$B,$B,$t2
569	ldr	$t2,[$t1,#20]
570	add	$C,$C,$t3
571	ldr	$t3,[$t1,#24]
572	add	$D,$D,$t4
573	ldr	$t4,[$t1,#28]
574	add	$E,$E,$t0
575	str	$A,[$t1],#4
576	add	$F,$F,$t2
577	str	$B,[$t1],#4
578	add	$G,$G,$t3
579	str	$C,[$t1],#4
580	add	$H,$H,$t4
581	str	$D,[$t1],#4
582	stmia	$t1,{$E-$H}
583
584	ittte	ne
585	movne	$Xfer,sp
586	ldrne	$t1,[sp,#0]
587	eorne	$t2,$t2,$t2
588	ldreq	sp,[sp,#76]			@ restore original sp
589	itt	ne
590	eorne	$t3,$B,$C
591	bne	.L_00_48
592
593	ldmia	sp!,{r4-r12,pc}
594.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
595#endif
596___
597}}}
598######################################################################
599# ARMv8 stuff
600#
601{{{
602my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
603my @MSG=map("q$_",(8..11));
604my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
605my $Ktbl="r3";
606
607$code.=<<___;
608#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
609
610# if defined(__thumb2__)
611#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
612# else
613#  define INST(a,b,c,d)	.byte	a,b,c,d
614# endif
615
616.type	sha256_block_data_order_armv8,%function
617.align	5
618sha256_block_data_order_armv8:
619.LARMv8:
620	vld1.32	{$ABCD,$EFGH},[$ctx]
621	sub	$Ktbl,$Ktbl,#256+32
622	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
623	b	.Loop_v8
624
625.align	4
626.Loop_v8:
627	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
628	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
629	vld1.32		{$W0},[$Ktbl]!
630	vrev32.8	@MSG[0],@MSG[0]
631	vrev32.8	@MSG[1],@MSG[1]
632	vrev32.8	@MSG[2],@MSG[2]
633	vrev32.8	@MSG[3],@MSG[3]
634	vmov		$ABCD_SAVE,$ABCD	@ offload
635	vmov		$EFGH_SAVE,$EFGH
636	teq		$inp,$len
637___
638for($i=0;$i<12;$i++) {
639$code.=<<___;
640	vld1.32		{$W1},[$Ktbl]!
641	vadd.i32	$W0,$W0,@MSG[0]
642	sha256su0	@MSG[0],@MSG[1]
643	vmov		$abcd,$ABCD
644	sha256h		$ABCD,$EFGH,$W0
645	sha256h2	$EFGH,$abcd,$W0
646	sha256su1	@MSG[0],@MSG[2],@MSG[3]
647___
648	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
649}
650$code.=<<___;
651	vld1.32		{$W1},[$Ktbl]!
652	vadd.i32	$W0,$W0,@MSG[0]
653	vmov		$abcd,$ABCD
654	sha256h		$ABCD,$EFGH,$W0
655	sha256h2	$EFGH,$abcd,$W0
656
657	vld1.32		{$W0},[$Ktbl]!
658	vadd.i32	$W1,$W1,@MSG[1]
659	vmov		$abcd,$ABCD
660	sha256h		$ABCD,$EFGH,$W1
661	sha256h2	$EFGH,$abcd,$W1
662
663	vld1.32		{$W1},[$Ktbl]
664	vadd.i32	$W0,$W0,@MSG[2]
665	sub		$Ktbl,$Ktbl,#256-16	@ rewind
666	vmov		$abcd,$ABCD
667	sha256h		$ABCD,$EFGH,$W0
668	sha256h2	$EFGH,$abcd,$W0
669
670	vadd.i32	$W1,$W1,@MSG[3]
671	vmov		$abcd,$ABCD
672	sha256h		$ABCD,$EFGH,$W1
673	sha256h2	$EFGH,$abcd,$W1
674
675	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
676	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
677	it		ne
678	bne		.Loop_v8
679
680	vst1.32		{$ABCD,$EFGH},[$ctx]
681
682	ret		@ bx lr
683.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
684#endif
685___
686}}}
687$code.=<<___;
688.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
689.align	2
690#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
691.comm   OPENSSL_armcap_P,4,4
692.hidden OPENSSL_armcap_P
693#endif
694___
695
696open SELF,$0;
697while(<SELF>) {
698	next if (/^#!/);
699	last if (!s/^#/@/ and !/^$/);
700	print;
701}
702close SELF;
703
704{   my  %opcode = (
705	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
706	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
707
708    sub unsha256 {
709	my ($mnemonic,$arg)=@_;
710
711	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
712	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
713					 |(($2&7)<<17)|(($2&8)<<4)
714					 |(($3&7)<<1) |(($3&8)<<2);
715	    # since ARMv7 instructions are always encoded little-endian.
716	    # correct solution is to use .inst directive, but older
717	    # assemblers don't implement it:-(
718	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
719			$word&0xff,($word>>8)&0xff,
720			($word>>16)&0xff,($word>>24)&0xff,
721			$mnemonic,$arg;
722	}
723    }
724}
725
726foreach (split($/,$code)) {
727
728	s/\`([^\`]*)\`/eval $1/geo;
729
730	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
731
732	s/\bret\b/bx	lr/go		or
733	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
734
735	print $_,"\n";
736}
737
738close STDOUT; # enforce flush
739