1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# 9# Permission to use under GPL terms is granted. 10# ==================================================================== 11 12# SHA256 block procedure for ARMv4. May 2007. 13 14# Performance is ~2x better than gcc 3.4 generated code and in "abso- 15# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 16# byte [on single-issue Xscale PXA250 core]. 17 18# July 2010. 19# 20# Rescheduling for dual-issue pipeline resulted in 22% improvement on 21# Cortex A8 core and ~20 cycles per processed byte. 22 23# February 2011. 24# 25# Profiler-assisted and platform-specific optimization resulted in 16% 26# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 27 28# September 2013. 29# 30# Add NEON implementation. On Cortex A8 it was measured to process one 31# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 32# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 33# code (meaning that latter performs sub-optimally, nothing was done 34# about it). 35 36# May 2014. 37# 38# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 39 40$flavour = shift; 41if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 42else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } 43 44if ($flavour && $flavour ne "void") { 45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 46 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 47 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 48 die "can't locate arm-xlate.pl"; 49 50 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 51} else { 52 open STDOUT,">$output"; 53} 54 55$ctx="r0"; $t0="r0"; 56$inp="r1"; $t4="r1"; 57$len="r2"; $t1="r2"; 58$T1="r3"; $t3="r3"; 59$A="r4"; 60$B="r5"; 61$C="r6"; 62$D="r7"; 63$E="r8"; 64$F="r9"; 65$G="r10"; 66$H="r11"; 67@V=($A,$B,$C,$D,$E,$F,$G,$H); 68$t2="r12"; 69$Ktbl="r14"; 70 71@Sigma0=( 2,13,22); 72@Sigma1=( 6,11,25); 73@sigma0=( 7,18, 3); 74@sigma1=(17,19,10); 75 76sub BODY_00_15 { 77my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 78 79$code.=<<___ if ($i<16); 80#if __ARM_ARCH__>=7 81 @ ldr $t1,[$inp],#4 @ $i 82# if $i==15 83 str $inp,[sp,#17*4] @ make room for $t4 84# endif 85 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 86 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 87 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 88# ifndef __ARMEB__ 89 rev $t1,$t1 90# endif 91#else 92 @ ldrb $t1,[$inp,#3] @ $i 93 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 94 ldrb $t2,[$inp,#2] 95 ldrb $t0,[$inp,#1] 96 orr $t1,$t1,$t2,lsl#8 97 ldrb $t2,[$inp],#4 98 orr $t1,$t1,$t0,lsl#16 99# if $i==15 100 str $inp,[sp,#17*4] @ make room for $t4 101# endif 102 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 103 orr $t1,$t1,$t2,lsl#24 104 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 105#endif 106___ 107$code.=<<___; 108 ldr $t2,[$Ktbl],#4 @ *K256++ 109 add $h,$h,$t1 @ h+=X[i] 110 str $t1,[sp,#`$i%16`*4] 111 eor $t1,$f,$g 112 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 113 and $t1,$t1,$e 114 add $h,$h,$t2 @ h+=K256[i] 115 eor $t1,$t1,$g @ Ch(e,f,g) 116 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 117 add $h,$h,$t1 @ h+=Ch(e,f,g) 118#if $i==31 119 and $t2,$t2,#0xff 120 cmp $t2,#0xf2 @ done? 121#endif 122#if $i<15 123# if __ARM_ARCH__>=7 124 ldr $t1,[$inp],#4 @ prefetch 125# else 126 ldrb $t1,[$inp,#3] 127# endif 128 eor $t2,$a,$b @ a^b, b^c in next round 129#else 130 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 131 eor $t2,$a,$b @ a^b, b^c in next round 132 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 133#endif 134 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 135 and $t3,$t3,$t2 @ (b^c)&=(a^b) 136 add $d,$d,$h @ d+=h 137 eor $t3,$t3,$b @ Maj(a,b,c) 138 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 139 @ add $h,$h,$t3 @ h+=Maj(a,b,c) 140___ 141 ($t2,$t3)=($t3,$t2); 142} 143 144sub BODY_16_XX { 145my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 146 147$code.=<<___; 148 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 149 @ ldr $t4,[sp,#`($i+14)%16`*4] 150 mov $t0,$t1,ror#$sigma0[0] 151 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 152 mov $t2,$t4,ror#$sigma1[0] 153 eor $t0,$t0,$t1,ror#$sigma0[1] 154 eor $t2,$t2,$t4,ror#$sigma1[1] 155 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 156 ldr $t1,[sp,#`($i+0)%16`*4] 157 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 158 ldr $t4,[sp,#`($i+9)%16`*4] 159 160 add $t2,$t2,$t0 161 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 162 add $t1,$t1,$t2 163 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 164 add $t1,$t1,$t4 @ X[i] 165___ 166 &BODY_00_15(@_); 167} 168 169$code=<<___; 170#ifndef __KERNEL__ 171# include <openssl/arm_arch.h> 172#else 173# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 174# define __ARM_MAX_ARCH__ 7 175#endif 176 177.text 178#if __ARM_ARCH__<7 179.code 32 180#else 181.syntax unified 182# if defined(__thumb2__) && !defined(__APPLE__) 183# define adrl adr 184.thumb 185# else 186.code 32 187# endif 188#endif 189 190.type K256,%object 191.align 5 192K256: 193.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 194.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 195.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 196.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 197.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 198.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 199.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 200.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 201.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 202.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 203.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 204.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 205.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 206.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 207.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 208.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 209.size K256,.-K256 210.word 0 @ terminator 211#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 212.LOPENSSL_armcap: 213.word OPENSSL_armcap_P-.Lsha256_block_data_order 214#endif 215.align 5 216 217.global sha256_block_data_order 218.type sha256_block_data_order,%function 219sha256_block_data_order: 220.Lsha256_block_data_order: 221#if __ARM_ARCH__<7 222 sub r3,pc,#8 @ sha256_block_data_order 223#else 224 adr r3,sha256_block_data_order 225#endif 226#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 227 ldr r12,.LOPENSSL_armcap 228 ldr r12,[r3,r12] @ OPENSSL_armcap_P 229#ifdef __APPLE__ 230 ldr r12,[r12] 231#endif 232 tst r12,#ARMV8_SHA256 233 bne .LARMv8 234 tst r12,#ARMV7_NEON 235 bne .LNEON 236#endif 237 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 238 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 239 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 240 sub $Ktbl,r3,#256+32 @ K256 241 sub sp,sp,#16*4 @ alloca(X[16]) 242.Loop: 243# if __ARM_ARCH__>=7 244 ldr $t1,[$inp],#4 245# else 246 ldrb $t1,[$inp,#3] 247# endif 248 eor $t3,$B,$C @ magic 249 eor $t2,$t2,$t2 250___ 251for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 252$code.=".Lrounds_16_xx:\n"; 253for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 254$code.=<<___; 255#if __ARM_ARCH__>=7 256 ite eq @ Thumb2 thing, sanity check in ARM 257#endif 258 ldreq $t3,[sp,#16*4] @ pull ctx 259 bne .Lrounds_16_xx 260 261 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 262 ldr $t0,[$t3,#0] 263 ldr $t1,[$t3,#4] 264 ldr $t2,[$t3,#8] 265 add $A,$A,$t0 266 ldr $t0,[$t3,#12] 267 add $B,$B,$t1 268 ldr $t1,[$t3,#16] 269 add $C,$C,$t2 270 ldr $t2,[$t3,#20] 271 add $D,$D,$t0 272 ldr $t0,[$t3,#24] 273 add $E,$E,$t1 274 ldr $t1,[$t3,#28] 275 add $F,$F,$t2 276 ldr $inp,[sp,#17*4] @ pull inp 277 ldr $t2,[sp,#18*4] @ pull inp+len 278 add $G,$G,$t0 279 add $H,$H,$t1 280 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 281 cmp $inp,$t2 282 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 283 bne .Loop 284 285 add sp,sp,#`16+3`*4 @ destroy frame 286#if __ARM_ARCH__>=5 287 ldmia sp!,{r4-r11,pc} 288#else 289 ldmia sp!,{r4-r11,lr} 290 tst lr,#1 291 moveq pc,lr @ be binary compatible with V4, yet 292 bx lr @ interoperable with Thumb ISA:-) 293#endif 294.size sha256_block_data_order,.-sha256_block_data_order 295___ 296###################################################################### 297# NEON stuff 298# 299{{{ 300my @X=map("q$_",(0..3)); 301my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 302my $Xfer=$t4; 303my $j=0; 304 305sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 306sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 307 308sub AUTOLOAD() # thunk [simplified] x86-style perlasm 309{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 310 my $arg = pop; 311 $arg = "#$arg" if ($arg*1 eq $arg); 312 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 313} 314 315sub Xupdate() 316{ use integer; 317 my $body = shift; 318 my @insns = (&$body,&$body,&$body,&$body); 319 my ($a,$b,$c,$d,$e,$f,$g,$h); 320 321 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 322 eval(shift(@insns)); 323 eval(shift(@insns)); 324 eval(shift(@insns)); 325 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 326 eval(shift(@insns)); 327 eval(shift(@insns)); 328 eval(shift(@insns)); 329 &vshr_u32 ($T2,$T0,$sigma0[0]); 330 eval(shift(@insns)); 331 eval(shift(@insns)); 332 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 333 eval(shift(@insns)); 334 eval(shift(@insns)); 335 &vshr_u32 ($T1,$T0,$sigma0[2]); 336 eval(shift(@insns)); 337 eval(shift(@insns)); 338 &vsli_32 ($T2,$T0,32-$sigma0[0]); 339 eval(shift(@insns)); 340 eval(shift(@insns)); 341 &vshr_u32 ($T3,$T0,$sigma0[1]); 342 eval(shift(@insns)); 343 eval(shift(@insns)); 344 &veor ($T1,$T1,$T2); 345 eval(shift(@insns)); 346 eval(shift(@insns)); 347 &vsli_32 ($T3,$T0,32-$sigma0[1]); 348 eval(shift(@insns)); 349 eval(shift(@insns)); 350 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 351 eval(shift(@insns)); 352 eval(shift(@insns)); 353 &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 354 eval(shift(@insns)); 355 eval(shift(@insns)); 356 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 357 eval(shift(@insns)); 358 eval(shift(@insns)); 359 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 360 eval(shift(@insns)); 361 eval(shift(@insns)); 362 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 363 eval(shift(@insns)); 364 eval(shift(@insns)); 365 &veor ($T5,$T5,$T4); 366 eval(shift(@insns)); 367 eval(shift(@insns)); 368 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 369 eval(shift(@insns)); 370 eval(shift(@insns)); 371 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 372 eval(shift(@insns)); 373 eval(shift(@insns)); 374 &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 375 eval(shift(@insns)); 376 eval(shift(@insns)); 377 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 378 eval(shift(@insns)); 379 eval(shift(@insns)); 380 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 381 eval(shift(@insns)); 382 eval(shift(@insns)); 383 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 384 eval(shift(@insns)); 385 eval(shift(@insns)); 386 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 387 eval(shift(@insns)); 388 eval(shift(@insns)); 389 &veor ($T5,$T5,$T4); 390 eval(shift(@insns)); 391 eval(shift(@insns)); 392 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 393 eval(shift(@insns)); 394 eval(shift(@insns)); 395 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 396 eval(shift(@insns)); 397 eval(shift(@insns)); 398 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 399 eval(shift(@insns)); 400 eval(shift(@insns)); 401 &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 402 eval(shift(@insns)); 403 eval(shift(@insns)); 404 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 405 eval(shift(@insns)); 406 eval(shift(@insns)); 407 &vadd_i32 ($T0,$T0,@X[0]); 408 while($#insns>=2) { eval(shift(@insns)); } 409 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 410 eval(shift(@insns)); 411 eval(shift(@insns)); 412 413 push(@X,shift(@X)); # "rotate" X[] 414} 415 416sub Xpreload() 417{ use integer; 418 my $body = shift; 419 my @insns = (&$body,&$body,&$body,&$body); 420 my ($a,$b,$c,$d,$e,$f,$g,$h); 421 422 eval(shift(@insns)); 423 eval(shift(@insns)); 424 eval(shift(@insns)); 425 eval(shift(@insns)); 426 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 427 eval(shift(@insns)); 428 eval(shift(@insns)); 429 eval(shift(@insns)); 430 eval(shift(@insns)); 431 &vrev32_8 (@X[0],@X[0]); 432 eval(shift(@insns)); 433 eval(shift(@insns)); 434 eval(shift(@insns)); 435 eval(shift(@insns)); 436 &vadd_i32 ($T0,$T0,@X[0]); 437 foreach (@insns) { eval; } # remaining instructions 438 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 439 440 push(@X,shift(@X)); # "rotate" X[] 441} 442 443sub body_00_15 () { 444 ( 445 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 446 '&add ($h,$h,$t1)', # h+=X[i]+K[i] 447 '&eor ($t1,$f,$g)', 448 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 449 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 450 '&and ($t1,$t1,$e)', 451 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 452 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 453 '&eor ($t1,$t1,$g)', # Ch(e,f,g) 454 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 455 '&eor ($t2,$a,$b)', # a^b, b^c in next round 456 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 457 '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 458 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 459 '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 460 '&ldr ($t1,"[sp,#64]") if ($j==31)', 461 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 462 '&add ($d,$d,$h)', # d+=h 463 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 464 '&eor ($t3,$t3,$b)', # Maj(a,b,c) 465 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 466 ) 467} 468 469$code.=<<___; 470#if __ARM_MAX_ARCH__>=7 471.arch armv7-a 472.fpu neon 473 474.global sha256_block_data_order_neon 475.type sha256_block_data_order_neon,%function 476.align 4 477sha256_block_data_order_neon: 478.LNEON: 479 stmdb sp!,{r4-r12,lr} 480 481 sub $H,sp,#16*4+16 482 adrl $Ktbl,K256 483 bic $H,$H,#15 @ align for 128-bit stores 484 mov $t2,sp 485 mov sp,$H @ alloca 486 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 487 488 vld1.8 {@X[0]},[$inp]! 489 vld1.8 {@X[1]},[$inp]! 490 vld1.8 {@X[2]},[$inp]! 491 vld1.8 {@X[3]},[$inp]! 492 vld1.32 {$T0},[$Ktbl,:128]! 493 vld1.32 {$T1},[$Ktbl,:128]! 494 vld1.32 {$T2},[$Ktbl,:128]! 495 vld1.32 {$T3},[$Ktbl,:128]! 496 vrev32.8 @X[0],@X[0] @ yes, even on 497 str $ctx,[sp,#64] 498 vrev32.8 @X[1],@X[1] @ big-endian 499 str $inp,[sp,#68] 500 mov $Xfer,sp 501 vrev32.8 @X[2],@X[2] 502 str $len,[sp,#72] 503 vrev32.8 @X[3],@X[3] 504 str $t2,[sp,#76] @ save original sp 505 vadd.i32 $T0,$T0,@X[0] 506 vadd.i32 $T1,$T1,@X[1] 507 vst1.32 {$T0},[$Xfer,:128]! 508 vadd.i32 $T2,$T2,@X[2] 509 vst1.32 {$T1},[$Xfer,:128]! 510 vadd.i32 $T3,$T3,@X[3] 511 vst1.32 {$T2},[$Xfer,:128]! 512 vst1.32 {$T3},[$Xfer,:128]! 513 514 ldmia $ctx,{$A-$H} 515 sub $Xfer,$Xfer,#64 516 ldr $t1,[sp,#0] 517 eor $t2,$t2,$t2 518 eor $t3,$B,$C 519 b .L_00_48 520 521.align 4 522.L_00_48: 523___ 524 &Xupdate(\&body_00_15); 525 &Xupdate(\&body_00_15); 526 &Xupdate(\&body_00_15); 527 &Xupdate(\&body_00_15); 528$code.=<<___; 529 teq $t1,#0 @ check for K256 terminator 530 ldr $t1,[sp,#0] 531 sub $Xfer,$Xfer,#64 532 bne .L_00_48 533 534 ldr $inp,[sp,#68] 535 ldr $t0,[sp,#72] 536 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 537 teq $inp,$t0 538 it eq 539 subeq $inp,$inp,#64 @ avoid SEGV 540 vld1.8 {@X[0]},[$inp]! @ load next input block 541 vld1.8 {@X[1]},[$inp]! 542 vld1.8 {@X[2]},[$inp]! 543 vld1.8 {@X[3]},[$inp]! 544 it ne 545 strne $inp,[sp,#68] 546 mov $Xfer,sp 547___ 548 &Xpreload(\&body_00_15); 549 &Xpreload(\&body_00_15); 550 &Xpreload(\&body_00_15); 551 &Xpreload(\&body_00_15); 552$code.=<<___; 553 ldr $t0,[$t1,#0] 554 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 555 ldr $t2,[$t1,#4] 556 ldr $t3,[$t1,#8] 557 ldr $t4,[$t1,#12] 558 add $A,$A,$t0 @ accumulate 559 ldr $t0,[$t1,#16] 560 add $B,$B,$t2 561 ldr $t2,[$t1,#20] 562 add $C,$C,$t3 563 ldr $t3,[$t1,#24] 564 add $D,$D,$t4 565 ldr $t4,[$t1,#28] 566 add $E,$E,$t0 567 str $A,[$t1],#4 568 add $F,$F,$t2 569 str $B,[$t1],#4 570 add $G,$G,$t3 571 str $C,[$t1],#4 572 add $H,$H,$t4 573 str $D,[$t1],#4 574 stmia $t1,{$E-$H} 575 576 ittte ne 577 movne $Xfer,sp 578 ldrne $t1,[sp,#0] 579 eorne $t2,$t2,$t2 580 ldreq sp,[sp,#76] @ restore original sp 581 itt ne 582 eorne $t3,$B,$C 583 bne .L_00_48 584 585 ldmia sp!,{r4-r12,pc} 586.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 587#endif 588___ 589}}} 590###################################################################### 591# ARMv8 stuff 592# 593{{{ 594my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 595my @MSG=map("q$_",(8..11)); 596my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 597my $Ktbl="r3"; 598 599$code.=<<___; 600#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 601 602# if defined(__thumb2__) && !defined(__APPLE__) 603# define INST(a,b,c,d) .byte c,d|0xc,a,b 604# else 605# define INST(a,b,c,d) .byte a,b,c,d 606# endif 607 608.type sha256_block_data_order_armv8,%function 609.align 5 610sha256_block_data_order_armv8: 611.LARMv8: 612 vld1.32 {$ABCD,$EFGH},[$ctx] 613# ifdef __APPLE__ 614 sub $Ktbl,$Ktbl,#256+32 615# elif defined(__thumb2__) 616 adr $Ktbl,.LARMv8 617 sub $Ktbl,$Ktbl,#.LARMv8-K256 618# else 619 adrl $Ktbl,K256 620# endif 621 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 622 623.Loop_v8: 624 vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 625 vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 626 vld1.32 {$W0},[$Ktbl]! 627 vrev32.8 @MSG[0],@MSG[0] 628 vrev32.8 @MSG[1],@MSG[1] 629 vrev32.8 @MSG[2],@MSG[2] 630 vrev32.8 @MSG[3],@MSG[3] 631 vmov $ABCD_SAVE,$ABCD @ offload 632 vmov $EFGH_SAVE,$EFGH 633 teq $inp,$len 634___ 635for($i=0;$i<12;$i++) { 636$code.=<<___; 637 vld1.32 {$W1},[$Ktbl]! 638 vadd.i32 $W0,$W0,@MSG[0] 639 sha256su0 @MSG[0],@MSG[1] 640 vmov $abcd,$ABCD 641 sha256h $ABCD,$EFGH,$W0 642 sha256h2 $EFGH,$abcd,$W0 643 sha256su1 @MSG[0],@MSG[2],@MSG[3] 644___ 645 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 646} 647$code.=<<___; 648 vld1.32 {$W1},[$Ktbl]! 649 vadd.i32 $W0,$W0,@MSG[0] 650 vmov $abcd,$ABCD 651 sha256h $ABCD,$EFGH,$W0 652 sha256h2 $EFGH,$abcd,$W0 653 654 vld1.32 {$W0},[$Ktbl]! 655 vadd.i32 $W1,$W1,@MSG[1] 656 vmov $abcd,$ABCD 657 sha256h $ABCD,$EFGH,$W1 658 sha256h2 $EFGH,$abcd,$W1 659 660 vld1.32 {$W1},[$Ktbl] 661 vadd.i32 $W0,$W0,@MSG[2] 662 sub $Ktbl,$Ktbl,#256-16 @ rewind 663 vmov $abcd,$ABCD 664 sha256h $ABCD,$EFGH,$W0 665 sha256h2 $EFGH,$abcd,$W0 666 667 vadd.i32 $W1,$W1,@MSG[3] 668 vmov $abcd,$ABCD 669 sha256h $ABCD,$EFGH,$W1 670 sha256h2 $EFGH,$abcd,$W1 671 672 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 673 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 674 it ne 675 bne .Loop_v8 676 677 vst1.32 {$ABCD,$EFGH},[$ctx] 678 679 ret @ bx lr 680.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 681#endif 682___ 683}}} 684$code.=<<___; 685.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 686.align 2 687#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 688.comm OPENSSL_armcap_P,4,4 689.hidden OPENSSL_armcap_P 690#endif 691___ 692 693open SELF,$0; 694while(<SELF>) { 695 next if (/^#!/); 696 last if (!s/^#/@/ and !/^$/); 697 print; 698} 699close SELF; 700 701{ my %opcode = ( 702 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 703 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 704 705 sub unsha256 { 706 my ($mnemonic,$arg)=@_; 707 708 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 709 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 710 |(($2&7)<<17)|(($2&8)<<4) 711 |(($3&7)<<1) |(($3&8)<<2); 712 # since ARMv7 instructions are always encoded little-endian. 713 # correct solution is to use .inst directive, but older 714 # assemblers don't implement it:-( 715 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 716 $word&0xff,($word>>8)&0xff, 717 ($word>>16)&0xff,($word>>24)&0xff, 718 $mnemonic,$arg; 719 } 720 } 721} 722 723foreach (split($/,$code)) { 724 725 s/\`([^\`]*)\`/eval $1/geo; 726 727 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 728 729 s/\bret\b/bx lr/go or 730 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 731 732 print $_,"\n"; 733} 734 735close STDOUT; # enforce flush 736