1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# December 2014 11# 12# ChaCha20 for ARMv4. 13# 14# Performance in cycles per byte out of large buffer. 15# 16# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU 17# 18# Cortex-A5 19.3(*)/+95% 21.8 14.1 19# Cortex-A8 10.5(*)/+160% 13.9 6.35 20# Cortex-A9 12.9(**)/+110% 14.3 6.50 21# Cortex-A15 11.0/+40% 16.0 5.00 22# Snapdragon S4 11.5/+125% 13.6 4.90 23# 24# (*) most "favourable" result for aligned data on little-endian 25# processor, result for misaligned data is 10-15% lower; 26# (**) this result is a trade-off: it can be improved by 20%, 27# but then Snapdragon S4 and Cortex-A8 results get 28# 20-25% worse; 29 30$flavour = shift; 31if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 32else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 33 34if ($flavour && $flavour ne "void") { 35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 36 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 37 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 38 die "can't locate arm-xlate.pl"; 39 40 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 41} else { 42 open STDOUT,">$output"; 43} 44 45sub AUTOLOAD() # thunk [simplified] x86-style perlasm 46{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 47 my $arg = pop; 48 $arg = "#$arg" if ($arg*1 eq $arg); 49 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 50} 51 52my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); 53my @t=map("r$_",(8..11)); 54 55sub ROUND { 56my ($a0,$b0,$c0,$d0)=@_; 57my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 58my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 59my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 60my $odd = $d0&1; 61my ($xc,$xc_) = (@t[0..1]); 62my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); 63my @ret; 64 65 # Consider order in which variables are addressed by their 66 # index: 67 # 68 # a b c d 69 # 70 # 0 4 8 12 < even round 71 # 1 5 9 13 72 # 2 6 10 14 73 # 3 7 11 15 74 # 0 5 10 15 < odd round 75 # 1 6 11 12 76 # 2 7 8 13 77 # 3 4 9 14 78 # 79 # 'a', 'b' are permanently allocated in registers, @x[0..7], 80 # while 'c's and pair of 'd's are maintained in memory. If 81 # you observe 'c' column, you'll notice that pair of 'c's is 82 # invariant between rounds. This means that we have to reload 83 # them once per round, in the middle. This is why you'll see 84 # bunch of 'c' stores and loads in the middle, but none in 85 # the beginning or end. If you observe 'd' column, you'll 86 # notice that 15 and 13 are reused in next pair of rounds. 87 # This is why these two are chosen for offloading to memory, 88 # to make loads count more. 89 push @ret,( 90 "&add (@x[$a0],@x[$a0],@x[$b0])", 91 "&mov ($xd,$xd,'ror#16')", 92 "&add (@x[$a1],@x[$a1],@x[$b1])", 93 "&mov ($xd_,$xd_,'ror#16')", 94 "&eor ($xd,$xd,@x[$a0],'ror#16')", 95 "&eor ($xd_,$xd_,@x[$a1],'ror#16')", 96 97 "&add ($xc,$xc,$xd)", 98 "&mov (@x[$b0],@x[$b0],'ror#20')", 99 "&add ($xc_,$xc_,$xd_)", 100 "&mov (@x[$b1],@x[$b1],'ror#20')", 101 "&eor (@x[$b0],@x[$b0],$xc,'ror#20')", 102 "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')", 103 104 "&add (@x[$a0],@x[$a0],@x[$b0])", 105 "&mov ($xd,$xd,'ror#24')", 106 "&add (@x[$a1],@x[$a1],@x[$b1])", 107 "&mov ($xd_,$xd_,'ror#24')", 108 "&eor ($xd,$xd,@x[$a0],'ror#24')", 109 "&eor ($xd_,$xd_,@x[$a1],'ror#24')", 110 111 "&add ($xc,$xc,$xd)", 112 "&mov (@x[$b0],@x[$b0],'ror#25')" ); 113 push @ret,( 114 "&str ($xd,'[sp,#4*(16+$d0)]')", 115 "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); 116 push @ret,( 117 "&add ($xc_,$xc_,$xd_)", 118 "&mov (@x[$b1],@x[$b1],'ror#25')" ); 119 push @ret,( 120 "&str ($xd_,'[sp,#4*(16+$d1)]')", 121 "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); 122 push @ret,( 123 "&eor (@x[$b0],@x[$b0],$xc,'ror#25')", 124 "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" ); 125 126 $xd=@x[$d2] if (!$odd); 127 $xd_=@x[$d3] if ($odd); 128 push @ret,( 129 "&str ($xc,'[sp,#4*(16+$c0)]')", 130 "&ldr ($xc,'[sp,#4*(16+$c2)]')", 131 "&add (@x[$a2],@x[$a2],@x[$b2])", 132 "&mov ($xd,$xd,'ror#16')", 133 "&str ($xc_,'[sp,#4*(16+$c1)]')", 134 "&ldr ($xc_,'[sp,#4*(16+$c3)]')", 135 "&add (@x[$a3],@x[$a3],@x[$b3])", 136 "&mov ($xd_,$xd_,'ror#16')", 137 "&eor ($xd,$xd,@x[$a2],'ror#16')", 138 "&eor ($xd_,$xd_,@x[$a3],'ror#16')", 139 140 "&add ($xc,$xc,$xd)", 141 "&mov (@x[$b2],@x[$b2],'ror#20')", 142 "&add ($xc_,$xc_,$xd_)", 143 "&mov (@x[$b3],@x[$b3],'ror#20')", 144 "&eor (@x[$b2],@x[$b2],$xc,'ror#20')", 145 "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')", 146 147 "&add (@x[$a2],@x[$a2],@x[$b2])", 148 "&mov ($xd,$xd,'ror#24')", 149 "&add (@x[$a3],@x[$a3],@x[$b3])", 150 "&mov ($xd_,$xd_,'ror#24')", 151 "&eor ($xd,$xd,@x[$a2],'ror#24')", 152 "&eor ($xd_,$xd_,@x[$a3],'ror#24')", 153 154 "&add ($xc,$xc,$xd)", 155 "&mov (@x[$b2],@x[$b2],'ror#25')", 156 "&add ($xc_,$xc_,$xd_)", 157 "&mov (@x[$b3],@x[$b3],'ror#25')", 158 "&eor (@x[$b2],@x[$b2],$xc,'ror#25')", 159 "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" ); 160 161 @ret; 162} 163 164$code.=<<___; 165#include <openssl/arm_arch.h> 166 167.text 168#if defined(__thumb2__) 169.syntax unified 170.thumb 171#else 172.code 32 173#endif 174 175#if defined(__thumb2__) || defined(__clang__) 176#define ldrhsb ldrbhs 177#endif 178 179.align 5 180.Lsigma: 181.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral 182.Lone: 183.long 1,0,0,0 184#if __ARM_MAX_ARCH__>=7 185.LOPENSSL_armcap: 186.word OPENSSL_armcap_P-.LChaCha20_ctr32 187#else 188.word -1 189#endif 190 191.globl ChaCha20_ctr32 192.type ChaCha20_ctr32,%function 193.align 5 194ChaCha20_ctr32: 195.LChaCha20_ctr32: 196 ldr r12,[sp,#0] @ pull pointer to counter and nonce 197 stmdb sp!,{r0-r2,r4-r11,lr} 198#if __ARM_ARCH__<7 && !defined(__thumb2__) 199 sub r14,pc,#16 @ ChaCha20_ctr32 200#else 201 adr r14,.LChaCha20_ctr32 202#endif 203 cmp r2,#0 @ len==0? 204#ifdef __thumb2__ 205 itt eq 206#endif 207 addeq sp,sp,#4*3 208 beq .Lno_data 209#if __ARM_MAX_ARCH__>=7 210 cmp r2,#192 @ test len 211 bls .Lshort 212 ldr r4,[r14,#-32] 213 ldr r4,[r14,r4] 214# ifdef __APPLE__ 215 ldr r4,[r4] 216# endif 217 tst r4,#ARMV7_NEON 218 bne .LChaCha20_neon 219.Lshort: 220#endif 221 ldmia r12,{r4-r7} @ load counter and nonce 222 sub sp,sp,#4*(16) @ off-load area 223 sub r14,r14,#64 @ .Lsigma 224 stmdb sp!,{r4-r7} @ copy counter and nonce 225 ldmia r3,{r4-r11} @ load key 226 ldmia r14,{r0-r3} @ load sigma 227 stmdb sp!,{r4-r11} @ copy key 228 stmdb sp!,{r0-r3} @ copy sigma 229 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 230 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 231 b .Loop_outer_enter 232 233.align 4 234.Loop_outer: 235 ldmia sp,{r0-r9} @ load key material 236 str @t[3],[sp,#4*(32+2)] @ save len 237 str r12, [sp,#4*(32+1)] @ save inp 238 str r14, [sp,#4*(32+0)] @ save out 239.Loop_outer_enter: 240 ldr @t[3], [sp,#4*(15)] 241 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 242 ldr @t[2], [sp,#4*(13)] 243 ldr @x[14],[sp,#4*(14)] 244 str @t[3], [sp,#4*(16+15)] 245 mov @t[3],#10 246 b .Loop 247 248.align 4 249.Loop: 250 subs @t[3],@t[3],#1 251___ 252 foreach (&ROUND(0, 4, 8,12)) { eval; } 253 foreach (&ROUND(0, 5,10,15)) { eval; } 254$code.=<<___; 255 bne .Loop 256 257 ldr @t[3],[sp,#4*(32+2)] @ load len 258 259 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 260 str @t[1], [sp,#4*(16+9)] 261 str @x[12],[sp,#4*(16+12)] 262 str @t[2], [sp,#4*(16+13)] 263 str @x[14],[sp,#4*(16+14)] 264 265 @ at this point we have first half of 512-bit result in 266 @ @x[0-7] and second half at sp+4*(16+8) 267 268 cmp @t[3],#64 @ done yet? 269#ifdef __thumb2__ 270 itete lo 271#endif 272 addlo r12,sp,#4*(0) @ shortcut or ... 273 ldrhs r12,[sp,#4*(32+1)] @ ... load inp 274 addlo r14,sp,#4*(0) @ shortcut or ... 275 ldrhs r14,[sp,#4*(32+0)] @ ... load out 276 277 ldr @t[0],[sp,#4*(0)] @ load key material 278 ldr @t[1],[sp,#4*(1)] 279 280#if __ARM_ARCH__>=6 || !defined(__ARMEB__) 281# if __ARM_ARCH__<7 282 orr @t[2],r12,r14 283 tst @t[2],#3 @ are input and output aligned? 284 ldr @t[2],[sp,#4*(2)] 285 bne .Lunaligned 286 cmp @t[3],#64 @ restore flags 287# else 288 ldr @t[2],[sp,#4*(2)] 289# endif 290 ldr @t[3],[sp,#4*(3)] 291 292 add @x[0],@x[0],@t[0] @ accumulate key material 293 add @x[1],@x[1],@t[1] 294# ifdef __thumb2__ 295 itt hs 296# endif 297 ldrhs @t[0],[r12],#16 @ load input 298 ldrhs @t[1],[r12,#-12] 299 300 add @x[2],@x[2],@t[2] 301 add @x[3],@x[3],@t[3] 302# ifdef __thumb2__ 303 itt hs 304# endif 305 ldrhs @t[2],[r12,#-8] 306 ldrhs @t[3],[r12,#-4] 307# if __ARM_ARCH__>=6 && defined(__ARMEB__) 308 rev @x[0],@x[0] 309 rev @x[1],@x[1] 310 rev @x[2],@x[2] 311 rev @x[3],@x[3] 312# endif 313# ifdef __thumb2__ 314 itt hs 315# endif 316 eorhs @x[0],@x[0],@t[0] @ xor with input 317 eorhs @x[1],@x[1],@t[1] 318 add @t[0],sp,#4*(4) 319 str @x[0],[r14],#16 @ store output 320# ifdef __thumb2__ 321 itt hs 322# endif 323 eorhs @x[2],@x[2],@t[2] 324 eorhs @x[3],@x[3],@t[3] 325 ldmia @t[0],{@t[0]-@t[3]} @ load key material 326 str @x[1],[r14,#-12] 327 str @x[2],[r14,#-8] 328 str @x[3],[r14,#-4] 329 330 add @x[4],@x[4],@t[0] @ accumulate key material 331 add @x[5],@x[5],@t[1] 332# ifdef __thumb2__ 333 itt hs 334# endif 335 ldrhs @t[0],[r12],#16 @ load input 336 ldrhs @t[1],[r12,#-12] 337 add @x[6],@x[6],@t[2] 338 add @x[7],@x[7],@t[3] 339# ifdef __thumb2__ 340 itt hs 341# endif 342 ldrhs @t[2],[r12,#-8] 343 ldrhs @t[3],[r12,#-4] 344# if __ARM_ARCH__>=6 && defined(__ARMEB__) 345 rev @x[4],@x[4] 346 rev @x[5],@x[5] 347 rev @x[6],@x[6] 348 rev @x[7],@x[7] 349# endif 350# ifdef __thumb2__ 351 itt hs 352# endif 353 eorhs @x[4],@x[4],@t[0] 354 eorhs @x[5],@x[5],@t[1] 355 add @t[0],sp,#4*(8) 356 str @x[4],[r14],#16 @ store output 357# ifdef __thumb2__ 358 itt hs 359# endif 360 eorhs @x[6],@x[6],@t[2] 361 eorhs @x[7],@x[7],@t[3] 362 str @x[5],[r14,#-12] 363 ldmia @t[0],{@t[0]-@t[3]} @ load key material 364 str @x[6],[r14,#-8] 365 add @x[0],sp,#4*(16+8) 366 str @x[7],[r14,#-4] 367 368 ldmia @x[0],{@x[0]-@x[7]} @ load second half 369 370 add @x[0],@x[0],@t[0] @ accumulate key material 371 add @x[1],@x[1],@t[1] 372# ifdef __thumb2__ 373 itt hs 374# endif 375 ldrhs @t[0],[r12],#16 @ load input 376 ldrhs @t[1],[r12,#-12] 377# ifdef __thumb2__ 378 itt hi 379# endif 380 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 381 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 382 add @x[2],@x[2],@t[2] 383 add @x[3],@x[3],@t[3] 384# ifdef __thumb2__ 385 itt hs 386# endif 387 ldrhs @t[2],[r12,#-8] 388 ldrhs @t[3],[r12,#-4] 389# if __ARM_ARCH__>=6 && defined(__ARMEB__) 390 rev @x[0],@x[0] 391 rev @x[1],@x[1] 392 rev @x[2],@x[2] 393 rev @x[3],@x[3] 394# endif 395# ifdef __thumb2__ 396 itt hs 397# endif 398 eorhs @x[0],@x[0],@t[0] 399 eorhs @x[1],@x[1],@t[1] 400 add @t[0],sp,#4*(12) 401 str @x[0],[r14],#16 @ store output 402# ifdef __thumb2__ 403 itt hs 404# endif 405 eorhs @x[2],@x[2],@t[2] 406 eorhs @x[3],@x[3],@t[3] 407 str @x[1],[r14,#-12] 408 ldmia @t[0],{@t[0]-@t[3]} @ load key material 409 str @x[2],[r14,#-8] 410 str @x[3],[r14,#-4] 411 412 add @x[4],@x[4],@t[0] @ accumulate key material 413 add @x[5],@x[5],@t[1] 414# ifdef __thumb2__ 415 itt hi 416# endif 417 addhi @t[0],@t[0],#1 @ next counter value 418 strhi @t[0],[sp,#4*(12)] @ save next counter value 419# ifdef __thumb2__ 420 itt hs 421# endif 422 ldrhs @t[0],[r12],#16 @ load input 423 ldrhs @t[1],[r12,#-12] 424 add @x[6],@x[6],@t[2] 425 add @x[7],@x[7],@t[3] 426# ifdef __thumb2__ 427 itt hs 428# endif 429 ldrhs @t[2],[r12,#-8] 430 ldrhs @t[3],[r12,#-4] 431# if __ARM_ARCH__>=6 && defined(__ARMEB__) 432 rev @x[4],@x[4] 433 rev @x[5],@x[5] 434 rev @x[6],@x[6] 435 rev @x[7],@x[7] 436# endif 437# ifdef __thumb2__ 438 itt hs 439# endif 440 eorhs @x[4],@x[4],@t[0] 441 eorhs @x[5],@x[5],@t[1] 442# ifdef __thumb2__ 443 it ne 444# endif 445 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 446# ifdef __thumb2__ 447 itt hs 448# endif 449 eorhs @x[6],@x[6],@t[2] 450 eorhs @x[7],@x[7],@t[3] 451 str @x[4],[r14],#16 @ store output 452 str @x[5],[r14,#-12] 453# ifdef __thumb2__ 454 it hs 455# endif 456 subhs @t[3],@t[0],#64 @ len-=64 457 str @x[6],[r14,#-8] 458 str @x[7],[r14,#-4] 459 bhi .Loop_outer 460 461 beq .Ldone 462# if __ARM_ARCH__<7 463 b .Ltail 464 465.align 4 466.Lunaligned: @ unaligned endian-neutral path 467 cmp @t[3],#64 @ restore flags 468# endif 469#endif 470#if __ARM_ARCH__<7 471 ldr @t[3],[sp,#4*(3)] 472___ 473for ($i=0;$i<16;$i+=4) { 474my $j=$i&0x7; 475 476$code.=<<___ if ($i==4); 477 add @x[0],sp,#4*(16+8) 478___ 479$code.=<<___ if ($i==8); 480 ldmia @x[0],{@x[0]-@x[7]} @ load second half 481# ifdef __thumb2__ 482 itt hi 483# endif 484 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" 485 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" 486___ 487$code.=<<___; 488 add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material 489___ 490$code.=<<___ if ($i==12); 491# ifdef __thumb2__ 492 itt hi 493# endif 494 addhi @t[0],@t[0],#1 @ next counter value 495 strhi @t[0],[sp,#4*(12)] @ save next counter value 496___ 497$code.=<<___; 498 add @x[$j+1],@x[$j+1],@t[1] 499 add @x[$j+2],@x[$j+2],@t[2] 500# ifdef __thumb2__ 501 itete lo 502# endif 503 eorlo @t[0],@t[0],@t[0] @ zero or ... 504 ldrhsb @t[0],[r12],#16 @ ... load input 505 eorlo @t[1],@t[1],@t[1] 506 ldrhsb @t[1],[r12,#-12] 507 508 add @x[$j+3],@x[$j+3],@t[3] 509# ifdef __thumb2__ 510 itete lo 511# endif 512 eorlo @t[2],@t[2],@t[2] 513 ldrhsb @t[2],[r12,#-8] 514 eorlo @t[3],@t[3],@t[3] 515 ldrhsb @t[3],[r12,#-4] 516 517 eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) 518 eor @x[$j+1],@t[1],@x[$j+1] 519# ifdef __thumb2__ 520 itt hs 521# endif 522 ldrhsb @t[0],[r12,#-15] @ load more input 523 ldrhsb @t[1],[r12,#-11] 524 eor @x[$j+2],@t[2],@x[$j+2] 525 strb @x[$j+0],[r14],#16 @ store output 526 eor @x[$j+3],@t[3],@x[$j+3] 527# ifdef __thumb2__ 528 itt hs 529# endif 530 ldrhsb @t[2],[r12,#-7] 531 ldrhsb @t[3],[r12,#-3] 532 strb @x[$j+1],[r14,#-12] 533 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 534 strb @x[$j+2],[r14,#-8] 535 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 536# ifdef __thumb2__ 537 itt hs 538# endif 539 ldrhsb @t[0],[r12,#-14] @ load more input 540 ldrhsb @t[1],[r12,#-10] 541 strb @x[$j+3],[r14,#-4] 542 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 543 strb @x[$j+0],[r14,#-15] 544 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 545# ifdef __thumb2__ 546 itt hs 547# endif 548 ldrhsb @t[2],[r12,#-6] 549 ldrhsb @t[3],[r12,#-2] 550 strb @x[$j+1],[r14,#-11] 551 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 552 strb @x[$j+2],[r14,#-7] 553 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 554# ifdef __thumb2__ 555 itt hs 556# endif 557 ldrhsb @t[0],[r12,#-13] @ load more input 558 ldrhsb @t[1],[r12,#-9] 559 strb @x[$j+3],[r14,#-3] 560 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 561 strb @x[$j+0],[r14,#-14] 562 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 563# ifdef __thumb2__ 564 itt hs 565# endif 566 ldrhsb @t[2],[r12,#-5] 567 ldrhsb @t[3],[r12,#-1] 568 strb @x[$j+1],[r14,#-10] 569 strb @x[$j+2],[r14,#-6] 570 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 571 strb @x[$j+3],[r14,#-2] 572 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 573 strb @x[$j+0],[r14,#-13] 574 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 575 strb @x[$j+1],[r14,#-9] 576 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 577 strb @x[$j+2],[r14,#-5] 578 strb @x[$j+3],[r14,#-1] 579___ 580$code.=<<___ if ($i<12); 581 add @t[0],sp,#4*(4+$i) 582 ldmia @t[0],{@t[0]-@t[3]} @ load key material 583___ 584} 585$code.=<<___; 586# ifdef __thumb2__ 587 it ne 588# endif 589 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 590# ifdef __thumb2__ 591 it hs 592# endif 593 subhs @t[3],@t[0],#64 @ len-=64 594 bhi .Loop_outer 595 596 beq .Ldone 597#endif 598 599.Ltail: 600 ldr r12,[sp,#4*(32+1)] @ load inp 601 add @t[1],sp,#4*(0) 602 ldr r14,[sp,#4*(32+0)] @ load out 603 604.Loop_tail: 605 ldrb @t[2],[@t[1]],#1 @ read buffer on stack 606 ldrb @t[3],[r12],#1 @ read input 607 subs @t[0],@t[0],#1 608 eor @t[3],@t[3],@t[2] 609 strb @t[3],[r14],#1 @ store output 610 bne .Loop_tail 611 612.Ldone: 613 add sp,sp,#4*(32+3) 614.Lno_data: 615 ldmia sp!,{r4-r11,pc} 616.size ChaCha20_ctr32,.-ChaCha20_ctr32 617___ 618 619{{{ 620my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = 621 map("q$_",(0..15)); 622 623sub NEONROUND { 624my $odd = pop; 625my ($a,$b,$c,$d,$t)=@_; 626 627 ( 628 "&vadd_i32 ($a,$a,$b)", 629 "&veor ($d,$d,$a)", 630 "&vrev32_16 ($d,$d)", # vrot ($d,16) 631 632 "&vadd_i32 ($c,$c,$d)", 633 "&veor ($t,$b,$c)", 634 "&vshr_u32 ($b,$t,20)", 635 "&vsli_32 ($b,$t,12)", 636 637 "&vadd_i32 ($a,$a,$b)", 638 "&veor ($t,$d,$a)", 639 "&vshr_u32 ($d,$t,24)", 640 "&vsli_32 ($d,$t,8)", 641 642 "&vadd_i32 ($c,$c,$d)", 643 "&veor ($t,$b,$c)", 644 "&vshr_u32 ($b,$t,25)", 645 "&vsli_32 ($b,$t,7)", 646 647 "&vext_8 ($c,$c,$c,8)", 648 "&vext_8 ($b,$b,$b,$odd?12:4)", 649 "&vext_8 ($d,$d,$d,$odd?4:12)" 650 ); 651} 652 653$code.=<<___; 654#if __ARM_MAX_ARCH__>=7 655.arch armv7-a 656.fpu neon 657 658.type ChaCha20_neon,%function 659.align 5 660ChaCha20_neon: 661 ldr r12,[sp,#0] @ pull pointer to counter and nonce 662 stmdb sp!,{r0-r2,r4-r11,lr} 663.LChaCha20_neon: 664 adr r14,.Lsigma 665 vstmdb sp!,{d8-d15} @ ABI spec says so 666 stmdb sp!,{r0-r3} 667 668 vld1.32 {$b0-$c0},[r3] @ load key 669 ldmia r3,{r4-r11} @ load key 670 671 sub sp,sp,#4*(16+16) 672 vld1.32 {$d0},[r12] @ load counter and nonce 673 add r12,sp,#4*8 674 ldmia r14,{r0-r3} @ load sigma 675 vld1.32 {$a0},[r14]! @ load sigma 676 vld1.32 {$t0},[r14] @ one 677 vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce 678 vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key 679 680 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 681 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 682 vshl.i32 $t1#lo,$t0#lo,#1 @ two 683 vstr $t0#lo,[sp,#4*(16+0)] 684 vshl.i32 $t2#lo,$t0#lo,#2 @ four 685 vstr $t1#lo,[sp,#4*(16+2)] 686 vmov $a1,$a0 687 vstr $t2#lo,[sp,#4*(16+4)] 688 vmov $a2,$a0 689 vmov $b1,$b0 690 vmov $b2,$b0 691 b .Loop_neon_enter 692 693.align 4 694.Loop_neon_outer: 695 ldmia sp,{r0-r9} @ load key material 696 cmp @t[3],#64*2 @ if len<=64*2 697 bls .Lbreak_neon @ switch to integer-only 698 vmov $a1,$a0 699 str @t[3],[sp,#4*(32+2)] @ save len 700 vmov $a2,$a0 701 str r12, [sp,#4*(32+1)] @ save inp 702 vmov $b1,$b0 703 str r14, [sp,#4*(32+0)] @ save out 704 vmov $b2,$b0 705.Loop_neon_enter: 706 ldr @t[3], [sp,#4*(15)] 707 vadd.i32 $d1,$d0,$t0 @ counter+1 708 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 709 vmov $c1,$c0 710 ldr @t[2], [sp,#4*(13)] 711 vmov $c2,$c0 712 ldr @x[14],[sp,#4*(14)] 713 vadd.i32 $d2,$d1,$t0 @ counter+2 714 str @t[3], [sp,#4*(16+15)] 715 mov @t[3],#10 716 add @x[12],@x[12],#3 @ counter+3 717 b .Loop_neon 718 719.align 4 720.Loop_neon: 721 subs @t[3],@t[3],#1 722___ 723 my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); 724 my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); 725 my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); 726 my @thread3=&ROUND(0,4,8,12); 727 728 foreach (@thread0) { 729 eval; eval(shift(@thread3)); 730 eval(shift(@thread1)); eval(shift(@thread3)); 731 eval(shift(@thread2)); eval(shift(@thread3)); 732 } 733 734 @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); 735 @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); 736 @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); 737 @thread3=&ROUND(0,5,10,15); 738 739 foreach (@thread0) { 740 eval; eval(shift(@thread3)); 741 eval(shift(@thread1)); eval(shift(@thread3)); 742 eval(shift(@thread2)); eval(shift(@thread3)); 743 } 744$code.=<<___; 745 bne .Loop_neon 746 747 add @t[3],sp,#32 748 vld1.32 {$t0-$t1},[sp] @ load key material 749 vld1.32 {$t2-$t3},[@t[3]] 750 751 ldr @t[3],[sp,#4*(32+2)] @ load len 752 753 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 754 str @t[1], [sp,#4*(16+9)] 755 str @x[12],[sp,#4*(16+12)] 756 str @t[2], [sp,#4*(16+13)] 757 str @x[14],[sp,#4*(16+14)] 758 759 @ at this point we have first half of 512-bit result in 760 @ @x[0-7] and second half at sp+4*(16+8) 761 762 ldr r12,[sp,#4*(32+1)] @ load inp 763 ldr r14,[sp,#4*(32+0)] @ load out 764 765 vadd.i32 $a0,$a0,$t0 @ accumulate key material 766 vadd.i32 $a1,$a1,$t0 767 vadd.i32 $a2,$a2,$t0 768 vldr $t0#lo,[sp,#4*(16+0)] @ one 769 770 vadd.i32 $b0,$b0,$t1 771 vadd.i32 $b1,$b1,$t1 772 vadd.i32 $b2,$b2,$t1 773 vldr $t1#lo,[sp,#4*(16+2)] @ two 774 775 vadd.i32 $c0,$c0,$t2 776 vadd.i32 $c1,$c1,$t2 777 vadd.i32 $c2,$c2,$t2 778 vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 779 vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 780 781 vadd.i32 $d0,$d0,$t3 782 vadd.i32 $d1,$d1,$t3 783 vadd.i32 $d2,$d2,$t3 784 785 cmp @t[3],#64*4 786 blo .Ltail_neon 787 788 vld1.8 {$t0-$t1},[r12]! @ load input 789 mov @t[3],sp 790 vld1.8 {$t2-$t3},[r12]! 791 veor $a0,$a0,$t0 @ xor with input 792 veor $b0,$b0,$t1 793 vld1.8 {$t0-$t1},[r12]! 794 veor $c0,$c0,$t2 795 veor $d0,$d0,$t3 796 vld1.8 {$t2-$t3},[r12]! 797 798 veor $a1,$a1,$t0 799 vst1.8 {$a0-$b0},[r14]! @ store output 800 veor $b1,$b1,$t1 801 vld1.8 {$t0-$t1},[r12]! 802 veor $c1,$c1,$t2 803 vst1.8 {$c0-$d0},[r14]! 804 veor $d1,$d1,$t3 805 vld1.8 {$t2-$t3},[r12]! 806 807 veor $a2,$a2,$t0 808 vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration 809 veor $t0#hi,$t0#hi,$t0#hi 810 vldr $t0#lo,[sp,#4*(16+4)] @ four 811 veor $b2,$b2,$t1 812 vld1.32 {$c0-$d0},[@t[3]] 813 veor $c2,$c2,$t2 814 vst1.8 {$a1-$b1},[r14]! 815 veor $d2,$d2,$t3 816 vst1.8 {$c1-$d1},[r14]! 817 818 vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value 819 vldr $t0#lo,[sp,#4*(16+0)] @ one 820 821 ldmia sp,{@t[0]-@t[3]} @ load key material 822 add @x[0],@x[0],@t[0] @ accumulate key material 823 ldr @t[0],[r12],#16 @ load input 824 vst1.8 {$a2-$b2},[r14]! 825 add @x[1],@x[1],@t[1] 826 ldr @t[1],[r12,#-12] 827 vst1.8 {$c2-$d2},[r14]! 828 add @x[2],@x[2],@t[2] 829 ldr @t[2],[r12,#-8] 830 add @x[3],@x[3],@t[3] 831 ldr @t[3],[r12,#-4] 832# ifdef __ARMEB__ 833 rev @x[0],@x[0] 834 rev @x[1],@x[1] 835 rev @x[2],@x[2] 836 rev @x[3],@x[3] 837# endif 838 eor @x[0],@x[0],@t[0] @ xor with input 839 add @t[0],sp,#4*(4) 840 eor @x[1],@x[1],@t[1] 841 str @x[0],[r14],#16 @ store output 842 eor @x[2],@x[2],@t[2] 843 str @x[1],[r14,#-12] 844 eor @x[3],@x[3],@t[3] 845 ldmia @t[0],{@t[0]-@t[3]} @ load key material 846 str @x[2],[r14,#-8] 847 str @x[3],[r14,#-4] 848 849 add @x[4],@x[4],@t[0] @ accumulate key material 850 ldr @t[0],[r12],#16 @ load input 851 add @x[5],@x[5],@t[1] 852 ldr @t[1],[r12,#-12] 853 add @x[6],@x[6],@t[2] 854 ldr @t[2],[r12,#-8] 855 add @x[7],@x[7],@t[3] 856 ldr @t[3],[r12,#-4] 857# ifdef __ARMEB__ 858 rev @x[4],@x[4] 859 rev @x[5],@x[5] 860 rev @x[6],@x[6] 861 rev @x[7],@x[7] 862# endif 863 eor @x[4],@x[4],@t[0] 864 add @t[0],sp,#4*(8) 865 eor @x[5],@x[5],@t[1] 866 str @x[4],[r14],#16 @ store output 867 eor @x[6],@x[6],@t[2] 868 str @x[5],[r14,#-12] 869 eor @x[7],@x[7],@t[3] 870 ldmia @t[0],{@t[0]-@t[3]} @ load key material 871 str @x[6],[r14,#-8] 872 add @x[0],sp,#4*(16+8) 873 str @x[7],[r14,#-4] 874 875 ldmia @x[0],{@x[0]-@x[7]} @ load second half 876 877 add @x[0],@x[0],@t[0] @ accumulate key material 878 ldr @t[0],[r12],#16 @ load input 879 add @x[1],@x[1],@t[1] 880 ldr @t[1],[r12,#-12] 881# ifdef __thumb2__ 882 it hi 883# endif 884 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 885 add @x[2],@x[2],@t[2] 886 ldr @t[2],[r12,#-8] 887# ifdef __thumb2__ 888 it hi 889# endif 890 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 891 add @x[3],@x[3],@t[3] 892 ldr @t[3],[r12,#-4] 893# ifdef __ARMEB__ 894 rev @x[0],@x[0] 895 rev @x[1],@x[1] 896 rev @x[2],@x[2] 897 rev @x[3],@x[3] 898# endif 899 eor @x[0],@x[0],@t[0] 900 add @t[0],sp,#4*(12) 901 eor @x[1],@x[1],@t[1] 902 str @x[0],[r14],#16 @ store output 903 eor @x[2],@x[2],@t[2] 904 str @x[1],[r14,#-12] 905 eor @x[3],@x[3],@t[3] 906 ldmia @t[0],{@t[0]-@t[3]} @ load key material 907 str @x[2],[r14,#-8] 908 str @x[3],[r14,#-4] 909 910 add @x[4],@x[4],@t[0] @ accumulate key material 911 add @t[0],@t[0],#4 @ next counter value 912 add @x[5],@x[5],@t[1] 913 str @t[0],[sp,#4*(12)] @ save next counter value 914 ldr @t[0],[r12],#16 @ load input 915 add @x[6],@x[6],@t[2] 916 add @x[4],@x[4],#3 @ counter+3 917 ldr @t[1],[r12,#-12] 918 add @x[7],@x[7],@t[3] 919 ldr @t[2],[r12,#-8] 920 ldr @t[3],[r12,#-4] 921# ifdef __ARMEB__ 922 rev @x[4],@x[4] 923 rev @x[5],@x[5] 924 rev @x[6],@x[6] 925 rev @x[7],@x[7] 926# endif 927 eor @x[4],@x[4],@t[0] 928# ifdef __thumb2__ 929 it hi 930# endif 931 ldrhi @t[0],[sp,#4*(32+2)] @ re-load len 932 eor @x[5],@x[5],@t[1] 933 eor @x[6],@x[6],@t[2] 934 str @x[4],[r14],#16 @ store output 935 eor @x[7],@x[7],@t[3] 936 str @x[5],[r14,#-12] 937 sub @t[3],@t[0],#64*4 @ len-=64*4 938 str @x[6],[r14,#-8] 939 str @x[7],[r14,#-4] 940 bhi .Loop_neon_outer 941 942 b .Ldone_neon 943 944.align 4 945.Lbreak_neon: 946 @ harmonize NEON and integer-only stack frames: load data 947 @ from NEON frame, but save to integer-only one; distance 948 @ between the two is 4*(32+4+16-32)=4*(20). 949 950 str @t[3], [sp,#4*(20+32+2)] @ save len 951 add @t[3],sp,#4*(32+4) 952 str r12, [sp,#4*(20+32+1)] @ save inp 953 str r14, [sp,#4*(20+32+0)] @ save out 954 955 ldr @x[12],[sp,#4*(16+10)] 956 ldr @x[14],[sp,#4*(16+11)] 957 vldmia @t[3],{d8-d15} @ fulfill ABI requirement 958 str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" 959 str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" 960 961 ldr @t[3], [sp,#4*(15)] 962 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 963 ldr @t[2], [sp,#4*(13)] 964 ldr @x[14],[sp,#4*(14)] 965 str @t[3], [sp,#4*(20+16+15)] 966 add @t[3],sp,#4*(20) 967 vst1.32 {$a0-$b0},[@t[3]]! @ copy key 968 add sp,sp,#4*(20) @ switch frame 969 vst1.32 {$c0-$d0},[@t[3]] 970 mov @t[3],#10 971 b .Loop @ go integer-only 972 973.align 4 974.Ltail_neon: 975 cmp @t[3],#64*3 976 bhs .L192_or_more_neon 977 cmp @t[3],#64*2 978 bhs .L128_or_more_neon 979 cmp @t[3],#64*1 980 bhs .L64_or_more_neon 981 982 add @t[0],sp,#4*(8) 983 vst1.8 {$a0-$b0},[sp] 984 add @t[2],sp,#4*(0) 985 vst1.8 {$c0-$d0},[@t[0]] 986 b .Loop_tail_neon 987 988.align 4 989.L64_or_more_neon: 990 vld1.8 {$t0-$t1},[r12]! 991 vld1.8 {$t2-$t3},[r12]! 992 veor $a0,$a0,$t0 993 veor $b0,$b0,$t1 994 veor $c0,$c0,$t2 995 veor $d0,$d0,$t3 996 vst1.8 {$a0-$b0},[r14]! 997 vst1.8 {$c0-$d0},[r14]! 998 999 beq .Ldone_neon 1000 1001 add @t[0],sp,#4*(8) 1002 vst1.8 {$a1-$b1},[sp] 1003 add @t[2],sp,#4*(0) 1004 vst1.8 {$c1-$d1},[@t[0]] 1005 sub @t[3],@t[3],#64*1 @ len-=64*1 1006 b .Loop_tail_neon 1007 1008.align 4 1009.L128_or_more_neon: 1010 vld1.8 {$t0-$t1},[r12]! 1011 vld1.8 {$t2-$t3},[r12]! 1012 veor $a0,$a0,$t0 1013 veor $b0,$b0,$t1 1014 vld1.8 {$t0-$t1},[r12]! 1015 veor $c0,$c0,$t2 1016 veor $d0,$d0,$t3 1017 vld1.8 {$t2-$t3},[r12]! 1018 1019 veor $a1,$a1,$t0 1020 veor $b1,$b1,$t1 1021 vst1.8 {$a0-$b0},[r14]! 1022 veor $c1,$c1,$t2 1023 vst1.8 {$c0-$d0},[r14]! 1024 veor $d1,$d1,$t3 1025 vst1.8 {$a1-$b1},[r14]! 1026 vst1.8 {$c1-$d1},[r14]! 1027 1028 beq .Ldone_neon 1029 1030 add @t[0],sp,#4*(8) 1031 vst1.8 {$a2-$b2},[sp] 1032 add @t[2],sp,#4*(0) 1033 vst1.8 {$c2-$d2},[@t[0]] 1034 sub @t[3],@t[3],#64*2 @ len-=64*2 1035 b .Loop_tail_neon 1036 1037.align 4 1038.L192_or_more_neon: 1039 vld1.8 {$t0-$t1},[r12]! 1040 vld1.8 {$t2-$t3},[r12]! 1041 veor $a0,$a0,$t0 1042 veor $b0,$b0,$t1 1043 vld1.8 {$t0-$t1},[r12]! 1044 veor $c0,$c0,$t2 1045 veor $d0,$d0,$t3 1046 vld1.8 {$t2-$t3},[r12]! 1047 1048 veor $a1,$a1,$t0 1049 veor $b1,$b1,$t1 1050 vld1.8 {$t0-$t1},[r12]! 1051 veor $c1,$c1,$t2 1052 vst1.8 {$a0-$b0},[r14]! 1053 veor $d1,$d1,$t3 1054 vld1.8 {$t2-$t3},[r12]! 1055 1056 veor $a2,$a2,$t0 1057 vst1.8 {$c0-$d0},[r14]! 1058 veor $b2,$b2,$t1 1059 vst1.8 {$a1-$b1},[r14]! 1060 veor $c2,$c2,$t2 1061 vst1.8 {$c1-$d1},[r14]! 1062 veor $d2,$d2,$t3 1063 vst1.8 {$a2-$b2},[r14]! 1064 vst1.8 {$c2-$d2},[r14]! 1065 1066 beq .Ldone_neon 1067 1068 ldmia sp,{@t[0]-@t[3]} @ load key material 1069 add @x[0],@x[0],@t[0] @ accumulate key material 1070 add @t[0],sp,#4*(4) 1071 add @x[1],@x[1],@t[1] 1072 add @x[2],@x[2],@t[2] 1073 add @x[3],@x[3],@t[3] 1074 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1075 1076 add @x[4],@x[4],@t[0] @ accumulate key material 1077 add @t[0],sp,#4*(8) 1078 add @x[5],@x[5],@t[1] 1079 add @x[6],@x[6],@t[2] 1080 add @x[7],@x[7],@t[3] 1081 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1082# ifdef __ARMEB__ 1083 rev @x[0],@x[0] 1084 rev @x[1],@x[1] 1085 rev @x[2],@x[2] 1086 rev @x[3],@x[3] 1087 rev @x[4],@x[4] 1088 rev @x[5],@x[5] 1089 rev @x[6],@x[6] 1090 rev @x[7],@x[7] 1091# endif 1092 stmia sp,{@x[0]-@x[7]} 1093 add @x[0],sp,#4*(16+8) 1094 1095 ldmia @x[0],{@x[0]-@x[7]} @ load second half 1096 1097 add @x[0],@x[0],@t[0] @ accumulate key material 1098 add @t[0],sp,#4*(12) 1099 add @x[1],@x[1],@t[1] 1100 add @x[2],@x[2],@t[2] 1101 add @x[3],@x[3],@t[3] 1102 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1103 1104 add @x[4],@x[4],@t[0] @ accumulate key material 1105 add @t[0],sp,#4*(8) 1106 add @x[5],@x[5],@t[1] 1107 add @x[4],@x[4],#3 @ counter+3 1108 add @x[6],@x[6],@t[2] 1109 add @x[7],@x[7],@t[3] 1110 ldr @t[3],[sp,#4*(32+2)] @ re-load len 1111# ifdef __ARMEB__ 1112 rev @x[0],@x[0] 1113 rev @x[1],@x[1] 1114 rev @x[2],@x[2] 1115 rev @x[3],@x[3] 1116 rev @x[4],@x[4] 1117 rev @x[5],@x[5] 1118 rev @x[6],@x[6] 1119 rev @x[7],@x[7] 1120# endif 1121 stmia @t[0],{@x[0]-@x[7]} 1122 add @t[2],sp,#4*(0) 1123 sub @t[3],@t[3],#64*3 @ len-=64*3 1124 1125.Loop_tail_neon: 1126 ldrb @t[0],[@t[2]],#1 @ read buffer on stack 1127 ldrb @t[1],[r12],#1 @ read input 1128 subs @t[3],@t[3],#1 1129 eor @t[0],@t[0],@t[1] 1130 strb @t[0],[r14],#1 @ store output 1131 bne .Loop_tail_neon 1132 1133.Ldone_neon: 1134 add sp,sp,#4*(32+4) 1135 vldmia sp,{d8-d15} 1136 add sp,sp,#4*(16+3) 1137 ldmia sp!,{r4-r11,pc} 1138.size ChaCha20_neon,.-ChaCha20_neon 1139.comm OPENSSL_armcap_P,4,4 1140#endif 1141___ 1142}}} 1143 1144foreach (split("\n",$code)) { 1145 s/\`([^\`]*)\`/eval $1/geo; 1146 1147 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1148 1149 print $_,"\n"; 1150} 1151close STDOUT; 1152