1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# This module implements support for ARMv8 AES instructions. The 11# module is endian-agnostic in sense that it supports both big- and 12# little-endian cases. As does it support both 32- and 64-bit modes 13# of operation. Latter is achieved by limiting amount of utilized 14# registers to 16, which implies additional NEON load and integer 15# instructions. This has no effect on mighty Apple A7, where results 16# are literally equal to the theoretical estimates based on AES 17# instruction latencies and issue rates. On Cortex-A53, an in-order 18# execution core, this costs up to 10-15%, which is partially 19# compensated by implementing dedicated code path for 128-bit 20# CBC encrypt case. On Cortex-A57 parallelizable mode performance 21# seems to be limited by sheer amount of NEON instructions... 22# 23# Performance in cycles per byte processed with 128-bit key: 24# 25# CBC enc CBC dec CTR 26# Apple A7 2.39 1.20 1.20 27# Cortex-A53 1.32 1.29 1.46 28# Cortex-A57(*) 1.95 0.85 0.93 29# Denver 1.96 0.86 0.80 30# 31# (*) original 3.64/1.34/1.32 results were for r0p0 revision 32# and are still same even for updated module; 33 34$flavour = shift; 35$output = shift; 36 37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 38( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 39( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 40die "can't locate arm-xlate.pl"; 41 42open OUT,"| \"$^X\" $xlate $flavour $output"; 43*STDOUT=*OUT; 44 45$prefix="aes_v8"; 46 47$code=<<___; 48#include <openssl/arm_arch.h> 49 50#if __ARM_MAX_ARCH__>=7 51.text 52___ 53$code.=<<___ if ($flavour =~ /64/); 54#if !defined(__clang__) 55.arch armv8-a+crypto 56#endif 57___ 58$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/); 59 #^^^^^^ this is done to simplify adoption by not depending 60 # on latest binutils. 61 62# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 63# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 64# maintain both 32- and 64-bit codes within single module and 65# transliterate common code to either flavour with regex vodoo. 66# 67{{{ 68my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 69my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 70 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 71 72 73$code.=<<___; 74.align 5 75.Lrcon: 76.long 0x01,0x01,0x01,0x01 77.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 78.long 0x1b,0x1b,0x1b,0x1b 79 80.globl ${prefix}_set_encrypt_key 81.type ${prefix}_set_encrypt_key,%function 82.align 5 83${prefix}_set_encrypt_key: 84.Lenc_key: 85___ 86$code.=<<___ if ($flavour =~ /64/); 87 stp x29,x30,[sp,#-16]! 88 add x29,sp,#0 89___ 90$code.=<<___; 91 mov $ptr,#-1 92 cmp $inp,#0 93 b.eq .Lenc_key_abort 94 cmp $out,#0 95 b.eq .Lenc_key_abort 96 mov $ptr,#-2 97 cmp $bits,#128 98 b.lt .Lenc_key_abort 99 cmp $bits,#256 100 b.gt .Lenc_key_abort 101 tst $bits,#0x3f 102 b.ne .Lenc_key_abort 103 104 adr $ptr,.Lrcon 105 cmp $bits,#192 106 107 veor $zero,$zero,$zero 108 vld1.8 {$in0},[$inp],#16 109 mov $bits,#8 // reuse $bits 110 vld1.32 {$rcon,$mask},[$ptr],#32 111 112 b.lt .Loop128 113 b.eq .L192 114 b .L256 115 116.align 4 117.Loop128: 118 vtbl.8 $key,{$in0},$mask 119 vext.8 $tmp,$zero,$in0,#12 120 vst1.32 {$in0},[$out],#16 121 aese $key,$zero 122 subs $bits,$bits,#1 123 124 veor $in0,$in0,$tmp 125 vext.8 $tmp,$zero,$tmp,#12 126 veor $in0,$in0,$tmp 127 vext.8 $tmp,$zero,$tmp,#12 128 veor $key,$key,$rcon 129 veor $in0,$in0,$tmp 130 vshl.u8 $rcon,$rcon,#1 131 veor $in0,$in0,$key 132 b.ne .Loop128 133 134 vld1.32 {$rcon},[$ptr] 135 136 vtbl.8 $key,{$in0},$mask 137 vext.8 $tmp,$zero,$in0,#12 138 vst1.32 {$in0},[$out],#16 139 aese $key,$zero 140 141 veor $in0,$in0,$tmp 142 vext.8 $tmp,$zero,$tmp,#12 143 veor $in0,$in0,$tmp 144 vext.8 $tmp,$zero,$tmp,#12 145 veor $key,$key,$rcon 146 veor $in0,$in0,$tmp 147 vshl.u8 $rcon,$rcon,#1 148 veor $in0,$in0,$key 149 150 vtbl.8 $key,{$in0},$mask 151 vext.8 $tmp,$zero,$in0,#12 152 vst1.32 {$in0},[$out],#16 153 aese $key,$zero 154 155 veor $in0,$in0,$tmp 156 vext.8 $tmp,$zero,$tmp,#12 157 veor $in0,$in0,$tmp 158 vext.8 $tmp,$zero,$tmp,#12 159 veor $key,$key,$rcon 160 veor $in0,$in0,$tmp 161 veor $in0,$in0,$key 162 vst1.32 {$in0},[$out] 163 add $out,$out,#0x50 164 165 mov $rounds,#10 166 b .Ldone 167 168.align 4 169.L192: 170 vld1.8 {$in1},[$inp],#8 171 vmov.i8 $key,#8 // borrow $key 172 vst1.32 {$in0},[$out],#16 173 vsub.i8 $mask,$mask,$key // adjust the mask 174 175.Loop192: 176 vtbl.8 $key,{$in1},$mask 177 vext.8 $tmp,$zero,$in0,#12 178 vst1.32 {$in1},[$out],#8 179 aese $key,$zero 180 subs $bits,$bits,#1 181 182 veor $in0,$in0,$tmp 183 vext.8 $tmp,$zero,$tmp,#12 184 veor $in0,$in0,$tmp 185 vext.8 $tmp,$zero,$tmp,#12 186 veor $in0,$in0,$tmp 187 188 vdup.32 $tmp,${in0}[3] 189 veor $tmp,$tmp,$in1 190 veor $key,$key,$rcon 191 vext.8 $in1,$zero,$in1,#12 192 vshl.u8 $rcon,$rcon,#1 193 veor $in1,$in1,$tmp 194 veor $in0,$in0,$key 195 veor $in1,$in1,$key 196 vst1.32 {$in0},[$out],#16 197 b.ne .Loop192 198 199 mov $rounds,#12 200 add $out,$out,#0x20 201 b .Ldone 202 203.align 4 204.L256: 205 vld1.8 {$in1},[$inp] 206 mov $bits,#7 207 mov $rounds,#14 208 vst1.32 {$in0},[$out],#16 209 210.Loop256: 211 vtbl.8 $key,{$in1},$mask 212 vext.8 $tmp,$zero,$in0,#12 213 vst1.32 {$in1},[$out],#16 214 aese $key,$zero 215 subs $bits,$bits,#1 216 217 veor $in0,$in0,$tmp 218 vext.8 $tmp,$zero,$tmp,#12 219 veor $in0,$in0,$tmp 220 vext.8 $tmp,$zero,$tmp,#12 221 veor $key,$key,$rcon 222 veor $in0,$in0,$tmp 223 vshl.u8 $rcon,$rcon,#1 224 veor $in0,$in0,$key 225 vst1.32 {$in0},[$out],#16 226 b.eq .Ldone 227 228 vdup.32 $key,${in0}[3] // just splat 229 vext.8 $tmp,$zero,$in1,#12 230 aese $key,$zero 231 232 veor $in1,$in1,$tmp 233 vext.8 $tmp,$zero,$tmp,#12 234 veor $in1,$in1,$tmp 235 vext.8 $tmp,$zero,$tmp,#12 236 veor $in1,$in1,$tmp 237 238 veor $in1,$in1,$key 239 b .Loop256 240 241.Ldone: 242 str $rounds,[$out] 243 mov $ptr,#0 244 245.Lenc_key_abort: 246 mov x0,$ptr // return value 247 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 248 ret 249.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 250 251.globl ${prefix}_set_decrypt_key 252.type ${prefix}_set_decrypt_key,%function 253.align 5 254${prefix}_set_decrypt_key: 255___ 256$code.=<<___ if ($flavour =~ /64/); 257 stp x29,x30,[sp,#-16]! 258 add x29,sp,#0 259___ 260$code.=<<___ if ($flavour !~ /64/); 261 stmdb sp!,{r4,lr} 262___ 263$code.=<<___; 264 bl .Lenc_key 265 266 cmp x0,#0 267 b.ne .Ldec_key_abort 268 269 sub $out,$out,#240 // restore original $out 270 mov x4,#-16 271 add $inp,$out,x12,lsl#4 // end of key schedule 272 273 vld1.32 {v0.16b},[$out] 274 vld1.32 {v1.16b},[$inp] 275 vst1.32 {v0.16b},[$inp],x4 276 vst1.32 {v1.16b},[$out],#16 277 278.Loop_imc: 279 vld1.32 {v0.16b},[$out] 280 vld1.32 {v1.16b},[$inp] 281 aesimc v0.16b,v0.16b 282 aesimc v1.16b,v1.16b 283 vst1.32 {v0.16b},[$inp],x4 284 vst1.32 {v1.16b},[$out],#16 285 cmp $inp,$out 286 b.hi .Loop_imc 287 288 vld1.32 {v0.16b},[$out] 289 aesimc v0.16b,v0.16b 290 vst1.32 {v0.16b},[$inp] 291 292 eor x0,x0,x0 // return value 293.Ldec_key_abort: 294___ 295$code.=<<___ if ($flavour !~ /64/); 296 ldmia sp!,{r4,pc} 297___ 298$code.=<<___ if ($flavour =~ /64/); 299 ldp x29,x30,[sp],#16 300 ret 301___ 302$code.=<<___; 303.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 304___ 305}}} 306{{{ 307sub gen_block () { 308my $dir = shift; 309my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 310my ($inp,$out,$key)=map("x$_",(0..2)); 311my $rounds="w3"; 312my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 313 314$code.=<<___; 315.globl ${prefix}_${dir}crypt 316.type ${prefix}_${dir}crypt,%function 317.align 5 318${prefix}_${dir}crypt: 319 ldr $rounds,[$key,#240] 320 vld1.32 {$rndkey0},[$key],#16 321 vld1.8 {$inout},[$inp] 322 sub $rounds,$rounds,#2 323 vld1.32 {$rndkey1},[$key],#16 324 325.Loop_${dir}c: 326 aes$e $inout,$rndkey0 327 aes$mc $inout,$inout 328 vld1.32 {$rndkey0},[$key],#16 329 subs $rounds,$rounds,#2 330 aes$e $inout,$rndkey1 331 aes$mc $inout,$inout 332 vld1.32 {$rndkey1},[$key],#16 333 b.gt .Loop_${dir}c 334 335 aes$e $inout,$rndkey0 336 aes$mc $inout,$inout 337 vld1.32 {$rndkey0},[$key] 338 aes$e $inout,$rndkey1 339 veor $inout,$inout,$rndkey0 340 341 vst1.8 {$inout},[$out] 342 ret 343.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 344___ 345} 346&gen_block("en"); 347&gen_block("de"); 348}}} 349{{{ 350my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 351my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 352my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 353 354my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 355my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 356 357### q8-q15 preloaded key schedule 358 359$code.=<<___; 360.globl ${prefix}_cbc_encrypt 361.type ${prefix}_cbc_encrypt,%function 362.align 5 363${prefix}_cbc_encrypt: 364___ 365$code.=<<___ if ($flavour =~ /64/); 366 stp x29,x30,[sp,#-16]! 367 add x29,sp,#0 368___ 369$code.=<<___ if ($flavour !~ /64/); 370 mov ip,sp 371 stmdb sp!,{r4-r8,lr} 372 vstmdb sp!,{d8-d15} @ ABI specification says so 373 ldmia ip,{r4-r5} @ load remaining args 374___ 375$code.=<<___; 376 subs $len,$len,#16 377 mov $step,#16 378 b.lo .Lcbc_abort 379 cclr $step,eq 380 381 cmp $enc,#0 // en- or decrypting? 382 ldr $rounds,[$key,#240] 383 and $len,$len,#-16 384 vld1.8 {$ivec},[$ivp] 385 vld1.8 {$dat},[$inp],$step 386 387 vld1.32 {q8-q9},[$key] // load key schedule... 388 sub $rounds,$rounds,#6 389 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 390 sub $rounds,$rounds,#2 391 vld1.32 {q10-q11},[$key_],#32 392 vld1.32 {q12-q13},[$key_],#32 393 vld1.32 {q14-q15},[$key_],#32 394 vld1.32 {$rndlast},[$key_] 395 396 add $key_,$key,#32 397 mov $cnt,$rounds 398 b.eq .Lcbc_dec 399 400 cmp $rounds,#2 401 veor $dat,$dat,$ivec 402 veor $rndzero_n_last,q8,$rndlast 403 b.eq .Lcbc_enc128 404 405 vld1.32 {$in0-$in1},[$key_] 406 add $key_,$key,#16 407 add $key4,$key,#16*4 408 add $key5,$key,#16*5 409 aese $dat,q8 410 aesmc $dat,$dat 411 add $key6,$key,#16*6 412 add $key7,$key,#16*7 413 b .Lenter_cbc_enc 414 415.align 4 416.Loop_cbc_enc: 417 aese $dat,q8 418 aesmc $dat,$dat 419 vst1.8 {$ivec},[$out],#16 420.Lenter_cbc_enc: 421 aese $dat,q9 422 aesmc $dat,$dat 423 aese $dat,$in0 424 aesmc $dat,$dat 425 vld1.32 {q8},[$key4] 426 cmp $rounds,#4 427 aese $dat,$in1 428 aesmc $dat,$dat 429 vld1.32 {q9},[$key5] 430 b.eq .Lcbc_enc192 431 432 aese $dat,q8 433 aesmc $dat,$dat 434 vld1.32 {q8},[$key6] 435 aese $dat,q9 436 aesmc $dat,$dat 437 vld1.32 {q9},[$key7] 438 nop 439 440.Lcbc_enc192: 441 aese $dat,q8 442 aesmc $dat,$dat 443 subs $len,$len,#16 444 aese $dat,q9 445 aesmc $dat,$dat 446 cclr $step,eq 447 aese $dat,q10 448 aesmc $dat,$dat 449 aese $dat,q11 450 aesmc $dat,$dat 451 vld1.8 {q8},[$inp],$step 452 aese $dat,q12 453 aesmc $dat,$dat 454 veor q8,q8,$rndzero_n_last 455 aese $dat,q13 456 aesmc $dat,$dat 457 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 458 aese $dat,q14 459 aesmc $dat,$dat 460 aese $dat,q15 461 veor $ivec,$dat,$rndlast 462 b.hs .Loop_cbc_enc 463 464 vst1.8 {$ivec},[$out],#16 465 b .Lcbc_done 466 467.align 5 468.Lcbc_enc128: 469 vld1.32 {$in0-$in1},[$key_] 470 aese $dat,q8 471 aesmc $dat,$dat 472 b .Lenter_cbc_enc128 473.Loop_cbc_enc128: 474 aese $dat,q8 475 aesmc $dat,$dat 476 vst1.8 {$ivec},[$out],#16 477.Lenter_cbc_enc128: 478 aese $dat,q9 479 aesmc $dat,$dat 480 subs $len,$len,#16 481 aese $dat,$in0 482 aesmc $dat,$dat 483 cclr $step,eq 484 aese $dat,$in1 485 aesmc $dat,$dat 486 aese $dat,q10 487 aesmc $dat,$dat 488 aese $dat,q11 489 aesmc $dat,$dat 490 vld1.8 {q8},[$inp],$step 491 aese $dat,q12 492 aesmc $dat,$dat 493 aese $dat,q13 494 aesmc $dat,$dat 495 aese $dat,q14 496 aesmc $dat,$dat 497 veor q8,q8,$rndzero_n_last 498 aese $dat,q15 499 veor $ivec,$dat,$rndlast 500 b.hs .Loop_cbc_enc128 501 502 vst1.8 {$ivec},[$out],#16 503 b .Lcbc_done 504___ 505{ 506my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 507$code.=<<___; 508.align 5 509.Lcbc_dec: 510 vld1.8 {$dat2},[$inp],#16 511 subs $len,$len,#32 // bias 512 add $cnt,$rounds,#2 513 vorr $in1,$dat,$dat 514 vorr $dat1,$dat,$dat 515 vorr $in2,$dat2,$dat2 516 b.lo .Lcbc_dec_tail 517 518 vorr $dat1,$dat2,$dat2 519 vld1.8 {$dat2},[$inp],#16 520 vorr $in0,$dat,$dat 521 vorr $in1,$dat1,$dat1 522 vorr $in2,$dat2,$dat2 523 524.Loop3x_cbc_dec: 525 aesd $dat0,q8 526 aesimc $dat0,$dat0 527 aesd $dat1,q8 528 aesimc $dat1,$dat1 529 aesd $dat2,q8 530 aesimc $dat2,$dat2 531 vld1.32 {q8},[$key_],#16 532 subs $cnt,$cnt,#2 533 aesd $dat0,q9 534 aesimc $dat0,$dat0 535 aesd $dat1,q9 536 aesimc $dat1,$dat1 537 aesd $dat2,q9 538 aesimc $dat2,$dat2 539 vld1.32 {q9},[$key_],#16 540 b.gt .Loop3x_cbc_dec 541 542 aesd $dat0,q8 543 aesimc $dat0,$dat0 544 aesd $dat1,q8 545 aesimc $dat1,$dat1 546 aesd $dat2,q8 547 aesimc $dat2,$dat2 548 veor $tmp0,$ivec,$rndlast 549 subs $len,$len,#0x30 550 veor $tmp1,$in0,$rndlast 551 mov.lo x6,$len // x6, $cnt, is zero at this point 552 aesd $dat0,q9 553 aesimc $dat0,$dat0 554 aesd $dat1,q9 555 aesimc $dat1,$dat1 556 aesd $dat2,q9 557 aesimc $dat2,$dat2 558 veor $tmp2,$in1,$rndlast 559 add $inp,$inp,x6 // $inp is adjusted in such way that 560 // at exit from the loop $dat1-$dat2 561 // are loaded with last "words" 562 vorr $ivec,$in2,$in2 563 mov $key_,$key 564 aesd $dat0,q12 565 aesimc $dat0,$dat0 566 aesd $dat1,q12 567 aesimc $dat1,$dat1 568 aesd $dat2,q12 569 aesimc $dat2,$dat2 570 vld1.8 {$in0},[$inp],#16 571 aesd $dat0,q13 572 aesimc $dat0,$dat0 573 aesd $dat1,q13 574 aesimc $dat1,$dat1 575 aesd $dat2,q13 576 aesimc $dat2,$dat2 577 vld1.8 {$in1},[$inp],#16 578 aesd $dat0,q14 579 aesimc $dat0,$dat0 580 aesd $dat1,q14 581 aesimc $dat1,$dat1 582 aesd $dat2,q14 583 aesimc $dat2,$dat2 584 vld1.8 {$in2},[$inp],#16 585 aesd $dat0,q15 586 aesd $dat1,q15 587 aesd $dat2,q15 588 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 589 add $cnt,$rounds,#2 590 veor $tmp0,$tmp0,$dat0 591 veor $tmp1,$tmp1,$dat1 592 veor $dat2,$dat2,$tmp2 593 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 594 vst1.8 {$tmp0},[$out],#16 595 vorr $dat0,$in0,$in0 596 vst1.8 {$tmp1},[$out],#16 597 vorr $dat1,$in1,$in1 598 vst1.8 {$dat2},[$out],#16 599 vorr $dat2,$in2,$in2 600 b.hs .Loop3x_cbc_dec 601 602 cmn $len,#0x30 603 b.eq .Lcbc_done 604 nop 605 606.Lcbc_dec_tail: 607 aesd $dat1,q8 608 aesimc $dat1,$dat1 609 aesd $dat2,q8 610 aesimc $dat2,$dat2 611 vld1.32 {q8},[$key_],#16 612 subs $cnt,$cnt,#2 613 aesd $dat1,q9 614 aesimc $dat1,$dat1 615 aesd $dat2,q9 616 aesimc $dat2,$dat2 617 vld1.32 {q9},[$key_],#16 618 b.gt .Lcbc_dec_tail 619 620 aesd $dat1,q8 621 aesimc $dat1,$dat1 622 aesd $dat2,q8 623 aesimc $dat2,$dat2 624 aesd $dat1,q9 625 aesimc $dat1,$dat1 626 aesd $dat2,q9 627 aesimc $dat2,$dat2 628 aesd $dat1,q12 629 aesimc $dat1,$dat1 630 aesd $dat2,q12 631 aesimc $dat2,$dat2 632 cmn $len,#0x20 633 aesd $dat1,q13 634 aesimc $dat1,$dat1 635 aesd $dat2,q13 636 aesimc $dat2,$dat2 637 veor $tmp1,$ivec,$rndlast 638 aesd $dat1,q14 639 aesimc $dat1,$dat1 640 aesd $dat2,q14 641 aesimc $dat2,$dat2 642 veor $tmp2,$in1,$rndlast 643 aesd $dat1,q15 644 aesd $dat2,q15 645 b.eq .Lcbc_dec_one 646 veor $tmp1,$tmp1,$dat1 647 veor $tmp2,$tmp2,$dat2 648 vorr $ivec,$in2,$in2 649 vst1.8 {$tmp1},[$out],#16 650 vst1.8 {$tmp2},[$out],#16 651 b .Lcbc_done 652 653.Lcbc_dec_one: 654 veor $tmp1,$tmp1,$dat2 655 vorr $ivec,$in2,$in2 656 vst1.8 {$tmp1},[$out],#16 657 658.Lcbc_done: 659 vst1.8 {$ivec},[$ivp] 660.Lcbc_abort: 661___ 662} 663$code.=<<___ if ($flavour !~ /64/); 664 vldmia sp!,{d8-d15} 665 ldmia sp!,{r4-r8,pc} 666___ 667$code.=<<___ if ($flavour =~ /64/); 668 ldr x29,[sp],#16 669 ret 670___ 671$code.=<<___; 672.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 673___ 674}}} 675{{{ 676my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 677my ($rounds,$cnt,$key_)=("w5","w6","x7"); 678my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 679my $step="x12"; # aliases with $tctr2 680 681my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 682my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 683 684my ($dat,$tmp)=($dat0,$tmp0); 685 686### q8-q15 preloaded key schedule 687 688$code.=<<___; 689.globl ${prefix}_ctr32_encrypt_blocks 690.type ${prefix}_ctr32_encrypt_blocks,%function 691.align 5 692${prefix}_ctr32_encrypt_blocks: 693___ 694$code.=<<___ if ($flavour =~ /64/); 695 stp x29,x30,[sp,#-16]! 696 add x29,sp,#0 697___ 698$code.=<<___ if ($flavour !~ /64/); 699 mov ip,sp 700 stmdb sp!,{r4-r10,lr} 701 vstmdb sp!,{d8-d15} @ ABI specification says so 702 ldr r4, [ip] @ load remaining arg 703___ 704$code.=<<___; 705 ldr $rounds,[$key,#240] 706 707 ldr $ctr, [$ivp, #12] 708 vld1.32 {$dat0},[$ivp] 709 710 vld1.32 {q8-q9},[$key] // load key schedule... 711 sub $rounds,$rounds,#4 712 mov $step,#16 713 cmp $len,#2 714 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 715 sub $rounds,$rounds,#2 716 vld1.32 {q12-q13},[$key_],#32 717 vld1.32 {q14-q15},[$key_],#32 718 vld1.32 {$rndlast},[$key_] 719 add $key_,$key,#32 720 mov $cnt,$rounds 721 cclr $step,lo 722#ifndef __ARMEB__ 723 rev $ctr, $ctr 724#endif 725 vorr $dat1,$dat0,$dat0 726 add $tctr1, $ctr, #1 727 vorr $dat2,$dat0,$dat0 728 add $ctr, $ctr, #2 729 vorr $ivec,$dat0,$dat0 730 rev $tctr1, $tctr1 731 vmov.32 ${dat1}[3],$tctr1 732 b.ls .Lctr32_tail 733 rev $tctr2, $ctr 734 sub $len,$len,#3 // bias 735 vmov.32 ${dat2}[3],$tctr2 736 b .Loop3x_ctr32 737 738.align 4 739.Loop3x_ctr32: 740 aese $dat0,q8 741 aesmc $dat0,$dat0 742 aese $dat1,q8 743 aesmc $dat1,$dat1 744 aese $dat2,q8 745 aesmc $dat2,$dat2 746 vld1.32 {q8},[$key_],#16 747 subs $cnt,$cnt,#2 748 aese $dat0,q9 749 aesmc $dat0,$dat0 750 aese $dat1,q9 751 aesmc $dat1,$dat1 752 aese $dat2,q9 753 aesmc $dat2,$dat2 754 vld1.32 {q9},[$key_],#16 755 b.gt .Loop3x_ctr32 756 757 aese $dat0,q8 758 aesmc $tmp0,$dat0 759 aese $dat1,q8 760 aesmc $tmp1,$dat1 761 vld1.8 {$in0},[$inp],#16 762 vorr $dat0,$ivec,$ivec 763 aese $dat2,q8 764 aesmc $dat2,$dat2 765 vld1.8 {$in1},[$inp],#16 766 vorr $dat1,$ivec,$ivec 767 aese $tmp0,q9 768 aesmc $tmp0,$tmp0 769 aese $tmp1,q9 770 aesmc $tmp1,$tmp1 771 vld1.8 {$in2},[$inp],#16 772 mov $key_,$key 773 aese $dat2,q9 774 aesmc $tmp2,$dat2 775 vorr $dat2,$ivec,$ivec 776 add $tctr0,$ctr,#1 777 aese $tmp0,q12 778 aesmc $tmp0,$tmp0 779 aese $tmp1,q12 780 aesmc $tmp1,$tmp1 781 veor $in0,$in0,$rndlast 782 add $tctr1,$ctr,#2 783 aese $tmp2,q12 784 aesmc $tmp2,$tmp2 785 veor $in1,$in1,$rndlast 786 add $ctr,$ctr,#3 787 aese $tmp0,q13 788 aesmc $tmp0,$tmp0 789 aese $tmp1,q13 790 aesmc $tmp1,$tmp1 791 veor $in2,$in2,$rndlast 792 rev $tctr0,$tctr0 793 aese $tmp2,q13 794 aesmc $tmp2,$tmp2 795 vmov.32 ${dat0}[3], $tctr0 796 rev $tctr1,$tctr1 797 aese $tmp0,q14 798 aesmc $tmp0,$tmp0 799 aese $tmp1,q14 800 aesmc $tmp1,$tmp1 801 vmov.32 ${dat1}[3], $tctr1 802 rev $tctr2,$ctr 803 aese $tmp2,q14 804 aesmc $tmp2,$tmp2 805 vmov.32 ${dat2}[3], $tctr2 806 subs $len,$len,#3 807 aese $tmp0,q15 808 aese $tmp1,q15 809 aese $tmp2,q15 810 811 veor $in0,$in0,$tmp0 812 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 813 vst1.8 {$in0},[$out],#16 814 veor $in1,$in1,$tmp1 815 mov $cnt,$rounds 816 vst1.8 {$in1},[$out],#16 817 veor $in2,$in2,$tmp2 818 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 819 vst1.8 {$in2},[$out],#16 820 b.hs .Loop3x_ctr32 821 822 adds $len,$len,#3 823 b.eq .Lctr32_done 824 cmp $len,#1 825 mov $step,#16 826 cclr $step,eq 827 828.Lctr32_tail: 829 aese $dat0,q8 830 aesmc $dat0,$dat0 831 aese $dat1,q8 832 aesmc $dat1,$dat1 833 vld1.32 {q8},[$key_],#16 834 subs $cnt,$cnt,#2 835 aese $dat0,q9 836 aesmc $dat0,$dat0 837 aese $dat1,q9 838 aesmc $dat1,$dat1 839 vld1.32 {q9},[$key_],#16 840 b.gt .Lctr32_tail 841 842 aese $dat0,q8 843 aesmc $dat0,$dat0 844 aese $dat1,q8 845 aesmc $dat1,$dat1 846 aese $dat0,q9 847 aesmc $dat0,$dat0 848 aese $dat1,q9 849 aesmc $dat1,$dat1 850 vld1.8 {$in0},[$inp],$step 851 aese $dat0,q12 852 aesmc $dat0,$dat0 853 aese $dat1,q12 854 aesmc $dat1,$dat1 855 vld1.8 {$in1},[$inp] 856 aese $dat0,q13 857 aesmc $dat0,$dat0 858 aese $dat1,q13 859 aesmc $dat1,$dat1 860 veor $in0,$in0,$rndlast 861 aese $dat0,q14 862 aesmc $dat0,$dat0 863 aese $dat1,q14 864 aesmc $dat1,$dat1 865 veor $in1,$in1,$rndlast 866 aese $dat0,q15 867 aese $dat1,q15 868 869 cmp $len,#1 870 veor $in0,$in0,$dat0 871 veor $in1,$in1,$dat1 872 vst1.8 {$in0},[$out],#16 873 b.eq .Lctr32_done 874 vst1.8 {$in1},[$out] 875 876.Lctr32_done: 877___ 878$code.=<<___ if ($flavour !~ /64/); 879 vldmia sp!,{d8-d15} 880 ldmia sp!,{r4-r10,pc} 881___ 882$code.=<<___ if ($flavour =~ /64/); 883 ldr x29,[sp],#16 884 ret 885___ 886$code.=<<___; 887.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 888___ 889}}} 890$code.=<<___; 891#endif 892___ 893######################################## 894if ($flavour =~ /64/) { ######## 64-bit code 895 my %opcode = ( 896 "aesd" => 0x4e285800, "aese" => 0x4e284800, 897 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 898 899 local *unaes = sub { 900 my ($mnemonic,$arg)=@_; 901 902 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 903 sprintf ".inst\t0x%08x\t//%s %s", 904 $opcode{$mnemonic}|$1|($2<<5), 905 $mnemonic,$arg; 906 }; 907 908 foreach(split("\n",$code)) { 909 s/\`([^\`]*)\`/eval($1)/geo; 910 911 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 912 s/@\s/\/\//o; # old->new style commentary 913 914 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 915 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 916 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 917 s/vmov\.i8/movi/o or # fix up legacy mnemonics 918 s/vext\.8/ext/o or 919 s/vrev32\.8/rev32/o or 920 s/vtst\.8/cmtst/o or 921 s/vshr/ushr/o or 922 s/^(\s+)v/$1/o or # strip off v prefix 923 s/\bbx\s+lr\b/ret/o; 924 925 # fix up remainig legacy suffixes 926 s/\.[ui]?8//o; 927 m/\],#8/o and s/\.16b/\.8b/go; 928 s/\.[ui]?32//o and s/\.16b/\.4s/go; 929 s/\.[ui]?64//o and s/\.16b/\.2d/go; 930 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 931 932 print $_,"\n"; 933 } 934} else { ######## 32-bit code 935 my %opcode = ( 936 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 937 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 938 939 local *unaes = sub { 940 my ($mnemonic,$arg)=@_; 941 942 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 943 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 944 |(($2&7)<<1) |(($2&8)<<2); 945 # since ARMv7 instructions are always encoded little-endian. 946 # correct solution is to use .inst directive, but older 947 # assemblers don't implement it:-( 948 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 949 $word&0xff,($word>>8)&0xff, 950 ($word>>16)&0xff,($word>>24)&0xff, 951 $mnemonic,$arg; 952 } 953 }; 954 955 sub unvtbl { 956 my $arg=shift; 957 958 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 959 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 960 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 961 } 962 963 sub unvdup32 { 964 my $arg=shift; 965 966 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 967 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 968 } 969 970 sub unvmov32 { 971 my $arg=shift; 972 973 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 974 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 975 } 976 977 foreach(split("\n",$code)) { 978 s/\`([^\`]*)\`/eval($1)/geo; 979 980 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 981 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 982 s/\/\/\s?/@ /o; # new->old style commentary 983 984 # fix up remainig new-style suffixes 985 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 986 s/\],#[0-9]+/]!/o; 987 988 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 989 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 990 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 991 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 992 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 993 s/^(\s+)b\./$1b/o or 994 s/^(\s+)mov\./$1mov/o or 995 s/^(\s+)ret/$1bx\tlr/o; 996 997 print $_,"\n"; 998 } 999} 1000 1001close STDOUT; 1002