1#! /usr/bin/env perl 2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# Performance in cycles per byte processed with 128-bit key: 31# 32# CBC enc CBC dec CTR 33# Apple A7 2.39 1.20 1.20 34# Cortex-A53 1.32 1.29 1.46 35# Cortex-A57(*) 1.95 0.85 0.93 36# Denver 1.96 0.86 0.80 37# Mongoose 1.33 1.20 1.20 38# 39# (*) original 3.64/1.34/1.32 results were for r0p0 revision 40# and are still same even for updated module; 41 42$flavour = shift; 43$output = shift; 44 45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 46( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 47( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 48die "can't locate arm-xlate.pl"; 49 50open OUT,"| \"$^X\" $xlate $flavour $output"; 51*STDOUT=*OUT; 52 53$prefix="aes_hw"; 54 55$code=<<___; 56#include <openssl/arm_arch.h> 57 58#if __ARM_MAX_ARCH__>=7 59.text 60___ 61$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); 62$code.=<<___ if ($flavour !~ /64/); 63.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 64.fpu neon 65.code 32 66#undef __thumb2__ 67___ 68 69# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 70# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 71# maintain both 32- and 64-bit codes within single module and 72# transliterate common code to either flavour with regex vodoo. 73# 74{{{ 75my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 76my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 77 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 78 79 80# On AArch64, put the data .rodata and use adrp + add for compatibility with 81# execute-only memory. On AArch32, put it in .text and use adr. 82$code.= ".section .rodata\n" if ($flavour =~ /64/); 83$code.=<<___; 84.align 5 85.Lrcon: 86.long 0x01,0x01,0x01,0x01 87.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 88.long 0x1b,0x1b,0x1b,0x1b 89 90.text 91 92.globl ${prefix}_set_encrypt_key 93.type ${prefix}_set_encrypt_key,%function 94.align 5 95${prefix}_set_encrypt_key: 96.Lenc_key: 97___ 98$code.=<<___ if ($flavour =~ /64/); 99 stp x29,x30,[sp,#-16]! 100 add x29,sp,#0 101___ 102$code.=<<___; 103 mov $ptr,#-1 104 cmp $inp,#0 105 b.eq .Lenc_key_abort 106 cmp $out,#0 107 b.eq .Lenc_key_abort 108 mov $ptr,#-2 109 cmp $bits,#128 110 b.lt .Lenc_key_abort 111 cmp $bits,#256 112 b.gt .Lenc_key_abort 113 tst $bits,#0x3f 114 b.ne .Lenc_key_abort 115 116___ 117$code.=<<___ if ($flavour =~ /64/); 118 adrp $ptr,:pg_hi21:.Lrcon 119 add $ptr,$ptr,:lo12:.Lrcon 120___ 121$code.=<<___ if ($flavour !~ /64/); 122 adr $ptr,.Lrcon 123___ 124$code.=<<___; 125 cmp $bits,#192 126 127 veor $zero,$zero,$zero 128 vld1.8 {$in0},[$inp],#16 129 mov $bits,#8 // reuse $bits 130 vld1.32 {$rcon,$mask},[$ptr],#32 131 132 b.lt .Loop128 133 b.eq .L192 134 b .L256 135 136.align 4 137.Loop128: 138 vtbl.8 $key,{$in0},$mask 139 vext.8 $tmp,$zero,$in0,#12 140 vst1.32 {$in0},[$out],#16 141 aese $key,$zero 142 subs $bits,$bits,#1 143 144 veor $in0,$in0,$tmp 145 vext.8 $tmp,$zero,$tmp,#12 146 veor $in0,$in0,$tmp 147 vext.8 $tmp,$zero,$tmp,#12 148 veor $key,$key,$rcon 149 veor $in0,$in0,$tmp 150 vshl.u8 $rcon,$rcon,#1 151 veor $in0,$in0,$key 152 b.ne .Loop128 153 154 vld1.32 {$rcon},[$ptr] 155 156 vtbl.8 $key,{$in0},$mask 157 vext.8 $tmp,$zero,$in0,#12 158 vst1.32 {$in0},[$out],#16 159 aese $key,$zero 160 161 veor $in0,$in0,$tmp 162 vext.8 $tmp,$zero,$tmp,#12 163 veor $in0,$in0,$tmp 164 vext.8 $tmp,$zero,$tmp,#12 165 veor $key,$key,$rcon 166 veor $in0,$in0,$tmp 167 vshl.u8 $rcon,$rcon,#1 168 veor $in0,$in0,$key 169 170 vtbl.8 $key,{$in0},$mask 171 vext.8 $tmp,$zero,$in0,#12 172 vst1.32 {$in0},[$out],#16 173 aese $key,$zero 174 175 veor $in0,$in0,$tmp 176 vext.8 $tmp,$zero,$tmp,#12 177 veor $in0,$in0,$tmp 178 vext.8 $tmp,$zero,$tmp,#12 179 veor $key,$key,$rcon 180 veor $in0,$in0,$tmp 181 veor $in0,$in0,$key 182 vst1.32 {$in0},[$out] 183 add $out,$out,#0x50 184 185 mov $rounds,#10 186 b .Ldone 187 188.align 4 189.L192: 190 vld1.8 {$in1},[$inp],#8 191 vmov.i8 $key,#8 // borrow $key 192 vst1.32 {$in0},[$out],#16 193 vsub.i8 $mask,$mask,$key // adjust the mask 194 195.Loop192: 196 vtbl.8 $key,{$in1},$mask 197 vext.8 $tmp,$zero,$in0,#12 198 vst1.32 {$in1},[$out],#8 199 aese $key,$zero 200 subs $bits,$bits,#1 201 202 veor $in0,$in0,$tmp 203 vext.8 $tmp,$zero,$tmp,#12 204 veor $in0,$in0,$tmp 205 vext.8 $tmp,$zero,$tmp,#12 206 veor $in0,$in0,$tmp 207 208 vdup.32 $tmp,${in0}[3] 209 veor $tmp,$tmp,$in1 210 veor $key,$key,$rcon 211 vext.8 $in1,$zero,$in1,#12 212 vshl.u8 $rcon,$rcon,#1 213 veor $in1,$in1,$tmp 214 veor $in0,$in0,$key 215 veor $in1,$in1,$key 216 vst1.32 {$in0},[$out],#16 217 b.ne .Loop192 218 219 mov $rounds,#12 220 add $out,$out,#0x20 221 b .Ldone 222 223.align 4 224.L256: 225 vld1.8 {$in1},[$inp] 226 mov $bits,#7 227 mov $rounds,#14 228 vst1.32 {$in0},[$out],#16 229 230.Loop256: 231 vtbl.8 $key,{$in1},$mask 232 vext.8 $tmp,$zero,$in0,#12 233 vst1.32 {$in1},[$out],#16 234 aese $key,$zero 235 subs $bits,$bits,#1 236 237 veor $in0,$in0,$tmp 238 vext.8 $tmp,$zero,$tmp,#12 239 veor $in0,$in0,$tmp 240 vext.8 $tmp,$zero,$tmp,#12 241 veor $key,$key,$rcon 242 veor $in0,$in0,$tmp 243 vshl.u8 $rcon,$rcon,#1 244 veor $in0,$in0,$key 245 vst1.32 {$in0},[$out],#16 246 b.eq .Ldone 247 248 vdup.32 $key,${in0}[3] // just splat 249 vext.8 $tmp,$zero,$in1,#12 250 aese $key,$zero 251 252 veor $in1,$in1,$tmp 253 vext.8 $tmp,$zero,$tmp,#12 254 veor $in1,$in1,$tmp 255 vext.8 $tmp,$zero,$tmp,#12 256 veor $in1,$in1,$tmp 257 258 veor $in1,$in1,$key 259 b .Loop256 260 261.Ldone: 262 str $rounds,[$out] 263 mov $ptr,#0 264 265.Lenc_key_abort: 266 mov x0,$ptr // return value 267 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 268 ret 269.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 270 271.globl ${prefix}_set_decrypt_key 272.type ${prefix}_set_decrypt_key,%function 273.align 5 274${prefix}_set_decrypt_key: 275___ 276$code.=<<___ if ($flavour =~ /64/); 277 stp x29,x30,[sp,#-16]! 278 add x29,sp,#0 279___ 280$code.=<<___ if ($flavour !~ /64/); 281 stmdb sp!,{r4,lr} 282___ 283$code.=<<___; 284 bl .Lenc_key 285 286 cmp x0,#0 287 b.ne .Ldec_key_abort 288 289 sub $out,$out,#240 // restore original $out 290 mov x4,#-16 291 add $inp,$out,x12,lsl#4 // end of key schedule 292 293 vld1.32 {v0.16b},[$out] 294 vld1.32 {v1.16b},[$inp] 295 vst1.32 {v0.16b},[$inp],x4 296 vst1.32 {v1.16b},[$out],#16 297 298.Loop_imc: 299 vld1.32 {v0.16b},[$out] 300 vld1.32 {v1.16b},[$inp] 301 aesimc v0.16b,v0.16b 302 aesimc v1.16b,v1.16b 303 vst1.32 {v0.16b},[$inp],x4 304 vst1.32 {v1.16b},[$out],#16 305 cmp $inp,$out 306 b.hi .Loop_imc 307 308 vld1.32 {v0.16b},[$out] 309 aesimc v0.16b,v0.16b 310 vst1.32 {v0.16b},[$inp] 311 312 eor x0,x0,x0 // return value 313.Ldec_key_abort: 314___ 315$code.=<<___ if ($flavour !~ /64/); 316 ldmia sp!,{r4,pc} 317___ 318$code.=<<___ if ($flavour =~ /64/); 319 ldp x29,x30,[sp],#16 320 ret 321___ 322$code.=<<___; 323.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 324___ 325}}} 326{{{ 327sub gen_block () { 328my $dir = shift; 329my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 330my ($inp,$out,$key)=map("x$_",(0..2)); 331my $rounds="w3"; 332my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 333 334$code.=<<___; 335.globl ${prefix}_${dir}crypt 336.type ${prefix}_${dir}crypt,%function 337.align 5 338${prefix}_${dir}crypt: 339 ldr $rounds,[$key,#240] 340 vld1.32 {$rndkey0},[$key],#16 341 vld1.8 {$inout},[$inp] 342 sub $rounds,$rounds,#2 343 vld1.32 {$rndkey1},[$key],#16 344 345.Loop_${dir}c: 346 aes$e $inout,$rndkey0 347 aes$mc $inout,$inout 348 vld1.32 {$rndkey0},[$key],#16 349 subs $rounds,$rounds,#2 350 aes$e $inout,$rndkey1 351 aes$mc $inout,$inout 352 vld1.32 {$rndkey1},[$key],#16 353 b.gt .Loop_${dir}c 354 355 aes$e $inout,$rndkey0 356 aes$mc $inout,$inout 357 vld1.32 {$rndkey0},[$key] 358 aes$e $inout,$rndkey1 359 veor $inout,$inout,$rndkey0 360 361 vst1.8 {$inout},[$out] 362 ret 363.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 364___ 365} 366&gen_block("en"); 367&gen_block("de"); 368}}} 369{{{ 370my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 371my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 372my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 373 374my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 375my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 376 377### q8-q15 preloaded key schedule 378 379$code.=<<___; 380.globl ${prefix}_cbc_encrypt 381.type ${prefix}_cbc_encrypt,%function 382.align 5 383${prefix}_cbc_encrypt: 384___ 385$code.=<<___ if ($flavour =~ /64/); 386 stp x29,x30,[sp,#-16]! 387 add x29,sp,#0 388___ 389$code.=<<___ if ($flavour !~ /64/); 390 mov ip,sp 391 stmdb sp!,{r4-r8,lr} 392 vstmdb sp!,{d8-d15} @ ABI specification says so 393 ldmia ip,{r4-r5} @ load remaining args 394___ 395$code.=<<___; 396 subs $len,$len,#16 397 mov $step,#16 398 b.lo .Lcbc_abort 399 cclr $step,eq 400 401 cmp $enc,#0 // en- or decrypting? 402 ldr $rounds,[$key,#240] 403 and $len,$len,#-16 404 vld1.8 {$ivec},[$ivp] 405 vld1.8 {$dat},[$inp],$step 406 407 vld1.32 {q8-q9},[$key] // load key schedule... 408 sub $rounds,$rounds,#6 409 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 410 sub $rounds,$rounds,#2 411 vld1.32 {q10-q11},[$key_],#32 412 vld1.32 {q12-q13},[$key_],#32 413 vld1.32 {q14-q15},[$key_],#32 414 vld1.32 {$rndlast},[$key_] 415 416 add $key_,$key,#32 417 mov $cnt,$rounds 418 b.eq .Lcbc_dec 419 420 cmp $rounds,#2 421 veor $dat,$dat,$ivec 422 veor $rndzero_n_last,q8,$rndlast 423 b.eq .Lcbc_enc128 424 425 vld1.32 {$in0-$in1},[$key_] 426 add $key_,$key,#16 427 add $key4,$key,#16*4 428 add $key5,$key,#16*5 429 aese $dat,q8 430 aesmc $dat,$dat 431 add $key6,$key,#16*6 432 add $key7,$key,#16*7 433 b .Lenter_cbc_enc 434 435.align 4 436.Loop_cbc_enc: 437 aese $dat,q8 438 aesmc $dat,$dat 439 vst1.8 {$ivec},[$out],#16 440.Lenter_cbc_enc: 441 aese $dat,q9 442 aesmc $dat,$dat 443 aese $dat,$in0 444 aesmc $dat,$dat 445 vld1.32 {q8},[$key4] 446 cmp $rounds,#4 447 aese $dat,$in1 448 aesmc $dat,$dat 449 vld1.32 {q9},[$key5] 450 b.eq .Lcbc_enc192 451 452 aese $dat,q8 453 aesmc $dat,$dat 454 vld1.32 {q8},[$key6] 455 aese $dat,q9 456 aesmc $dat,$dat 457 vld1.32 {q9},[$key7] 458 nop 459 460.Lcbc_enc192: 461 aese $dat,q8 462 aesmc $dat,$dat 463 subs $len,$len,#16 464 aese $dat,q9 465 aesmc $dat,$dat 466 cclr $step,eq 467 aese $dat,q10 468 aesmc $dat,$dat 469 aese $dat,q11 470 aesmc $dat,$dat 471 vld1.8 {q8},[$inp],$step 472 aese $dat,q12 473 aesmc $dat,$dat 474 veor q8,q8,$rndzero_n_last 475 aese $dat,q13 476 aesmc $dat,$dat 477 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 478 aese $dat,q14 479 aesmc $dat,$dat 480 aese $dat,q15 481 veor $ivec,$dat,$rndlast 482 b.hs .Loop_cbc_enc 483 484 vst1.8 {$ivec},[$out],#16 485 b .Lcbc_done 486 487.align 5 488.Lcbc_enc128: 489 vld1.32 {$in0-$in1},[$key_] 490 aese $dat,q8 491 aesmc $dat,$dat 492 b .Lenter_cbc_enc128 493.Loop_cbc_enc128: 494 aese $dat,q8 495 aesmc $dat,$dat 496 vst1.8 {$ivec},[$out],#16 497.Lenter_cbc_enc128: 498 aese $dat,q9 499 aesmc $dat,$dat 500 subs $len,$len,#16 501 aese $dat,$in0 502 aesmc $dat,$dat 503 cclr $step,eq 504 aese $dat,$in1 505 aesmc $dat,$dat 506 aese $dat,q10 507 aesmc $dat,$dat 508 aese $dat,q11 509 aesmc $dat,$dat 510 vld1.8 {q8},[$inp],$step 511 aese $dat,q12 512 aesmc $dat,$dat 513 aese $dat,q13 514 aesmc $dat,$dat 515 aese $dat,q14 516 aesmc $dat,$dat 517 veor q8,q8,$rndzero_n_last 518 aese $dat,q15 519 veor $ivec,$dat,$rndlast 520 b.hs .Loop_cbc_enc128 521 522 vst1.8 {$ivec},[$out],#16 523 b .Lcbc_done 524___ 525{ 526my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 527$code.=<<___; 528.align 5 529.Lcbc_dec: 530 vld1.8 {$dat2},[$inp],#16 531 subs $len,$len,#32 // bias 532 add $cnt,$rounds,#2 533 vorr $in1,$dat,$dat 534 vorr $dat1,$dat,$dat 535 vorr $in2,$dat2,$dat2 536 b.lo .Lcbc_dec_tail 537 538 vorr $dat1,$dat2,$dat2 539 vld1.8 {$dat2},[$inp],#16 540 vorr $in0,$dat,$dat 541 vorr $in1,$dat1,$dat1 542 vorr $in2,$dat2,$dat2 543 544.Loop3x_cbc_dec: 545 aesd $dat0,q8 546 aesimc $dat0,$dat0 547 aesd $dat1,q8 548 aesimc $dat1,$dat1 549 aesd $dat2,q8 550 aesimc $dat2,$dat2 551 vld1.32 {q8},[$key_],#16 552 subs $cnt,$cnt,#2 553 aesd $dat0,q9 554 aesimc $dat0,$dat0 555 aesd $dat1,q9 556 aesimc $dat1,$dat1 557 aesd $dat2,q9 558 aesimc $dat2,$dat2 559 vld1.32 {q9},[$key_],#16 560 b.gt .Loop3x_cbc_dec 561 562 aesd $dat0,q8 563 aesimc $dat0,$dat0 564 aesd $dat1,q8 565 aesimc $dat1,$dat1 566 aesd $dat2,q8 567 aesimc $dat2,$dat2 568 veor $tmp0,$ivec,$rndlast 569 subs $len,$len,#0x30 570 veor $tmp1,$in0,$rndlast 571 mov.lo x6,$len // x6, $cnt, is zero at this point 572 aesd $dat0,q9 573 aesimc $dat0,$dat0 574 aesd $dat1,q9 575 aesimc $dat1,$dat1 576 aesd $dat2,q9 577 aesimc $dat2,$dat2 578 veor $tmp2,$in1,$rndlast 579 add $inp,$inp,x6 // $inp is adjusted in such way that 580 // at exit from the loop $dat1-$dat2 581 // are loaded with last "words" 582 vorr $ivec,$in2,$in2 583 mov $key_,$key 584 aesd $dat0,q12 585 aesimc $dat0,$dat0 586 aesd $dat1,q12 587 aesimc $dat1,$dat1 588 aesd $dat2,q12 589 aesimc $dat2,$dat2 590 vld1.8 {$in0},[$inp],#16 591 aesd $dat0,q13 592 aesimc $dat0,$dat0 593 aesd $dat1,q13 594 aesimc $dat1,$dat1 595 aesd $dat2,q13 596 aesimc $dat2,$dat2 597 vld1.8 {$in1},[$inp],#16 598 aesd $dat0,q14 599 aesimc $dat0,$dat0 600 aesd $dat1,q14 601 aesimc $dat1,$dat1 602 aesd $dat2,q14 603 aesimc $dat2,$dat2 604 vld1.8 {$in2},[$inp],#16 605 aesd $dat0,q15 606 aesd $dat1,q15 607 aesd $dat2,q15 608 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 609 add $cnt,$rounds,#2 610 veor $tmp0,$tmp0,$dat0 611 veor $tmp1,$tmp1,$dat1 612 veor $dat2,$dat2,$tmp2 613 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 614 vst1.8 {$tmp0},[$out],#16 615 vorr $dat0,$in0,$in0 616 vst1.8 {$tmp1},[$out],#16 617 vorr $dat1,$in1,$in1 618 vst1.8 {$dat2},[$out],#16 619 vorr $dat2,$in2,$in2 620 b.hs .Loop3x_cbc_dec 621 622 cmn $len,#0x30 623 b.eq .Lcbc_done 624 nop 625 626.Lcbc_dec_tail: 627 aesd $dat1,q8 628 aesimc $dat1,$dat1 629 aesd $dat2,q8 630 aesimc $dat2,$dat2 631 vld1.32 {q8},[$key_],#16 632 subs $cnt,$cnt,#2 633 aesd $dat1,q9 634 aesimc $dat1,$dat1 635 aesd $dat2,q9 636 aesimc $dat2,$dat2 637 vld1.32 {q9},[$key_],#16 638 b.gt .Lcbc_dec_tail 639 640 aesd $dat1,q8 641 aesimc $dat1,$dat1 642 aesd $dat2,q8 643 aesimc $dat2,$dat2 644 aesd $dat1,q9 645 aesimc $dat1,$dat1 646 aesd $dat2,q9 647 aesimc $dat2,$dat2 648 aesd $dat1,q12 649 aesimc $dat1,$dat1 650 aesd $dat2,q12 651 aesimc $dat2,$dat2 652 cmn $len,#0x20 653 aesd $dat1,q13 654 aesimc $dat1,$dat1 655 aesd $dat2,q13 656 aesimc $dat2,$dat2 657 veor $tmp1,$ivec,$rndlast 658 aesd $dat1,q14 659 aesimc $dat1,$dat1 660 aesd $dat2,q14 661 aesimc $dat2,$dat2 662 veor $tmp2,$in1,$rndlast 663 aesd $dat1,q15 664 aesd $dat2,q15 665 b.eq .Lcbc_dec_one 666 veor $tmp1,$tmp1,$dat1 667 veor $tmp2,$tmp2,$dat2 668 vorr $ivec,$in2,$in2 669 vst1.8 {$tmp1},[$out],#16 670 vst1.8 {$tmp2},[$out],#16 671 b .Lcbc_done 672 673.Lcbc_dec_one: 674 veor $tmp1,$tmp1,$dat2 675 vorr $ivec,$in2,$in2 676 vst1.8 {$tmp1},[$out],#16 677 678.Lcbc_done: 679 vst1.8 {$ivec},[$ivp] 680.Lcbc_abort: 681___ 682} 683$code.=<<___ if ($flavour !~ /64/); 684 vldmia sp!,{d8-d15} 685 ldmia sp!,{r4-r8,pc} 686___ 687$code.=<<___ if ($flavour =~ /64/); 688 ldr x29,[sp],#16 689 ret 690___ 691$code.=<<___; 692.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 693___ 694}}} 695{{{ 696my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 697my ($rounds,$cnt,$key_)=("w5","w6","x7"); 698my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 699my $step="x12"; # aliases with $tctr2 700 701my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 702my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 703 704my ($dat,$tmp)=($dat0,$tmp0); 705 706### q8-q15 preloaded key schedule 707 708$code.=<<___; 709.globl ${prefix}_ctr32_encrypt_blocks 710.type ${prefix}_ctr32_encrypt_blocks,%function 711.align 5 712${prefix}_ctr32_encrypt_blocks: 713___ 714$code.=<<___ if ($flavour =~ /64/); 715 stp x29,x30,[sp,#-16]! 716 add x29,sp,#0 717___ 718$code.=<<___ if ($flavour !~ /64/); 719 mov ip,sp 720 stmdb sp!,{r4-r10,lr} 721 vstmdb sp!,{d8-d15} @ ABI specification says so 722 ldr r4, [ip] @ load remaining arg 723___ 724$code.=<<___; 725 ldr $rounds,[$key,#240] 726 727 ldr $ctr, [$ivp, #12] 728 vld1.32 {$dat0},[$ivp] 729 730 vld1.32 {q8-q9},[$key] // load key schedule... 731 sub $rounds,$rounds,#4 732 mov $step,#16 733 cmp $len,#2 734 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 735 sub $rounds,$rounds,#2 736 vld1.32 {q12-q13},[$key_],#32 737 vld1.32 {q14-q15},[$key_],#32 738 vld1.32 {$rndlast},[$key_] 739 add $key_,$key,#32 740 mov $cnt,$rounds 741 cclr $step,lo 742#ifndef __ARMEB__ 743 rev $ctr, $ctr 744#endif 745 vorr $dat1,$dat0,$dat0 746 add $tctr1, $ctr, #1 747 vorr $dat2,$dat0,$dat0 748 add $ctr, $ctr, #2 749 vorr $ivec,$dat0,$dat0 750 rev $tctr1, $tctr1 751 vmov.32 ${dat1}[3],$tctr1 752 b.ls .Lctr32_tail 753 rev $tctr2, $ctr 754 sub $len,$len,#3 // bias 755 vmov.32 ${dat2}[3],$tctr2 756 b .Loop3x_ctr32 757 758.align 4 759.Loop3x_ctr32: 760 aese $dat0,q8 761 aesmc $dat0,$dat0 762 aese $dat1,q8 763 aesmc $dat1,$dat1 764 aese $dat2,q8 765 aesmc $dat2,$dat2 766 vld1.32 {q8},[$key_],#16 767 subs $cnt,$cnt,#2 768 aese $dat0,q9 769 aesmc $dat0,$dat0 770 aese $dat1,q9 771 aesmc $dat1,$dat1 772 aese $dat2,q9 773 aesmc $dat2,$dat2 774 vld1.32 {q9},[$key_],#16 775 b.gt .Loop3x_ctr32 776 777 aese $dat0,q8 778 aesmc $tmp0,$dat0 779 aese $dat1,q8 780 aesmc $tmp1,$dat1 781 vld1.8 {$in0},[$inp],#16 782 vorr $dat0,$ivec,$ivec 783 aese $dat2,q8 784 aesmc $dat2,$dat2 785 vld1.8 {$in1},[$inp],#16 786 vorr $dat1,$ivec,$ivec 787 aese $tmp0,q9 788 aesmc $tmp0,$tmp0 789 aese $tmp1,q9 790 aesmc $tmp1,$tmp1 791 vld1.8 {$in2},[$inp],#16 792 mov $key_,$key 793 aese $dat2,q9 794 aesmc $tmp2,$dat2 795 vorr $dat2,$ivec,$ivec 796 add $tctr0,$ctr,#1 797 aese $tmp0,q12 798 aesmc $tmp0,$tmp0 799 aese $tmp1,q12 800 aesmc $tmp1,$tmp1 801 veor $in0,$in0,$rndlast 802 add $tctr1,$ctr,#2 803 aese $tmp2,q12 804 aesmc $tmp2,$tmp2 805 veor $in1,$in1,$rndlast 806 add $ctr,$ctr,#3 807 aese $tmp0,q13 808 aesmc $tmp0,$tmp0 809 aese $tmp1,q13 810 aesmc $tmp1,$tmp1 811 veor $in2,$in2,$rndlast 812 rev $tctr0,$tctr0 813 aese $tmp2,q13 814 aesmc $tmp2,$tmp2 815 vmov.32 ${dat0}[3], $tctr0 816 rev $tctr1,$tctr1 817 aese $tmp0,q14 818 aesmc $tmp0,$tmp0 819 aese $tmp1,q14 820 aesmc $tmp1,$tmp1 821 vmov.32 ${dat1}[3], $tctr1 822 rev $tctr2,$ctr 823 aese $tmp2,q14 824 aesmc $tmp2,$tmp2 825 vmov.32 ${dat2}[3], $tctr2 826 subs $len,$len,#3 827 aese $tmp0,q15 828 aese $tmp1,q15 829 aese $tmp2,q15 830 831 veor $in0,$in0,$tmp0 832 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 833 vst1.8 {$in0},[$out],#16 834 veor $in1,$in1,$tmp1 835 mov $cnt,$rounds 836 vst1.8 {$in1},[$out],#16 837 veor $in2,$in2,$tmp2 838 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 839 vst1.8 {$in2},[$out],#16 840 b.hs .Loop3x_ctr32 841 842 adds $len,$len,#3 843 b.eq .Lctr32_done 844 cmp $len,#1 845 mov $step,#16 846 cclr $step,eq 847 848.Lctr32_tail: 849 aese $dat0,q8 850 aesmc $dat0,$dat0 851 aese $dat1,q8 852 aesmc $dat1,$dat1 853 vld1.32 {q8},[$key_],#16 854 subs $cnt,$cnt,#2 855 aese $dat0,q9 856 aesmc $dat0,$dat0 857 aese $dat1,q9 858 aesmc $dat1,$dat1 859 vld1.32 {q9},[$key_],#16 860 b.gt .Lctr32_tail 861 862 aese $dat0,q8 863 aesmc $dat0,$dat0 864 aese $dat1,q8 865 aesmc $dat1,$dat1 866 aese $dat0,q9 867 aesmc $dat0,$dat0 868 aese $dat1,q9 869 aesmc $dat1,$dat1 870 vld1.8 {$in0},[$inp],$step 871 aese $dat0,q12 872 aesmc $dat0,$dat0 873 aese $dat1,q12 874 aesmc $dat1,$dat1 875 vld1.8 {$in1},[$inp] 876 aese $dat0,q13 877 aesmc $dat0,$dat0 878 aese $dat1,q13 879 aesmc $dat1,$dat1 880 veor $in0,$in0,$rndlast 881 aese $dat0,q14 882 aesmc $dat0,$dat0 883 aese $dat1,q14 884 aesmc $dat1,$dat1 885 veor $in1,$in1,$rndlast 886 aese $dat0,q15 887 aese $dat1,q15 888 889 cmp $len,#1 890 veor $in0,$in0,$dat0 891 veor $in1,$in1,$dat1 892 vst1.8 {$in0},[$out],#16 893 b.eq .Lctr32_done 894 vst1.8 {$in1},[$out] 895 896.Lctr32_done: 897___ 898$code.=<<___ if ($flavour !~ /64/); 899 vldmia sp!,{d8-d15} 900 ldmia sp!,{r4-r10,pc} 901___ 902$code.=<<___ if ($flavour =~ /64/); 903 ldr x29,[sp],#16 904 ret 905___ 906$code.=<<___; 907.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 908___ 909}}} 910$code.=<<___; 911#endif 912___ 913######################################## 914if ($flavour =~ /64/) { ######## 64-bit code 915 my %opcode = ( 916 "aesd" => 0x4e285800, "aese" => 0x4e284800, 917 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 918 919 local *unaes = sub { 920 my ($mnemonic,$arg)=@_; 921 922 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 923 sprintf ".inst\t0x%08x\t//%s %s", 924 $opcode{$mnemonic}|$1|($2<<5), 925 $mnemonic,$arg; 926 }; 927 928 foreach(split("\n",$code)) { 929 s/\`([^\`]*)\`/eval($1)/geo; 930 931 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 932 s/@\s/\/\//o; # old->new style commentary 933 934 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 935 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 936 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 937 s/vmov\.i8/movi/o or # fix up legacy mnemonics 938 s/vext\.8/ext/o or 939 s/vrev32\.8/rev32/o or 940 s/vtst\.8/cmtst/o or 941 s/vshr/ushr/o or 942 s/^(\s+)v/$1/o or # strip off v prefix 943 s/\bbx\s+lr\b/ret/o; 944 945 # fix up remaining legacy suffixes 946 s/\.[ui]?8//o; 947 m/\],#8/o and s/\.16b/\.8b/go; 948 s/\.[ui]?32//o and s/\.16b/\.4s/go; 949 s/\.[ui]?64//o and s/\.16b/\.2d/go; 950 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 951 952 print $_,"\n"; 953 } 954} else { ######## 32-bit code 955 my %opcode = ( 956 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 957 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 958 959 local *unaes = sub { 960 my ($mnemonic,$arg)=@_; 961 962 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 963 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 964 |(($2&7)<<1) |(($2&8)<<2); 965 # since ARMv7 instructions are always encoded little-endian. 966 # correct solution is to use .inst directive, but older 967 # assemblers don't implement it:-( 968 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 969 $word&0xff,($word>>8)&0xff, 970 ($word>>16)&0xff,($word>>24)&0xff, 971 $mnemonic,$arg; 972 } 973 }; 974 975 sub unvtbl { 976 my $arg=shift; 977 978 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 979 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 980 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 981 } 982 983 sub unvdup32 { 984 my $arg=shift; 985 986 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 987 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 988 } 989 990 sub unvmov32 { 991 my $arg=shift; 992 993 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 994 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 995 } 996 997 foreach(split("\n",$code)) { 998 s/\`([^\`]*)\`/eval($1)/geo; 999 1000 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 1001 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 1002 s/\/\/\s?/@ /o; # new->old style commentary 1003 1004 # fix up remaining new-style suffixes 1005 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 1006 s/\],#[0-9]+/]!/o; 1007 1008 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 1009 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 1010 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 1011 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 1012 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 1013 s/^(\s+)b\./$1b/o or 1014 s/^(\s+)mov\./$1mov/o or 1015 s/^(\s+)ret/$1bx\tlr/o; 1016 1017 print $_,"\n"; 1018 } 1019} 1020 1021close STDOUT; 1022