1#! /usr/bin/env perl 2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# April 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that it 21# uses 256 bytes per-key table [+32 bytes shared table]. There is no 22# experimental performance data available yet. The only approximation 23# that can be made at this point is based on code size. Inner loop is 24# 32 instructions long and on single-issue core should execute in <40 25# cycles. Having verified that gcc 3.4 didn't unroll corresponding 26# loop, this assembler loop body was found to be ~3x smaller than 27# compiler-generated one... 28# 29# July 2010 30# 31# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on 32# Cortex A8 core and ~25 cycles per processed byte (which was observed 33# to be ~3 times faster than gcc-generated code:-) 34# 35# February 2011 36# 37# Profiler-assisted and platform-specific optimization resulted in 7% 38# improvement on Cortex A8 core and ~23.5 cycles per byte. 39# 40# March 2011 41# 42# Add NEON implementation featuring polynomial multiplication, i.e. no 43# lookup tables involved. On Cortex A8 it was measured to process one 44# byte in 15 cycles or 55% faster than integer-only code. 45# 46# April 2014 47# 48# Switch to multiplication algorithm suggested in paper referred 49# below and combine it with reduction algorithm from x86 module. 50# Performance improvement over previous version varies from 65% on 51# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 52# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63, 53# Snapdragon S4 - in 9.33. 54# 55# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software 56# Polynomial Multiplication on ARM Processors using the NEON Engine. 57# 58# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf 59 60# ==================================================================== 61# Note about "528B" variant. In ARM case it makes lesser sense to 62# implement it for following reasons: 63# 64# - performance improvement won't be anywhere near 50%, because 128- 65# bit shift operation is neatly fused with 128-bit xor here, and 66# "538B" variant would eliminate only 4-5 instructions out of 32 67# in the inner loop (meaning that estimated improvement is ~15%); 68# - ARM-based systems are often embedded ones and extra memory 69# consumption might be unappreciated (for so little improvement); 70# 71# Byte order [in]dependence. ========================================= 72# 73# Caller is expected to maintain specific *dword* order in Htable, 74# namely with *least* significant dword of 128-bit value at *lower* 75# address. This differs completely from C code and has everything to 76# do with ldm instruction and order in which dwords are "consumed" by 77# algorithm. *Byte* order within these dwords in turn is whatever 78# *native* byte order on current platform. See gcm128.c for working 79# example... 80 81$flavour = shift; 82if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 83else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 84 85if ($flavour && $flavour ne "void") { 86 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 87 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 88 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 89 die "can't locate arm-xlate.pl"; 90 91 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 92} else { 93 open STDOUT,">$output"; 94} 95 96$Xi="r0"; # argument block 97$Htbl="r1"; 98$inp="r2"; 99$len="r3"; 100 101$Zll="r4"; # variables 102$Zlh="r5"; 103$Zhl="r6"; 104$Zhh="r7"; 105$Tll="r8"; 106$Tlh="r9"; 107$Thl="r10"; 108$Thh="r11"; 109$nlo="r12"; 110################# r13 is stack pointer 111$nhi="r14"; 112################# r15 is program counter 113 114$rem_4bit=$inp; # used in gcm_gmult_4bit 115$cnt=$len; 116 117sub Zsmash() { 118 my $i=12; 119 my @args=@_; 120 for ($Zll,$Zlh,$Zhl,$Zhh) { 121 $code.=<<___; 122#if __ARM_ARCH__>=7 && defined(__ARMEL__) 123 rev $_,$_ 124 str $_,[$Xi,#$i] 125#elif defined(__ARMEB__) 126 str $_,[$Xi,#$i] 127#else 128 mov $Tlh,$_,lsr#8 129 strb $_,[$Xi,#$i+3] 130 mov $Thl,$_,lsr#16 131 strb $Tlh,[$Xi,#$i+2] 132 mov $Thh,$_,lsr#24 133 strb $Thl,[$Xi,#$i+1] 134 strb $Thh,[$Xi,#$i] 135#endif 136___ 137 $code.="\t".shift(@args)."\n"; 138 $i-=4; 139 } 140} 141 142$code=<<___; 143#include <openssl/arm_arch.h> 144 145@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both 146@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL 147@ instructions are in aesv8-armx.pl.) 148.arch armv7-a 149 150.text 151#if defined(__thumb2__) || defined(__clang__) 152.syntax unified 153#endif 154#if defined(__thumb2__) 155.thumb 156#else 157.code 32 158#endif 159 160#ifdef __clang__ 161#define ldrplb ldrbpl 162#define ldrneb ldrbne 163#endif 164 165.type rem_4bit,%object 166.align 5 167rem_4bit: 168.short 0x0000,0x1C20,0x3840,0x2460 169.short 0x7080,0x6CA0,0x48C0,0x54E0 170.short 0xE100,0xFD20,0xD940,0xC560 171.short 0x9180,0x8DA0,0xA9C0,0xB5E0 172.size rem_4bit,.-rem_4bit 173 174.type rem_4bit_get,%function 175rem_4bit_get: 176#if defined(__thumb2__) 177 adr $rem_4bit,rem_4bit 178#else 179 sub $rem_4bit,pc,#8+32 @ &rem_4bit 180#endif 181 b .Lrem_4bit_got 182 nop 183 nop 184.size rem_4bit_get,.-rem_4bit_get 185 186.global gcm_ghash_4bit 187.type gcm_ghash_4bit,%function 188.align 4 189gcm_ghash_4bit: 190#if defined(__thumb2__) 191 adr r12,rem_4bit 192#else 193 sub r12,pc,#8+48 @ &rem_4bit 194#endif 195 add $len,$inp,$len @ $len to point at the end 196 stmdb sp!,{r3-r11,lr} @ save $len/end too 197 198 ldmia r12,{r4-r11} @ copy rem_4bit ... 199 stmdb sp!,{r4-r11} @ ... to stack 200 201 ldrb $nlo,[$inp,#15] 202 ldrb $nhi,[$Xi,#15] 203.Louter: 204 eor $nlo,$nlo,$nhi 205 and $nhi,$nlo,#0xf0 206 and $nlo,$nlo,#0x0f 207 mov $cnt,#14 208 209 add $Zhh,$Htbl,$nlo,lsl#4 210 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] 211 add $Thh,$Htbl,$nhi 212 ldrb $nlo,[$inp,#14] 213 214 and $nhi,$Zll,#0xf @ rem 215 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 216 add $nhi,$nhi,$nhi 217 eor $Zll,$Tll,$Zll,lsr#4 218 ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] 219 eor $Zll,$Zll,$Zlh,lsl#28 220 ldrb $nhi,[$Xi,#14] 221 eor $Zlh,$Tlh,$Zlh,lsr#4 222 eor $Zlh,$Zlh,$Zhl,lsl#28 223 eor $Zhl,$Thl,$Zhl,lsr#4 224 eor $Zhl,$Zhl,$Zhh,lsl#28 225 eor $Zhh,$Thh,$Zhh,lsr#4 226 eor $nlo,$nlo,$nhi 227 and $nhi,$nlo,#0xf0 228 and $nlo,$nlo,#0x0f 229 eor $Zhh,$Zhh,$Tll,lsl#16 230 231.Linner: 232 add $Thh,$Htbl,$nlo,lsl#4 233 and $nlo,$Zll,#0xf @ rem 234 subs $cnt,$cnt,#1 235 add $nlo,$nlo,$nlo 236 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] 237 eor $Zll,$Tll,$Zll,lsr#4 238 eor $Zll,$Zll,$Zlh,lsl#28 239 eor $Zlh,$Tlh,$Zlh,lsr#4 240 eor $Zlh,$Zlh,$Zhl,lsl#28 241 ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] 242 eor $Zhl,$Thl,$Zhl,lsr#4 243#ifdef __thumb2__ 244 it pl 245#endif 246 ldrplb $nlo,[$inp,$cnt] 247 eor $Zhl,$Zhl,$Zhh,lsl#28 248 eor $Zhh,$Thh,$Zhh,lsr#4 249 250 add $Thh,$Htbl,$nhi 251 and $nhi,$Zll,#0xf @ rem 252 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 253 add $nhi,$nhi,$nhi 254 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 255 eor $Zll,$Tll,$Zll,lsr#4 256#ifdef __thumb2__ 257 it pl 258#endif 259 ldrplb $Tll,[$Xi,$cnt] 260 eor $Zll,$Zll,$Zlh,lsl#28 261 eor $Zlh,$Tlh,$Zlh,lsr#4 262 ldrh $Tlh,[sp,$nhi] 263 eor $Zlh,$Zlh,$Zhl,lsl#28 264 eor $Zhl,$Thl,$Zhl,lsr#4 265 eor $Zhl,$Zhl,$Zhh,lsl#28 266#ifdef __thumb2__ 267 it pl 268#endif 269 eorpl $nlo,$nlo,$Tll 270 eor $Zhh,$Thh,$Zhh,lsr#4 271#ifdef __thumb2__ 272 itt pl 273#endif 274 andpl $nhi,$nlo,#0xf0 275 andpl $nlo,$nlo,#0x0f 276 eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] 277 bpl .Linner 278 279 ldr $len,[sp,#32] @ re-load $len/end 280 add $inp,$inp,#16 281 mov $nhi,$Zll 282___ 283 &Zsmash("cmp\t$inp,$len","\n". 284 "#ifdef __thumb2__\n". 285 " it ne\n". 286 "#endif\n". 287 " ldrneb $nlo,[$inp,#15]"); 288$code.=<<___; 289 bne .Louter 290 291 add sp,sp,#36 292#if __ARM_ARCH__>=5 293 ldmia sp!,{r4-r11,pc} 294#else 295 ldmia sp!,{r4-r11,lr} 296 tst lr,#1 297 moveq pc,lr @ be binary compatible with V4, yet 298 bx lr @ interoperable with Thumb ISA:-) 299#endif 300.size gcm_ghash_4bit,.-gcm_ghash_4bit 301 302.global gcm_gmult_4bit 303.type gcm_gmult_4bit,%function 304gcm_gmult_4bit: 305 stmdb sp!,{r4-r11,lr} 306 ldrb $nlo,[$Xi,#15] 307 b rem_4bit_get 308.Lrem_4bit_got: 309 and $nhi,$nlo,#0xf0 310 and $nlo,$nlo,#0x0f 311 mov $cnt,#14 312 313 add $Zhh,$Htbl,$nlo,lsl#4 314 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] 315 ldrb $nlo,[$Xi,#14] 316 317 add $Thh,$Htbl,$nhi 318 and $nhi,$Zll,#0xf @ rem 319 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 320 add $nhi,$nhi,$nhi 321 eor $Zll,$Tll,$Zll,lsr#4 322 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] 323 eor $Zll,$Zll,$Zlh,lsl#28 324 eor $Zlh,$Tlh,$Zlh,lsr#4 325 eor $Zlh,$Zlh,$Zhl,lsl#28 326 eor $Zhl,$Thl,$Zhl,lsr#4 327 eor $Zhl,$Zhl,$Zhh,lsl#28 328 eor $Zhh,$Thh,$Zhh,lsr#4 329 and $nhi,$nlo,#0xf0 330 eor $Zhh,$Zhh,$Tll,lsl#16 331 and $nlo,$nlo,#0x0f 332 333.Loop: 334 add $Thh,$Htbl,$nlo,lsl#4 335 and $nlo,$Zll,#0xf @ rem 336 subs $cnt,$cnt,#1 337 add $nlo,$nlo,$nlo 338 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] 339 eor $Zll,$Tll,$Zll,lsr#4 340 eor $Zll,$Zll,$Zlh,lsl#28 341 eor $Zlh,$Tlh,$Zlh,lsr#4 342 eor $Zlh,$Zlh,$Zhl,lsl#28 343 ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] 344 eor $Zhl,$Thl,$Zhl,lsr#4 345#ifdef __thumb2__ 346 it pl 347#endif 348 ldrplb $nlo,[$Xi,$cnt] 349 eor $Zhl,$Zhl,$Zhh,lsl#28 350 eor $Zhh,$Thh,$Zhh,lsr#4 351 352 add $Thh,$Htbl,$nhi 353 and $nhi,$Zll,#0xf @ rem 354 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 355 add $nhi,$nhi,$nhi 356 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 357 eor $Zll,$Tll,$Zll,lsr#4 358 eor $Zll,$Zll,$Zlh,lsl#28 359 eor $Zlh,$Tlh,$Zlh,lsr#4 360 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] 361 eor $Zlh,$Zlh,$Zhl,lsl#28 362 eor $Zhl,$Thl,$Zhl,lsr#4 363 eor $Zhl,$Zhl,$Zhh,lsl#28 364 eor $Zhh,$Thh,$Zhh,lsr#4 365#ifdef __thumb2__ 366 itt pl 367#endif 368 andpl $nhi,$nlo,#0xf0 369 andpl $nlo,$nlo,#0x0f 370 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 371 bpl .Loop 372___ 373 &Zsmash(); 374$code.=<<___; 375#if __ARM_ARCH__>=5 376 ldmia sp!,{r4-r11,pc} 377#else 378 ldmia sp!,{r4-r11,lr} 379 tst lr,#1 380 moveq pc,lr @ be binary compatible with V4, yet 381 bx lr @ interoperable with Thumb ISA:-) 382#endif 383.size gcm_gmult_4bit,.-gcm_gmult_4bit 384___ 385{ 386my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); 387my ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); 388my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); 389 390sub clmul64x64 { 391my ($r,$a,$b)=@_; 392$code.=<<___; 393 vext.8 $t0#lo, $a, $a, #1 @ A1 394 vmull.p8 $t0, $t0#lo, $b @ F = A1*B 395 vext.8 $r#lo, $b, $b, #1 @ B1 396 vmull.p8 $r, $a, $r#lo @ E = A*B1 397 vext.8 $t1#lo, $a, $a, #2 @ A2 398 vmull.p8 $t1, $t1#lo, $b @ H = A2*B 399 vext.8 $t3#lo, $b, $b, #2 @ B2 400 vmull.p8 $t3, $a, $t3#lo @ G = A*B2 401 vext.8 $t2#lo, $a, $a, #3 @ A3 402 veor $t0, $t0, $r @ L = E + F 403 vmull.p8 $t2, $t2#lo, $b @ J = A3*B 404 vext.8 $r#lo, $b, $b, #3 @ B3 405 veor $t1, $t1, $t3 @ M = G + H 406 vmull.p8 $r, $a, $r#lo @ I = A*B3 407 veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 408 vand $t0#hi, $t0#hi, $k48 409 vext.8 $t3#lo, $b, $b, #4 @ B4 410 veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 411 vand $t1#hi, $t1#hi, $k32 412 vmull.p8 $t3, $a, $t3#lo @ K = A*B4 413 veor $t2, $t2, $r @ N = I + J 414 veor $t0#lo, $t0#lo, $t0#hi 415 veor $t1#lo, $t1#lo, $t1#hi 416 veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 417 vand $t2#hi, $t2#hi, $k16 418 vext.8 $t0, $t0, $t0, #15 419 veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 420 vmov.i64 $t3#hi, #0 421 vext.8 $t1, $t1, $t1, #14 422 veor $t2#lo, $t2#lo, $t2#hi 423 vmull.p8 $r, $a, $b @ D = A*B 424 vext.8 $t3, $t3, $t3, #12 425 vext.8 $t2, $t2, $t2, #13 426 veor $t0, $t0, $t1 427 veor $t2, $t2, $t3 428 veor $r, $r, $t0 429 veor $r, $r, $t2 430___ 431} 432 433$code.=<<___; 434#if __ARM_MAX_ARCH__>=7 435.arch armv7-a 436.fpu neon 437 438.global gcm_init_neon 439.type gcm_init_neon,%function 440.align 4 441gcm_init_neon: 442 vld1.64 $IN#hi,[r1]! @ load H 443 vmov.i8 $t0,#0xe1 444 vld1.64 $IN#lo,[r1] 445 vshl.i64 $t0#hi,#57 446 vshr.u64 $t0#lo,#63 @ t0=0xc2....01 447 vdup.8 $t1,$IN#hi[7] 448 vshr.u64 $Hlo,$IN#lo,#63 449 vshr.s8 $t1,#7 @ broadcast carry bit 450 vshl.i64 $IN,$IN,#1 451 vand $t0,$t0,$t1 452 vorr $IN#hi,$Hlo @ H<<<=1 453 veor $IN,$IN,$t0 @ twisted H 454 vstmia r0,{$IN} 455 456 ret @ bx lr 457.size gcm_init_neon,.-gcm_init_neon 458 459.global gcm_gmult_neon 460.type gcm_gmult_neon,%function 461.align 4 462gcm_gmult_neon: 463 vld1.64 $IN#hi,[$Xi]! @ load Xi 464 vld1.64 $IN#lo,[$Xi]! 465 vmov.i64 $k48,#0x0000ffffffffffff 466 vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H 467 vmov.i64 $k32,#0x00000000ffffffff 468#ifdef __ARMEL__ 469 vrev64.8 $IN,$IN 470#endif 471 vmov.i64 $k16,#0x000000000000ffff 472 veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing 473 mov $len,#16 474 b .Lgmult_neon 475.size gcm_gmult_neon,.-gcm_gmult_neon 476 477.global gcm_ghash_neon 478.type gcm_ghash_neon,%function 479.align 4 480gcm_ghash_neon: 481 vld1.64 $Xl#hi,[$Xi]! @ load Xi 482 vld1.64 $Xl#lo,[$Xi]! 483 vmov.i64 $k48,#0x0000ffffffffffff 484 vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H 485 vmov.i64 $k32,#0x00000000ffffffff 486#ifdef __ARMEL__ 487 vrev64.8 $Xl,$Xl 488#endif 489 vmov.i64 $k16,#0x000000000000ffff 490 veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing 491 492.Loop_neon: 493 vld1.64 $IN#hi,[$inp]! @ load inp 494 vld1.64 $IN#lo,[$inp]! 495#ifdef __ARMEL__ 496 vrev64.8 $IN,$IN 497#endif 498 veor $IN,$Xl @ inp^=Xi 499.Lgmult_neon: 500___ 501 &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo 502$code.=<<___; 503 veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing 504___ 505 &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) 506 &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi 507$code.=<<___; 508 veor $Xm,$Xm,$Xl @ Karatsuba post-processing 509 veor $Xm,$Xm,$Xh 510 veor $Xl#hi,$Xl#hi,$Xm#lo 511 veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result 512 513 @ equivalent of reduction_avx from ghash-x86_64.pl 514 vshl.i64 $t1,$Xl,#57 @ 1st phase 515 vshl.i64 $t2,$Xl,#62 516 veor $t2,$t2,$t1 @ 517 vshl.i64 $t1,$Xl,#63 518 veor $t2, $t2, $t1 @ 519 veor $Xl#hi,$Xl#hi,$t2#lo @ 520 veor $Xh#lo,$Xh#lo,$t2#hi 521 522 vshr.u64 $t2,$Xl,#1 @ 2nd phase 523 veor $Xh,$Xh,$Xl 524 veor $Xl,$Xl,$t2 @ 525 vshr.u64 $t2,$t2,#6 526 vshr.u64 $Xl,$Xl,#1 @ 527 veor $Xl,$Xl,$Xh @ 528 veor $Xl,$Xl,$t2 @ 529 530 subs $len,#16 531 bne .Loop_neon 532 533#ifdef __ARMEL__ 534 vrev64.8 $Xl,$Xl 535#endif 536 sub $Xi,#16 537 vst1.64 $Xl#hi,[$Xi]! @ write out Xi 538 vst1.64 $Xl#lo,[$Xi] 539 540 ret @ bx lr 541.size gcm_ghash_neon,.-gcm_ghash_neon 542#endif 543___ 544} 545$code.=<<___; 546.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 547.align 2 548___ 549 550foreach (split("\n",$code)) { 551 s/\`([^\`]*)\`/eval $1/geo; 552 553 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 554 s/\bret\b/bx lr/go or 555 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 556 557 print $_,"\n"; 558} 559close STDOUT; # enforce flush 560