1#! /usr/bin/env perl 2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15 16# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It 17# implements the multiplication algorithm described in: 18# 19# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software 20# Polynomial Multiplication on ARM Processors using the NEON Engine. 21# 22# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf 23# 24# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is 25# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit 26# NEON, the low and high halves of the 128-bit register q0 are accessible as 27# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of 28# vN. Where the 32-bit version would use the upper half, this file must keep 29# halves in separate registers. 30# 31# The other distinction is in syntax. 32-bit NEON embeds lane information in the 32# instruction name, while AArch64 uses suffixes on the registers. For instance, 33# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written: 34# 35# vshl.i64 q0, q0, #1 36# 37# in 64-bit, it would be written: 38# 39# shl v0.2d, v0.2d, #1 40# 41# See Programmer's Guide for ARMv8-A, section 7 for details. 42# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf 43# 44# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ 45# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials 46# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit 47# polynomial and is conditioned on the PMULL extension. This file emulates the 48# latter with the former. 49 50use strict; 51 52my $flavour = shift; 53my $output; 54if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 55else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 56 57if ($flavour && $flavour ne "void") { 58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; 59 my $dir = $1; 60 my $xlate; 61 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 62 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 63 die "can't locate arm-xlate.pl"; 64 65 open OUT,"| \"$^X\" $xlate $flavour $output"; 66 *STDOUT=*OUT; 67} else { 68 open OUT,">$output"; 69 *STDOUT=*OUT; 70} 71 72my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3)); # argument block 73my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4)); 74my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7)); 75# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers 76# to spare. 77my ($t0, $t1, $t2, $t3) = map("v$_", (16..19)); 78my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23)); 79my ($k48_k32, $k16_k0) = map("v$_", (24..25)); 80 81my $code = ""; 82 83# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b 84# must be distinct from $t* and $k*. $t* are clobbered by the emitted code. 85sub clmul64x64 { 86my ($r, $a, $b) = @_; 87$code .= <<___; 88 ext $t0.8b, $a.8b, $a.8b, #1 // A1 89 pmull $t0.8h, $t0.8b, $b.8b // F = A1*B 90 ext $r.8b, $b.8b, $b.8b, #1 // B1 91 pmull $r.8h, $a.8b, $r.8b // E = A*B1 92 ext $t1.8b, $a.8b, $a.8b, #2 // A2 93 pmull $t1.8h, $t1.8b, $b.8b // H = A2*B 94 ext $t3.8b, $b.8b, $b.8b, #2 // B2 95 pmull $t3.8h, $a.8b, $t3.8b // G = A*B2 96 ext $t2.8b, $a.8b, $a.8b, #3 // A3 97 eor $t0.16b, $t0.16b, $r.16b // L = E + F 98 pmull $t2.8h, $t2.8b, $b.8b // J = A3*B 99 ext $r.8b, $b.8b, $b.8b, #3 // B3 100 eor $t1.16b, $t1.16b, $t3.16b // M = G + H 101 pmull $r.8h, $a.8b, $r.8b // I = A*B3 102 103 // Here we diverge from the 32-bit version. It computes the following 104 // (instructions reordered for clarity): 105 // 106 // veor \$t0#lo, \$t0#lo, \$t0#hi @ t0 = P0 + P1 (L) 107 // vand \$t0#hi, \$t0#hi, \$k48 108 // veor \$t0#lo, \$t0#lo, \$t0#hi 109 // 110 // veor \$t1#lo, \$t1#lo, \$t1#hi @ t1 = P2 + P3 (M) 111 // vand \$t1#hi, \$t1#hi, \$k32 112 // veor \$t1#lo, \$t1#lo, \$t1#hi 113 // 114 // veor \$t2#lo, \$t2#lo, \$t2#hi @ t2 = P4 + P5 (N) 115 // vand \$t2#hi, \$t2#hi, \$k16 116 // veor \$t2#lo, \$t2#lo, \$t2#hi 117 // 118 // veor \$t3#lo, \$t3#lo, \$t3#hi @ t3 = P6 + P7 (K) 119 // vmov.i64 \$t3#hi, #0 120 // 121 // \$kN is a mask with the bottom N bits set. AArch64 cannot compute on 122 // upper halves of SIMD registers, so we must split each half into 123 // separate registers. To compensate, we pair computations up and 124 // parallelize. 125 126 ext $t3.8b, $b.8b, $b.8b, #4 // B4 127 eor $t2.16b, $t2.16b, $r.16b // N = I + J 128 pmull $t3.8h, $a.8b, $t3.8b // K = A*B4 129 130 // This can probably be scheduled more efficiently. For now, we just 131 // pair up independent instructions. 132 zip1 $t0l_t1l.2d, $t0.2d, $t1.2d 133 zip1 $t2l_t3l.2d, $t2.2d, $t3.2d 134 zip2 $t0h_t1h.2d, $t0.2d, $t1.2d 135 zip2 $t2h_t3h.2d, $t2.2d, $t3.2d 136 eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b 137 eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b 138 and $t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b 139 and $t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b 140 eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b 141 eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b 142 zip1 $t0.2d, $t0l_t1l.2d, $t0h_t1h.2d 143 zip1 $t2.2d, $t2l_t3l.2d, $t2h_t3h.2d 144 zip2 $t1.2d, $t0l_t1l.2d, $t0h_t1h.2d 145 zip2 $t3.2d, $t2l_t3l.2d, $t2h_t3h.2d 146 147 ext $t0.16b, $t0.16b, $t0.16b, #15 // t0 = t0 << 8 148 ext $t1.16b, $t1.16b, $t1.16b, #14 // t1 = t1 << 16 149 pmull $r.8h, $a.8b, $b.8b // D = A*B 150 ext $t3.16b, $t3.16b, $t3.16b, #12 // t3 = t3 << 32 151 ext $t2.16b, $t2.16b, $t2.16b, #13 // t2 = t2 << 24 152 eor $t0.16b, $t0.16b, $t1.16b 153 eor $t2.16b, $t2.16b, $t3.16b 154 eor $r.16b, $r.16b, $t0.16b 155 eor $r.16b, $r.16b, $t2.16b 156___ 157} 158 159$code .= <<___; 160#include <openssl/arm_arch.h> 161 162.text 163 164.global gcm_init_neon 165.type gcm_init_neon,%function 166.align 4 167gcm_init_neon: 168 AARCH64_VALID_CALL_TARGET 169 // This function is adapted from gcm_init_v8. xC2 is t3. 170 ld1 {$t1.2d}, [x1] // load H 171 movi $t3.16b, #0xe1 172 shl $t3.2d, $t3.2d, #57 // 0xc2.0 173 ext $INlo.16b, $t1.16b, $t1.16b, #8 174 ushr $t2.2d, $t3.2d, #63 175 dup $t1.4s, $t1.s[1] 176 ext $t0.16b, $t2.16b, $t3.16b, #8 // t0=0xc2....01 177 ushr $t2.2d, $INlo.2d, #63 178 sshr $t1.4s, $t1.4s, #31 // broadcast carry bit 179 and $t2.16b, $t2.16b, $t0.16b 180 shl $INlo.2d, $INlo.2d, #1 181 ext $t2.16b, $t2.16b, $t2.16b, #8 182 and $t0.16b, $t0.16b, $t1.16b 183 orr $INlo.16b, $INlo.16b, $t2.16b // H<<<=1 184 eor $Hlo.16b, $INlo.16b, $t0.16b // twisted H 185 st1 {$Hlo.2d}, [x0] // store Htable[0] 186 ret 187.size gcm_init_neon,.-gcm_init_neon 188 189.global gcm_gmult_neon 190.type gcm_gmult_neon,%function 191.align 4 192gcm_gmult_neon: 193 AARCH64_VALID_CALL_TARGET 194 ld1 {$INlo.16b}, [$Xi] // load Xi 195 ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H 196 ld1 {$Hhi.1d}, [$Htbl] 197 adrp x9, :pg_hi21:.Lmasks // load constants 198 add x9, x9, :lo12:.Lmasks 199 ld1 {$k48_k32.2d, $k16_k0.2d}, [x9] 200 rev64 $INlo.16b, $INlo.16b // byteswap Xi 201 ext $INlo.16b, $INlo.16b, $INlo.16b, #8 202 eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing 203 204 mov $len, #16 205 b .Lgmult_neon 206.size gcm_gmult_neon,.-gcm_gmult_neon 207 208.global gcm_ghash_neon 209.type gcm_ghash_neon,%function 210.align 4 211gcm_ghash_neon: 212 AARCH64_VALID_CALL_TARGET 213 ld1 {$Xl.16b}, [$Xi] // load Xi 214 ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H 215 ld1 {$Hhi.1d}, [$Htbl] 216 adrp x9, :pg_hi21:.Lmasks // load constants 217 add x9, x9, :lo12:.Lmasks 218 ld1 {$k48_k32.2d, $k16_k0.2d}, [x9] 219 rev64 $Xl.16b, $Xl.16b // byteswap Xi 220 ext $Xl.16b, $Xl.16b, $Xl.16b, #8 221 eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing 222 223.Loop_neon: 224 ld1 {$INlo.16b}, [$inp], #16 // load inp 225 rev64 $INlo.16b, $INlo.16b // byteswap inp 226 ext $INlo.16b, $INlo.16b, $INlo.16b, #8 227 eor $INlo.16b, $INlo.16b, $Xl.16b // inp ^= Xi 228 229.Lgmult_neon: 230 // Split the input into $INlo and $INhi. (The upper halves are unused, 231 // so it is okay to leave them alone.) 232 ins $INhi.d[0], $INlo.d[1] 233___ 234&clmul64x64 ($Xl, $Hlo, $INlo); # H.lo·Xi.lo 235$code .= <<___; 236 eor $INlo.8b, $INlo.8b, $INhi.8b // Karatsuba pre-processing 237___ 238&clmul64x64 ($Xm, $Hhl, $INlo); # (H.lo+H.hi)·(Xi.lo+Xi.hi) 239&clmul64x64 ($Xh, $Hhi, $INhi); # H.hi·Xi.hi 240$code .= <<___; 241 ext $t0.16b, $Xl.16b, $Xh.16b, #8 242 eor $Xm.16b, $Xm.16b, $Xl.16b // Karatsuba post-processing 243 eor $Xm.16b, $Xm.16b, $Xh.16b 244 eor $Xm.16b, $Xm.16b, $t0.16b // Xm overlaps Xh.lo and Xl.hi 245 ins $Xl.d[1], $Xm.d[0] // Xh|Xl - 256-bit result 246 // This is a no-op due to the ins instruction below. 247 // ins $Xh.d[0], $Xm.d[1] 248 249 // equivalent of reduction_avx from ghash-x86_64.pl 250 shl $t1.2d, $Xl.2d, #57 // 1st phase 251 shl $t2.2d, $Xl.2d, #62 252 eor $t2.16b, $t2.16b, $t1.16b // 253 shl $t1.2d, $Xl.2d, #63 254 eor $t2.16b, $t2.16b, $t1.16b // 255 // Note Xm contains {Xl.d[1], Xh.d[0]}. 256 eor $t2.16b, $t2.16b, $Xm.16b 257 ins $Xl.d[1], $t2.d[0] // Xl.d[1] ^= t2.d[0] 258 ins $Xh.d[0], $t2.d[1] // Xh.d[0] ^= t2.d[1] 259 260 ushr $t2.2d, $Xl.2d, #1 // 2nd phase 261 eor $Xh.16b, $Xh.16b,$Xl.16b 262 eor $Xl.16b, $Xl.16b,$t2.16b // 263 ushr $t2.2d, $t2.2d, #6 264 ushr $Xl.2d, $Xl.2d, #1 // 265 eor $Xl.16b, $Xl.16b, $Xh.16b // 266 eor $Xl.16b, $Xl.16b, $t2.16b // 267 268 subs $len, $len, #16 269 bne .Loop_neon 270 271 rev64 $Xl.16b, $Xl.16b // byteswap Xi and write 272 ext $Xl.16b, $Xl.16b, $Xl.16b, #8 273 st1 {$Xl.16b}, [$Xi] 274 275 ret 276.size gcm_ghash_neon,.-gcm_ghash_neon 277 278.section .rodata 279.align 4 280.Lmasks: 281.quad 0x0000ffffffffffff // k48 282.quad 0x00000000ffffffff // k32 283.quad 0x000000000000ffff // k16 284.quad 0x0000000000000000 // k0 285.asciz "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>" 286.align 2 287___ 288 289foreach (split("\n",$code)) { 290 s/\`([^\`]*)\`/eval $1/geo; 291 292 print $_,"\n"; 293} 294close STDOUT or die "error closing STDOUT"; # enforce flush 295