1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17 18.text 19.arch armv8-a+crypto 20.globl gcm_init_v8 21.hidden gcm_init_v8 22.type gcm_init_v8,%function 23.align 4 24gcm_init_v8: 25 AARCH64_VALID_CALL_TARGET 26 ld1 {v17.2d},[x1] //load input H 27 movi v19.16b,#0xe1 28 shl v19.2d,v19.2d,#57 //0xc2.0 29 ext v3.16b,v17.16b,v17.16b,#8 30 ushr v18.2d,v19.2d,#63 31 dup v17.4s,v17.s[1] 32 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 33 ushr v18.2d,v3.2d,#63 34 sshr v17.4s,v17.4s,#31 //broadcast carry bit 35 and v18.16b,v18.16b,v16.16b 36 shl v3.2d,v3.2d,#1 37 ext v18.16b,v18.16b,v18.16b,#8 38 and v16.16b,v16.16b,v17.16b 39 orr v3.16b,v3.16b,v18.16b //H<<<=1 40 eor v20.16b,v3.16b,v16.16b //twisted H 41 st1 {v20.2d},[x0],#16 //store Htable[0] 42 43 //calculate H^2 44 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 45 pmull v0.1q,v20.1d,v20.1d 46 eor v16.16b,v16.16b,v20.16b 47 pmull2 v2.1q,v20.2d,v20.2d 48 pmull v1.1q,v16.1d,v16.1d 49 50 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 51 eor v18.16b,v0.16b,v2.16b 52 eor v1.16b,v1.16b,v17.16b 53 eor v1.16b,v1.16b,v18.16b 54 pmull v18.1q,v0.1d,v19.1d //1st phase 55 56 ins v2.d[0],v1.d[1] 57 ins v1.d[1],v0.d[0] 58 eor v0.16b,v1.16b,v18.16b 59 60 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 61 pmull v0.1q,v0.1d,v19.1d 62 eor v18.16b,v18.16b,v2.16b 63 eor v22.16b,v0.16b,v18.16b 64 65 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 66 eor v17.16b,v17.16b,v22.16b 67 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 68 st1 {v21.2d,v22.2d},[x0] //store Htable[1..2] 69 70 ret 71.size gcm_init_v8,.-gcm_init_v8 72.globl gcm_gmult_v8 73.hidden gcm_gmult_v8 74.type gcm_gmult_v8,%function 75.align 4 76gcm_gmult_v8: 77 AARCH64_VALID_CALL_TARGET 78 ld1 {v17.2d},[x0] //load Xi 79 movi v19.16b,#0xe1 80 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 81 shl v19.2d,v19.2d,#57 82#ifndef __ARMEB__ 83 rev64 v17.16b,v17.16b 84#endif 85 ext v3.16b,v17.16b,v17.16b,#8 86 87 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 88 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 89 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 90 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 91 92 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 93 eor v18.16b,v0.16b,v2.16b 94 eor v1.16b,v1.16b,v17.16b 95 eor v1.16b,v1.16b,v18.16b 96 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 97 98 ins v2.d[0],v1.d[1] 99 ins v1.d[1],v0.d[0] 100 eor v0.16b,v1.16b,v18.16b 101 102 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 103 pmull v0.1q,v0.1d,v19.1d 104 eor v18.16b,v18.16b,v2.16b 105 eor v0.16b,v0.16b,v18.16b 106 107#ifndef __ARMEB__ 108 rev64 v0.16b,v0.16b 109#endif 110 ext v0.16b,v0.16b,v0.16b,#8 111 st1 {v0.2d},[x0] //write out Xi 112 113 ret 114.size gcm_gmult_v8,.-gcm_gmult_v8 115.globl gcm_ghash_v8 116.hidden gcm_ghash_v8 117.type gcm_ghash_v8,%function 118.align 4 119gcm_ghash_v8: 120 AARCH64_VALID_CALL_TARGET 121 ld1 {v0.2d},[x0] //load [rotated] Xi 122 //"[rotated]" means that 123 //loaded value would have 124 //to be rotated in order to 125 //make it appear as in 126 //algorithm specification 127 subs x3,x3,#32 //see if x3 is 32 or larger 128 mov x12,#16 //x12 is used as post- 129 //increment for input pointer; 130 //as loop is modulo-scheduled 131 //x12 is zeroed just in time 132 //to preclude overstepping 133 //inp[len], which means that 134 //last block[s] are actually 135 //loaded twice, but last 136 //copy is not processed 137 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 138 movi v19.16b,#0xe1 139 ld1 {v22.2d},[x1] 140 csel x12,xzr,x12,eq //is it time to zero x12? 141 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 142 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 143 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 144#ifndef __ARMEB__ 145 rev64 v16.16b,v16.16b 146 rev64 v0.16b,v0.16b 147#endif 148 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 149 b.lo .Lodd_tail_v8 //x3 was less than 32 150 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 151#ifndef __ARMEB__ 152 rev64 v17.16b,v17.16b 153#endif 154 ext v7.16b,v17.16b,v17.16b,#8 155 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 156 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 157 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 158 pmull2 v6.1q,v20.2d,v7.2d 159 b .Loop_mod2x_v8 160 161.align 4 162.Loop_mod2x_v8: 163 ext v18.16b,v3.16b,v3.16b,#8 164 subs x3,x3,#32 //is there more data? 165 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 166 csel x12,xzr,x12,lo //is it time to zero x12? 167 168 pmull v5.1q,v21.1d,v17.1d 169 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 170 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 171 eor v0.16b,v0.16b,v4.16b //accumulate 172 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 173 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 174 175 eor v2.16b,v2.16b,v6.16b 176 csel x12,xzr,x12,eq //is it time to zero x12? 177 eor v1.16b,v1.16b,v5.16b 178 179 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 180 eor v18.16b,v0.16b,v2.16b 181 eor v1.16b,v1.16b,v17.16b 182 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 183#ifndef __ARMEB__ 184 rev64 v16.16b,v16.16b 185#endif 186 eor v1.16b,v1.16b,v18.16b 187 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 188 189#ifndef __ARMEB__ 190 rev64 v17.16b,v17.16b 191#endif 192 ins v2.d[0],v1.d[1] 193 ins v1.d[1],v0.d[0] 194 ext v7.16b,v17.16b,v17.16b,#8 195 ext v3.16b,v16.16b,v16.16b,#8 196 eor v0.16b,v1.16b,v18.16b 197 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 198 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 199 200 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 201 pmull v0.1q,v0.1d,v19.1d 202 eor v3.16b,v3.16b,v18.16b 203 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 204 eor v3.16b,v3.16b,v0.16b 205 pmull2 v6.1q,v20.2d,v7.2d 206 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes 207 208 eor v2.16b,v2.16b,v18.16b 209 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 210 adds x3,x3,#32 //re-construct x3 211 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 212 b.eq .Ldone_v8 //is x3 zero? 213.Lodd_tail_v8: 214 ext v18.16b,v0.16b,v0.16b,#8 215 eor v3.16b,v3.16b,v0.16b //inp^=Xi 216 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 217 218 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 219 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 220 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 221 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 222 223 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 224 eor v18.16b,v0.16b,v2.16b 225 eor v1.16b,v1.16b,v17.16b 226 eor v1.16b,v1.16b,v18.16b 227 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 228 229 ins v2.d[0],v1.d[1] 230 ins v1.d[1],v0.d[0] 231 eor v0.16b,v1.16b,v18.16b 232 233 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 234 pmull v0.1q,v0.1d,v19.1d 235 eor v18.16b,v18.16b,v2.16b 236 eor v0.16b,v0.16b,v18.16b 237 238.Ldone_v8: 239#ifndef __ARMEB__ 240 rev64 v0.16b,v0.16b 241#endif 242 ext v0.16b,v0.16b,v0.16b,#8 243 st1 {v0.2d},[x0] //write out Xi 244 245 ret 246.size gcm_ghash_v8,.-gcm_ghash_v8 247.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 248.align 2 249.align 2 250#endif 251#endif // !OPENSSL_NO_ASM 252.section .note.GNU-stack,"",%progbits 253