1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(BORINGSSL_PREFIX) 13#include <boringssl_prefix_symbols_asm.h> 14#endif 15#include <openssl/arm_arch.h> 16 17.text 18 19.globl _gcm_init_neon 20.private_extern _gcm_init_neon 21 22.align 4 23_gcm_init_neon: 24 AARCH64_VALID_CALL_TARGET 25 // This function is adapted from gcm_init_v8. xC2 is t3. 26 ld1 {v17.2d}, [x1] // load H 27 movi v19.16b, #0xe1 28 shl v19.2d, v19.2d, #57 // 0xc2.0 29 ext v3.16b, v17.16b, v17.16b, #8 30 ushr v18.2d, v19.2d, #63 31 dup v17.4s, v17.s[1] 32 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 33 ushr v18.2d, v3.2d, #63 34 sshr v17.4s, v17.4s, #31 // broadcast carry bit 35 and v18.16b, v18.16b, v16.16b 36 shl v3.2d, v3.2d, #1 37 ext v18.16b, v18.16b, v18.16b, #8 38 and v16.16b, v16.16b, v17.16b 39 orr v3.16b, v3.16b, v18.16b // H<<<=1 40 eor v5.16b, v3.16b, v16.16b // twisted H 41 st1 {v5.2d}, [x0] // store Htable[0] 42 ret 43 44 45.globl _gcm_gmult_neon 46.private_extern _gcm_gmult_neon 47 48.align 4 49_gcm_gmult_neon: 50 AARCH64_VALID_CALL_TARGET 51 ld1 {v3.16b}, [x0] // load Xi 52 ld1 {v5.1d}, [x1], #8 // load twisted H 53 ld1 {v6.1d}, [x1] 54 adrp x9, Lmasks@PAGE // load constants 55 add x9, x9, Lmasks@PAGEOFF 56 ld1 {v24.2d, v25.2d}, [x9] 57 rev64 v3.16b, v3.16b // byteswap Xi 58 ext v3.16b, v3.16b, v3.16b, #8 59 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 60 61 mov x3, #16 62 b Lgmult_neon 63 64 65.globl _gcm_ghash_neon 66.private_extern _gcm_ghash_neon 67 68.align 4 69_gcm_ghash_neon: 70 AARCH64_VALID_CALL_TARGET 71 ld1 {v0.16b}, [x0] // load Xi 72 ld1 {v5.1d}, [x1], #8 // load twisted H 73 ld1 {v6.1d}, [x1] 74 adrp x9, Lmasks@PAGE // load constants 75 add x9, x9, Lmasks@PAGEOFF 76 ld1 {v24.2d, v25.2d}, [x9] 77 rev64 v0.16b, v0.16b // byteswap Xi 78 ext v0.16b, v0.16b, v0.16b, #8 79 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 80 81Loop_neon: 82 ld1 {v3.16b}, [x2], #16 // load inp 83 rev64 v3.16b, v3.16b // byteswap inp 84 ext v3.16b, v3.16b, v3.16b, #8 85 eor v3.16b, v3.16b, v0.16b // inp ^= Xi 86 87Lgmult_neon: 88 // Split the input into v3 and v4. (The upper halves are unused, 89 // so it is okay to leave them alone.) 90 ins v4.d[0], v3.d[1] 91 ext v16.8b, v5.8b, v5.8b, #1 // A1 92 pmull v16.8h, v16.8b, v3.8b // F = A1*B 93 ext v0.8b, v3.8b, v3.8b, #1 // B1 94 pmull v0.8h, v5.8b, v0.8b // E = A*B1 95 ext v17.8b, v5.8b, v5.8b, #2 // A2 96 pmull v17.8h, v17.8b, v3.8b // H = A2*B 97 ext v19.8b, v3.8b, v3.8b, #2 // B2 98 pmull v19.8h, v5.8b, v19.8b // G = A*B2 99 ext v18.8b, v5.8b, v5.8b, #3 // A3 100 eor v16.16b, v16.16b, v0.16b // L = E + F 101 pmull v18.8h, v18.8b, v3.8b // J = A3*B 102 ext v0.8b, v3.8b, v3.8b, #3 // B3 103 eor v17.16b, v17.16b, v19.16b // M = G + H 104 pmull v0.8h, v5.8b, v0.8b // I = A*B3 105 106 // Here we diverge from the 32-bit version. It computes the following 107 // (instructions reordered for clarity): 108 // 109 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 110 // vand $t0#hi, $t0#hi, $k48 111 // veor $t0#lo, $t0#lo, $t0#hi 112 // 113 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 114 // vand $t1#hi, $t1#hi, $k32 115 // veor $t1#lo, $t1#lo, $t1#hi 116 // 117 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 118 // vand $t2#hi, $t2#hi, $k16 119 // veor $t2#lo, $t2#lo, $t2#hi 120 // 121 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 122 // vmov.i64 $t3#hi, #0 123 // 124 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 125 // upper halves of SIMD registers, so we must split each half into 126 // separate registers. To compensate, we pair computations up and 127 // parallelize. 128 129 ext v19.8b, v3.8b, v3.8b, #4 // B4 130 eor v18.16b, v18.16b, v0.16b // N = I + J 131 pmull v19.8h, v5.8b, v19.8b // K = A*B4 132 133 // This can probably be scheduled more efficiently. For now, we just 134 // pair up independent instructions. 135 zip1 v20.2d, v16.2d, v17.2d 136 zip1 v22.2d, v18.2d, v19.2d 137 zip2 v21.2d, v16.2d, v17.2d 138 zip2 v23.2d, v18.2d, v19.2d 139 eor v20.16b, v20.16b, v21.16b 140 eor v22.16b, v22.16b, v23.16b 141 and v21.16b, v21.16b, v24.16b 142 and v23.16b, v23.16b, v25.16b 143 eor v20.16b, v20.16b, v21.16b 144 eor v22.16b, v22.16b, v23.16b 145 zip1 v16.2d, v20.2d, v21.2d 146 zip1 v18.2d, v22.2d, v23.2d 147 zip2 v17.2d, v20.2d, v21.2d 148 zip2 v19.2d, v22.2d, v23.2d 149 150 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 151 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 152 pmull v0.8h, v5.8b, v3.8b // D = A*B 153 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 154 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 155 eor v16.16b, v16.16b, v17.16b 156 eor v18.16b, v18.16b, v19.16b 157 eor v0.16b, v0.16b, v16.16b 158 eor v0.16b, v0.16b, v18.16b 159 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing 160 ext v16.8b, v7.8b, v7.8b, #1 // A1 161 pmull v16.8h, v16.8b, v3.8b // F = A1*B 162 ext v1.8b, v3.8b, v3.8b, #1 // B1 163 pmull v1.8h, v7.8b, v1.8b // E = A*B1 164 ext v17.8b, v7.8b, v7.8b, #2 // A2 165 pmull v17.8h, v17.8b, v3.8b // H = A2*B 166 ext v19.8b, v3.8b, v3.8b, #2 // B2 167 pmull v19.8h, v7.8b, v19.8b // G = A*B2 168 ext v18.8b, v7.8b, v7.8b, #3 // A3 169 eor v16.16b, v16.16b, v1.16b // L = E + F 170 pmull v18.8h, v18.8b, v3.8b // J = A3*B 171 ext v1.8b, v3.8b, v3.8b, #3 // B3 172 eor v17.16b, v17.16b, v19.16b // M = G + H 173 pmull v1.8h, v7.8b, v1.8b // I = A*B3 174 175 // Here we diverge from the 32-bit version. It computes the following 176 // (instructions reordered for clarity): 177 // 178 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 179 // vand $t0#hi, $t0#hi, $k48 180 // veor $t0#lo, $t0#lo, $t0#hi 181 // 182 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 183 // vand $t1#hi, $t1#hi, $k32 184 // veor $t1#lo, $t1#lo, $t1#hi 185 // 186 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 187 // vand $t2#hi, $t2#hi, $k16 188 // veor $t2#lo, $t2#lo, $t2#hi 189 // 190 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 191 // vmov.i64 $t3#hi, #0 192 // 193 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 194 // upper halves of SIMD registers, so we must split each half into 195 // separate registers. To compensate, we pair computations up and 196 // parallelize. 197 198 ext v19.8b, v3.8b, v3.8b, #4 // B4 199 eor v18.16b, v18.16b, v1.16b // N = I + J 200 pmull v19.8h, v7.8b, v19.8b // K = A*B4 201 202 // This can probably be scheduled more efficiently. For now, we just 203 // pair up independent instructions. 204 zip1 v20.2d, v16.2d, v17.2d 205 zip1 v22.2d, v18.2d, v19.2d 206 zip2 v21.2d, v16.2d, v17.2d 207 zip2 v23.2d, v18.2d, v19.2d 208 eor v20.16b, v20.16b, v21.16b 209 eor v22.16b, v22.16b, v23.16b 210 and v21.16b, v21.16b, v24.16b 211 and v23.16b, v23.16b, v25.16b 212 eor v20.16b, v20.16b, v21.16b 213 eor v22.16b, v22.16b, v23.16b 214 zip1 v16.2d, v20.2d, v21.2d 215 zip1 v18.2d, v22.2d, v23.2d 216 zip2 v17.2d, v20.2d, v21.2d 217 zip2 v19.2d, v22.2d, v23.2d 218 219 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 220 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 221 pmull v1.8h, v7.8b, v3.8b // D = A*B 222 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 223 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 224 eor v16.16b, v16.16b, v17.16b 225 eor v18.16b, v18.16b, v19.16b 226 eor v1.16b, v1.16b, v16.16b 227 eor v1.16b, v1.16b, v18.16b 228 ext v16.8b, v6.8b, v6.8b, #1 // A1 229 pmull v16.8h, v16.8b, v4.8b // F = A1*B 230 ext v2.8b, v4.8b, v4.8b, #1 // B1 231 pmull v2.8h, v6.8b, v2.8b // E = A*B1 232 ext v17.8b, v6.8b, v6.8b, #2 // A2 233 pmull v17.8h, v17.8b, v4.8b // H = A2*B 234 ext v19.8b, v4.8b, v4.8b, #2 // B2 235 pmull v19.8h, v6.8b, v19.8b // G = A*B2 236 ext v18.8b, v6.8b, v6.8b, #3 // A3 237 eor v16.16b, v16.16b, v2.16b // L = E + F 238 pmull v18.8h, v18.8b, v4.8b // J = A3*B 239 ext v2.8b, v4.8b, v4.8b, #3 // B3 240 eor v17.16b, v17.16b, v19.16b // M = G + H 241 pmull v2.8h, v6.8b, v2.8b // I = A*B3 242 243 // Here we diverge from the 32-bit version. It computes the following 244 // (instructions reordered for clarity): 245 // 246 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 247 // vand $t0#hi, $t0#hi, $k48 248 // veor $t0#lo, $t0#lo, $t0#hi 249 // 250 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 251 // vand $t1#hi, $t1#hi, $k32 252 // veor $t1#lo, $t1#lo, $t1#hi 253 // 254 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 255 // vand $t2#hi, $t2#hi, $k16 256 // veor $t2#lo, $t2#lo, $t2#hi 257 // 258 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 259 // vmov.i64 $t3#hi, #0 260 // 261 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 262 // upper halves of SIMD registers, so we must split each half into 263 // separate registers. To compensate, we pair computations up and 264 // parallelize. 265 266 ext v19.8b, v4.8b, v4.8b, #4 // B4 267 eor v18.16b, v18.16b, v2.16b // N = I + J 268 pmull v19.8h, v6.8b, v19.8b // K = A*B4 269 270 // This can probably be scheduled more efficiently. For now, we just 271 // pair up independent instructions. 272 zip1 v20.2d, v16.2d, v17.2d 273 zip1 v22.2d, v18.2d, v19.2d 274 zip2 v21.2d, v16.2d, v17.2d 275 zip2 v23.2d, v18.2d, v19.2d 276 eor v20.16b, v20.16b, v21.16b 277 eor v22.16b, v22.16b, v23.16b 278 and v21.16b, v21.16b, v24.16b 279 and v23.16b, v23.16b, v25.16b 280 eor v20.16b, v20.16b, v21.16b 281 eor v22.16b, v22.16b, v23.16b 282 zip1 v16.2d, v20.2d, v21.2d 283 zip1 v18.2d, v22.2d, v23.2d 284 zip2 v17.2d, v20.2d, v21.2d 285 zip2 v19.2d, v22.2d, v23.2d 286 287 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 288 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 289 pmull v2.8h, v6.8b, v4.8b // D = A*B 290 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 291 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 292 eor v16.16b, v16.16b, v17.16b 293 eor v18.16b, v18.16b, v19.16b 294 eor v2.16b, v2.16b, v16.16b 295 eor v2.16b, v2.16b, v18.16b 296 ext v16.16b, v0.16b, v2.16b, #8 297 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing 298 eor v1.16b, v1.16b, v2.16b 299 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi 300 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result 301 // This is a no-op due to the ins instruction below. 302 // ins v2.d[0], v1.d[1] 303 304 // equivalent of reduction_avx from ghash-x86_64.pl 305 shl v17.2d, v0.2d, #57 // 1st phase 306 shl v18.2d, v0.2d, #62 307 eor v18.16b, v18.16b, v17.16b // 308 shl v17.2d, v0.2d, #63 309 eor v18.16b, v18.16b, v17.16b // 310 // Note Xm contains {Xl.d[1], Xh.d[0]}. 311 eor v18.16b, v18.16b, v1.16b 312 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] 313 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] 314 315 ushr v18.2d, v0.2d, #1 // 2nd phase 316 eor v2.16b, v2.16b,v0.16b 317 eor v0.16b, v0.16b,v18.16b // 318 ushr v18.2d, v18.2d, #6 319 ushr v0.2d, v0.2d, #1 // 320 eor v0.16b, v0.16b, v2.16b // 321 eor v0.16b, v0.16b, v18.16b // 322 323 subs x3, x3, #16 324 bne Loop_neon 325 326 rev64 v0.16b, v0.16b // byteswap Xi and write 327 ext v0.16b, v0.16b, v0.16b, #8 328 st1 {v0.16b}, [x0] 329 330 ret 331 332 333.section __TEXT,__const 334.align 4 335Lmasks: 336.quad 0x0000ffffffffffff // k48 337.quad 0x00000000ffffffff // k32 338.quad 0x000000000000ffff // k16 339.quad 0x0000000000000000 // k0 340.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 341.align 2 342.align 2 343#endif // !OPENSSL_NO_ASM 344