1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(BORINGSSL_PREFIX) 13#include <boringssl_prefix_symbols_asm.h> 14#endif 15.text 16 17.globl _gcm_init_neon 18.private_extern _gcm_init_neon 19 20.align 4 21_gcm_init_neon: 22 // This function is adapted from gcm_init_v8. xC2 is t3. 23 ld1 {v17.2d}, [x1] // load H 24 movi v19.16b, #0xe1 25 shl v19.2d, v19.2d, #57 // 0xc2.0 26 ext v3.16b, v17.16b, v17.16b, #8 27 ushr v18.2d, v19.2d, #63 28 dup v17.4s, v17.s[1] 29 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 30 ushr v18.2d, v3.2d, #63 31 sshr v17.4s, v17.4s, #31 // broadcast carry bit 32 and v18.16b, v18.16b, v16.16b 33 shl v3.2d, v3.2d, #1 34 ext v18.16b, v18.16b, v18.16b, #8 35 and v16.16b, v16.16b, v17.16b 36 orr v3.16b, v3.16b, v18.16b // H<<<=1 37 eor v5.16b, v3.16b, v16.16b // twisted H 38 st1 {v5.2d}, [x0] // store Htable[0] 39 ret 40 41 42.globl _gcm_gmult_neon 43.private_extern _gcm_gmult_neon 44 45.align 4 46_gcm_gmult_neon: 47 ld1 {v3.16b}, [x0] // load Xi 48 ld1 {v5.1d}, [x1], #8 // load twisted H 49 ld1 {v6.1d}, [x1] 50 adrp x9, Lmasks@PAGE // load constants 51 add x9, x9, Lmasks@PAGEOFF 52 ld1 {v24.2d, v25.2d}, [x9] 53 rev64 v3.16b, v3.16b // byteswap Xi 54 ext v3.16b, v3.16b, v3.16b, #8 55 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 56 57 mov x3, #16 58 b Lgmult_neon 59 60 61.globl _gcm_ghash_neon 62.private_extern _gcm_ghash_neon 63 64.align 4 65_gcm_ghash_neon: 66 ld1 {v0.16b}, [x0] // load Xi 67 ld1 {v5.1d}, [x1], #8 // load twisted H 68 ld1 {v6.1d}, [x1] 69 adrp x9, Lmasks@PAGE // load constants 70 add x9, x9, Lmasks@PAGEOFF 71 ld1 {v24.2d, v25.2d}, [x9] 72 rev64 v0.16b, v0.16b // byteswap Xi 73 ext v0.16b, v0.16b, v0.16b, #8 74 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 75 76Loop_neon: 77 ld1 {v3.16b}, [x2], #16 // load inp 78 rev64 v3.16b, v3.16b // byteswap inp 79 ext v3.16b, v3.16b, v3.16b, #8 80 eor v3.16b, v3.16b, v0.16b // inp ^= Xi 81 82Lgmult_neon: 83 // Split the input into v3 and v4. (The upper halves are unused, 84 // so it is okay to leave them alone.) 85 ins v4.d[0], v3.d[1] 86 ext v16.8b, v5.8b, v5.8b, #1 // A1 87 pmull v16.8h, v16.8b, v3.8b // F = A1*B 88 ext v0.8b, v3.8b, v3.8b, #1 // B1 89 pmull v0.8h, v5.8b, v0.8b // E = A*B1 90 ext v17.8b, v5.8b, v5.8b, #2 // A2 91 pmull v17.8h, v17.8b, v3.8b // H = A2*B 92 ext v19.8b, v3.8b, v3.8b, #2 // B2 93 pmull v19.8h, v5.8b, v19.8b // G = A*B2 94 ext v18.8b, v5.8b, v5.8b, #3 // A3 95 eor v16.16b, v16.16b, v0.16b // L = E + F 96 pmull v18.8h, v18.8b, v3.8b // J = A3*B 97 ext v0.8b, v3.8b, v3.8b, #3 // B3 98 eor v17.16b, v17.16b, v19.16b // M = G + H 99 pmull v0.8h, v5.8b, v0.8b // I = A*B3 100 101 // Here we diverge from the 32-bit version. It computes the following 102 // (instructions reordered for clarity): 103 // 104 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 105 // vand $t0#hi, $t0#hi, $k48 106 // veor $t0#lo, $t0#lo, $t0#hi 107 // 108 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 109 // vand $t1#hi, $t1#hi, $k32 110 // veor $t1#lo, $t1#lo, $t1#hi 111 // 112 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 113 // vand $t2#hi, $t2#hi, $k16 114 // veor $t2#lo, $t2#lo, $t2#hi 115 // 116 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 117 // vmov.i64 $t3#hi, #0 118 // 119 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 120 // upper halves of SIMD registers, so we must split each half into 121 // separate registers. To compensate, we pair computations up and 122 // parallelize. 123 124 ext v19.8b, v3.8b, v3.8b, #4 // B4 125 eor v18.16b, v18.16b, v0.16b // N = I + J 126 pmull v19.8h, v5.8b, v19.8b // K = A*B4 127 128 // This can probably be scheduled more efficiently. For now, we just 129 // pair up independent instructions. 130 zip1 v20.2d, v16.2d, v17.2d 131 zip1 v22.2d, v18.2d, v19.2d 132 zip2 v21.2d, v16.2d, v17.2d 133 zip2 v23.2d, v18.2d, v19.2d 134 eor v20.16b, v20.16b, v21.16b 135 eor v22.16b, v22.16b, v23.16b 136 and v21.16b, v21.16b, v24.16b 137 and v23.16b, v23.16b, v25.16b 138 eor v20.16b, v20.16b, v21.16b 139 eor v22.16b, v22.16b, v23.16b 140 zip1 v16.2d, v20.2d, v21.2d 141 zip1 v18.2d, v22.2d, v23.2d 142 zip2 v17.2d, v20.2d, v21.2d 143 zip2 v19.2d, v22.2d, v23.2d 144 145 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 146 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 147 pmull v0.8h, v5.8b, v3.8b // D = A*B 148 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 149 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 150 eor v16.16b, v16.16b, v17.16b 151 eor v18.16b, v18.16b, v19.16b 152 eor v0.16b, v0.16b, v16.16b 153 eor v0.16b, v0.16b, v18.16b 154 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing 155 ext v16.8b, v7.8b, v7.8b, #1 // A1 156 pmull v16.8h, v16.8b, v3.8b // F = A1*B 157 ext v1.8b, v3.8b, v3.8b, #1 // B1 158 pmull v1.8h, v7.8b, v1.8b // E = A*B1 159 ext v17.8b, v7.8b, v7.8b, #2 // A2 160 pmull v17.8h, v17.8b, v3.8b // H = A2*B 161 ext v19.8b, v3.8b, v3.8b, #2 // B2 162 pmull v19.8h, v7.8b, v19.8b // G = A*B2 163 ext v18.8b, v7.8b, v7.8b, #3 // A3 164 eor v16.16b, v16.16b, v1.16b // L = E + F 165 pmull v18.8h, v18.8b, v3.8b // J = A3*B 166 ext v1.8b, v3.8b, v3.8b, #3 // B3 167 eor v17.16b, v17.16b, v19.16b // M = G + H 168 pmull v1.8h, v7.8b, v1.8b // I = A*B3 169 170 // Here we diverge from the 32-bit version. It computes the following 171 // (instructions reordered for clarity): 172 // 173 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 174 // vand $t0#hi, $t0#hi, $k48 175 // veor $t0#lo, $t0#lo, $t0#hi 176 // 177 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 178 // vand $t1#hi, $t1#hi, $k32 179 // veor $t1#lo, $t1#lo, $t1#hi 180 // 181 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 182 // vand $t2#hi, $t2#hi, $k16 183 // veor $t2#lo, $t2#lo, $t2#hi 184 // 185 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 186 // vmov.i64 $t3#hi, #0 187 // 188 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 189 // upper halves of SIMD registers, so we must split each half into 190 // separate registers. To compensate, we pair computations up and 191 // parallelize. 192 193 ext v19.8b, v3.8b, v3.8b, #4 // B4 194 eor v18.16b, v18.16b, v1.16b // N = I + J 195 pmull v19.8h, v7.8b, v19.8b // K = A*B4 196 197 // This can probably be scheduled more efficiently. For now, we just 198 // pair up independent instructions. 199 zip1 v20.2d, v16.2d, v17.2d 200 zip1 v22.2d, v18.2d, v19.2d 201 zip2 v21.2d, v16.2d, v17.2d 202 zip2 v23.2d, v18.2d, v19.2d 203 eor v20.16b, v20.16b, v21.16b 204 eor v22.16b, v22.16b, v23.16b 205 and v21.16b, v21.16b, v24.16b 206 and v23.16b, v23.16b, v25.16b 207 eor v20.16b, v20.16b, v21.16b 208 eor v22.16b, v22.16b, v23.16b 209 zip1 v16.2d, v20.2d, v21.2d 210 zip1 v18.2d, v22.2d, v23.2d 211 zip2 v17.2d, v20.2d, v21.2d 212 zip2 v19.2d, v22.2d, v23.2d 213 214 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 215 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 216 pmull v1.8h, v7.8b, v3.8b // D = A*B 217 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 218 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 219 eor v16.16b, v16.16b, v17.16b 220 eor v18.16b, v18.16b, v19.16b 221 eor v1.16b, v1.16b, v16.16b 222 eor v1.16b, v1.16b, v18.16b 223 ext v16.8b, v6.8b, v6.8b, #1 // A1 224 pmull v16.8h, v16.8b, v4.8b // F = A1*B 225 ext v2.8b, v4.8b, v4.8b, #1 // B1 226 pmull v2.8h, v6.8b, v2.8b // E = A*B1 227 ext v17.8b, v6.8b, v6.8b, #2 // A2 228 pmull v17.8h, v17.8b, v4.8b // H = A2*B 229 ext v19.8b, v4.8b, v4.8b, #2 // B2 230 pmull v19.8h, v6.8b, v19.8b // G = A*B2 231 ext v18.8b, v6.8b, v6.8b, #3 // A3 232 eor v16.16b, v16.16b, v2.16b // L = E + F 233 pmull v18.8h, v18.8b, v4.8b // J = A3*B 234 ext v2.8b, v4.8b, v4.8b, #3 // B3 235 eor v17.16b, v17.16b, v19.16b // M = G + H 236 pmull v2.8h, v6.8b, v2.8b // I = A*B3 237 238 // Here we diverge from the 32-bit version. It computes the following 239 // (instructions reordered for clarity): 240 // 241 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 242 // vand $t0#hi, $t0#hi, $k48 243 // veor $t0#lo, $t0#lo, $t0#hi 244 // 245 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 246 // vand $t1#hi, $t1#hi, $k32 247 // veor $t1#lo, $t1#lo, $t1#hi 248 // 249 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 250 // vand $t2#hi, $t2#hi, $k16 251 // veor $t2#lo, $t2#lo, $t2#hi 252 // 253 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 254 // vmov.i64 $t3#hi, #0 255 // 256 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 257 // upper halves of SIMD registers, so we must split each half into 258 // separate registers. To compensate, we pair computations up and 259 // parallelize. 260 261 ext v19.8b, v4.8b, v4.8b, #4 // B4 262 eor v18.16b, v18.16b, v2.16b // N = I + J 263 pmull v19.8h, v6.8b, v19.8b // K = A*B4 264 265 // This can probably be scheduled more efficiently. For now, we just 266 // pair up independent instructions. 267 zip1 v20.2d, v16.2d, v17.2d 268 zip1 v22.2d, v18.2d, v19.2d 269 zip2 v21.2d, v16.2d, v17.2d 270 zip2 v23.2d, v18.2d, v19.2d 271 eor v20.16b, v20.16b, v21.16b 272 eor v22.16b, v22.16b, v23.16b 273 and v21.16b, v21.16b, v24.16b 274 and v23.16b, v23.16b, v25.16b 275 eor v20.16b, v20.16b, v21.16b 276 eor v22.16b, v22.16b, v23.16b 277 zip1 v16.2d, v20.2d, v21.2d 278 zip1 v18.2d, v22.2d, v23.2d 279 zip2 v17.2d, v20.2d, v21.2d 280 zip2 v19.2d, v22.2d, v23.2d 281 282 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 283 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 284 pmull v2.8h, v6.8b, v4.8b // D = A*B 285 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 286 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 287 eor v16.16b, v16.16b, v17.16b 288 eor v18.16b, v18.16b, v19.16b 289 eor v2.16b, v2.16b, v16.16b 290 eor v2.16b, v2.16b, v18.16b 291 ext v16.16b, v0.16b, v2.16b, #8 292 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing 293 eor v1.16b, v1.16b, v2.16b 294 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi 295 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result 296 // This is a no-op due to the ins instruction below. 297 // ins v2.d[0], v1.d[1] 298 299 // equivalent of reduction_avx from ghash-x86_64.pl 300 shl v17.2d, v0.2d, #57 // 1st phase 301 shl v18.2d, v0.2d, #62 302 eor v18.16b, v18.16b, v17.16b // 303 shl v17.2d, v0.2d, #63 304 eor v18.16b, v18.16b, v17.16b // 305 // Note Xm contains {Xl.d[1], Xh.d[0]}. 306 eor v18.16b, v18.16b, v1.16b 307 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] 308 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] 309 310 ushr v18.2d, v0.2d, #1 // 2nd phase 311 eor v2.16b, v2.16b,v0.16b 312 eor v0.16b, v0.16b,v18.16b // 313 ushr v18.2d, v18.2d, #6 314 ushr v0.2d, v0.2d, #1 // 315 eor v0.16b, v0.16b, v2.16b // 316 eor v0.16b, v0.16b, v18.16b // 317 318 subs x3, x3, #16 319 bne Loop_neon 320 321 rev64 v0.16b, v0.16b // byteswap Xi and write 322 ext v0.16b, v0.16b, v0.16b, #8 323 st1 {v0.16b}, [x0] 324 325 ret 326 327 328.section __TEXT,__const 329.align 4 330Lmasks: 331.quad 0x0000ffffffffffff // k48 332.quad 0x00000000ffffffff // k32 333.quad 0x000000000000ffff // k16 334.quad 0x0000000000000000 // k0 335.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 336.align 2 337.align 2 338#endif // !OPENSSL_NO_ASM 339