1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#include <GFp/arm_arch.h> 13 14.text 15 16.globl _GFp_gcm_init_neon 17.private_extern _GFp_gcm_init_neon 18 19.align 4 20_GFp_gcm_init_neon: 21 AARCH64_VALID_CALL_TARGET 22 // This function is adapted from gcm_init_v8. xC2 is t3. 23 ld1 {v17.2d}, [x1] // load H 24 movi v19.16b, #0xe1 25 shl v19.2d, v19.2d, #57 // 0xc2.0 26 ext v3.16b, v17.16b, v17.16b, #8 27 ushr v18.2d, v19.2d, #63 28 dup v17.4s, v17.s[1] 29 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 30 ushr v18.2d, v3.2d, #63 31 sshr v17.4s, v17.4s, #31 // broadcast carry bit 32 and v18.16b, v18.16b, v16.16b 33 shl v3.2d, v3.2d, #1 34 ext v18.16b, v18.16b, v18.16b, #8 35 and v16.16b, v16.16b, v17.16b 36 orr v3.16b, v3.16b, v18.16b // H<<<=1 37 eor v5.16b, v3.16b, v16.16b // twisted H 38 st1 {v5.2d}, [x0] // store Htable[0] 39 ret 40 41 42.globl _GFp_gcm_gmult_neon 43.private_extern _GFp_gcm_gmult_neon 44 45.align 4 46_GFp_gcm_gmult_neon: 47 AARCH64_VALID_CALL_TARGET 48 ld1 {v3.16b}, [x0] // load Xi 49 ld1 {v5.1d}, [x1], #8 // load twisted H 50 ld1 {v6.1d}, [x1] 51 adrp x9, Lmasks@PAGE // load constants 52 add x9, x9, Lmasks@PAGEOFF 53 ld1 {v24.2d, v25.2d}, [x9] 54 rev64 v3.16b, v3.16b // byteswap Xi 55 ext v3.16b, v3.16b, v3.16b, #8 56 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 57 58 mov x3, #16 59 b Lgmult_neon 60 61 62.globl _GFp_gcm_ghash_neon 63.private_extern _GFp_gcm_ghash_neon 64 65.align 4 66_GFp_gcm_ghash_neon: 67 AARCH64_VALID_CALL_TARGET 68 ld1 {v0.16b}, [x0] // load Xi 69 ld1 {v5.1d}, [x1], #8 // load twisted H 70 ld1 {v6.1d}, [x1] 71 adrp x9, Lmasks@PAGE // load constants 72 add x9, x9, Lmasks@PAGEOFF 73 ld1 {v24.2d, v25.2d}, [x9] 74 rev64 v0.16b, v0.16b // byteswap Xi 75 ext v0.16b, v0.16b, v0.16b, #8 76 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 77 78Loop_neon: 79 ld1 {v3.16b}, [x2], #16 // load inp 80 rev64 v3.16b, v3.16b // byteswap inp 81 ext v3.16b, v3.16b, v3.16b, #8 82 eor v3.16b, v3.16b, v0.16b // inp ^= Xi 83 84Lgmult_neon: 85 // Split the input into v3 and v4. (The upper halves are unused, 86 // so it is okay to leave them alone.) 87 ins v4.d[0], v3.d[1] 88 ext v16.8b, v5.8b, v5.8b, #1 // A1 89 pmull v16.8h, v16.8b, v3.8b // F = A1*B 90 ext v0.8b, v3.8b, v3.8b, #1 // B1 91 pmull v0.8h, v5.8b, v0.8b // E = A*B1 92 ext v17.8b, v5.8b, v5.8b, #2 // A2 93 pmull v17.8h, v17.8b, v3.8b // H = A2*B 94 ext v19.8b, v3.8b, v3.8b, #2 // B2 95 pmull v19.8h, v5.8b, v19.8b // G = A*B2 96 ext v18.8b, v5.8b, v5.8b, #3 // A3 97 eor v16.16b, v16.16b, v0.16b // L = E + F 98 pmull v18.8h, v18.8b, v3.8b // J = A3*B 99 ext v0.8b, v3.8b, v3.8b, #3 // B3 100 eor v17.16b, v17.16b, v19.16b // M = G + H 101 pmull v0.8h, v5.8b, v0.8b // I = A*B3 102 103 // Here we diverge from the 32-bit version. It computes the following 104 // (instructions reordered for clarity): 105 // 106 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 107 // vand $t0#hi, $t0#hi, $k48 108 // veor $t0#lo, $t0#lo, $t0#hi 109 // 110 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 111 // vand $t1#hi, $t1#hi, $k32 112 // veor $t1#lo, $t1#lo, $t1#hi 113 // 114 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 115 // vand $t2#hi, $t2#hi, $k16 116 // veor $t2#lo, $t2#lo, $t2#hi 117 // 118 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 119 // vmov.i64 $t3#hi, #0 120 // 121 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 122 // upper halves of SIMD registers, so we must split each half into 123 // separate registers. To compensate, we pair computations up and 124 // parallelize. 125 126 ext v19.8b, v3.8b, v3.8b, #4 // B4 127 eor v18.16b, v18.16b, v0.16b // N = I + J 128 pmull v19.8h, v5.8b, v19.8b // K = A*B4 129 130 // This can probably be scheduled more efficiently. For now, we just 131 // pair up independent instructions. 132 zip1 v20.2d, v16.2d, v17.2d 133 zip1 v22.2d, v18.2d, v19.2d 134 zip2 v21.2d, v16.2d, v17.2d 135 zip2 v23.2d, v18.2d, v19.2d 136 eor v20.16b, v20.16b, v21.16b 137 eor v22.16b, v22.16b, v23.16b 138 and v21.16b, v21.16b, v24.16b 139 and v23.16b, v23.16b, v25.16b 140 eor v20.16b, v20.16b, v21.16b 141 eor v22.16b, v22.16b, v23.16b 142 zip1 v16.2d, v20.2d, v21.2d 143 zip1 v18.2d, v22.2d, v23.2d 144 zip2 v17.2d, v20.2d, v21.2d 145 zip2 v19.2d, v22.2d, v23.2d 146 147 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 148 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 149 pmull v0.8h, v5.8b, v3.8b // D = A*B 150 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 151 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 152 eor v16.16b, v16.16b, v17.16b 153 eor v18.16b, v18.16b, v19.16b 154 eor v0.16b, v0.16b, v16.16b 155 eor v0.16b, v0.16b, v18.16b 156 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing 157 ext v16.8b, v7.8b, v7.8b, #1 // A1 158 pmull v16.8h, v16.8b, v3.8b // F = A1*B 159 ext v1.8b, v3.8b, v3.8b, #1 // B1 160 pmull v1.8h, v7.8b, v1.8b // E = A*B1 161 ext v17.8b, v7.8b, v7.8b, #2 // A2 162 pmull v17.8h, v17.8b, v3.8b // H = A2*B 163 ext v19.8b, v3.8b, v3.8b, #2 // B2 164 pmull v19.8h, v7.8b, v19.8b // G = A*B2 165 ext v18.8b, v7.8b, v7.8b, #3 // A3 166 eor v16.16b, v16.16b, v1.16b // L = E + F 167 pmull v18.8h, v18.8b, v3.8b // J = A3*B 168 ext v1.8b, v3.8b, v3.8b, #3 // B3 169 eor v17.16b, v17.16b, v19.16b // M = G + H 170 pmull v1.8h, v7.8b, v1.8b // I = A*B3 171 172 // Here we diverge from the 32-bit version. It computes the following 173 // (instructions reordered for clarity): 174 // 175 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 176 // vand $t0#hi, $t0#hi, $k48 177 // veor $t0#lo, $t0#lo, $t0#hi 178 // 179 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 180 // vand $t1#hi, $t1#hi, $k32 181 // veor $t1#lo, $t1#lo, $t1#hi 182 // 183 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 184 // vand $t2#hi, $t2#hi, $k16 185 // veor $t2#lo, $t2#lo, $t2#hi 186 // 187 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 188 // vmov.i64 $t3#hi, #0 189 // 190 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 191 // upper halves of SIMD registers, so we must split each half into 192 // separate registers. To compensate, we pair computations up and 193 // parallelize. 194 195 ext v19.8b, v3.8b, v3.8b, #4 // B4 196 eor v18.16b, v18.16b, v1.16b // N = I + J 197 pmull v19.8h, v7.8b, v19.8b // K = A*B4 198 199 // This can probably be scheduled more efficiently. For now, we just 200 // pair up independent instructions. 201 zip1 v20.2d, v16.2d, v17.2d 202 zip1 v22.2d, v18.2d, v19.2d 203 zip2 v21.2d, v16.2d, v17.2d 204 zip2 v23.2d, v18.2d, v19.2d 205 eor v20.16b, v20.16b, v21.16b 206 eor v22.16b, v22.16b, v23.16b 207 and v21.16b, v21.16b, v24.16b 208 and v23.16b, v23.16b, v25.16b 209 eor v20.16b, v20.16b, v21.16b 210 eor v22.16b, v22.16b, v23.16b 211 zip1 v16.2d, v20.2d, v21.2d 212 zip1 v18.2d, v22.2d, v23.2d 213 zip2 v17.2d, v20.2d, v21.2d 214 zip2 v19.2d, v22.2d, v23.2d 215 216 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 217 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 218 pmull v1.8h, v7.8b, v3.8b // D = A*B 219 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 220 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 221 eor v16.16b, v16.16b, v17.16b 222 eor v18.16b, v18.16b, v19.16b 223 eor v1.16b, v1.16b, v16.16b 224 eor v1.16b, v1.16b, v18.16b 225 ext v16.8b, v6.8b, v6.8b, #1 // A1 226 pmull v16.8h, v16.8b, v4.8b // F = A1*B 227 ext v2.8b, v4.8b, v4.8b, #1 // B1 228 pmull v2.8h, v6.8b, v2.8b // E = A*B1 229 ext v17.8b, v6.8b, v6.8b, #2 // A2 230 pmull v17.8h, v17.8b, v4.8b // H = A2*B 231 ext v19.8b, v4.8b, v4.8b, #2 // B2 232 pmull v19.8h, v6.8b, v19.8b // G = A*B2 233 ext v18.8b, v6.8b, v6.8b, #3 // A3 234 eor v16.16b, v16.16b, v2.16b // L = E + F 235 pmull v18.8h, v18.8b, v4.8b // J = A3*B 236 ext v2.8b, v4.8b, v4.8b, #3 // B3 237 eor v17.16b, v17.16b, v19.16b // M = G + H 238 pmull v2.8h, v6.8b, v2.8b // I = A*B3 239 240 // Here we diverge from the 32-bit version. It computes the following 241 // (instructions reordered for clarity): 242 // 243 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 244 // vand $t0#hi, $t0#hi, $k48 245 // veor $t0#lo, $t0#lo, $t0#hi 246 // 247 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 248 // vand $t1#hi, $t1#hi, $k32 249 // veor $t1#lo, $t1#lo, $t1#hi 250 // 251 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 252 // vand $t2#hi, $t2#hi, $k16 253 // veor $t2#lo, $t2#lo, $t2#hi 254 // 255 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 256 // vmov.i64 $t3#hi, #0 257 // 258 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 259 // upper halves of SIMD registers, so we must split each half into 260 // separate registers. To compensate, we pair computations up and 261 // parallelize. 262 263 ext v19.8b, v4.8b, v4.8b, #4 // B4 264 eor v18.16b, v18.16b, v2.16b // N = I + J 265 pmull v19.8h, v6.8b, v19.8b // K = A*B4 266 267 // This can probably be scheduled more efficiently. For now, we just 268 // pair up independent instructions. 269 zip1 v20.2d, v16.2d, v17.2d 270 zip1 v22.2d, v18.2d, v19.2d 271 zip2 v21.2d, v16.2d, v17.2d 272 zip2 v23.2d, v18.2d, v19.2d 273 eor v20.16b, v20.16b, v21.16b 274 eor v22.16b, v22.16b, v23.16b 275 and v21.16b, v21.16b, v24.16b 276 and v23.16b, v23.16b, v25.16b 277 eor v20.16b, v20.16b, v21.16b 278 eor v22.16b, v22.16b, v23.16b 279 zip1 v16.2d, v20.2d, v21.2d 280 zip1 v18.2d, v22.2d, v23.2d 281 zip2 v17.2d, v20.2d, v21.2d 282 zip2 v19.2d, v22.2d, v23.2d 283 284 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 285 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 286 pmull v2.8h, v6.8b, v4.8b // D = A*B 287 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 288 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 289 eor v16.16b, v16.16b, v17.16b 290 eor v18.16b, v18.16b, v19.16b 291 eor v2.16b, v2.16b, v16.16b 292 eor v2.16b, v2.16b, v18.16b 293 ext v16.16b, v0.16b, v2.16b, #8 294 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing 295 eor v1.16b, v1.16b, v2.16b 296 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi 297 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result 298 // This is a no-op due to the ins instruction below. 299 // ins v2.d[0], v1.d[1] 300 301 // equivalent of reduction_avx from ghash-x86_64.pl 302 shl v17.2d, v0.2d, #57 // 1st phase 303 shl v18.2d, v0.2d, #62 304 eor v18.16b, v18.16b, v17.16b // 305 shl v17.2d, v0.2d, #63 306 eor v18.16b, v18.16b, v17.16b // 307 // Note Xm contains {Xl.d[1], Xh.d[0]}. 308 eor v18.16b, v18.16b, v1.16b 309 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] 310 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] 311 312 ushr v18.2d, v0.2d, #1 // 2nd phase 313 eor v2.16b, v2.16b,v0.16b 314 eor v0.16b, v0.16b,v18.16b // 315 ushr v18.2d, v18.2d, #6 316 ushr v0.2d, v0.2d, #1 // 317 eor v0.16b, v0.16b, v2.16b // 318 eor v0.16b, v0.16b, v18.16b // 319 320 subs x3, x3, #16 321 bne Loop_neon 322 323 rev64 v0.16b, v0.16b // byteswap Xi and write 324 ext v0.16b, v0.16b, v0.16b, #8 325 st1 {v0.16b}, [x0] 326 327 ret 328 329 330.section __TEXT,__const 331.align 4 332Lmasks: 333.quad 0x0000ffffffffffff // k48 334.quad 0x00000000ffffffff // k32 335.quad 0x000000000000ffff // k16 336.quad 0x0000000000000000 // k0 337.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 338.align 2 339.align 2 340#endif // !OPENSSL_NO_ASM 341