1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#include <GFp/arm_arch.h> 14 15.text 16 17.globl GFp_gcm_init_neon 18.hidden GFp_gcm_init_neon 19.type GFp_gcm_init_neon,%function 20.align 4 21GFp_gcm_init_neon: 22 AARCH64_VALID_CALL_TARGET 23 // This function is adapted from gcm_init_v8. xC2 is t3. 24 ld1 {v17.2d}, [x1] // load H 25 movi v19.16b, #0xe1 26 shl v19.2d, v19.2d, #57 // 0xc2.0 27 ext v3.16b, v17.16b, v17.16b, #8 28 ushr v18.2d, v19.2d, #63 29 dup v17.4s, v17.s[1] 30 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 31 ushr v18.2d, v3.2d, #63 32 sshr v17.4s, v17.4s, #31 // broadcast carry bit 33 and v18.16b, v18.16b, v16.16b 34 shl v3.2d, v3.2d, #1 35 ext v18.16b, v18.16b, v18.16b, #8 36 and v16.16b, v16.16b, v17.16b 37 orr v3.16b, v3.16b, v18.16b // H<<<=1 38 eor v5.16b, v3.16b, v16.16b // twisted H 39 st1 {v5.2d}, [x0] // store Htable[0] 40 ret 41.size GFp_gcm_init_neon,.-GFp_gcm_init_neon 42 43.globl GFp_gcm_gmult_neon 44.hidden GFp_gcm_gmult_neon 45.type GFp_gcm_gmult_neon,%function 46.align 4 47GFp_gcm_gmult_neon: 48 AARCH64_VALID_CALL_TARGET 49 ld1 {v3.16b}, [x0] // load Xi 50 ld1 {v5.1d}, [x1], #8 // load twisted H 51 ld1 {v6.1d}, [x1] 52 adrp x9, .Lmasks // load constants 53 add x9, x9, :lo12:.Lmasks 54 ld1 {v24.2d, v25.2d}, [x9] 55 rev64 v3.16b, v3.16b // byteswap Xi 56 ext v3.16b, v3.16b, v3.16b, #8 57 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 58 59 mov x3, #16 60 b .Lgmult_neon 61.size GFp_gcm_gmult_neon,.-GFp_gcm_gmult_neon 62 63.globl GFp_gcm_ghash_neon 64.hidden GFp_gcm_ghash_neon 65.type GFp_gcm_ghash_neon,%function 66.align 4 67GFp_gcm_ghash_neon: 68 AARCH64_VALID_CALL_TARGET 69 ld1 {v0.16b}, [x0] // load Xi 70 ld1 {v5.1d}, [x1], #8 // load twisted H 71 ld1 {v6.1d}, [x1] 72 adrp x9, .Lmasks // load constants 73 add x9, x9, :lo12:.Lmasks 74 ld1 {v24.2d, v25.2d}, [x9] 75 rev64 v0.16b, v0.16b // byteswap Xi 76 ext v0.16b, v0.16b, v0.16b, #8 77 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 78 79.Loop_neon: 80 ld1 {v3.16b}, [x2], #16 // load inp 81 rev64 v3.16b, v3.16b // byteswap inp 82 ext v3.16b, v3.16b, v3.16b, #8 83 eor v3.16b, v3.16b, v0.16b // inp ^= Xi 84 85.Lgmult_neon: 86 // Split the input into v3 and v4. (The upper halves are unused, 87 // so it is okay to leave them alone.) 88 ins v4.d[0], v3.d[1] 89 ext v16.8b, v5.8b, v5.8b, #1 // A1 90 pmull v16.8h, v16.8b, v3.8b // F = A1*B 91 ext v0.8b, v3.8b, v3.8b, #1 // B1 92 pmull v0.8h, v5.8b, v0.8b // E = A*B1 93 ext v17.8b, v5.8b, v5.8b, #2 // A2 94 pmull v17.8h, v17.8b, v3.8b // H = A2*B 95 ext v19.8b, v3.8b, v3.8b, #2 // B2 96 pmull v19.8h, v5.8b, v19.8b // G = A*B2 97 ext v18.8b, v5.8b, v5.8b, #3 // A3 98 eor v16.16b, v16.16b, v0.16b // L = E + F 99 pmull v18.8h, v18.8b, v3.8b // J = A3*B 100 ext v0.8b, v3.8b, v3.8b, #3 // B3 101 eor v17.16b, v17.16b, v19.16b // M = G + H 102 pmull v0.8h, v5.8b, v0.8b // I = A*B3 103 104 // Here we diverge from the 32-bit version. It computes the following 105 // (instructions reordered for clarity): 106 // 107 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 108 // vand $t0#hi, $t0#hi, $k48 109 // veor $t0#lo, $t0#lo, $t0#hi 110 // 111 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 112 // vand $t1#hi, $t1#hi, $k32 113 // veor $t1#lo, $t1#lo, $t1#hi 114 // 115 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 116 // vand $t2#hi, $t2#hi, $k16 117 // veor $t2#lo, $t2#lo, $t2#hi 118 // 119 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 120 // vmov.i64 $t3#hi, #0 121 // 122 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 123 // upper halves of SIMD registers, so we must split each half into 124 // separate registers. To compensate, we pair computations up and 125 // parallelize. 126 127 ext v19.8b, v3.8b, v3.8b, #4 // B4 128 eor v18.16b, v18.16b, v0.16b // N = I + J 129 pmull v19.8h, v5.8b, v19.8b // K = A*B4 130 131 // This can probably be scheduled more efficiently. For now, we just 132 // pair up independent instructions. 133 zip1 v20.2d, v16.2d, v17.2d 134 zip1 v22.2d, v18.2d, v19.2d 135 zip2 v21.2d, v16.2d, v17.2d 136 zip2 v23.2d, v18.2d, v19.2d 137 eor v20.16b, v20.16b, v21.16b 138 eor v22.16b, v22.16b, v23.16b 139 and v21.16b, v21.16b, v24.16b 140 and v23.16b, v23.16b, v25.16b 141 eor v20.16b, v20.16b, v21.16b 142 eor v22.16b, v22.16b, v23.16b 143 zip1 v16.2d, v20.2d, v21.2d 144 zip1 v18.2d, v22.2d, v23.2d 145 zip2 v17.2d, v20.2d, v21.2d 146 zip2 v19.2d, v22.2d, v23.2d 147 148 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 149 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 150 pmull v0.8h, v5.8b, v3.8b // D = A*B 151 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 152 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 153 eor v16.16b, v16.16b, v17.16b 154 eor v18.16b, v18.16b, v19.16b 155 eor v0.16b, v0.16b, v16.16b 156 eor v0.16b, v0.16b, v18.16b 157 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing 158 ext v16.8b, v7.8b, v7.8b, #1 // A1 159 pmull v16.8h, v16.8b, v3.8b // F = A1*B 160 ext v1.8b, v3.8b, v3.8b, #1 // B1 161 pmull v1.8h, v7.8b, v1.8b // E = A*B1 162 ext v17.8b, v7.8b, v7.8b, #2 // A2 163 pmull v17.8h, v17.8b, v3.8b // H = A2*B 164 ext v19.8b, v3.8b, v3.8b, #2 // B2 165 pmull v19.8h, v7.8b, v19.8b // G = A*B2 166 ext v18.8b, v7.8b, v7.8b, #3 // A3 167 eor v16.16b, v16.16b, v1.16b // L = E + F 168 pmull v18.8h, v18.8b, v3.8b // J = A3*B 169 ext v1.8b, v3.8b, v3.8b, #3 // B3 170 eor v17.16b, v17.16b, v19.16b // M = G + H 171 pmull v1.8h, v7.8b, v1.8b // I = A*B3 172 173 // Here we diverge from the 32-bit version. It computes the following 174 // (instructions reordered for clarity): 175 // 176 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 177 // vand $t0#hi, $t0#hi, $k48 178 // veor $t0#lo, $t0#lo, $t0#hi 179 // 180 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 181 // vand $t1#hi, $t1#hi, $k32 182 // veor $t1#lo, $t1#lo, $t1#hi 183 // 184 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 185 // vand $t2#hi, $t2#hi, $k16 186 // veor $t2#lo, $t2#lo, $t2#hi 187 // 188 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 189 // vmov.i64 $t3#hi, #0 190 // 191 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 192 // upper halves of SIMD registers, so we must split each half into 193 // separate registers. To compensate, we pair computations up and 194 // parallelize. 195 196 ext v19.8b, v3.8b, v3.8b, #4 // B4 197 eor v18.16b, v18.16b, v1.16b // N = I + J 198 pmull v19.8h, v7.8b, v19.8b // K = A*B4 199 200 // This can probably be scheduled more efficiently. For now, we just 201 // pair up independent instructions. 202 zip1 v20.2d, v16.2d, v17.2d 203 zip1 v22.2d, v18.2d, v19.2d 204 zip2 v21.2d, v16.2d, v17.2d 205 zip2 v23.2d, v18.2d, v19.2d 206 eor v20.16b, v20.16b, v21.16b 207 eor v22.16b, v22.16b, v23.16b 208 and v21.16b, v21.16b, v24.16b 209 and v23.16b, v23.16b, v25.16b 210 eor v20.16b, v20.16b, v21.16b 211 eor v22.16b, v22.16b, v23.16b 212 zip1 v16.2d, v20.2d, v21.2d 213 zip1 v18.2d, v22.2d, v23.2d 214 zip2 v17.2d, v20.2d, v21.2d 215 zip2 v19.2d, v22.2d, v23.2d 216 217 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 218 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 219 pmull v1.8h, v7.8b, v3.8b // D = A*B 220 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 221 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 222 eor v16.16b, v16.16b, v17.16b 223 eor v18.16b, v18.16b, v19.16b 224 eor v1.16b, v1.16b, v16.16b 225 eor v1.16b, v1.16b, v18.16b 226 ext v16.8b, v6.8b, v6.8b, #1 // A1 227 pmull v16.8h, v16.8b, v4.8b // F = A1*B 228 ext v2.8b, v4.8b, v4.8b, #1 // B1 229 pmull v2.8h, v6.8b, v2.8b // E = A*B1 230 ext v17.8b, v6.8b, v6.8b, #2 // A2 231 pmull v17.8h, v17.8b, v4.8b // H = A2*B 232 ext v19.8b, v4.8b, v4.8b, #2 // B2 233 pmull v19.8h, v6.8b, v19.8b // G = A*B2 234 ext v18.8b, v6.8b, v6.8b, #3 // A3 235 eor v16.16b, v16.16b, v2.16b // L = E + F 236 pmull v18.8h, v18.8b, v4.8b // J = A3*B 237 ext v2.8b, v4.8b, v4.8b, #3 // B3 238 eor v17.16b, v17.16b, v19.16b // M = G + H 239 pmull v2.8h, v6.8b, v2.8b // I = A*B3 240 241 // Here we diverge from the 32-bit version. It computes the following 242 // (instructions reordered for clarity): 243 // 244 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 245 // vand $t0#hi, $t0#hi, $k48 246 // veor $t0#lo, $t0#lo, $t0#hi 247 // 248 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 249 // vand $t1#hi, $t1#hi, $k32 250 // veor $t1#lo, $t1#lo, $t1#hi 251 // 252 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 253 // vand $t2#hi, $t2#hi, $k16 254 // veor $t2#lo, $t2#lo, $t2#hi 255 // 256 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 257 // vmov.i64 $t3#hi, #0 258 // 259 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 260 // upper halves of SIMD registers, so we must split each half into 261 // separate registers. To compensate, we pair computations up and 262 // parallelize. 263 264 ext v19.8b, v4.8b, v4.8b, #4 // B4 265 eor v18.16b, v18.16b, v2.16b // N = I + J 266 pmull v19.8h, v6.8b, v19.8b // K = A*B4 267 268 // This can probably be scheduled more efficiently. For now, we just 269 // pair up independent instructions. 270 zip1 v20.2d, v16.2d, v17.2d 271 zip1 v22.2d, v18.2d, v19.2d 272 zip2 v21.2d, v16.2d, v17.2d 273 zip2 v23.2d, v18.2d, v19.2d 274 eor v20.16b, v20.16b, v21.16b 275 eor v22.16b, v22.16b, v23.16b 276 and v21.16b, v21.16b, v24.16b 277 and v23.16b, v23.16b, v25.16b 278 eor v20.16b, v20.16b, v21.16b 279 eor v22.16b, v22.16b, v23.16b 280 zip1 v16.2d, v20.2d, v21.2d 281 zip1 v18.2d, v22.2d, v23.2d 282 zip2 v17.2d, v20.2d, v21.2d 283 zip2 v19.2d, v22.2d, v23.2d 284 285 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 286 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 287 pmull v2.8h, v6.8b, v4.8b // D = A*B 288 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 289 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 290 eor v16.16b, v16.16b, v17.16b 291 eor v18.16b, v18.16b, v19.16b 292 eor v2.16b, v2.16b, v16.16b 293 eor v2.16b, v2.16b, v18.16b 294 ext v16.16b, v0.16b, v2.16b, #8 295 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing 296 eor v1.16b, v1.16b, v2.16b 297 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi 298 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result 299 // This is a no-op due to the ins instruction below. 300 // ins v2.d[0], v1.d[1] 301 302 // equivalent of reduction_avx from ghash-x86_64.pl 303 shl v17.2d, v0.2d, #57 // 1st phase 304 shl v18.2d, v0.2d, #62 305 eor v18.16b, v18.16b, v17.16b // 306 shl v17.2d, v0.2d, #63 307 eor v18.16b, v18.16b, v17.16b // 308 // Note Xm contains {Xl.d[1], Xh.d[0]}. 309 eor v18.16b, v18.16b, v1.16b 310 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] 311 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] 312 313 ushr v18.2d, v0.2d, #1 // 2nd phase 314 eor v2.16b, v2.16b,v0.16b 315 eor v0.16b, v0.16b,v18.16b // 316 ushr v18.2d, v18.2d, #6 317 ushr v0.2d, v0.2d, #1 // 318 eor v0.16b, v0.16b, v2.16b // 319 eor v0.16b, v0.16b, v18.16b // 320 321 subs x3, x3, #16 322 bne .Loop_neon 323 324 rev64 v0.16b, v0.16b // byteswap Xi and write 325 ext v0.16b, v0.16b, v0.16b, #8 326 st1 {v0.16b}, [x0] 327 328 ret 329.size GFp_gcm_ghash_neon,.-GFp_gcm_ghash_neon 330 331.section .rodata 332.align 4 333.Lmasks: 334.quad 0x0000ffffffffffff // k48 335.quad 0x00000000ffffffff // k32 336.quad 0x000000000000ffff // k16 337.quad 0x0000000000000000 // k0 338.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 339.align 2 340.align 2 341#endif 342#endif // !OPENSSL_NO_ASM 343.section .note.GNU-stack,"",%progbits 344