1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if !defined(OPENSSL_NO_ASM) 11#if defined(BORINGSSL_PREFIX) 12#include <boringssl_prefix_symbols_asm.h> 13#endif 14#include <openssl/arm_arch.h> 15 16.text 17 18.globl _gcm_init_v8 19.private_extern _gcm_init_v8 20 21.align 4 22_gcm_init_v8: 23 ld1 {v17.2d},[x1] //load input H 24 movi v19.16b,#0xe1 25 shl v19.2d,v19.2d,#57 //0xc2.0 26 ext v3.16b,v17.16b,v17.16b,#8 27 ushr v18.2d,v19.2d,#63 28 dup v17.4s,v17.s[1] 29 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 30 ushr v18.2d,v3.2d,#63 31 sshr v17.4s,v17.4s,#31 //broadcast carry bit 32 and v18.16b,v18.16b,v16.16b 33 shl v3.2d,v3.2d,#1 34 ext v18.16b,v18.16b,v18.16b,#8 35 and v16.16b,v16.16b,v17.16b 36 orr v3.16b,v3.16b,v18.16b //H<<<=1 37 eor v20.16b,v3.16b,v16.16b //twisted H 38 st1 {v20.2d},[x0],#16 //store Htable[0] 39 40 //calculate H^2 41 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 42 pmull v0.1q,v20.1d,v20.1d 43 eor v16.16b,v16.16b,v20.16b 44 pmull2 v2.1q,v20.2d,v20.2d 45 pmull v1.1q,v16.1d,v16.1d 46 47 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 48 eor v18.16b,v0.16b,v2.16b 49 eor v1.16b,v1.16b,v17.16b 50 eor v1.16b,v1.16b,v18.16b 51 pmull v18.1q,v0.1d,v19.1d //1st phase 52 53 ins v2.d[0],v1.d[1] 54 ins v1.d[1],v0.d[0] 55 eor v0.16b,v1.16b,v18.16b 56 57 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 58 pmull v0.1q,v0.1d,v19.1d 59 eor v18.16b,v18.16b,v2.16b 60 eor v22.16b,v0.16b,v18.16b 61 62 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 63 eor v17.16b,v17.16b,v22.16b 64 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 65 st1 {v21.2d,v22.2d},[x0] //store Htable[1..2] 66 67 ret 68 69.globl _gcm_gmult_v8 70.private_extern _gcm_gmult_v8 71 72.align 4 73_gcm_gmult_v8: 74 ld1 {v17.2d},[x0] //load Xi 75 movi v19.16b,#0xe1 76 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 77 shl v19.2d,v19.2d,#57 78#ifndef __ARMEB__ 79 rev64 v17.16b,v17.16b 80#endif 81 ext v3.16b,v17.16b,v17.16b,#8 82 83 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 84 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 85 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 86 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 87 88 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 89 eor v18.16b,v0.16b,v2.16b 90 eor v1.16b,v1.16b,v17.16b 91 eor v1.16b,v1.16b,v18.16b 92 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 93 94 ins v2.d[0],v1.d[1] 95 ins v1.d[1],v0.d[0] 96 eor v0.16b,v1.16b,v18.16b 97 98 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 99 pmull v0.1q,v0.1d,v19.1d 100 eor v18.16b,v18.16b,v2.16b 101 eor v0.16b,v0.16b,v18.16b 102 103#ifndef __ARMEB__ 104 rev64 v0.16b,v0.16b 105#endif 106 ext v0.16b,v0.16b,v0.16b,#8 107 st1 {v0.2d},[x0] //write out Xi 108 109 ret 110 111.globl _gcm_ghash_v8 112.private_extern _gcm_ghash_v8 113 114.align 4 115_gcm_ghash_v8: 116 ld1 {v0.2d},[x0] //load [rotated] Xi 117 //"[rotated]" means that 118 //loaded value would have 119 //to be rotated in order to 120 //make it appear as in 121 //algorithm specification 122 subs x3,x3,#32 //see if x3 is 32 or larger 123 mov x12,#16 //x12 is used as post- 124 //increment for input pointer; 125 //as loop is modulo-scheduled 126 //x12 is zeroed just in time 127 //to preclude overstepping 128 //inp[len], which means that 129 //last block[s] are actually 130 //loaded twice, but last 131 //copy is not processed 132 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 133 movi v19.16b,#0xe1 134 ld1 {v22.2d},[x1] 135 csel x12,xzr,x12,eq //is it time to zero x12? 136 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 137 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 138 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 139#ifndef __ARMEB__ 140 rev64 v16.16b,v16.16b 141 rev64 v0.16b,v0.16b 142#endif 143 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 144 b.lo Lodd_tail_v8 //x3 was less than 32 145 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 146#ifndef __ARMEB__ 147 rev64 v17.16b,v17.16b 148#endif 149 ext v7.16b,v17.16b,v17.16b,#8 150 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 151 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 152 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 153 pmull2 v6.1q,v20.2d,v7.2d 154 b Loop_mod2x_v8 155 156.align 4 157Loop_mod2x_v8: 158 ext v18.16b,v3.16b,v3.16b,#8 159 subs x3,x3,#32 //is there more data? 160 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 161 csel x12,xzr,x12,lo //is it time to zero x12? 162 163 pmull v5.1q,v21.1d,v17.1d 164 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 165 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 166 eor v0.16b,v0.16b,v4.16b //accumulate 167 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 168 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 169 170 eor v2.16b,v2.16b,v6.16b 171 csel x12,xzr,x12,eq //is it time to zero x12? 172 eor v1.16b,v1.16b,v5.16b 173 174 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 175 eor v18.16b,v0.16b,v2.16b 176 eor v1.16b,v1.16b,v17.16b 177 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 178#ifndef __ARMEB__ 179 rev64 v16.16b,v16.16b 180#endif 181 eor v1.16b,v1.16b,v18.16b 182 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 183 184#ifndef __ARMEB__ 185 rev64 v17.16b,v17.16b 186#endif 187 ins v2.d[0],v1.d[1] 188 ins v1.d[1],v0.d[0] 189 ext v7.16b,v17.16b,v17.16b,#8 190 ext v3.16b,v16.16b,v16.16b,#8 191 eor v0.16b,v1.16b,v18.16b 192 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 193 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 194 195 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 196 pmull v0.1q,v0.1d,v19.1d 197 eor v3.16b,v3.16b,v18.16b 198 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 199 eor v3.16b,v3.16b,v0.16b 200 pmull2 v6.1q,v20.2d,v7.2d 201 b.hs Loop_mod2x_v8 //there was at least 32 more bytes 202 203 eor v2.16b,v2.16b,v18.16b 204 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 205 adds x3,x3,#32 //re-construct x3 206 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 207 b.eq Ldone_v8 //is x3 zero? 208Lodd_tail_v8: 209 ext v18.16b,v0.16b,v0.16b,#8 210 eor v3.16b,v3.16b,v0.16b //inp^=Xi 211 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 212 213 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 214 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 215 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 216 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 217 218 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 219 eor v18.16b,v0.16b,v2.16b 220 eor v1.16b,v1.16b,v17.16b 221 eor v1.16b,v1.16b,v18.16b 222 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 223 224 ins v2.d[0],v1.d[1] 225 ins v1.d[1],v0.d[0] 226 eor v0.16b,v1.16b,v18.16b 227 228 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 229 pmull v0.1q,v0.1d,v19.1d 230 eor v18.16b,v18.16b,v2.16b 231 eor v0.16b,v0.16b,v18.16b 232 233Ldone_v8: 234#ifndef __ARMEB__ 235 rev64 v0.16b,v0.16b 236#endif 237 ext v0.16b,v0.16b,v0.16b,#8 238 st1 {v0.2d},[x0] //write out Xi 239 240 ret 241 242.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 243.align 2 244.align 2 245#endif // !OPENSSL_NO_ASM 246