1#if defined(__aarch64__) 2#include <openssl/arm_arch.h> 3 4.text 5#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH) 6.arch armv8-a+crypto 7#endif 8.globl gcm_init_v8 9.hidden gcm_init_v8 10.type gcm_init_v8,%function 11.align 4 12gcm_init_v8: 13 ld1 {v17.2d},[x1] //load input H 14 movi v19.16b,#0xe1 15 shl v19.2d,v19.2d,#57 //0xc2.0 16 ext v3.16b,v17.16b,v17.16b,#8 17 ushr v18.2d,v19.2d,#63 18 dup v17.4s,v17.s[1] 19 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 20 ushr v18.2d,v3.2d,#63 21 sshr v17.4s,v17.4s,#31 //broadcast carry bit 22 and v18.16b,v18.16b,v16.16b 23 shl v3.2d,v3.2d,#1 24 ext v18.16b,v18.16b,v18.16b,#8 25 and v16.16b,v16.16b,v17.16b 26 orr v3.16b,v3.16b,v18.16b //H<<<=1 27 eor v20.16b,v3.16b,v16.16b //twisted H 28 st1 {v20.2d},[x0],#16 //store Htable[0] 29 30 //calculate H^2 31 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 32 pmull v0.1q,v20.1d,v20.1d 33 eor v16.16b,v16.16b,v20.16b 34 pmull2 v2.1q,v20.2d,v20.2d 35 pmull v1.1q,v16.1d,v16.1d 36 37 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 38 eor v18.16b,v0.16b,v2.16b 39 eor v1.16b,v1.16b,v17.16b 40 eor v1.16b,v1.16b,v18.16b 41 pmull v18.1q,v0.1d,v19.1d //1st phase 42 43 ins v2.d[0],v1.d[1] 44 ins v1.d[1],v0.d[0] 45 eor v0.16b,v1.16b,v18.16b 46 47 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 48 pmull v0.1q,v0.1d,v19.1d 49 eor v18.16b,v18.16b,v2.16b 50 eor v22.16b,v0.16b,v18.16b 51 52 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 53 eor v17.16b,v17.16b,v22.16b 54 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 55 st1 {v21.2d,v22.2d},[x0] //store Htable[1..2] 56 57 ret 58.size gcm_init_v8,.-gcm_init_v8 59.globl gcm_gmult_v8 60.hidden gcm_gmult_v8 61.type gcm_gmult_v8,%function 62.align 4 63gcm_gmult_v8: 64 ld1 {v17.2d},[x0] //load Xi 65 movi v19.16b,#0xe1 66 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 67 shl v19.2d,v19.2d,#57 68#ifndef __ARMEB__ 69 rev64 v17.16b,v17.16b 70#endif 71 ext v3.16b,v17.16b,v17.16b,#8 72 73 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 74 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 75 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 76 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 77 78 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 79 eor v18.16b,v0.16b,v2.16b 80 eor v1.16b,v1.16b,v17.16b 81 eor v1.16b,v1.16b,v18.16b 82 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 83 84 ins v2.d[0],v1.d[1] 85 ins v1.d[1],v0.d[0] 86 eor v0.16b,v1.16b,v18.16b 87 88 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 89 pmull v0.1q,v0.1d,v19.1d 90 eor v18.16b,v18.16b,v2.16b 91 eor v0.16b,v0.16b,v18.16b 92 93#ifndef __ARMEB__ 94 rev64 v0.16b,v0.16b 95#endif 96 ext v0.16b,v0.16b,v0.16b,#8 97 st1 {v0.2d},[x0] //write out Xi 98 99 ret 100.size gcm_gmult_v8,.-gcm_gmult_v8 101.globl gcm_ghash_v8 102.hidden gcm_ghash_v8 103.type gcm_ghash_v8,%function 104.align 4 105gcm_ghash_v8: 106 ld1 {v0.2d},[x0] //load [rotated] Xi 107 //"[rotated]" means that 108 //loaded value would have 109 //to be rotated in order to 110 //make it appear as in 111 //alorithm specification 112 subs x3,x3,#32 //see if x3 is 32 or larger 113 mov x12,#16 //x12 is used as post- 114 //increment for input pointer; 115 //as loop is modulo-scheduled 116 //x12 is zeroed just in time 117 //to preclude oversteping 118 //inp[len], which means that 119 //last block[s] are actually 120 //loaded twice, but last 121 //copy is not processed 122 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 123 movi v19.16b,#0xe1 124 ld1 {v22.2d},[x1] 125 csel x12,xzr,x12,eq //is it time to zero x12? 126 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 127 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 128 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 129#ifndef __ARMEB__ 130 rev64 v16.16b,v16.16b 131 rev64 v0.16b,v0.16b 132#endif 133 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 134 b.lo .Lodd_tail_v8 //x3 was less than 32 135 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 136#ifndef __ARMEB__ 137 rev64 v17.16b,v17.16b 138#endif 139 ext v7.16b,v17.16b,v17.16b,#8 140 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 141 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 142 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 143 pmull2 v6.1q,v20.2d,v7.2d 144 b .Loop_mod2x_v8 145 146.align 4 147.Loop_mod2x_v8: 148 ext v18.16b,v3.16b,v3.16b,#8 149 subs x3,x3,#32 //is there more data? 150 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 151 csel x12,xzr,x12,lo //is it time to zero x12? 152 153 pmull v5.1q,v21.1d,v17.1d 154 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 155 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 156 eor v0.16b,v0.16b,v4.16b //accumulate 157 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 158 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 159 160 eor v2.16b,v2.16b,v6.16b 161 csel x12,xzr,x12,eq //is it time to zero x12? 162 eor v1.16b,v1.16b,v5.16b 163 164 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 165 eor v18.16b,v0.16b,v2.16b 166 eor v1.16b,v1.16b,v17.16b 167 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 168#ifndef __ARMEB__ 169 rev64 v16.16b,v16.16b 170#endif 171 eor v1.16b,v1.16b,v18.16b 172 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 173 174#ifndef __ARMEB__ 175 rev64 v17.16b,v17.16b 176#endif 177 ins v2.d[0],v1.d[1] 178 ins v1.d[1],v0.d[0] 179 ext v7.16b,v17.16b,v17.16b,#8 180 ext v3.16b,v16.16b,v16.16b,#8 181 eor v0.16b,v1.16b,v18.16b 182 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 183 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 184 185 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 186 pmull v0.1q,v0.1d,v19.1d 187 eor v3.16b,v3.16b,v18.16b 188 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 189 eor v3.16b,v3.16b,v0.16b 190 pmull2 v6.1q,v20.2d,v7.2d 191 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes 192 193 eor v2.16b,v2.16b,v18.16b 194 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 195 adds x3,x3,#32 //re-construct x3 196 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 197 b.eq .Ldone_v8 //is x3 zero? 198.Lodd_tail_v8: 199 ext v18.16b,v0.16b,v0.16b,#8 200 eor v3.16b,v3.16b,v0.16b //inp^=Xi 201 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 202 203 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 204 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 205 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 206 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 207 208 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 209 eor v18.16b,v0.16b,v2.16b 210 eor v1.16b,v1.16b,v17.16b 211 eor v1.16b,v1.16b,v18.16b 212 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 213 214 ins v2.d[0],v1.d[1] 215 ins v1.d[1],v0.d[0] 216 eor v0.16b,v1.16b,v18.16b 217 218 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 219 pmull v0.1q,v0.1d,v19.1d 220 eor v18.16b,v18.16b,v2.16b 221 eor v0.16b,v0.16b,v18.16b 222 223.Ldone_v8: 224#ifndef __ARMEB__ 225 rev64 v0.16b,v0.16b 226#endif 227 ext v0.16b,v0.16b,v0.16b,#8 228 st1 {v0.2d},[x0] //write out Xi 229 230 ret 231.size gcm_ghash_v8,.-gcm_ghash_v8 232.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 233.align 2 234.align 2 235#endif 236