1#if defined(__aarch64__) 2#include <openssl/arm_arch.h> 3 4.text 5#if !defined(__clang__) 6.arch armv8-a+crypto 7#endif 8.globl gcm_init_v8 9.type gcm_init_v8,%function 10.align 4 11gcm_init_v8: 12 ld1 {v17.2d},[x1] //load input H 13 movi v19.16b,#0xe1 14 shl v19.2d,v19.2d,#57 //0xc2.0 15 ext v3.16b,v17.16b,v17.16b,#8 16 ushr v18.2d,v19.2d,#63 17 dup v17.4s,v17.s[1] 18 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 19 ushr v18.2d,v3.2d,#63 20 sshr v17.4s,v17.4s,#31 //broadcast carry bit 21 and v18.16b,v18.16b,v16.16b 22 shl v3.2d,v3.2d,#1 23 ext v18.16b,v18.16b,v18.16b,#8 24 and v16.16b,v16.16b,v17.16b 25 orr v3.16b,v3.16b,v18.16b //H<<<=1 26 eor v20.16b,v3.16b,v16.16b //twisted H 27 st1 {v20.2d},[x0],#16 //store Htable[0] 28 29 //calculate H^2 30 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 31 pmull v0.1q,v20.1d,v20.1d 32 eor v16.16b,v16.16b,v20.16b 33 pmull2 v2.1q,v20.2d,v20.2d 34 pmull v1.1q,v16.1d,v16.1d 35 36 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 37 eor v18.16b,v0.16b,v2.16b 38 eor v1.16b,v1.16b,v17.16b 39 eor v1.16b,v1.16b,v18.16b 40 pmull v18.1q,v0.1d,v19.1d //1st phase 41 42 ins v2.d[0],v1.d[1] 43 ins v1.d[1],v0.d[0] 44 eor v0.16b,v1.16b,v18.16b 45 46 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 47 pmull v0.1q,v0.1d,v19.1d 48 eor v18.16b,v18.16b,v2.16b 49 eor v22.16b,v0.16b,v18.16b 50 51 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 52 eor v17.16b,v17.16b,v22.16b 53 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 54 st1 {v21.2d,v22.2d},[x0] //store Htable[1..2] 55 56 ret 57.size gcm_init_v8,.-gcm_init_v8 58.globl gcm_gmult_v8 59.type gcm_gmult_v8,%function 60.align 4 61gcm_gmult_v8: 62 ld1 {v17.2d},[x0] //load Xi 63 movi v19.16b,#0xe1 64 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 65 shl v19.2d,v19.2d,#57 66#ifndef __ARMEB__ 67 rev64 v17.16b,v17.16b 68#endif 69 ext v3.16b,v17.16b,v17.16b,#8 70 71 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 72 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 73 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 74 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 75 76 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 77 eor v18.16b,v0.16b,v2.16b 78 eor v1.16b,v1.16b,v17.16b 79 eor v1.16b,v1.16b,v18.16b 80 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 81 82 ins v2.d[0],v1.d[1] 83 ins v1.d[1],v0.d[0] 84 eor v0.16b,v1.16b,v18.16b 85 86 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 87 pmull v0.1q,v0.1d,v19.1d 88 eor v18.16b,v18.16b,v2.16b 89 eor v0.16b,v0.16b,v18.16b 90 91#ifndef __ARMEB__ 92 rev64 v0.16b,v0.16b 93#endif 94 ext v0.16b,v0.16b,v0.16b,#8 95 st1 {v0.2d},[x0] //write out Xi 96 97 ret 98.size gcm_gmult_v8,.-gcm_gmult_v8 99.globl gcm_ghash_v8 100.type gcm_ghash_v8,%function 101.align 4 102gcm_ghash_v8: 103 ld1 {v0.2d},[x0] //load [rotated] Xi 104 //"[rotated]" means that 105 //loaded value would have 106 //to be rotated in order to 107 //make it appear as in 108 //alorithm specification 109 subs x3,x3,#32 //see if x3 is 32 or larger 110 mov x12,#16 //x12 is used as post- 111 //increment for input pointer; 112 //as loop is modulo-scheduled 113 //x12 is zeroed just in time 114 //to preclude oversteping 115 //inp[len], which means that 116 //last block[s] are actually 117 //loaded twice, but last 118 //copy is not processed 119 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 120 movi v19.16b,#0xe1 121 ld1 {v22.2d},[x1] 122 csel x12,xzr,x12,eq //is it time to zero x12? 123 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 124 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 125 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 126#ifndef __ARMEB__ 127 rev64 v16.16b,v16.16b 128 rev64 v0.16b,v0.16b 129#endif 130 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 131 b.lo .Lodd_tail_v8 //x3 was less than 32 132 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 133#ifndef __ARMEB__ 134 rev64 v17.16b,v17.16b 135#endif 136 ext v7.16b,v17.16b,v17.16b,#8 137 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 138 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 139 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 140 pmull2 v6.1q,v20.2d,v7.2d 141 b .Loop_mod2x_v8 142 143.align 4 144.Loop_mod2x_v8: 145 ext v18.16b,v3.16b,v3.16b,#8 146 subs x3,x3,#32 //is there more data? 147 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 148 csel x12,xzr,x12,lo //is it time to zero x12? 149 150 pmull v5.1q,v21.1d,v17.1d 151 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 152 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 153 eor v0.16b,v0.16b,v4.16b //accumulate 154 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 155 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 156 157 eor v2.16b,v2.16b,v6.16b 158 csel x12,xzr,x12,eq //is it time to zero x12? 159 eor v1.16b,v1.16b,v5.16b 160 161 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 162 eor v18.16b,v0.16b,v2.16b 163 eor v1.16b,v1.16b,v17.16b 164 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 165#ifndef __ARMEB__ 166 rev64 v16.16b,v16.16b 167#endif 168 eor v1.16b,v1.16b,v18.16b 169 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 170 171#ifndef __ARMEB__ 172 rev64 v17.16b,v17.16b 173#endif 174 ins v2.d[0],v1.d[1] 175 ins v1.d[1],v0.d[0] 176 ext v7.16b,v17.16b,v17.16b,#8 177 ext v3.16b,v16.16b,v16.16b,#8 178 eor v0.16b,v1.16b,v18.16b 179 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 180 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 181 182 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 183 pmull v0.1q,v0.1d,v19.1d 184 eor v3.16b,v3.16b,v18.16b 185 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 186 eor v3.16b,v3.16b,v0.16b 187 pmull2 v6.1q,v20.2d,v7.2d 188 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes 189 190 eor v2.16b,v2.16b,v18.16b 191 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 192 adds x3,x3,#32 //re-construct x3 193 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 194 b.eq .Ldone_v8 //is x3 zero? 195.Lodd_tail_v8: 196 ext v18.16b,v0.16b,v0.16b,#8 197 eor v3.16b,v3.16b,v0.16b //inp^=Xi 198 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 199 200 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 201 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 202 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 203 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 204 205 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 206 eor v18.16b,v0.16b,v2.16b 207 eor v1.16b,v1.16b,v17.16b 208 eor v1.16b,v1.16b,v18.16b 209 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 210 211 ins v2.d[0],v1.d[1] 212 ins v1.d[1],v0.d[0] 213 eor v0.16b,v1.16b,v18.16b 214 215 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 216 pmull v0.1q,v0.1d,v19.1d 217 eor v18.16b,v18.16b,v2.16b 218 eor v0.16b,v0.16b,v18.16b 219 220.Ldone_v8: 221#ifndef __ARMEB__ 222 rev64 v0.16b,v0.16b 223#endif 224 ext v0.16b,v0.16b,v0.16b,#8 225 st1 {v0.2d},[x0] //write out Xi 226 227 ret 228.size gcm_ghash_v8,.-gcm_ghash_v8 229.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 230.align 2 231.align 2 232#endif