1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#include <GFp/arm_arch.h> 14 15.section .rodata 16 17.type _vpaes_consts,%object 18.align 7 // totally strategic alignment 19_vpaes_consts: 20.Lk_mc_forward: // mc_forward 21.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 22.quad 0x080B0A0904070605, 0x000302010C0F0E0D 23.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 24.quad 0x000302010C0F0E0D, 0x080B0A0904070605 25.Lk_mc_backward: // mc_backward 26.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 27.quad 0x020100030E0D0C0F, 0x0A09080B06050407 28.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 29.quad 0x0A09080B06050407, 0x020100030E0D0C0F 30.Lk_sr: // sr 31.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 32.quad 0x030E09040F0A0500, 0x0B06010C07020D08 33.quad 0x0F060D040B020900, 0x070E050C030A0108 34.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 35 36// 37// "Hot" constants 38// 39.Lk_inv: // inv, inva 40.quad 0x0E05060F0D080180, 0x040703090A0B0C02 41.quad 0x01040A060F0B0780, 0x030D0E0C02050809 42.Lk_ipt: // input transform (lo, hi) 43.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 44.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 45.Lk_sbo: // sbou, sbot 46.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 47.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 48.Lk_sb1: // sb1u, sb1t 49.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 50.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 51.Lk_sb2: // sb2u, sb2t 52.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 53.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 54 55// 56// Key schedule constants 57// 58.Lk_dksd: // decryption key schedule: invskew x*D 59.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 60.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 61.Lk_dksb: // decryption key schedule: invskew x*B 62.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 63.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 64.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 65.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 66.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 67.Lk_dks9: // decryption key schedule: invskew x*9 68.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 69.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 70 71.Lk_rcon: // rcon 72.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 73 74.Lk_opt: // output transform 75.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 76.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 77.Lk_deskew: // deskew tables: inverts the sbox's "skew" 78.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 79.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 80 81.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 82.align 2 83.size _vpaes_consts,.-_vpaes_consts 84.align 6 85 86.text 87## 88## _aes_preheat 89## 90## Fills register %r10 -> .aes_consts (so you can -fPIC) 91## and %xmm9-%xmm15 as specified below. 92## 93.type _vpaes_encrypt_preheat,%function 94.align 4 95_vpaes_encrypt_preheat: 96 adrp x10, .Lk_inv 97 add x10, x10, :lo12:.Lk_inv 98 movi v17.16b, #0x0f 99 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 100 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 101 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 102 ret 103.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 104 105## 106## _aes_encrypt_core 107## 108## AES-encrypt %xmm0. 109## 110## Inputs: 111## %xmm0 = input 112## %xmm9-%xmm15 as in _vpaes_preheat 113## (%rdx) = scheduled keys 114## 115## Output in %xmm0 116## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 117## Preserves %xmm6 - %xmm8 so you get some local vectors 118## 119## 120.type _vpaes_encrypt_core,%function 121.align 4 122_vpaes_encrypt_core: 123 mov x9, x2 124 ldr w8, [x2,#240] // pull rounds 125 adrp x11, .Lk_mc_forward+16 126 add x11, x11, :lo12:.Lk_mc_forward+16 127 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 128 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 129 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 130 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 131 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 132 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 133 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 134 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 135 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 136 b .Lenc_entry 137 138.align 4 139.Lenc_loop: 140 // middle of middle round 141 add x10, x11, #0x40 142 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 143 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 144 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 145 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 146 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 147 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 148 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 149 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 150 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 151 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 152 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 153 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 154 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 155 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 156 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 157 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 158 sub w8, w8, #1 // nr-- 159 160.Lenc_entry: 161 // top of round 162 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 163 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 164 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 165 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 166 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 167 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 168 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 169 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 170 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 171 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 172 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 173 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 174 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 175 cbnz w8, .Lenc_loop 176 177 // middle of last round 178 add x10, x11, #0x80 179 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 180 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 181 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 182 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 183 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 184 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 185 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 186 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 187 ret 188.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 189 190.globl GFp_vpaes_encrypt 191.hidden GFp_vpaes_encrypt 192.type GFp_vpaes_encrypt,%function 193.align 4 194GFp_vpaes_encrypt: 195 AARCH64_SIGN_LINK_REGISTER 196 stp x29,x30,[sp,#-16]! 197 add x29,sp,#0 198 199 ld1 {v7.16b}, [x0] 200 bl _vpaes_encrypt_preheat 201 bl _vpaes_encrypt_core 202 st1 {v0.16b}, [x1] 203 204 ldp x29,x30,[sp],#16 205 AARCH64_VALIDATE_LINK_REGISTER 206 ret 207.size GFp_vpaes_encrypt,.-GFp_vpaes_encrypt 208 209.type _vpaes_encrypt_2x,%function 210.align 4 211_vpaes_encrypt_2x: 212 mov x9, x2 213 ldr w8, [x2,#240] // pull rounds 214 adrp x11, .Lk_mc_forward+16 215 add x11, x11, :lo12:.Lk_mc_forward+16 216 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 217 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 218 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 219 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 220 and v9.16b, v15.16b, v17.16b 221 ushr v8.16b, v15.16b, #4 222 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 223 tbl v9.16b, {v20.16b}, v9.16b 224 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 225 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 226 tbl v10.16b, {v21.16b}, v8.16b 227 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 228 eor v8.16b, v9.16b, v16.16b 229 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 230 eor v8.16b, v8.16b, v10.16b 231 b .Lenc_2x_entry 232 233.align 4 234.Lenc_2x_loop: 235 // middle of middle round 236 add x10, x11, #0x40 237 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 238 tbl v12.16b, {v25.16b}, v10.16b 239 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 240 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 241 tbl v8.16b, {v24.16b}, v11.16b 242 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 243 eor v12.16b, v12.16b, v16.16b 244 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 245 tbl v13.16b, {v27.16b}, v10.16b 246 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 247 eor v8.16b, v8.16b, v12.16b 248 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 249 tbl v10.16b, {v26.16b}, v11.16b 250 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 251 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 252 tbl v11.16b, {v8.16b}, v1.16b 253 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 254 eor v10.16b, v10.16b, v13.16b 255 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 256 tbl v8.16b, {v8.16b}, v4.16b 257 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 258 eor v11.16b, v11.16b, v10.16b 259 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 260 tbl v12.16b, {v11.16b},v1.16b 261 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 262 eor v8.16b, v8.16b, v11.16b 263 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 264 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 265 eor v8.16b, v8.16b, v12.16b 266 sub w8, w8, #1 // nr-- 267 268.Lenc_2x_entry: 269 // top of round 270 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 271 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 272 and v9.16b, v8.16b, v17.16b 273 ushr v8.16b, v8.16b, #4 274 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 275 tbl v13.16b, {v19.16b},v9.16b 276 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 277 eor v9.16b, v9.16b, v8.16b 278 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 279 tbl v11.16b, {v18.16b},v8.16b 280 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 281 tbl v12.16b, {v18.16b},v9.16b 282 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 283 eor v11.16b, v11.16b, v13.16b 284 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 285 eor v12.16b, v12.16b, v13.16b 286 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 287 tbl v10.16b, {v18.16b},v11.16b 288 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 289 tbl v11.16b, {v18.16b},v12.16b 290 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 291 eor v10.16b, v10.16b, v9.16b 292 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 293 eor v11.16b, v11.16b, v8.16b 294 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 295 cbnz w8, .Lenc_2x_loop 296 297 // middle of last round 298 add x10, x11, #0x80 299 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 300 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 301 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 302 tbl v12.16b, {v22.16b}, v10.16b 303 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 304 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 305 tbl v8.16b, {v23.16b}, v11.16b 306 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 307 eor v12.16b, v12.16b, v16.16b 308 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 309 eor v8.16b, v8.16b, v12.16b 310 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 311 tbl v1.16b, {v8.16b},v1.16b 312 ret 313.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 314######################################################## 315## ## 316## AES key schedule ## 317## ## 318######################################################## 319.type _vpaes_key_preheat,%function 320.align 4 321_vpaes_key_preheat: 322 adrp x10, .Lk_inv 323 add x10, x10, :lo12:.Lk_inv 324 movi v16.16b, #0x5b // .Lk_s63 325 adrp x11, .Lk_sb1 326 add x11, x11, :lo12:.Lk_sb1 327 movi v17.16b, #0x0f // .Lk_s0F 328 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt 329 adrp x10, .Lk_dksd 330 add x10, x10, :lo12:.Lk_dksd 331 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 332 adrp x11, .Lk_mc_forward 333 add x11, x11, :lo12:.Lk_mc_forward 334 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 335 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 336 ld1 {v8.2d}, [x10] // .Lk_rcon 337 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 338 ret 339.size _vpaes_key_preheat,.-_vpaes_key_preheat 340 341.type _vpaes_schedule_core,%function 342.align 4 343_vpaes_schedule_core: 344 AARCH64_SIGN_LINK_REGISTER 345 stp x29, x30, [sp,#-16]! 346 add x29,sp,#0 347 348 bl _vpaes_key_preheat // load the tables 349 350 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 351 352 // input transform 353 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 354 bl _vpaes_schedule_transform 355 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 356 357 adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 358 add x10, x10, :lo12:.Lk_sr 359 360 add x8, x8, x10 361 362 // encrypting, output zeroth round key after transform 363 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 364 365 cmp w1, #192 // cmp $192, %esi 366 b.hi .Lschedule_256 367 b.eq .Lschedule_192 368 // 128: fall though 369 370## 371## .schedule_128 372## 373## 128-bit specific part of key schedule. 374## 375## This schedule is really simple, because all its parts 376## are accomplished by the subroutines. 377## 378.Lschedule_128: 379 mov x0, #10 // mov $10, %esi 380 381.Loop_schedule_128: 382 sub x0, x0, #1 // dec %esi 383 bl _vpaes_schedule_round 384 cbz x0, .Lschedule_mangle_last 385 bl _vpaes_schedule_mangle // write output 386 b .Loop_schedule_128 387 388## 389## .aes_schedule_192 390## 391## 192-bit specific part of key schedule. 392## 393## The main body of this schedule is the same as the 128-bit 394## schedule, but with more smearing. The long, high side is 395## stored in %xmm7 as before, and the short, low side is in 396## the high bits of %xmm6. 397## 398## This schedule is somewhat nastier, however, because each 399## round produces 192 bits of key material, or 1.5 round keys. 400## Therefore, on each cycle we do 2 rounds and produce 3 round 401## keys. 402## 403.align 4 404.Lschedule_192: 405 sub x0, x0, #8 406 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 407 bl _vpaes_schedule_transform // input transform 408 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 409 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 410 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 411 mov x0, #4 // mov $4, %esi 412 413.Loop_schedule_192: 414 sub x0, x0, #1 // dec %esi 415 bl _vpaes_schedule_round 416 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 417 bl _vpaes_schedule_mangle // save key n 418 bl _vpaes_schedule_192_smear 419 bl _vpaes_schedule_mangle // save key n+1 420 bl _vpaes_schedule_round 421 cbz x0, .Lschedule_mangle_last 422 bl _vpaes_schedule_mangle // save key n+2 423 bl _vpaes_schedule_192_smear 424 b .Loop_schedule_192 425 426## 427## .aes_schedule_256 428## 429## 256-bit specific part of key schedule. 430## 431## The structure here is very similar to the 128-bit 432## schedule, but with an additional "low side" in 433## %xmm6. The low side's rounds are the same as the 434## high side's, except no rcon and no rotation. 435## 436.align 4 437.Lschedule_256: 438 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 439 bl _vpaes_schedule_transform // input transform 440 mov x0, #7 // mov $7, %esi 441 442.Loop_schedule_256: 443 sub x0, x0, #1 // dec %esi 444 bl _vpaes_schedule_mangle // output low result 445 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 446 447 // high round 448 bl _vpaes_schedule_round 449 cbz x0, .Lschedule_mangle_last 450 bl _vpaes_schedule_mangle 451 452 // low round. swap xmm7 and xmm6 453 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 454 movi v4.16b, #0 455 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 456 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 457 bl _vpaes_schedule_low_round 458 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 459 460 b .Loop_schedule_256 461 462## 463## .aes_schedule_mangle_last 464## 465## Mangler for last round of key schedule 466## Mangles %xmm0 467## when encrypting, outputs out(%xmm0) ^ 63 468## when decrypting, outputs unskew(%xmm0) 469## 470## Always called right before return... jumps to cleanup and exits 471## 472.align 4 473.Lschedule_mangle_last: 474 // schedule last round key from xmm0 475 adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew 476 add x11, x11, :lo12:.Lk_deskew 477 478 cbnz w3, .Lschedule_mangle_last_dec 479 480 // encrypting 481 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 482 adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform 483 add x11, x11, :lo12:.Lk_opt 484 add x2, x2, #32 // add $32, %rdx 485 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 486 487.Lschedule_mangle_last_dec: 488 ld1 {v20.2d,v21.2d}, [x11] // reload constants 489 sub x2, x2, #16 // add $-16, %rdx 490 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 491 bl _vpaes_schedule_transform // output transform 492 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 493 494 // cleanup 495 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 496 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 497 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 498 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 499 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 500 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 501 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 502 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 503 ldp x29, x30, [sp],#16 504 AARCH64_VALIDATE_LINK_REGISTER 505 ret 506.size _vpaes_schedule_core,.-_vpaes_schedule_core 507 508## 509## .aes_schedule_192_smear 510## 511## Smear the short, low side in the 192-bit key schedule. 512## 513## Inputs: 514## %xmm7: high side, b a x y 515## %xmm6: low side, d c 0 0 516## %xmm13: 0 517## 518## Outputs: 519## %xmm6: b+c+d b+c 0 0 520## %xmm0: b+c+d b+c b a 521## 522.type _vpaes_schedule_192_smear,%function 523.align 4 524_vpaes_schedule_192_smear: 525 movi v1.16b, #0 526 dup v0.4s, v7.s[3] 527 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 528 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 529 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 530 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 531 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 532 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 533 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 534 ret 535.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 536 537## 538## .aes_schedule_round 539## 540## Runs one main round of the key schedule on %xmm0, %xmm7 541## 542## Specifically, runs subbytes on the high dword of %xmm0 543## then rotates it by one byte and xors into the low dword of 544## %xmm7. 545## 546## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 547## next rcon. 548## 549## Smears the dwords of %xmm7 by xoring the low into the 550## second low, result into third, result into highest. 551## 552## Returns results in %xmm7 = %xmm0. 553## Clobbers %xmm1-%xmm4, %r11. 554## 555.type _vpaes_schedule_round,%function 556.align 4 557_vpaes_schedule_round: 558 // extract rcon from xmm8 559 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 560 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 561 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 562 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 563 564 // rotate 565 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 566 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 567 568 // fall through... 569 570 // low round: same as high round, but no rotation and no rcon. 571_vpaes_schedule_low_round: 572 // smear xmm7 573 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 574 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 575 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 576 577 // subbytes 578 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 579 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 580 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 581 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 582 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 583 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 584 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 585 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 586 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 587 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 588 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 589 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 590 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 591 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 592 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 593 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 594 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 595 596 // add in smeared stuff 597 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 598 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 599 ret 600.size _vpaes_schedule_round,.-_vpaes_schedule_round 601 602## 603## .aes_schedule_transform 604## 605## Linear-transform %xmm0 according to tables at (%r11) 606## 607## Requires that %xmm9 = 0x0F0F... as in preheat 608## Output in %xmm0 609## Clobbers %xmm1, %xmm2 610## 611.type _vpaes_schedule_transform,%function 612.align 4 613_vpaes_schedule_transform: 614 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 615 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 616 // vmovdqa (%r11), %xmm2 # lo 617 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 618 // vmovdqa 16(%r11), %xmm1 # hi 619 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 620 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 621 ret 622.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 623 624## 625## .aes_schedule_mangle 626## 627## Mangle xmm0 from (basis-transformed) standard version 628## to our version. 629## 630## On encrypt, 631## xor with 0x63 632## multiply by circulant 0,1,1,1 633## apply shiftrows transform 634## 635## On decrypt, 636## xor with 0x63 637## multiply by "inverse mixcolumns" circulant E,B,D,9 638## deskew 639## apply shiftrows transform 640## 641## 642## Writes out to (%rdx), and increments or decrements it 643## Keeps track of round number mod 4 in %r8 644## Preserves xmm0 645## Clobbers xmm1-xmm5 646## 647.type _vpaes_schedule_mangle,%function 648.align 4 649_vpaes_schedule_mangle: 650 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 651 // vmovdqa .Lk_mc_forward(%rip),%xmm5 652 653 // encrypting 654 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 655 add x2, x2, #16 // add $16, %rdx 656 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 657 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 658 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 659 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 660 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 661 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 662 663.Lschedule_mangle_both: 664 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 665 add x8, x8, #64-16 // add $-16, %r8 666 and x8, x8, #~(1<<6) // and $0x30, %r8 667 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 668 ret 669.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 670 671.globl GFp_vpaes_set_encrypt_key 672.hidden GFp_vpaes_set_encrypt_key 673.type GFp_vpaes_set_encrypt_key,%function 674.align 4 675GFp_vpaes_set_encrypt_key: 676 AARCH64_SIGN_LINK_REGISTER 677 stp x29,x30,[sp,#-16]! 678 add x29,sp,#0 679 stp d8,d9,[sp,#-16]! // ABI spec says so 680 681 lsr w9, w1, #5 // shr $5,%eax 682 add w9, w9, #5 // $5,%eax 683 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 684 685 mov w3, #0 // mov $0,%ecx 686 mov x8, #0x30 // mov $0x30,%r8d 687 bl _vpaes_schedule_core 688 eor x0, x0, x0 689 690 ldp d8,d9,[sp],#16 691 ldp x29,x30,[sp],#16 692 AARCH64_VALIDATE_LINK_REGISTER 693 ret 694.size GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key 695.globl GFp_vpaes_ctr32_encrypt_blocks 696.hidden GFp_vpaes_ctr32_encrypt_blocks 697.type GFp_vpaes_ctr32_encrypt_blocks,%function 698.align 4 699GFp_vpaes_ctr32_encrypt_blocks: 700 AARCH64_SIGN_LINK_REGISTER 701 stp x29,x30,[sp,#-16]! 702 add x29,sp,#0 703 stp d8,d9,[sp,#-16]! // ABI spec says so 704 stp d10,d11,[sp,#-16]! 705 stp d12,d13,[sp,#-16]! 706 stp d14,d15,[sp,#-16]! 707 708 cbz x2, .Lctr32_done 709 710 // Note, unlike the other functions, x2 here is measured in blocks, 711 // not bytes. 712 mov x17, x2 713 mov x2, x3 714 715 // Load the IV and counter portion. 716 ldr w6, [x4, #12] 717 ld1 {v7.16b}, [x4] 718 719 bl _vpaes_encrypt_preheat 720 tst x17, #1 721 rev w6, w6 // The counter is big-endian. 722 b.eq .Lctr32_prep_loop 723 724 // Handle one block so the remaining block count is even for 725 // _vpaes_encrypt_2x. 726 ld1 {v6.16b}, [x0], #16 // .Load input ahead of time 727 bl _vpaes_encrypt_core 728 eor v0.16b, v0.16b, v6.16b // XOR input and result 729 st1 {v0.16b}, [x1], #16 730 subs x17, x17, #1 731 // Update the counter. 732 add w6, w6, #1 733 rev w7, w6 734 mov v7.s[3], w7 735 b.ls .Lctr32_done 736 737.Lctr32_prep_loop: 738 // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x 739 // uses v14 and v15. 740 mov v15.16b, v7.16b 741 mov v14.16b, v7.16b 742 add w6, w6, #1 743 rev w7, w6 744 mov v15.s[3], w7 745 746.Lctr32_loop: 747 ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time 748 bl _vpaes_encrypt_2x 749 eor v0.16b, v0.16b, v6.16b // XOR input and result 750 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) 751 st1 {v0.16b,v1.16b}, [x1], #32 752 subs x17, x17, #2 753 // Update the counter. 754 add w7, w6, #1 755 add w6, w6, #2 756 rev w7, w7 757 mov v14.s[3], w7 758 rev w7, w6 759 mov v15.s[3], w7 760 b.hi .Lctr32_loop 761 762.Lctr32_done: 763 ldp d14,d15,[sp],#16 764 ldp d12,d13,[sp],#16 765 ldp d10,d11,[sp],#16 766 ldp d8,d9,[sp],#16 767 ldp x29,x30,[sp],#16 768 AARCH64_VALIDATE_LINK_REGISTER 769 ret 770.size GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks 771#endif 772#endif // !OPENSSL_NO_ASM 773.section .note.GNU-stack,"",%progbits 774