1#! /usr/bin/env perl 2# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10###################################################################### 11## Constant-time SSSE3 AES core implementation. 12## version 0.1 13## 14## By Mike Hamburg (Stanford University), 2009 15## Public domain. 16## 17## For details see http://shiftleft.org/papers/vector_aes/ and 18## http://crypto.stanford.edu/vpaes/. 19## 20###################################################################### 21# ARMv8 NEON adaptation by <appro@openssl.org> 22# 23# Reason for undertaken effort is that there is at least one popular 24# SoC based on Cortex-A53 that doesn't have crypto extensions. 25# 26# CBC enc ECB enc/dec(*) [bit-sliced enc/dec] 27# Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ] 28# Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ] 29# X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ] 30# Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] 31# Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] 32# Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ] 33# 34# (*) ECB denotes approximate result for parallelizable modes 35# such as CBC decrypt, CTR, etc.; 36# (**) these results are worse than scalar compiler-generated 37# code, but it's constant-time and therefore preferred; 38# (***) presented for reference/comparison purposes; 39 40$flavour = shift; 41while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 42 43$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 44( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 45( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 46die "can't locate arm-xlate.pl"; 47 48open OUT,"| \"$^X\" $xlate $flavour $output"; 49*STDOUT=*OUT; 50 51$code.=<<___; 52.section .rodata 53 54.type _vpaes_consts,%object 55.align 7 // totally strategic alignment 56_vpaes_consts: 57.Lk_mc_forward: // mc_forward 58 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 59 .quad 0x080B0A0904070605, 0x000302010C0F0E0D 60 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 61 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 62.Lk_mc_backward:// mc_backward 63 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B 64 .quad 0x020100030E0D0C0F, 0x0A09080B06050407 65 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 66 .quad 0x0A09080B06050407, 0x020100030E0D0C0F 67.Lk_sr: // sr 68 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 69 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 70 .quad 0x0F060D040B020900, 0x070E050C030A0108 71 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 72 73// 74// "Hot" constants 75// 76.Lk_inv: // inv, inva 77 .quad 0x0E05060F0D080180, 0x040703090A0B0C02 78 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 79.Lk_ipt: // input transform (lo, hi) 80 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 81 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 82.Lk_sbo: // sbou, sbot 83 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 84 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 85.Lk_sb1: // sb1u, sb1t 86 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 87 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 88.Lk_sb2: // sb2u, sb2t 89 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 90 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 91 92// 93// Decryption stuff 94// 95.Lk_dipt: // decryption input transform 96 .quad 0x0F505B040B545F00, 0x154A411E114E451A 97 .quad 0x86E383E660056500, 0x12771772F491F194 98.Lk_dsbo: // decryption sbox final output 99 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 100 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 101.Lk_dsb9: // decryption sbox output *9*u, *9*t 102 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 103 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 104.Lk_dsbd: // decryption sbox output *D*u, *D*t 105 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 106 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 107.Lk_dsbb: // decryption sbox output *B*u, *B*t 108 .quad 0xD022649296B44200, 0x602646F6B0F2D404 109 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 110.Lk_dsbe: // decryption sbox output *E*u, *E*t 111 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 112 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 113 114// 115// Key schedule constants 116// 117.Lk_dksd: // decryption key schedule: invskew x*D 118 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 119 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 120.Lk_dksb: // decryption key schedule: invskew x*B 121 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 122 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 123.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 124 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 125 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 126.Lk_dks9: // decryption key schedule: invskew x*9 127 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 128 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 129 130.Lk_rcon: // rcon 131 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 132 133.Lk_opt: // output transform 134 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 135 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 136.Lk_deskew: // deskew tables: inverts the sbox's "skew" 137 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 138 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 139 140.asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)" 141.size _vpaes_consts,.-_vpaes_consts 142.align 6 143 144.text 145___ 146 147{ 148my ($inp,$out,$key) = map("x$_",(0..2)); 149 150my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23)); 151my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27)); 152my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31)); 153 154$code.=<<___; 155## 156## _aes_preheat 157## 158## Fills register %r10 -> .aes_consts (so you can -fPIC) 159## and %xmm9-%xmm15 as specified below. 160## 161.type _vpaes_encrypt_preheat,%function 162.align 4 163_vpaes_encrypt_preheat: 164 adrp x10, :pg_hi21:.Lk_inv 165 add x10, x10, :lo12:.Lk_inv 166 movi v17.16b, #0x0f 167 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv 168 ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 169 ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 170 ret 171.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 172 173## 174## _aes_encrypt_core 175## 176## AES-encrypt %xmm0. 177## 178## Inputs: 179## %xmm0 = input 180## %xmm9-%xmm15 as in _vpaes_preheat 181## (%rdx) = scheduled keys 182## 183## Output in %xmm0 184## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 185## Preserves %xmm6 - %xmm8 so you get some local vectors 186## 187## 188.type _vpaes_encrypt_core,%function 189.align 4 190_vpaes_encrypt_core: 191 mov x9, $key 192 ldr w8, [$key,#240] // pull rounds 193 adrp x11, :pg_hi21:.Lk_mc_forward+16 194 add x11, x11, :lo12:.Lk_mc_forward+16 195 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 196 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 197 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 198 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 199 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 200 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 201 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 202 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 203 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 204 b .Lenc_entry 205 206.align 4 207.Lenc_loop: 208 // middle of middle round 209 add x10, x11, #0x40 210 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 211 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 212 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 213 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 214 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 215 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 216 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 217 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 218 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 219 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 220 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 221 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 222 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 223 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 224 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 225 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 226 sub w8, w8, #1 // nr-- 227 228.Lenc_entry: 229 // top of round 230 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 231 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 232 tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 233 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 234 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 235 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 236 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 237 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 238 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 239 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 240 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 241 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 242 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 243 cbnz w8, .Lenc_loop 244 245 // middle of last round 246 add x10, x11, #0x80 247 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 248 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 249 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 250 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 251 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 252 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 253 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 254 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 255 ret 256.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 257 258.globl vpaes_encrypt 259.type vpaes_encrypt,%function 260.align 4 261vpaes_encrypt: 262 stp x29,x30,[sp,#-16]! 263 add x29,sp,#0 264 265 ld1 {v7.16b}, [$inp] 266 bl _vpaes_encrypt_preheat 267 bl _vpaes_encrypt_core 268 st1 {v0.16b}, [$out] 269 270 ldp x29,x30,[sp],#16 271 ret 272.size vpaes_encrypt,.-vpaes_encrypt 273 274.type _vpaes_encrypt_2x,%function 275.align 4 276_vpaes_encrypt_2x: 277 mov x9, $key 278 ldr w8, [$key,#240] // pull rounds 279 adrp x11, :pg_hi21:.Lk_mc_forward+16 280 add x11, x11, :lo12:.Lk_mc_forward+16 281 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 282 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 283 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 284 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 285 and v9.16b, v15.16b, v17.16b 286 ushr v8.16b, v15.16b, #4 287 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 288 tbl v9.16b, {$iptlo}, v9.16b 289 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 290 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 291 tbl v10.16b, {$ipthi}, v8.16b 292 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 293 eor v8.16b, v9.16b, v16.16b 294 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 295 eor v8.16b, v8.16b, v10.16b 296 b .Lenc_2x_entry 297 298.align 4 299.Lenc_2x_loop: 300 // middle of middle round 301 add x10, x11, #0x40 302 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 303 tbl v12.16b, {$sb1t}, v10.16b 304 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 305 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 306 tbl v8.16b, {$sb1u}, v11.16b 307 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 308 eor v12.16b, v12.16b, v16.16b 309 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 310 tbl v13.16b, {$sb2t}, v10.16b 311 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 312 eor v8.16b, v8.16b, v12.16b 313 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 314 tbl v10.16b, {$sb2u}, v11.16b 315 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 316 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 317 tbl v11.16b, {v8.16b}, v1.16b 318 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 319 eor v10.16b, v10.16b, v13.16b 320 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 321 tbl v8.16b, {v8.16b}, v4.16b 322 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 323 eor v11.16b, v11.16b, v10.16b 324 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 325 tbl v12.16b, {v11.16b},v1.16b 326 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 327 eor v8.16b, v8.16b, v11.16b 328 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 329 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 330 eor v8.16b, v8.16b, v12.16b 331 sub w8, w8, #1 // nr-- 332 333.Lenc_2x_entry: 334 // top of round 335 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 336 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 337 and v9.16b, v8.16b, v17.16b 338 ushr v8.16b, v8.16b, #4 339 tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 340 tbl v13.16b, {$invhi},v9.16b 341 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 342 eor v9.16b, v9.16b, v8.16b 343 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 344 tbl v11.16b, {$invlo},v8.16b 345 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 346 tbl v12.16b, {$invlo},v9.16b 347 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 348 eor v11.16b, v11.16b, v13.16b 349 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 350 eor v12.16b, v12.16b, v13.16b 351 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 352 tbl v10.16b, {$invlo},v11.16b 353 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 354 tbl v11.16b, {$invlo},v12.16b 355 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 356 eor v10.16b, v10.16b, v9.16b 357 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 358 eor v11.16b, v11.16b, v8.16b 359 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 360 cbnz w8, .Lenc_2x_loop 361 362 // middle of last round 363 add x10, x11, #0x80 364 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 365 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 366 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 367 tbl v12.16b, {$sbou}, v10.16b 368 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 369 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 370 tbl v8.16b, {$sbot}, v11.16b 371 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 372 eor v12.16b, v12.16b, v16.16b 373 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 374 eor v8.16b, v8.16b, v12.16b 375 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 376 tbl v1.16b, {v8.16b},v1.16b 377 ret 378.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 379 380.type _vpaes_decrypt_preheat,%function 381.align 4 382_vpaes_decrypt_preheat: 383 adrp x10, :pg_hi21:.Lk_inv 384 add x10, x10, :lo12:.Lk_inv 385 movi v17.16b, #0x0f 386 adrp x11, :pg_hi21:.Lk_dipt 387 add x11, x11, :lo12:.Lk_dipt 388 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv 389 ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo 390 ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd 391 ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe 392 ret 393.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat 394 395## 396## Decryption core 397## 398## Same API as encryption core. 399## 400.type _vpaes_decrypt_core,%function 401.align 4 402_vpaes_decrypt_core: 403 mov x9, $key 404 ldr w8, [$key,#240] // pull rounds 405 406 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 407 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 408 eor x11, x11, #0x30 // xor \$0x30, %r11 409 adrp x10, :pg_hi21:.Lk_sr 410 add x10, x10, :lo12:.Lk_sr 411 and x11, x11, #0x30 // and \$0x30, %r11 412 add x11, x11, x10 413 adrp x10, :pg_hi21:.Lk_mc_forward+48 414 add x10, x10, :lo12:.Lk_mc_forward+48 415 416 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 417 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 418 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 419 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 420 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 421 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 422 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 423 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 424 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 425 b .Ldec_entry 426 427.align 4 428.Ldec_loop: 429// 430// Inverse mix columns 431// 432 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 433 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 434 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 435 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 436 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 437 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 438 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 439 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 440 441 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 442 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 443 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 444 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 445 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 446 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 447 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 448 449 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 450 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 451 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 452 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 453 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 454 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 455 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 456 457 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 458 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 459 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 460 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 461 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5 462 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 463 sub w8, w8, #1 // sub \$1,%rax # nr-- 464 465.Ldec_entry: 466 // top of round 467 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 468 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 469 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 470 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 471 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 472 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 473 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 474 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 475 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 476 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 477 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 478 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 479 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 480 cbnz w8, .Ldec_loop 481 482 // middle of last round 483 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 484 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 485 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 486 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 487 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 488 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 489 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 490 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 491 ret 492.size _vpaes_decrypt_core,.-_vpaes_decrypt_core 493 494.globl vpaes_decrypt 495.type vpaes_decrypt,%function 496.align 4 497vpaes_decrypt: 498 stp x29,x30,[sp,#-16]! 499 add x29,sp,#0 500 501 ld1 {v7.16b}, [$inp] 502 bl _vpaes_decrypt_preheat 503 bl _vpaes_decrypt_core 504 st1 {v0.16b}, [$out] 505 506 ldp x29,x30,[sp],#16 507 ret 508.size vpaes_decrypt,.-vpaes_decrypt 509 510// v14-v15 input, v0-v1 output 511.type _vpaes_decrypt_2x,%function 512.align 4 513_vpaes_decrypt_2x: 514 mov x9, $key 515 ldr w8, [$key,#240] // pull rounds 516 517 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 518 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 519 eor x11, x11, #0x30 // xor \$0x30, %r11 520 adrp x10, :pg_hi21:.Lk_sr 521 add x10, x10, :lo12:.Lk_sr 522 and x11, x11, #0x30 // and \$0x30, %r11 523 add x11, x11, x10 524 adrp x10, :pg_hi21:.Lk_mc_forward+48 525 add x10, x10, :lo12:.Lk_mc_forward+48 526 527 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 528 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 529 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 530 and v9.16b, v15.16b, v17.16b 531 ushr v8.16b, v15.16b, #4 532 tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 533 tbl v10.16b, {$iptlo},v9.16b 534 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 535 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 536 tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 537 tbl v8.16b, {$ipthi},v8.16b 538 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 539 eor v10.16b, v10.16b, v16.16b 540 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 541 eor v8.16b, v8.16b, v10.16b 542 b .Ldec_2x_entry 543 544.align 4 545.Ldec_2x_loop: 546// 547// Inverse mix columns 548// 549 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 550 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 551 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 552 tbl v12.16b, {$sb9u}, v10.16b 553 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 554 tbl v9.16b, {$sb9t}, v11.16b 555 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 556 eor v8.16b, v12.16b, v16.16b 557 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 558 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 559 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 560 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 561 562 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 563 tbl v12.16b, {$sbdu}, v10.16b 564 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 565 tbl v8.16b, {v8.16b},v5.16b 566 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 567 tbl v9.16b, {$sbdt}, v11.16b 568 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 569 eor v8.16b, v8.16b, v12.16b 570 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 571 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 572 eor v8.16b, v8.16b, v9.16b 573 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 574 575 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 576 tbl v12.16b, {$sbbu}, v10.16b 577 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 578 tbl v8.16b, {v8.16b},v5.16b 579 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 580 tbl v9.16b, {$sbbt}, v11.16b 581 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 582 eor v8.16b, v8.16b, v12.16b 583 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 584 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 585 eor v8.16b, v8.16b, v9.16b 586 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 587 588 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 589 tbl v12.16b, {$sbeu}, v10.16b 590 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 591 tbl v8.16b, {v8.16b},v5.16b 592 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 593 tbl v9.16b, {$sbet}, v11.16b 594 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 595 eor v8.16b, v8.16b, v12.16b 596 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5 597 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 598 eor v8.16b, v8.16b, v9.16b 599 sub w8, w8, #1 // sub \$1,%rax # nr-- 600 601.Ldec_2x_entry: 602 // top of round 603 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 604 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 605 and v9.16b, v8.16b, v17.16b 606 ushr v8.16b, v8.16b, #4 607 tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 608 tbl v10.16b, {$invhi},v9.16b 609 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 610 eor v9.16b, v9.16b, v8.16b 611 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 612 tbl v11.16b, {$invlo},v8.16b 613 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 614 tbl v12.16b, {$invlo},v9.16b 615 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 616 eor v11.16b, v11.16b, v10.16b 617 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 618 eor v12.16b, v12.16b, v10.16b 619 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 620 tbl v10.16b, {$invlo},v11.16b 621 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 622 tbl v11.16b, {$invlo},v12.16b 623 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 624 eor v10.16b, v10.16b, v9.16b 625 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 626 eor v11.16b, v11.16b, v8.16b 627 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 628 cbnz w8, .Ldec_2x_loop 629 630 // middle of last round 631 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 632 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 633 tbl v12.16b, {$sbou}, v10.16b 634 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 635 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 636 tbl v9.16b, {$sbot}, v11.16b 637 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 638 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 639 eor v12.16b, v12.16b, v16.16b 640 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 641 eor v8.16b, v9.16b, v12.16b 642 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 643 tbl v1.16b, {v8.16b},v2.16b 644 ret 645.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x 646___ 647} 648{ 649my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3"); 650my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8)); 651 652$code.=<<___; 653######################################################## 654## ## 655## AES key schedule ## 656## ## 657######################################################## 658.type _vpaes_key_preheat,%function 659.align 4 660_vpaes_key_preheat: 661 adrp x10, :pg_hi21:.Lk_inv 662 add x10, x10, :lo12:.Lk_inv 663 movi v16.16b, #0x5b // .Lk_s63 664 adrp x11, :pg_hi21:.Lk_sb1 665 add x11, x11, :lo12:.Lk_sb1 666 movi v17.16b, #0x0f // .Lk_s0F 667 ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt 668 adrp x10, :pg_hi21:.Lk_dksd 669 add x10, x10, :lo12:.Lk_dksd 670 ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1 671 adrp x11, :pg_hi21:.Lk_mc_forward 672 add x11, x11, :lo12:.Lk_mc_forward 673 ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 674 ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 675 ld1 {v8.2d}, [x10] // .Lk_rcon 676 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 677 ret 678.size _vpaes_key_preheat,.-_vpaes_key_preheat 679 680.type _vpaes_schedule_core,%function 681.align 4 682_vpaes_schedule_core: 683 stp x29, x30, [sp,#-16]! 684 add x29,sp,#0 685 686 bl _vpaes_key_preheat // load the tables 687 688 ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 689 690 // input transform 691 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 692 bl _vpaes_schedule_transform 693 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 694 695 adrp x10, :pg_hi21:.Lk_sr // lea .Lk_sr(%rip),%r10 696 add x10, x10, :lo12:.Lk_sr 697 698 add x8, x8, x10 699 cbnz $dir, .Lschedule_am_decrypting 700 701 // encrypting, output zeroth round key after transform 702 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) 703 b .Lschedule_go 704 705.Lschedule_am_decrypting: 706 // decrypting, output zeroth round key after shiftrows 707 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 708 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 709 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) 710 eor x8, x8, #0x30 // xor \$0x30, %r8 711 712.Lschedule_go: 713 cmp $bits, #192 // cmp \$192, %esi 714 b.hi .Lschedule_256 715 b.eq .Lschedule_192 716 // 128: fall though 717 718## 719## .schedule_128 720## 721## 128-bit specific part of key schedule. 722## 723## This schedule is really simple, because all its parts 724## are accomplished by the subroutines. 725## 726.Lschedule_128: 727 mov $inp, #10 // mov \$10, %esi 728 729.Loop_schedule_128: 730 sub $inp, $inp, #1 // dec %esi 731 bl _vpaes_schedule_round 732 cbz $inp, .Lschedule_mangle_last 733 bl _vpaes_schedule_mangle // write output 734 b .Loop_schedule_128 735 736## 737## .aes_schedule_192 738## 739## 192-bit specific part of key schedule. 740## 741## The main body of this schedule is the same as the 128-bit 742## schedule, but with more smearing. The long, high side is 743## stored in %xmm7 as before, and the short, low side is in 744## the high bits of %xmm6. 745## 746## This schedule is somewhat nastier, however, because each 747## round produces 192 bits of key material, or 1.5 round keys. 748## Therefore, on each cycle we do 2 rounds and produce 3 round 749## keys. 750## 751.align 4 752.Lschedule_192: 753 sub $inp, $inp, #8 754 ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 755 bl _vpaes_schedule_transform // input transform 756 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 757 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 758 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 759 mov $inp, #4 // mov \$4, %esi 760 761.Loop_schedule_192: 762 sub $inp, $inp, #1 // dec %esi 763 bl _vpaes_schedule_round 764 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0 765 bl _vpaes_schedule_mangle // save key n 766 bl _vpaes_schedule_192_smear 767 bl _vpaes_schedule_mangle // save key n+1 768 bl _vpaes_schedule_round 769 cbz $inp, .Lschedule_mangle_last 770 bl _vpaes_schedule_mangle // save key n+2 771 bl _vpaes_schedule_192_smear 772 b .Loop_schedule_192 773 774## 775## .aes_schedule_256 776## 777## 256-bit specific part of key schedule. 778## 779## The structure here is very similar to the 128-bit 780## schedule, but with an additional "low side" in 781## %xmm6. The low side's rounds are the same as the 782## high side's, except no rcon and no rotation. 783## 784.align 4 785.Lschedule_256: 786 ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 787 bl _vpaes_schedule_transform // input transform 788 mov $inp, #7 // mov \$7, %esi 789 790.Loop_schedule_256: 791 sub $inp, $inp, #1 // dec %esi 792 bl _vpaes_schedule_mangle // output low result 793 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 794 795 // high round 796 bl _vpaes_schedule_round 797 cbz $inp, .Lschedule_mangle_last 798 bl _vpaes_schedule_mangle 799 800 // low round. swap xmm7 and xmm6 801 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 802 movi v4.16b, #0 803 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 804 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 805 bl _vpaes_schedule_low_round 806 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 807 808 b .Loop_schedule_256 809 810## 811## .aes_schedule_mangle_last 812## 813## Mangler for last round of key schedule 814## Mangles %xmm0 815## when encrypting, outputs out(%xmm0) ^ 63 816## when decrypting, outputs unskew(%xmm0) 817## 818## Always called right before return... jumps to cleanup and exits 819## 820.align 4 821.Lschedule_mangle_last: 822 // schedule last round key from xmm0 823 adrp x11, :pg_hi21:.Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew 824 add x11, x11, :lo12:.Lk_deskew 825 826 cbnz $dir, .Lschedule_mangle_last_dec 827 828 // encrypting 829 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 830 adrp x11, :pg_hi21:.Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform 831 add x11, x11, :lo12:.Lk_opt 832 add $out, $out, #32 // add \$32, %rdx 833 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 834 835.Lschedule_mangle_last_dec: 836 ld1 {v20.2d-v21.2d}, [x11] // reload constants 837 sub $out, $out, #16 // add \$-16, %rdx 838 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 839 bl _vpaes_schedule_transform // output transform 840 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key 841 842 // cleanup 843 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 844 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 845 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 846 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 847 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 848 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 849 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 850 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 851 ldp x29, x30, [sp],#16 852 ret 853.size _vpaes_schedule_core,.-_vpaes_schedule_core 854 855## 856## .aes_schedule_192_smear 857## 858## Smear the short, low side in the 192-bit key schedule. 859## 860## Inputs: 861## %xmm7: high side, b a x y 862## %xmm6: low side, d c 0 0 863## %xmm13: 0 864## 865## Outputs: 866## %xmm6: b+c+d b+c 0 0 867## %xmm0: b+c+d b+c b a 868## 869.type _vpaes_schedule_192_smear,%function 870.align 4 871_vpaes_schedule_192_smear: 872 movi v1.16b, #0 873 dup v0.4s, v7.s[3] 874 ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 875 ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 876 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 877 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 878 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 879 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 880 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 881 ret 882.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 883 884## 885## .aes_schedule_round 886## 887## Runs one main round of the key schedule on %xmm0, %xmm7 888## 889## Specifically, runs subbytes on the high dword of %xmm0 890## then rotates it by one byte and xors into the low dword of 891## %xmm7. 892## 893## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 894## next rcon. 895## 896## Smears the dwords of %xmm7 by xoring the low into the 897## second low, result into third, result into highest. 898## 899## Returns results in %xmm7 = %xmm0. 900## Clobbers %xmm1-%xmm4, %r11. 901## 902.type _vpaes_schedule_round,%function 903.align 4 904_vpaes_schedule_round: 905 // extract rcon from xmm8 906 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 907 ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1 908 ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8 909 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 910 911 // rotate 912 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 913 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0 914 915 // fall through... 916 917 // low round: same as high round, but no rotation and no rcon. 918_vpaes_schedule_low_round: 919 // smear xmm7 920 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1 921 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 922 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4 923 924 // subbytes 925 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 926 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 927 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 928 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 929 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 930 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 931 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 932 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 933 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 934 tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 935 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 936 tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 937 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 938 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 939 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 940 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 941 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 942 943 // add in smeared stuff 944 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 945 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 946 ret 947.size _vpaes_schedule_round,.-_vpaes_schedule_round 948 949## 950## .aes_schedule_transform 951## 952## Linear-transform %xmm0 according to tables at (%r11) 953## 954## Requires that %xmm9 = 0x0F0F... as in preheat 955## Output in %xmm0 956## Clobbers %xmm1, %xmm2 957## 958.type _vpaes_schedule_transform,%function 959.align 4 960_vpaes_schedule_transform: 961 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 962 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 963 // vmovdqa (%r11), %xmm2 # lo 964 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 965 // vmovdqa 16(%r11), %xmm1 # hi 966 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 967 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 968 ret 969.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 970 971## 972## .aes_schedule_mangle 973## 974## Mangle xmm0 from (basis-transformed) standard version 975## to our version. 976## 977## On encrypt, 978## xor with 0x63 979## multiply by circulant 0,1,1,1 980## apply shiftrows transform 981## 982## On decrypt, 983## xor with 0x63 984## multiply by "inverse mixcolumns" circulant E,B,D,9 985## deskew 986## apply shiftrows transform 987## 988## 989## Writes out to (%rdx), and increments or decrements it 990## Keeps track of round number mod 4 in %r8 991## Preserves xmm0 992## Clobbers xmm1-xmm5 993## 994.type _vpaes_schedule_mangle,%function 995.align 4 996_vpaes_schedule_mangle: 997 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 998 // vmovdqa .Lk_mc_forward(%rip),%xmm5 999 cbnz $dir, .Lschedule_mangle_dec 1000 1001 // encrypting 1002 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 1003 add $out, $out, #16 // add \$16, %rdx 1004 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 1005 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 1006 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 1007 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 1008 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 1009 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 1010 1011 b .Lschedule_mangle_both 1012.align 4 1013.Lschedule_mangle_dec: 1014 // inverse mix columns 1015 // lea .Lk_dksd(%rip),%r11 1016 ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi 1017 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo 1018 1019 // vmovdqa 0x00(%r11), %xmm2 1020 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1021 // vmovdqa 0x10(%r11), %xmm3 1022 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1023 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 1024 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1025 1026 // vmovdqa 0x20(%r11), %xmm2 1027 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1028 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1029 // vmovdqa 0x30(%r11), %xmm3 1030 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1031 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 1032 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1033 1034 // vmovdqa 0x40(%r11), %xmm2 1035 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1036 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1037 // vmovdqa 0x50(%r11), %xmm3 1038 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1039 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 1040 1041 // vmovdqa 0x60(%r11), %xmm2 1042 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1043 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1044 // vmovdqa 0x70(%r11), %xmm4 1045 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 1046 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 1047 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1048 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 1049 1050 sub $out, $out, #16 // add \$-16, %rdx 1051 1052.Lschedule_mangle_both: 1053 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1054 add x8, x8, #64-16 // add \$-16, %r8 1055 and x8, x8, #~(1<<6) // and \$0x30, %r8 1056 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) 1057 ret 1058.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 1059 1060.globl vpaes_set_encrypt_key 1061.type vpaes_set_encrypt_key,%function 1062.align 4 1063vpaes_set_encrypt_key: 1064 stp x29,x30,[sp,#-16]! 1065 add x29,sp,#0 1066 stp d8,d9,[sp,#-16]! // ABI spec says so 1067 1068 lsr w9, $bits, #5 // shr \$5,%eax 1069 add w9, w9, #5 // \$5,%eax 1070 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1071 1072 mov $dir, #0 // mov \$0,%ecx 1073 mov x8, #0x30 // mov \$0x30,%r8d 1074 bl _vpaes_schedule_core 1075 eor x0, x0, x0 1076 1077 ldp d8,d9,[sp],#16 1078 ldp x29,x30,[sp],#16 1079 ret 1080.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 1081 1082.globl vpaes_set_decrypt_key 1083.type vpaes_set_decrypt_key,%function 1084.align 4 1085vpaes_set_decrypt_key: 1086 stp x29,x30,[sp,#-16]! 1087 add x29,sp,#0 1088 stp d8,d9,[sp,#-16]! // ABI spec says so 1089 1090 lsr w9, $bits, #5 // shr \$5,%eax 1091 add w9, w9, #5 // \$5,%eax 1092 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1093 lsl w9, w9, #4 // shl \$4,%eax 1094 add $out, $out, #16 // lea 16(%rdx,%rax),%rdx 1095 add $out, $out, x9 1096 1097 mov $dir, #1 // mov \$1,%ecx 1098 lsr w8, $bits, #1 // shr \$1,%r8d 1099 and x8, x8, #32 // and \$32,%r8d 1100 eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32 1101 bl _vpaes_schedule_core 1102 1103 ldp d8,d9,[sp],#16 1104 ldp x29,x30,[sp],#16 1105 ret 1106.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key 1107___ 1108} 1109{ 1110my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5)); 1111 1112$code.=<<___; 1113.globl vpaes_cbc_encrypt 1114.type vpaes_cbc_encrypt,%function 1115.align 4 1116vpaes_cbc_encrypt: 1117 cbz $len, .Lcbc_abort 1118 cmp w5, #0 // check direction 1119 b.eq vpaes_cbc_decrypt 1120 1121 stp x29,x30,[sp,#-16]! 1122 add x29,sp,#0 1123 1124 mov x17, $len // reassign 1125 mov x2, $key // reassign 1126 1127 ld1 {v0.16b}, [$ivec] // load ivec 1128 bl _vpaes_encrypt_preheat 1129 b .Lcbc_enc_loop 1130 1131.align 4 1132.Lcbc_enc_loop: 1133 ld1 {v7.16b}, [$inp],#16 // load input 1134 eor v7.16b, v7.16b, v0.16b // xor with ivec 1135 bl _vpaes_encrypt_core 1136 st1 {v0.16b}, [$out],#16 // save output 1137 subs x17, x17, #16 1138 b.hi .Lcbc_enc_loop 1139 1140 st1 {v0.16b}, [$ivec] // write ivec 1141 1142 ldp x29,x30,[sp],#16 1143.Lcbc_abort: 1144 ret 1145.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt 1146 1147.type vpaes_cbc_decrypt,%function 1148.align 4 1149vpaes_cbc_decrypt: 1150 stp x29,x30,[sp,#-16]! 1151 add x29,sp,#0 1152 stp d8,d9,[sp,#-16]! // ABI spec says so 1153 stp d10,d11,[sp,#-16]! 1154 stp d12,d13,[sp,#-16]! 1155 stp d14,d15,[sp,#-16]! 1156 1157 mov x17, $len // reassign 1158 mov x2, $key // reassign 1159 ld1 {v6.16b}, [$ivec] // load ivec 1160 bl _vpaes_decrypt_preheat 1161 tst x17, #16 1162 b.eq .Lcbc_dec_loop2x 1163 1164 ld1 {v7.16b}, [$inp], #16 // load input 1165 bl _vpaes_decrypt_core 1166 eor v0.16b, v0.16b, v6.16b // xor with ivec 1167 orr v6.16b, v7.16b, v7.16b // next ivec value 1168 st1 {v0.16b}, [$out], #16 1169 subs x17, x17, #16 1170 b.ls .Lcbc_dec_done 1171 1172.align 4 1173.Lcbc_dec_loop2x: 1174 ld1 {v14.16b,v15.16b}, [$inp], #32 1175 bl _vpaes_decrypt_2x 1176 eor v0.16b, v0.16b, v6.16b // xor with ivec 1177 eor v1.16b, v1.16b, v14.16b 1178 orr v6.16b, v15.16b, v15.16b 1179 st1 {v0.16b,v1.16b}, [$out], #32 1180 subs x17, x17, #32 1181 b.hi .Lcbc_dec_loop2x 1182 1183.Lcbc_dec_done: 1184 st1 {v6.16b}, [$ivec] 1185 1186 ldp d14,d15,[sp],#16 1187 ldp d12,d13,[sp],#16 1188 ldp d10,d11,[sp],#16 1189 ldp d8,d9,[sp],#16 1190 ldp x29,x30,[sp],#16 1191 ret 1192.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt 1193___ 1194# We omit vpaes_ecb_* in BoringSSL. They are unused. 1195if (0) { 1196$code.=<<___; 1197.globl vpaes_ecb_encrypt 1198.type vpaes_ecb_encrypt,%function 1199.align 4 1200vpaes_ecb_encrypt: 1201 stp x29,x30,[sp,#-16]! 1202 add x29,sp,#0 1203 stp d8,d9,[sp,#-16]! // ABI spec says so 1204 stp d10,d11,[sp,#-16]! 1205 stp d12,d13,[sp,#-16]! 1206 stp d14,d15,[sp,#-16]! 1207 1208 mov x17, $len 1209 mov x2, $key 1210 bl _vpaes_encrypt_preheat 1211 tst x17, #16 1212 b.eq .Lecb_enc_loop 1213 1214 ld1 {v7.16b}, [$inp],#16 1215 bl _vpaes_encrypt_core 1216 st1 {v0.16b}, [$out],#16 1217 subs x17, x17, #16 1218 b.ls .Lecb_enc_done 1219 1220.align 4 1221.Lecb_enc_loop: 1222 ld1 {v14.16b,v15.16b}, [$inp], #32 1223 bl _vpaes_encrypt_2x 1224 st1 {v0.16b,v1.16b}, [$out], #32 1225 subs x17, x17, #32 1226 b.hi .Lecb_enc_loop 1227 1228.Lecb_enc_done: 1229 ldp d14,d15,[sp],#16 1230 ldp d12,d13,[sp],#16 1231 ldp d10,d11,[sp],#16 1232 ldp d8,d9,[sp],#16 1233 ldp x29,x30,[sp],#16 1234 ret 1235.size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt 1236 1237.globl vpaes_ecb_decrypt 1238.type vpaes_ecb_decrypt,%function 1239.align 4 1240vpaes_ecb_decrypt: 1241 stp x29,x30,[sp,#-16]! 1242 add x29,sp,#0 1243 stp d8,d9,[sp,#-16]! // ABI spec says so 1244 stp d10,d11,[sp,#-16]! 1245 stp d12,d13,[sp,#-16]! 1246 stp d14,d15,[sp,#-16]! 1247 1248 mov x17, $len 1249 mov x2, $key 1250 bl _vpaes_decrypt_preheat 1251 tst x17, #16 1252 b.eq .Lecb_dec_loop 1253 1254 ld1 {v7.16b}, [$inp],#16 1255 bl _vpaes_encrypt_core 1256 st1 {v0.16b}, [$out],#16 1257 subs x17, x17, #16 1258 b.ls .Lecb_dec_done 1259 1260.align 4 1261.Lecb_dec_loop: 1262 ld1 {v14.16b,v15.16b}, [$inp], #32 1263 bl _vpaes_decrypt_2x 1264 st1 {v0.16b,v1.16b}, [$out], #32 1265 subs x17, x17, #32 1266 b.hi .Lecb_dec_loop 1267 1268.Lecb_dec_done: 1269 ldp d14,d15,[sp],#16 1270 ldp d12,d13,[sp],#16 1271 ldp d10,d11,[sp],#16 1272 ldp d8,d9,[sp],#16 1273 ldp x29,x30,[sp],#16 1274 ret 1275.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt 1276___ 1277} 1278 1279my ($ctr, $ctr_tmp) = ("w6", "w7"); 1280 1281# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, 1282# const AES_KEY *key, const uint8_t ivec[16]); 1283$code.=<<___; 1284.globl vpaes_ctr32_encrypt_blocks 1285.type vpaes_ctr32_encrypt_blocks,%function 1286.align 4 1287vpaes_ctr32_encrypt_blocks: 1288 stp x29,x30,[sp,#-16]! 1289 add x29,sp,#0 1290 stp d8,d9,[sp,#-16]! // ABI spec says so 1291 stp d10,d11,[sp,#-16]! 1292 stp d12,d13,[sp,#-16]! 1293 stp d14,d15,[sp,#-16]! 1294 1295 cbz $len, .Lctr32_done 1296 1297 // Note, unlike the other functions, $len here is measured in blocks, 1298 // not bytes. 1299 mov x17, $len 1300 mov x2, $key 1301 1302 // Load the IV and counter portion. 1303 ldr $ctr, [$ivec, #12] 1304 ld1 {v7.16b}, [$ivec] 1305 1306 bl _vpaes_encrypt_preheat 1307 tst x17, #1 1308 rev $ctr, $ctr // The counter is big-endian. 1309 b.eq .Lctr32_prep_loop 1310 1311 // Handle one block so the remaining block count is even for 1312 // _vpaes_encrypt_2x. 1313 ld1 {v6.16b}, [$inp], #16 // Load input ahead of time 1314 bl _vpaes_encrypt_core 1315 eor v0.16b, v0.16b, v6.16b // XOR input and result 1316 st1 {v0.16b}, [$out], #16 1317 subs x17, x17, #1 1318 // Update the counter. 1319 add $ctr, $ctr, #1 1320 rev $ctr_tmp, $ctr 1321 mov v7.s[3], $ctr_tmp 1322 b.ls .Lctr32_done 1323 1324.Lctr32_prep_loop: 1325 // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x 1326 // uses v14 and v15. 1327 mov v15.16b, v7.16b 1328 mov v14.16b, v7.16b 1329 add $ctr, $ctr, #1 1330 rev $ctr_tmp, $ctr 1331 mov v15.s[3], $ctr_tmp 1332 1333.Lctr32_loop: 1334 ld1 {v6.16b,v7.16b}, [$inp], #32 // Load input ahead of time 1335 bl _vpaes_encrypt_2x 1336 eor v0.16b, v0.16b, v6.16b // XOR input and result 1337 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) 1338 st1 {v0.16b,v1.16b}, [$out], #32 1339 subs x17, x17, #2 1340 // Update the counter. 1341 add $ctr_tmp, $ctr, #1 1342 add $ctr, $ctr, #2 1343 rev $ctr_tmp, $ctr_tmp 1344 mov v14.s[3], $ctr_tmp 1345 rev $ctr_tmp, $ctr 1346 mov v15.s[3], $ctr_tmp 1347 b.hi .Lctr32_loop 1348 1349.Lctr32_done: 1350 ldp d14,d15,[sp],#16 1351 ldp d12,d13,[sp],#16 1352 ldp d10,d11,[sp],#16 1353 ldp d8,d9,[sp],#16 1354 ldp x29,x30,[sp],#16 1355 ret 1356.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks 1357___ 1358} 1359 1360print $code; 1361 1362close STDOUT; 1363