1#!/usr/bin/env perl 2# Copyright (c) 2019, Google Inc. 3# 4# Permission to use, copy, modify, and/or distribute this software for any 5# purpose with or without fee is hereby granted, provided that the above 6# copyright notice and this permission notice appear in all copies. 7# 8# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 11# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 13# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 14# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 16# ghash-ssse3-x86_64.pl is a constant-time variant of the traditional 4-bit 17# table-based GHASH implementation. It requires SSSE3 instructions. 18# 19# For background, the table-based strategy is a 4-bit windowed multiplication. 20# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops 21# over 4-bit windows of the input and indexes them up into the table. Visually, 22# it multiplies as in the schoolbook multiplication diagram below, but with 23# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First 24# it incorporates the terms labeled '1' by indexing the most significant term 25# of X into the table. Then it shifts and repeats for '2' and so on. 26# 27# hhhhhh 28# * xxxxxx 29# ============ 30# 666666 31# 555555 32# 444444 33# 333333 34# 222222 35# 111111 36# 37# This implementation changes the order. We treat the table as a 16×16 matrix 38# and transpose it. The first row is then the first byte of each multiple of H, 39# and so on. We then reorder terms as below. Observe that the terms labeled '1' 40# and '2' are all lookups into the first row, etc. This maps well to the SSSE3 41# pshufb instruction, using alternating terms of X in parallel as indices. This 42# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and 43# repeat for each row. 44# 45# hhhhhh 46# * xxxxxx 47# ============ 48# 224466 49# 113355 50# 224466 51# 113355 52# 224466 53# 113355 54# 55# Next we account for GCM's confusing bit order. The "first" bit is the least 56# significant coefficient, but GCM treats the most sigificant bit within a byte 57# as first. Bytes are little-endian, and bits are big-endian. We reverse the 58# bytes in XMM registers for a consistent bit and byte ordering, but this means 59# the least significant bit is the most significant coefficient and vice versa. 60# 61# For consistency, "low", "high", "left-shift", and "right-shift" refer to the 62# bit ordering within the XMM register, rather than the reversed coefficient 63# ordering. Low bits are less significant bits and more significant 64# coefficients. Right-shifts move from MSB to the LSB and correspond to 65# increasing the power of each coefficient. 66# 67# Note this bit reversal enters into the table's column indices. H*1 is stored 68# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier 69# table rows contain more significant coefficients, so we iterate forwards. 70 71use strict; 72 73my $flavour = shift; 74my $output = shift; 75if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 76 77my $win64 = 0; 78$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 79 80$0 =~ m/(.*[\/\\])[^\/\\]+$/; 81my $dir = $1; 82my $xlate; 83( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 84( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 85die "can't locate x86_64-xlate.pl"; 86 87open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""; 88*STDOUT = *OUT; 89 90my ($Xi, $Htable, $in, $len) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") : 91 ("%rdi", "%rsi", "%rdx", "%rcx"); 92 93 94my $code = <<____; 95.text 96 97# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|. 98# |Xi| is represented in GHASH's serialized byte representation. |Htable| is 99# formatted as described above. 100# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); 101.type gcm_gmult_ssse3, \@abi-omnipotent 102.globl gcm_gmult_ssse3 103.align 16 104gcm_gmult_ssse3: 105.cfi_startproc 106.Lgmult_seh_begin: 107____ 108$code .= <<____ if ($win64); 109 subq \$40, %rsp 110.Lgmult_seh_allocstack: 111 movdqa %xmm6, (%rsp) 112.Lgmult_seh_save_xmm6: 113 movdqa %xmm10, 16(%rsp) 114.Lgmult_seh_save_xmm10: 115.Lgmult_seh_prolog_end: 116____ 117$code .= <<____; 118 movdqu ($Xi), %xmm0 119 movdqa .Lreverse_bytes(%rip), %xmm10 120 movdqa .Llow4_mask(%rip), %xmm2 121 122 # Reverse input bytes to deserialize. 123 pshufb %xmm10, %xmm0 124 125 # Split each byte into low (%xmm0) and high (%xmm1) halves. 126 movdqa %xmm2, %xmm1 127 pandn %xmm0, %xmm1 128 psrld \$4, %xmm1 129 pand %xmm2, %xmm0 130 131 # Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note 132 # that, due to bit reversal, %xmm3 contains bits that fall off when 133 # right-shifting, not left-shifting. 134 pxor %xmm2, %xmm2 135 pxor %xmm3, %xmm3 136____ 137 138my $call_counter = 0; 139# process_rows returns assembly code to process $rows rows of the table. On 140# input, $Htable stores the pointer to the next row. %xmm0 and %xmm1 store the 141# low and high halves of the input. The result so far is passed in %xmm2. %xmm3 142# must be zero. On output, $Htable is advanced to the next row and %xmm2 is 143# updated. %xmm3 remains zero. It clobbers %rax, %xmm4, %xmm5, and %xmm6. 144sub process_rows { 145 my ($rows) = @_; 146 $call_counter++; 147 148 # Shifting whole XMM registers by bits is complex. psrldq shifts by bytes, 149 # and psrlq shifts the two 64-bit halves separately. Each row produces 8 150 # bits of carry, and the reduction needs an additional 7-bit shift. This 151 # must fit in 64 bits so reduction can use psrlq. This allows up to 7 rows 152 # at a time. 153 die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64); 154 155 return <<____; 156 movq \$$rows, %rax 157.Loop_row_$call_counter: 158 movdqa ($Htable), %xmm4 159 leaq 16($Htable), $Htable 160 161 # Right-shift %xmm2 and %xmm3 by 8 bytes. 162 movdqa %xmm2, %xmm6 163 palignr \$1, %xmm3, %xmm6 164 movdqa %xmm6, %xmm3 165 psrldq \$1, %xmm2 166 167 # Load the next table row and index the low and high bits of the input. 168 # Note the low (respectively, high) half corresponds to more 169 # (respectively, less) significant coefficients. 170 movdqa %xmm4, %xmm5 171 pshufb %xmm0, %xmm4 172 pshufb %xmm1, %xmm5 173 174 # Add the high half (%xmm5) without shifting. 175 pxor %xmm5, %xmm2 176 177 # Add the low half (%xmm4). This must be right-shifted by 4 bits. First, 178 # add into the carry register (%xmm3). 179 movdqa %xmm4, %xmm5 180 psllq \$60, %xmm5 181 movdqa %xmm5, %xmm6 182 pslldq \$8, %xmm6 183 pxor %xmm6, %xmm3 184 185 # Next, add into %xmm2. 186 psrldq \$8, %xmm5 187 pxor %xmm5, %xmm2 188 psrlq \$4, %xmm4 189 pxor %xmm4, %xmm2 190 191 subq \$1, %rax 192 jnz .Loop_row_$call_counter 193 194 # Reduce the carry register. The reduction polynomial is 1 + x + x^2 + 195 # x^7, so we shift and XOR four times. 196 pxor %xmm3, %xmm2 # x^0 = 0 197 psrlq \$1, %xmm3 198 pxor %xmm3, %xmm2 # x^1 = x 199 psrlq \$1, %xmm3 200 pxor %xmm3, %xmm2 # x^(1+1) = x^2 201 psrlq \$5, %xmm3 202 pxor %xmm3, %xmm2 # x^(1+1+5) = x^7 203 pxor %xmm3, %xmm3 204____ 205} 206 207# We must reduce at least once every 7 rows, so divide into three chunks. 208$code .= process_rows(5); 209$code .= process_rows(5); 210$code .= process_rows(6); 211 212$code .= <<____; 213 # Store the result. Reverse bytes to serialize. 214 pshufb %xmm10, %xmm2 215 movdqu %xmm2, ($Xi) 216 217 # Zero any registers which contain secrets. 218 pxor %xmm0, %xmm0 219 pxor %xmm1, %xmm1 220 pxor %xmm2, %xmm2 221 pxor %xmm3, %xmm3 222 pxor %xmm4, %xmm4 223 pxor %xmm5, %xmm5 224 pxor %xmm6, %xmm6 225____ 226$code .= <<____ if ($win64); 227 movdqa (%rsp), %xmm6 228 movdqa 16(%rsp), %xmm10 229 addq \$40, %rsp 230____ 231$code .= <<____; 232 ret 233.Lgmult_seh_end: 234.cfi_endproc 235.size gcm_gmult_ssse3,.-gcm_gmult_ssse3 236____ 237 238$code .= <<____; 239# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as 240# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's 241# serialized byte representation. |Htable| is formatted as described above. 242# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, 243# size_t len); 244.type gcm_ghash_ssse3, \@abi-omnipotent 245.globl gcm_ghash_ssse3 246.align 16 247gcm_ghash_ssse3: 248.Lghash_seh_begin: 249.cfi_startproc 250____ 251$code .= <<____ if ($win64); 252 subq \$56, %rsp 253.Lghash_seh_allocstack: 254 movdqa %xmm6, (%rsp) 255.Lghash_seh_save_xmm6: 256 movdqa %xmm10, 16(%rsp) 257.Lghash_seh_save_xmm10: 258 movdqa %xmm11, 32(%rsp) 259.Lghash_seh_save_xmm11: 260.Lghash_seh_prolog_end: 261____ 262$code .= <<____; 263 movdqu ($Xi), %xmm0 264 movdqa .Lreverse_bytes(%rip), %xmm10 265 movdqa .Llow4_mask(%rip), %xmm11 266 267 # This function only processes whole blocks. 268 andq \$-16, $len 269 270 # Reverse input bytes to deserialize. We maintain the running 271 # total in %xmm0. 272 pshufb %xmm10, %xmm0 273 274 # Iterate over each block. On entry to each iteration, %xmm3 is zero. 275 pxor %xmm3, %xmm3 276.Loop_ghash: 277 # Incorporate the next block of input. 278 movdqu ($in), %xmm1 279 pshufb %xmm10, %xmm1 # Reverse bytes. 280 pxor %xmm1, %xmm0 281 282 # Split each byte into low (%xmm0) and high (%xmm1) halves. 283 movdqa %xmm11, %xmm1 284 pandn %xmm0, %xmm1 285 psrld \$4, %xmm1 286 pand %xmm11, %xmm0 287 288 # Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note 289 # that, due to bit reversal, %xmm3 contains bits that fall off when 290 # right-shifting, not left-shifting. 291 pxor %xmm2, %xmm2 292 # %xmm3 is already zero at this point. 293____ 294 295# We must reduce at least once every 7 rows, so divide into three chunks. 296$code .= process_rows(5); 297$code .= process_rows(5); 298$code .= process_rows(6); 299 300$code .= <<____; 301 movdqa %xmm2, %xmm0 302 303 # Rewind $Htable for the next iteration. 304 leaq -256($Htable), $Htable 305 306 # Advance input and continue. 307 leaq 16($in), $in 308 subq \$16, $len 309 jnz .Loop_ghash 310 311 # Reverse bytes and store the result. 312 pshufb %xmm10, %xmm0 313 movdqu %xmm0, ($Xi) 314 315 # Zero any registers which contain secrets. 316 pxor %xmm0, %xmm0 317 pxor %xmm1, %xmm1 318 pxor %xmm2, %xmm2 319 pxor %xmm3, %xmm3 320 pxor %xmm4, %xmm4 321 pxor %xmm5, %xmm5 322 pxor %xmm6, %xmm6 323____ 324$code .= <<____ if ($win64); 325 movdqa (%rsp), %xmm6 326 movdqa 16(%rsp), %xmm10 327 movdqa 32(%rsp), %xmm11 328 addq \$56, %rsp 329____ 330$code .= <<____; 331 ret 332.Lghash_seh_end: 333.cfi_endproc 334.size gcm_ghash_ssse3,.-gcm_ghash_ssse3 335 336.align 16 337# .Lreverse_bytes is a permutation which, if applied with pshufb, reverses the 338# bytes in an XMM register. 339.Lreverse_bytes: 340.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 341# .Llow4_mask is an XMM mask which selects the low four bits of each byte. 342.Llow4_mask: 343.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 344____ 345 346if ($win64) { 347 # Add unwind metadata for SEH. 348 # 349 # TODO(davidben): This is all manual right now. Once we've added SEH tests, 350 # add support for emitting these in x86_64-xlate.pl, probably based on MASM 351 # and Yasm's unwind directives, and unify with CFI. Then upstream it to 352 # replace the error-prone and non-standard custom handlers. 353 354 # See https://docs.microsoft.com/en-us/cpp/build/struct-unwind-code?view=vs-2017 355 my $UWOP_ALLOC_SMALL = 2; 356 my $UWOP_SAVE_XMM128 = 8; 357 358 $code .= <<____; 359.section .pdata 360.align 4 361 .rva .Lgmult_seh_begin 362 .rva .Lgmult_seh_end 363 .rva .Lgmult_seh_info 364 365 .rva .Lghash_seh_begin 366 .rva .Lghash_seh_end 367 .rva .Lghash_seh_info 368 369.section .xdata 370.align 8 371.Lgmult_seh_info: 372 .byte 1 # version 1, no flags 373 .byte .Lgmult_seh_prolog_end-.Lgmult_seh_begin 374 .byte 5 # num_slots = 1 + 2 + 2 375 .byte 0 # no frame register 376 377 .byte .Lgmult_seh_save_xmm10-.Lgmult_seh_begin 378 .byte @{[$UWOP_SAVE_XMM128 | (10 << 4)]} 379 .value 1 380 381 .byte .Lgmult_seh_save_xmm6-.Lgmult_seh_begin 382 .byte @{[$UWOP_SAVE_XMM128 | (6 << 4)]} 383 .value 0 384 385 .byte .Lgmult_seh_allocstack-.Lgmult_seh_begin 386 .byte @{[$UWOP_ALLOC_SMALL | (((40 - 8) / 8) << 4)]} 387 388.align 8 389.Lghash_seh_info: 390 .byte 1 # version 1, no flags 391 .byte .Lghash_seh_prolog_end-.Lghash_seh_begin 392 .byte 7 # num_slots = 1 + 2 + 2 + 2 393 .byte 0 # no frame register 394 395 .byte .Lghash_seh_save_xmm11-.Lghash_seh_begin 396 .byte @{[$UWOP_SAVE_XMM128 | (11 << 4)]} 397 .value 2 398 399 .byte .Lghash_seh_save_xmm10-.Lghash_seh_begin 400 .byte @{[$UWOP_SAVE_XMM128 | (10 << 4)]} 401 .value 1 402 403 .byte .Lghash_seh_save_xmm6-.Lghash_seh_begin 404 .byte @{[$UWOP_SAVE_XMM128 | (6 << 4)]} 405 .value 0 406 407 .byte .Lghash_seh_allocstack-.Lghash_seh_begin 408 .byte @{[$UWOP_ALLOC_SMALL | (((56 - 8) / 8) << 4)]} 409____ 410} 411 412print $code; 413close STDOUT; 414