1#!/usr/bin/env perl 2# Copyright (c) 2019, Google Inc. 3# 4# Permission to use, copy, modify, and/or distribute this software for any 5# purpose with or without fee is hereby granted, provided that the above 6# copyright notice and this permission notice appear in all copies. 7# 8# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 11# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 13# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 14# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 16# ghash-ssse3-x86.pl is a constant-time variant of the traditional 4-bit 17# table-based GHASH implementation. It requires SSSE3 instructions. 18# 19# For background, the table-based strategy is a 4-bit windowed multiplication. 20# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops 21# over 4-bit windows of the input and indexes them up into the table. Visually, 22# it multiplies as in the schoolbook multiplication diagram below, but with 23# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First 24# it incorporates the terms labeled '1' by indexing the most significant term 25# of X into the table. Then it shifts and repeats for '2' and so on. 26# 27# hhhhhh 28# * xxxxxx 29# ============ 30# 666666 31# 555555 32# 444444 33# 333333 34# 222222 35# 111111 36# 37# This implementation changes the order. We treat the table as a 16×16 matrix 38# and transpose it. The first row is then the first byte of each multiple of H, 39# and so on. We then reorder terms as below. Observe that the terms labeled '1' 40# and '2' are all lookups into the first row, etc. This maps well to the SSSE3 41# pshufb instruction, using alternating terms of X in parallel as indices. This 42# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and 43# repeat for each row. 44# 45# hhhhhh 46# * xxxxxx 47# ============ 48# 224466 49# 113355 50# 224466 51# 113355 52# 224466 53# 113355 54# 55# Next we account for GCM's confusing bit order. The "first" bit is the least 56# significant coefficient, but GCM treats the most sigificant bit within a byte 57# as first. Bytes are little-endian, and bits are big-endian. We reverse the 58# bytes in XMM registers for a consistent bit and byte ordering, but this means 59# the least significant bit is the most significant coefficient and vice versa. 60# 61# For consistency, "low", "high", "left-shift", and "right-shift" refer to the 62# bit ordering within the XMM register, rather than the reversed coefficient 63# ordering. Low bits are less significant bits and more significant 64# coefficients. Right-shifts move from MSB to the LSB and correspond to 65# increasing the power of each coefficient. 66# 67# Note this bit reversal enters into the table's column indices. H*1 is stored 68# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier 69# table rows contain more significant coefficients, so we iterate forwards. 70 71$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 72push(@INC,"${dir}","${dir}../../../perlasm"); 73require "x86asm.pl"; 74 75$output = pop; 76open STDOUT, ">$output"; 77 78&asm_init($ARGV[0]); 79 80my ($Xi, $Htable, $in, $len) = ("edi", "esi", "edx", "ecx"); 81&static_label("reverse_bytes"); 82&static_label("low4_mask"); 83 84my $call_counter = 0; 85# process_rows emits assembly code to process $rows rows of the table. On 86# input, $Htable stores the pointer to the next row. xmm0 and xmm1 store the 87# low and high halves of the input. The result so far is passed in xmm2. xmm3 88# must be zero. On output, $Htable is advanced to the next row and xmm2 is 89# updated. xmm3 remains zero. It clobbers eax, xmm4, xmm5, and xmm6. 90sub process_rows { 91 my ($rows) = @_; 92 $call_counter++; 93 94 # Shifting whole XMM registers by bits is complex. psrldq shifts by 95 # bytes, and psrlq shifts the two 64-bit halves separately. Each row 96 # produces 8 bits of carry, and the reduction needs an additional 7-bit 97 # shift. This must fit in 64 bits so reduction can use psrlq. This 98 # allows up to 7 rows at a time. 99 die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64); 100 101 &mov("eax", $rows); 102&set_label("loop_row_$call_counter"); 103 &movdqa("xmm4", &QWP(0, $Htable)); 104 &lea($Htable, &DWP(16, $Htable)); 105 106 # Right-shift xmm2 and xmm3 by 8 bytes. 107 &movdqa("xmm6", "xmm2"); 108 &palignr("xmm6", "xmm3", 1); 109 &movdqa("xmm3", "xmm6"); 110 &psrldq("xmm2", 1); 111 112 # Load the next table row and index the low and high bits of the input. 113 # Note the low (respectively, high) half corresponds to more 114 # (respectively, less) significant coefficients. 115 &movdqa("xmm5", "xmm4"); 116 &pshufb("xmm4", "xmm0"); 117 &pshufb("xmm5", "xmm1"); 118 119 # Add the high half (xmm5) without shifting. 120 &pxor("xmm2", "xmm5"); 121 122 # Add the low half (xmm4). This must be right-shifted by 4 bits. First, 123 # add into the carry register (xmm3). 124 &movdqa("xmm5", "xmm4"); 125 &psllq("xmm5", 60); 126 &movdqa("xmm6", "xmm5"); 127 &pslldq("xmm6", 8); 128 &pxor("xmm3", "xmm6"); 129 130 # Next, add into xmm2. 131 &psrldq("xmm5", 8); 132 &pxor("xmm2", "xmm5"); 133 &psrlq("xmm4", 4); 134 &pxor("xmm2", "xmm4"); 135 136 &sub("eax", 1); 137 &jnz(&label("loop_row_$call_counter")); 138 139 # Reduce the carry register. The reduction polynomial is 1 + x + x^2 + 140 # x^7, so we shift and XOR four times. 141 &pxor("xmm2", "xmm3"); # x^0 = 0 142 &psrlq("xmm3", 1); 143 &pxor("xmm2", "xmm3"); # x^1 = x 144 &psrlq("xmm3", 1); 145 &pxor("xmm2", "xmm3"); # x^(1+1) = x^2 146 &psrlq("xmm3", 5); 147 &pxor("xmm2", "xmm3"); # x^(1+1+5) = x^7 148 &pxor("xmm3", "xmm3"); 149____ 150} 151 152# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|. 153# |Xi| is represented in GHASH's serialized byte representation. |Htable| is 154# formatted as described above. 155# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); 156&function_begin("gcm_gmult_ssse3"); 157 &mov($Xi, &wparam(0)); 158 &mov($Htable, &wparam(1)); 159 160 &movdqu("xmm0", &QWP(0, $Xi)); 161 &call(&label("pic_point")); 162&set_label("pic_point"); 163 &blindpop("eax"); 164 &movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "eax")); 165 &movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "eax")); 166 167 # Reverse input bytes to deserialize. 168 &pshufb("xmm0", "xmm7"); 169 170 # Split each byte into low (xmm0) and high (xmm1) halves. 171 &movdqa("xmm1", "xmm2"); 172 &pandn("xmm1", "xmm0"); 173 &psrld("xmm1", 4); 174 &pand("xmm0", "xmm2"); 175 176 # Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note 177 # that, due to bit reversal, xmm3 contains bits that fall off when 178 # right-shifting, not left-shifting. 179 &pxor("xmm2", "xmm2"); 180 &pxor("xmm3", "xmm3"); 181 182 # We must reduce at least once every 7 rows, so divide into three 183 # chunks. 184 &process_rows(5); 185 &process_rows(5); 186 &process_rows(6); 187 188 # Store the result. Reverse bytes to serialize. 189 &pshufb("xmm2", "xmm7"); 190 &movdqu(&QWP(0, $Xi), "xmm2"); 191 192 # Zero any registers which contain secrets. 193 &pxor("xmm0", "xmm0"); 194 &pxor("xmm1", "xmm1"); 195 &pxor("xmm2", "xmm2"); 196 &pxor("xmm3", "xmm3"); 197 &pxor("xmm4", "xmm4"); 198 &pxor("xmm5", "xmm5"); 199 &pxor("xmm6", "xmm6"); 200&function_end("gcm_gmult_ssse3"); 201 202# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as 203# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's 204# serialized byte representation. |Htable| is formatted as described above. 205# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, 206# size_t len); 207&function_begin("gcm_ghash_ssse3"); 208 &mov($Xi, &wparam(0)); 209 &mov($Htable, &wparam(1)); 210 &mov($in, &wparam(2)); 211 &mov($len, &wparam(3)); 212 213 &movdqu("xmm0", &QWP(0, $Xi)); 214 &call(&label("pic_point")); 215&set_label("pic_point"); 216 &blindpop("ebx"); 217 &movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "ebx")); 218 219 # This function only processes whole blocks. 220 &and($len, -16); 221 222 # Reverse input bytes to deserialize. We maintain the running 223 # total in xmm0. 224 &pshufb("xmm0", "xmm7"); 225 226 # Iterate over each block. On entry to each iteration, xmm3 is zero. 227 &pxor("xmm3", "xmm3"); 228&set_label("loop_ghash"); 229 &movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "ebx")); 230 231 # Incorporate the next block of input. 232 &movdqu("xmm1", &QWP(0, $in)); 233 &pshufb("xmm1", "xmm7"); # Reverse bytes. 234 &pxor("xmm0", "xmm1"); 235 236 # Split each byte into low (xmm0) and high (xmm1) halves. 237 &movdqa("xmm1", "xmm2"); 238 &pandn("xmm1", "xmm0"); 239 &psrld("xmm1", 4); 240 &pand("xmm0", "xmm2"); 241 242 # Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note 243 # that, due to bit reversal, xmm3 contains bits that fall off when 244 # right-shifting, not left-shifting. 245 &pxor("xmm2", "xmm2"); 246 # xmm3 is already zero at this point. 247 248 # We must reduce at least once every 7 rows, so divide into three 249 # chunks. 250 &process_rows(5); 251 &process_rows(5); 252 &process_rows(6); 253 254 &movdqa("xmm0", "xmm2"); 255 256 # Rewind $Htable for the next iteration. 257 &lea($Htable, &DWP(-256, $Htable)); 258 259 # Advance input and continue. 260 &lea($in, &DWP(16, $in)); 261 &sub($len, 16); 262 &jnz(&label("loop_ghash")); 263 264 # Reverse bytes and store the result. 265 &pshufb("xmm0", "xmm7"); 266 &movdqu(&QWP(0, $Xi), "xmm0"); 267 268 # Zero any registers which contain secrets. 269 &pxor("xmm0", "xmm0"); 270 &pxor("xmm1", "xmm1"); 271 &pxor("xmm2", "xmm2"); 272 &pxor("xmm3", "xmm3"); 273 &pxor("xmm4", "xmm4"); 274 &pxor("xmm5", "xmm5"); 275 &pxor("xmm6", "xmm6"); 276&function_end("gcm_ghash_ssse3"); 277 278# reverse_bytes is a permutation which, if applied with pshufb, reverses the 279# bytes in an XMM register. 280&set_label("reverse_bytes", 16); 281&data_byte(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 282# low4_mask is an XMM mask which selects the low four bits of each byte. 283&set_label("low4_mask", 16); 284&data_word(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f); 285 286&asm_finish(); 287 288close STDOUT; 289