1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright (c) 2012, Intel Corporation # 6# # 7# All rights reserved. # 8# # 9# Redistribution and use in source and binary forms, with or without # 10# modification, are permitted provided that the following conditions are # 11# met: # 12# # 13# * Redistributions of source code must retain the above copyright # 14# notice, this list of conditions and the following disclaimer. # 15# # 16# * Redistributions in binary form must reproduce the above copyright # 17# notice, this list of conditions and the following disclaimer in the # 18# documentation and/or other materials provided with the # 19# distribution. # 20# # 21# * Neither the name of the Intel Corporation nor the names of its # 22# contributors may be used to endorse or promote products derived from # 23# this software without specific prior written permission. # 24# # 25# # 26# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # 27# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # 28# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # 29# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # 30# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # 31# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # 32# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # 33# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # 34# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # 35# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # 36# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 37# # 38############################################################################## 39# Developers and authors: # 40# Shay Gueron (1, 2), and Vlad Krasnov (1) # 41# (1) Intel Corporation, Israel Development Center, Haifa, Israel # 42# (2) University of Haifa, Israel # 43############################################################################## 44# Reference: # 45# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular # 46# Exponentiation, Using Advanced Vector Instructions Architectures", # 47# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, # 48# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 # 49# [2] S. Gueron: "Efficient Software Implementations of Modular # 50# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). # 51# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE # 52# Proceedings of 9th International Conference on Information Technology: # 53# New Generations (ITNG 2012), pp.821-823 (2012) # 54# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # 55# resistant 1024-bit modular exponentiation, for optimizing RSA2048 # 56# on AVX2 capable x86_64 platforms", # 57# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest# 58############################################################################## 59# 60# +13% improvement over original submission by <appro@openssl.org> 61# 62# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this 63# 2.3GHz Haswell 621 765/+23% 1113/+79% 64# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63% 65# 66# (*) if system doesn't support AVX2, for reference purposes; 67# (**) scaled to 2.3GHz to simplify comparison; 68# (***) scalar AD*X code is faster than AVX2 and is preferred code 69# path for Broadwell; 70 71$flavour = shift; 72$output = shift; 73if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 74 75$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 76 77$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 78( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 79( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 80die "can't locate x86_64-xlate.pl"; 81 82# In upstream, this is controlled by shelling out to the compiler to check 83# versions, but BoringSSL is intended to be used with pre-generated perlasm 84# output, so this isn't useful anyway. 85# 86# TODO(davidben): Enable these after testing. $avx goes up to 2 and $addx to 1. 87$avx = 2; 88$addx = 1; 89 90open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 91*STDOUT = *OUT; 92 93if ($avx>1) {{{ 94{ # void AMS_WW( 95my $rp="%rdi"; # BN_ULONG *rp, 96my $ap="%rsi"; # const BN_ULONG *ap, 97my $np="%rdx"; # const BN_ULONG *np, 98my $n0="%ecx"; # const BN_ULONG n0, 99my $rep="%r8d"; # int repeat); 100 101# The registers that hold the accumulated redundant result 102# The AMM works on 1024 bit operands, and redundant word size is 29 103# Therefore: ceil(1024/29)/4 = 9 104my $ACC0="%ymm0"; 105my $ACC1="%ymm1"; 106my $ACC2="%ymm2"; 107my $ACC3="%ymm3"; 108my $ACC4="%ymm4"; 109my $ACC5="%ymm5"; 110my $ACC6="%ymm6"; 111my $ACC7="%ymm7"; 112my $ACC8="%ymm8"; 113my $ACC9="%ymm9"; 114# Registers that hold the broadcasted words of bp, currently used 115my $B1="%ymm10"; 116my $B2="%ymm11"; 117# Registers that hold the broadcasted words of Y, currently used 118my $Y1="%ymm12"; 119my $Y2="%ymm13"; 120# Helper registers 121my $TEMP1="%ymm14"; 122my $AND_MASK="%ymm15"; 123# alu registers that hold the first words of the ACC 124my $r0="%r9"; 125my $r1="%r10"; 126my $r2="%r11"; 127my $r3="%r12"; 128 129my $i="%r14d"; # loop counter 130my $tmp = "%r15"; 131 132my $FrameSize=32*18+32*8; # place for A^2 and 2*A 133 134my $aap=$r0; 135my $tp0="%rbx"; 136my $tp1=$r3; 137my $tpa=$tmp; 138 139$np="%r13"; # reassigned argument 140 141$code.=<<___; 142.text 143 144.globl rsaz_1024_sqr_avx2 145.type rsaz_1024_sqr_avx2,\@function,5 146.align 64 147rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 148.cfi_startproc 149 lea (%rsp), %rax 150.cfi_def_cfa_register %rax 151 push %rbx 152.cfi_push %rbx 153 push %rbp 154.cfi_push %rbp 155 push %r12 156.cfi_push %r12 157 push %r13 158.cfi_push %r13 159 push %r14 160.cfi_push %r14 161 push %r15 162.cfi_push %r15 163 vzeroupper 164___ 165$code.=<<___ if ($win64); 166 lea -0xa8(%rsp),%rsp 167 vmovaps %xmm6,-0xd8(%rax) 168 vmovaps %xmm7,-0xc8(%rax) 169 vmovaps %xmm8,-0xb8(%rax) 170 vmovaps %xmm9,-0xa8(%rax) 171 vmovaps %xmm10,-0x98(%rax) 172 vmovaps %xmm11,-0x88(%rax) 173 vmovaps %xmm12,-0x78(%rax) 174 vmovaps %xmm13,-0x68(%rax) 175 vmovaps %xmm14,-0x58(%rax) 176 vmovaps %xmm15,-0x48(%rax) 177.Lsqr_1024_body: 178___ 179$code.=<<___; 180 mov %rax,%rbp 181.cfi_def_cfa_register %rbp 182 mov %rdx, $np # reassigned argument 183 sub \$$FrameSize, %rsp 184 mov $np, $tmp 185 sub \$-128, $rp # size optimization 186 sub \$-128, $ap 187 sub \$-128, $np 188 189 and \$4095, $tmp # see if $np crosses page 190 add \$32*10, $tmp 191 shr \$12, $tmp 192 vpxor $ACC9,$ACC9,$ACC9 193 jz .Lsqr_1024_no_n_copy 194 195 # unaligned 256-bit load that crosses page boundary can 196 # cause >2x performance degradation here, so if $np does 197 # cross page boundary, copy it to stack and make sure stack 198 # frame doesn't... 199 sub \$32*10,%rsp 200 vmovdqu 32*0-128($np), $ACC0 201 and \$-2048, %rsp 202 vmovdqu 32*1-128($np), $ACC1 203 vmovdqu 32*2-128($np), $ACC2 204 vmovdqu 32*3-128($np), $ACC3 205 vmovdqu 32*4-128($np), $ACC4 206 vmovdqu 32*5-128($np), $ACC5 207 vmovdqu 32*6-128($np), $ACC6 208 vmovdqu 32*7-128($np), $ACC7 209 vmovdqu 32*8-128($np), $ACC8 210 lea $FrameSize+128(%rsp),$np 211 vmovdqu $ACC0, 32*0-128($np) 212 vmovdqu $ACC1, 32*1-128($np) 213 vmovdqu $ACC2, 32*2-128($np) 214 vmovdqu $ACC3, 32*3-128($np) 215 vmovdqu $ACC4, 32*4-128($np) 216 vmovdqu $ACC5, 32*5-128($np) 217 vmovdqu $ACC6, 32*6-128($np) 218 vmovdqu $ACC7, 32*7-128($np) 219 vmovdqu $ACC8, 32*8-128($np) 220 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero 221 222.Lsqr_1024_no_n_copy: 223 and \$-1024, %rsp 224 225 vmovdqu 32*1-128($ap), $ACC1 226 vmovdqu 32*2-128($ap), $ACC2 227 vmovdqu 32*3-128($ap), $ACC3 228 vmovdqu 32*4-128($ap), $ACC4 229 vmovdqu 32*5-128($ap), $ACC5 230 vmovdqu 32*6-128($ap), $ACC6 231 vmovdqu 32*7-128($ap), $ACC7 232 vmovdqu 32*8-128($ap), $ACC8 233 234 lea 192(%rsp), $tp0 # 64+128=192 235 vpbroadcastq .Land_mask(%rip), $AND_MASK 236 jmp .LOOP_GRANDE_SQR_1024 237 238.align 32 239.LOOP_GRANDE_SQR_1024: 240 lea 32*18+128(%rsp), $aap # size optimization 241 lea 448(%rsp), $tp1 # 64+128+256=448 242 243 # the squaring is performed as described in Variant B of 244 # "Speeding up Big-Number Squaring", so start by calculating 245 # the A*2=A+A vector 246 vpaddq $ACC1, $ACC1, $ACC1 247 vpbroadcastq 32*0-128($ap), $B1 248 vpaddq $ACC2, $ACC2, $ACC2 249 vmovdqa $ACC1, 32*0-128($aap) 250 vpaddq $ACC3, $ACC3, $ACC3 251 vmovdqa $ACC2, 32*1-128($aap) 252 vpaddq $ACC4, $ACC4, $ACC4 253 vmovdqa $ACC3, 32*2-128($aap) 254 vpaddq $ACC5, $ACC5, $ACC5 255 vmovdqa $ACC4, 32*3-128($aap) 256 vpaddq $ACC6, $ACC6, $ACC6 257 vmovdqa $ACC5, 32*4-128($aap) 258 vpaddq $ACC7, $ACC7, $ACC7 259 vmovdqa $ACC6, 32*5-128($aap) 260 vpaddq $ACC8, $ACC8, $ACC8 261 vmovdqa $ACC7, 32*6-128($aap) 262 vpxor $ACC9, $ACC9, $ACC9 263 vmovdqa $ACC8, 32*7-128($aap) 264 265 vpmuludq 32*0-128($ap), $B1, $ACC0 266 vpbroadcastq 32*1-128($ap), $B2 267 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half 268 vpmuludq $B1, $ACC1, $ACC1 269 vmovdqu $ACC9, 32*10-448($tp1) 270 vpmuludq $B1, $ACC2, $ACC2 271 vmovdqu $ACC9, 32*11-448($tp1) 272 vpmuludq $B1, $ACC3, $ACC3 273 vmovdqu $ACC9, 32*12-448($tp1) 274 vpmuludq $B1, $ACC4, $ACC4 275 vmovdqu $ACC9, 32*13-448($tp1) 276 vpmuludq $B1, $ACC5, $ACC5 277 vmovdqu $ACC9, 32*14-448($tp1) 278 vpmuludq $B1, $ACC6, $ACC6 279 vmovdqu $ACC9, 32*15-448($tp1) 280 vpmuludq $B1, $ACC7, $ACC7 281 vmovdqu $ACC9, 32*16-448($tp1) 282 vpmuludq $B1, $ACC8, $ACC8 283 vpbroadcastq 32*2-128($ap), $B1 284 vmovdqu $ACC9, 32*17-448($tp1) 285 286 mov $ap, $tpa 287 mov \$4, $i 288 jmp .Lsqr_entry_1024 289___ 290$TEMP0=$Y1; 291$TEMP2=$Y2; 292$code.=<<___; 293.align 32 294.LOOP_SQR_1024: 295 vpbroadcastq 32*1-128($tpa), $B2 296 vpmuludq 32*0-128($ap), $B1, $ACC0 297 vpaddq 32*0-192($tp0), $ACC0, $ACC0 298 vpmuludq 32*0-128($aap), $B1, $ACC1 299 vpaddq 32*1-192($tp0), $ACC1, $ACC1 300 vpmuludq 32*1-128($aap), $B1, $ACC2 301 vpaddq 32*2-192($tp0), $ACC2, $ACC2 302 vpmuludq 32*2-128($aap), $B1, $ACC3 303 vpaddq 32*3-192($tp0), $ACC3, $ACC3 304 vpmuludq 32*3-128($aap), $B1, $ACC4 305 vpaddq 32*4-192($tp0), $ACC4, $ACC4 306 vpmuludq 32*4-128($aap), $B1, $ACC5 307 vpaddq 32*5-192($tp0), $ACC5, $ACC5 308 vpmuludq 32*5-128($aap), $B1, $ACC6 309 vpaddq 32*6-192($tp0), $ACC6, $ACC6 310 vpmuludq 32*6-128($aap), $B1, $ACC7 311 vpaddq 32*7-192($tp0), $ACC7, $ACC7 312 vpmuludq 32*7-128($aap), $B1, $ACC8 313 vpbroadcastq 32*2-128($tpa), $B1 314 vpaddq 32*8-192($tp0), $ACC8, $ACC8 315.Lsqr_entry_1024: 316 vmovdqu $ACC0, 32*0-192($tp0) 317 vmovdqu $ACC1, 32*1-192($tp0) 318 319 vpmuludq 32*1-128($ap), $B2, $TEMP0 320 vpaddq $TEMP0, $ACC2, $ACC2 321 vpmuludq 32*1-128($aap), $B2, $TEMP1 322 vpaddq $TEMP1, $ACC3, $ACC3 323 vpmuludq 32*2-128($aap), $B2, $TEMP2 324 vpaddq $TEMP2, $ACC4, $ACC4 325 vpmuludq 32*3-128($aap), $B2, $TEMP0 326 vpaddq $TEMP0, $ACC5, $ACC5 327 vpmuludq 32*4-128($aap), $B2, $TEMP1 328 vpaddq $TEMP1, $ACC6, $ACC6 329 vpmuludq 32*5-128($aap), $B2, $TEMP2 330 vpaddq $TEMP2, $ACC7, $ACC7 331 vpmuludq 32*6-128($aap), $B2, $TEMP0 332 vpaddq $TEMP0, $ACC8, $ACC8 333 vpmuludq 32*7-128($aap), $B2, $ACC0 334 vpbroadcastq 32*3-128($tpa), $B2 335 vpaddq 32*9-192($tp0), $ACC0, $ACC0 336 337 vmovdqu $ACC2, 32*2-192($tp0) 338 vmovdqu $ACC3, 32*3-192($tp0) 339 340 vpmuludq 32*2-128($ap), $B1, $TEMP2 341 vpaddq $TEMP2, $ACC4, $ACC4 342 vpmuludq 32*2-128($aap), $B1, $TEMP0 343 vpaddq $TEMP0, $ACC5, $ACC5 344 vpmuludq 32*3-128($aap), $B1, $TEMP1 345 vpaddq $TEMP1, $ACC6, $ACC6 346 vpmuludq 32*4-128($aap), $B1, $TEMP2 347 vpaddq $TEMP2, $ACC7, $ACC7 348 vpmuludq 32*5-128($aap), $B1, $TEMP0 349 vpaddq $TEMP0, $ACC8, $ACC8 350 vpmuludq 32*6-128($aap), $B1, $TEMP1 351 vpaddq $TEMP1, $ACC0, $ACC0 352 vpmuludq 32*7-128($aap), $B1, $ACC1 353 vpbroadcastq 32*4-128($tpa), $B1 354 vpaddq 32*10-448($tp1), $ACC1, $ACC1 355 356 vmovdqu $ACC4, 32*4-192($tp0) 357 vmovdqu $ACC5, 32*5-192($tp0) 358 359 vpmuludq 32*3-128($ap), $B2, $TEMP0 360 vpaddq $TEMP0, $ACC6, $ACC6 361 vpmuludq 32*3-128($aap), $B2, $TEMP1 362 vpaddq $TEMP1, $ACC7, $ACC7 363 vpmuludq 32*4-128($aap), $B2, $TEMP2 364 vpaddq $TEMP2, $ACC8, $ACC8 365 vpmuludq 32*5-128($aap), $B2, $TEMP0 366 vpaddq $TEMP0, $ACC0, $ACC0 367 vpmuludq 32*6-128($aap), $B2, $TEMP1 368 vpaddq $TEMP1, $ACC1, $ACC1 369 vpmuludq 32*7-128($aap), $B2, $ACC2 370 vpbroadcastq 32*5-128($tpa), $B2 371 vpaddq 32*11-448($tp1), $ACC2, $ACC2 372 373 vmovdqu $ACC6, 32*6-192($tp0) 374 vmovdqu $ACC7, 32*7-192($tp0) 375 376 vpmuludq 32*4-128($ap), $B1, $TEMP0 377 vpaddq $TEMP0, $ACC8, $ACC8 378 vpmuludq 32*4-128($aap), $B1, $TEMP1 379 vpaddq $TEMP1, $ACC0, $ACC0 380 vpmuludq 32*5-128($aap), $B1, $TEMP2 381 vpaddq $TEMP2, $ACC1, $ACC1 382 vpmuludq 32*6-128($aap), $B1, $TEMP0 383 vpaddq $TEMP0, $ACC2, $ACC2 384 vpmuludq 32*7-128($aap), $B1, $ACC3 385 vpbroadcastq 32*6-128($tpa), $B1 386 vpaddq 32*12-448($tp1), $ACC3, $ACC3 387 388 vmovdqu $ACC8, 32*8-192($tp0) 389 vmovdqu $ACC0, 32*9-192($tp0) 390 lea 8($tp0), $tp0 391 392 vpmuludq 32*5-128($ap), $B2, $TEMP2 393 vpaddq $TEMP2, $ACC1, $ACC1 394 vpmuludq 32*5-128($aap), $B2, $TEMP0 395 vpaddq $TEMP0, $ACC2, $ACC2 396 vpmuludq 32*6-128($aap), $B2, $TEMP1 397 vpaddq $TEMP1, $ACC3, $ACC3 398 vpmuludq 32*7-128($aap), $B2, $ACC4 399 vpbroadcastq 32*7-128($tpa), $B2 400 vpaddq 32*13-448($tp1), $ACC4, $ACC4 401 402 vmovdqu $ACC1, 32*10-448($tp1) 403 vmovdqu $ACC2, 32*11-448($tp1) 404 405 vpmuludq 32*6-128($ap), $B1, $TEMP0 406 vpaddq $TEMP0, $ACC3, $ACC3 407 vpmuludq 32*6-128($aap), $B1, $TEMP1 408 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 409 vpaddq $TEMP1, $ACC4, $ACC4 410 vpmuludq 32*7-128($aap), $B1, $ACC5 411 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration 412 vpaddq 32*14-448($tp1), $ACC5, $ACC5 413 414 vmovdqu $ACC3, 32*12-448($tp1) 415 vmovdqu $ACC4, 32*13-448($tp1) 416 lea 8($tpa), $tpa 417 418 vpmuludq 32*7-128($ap), $B2, $TEMP0 419 vpaddq $TEMP0, $ACC5, $ACC5 420 vpmuludq 32*7-128($aap), $B2, $ACC6 421 vpaddq 32*15-448($tp1), $ACC6, $ACC6 422 423 vpmuludq 32*8-128($ap), $ACC0, $ACC7 424 vmovdqu $ACC5, 32*14-448($tp1) 425 vpaddq 32*16-448($tp1), $ACC7, $ACC7 426 vmovdqu $ACC6, 32*15-448($tp1) 427 vmovdqu $ACC7, 32*16-448($tp1) 428 lea 8($tp1), $tp1 429 430 dec $i 431 jnz .LOOP_SQR_1024 432___ 433$ZERO = $ACC9; 434$TEMP0 = $B1; 435$TEMP2 = $B2; 436$TEMP3 = $Y1; 437$TEMP4 = $Y2; 438$code.=<<___; 439 # we need to fix indices 32-39 to avoid overflow 440 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), 441 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) 442 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) 443 lea 192(%rsp), $tp0 # 64+128=192 444 445 vpsrlq \$29, $ACC8, $TEMP1 446 vpand $AND_MASK, $ACC8, $ACC8 447 vpsrlq \$29, $ACC1, $TEMP2 448 vpand $AND_MASK, $ACC1, $ACC1 449 450 vpermq \$0x93, $TEMP1, $TEMP1 451 vpxor $ZERO, $ZERO, $ZERO 452 vpermq \$0x93, $TEMP2, $TEMP2 453 454 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 455 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 456 vpaddq $TEMP0, $ACC8, $ACC8 457 vpblendd \$3, $TEMP2, $ZERO, $TEMP2 458 vpaddq $TEMP1, $ACC1, $ACC1 459 vpaddq $TEMP2, $ACC2, $ACC2 460 vmovdqu $ACC1, 32*9-192($tp0) 461 vmovdqu $ACC2, 32*10-192($tp0) 462 463 mov (%rsp), %rax 464 mov 8(%rsp), $r1 465 mov 16(%rsp), $r2 466 mov 24(%rsp), $r3 467 vmovdqu 32*1(%rsp), $ACC1 468 vmovdqu 32*2-192($tp0), $ACC2 469 vmovdqu 32*3-192($tp0), $ACC3 470 vmovdqu 32*4-192($tp0), $ACC4 471 vmovdqu 32*5-192($tp0), $ACC5 472 vmovdqu 32*6-192($tp0), $ACC6 473 vmovdqu 32*7-192($tp0), $ACC7 474 475 mov %rax, $r0 476 imull $n0, %eax 477 and \$0x1fffffff, %eax 478 vmovd %eax, $Y1 479 480 mov %rax, %rdx 481 imulq -128($np), %rax 482 vpbroadcastq $Y1, $Y1 483 add %rax, $r0 484 mov %rdx, %rax 485 imulq 8-128($np), %rax 486 shr \$29, $r0 487 add %rax, $r1 488 mov %rdx, %rax 489 imulq 16-128($np), %rax 490 add $r0, $r1 491 add %rax, $r2 492 imulq 24-128($np), %rdx 493 add %rdx, $r3 494 495 mov $r1, %rax 496 imull $n0, %eax 497 and \$0x1fffffff, %eax 498 499 mov \$9, $i 500 jmp .LOOP_REDUCE_1024 501 502.align 32 503.LOOP_REDUCE_1024: 504 vmovd %eax, $Y2 505 vpbroadcastq $Y2, $Y2 506 507 vpmuludq 32*1-128($np), $Y1, $TEMP0 508 mov %rax, %rdx 509 imulq -128($np), %rax 510 vpaddq $TEMP0, $ACC1, $ACC1 511 add %rax, $r1 512 vpmuludq 32*2-128($np), $Y1, $TEMP1 513 mov %rdx, %rax 514 imulq 8-128($np), %rax 515 vpaddq $TEMP1, $ACC2, $ACC2 516 vpmuludq 32*3-128($np), $Y1, $TEMP2 517 .byte 0x67 518 add %rax, $r2 519 .byte 0x67 520 mov %rdx, %rax 521 imulq 16-128($np), %rax 522 shr \$29, $r1 523 vpaddq $TEMP2, $ACC3, $ACC3 524 vpmuludq 32*4-128($np), $Y1, $TEMP0 525 add %rax, $r3 526 add $r1, $r2 527 vpaddq $TEMP0, $ACC4, $ACC4 528 vpmuludq 32*5-128($np), $Y1, $TEMP1 529 mov $r2, %rax 530 imull $n0, %eax 531 vpaddq $TEMP1, $ACC5, $ACC5 532 vpmuludq 32*6-128($np), $Y1, $TEMP2 533 and \$0x1fffffff, %eax 534 vpaddq $TEMP2, $ACC6, $ACC6 535 vpmuludq 32*7-128($np), $Y1, $TEMP0 536 vpaddq $TEMP0, $ACC7, $ACC7 537 vpmuludq 32*8-128($np), $Y1, $TEMP1 538 vmovd %eax, $Y1 539 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below 540 vpaddq $TEMP1, $ACC8, $ACC8 541 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below 542 vpbroadcastq $Y1, $Y1 543 544 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above 545 vmovdqu 32*3-8-128($np), $TEMP1 546 mov %rax, %rdx 547 imulq -128($np), %rax 548 vpaddq $TEMP2, $ACC1, $ACC1 549 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above 550 vmovdqu 32*4-8-128($np), $TEMP2 551 add %rax, $r2 552 mov %rdx, %rax 553 imulq 8-128($np), %rax 554 vpaddq $TEMP0, $ACC2, $ACC2 555 add $r3, %rax 556 shr \$29, $r2 557 vpmuludq $Y2, $TEMP1, $TEMP1 558 vmovdqu 32*5-8-128($np), $TEMP0 559 add $r2, %rax 560 vpaddq $TEMP1, $ACC3, $ACC3 561 vpmuludq $Y2, $TEMP2, $TEMP2 562 vmovdqu 32*6-8-128($np), $TEMP1 563 .byte 0x67 564 mov %rax, $r3 565 imull $n0, %eax 566 vpaddq $TEMP2, $ACC4, $ACC4 567 vpmuludq $Y2, $TEMP0, $TEMP0 568 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 569 and \$0x1fffffff, %eax 570 vpaddq $TEMP0, $ACC5, $ACC5 571 vpmuludq $Y2, $TEMP1, $TEMP1 572 vmovdqu 32*8-8-128($np), $TEMP0 573 vpaddq $TEMP1, $ACC6, $ACC6 574 vpmuludq $Y2, $TEMP2, $TEMP2 575 vmovdqu 32*9-8-128($np), $ACC9 576 vmovd %eax, $ACC0 # borrow ACC0 for Y2 577 imulq -128($np), %rax 578 vpaddq $TEMP2, $ACC7, $ACC7 579 vpmuludq $Y2, $TEMP0, $TEMP0 580 vmovdqu 32*1-16-128($np), $TEMP1 581 vpbroadcastq $ACC0, $ACC0 582 vpaddq $TEMP0, $ACC8, $ACC8 583 vpmuludq $Y2, $ACC9, $ACC9 584 vmovdqu 32*2-16-128($np), $TEMP2 585 add %rax, $r3 586 587___ 588($ACC0,$Y2)=($Y2,$ACC0); 589$code.=<<___; 590 vmovdqu 32*1-24-128($np), $ACC0 591 vpmuludq $Y1, $TEMP1, $TEMP1 592 vmovdqu 32*3-16-128($np), $TEMP0 593 vpaddq $TEMP1, $ACC1, $ACC1 594 vpmuludq $Y2, $ACC0, $ACC0 595 vpmuludq $Y1, $TEMP2, $TEMP2 596 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 597 vpaddq $ACC1, $ACC0, $ACC0 598 vpaddq $TEMP2, $ACC2, $ACC2 599 vpmuludq $Y1, $TEMP0, $TEMP0 600 vmovdqu 32*5-16-128($np), $TEMP2 601 .byte 0x67 602 vmovq $ACC0, %rax 603 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 604 vpaddq $TEMP0, $ACC3, $ACC3 605 vpmuludq $Y1, $TEMP1, $TEMP1 606 vmovdqu 32*6-16-128($np), $TEMP0 607 vpaddq $TEMP1, $ACC4, $ACC4 608 vpmuludq $Y1, $TEMP2, $TEMP2 609 vmovdqu 32*7-16-128($np), $TEMP1 610 vpaddq $TEMP2, $ACC5, $ACC5 611 vpmuludq $Y1, $TEMP0, $TEMP0 612 vmovdqu 32*8-16-128($np), $TEMP2 613 vpaddq $TEMP0, $ACC6, $ACC6 614 vpmuludq $Y1, $TEMP1, $TEMP1 615 shr \$29, $r3 616 vmovdqu 32*9-16-128($np), $TEMP0 617 add $r3, %rax 618 vpaddq $TEMP1, $ACC7, $ACC7 619 vpmuludq $Y1, $TEMP2, $TEMP2 620 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below 621 mov %rax, $r0 622 imull $n0, %eax 623 vpaddq $TEMP2, $ACC8, $ACC8 624 vpmuludq $Y1, $TEMP0, $TEMP0 625 and \$0x1fffffff, %eax 626 vmovd %eax, $Y1 627 vmovdqu 32*3-24-128($np), $TEMP2 628 .byte 0x67 629 vpaddq $TEMP0, $ACC9, $ACC9 630 vpbroadcastq $Y1, $Y1 631 632 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above 633 vmovdqu 32*4-24-128($np), $TEMP0 634 mov %rax, %rdx 635 imulq -128($np), %rax 636 mov 8(%rsp), $r1 637 vpaddq $TEMP1, $ACC2, $ACC1 638 vpmuludq $Y2, $TEMP2, $TEMP2 639 vmovdqu 32*5-24-128($np), $TEMP1 640 add %rax, $r0 641 mov %rdx, %rax 642 imulq 8-128($np), %rax 643 .byte 0x67 644 shr \$29, $r0 645 mov 16(%rsp), $r2 646 vpaddq $TEMP2, $ACC3, $ACC2 647 vpmuludq $Y2, $TEMP0, $TEMP0 648 vmovdqu 32*6-24-128($np), $TEMP2 649 add %rax, $r1 650 mov %rdx, %rax 651 imulq 16-128($np), %rax 652 vpaddq $TEMP0, $ACC4, $ACC3 653 vpmuludq $Y2, $TEMP1, $TEMP1 654 vmovdqu 32*7-24-128($np), $TEMP0 655 imulq 24-128($np), %rdx # future $r3 656 add %rax, $r2 657 lea ($r0,$r1), %rax 658 vpaddq $TEMP1, $ACC5, $ACC4 659 vpmuludq $Y2, $TEMP2, $TEMP2 660 vmovdqu 32*8-24-128($np), $TEMP1 661 mov %rax, $r1 662 imull $n0, %eax 663 vpmuludq $Y2, $TEMP0, $TEMP0 664 vpaddq $TEMP2, $ACC6, $ACC5 665 vmovdqu 32*9-24-128($np), $TEMP2 666 and \$0x1fffffff, %eax 667 vpaddq $TEMP0, $ACC7, $ACC6 668 vpmuludq $Y2, $TEMP1, $TEMP1 669 add 24(%rsp), %rdx 670 vpaddq $TEMP1, $ACC8, $ACC7 671 vpmuludq $Y2, $TEMP2, $TEMP2 672 vpaddq $TEMP2, $ACC9, $ACC8 673 vmovq $r3, $ACC9 674 mov %rdx, $r3 675 676 dec $i 677 jnz .LOOP_REDUCE_1024 678___ 679($ACC0,$Y2)=($Y2,$ACC0); 680$code.=<<___; 681 lea 448(%rsp), $tp1 # size optimization 682 vpaddq $ACC9, $Y2, $ACC0 683 vpxor $ZERO, $ZERO, $ZERO 684 685 vpaddq 32*9-192($tp0), $ACC0, $ACC0 686 vpaddq 32*10-448($tp1), $ACC1, $ACC1 687 vpaddq 32*11-448($tp1), $ACC2, $ACC2 688 vpaddq 32*12-448($tp1), $ACC3, $ACC3 689 vpaddq 32*13-448($tp1), $ACC4, $ACC4 690 vpaddq 32*14-448($tp1), $ACC5, $ACC5 691 vpaddq 32*15-448($tp1), $ACC6, $ACC6 692 vpaddq 32*16-448($tp1), $ACC7, $ACC7 693 vpaddq 32*17-448($tp1), $ACC8, $ACC8 694 695 vpsrlq \$29, $ACC0, $TEMP1 696 vpand $AND_MASK, $ACC0, $ACC0 697 vpsrlq \$29, $ACC1, $TEMP2 698 vpand $AND_MASK, $ACC1, $ACC1 699 vpsrlq \$29, $ACC2, $TEMP3 700 vpermq \$0x93, $TEMP1, $TEMP1 701 vpand $AND_MASK, $ACC2, $ACC2 702 vpsrlq \$29, $ACC3, $TEMP4 703 vpermq \$0x93, $TEMP2, $TEMP2 704 vpand $AND_MASK, $ACC3, $ACC3 705 vpermq \$0x93, $TEMP3, $TEMP3 706 707 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 708 vpermq \$0x93, $TEMP4, $TEMP4 709 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 710 vpaddq $TEMP0, $ACC0, $ACC0 711 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 712 vpaddq $TEMP1, $ACC1, $ACC1 713 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 714 vpaddq $TEMP2, $ACC2, $ACC2 715 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 716 vpaddq $TEMP3, $ACC3, $ACC3 717 vpaddq $TEMP4, $ACC4, $ACC4 718 719 vpsrlq \$29, $ACC0, $TEMP1 720 vpand $AND_MASK, $ACC0, $ACC0 721 vpsrlq \$29, $ACC1, $TEMP2 722 vpand $AND_MASK, $ACC1, $ACC1 723 vpsrlq \$29, $ACC2, $TEMP3 724 vpermq \$0x93, $TEMP1, $TEMP1 725 vpand $AND_MASK, $ACC2, $ACC2 726 vpsrlq \$29, $ACC3, $TEMP4 727 vpermq \$0x93, $TEMP2, $TEMP2 728 vpand $AND_MASK, $ACC3, $ACC3 729 vpermq \$0x93, $TEMP3, $TEMP3 730 731 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 732 vpermq \$0x93, $TEMP4, $TEMP4 733 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 734 vpaddq $TEMP0, $ACC0, $ACC0 735 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 736 vpaddq $TEMP1, $ACC1, $ACC1 737 vmovdqu $ACC0, 32*0-128($rp) 738 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 739 vpaddq $TEMP2, $ACC2, $ACC2 740 vmovdqu $ACC1, 32*1-128($rp) 741 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 742 vpaddq $TEMP3, $ACC3, $ACC3 743 vmovdqu $ACC2, 32*2-128($rp) 744 vpaddq $TEMP4, $ACC4, $ACC4 745 vmovdqu $ACC3, 32*3-128($rp) 746___ 747$TEMP5=$ACC0; 748$code.=<<___; 749 vpsrlq \$29, $ACC4, $TEMP1 750 vpand $AND_MASK, $ACC4, $ACC4 751 vpsrlq \$29, $ACC5, $TEMP2 752 vpand $AND_MASK, $ACC5, $ACC5 753 vpsrlq \$29, $ACC6, $TEMP3 754 vpermq \$0x93, $TEMP1, $TEMP1 755 vpand $AND_MASK, $ACC6, $ACC6 756 vpsrlq \$29, $ACC7, $TEMP4 757 vpermq \$0x93, $TEMP2, $TEMP2 758 vpand $AND_MASK, $ACC7, $ACC7 759 vpsrlq \$29, $ACC8, $TEMP5 760 vpermq \$0x93, $TEMP3, $TEMP3 761 vpand $AND_MASK, $ACC8, $ACC8 762 vpermq \$0x93, $TEMP4, $TEMP4 763 764 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 765 vpermq \$0x93, $TEMP5, $TEMP5 766 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 767 vpaddq $TEMP0, $ACC4, $ACC4 768 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 769 vpaddq $TEMP1, $ACC5, $ACC5 770 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 771 vpaddq $TEMP2, $ACC6, $ACC6 772 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 773 vpaddq $TEMP3, $ACC7, $ACC7 774 vpaddq $TEMP4, $ACC8, $ACC8 775 776 vpsrlq \$29, $ACC4, $TEMP1 777 vpand $AND_MASK, $ACC4, $ACC4 778 vpsrlq \$29, $ACC5, $TEMP2 779 vpand $AND_MASK, $ACC5, $ACC5 780 vpsrlq \$29, $ACC6, $TEMP3 781 vpermq \$0x93, $TEMP1, $TEMP1 782 vpand $AND_MASK, $ACC6, $ACC6 783 vpsrlq \$29, $ACC7, $TEMP4 784 vpermq \$0x93, $TEMP2, $TEMP2 785 vpand $AND_MASK, $ACC7, $ACC7 786 vpsrlq \$29, $ACC8, $TEMP5 787 vpermq \$0x93, $TEMP3, $TEMP3 788 vpand $AND_MASK, $ACC8, $ACC8 789 vpermq \$0x93, $TEMP4, $TEMP4 790 791 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 792 vpermq \$0x93, $TEMP5, $TEMP5 793 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 794 vpaddq $TEMP0, $ACC4, $ACC4 795 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 796 vpaddq $TEMP1, $ACC5, $ACC5 797 vmovdqu $ACC4, 32*4-128($rp) 798 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 799 vpaddq $TEMP2, $ACC6, $ACC6 800 vmovdqu $ACC5, 32*5-128($rp) 801 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 802 vpaddq $TEMP3, $ACC7, $ACC7 803 vmovdqu $ACC6, 32*6-128($rp) 804 vpaddq $TEMP4, $ACC8, $ACC8 805 vmovdqu $ACC7, 32*7-128($rp) 806 vmovdqu $ACC8, 32*8-128($rp) 807 808 mov $rp, $ap 809 dec $rep 810 jne .LOOP_GRANDE_SQR_1024 811 812 vzeroall 813 mov %rbp, %rax 814.cfi_def_cfa_register %rax 815___ 816$code.=<<___ if ($win64); 817.Lsqr_1024_in_tail: 818 movaps -0xd8(%rax),%xmm6 819 movaps -0xc8(%rax),%xmm7 820 movaps -0xb8(%rax),%xmm8 821 movaps -0xa8(%rax),%xmm9 822 movaps -0x98(%rax),%xmm10 823 movaps -0x88(%rax),%xmm11 824 movaps -0x78(%rax),%xmm12 825 movaps -0x68(%rax),%xmm13 826 movaps -0x58(%rax),%xmm14 827 movaps -0x48(%rax),%xmm15 828___ 829$code.=<<___; 830 mov -48(%rax),%r15 831.cfi_restore %r15 832 mov -40(%rax),%r14 833.cfi_restore %r14 834 mov -32(%rax),%r13 835.cfi_restore %r13 836 mov -24(%rax),%r12 837.cfi_restore %r12 838 mov -16(%rax),%rbp 839.cfi_restore %rbp 840 mov -8(%rax),%rbx 841.cfi_restore %rbx 842 lea (%rax),%rsp # restore %rsp 843.cfi_def_cfa_register %rsp 844.Lsqr_1024_epilogue: 845 ret 846.cfi_endproc 847.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 848___ 849} 850 851{ # void AMM_WW( 852my $rp="%rdi"; # BN_ULONG *rp, 853my $ap="%rsi"; # const BN_ULONG *ap, 854my $bp="%rdx"; # const BN_ULONG *bp, 855my $np="%rcx"; # const BN_ULONG *np, 856my $n0="%r8d"; # unsigned int n0); 857 858# The registers that hold the accumulated redundant result 859# The AMM works on 1024 bit operands, and redundant word size is 29 860# Therefore: ceil(1024/29)/4 = 9 861my $ACC0="%ymm0"; 862my $ACC1="%ymm1"; 863my $ACC2="%ymm2"; 864my $ACC3="%ymm3"; 865my $ACC4="%ymm4"; 866my $ACC5="%ymm5"; 867my $ACC6="%ymm6"; 868my $ACC7="%ymm7"; 869my $ACC8="%ymm8"; 870my $ACC9="%ymm9"; 871 872# Registers that hold the broadcasted words of multiplier, currently used 873my $Bi="%ymm10"; 874my $Yi="%ymm11"; 875 876# Helper registers 877my $TEMP0=$ACC0; 878my $TEMP1="%ymm12"; 879my $TEMP2="%ymm13"; 880my $ZERO="%ymm14"; 881my $AND_MASK="%ymm15"; 882 883# alu registers that hold the first words of the ACC 884my $r0="%r9"; 885my $r1="%r10"; 886my $r2="%r11"; 887my $r3="%r12"; 888 889my $i="%r14d"; 890my $tmp="%r15"; 891 892$bp="%r13"; # reassigned argument 893 894$code.=<<___; 895.globl rsaz_1024_mul_avx2 896.type rsaz_1024_mul_avx2,\@function,5 897.align 64 898rsaz_1024_mul_avx2: 899.cfi_startproc 900 lea (%rsp), %rax 901.cfi_def_cfa_register %rax 902 push %rbx 903.cfi_push %rbx 904 push %rbp 905.cfi_push %rbp 906 push %r12 907.cfi_push %r12 908 push %r13 909.cfi_push %r13 910 push %r14 911.cfi_push %r14 912 push %r15 913.cfi_push %r15 914___ 915$code.=<<___ if ($win64); 916 vzeroupper 917 lea -0xa8(%rsp),%rsp 918 vmovaps %xmm6,-0xd8(%rax) 919 vmovaps %xmm7,-0xc8(%rax) 920 vmovaps %xmm8,-0xb8(%rax) 921 vmovaps %xmm9,-0xa8(%rax) 922 vmovaps %xmm10,-0x98(%rax) 923 vmovaps %xmm11,-0x88(%rax) 924 vmovaps %xmm12,-0x78(%rax) 925 vmovaps %xmm13,-0x68(%rax) 926 vmovaps %xmm14,-0x58(%rax) 927 vmovaps %xmm15,-0x48(%rax) 928.Lmul_1024_body: 929___ 930$code.=<<___; 931 mov %rax,%rbp 932.cfi_def_cfa_register %rbp 933 vzeroall 934 mov %rdx, $bp # reassigned argument 935 sub \$64,%rsp 936 937 # unaligned 256-bit load that crosses page boundary can 938 # cause severe performance degradation here, so if $ap does 939 # cross page boundary, swap it with $bp [meaning that caller 940 # is advised to lay down $ap and $bp next to each other, so 941 # that only one can cross page boundary]. 942 .byte 0x67,0x67 943 mov $ap, $tmp 944 and \$4095, $tmp 945 add \$32*10, $tmp 946 shr \$12, $tmp 947 mov $ap, $tmp 948 cmovnz $bp, $ap 949 cmovnz $tmp, $bp 950 951 mov $np, $tmp 952 sub \$-128,$ap # size optimization 953 sub \$-128,$np 954 sub \$-128,$rp 955 956 and \$4095, $tmp # see if $np crosses page 957 add \$32*10, $tmp 958 .byte 0x67,0x67 959 shr \$12, $tmp 960 jz .Lmul_1024_no_n_copy 961 962 # unaligned 256-bit load that crosses page boundary can 963 # cause severe performance degradation here, so if $np does 964 # cross page boundary, copy it to stack and make sure stack 965 # frame doesn't... 966 sub \$32*10,%rsp 967 vmovdqu 32*0-128($np), $ACC0 968 and \$-512, %rsp 969 vmovdqu 32*1-128($np), $ACC1 970 vmovdqu 32*2-128($np), $ACC2 971 vmovdqu 32*3-128($np), $ACC3 972 vmovdqu 32*4-128($np), $ACC4 973 vmovdqu 32*5-128($np), $ACC5 974 vmovdqu 32*6-128($np), $ACC6 975 vmovdqu 32*7-128($np), $ACC7 976 vmovdqu 32*8-128($np), $ACC8 977 lea 64+128(%rsp),$np 978 vmovdqu $ACC0, 32*0-128($np) 979 vpxor $ACC0, $ACC0, $ACC0 980 vmovdqu $ACC1, 32*1-128($np) 981 vpxor $ACC1, $ACC1, $ACC1 982 vmovdqu $ACC2, 32*2-128($np) 983 vpxor $ACC2, $ACC2, $ACC2 984 vmovdqu $ACC3, 32*3-128($np) 985 vpxor $ACC3, $ACC3, $ACC3 986 vmovdqu $ACC4, 32*4-128($np) 987 vpxor $ACC4, $ACC4, $ACC4 988 vmovdqu $ACC5, 32*5-128($np) 989 vpxor $ACC5, $ACC5, $ACC5 990 vmovdqu $ACC6, 32*6-128($np) 991 vpxor $ACC6, $ACC6, $ACC6 992 vmovdqu $ACC7, 32*7-128($np) 993 vpxor $ACC7, $ACC7, $ACC7 994 vmovdqu $ACC8, 32*8-128($np) 995 vmovdqa $ACC0, $ACC8 996 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall 997.Lmul_1024_no_n_copy: 998 and \$-64,%rsp 999 1000 mov ($bp), %rbx 1001 vpbroadcastq ($bp), $Bi 1002 vmovdqu $ACC0, (%rsp) # clear top of stack 1003 xor $r0, $r0 1004 .byte 0x67 1005 xor $r1, $r1 1006 xor $r2, $r2 1007 xor $r3, $r3 1008 1009 vmovdqu .Land_mask(%rip), $AND_MASK 1010 mov \$9, $i 1011 vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall 1012 jmp .Loop_mul_1024 1013 1014.align 32 1015.Loop_mul_1024: 1016 vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) 1017 mov %rbx, %rax 1018 imulq -128($ap), %rax 1019 add $r0, %rax 1020 mov %rbx, $r1 1021 imulq 8-128($ap), $r1 1022 add 8(%rsp), $r1 1023 1024 mov %rax, $r0 1025 imull $n0, %eax 1026 and \$0x1fffffff, %eax 1027 1028 mov %rbx, $r2 1029 imulq 16-128($ap), $r2 1030 add 16(%rsp), $r2 1031 1032 mov %rbx, $r3 1033 imulq 24-128($ap), $r3 1034 add 24(%rsp), $r3 1035 vpmuludq 32*1-128($ap),$Bi,$TEMP0 1036 vmovd %eax, $Yi 1037 vpaddq $TEMP0,$ACC1,$ACC1 1038 vpmuludq 32*2-128($ap),$Bi,$TEMP1 1039 vpbroadcastq $Yi, $Yi 1040 vpaddq $TEMP1,$ACC2,$ACC2 1041 vpmuludq 32*3-128($ap),$Bi,$TEMP2 1042 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 1043 vpaddq $TEMP2,$ACC3,$ACC3 1044 vpmuludq 32*4-128($ap),$Bi,$TEMP0 1045 vpaddq $TEMP0,$ACC4,$ACC4 1046 vpmuludq 32*5-128($ap),$Bi,$TEMP1 1047 vpaddq $TEMP1,$ACC5,$ACC5 1048 vpmuludq 32*6-128($ap),$Bi,$TEMP2 1049 vpaddq $TEMP2,$ACC6,$ACC6 1050 vpmuludq 32*7-128($ap),$Bi,$TEMP0 1051 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 1052 vpaddq $TEMP0,$ACC7,$ACC7 1053 vpmuludq 32*8-128($ap),$Bi,$TEMP1 1054 vpbroadcastq 8($bp), $Bi 1055 vpaddq $TEMP1,$ACC8,$ACC8 1056 1057 mov %rax,%rdx 1058 imulq -128($np),%rax 1059 add %rax,$r0 1060 mov %rdx,%rax 1061 imulq 8-128($np),%rax 1062 add %rax,$r1 1063 mov %rdx,%rax 1064 imulq 16-128($np),%rax 1065 add %rax,$r2 1066 shr \$29, $r0 1067 imulq 24-128($np),%rdx 1068 add %rdx,$r3 1069 add $r0, $r1 1070 1071 vpmuludq 32*1-128($np),$Yi,$TEMP2 1072 vmovq $Bi, %rbx 1073 vpaddq $TEMP2,$ACC1,$ACC1 1074 vpmuludq 32*2-128($np),$Yi,$TEMP0 1075 vpaddq $TEMP0,$ACC2,$ACC2 1076 vpmuludq 32*3-128($np),$Yi,$TEMP1 1077 vpaddq $TEMP1,$ACC3,$ACC3 1078 vpmuludq 32*4-128($np),$Yi,$TEMP2 1079 vpaddq $TEMP2,$ACC4,$ACC4 1080 vpmuludq 32*5-128($np),$Yi,$TEMP0 1081 vpaddq $TEMP0,$ACC5,$ACC5 1082 vpmuludq 32*6-128($np),$Yi,$TEMP1 1083 vpaddq $TEMP1,$ACC6,$ACC6 1084 vpmuludq 32*7-128($np),$Yi,$TEMP2 1085 vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3 1086 vpaddq $TEMP2,$ACC7,$ACC7 1087 vpmuludq 32*8-128($np),$Yi,$TEMP0 1088 vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3 1089 vpaddq $TEMP0,$ACC8,$ACC8 1090 1091 mov %rbx, %rax 1092 imulq -128($ap),%rax 1093 add %rax,$r1 1094 vmovdqu -8+32*1-128($ap),$TEMP1 1095 mov %rbx, %rax 1096 imulq 8-128($ap),%rax 1097 add %rax,$r2 1098 vmovdqu -8+32*2-128($ap),$TEMP2 1099 1100 mov $r1, %rax 1101 imull $n0, %eax 1102 and \$0x1fffffff, %eax 1103 1104 imulq 16-128($ap),%rbx 1105 add %rbx,$r3 1106 vpmuludq $Bi,$TEMP1,$TEMP1 1107 vmovd %eax, $Yi 1108 vmovdqu -8+32*3-128($ap),$TEMP0 1109 vpaddq $TEMP1,$ACC1,$ACC1 1110 vpmuludq $Bi,$TEMP2,$TEMP2 1111 vpbroadcastq $Yi, $Yi 1112 vmovdqu -8+32*4-128($ap),$TEMP1 1113 vpaddq $TEMP2,$ACC2,$ACC2 1114 vpmuludq $Bi,$TEMP0,$TEMP0 1115 vmovdqu -8+32*5-128($ap),$TEMP2 1116 vpaddq $TEMP0,$ACC3,$ACC3 1117 vpmuludq $Bi,$TEMP1,$TEMP1 1118 vmovdqu -8+32*6-128($ap),$TEMP0 1119 vpaddq $TEMP1,$ACC4,$ACC4 1120 vpmuludq $Bi,$TEMP2,$TEMP2 1121 vmovdqu -8+32*7-128($ap),$TEMP1 1122 vpaddq $TEMP2,$ACC5,$ACC5 1123 vpmuludq $Bi,$TEMP0,$TEMP0 1124 vmovdqu -8+32*8-128($ap),$TEMP2 1125 vpaddq $TEMP0,$ACC6,$ACC6 1126 vpmuludq $Bi,$TEMP1,$TEMP1 1127 vmovdqu -8+32*9-128($ap),$ACC9 1128 vpaddq $TEMP1,$ACC7,$ACC7 1129 vpmuludq $Bi,$TEMP2,$TEMP2 1130 vpaddq $TEMP2,$ACC8,$ACC8 1131 vpmuludq $Bi,$ACC9,$ACC9 1132 vpbroadcastq 16($bp), $Bi 1133 1134 mov %rax,%rdx 1135 imulq -128($np),%rax 1136 add %rax,$r1 1137 vmovdqu -8+32*1-128($np),$TEMP0 1138 mov %rdx,%rax 1139 imulq 8-128($np),%rax 1140 add %rax,$r2 1141 vmovdqu -8+32*2-128($np),$TEMP1 1142 shr \$29, $r1 1143 imulq 16-128($np),%rdx 1144 add %rdx,$r3 1145 add $r1, $r2 1146 1147 vpmuludq $Yi,$TEMP0,$TEMP0 1148 vmovq $Bi, %rbx 1149 vmovdqu -8+32*3-128($np),$TEMP2 1150 vpaddq $TEMP0,$ACC1,$ACC1 1151 vpmuludq $Yi,$TEMP1,$TEMP1 1152 vmovdqu -8+32*4-128($np),$TEMP0 1153 vpaddq $TEMP1,$ACC2,$ACC2 1154 vpmuludq $Yi,$TEMP2,$TEMP2 1155 vmovdqu -8+32*5-128($np),$TEMP1 1156 vpaddq $TEMP2,$ACC3,$ACC3 1157 vpmuludq $Yi,$TEMP0,$TEMP0 1158 vmovdqu -8+32*6-128($np),$TEMP2 1159 vpaddq $TEMP0,$ACC4,$ACC4 1160 vpmuludq $Yi,$TEMP1,$TEMP1 1161 vmovdqu -8+32*7-128($np),$TEMP0 1162 vpaddq $TEMP1,$ACC5,$ACC5 1163 vpmuludq $Yi,$TEMP2,$TEMP2 1164 vmovdqu -8+32*8-128($np),$TEMP1 1165 vpaddq $TEMP2,$ACC6,$ACC6 1166 vpmuludq $Yi,$TEMP0,$TEMP0 1167 vmovdqu -8+32*9-128($np),$TEMP2 1168 vpaddq $TEMP0,$ACC7,$ACC7 1169 vpmuludq $Yi,$TEMP1,$TEMP1 1170 vpaddq $TEMP1,$ACC8,$ACC8 1171 vpmuludq $Yi,$TEMP2,$TEMP2 1172 vpaddq $TEMP2,$ACC9,$ACC9 1173 1174 vmovdqu -16+32*1-128($ap),$TEMP0 1175 mov %rbx,%rax 1176 imulq -128($ap),%rax 1177 add $r2,%rax 1178 1179 vmovdqu -16+32*2-128($ap),$TEMP1 1180 mov %rax,$r2 1181 imull $n0, %eax 1182 and \$0x1fffffff, %eax 1183 1184 imulq 8-128($ap),%rbx 1185 add %rbx,$r3 1186 vpmuludq $Bi,$TEMP0,$TEMP0 1187 vmovd %eax, $Yi 1188 vmovdqu -16+32*3-128($ap),$TEMP2 1189 vpaddq $TEMP0,$ACC1,$ACC1 1190 vpmuludq $Bi,$TEMP1,$TEMP1 1191 vpbroadcastq $Yi, $Yi 1192 vmovdqu -16+32*4-128($ap),$TEMP0 1193 vpaddq $TEMP1,$ACC2,$ACC2 1194 vpmuludq $Bi,$TEMP2,$TEMP2 1195 vmovdqu -16+32*5-128($ap),$TEMP1 1196 vpaddq $TEMP2,$ACC3,$ACC3 1197 vpmuludq $Bi,$TEMP0,$TEMP0 1198 vmovdqu -16+32*6-128($ap),$TEMP2 1199 vpaddq $TEMP0,$ACC4,$ACC4 1200 vpmuludq $Bi,$TEMP1,$TEMP1 1201 vmovdqu -16+32*7-128($ap),$TEMP0 1202 vpaddq $TEMP1,$ACC5,$ACC5 1203 vpmuludq $Bi,$TEMP2,$TEMP2 1204 vmovdqu -16+32*8-128($ap),$TEMP1 1205 vpaddq $TEMP2,$ACC6,$ACC6 1206 vpmuludq $Bi,$TEMP0,$TEMP0 1207 vmovdqu -16+32*9-128($ap),$TEMP2 1208 vpaddq $TEMP0,$ACC7,$ACC7 1209 vpmuludq $Bi,$TEMP1,$TEMP1 1210 vpaddq $TEMP1,$ACC8,$ACC8 1211 vpmuludq $Bi,$TEMP2,$TEMP2 1212 vpbroadcastq 24($bp), $Bi 1213 vpaddq $TEMP2,$ACC9,$ACC9 1214 1215 vmovdqu -16+32*1-128($np),$TEMP0 1216 mov %rax,%rdx 1217 imulq -128($np),%rax 1218 add %rax,$r2 1219 vmovdqu -16+32*2-128($np),$TEMP1 1220 imulq 8-128($np),%rdx 1221 add %rdx,$r3 1222 shr \$29, $r2 1223 1224 vpmuludq $Yi,$TEMP0,$TEMP0 1225 vmovq $Bi, %rbx 1226 vmovdqu -16+32*3-128($np),$TEMP2 1227 vpaddq $TEMP0,$ACC1,$ACC1 1228 vpmuludq $Yi,$TEMP1,$TEMP1 1229 vmovdqu -16+32*4-128($np),$TEMP0 1230 vpaddq $TEMP1,$ACC2,$ACC2 1231 vpmuludq $Yi,$TEMP2,$TEMP2 1232 vmovdqu -16+32*5-128($np),$TEMP1 1233 vpaddq $TEMP2,$ACC3,$ACC3 1234 vpmuludq $Yi,$TEMP0,$TEMP0 1235 vmovdqu -16+32*6-128($np),$TEMP2 1236 vpaddq $TEMP0,$ACC4,$ACC4 1237 vpmuludq $Yi,$TEMP1,$TEMP1 1238 vmovdqu -16+32*7-128($np),$TEMP0 1239 vpaddq $TEMP1,$ACC5,$ACC5 1240 vpmuludq $Yi,$TEMP2,$TEMP2 1241 vmovdqu -16+32*8-128($np),$TEMP1 1242 vpaddq $TEMP2,$ACC6,$ACC6 1243 vpmuludq $Yi,$TEMP0,$TEMP0 1244 vmovdqu -16+32*9-128($np),$TEMP2 1245 vpaddq $TEMP0,$ACC7,$ACC7 1246 vpmuludq $Yi,$TEMP1,$TEMP1 1247 vmovdqu -24+32*1-128($ap),$TEMP0 1248 vpaddq $TEMP1,$ACC8,$ACC8 1249 vpmuludq $Yi,$TEMP2,$TEMP2 1250 vmovdqu -24+32*2-128($ap),$TEMP1 1251 vpaddq $TEMP2,$ACC9,$ACC9 1252 1253 add $r2, $r3 1254 imulq -128($ap),%rbx 1255 add %rbx,$r3 1256 1257 mov $r3, %rax 1258 imull $n0, %eax 1259 and \$0x1fffffff, %eax 1260 1261 vpmuludq $Bi,$TEMP0,$TEMP0 1262 vmovd %eax, $Yi 1263 vmovdqu -24+32*3-128($ap),$TEMP2 1264 vpaddq $TEMP0,$ACC1,$ACC1 1265 vpmuludq $Bi,$TEMP1,$TEMP1 1266 vpbroadcastq $Yi, $Yi 1267 vmovdqu -24+32*4-128($ap),$TEMP0 1268 vpaddq $TEMP1,$ACC2,$ACC2 1269 vpmuludq $Bi,$TEMP2,$TEMP2 1270 vmovdqu -24+32*5-128($ap),$TEMP1 1271 vpaddq $TEMP2,$ACC3,$ACC3 1272 vpmuludq $Bi,$TEMP0,$TEMP0 1273 vmovdqu -24+32*6-128($ap),$TEMP2 1274 vpaddq $TEMP0,$ACC4,$ACC4 1275 vpmuludq $Bi,$TEMP1,$TEMP1 1276 vmovdqu -24+32*7-128($ap),$TEMP0 1277 vpaddq $TEMP1,$ACC5,$ACC5 1278 vpmuludq $Bi,$TEMP2,$TEMP2 1279 vmovdqu -24+32*8-128($ap),$TEMP1 1280 vpaddq $TEMP2,$ACC6,$ACC6 1281 vpmuludq $Bi,$TEMP0,$TEMP0 1282 vmovdqu -24+32*9-128($ap),$TEMP2 1283 vpaddq $TEMP0,$ACC7,$ACC7 1284 vpmuludq $Bi,$TEMP1,$TEMP1 1285 vpaddq $TEMP1,$ACC8,$ACC8 1286 vpmuludq $Bi,$TEMP2,$TEMP2 1287 vpbroadcastq 32($bp), $Bi 1288 vpaddq $TEMP2,$ACC9,$ACC9 1289 add \$32, $bp # $bp++ 1290 1291 vmovdqu -24+32*1-128($np),$TEMP0 1292 imulq -128($np),%rax 1293 add %rax,$r3 1294 shr \$29, $r3 1295 1296 vmovdqu -24+32*2-128($np),$TEMP1 1297 vpmuludq $Yi,$TEMP0,$TEMP0 1298 vmovq $Bi, %rbx 1299 vmovdqu -24+32*3-128($np),$TEMP2 1300 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 1301 vpmuludq $Yi,$TEMP1,$TEMP1 1302 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 1303 vpaddq $TEMP1,$ACC2,$ACC1 1304 vmovdqu -24+32*4-128($np),$TEMP0 1305 vpmuludq $Yi,$TEMP2,$TEMP2 1306 vmovdqu -24+32*5-128($np),$TEMP1 1307 vpaddq $TEMP2,$ACC3,$ACC2 1308 vpmuludq $Yi,$TEMP0,$TEMP0 1309 vmovdqu -24+32*6-128($np),$TEMP2 1310 vpaddq $TEMP0,$ACC4,$ACC3 1311 vpmuludq $Yi,$TEMP1,$TEMP1 1312 vmovdqu -24+32*7-128($np),$TEMP0 1313 vpaddq $TEMP1,$ACC5,$ACC4 1314 vpmuludq $Yi,$TEMP2,$TEMP2 1315 vmovdqu -24+32*8-128($np),$TEMP1 1316 vpaddq $TEMP2,$ACC6,$ACC5 1317 vpmuludq $Yi,$TEMP0,$TEMP0 1318 vmovdqu -24+32*9-128($np),$TEMP2 1319 mov $r3, $r0 1320 vpaddq $TEMP0,$ACC7,$ACC6 1321 vpmuludq $Yi,$TEMP1,$TEMP1 1322 add (%rsp), $r0 1323 vpaddq $TEMP1,$ACC8,$ACC7 1324 vpmuludq $Yi,$TEMP2,$TEMP2 1325 vmovq $r3, $TEMP1 1326 vpaddq $TEMP2,$ACC9,$ACC8 1327 1328 dec $i 1329 jnz .Loop_mul_1024 1330___ 1331 1332# (*) Original implementation was correcting ACC1-ACC3 for overflow 1333# after 7 loop runs, or after 28 iterations, or 56 additions. 1334# But as we underutilize resources, it's possible to correct in 1335# each iteration with marginal performance loss. But then, as 1336# we do it in each iteration, we can correct less digits, and 1337# avoid performance penalties completely. Also note that we 1338# correct only three digits out of four. This works because 1339# most significant digit is subjected to less additions. 1340 1341$TEMP0 = $ACC9; 1342$TEMP3 = $Bi; 1343$TEMP4 = $Yi; 1344$code.=<<___; 1345 vpermq \$0, $AND_MASK, $AND_MASK 1346 vpaddq (%rsp), $TEMP1, $ACC0 1347 1348 vpsrlq \$29, $ACC0, $TEMP1 1349 vpand $AND_MASK, $ACC0, $ACC0 1350 vpsrlq \$29, $ACC1, $TEMP2 1351 vpand $AND_MASK, $ACC1, $ACC1 1352 vpsrlq \$29, $ACC2, $TEMP3 1353 vpermq \$0x93, $TEMP1, $TEMP1 1354 vpand $AND_MASK, $ACC2, $ACC2 1355 vpsrlq \$29, $ACC3, $TEMP4 1356 vpermq \$0x93, $TEMP2, $TEMP2 1357 vpand $AND_MASK, $ACC3, $ACC3 1358 1359 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1360 vpermq \$0x93, $TEMP3, $TEMP3 1361 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1362 vpermq \$0x93, $TEMP4, $TEMP4 1363 vpaddq $TEMP0, $ACC0, $ACC0 1364 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1365 vpaddq $TEMP1, $ACC1, $ACC1 1366 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1367 vpaddq $TEMP2, $ACC2, $ACC2 1368 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1369 vpaddq $TEMP3, $ACC3, $ACC3 1370 vpaddq $TEMP4, $ACC4, $ACC4 1371 1372 vpsrlq \$29, $ACC0, $TEMP1 1373 vpand $AND_MASK, $ACC0, $ACC0 1374 vpsrlq \$29, $ACC1, $TEMP2 1375 vpand $AND_MASK, $ACC1, $ACC1 1376 vpsrlq \$29, $ACC2, $TEMP3 1377 vpermq \$0x93, $TEMP1, $TEMP1 1378 vpand $AND_MASK, $ACC2, $ACC2 1379 vpsrlq \$29, $ACC3, $TEMP4 1380 vpermq \$0x93, $TEMP2, $TEMP2 1381 vpand $AND_MASK, $ACC3, $ACC3 1382 vpermq \$0x93, $TEMP3, $TEMP3 1383 1384 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1385 vpermq \$0x93, $TEMP4, $TEMP4 1386 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1387 vpaddq $TEMP0, $ACC0, $ACC0 1388 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1389 vpaddq $TEMP1, $ACC1, $ACC1 1390 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1391 vpaddq $TEMP2, $ACC2, $ACC2 1392 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1393 vpaddq $TEMP3, $ACC3, $ACC3 1394 vpaddq $TEMP4, $ACC4, $ACC4 1395 1396 vmovdqu $ACC0, 0-128($rp) 1397 vmovdqu $ACC1, 32-128($rp) 1398 vmovdqu $ACC2, 64-128($rp) 1399 vmovdqu $ACC3, 96-128($rp) 1400___ 1401 1402$TEMP5=$ACC0; 1403$code.=<<___; 1404 vpsrlq \$29, $ACC4, $TEMP1 1405 vpand $AND_MASK, $ACC4, $ACC4 1406 vpsrlq \$29, $ACC5, $TEMP2 1407 vpand $AND_MASK, $ACC5, $ACC5 1408 vpsrlq \$29, $ACC6, $TEMP3 1409 vpermq \$0x93, $TEMP1, $TEMP1 1410 vpand $AND_MASK, $ACC6, $ACC6 1411 vpsrlq \$29, $ACC7, $TEMP4 1412 vpermq \$0x93, $TEMP2, $TEMP2 1413 vpand $AND_MASK, $ACC7, $ACC7 1414 vpsrlq \$29, $ACC8, $TEMP5 1415 vpermq \$0x93, $TEMP3, $TEMP3 1416 vpand $AND_MASK, $ACC8, $ACC8 1417 vpermq \$0x93, $TEMP4, $TEMP4 1418 1419 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1420 vpermq \$0x93, $TEMP5, $TEMP5 1421 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1422 vpaddq $TEMP0, $ACC4, $ACC4 1423 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1424 vpaddq $TEMP1, $ACC5, $ACC5 1425 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1426 vpaddq $TEMP2, $ACC6, $ACC6 1427 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1428 vpaddq $TEMP3, $ACC7, $ACC7 1429 vpaddq $TEMP4, $ACC8, $ACC8 1430 1431 vpsrlq \$29, $ACC4, $TEMP1 1432 vpand $AND_MASK, $ACC4, $ACC4 1433 vpsrlq \$29, $ACC5, $TEMP2 1434 vpand $AND_MASK, $ACC5, $ACC5 1435 vpsrlq \$29, $ACC6, $TEMP3 1436 vpermq \$0x93, $TEMP1, $TEMP1 1437 vpand $AND_MASK, $ACC6, $ACC6 1438 vpsrlq \$29, $ACC7, $TEMP4 1439 vpermq \$0x93, $TEMP2, $TEMP2 1440 vpand $AND_MASK, $ACC7, $ACC7 1441 vpsrlq \$29, $ACC8, $TEMP5 1442 vpermq \$0x93, $TEMP3, $TEMP3 1443 vpand $AND_MASK, $ACC8, $ACC8 1444 vpermq \$0x93, $TEMP4, $TEMP4 1445 1446 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1447 vpermq \$0x93, $TEMP5, $TEMP5 1448 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1449 vpaddq $TEMP0, $ACC4, $ACC4 1450 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1451 vpaddq $TEMP1, $ACC5, $ACC5 1452 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1453 vpaddq $TEMP2, $ACC6, $ACC6 1454 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1455 vpaddq $TEMP3, $ACC7, $ACC7 1456 vpaddq $TEMP4, $ACC8, $ACC8 1457 1458 vmovdqu $ACC4, 128-128($rp) 1459 vmovdqu $ACC5, 160-128($rp) 1460 vmovdqu $ACC6, 192-128($rp) 1461 vmovdqu $ACC7, 224-128($rp) 1462 vmovdqu $ACC8, 256-128($rp) 1463 vzeroupper 1464 1465 mov %rbp, %rax 1466.cfi_def_cfa_register %rax 1467___ 1468$code.=<<___ if ($win64); 1469.Lmul_1024_in_tail: 1470 movaps -0xd8(%rax),%xmm6 1471 movaps -0xc8(%rax),%xmm7 1472 movaps -0xb8(%rax),%xmm8 1473 movaps -0xa8(%rax),%xmm9 1474 movaps -0x98(%rax),%xmm10 1475 movaps -0x88(%rax),%xmm11 1476 movaps -0x78(%rax),%xmm12 1477 movaps -0x68(%rax),%xmm13 1478 movaps -0x58(%rax),%xmm14 1479 movaps -0x48(%rax),%xmm15 1480___ 1481$code.=<<___; 1482 mov -48(%rax),%r15 1483.cfi_restore %r15 1484 mov -40(%rax),%r14 1485.cfi_restore %r14 1486 mov -32(%rax),%r13 1487.cfi_restore %r13 1488 mov -24(%rax),%r12 1489.cfi_restore %r12 1490 mov -16(%rax),%rbp 1491.cfi_restore %rbp 1492 mov -8(%rax),%rbx 1493.cfi_restore %rbx 1494 lea (%rax),%rsp # restore %rsp 1495.cfi_def_cfa_register %rsp 1496.Lmul_1024_epilogue: 1497 ret 1498.cfi_endproc 1499.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 1500___ 1501} 1502{ 1503my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); 1504my @T = map("%r$_",(8..11)); 1505 1506$code.=<<___; 1507.globl rsaz_1024_red2norm_avx2 1508.type rsaz_1024_red2norm_avx2,\@abi-omnipotent 1509.align 32 1510rsaz_1024_red2norm_avx2: 1511 sub \$-128,$inp # size optimization 1512 xor %rax,%rax 1513___ 1514 1515for ($j=0,$i=0; $i<16; $i++) { 1516 my $k=0; 1517 while (29*$j<64*($i+1)) { # load data till boundary 1518 $code.=" mov `8*$j-128`($inp), @T[0]\n"; 1519 $j++; $k++; push(@T,shift(@T)); 1520 } 1521 $l=$k; 1522 while ($k>1) { # shift loaded data but last value 1523 $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; 1524 $k--; 1525 } 1526 $code.=<<___; # shift last value 1527 mov @T[-1], @T[0] 1528 shl \$`29*($j-1)`, @T[-1] 1529 shr \$`-29*($j-1)`, @T[0] 1530___ 1531 while ($l) { # accumulate all values 1532 $code.=" add @T[-$l], %rax\n"; 1533 $l--; 1534 } 1535 $code.=<<___; 1536 adc \$0, @T[0] # consume eventual carry 1537 mov %rax, 8*$i($out) 1538 mov @T[0], %rax 1539___ 1540 push(@T,shift(@T)); 1541} 1542$code.=<<___; 1543 ret 1544.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 1545 1546.globl rsaz_1024_norm2red_avx2 1547.type rsaz_1024_norm2red_avx2,\@abi-omnipotent 1548.align 32 1549rsaz_1024_norm2red_avx2: 1550 sub \$-128,$out # size optimization 1551 mov ($inp),@T[0] 1552 mov \$0x1fffffff,%eax 1553___ 1554for ($j=0,$i=0; $i<16; $i++) { 1555 $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); 1556 $code.=" xor @T[1],@T[1]\n" if ($i==15); 1557 my $k=1; 1558 while (29*($j+1)<64*($i+1)) { 1559 $code.=<<___; 1560 mov @T[0],@T[-$k] 1561 shr \$`29*$j`,@T[-$k] 1562 and %rax,@T[-$k] # &0x1fffffff 1563 mov @T[-$k],`8*$j-128`($out) 1564___ 1565 $j++; $k++; 1566 } 1567 $code.=<<___; 1568 shrd \$`29*$j`,@T[1],@T[0] 1569 and %rax,@T[0] 1570 mov @T[0],`8*$j-128`($out) 1571___ 1572 $j++; 1573 push(@T,shift(@T)); 1574} 1575$code.=<<___; 1576 mov @T[0],`8*$j-128`($out) # zero 1577 mov @T[0],`8*($j+1)-128`($out) 1578 mov @T[0],`8*($j+2)-128`($out) 1579 mov @T[0],`8*($j+3)-128`($out) 1580 ret 1581.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 1582___ 1583} 1584{ 1585my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 1586 1587$code.=<<___; 1588.globl rsaz_1024_scatter5_avx2 1589.type rsaz_1024_scatter5_avx2,\@abi-omnipotent 1590.align 32 1591rsaz_1024_scatter5_avx2: 1592 vzeroupper 1593 vmovdqu .Lscatter_permd(%rip),%ymm5 1594 shl \$4,$power 1595 lea ($out,$power),$out 1596 mov \$9,%eax 1597 jmp .Loop_scatter_1024 1598 1599.align 32 1600.Loop_scatter_1024: 1601 vmovdqu ($inp),%ymm0 1602 lea 32($inp),$inp 1603 vpermd %ymm0,%ymm5,%ymm0 1604 vmovdqu %xmm0,($out) 1605 lea 16*32($out),$out 1606 dec %eax 1607 jnz .Loop_scatter_1024 1608 1609 vzeroupper 1610 ret 1611.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 1612 1613.globl rsaz_1024_gather5_avx2 1614.type rsaz_1024_gather5_avx2,\@abi-omnipotent 1615.align 32 1616rsaz_1024_gather5_avx2: 1617.cfi_startproc 1618 vzeroupper 1619 mov %rsp,%r11 1620.cfi_def_cfa_register %r11 1621___ 1622$code.=<<___ if ($win64); 1623 lea -0x88(%rsp),%rax 1624.LSEH_begin_rsaz_1024_gather5: 1625 # I can't trust assembler to use specific encoding:-( 1626 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp 1627 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax) 1628 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax) 1629 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax) 1630 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax) 1631 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax) 1632 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax) 1633 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax) 1634 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax) 1635 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax) 1636 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax) 1637___ 1638$code.=<<___; 1639 lea -0x100(%rsp),%rsp 1640 and \$-32, %rsp 1641 lea .Linc(%rip), %r10 1642 lea -128(%rsp),%rax # control u-op density 1643 1644 vmovd $power, %xmm4 1645 vmovdqa (%r10),%ymm0 1646 vmovdqa 32(%r10),%ymm1 1647 vmovdqa 64(%r10),%ymm5 1648 vpbroadcastd %xmm4,%ymm4 1649 1650 vpaddd %ymm5, %ymm0, %ymm2 1651 vpcmpeqd %ymm4, %ymm0, %ymm0 1652 vpaddd %ymm5, %ymm1, %ymm3 1653 vpcmpeqd %ymm4, %ymm1, %ymm1 1654 vmovdqa %ymm0, 32*0+128(%rax) 1655 vpaddd %ymm5, %ymm2, %ymm0 1656 vpcmpeqd %ymm4, %ymm2, %ymm2 1657 vmovdqa %ymm1, 32*1+128(%rax) 1658 vpaddd %ymm5, %ymm3, %ymm1 1659 vpcmpeqd %ymm4, %ymm3, %ymm3 1660 vmovdqa %ymm2, 32*2+128(%rax) 1661 vpaddd %ymm5, %ymm0, %ymm2 1662 vpcmpeqd %ymm4, %ymm0, %ymm0 1663 vmovdqa %ymm3, 32*3+128(%rax) 1664 vpaddd %ymm5, %ymm1, %ymm3 1665 vpcmpeqd %ymm4, %ymm1, %ymm1 1666 vmovdqa %ymm0, 32*4+128(%rax) 1667 vpaddd %ymm5, %ymm2, %ymm8 1668 vpcmpeqd %ymm4, %ymm2, %ymm2 1669 vmovdqa %ymm1, 32*5+128(%rax) 1670 vpaddd %ymm5, %ymm3, %ymm9 1671 vpcmpeqd %ymm4, %ymm3, %ymm3 1672 vmovdqa %ymm2, 32*6+128(%rax) 1673 vpaddd %ymm5, %ymm8, %ymm10 1674 vpcmpeqd %ymm4, %ymm8, %ymm8 1675 vmovdqa %ymm3, 32*7+128(%rax) 1676 vpaddd %ymm5, %ymm9, %ymm11 1677 vpcmpeqd %ymm4, %ymm9, %ymm9 1678 vpaddd %ymm5, %ymm10, %ymm12 1679 vpcmpeqd %ymm4, %ymm10, %ymm10 1680 vpaddd %ymm5, %ymm11, %ymm13 1681 vpcmpeqd %ymm4, %ymm11, %ymm11 1682 vpaddd %ymm5, %ymm12, %ymm14 1683 vpcmpeqd %ymm4, %ymm12, %ymm12 1684 vpaddd %ymm5, %ymm13, %ymm15 1685 vpcmpeqd %ymm4, %ymm13, %ymm13 1686 vpcmpeqd %ymm4, %ymm14, %ymm14 1687 vpcmpeqd %ymm4, %ymm15, %ymm15 1688 1689 vmovdqa -32(%r10),%ymm7 # .Lgather_permd 1690 lea 128($inp), $inp 1691 mov \$9,$power 1692 1693.Loop_gather_1024: 1694 vmovdqa 32*0-128($inp), %ymm0 1695 vmovdqa 32*1-128($inp), %ymm1 1696 vmovdqa 32*2-128($inp), %ymm2 1697 vmovdqa 32*3-128($inp), %ymm3 1698 vpand 32*0+128(%rax), %ymm0, %ymm0 1699 vpand 32*1+128(%rax), %ymm1, %ymm1 1700 vpand 32*2+128(%rax), %ymm2, %ymm2 1701 vpor %ymm0, %ymm1, %ymm4 1702 vpand 32*3+128(%rax), %ymm3, %ymm3 1703 vmovdqa 32*4-128($inp), %ymm0 1704 vmovdqa 32*5-128($inp), %ymm1 1705 vpor %ymm2, %ymm3, %ymm5 1706 vmovdqa 32*6-128($inp), %ymm2 1707 vmovdqa 32*7-128($inp), %ymm3 1708 vpand 32*4+128(%rax), %ymm0, %ymm0 1709 vpand 32*5+128(%rax), %ymm1, %ymm1 1710 vpand 32*6+128(%rax), %ymm2, %ymm2 1711 vpor %ymm0, %ymm4, %ymm4 1712 vpand 32*7+128(%rax), %ymm3, %ymm3 1713 vpand 32*8-128($inp), %ymm8, %ymm0 1714 vpor %ymm1, %ymm5, %ymm5 1715 vpand 32*9-128($inp), %ymm9, %ymm1 1716 vpor %ymm2, %ymm4, %ymm4 1717 vpand 32*10-128($inp),%ymm10, %ymm2 1718 vpor %ymm3, %ymm5, %ymm5 1719 vpand 32*11-128($inp),%ymm11, %ymm3 1720 vpor %ymm0, %ymm4, %ymm4 1721 vpand 32*12-128($inp),%ymm12, %ymm0 1722 vpor %ymm1, %ymm5, %ymm5 1723 vpand 32*13-128($inp),%ymm13, %ymm1 1724 vpor %ymm2, %ymm4, %ymm4 1725 vpand 32*14-128($inp),%ymm14, %ymm2 1726 vpor %ymm3, %ymm5, %ymm5 1727 vpand 32*15-128($inp),%ymm15, %ymm3 1728 lea 32*16($inp), $inp 1729 vpor %ymm0, %ymm4, %ymm4 1730 vpor %ymm1, %ymm5, %ymm5 1731 vpor %ymm2, %ymm4, %ymm4 1732 vpor %ymm3, %ymm5, %ymm5 1733 1734 vpor %ymm5, %ymm4, %ymm4 1735 vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared 1736 vpor %xmm4, %xmm5, %xmm5 1737 vpermd %ymm5,%ymm7,%ymm5 1738 vmovdqu %ymm5,($out) 1739 lea 32($out),$out 1740 dec $power 1741 jnz .Loop_gather_1024 1742 1743 vpxor %ymm0,%ymm0,%ymm0 1744 vmovdqu %ymm0,($out) 1745 vzeroupper 1746___ 1747$code.=<<___ if ($win64); 1748 movaps -0xa8(%r11),%xmm6 1749 movaps -0x98(%r11),%xmm7 1750 movaps -0x88(%r11),%xmm8 1751 movaps -0x78(%r11),%xmm9 1752 movaps -0x68(%r11),%xmm10 1753 movaps -0x58(%r11),%xmm11 1754 movaps -0x48(%r11),%xmm12 1755 movaps -0x38(%r11),%xmm13 1756 movaps -0x28(%r11),%xmm14 1757 movaps -0x18(%r11),%xmm15 1758___ 1759$code.=<<___; 1760 lea (%r11),%rsp 1761.cfi_def_cfa_register %rsp 1762 ret 1763.cfi_endproc 1764.LSEH_end_rsaz_1024_gather5: 1765.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 1766___ 1767} 1768 1769$code.=<<___; 1770.extern OPENSSL_ia32cap_P 1771.globl rsaz_avx2_eligible 1772.type rsaz_avx2_eligible,\@abi-omnipotent 1773.align 32 1774rsaz_avx2_eligible: 1775 mov OPENSSL_ia32cap_P+8(%rip),%eax 1776___ 1777$code.=<<___ if ($addx); 1778 mov \$`1<<8|1<<19`,%ecx 1779 mov \$0,%edx 1780 and %eax,%ecx 1781 cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X 1782 cmove %edx,%eax 1783___ 1784$code.=<<___; 1785 and \$`1<<5`,%eax 1786 shr \$5,%eax 1787 ret 1788.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1789 1790.align 64 1791.Land_mask: 1792 .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 1793.Lscatter_permd: 1794 .long 0,2,4,6,7,7,7,7 1795.Lgather_permd: 1796 .long 0,7,1,7,2,7,3,7 1797.Linc: 1798 .long 0,0,0,0, 1,1,1,1 1799 .long 2,2,2,2, 3,3,3,3 1800 .long 4,4,4,4, 4,4,4,4 1801.align 64 1802___ 1803 1804if ($win64) { 1805$rec="%rcx"; 1806$frame="%rdx"; 1807$context="%r8"; 1808$disp="%r9"; 1809 1810$code.=<<___ 1811.extern __imp_RtlVirtualUnwind 1812.type rsaz_se_handler,\@abi-omnipotent 1813.align 16 1814rsaz_se_handler: 1815 push %rsi 1816 push %rdi 1817 push %rbx 1818 push %rbp 1819 push %r12 1820 push %r13 1821 push %r14 1822 push %r15 1823 pushfq 1824 sub \$64,%rsp 1825 1826 mov 120($context),%rax # pull context->Rax 1827 mov 248($context),%rbx # pull context->Rip 1828 1829 mov 8($disp),%rsi # disp->ImageBase 1830 mov 56($disp),%r11 # disp->HandlerData 1831 1832 mov 0(%r11),%r10d # HandlerData[0] 1833 lea (%rsi,%r10),%r10 # prologue label 1834 cmp %r10,%rbx # context->Rip<prologue label 1835 jb .Lcommon_seh_tail 1836 1837 mov 4(%r11),%r10d # HandlerData[1] 1838 lea (%rsi,%r10),%r10 # epilogue label 1839 cmp %r10,%rbx # context->Rip>=epilogue label 1840 jae .Lcommon_seh_tail 1841 1842 mov 160($context),%rbp # pull context->Rbp 1843 1844 mov 8(%r11),%r10d # HandlerData[2] 1845 lea (%rsi,%r10),%r10 # "in tail" label 1846 cmp %r10,%rbx # context->Rip>="in tail" label 1847 cmovc %rbp,%rax 1848 1849 mov -48(%rax),%r15 1850 mov -40(%rax),%r14 1851 mov -32(%rax),%r13 1852 mov -24(%rax),%r12 1853 mov -16(%rax),%rbp 1854 mov -8(%rax),%rbx 1855 mov %r15,240($context) 1856 mov %r14,232($context) 1857 mov %r13,224($context) 1858 mov %r12,216($context) 1859 mov %rbp,160($context) 1860 mov %rbx,144($context) 1861 1862 lea -0xd8(%rax),%rsi # %xmm save area 1863 lea 512($context),%rdi # & context.Xmm6 1864 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 1865 .long 0xa548f3fc # cld; rep movsq 1866 1867.Lcommon_seh_tail: 1868 mov 8(%rax),%rdi 1869 mov 16(%rax),%rsi 1870 mov %rax,152($context) # restore context->Rsp 1871 mov %rsi,168($context) # restore context->Rsi 1872 mov %rdi,176($context) # restore context->Rdi 1873 1874 mov 40($disp),%rdi # disp->ContextRecord 1875 mov $context,%rsi # context 1876 mov \$154,%ecx # sizeof(CONTEXT) 1877 .long 0xa548f3fc # cld; rep movsq 1878 1879 mov $disp,%rsi 1880 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1881 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1882 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1883 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1884 mov 40(%rsi),%r10 # disp->ContextRecord 1885 lea 56(%rsi),%r11 # &disp->HandlerData 1886 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1887 mov %r10,32(%rsp) # arg5 1888 mov %r11,40(%rsp) # arg6 1889 mov %r12,48(%rsp) # arg7 1890 mov %rcx,56(%rsp) # arg8, (NULL) 1891 call *__imp_RtlVirtualUnwind(%rip) 1892 1893 mov \$1,%eax # ExceptionContinueSearch 1894 add \$64,%rsp 1895 popfq 1896 pop %r15 1897 pop %r14 1898 pop %r13 1899 pop %r12 1900 pop %rbp 1901 pop %rbx 1902 pop %rdi 1903 pop %rsi 1904 ret 1905.size rsaz_se_handler,.-rsaz_se_handler 1906 1907.section .pdata 1908.align 4 1909 .rva .LSEH_begin_rsaz_1024_sqr_avx2 1910 .rva .LSEH_end_rsaz_1024_sqr_avx2 1911 .rva .LSEH_info_rsaz_1024_sqr_avx2 1912 1913 .rva .LSEH_begin_rsaz_1024_mul_avx2 1914 .rva .LSEH_end_rsaz_1024_mul_avx2 1915 .rva .LSEH_info_rsaz_1024_mul_avx2 1916 1917 .rva .LSEH_begin_rsaz_1024_gather5 1918 .rva .LSEH_end_rsaz_1024_gather5 1919 .rva .LSEH_info_rsaz_1024_gather5 1920.section .xdata 1921.align 8 1922.LSEH_info_rsaz_1024_sqr_avx2: 1923 .byte 9,0,0,0 1924 .rva rsaz_se_handler 1925 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail 1926 .long 0 1927.LSEH_info_rsaz_1024_mul_avx2: 1928 .byte 9,0,0,0 1929 .rva rsaz_se_handler 1930 .rva .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail 1931 .long 0 1932.LSEH_info_rsaz_1024_gather5: 1933 .byte 0x01,0x36,0x17,0x0b 1934 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 1935 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 1936 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 1937 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 1938 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 1939 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 1940 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 1941 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 1942 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 1943 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 1944 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 1945 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 1946___ 1947} 1948 1949foreach (split("\n",$code)) { 1950 s/\`([^\`]*)\`/eval($1)/ge; 1951 1952 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or 1953 1954 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1955 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1956 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1957 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1958 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1959 print $_,"\n"; 1960} 1961 1962}}} else {{{ 1963print <<___; # assembler is too old 1964.text 1965 1966.globl rsaz_avx2_eligible 1967.type rsaz_avx2_eligible,\@abi-omnipotent 1968rsaz_avx2_eligible: 1969 xor %eax,%eax 1970 ret 1971.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1972 1973.globl rsaz_1024_sqr_avx2 1974.globl rsaz_1024_mul_avx2 1975.globl rsaz_1024_norm2red_avx2 1976.globl rsaz_1024_red2norm_avx2 1977.globl rsaz_1024_scatter5_avx2 1978.globl rsaz_1024_gather5_avx2 1979.type rsaz_1024_sqr_avx2,\@abi-omnipotent 1980rsaz_1024_sqr_avx2: 1981rsaz_1024_mul_avx2: 1982rsaz_1024_norm2red_avx2: 1983rsaz_1024_red2norm_avx2: 1984rsaz_1024_scatter5_avx2: 1985rsaz_1024_gather5_avx2: 1986 .byte 0x0f,0x0b # ud2 1987 ret 1988.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 1989___ 1990}}} 1991 1992close STDOUT; 1993